PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250730py3-none-any.whl → 1.0.0.dev20250801py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (81) hide show

sky/__init__.py +2 -2
sky/backends/backend_utils.py +4 -1
sky/backends/cloud_vm_ray_backend.py +4 -3
sky/catalog/__init__.py +3 -3
sky/catalog/aws_catalog.py +12 -0
sky/catalog/common.py +2 -2
sky/catalog/data_fetchers/fetch_aws.py +13 -1
sky/client/cli/command.py +452 -53
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +21 -1
sky/data/storage.py +12 -0
sky/jobs/__init__.py +3 -0
sky/jobs/client/sdk.py +80 -3
sky/jobs/controller.py +76 -25
sky/jobs/recovery_strategy.py +80 -34
sky/jobs/scheduler.py +68 -20
sky/jobs/server/core.py +228 -136
sky/jobs/server/server.py +40 -0
sky/jobs/state.py +129 -24
sky/jobs/utils.py +109 -51
sky/provision/nebius/constants.py +3 -0
sky/provision/runpod/utils.py +27 -12
sky/py.typed +0 -0
sky/resources.py +16 -12
sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
sky/serve/autoscalers.py +8 -0
sky/serve/client/impl.py +188 -0
sky/serve/client/sdk.py +12 -82
sky/serve/constants.py +5 -1
sky/serve/controller.py +5 -0
sky/serve/replica_managers.py +112 -37
sky/serve/serve_state.py +16 -6
sky/serve/serve_utils.py +274 -77
sky/serve/server/core.py +8 -525
sky/serve/server/impl.py +709 -0
sky/serve/service.py +13 -9
sky/serve/service_spec.py +74 -4
sky/server/constants.py +1 -1
sky/server/daemons.py +164 -0
sky/server/requests/payloads.py +33 -0
sky/server/requests/requests.py +2 -107
sky/server/requests/serializers/decoders.py +12 -3
sky/server/requests/serializers/encoders.py +13 -2
sky/server/server.py +2 -1
sky/server/uvicorn.py +2 -1
sky/sky_logging.py +30 -0
sky/skylet/constants.py +2 -1
sky/skylet/events.py +9 -0
sky/skypilot_config.py +24 -21
sky/task.py +41 -11
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/sky-serve-controller.yaml.j2 +18 -2
sky/users/server.py +1 -1
sky/utils/command_runner.py +4 -2
sky/utils/controller_utils.py +14 -10
sky/utils/dag_utils.py +4 -2
sky/utils/db/migration_utils.py +2 -4
sky/utils/schemas.py +47 -19
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
/sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0

sky/serve/service.py CHANGED Viewed

@@ -222,7 +222,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
             requested_resources_str=backend_utils.get_task_resources_str(task),
             load_balancing_policy=service_spec.load_balancing_policy,
             status=serve_state.ServiceStatus.CONTROLLER_INIT,
-            tls_encrypted=service_spec.tls_credential is not None)
+            tls_encrypted=service_spec.tls_credential is not None,
+            pool=service_spec.pool)
         # Directly throw an error here. See sky/serve/api.py::up
         # for more details.
         if not success:
@@ -292,14 +293,17 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
             # TODO(tian): Probably we could enable multiple ports specified in
             # service spec and we could start multiple load balancers.
             # After that, we will have a mapping from replica port to endpoint.
-            load_balancer_process = multiprocessing.Process(
-                target=ux_utils.RedirectOutputForProcess(
-                    load_balancer.run_load_balancer,
-                    load_balancer_log_file).run,
-                args=(controller_addr, load_balancer_port,
-                      service_spec.load_balancing_policy,
-                      service_spec.tls_credential))
-            load_balancer_process.start()
+            # NOTE(tian): We don't need the load balancer for cluster pool.
+            # Skip the load balancer process for cluster pool.
+            if not service_spec.pool:
+                load_balancer_process = multiprocessing.Process(
+                    target=ux_utils.RedirectOutputForProcess(
+                        load_balancer.run_load_balancer,
+                        load_balancer_log_file).run,
+                    args=(controller_addr, load_balancer_port,
+                          service_spec.load_balancing_policy,
+                          service_spec.tls_credential))
+                load_balancer_process.start()
             if not is_recovery:
                 serve_state.set_service_load_balancer_port(

sky/serve/service_spec.py CHANGED Viewed

@@ -43,7 +43,33 @@ class SkyServiceSpec:
         upscale_delay_seconds: Optional[int] = None,
         downscale_delay_seconds: Optional[int] = None,
         load_balancing_policy: Optional[str] = None,
+        pool: Optional[bool] = None,
     ) -> None:
+        if pool:
+            for unsupported_field in [
+                    'max_replicas',
+                    'num_overprovision',
+                    'target_qps_per_replica',
+                    'upscale_delay_seconds',
+                    'downscale_delay_seconds',
+                    'base_ondemand_fallback_replicas',
+                    'dynamic_ondemand_fallback',
+                    'spot_placer',
+                    'load_balancing_policy',
+                    'ports',
+                    'post_data',
+                    'tls_credential',
+                    'readiness_headers',
+            ]:
+                if locals()[unsupported_field] is not None:
+                    with ux_utils.print_exception_no_traceback():
+                        raise ValueError(
+                            f'{unsupported_field} is not supported for pool.')
+            if max_replicas is not None and max_replicas != min_replicas:
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError('Autoscaling is not supported for pool '
+                                     'for now.')
         if max_replicas is not None and max_replicas < min_replicas:
             with ux_utils.print_exception_no_traceback():
                 raise ValueError('max_replicas must be greater than or '
@@ -96,6 +122,7 @@ class SkyServiceSpec:
         self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
         self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
         self._load_balancing_policy: Optional[str] = load_balancing_policy
+        self._pool: Optional[bool] = pool
         self._use_ondemand_fallback: bool = (
             self.dynamic_ondemand_fallback is not None and
@@ -115,7 +142,7 @@ class SkyServiceSpec:
         service_config: Dict[str, Any] = {}
-        readiness_section = config['readiness_probe']
+        readiness_section = config.get('readiness_probe', '/')
         if isinstance(readiness_section, str):
             service_config['readiness_path'] = readiness_section
             initial_delay_seconds = None
@@ -157,8 +184,29 @@ class SkyServiceSpec:
                     raise ValueError('Port must be between 1 and 65535.')
         service_config['ports'] = str(ports) if ports is not None else None
+        pool_config = config.get('pool', None)
+        if pool_config is not None:
+            service_config['pool'] = pool_config
         policy_section = config.get('replica_policy', None)
+        if policy_section is not None and pool_config:
+            with ux_utils.print_exception_no_traceback():
+                raise ValueError('Cannot specify `replica_policy` for cluster '
+                                 'pool. Only `workers: <num>` is supported '
+                                 'for cluster pool now.')
         simplified_policy_section = config.get('replicas', None)
+        workers_config = config.get('workers', None)
+        if simplified_policy_section is not None and workers_config is not None:
+            with ux_utils.print_exception_no_traceback():
+                raise ValueError('Cannot specify both `replicas` and `workers`.'
+                                 ' Please use one of them.')
+        if simplified_policy_section is not None and pool_config:
+            with ux_utils.print_exception_no_traceback():
+                raise ValueError('Cannot specify `replicas` for cluster pool. '
+                                 'Please use `workers` instead.')
+        if simplified_policy_section is None:
+            simplified_policy_section = workers_config
         if policy_section is None or simplified_policy_section is not None:
             if simplified_policy_section is not None:
                 min_replicas = simplified_policy_section
@@ -239,6 +287,13 @@ class SkyServiceSpec:
                         config[section] = dict()
                     config[section][key] = value
+        add_if_not_none('pool', None, self._pool)
+        if self.pool:
+            # For pool, currently only `workers: <num>` is supported.
+            add_if_not_none('workers', None, self.min_replicas)
+            return config
         add_if_not_none('readiness_probe', 'path', self.readiness_path)
         add_if_not_none('readiness_probe', 'initial_delay_seconds',
                         self.initial_delay_seconds)
@@ -306,10 +361,14 @@ class SkyServiceSpec:
         return ' '.join(policy_strs)
     def autoscaling_policy_str(self):
+        if self.pool:
+            # We only support fixed-size pool for now.
+            return f'Fixed-size ({self.min_replicas} workers)'
         # TODO(MaoZiming): Update policy_str
+        noun = 'worker' if self.pool else 'replica'
         min_plural = '' if self.min_replicas == 1 else 's'
         if self.max_replicas == self.min_replicas or self.max_replicas is None:
-            return f'Fixed {self.min_replicas} replica{min_plural}'
+            return f'Fixed {self.min_replicas} {noun}{min_plural}'
         # Already checked in __init__.
         assert self.target_qps_per_replica is not None
         # TODO(tian): Refactor to contain more information
@@ -319,8 +378,8 @@ class SkyServiceSpec:
             overprovision_str = (
                 f' with {self.num_overprovision} overprovisioned replicas')
         return (f'Autoscaling from {self.min_replicas} to {self.max_replicas} '
-                f'replica{max_plural}{overprovision_str} (target QPS per '
-                f'replica: {self.target_qps_per_replica})')
+                f'{noun}{max_plural}{overprovision_str} (target QPS per '
+                f'{noun}: {self.target_qps_per_replica})')
     def set_ports(self, ports: str) -> None:
         self._ports = ports
@@ -332,6 +391,10 @@ class SkyServiceSpec:
                 f'Certfile: {self.tls_credential.certfile}')
     def __repr__(self) -> str:
+        if self.pool:
+            return textwrap.dedent(f"""\
+                Worker policy:  {self.autoscaling_policy_str()}
+            """)
         return textwrap.dedent(f"""\
             Readiness probe method:           {self.probe_str()}
             Readiness initial delay seconds:  {self.initial_delay_seconds}
@@ -420,3 +483,10 @@ class SkyServiceSpec:
     def load_balancing_policy(self) -> str:
         return lb_policies.LoadBalancingPolicy.make_policy_name(
             self._load_balancing_policy)
+    @property
+    def pool(self) -> bool:
+        # This can happen for backward compatibility.
+        if not hasattr(self, '_pool'):
+            return False
+        return bool(self._pool)

sky/server/constants.py CHANGED Viewed

@@ -10,7 +10,7 @@ from sky.skylet import constants
 # based on version info is needed.
 # For more details and code guidelines, refer to:
 # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
-API_VERSION = 11
+API_VERSION = 12
 # The minimum peer API version that the code should still work with.
 # Notes (dev):

sky/server/daemons.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""Internal server daemons that run in the background."""
+import dataclasses
+import os
+import time
+from typing import Callable
+from sky import sky_logging
+from sky import skypilot_config
+from sky.server import constants as server_constants
+from sky.utils import common
+from sky.utils import env_options
+from sky.utils import ux_utils
+logger = sky_logging.init_logger(__name__)
+@dataclasses.dataclass
+class InternalRequestDaemon:
+    """Internal daemon that runs an event in the background."""
+    id: str
+    name: str
+    event_fn: Callable[[], None]
+    default_log_level: str = 'INFO'
+    def refresh_log_level(self) -> int:
+        # pylint: disable=import-outside-toplevel
+        import logging
+        try:
+            # Refresh config within the while loop.
+            # Since this is a long running daemon,
+            # reload_config_for_new_request()
+            # is not called in between the event runs.
+            skypilot_config.safe_reload_config()
+            # Get the configured log level for the daemon inside the event loop
+            # in case the log level changes after the API server is started.
+            level_str = skypilot_config.get_nested(
+                ('daemons', self.id, 'log_level'), self.default_log_level)
+            return getattr(logging, level_str.upper())
+        except AttributeError:
+            # Bad level should be rejected by
+            # schema validation, just in case.
+            logger.warning(f'Invalid log level: {level_str}, using DEBUG')
+            return logging.DEBUG
+        except Exception as e:  # pylint: disable=broad-except
+            logger.exception(f'Error refreshing log level for {self.id}: {e}')
+            return logging.DEBUG
+    def run_event(self):
+        """Run the event."""
+        # Disable logging for periodic refresh to avoid the usage message being
+        # sent multiple times.
+        os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
+        level = self.refresh_log_level()
+        while True:
+            try:
+                with ux_utils.enable_traceback(), \
+                    sky_logging.set_sky_logging_levels(level):
+                    sky_logging.reload_logger()
+                    level = self.refresh_log_level()
+                    self.event_fn()
+            except Exception:  # pylint: disable=broad-except
+                # It is OK to fail to run the event, as the event is not
+                # critical, but we should log the error.
+                logger.exception(
+                    f'Error running {self.name} event. '
+                    f'Restarting in '
+                    f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
+                    'seconds...')
+                time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
+def refresh_cluster_status_event():
+    """Periodically refresh the cluster status."""
+    # pylint: disable=import-outside-toplevel
+    from sky import core
+    logger.info('=== Refreshing cluster status ===')
+    # This periodically refresh will hold the lock for the cluster being
+    # refreshed, but it is OK because other operations will just wait for
+    # the lock and get the just refreshed status without refreshing again.
+    core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
+    logger.info('Status refreshed. Sleeping '
+                f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
+                ' seconds for the next refresh...\n')
+    time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
+def refresh_volume_status_event():
+    """Periodically refresh the volume status."""
+    # pylint: disable=import-outside-toplevel
+    from sky.volumes.server import core
+    # Disable logging for periodic refresh to avoid the usage message being
+    # sent multiple times.
+    os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
+    logger.info('=== Refreshing volume status ===')
+    core.volume_refresh()
+    logger.info('Volume status refreshed. Sleeping '
+                f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
+                ' seconds for the next refresh...\n')
+    time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
+def managed_job_status_refresh_event():
+    """Refresh the managed job status for controller consolidation mode."""
+    # pylint: disable=import-outside-toplevel
+    from sky.jobs import utils as managed_job_utils
+    if not managed_job_utils.is_consolidation_mode():
+        return
+    # We run the recovery logic before starting the event loop as those two are
+    # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
+    from sky.utils import controller_utils
+    if controller_utils.high_availability_specified(
+            controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
+        managed_job_utils.ha_recovery_for_consolidation_mode()
+    # After recovery, we start the event loop.
+    from sky.skylet import events
+    refresh_event = events.ManagedJobEvent()
+    scheduling_event = events.ManagedJobSchedulingEvent()
+    logger.info('=== Running managed job event ===')
+    refresh_event.run()
+    scheduling_event.run()
+    time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
+def sky_serve_status_refresh_event():
+    """Refresh the sky serve status for controller consolidation mode."""
+    # pylint: disable=import-outside-toplevel
+    from sky.serve import serve_utils
+    if not serve_utils.is_consolidation_mode():
+        return
+    # TODO(tian): Add HA recovery logic.
+    from sky.skylet import events
+    event = events.ServiceUpdateEvent()
+    logger.info('=== Running serve status refresh event ===')
+    event.run()
+    time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
+# Register the events to run in the background.
+INTERNAL_REQUEST_DAEMONS = [
+    # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
+    # set to updated status automatically, without showing users the hint of
+    # cluster being stopped or down when `sky status -r` is called.
+    InternalRequestDaemon(id='skypilot-status-refresh-daemon',
+                          name='status',
+                          event_fn=refresh_cluster_status_event,
+                          default_log_level='DEBUG'),
+    # Volume status refresh daemon to update the volume status periodically.
+    InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
+                          name='volume',
+                          event_fn=refresh_volume_status_event),
+    InternalRequestDaemon(id='managed-job-status-refresh-daemon',
+                          name='managed-job-status',
+                          event_fn=managed_job_status_refresh_event),
+    InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
+                          name='sky-serve-status',
+                          event_fn=sky_serve_status_refresh_event),
+]

sky/server/requests/payloads.py CHANGED Viewed

@@ -478,6 +478,8 @@ class JobsLaunchBody(RequestBody):
     """The request body for the jobs launch endpoint."""
     task: str
     name: Optional[str]
+    pool: Optional[str] = None
+    num_jobs: Optional[int] = None
     def to_kwargs(self) -> Dict[str, Any]:
         kwargs = super().to_kwargs()
@@ -500,6 +502,7 @@ class JobsCancelBody(RequestBody):
     job_ids: Optional[List[int]] = None
     all: bool = False
     all_users: bool = False
+    pool: Optional[str] = None
 class JobsLogsBody(RequestBody):
@@ -671,6 +674,36 @@ class JobsDownloadLogsBody(RequestBody):
     local_dir: str = constants.SKY_LOGS_DIRECTORY
+class JobsPoolApplyBody(RequestBody):
+    """The request body for the jobs pool apply endpoint."""
+    task: str
+    pool_name: str
+    mode: serve.UpdateMode
+    def to_kwargs(self) -> Dict[str, Any]:
+        kwargs = super().to_kwargs()
+        dag = common.process_mounts_in_task_on_api_server(self.task,
+                                                          self.env_vars,
+                                                          workdir_only=False)
+        assert len(
+            dag.tasks) == 1, ('Must only specify one task in the DAG for '
+                              'a pool.', dag)
+        kwargs['task'] = dag.tasks[0]
+        return kwargs
+class JobsPoolDownBody(RequestBody):
+    """The request body for the jobs pool down endpoint."""
+    pool_names: Optional[Union[str, List[str]]]
+    all: bool = False
+    purge: bool = False
+class JobsPoolStatusBody(RequestBody):
+    """The request body for the jobs pool status endpoint."""
+    pool_names: Optional[Union[str, List[str]]]
 class UploadZipFileResponse(pydantic.BaseModel):
     """The response body for the upload zip file endpoint."""
     status: str

sky/server/requests/requests.py CHANGED Viewed

@@ -24,12 +24,11 @@ from sky import sky_logging
 from sky import skypilot_config
 from sky.server import common as server_common
 from sky.server import constants as server_constants
+from sky.server import daemons
 from sky.server.requests import payloads
 from sky.server.requests.serializers import decoders
 from sky.server.requests.serializers import encoders
-from sky.utils import common
 from sky.utils import common_utils
-from sky.utils import env_options
 from sky.utils import subprocess_utils
 from sky.utils import ux_utils
 from sky.utils.db import db_utils
@@ -307,110 +306,6 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
     kill_requests(request_ids)
-def refresh_cluster_status_event():
-    """Periodically refresh the cluster status."""
-    # pylint: disable=import-outside-toplevel
-    from sky import core
-    # Disable logging for periodic refresh to avoid the usage message being
-    # sent multiple times.
-    os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
-    while True:
-        logger.info('=== Refreshing cluster status ===')
-        # This periodically refresh will hold the lock for the cluster being
-        # refreshed, but it is OK because other operations will just wait for
-        # the lock and get the just refreshed status without refreshing again.
-        core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
-        logger.info(
-            'Status refreshed. Sleeping '
-            f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
-            ' seconds for the next refresh...\n')
-        time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
-def refresh_volume_status_event():
-    """Periodically refresh the volume status."""
-    # pylint: disable=import-outside-toplevel
-    from sky.volumes.server import core
-    # Disable logging for periodic refresh to avoid the usage message being
-    # sent multiple times.
-    os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
-    while True:
-        logger.info('=== Refreshing volume status ===')
-        core.volume_refresh()
-        logger.info('Volume status refreshed. Sleeping '
-                    f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
-                    ' seconds for the next refresh...\n')
-        time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
-def managed_job_status_refresh_event():
-    """Refresh the managed job status for controller consolidation mode."""
-    # pylint: disable=import-outside-toplevel
-    from sky.jobs import utils as managed_job_utils
-    if not managed_job_utils.is_consolidation_mode():
-        return
-    # We run the recovery logic before starting the event loop as those two are
-    # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
-    from sky.utils import controller_utils
-    if controller_utils.high_availability_specified(
-            controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
-        managed_job_utils.ha_recovery_for_consolidation_mode()
-    # After recovery, we start the event loop.
-    from sky.skylet import events
-    event = events.ManagedJobEvent()
-    while True:
-        time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
-        event.run()
-@dataclasses.dataclass
-class InternalRequestDaemon:
-    """Internal daemon that runs an event in the background."""
-    id: str
-    name: str
-    event_fn: Callable[[], None]
-    def run_event(self):
-        """Run the event."""
-        while True:
-            with ux_utils.enable_traceback():
-                try:
-                    self.event_fn()
-                    break
-                except Exception:  # pylint: disable=broad-except
-                    # It is OK to fail to run the event, as the event is not
-                    # critical, but we should log the error.
-                    logger.exception(
-                        f'Error running {self.name} event. '
-                        f'Restarting in '
-                        f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
-                        'seconds...')
-                    time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
-# Register the events to run in the background.
-INTERNAL_REQUEST_DAEMONS = [
-    # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
-    # set to updated status automatically, without showing users the hint of
-    # cluster being stopped or down when `sky status -r` is called.
-    InternalRequestDaemon(id='skypilot-status-refresh-daemon',
-                          name='status',
-                          event_fn=refresh_cluster_status_event),
-    # Volume status refresh daemon to update the volume status periodically.
-    InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
-                          name='volume',
-                          event_fn=refresh_volume_status_event),
-    InternalRequestDaemon(id='managed-job-status-refresh-daemon',
-                          name='managed-job-status',
-                          event_fn=managed_job_status_refresh_event),
-]
 def kill_requests(request_ids: Optional[List[str]] = None,
                   user_id: Optional[str] = None) -> List[str]:
     """Kill a SkyPilot API request and set its status to cancelled.
@@ -441,7 +336,7 @@ def kill_requests(request_ids: Optional[List[str]] = None,
             # Skip internal requests. The internal requests are scheduled with
             # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
             if request_record.request_id in set(
-                    event.id for event in INTERNAL_REQUEST_DAEMONS):
+                    event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
                 continue
             if request_record.status > RequestStatus.RUNNING:
                 logger.debug(f'Request {request_id} already finished')

sky/server/requests/serializers/decoders.py CHANGED Viewed

@@ -109,9 +109,8 @@ def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
     return jobs
-@register_decoders('serve.status')
-def decode_serve_status(return_value: List[dict]) -> List[Dict[str, Any]]:
-    service_statuses = return_value
+def _decode_serve_status(
+        service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     for service_status in service_statuses:
         service_status['status'] = serve_state.ServiceStatus(
             service_status['status'])
@@ -122,6 +121,16 @@ def decode_serve_status(return_value: List[dict]) -> List[Dict[str, Any]]:
     return service_statuses
+@register_decoders('serve.status')
+def decode_serve_status(return_value: List[dict]) -> List[Dict[str, Any]]:
+    return _decode_serve_status(return_value)
+@register_decoders('jobs.pool_status')
+def decode_jobs_pool_status(return_value: List[dict]) -> List[Dict[str, Any]]:
+    return _decode_serve_status(return_value)
 @register_decoders('cost_report')
 def decode_cost_report(
         return_value: List[Dict[str, Any]]) -> List[Dict[str, Any]]:

sky/server/requests/serializers/encoders.py CHANGED Viewed

@@ -112,8 +112,7 @@ def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
     return jobs
-@register_encoder('serve.status')
-def encode_serve_status(
+def _encode_serve_status(
         service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     for service_status in service_statuses:
         service_status['status'] = service_status['status'].value
@@ -123,6 +122,18 @@ def encode_serve_status(
     return service_statuses
+@register_encoder('serve.status')
+def encode_serve_status(
+        service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    return _encode_serve_status(service_statuses)
+@register_encoder('jobs.pool_status')
+def encode_jobs_pool_status(
+        pool_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    return _encode_serve_status(pool_statuses)
 @register_encoder('cost_report')
 def encode_cost_report(
         cost_report: List[Dict[str, Any]]) -> List[Dict[str, Any]]:

sky/server/server.py CHANGED Viewed

@@ -46,6 +46,7 @@ from sky.serve.server import server as serve_rest
 from sky.server import common
 from sky.server import config as server_config
 from sky.server import constants as server_constants
+from sky.server import daemons
 from sky.server import metrics
 from sky.server import state
 from sky.server import stream_utils
@@ -482,7 +483,7 @@ async def lifespan(app: fastapi.FastAPI):  # pylint: disable=redefined-outer-nam
     """FastAPI lifespan context manager."""
     del app  # unused
     # Startup: Run background tasks
-    for event in requests_lib.INTERNAL_REQUEST_DAEMONS:
+    for event in daemons.INTERNAL_REQUEST_DAEMONS:
         try:
             executor.schedule_request(
                 request_id=event.id,

sky/server/uvicorn.py CHANGED Viewed

@@ -16,6 +16,7 @@ import uvicorn
 from uvicorn.supervisors import multiprocess
 from sky import sky_logging
+from sky.server import daemons
 from sky.server import state
 from sky.server.requests import requests as requests_lib
 from sky.skylet import constants
@@ -120,7 +121,7 @@ class Server(uvicorn.Server):
             # Proactively cancel internal requests and logs requests since
             # they can run for infinite time.
             internal_request_ids = [
-                d.id for d in requests_lib.INTERNAL_REQUEST_DAEMONS
+                d.id for d in daemons.INTERNAL_REQUEST_DAEMONS
             ]
             if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
                 logger.warning('Timeout waiting for on-going requests to '

sky/sky_logging.py CHANGED Viewed

@@ -171,6 +171,36 @@ def set_logging_level(logger: str, level: int):
         logger.setLevel(original_level)
+@contextlib.contextmanager
+def set_sky_logging_levels(level: int):
+    """Set the logging level for all loggers."""
+    # Turn off logger
+    previous_levels = {}
+    for logger_name in logging.Logger.manager.loggerDict:
+        if logger_name.startswith('sky'):
+            logger = logging.getLogger(logger_name)
+            previous_levels[logger_name] = logger.level
+            logger.setLevel(level)
+    if level == logging.DEBUG:
+        previous_show_debug_info = env_options.Options.SHOW_DEBUG_INFO.get()
+        os.environ[env_options.Options.SHOW_DEBUG_INFO.env_key] = '1'
+    try:
+        yield
+    finally:
+        # Restore logger
+        for logger_name in logging.Logger.manager.loggerDict:
+            if logger_name.startswith('sky'):
+                logger = logging.getLogger(logger_name)
+                try:
+                    logger.setLevel(previous_levels[logger_name])
+                except KeyError:
+                    # New loggers maybe initialized after the context manager,
+                    # no need to restore the level.
+                    pass
+        if level == logging.DEBUG and not previous_show_debug_info:
+            os.environ.pop(env_options.Options.SHOW_DEBUG_INFO.env_key)
 def logging_enabled(logger: logging.Logger, level: int) -> bool:
     return logger.level <= level

skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250730py3-none-any.whl → 1.0.0.dev20250801py3-none-any.whl