PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250311__py3-none-any.whl → 1.0.0.dev20250312__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250311py3-none-any.whl → 1.0.0.dev20250312py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

sky/__init__.py +2 -2
sky/adaptors/nebius.py +11 -1
sky/backends/backend_utils.py +38 -15
sky/backends/cloud_vm_ray_backend.py +17 -52
sky/clouds/nebius.py +8 -6
sky/exceptions.py +11 -3
sky/provision/kubernetes/utils.py +1 -1
sky/server/requests/event_loop.py +31 -0
sky/server/requests/executor.py +50 -22
sky/server/requests/preconditions.py +174 -0
sky/server/requests/requests.py +42 -3
sky/server/server.py +29 -8
sky/server/stream_utils.py +9 -6
sky/server/uvicorn.py +81 -0
sky/utils/accelerator_registry.py +1 -1
sky/utils/subprocess_utils.py +56 -1
{skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250312.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250312.dist-info}/RECORD +22 -19
{skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250312.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250312.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250312.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250312.dist-info}/top_level.txt +0 -0

sky/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 import urllib.request
 # Replaced with the current commit when building the wheels.
-_SKYPILOT_COMMIT_SHA = '439de1a24a6f0a9601051ecdc3e565308bac442a'
+_SKYPILOT_COMMIT_SHA = '78a42b6e733bbc29b68efe0e9c79191eaaca9fcd'
 def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev20250311'
+__version__ = '1.0.0.dev20250312'
 __root_dir__ = os.path.dirname(os.path.abspath(__file__))

sky/adaptors/nebius.py CHANGED Viewed

@@ -6,9 +6,11 @@ from sky.adaptors import common
 NEBIUS_TENANT_ID_FILENAME = 'NEBIUS_TENANT_ID.txt'
 NEBIUS_IAM_TOKEN_FILENAME = 'NEBIUS_IAM_TOKEN.txt'
 NEBIUS_PROJECT_ID_FILENAME = 'NEBIUS_PROJECT_ID.txt'
+NEBIUS_CREDENTIALS_FILENAME = 'credentials.json'
 NEBIUS_TENANT_ID_PATH = '~/.nebius/' + NEBIUS_TENANT_ID_FILENAME
 NEBIUS_IAM_TOKEN_PATH = '~/.nebius/' + NEBIUS_IAM_TOKEN_FILENAME
 NEBIUS_PROJECT_ID_PATH = '~/.nebius/' + NEBIUS_PROJECT_ID_FILENAME
+NEBIUS_CREDENTIALS_PATH = '~/.nebius/' + NEBIUS_CREDENTIALS_FILENAME
 MAX_RETRIES_TO_DISK_CREATE = 120
 MAX_RETRIES_TO_INSTANCE_STOP = 120
@@ -72,6 +74,11 @@ def get_iam_token():
     return _iam_token
+def is_token_or_cred_file_exist():
+    return (os.path.exists(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH)) or
+            os.path.exists(os.path.expanduser(NEBIUS_CREDENTIALS_PATH)))
 def get_project_id():
     global _project_id
     if _project_id is None:
@@ -97,4 +104,7 @@ def get_tenant_id():
 def sdk():
-    return nebius.sdk.SDK(credentials=get_iam_token())
+    if get_iam_token() is not None:
+        return nebius.sdk.SDK(credentials=get_iam_token())
+    return nebius.sdk.SDK(
+        credentials_file_name=os.path.expanduser(NEBIUS_CREDENTIALS_PATH))

sky/backends/backend_utils.py CHANGED Viewed

@@ -1802,6 +1802,21 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
         status == status_lib.ClusterStatus.UP for status in node_statuses) and
                     len(node_statuses) == handle.launched_nodes)
+    def get_node_counts_from_ray_status(
+            runner: command_runner.CommandRunner) -> Tuple[int, int, str, str]:
+        rc, output, stderr = runner.run(
+            instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
+            stream_logs=False,
+            require_outputs=True,
+            separate_stderr=True)
+        if rc:
+            raise RuntimeError(
+                f'Refreshing status ({cluster_name!r}): Failed to check '
+                f'ray cluster\'s healthiness with '
+                f'{instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND}.\n'
+                f'-- stdout --\n{output}\n-- stderr --\n{stderr}')
+        return (*_count_healthy_nodes_from_ray(output), output, stderr)
     def run_ray_status_to_check_ray_cluster_healthy() -> bool:
         try:
             # NOTE: fetching the IPs is very slow as it calls into
@@ -1822,26 +1837,34 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
                 raise exceptions.FetchClusterInfoError(
                     reason=exceptions.FetchClusterInfoError.Reason.HEAD)
             head_runner = runners[0]
-            rc, output, stderr = head_runner.run(
-                instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
-                stream_logs=False,
-                require_outputs=True,
-                separate_stderr=True)
-            if rc:
-                raise RuntimeError(
-                    f'Refreshing status ({cluster_name!r}): Failed to check '
-                    f'ray cluster\'s healthiness with '
-                    f'{instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND}.\n'
-                    f'-- stdout --\n{output}\n-- stderr --\n{stderr}')
-            ready_head, ready_workers = _count_healthy_nodes_from_ray(output)
             total_nodes = handle.launched_nodes * handle.num_ips_per_node
-            if ready_head + ready_workers == total_nodes:
-                return True
+            for i in range(5):
+                ready_head, ready_workers, output, stderr = (
+                    get_node_counts_from_ray_status(head_runner))
+                if ready_head + ready_workers == total_nodes:
+                    return True
+                logger.debug(f'Refreshing status ({cluster_name!r}) attempt '
+                             f'{i}: ray status not showing all nodes '
+                             f'({ready_head + ready_workers}/{total_nodes});\n'
+                             f'output:\n{output}\nstderr:\n{stderr}')
+                # If cluster JUST started, maybe not all the nodes have shown
+                # up. Try again for a few seconds.
+                # Note: We are okay with this performance hit because it's very
+                # rare to normally hit this case. It requires:
+                # - All the instances in the cluster are up on the cloud side
+                #   (not preempted), but
+                # - The ray cluster is somehow degraded so not all instances are
+                #   showing up
+                time.sleep(1)
             raise RuntimeError(
                 f'Refreshing status ({cluster_name!r}): ray status not showing '
                 f'all nodes ({ready_head + ready_workers}/'
-                f'{total_nodes}); output: {output}; stderr: {stderr}')
+                f'{total_nodes});\noutput:\n{output}\nstderr:\n{stderr}')
         except exceptions.FetchClusterInfoError:
             logger.debug(
                 f'Refreshing status ({cluster_name!r}) failed to get IPs.')

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -772,32 +772,6 @@ class FailoverCloudErrorHandlerV1:
             setattr(e, 'detailed_reason', detailed_reason)
             raise e
-    @staticmethod
-    def _lambda_handler(blocked_resources: Set['resources_lib.Resources'],
-                        launchable_resources: 'resources_lib.Resources',
-                        region: 'clouds.Region',
-                        zones: Optional[List['clouds.Zone']], stdout: str,
-                        stderr: str):
-        del region, zones  # Unused.
-        errors = FailoverCloudErrorHandlerV1._handle_errors(
-            stdout,
-            stderr,
-            is_error_str_known=lambda x: 'LambdaCloudError:' in x.strip())
-        messages = '\n  '.join(errors)
-        style = colorama.Style
-        logger.warning(f'  {style.DIM}{messages}{style.RESET_ALL}')
-        _add_to_blocked_resources(blocked_resources,
-                                  launchable_resources.copy(zone=None))
-        # Sometimes, LambdaCloudError will list available regions.
-        for e in errors:
-            if e.find('Regions with capacity available:') != -1:
-                for r in service_catalog.regions('lambda'):
-                    if e.find(r.name) == -1:
-                        _add_to_blocked_resources(
-                            blocked_resources,
-                            launchable_resources.copy(region=r.name, zone=None))
     @staticmethod
     def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
                      launchable_resources: 'resources_lib.Resources',
@@ -846,32 +820,6 @@ class FailoverCloudErrorHandlerV1:
             _add_to_blocked_resources(blocked_resources,
                                       launchable_resources.copy(zone=zone.name))
-    # Apr, 2023 by Hysun(hysun.he@oracle.com): Added support for OCI
-    @staticmethod
-    def _oci_handler(blocked_resources: Set['resources_lib.Resources'],
-                     launchable_resources: 'resources_lib.Resources',
-                     region: 'clouds.Region',
-                     zones: Optional[List['clouds.Zone']], stdout: str,
-                     stderr: str):
-        known_service_errors = [
-            'NotAuthorizedOrNotFound', 'CannotParseRequest', 'InternalError',
-            'LimitExceeded', 'NotAuthenticated'
-        ]
-        errors = FailoverCloudErrorHandlerV1._handle_errors(
-            stdout, stderr, lambda x: 'VcnSubnetNotFound' in x.strip() or
-            ('oci.exceptions.ServiceError' in x.strip() and any(
-                known_err in x.strip() for known_err in known_service_errors)))
-        logger.warning(f'Got error(s) in {region.name}:')
-        messages = '\n\t'.join(errors)
-        style = colorama.Style
-        logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
-        if zones is not None:
-            for zone in zones:
-                _add_to_blocked_resources(
-                    blocked_resources,
-                    launchable_resources.copy(zone=zone.name))
     @staticmethod
     def update_blocklist_on_error(
             blocked_resources: Set['resources_lib.Resources'],
@@ -1123,6 +1071,23 @@ class FailoverCloudErrorHandlerV2:
                     blocked_resources,
                     launchable_resources.copy(zone=zone.name))
+    @staticmethod
+    def _lambda_handler(blocked_resources: Set['resources_lib.Resources'],
+                        launchable_resources: 'resources_lib.Resources',
+                        region: 'clouds.Region',
+                        zones: Optional[List['clouds.Zone']], error: Exception):
+        output = str(error)
+        # Sometimes, lambda cloud error will list available regions.
+        if output.find('Regions with capacity available:') != -1:
+            for r in service_catalog.regions('lambda'):
+                if output.find(r.name) == -1:
+                    _add_to_blocked_resources(
+                        blocked_resources,
+                        launchable_resources.copy(region=r.name, zone=None))
+        else:
+            FailoverCloudErrorHandlerV2._default_handler(
+                blocked_resources, launchable_resources, region, zones, error)
     @staticmethod
     def _default_handler(blocked_resources: Set['resources_lib.Resources'],
                          launchable_resources: 'resources_lib.Resources',

sky/clouds/nebius.py CHANGED Viewed

@@ -17,6 +17,7 @@ _CREDENTIAL_FILES = [
     nebius.NEBIUS_TENANT_ID_FILENAME,
     nebius.NEBIUS_IAM_TOKEN_FILENAME,
     nebius.NEBIUS_PROJECT_ID_FILENAME,
+    nebius.NEBIUS_CREDENTIALS_FILENAME
 ]
@@ -252,15 +253,16 @@ class Nebius(clouds.Cloud):
     def check_credentials(cls) -> Tuple[bool, Optional[str]]:
         """ Verify that the user has valid credentials for Nebius. """
         logging.debug('Nebius cloud check credentials')
-        token = nebius.get_iam_token()
-        token_msg = ('    Credentials can be set up by running: \n'\
-                    f'        $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n')  # pylint: disable=line-too-long
+        token_cred_msg = ('    Credentials can be set up by running: \n'\
+                    f'        $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n'\
+                          '    or generate  ~/.nebius/credentials.json')  # pylint: disable=line-too-long
         tenant_msg = ('   Copy your tenat ID from the web console and save it to file \n'  # pylint: disable=line-too-long
                       f'        $ echo $NEBIUS_TENANT_ID_PATH > {nebius.NEBIUS_TENANT_ID_PATH} \n'  # pylint: disable=line-too-long
                       '   Or if you have 1 tenant you can run:\n'  # pylint: disable=line-too-long
                       f'        $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.NEBIUS_TENANT_ID_PATH} \n')  # pylint: disable=line-too-long
-        if token is None:
-            return False, f'{token_msg}'
+        if not nebius.is_token_or_cred_file_exist():
+            return False, f'{token_cred_msg}'
         sdk = nebius.sdk()
         tenant_id = nebius.get_tenant_id()
         if tenant_id is None:
@@ -272,7 +274,7 @@ class Nebius(clouds.Cloud):
         except nebius.request_error() as e:
             return False, (
                 f'{e.status} \n'  # First line is indented by 4 spaces
-                f'{token_msg}'
+                f'{token_cred_msg}'
                 f'{tenant_msg}')
         return True, None

sky/exceptions.py CHANGED Viewed

@@ -28,12 +28,19 @@ GIT_FATAL_EXIT_CODE = 128
 ARCH_NOT_SUPPORTED_EXIT_CODE = 133
-def is_safe_exception(exc: Exception) -> bool:
+def is_safe_exception(exc: BaseException) -> bool:
     """Returns True if the exception is safe to send to clients.
     Safe exceptions are:
     1. Built-in exceptions
     2. SkyPilot's own exceptions
+    Args:
+        exc: The exception to check, accept BaseException to handle SystemExit
+            and KeyboardInterrupt.
+    Returns:
+        True if the exception is safe to send to clients, False otherwise.
     """
     module = type(exc).__module__
@@ -48,7 +55,7 @@ def is_safe_exception(exc: Exception) -> bool:
     return False
-def wrap_exception(exc: Exception) -> Exception:
+def wrap_exception(exc: BaseException) -> BaseException:
     """Wraps non-safe exceptions into SkyPilot exceptions
     This is used to wrap exceptions that are not safe to deserialize at clients.
@@ -64,7 +71,8 @@ def wrap_exception(exc: Exception) -> Exception:
                       error_type=type(exc).__name__)
-def serialize_exception(e: Exception) -> Dict[str, Any]:
+# Accept BaseException to handle SystemExit and KeyboardInterrupt
+def serialize_exception(e: BaseException) -> Dict[str, Any]:
     """Serialize the exception.
     This function also wraps any unsafe exceptions (e.g., cloud exceptions)

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -853,7 +853,7 @@ def get_accelerator_label_key_value(
                 for label, value in label_list:
                     if (label_formatter.match_label_key(label) and
                             label_formatter.get_accelerator_from_label_value(
-                                value) == acc_type):
+                                value).lower() == acc_type.lower()):
                         if is_tpu_on_gke(acc_type):
                             assert isinstance(label_formatter,
                                               GKELabelFormatter)

sky/server/requests/event_loop.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""Executor event loop to process tasks in coroutines."""
+import asyncio
+import concurrent.futures
+import threading
+from typing import Coroutine, Optional
+# Dedicated event loop for requests, isolated with the event loop managed
+# by uvicorn. This is responsible for light-weight async tasks or sub-tasks,
+# refer to `executor.py` for more details about cooperation between the event
+# loop and executor process pool.
+_EVENT_LOOP: Optional[asyncio.AbstractEventLoop] = None
+_LOCK = threading.Lock()
+def run(coro: Coroutine) -> concurrent.futures.Future:
+    """Run a coroutine asynchronously in the request event loop."""
+    return asyncio.run_coroutine_threadsafe(coro, get_event_loop())
+def get_event_loop() -> asyncio.AbstractEventLoop:
+    """Open and get the event loop."""
+    global _EVENT_LOOP
+    if _EVENT_LOOP is not None and not _EVENT_LOOP.is_closed():
+        return _EVENT_LOOP
+    with _LOCK:
+        if _EVENT_LOOP is None or _EVENT_LOOP.is_closed():
+            _EVENT_LOOP = asyncio.new_event_loop()
+            loop_thread = threading.Thread(target=_EVENT_LOOP.run_forever,
+                                           daemon=True)
+            loop_thread.start()
+    return _EVENT_LOOP

sky/server/requests/executor.py CHANGED Viewed

@@ -27,8 +27,8 @@ import os
 import queue as queue_lib
 import signal
 import sys
+import threading
 import time
-import traceback
 import typing
 from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
@@ -41,11 +41,13 @@ from sky import skypilot_config
 from sky.server import common as server_common
 from sky.server import constants as server_constants
 from sky.server.requests import payloads
+from sky.server.requests import preconditions
 from sky.server.requests import requests as api_requests
 from sky.server.requests.queues import mp_queue
 from sky.skylet import constants
 from sky.utils import annotations
 from sky.utils import common_utils
+from sky.utils import subprocess_utils
 from sky.utils import timeline
 from sky.utils import ux_utils
@@ -262,13 +264,7 @@ def _request_execution_wrapper(request_id: str,
             _restore_output(original_stdout, original_stderr)
             return
         except (Exception, SystemExit) as e:  # pylint: disable=broad-except
-            with ux_utils.enable_traceback():
-                stacktrace = traceback.format_exc()
-            setattr(e, 'stacktrace', stacktrace)
-            with api_requests.update_request(request_id) as request_task:
-                assert request_task is not None, request_id
-                request_task.status = api_requests.RequestStatus.FAILED
-                request_task.set_error(e)
+            api_requests.set_request_failed(request_id, e)
             _restore_output(original_stdout, original_stderr)
             logger.info(f'Request {request_id} failed due to '
                         f'{common_utils.format_exception(e)}')
@@ -283,16 +279,37 @@ def _request_execution_wrapper(request_id: str,
             logger.info(f'Request {request_id} finished')
-def schedule_request(request_id: str,
-                     request_name: str,
-                     request_body: payloads.RequestBody,
-                     func: Callable[P, Any],
-                     request_cluster_name: Optional[str] = None,
-                     ignore_return_value: bool = False,
-                     schedule_type: api_requests.ScheduleType = api_requests.
-                     ScheduleType.LONG,
-                     is_skypilot_system: bool = False) -> None:
-    """Enqueue a request to the request queue."""
+def schedule_request(
+        request_id: str,
+        request_name: str,
+        request_body: payloads.RequestBody,
+        func: Callable[P, Any],
+        request_cluster_name: Optional[str] = None,
+        ignore_return_value: bool = False,
+        schedule_type: api_requests.ScheduleType = (
+            api_requests.ScheduleType.LONG),
+        is_skypilot_system: bool = False,
+        precondition: Optional[preconditions.Precondition] = None) -> None:
+    """Enqueue a request to the request queue.
+    Args:
+        request_id: ID of the request.
+        request_name: Name of the request type, e.g. "sky.launch".
+        request_body: The request body containing parameters and environment
+            variables.
+        func: The function to execute when the request is processed.
+        request_cluster_name: The name of the cluster associated with this
+            request, if any.
+        ignore_return_value: If True, the return value of the function will be
+            ignored.
+        schedule_type: The type of scheduling to use for this request, refer to
+            `api_requests.ScheduleType` for more details.
+        is_skypilot_system: Denote whether the request is from SkyPilot system.
+        precondition: If a precondition is provided, the request will only be
+            scheduled for execution when the precondition is met (returns True).
+            The precondition is waited asynchronously and does not block the
+            caller.
+    """
     user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
     if is_skypilot_system:
         user_id = server_constants.SKYPILOT_SYSTEM_USER_ID
@@ -314,10 +331,17 @@ def schedule_request(request_id: str,
         return
     request.log_path.touch()
-    input_tuple = (request_id, ignore_return_value)
-    logger.info(f'Queuing request: {request_id}')
-    _get_queue(schedule_type).put(input_tuple)
+    def enqueue():
+        input_tuple = (request_id, ignore_return_value)
+        logger.info(f'Queuing request: {request_id}')
+        _get_queue(schedule_type).put(input_tuple)
+    if precondition is not None:
+        # Wait async to avoid blocking caller.
+        precondition.wait_async(on_condition_met=enqueue)
+    else:
+        enqueue()
 def executor_initializer(proc_group: str):
@@ -431,13 +455,17 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
     logger.info('Request queues created')
+    long_workers = []
     for worker_id in range(max_parallel_for_long):
         worker = RequestWorker(id=worker_id,
                                schedule_type=api_requests.ScheduleType.LONG)
         worker_proc = multiprocessing.Process(target=request_worker,
                                               args=(worker, 1))
-        worker_proc.start()
+        long_workers.append(worker_proc)
         sub_procs.append(worker_proc)
+    threading.Thread(target=subprocess_utils.slow_start_processes,
+                     args=(long_workers,),
+                     daemon=True).start()
     # Start a worker for short requests.
     worker = RequestWorker(id=1, schedule_type=api_requests.ScheduleType.SHORT)

sky/server/requests/preconditions.py ADDED Viewed

@@ -0,0 +1,174 @@
+"""Precondition for a request to be executed.
+Preconditions are introduced so that:
+- Wait for precondition does not block executor process, which is expensive;
+- Cross requests knowledge (e.g. waiting for other requests to be completed)
+  can be handled at precondition level, instead of invading the execution
+  logic of specific requests.
+"""
+import abc
+import asyncio
+import time
+from typing import Callable, Optional, Tuple
+from sky import exceptions
+from sky import global_user_state
+from sky import sky_logging
+from sky.server.requests import event_loop
+from sky.server.requests import requests as api_requests
+from sky.utils import common_utils
+from sky.utils import status_lib
+# The default interval seconds to check the precondition.
+_PRECONDITION_CHECK_INTERVAL = 1
+# The default timeout seconds to wait for the precondition to be met.
+_PRECONDITION_TIMEOUT = 60 * 60
+logger = sky_logging.init_logger(__name__)
+class Precondition(abc.ABC):
+    """Abstract base class for a precondition for a request to be executed.
+    A Precondition can be waited in either of the following ways:
+    - await Precondition: wait for the precondition to be met.
+    - Precondition.wait_async: wait for the precondition to be met in background
+      and execute the given callback on met.
+    """
+    def __init__(self,
+                 request_id: str,
+                 check_interval: float = _PRECONDITION_CHECK_INTERVAL,
+                 timeout: float = _PRECONDITION_TIMEOUT):
+        self.request_id = request_id
+        self.check_interval = check_interval
+        self.timeout = timeout
+    def __await__(self):
+        """Make Precondition awaitable."""
+        return self._wait().__await__()
+    def wait_async(
+            self,
+            on_condition_met: Optional[Callable[[], None]] = None) -> None:
+        """Wait precondition asynchronously and execute the callback on met."""
+        async def wait_with_callback():
+            met = await self
+            if met and on_condition_met is not None:
+                on_condition_met()
+        event_loop.run(wait_with_callback())
+    @abc.abstractmethod
+    async def check(self) -> Tuple[bool, Optional[str]]:
+        """Check if the precondition is met.
+        Note that compared to _request_execution_wrapper, the env vars and
+        skypilot config here are not overridden since the lack of process
+        isolation, which may cause issues if the check accidentally depends on
+        these. Make sure the check function is independent of the request
+        environment.
+        TODO(aylei): a new request context isolation mechanism is needed to
+        enable more tasks/sub-tasks to be processed in coroutines or threads.
+        Returns:
+            A tuple of (bool, Optional[str]).
+            The bool indicates if the precondition is met.
+            The str is the current status of the precondition if any.
+        """
+        raise NotImplementedError
+    async def _wait(self) -> bool:
+        """Wait for the precondition to be met.
+        Args:
+            on_condition_met: Callback to execute when the precondition is met.
+        """
+        start_time = time.time()
+        last_status_msg = ''
+        while True:
+            if self.timeout > 0 and time.time() - start_time > self.timeout:
+                # Cancel the request on timeout.
+                api_requests.set_request_failed(
+                    self.request_id,
+                    exceptions.RequestCancelled(
+                        f'Request {self.request_id} precondition wait timed '
+                        f'out after {self.timeout}s'))
+                return False
+            # Check if the request has been cancelled
+            request = api_requests.get_request(self.request_id)
+            if request is None:
+                logger.error(f'Request {self.request_id} not found')
+                return False
+            if request.status == api_requests.RequestStatus.CANCELLED:
+                logger.debug(f'Request {self.request_id} cancelled')
+                return False
+            try:
+                met, status_msg = await self.check()
+                if met:
+                    return True
+                if status_msg is not None and status_msg != last_status_msg:
+                    # Update the status message if it has changed.
+                    with api_requests.update_request(self.request_id) as req:
+                        assert req is not None, self.request_id
+                        req.status_msg = status_msg
+                    last_status_msg = status_msg
+            except (Exception, SystemExit, KeyboardInterrupt) as e:  # pylint: disable=broad-except
+                api_requests.set_request_failed(self.request_id, e)
+                logger.info(f'Request {self.request_id} failed due to '
+                            f'{common_utils.format_exception(e)}')
+                return False
+            await asyncio.sleep(self.check_interval)
+class ClusterStartCompletePrecondition(Precondition):
+    """Whether the start process of a cluster is complete.
+    This condition only waits the start process of a cluster to complete, e.g.
+    `sky launch` or `sky start`.
+    For cluster that has been started but not in UP status, bypass the waiting
+    in favor of:
+    - allowing the task to refresh cluster status from cloud vendor;
+    - unified error message in task handlers.
+    Args:
+        request_id: The request ID of the task.
+        cluster_name: The name of the cluster to wait for.
+    """
+    def __init__(self, request_id: str, cluster_name: str, **kwargs):
+        super().__init__(request_id=request_id, **kwargs)
+        self.cluster_name = cluster_name
+    async def check(self) -> Tuple[bool, Optional[str]]:
+        cluster_record = global_user_state.get_cluster_from_name(
+            self.cluster_name)
+        if (cluster_record and
+                cluster_record['status'] is status_lib.ClusterStatus.UP):
+            # Shortcut for started clusters, ignore cluster not found
+            # since the cluster record might not yet be created by the
+            # launch task.
+            return True, None
+        # Check if there is a task starting the cluster, we do not check
+        # SUCCEEDED requests since successfully launched cluster can be
+        # restarted later on.
+        # Note that since the requests are not persistent yet between restarts,
+        # a cluster might be started in halfway and requests are lost.
+        # We unify these situations into a single state: the process of starting
+        # the cluster is done (either normally or abnormally) but cluster is not
+        # in UP status.
+        requests = api_requests.get_request_tasks(
+            status=[
+                api_requests.RequestStatus.RUNNING,
+                api_requests.RequestStatus.PENDING
+            ],
+            include_request_names=['sky.launch', 'sky.start'],
+            cluster_names=[self.cluster_name])
+        if len(requests) == 0:
+            # No runnning or pending tasks, the start process is done.
+            return True, None
+        return False, f'Waiting for cluster {self.cluster_name} to be UP.'

sky/server/requests/requests.py CHANGED Viewed

@@ -10,6 +10,7 @@ import shutil
 import signal
 import sqlite3
 import time
+import traceback
 from typing import Any, Callable, Dict, List, Optional, Tuple
 import colorama
@@ -27,6 +28,7 @@ from sky.utils import common
 from sky.utils import common_utils
 from sky.utils import db_utils
 from sky.utils import env_options
+from sky.utils import ux_utils
 logger = sky_logging.init_logger(__name__)
@@ -34,6 +36,7 @@ logger = sky_logging.init_logger(__name__)
 REQUEST_TABLE = 'requests'
 COL_CLUSTER_NAME = 'cluster_name'
 COL_USER_ID = 'user_id'
+COL_STATUS_MSG = 'status_msg'
 REQUEST_LOG_PATH_PREFIX = '~/sky_logs/api_server/requests'
 # TODO(zhwu): For scalability, there are several TODOs:
@@ -81,6 +84,7 @@ REQUEST_COLUMNS = [
     COL_CLUSTER_NAME,
     'schedule_type',
     COL_USER_ID,
+    COL_STATUS_MSG,
 ]
@@ -109,6 +113,7 @@ class RequestPayload:
     user_name: Optional[str] = None
     # Resources the request operates on.
     cluster_name: Optional[str] = None
+    status_msg: Optional[str] = None
 @dataclasses.dataclass
@@ -129,6 +134,8 @@ class Request:
     schedule_type: ScheduleType = ScheduleType.LONG
     # Resources the request operates on.
     cluster_name: Optional[str] = None
+    # Status message of the request, indicates the reason of current status.
+    status_msg: Optional[str] = None
     @property
     def log_path(self) -> pathlib.Path:
@@ -138,7 +145,7 @@ class Request:
         log_path = (log_path_prefix / self.request_id).with_suffix('.log')
         return log_path
-    def set_error(self, error: Exception) -> None:
+    def set_error(self, error: BaseException) -> None:
         """Set the error."""
         # TODO(zhwu): pickle.dump does not work well with custom exceptions if
         # it has more than 1 arguments.
@@ -212,6 +219,7 @@ class Request:
             user_id=self.user_id,
             user_name=user_name,
             cluster_name=self.cluster_name,
+            status_msg=self.status_msg,
         )
     def encode(self) -> RequestPayload:
@@ -232,6 +240,7 @@ class Request:
                 schedule_type=self.schedule_type.value,
                 user_id=self.user_id,
                 cluster_name=self.cluster_name,
+                status_msg=self.status_msg,
             )
         except (TypeError, ValueError) as e:
             # The error is unexpected, so we don't suppress the stack trace.
@@ -262,6 +271,7 @@ class Request:
                 schedule_type=ScheduleType(payload.schedule_type),
                 user_id=payload.user_id,
                 cluster_name=payload.cluster_name,
+                status_msg=payload.status_msg,
             )
         except (TypeError, ValueError) as e:
             logger.error(
@@ -415,7 +425,8 @@ def create_table(cursor, conn):
         pid INTEGER,
         {COL_CLUSTER_NAME} TEXT,
         schedule_type TEXT,
-        {COL_USER_ID} TEXT)""")
+        {COL_USER_ID} TEXT,
+        {COL_STATUS_MSG} TEXT)""")
 _DB = None
@@ -507,8 +518,9 @@ def create_if_not_exists(request: Request) -> bool:
 def get_request_tasks(
     status: Optional[List[RequestStatus]] = None,
     cluster_names: Optional[List[str]] = None,
-    exclude_request_names: Optional[List[str]] = None,
     user_id: Optional[str] = None,
+    exclude_request_names: Optional[List[str]] = None,
+    include_request_names: Optional[List[str]] = None,
 ) -> List[Request]:
     """Get a list of requests that match the given filters.
@@ -516,9 +528,21 @@ def get_request_tasks(
         status: a list of statuses of the requests to filter on.
         cluster_names: a list of cluster names to filter requests on.
         exclude_request_names: a list of request names to exclude from results.
+            Mutually exclusive with include_request_names.
         user_id: the user ID to filter requests on.
             If None, all users are included.
+        include_request_names: a list of request names to filter on.
+            Mutually exclusive with exclude_request_names.
+    Raises:
+        ValueError: If both exclude_request_names and include_request_names are
+            provided.
     """
+    if exclude_request_names is not None and include_request_names is not None:
+        raise ValueError(
+            'Only one of exclude_request_names or include_request_names can be '
+            'provided, not both.')
     filters = []
     filter_params = []
     if status is not None:
@@ -534,6 +558,10 @@ def get_request_tasks(
     if user_id is not None:
         filters.append(f'{COL_USER_ID} = ?')
         filter_params.append(user_id)
+    if include_request_names is not None:
+        request_names_str = ','.join(
+            repr(name) for name in include_request_names)
+        filters.append(f'name IN ({request_names_str})')
     assert _DB is not None
     with _DB.conn:
         cursor = _DB.conn.cursor()
@@ -565,3 +593,14 @@ def _add_or_update_request_no_lock(request: Request):
         cursor.execute(
             f'INSERT OR REPLACE INTO {REQUEST_TABLE} ({key_str}) '
             f'VALUES ({fill_str})', row)
+def set_request_failed(request_id: str, e: BaseException) -> None:
+    """Set a request to failed and populate the error message."""
+    with ux_utils.enable_traceback():
+        stacktrace = traceback.format_exc()
+    setattr(e, 'stacktrace', stacktrace)
+    with update_request(request_id) as request_task:
+        assert request_task is not None, request_id
+        request_task.status = RequestStatus.FAILED
+        request_task.set_error(e)

sky/server/server.py CHANGED Viewed

@@ -6,6 +6,7 @@ import contextlib
 import dataclasses
 import datetime
 import logging
+import multiprocessing
 import os
 import pathlib
 import re
@@ -38,6 +39,7 @@ from sky.server import constants as server_constants
 from sky.server import stream_utils
 from sky.server.requests import executor
 from sky.server.requests import payloads
+from sky.server.requests import preconditions
 from sky.server.requests import requests as requests_lib
 from sky.skylet import constants
 from sky.usage import usage_lib
@@ -47,6 +49,7 @@ from sky.utils import common_utils
 from sky.utils import dag_utils
 from sky.utils import env_options
 from sky.utils import status_lib
+from sky.utils import subprocess_utils
 # pylint: disable=ungrouped-imports
 if sys.version_info >= (3, 10):
@@ -496,13 +499,18 @@ async def launch(launch_body: payloads.LaunchBody,
 # pylint: disable=redefined-builtin
 async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
     """Executes a task on an existing cluster."""
+    cluster_name = exec_body.cluster_name
     executor.schedule_request(
         request_id=request.state.request_id,
         request_name='exec',
         request_body=exec_body,
         func=execution.exec,
+        precondition=preconditions.ClusterStartCompletePrecondition(
+            request_id=request.state.request_id,
+            cluster_name=cluster_name,
+        ),
         schedule_type=requests_lib.ScheduleType.LONG,
-        request_cluster_name=exec_body.cluster_name,
+        request_cluster_name=cluster_name,
     )
@@ -1088,6 +1096,9 @@ async def complete_storage_name(incomplete: str,) -> List[str]:
 if __name__ == '__main__':
     import uvicorn
+    from sky.server import uvicorn as skyuvicorn
     requests_lib.reset_db_and_logs()
     parser = argparse.ArgumentParser()
@@ -1109,16 +1120,26 @@ if __name__ == '__main__':
         logger.info(f'Starting SkyPilot API server, workers={num_workers}')
         # We don't support reload for now, since it may cause leakage of request
         # workers or interrupt running requests.
-        uvicorn.run('sky.server.server:app',
-                    host=cmd_args.host,
-                    port=cmd_args.port,
-                    workers=num_workers)
+        config = uvicorn.Config('sky.server.server:app',
+                                host=cmd_args.host,
+                                port=cmd_args.port,
+                                workers=num_workers)
+        skyuvicorn.run(config)
     except Exception as exc:  # pylint: disable=broad-except
         logger.error(f'Failed to start SkyPilot API server: '
                      f'{common_utils.format_exception(exc, use_bracket=True)}')
         raise
     finally:
         logger.info('Shutting down SkyPilot API server...')
-        for sub_proc in sub_procs:
-            sub_proc.terminate()
-            sub_proc.join()
+        def cleanup(proc: multiprocessing.Process) -> None:
+            try:
+                proc.terminate()
+                proc.join()
+            finally:
+                # The process may not be started yet, close it anyway.
+                proc.close()
+        subprocess_utils.run_in_parallel(cleanup,
+                                         sub_procs,
+                                         num_threads=len(sub_procs))

sky/server/stream_utils.py CHANGED Viewed

@@ -55,19 +55,22 @@ async def log_streamer(request_id: Optional[str],
         if show_request_waiting_spinner:
             yield status_msg.init()
             yield status_msg.start()
-        is_waiting_msg_logged = False
+        last_waiting_msg = ''
         waiting_msg = (f'Waiting for {request_task.name!r} request to be '
                        f'scheduled: {request_id}')
         while request_task.status < requests_lib.RequestStatus.RUNNING:
+            if request_task.status_msg is not None:
+                waiting_msg = request_task.status_msg
             if show_request_waiting_spinner:
                 yield status_msg.update(f'[dim]{waiting_msg}[/dim]')
-            elif plain_logs and not is_waiting_msg_logged:
-                is_waiting_msg_logged = True
+            elif plain_logs and waiting_msg != last_waiting_msg:
+                # Only log when waiting message changes.
+                last_waiting_msg = waiting_msg
                 # Use smaller padding (1024 bytes) to force browser rendering
                 yield f'{waiting_msg}' + ' ' * 4096 + '\n'
-            # Sleep 0 to yield, so other coroutines can run. This busy waiting
-            # loop is performance critical for short-running requests, so we do
-            # not want to yield too long.
+            # Sleep shortly to avoid storming the DB and CPU and allow other
+            # coroutines to run. This busy waiting loop is performance critical
+            # for short-running requests, so we do not want to yield too long.
             await asyncio.sleep(0.1)
             request_task = requests_lib.get_request(request_id)
             if not follow:

sky/server/uvicorn.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""Uvicorn wrapper for SkyPilot API server.
+This module is a wrapper around uvicorn to customize the behavior of the
+server.
+"""
+import os
+import threading
+from typing import Optional
+import uvicorn
+from uvicorn.supervisors import multiprocess
+from sky.utils import subprocess_utils
+def run(config: uvicorn.Config):
+    """Run unvicorn server."""
+    if config.reload:
+        # Reload and multi-workers are mutually exclusive
+        # in uvicorn. Since we do not use reload now, simply
+        # guard by an exception.
+        raise ValueError('Reload is not supported yet.')
+    server = uvicorn.Server(config=config)
+    try:
+        if config.workers is not None and config.workers > 1:
+            sock = config.bind_socket()
+            SlowStartMultiprocess(config, target=server.run,
+                                  sockets=[sock]).run()
+        else:
+            server.run()
+    finally:
+        # Copied from unvicorn.run()
+        if config.uds and os.path.exists(config.uds):
+            os.remove(config.uds)
+class SlowStartMultiprocess(multiprocess.Multiprocess):
+    """Uvicorn Multiprocess wrapper with slow start.
+    Slow start offers faster and more stable  start time.
+    Profile shows the start time is more stable and accelerated from
+    ~7s to ~3.3s on a 12-core machine after switching LONG workers and
+    Uvicorn workers to slow start.
+    Refer to subprocess_utils.slow_start_processes() for more details.
+    """
+    def __init__(self, config: uvicorn.Config, **kwargs):
+        """Initialize the multiprocess wrapper.
+        Args:
+            config: The uvicorn config.
+        """
+        super().__init__(config, **kwargs)
+        self._init_thread: Optional[threading.Thread] = None
+    def init_processes(self) -> None:
+        # Slow start worker processes asynchronously to avoid blocking signal
+        # handling of uvicorn.
+        self._init_thread = threading.Thread(target=self.slow_start_processes,
+                                             daemon=True)
+        self._init_thread.start()
+    def slow_start_processes(self) -> None:
+        """Initialize processes with slow start."""
+        to_start = []
+        # Init N worker processes
+        for _ in range(self.processes_num):
+            to_start.append(
+                multiprocess.Process(self.config, self.target, self.sockets))
+        # Start the processes with slow start, we only append start to
+        # self.processes because Uvicorn periodically restarts unstarted
+        # workers.
+        subprocess_utils.slow_start_processes(to_start,
+                                              on_start=self.processes.append,
+                                              should_exit=self.should_exit)
+    def terminate_all(self) -> None:
+        """Wait init thread to finish before terminating all processes."""
+        if self._init_thread is not None:
+            self._init_thread.join()
+        super().terminate_all()

sky/utils/accelerator_registry.py CHANGED Viewed

@@ -77,7 +77,7 @@ def canonicalize_accelerator_name(accelerator: str,
     # Look for Kubernetes accelerators online if the accelerator is not found
     # in the public cloud catalog. This is to make sure custom accelerators
     # on Kubernetes can be correctly canonicalized.
-    if not names and cloud_str in ['kubernetes', None]:
+    if not names and cloud_str in ['Kubernetes', None]:
         with rich_utils.safe_status(
                 ux_utils.spinner_message('Listing accelerators on Kubernetes')):
             searched = service_catalog.list_accelerators(

sky/utils/subprocess_utils.py CHANGED Viewed

@@ -5,8 +5,9 @@ import random
 import resource
 import shlex
 import subprocess
+import threading
 import time
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
 import colorama
 import psutil
@@ -15,6 +16,7 @@ from sky import exceptions
 from sky import sky_logging
 from sky.skylet import constants
 from sky.skylet import log_lib
+from sky.utils import common_utils
 from sky.utils import timeline
 from sky.utils import ux_utils
@@ -353,3 +355,56 @@ def launch_new_process_tree(cmd: str, log_output: str = '/dev/null') -> int:
                           text=True)
     # Get the PID of the detached process
     return int(proc.stdout.strip())
+# A protocol for objects that can be started, designed to be used with
+# slow_start_processes() so that we can handle different wrappers of
+# multiprocessing.Process in a uniform way.
+class Startable(Protocol):
+    def start(self) -> None:
+        ...
+OnStartFn = Callable[[Startable], None]
+def slow_start_processes(processes: List[Startable],
+                         delay: float = 2.0,
+                         on_start: Optional[OnStartFn] = None,
+                         should_exit: Optional[threading.Event] = None) -> None:
+    """Start processes with slow start.
+    Profile shows that it takes 1~2 seconds to start a worker process when
+    CPU is relatively idle. However, starting all workers simultaneously will
+    overwhelm the CPU and cause the time for the first worker to be ready to
+    be delayed. Slow start start a group of workers slowly to accelerate the
+    start time (i.e. the time for the first worker to be ready), while
+    gradually increasing the batch size in exponential manner to make the
+    time of achieving full parallelism as short as possible.
+    Args:
+        processes: The list of processes to start.
+        delay: The delay between starting each process, default to 2.0 seconds,
+            based on profile.
+        on_start: An optional function to callback when a process starts.
+        should_exit: An optional event to check if the function should exit
+            before starting all the processes.
+    """
+    max_batch_size = max(1, int(common_utils.get_cpu_count() / 2))
+    batch_size = 1
+    left = len(processes)
+    while left > 0:
+        if should_exit and should_exit.is_set():
+            break
+        current_batch = min(batch_size, left)
+        for i in range(current_batch):
+            worker_idx = len(processes) - left + i
+            processes[worker_idx].start()
+            if on_start:
+                on_start(processes[worker_idx])
+        left -= current_batch
+        if left <= 0:
+            break
+        batch_size = min(batch_size * 2, max_batch_size)
+        time.sleep(delay)

{skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250312.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: skypilot-nightly
-Version: 1.0.0.dev20250311
+Version: 1.0.0.dev20250312
 Summary: SkyPilot: An intercloud broker for the clouds
 Author: SkyPilot Team
 License: Apache 2.0

{skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250312.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-sky/__init__.py,sha256=BsWzCznVm1cQKZDKOkladqi6DUUPJs0mc7iSx5QIw_E,6428
+sky/__init__.py,sha256=15ZYL6HUs43go7VjSHq7_BlZEptubkQ6aeBIx534zkU,6428
 sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
 sky/authentication.py,sha256=hCEqi77nprQEg3ktfRL51xiiw16zwZOmFEDB_Z7fWVU,22384
 sky/check.py,sha256=NDKx_Zm7YRxPjMv82wz3ESLnGIPljaACyqVdVNM0PzY,11258
@@ -6,7 +6,7 @@ sky/cli.py,sha256=qBRqtKVV_GurbCFZBHkF2UIahy3A7bsOsmfCNm6mZ54,221503
 sky/cloud_stores.py,sha256=kEHXd2divyra-1c3EusHxKyM5yTQlTXc6cKVXofsefA,23978
 sky/core.py,sha256=MU9hcTdh8baMGrr2ZXmbxx12vNlhajrkeyg5QtV717c,47609
 sky/dag.py,sha256=Yl7Ry26Vql5cv4YMz8g9kOUgtoCihJnw7c8NgZYakMY,3242
-sky/exceptions.py,sha256=KvKQDPmlO7Qk90_NyRRYO9yNYBifbDGfxsRIe_L_fWw,16345
+sky/exceptions.py,sha256=cEZ5nm7RhTW22Npw-oYS5Wp9rtxoHxdPQHfkNa92wOo,16641
 sky/execution.py,sha256=0M4RTEzWn-B9oz221XdZOIGH12XOACmNq0j-WGUT_No,28023
 sky/global_user_state.py,sha256=sUDdSsJeiJkbgmZNwy8YGFK0XeNh-RBr1VDUvbmjf0g,33246
 sky/models.py,sha256=4xSW05BdDPEjW8Ubvj3VlVOVnzv0TbrolsFvR5R5v1U,638
@@ -26,15 +26,15 @@ sky/adaptors/docker.py,sha256=_kzpZ0fkWHqqQAVVl0llTsCE31KYz3Sjn8psTBQHVkA,468
 sky/adaptors/gcp.py,sha256=OQ9RaqjR0r0iaWYpjvEtIx5vnEhyB4LhUCwbtdxsmVk,3115
 sky/adaptors/ibm.py,sha256=H87vD6izq_wQI8oQC7cx9iVtRgPi_QkAcrfa1Z3PNqU,4906
 sky/adaptors/kubernetes.py,sha256=UIUc3zI0MgWcv1GTBu-pZUSx_NTLf0zRI20JUdtA1HI,6594
-sky/adaptors/nebius.py,sha256=JOvwniQT-Pkp9-af6IdL_FUkjIbsEAUXVNUkwdaEeb0,2732
+sky/adaptors/nebius.py,sha256=QAqU_reFk7MKQ39TE1FiNgNnDPH5L5-HT19j6CtJcJE,3175
 sky/adaptors/oci.py,sha256=LfMSFUmkkNT6Yoz9FZHNl6UFSg4X1lJO4-x4ZbDdXTs,2831
 sky/adaptors/runpod.py,sha256=4Nt_BfZhJAKQNA3wO8cxvvNI8x4NsDGHu_4EhRDlGYQ,225
 sky/adaptors/vast.py,sha256=tpvmHi7IkQNzbbHVkeo04kUSajoEpSzXr2XgeO_I1LU,695
 sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
 sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
 sky/backends/backend.py,sha256=4BOqKZ-bwBTpjNnZF4JAHX2m2Iga7EmEn8Ao3tEivaM,7527
-sky/backends/backend_utils.py,sha256=B_46tG9PyrppxLWdg4mWGuuIr3TEcWTz6qhYXjAY2bw,133452
-sky/backends/cloud_vm_ray_backend.py,sha256=KIU4IkUTBGE__7MC3ayjYMwE14mSxeiHjrGnK7wAQXw,247773
+sky/backends/backend_utils.py,sha256=lOkufcDQiBFHKf5TYppaQ1SKCRmUxAM-71q3EmXM_QY,134525
+sky/backends/cloud_vm_ray_backend.py,sha256=aNRjxeVe_1GmYYbU3KUCCr2_-PW9KWUeCO-atAg9RKU,246171
 sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
 sky/backends/local_docker_backend.py,sha256=nSYCjms3HOPjPNOrcCqsUKm1WV3AAovRFjEQ7hcEXW4,17021
 sky/backends/wheel_utils.py,sha256=meypuMaygSXXjGdXfq6dhWl-OrpAybg9KVRoup4D0wU,9098
@@ -57,7 +57,7 @@ sky/clouds/gcp.py,sha256=FUCUq94yGUZ_yyKxA3prRKTqetObbIMkfjAPTPbhXyA,55824
 sky/clouds/ibm.py,sha256=R4JR96YfXstZ2B_IgFNVEX2SBAq3q0lSWz4y7FoFoeE,21474
 sky/clouds/kubernetes.py,sha256=xsYX8HhdcRzsdx6Gd_3kumNqjMjpo_l4cinhs3ZMwZM,35067
 sky/clouds/lambda_cloud.py,sha256=ejqA_Wj5-325Y_QjQ__FY4HMO8sv_2tSRsufmaldcmI,12699
-sky/clouds/nebius.py,sha256=4180IruRMib7L9o60lrxrUDJtYhpX4lWFfAznbZoY6Q,12560
+sky/clouds/nebius.py,sha256=G3v73NZjLzGoCi0ZfHj6VkOt-fs1i6DDxCpNiE88BdA,12676
 sky/clouds/oci.py,sha256=irINbQsQ6YxRxGTMaCNsms3mZkIun2oJMMA1fMCRJyA,27072
 sky/clouds/paperspace.py,sha256=O7bH8YaHBLFuyj6rDz2bPDz_6OYWmNB9OLqnZH70yfY,10922
 sky/clouds/runpod.py,sha256=hzYB4td6qaged83xMAVKZ96bH40oZnrHXL7a_CKxXIw,11926
@@ -165,7 +165,7 @@ sky/provision/kubernetes/constants.py,sha256=dZCUV8FOO9Gct80sdqeubKnxeW3CGl-u5mx
 sky/provision/kubernetes/instance.py,sha256=oag17OtuiqU-1RjkgW9NvEpxSGUFIYdI7M61S-YmPu8,50503
 sky/provision/kubernetes/network.py,sha256=AtcOM8wPs_-UlQJhGEQGP6Lh4HIgdx63Y0iWEhP5jyc,12673
 sky/provision/kubernetes/network_utils.py,sha256=Bwy5ZQb62ejC7ZHM4htjzhs86UNACK7AXN-NfQ9IJrE,11454
-sky/provision/kubernetes/utils.py,sha256=pmtjphlon6ANdMFy7aqGFhh4bSUYAEdMQ5ARSUD2s4w,109746
+sky/provision/kubernetes/utils.py,sha256=aGIYhGFnvInMqd8INwR7RirKrstSsMQxM0RvZUFia4Q,109762
 sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
 sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
 sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
@@ -230,13 +230,16 @@ sky/serve/server/server.py,sha256=gQGVU9nHYdGbaLhGjIUNIYn4xwKjRASRJkiiTL5AI1Y,32
 sky/server/__init__.py,sha256=MPPBqFzXz6Jv5QSk6td_IcvnfXfNErDZVcizu4MLRow,27
 sky/server/common.py,sha256=pEa-q3P5aOm6RMlit0pVzlDoJnZU_6zViO7aK_7htn0,17843
 sky/server/constants.py,sha256=_ZNrxYh8vmgbf3DmkGDduxjvO2y43ZSPTkH5rCNsVjU,770
-sky/server/server.py,sha256=ag2vXO3ESU2BYOMLRkgZhpYR_WrfDB0Zo6wMTnRuy5k,43458
-sky/server/stream_utils.py,sha256=-3IX1YCgxAFfcvQIV0TCvOn1wbRLWovAx3ckCrsExWU,5651
+sky/server/server.py,sha256=kEjwRjA7PJDZzx6KqD_NAFxryVLkzwCnuPfbmY_p30A,44232
+sky/server/stream_utils.py,sha256=4JMHgtoXPpCT8JwtqyUcDQ9IdZFir9om0JaCRr8rvbQ,5849
+sky/server/uvicorn.py,sha256=wajwPHJ3IEEP3GMNOCc0S81-1v2qT5F-ejUkLFVhUzk,2953
 sky/server/html/log.html,sha256=TSGZktua9Ysl_ysg3w60rjxAxhH61AJnsYDHdtqrjmI,6929
 sky/server/requests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sky/server/requests/executor.py,sha256=Jk8RJoQlicDqaHhgVWMH3UiL-dJS7lGSGd05GPv-Lrc,19781
+sky/server/requests/event_loop.py,sha256=OhpPbuce65bbjpGRlcJa78AVnYSm08SzFKt70ypCUuQ,1211
+sky/server/requests/executor.py,sha256=SuSr-cVrRnMzf-1SEz6O8HpcLzGM3mrbNc8re7QduYk,20862
 sky/server/requests/payloads.py,sha256=nVb7vr1SNAq6ay2dNe9301zLHp7NrM79M7nsWAECBms,16340
-sky/server/requests/requests.py,sha256=aMdjiK5kjSYP36pxdXFU6qgKOXcOmtViHbFm3V8Dvf8,19590
+sky/server/requests/preconditions.py,sha256=ipxIb_3JXG6S3-ymcOdqQNb7VDvoPqADxu9ZK7-nQWc,7179
+sky/server/requests/requests.py,sha256=HrBDrJyWPaKk52ykHp34A6UjipXPH-f2Eh2sNvhWt4g,21228
 sky/server/requests/queues/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sky/server/requests/queues/mp_queue.py,sha256=_7AFas__0b1L8e7Bwy4lu0VYU18R85YwMlDHPhQCfh0,2998
 sky/server/requests/serializers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -303,7 +306,7 @@ sky/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sky/usage/constants.py,sha256=mFrTgrFIfFf4kpcl-M1VDU7_moD5_mJazUJTUDrybms,1102
 sky/usage/usage_lib.py,sha256=rInJW2kj2O1wwXUZAbeVVLhnoa7T_xBHqDhbBBrUqfI,21400
 sky/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sky/utils/accelerator_registry.py,sha256=GjOgqT0s0n5hT-wcpCcTRu74rnKb8LwQ6MJl6dKL-1I,3905
+sky/utils/accelerator_registry.py,sha256=rZniDbqqPAF-vjkrwxGwEErFSAp6puOimkRj3ppOSRY,3905
 sky/utils/admin_policy_utils.py,sha256=y_do0VH6qh163EqSuRW1uGeKvTnJhiYNrHUs77uoOcA,6013
 sky/utils/annotations.py,sha256=-rfacB30Sl0xkFriejGvxma3oKctGfXXLZkQPHG33eo,1626
 sky/utils/cluster_utils.py,sha256=s6DFRXktv6_gF_DnwDEXJ7CniifHp8CAPeGciRCbXgI,14432
@@ -325,7 +328,7 @@ sky/utils/resources_utils.py,sha256=URp6OS9B9nc9tIB5ibZCgGK4XSABmI4kRG0wOM6qgvs,
 sky/utils/rich_utils.py,sha256=3xdDzmn-TQXAE83EevAtOf9N4aak3Bl4ZeD33xIxjOo,11931
 sky/utils/schemas.py,sha256=KJCHrn1nMZ3XqzddWuu_nFQoRQw01cZh9qh19OrRtps,30145
 sky/utils/status_lib.py,sha256=zn_MSuRYQdNKF8pnFOGQ54X_s_R7dyqWS6Q3a9zENw8,1512
-sky/utils/subprocess_utils.py,sha256=lqhSHoy93GsVeQgQ48C6f77bixD6yfsGQP40rbXofts,12779
+sky/utils/subprocess_utils.py,sha256=Q42CyjDNICXze2WCGuGxgpEjtjlka43_2ihRqKhSnQw,14916
 sky/utils/timeline.py,sha256=ob6s3bc7nwAuSI76yLKBrSR5bzOHnOhbozz1avwoet4,4070
 sky/utils/ux_utils.py,sha256=ngcOCg1K44p-SOk6XfwxJGXwjoP__PRvNuEzj7t05Yc,10185
 sky/utils/validator.py,sha256=cAFERCoC7jH0DFKepcU4x9SYmdrYL1iVmW9tXA18hvo,701
@@ -344,9 +347,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488
 sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=otzHzpliHDCpzYT-nU9Q0ZExbiFpDPWvhxwkvchZj7k,10073
 sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
 sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
-skypilot_nightly-1.0.0.dev20250311.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
-skypilot_nightly-1.0.0.dev20250311.dist-info/METADATA,sha256=sSJcOjrZzxkaeM8U9koQpUk4DNlQa4RfH21iDGPCbXo,18051
-skypilot_nightly-1.0.0.dev20250311.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
-skypilot_nightly-1.0.0.dev20250311.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
-skypilot_nightly-1.0.0.dev20250311.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
-skypilot_nightly-1.0.0.dev20250311.dist-info/RECORD,,
+skypilot_nightly-1.0.0.dev20250312.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
+skypilot_nightly-1.0.0.dev20250312.dist-info/METADATA,sha256=q1Bn6vuOOsagTfsfIAPxoyhpt2hWE2H6hzCmLvH65jM,18051
+skypilot_nightly-1.0.0.dev20250312.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
+skypilot_nightly-1.0.0.dev20250312.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
+skypilot_nightly-1.0.0.dev20250312.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
+skypilot_nightly-1.0.0.dev20250312.dist-info/RECORD,,

{skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250312.dist-info}/LICENSE RENAMED Viewed

File without changes

{skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250312.dist-info}/WHEEL RENAMED Viewed

File without changes

{skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250312.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250312.dist-info}/top_level.txt RENAMED Viewed

File without changes

skypilot-nightly 1.0.0.dev20250311__py3-none-any.whl → 1.0.0.dev20250312__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250311py3-none-any.whl → 1.0.0.dev20250312py3-none-any.whl