PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250407__py3-none-any.whl → 1.0.0.dev20250410__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250407py3-none-any.whl → 1.0.0.dev20250410py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

sky/__init__.py +2 -2
sky/adaptors/azure.py +1 -1
sky/adaptors/nebius.py +5 -27
sky/backends/backend.py +9 -7
sky/backends/cloud_vm_ray_backend.py +8 -11
sky/backends/local_docker_backend.py +3 -3
sky/cloud_stores.py +0 -4
sky/clouds/do.py +4 -5
sky/clouds/gcp.py +5 -3
sky/clouds/nebius.py +22 -12
sky/clouds/service_catalog/data_fetchers/fetch_ibm.py +1 -2
sky/clouds/service_catalog/gcp_catalog.py +37 -10
sky/core.py +6 -6
sky/data/data_utils.py +5 -9
sky/data/mounting_utils.py +1 -1
sky/data/storage.py +25 -31
sky/data/storage_utils.py +36 -20
sky/execution.py +11 -4
sky/jobs/server/server.py +5 -1
sky/provision/do/utils.py +19 -16
sky/provision/gcp/config.py +30 -20
sky/server/requests/executor.py +204 -126
sky/server/requests/process.py +212 -0
sky/server/requests/queues/local_queue.py +16 -0
sky/setup_files/dependencies.py +1 -1
sky/skylet/log_lib.py +4 -0
sky/task.py +27 -7
sky/utils/atomic.py +52 -0
sky/utils/common_utils.py +2 -2
sky/utils/schemas.py +25 -7
sky/utils/validator.py +1 -8
{skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/METADATA +2 -2
{skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/RECORD +37 -34
{skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250410.dist-info}/top_level.txt +0 -0

sky/server/requests/executor.py CHANGED Viewed

@@ -18,9 +18,7 @@ The number of the workers is determined by the system resources.
 See the [README.md](../README.md) for detailed architecture of the executor.
 """
-import concurrent.futures
 import contextlib
-import dataclasses
 import enum
 import multiprocessing
 import os
@@ -42,7 +40,9 @@ from sky.server import common as server_common
 from sky.server import constants as server_constants
 from sky.server.requests import payloads
 from sky.server.requests import preconditions
+from sky.server.requests import process
 from sky.server.requests import requests as api_requests
+from sky.server.requests.queues import local_queue
 from sky.server.requests.queues import mp_queue
 from sky.skylet import constants
 from sky.utils import annotations
@@ -101,22 +101,23 @@ _MIN_LONG_WORKERS = 1
 # workers so at least 2 workers are needed to ensure responsiveness.
 _MIN_SHORT_WORKERS = 2
+# Default number of burstable workers for local API server. A heuristic number
+# that is large enough for most local cases.
+# TODO(aylei): the number of burstable workers should be auto-tuned based on the
+# system usage stats.
+_BURSTABLE_WORKERS_FOR_LOCAL = 1024
 class QueueBackend(enum.Enum):
+    # Local queue backend serves queues in each process locally, which has
+    # lower resource usage but the consumer must be in the same process, i.e.
+    # this only works in single-process mode.
+    LOCAL = 'local'
+    # Multi-process queue backend starts a dedicated process for serving queues.
     MULTIPROCESSING = 'multiprocessing'
     # TODO(zhwu): we can add redis backend in the future.
-@dataclasses.dataclass
-class RequestWorker:
-    id: int
-    # The type of queue this worker works on.
-    schedule_type: api_requests.ScheduleType
-    def __str__(self) -> str:
-        return f'Worker(id={self.id}, schedule_type={self.schedule_type.value})'
 class RequestQueue:
     """The queue for the requests, either redis or multiprocessing.
@@ -128,9 +129,12 @@ class RequestQueue:
                  backend: Optional[QueueBackend] = None) -> None:
         self.name = schedule_type.value
         self.backend = backend
-        assert (backend is None or
-                backend == QueueBackend.MULTIPROCESSING), backend
-        self.queue = mp_queue.get_queue(self.name)
+        if backend == QueueBackend.MULTIPROCESSING:
+            self.queue = mp_queue.get_queue(self.name)
+        elif backend == QueueBackend.LOCAL:
+            self.queue = local_queue.get_queue(self.name)
+        else:
+            raise RuntimeError(f'Invalid queue backend: {backend}')
     def put(self, request: Tuple[str, bool]) -> None:
         """Put and request to the queue.
@@ -161,6 +165,104 @@ class RequestQueue:
 queue_backend = QueueBackend.MULTIPROCESSING
+def executor_initializer(proc_group: str):
+    setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
+                              f'{multiprocessing.current_process().pid}')
+class RequestWorker:
+    """A worker that polls requests from the queue and runs them.
+    The worker can run at least `garanteed_parallelism` requests in parallel.
+    If there are more resources available, it can spin up extra workers up to
+    `garanteed_parallelism + burstable_parallelism`.
+    """
+    # The type of queue this worker works on.
+    schedule_type: api_requests.ScheduleType
+    # The least number of requests that this worker can run in parallel.
+    garanteed_parallelism: int
+    # The extra number of requests that this worker can run in parallel
+    # if there are available CPU/memory resources.
+    burstable_parallelism: int = 0
+    def __init__(self,
+                 schedule_type: api_requests.ScheduleType,
+                 garanteed_parallelism: int,
+                 burstable_parallelism: int = 0) -> None:
+        self.schedule_type = schedule_type
+        self.garanteed_parallelism = garanteed_parallelism
+        self.burstable_parallelism = burstable_parallelism
+    def __str__(self) -> str:
+        return f'Worker(schedule_type={self.schedule_type.value})'
+    def process_request(self, executor: process.BurstableExecutor,
+                        queue: RequestQueue) -> None:
+        try:
+            request_element = queue.get()
+            if request_element is None:
+                time.sleep(0.1)
+                return
+            request_id, ignore_return_value = request_element
+            request = api_requests.get_request(request_id)
+            assert request is not None, f'Request with ID {request_id} is None'
+            if request.status == api_requests.RequestStatus.CANCELLED:
+                return
+            logger.info(f'[{self}] Submitting request: {request_id}')
+            # Start additional process to run the request, so that it can be
+            # cancelled when requested by a user.
+            # TODO(zhwu): since the executor is reusing the request process,
+            # multiple requests can share the same process pid, which may cause
+            # issues with SkyPilot core functions if they rely on the exit of
+            # the process, such as subprocess_daemon.py.
+            executor.submit_until_success(_request_execution_wrapper,
+                                          request_id, ignore_return_value)
+            logger.info(f'[{self}] Submitted request: {request_id}')
+        except (Exception, SystemExit) as e:  # pylint: disable=broad-except
+            # Catch any other exceptions to avoid crashing the worker process.
+            logger.error(
+                f'[{self}] Error processing request: '
+                f'{request_id if "request_id" in locals() else ""} '
+                f'{common_utils.format_exception(e, use_bracket=True)}')
+    def run(self) -> None:
+        # Handle the SIGTERM signal to abort the executor process gracefully.
+        proc_group = f'{self.schedule_type.value}'
+        if threading.current_thread() is threading.main_thread():
+            signal.signal(signal.SIGTERM, _sigterm_handler)
+            setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
+        queue = _get_queue(self.schedule_type)
+        # Use concurrent.futures.ProcessPoolExecutor instead of
+        # multiprocessing.Pool because the former is more efficient with the
+        # support of lazy creation of worker processes.
+        # We use executor instead of individual multiprocessing.Process to avoid
+        # the overhead of forking a new process for each request, which can be
+        # about 1s delay.
+        try:
+            executor = process.BurstableExecutor(
+                garanteed_workers=self.garanteed_parallelism,
+                burst_workers=self.burstable_parallelism,
+                initializer=executor_initializer,
+                initargs=(proc_group,))
+            while True:
+                self.process_request(executor, queue)
+        # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
+        except KeyboardInterrupt:
+            pass
+        finally:
+            # In most cases, here we receive either ctrl-c in foreground
+            # execution or SIGTERM on server exiting. Gracefully exit the
+            # worker process and the executor.
+            # TODO(aylei): worker may also be killed by system daemons like
+            # OOM killer, crash the API server or recreate the worker process
+            # to avoid broken state in such cases.
+            logger.info(f'[{self}] Worker process interrupted')
+            executor.shutdown()
 @annotations.lru_cache(scope='global', maxsize=None)
 def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
     return RequestQueue(schedule_type, backend=queue_backend)
@@ -349,110 +451,77 @@ def schedule_request(
         enqueue()
-def executor_initializer(proc_group: str):
-    setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
-                              f'{multiprocessing.current_process().pid}')
-def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
-    """Worker for the requests.
-    Args:
-        max_parallel_size: Maximum number of parallel jobs this worker can run.
-    """
-    # Handle the SIGTERM signal to abort the executor process gracefully.
-    signal.signal(signal.SIGTERM, _sigterm_handler)
-    proc_group = f'{worker.schedule_type.value}-{worker.id}'
-    setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
-    queue = _get_queue(worker.schedule_type)
-    def process_request(executor: concurrent.futures.ProcessPoolExecutor):
-        try:
-            request_element = queue.get()
-            if request_element is None:
-                time.sleep(0.1)
-                return
-            request_id, ignore_return_value = request_element
-            request = api_requests.get_request(request_id)
-            assert request is not None, f'Request with ID {request_id} is None'
-            if request.status == api_requests.RequestStatus.CANCELLED:
-                return
-            logger.info(f'[{worker}] Submitting request: {request_id}')
-            # Start additional process to run the request, so that it can be
-            # cancelled when requested by a user.
-            # TODO(zhwu): since the executor is reusing the request process,
-            # multiple requests can share the same process pid, which may cause
-            # issues with SkyPilot core functions if they rely on the exit of
-            # the process, such as subprocess_daemon.py.
-            future = executor.submit(_request_execution_wrapper, request_id,
-                                     ignore_return_value)
-            if worker.schedule_type == api_requests.ScheduleType.LONG:
-                try:
-                    future.result(timeout=None)
-                except Exception as e:  # pylint: disable=broad-except
-                    logger.error(f'[{worker}] Request {request_id} failed: {e}')
-                logger.info(f'[{worker}] Finished request: {request_id}')
-            else:
-                logger.info(f'[{worker}] Submitted request: {request_id}')
-        except (Exception, SystemExit) as e:  # pylint: disable=broad-except
-            # Catch any other exceptions to avoid crashing the worker process.
-            logger.error(
-                f'[{worker}] Error processing request: '
-                f'{request_id if "request_id" in locals() else ""} '
-                f'{common_utils.format_exception(e, use_bracket=True)}')
-    # Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
-    # because the former is more efficient with the support of lazy creation of
-    # worker processes.
-    # We use executor instead of individual multiprocessing.Process to avoid
-    # the overhead of forking a new process for each request, which can be about
-    # 1s delay.
-    try:
-        executor = concurrent.futures.ProcessPoolExecutor(
-            max_workers=max_parallel_size,
-            initializer=executor_initializer,
-            initargs=(proc_group,))
-        while True:
-            process_request(executor)
-    # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
-    except KeyboardInterrupt:
-        pass
-    finally:
-        # In most cases, here we receive either ctrl-c in foreground execution
-        # or SIGTERM on server exiting. Gracefully exit the worker process and
-        # the executor.
-        # TODO(aylei): worker may also be killed by system daemons like OOM
-        # killer, crash the API server or recreate the worker process to avoid
-        # broken state in such cases.
-        logger.info(f'[{worker}] Worker process interrupted')
-        executor_processes = list(executor._processes.values())  # pylint: disable=protected-access,line-too-long
-        # Shutdown the executor so that executor process can exit once the
-        # running task is finished or interrupted.
-        executor.shutdown(wait=False)
-        # Proactively interrupt the running task to avoid indefinite waiting.
-        subprocess_utils.run_in_parallel(
-            subprocess_utils.kill_process_with_grace_period,
-            executor_processes,
-            num_threads=len(executor_processes))
 def start(deploy: bool) -> List[multiprocessing.Process]:
-    """Start the request workers."""
+    """Start the request workers.
+    Request workers run in background, schedule the requests and delegate the
+    request execution to executor processes. We have different assumptions for
+    the resources in different deployment modes, which leads to different
+    worker setups:
+    - Deployment mode (deploy=True), we assume the resources are dedicated to
+      the API server and the resources will be tuned for serious use cases, so:
+      - Use multiprocessing queue backend and dedicated workers processes to
+        avoid GIL contention.
+      - Parallelism (number of executor processes) is fixed and executor
+        processes have same lifecycle with the server, which ensures
+        best-effort cache reusing and stable resources consumption.
+      - Reject to start in low resource environments, to avoid flaky
+        deployments.
+    - Local mode (deploy=False), we assume the server is running in a shared
+      environment (e.g. laptop) and users typically do not pay attention to
+      the resource setup of the server. Moreover, existing users may expect
+      some consistent behaviors with old versions, i.e. before API server was
+      introduced, so:
+      - The max number of long-running executor processes are limited, to avoid
+        high memory consumption when the server is idle.
+      - Allow burstable workers to handle requests when all long-running
+        workers are busy, which mimics the behavior of local sky CLI before
+        API server was introduced.
+      - Works in low resources environments, and further reduce the memory
+        consumption in low resource environments.
+    Note that there is still significant overhead for SDK users when migrate to
+    local API server. Since the users are free to run sky operations in Threads
+    when using SDK but all client operations will occupy at least one worker
+    process after API server was introduced.
+    """
     # Determine the job capacity of the workers based on the system resources.
     cpu_count = common_utils.get_cpu_count()
     mem_size_gb = common_utils.get_mem_size_gb()
     mem_size_gb = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
+    # Runs in low resource mode if the available memory is less than
+    # server_constants.MIN_AVAIL_MEM_GB.
     max_parallel_for_long = _max_long_worker_parallism(cpu_count,
                                                        mem_size_gb,
                                                        local=not deploy)
     max_parallel_for_short = _max_short_worker_parallism(
         mem_size_gb, max_parallel_for_long)
-    logger.info(
-        f'SkyPilot API server will start {max_parallel_for_long} workers for '
-        f'long requests and will allow at max '
-        f'{max_parallel_for_short} short requests in parallel.')
+    if mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
+        # Permanent worker process may have significant memory consumption
+        # (~350MB per worker) after running commands like `sky check`, so we
+        # don't start any permanent workers in low resource local mode. This
+        # mimics the behavior of local sky CLI before API server was
+        # introduced, where the CLI will start new process everytime and
+        # never reject to start due to resource constraints.
+        # Note that the refresh daemon will still occupy one worker
+        # permanently because it never exits.
+        max_parallel_for_long = 0
+        max_parallel_for_short = 0
+        logger.warning(
+            'SkyPilot API server will run in low resource mode because '
+            'the available memory is less than '
+            f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
+    else:
+        logger.info(
+            f'SkyPilot API server will start {max_parallel_for_long} workers '
+            f'for long requests and will allow at max '
+            f'{max_parallel_for_short} short requests in parallel.')
+    if not deploy:
+        # For local mode, use local queue backend since we only run 1 uvicorn
+        # worker in local mode.
+        global queue_backend
+        queue_backend = QueueBackend.LOCAL
     sub_procs = []
     # Setup the queues.
     if queue_backend == QueueBackend.MULTIPROCESSING:
@@ -471,28 +540,37 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
             target=mp_queue.start_queue_manager, args=(queue_names, port))
         queue_server.start()
         sub_procs.append(queue_server)
-        mp_queue.wait_for_queues_to_be_ready(queue_names, queue_server, port)
+        mp_queue.wait_for_queues_to_be_ready(queue_names,
+                                             queue_server,
+                                             port=port)
+    elif queue_backend == QueueBackend.LOCAL:
+        # No setup is needed for local queue backend.
+        pass
+    else:
+        # Should be checked earlier, but just in case.
+        raise RuntimeError(f'Invalid queue backend: {queue_backend}')
     logger.info('Request queues created')
-    long_workers = []
-    for worker_id in range(max_parallel_for_long):
-        worker = RequestWorker(id=worker_id,
-                               schedule_type=api_requests.ScheduleType.LONG)
-        worker_proc = multiprocessing.Process(target=request_worker,
-                                              args=(worker, 1))
-        long_workers.append(worker_proc)
-        sub_procs.append(worker_proc)
-    threading.Thread(target=subprocess_utils.slow_start_processes,
-                     args=(long_workers,),
-                     daemon=True).start()
+    def run_worker_in_background(worker: RequestWorker):
+        # Thread dispatcher is sufficient for current scale, refer to
+        # tests/load_tests/test_queue_dispatcher.py for more details.
+        # Use daemon thread for automatic cleanup.
+        thread = threading.Thread(target=worker.run, daemon=True)
+        thread.start()
+    burstable_parallelism = _BURSTABLE_WORKERS_FOR_LOCAL if not deploy else 0
+    # Start a worker for long requests.
+    long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
+                                garanteed_parallelism=max_parallel_for_long,
+                                burstable_parallelism=burstable_parallelism)
+    run_worker_in_background(long_worker)
     # Start a worker for short requests.
-    worker = RequestWorker(id=1, schedule_type=api_requests.ScheduleType.SHORT)
-    worker_proc = multiprocessing.Process(target=request_worker,
-                                          args=(worker, max_parallel_for_short))
-    worker_proc.start()
-    sub_procs.append(worker_proc)
+    short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
+                                 garanteed_parallelism=max_parallel_for_short,
+                                 burstable_parallelism=burstable_parallelism)
+    run_worker_in_background(short_worker)
     return sub_procs

sky/server/requests/process.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""ProcessPoolExecutor with additional supports for skypilot."""
+import concurrent.futures
+import logging
+import multiprocessing
+import threading
+import time
+from typing import Callable, Dict, Optional, Tuple
+from sky.utils import atomic
+from sky.utils import subprocess_utils
+logger = logging.getLogger(__name__)
+class PoolExecutor(concurrent.futures.ProcessPoolExecutor):
+    """A custom ProcessPoolExecutor with additional supports for skypilot.
+    The additional supports include:
+    1. Disposable workers: support control whether the worker process should
+       exit after complete a task.
+    2. Idle check: support check if there are any idle workers.
+    3. Proactive shutdown: SIGTERM worker processes when the executor is
+       shutting down instead of indefinitely waiting.
+    """
+    def __init__(self, max_workers: int, **kwargs):
+        super().__init__(max_workers=max_workers, **kwargs)
+        self.max_workers: int = max_workers
+        # The number of workers that are handling tasks, atomicity across
+        # multiple threads is sufficient since the idleness check is
+        # best-effort and does not affect the correctness.
+        # E.g. the following case is totally fine:
+        # 1. Thread 1 checks running == max_workers
+        # 2. Thread 2 decrements running
+        # 3. Thread 1 schedules the task to other pool even if the pool is
+        #    currently idle.
+        self.running: atomic.AtomicInt = atomic.AtomicInt(0)
+    def submit(self, fn, *args, **kwargs) -> concurrent.futures.Future:
+        """Submit a task for execution.
+        If reuse_worker is False, wraps the function to exit after completion.
+        """
+        self.running.increment()
+        future = super().submit(fn, *args, **kwargs)
+        future.add_done_callback(lambda _: self.running.decrement())
+        return future
+    def has_idle_workers(self) -> bool:
+        """Check if there are any idle workers."""
+        return self.running.get() < self.max_workers
+    def shutdown(self, wait: bool = True) -> None:
+        """Shutdown the executor."""
+        # Here wait means wait for the proactive cancellation complete.
+        # TODO(aylei): we may support wait=True in the future if needed.
+        assert wait is True, 'wait=False is not supported'
+        executor_processes = list(self._processes.values())
+        # Shutdown the executor so that executor process can exit once the
+        # running task is finished or interrupted.
+        super().shutdown(wait=False)
+        # Proactively interrupt the running task to avoid indefinite waiting.
+        subprocess_utils.run_in_parallel(
+            subprocess_utils.kill_process_with_grace_period,
+            executor_processes,
+            num_threads=len(executor_processes))
+# Define the worker function outside of the class to avoid pickling self
+def _disposable_worker(fn, initializer: Optional[Callable], initargs: Tuple,
+                       args, kwargs):
+    try:
+        if initializer is not None:
+            initializer(*initargs)
+        fn(*args, **kwargs)
+    except BaseException as e:  # pylint: disable=broad-except
+        return e
+class DisposableExecutor:
+    """A simple wrapper that creates a new process for each task.
+    This is a workaround for Python 3.10 since `max_tasks_per_child` of
+    ProcessPoolExecutor was introduced in 3.11. There is no way to control
+    the worker lifetime in 3.10.
+    Ref: https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ProcessPoolExecutor # pylint: disable=line-too-long
+    TODO(aylei): use the official `max_tasks_per_child` when upgrade to 3.11
+    """
+    def __init__(self,
+                 max_workers: Optional[int] = None,
+                 initializer: Optional[Callable] = None,
+                 initargs: Tuple = ()):
+        self.max_workers: Optional[int] = max_workers
+        self.workers: Dict[int, multiprocessing.Process] = {}
+        self._shutdown: bool = False
+        self._lock: threading.Lock = threading.Lock()
+        self._initializer: Optional[Callable] = initializer
+        self._initargs: Tuple = initargs
+    def _monitor_worker(self, process: multiprocessing.Process) -> None:
+        """Monitor the worker process and cleanup when it's done."""
+        process.join()
+        if process.pid:
+            with self._lock:
+                if process.pid in self.workers:
+                    del self.workers[process.pid]
+    # Submit is not compatible with ProcessPoolExecutor because we does not
+    # bother to return a Future. Can be improved if needed.
+    def submit(self, fn, *args, **kwargs) -> bool:
+        """Submit a task for execution."""
+        if self._shutdown:
+            return False
+        with self._lock:
+            if (self.max_workers is not None and
+                    len(self.workers) >= self.max_workers):
+                return False
+        process = multiprocessing.Process(target=_disposable_worker,
+                                          args=(fn, self._initializer,
+                                                self._initargs, args, kwargs))
+        process.start()
+        with self._lock:
+            pid = process.pid or 0
+            if pid == 0:
+                raise RuntimeError('Failed to start process')
+            self.workers[pid] = process
+        # Start monitor thread to cleanup the worker process when it's done.
+        monitor_thread = threading.Thread(target=self._monitor_worker,
+                                          args=(process,),
+                                          daemon=True)
+        monitor_thread.start()
+        return True
+    def has_idle_workers(self) -> bool:
+        """Check if there are any idle workers."""
+        if self.max_workers is None:
+            return True
+        with self._lock:
+            return len(self.workers) < self.max_workers
+    def shutdown(self):
+        """Shutdown the executor."""
+        with self._lock:
+            self._shutdown = True
+        subprocess_utils.run_in_parallel(
+            subprocess_utils.kill_process_with_grace_period,
+            list(self.workers.values()),  # Convert dict values to list
+            num_threads=len(self.workers))
+class BurstableExecutor:
+    """An multiprocessing executor that supports bursting worker processes."""
+    # _executor is a PoolExecutor that is used to run guaranteed requests.
+    _executor: Optional[PoolExecutor] = None
+    # _burst_executor is a ProcessPoolExecutor that is used to run burst
+    # requests.
+    _burst_executor: Optional[DisposableExecutor] = None
+    def __init__(self,
+                 garanteed_workers: int,
+                 burst_workers: int = 0,
+                 **kwargs):
+        if garanteed_workers > 0:
+            self._executor = PoolExecutor(max_workers=garanteed_workers,
+                                          **kwargs)
+        if burst_workers > 0:
+            self._burst_executor = DisposableExecutor(max_workers=burst_workers,
+                                                      **kwargs)
+    def submit_until_success(self, fn, *args, **kwargs):
+        """Submit a task for execution until success.
+        Prioritizes submitting to the guaranteed pool. If no idle workers
+        are available in the guaranteed pool, it will submit to the burst
+        pool.
+        TODO(aylei): this is coupled with executor.RequestWorker since we
+        know the worker is dedicated to request scheduling and it either
+        blocks on request polling or request submitting. So it is no harm
+        to make submit blocking here. But for general cases, we need an
+        internal queue to decouple submit and run.
+        """
+        while True:
+            if self._executor is not None and self._executor.has_idle_workers():
+                self._executor.submit(fn, *args, **kwargs)
+                break
+            if (self._burst_executor is not None and
+                    self._burst_executor.has_idle_workers()):
+                self._burst_executor.submit(fn, *args, **kwargs)
+                break
+            if self._executor is not None:
+                # No idle workers in either pool, still queue the request
+                # to the guaranteed pool to keep behavior consistent.
+                self._executor.submit(fn, *args, **kwargs)
+                break
+            logger.debug('No guaranteed pool set and the burst pool is full, '
+                         'retry later.')
+            time.sleep(0.1)
+    def shutdown(self) -> None:
+        """Shutdown the executor."""
+        if self._burst_executor is not None:
+            self._burst_executor.shutdown()
+        if self._executor is not None:
+            self._executor.shutdown(wait=True)

sky/server/requests/queues/local_queue.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Process-local queue implementation."""
+import queue
+import threading
+from typing import Dict
+# Global dict to store queues
+_queues: Dict[str, queue.Queue] = {}
+_lock = threading.Lock()
+def get_queue(queue_name: str) -> queue.Queue:
+    """Get or create a queue by name."""
+    with _lock:
+        if queue_name not in _queues:
+            _queues[queue_name] = queue.Queue()
+        return _queues[queue_name]

sky/setup_files/dependencies.py CHANGED Viewed

@@ -9,7 +9,7 @@ import sys
 from typing import Dict, List
 install_requires = [
-    'wheel',
+    'wheel<0.46.0',  # https://github.com/skypilot-org/skypilot/issues/5153
     'cachetools',
     # NOTE: ray requires click>=7.0.
     'click >= 7.0',

sky/skylet/log_lib.py CHANGED Viewed

@@ -149,6 +149,7 @@ def run_with_log(
     process_stream: bool = True,
     line_processor: Optional[log_utils.LineProcessor] = None,
     streaming_prefix: Optional[str] = None,
+    log_cmd: bool = False,
     **kwargs,
 ) -> Union[int, Tuple[int, str, str]]:
     """Runs a command and logs its output to a file.
@@ -182,6 +183,9 @@ def run_with_log(
     # the terminal output when typing in the terminal that starts the API
     # server.
     stdin = kwargs.pop('stdin', subprocess.DEVNULL)
+    if log_cmd:
+        with open(log_path, 'a', encoding='utf-8') as f:
+            print(f'Running command: {cmd}', file=f)
     with subprocess.Popen(cmd,
                           stdout=stdout_arg,
                           stderr=stderr_arg,

skypilot-nightly 1.0.0.dev20250407__py3-none-any.whl → 1.0.0.dev20250410__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250407py3-none-any.whl → 1.0.0.dev20250410py3-none-any.whl