PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250407__py3-none-any.whl → 1.0.0.dev20250408__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250407py3-none-any.whl → 1.0.0.dev20250408py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

sky/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 import urllib.request
 # Replaced with the current commit when building the wheels.
-_SKYPILOT_COMMIT_SHA = 'c5039370280815a3f347e76622dc154ede36d6c3'
+_SKYPILOT_COMMIT_SHA = 'e0674be528e87191ade88961c44c6449d01232fa'
 def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev20250407'
+__version__ = '1.0.0.dev20250408'
 __root_dir__ = os.path.dirname(os.path.abspath(__file__))

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -655,12 +655,9 @@ class RayCodeGen:
         rclone_flush_script = {rclone_flush_script!r}
         if run_fn is not None:
             script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
-        if script is not None:
-            script += rclone_flush_script
-        else:
-            script = rclone_flush_script
         if script is not None:
+            script += rclone_flush_script
             sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
             # Backward compatibility: Environment starting with `SKY_` is
             # deprecated. Remove it in v0.9.0.

sky/data/storage_utils.py CHANGED Viewed

@@ -227,6 +227,9 @@ def get_excluded_files(src_dir_path: str) -> List[str]:
     expand_src_dir_path = os.path.expanduser(src_dir_path)
     skyignore_path = os.path.join(expand_src_dir_path,
                                   constants.SKY_IGNORE_FILE)
+    # Fail fast if the source is a file.
+    if os.path.isfile(expand_src_dir_path):
+        raise ValueError(f'{src_dir_path} is a file, not a directory.')
     if os.path.exists(skyignore_path):
         logger.debug(f'  {colorama.Style.DIM}'
                      f'Excluded files to sync to cluster based on '
@@ -267,11 +270,15 @@ def zip_files_and_folders(items: List[str],
                 item = os.path.expanduser(item)
                 if not os.path.isfile(item) and not os.path.isdir(item):
                     raise ValueError(f'{item} does not exist.')
-                excluded_files = set(
-                    [os.path.join(item, f) for f in get_excluded_files(item)])
-                if os.path.isfile(item) and item not in excluded_files:
+                if os.path.isfile(item):
+                    # Add the file to the zip archive even if it matches
+                    # patterns in dot ignore files, as it was explicitly
+                    # specified by user.
                     zipf.write(item)
                 elif os.path.isdir(item):
+                    excluded_files = set([
+                        os.path.join(item, f) for f in get_excluded_files(item)
+                    ])
                     for root, dirs, files in os.walk(item, followlinks=False):
                         # Modify dirs in-place to control os.walk()'s traversal
                         # behavior. This filters out excluded directories BEFORE

sky/server/requests/executor.py CHANGED Viewed

@@ -18,9 +18,7 @@ The number of the workers is determined by the system resources.
 See the [README.md](../README.md) for detailed architecture of the executor.
 """
-import concurrent.futures
 import contextlib
-import dataclasses
 import enum
 import multiprocessing
 import os
@@ -42,7 +40,9 @@ from sky.server import common as server_common
 from sky.server import constants as server_constants
 from sky.server.requests import payloads
 from sky.server.requests import preconditions
+from sky.server.requests import process
 from sky.server.requests import requests as api_requests
+from sky.server.requests.queues import local_queue
 from sky.server.requests.queues import mp_queue
 from sky.skylet import constants
 from sky.utils import annotations
@@ -101,22 +101,23 @@ _MIN_LONG_WORKERS = 1
 # workers so at least 2 workers are needed to ensure responsiveness.
 _MIN_SHORT_WORKERS = 2
+# Default number of burstable workers for local API server. A heuristic number
+# that is large enough for most local cases.
+# TODO(aylei): the number of burstable workers should be auto-tuned based on the
+# system usage stats.
+_BURSTABLE_WORKERS_FOR_LOCAL = 1024
 class QueueBackend(enum.Enum):
+    # Local queue backend serves queues in each process locally, which has
+    # lower resource usage but the consumer must be in the same process, i.e.
+    # this only works in single-process mode.
+    LOCAL = 'local'
+    # Multi-process queue backend starts a dedicated process for serving queues.
     MULTIPROCESSING = 'multiprocessing'
     # TODO(zhwu): we can add redis backend in the future.
-@dataclasses.dataclass
-class RequestWorker:
-    id: int
-    # The type of queue this worker works on.
-    schedule_type: api_requests.ScheduleType
-    def __str__(self) -> str:
-        return f'Worker(id={self.id}, schedule_type={self.schedule_type.value})'
 class RequestQueue:
     """The queue for the requests, either redis or multiprocessing.
@@ -128,9 +129,12 @@ class RequestQueue:
                  backend: Optional[QueueBackend] = None) -> None:
         self.name = schedule_type.value
         self.backend = backend
-        assert (backend is None or
-                backend == QueueBackend.MULTIPROCESSING), backend
-        self.queue = mp_queue.get_queue(self.name)
+        if backend == QueueBackend.MULTIPROCESSING:
+            self.queue = mp_queue.get_queue(self.name)
+        elif backend == QueueBackend.LOCAL:
+            self.queue = local_queue.get_queue(self.name)
+        else:
+            raise RuntimeError(f'Invalid queue backend: {backend}')
     def put(self, request: Tuple[str, bool]) -> None:
         """Put and request to the queue.
@@ -161,6 +165,104 @@ class RequestQueue:
 queue_backend = QueueBackend.MULTIPROCESSING
+def executor_initializer(proc_group: str):
+    setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
+                              f'{multiprocessing.current_process().pid}')
+class RequestWorker:
+    """A worker that polls requests from the queue and runs them.
+    The worker can run at least `garanteed_parallelism` requests in parallel.
+    If there are more resources available, it can spin up extra workers up to
+    `garanteed_parallelism + burstable_parallelism`.
+    """
+    # The type of queue this worker works on.
+    schedule_type: api_requests.ScheduleType
+    # The least number of requests that this worker can run in parallel.
+    garanteed_parallelism: int
+    # The extra number of requests that this worker can run in parallel
+    # if there are available CPU/memory resources.
+    burstable_parallelism: int = 0
+    def __init__(self,
+                 schedule_type: api_requests.ScheduleType,
+                 garanteed_parallelism: int,
+                 burstable_parallelism: int = 0) -> None:
+        self.schedule_type = schedule_type
+        self.garanteed_parallelism = garanteed_parallelism
+        self.burstable_parallelism = burstable_parallelism
+    def __str__(self) -> str:
+        return f'Worker(schedule_type={self.schedule_type.value})'
+    def process_request(self, executor: process.BurstableExecutor,
+                        queue: RequestQueue) -> None:
+        try:
+            request_element = queue.get()
+            if request_element is None:
+                time.sleep(0.1)
+                return
+            request_id, ignore_return_value = request_element
+            request = api_requests.get_request(request_id)
+            assert request is not None, f'Request with ID {request_id} is None'
+            if request.status == api_requests.RequestStatus.CANCELLED:
+                return
+            logger.info(f'[{self}] Submitting request: {request_id}')
+            # Start additional process to run the request, so that it can be
+            # cancelled when requested by a user.
+            # TODO(zhwu): since the executor is reusing the request process,
+            # multiple requests can share the same process pid, which may cause
+            # issues with SkyPilot core functions if they rely on the exit of
+            # the process, such as subprocess_daemon.py.
+            executor.submit_until_success(_request_execution_wrapper,
+                                          request_id, ignore_return_value)
+            logger.info(f'[{self}] Submitted request: {request_id}')
+        except (Exception, SystemExit) as e:  # pylint: disable=broad-except
+            # Catch any other exceptions to avoid crashing the worker process.
+            logger.error(
+                f'[{self}] Error processing request: '
+                f'{request_id if "request_id" in locals() else ""} '
+                f'{common_utils.format_exception(e, use_bracket=True)}')
+    def run(self) -> None:
+        # Handle the SIGTERM signal to abort the executor process gracefully.
+        proc_group = f'{self.schedule_type.value}'
+        if threading.current_thread() is threading.main_thread():
+            signal.signal(signal.SIGTERM, _sigterm_handler)
+            setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
+        queue = _get_queue(self.schedule_type)
+        # Use concurrent.futures.ProcessPoolExecutor instead of
+        # multiprocessing.Pool because the former is more efficient with the
+        # support of lazy creation of worker processes.
+        # We use executor instead of individual multiprocessing.Process to avoid
+        # the overhead of forking a new process for each request, which can be
+        # about 1s delay.
+        try:
+            executor = process.BurstableExecutor(
+                garanteed_workers=self.garanteed_parallelism,
+                burst_workers=self.burstable_parallelism,
+                initializer=executor_initializer,
+                initargs=(proc_group,))
+            while True:
+                self.process_request(executor, queue)
+        # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
+        except KeyboardInterrupt:
+            pass
+        finally:
+            # In most cases, here we receive either ctrl-c in foreground
+            # execution or SIGTERM on server exiting. Gracefully exit the
+            # worker process and the executor.
+            # TODO(aylei): worker may also be killed by system daemons like
+            # OOM killer, crash the API server or recreate the worker process
+            # to avoid broken state in such cases.
+            logger.info(f'[{self}] Worker process interrupted')
+            executor.shutdown()
 @annotations.lru_cache(scope='global', maxsize=None)
 def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
     return RequestQueue(schedule_type, backend=queue_backend)
@@ -349,110 +451,77 @@ def schedule_request(
         enqueue()
-def executor_initializer(proc_group: str):
-    setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
-                              f'{multiprocessing.current_process().pid}')
-def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
-    """Worker for the requests.
-    Args:
-        max_parallel_size: Maximum number of parallel jobs this worker can run.
-    """
-    # Handle the SIGTERM signal to abort the executor process gracefully.
-    signal.signal(signal.SIGTERM, _sigterm_handler)
-    proc_group = f'{worker.schedule_type.value}-{worker.id}'
-    setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
-    queue = _get_queue(worker.schedule_type)
-    def process_request(executor: concurrent.futures.ProcessPoolExecutor):
-        try:
-            request_element = queue.get()
-            if request_element is None:
-                time.sleep(0.1)
-                return
-            request_id, ignore_return_value = request_element
-            request = api_requests.get_request(request_id)
-            assert request is not None, f'Request with ID {request_id} is None'
-            if request.status == api_requests.RequestStatus.CANCELLED:
-                return
-            logger.info(f'[{worker}] Submitting request: {request_id}')
-            # Start additional process to run the request, so that it can be
-            # cancelled when requested by a user.
-            # TODO(zhwu): since the executor is reusing the request process,
-            # multiple requests can share the same process pid, which may cause
-            # issues with SkyPilot core functions if they rely on the exit of
-            # the process, such as subprocess_daemon.py.
-            future = executor.submit(_request_execution_wrapper, request_id,
-                                     ignore_return_value)
-            if worker.schedule_type == api_requests.ScheduleType.LONG:
-                try:
-                    future.result(timeout=None)
-                except Exception as e:  # pylint: disable=broad-except
-                    logger.error(f'[{worker}] Request {request_id} failed: {e}')
-                logger.info(f'[{worker}] Finished request: {request_id}')
-            else:
-                logger.info(f'[{worker}] Submitted request: {request_id}')
-        except (Exception, SystemExit) as e:  # pylint: disable=broad-except
-            # Catch any other exceptions to avoid crashing the worker process.
-            logger.error(
-                f'[{worker}] Error processing request: '
-                f'{request_id if "request_id" in locals() else ""} '
-                f'{common_utils.format_exception(e, use_bracket=True)}')
-    # Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
-    # because the former is more efficient with the support of lazy creation of
-    # worker processes.
-    # We use executor instead of individual multiprocessing.Process to avoid
-    # the overhead of forking a new process for each request, which can be about
-    # 1s delay.
-    try:
-        executor = concurrent.futures.ProcessPoolExecutor(
-            max_workers=max_parallel_size,
-            initializer=executor_initializer,
-            initargs=(proc_group,))
-        while True:
-            process_request(executor)
-    # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
-    except KeyboardInterrupt:
-        pass
-    finally:
-        # In most cases, here we receive either ctrl-c in foreground execution
-        # or SIGTERM on server exiting. Gracefully exit the worker process and
-        # the executor.
-        # TODO(aylei): worker may also be killed by system daemons like OOM
-        # killer, crash the API server or recreate the worker process to avoid
-        # broken state in such cases.
-        logger.info(f'[{worker}] Worker process interrupted')
-        executor_processes = list(executor._processes.values())  # pylint: disable=protected-access,line-too-long
-        # Shutdown the executor so that executor process can exit once the
-        # running task is finished or interrupted.
-        executor.shutdown(wait=False)
-        # Proactively interrupt the running task to avoid indefinite waiting.
-        subprocess_utils.run_in_parallel(
-            subprocess_utils.kill_process_with_grace_period,
-            executor_processes,
-            num_threads=len(executor_processes))
 def start(deploy: bool) -> List[multiprocessing.Process]:
-    """Start the request workers."""
+    """Start the request workers.
+    Request workers run in background, schedule the requests and delegate the
+    request execution to executor processes. We have different assumptions for
+    the resources in different deployment modes, which leads to different
+    worker setups:
+    - Deployment mode (deploy=True), we assume the resources are dedicated to
+      the API server and the resources will be tuned for serious use cases, so:
+      - Use multiprocessing queue backend and dedicated workers processes to
+        avoid GIL contention.
+      - Parallelism (number of executor processes) is fixed and executor
+        processes have same lifecycle with the server, which ensures
+        best-effort cache reusing and stable resources consumption.
+      - Reject to start in low resource environments, to avoid flaky
+        deployments.
+    - Local mode (deploy=False), we assume the server is running in a shared
+      environment (e.g. laptop) and users typically do not pay attention to
+      the resource setup of the server. Moreover, existing users may expect
+      some consistent behaviors with old versions, i.e. before API server was
+      introduced, so:
+      - The max number of long-running executor processes are limited, to avoid
+        high memory consumption when the server is idle.
+      - Allow burstable workers to handle requests when all long-running
+        workers are busy, which mimics the behavior of local sky CLI before
+        API server was introduced.
+      - Works in low resources environments, and further reduce the memory
+        consumption in low resource environments.
+    Note that there is still significant overhead for SDK users when migrate to
+    local API server. Since the users are free to run sky operations in Threads
+    when using SDK but all client operations will occupy at least one worker
+    process after API server was introduced.
+    """
     # Determine the job capacity of the workers based on the system resources.
     cpu_count = common_utils.get_cpu_count()
     mem_size_gb = common_utils.get_mem_size_gb()
     mem_size_gb = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
+    # Runs in low resource mode if the available memory is less than
+    # server_constants.MIN_AVAIL_MEM_GB.
     max_parallel_for_long = _max_long_worker_parallism(cpu_count,
                                                        mem_size_gb,
                                                        local=not deploy)
     max_parallel_for_short = _max_short_worker_parallism(
         mem_size_gb, max_parallel_for_long)
-    logger.info(
-        f'SkyPilot API server will start {max_parallel_for_long} workers for '
-        f'long requests and will allow at max '
-        f'{max_parallel_for_short} short requests in parallel.')
+    if mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
+        # Permanent worker process may have significant memory consumption
+        # (~350MB per worker) after running commands like `sky check`, so we
+        # don't start any permanent workers in low resource local mode. This
+        # mimics the behavior of local sky CLI before API server was
+        # introduced, where the CLI will start new process everytime and
+        # never reject to start due to resource constraints.
+        # Note that the refresh daemon will still occupy one worker
+        # permanently because it never exits.
+        max_parallel_for_long = 0
+        max_parallel_for_short = 0
+        logger.warning(
+            'SkyPilot API server will run in low resource mode because '
+            'the available memory is less than '
+            f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
+    else:
+        logger.info(
+            f'SkyPilot API server will start {max_parallel_for_long} workers '
+            f'for long requests and will allow at max '
+            f'{max_parallel_for_short} short requests in parallel.')
+    if not deploy:
+        # For local mode, use local queue backend since we only run 1 uvicorn
+        # worker in local mode.
+        global queue_backend
+        queue_backend = QueueBackend.LOCAL
     sub_procs = []
     # Setup the queues.
     if queue_backend == QueueBackend.MULTIPROCESSING:
@@ -471,28 +540,37 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
             target=mp_queue.start_queue_manager, args=(queue_names, port))
         queue_server.start()
         sub_procs.append(queue_server)
-        mp_queue.wait_for_queues_to_be_ready(queue_names, queue_server, port)
+        mp_queue.wait_for_queues_to_be_ready(queue_names,
+                                             queue_server,
+                                             port=port)
+    elif queue_backend == QueueBackend.LOCAL:
+        # No setup is needed for local queue backend.
+        pass
+    else:
+        # Should be checked earlier, but just in case.
+        raise RuntimeError(f'Invalid queue backend: {queue_backend}')
     logger.info('Request queues created')
-    long_workers = []
-    for worker_id in range(max_parallel_for_long):
-        worker = RequestWorker(id=worker_id,
-                               schedule_type=api_requests.ScheduleType.LONG)
-        worker_proc = multiprocessing.Process(target=request_worker,
-                                              args=(worker, 1))
-        long_workers.append(worker_proc)
-        sub_procs.append(worker_proc)
-    threading.Thread(target=subprocess_utils.slow_start_processes,
-                     args=(long_workers,),
-                     daemon=True).start()
+    def run_worker_in_background(worker: RequestWorker):
+        # Thread dispatcher is sufficient for current scale, refer to
+        # tests/load_tests/test_queue_dispatcher.py for more details.
+        # Use daemon thread for automatic cleanup.
+        thread = threading.Thread(target=worker.run, daemon=True)
+        thread.start()
+    burstable_parallelism = _BURSTABLE_WORKERS_FOR_LOCAL if not deploy else 0
+    # Start a worker for long requests.
+    long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
+                                garanteed_parallelism=max_parallel_for_long,
+                                burstable_parallelism=burstable_parallelism)
+    run_worker_in_background(long_worker)
     # Start a worker for short requests.
-    worker = RequestWorker(id=1, schedule_type=api_requests.ScheduleType.SHORT)
-    worker_proc = multiprocessing.Process(target=request_worker,
-                                          args=(worker, max_parallel_for_short))
-    worker_proc.start()
-    sub_procs.append(worker_proc)
+    short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
+                                 garanteed_parallelism=max_parallel_for_short,
+                                 burstable_parallelism=burstable_parallelism)
+    run_worker_in_background(short_worker)
     return sub_procs

sky/server/requests/process.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""ProcessPoolExecutor with additional supports for skypilot."""
+import concurrent.futures
+import logging
+import multiprocessing
+import threading
+import time
+from typing import Callable, Dict, Optional, Tuple
+from sky.utils import atomic
+from sky.utils import subprocess_utils
+logger = logging.getLogger(__name__)
+class PoolExecutor(concurrent.futures.ProcessPoolExecutor):
+    """A custom ProcessPoolExecutor with additional supports for skypilot.
+    The additional supports include:
+    1. Disposable workers: support control whether the worker process should
+       exit after complete a task.
+    2. Idle check: support check if there are any idle workers.
+    3. Proactive shutdown: SIGTERM worker processes when the executor is
+       shutting down instead of indefinitely waiting.
+    """
+    def __init__(self, max_workers: int, **kwargs):
+        super().__init__(max_workers=max_workers, **kwargs)
+        self.max_workers: int = max_workers
+        # The number of workers that are handling tasks, atomicity across
+        # multiple threads is sufficient since the idleness check is
+        # best-effort and does not affect the correctness.
+        # E.g. the following case is totally fine:
+        # 1. Thread 1 checks running == max_workers
+        # 2. Thread 2 decrements running
+        # 3. Thread 1 schedules the task to other pool even if the pool is
+        #    currently idle.
+        self.running: atomic.AtomicInt = atomic.AtomicInt(0)
+    def submit(self, fn, *args, **kwargs) -> concurrent.futures.Future:
+        """Submit a task for execution.
+        If reuse_worker is False, wraps the function to exit after completion.
+        """
+        self.running.increment()
+        future = super().submit(fn, *args, **kwargs)
+        future.add_done_callback(lambda _: self.running.decrement())
+        return future
+    def has_idle_workers(self) -> bool:
+        """Check if there are any idle workers."""
+        return self.running.get() < self.max_workers
+    def shutdown(self, wait: bool = True) -> None:
+        """Shutdown the executor."""
+        # Here wait means wait for the proactive cancellation complete.
+        # TODO(aylei): we may support wait=True in the future if needed.
+        assert wait is True, 'wait=False is not supported'
+        executor_processes = list(self._processes.values())
+        # Shutdown the executor so that executor process can exit once the
+        # running task is finished or interrupted.
+        super().shutdown(wait=False)
+        # Proactively interrupt the running task to avoid indefinite waiting.
+        subprocess_utils.run_in_parallel(
+            subprocess_utils.kill_process_with_grace_period,
+            executor_processes,
+            num_threads=len(executor_processes))
+# Define the worker function outside of the class to avoid pickling self
+def _disposable_worker(fn, initializer: Optional[Callable], initargs: Tuple,
+                       args, kwargs):
+    try:
+        if initializer is not None:
+            initializer(*initargs)
+        fn(*args, **kwargs)
+    except BaseException as e:  # pylint: disable=broad-except
+        return e
+class DisposableExecutor:
+    """A simple wrapper that creates a new process for each task.
+    This is a workaround for Python 3.10 since `max_tasks_per_child` of
+    ProcessPoolExecutor was introduced in 3.11. There is no way to control
+    the worker lifetime in 3.10.
+    Ref: https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ProcessPoolExecutor # pylint: disable=line-too-long
+    TODO(aylei): use the official `max_tasks_per_child` when upgrade to 3.11
+    """
+    def __init__(self,
+                 max_workers: Optional[int] = None,
+                 initializer: Optional[Callable] = None,
+                 initargs: Tuple = ()):
+        self.max_workers: Optional[int] = max_workers
+        self.workers: Dict[int, multiprocessing.Process] = {}
+        self._shutdown: bool = False
+        self._lock: threading.Lock = threading.Lock()
+        self._initializer: Optional[Callable] = initializer
+        self._initargs: Tuple = initargs
+    def _monitor_worker(self, process: multiprocessing.Process) -> None:
+        """Monitor the worker process and cleanup when it's done."""
+        process.join()
+        if process.pid:
+            with self._lock:
+                if process.pid in self.workers:
+                    del self.workers[process.pid]
+    # Submit is not compatible with ProcessPoolExecutor because we does not
+    # bother to return a Future. Can be improved if needed.
+    def submit(self, fn, *args, **kwargs) -> bool:
+        """Submit a task for execution."""
+        if self._shutdown:
+            return False
+        with self._lock:
+            if (self.max_workers is not None and
+                    len(self.workers) >= self.max_workers):
+                return False
+        process = multiprocessing.Process(target=_disposable_worker,
+                                          args=(fn, self._initializer,
+                                                self._initargs, args, kwargs))
+        process.start()
+        with self._lock:
+            pid = process.pid or 0
+            if pid == 0:
+                raise RuntimeError('Failed to start process')
+            self.workers[pid] = process
+        # Start monitor thread to cleanup the worker process when it's done.
+        monitor_thread = threading.Thread(target=self._monitor_worker,
+                                          args=(process,),
+                                          daemon=True)
+        monitor_thread.start()
+        return True
+    def has_idle_workers(self) -> bool:
+        """Check if there are any idle workers."""
+        if self.max_workers is None:
+            return True
+        with self._lock:
+            return len(self.workers) < self.max_workers
+    def shutdown(self):
+        """Shutdown the executor."""
+        with self._lock:
+            self._shutdown = True
+        subprocess_utils.run_in_parallel(
+            subprocess_utils.kill_process_with_grace_period,
+            list(self.workers.values()),  # Convert dict values to list
+            num_threads=len(self.workers))
+class BurstableExecutor:
+    """An multiprocessing executor that supports bursting worker processes."""
+    # _executor is a PoolExecutor that is used to run guaranteed requests.
+    _executor: Optional[PoolExecutor] = None
+    # _burst_executor is a ProcessPoolExecutor that is used to run burst
+    # requests.
+    _burst_executor: Optional[DisposableExecutor] = None
+    def __init__(self,
+                 garanteed_workers: int,
+                 burst_workers: int = 0,
+                 **kwargs):
+        if garanteed_workers > 0:
+            self._executor = PoolExecutor(max_workers=garanteed_workers,
+                                          **kwargs)
+        if burst_workers > 0:
+            self._burst_executor = DisposableExecutor(max_workers=burst_workers,
+                                                      **kwargs)
+    def submit_until_success(self, fn, *args, **kwargs):
+        """Submit a task for execution until success.
+        Prioritizes submitting to the guaranteed pool. If no idle workers
+        are available in the guaranteed pool, it will submit to the burst
+        pool.
+        TODO(aylei): this is coupled with executor.RequestWorker since we
+        know the worker is dedicated to request scheduling and it either
+        blocks on request polling or request submitting. So it is no harm
+        to make submit blocking here. But for general cases, we need an
+        internal queue to decouple submit and run.
+        """
+        while True:
+            if self._executor is not None and self._executor.has_idle_workers():
+                self._executor.submit(fn, *args, **kwargs)
+                break
+            if (self._burst_executor is not None and
+                    self._burst_executor.has_idle_workers()):
+                self._burst_executor.submit(fn, *args, **kwargs)
+                break
+            if self._executor is not None:
+                # No idle workers in either pool, still queue the request
+                # to the guaranteed pool to keep behavior consistent.
+                self._executor.submit(fn, *args, **kwargs)
+                break
+            logger.debug('No guaranteed pool set and the burst pool is full, '
+                         'retry later.')
+            time.sleep(0.1)
+    def shutdown(self) -> None:
+        """Shutdown the executor."""
+        if self._burst_executor is not None:
+            self._burst_executor.shutdown()
+        if self._executor is not None:
+            self._executor.shutdown(wait=True)

sky/server/requests/queues/local_queue.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Process-local queue implementation."""
+import queue
+import threading
+from typing import Dict
+# Global dict to store queues
+_queues: Dict[str, queue.Queue] = {}
+_lock = threading.Lock()
+def get_queue(queue_name: str) -> queue.Queue:
+    """Get or create a queue by name."""
+    with _lock:
+        if queue_name not in _queues:
+            _queues[queue_name] = queue.Queue()
+        return _queues[queue_name]

sky/utils/atomic.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""Atomic structures and utilties."""
+import threading
+class AtomicInt:
+    """A thread-safe atomic integer implementation."""
+    def __init__(self, initial_value: int = 0):
+        self._value = initial_value
+        self._lock = threading.Lock()
+    def get(self) -> int:
+        """Get the current value atomically.
+        Returns:
+            The current integer value.
+        """
+        with self._lock:
+            return self._value
+    def increment(self, delta: int = 1) -> int:
+        """Atomically increment by delta and return new value.
+        Args:
+            delta: Amount to increment by (default: 1)
+        Returns:
+            The new value after incrementing.
+        """
+        with self._lock:
+            self._value += delta
+            return self._value
+    def decrement(self, delta: int = 1) -> int:
+        """Atomically decrement by delta and return new value.
+        Args:
+            delta: Amount to decrement by (default: 1)
+        Returns:
+            The new value after decrementing.
+        """
+        with self._lock:
+            self._value -= delta
+            return self._value
+    def __str__(self) -> str:
+        return str(self.get())
+    def __repr__(self) -> str:
+        return f'AtomicInt({self.get()})'

sky/utils/common_utils.py CHANGED Viewed

@@ -17,6 +17,8 @@ import typing
 from typing import Any, Callable, Dict, List, Optional, Union
 import uuid
+import jsonschema
 from sky import exceptions
 from sky import sky_logging
 from sky.adaptors import common as adaptors_common
@@ -28,12 +30,10 @@ from sky.utils import validator
 if typing.TYPE_CHECKING:
     import jinja2
-    import jsonschema
     import psutil
     import yaml
 else:
     jinja2 = adaptors_common.LazyImport('jinja2')
-    jsonschema = adaptors_common.LazyImport('jsonschema')
     psutil = adaptors_common.LazyImport('psutil')
     yaml = adaptors_common.LazyImport('yaml')

sky/utils/validator.py CHANGED Viewed

@@ -4,14 +4,7 @@ The main motivation behind extending the existing JSON Schema validator is to
 allow for case-insensitive enum matching since this is currently not supported
 by the JSON Schema specification.
 """
-import typing
-from sky.adaptors import common as adaptors_common
-if typing.TYPE_CHECKING:
-    import jsonschema
-else:
-    jsonschema = adaptors_common.LazyImport('jsonschema')
+import jsonschema
 def case_insensitive_enum(validator, enums, instance, schema):

{skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250408.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: skypilot-nightly
-Version: 1.0.0.dev20250407
+Version: 1.0.0.dev20250408
 Summary: SkyPilot: An intercloud broker for the clouds
 Author: SkyPilot Team
 License: Apache 2.0

{skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250408.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-sky/__init__.py,sha256=-f-rcPq-1NRczFhvTmgyY0eGeL4xNdnjClhgY-sPx5I,6428
+sky/__init__.py,sha256=q1bqMlklbkN76ppGuGrZUg38yFnoTcFONAreuXS5ffY,6428
 sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
 sky/authentication.py,sha256=ND011K_-Ud1dVZF37A9KrwYir_ihJXcHc7iDWmuBc8Q,22872
 sky/check.py,sha256=PPNQnaaZBA9_aogJpN4gnG4XWnTqkd74c-rBYDkDRDY,16101
@@ -34,7 +34,7 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
 sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
 sky/backends/backend.py,sha256=4BOqKZ-bwBTpjNnZF4JAHX2m2Iga7EmEn8Ao3tEivaM,7527
 sky/backends/backend_utils.py,sha256=ndY4IPs1F9QovyiKAnB1FNYGWm52_ylwf_K7wY50cv0,134922
-sky/backends/cloud_vm_ray_backend.py,sha256=ICo21xsKd1Ipy_nBHbP2FUWllOmdS0Pvr4mfypSYhXI,252012
+sky/backends/cloud_vm_ray_backend.py,sha256=mjedyasnvINYz9pIFThBqscIvjqiXs1DKZyVD8twnc0,251926
 sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
 sky/backends/local_docker_backend.py,sha256=nSYCjms3HOPjPNOrcCqsUKm1WV3AAovRFjEQ7hcEXW4,17021
 sky/backends/wheel_utils.py,sha256=meypuMaygSXXjGdXfq6dhWl-OrpAybg9KVRoup4D0wU,9098
@@ -106,7 +106,7 @@ sky/data/data_transfer.py,sha256=-JcnVa_LT0kQejcSCnBwYtxhuuaNDPf_Q5oz62p186c,119
 sky/data/data_utils.py,sha256=ryKUPgNBdeDmGIttqK-J7AKdfc70INTuYH5GOWm3C9g,33581
 sky/data/mounting_utils.py,sha256=ph2p8cYB28FODgxK5ibiD4B4iMD7T3or99zNQaD9HLs,20162
 sky/data/storage.py,sha256=85LcC64yxfd5bzTijGZVyMZV41NyzUhOn0xJZieK2Dc,236652
-sky/data/storage_utils.py,sha256=fDEEErxu97XhOtwPdnNBqRukWcfRT4eTBUhrSGrAvsY,13255
+sky/data/storage_utils.py,sha256=_0NYCWPSjyEGiLNckOl8NzclO5Rd03jRS-hgbQMofBs,13597
 sky/jobs/__init__.py,sha256=qoI53-xXE0-SOkrLWigvhgFXjk7dWE0OTqGPYIk-kmM,1458
 sky/jobs/constants.py,sha256=1XiIqdR5dEgGgepLKWkZCRT3MYSsMBR-dO7N4RTsjwg,3088
 sky/jobs/controller.py,sha256=d5qQYHadesfFgU7-dYtt2trZwyd5IzvlVJeNh5O8OiA,31386
@@ -238,11 +238,13 @@ sky/server/uvicorn.py,sha256=wajwPHJ3IEEP3GMNOCc0S81-1v2qT5F-ejUkLFVhUzk,2953
 sky/server/html/log.html,sha256=TSGZktua9Ysl_ysg3w60rjxAxhH61AJnsYDHdtqrjmI,6929
 sky/server/requests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sky/server/requests/event_loop.py,sha256=OhpPbuce65bbjpGRlcJa78AVnYSm08SzFKt70ypCUuQ,1211
-sky/server/requests/executor.py,sha256=txzvCUKLafRzEoY4Snk9xVFbIdw5cnu7_wkHTldQdmE,22085
+sky/server/requests/executor.py,sha256=z9DaLJOy__7BUddMhXCODmxqD3iAblo6-siEsmO9DiU,26495
 sky/server/requests/payloads.py,sha256=3sF36Z9_PLzpEncW0AplJtOz-_nsn5PJaM5lS-3Y8bw,16558
 sky/server/requests/preconditions.py,sha256=ipxIb_3JXG6S3-ymcOdqQNb7VDvoPqADxu9ZK7-nQWc,7179
+sky/server/requests/process.py,sha256=uv6JmqdT1vR6S5j3a0CEmxz3fUoKQoZCryQsjZpZE7E,8734
 sky/server/requests/requests.py,sha256=9ovdQE-zv_Mvc6IbGATHVyQlOxSKjg_OankZbgDVGeE,21338
 sky/server/requests/queues/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sky/server/requests/queues/local_queue.py,sha256=X6VkBiUmgd_kfqIK1hCtMWG1b8GiZbY70TBiBR6c6GY,416
 sky/server/requests/queues/mp_queue.py,sha256=jDqP4Jd28U3ibSFyMR1DF9I2OWZrPZqFJrG5S6RFpyw,3403
 sky/server/requests/serializers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sky/server/requests/serializers/decoders.py,sha256=0cpg80uAqkdK_LqcQPkpKswhcNUUztG9luDLm_0eUow,6811
@@ -311,11 +313,12 @@ sky/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sky/utils/accelerator_registry.py,sha256=rZniDbqqPAF-vjkrwxGwEErFSAp6puOimkRj3ppOSRY,3905
 sky/utils/admin_policy_utils.py,sha256=y_do0VH6qh163EqSuRW1uGeKvTnJhiYNrHUs77uoOcA,6013
 sky/utils/annotations.py,sha256=-rfacB30Sl0xkFriejGvxma3oKctGfXXLZkQPHG33eo,1626
+sky/utils/atomic.py,sha256=vrw-7XCnckF0xCx-ttamao7evPdGtVsnjaTtgMlBXIE,1280
 sky/utils/cluster_utils.py,sha256=s6DFRXktv6_gF_DnwDEXJ7CniifHp8CAPeGciRCbXgI,14432
 sky/utils/command_runner.py,sha256=aEBs4Km8b6PqDklNc63tVYMK0w3PBGQEEP21_wmhG1k,39191
 sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
 sky/utils/common.py,sha256=P4oVXFATUYgkruHX92cN12SJBtfb8DiOOYZtbN1kvP0,1927
-sky/utils/common_utils.py,sha256=s5YIo9wtFCwWLfLRW7fCjlC9BzqQKPGatWQjrEyYqpc,31680
+sky/utils/common_utils.py,sha256=UM2eSQNdXRvAzlbfC839E7-7DXC9BMMUkquLsmYpu8w,31619
 sky/utils/config_utils.py,sha256=VQ2E3DQ2XysD-kul-diSrxn_pXWsDMfKAev91OiJQ1Q,9041
 sky/utils/control_master_utils.py,sha256=iD4M0onjYOdZ2RuxjwMBl4KhafHXJzuHjvqlBUnu-VE,1450
 sky/utils/controller_utils.py,sha256=mrmkerYyeu7gsCQ56cB3AjCz0r9WaN7teqXUItA47oQ,49805
@@ -334,7 +337,7 @@ sky/utils/status_lib.py,sha256=zn_MSuRYQdNKF8pnFOGQ54X_s_R7dyqWS6Q3a9zENw8,1512
 sky/utils/subprocess_utils.py,sha256=yM2WumV49gSKuZs0v6E3R8XKl5Q9b6veIzi6us5ORU8,15927
 sky/utils/timeline.py,sha256=ob6s3bc7nwAuSI76yLKBrSR5bzOHnOhbozz1avwoet4,4070
 sky/utils/ux_utils.py,sha256=R-ddrqcwKngziZz5haHufxiUnABaMMbmRVsaUljrPBg,10181
-sky/utils/validator.py,sha256=moqe3T_PBKmri_SEtpgoJiKuf_PbdSJxsa8CQlcTbxI,1016
+sky/utils/validator.py,sha256=yo5cPUjGxqfa0ZxGyEYZMCWZ8O35G-k3VOEAtAoA_3w,856
 sky/utils/cli_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sky/utils/cli_utils/status_utils.py,sha256=LwGXzMgvnQeGR1fCC24q38hRLuAPeeSDkQ387eG6YSs,13495
 sky/utils/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -350,9 +353,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488
 sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=HPVgNt-wbCVPd9dpDFiA7t2mzQLpjXHJ61eiwRbEr-c,10378
 sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
 sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
-skypilot_nightly-1.0.0.dev20250407.dist-info/licenses/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
-skypilot_nightly-1.0.0.dev20250407.dist-info/METADATA,sha256=hqvdfiv3pR-AR3iUrwYaDHD9U1Qra2EFlv8mwLdtAmk,18552
-skypilot_nightly-1.0.0.dev20250407.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-skypilot_nightly-1.0.0.dev20250407.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
-skypilot_nightly-1.0.0.dev20250407.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
-skypilot_nightly-1.0.0.dev20250407.dist-info/RECORD,,
+skypilot_nightly-1.0.0.dev20250408.dist-info/licenses/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
+skypilot_nightly-1.0.0.dev20250408.dist-info/METADATA,sha256=EO_QBlBiR_CcaunlS8EDv2fOBCqiy0SQACbeUa6Pd88,18552
+skypilot_nightly-1.0.0.dev20250408.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+skypilot_nightly-1.0.0.dev20250408.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
+skypilot_nightly-1.0.0.dev20250408.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
+skypilot_nightly-1.0.0.dev20250408.dist-info/RECORD,,

{skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250408.dist-info}/WHEEL RENAMED Viewed

File without changes

{skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250408.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250408.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{skypilot_nightly-1.0.0.dev20250407.dist-info → skypilot_nightly-1.0.0.dev20250408.dist-info}/top_level.txt RENAMED Viewed

File without changes

skypilot-nightly 1.0.0.dev20250407__py3-none-any.whl → 1.0.0.dev20250408__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250407py3-none-any.whl → 1.0.0.dev20250408py3-none-any.whl