PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250909py3-none-any.whl → 1.0.0.dev20250912py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (97) hide show

sky/__init__.py +2 -2
sky/authentication.py +19 -4
sky/backends/backend_utils.py +160 -23
sky/backends/cloud_vm_ray_backend.py +226 -74
sky/catalog/__init__.py +7 -0
sky/catalog/aws_catalog.py +4 -0
sky/catalog/common.py +18 -0
sky/catalog/data_fetchers/fetch_aws.py +13 -1
sky/client/cli/command.py +2 -71
sky/client/sdk.py +20 -0
sky/client/sdk_async.py +23 -18
sky/clouds/aws.py +26 -6
sky/clouds/cloud.py +8 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage.py +5 -1
sky/execution.py +21 -14
sky/global_user_state.py +34 -0
sky/jobs/client/sdk_async.py +4 -2
sky/jobs/constants.py +3 -0
sky/jobs/controller.py +734 -310
sky/jobs/recovery_strategy.py +251 -129
sky/jobs/scheduler.py +247 -174
sky/jobs/server/core.py +20 -4
sky/jobs/server/utils.py +2 -2
sky/jobs/state.py +709 -508
sky/jobs/utils.py +90 -40
sky/logs/agent.py +10 -2
sky/provision/aws/config.py +4 -1
sky/provision/gcp/config.py +6 -1
sky/provision/kubernetes/config.py +7 -2
sky/provision/kubernetes/instance.py +84 -41
sky/provision/kubernetes/utils.py +17 -8
sky/provision/provisioner.py +1 -0
sky/provision/vast/instance.py +1 -1
sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
sky/serve/replica_managers.py +0 -7
sky/serve/serve_utils.py +5 -0
sky/serve/server/impl.py +1 -2
sky/serve/service.py +0 -2
sky/server/common.py +8 -3
sky/server/config.py +55 -27
sky/server/constants.py +1 -0
sky/server/daemons.py +7 -11
sky/server/metrics.py +41 -8
sky/server/requests/executor.py +41 -4
sky/server/requests/serializers/encoders.py +1 -1
sky/server/server.py +9 -1
sky/server/uvicorn.py +11 -5
sky/setup_files/dependencies.py +4 -2
sky/skylet/attempt_skylet.py +1 -0
sky/skylet/constants.py +14 -7
sky/skylet/events.py +2 -10
sky/skylet/log_lib.py +11 -0
sky/skylet/log_lib.pyi +9 -0
sky/task.py +62 -0
sky/templates/kubernetes-ray.yml.j2 +120 -3
sky/utils/accelerator_registry.py +3 -1
sky/utils/command_runner.py +35 -11
sky/utils/command_runner.pyi +25 -3
sky/utils/common_utils.py +11 -1
sky/utils/context_utils.py +15 -2
sky/utils/controller_utils.py +5 -0
sky/utils/db/db_utils.py +31 -2
sky/utils/db/migration_utils.py +1 -1
sky/utils/git.py +559 -1
sky/utils/resource_checker.py +8 -7
sky/utils/rich_utils.py +3 -1
sky/utils/subprocess_utils.py +9 -0
sky/volumes/volume.py +2 -0
sky/workspaces/core.py +57 -21
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +38 -36
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +95 -95
sky/client/cli/git.py +0 -549
sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
/sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0

sky/serve/replica_managers.py CHANGED Viewed

@@ -22,7 +22,6 @@ from sky import global_user_state
 from sky import sky_logging
 from sky import task as task_lib
 from sky.backends import backend_utils
-from sky.jobs import scheduler as jobs_scheduler
 from sky.serve import constants as serve_constants
 from sky.serve import serve_state
 from sky.serve import serve_utils
@@ -1052,7 +1051,6 @@ class SkyPilotReplicaManager(ReplicaManager):
                     self._service_name, replica_id)
                 assert info is not None, replica_id
                 error_in_sky_launch = False
-                schedule_next_jobs = False
                 if info.status == serve_state.ReplicaStatus.PENDING:
                     # sky.launch not started yet
                     if controller_utils.can_provision():
@@ -1080,7 +1078,6 @@ class SkyPilotReplicaManager(ReplicaManager):
                     else:
                         info.status_property.sky_launch_status = (
                             common_utils.ProcessStatus.SUCCEEDED)
-                        schedule_next_jobs = True
                     if self._spot_placer is not None and info.is_spot:
                         # TODO(tian): Currently, we set the location to
                         # preemptive if the launch process failed. This is
@@ -1100,16 +1097,12 @@ class SkyPilotReplicaManager(ReplicaManager):
                             self._spot_placer.set_active(location)
                 serve_state.add_or_update_replica(self._service_name,
                                                   replica_id, info)
-                if schedule_next_jobs and self._is_pool:
-                    jobs_scheduler.maybe_schedule_next_jobs()
                 if error_in_sky_launch:
                     # Teardown after update replica info since
                     # _terminate_replica will update the replica info too.
                     self._terminate_replica(replica_id,
                                             sync_down_logs=True,
                                             replica_drain_delay_seconds=0)
-            # Try schedule next job after acquiring the lock.
-            jobs_scheduler.maybe_schedule_next_jobs()
         down_process_pool_snapshot = list(self._down_process_pool.items())
         for replica_id, p in down_process_pool_snapshot:
             if p.is_alive():

sky/serve/serve_utils.py CHANGED Viewed

@@ -294,6 +294,11 @@ def is_consolidation_mode(pool: bool = False) -> bool:
     # We should only do this check on API server, as the controller will not
     # have related config and will always seemingly disabled for consolidation
     # mode. Check #6611 for more details.
+    if (os.environ.get(skylet_constants.OVERRIDE_CONSOLIDATION_MODE) is not None
+            and controller.controller_type == 'jobs'):
+        # if we are in the job controller, we must always be in consolidation
+        # mode.
+        return True
     if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
         _validate_consolidation_mode_config(consolidation_mode, pool)
     return consolidation_mode

sky/serve/server/impl.py CHANGED Viewed

@@ -280,8 +280,7 @@ def up(
             ]
             run_script = '\n'.join(env_cmds + [run_script])
             # Dump script for high availability recovery.
-            if controller_utils.high_availability_specified(controller_name):
-                serve_state.set_ha_recovery_script(service_name, run_script)
+            serve_state.set_ha_recovery_script(service_name, run_script)
             backend.run_on_head(controller_handle, run_script)
         style = colorama.Style

sky/serve/service.py CHANGED Viewed

@@ -21,7 +21,6 @@ from sky import task as task_lib
 from sky.backends import backend_utils
 from sky.backends import cloud_vm_ray_backend
 from sky.data import data_utils
-from sky.jobs import scheduler as jobs_scheduler
 from sky.serve import constants
 from sky.serve import controller
 from sky.serve import load_balancer
@@ -278,7 +277,6 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
                 pool=service_spec.pool,
                 controller_pid=os.getpid(),
                 entrypoint=entrypoint)
-        jobs_scheduler.maybe_schedule_next_jobs()
         # Directly throw an error here. See sky/serve/api.py::up
         # for more details.
         if not success:

sky/server/common.py CHANGED Viewed

@@ -538,12 +538,17 @@ def _start_api_server(deploy: bool = False,
         # Check available memory before starting the server.
         avail_mem_size_gb: float = common_utils.get_mem_size_gb()
-        if avail_mem_size_gb <= server_constants.MIN_AVAIL_MEM_GB:
+        # pylint: disable=import-outside-toplevel
+        import sky.jobs.utils as job_utils
+        max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
+                      if job_utils.is_consolidation_mode() else
+                      server_constants.MIN_AVAIL_MEM_GB)
+        if avail_mem_size_gb <= max_memory:
             logger.warning(
                 f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
                 f'has {avail_mem_size_gb:.1f}GB memory available. '
-                f'At least {server_constants.MIN_AVAIL_MEM_GB}GB is '
-                'recommended to support higher load with better performance.'
+                f'At least {max_memory}GB is recommended to support higher '
+                'load with better performance.'
                 f'{colorama.Style.RESET_ALL}')
         args = [sys.executable, *API_SERVER_CMD.split()]

sky/server/config.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Optional
 from sky import sky_logging
 from sky.server import constants as server_constants
+from sky.server import daemons
 from sky.utils import common_utils
 # Constants based on profiling the peak memory usage while serving various
@@ -19,8 +20,9 @@ from sky.utils import common_utils
 # TODO(aylei): maintaining these constants is error-prone, we may need to
 # automatically tune parallelism at runtime according to system usage stats
 # in the future.
-_LONG_WORKER_MEM_GB = 0.4
-_SHORT_WORKER_MEM_GB = 0.25
+# TODO(luca): The future is now! ^^^
+LONG_WORKER_MEM_GB = 0.4
+SHORT_WORKER_MEM_GB = 0.3
 # To control the number of long workers.
 _CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
 # Limit the number of long workers of local API server, since local server is
@@ -35,9 +37,8 @@ _MAX_LONG_WORKERS_LOCAL = 4
 _MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
 # Minimal number of long workers to ensure responsiveness.
 _MIN_LONG_WORKERS = 1
-# Minimal number of short workers, there is a daemon task running on short
-# workers so at least 2 workers are needed to ensure responsiveness.
-_MIN_SHORT_WORKERS = 2
+# Minimal number of idle short workers to ensure responsiveness.
+_MIN_IDLE_SHORT_WORKERS = 1
 # Default number of burstable workers for local API server. A heuristic number
 # that is large enough for most local cases.
@@ -75,8 +76,8 @@ class ServerConfig:
 def compute_server_config(deploy: bool,
-                          max_db_connections: Optional[int] = None
-                         ) -> ServerConfig:
+                          max_db_connections: Optional[int] = None,
+                          quiet: bool = False) -> ServerConfig:
     """Compute the server config based on environment.
     We have different assumptions for the resources in different deployment
@@ -140,7 +141,12 @@ def compute_server_config(deploy: bool,
         burstable_parallel_for_short = _BURSTABLE_WORKERS_FOR_LOCAL
         # Runs in low resource mode if the available memory is less than
         # server_constants.MIN_AVAIL_MEM_GB.
-        if not deploy and mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
+        # pylint: disable=import-outside-toplevel
+        import sky.jobs.utils as job_utils
+        max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
+                      if job_utils.is_consolidation_mode() else
+                      server_constants.MIN_AVAIL_MEM_GB)
+        if not deploy and mem_size_gb < max_memory:
             # Permanent worker process may have significant memory consumption
             # (~350MB per worker) after running commands like `sky check`, so we
             # don't start any permanent workers in low resource local mode. This
@@ -151,25 +157,29 @@ def compute_server_config(deploy: bool,
             # permanently because it never exits.
             max_parallel_for_long = 0
             max_parallel_for_short = 0
-            logger.warning(
-                'SkyPilot API server will run in low resource mode because '
-                'the available memory is less than '
-                f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
+            if not quiet:
+                logger.warning(
+                    'SkyPilot API server will run in low resource mode because '
+                    'the available memory is less than '
+                    f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
     elif max_db_connections is not None:
         if max_parallel_all_workers > max_db_connections:
-            logger.warning(
-                f'Max parallel all workers ({max_parallel_all_workers}) '
-                f'is greater than max db connections ({max_db_connections}). '
-                'Increase the number of max db connections to '
-                f'at least {max_parallel_all_workers} for optimal performance.')
+            if not quiet:
+                logger.warning(
+                    f'Max parallel all workers ({max_parallel_all_workers}) '
+                    'is greater than max db connections '
+                    f'({max_db_connections}). Increase the number of max db '
+                    f'connections to at least {max_parallel_all_workers} for '
+                    'optimal performance.')
         else:
             num_db_connections_per_worker = 1
-    logger.info(
-        f'SkyPilot API server will start {num_server_workers} server processes '
-        f'with {max_parallel_for_long} background workers for long requests '
-        f'and will allow at max {max_parallel_for_short} short requests in '
-        f'parallel.')
+    if not quiet:
+        logger.info(
+            f'SkyPilot API server will start {num_server_workers} server '
+            f'processes with {max_parallel_for_long} background workers for '
+            f'long requests and will allow at max {max_parallel_for_short} '
+            'short requests in parallel.')
     return ServerConfig(
         num_server_workers=num_server_workers,
         queue_backend=queue_backend,
@@ -190,10 +200,15 @@ def _max_long_worker_parallism(cpu_count: int,
                                local=False) -> int:
     """Max parallelism for long workers."""
     # Reserve min available memory to avoid OOM.
-    available_mem = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
+    # pylint: disable=import-outside-toplevel
+    import sky.jobs.utils as job_utils
+    max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
+                  if job_utils.is_consolidation_mode() else
+                  server_constants.MIN_AVAIL_MEM_GB)
+    available_mem = max(0, mem_size_gb - max_memory)
     cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
     mem_based_max_parallel = int(available_mem * _MAX_MEM_PERCENT_FOR_BLOCKING /
-                                 _LONG_WORKER_MEM_GB)
+                                 LONG_WORKER_MEM_GB)
     n = max(_MIN_LONG_WORKERS,
             min(cpu_based_max_parallel, mem_based_max_parallel))
     if local:
@@ -201,12 +216,25 @@ def _max_long_worker_parallism(cpu_count: int,
     return n
+def _get_min_short_workers() -> int:
+    """Min number of short workers."""
+    daemon_count = 0
+    for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
+        if not daemon.should_skip():
+            daemon_count += 1
+    return _MIN_IDLE_SHORT_WORKERS + daemon_count
 def _max_short_worker_parallism(mem_size_gb: float,
                                 long_worker_parallism: int) -> int:
     """Max parallelism for short workers."""
     # Reserve memory for long workers and min available memory.
-    reserved_mem = server_constants.MIN_AVAIL_MEM_GB + (long_worker_parallism *
-                                                        _LONG_WORKER_MEM_GB)
+    # pylint: disable=import-outside-toplevel
+    import sky.jobs.utils as job_utils
+    max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
+                  if job_utils.is_consolidation_mode() else
+                  server_constants.MIN_AVAIL_MEM_GB)
+    reserved_mem = max_memory + (long_worker_parallism * LONG_WORKER_MEM_GB)
     available_mem = max(0, mem_size_gb - reserved_mem)
-    n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
+    n = max(_get_min_short_workers(), int(available_mem / SHORT_WORKER_MEM_GB))
     return n

sky/server/constants.py CHANGED Viewed

@@ -34,6 +34,7 @@ VERSION_HEADER = 'X-SkyPilot-Version'
 REQUEST_NAME_PREFIX = 'sky.'
 # The memory (GB) that SkyPilot tries to not use to prevent OOM.
 MIN_AVAIL_MEM_GB = 2
+MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE = 4
 # Default encoder/decoder handler name.
 DEFAULT_HANDLER_NAME = 'default'
 # The path to the API request database.

sky/server/daemons.py CHANGED Viewed

@@ -11,6 +11,7 @@ from sky.utils import annotations
 from sky.utils import common
 from sky.utils import common_utils
 from sky.utils import env_options
+from sky.utils import subprocess_utils
 from sky.utils import timeline
 from sky.utils import ux_utils
@@ -74,6 +75,10 @@ class InternalRequestDaemon:
                 # using too much memory.
                 annotations.clear_request_level_cache()
                 timeline.save_timeline()
+                # Kill all children processes related to this request.
+                # Each executor handles a single request, so we can safely
+                # kill all children processes related to this request.
+                subprocess_utils.kill_children_processes()
                 common_utils.release_memory()
             except Exception:  # pylint: disable=broad-except
                 # It is OK to fail to run the event, as the event is not
@@ -123,21 +128,16 @@ def managed_job_status_refresh_event():
     """Refresh the managed job status for controller consolidation mode."""
     # pylint: disable=import-outside-toplevel
     from sky.jobs import utils as managed_job_utils
-    from sky.utils import controller_utils
     # We run the recovery logic before starting the event loop as those two are
     # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
-    if controller_utils.high_availability_specified(
-            controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
-        managed_job_utils.ha_recovery_for_consolidation_mode()
+    managed_job_utils.ha_recovery_for_consolidation_mode()
     # After recovery, we start the event loop.
     from sky.skylet import events
     refresh_event = events.ManagedJobEvent()
-    scheduling_event = events.ManagedJobSchedulingEvent()
     logger.info('=== Running managed job event ===')
     refresh_event.run()
-    scheduling_event.run()
     time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
@@ -152,14 +152,10 @@ def _serve_status_refresh_event(pool: bool):
     """Refresh the sky serve status for controller consolidation mode."""
     # pylint: disable=import-outside-toplevel
     from sky.serve import serve_utils
-    from sky.utils import controller_utils
     # We run the recovery logic before starting the event loop as those two are
     # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
-    controller = controller_utils.get_controller_for_pool(pool)
-    if controller_utils.high_availability_specified(
-            controller.value.cluster_name):
-        serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
+    serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
     # After recovery, we start the event loop.
     from sky.skylet import events

sky/server/metrics.py CHANGED Viewed

@@ -4,6 +4,7 @@ import contextlib
 import functools
 import multiprocessing
 import os
+import threading
 import time
 import fastapi
@@ -21,6 +22,24 @@ from sky.skylet import constants
 METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
                                  'false').lower() == 'true'
+_KB = 2**10
+_MB = 2**20
+_MEM_BUCKETS = [
+    _KB,
+    256 * _KB,
+    512 * _KB,
+    _MB,
+    2 * _MB,
+    4 * _MB,
+    8 * _MB,
+    16 * _MB,
+    32 * _MB,
+    64 * _MB,
+    128 * _MB,
+    256 * _MB,
+    float('inf'),
+]
 logger = sky_logging.init_logger(__name__)
 # Total number of API server requests, grouped by path, method, and status.
@@ -92,6 +111,16 @@ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
     ['pid', 'type', 'mode'],
 )
+SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
+    'sky_apiserver_request_memory_usage_bytes',
+    'Peak memory usage of requests', ['name'],
+    buckets=_MEM_BUCKETS)
+SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
+    'sky_apiserver_request_rss_incr_bytes',
+    'RSS increment after requests', ['name'],
+    buckets=_MEM_BUCKETS)
 metrics_app = fastapi.FastAPI()
@@ -208,19 +237,23 @@ def time_me_async(func):
     return async_wrapper
-def process_monitor(process_type: str):
+peak_rss_bytes = 0
+def process_monitor(process_type: str, stop: threading.Event):
     pid = multiprocessing.current_process().pid
     proc = psutil.Process(pid)
-    peak_rss = 0
     last_bucket_end = time.time()
-    while True:
+    bucket_peak = 0
+    global peak_rss_bytes
+    while not stop.is_set():
         if time.time() - last_bucket_end >= 30:
-            # Reset peak RSS every 30 seconds.
+            # Reset peak RSS for the next time bucket.
             last_bucket_end = time.time()
-            peak_rss = 0
-        peak_rss = max(peak_rss, proc.memory_info().rss)
-        SKY_APISERVER_PROCESS_PEAK_RSS.labels(pid=pid,
-                                              type=process_type).set(peak_rss)
+            bucket_peak = 0
+        peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
+        SKY_APISERVER_PROCESS_PEAK_RSS.labels(
+            pid=pid, type=process_type).set(peak_rss_bytes)
         ctimes = proc.cpu_times()
         SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
                                                type=process_type,

sky/server/requests/executor.py CHANGED Viewed

@@ -31,6 +31,7 @@ import time
 import typing
 from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
+import psutil
 import setproctitle
 from sky import exceptions
@@ -130,8 +131,9 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
 def executor_initializer(proc_group: str):
     setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
                               f'{multiprocessing.current_process().pid}')
+    # Executor never stops, unless the whole process is killed.
     threading.Thread(target=metrics_lib.process_monitor,
-                     args=(f'worker:{proc_group}',),
+                     args=(f'worker:{proc_group}', threading.Event()),
                      daemon=True).start()
@@ -373,11 +375,13 @@ def _request_execution_wrapper(request_id: str,
     4. Handle the SIGTERM signal to abort the request gracefully.
     5. Maintain the lifecycle of the temp dir used by the request.
     """
+    pid = multiprocessing.current_process().pid
+    proc = psutil.Process(pid)
+    rss_begin = proc.memory_info().rss
     db_utils.set_max_connections(num_db_connections_per_worker)
     # Handle the SIGTERM signal to abort the request processing gracefully.
     signal.signal(signal.SIGTERM, _sigterm_handler)
-    pid = multiprocessing.current_process().pid
     logger.info(f'Running request {request_id} with pid {pid}')
     with api_requests.update_request(request_id) as request_task:
         assert request_task is not None, request_id
@@ -443,8 +447,41 @@ def _request_execution_wrapper(request_id: str,
             _restore_output(original_stdout, original_stderr)
             logger.info(f'Request {request_id} finished')
         finally:
-            with metrics_lib.time_it(name='release_memory', group='internal'):
-                common_utils.release_memory()
+            try:
+                # Capture the peak RSS before GC.
+                peak_rss = max(proc.memory_info().rss,
+                               metrics_lib.peak_rss_bytes)
+                with metrics_lib.time_it(name='release_memory',
+                                         group='internal'):
+                    common_utils.release_memory()
+                _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
+            except Exception as e:  # pylint: disable=broad-except
+                logger.error(f'Failed to record memory metrics: '
+                             f'{common_utils.format_exception(e)}')
+_first_request = True
+def _record_memory_metrics(request_name: str, proc: psutil.Process,
+                           rss_begin: int, peak_rss: int) -> None:
+    """Record the memory metrics for a request."""
+    # Do not record full memory delta for the first request as it
+    # will loads the sky core modules and make the memory usage
+    # estimation inaccurate.
+    global _first_request
+    if _first_request:
+        _first_request = False
+        return
+    rss_end = proc.memory_info().rss
+    # Answer "how much RSS this request contributed?"
+    metrics_lib.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
+        name=request_name).observe(max(rss_end - rss_begin, 0))
+    # Estimate the memory usage by the request by capturing the
+    # peak memory delta during the request execution.
+    metrics_lib.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
+        name=request_name).observe(max(peak_rss - rss_begin, 0))
 async def execute_request_coroutine(request: api_requests.Request):

sky/server/requests/serializers/encoders.py CHANGED Viewed

@@ -131,7 +131,7 @@ def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
 def encode_jobs_queue_v2(
         jobs_or_tuple) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
     # Support returning either a plain jobs list or a (jobs, total) tuple
-    status_counts = {}
+    status_counts: Dict[str, int] = {}
     if isinstance(jobs_or_tuple, tuple):
         if len(jobs_or_tuple) == 2:
             jobs, total = jobs_or_tuple

sky/server/server.py CHANGED Viewed

@@ -625,6 +625,9 @@ app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
 app.include_router(ssh_node_pools_rest.router,
                    prefix='/ssh_node_pools',
                    tags=['ssh_node_pools'])
+# increase the resource limit for the server
+soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
 # Increase the limit of files we can open to our hard limit. This fixes bugs
 # where we can not aquire file locks or open enough logs and the API server
@@ -1211,6 +1214,7 @@ async def logs(
         request_body=cluster_job_body,
         func=core.tail_logs,
         schedule_type=requests_lib.ScheduleType.SHORT,
+        request_cluster_name=cluster_job_body.cluster_name,
     )
     task = asyncio.create_task(executor.execute_request_coroutine(request_task))
@@ -1826,7 +1830,7 @@ async def all_contexts(request: fastapi.Request) -> None:
 async def gpu_metrics() -> fastapi.Response:
     """Gets the GPU metrics from multiple external k8s clusters"""
     contexts = core.get_all_contexts()
-    all_metrics = []
+    all_metrics: List[str] = []
     successful_contexts = 0
     tasks = [
@@ -1841,6 +1845,10 @@ async def gpu_metrics() -> fastapi.Response:
         if isinstance(result, Exception):
             logger.error(
                 f'Failed to get metrics for context {contexts[i]}: {result}')
+        elif isinstance(result, BaseException):
+            # Avoid changing behavior for non-Exception BaseExceptions
+            # like KeyboardInterrupt/SystemExit: re-raise them.
+            raise result
         else:
             metrics_text = result
             all_metrics.append(metrics_text)

sky/server/uvicorn.py CHANGED Viewed

@@ -213,11 +213,17 @@ class Server(uvicorn.Server):
             # Same as set PYTHONASYNCIODEBUG=1, but with custom threshold.
             event_loop.set_debug(True)
             event_loop.slow_callback_duration = lag_threshold
-        threading.Thread(target=metrics_lib.process_monitor,
-                         args=('server',),
-                         daemon=True).start()
-        with self.capture_signals():
-            asyncio.run(self.serve(*args, **kwargs))
+        stop_monitor = threading.Event()
+        monitor = threading.Thread(target=metrics_lib.process_monitor,
+                                   args=('server', stop_monitor),
+                                   daemon=True)
+        monitor.start()
+        try:
+            with self.capture_signals():
+                asyncio.run(self.serve(*args, **kwargs))
+        finally:
+            stop_monitor.set()
+            monitor.join()
 def run(config: uvicorn.Config, max_db_connections: Optional[int] = None):

sky/setup_files/dependencies.py CHANGED Viewed

@@ -63,6 +63,8 @@ install_requires = [
     'setproctitle',
     'sqlalchemy',
     'psycopg2-binary',
+    'aiosqlite',
+    'asyncpg',
     # TODO(hailong): These three dependencies should be removed after we make
     # the client-side actually not importing them.
     'casbin',
@@ -108,9 +110,9 @@ server_dependencies = [
 local_ray = [
     # Lower version of ray will cause dependency conflict for
     # click/grpcio/protobuf.
-    # Excluded 2.6.0 as it has a bug in the cluster launcher:
+    # Ray 2.6.1+ resolved cluster launcher bugs and grpcio issues on Apple Silicon.
     # https://github.com/ray-project/ray/releases/tag/ray-2.6.1
-    'ray[default] >= 2.2.0, != 2.6.0',
+    'ray[default] >= 2.6.1',
 ]
 remote = [

sky/skylet/attempt_skylet.py CHANGED Viewed

@@ -12,6 +12,7 @@ def restart_skylet():
     # Kills old skylet if it is running.
     # TODO(zhwu): make the killing graceful, e.g., use a signal to tell
     # skylet to exit, instead of directly killing it.
     subprocess.run(
         # We use -m to grep instead of {constants.SKY_PYTHON_CMD} -m to grep
         # because need to handle the backward compatibility of the old skylet

sky/skylet/constants.py CHANGED Viewed

@@ -62,11 +62,14 @@ SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
                       'curl -LsSf https://astral.sh/uv/install.sh '
                       f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
 SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
-# Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH to deactivate the
-# environment. `deactivate` command does not work when conda is used.
+SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run')
+# Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
+# VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
+# not work when conda is used.
 DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
     'export PATH='
-    f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||")')
+    f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||") && '
+    'unset VIRTUAL_ENV && unset VIRTUAL_ENV_PROMPT')
 # Prefix for SkyPilot environment variables
 SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
@@ -91,14 +94,14 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
 # cluster yaml is updated.
 #
 # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
-SKYLET_VERSION = '17'
+SKYLET_VERSION = '18'
 # The version of the lib files that skylet/jobs use. Whenever there is an API
 # change for the job_lib or log_lib, we need to bump this version, so that the
 # user can be notified to update their SkyPilot version on the remote cluster.
 SKYLET_LIB_VERSION = 4
 SKYLET_VERSION_FILE = '~/.sky/skylet_version'
 SKYLET_GRPC_PORT = 46590
-SKYLET_GRPC_TIMEOUT_SECONDS = 5
+SKYLET_GRPC_TIMEOUT_SECONDS = 10
 # Docker default options
 DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
@@ -229,7 +232,7 @@ RAY_INSTALLATION_COMMANDS = (
     'export PATH=$PATH:$HOME/.local/bin; '
     # Writes ray path to file if it does not exist or the file is empty.
     f'[ -s {SKY_RAY_PATH_FILE} ] || '
-    f'{{ {ACTIVATE_SKY_REMOTE_PYTHON_ENV} && '
+    f'{{ {SKY_UV_RUN_CMD} '
     f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
 SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
@@ -421,6 +424,7 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
 # TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
 # Environment variable that is set to 'true' if this is a skypilot server.
 ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
+OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
 # Environment variable that is set to 'true' if metrics are enabled.
 ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
@@ -447,7 +451,7 @@ SKYPILOT_DEFAULT_WORKSPACE = 'default'
 # BEGIN constants used for service catalog.
 HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs'  # pylint: disable=line-too-long
 HOSTED_CATALOG_DIR_URL_S3_MIRROR = 'https://skypilot-catalog.s3.us-east-1.amazonaws.com/catalogs'  # pylint: disable=line-too-long
-CATALOG_SCHEMA_VERSION = 'v7'
+CATALOG_SCHEMA_VERSION = 'v8'
 CATALOG_DIR = '~/.sky/catalogs'
 ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
               'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
@@ -508,3 +512,6 @@ SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
 ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
                                  'DEBUG_LOOP_LAG_THRESHOLD_MS')
+ARM64_ARCH = 'arm64'
+X86_64_ARCH = 'x86_64'

skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250909py3-none-any.whl → 1.0.0.dev20250912py3-none-any.whl