PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250908__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250908py3-none-any.whl → 1.0.0.dev20250910py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (70) hide show

sky/__init__.py +2 -2
sky/authentication.py +19 -4
sky/backends/backend_utils.py +35 -1
sky/backends/cloud_vm_ray_backend.py +2 -2
sky/client/sdk.py +20 -0
sky/client/sdk_async.py +18 -16
sky/clouds/aws.py +3 -1
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-47c64cc05717f8a3.js → webpack-1d7e11230da3ca89.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage.py +5 -1
sky/execution.py +21 -14
sky/jobs/constants.py +3 -0
sky/jobs/controller.py +732 -310
sky/jobs/recovery_strategy.py +251 -129
sky/jobs/scheduler.py +247 -174
sky/jobs/server/core.py +20 -4
sky/jobs/server/utils.py +2 -2
sky/jobs/state.py +702 -511
sky/jobs/utils.py +94 -39
sky/provision/aws/config.py +4 -1
sky/provision/gcp/config.py +6 -1
sky/provision/kubernetes/utils.py +17 -8
sky/provision/provisioner.py +1 -0
sky/serve/replica_managers.py +0 -7
sky/serve/serve_utils.py +5 -0
sky/serve/server/impl.py +1 -2
sky/serve/service.py +0 -2
sky/server/common.py +8 -3
sky/server/config.py +43 -24
sky/server/constants.py +1 -0
sky/server/daemons.py +7 -11
sky/server/metrics.py +60 -9
sky/server/requests/executor.py +33 -32
sky/server/requests/serializers/encoders.py +1 -1
sky/server/server.py +57 -4
sky/server/uvicorn.py +4 -0
sky/setup_files/dependencies.py +4 -2
sky/skylet/attempt_skylet.py +1 -0
sky/skylet/constants.py +3 -1
sky/skylet/events.py +2 -10
sky/utils/command_runner.pyi +3 -3
sky/utils/common_utils.py +19 -7
sky/utils/controller_utils.py +5 -0
sky/utils/db/db_utils.py +31 -2
sky/utils/rich_utils.py +3 -1
sky/utils/subprocess_utils.py +9 -0
sky/volumes/volume.py +2 -0
{skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/METADATA +37 -35
{skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/RECORD +70 -70
/sky/dashboard/out/_next/static/{qikl6zGuwe8BKavteei3r → 3SYxqNGnvvPS8h3gdD2T7}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{qikl6zGuwe8BKavteei3r → 3SYxqNGnvvPS8h3gdD2T7}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250908.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/top_level.txt +0 -0

sky/server/daemons.py CHANGED Viewed

@@ -11,6 +11,7 @@ from sky.utils import annotations
 from sky.utils import common
 from sky.utils import common_utils
 from sky.utils import env_options
+from sky.utils import subprocess_utils
 from sky.utils import timeline
 from sky.utils import ux_utils
@@ -74,6 +75,10 @@ class InternalRequestDaemon:
                 # using too much memory.
                 annotations.clear_request_level_cache()
                 timeline.save_timeline()
+                # Kill all children processes related to this request.
+                # Each executor handles a single request, so we can safely
+                # kill all children processes related to this request.
+                subprocess_utils.kill_children_processes()
                 common_utils.release_memory()
             except Exception:  # pylint: disable=broad-except
                 # It is OK to fail to run the event, as the event is not
@@ -123,21 +128,16 @@ def managed_job_status_refresh_event():
     """Refresh the managed job status for controller consolidation mode."""
     # pylint: disable=import-outside-toplevel
     from sky.jobs import utils as managed_job_utils
-    from sky.utils import controller_utils
     # We run the recovery logic before starting the event loop as those two are
     # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
-    if controller_utils.high_availability_specified(
-            controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
-        managed_job_utils.ha_recovery_for_consolidation_mode()
+    managed_job_utils.ha_recovery_for_consolidation_mode()
     # After recovery, we start the event loop.
     from sky.skylet import events
     refresh_event = events.ManagedJobEvent()
-    scheduling_event = events.ManagedJobSchedulingEvent()
     logger.info('=== Running managed job event ===')
     refresh_event.run()
-    scheduling_event.run()
     time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
@@ -152,14 +152,10 @@ def _serve_status_refresh_event(pool: bool):
     """Refresh the sky serve status for controller consolidation mode."""
     # pylint: disable=import-outside-toplevel
     from sky.serve import serve_utils
-    from sky.utils import controller_utils
     # We run the recovery logic before starting the event loop as those two are
     # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
-    controller = controller_utils.get_controller_for_pool(pool)
-    if controller_utils.high_availability_specified(
-            controller.value.cluster_name):
-        serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
+    serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
     # After recovery, we start the event loop.
     from sky.skylet import events

sky/server/metrics.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import contextlib
 import functools
+import multiprocessing
 import os
 import time
@@ -9,6 +10,7 @@ import fastapi
 from prometheus_client import generate_latest
 from prometheus_client import multiprocess
 import prometheus_client as prom
+import psutil
 import starlette.middleware.base
 import uvicorn
@@ -38,15 +40,6 @@ SKY_APISERVER_REQUEST_DURATION_SECONDS = prom.Histogram(
              60.0, 120.0, float('inf')),
 )
-# Time spent processing requests in executor.
-SKY_APISERVER_REQUEST_EXECUTION_DURATION_SECONDS = prom.Histogram(
-    'sky_apiserver_request_execution_duration_seconds',
-    'Time spent executing requests in executor',
-    ['request', 'worker'],
-    buckets=(0.5, 1, 2.5, 5.0, 10.0, 15.0, 25.0, 40.0, 60.0, 90.0, 120.0, 180.0,
-             float('inf')),
-)
 # Time spent processing a piece of code, refer to time_it().
 SKY_APISERVER_CODE_DURATION_SECONDS = prom.Histogram(
     'sky_apiserver_code_duration_seconds',
@@ -64,6 +57,41 @@ SKY_APISERVER_EVENT_LOOP_LAG_SECONDS = prom.Histogram(
              60.0, float('inf')),
 )
+SKY_APISERVER_WEBSOCKET_CONNECTIONS = prom.Gauge(
+    'sky_apiserver_websocket_connections',
+    'Number of websocket connections',
+    ['pid'],
+    multiprocess_mode='livesum',
+)
+SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL = prom.Counter(
+    'sky_apiserver_websocket_closed_total',
+    'Number of websocket closed',
+    ['pid', 'reason'],
+)
+# The number of execution starts in each worker process, we do not record
+# histogram here as the duration has been measured in
+# SKY_APISERVER_CODE_DURATION_SECONDS without the worker label (process id).
+# Recording histogram WITH worker label will cause high cardinality.
+SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL = prom.Counter(
+    'sky_apiserver_process_execution_start_total',
+    'Total number of execution starts in each worker process',
+    ['request', 'pid'],
+)
+SKY_APISERVER_PROCESS_PEAK_RSS = prom.Gauge(
+    'sky_apiserver_process_peak_rss',
+    'Peak RSS we saw in each process in last 30 seconds',
+    ['pid', 'type'],
+)
+SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
+    'sky_apiserver_process_cpu_total',
+    'Total CPU times a worker process has been running',
+    ['pid', 'type', 'mode'],
+)
 metrics_app = fastapi.FastAPI()
@@ -178,3 +206,26 @@ def time_me_async(func):
             return await func(*args, **kwargs)
     return async_wrapper
+def process_monitor(process_type: str):
+    pid = multiprocessing.current_process().pid
+    proc = psutil.Process(pid)
+    peak_rss = 0
+    last_bucket_end = time.time()
+    while True:
+        if time.time() - last_bucket_end >= 30:
+            # Reset peak RSS every 30 seconds.
+            last_bucket_end = time.time()
+            peak_rss = 0
+        peak_rss = max(peak_rss, proc.memory_info().rss)
+        SKY_APISERVER_PROCESS_PEAK_RSS.labels(pid=pid,
+                                              type=process_type).set(peak_rss)
+        ctimes = proc.cpu_times()
+        SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
+                                               type=process_type,
+                                               mode='user').set(ctimes.user)
+        SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
+                                               type=process_type,
+                                               mode='system').set(ctimes.system)
+        time.sleep(1)

sky/server/requests/executor.py CHANGED Viewed

@@ -130,6 +130,9 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
 def executor_initializer(proc_group: str):
     setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
                               f'{multiprocessing.current_process().pid}')
+    threading.Thread(target=metrics_lib.process_monitor,
+                     args=(f'worker:{proc_group}',),
+                     daemon=True).start()
 class RequestWorker:
@@ -281,34 +284,34 @@ def override_request_env_and_config(
         request_id: str) -> Generator[None, None, None]:
     """Override the environment and SkyPilot config for a request."""
     original_env = os.environ.copy()
-    # Unset SKYPILOT_DEBUG by default, to avoid the value set on the API server
-    # affecting client requests. If set on the client side, it will be
-    # overridden by the request body.
-    os.environ.pop('SKYPILOT_DEBUG', None)
-    # Remove the db connection uri from client supplied env vars, as the
-    # client should not set the db string on server side.
-    request_body.env_vars.pop(constants.ENV_VAR_DB_CONNECTION_URI, None)
-    os.environ.update(request_body.env_vars)
-    # Note: may be overridden by AuthProxyMiddleware.
-    # TODO(zhwu): we need to make the entire request a context available to the
-    # entire request execution, so that we can access info like user through
-    # the execution.
-    user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
-                       name=request_body.env_vars[constants.USER_ENV_VAR])
-    global_user_state.add_or_update_user(user)
-    # Refetch the user to get the latest user info, including the created_at
-    # field.
-    user = global_user_state.get_user(user.id)
-    # Force color to be enabled.
-    os.environ['CLICOLOR_FORCE'] = '1'
-    server_common.reload_for_new_request(
-        client_entrypoint=request_body.entrypoint,
-        client_command=request_body.entrypoint_command,
-        using_remote_api_server=request_body.using_remote_api_server,
-        user=user,
-        request_id=request_id)
     try:
+        # Unset SKYPILOT_DEBUG by default, to avoid the value set on the API
+        # server affecting client requests. If set on the client side, it will
+        # be overridden by the request body.
+        os.environ.pop('SKYPILOT_DEBUG', None)
+        # Remove the db connection uri from client supplied env vars, as the
+        # client should not set the db string on server side.
+        request_body.env_vars.pop(constants.ENV_VAR_DB_CONNECTION_URI, None)
+        os.environ.update(request_body.env_vars)
+        # Note: may be overridden by AuthProxyMiddleware.
+        # TODO(zhwu): we need to make the entire request a context available to
+        # the entire request execution, so that we can access info like user
+        # through the execution.
+        user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
+                           name=request_body.env_vars[constants.USER_ENV_VAR])
+        global_user_state.add_or_update_user(user)
+        # Refetch the user to get the latest user info, including the created_at
+        # field.
+        user = global_user_state.get_user(user.id)
+        # Force color to be enabled.
+        os.environ['CLICOLOR_FORCE'] = '1'
+        server_common.reload_for_new_request(
+            client_entrypoint=request_body.entrypoint,
+            client_command=request_body.entrypoint_command,
+            using_remote_api_server=request_body.using_remote_api_server,
+            user=user,
+            request_id=request_id)
         logger.debug(
             f'override path: {request_body.override_skypilot_config_path}')
         with skypilot_config.override_skypilot_config(
@@ -401,6 +404,8 @@ def _request_execution_wrapper(request_id: str,
                     config = skypilot_config.to_dict()
                     logger.debug(f'request config: \n'
                                  f'{yaml_utils.dump_yaml_str(dict(config))}')
+                metrics_lib.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.labels(
+                    request=request_name, pid=pid).inc()
                 with metrics_lib.time_it(name=request_name,
                                          group='request_execution'):
                     return_value = func(**request_body.to_kwargs())
@@ -439,11 +444,7 @@ def _request_execution_wrapper(request_id: str,
             logger.info(f'Request {request_id} finished')
         finally:
             with metrics_lib.time_it(name='release_memory', group='internal'):
-                try:
-                    common_utils.release_memory()
-                except Exception as e:  # pylint: disable=broad-except
-                    logger.error(f'Failed to release memory: '
-                                 f'{common_utils.format_exception(e)}')
+                common_utils.release_memory()
 async def execute_request_coroutine(request: api_requests.Request):

sky/server/requests/serializers/encoders.py CHANGED Viewed

@@ -131,7 +131,7 @@ def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
 def encode_jobs_queue_v2(
         jobs_or_tuple) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
     # Support returning either a plain jobs list or a (jobs, total) tuple
-    status_counts = {}
+    status_counts: Dict[str, int] = {}
     if isinstance(jobs_or_tuple, tuple):
         if len(jobs_or_tuple) == 2:
             jobs, total = jobs_or_tuple

sky/server/server.py CHANGED Viewed

@@ -625,6 +625,9 @@ app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
 app.include_router(ssh_node_pools_rest.router,
                    prefix='/ssh_node_pools',
                    tags=['ssh_node_pools'])
+# increase the resource limit for the server
+soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
 # Increase the limit of files we can open to our hard limit. This fixes bugs
 # where we can not aquire file locks or open enough logs and the API server
@@ -1734,7 +1737,12 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
             return
     logger.info(f'Starting port-forward to local port: {local_port}')
+    conn_gauge = metrics.SKY_APISERVER_WEBSOCKET_CONNECTIONS.labels(
+        pid=os.getpid())
+    ssh_failed = False
+    websocket_closed = False
     try:
+        conn_gauge.inc()
         # Connect to the local port
         reader, writer = await asyncio.open_connection('127.0.0.1', local_port)
@@ -1742,9 +1750,21 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
             try:
                 async for message in websocket.iter_bytes():
                     writer.write(message)
-                    await writer.drain()
+                    try:
+                        await writer.drain()
+                    except Exception as e:  # pylint: disable=broad-except
+                        # Typically we will not reach here, if the ssh to pod
+                        # is disconnected, ssh_to_websocket will exit first.
+                        # But just in case.
+                        logger.error('Failed to write to pod through '
+                                     f'port-forward connection: {e}')
+                        nonlocal ssh_failed
+                        ssh_failed = True
+                        break
             except fastapi.WebSocketDisconnect:
                 pass
+            nonlocal websocket_closed
+            websocket_closed = True
             writer.close()
         async def ssh_to_websocket():
@@ -1752,15 +1772,44 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
                 while True:
                     data = await reader.read(1024)
                     if not data:
+                        if not websocket_closed:
+                            logger.warning('SSH connection to pod is '
+                                           'disconnected before websocket '
+                                           'connection is closed')
+                            nonlocal ssh_failed
+                            ssh_failed = True
                         break
                     await websocket.send_bytes(data)
             except Exception:  # pylint: disable=broad-except
                 pass
-            await websocket.close()
+            try:
+                await websocket.close()
+            except Exception:  # pylint: disable=broad-except
+                # The websocket might has been closed by the client.
+                pass
         await asyncio.gather(websocket_to_ssh(), ssh_to_websocket())
     finally:
-        proc.terminate()
+        conn_gauge.dec()
+        reason = ''
+        try:
+            logger.info('Terminating kubectl port-forward process')
+            proc.terminate()
+        except ProcessLookupError:
+            stdout = await proc.stdout.read()
+            logger.error('kubectl port-forward was terminated before the '
+                         'ssh websocket connection was closed. Remaining '
+                         f'output: {str(stdout)}')
+            reason = 'KubectlPortForwardExit'
+            metrics.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
+                pid=os.getpid(), reason='KubectlPortForwardExit').inc()
+        else:
+            if ssh_failed:
+                reason = 'SSHToPodDisconnected'
+            else:
+                reason = 'ClientClosed'
+        metrics.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
+            pid=os.getpid(), reason=reason).inc()
 @app.get('/all_contexts')
@@ -1780,7 +1829,7 @@ async def all_contexts(request: fastapi.Request) -> None:
 async def gpu_metrics() -> fastapi.Response:
     """Gets the GPU metrics from multiple external k8s clusters"""
     contexts = core.get_all_contexts()
-    all_metrics = []
+    all_metrics: List[str] = []
     successful_contexts = 0
     tasks = [
@@ -1795,6 +1844,10 @@ async def gpu_metrics() -> fastapi.Response:
         if isinstance(result, Exception):
             logger.error(
                 f'Failed to get metrics for context {contexts[i]}: {result}')
+        elif isinstance(result, BaseException):
+            # Avoid changing behavior for non-Exception BaseExceptions
+            # like KeyboardInterrupt/SystemExit: re-raise them.
+            raise result
         else:
             metrics_text = result
             all_metrics.append(metrics_text)

sky/server/uvicorn.py CHANGED Viewed

@@ -19,6 +19,7 @@ from uvicorn.supervisors import multiprocess
 from sky import sky_logging
 from sky.server import daemons
+from sky.server import metrics as metrics_lib
 from sky.server import state
 from sky.server.requests import requests as requests_lib
 from sky.skylet import constants
@@ -212,6 +213,9 @@ class Server(uvicorn.Server):
             # Same as set PYTHONASYNCIODEBUG=1, but with custom threshold.
             event_loop.set_debug(True)
             event_loop.slow_callback_duration = lag_threshold
+        threading.Thread(target=metrics_lib.process_monitor,
+                         args=('server',),
+                         daemon=True).start()
         with self.capture_signals():
             asyncio.run(self.serve(*args, **kwargs))

sky/setup_files/dependencies.py CHANGED Viewed

@@ -63,6 +63,8 @@ install_requires = [
     'setproctitle',
     'sqlalchemy',
     'psycopg2-binary',
+    'aiosqlite',
+    'asyncpg',
     # TODO(hailong): These three dependencies should be removed after we make
     # the client-side actually not importing them.
     'casbin',
@@ -108,9 +110,9 @@ server_dependencies = [
 local_ray = [
     # Lower version of ray will cause dependency conflict for
     # click/grpcio/protobuf.
-    # Excluded 2.6.0 as it has a bug in the cluster launcher:
+    # Ray 2.6.1+ resolved cluster launcher bugs and grpcio issues on Apple Silicon.
     # https://github.com/ray-project/ray/releases/tag/ray-2.6.1
-    'ray[default] >= 2.2.0, != 2.6.0',
+    'ray[default] >= 2.6.1',
 ]
 remote = [

sky/skylet/attempt_skylet.py CHANGED Viewed

@@ -12,6 +12,7 @@ def restart_skylet():
     # Kills old skylet if it is running.
     # TODO(zhwu): make the killing graceful, e.g., use a signal to tell
     # skylet to exit, instead of directly killing it.
     subprocess.run(
         # We use -m to grep instead of {constants.SKY_PYTHON_CMD} -m to grep
         # because need to handle the backward compatibility of the old skylet

sky/skylet/constants.py CHANGED Viewed

@@ -91,7 +91,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
 # cluster yaml is updated.
 #
 # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
-SKYLET_VERSION = '17'
+SKYLET_VERSION = '18'
 # The version of the lib files that skylet/jobs use. Whenever there is an API
 # change for the job_lib or log_lib, we need to bump this version, so that the
 # user can be notified to update their SkyPilot version on the remote cluster.
@@ -374,6 +374,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
     ('ssh', 'pod_config'),
     ('kubernetes', 'custom_metadata'),
     ('kubernetes', 'pod_config'),
+    ('kubernetes', 'context_configs'),
     ('kubernetes', 'provision_timeout'),
     ('kubernetes', 'dws'),
     ('kubernetes', 'kueue'),
@@ -421,6 +422,7 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
 # TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
 # Environment variable that is set to 'true' if this is a skypilot server.
 ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
+OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
 # Environment variable that is set to 'true' if metrics are enabled.
 ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'

sky/skylet/events.py CHANGED Viewed

@@ -11,7 +11,7 @@ import psutil
 from sky import clouds
 from sky import sky_logging
 from sky.backends import cloud_vm_ray_backend
-from sky.jobs import scheduler as managed_job_scheduler
+from sky.jobs import scheduler
 from sky.jobs import state as managed_job_state
 from sky.jobs import utils as managed_job_utils
 from sky.serve import serve_utils
@@ -76,15 +76,7 @@ class ManagedJobEvent(SkyletEvent):
     def _run(self):
         logger.info('=== Updating managed job status ===')
         managed_job_utils.update_managed_jobs_statuses()
-class ManagedJobSchedulingEvent(SkyletEvent):
-    """Skylet event for scheduling managed jobs."""
-    EVENT_INTERVAL_SECONDS = 20
-    def _run(self):
-        logger.info('=== Scheduling next jobs ===')
-        managed_job_scheduler.maybe_schedule_next_jobs()
+        scheduler.maybe_start_controllers()
 class ServiceUpdateEvent(SkyletEvent):

sky/utils/command_runner.pyi CHANGED Viewed

@@ -36,9 +36,9 @@ def ssh_options_list(
 class SshMode(enum.Enum):
-    NON_INTERACTIVE: int
-    INTERACTIVE: int
-    LOGIN: int
+    NON_INTERACTIVE = ...
+    INTERACTIVE = ...
+    LOGIN = ...
 class CommandRunner:

sky/utils/common_utils.py CHANGED Viewed

@@ -996,7 +996,17 @@ def get_mem_size_gb() -> float:
         except ValueError as e:
             with ux_utils.print_exception_no_traceback():
                 raise ValueError(
-                    f'Failed to parse the memory size from {mem_size}') from e
+                    f'Failed to parse the memory size from {mem_size} (GB)'
+                ) from e
+    mem_size = os.getenv('SKYPILOT_POD_MEMORY_BYTES_LIMIT')
+    if mem_size is not None:
+        try:
+            return float(mem_size) / (1024**3)
+        except ValueError as e:
+            with ux_utils.print_exception_no_traceback():
+                raise ValueError(
+                    f'Failed to parse the memory size from {mem_size} (bytes)'
+                ) from e
     return _mem_size_gb()
@@ -1098,13 +1108,15 @@ def release_memory():
     """Release the process memory"""
     # Do the best effort to release the python heap and let malloc_trim
     # be more efficient.
-    gc.collect()
-    if sys.platform.startswith('linux'):
-        try:
+    try:
+        gc.collect()
+        if sys.platform.startswith('linux'):
             # Will fail on musl (alpine), but at least it works on our
             # offical docker images.
             libc = ctypes.CDLL('libc.so.6')
             return libc.malloc_trim(0)
-        except (AttributeError, OSError):
-            return 0
-    return 0
+        return 0
+    except Exception as e:  # pylint: disable=broad-except
+        logger.error(f'Failed to release memory: '
+                     f'{format_exception(e)}')
+        return 0

sky/utils/controller_utils.py CHANGED Viewed

@@ -228,6 +228,11 @@ def get_controller_for_pool(pool: bool) -> Controllers:
 def high_availability_specified(cluster_name: Optional[str]) -> bool:
     """Check if the controller high availability is specified in user config.
     """
+    # pylint: disable=import-outside-toplevel
+    from sky.jobs import utils as managed_job_utils
+    if managed_job_utils.is_consolidation_mode():
+        return True
     controller = Controllers.from_name(cluster_name)
     if controller is None:
         return False

sky/utils/db/db_utils.py CHANGED Viewed

@@ -7,12 +7,13 @@ import pathlib
 import sqlite3
 import threading
 import typing
-from typing import Any, Callable, Dict, Iterable, Optional
+from typing import Any, Callable, Dict, Iterable, Literal, Optional, Union
 import aiosqlite
 import aiosqlite.context
 import sqlalchemy
 from sqlalchemy import exc as sqlalchemy_exc
+from sqlalchemy.ext import asyncio as sqlalchemy_async
 from sky import sky_logging
 from sky.skylet import constants
@@ -375,11 +376,34 @@ def get_max_connections():
     return _max_connections
-def get_engine(db_name: str):
+@typing.overload
+def get_engine(
+        db_name: str,
+        async_engine: Literal[False] = False) -> sqlalchemy.engine.Engine:
+    ...
+@typing.overload
+def get_engine(db_name: str,
+               async_engine: Literal[True]) -> sqlalchemy_async.AsyncEngine:
+    ...
+def get_engine(
+    db_name: str,
+    async_engine: bool = False
+) -> Union[sqlalchemy.engine.Engine, sqlalchemy_async.AsyncEngine]:
     conn_string = None
     if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
         conn_string = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
     if conn_string:
+        if async_engine:
+            conn_string = conn_string.replace('postgresql://',
+                                              'postgresql+asyncpg://')
+            # This is an AsyncEngine, instead of a (normal, synchronous) Engine,
+            # so we should not put it in the cache. Instead, just return.
+            return sqlalchemy_async.create_async_engine(
+                conn_string, poolclass=sqlalchemy.NullPool)
         with _db_creation_lock:
             if conn_string not in _postgres_engine_cache:
                 if _max_connections == 0:
@@ -401,6 +425,11 @@ def get_engine(db_name: str):
     else:
         db_path = os.path.expanduser(f'~/.sky/{db_name}.db')
         pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
+        if async_engine:
+            # This is an AsyncEngine, instead of a (normal, synchronous) Engine,
+            # so we should not put it in the cache. Instead, just return.
+            return sqlalchemy_async.create_async_engine(
+                'sqlite+aiosqlite:///' + db_path, connect_args={'timeout': 30})
         if db_path not in _sqlite_engine_cache:
             _sqlite_engine_cache[db_path] = sqlalchemy.create_engine(
                 'sqlite:///' + db_path)

sky/utils/rich_utils.py CHANGED Viewed

@@ -421,7 +421,7 @@ async def decode_rich_status_async(
         undecoded_buffer = b''
         # Iterate over the response content in chunks
-        async for chunk in response.content.iter_chunked(8192):
+        async for chunk, _ in response.content.iter_chunks():
             if chunk is None:
                 return
@@ -481,6 +481,8 @@ async def decode_rich_status_async(
                     line = line[:-2] + '\n'
                 is_payload, line = message_utils.decode_payload(
                     line, raise_for_mismatch=False)
+                if line is None:
+                    continue
                 control = None
                 if is_payload:
                     control, encoded_status = Control.decode(line)

sky/utils/subprocess_utils.py CHANGED Viewed

@@ -437,3 +437,12 @@ def slow_start_processes(processes: List[Startable],
             break
         batch_size = min(batch_size * 2, max_batch_size)
         time.sleep(delay)
+def is_process_alive(pid: int) -> bool:
+    """Check if a process is alive."""
+    try:
+        process = psutil.Process(pid)
+        return process.is_running()
+    except psutil.NoSuchProcess:
+        return False

sky/volumes/volume.py CHANGED Viewed

@@ -150,6 +150,8 @@ class Volume:
         self.region, self.zone = cloud_obj.validate_region_zone(
             self.region, self.zone)
+        # Name must be set by factory before validation.
+        assert self.name is not None
         valid, err_msg = cloud_obj.is_volume_name_valid(self.name)
         if not valid:
             raise ValueError(f'Invalid volume name: {err_msg}')

skypilot-nightly 1.0.0.dev20250908__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250908py3-none-any.whl → 1.0.0.dev20250910py3-none-any.whl