PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251027py3-none-any.whl → 1.0.0.dev20251101py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (114) hide show

sky/__init__.py +2 -2
sky/adaptors/aws.py +25 -7
sky/adaptors/coreweave.py +278 -0
sky/backends/backend_utils.py +9 -6
sky/backends/cloud_vm_ray_backend.py +2 -3
sky/check.py +25 -13
sky/client/cli/command.py +52 -24
sky/cloud_stores.py +73 -0
sky/clouds/aws.py +59 -11
sky/core.py +7 -5
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +92 -1
sky/data/mounting_utils.py +71 -2
sky/data/storage.py +166 -9
sky/global_user_state.py +14 -18
sky/jobs/constants.py +2 -0
sky/jobs/controller.py +62 -67
sky/jobs/file_content_utils.py +80 -0
sky/jobs/log_gc.py +201 -0
sky/jobs/scheduler.py +15 -2
sky/jobs/server/core.py +85 -13
sky/jobs/server/server.py +14 -13
sky/jobs/server/utils.py +28 -10
sky/jobs/state.py +216 -40
sky/jobs/utils.py +65 -28
sky/metrics/utils.py +18 -0
sky/optimizer.py +1 -1
sky/provision/kubernetes/instance.py +88 -19
sky/provision/kubernetes/volume.py +2 -2
sky/schemas/api/responses.py +3 -5
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
sky/serve/replica_managers.py +2 -2
sky/serve/serve_utils.py +9 -2
sky/serve/server/server.py +8 -7
sky/server/common.py +21 -15
sky/server/constants.py +1 -1
sky/server/daemons.py +23 -17
sky/server/requests/executor.py +7 -3
sky/server/requests/payloads.py +2 -0
sky/server/requests/request_names.py +80 -0
sky/server/requests/requests.py +137 -102
sky/server/requests/serializers/decoders.py +0 -6
sky/server/requests/serializers/encoders.py +33 -6
sky/server/server.py +105 -36
sky/server/stream_utils.py +56 -13
sky/setup_files/dependencies.py +2 -0
sky/skylet/constants.py +6 -1
sky/skylet/events.py +7 -0
sky/skylet/services.py +18 -7
sky/ssh_node_pools/server.py +5 -4
sky/task.py +14 -42
sky/templates/kubernetes-ray.yml.j2 +1 -1
sky/templates/nebius-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +140 -12
sky/users/permission.py +4 -1
sky/utils/cli_utils/status_utils.py +8 -2
sky/utils/context_utils.py +13 -1
sky/utils/db/migration_utils.py +1 -1
sky/utils/resource_checker.py +4 -1
sky/utils/resources_utils.py +53 -29
sky/utils/schemas.py +23 -4
sky/volumes/server/server.py +4 -3
sky/workspaces/server.py +7 -6
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
/sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
/sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0

sky/server/common.py CHANGED Viewed

@@ -539,19 +539,27 @@ def _start_api_server(deploy: bool = False,
                                'is not a local URL')
         # Check available memory before starting the server.
-        avail_mem_size_gb: float = common_utils.get_mem_size_gb()
-        # pylint: disable=import-outside-toplevel
-        import sky.jobs.utils as job_utils
-        max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
-                      if job_utils.is_consolidation_mode(on_api_restart=True)
-                      else server_constants.MIN_AVAIL_MEM_GB)
-        if avail_mem_size_gb <= max_memory:
-            logger.warning(
-                f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
-                f'has {avail_mem_size_gb:.1f}GB memory available. '
-                f'At least {max_memory}GB is recommended to support higher '
-                'load with better performance.'
-                f'{colorama.Style.RESET_ALL}')
+        # Skip this warning if postgres is used, as:
+        #   1) that's almost certainly a remote API server;
+        #   2) the actual consolidation mode config is stashed in the database,
+        #      and the value of `job_utils.is_consolidation_mode` will not be
+        #      the actual value in the db, but only None as in this case, the
+        #      whole YAML config is really just `db: <URI>`.
+        if skypilot_config.get_nested(('db',), None) is None:
+            avail_mem_size_gb: float = common_utils.get_mem_size_gb()
+            # pylint: disable=import-outside-toplevel
+            import sky.jobs.utils as job_utils
+            max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
+                          if job_utils.is_consolidation_mode(
+                              on_api_restart=True) else
+                          server_constants.MIN_AVAIL_MEM_GB)
+            if avail_mem_size_gb <= max_memory:
+                logger.warning(
+                    f'{colorama.Fore.YELLOW}Your SkyPilot API server machine '
+                    f'only has {avail_mem_size_gb:.1f}GB memory available. '
+                    f'At least {max_memory}GB is recommended to support higher '
+                    'load with better performance.'
+                    f'{colorama.Style.RESET_ALL}')
         args = [sys.executable, *API_SERVER_CMD.split()]
         if deploy:
@@ -560,8 +568,6 @@ def _start_api_server(deploy: bool = False,
             args += [f'--host={host}']
         if metrics_port is not None:
             args += [f'--metrics-port={metrics_port}']
-        # Use this argument to disable the internal signal file check.
-        args += ['--start-with-python']
         if foreground:
             # Replaces the current process with the API server

sky/server/constants.py CHANGED Viewed

@@ -10,7 +10,7 @@ from sky.skylet import constants
 # based on version info is needed.
 # For more details and code guidelines, refer to:
 # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
-API_VERSION = 21
+API_VERSION = 22
 # The minimum peer API version that the code should still work with.
 # Notes (dev):

sky/server/daemons.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Callable
 from sky import sky_logging
 from sky import skypilot_config
 from sky.server import constants as server_constants
+from sky.server.requests import request_names
 from sky.utils import annotations
 from sky.utils import common_utils
 from sky.utils import env_options
@@ -26,7 +27,7 @@ class InternalRequestDaemon:
     """Internal daemon that runs an event in the background."""
     id: str
-    name: str
+    name: request_names.RequestName
     event_fn: Callable[[], None]
     default_log_level: str = 'INFO'
     should_skip: Callable[[], bool] = _default_should_skip
@@ -195,26 +196,31 @@ INTERNAL_REQUEST_DAEMONS = [
     # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
     # set to updated status automatically, without showing users the hint of
     # cluster being stopped or down when `sky status -r` is called.
-    InternalRequestDaemon(id='skypilot-status-refresh-daemon',
-                          name='status-refresh',
-                          event_fn=refresh_cluster_status_event,
-                          default_log_level='DEBUG'),
+    InternalRequestDaemon(
+        id='skypilot-status-refresh-daemon',
+        name=request_names.RequestName.REQUEST_DAEMON_STATUS_REFRESH,
+        event_fn=refresh_cluster_status_event,
+        default_log_level='DEBUG'),
     # Volume status refresh daemon to update the volume status periodically.
-    InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
-                          name='volume-refresh',
-                          event_fn=refresh_volume_status_event),
+    InternalRequestDaemon(
+        id='skypilot-volume-status-refresh-daemon',
+        name=request_names.RequestName.REQUEST_DAEMON_VOLUME_REFRESH,
+        event_fn=refresh_volume_status_event),
     InternalRequestDaemon(id='managed-job-status-refresh-daemon',
-                          name='managed-job-status-refresh',
+                          name=request_names.RequestName.
+                          REQUEST_DAEMON_MANAGED_JOB_STATUS_REFRESH,
                           event_fn=managed_job_status_refresh_event,
                           should_skip=should_skip_managed_job_status_refresh),
-    InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
-                          name='sky-serve-status-refresh',
-                          event_fn=sky_serve_status_refresh_event,
-                          should_skip=should_skip_sky_serve_status_refresh),
-    InternalRequestDaemon(id='pool-status-refresh-daemon',
-                          name='pool-status-refresh',
-                          event_fn=pool_status_refresh_event,
-                          should_skip=should_skip_pool_status_refresh),
+    InternalRequestDaemon(
+        id='sky-serve-status-refresh-daemon',
+        name=request_names.RequestName.REQUEST_DAEMON_SKY_SERVE_STATUS_REFRESH,
+        event_fn=sky_serve_status_refresh_event,
+        should_skip=should_skip_sky_serve_status_refresh),
+    InternalRequestDaemon(
+        id='pool-status-refresh-daemon',
+        name=request_names.RequestName.REQUEST_DAEMON_POOL_STATUS_REFRESH,
+        event_fn=pool_status_refresh_event,
+        should_skip=should_skip_pool_status_refresh),
 ]

sky/server/requests/executor.py CHANGED Viewed

@@ -47,6 +47,7 @@ from sky.server import metrics as metrics_lib
 from sky.server.requests import payloads
 from sky.server.requests import preconditions
 from sky.server.requests import process
+from sky.server.requests import request_names
 from sky.server.requests import requests as api_requests
 from sky.server.requests import threads
 from sky.server.requests.queues import local_queue
@@ -395,7 +396,10 @@ def _request_execution_wrapper(request_id: str,
     rss_begin = proc.memory_info().rss
     db_utils.set_max_connections(num_db_connections_per_worker)
     # Handle the SIGTERM signal to abort the request processing gracefully.
-    signal.signal(signal.SIGTERM, _sigterm_handler)
+    # Only set up signal handlers in the main thread, as signal.signal() raises
+    # ValueError if called from a non-main thread (e.g., in tests).
+    if threading.current_thread() is threading.main_thread():
+        signal.signal(signal.SIGTERM, _sigterm_handler)
     logger.info(f'Running request {request_id} with pid {pid}')
@@ -688,7 +692,7 @@ async def _execute_request_coroutine(request: api_requests.Request):
 async def prepare_request_async(
     request_id: str,
-    request_name: str,
+    request_name: request_names.RequestName,
     request_body: payloads.RequestBody,
     func: Callable[P, Any],
     request_cluster_name: Optional[str] = None,
@@ -721,7 +725,7 @@ async def prepare_request_async(
 async def schedule_request_async(request_id: str,
-                                 request_name: str,
+                                 request_name: request_names.RequestName,
                                  request_body: payloads.RequestBody,
                                  func: Callable[P, Any],
                                  request_cluster_name: Optional[str] = None,

sky/server/requests/payloads.py CHANGED Viewed

@@ -319,6 +319,8 @@ class StatusBody(RequestBody):
     # Only return fields that are needed for the
     # dashboard / CLI summary response
     summary_response: bool = False
+    # Include the cluster handle in the response
+    include_handle: bool = True
 class StartBody(RequestBody):

sky/server/requests/request_names.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""Request names."""
+import enum
+class RequestName(str, enum.Enum):
+    """Enum of all the request names."""
+    # General requests
+    CHECK = 'check'
+    ENABLED_CLOUDS = 'enabled_clouds'
+    REALTIME_KUBERNETES_GPU_AVAILABILITY = (
+        'realtime_kubernetes_gpu_availability')
+    KUBERNETES_NODE_INFO = 'kubernetes_node_info'
+    STATUS_KUBERNETES = 'status_kubernetes'
+    LIST_ACCELERATORS = 'list_accelerators'
+    LIST_ACCELERATOR_COUNTS = 'list_accelerator_counts'
+    OPTIMIZE = 'optimize'
+    # Cluster requests
+    CLUSTER_LAUNCH = 'launch'
+    CLUSTER_EXEC = 'exec'
+    CLUSTER_STOP = 'stop'
+    CLUSTER_STATUS = 'status'
+    CLUSTER_ENDPOINTS = 'endpoints'
+    CLUSTER_DOWN = 'down'
+    CLUSTER_START = 'start'
+    CLUSTER_AUTOSTOP = 'autostop'
+    CLUSTER_QUEUE = 'queue'
+    CLUSTER_JOB_STATUS = 'job_status'
+    CLUSTER_JOB_CANCEL = 'cancel'
+    CLUSTER_JOB_LOGS = 'logs'
+    CLUSTER_JOB_DOWNLOAD_LOGS = 'download_logs'
+    CLUSTER_COST_REPORT = 'cost_report'
+    # Storage requests
+    STORAGE_LS = 'storage_ls'
+    STORAGE_DELETE = 'storage_delete'
+    # Local requests
+    LOCAL_UP = 'local_up'
+    LOCAL_DOWN = 'local_down'
+    # API requests
+    API_CANCEL = 'api_cancel'
+    ALL_CONTEXTS = 'all_contexts'
+    # Managed jobs requests
+    JOBS_LAUNCH = 'jobs.launch'
+    JOBS_QUEUE = 'jobs.queue'
+    JOBS_QUEUE_V2 = 'jobs.queue_v2'
+    JOBS_CANCEL = 'jobs.cancel'
+    JOBS_LOGS = 'jobs.logs'
+    JOBS_DOWNLOAD_LOGS = 'jobs.download_logs'
+    JOBS_POOL_APPLY = 'jobs.pool_apply'
+    JOBS_POOL_DOWN = 'jobs.pool_down'
+    JOBS_POOL_STATUS = 'jobs.pool_status'
+    JOBS_POOL_LOGS = 'jobs.pool_logs'
+    JOBS_POOL_SYNC_DOWN_LOGS = 'jobs.pool_sync_down_logs'
+    # Serve requests
+    SERVE_UP = 'serve.up'
+    SERVE_UPDATE = 'serve.update'
+    SERVE_DOWN = 'serve.down'
+    SERVE_TERMINATE_REPLICA = 'serve.terminate_replica'
+    SERVE_STATUS = 'serve.status'
+    SERVE_LOGS = 'serve.logs'
+    SERVE_SYNC_DOWN_LOGS = 'serve.sync_down_logs'
+    # Volumes requests
+    VOLUME_LIST = 'volume_list'
+    VOLUME_DELETE = 'volume_delete'
+    VOLUME_APPLY = 'volume_apply'
+    # Workspaces requests
+    WORKSPACES_GET = 'workspaces.get'
+    WORKSPACES_UPDATE = 'workspaces.update'
+    WORKSPACES_CREATE = 'workspaces.create'
+    WORKSPACES_DELETE = 'workspaces.delete'
+    WORKSPACES_GET_CONFIG = 'workspaces.get_config'
+    WORKSPACES_UPDATE_CONFIG = 'workspaces.update_config'
+    # SSH node pools requests
+    SSH_NODE_POOLS_UP = 'ssh_node_pools.up'
+    SSH_NODE_POOLS_DOWN = 'ssh_node_pools.down'
+    # Internal request daemons
+    REQUEST_DAEMON_STATUS_REFRESH = 'status-refresh'
+    REQUEST_DAEMON_VOLUME_REFRESH = 'volume-refresh'
+    REQUEST_DAEMON_MANAGED_JOB_STATUS_REFRESH = 'managed-job-status-refresh'
+    REQUEST_DAEMON_SKY_SERVE_STATUS_REFRESH = 'sky-serve-status-refresh'
+    REQUEST_DAEMON_POOL_STATUS_REFRESH = 'pool-status-refresh'

sky/server/requests/requests.py CHANGED Viewed

@@ -5,7 +5,6 @@ import contextlib
 import dataclasses
 import enum
 import functools
-import json
 import os
 import pathlib
 import shutil
@@ -21,6 +20,7 @@ import uuid
 import anyio
 import colorama
 import filelock
+import orjson
 from sky import exceptions
 from sky import global_user_state
@@ -213,8 +213,8 @@ class Request:
             entrypoint=self.entrypoint.__name__,
             request_body=self.request_body.model_dump_json(),
             status=self.status.value,
-            return_value=json.dumps(None),
-            error=json.dumps(None),
+            return_value=orjson.dumps(None).decode('utf-8'),
+            error=orjson.dumps(None).decode('utf-8'),
             pid=None,
             created_at=self.created_at,
             schedule_type=self.schedule_type.value,
@@ -237,8 +237,8 @@ class Request:
                 entrypoint=encoders.pickle_and_encode(self.entrypoint),
                 request_body=encoders.pickle_and_encode(self.request_body),
                 status=self.status.value,
-                return_value=json.dumps(self.return_value),
-                error=json.dumps(self.error),
+                return_value=orjson.dumps(self.return_value).decode('utf-8'),
+                error=orjson.dumps(self.error).decode('utf-8'),
                 pid=self.pid,
                 created_at=self.created_at,
                 schedule_type=self.schedule_type.value,
@@ -270,8 +270,8 @@ class Request:
                 entrypoint=decoders.decode_and_unpickle(payload.entrypoint),
                 request_body=decoders.decode_and_unpickle(payload.request_body),
                 status=RequestStatus(payload.status),
-                return_value=json.loads(payload.return_value),
-                error=json.loads(payload.error),
+                return_value=orjson.loads(payload.return_value),
+                error=orjson.loads(payload.error),
                 pid=payload.pid,
                 created_at=payload.created_at,
                 schedule_type=ScheduleType(payload.schedule_type),
@@ -328,10 +328,11 @@ def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
             entrypoint=request.entrypoint.__name__
             if request.entrypoint is not None else '',
             request_body=request.request_body.model_dump_json()
-            if request.request_body is not None else json.dumps(None),
+            if request.request_body is not None else
+            orjson.dumps(None).decode('utf-8'),
             status=request.status.value,
-            return_value=json.dumps(None),
-            error=json.dumps(None),
+            return_value=orjson.dumps(None).decode('utf-8'),
+            error=orjson.dumps(None).decode('utf-8'),
             pid=None,
             created_at=request.created_at,
             schedule_type=request.schedule_type.value,
@@ -372,9 +373,9 @@ def _update_request_row_fields(
     if 'user_id' not in fields:
         content['user_id'] = ''
     if 'return_value' not in fields:
-        content['return_value'] = json.dumps(None)
+        content['return_value'] = orjson.dumps(None).decode('utf-8')
     if 'error' not in fields:
-        content['error'] = json.dumps(None)
+        content['error'] = orjson.dumps(None).decode('utf-8')
     if 'schedule_type' not in fields:
         content['schedule_type'] = ScheduleType.SHORT.value
     # Optional fields in RequestPayload
@@ -393,94 +394,6 @@ def _update_request_row_fields(
     return tuple(content[col] for col in REQUEST_COLUMNS)
-def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
-    """Kill all pending and running requests for a cluster.
-    Args:
-        cluster_name: the name of the cluster.
-        exclude_request_names: exclude requests with these names. This is to
-            prevent killing the caller request.
-    """
-    request_ids = [
-        request_task.request_id
-        for request_task in get_request_tasks(req_filter=RequestTaskFilter(
-            status=[RequestStatus.PENDING, RequestStatus.RUNNING],
-            exclude_request_names=[exclude_request_name],
-            cluster_names=[cluster_name],
-            fields=['request_id']))
-    ]
-    kill_requests(request_ids)
-def kill_requests_with_prefix(request_ids: Optional[List[str]] = None,
-                              user_id: Optional[str] = None) -> List[str]:
-    """Kill requests with a given request ID prefix."""
-    expanded_request_ids: Optional[List[str]] = None
-    if request_ids is not None:
-        expanded_request_ids = []
-        for request_id in request_ids:
-            request_tasks = get_requests_with_prefix(request_id,
-                                                     fields=['request_id'])
-            if request_tasks is None or len(request_tasks) == 0:
-                continue
-            if len(request_tasks) > 1:
-                raise ValueError(f'Multiple requests found for '
-                                 f'request ID prefix: {request_id}')
-            expanded_request_ids.append(request_tasks[0].request_id)
-    return kill_requests(request_ids=expanded_request_ids, user_id=user_id)
-def kill_requests(request_ids: Optional[List[str]] = None,
-                  user_id: Optional[str] = None) -> List[str]:
-    """Kill a SkyPilot API request and set its status to cancelled.
-    Args:
-        request_ids: The request IDs to kill. If None, all requests for the
-            user are killed.
-        user_id: The user ID to kill requests for. If None, all users are
-            killed.
-    Returns:
-        A list of request IDs that were cancelled.
-    """
-    if request_ids is None:
-        request_ids = [
-            request_task.request_id
-            for request_task in get_request_tasks(req_filter=RequestTaskFilter(
-                status=[RequestStatus.PENDING, RequestStatus.RUNNING],
-                # Avoid cancelling the cancel request itself.
-                exclude_request_names=['sky.api_cancel'],
-                user_id=user_id,
-                fields=['request_id']))
-        ]
-    cancelled_request_ids = []
-    for request_id in request_ids:
-        with update_request(request_id) as request_record:
-            if request_record is None:
-                logger.debug(f'No request ID {request_id}')
-                continue
-            # Skip internal requests. The internal requests are scheduled with
-            # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
-            if request_record.request_id in set(
-                    event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
-                continue
-            if request_record.status > RequestStatus.RUNNING:
-                logger.debug(f'Request {request_id} already finished')
-                continue
-            if request_record.pid is not None:
-                logger.debug(f'Killing request process {request_record.pid}')
-                # Use SIGTERM instead of SIGKILL:
-                # - The executor can handle SIGTERM gracefully
-                # - After SIGTERM, the executor can reuse the request process
-                #   for other requests, avoiding the overhead of forking a new
-                #   process for each request.
-                os.kill(request_record.pid, signal.SIGTERM)
-            request_record.status = RequestStatus.CANCELLED
-            request_record.finished_at = time.time()
-            cancelled_request_ids.append(request_id)
-    return cancelled_request_ids
 def create_table(cursor, conn):
     # Enable WAL mode to avoid locking issues.
     # See: issue #1441 and PR #1509
@@ -624,6 +537,128 @@ def request_lock_path(request_id: str) -> str:
     return os.path.join(lock_path, f'.{request_id}.lock')
+def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
+    """Kill all pending and running requests for a cluster.
+    Args:
+        cluster_name: the name of the cluster.
+        exclude_request_names: exclude requests with these names. This is to
+            prevent killing the caller request.
+    """
+    request_ids = [
+        request_task.request_id
+        for request_task in get_request_tasks(req_filter=RequestTaskFilter(
+            status=[RequestStatus.PENDING, RequestStatus.RUNNING],
+            exclude_request_names=[exclude_request_name],
+            cluster_names=[cluster_name],
+            fields=['request_id']))
+    ]
+    _kill_requests(request_ids)
+def kill_requests_with_prefix(request_ids: Optional[List[str]] = None,
+                              user_id: Optional[str] = None) -> List[str]:
+    """Kill requests with a given request ID prefix."""
+    expanded_request_ids: Optional[List[str]] = None
+    if request_ids is not None:
+        expanded_request_ids = []
+        for request_id in request_ids:
+            request_tasks = get_requests_with_prefix(request_id,
+                                                     fields=['request_id'])
+            if request_tasks is None or len(request_tasks) == 0:
+                continue
+            if len(request_tasks) > 1:
+                raise ValueError(f'Multiple requests found for '
+                                 f'request ID prefix: {request_id}')
+            expanded_request_ids.append(request_tasks[0].request_id)
+    return _kill_requests(request_ids=expanded_request_ids, user_id=user_id)
+def _should_kill_request(request_id: str,
+                         request_record: Optional[Request]) -> bool:
+    if request_record is None:
+        logger.debug(f'No request ID {request_id}')
+        return False
+    # Skip internal requests. The internal requests are scheduled with
+    # request_id in range(len(INTERNAL_REQUEST_EVENTS)).
+    if request_record.request_id in set(
+            event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
+        return False
+    if request_record.status > RequestStatus.RUNNING:
+        logger.debug(f'Request {request_id} already finished')
+        return False
+    return True
+def _kill_requests(request_ids: Optional[List[str]] = None,
+                   user_id: Optional[str] = None) -> List[str]:
+    """Kill a SkyPilot API request and set its status to cancelled.
+    Args:
+        request_ids: The request IDs to kill. If None, all requests for the
+            user are killed.
+        user_id: The user ID to kill requests for. If None, all users are
+            killed.
+    Returns:
+        A list of request IDs that were cancelled.
+    """
+    if request_ids is None:
+        request_ids = [
+            request_task.request_id
+            for request_task in get_request_tasks(req_filter=RequestTaskFilter(
+                status=[RequestStatus.PENDING, RequestStatus.RUNNING],
+                # Avoid cancelling the cancel request itself.
+                exclude_request_names=['sky.api_cancel'],
+                user_id=user_id,
+                fields=['request_id']))
+        ]
+    cancelled_request_ids = []
+    for request_id in request_ids:
+        with update_request(request_id) as request_record:
+            if not _should_kill_request(request_id, request_record):
+                continue
+            if request_record.pid is not None:
+                logger.debug(f'Killing request process {request_record.pid}')
+                # Use SIGTERM instead of SIGKILL:
+                # - The executor can handle SIGTERM gracefully
+                # - After SIGTERM, the executor can reuse the request process
+                #   for other requests, avoiding the overhead of forking a new
+                #   process for each request.
+                os.kill(request_record.pid, signal.SIGTERM)
+            request_record.status = RequestStatus.CANCELLED
+            request_record.finished_at = time.time()
+            cancelled_request_ids.append(request_id)
+    return cancelled_request_ids
+@init_db_async
+@asyncio_utils.shield
+async def kill_request_async(request_id: str) -> bool:
+    """Kill a SkyPilot API request and set its status to cancelled.
+    Returns:
+        True if the request was killed, False otherwise.
+    """
+    async with filelock.AsyncFileLock(request_lock_path(request_id)):
+        request = await _get_request_no_lock_async(request_id)
+        if not _should_kill_request(request_id, request):
+            return False
+        assert request is not None
+        if request.pid is not None:
+            logger.debug(f'Killing request process {request.pid}')
+            # Use SIGTERM instead of SIGKILL:
+            # - The executor can handle SIGTERM gracefully
+            # - After SIGTERM, the executor can reuse the request process
+            #   for other requests, avoiding the overhead of forking a new
+            #   process for each request.
+            os.kill(request.pid, signal.SIGTERM)
+        request.status = RequestStatus.CANCELLED
+        request.finished_at = time.time()
+        await _add_or_update_request_no_lock_async(request)
+    return True
 @contextlib.contextmanager
 @init_db
 @metrics_lib.time_me
@@ -638,7 +673,7 @@ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
             _add_or_update_request_no_lock(request)
-@init_db
+@init_db_async
 @metrics_lib.time_me
 @asyncio_utils.shield
 async def update_status_async(request_id: str, status: RequestStatus) -> None:
@@ -650,7 +685,7 @@ async def update_status_async(request_id: str, status: RequestStatus) -> None:
             await _add_or_update_request_no_lock_async(request)
-@init_db
+@init_db_async
 @metrics_lib.time_me
 @asyncio_utils.shield
 async def update_status_msg_async(request_id: str, status_msg: str) -> None:

sky/server/requests/serializers/decoders.py CHANGED Viewed

@@ -60,12 +60,6 @@ def decode_status(
         if 'handle' in cluster and cluster['handle'] is not None:
             cluster['handle'] = decode_and_unpickle(cluster['handle'])
         cluster['status'] = status_lib.ClusterStatus(cluster['status'])
-        # this field is to be deprecated in the future.
-        # do not decode this field if it is not present.
-        if ('storage_mounts_metadata' in cluster and
-                cluster['storage_mounts_metadata'] is not None):
-            cluster['storage_mounts_metadata'] = decode_and_unpickle(
-                cluster['storage_mounts_metadata'])
         if 'is_managed' not in cluster:
             cluster['is_managed'] = False
         response.append(responses.StatusResponse.model_validate(cluster))

sky/server/requests/serializers/encoders.py CHANGED Viewed

@@ -60,13 +60,23 @@ def encode_status(
         clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
     response = []
     for cluster in clusters:
-        response_cluster = cluster.model_dump()
+        response_cluster = cluster.model_dump(exclude_none=True)
+        # These default setting is needed because last_use and status_updated_at
+        # used to be not optional.
+        # TODO(syang): remove this after v0.10.7 or v0.11.0
+        if 'last_use' not in response_cluster:
+            response_cluster['last_use'] = ''
+        if 'status_updated_at' not in response_cluster:
+            response_cluster['status_updated_at'] = 0
         response_cluster['status'] = cluster['status'].value
         handle = serialize_utils.prepare_handle_for_backwards_compatibility(
             cluster['handle'])
         response_cluster['handle'] = pickle_and_encode(handle)
+        # TODO (syang) We still need to return this field for backwards
+        # compatibility.
+        # Remove this field at or after v0.10.7 or v0.11.0
         response_cluster['storage_mounts_metadata'] = pickle_and_encode(
-            response_cluster['storage_mounts_metadata'])
+            None)  # Always returns None.
         response.append(response_cluster)
     return response
@@ -206,10 +216,11 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
 @register_encoder('storage_ls')
 def encode_storage_ls(
         return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
-    for storage_info in return_value:
+    response_list = [storage_info.model_dump() for storage_info in return_value]
+    for storage_info in response_list:
         storage_info['status'] = storage_info['status'].value
         storage_info['store'] = [store.value for store in storage_info['store']]
-    return [storage_info.model_dump() for storage_info in return_value]
+    return response_list
 @register_encoder('volume_list')
@@ -219,11 +230,11 @@ def encode_volume_list(
 @register_encoder('job_status')
-def encode_job_status(return_value: Dict[int, Any]) -> Dict[int, str]:
+def encode_job_status(return_value: Dict[int, Any]) -> Dict[str, str]:
     for job_id in return_value.keys():
         if return_value[job_id] is not None:
             return_value[job_id] = return_value[job_id].value
-    return return_value
+    return {str(k): v for k, v in return_value.items()}
 @register_encoder('kubernetes_node_info')
@@ -235,3 +246,19 @@ def encode_kubernetes_node_info(
 @register_encoder('endpoints')
 def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
     return {str(k): v for k, v in return_value.items()}
+@register_encoder('realtime_kubernetes_gpu_availability')
+def encode_realtime_gpu_availability(
+    return_value: List[Tuple[str,
+                             List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
+    # Convert RealtimeGpuAvailability namedtuples to lists
+    # for JSON serialization.
+    result = []
+    for context, gpu_list in return_value:
+        gpu_availability_list = []
+        for gpu in gpu_list:
+            gpu_list_item = [gpu.gpu, gpu.counts, gpu.capacity, gpu.available]
+            gpu_availability_list.append(gpu_list_item)
+        result.append((context, gpu_availability_list))
+    return result

skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251027py3-none-any.whl → 1.0.0.dev20251101py3-none-any.whl