PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250623py3-none-any.whl → 1.0.0.dev20250625py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

sky/__init__.py +2 -2
sky/admin_policy.py +16 -5
sky/backends/__init__.py +2 -1
sky/backends/backend_utils.py +38 -11
sky/backends/cloud_vm_ray_backend.py +52 -18
sky/client/cli/command.py +264 -25
sky/client/sdk.py +119 -85
sky/clouds/aws.py +10 -7
sky/clouds/azure.py +10 -7
sky/clouds/cloud.py +2 -0
sky/clouds/cudo.py +2 -0
sky/clouds/do.py +10 -7
sky/clouds/fluidstack.py +2 -0
sky/clouds/gcp.py +10 -7
sky/clouds/hyperbolic.py +10 -7
sky/clouds/ibm.py +2 -0
sky/clouds/kubernetes.py +27 -9
sky/clouds/lambda_cloud.py +10 -7
sky/clouds/nebius.py +10 -7
sky/clouds/oci.py +10 -7
sky/clouds/paperspace.py +10 -7
sky/clouds/runpod.py +10 -7
sky/clouds/scp.py +10 -7
sky/clouds/vast.py +10 -7
sky/clouds/vsphere.py +2 -0
sky/core.py +89 -15
sky/dag.py +14 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -0
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage_utils.py +2 -4
sky/exceptions.py +26 -0
sky/execution.py +5 -0
sky/global_user_state.py +263 -20
sky/jobs/client/sdk.py +13 -12
sky/jobs/controller.py +5 -1
sky/jobs/scheduler.py +4 -3
sky/jobs/server/core.py +121 -51
sky/jobs/state.py +15 -0
sky/jobs/utils.py +114 -8
sky/models.py +16 -0
sky/provision/__init__.py +26 -0
sky/provision/kubernetes/__init__.py +3 -0
sky/provision/kubernetes/instance.py +38 -77
sky/provision/kubernetes/utils.py +52 -2
sky/provision/kubernetes/volume.py +147 -0
sky/resources.py +20 -76
sky/serve/client/sdk.py +13 -13
sky/serve/server/core.py +5 -1
sky/server/common.py +40 -5
sky/server/constants.py +5 -1
sky/server/metrics.py +105 -0
sky/server/requests/executor.py +30 -14
sky/server/requests/payloads.py +22 -3
sky/server/requests/requests.py +59 -2
sky/server/rest.py +152 -0
sky/server/server.py +70 -19
sky/server/state.py +20 -0
sky/server/stream_utils.py +8 -3
sky/server/uvicorn.py +153 -13
sky/setup_files/dependencies.py +2 -0
sky/skylet/constants.py +19 -14
sky/task.py +141 -43
sky/templates/jobs-controller.yaml.j2 +12 -1
sky/templates/kubernetes-ray.yml.j2 +31 -2
sky/users/permission.py +2 -0
sky/utils/admin_policy_utils.py +5 -1
sky/utils/cli_utils/status_utils.py +25 -17
sky/utils/command_runner.py +118 -12
sky/utils/command_runner.pyi +57 -0
sky/utils/common_utils.py +9 -1
sky/utils/context.py +3 -1
sky/utils/controller_utils.py +1 -2
sky/utils/resources_utils.py +66 -0
sky/utils/rich_utils.py +6 -0
sky/utils/schemas.py +180 -38
sky/utils/status_lib.py +10 -0
sky/utils/validator.py +11 -1
sky/volumes/__init__.py +0 -0
sky/volumes/client/__init__.py +0 -0
sky/volumes/client/sdk.py +64 -0
sky/volumes/server/__init__.py +0 -0
sky/volumes/server/core.py +199 -0
sky/volumes/server/server.py +85 -0
sky/volumes/utils.py +158 -0
sky/volumes/volume.py +198 -0
{skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
{skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
/sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
/sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
/sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
{skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0

sky/server/server.py CHANGED Viewed

@@ -16,6 +16,7 @@ import posixpath
 import re
 import shutil
 import sys
+import threading
 from typing import Any, Dict, List, Literal, Optional, Set, Tuple
 import uuid
 import zipfile
@@ -43,6 +44,8 @@ from sky.serve.server import server as serve_rest
 from sky.server import common
 from sky.server import config as server_config
 from sky.server import constants as server_constants
+from sky.server import metrics
+from sky.server import state
 from sky.server import stream_utils
 from sky.server.requests import executor
 from sky.server.requests import payloads
@@ -61,6 +64,7 @@ from sky.utils import dag_utils
 from sky.utils import env_options
 from sky.utils import status_lib
 from sky.utils import subprocess_utils
+from sky.volumes.server import server as volumes_rest
 from sky.workspaces import server as workspaces_rest
 # pylint: disable=ungrouped-imports
@@ -378,9 +382,32 @@ class PathCleanMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
         return await call_next(request)
+class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
+    """Middleware to control requests when server is shutting down."""
+    async def dispatch(self, request: fastapi.Request, call_next):
+        if state.get_block_requests():
+            # Allow /api/ paths to continue, which are critical to operate
+            # on-going requests but will not submit new requests.
+            if not request.url.path.startswith('/api/'):
+                # Client will retry on 503 error.
+                return fastapi.responses.JSONResponse(
+                    status_code=503,
+                    content={
+                        'detail': 'Server is shutting down, '
+                                  'please try again later.'
+                    })
+        return await call_next(request)
 app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
+# Use environment variable to make the metrics middleware optional.
+if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
+    app.add_middleware(metrics.PrometheusMiddleware)
 app.add_middleware(RBACMiddleware)
 app.add_middleware(InternalDashboardPrefixMiddleware)
+app.add_middleware(GracefulShutdownMiddleware)
 app.add_middleware(PathCleanMiddleware)
 app.add_middleware(CacheControlStaticMiddleware)
 app.add_middleware(
@@ -404,6 +431,7 @@ app.include_router(users_rest.router, prefix='/users', tags=['users'])
 app.include_router(workspaces_rest.router,
                    prefix='/workspaces',
                    tags=['workspaces'])
+app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
 @app.get('/token')
@@ -564,6 +592,8 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
     ctx.override_envs(validate_body.env_vars)
     def validate_dag(dag: dag_utils.dag_lib.Dag):
+        # Resolve the volumes before admin policy and validation.
+        dag.resolve_and_validate_volumes()
         # TODO: Admin policy may contain arbitrary code, which may be expensive
         # to run and may block the server thread. However, moving it into the
         # executor adds a ~150ms penalty on the local API server because of
@@ -826,6 +856,10 @@ async def status(
     status_body: payloads.StatusBody = payloads.StatusBody()
 ) -> None:
     """Gets cluster statuses."""
+    if state.get_block_requests():
+        raise fastapi.HTTPException(
+            status_code=503,
+            detail='Server is shutting down, please try again later.')
     executor.schedule_request(
         request_id=request.state.request_id,
         request_name='status',
@@ -1044,13 +1078,14 @@ async def download(download_body: payloads.DownloadBody) -> None:
                                     detail=f'Error creating zip file: {str(e)}')
-@app.get('/cost_report')
-async def cost_report(request: fastapi.Request) -> None:
+@app.post('/cost_report')
+async def cost_report(request: fastapi.Request,
+                      cost_report_body: payloads.CostReportBody) -> None:
     """Gets the cost report of a cluster."""
     executor.schedule_request(
         request_id=request.state.request_id,
         request_name='cost_report',
-        request_body=payloads.RequestBody(),
+        request_body=cost_report_body,
         func=core.cost_report,
         schedule_type=requests_lib.ScheduleType.SHORT,
     )
@@ -1144,6 +1179,10 @@ async def api_get(request_id: str) -> requests_lib.RequestPayload:
             raise fastapi.HTTPException(
                 status_code=404, detail=f'Request {request_id!r} not found')
         if request_task.status > requests_lib.RequestStatus.RUNNING:
+            if request_task.should_retry:
+                raise fastapi.HTTPException(
+                    status_code=503,
+                    detail=f'Request {request_id!r} should be retried')
             request_error = request_task.get_error()
             if request_error is not None:
                 raise fastapi.HTTPException(status_code=500,
@@ -1434,6 +1473,11 @@ async def complete_storage_name(incomplete: str,) -> List[str]:
     return global_user_state.get_storage_names_start_with(incomplete)
+@app.get('/api/completion/volume_name')
+async def complete_volume_name(incomplete: str,) -> List[str]:
+    return global_user_state.get_volume_names_start_with(incomplete)
 @app.get('/dashboard/{full_path:path}')
 async def serve_dashboard(full_path: str):
     """Serves the Next.js dashboard application.
@@ -1460,6 +1504,7 @@ async def serve_dashboard(full_path: str):
     try:
         with open(index_path, 'r', encoding='utf-8') as f:
             content = f.read()
         return fastapi.responses.HTMLResponse(content=content)
     except Exception as e:
         logger.error(f'Error serving dashboard: {e}')
@@ -1483,7 +1528,13 @@ if __name__ == '__main__':
     parser.add_argument('--host', default='127.0.0.1')
     parser.add_argument('--port', default=46580, type=int)
     parser.add_argument('--deploy', action='store_true')
+    # Serve metrics on a separate port to isolate it from the application APIs:
+    # metrics port will not be exposed to the public network typically.
+    parser.add_argument('--metrics-port', default=9090, type=int)
     cmd_args = parser.parse_args()
+    if cmd_args.port == cmd_args.metrics_port:
+        raise ValueError('port and metrics-port cannot be the same')
     # Show the privacy policy if it is not already shown. We place it here so
     # that it is shown only when the API server is started.
     usage_lib.maybe_show_privacy_policy()
@@ -1491,9 +1542,17 @@ if __name__ == '__main__':
     config = server_config.compute_server_config(cmd_args.deploy)
     num_workers = config.num_server_workers
-    sub_procs = []
+    queue_server: Optional[multiprocessing.Process] = None
+    workers: List[executor.RequestWorker] = []
     try:
-        sub_procs = executor.start(config)
+        if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
+            metrics_thread = threading.Thread(target=metrics.run_metrics_server,
+                                              args=(cmd_args.host,
+                                                    cmd_args.metrics_port),
+                                              daemon=True)
+            metrics_thread.start()
+        queue_server, workers = executor.start(config)
         logger.info(f'Starting SkyPilot API server, workers={num_workers}')
         # We don't support reload for now, since it may cause leakage of request
         # workers or interrupt running requests.
@@ -1509,17 +1568,9 @@ if __name__ == '__main__':
     finally:
         logger.info('Shutting down SkyPilot API server...')
-        def cleanup(proc: multiprocessing.Process) -> None:
-            try:
-                proc.terminate()
-                proc.join()
-            finally:
-                # The process may not be started yet, close it anyway.
-                proc.close()
-        # Terminate processes in reverse order in case dependency, especially
-        # queue server. Terminate queue server first does not affect the
-        # correctness of cleanup but introduce redundant error messages.
-        subprocess_utils.run_in_parallel(cleanup,
-                                         list(reversed(sub_procs)),
-                                         num_threads=len(sub_procs))
+        subprocess_utils.run_in_parallel(lambda worker: worker.cancel(),
+                                         workers,
+                                         num_threads=len(workers))
+        if queue_server is not None:
+            queue_server.kill()
+            queue_server.join()

sky/server/state.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""State for API server process."""
+# This state is used to block requests except /api operations, which is useful
+# when a server is shutting down: new requests will be blocked, but existing
+# requests will be allowed to finish and be operated via /api operations, e.g.
+# /api/logs, /api/cancel, etc.
+_block_requests = False
+# TODO(aylei): refactor, state should be a instance property of API server app
+# instead of a global variable.
+def get_block_requests() -> bool:
+    """Whether block requests except /api operations."""
+    return _block_requests
+def set_block_requests(shutting_down: bool) -> None:
+    """Set the API server to block requests except /api operations."""
+    global _block_requests
+    _block_requests = shutting_down

sky/server/stream_utils.py CHANGED Viewed

@@ -155,9 +155,14 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
                 if request_task.status > requests_lib.RequestStatus.RUNNING:
                     if (request_task.status ==
                             requests_lib.RequestStatus.CANCELLED):
-                        buffer.append(
-                            f'{request_task.name!r} request {request_id}'
-                            ' cancelled\n')
+                        if request_task.should_retry:
+                            buffer.append(
+                                message_utils.encode_payload(
+                                    rich_utils.Control.RETRY.encode('')))
+                        else:
+                            buffer.append(
+                                f'{request_task.name!r} request {request_id}'
+                                ' cancelled\n')
                     break
             if not follow:
                 break

sky/server/uvicorn.py CHANGED Viewed

@@ -3,17 +3,165 @@
 This module is a wrapper around uvicorn to customize the behavior of the
 server.
 """
-import functools
+import asyncio
 import os
+import signal
 import threading
-from typing import Optional
+import time
+from types import FrameType
+from typing import Optional, Union
+import filelock
 import uvicorn
 from uvicorn.supervisors import multiprocess
+from sky import sky_logging
+from sky.server import state
+from sky.server.requests import requests as requests_lib
+from sky.skylet import constants
 from sky.utils import context_utils
 from sky.utils import subprocess_utils
+logger = sky_logging.init_logger(__name__)
+# File lock path for coordinating graceful shutdown across processes
+_GRACEFUL_SHUTDOWN_LOCK_PATH = '/tmp/skypilot_graceful_shutdown.lock'
+# Interval to check for on-going requests.
+_WAIT_REQUESTS_INTERVAL_SECONDS = 5
+# Timeout for waiting for on-going requests to finish.
+try:
+    _WAIT_REQUESTS_TIMEOUT_SECONDS = int(
+        os.environ.get(constants.GRACE_PERIOD_SECONDS_ENV_VAR, '60'))
+except ValueError:
+    _WAIT_REQUESTS_TIMEOUT_SECONDS = 60
+# TODO(aylei): use decorator to register requests that need to be proactively
+# cancelled instead of hardcoding here.
+_RETRIABLE_REQUEST_NAMES = [
+    'sky.logs',
+    'sky.jobs.logs',
+    'sky.serve.logs',
+]
+class Server(uvicorn.Server):
+    """Server wrapper for uvicorn.
+    Extended functionalities:
+    - Handle exit signal and perform custom graceful shutdown.
+    - Run the server process with contextually aware.
+    """
+    def __init__(self, config: uvicorn.Config):
+        super().__init__(config=config)
+        self.exiting: bool = False
+    def handle_exit(self, sig: int, frame: Union[FrameType, None]) -> None:
+        """Handle exit signal.
+        When a server process receives a SIGTERM or SIGINT signal, a graceful
+        shutdown will be initiated. If a SIGINT signal is received again, the
+        server will be forcefully shutdown.
+        """
+        if self.exiting and sig == signal.SIGINT:
+            # The server has been siganled to exit and recieved a SIGINT again,
+            # do force shutdown.
+            logger.info('Force shutdown.')
+            self.should_exit = True
+            super().handle_exit(sig, frame)
+            return
+        if not self.exiting:
+            self.exiting = True
+            # Perform graceful shutdown in a separate thread to avoid blocking
+            # the main thread.
+            threading.Thread(target=self._graceful_shutdown,
+                             args=(sig, frame),
+                             daemon=True).start()
+    def _graceful_shutdown(self, sig: int, frame: Union[FrameType,
+                                                        None]) -> None:
+        """Perform graceful shutdown."""
+        # Block new requests so that we can wait until all on-going requests
+        # are finished. Note that /api/$verb operations are still allowed in
+        # this stage to ensure the client can still operate the on-going
+        # requests, e.g. /api/logs, /api/cancel, etc.
+        logger.info('Block new requests being submitted in worker '
+                    f'{os.getpid()}.')
+        state.set_block_requests(True)
+        # Ensure the shutting_down are set on all workers before next step.
+        # TODO(aylei): hacky, need a reliable solution.
+        time.sleep(1)
+        lock = filelock.FileLock(_GRACEFUL_SHUTDOWN_LOCK_PATH)
+        # Elect a coordinator process to handle on-going requests check
+        with lock.acquire():
+            logger.info(f'Worker {os.getpid()} elected as shutdown coordinator')
+            self._wait_requests()
+        logger.info('Shutting down server...')
+        self.should_exit = True
+        super().handle_exit(sig, frame)
+    def _wait_requests(self) -> None:
+        """Wait until all on-going requests are finished or cancelled."""
+        start_time = time.time()
+        while True:
+            statuses = [
+                requests_lib.RequestStatus.PENDING,
+                requests_lib.RequestStatus.RUNNING,
+            ]
+            reqs = requests_lib.get_request_tasks(status=statuses)
+            if not reqs:
+                break
+            logger.info(f'{len(reqs)} on-going requests '
+                        'found, waiting for them to finish...')
+            # Proactively cancel internal requests and logs requests since
+            # they can run for infinite time.
+            internal_request_ids = [
+                d.id for d in requests_lib.INTERNAL_REQUEST_DAEMONS
+            ]
+            if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
+                logger.warning('Timeout waiting for on-going requests to '
+                               'finish, cancelling all on-going requests.')
+                for req in reqs:
+                    self.interrupt_request_for_retry(req.request_id)
+                break
+            interrupted = 0
+            for req in reqs:
+                if req.request_id in internal_request_ids:
+                    self.interrupt_request_for_retry(req.request_id)
+                    interrupted += 1
+                elif req.name in _RETRIABLE_REQUEST_NAMES:
+                    self.interrupt_request_for_retry(req.request_id)
+                    interrupted += 1
+                # TODO(aylei): interrupt pending requests to accelerate the
+                # shutdown.
+            # If some requests are not interrupted, wait for them to finish,
+            # otherwise we just check again immediately to accelerate the
+            # shutdown process.
+            if interrupted < len(reqs):
+                time.sleep(_WAIT_REQUESTS_INTERVAL_SECONDS)
+    def interrupt_request_for_retry(self, request_id: str) -> None:
+        """Interrupt a request for retry."""
+        with requests_lib.update_request(request_id) as req:
+            if req is None:
+                return
+            if req.pid is not None:
+                os.kill(req.pid, signal.SIGTERM)
+            req.status = requests_lib.RequestStatus.CANCELLED
+            req.should_retry = True
+        logger.info(
+            f'Request {request_id} interrupted and will be retried by client.')
+    def run(self, *args, **kwargs):
+        """Run the server process."""
+        context_utils.hijack_sys_attrs()
+        with self.capture_signals():
+            asyncio.run(self.serve(*args, **kwargs))
 def run(config: uvicorn.Config):
     """Run unvicorn server."""
@@ -22,28 +170,20 @@ def run(config: uvicorn.Config):
         # in uvicorn. Since we do not use reload now, simply
         # guard by an exception.
         raise ValueError('Reload is not supported yet.')
-    server = uvicorn.Server(config=config)
-    run_server_process = functools.partial(_run_server_process, server)
+    server = Server(config=config)
     try:
         if config.workers is not None and config.workers > 1:
             sock = config.bind_socket()
-            SlowStartMultiprocess(config,
-                                  target=run_server_process,
+            SlowStartMultiprocess(config, target=server.run,
                                   sockets=[sock]).run()
         else:
-            run_server_process()
+            server.run()
     finally:
         # Copied from unvicorn.run()
         if config.uds and os.path.exists(config.uds):
             os.remove(config.uds)
-def _run_server_process(server: uvicorn.Server, *args, **kwargs):
-    """Run the server process with contextually aware."""
-    context_utils.hijack_sys_attrs()
-    server.run(*args, **kwargs)
 class SlowStartMultiprocess(multiprocess.Multiprocess):
     """Uvicorn Multiprocess wrapper with slow start.

sky/setup_files/dependencies.py CHANGED Viewed

@@ -62,6 +62,8 @@ install_requires = [
     # the client-side actually not importing them.
     'casbin',
     'sqlalchemy_adapter',
+    # Required for API server metrics
+    'prometheus_client>=0.8.0',
     'passlib',
 ]

sky/skylet/constants.py CHANGED Viewed

@@ -401,6 +401,8 @@ PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
 PERSISTENT_RUN_RESTARTING_SIGNAL_FILE = (
     '~/.sky/.controller_recovery_restarting_signal')
+HA_PERSISTENT_RECOVERY_LOG_PATH = '/tmp/ha_recovery.log'
 # The placeholder for the local skypilot config path in file mounts for
 # controllers.
 LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
@@ -411,6 +413,8 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
 # Environment variable that is set to 'true' if this is a skypilot server.
 ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
+# Environment variable that is set to 'true' if metrics are enabled.
+ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
 # Environment variable that is set to 'true' if basic
 # authentication is enabled in the API server.
 ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
@@ -436,39 +440,40 @@ LOGGING_CONFIG_DIR = '~/.sky/logging'
 # Resources constants
 TIME_UNITS = {
-    's': 1 / 60,
-    'sec': 1 / 60,
     'm': 1,
-    'min': 1,
     'h': 60,
-    'hr': 60,
     'd': 24 * 60,
-    'day': 24 * 60,
+    'w': 7 * 24 * 60,
 }
 TIME_PATTERN: str = (
     f'^[0-9]+({"|".join([unit.lower() for unit in TIME_UNITS])})?$/i')
 MEMORY_SIZE_UNITS = {
-    'b': 1,
-    'k': 2**10,
     'kb': 2**10,
-    'm': 2**20,
+    'ki': 2**10,
     'mb': 2**20,
-    'g': 2**30,
+    'mi': 2**20,
     'gb': 2**30,
-    't': 2**40,
+    'gi': 2**30,
     'tb': 2**40,
-    'p': 2**50,
+    'ti': 2**40,
     'pb': 2**50,
+    'pi': 2**50,
 }
 MEMORY_SIZE_PATTERN = (
     '^[0-9]+('
-    f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}'
-    ')?$/i')
-MEMORY_SIZE_PLUS_PATTERN = f'{MEMORY_SIZE_PATTERN[:-3]}+?$/i'
+    f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}|'
+    f'{"|".join([unit.upper() for unit in MEMORY_SIZE_UNITS])}|'
+    f'{"|".join([unit[0].upper() + unit[1:] for unit in MEMORY_SIZE_UNITS if len(unit) > 1])}'  # pylint: disable=line-too-long
+    ')?$')
+LAST_USE_TRUNC_LENGTH = 25
 MIN_PRIORITY = -1000
 MAX_PRIORITY = 1000
 DEFAULT_PRIORITY = 0
+GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
+COST_REPORT_DEFAULT_DAYS = 30

skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250623py3-none-any.whl → 1.0.0.dev20250625py3-none-any.whl