PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250626__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250624py3-none-any.whl → 1.0.0.dev20250626py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

sky/__init__.py +2 -2
sky/adaptors/kubernetes.py +1 -6
sky/backends/backend_utils.py +26 -11
sky/backends/cloud_vm_ray_backend.py +16 -5
sky/client/cli/command.py +232 -9
sky/client/sdk.py +195 -91
sky/clouds/aws.py +10 -7
sky/clouds/azure.py +10 -7
sky/clouds/cloud.py +2 -0
sky/clouds/cudo.py +2 -0
sky/clouds/do.py +10 -7
sky/clouds/fluidstack.py +2 -0
sky/clouds/gcp.py +10 -7
sky/clouds/hyperbolic.py +10 -7
sky/clouds/ibm.py +2 -0
sky/clouds/kubernetes.py +26 -9
sky/clouds/lambda_cloud.py +10 -7
sky/clouds/nebius.py +10 -7
sky/clouds/oci.py +10 -7
sky/clouds/paperspace.py +10 -7
sky/clouds/runpod.py +10 -7
sky/clouds/scp.py +10 -7
sky/clouds/ssh.py +36 -0
sky/clouds/vast.py +10 -7
sky/clouds/vsphere.py +2 -0
sky/core.py +21 -0
sky/dag.py +14 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/bs6UB9V4Jq10TIZ5x-kBK/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/141-fa5a20cbf401b351.js +11 -0
sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
sky/dashboard/out/_next/static/chunks/25.76c246239df93d50.js +6 -0
sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +1 -0
sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +1 -0
sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +16 -0
sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +1 -0
sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +6 -0
sky/dashboard/out/_next/static/chunks/875.52c962183328b3f2.js +25 -0
sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +1 -0
sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +1 -0
sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +1 -0
sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-9a3ce3170d2edcec.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/{clusters-7e9736af1c6345a6.js → clusters-f119a5630a1efd61.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspace/{new-31aa8bdcb7592635.js → new-5b59bce9eb208d84.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +1 -0
sky/dashboard/out/_next/static/css/52082cf558ec9705.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -0
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage_utils.py +2 -4
sky/exceptions.py +15 -0
sky/execution.py +5 -0
sky/global_user_state.py +129 -0
sky/jobs/client/sdk.py +13 -11
sky/jobs/server/core.py +4 -0
sky/models.py +16 -0
sky/provision/__init__.py +26 -0
sky/provision/kubernetes/__init__.py +3 -0
sky/provision/kubernetes/instance.py +38 -77
sky/provision/kubernetes/utils.py +70 -4
sky/provision/kubernetes/volume.py +147 -0
sky/resources.py +20 -76
sky/serve/client/sdk.py +13 -13
sky/serve/server/core.py +5 -1
sky/server/common.py +40 -5
sky/server/constants.py +5 -1
sky/server/metrics.py +105 -0
sky/server/requests/executor.py +30 -14
sky/server/requests/payloads.py +16 -0
sky/server/requests/requests.py +35 -1
sky/server/rest.py +153 -0
sky/server/server.py +70 -43
sky/server/state.py +20 -0
sky/server/stream_utils.py +8 -3
sky/server/uvicorn.py +153 -13
sky/setup_files/dependencies.py +2 -0
sky/skylet/constants.py +19 -3
sky/skypilot_config.py +3 -0
sky/ssh_node_pools/__init__.py +1 -0
sky/ssh_node_pools/core.py +133 -0
sky/ssh_node_pools/server.py +232 -0
sky/task.py +141 -18
sky/templates/kubernetes-ray.yml.j2 +30 -1
sky/users/permission.py +2 -0
sky/utils/context.py +3 -1
sky/utils/kubernetes/deploy_remote_cluster.py +12 -185
sky/utils/kubernetes/ssh_utils.py +221 -0
sky/utils/resources_utils.py +66 -0
sky/utils/rich_utils.py +6 -0
sky/utils/schemas.py +146 -3
sky/utils/status_lib.py +10 -0
sky/utils/validator.py +11 -1
sky/volumes/__init__.py +0 -0
sky/volumes/client/__init__.py +0 -0
sky/volumes/client/sdk.py +64 -0
sky/volumes/server/__init__.py +0 -0
sky/volumes/server/core.py +199 -0
sky/volumes/server/server.py +85 -0
sky/volumes/utils.py +158 -0
sky/volumes/volume.py +198 -0
{skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/METADATA +2 -1
{skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/RECORD +135 -115
sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +0 -1
sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +0 -6
sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +0 -1
sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
sky/dashboard/out/_next/static/chunks/513.309df9e18a9ff005.js +0 -1
sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
sky/dashboard/out/_next/static/chunks/843-bde186946d353355.js +0 -11
sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +0 -1
sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
sky/dashboard/out/_next/static/chunks/973-56412c7976b4655b.js +0 -1
sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-ecc5a7003776cfa7.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
/sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → bs6UB9V4Jq10TIZ5x-kBK}/_ssgManifest.js +0 -0
/sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
{skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/top_level.txt +0 -0

sky/server/requests/requests.py CHANGED Viewed

@@ -38,6 +38,7 @@ REQUEST_TABLE = 'requests'
 COL_CLUSTER_NAME = 'cluster_name'
 COL_USER_ID = 'user_id'
 COL_STATUS_MSG = 'status_msg'
+COL_SHOULD_RETRY = 'should_retry'
 REQUEST_LOG_PATH_PREFIX = '~/sky_logs/api_server/requests'
 # TODO(zhwu): For scalability, there are several TODOs:
@@ -86,6 +87,7 @@ REQUEST_COLUMNS = [
     'schedule_type',
     COL_USER_ID,
     COL_STATUS_MSG,
+    COL_SHOULD_RETRY,
 ]
@@ -115,6 +117,7 @@ class RequestPayload:
     # Resources the request operates on.
     cluster_name: Optional[str] = None
     status_msg: Optional[str] = None
+    should_retry: bool = False
 @dataclasses.dataclass
@@ -137,6 +140,8 @@ class Request:
     cluster_name: Optional[str] = None
     # Status message of the request, indicates the reason of current status.
     status_msg: Optional[str] = None
+    # Whether the request should be retried.
+    should_retry: bool = False
     @property
     def log_path(self) -> pathlib.Path:
@@ -222,6 +227,7 @@ class Request:
             user_name=user_name,
             cluster_name=self.cluster_name,
             status_msg=self.status_msg,
+            should_retry=self.should_retry,
         )
     def encode(self) -> RequestPayload:
@@ -243,6 +249,7 @@ class Request:
                 user_id=self.user_id,
                 cluster_name=self.cluster_name,
                 status_msg=self.status_msg,
+                should_retry=self.should_retry,
             )
         except (TypeError, ValueError) as e:
             # The error is unexpected, so we don't suppress the stack trace.
@@ -274,6 +281,7 @@ class Request:
                 user_id=payload.user_id,
                 cluster_name=payload.cluster_name,
                 status_msg=payload.status_msg,
+                should_retry=payload.should_retry,
             )
         except (TypeError, ValueError) as e:
             logger.error(
@@ -327,6 +335,24 @@ def refresh_cluster_status_event():
         time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
+def refresh_volume_status_event():
+    """Periodically refresh the volume status."""
+    # pylint: disable=import-outside-toplevel
+    from sky.volumes.server import core
+    # Disable logging for periodic refresh to avoid the usage message being
+    # sent multiple times.
+    os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
+    while True:
+        logger.info('=== Refreshing volume status ===')
+        core.volume_refresh()
+        logger.info('Volume status refreshed. Sleeping '
+                    f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
+                    ' seconds for the next refresh...\n')
+        time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
 def managed_job_status_refresh_event():
     """Refresh the managed job status for controller consolidation mode."""
     # pylint: disable=import-outside-toplevel
@@ -362,6 +388,10 @@ INTERNAL_REQUEST_DAEMONS = [
     InternalRequestDaemon(id='skypilot-status-refresh-daemon',
                           name='status',
                           event_fn=refresh_cluster_status_event),
+    # Volume status refresh daemon to update the volume status periodically.
+    InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
+                          name='volume',
+                          event_fn=refresh_volume_status_event),
     InternalRequestDaemon(id='managed-job-status-refresh-daemon',
                           name='managed-job-status',
                           event_fn=managed_job_status_refresh_event),
@@ -446,10 +476,14 @@ def create_table(cursor, conn):
         {COL_CLUSTER_NAME} TEXT,
         schedule_type TEXT,
         {COL_USER_ID} TEXT,
-        {COL_STATUS_MSG} TEXT)""")
+        {COL_STATUS_MSG} TEXT,
+        {COL_SHOULD_RETRY} INTEGER
+        )""")
     db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_STATUS_MSG,
                                  'TEXT')
+    db_utils.add_column_to_table(cursor, conn, REQUEST_TABLE, COL_SHOULD_RETRY,
+                                 'INTEGER')
 _DB = None

sky/server/rest.py ADDED Viewed

@@ -0,0 +1,153 @@
+"""REST API client of SkyPilot API server"""
+import contextlib
+import contextvars
+import functools
+import time
+import typing
+from typing import Any, Callable, cast, Optional, TypeVar
+import colorama
+from sky import exceptions
+from sky import sky_logging
+from sky.adaptors import common as adaptors_common
+from sky.utils import common_utils
+from sky.utils import rich_utils
+from sky.utils import ux_utils
+logger = sky_logging.init_logger(__name__)
+if typing.TYPE_CHECKING:
+    import requests
+else:
+    requests = adaptors_common.LazyImport('requests')
+F = TypeVar('F', bound=Callable[..., Any])
+_RETRY_CONTEXT = contextvars.ContextVar('retry_context', default=None)
+class RetryContext:
+    def __init__(self):
+        self.line_processed = 0
+def retry_on_server_unavailable(max_wait_seconds: int = 600,
+                                initial_backoff: float = 5.0,
+                                max_backoff_factor: int = 5):
+    """Decorator that retries a function when ServerTemporarilyUnavailableError
+    is caught.
+    Args:
+        max_wait_seconds: Maximum number of seconds to wait for the server to
+            be healthy
+        initial_backoff: Initial backoff time in seconds
+        max_backoff_factor: Maximum backoff factor for exponential backoff
+    Notes(dev):
+        This decorator is mainly used in two scenarios:
+        1. Decorate a Restful API call to make the API call wait for server
+           recovery when server is temporarily unavailable. APIs like /api/get
+           and /api/stream should not be retried since sending them to a new
+           replica of API server will not work.
+        2. Decorate a SDK function to make the entire SDK function call get
+           retried when /api/get or /logs raises a retryable error. This
+           is typically triggered by a graceful upgrade of the API server,
+           where the pending requests and logs requests will be interrupted.
+    """
+    def decorator(func: F) -> F:
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs) -> Any:
+            msg = (
+                f'{colorama.Fore.YELLOW}API server is temporarily unavailable: '
+                'upgrade in progress. Waiting to resume...'
+                f'{colorama.Style.RESET_ALL}')
+            backoff = common_utils.Backoff(
+                initial_backoff=initial_backoff,
+                max_backoff_factor=max_backoff_factor)
+            start_time = time.time()
+            attempt = 0
+            with _retry_in_context():
+                while True:
+                    attempt += 1
+                    try:
+                        return func(*args, **kwargs)
+                    except exceptions.ServerTemporarilyUnavailableError as e:
+                        # This will cause the status spinner being stopped and
+                        # restarted in every retry loop. But it is necessary to
+                        # stop the status spinner before retrying func() to
+                        # avoid the status spinner get stuck if the func() runs
+                        # for a long time without update status, e.g. sky logs.
+                        with rich_utils.client_status(msg):
+                            if time.time() - start_time > max_wait_seconds:
+                                # pylint: disable=line-too-long
+                                raise exceptions.ServerTemporarilyUnavailableError(
+                                    'Timeout waiting for the API server to be '
+                                    f'available after {max_wait_seconds}s.') \
+                                    from e
+                            sleep_time = backoff.current_backoff()
+                            time.sleep(sleep_time)
+                            logger.debug('The API server is unavailable. '
+                                         f'Retrying {func.__name__} '
+                                         f'(attempt {attempt}, '
+                                         f'backoff {sleep_time}s).')
+        return cast(F, wrapper)
+    return decorator
+@contextlib.contextmanager
+def _retry_in_context():
+    token = _RETRY_CONTEXT.set(RetryContext())
+    try:
+        yield
+    finally:
+        _RETRY_CONTEXT.reset(token)
+def get_retry_context() -> Optional[RetryContext]:
+    return _RETRY_CONTEXT.get()
+def handle_server_unavailable(response: 'requests.Response') -> None:
+    if response.status_code == 503:
+        # TODO(aylei): Hacky, depends on how nginx controller handles backends
+        # with no ready endpoints. Should use self-defined status code or header
+        # to distinguish retryable server error from general 503 errors.
+        with ux_utils.print_exception_no_traceback():
+            raise exceptions.ServerTemporarilyUnavailableError(
+                'SkyPilot API server is temporarily unavailable. '
+                'Please try again later.')
+@retry_on_server_unavailable()
+def post(url, data=None, json=None, **kwargs) -> 'requests.Response':
+    """Send a POST request to the API server, retry on server temporarily
+    unavailable."""
+    response = requests.post(url, data=data, json=json, **kwargs)
+    handle_server_unavailable(response)
+    return response
+@retry_on_server_unavailable()
+def get(url, params=None, **kwargs) -> 'requests.Response':
+    """Send a GET request to the API server, retry on server temporarily
+    unavailable."""
+    response = requests.get(url, params=params, **kwargs)
+    handle_server_unavailable(response)
+    return response
+def get_without_retry(url, params=None, **kwargs) -> 'requests.Response':
+    """Send a GET request to the API server without retry."""
+    response = requests.get(url, params=params, **kwargs)
+    handle_server_unavailable(response)
+    return response

sky/server/server.py CHANGED Viewed

@@ -16,6 +16,7 @@ import posixpath
 import re
 import shutil
 import sys
+import threading
 from typing import Any, Dict, List, Literal, Optional, Set, Tuple
 import uuid
 import zipfile
@@ -43,12 +44,15 @@ from sky.serve.server import server as serve_rest
 from sky.server import common
 from sky.server import config as server_config
 from sky.server import constants as server_constants
+from sky.server import metrics
+from sky.server import state
 from sky.server import stream_utils
 from sky.server.requests import executor
 from sky.server.requests import payloads
 from sky.server.requests import preconditions
 from sky.server.requests import requests as requests_lib
 from sky.skylet import constants
+from sky.ssh_node_pools import server as ssh_node_pools_rest
 from sky.usage import usage_lib
 from sky.users import permission
 from sky.users import server as users_rest
@@ -61,6 +65,7 @@ from sky.utils import dag_utils
 from sky.utils import env_options
 from sky.utils import status_lib
 from sky.utils import subprocess_utils
+from sky.volumes.server import server as volumes_rest
 from sky.workspaces import server as workspaces_rest
 # pylint: disable=ungrouped-imports
@@ -378,9 +383,32 @@ class PathCleanMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
         return await call_next(request)
+class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
+    """Middleware to control requests when server is shutting down."""
+    async def dispatch(self, request: fastapi.Request, call_next):
+        if state.get_block_requests():
+            # Allow /api/ paths to continue, which are critical to operate
+            # on-going requests but will not submit new requests.
+            if not request.url.path.startswith('/api/'):
+                # Client will retry on 503 error.
+                return fastapi.responses.JSONResponse(
+                    status_code=503,
+                    content={
+                        'detail': 'Server is shutting down, '
+                                  'please try again later.'
+                    })
+        return await call_next(request)
 app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
+# Use environment variable to make the metrics middleware optional.
+if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
+    app.add_middleware(metrics.PrometheusMiddleware)
 app.add_middleware(RBACMiddleware)
 app.add_middleware(InternalDashboardPrefixMiddleware)
+app.add_middleware(GracefulShutdownMiddleware)
 app.add_middleware(PathCleanMiddleware)
 app.add_middleware(CacheControlStaticMiddleware)
 app.add_middleware(
@@ -404,6 +432,10 @@ app.include_router(users_rest.router, prefix='/users', tags=['users'])
 app.include_router(workspaces_rest.router,
                    prefix='/workspaces',
                    tags=['workspaces'])
+app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
+app.include_router(ssh_node_pools_rest.router,
+                   prefix='/ssh_node_pools',
+                   tags=['ssh_node_pools'])
 @app.get('/token')
@@ -564,6 +596,8 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
     ctx.override_envs(validate_body.env_vars)
     def validate_dag(dag: dag_utils.dag_lib.Dag):
+        # Resolve the volumes before admin policy and validation.
+        dag.resolve_and_validate_volumes()
         # TODO: Admin policy may contain arbitrary code, which may be expensive
         # to run and may block the server thread. However, moving it into the
         # executor adds a ~150ms penalty on the local API server because of
@@ -826,6 +860,10 @@ async def status(
     status_body: payloads.StatusBody = payloads.StatusBody()
 ) -> None:
     """Gets cluster statuses."""
+    if state.get_block_requests():
+        raise fastapi.HTTPException(
+            status_code=503,
+            detail='Server is shutting down, please try again later.')
     executor.schedule_request(
         request_id=request.state.request_id,
         request_name='status',
@@ -1107,33 +1145,6 @@ async def local_down(request: fastapi.Request) -> None:
     )
-@app.post('/ssh_up')
-async def ssh_up(request: fastapi.Request,
-                 ssh_up_body: payloads.SSHUpBody) -> None:
-    """Deploys a Kubernetes cluster on SSH targets."""
-    executor.schedule_request(
-        request_id=request.state.request_id,
-        request_name='ssh_up',
-        request_body=ssh_up_body,
-        func=core.ssh_up,
-        schedule_type=requests_lib.ScheduleType.LONG,
-    )
-@app.post('/ssh_down')
-async def ssh_down(request: fastapi.Request,
-                   ssh_up_body: payloads.SSHUpBody) -> None:
-    """Tears down a Kubernetes cluster on SSH targets."""
-    # We still call ssh_up but with cleanup=True
-    executor.schedule_request(
-        request_id=request.state.request_id,
-        request_name='ssh_down',
-        request_body=ssh_up_body,
-        func=core.ssh_up,  # Reuse ssh_up function with cleanup=True
-        schedule_type=requests_lib.ScheduleType.LONG,
-    )
 # === API server related APIs ===
 @app.get('/api/get')
 async def api_get(request_id: str) -> requests_lib.RequestPayload:
@@ -1145,6 +1156,10 @@ async def api_get(request_id: str) -> requests_lib.RequestPayload:
             raise fastapi.HTTPException(
                 status_code=404, detail=f'Request {request_id!r} not found')
         if request_task.status > requests_lib.RequestStatus.RUNNING:
+            if request_task.should_retry:
+                raise fastapi.HTTPException(
+                    status_code=503,
+                    detail=f'Request {request_id!r} should be retried')
             request_error = request_task.get_error()
             if request_error is not None:
                 raise fastapi.HTTPException(status_code=500,
@@ -1435,6 +1450,11 @@ async def complete_storage_name(incomplete: str,) -> List[str]:
     return global_user_state.get_storage_names_start_with(incomplete)
+@app.get('/api/completion/volume_name')
+async def complete_volume_name(incomplete: str,) -> List[str]:
+    return global_user_state.get_volume_names_start_with(incomplete)
 @app.get('/dashboard/{full_path:path}')
 async def serve_dashboard(full_path: str):
     """Serves the Next.js dashboard application.
@@ -1461,6 +1481,7 @@ async def serve_dashboard(full_path: str):
     try:
         with open(index_path, 'r', encoding='utf-8') as f:
             content = f.read()
         return fastapi.responses.HTMLResponse(content=content)
     except Exception as e:
         logger.error(f'Error serving dashboard: {e}')
@@ -1484,7 +1505,13 @@ if __name__ == '__main__':
     parser.add_argument('--host', default='127.0.0.1')
     parser.add_argument('--port', default=46580, type=int)
     parser.add_argument('--deploy', action='store_true')
+    # Serve metrics on a separate port to isolate it from the application APIs:
+    # metrics port will not be exposed to the public network typically.
+    parser.add_argument('--metrics-port', default=9090, type=int)
     cmd_args = parser.parse_args()
+    if cmd_args.port == cmd_args.metrics_port:
+        raise ValueError('port and metrics-port cannot be the same')
     # Show the privacy policy if it is not already shown. We place it here so
     # that it is shown only when the API server is started.
     usage_lib.maybe_show_privacy_policy()
@@ -1492,9 +1519,17 @@ if __name__ == '__main__':
     config = server_config.compute_server_config(cmd_args.deploy)
     num_workers = config.num_server_workers
-    sub_procs = []
+    queue_server: Optional[multiprocessing.Process] = None
+    workers: List[executor.RequestWorker] = []
     try:
-        sub_procs = executor.start(config)
+        if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
+            metrics_thread = threading.Thread(target=metrics.run_metrics_server,
+                                              args=(cmd_args.host,
+                                                    cmd_args.metrics_port),
+                                              daemon=True)
+            metrics_thread.start()
+        queue_server, workers = executor.start(config)
         logger.info(f'Starting SkyPilot API server, workers={num_workers}')
         # We don't support reload for now, since it may cause leakage of request
         # workers or interrupt running requests.
@@ -1510,17 +1545,9 @@ if __name__ == '__main__':
     finally:
         logger.info('Shutting down SkyPilot API server...')
-        def cleanup(proc: multiprocessing.Process) -> None:
-            try:
-                proc.terminate()
-                proc.join()
-            finally:
-                # The process may not be started yet, close it anyway.
-                proc.close()
-        # Terminate processes in reverse order in case dependency, especially
-        # queue server. Terminate queue server first does not affect the
-        # correctness of cleanup but introduce redundant error messages.
-        subprocess_utils.run_in_parallel(cleanup,
-                                         list(reversed(sub_procs)),
-                                         num_threads=len(sub_procs))
+        subprocess_utils.run_in_parallel(lambda worker: worker.cancel(),
+                                         workers,
+                                         num_threads=len(workers))
+        if queue_server is not None:
+            queue_server.kill()
+            queue_server.join()

sky/server/state.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""State for API server process."""
+# This state is used to block requests except /api operations, which is useful
+# when a server is shutting down: new requests will be blocked, but existing
+# requests will be allowed to finish and be operated via /api operations, e.g.
+# /api/logs, /api/cancel, etc.
+_block_requests = False
+# TODO(aylei): refactor, state should be a instance property of API server app
+# instead of a global variable.
+def get_block_requests() -> bool:
+    """Whether block requests except /api operations."""
+    return _block_requests
+def set_block_requests(shutting_down: bool) -> None:
+    """Set the API server to block requests except /api operations."""
+    global _block_requests
+    _block_requests = shutting_down

sky/server/stream_utils.py CHANGED Viewed

@@ -155,9 +155,14 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
                 if request_task.status > requests_lib.RequestStatus.RUNNING:
                     if (request_task.status ==
                             requests_lib.RequestStatus.CANCELLED):
-                        buffer.append(
-                            f'{request_task.name!r} request {request_id}'
-                            ' cancelled\n')
+                        if request_task.should_retry:
+                            buffer.append(
+                                message_utils.encode_payload(
+                                    rich_utils.Control.RETRY.encode('')))
+                        else:
+                            buffer.append(
+                                f'{request_task.name!r} request {request_id}'
+                                ' cancelled\n')
                     break
             if not follow:
                 break

skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250626__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250624py3-none-any.whl → 1.0.0.dev20250626py3-none-any.whl