PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250808py3-none-any.whl → 1.0.0.dev20250814py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (120) hide show

sky/__init__.py +4 -2
sky/adaptors/kubernetes.py +5 -2
sky/backends/backend_utils.py +102 -8
sky/backends/cloud_vm_ray_backend.py +197 -31
sky/catalog/cudo_catalog.py +1 -1
sky/catalog/data_fetchers/fetch_cudo.py +1 -1
sky/catalog/data_fetchers/fetch_nebius.py +6 -3
sky/client/cli/command.py +60 -77
sky/client/common.py +1 -1
sky/client/sdk.py +19 -19
sky/client/sdk_async.py +5 -4
sky/clouds/aws.py +52 -1
sky/clouds/kubernetes.py +14 -0
sky/core.py +5 -0
sky/dag.py +1 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-339efec49c0cc7d0.js → webpack-00c0a51d21157453.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage.py +11 -1
sky/exceptions.py +5 -0
sky/execution.py +15 -0
sky/global_user_state.py +160 -2
sky/jobs/constants.py +1 -1
sky/jobs/controller.py +0 -1
sky/jobs/recovery_strategy.py +6 -3
sky/jobs/scheduler.py +23 -68
sky/jobs/server/core.py +22 -12
sky/jobs/state.py +6 -2
sky/jobs/utils.py +17 -2
sky/provision/__init__.py +4 -2
sky/provision/aws/config.py +9 -0
sky/provision/aws/instance.py +41 -17
sky/provision/azure/instance.py +7 -4
sky/provision/cudo/cudo_wrapper.py +1 -1
sky/provision/cudo/instance.py +7 -4
sky/provision/do/instance.py +7 -4
sky/provision/fluidstack/instance.py +7 -4
sky/provision/gcp/instance.py +7 -4
sky/provision/hyperbolic/instance.py +7 -5
sky/provision/kubernetes/instance.py +169 -6
sky/provision/lambda_cloud/instance.py +7 -4
sky/provision/nebius/instance.py +7 -4
sky/provision/oci/instance.py +7 -4
sky/provision/paperspace/instance.py +7 -5
sky/provision/paperspace/utils.py +1 -1
sky/provision/provisioner.py +6 -0
sky/provision/runpod/instance.py +7 -4
sky/provision/runpod/utils.py +1 -1
sky/provision/scp/instance.py +7 -5
sky/provision/vast/instance.py +7 -5
sky/provision/vsphere/instance.py +7 -4
sky/resources.py +1 -2
sky/schemas/__init__.py +0 -0
sky/schemas/api/__init__.py +0 -0
sky/schemas/api/responses.py +70 -0
sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
sky/schemas/db/serve_state/001_initial_schema.py +1 -1
sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
sky/schemas/generated/__init__.py +0 -0
sky/schemas/generated/autostopv1_pb2.py +36 -0
sky/schemas/generated/autostopv1_pb2.pyi +43 -0
sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
sky/serve/constants.py +3 -7
sky/serve/replica_managers.py +15 -16
sky/serve/serve_state.py +10 -0
sky/serve/serve_utils.py +58 -23
sky/serve/server/impl.py +15 -19
sky/serve/service.py +31 -16
sky/server/server.py +20 -14
sky/setup_files/dependencies.py +11 -10
sky/skylet/autostop_lib.py +38 -5
sky/skylet/constants.py +3 -1
sky/skylet/services.py +44 -0
sky/skylet/skylet.py +49 -4
sky/skypilot_config.py +4 -4
sky/task.py +19 -16
sky/templates/aws-ray.yml.j2 +2 -2
sky/templates/jobs-controller.yaml.j2 +6 -0
sky/users/permission.py +1 -1
sky/utils/cli_utils/status_utils.py +9 -0
sky/utils/command_runner.py +1 -1
sky/utils/config_utils.py +29 -5
sky/utils/controller_utils.py +73 -0
sky/utils/db/db_utils.py +39 -1
sky/utils/db/migration_utils.py +1 -1
sky/utils/schemas.py +3 -0
sky/volumes/server/core.py +2 -2
sky/volumes/server/server.py +2 -2
{skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
{skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +117 -108
sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +0 -11
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
/sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0

sky/schemas/__init__.py ADDED Viewed

File without changes

sky/schemas/api/__init__.py ADDED Viewed

File without changes

sky/schemas/api/responses.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""Responses for the API server."""
+from typing import Optional
+import pydantic
+from sky import models
+from sky.server import common
+class ResponseBaseModel(pydantic.BaseModel):
+    """A pydantic model that acts like a dict.
+    Supports the following syntax:
+    class SampleResponse(DictLikePayload):
+        field: str
+    response = SampleResponse(field='value')
+    print(response['field']) # prints 'value'
+    response['field'] = 'value2'
+    print(response['field']) # prints 'value2'
+    print('field' in response) # prints True
+    This model exists for backwards compatibility with the
+    old SDK that used to return a dict.
+    The backward compatibility may be removed
+    in the future.
+    """
+    # Ignore extra fields in the request body, which is useful for backward
+    # compatibility. The difference with `allow` is that `ignore` will not
+    # include the unknown fields when dump the model, i.e., we can add new
+    # fields to the request body without breaking the existing old API server
+    # where the handler function does not accept the new field in function
+    # signature.
+    model_config = pydantic.ConfigDict(extra='ignore')
+    # backward compatibility with dict
+    # TODO(syang): remove this in v0.13.0
+    def __getitem__(self, key):
+        try:
+            return getattr(self, key)
+        except AttributeError as e:
+            raise KeyError(key) from e
+    def __setitem__(self, key, value):
+        setattr(self, key, value)
+    def __contains__(self, key):
+        return hasattr(self, key)
+    def keys(self):
+        return self.model_dump().keys()
+    def values(self):
+        return self.model_dump().values()
+    def items(self):
+        return self.model_dump().items()
+class APIHealthResponse(ResponseBaseModel):
+    """Response for the API health endpoint."""
+    status: common.ApiServerStatus
+    api_version: str = ''
+    version: str = ''
+    version_on_disk: str = ''
+    commit: str = ''
+    basic_auth_enabled: bool = False
+    user: Optional[models.User] = None

sky/schemas/db/global_user_state/001_initial_schema.py CHANGED Viewed

@@ -22,7 +22,7 @@ depends_on = None
 def upgrade():
     with op.get_context().autocommit_block():
         # Create any missing tables with current schema first
-        db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
+        db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
         # Add all missing columns to clusters table
         # This allows each column addition to fail independently without rolling

sky/schemas/db/global_user_state/005_cluster_event.py ADDED Viewed

@@ -0,0 +1,32 @@
+"""Columns for whether the cluster is managed.
+Revision ID: 005
+Revises: 004
+Create Date: 2025-08-08
+"""
+# pylint: disable=invalid-name
+from typing import Sequence, Union
+from alembic import op
+from sky.global_user_state import Base
+from sky.utils.db import db_utils
+# revision identifiers, used by Alembic.π
+revision: str = '005'
+down_revision: Union[str, Sequence[str], None] = '004'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade():
+    """Add new table for cluster events."""
+    with op.get_context().autocommit_block():
+        # Add new table for cluster events.
+        db_utils.add_table_to_db_sqlalchemy(Base.metadata, op.get_bind(),
+                                            'cluster_events')
+def downgrade():
+    pass

sky/schemas/db/serve_state/001_initial_schema.py CHANGED Viewed

@@ -26,7 +26,7 @@ def upgrade():
     """Create initial schema and add all backwards compatibility columns"""
     with op.get_context().autocommit_block():
         # Create all tables with their current schema
-        db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
+        db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
         # Add backwards compatibility columns using helper function that matches
         # original add_column_to_table_sqlalchemy behavior exactly

sky/schemas/db/spot_jobs/001_initial_schema.py CHANGED Viewed

@@ -26,7 +26,7 @@ def upgrade():
     """Create initial schema and add all backwards compatibility columns"""
     with op.get_context().autocommit_block():
         # Create all tables with their current schema
-        db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
+        db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
         # Add backwards compatibility columns using helper function that matches
         # original add_column_to_table_sqlalchemy behavior exactly

sky/schemas/generated/__init__.py ADDED Viewed

File without changes

sky/schemas/generated/autostopv1_pb2.py ADDED Viewed

@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: sky/schemas/generated/autostopv1.proto
+# Protobuf Python Version: 5.26.1
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+_sym_db = _symbol_database.Default()
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n&sky/schemas/generated/autostopv1.proto\x12\x0b\x61utostop.v1\"y\n\x12SetAutostopRequest\x12\x14\n\x0cidle_minutes\x18\x01 \x01(\x05\x12\x0f\n\x07\x62\x61\x63kend\x18\x02 \x01(\t\x12.\n\x08wait_for\x18\x03 \x01(\x0e\x32\x1c.autostop.v1.AutostopWaitFor\x12\x0c\n\x04\x64own\x18\x04 \x01(\x08\"\x15\n\x13SetAutostopResponse\"\x17\n\x15IsAutostoppingRequest\"1\n\x16IsAutostoppingResponse\x12\x17\n\x0fis_autostopping\x18\x01 \x01(\x08*\x90\x01\n\x0f\x41utostopWaitFor\x12!\n\x1d\x41UTOSTOP_WAIT_FOR_UNSPECIFIED\x10\x00\x12\"\n\x1e\x41UTOSTOP_WAIT_FOR_JOBS_AND_SSH\x10\x01\x12\x1a\n\x16\x41UTOSTOP_WAIT_FOR_JOBS\x10\x02\x12\x1a\n\x16\x41UTOSTOP_WAIT_FOR_NONE\x10\x03\x32\xbe\x01\n\x0f\x41utostopService\x12P\n\x0bSetAutostop\x12\x1f.autostop.v1.SetAutostopRequest\x1a .autostop.v1.SetAutostopResponse\x12Y\n\x0eIsAutostopping\x12\".autostop.v1.IsAutostoppingRequest\x1a#.autostop.v1.IsAutostoppingResponseb\x06proto3')
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sky.schemas.generated.autostopv1_pb2', _globals)
+if not _descriptor._USE_C_DESCRIPTORS:
+  DESCRIPTOR._loaded_options = None
+  _globals['_AUTOSTOPWAITFOR']._serialized_start=278
+  _globals['_AUTOSTOPWAITFOR']._serialized_end=422
+  _globals['_SETAUTOSTOPREQUEST']._serialized_start=55
+  _globals['_SETAUTOSTOPREQUEST']._serialized_end=176
+  _globals['_SETAUTOSTOPRESPONSE']._serialized_start=178
+  _globals['_SETAUTOSTOPRESPONSE']._serialized_end=199
+  _globals['_ISAUTOSTOPPINGREQUEST']._serialized_start=201
+  _globals['_ISAUTOSTOPPINGREQUEST']._serialized_end=224
+  _globals['_ISAUTOSTOPPINGRESPONSE']._serialized_start=226
+  _globals['_ISAUTOSTOPPINGRESPONSE']._serialized_end=275
+  _globals['_AUTOSTOPSERVICE']._serialized_start=425
+  _globals['_AUTOSTOPSERVICE']._serialized_end=615
+# @@protoc_insertion_point(module_scope)

sky/schemas/generated/autostopv1_pb2.pyi ADDED Viewed

@@ -0,0 +1,43 @@
+from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
+DESCRIPTOR: _descriptor.FileDescriptor
+class AutostopWaitFor(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
+    __slots__ = ()
+    AUTOSTOP_WAIT_FOR_UNSPECIFIED: _ClassVar[AutostopWaitFor]
+    AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: _ClassVar[AutostopWaitFor]
+    AUTOSTOP_WAIT_FOR_JOBS: _ClassVar[AutostopWaitFor]
+    AUTOSTOP_WAIT_FOR_NONE: _ClassVar[AutostopWaitFor]
+AUTOSTOP_WAIT_FOR_UNSPECIFIED: AutostopWaitFor
+AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: AutostopWaitFor
+AUTOSTOP_WAIT_FOR_JOBS: AutostopWaitFor
+AUTOSTOP_WAIT_FOR_NONE: AutostopWaitFor
+class SetAutostopRequest(_message.Message):
+    __slots__ = ("idle_minutes", "backend", "wait_for", "down")
+    IDLE_MINUTES_FIELD_NUMBER: _ClassVar[int]
+    BACKEND_FIELD_NUMBER: _ClassVar[int]
+    WAIT_FOR_FIELD_NUMBER: _ClassVar[int]
+    DOWN_FIELD_NUMBER: _ClassVar[int]
+    idle_minutes: int
+    backend: str
+    wait_for: AutostopWaitFor
+    down: bool
+    def __init__(self, idle_minutes: _Optional[int] = ..., backend: _Optional[str] = ..., wait_for: _Optional[_Union[AutostopWaitFor, str]] = ..., down: bool = ...) -> None: ...
+class SetAutostopResponse(_message.Message):
+    __slots__ = ()
+    def __init__(self) -> None: ...
+class IsAutostoppingRequest(_message.Message):
+    __slots__ = ()
+    def __init__(self) -> None: ...
+class IsAutostoppingResponse(_message.Message):
+    __slots__ = ("is_autostopping",)
+    IS_AUTOSTOPPING_FIELD_NUMBER: _ClassVar[int]
+    is_autostopping: bool
+    def __init__(self, is_autostopping: bool = ...) -> None: ...

sky/schemas/generated/autostopv1_pb2_grpc.py ADDED Viewed

@@ -0,0 +1,146 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+import warnings
+from sky.schemas.generated import autostopv1_pb2 as sky_dot_schemas_dot_generated_dot_autostopv1__pb2
+GRPC_GENERATED_VERSION = '1.63.0'
+GRPC_VERSION = grpc.__version__
+EXPECTED_ERROR_RELEASE = '1.65.0'
+SCHEDULED_RELEASE_DATE = 'June 25, 2024'
+_version_not_supported = False
+try:
+    from grpc._utilities import first_version_is_lower
+    _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
+except ImportError:
+    _version_not_supported = True
+if _version_not_supported:
+    warnings.warn(
+        f'The grpc package installed is at version {GRPC_VERSION},'
+        + f' but the generated code in sky/schemas/generated/autostopv1_pb2_grpc.py depends on'
+        + f' grpcio>={GRPC_GENERATED_VERSION}.'
+        + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
+        + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
+        + f' This warning will become an error in {EXPECTED_ERROR_RELEASE},'
+        + f' scheduled for release on {SCHEDULED_RELEASE_DATE}.',
+        RuntimeWarning
+    )
+class AutostopServiceStub(object):
+    """Missing associated documentation comment in .proto file."""
+    def __init__(self, channel):
+        """Constructor.
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.SetAutostop = channel.unary_unary(
+                '/autostop.v1.AutostopService/SetAutostop',
+                request_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.SerializeToString,
+                response_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.FromString,
+                _registered_method=True)
+        self.IsAutostopping = channel.unary_unary(
+                '/autostop.v1.AutostopService/IsAutostopping',
+                request_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.SerializeToString,
+                response_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.FromString,
+                _registered_method=True)
+class AutostopServiceServicer(object):
+    """Missing associated documentation comment in .proto file."""
+    def SetAutostop(self, request, context):
+        """Set autostop configuration for the cluster.
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+    def IsAutostopping(self, request, context):
+        """Check if the cluster is currently autostopping.
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+def add_AutostopServiceServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'SetAutostop': grpc.unary_unary_rpc_method_handler(
+                    servicer.SetAutostop,
+                    request_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.FromString,
+                    response_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.SerializeToString,
+            ),
+            'IsAutostopping': grpc.unary_unary_rpc_method_handler(
+                    servicer.IsAutostopping,
+                    request_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.FromString,
+                    response_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'autostop.v1.AutostopService', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+ # This class is part of an EXPERIMENTAL API.
+class AutostopService(object):
+    """Missing associated documentation comment in .proto file."""
+    @staticmethod
+    def SetAutostop(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/autostop.v1.AutostopService/SetAutostop',
+            sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.SerializeToString,
+            sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+    @staticmethod
+    def IsAutostopping(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/autostop.v1.AutostopService/IsAutostopping',
+            sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.SerializeToString,
+            sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)

sky/serve/constants.py CHANGED Viewed

@@ -73,13 +73,6 @@ CONTROLLER_AUTOSTOP = {
     'down': False,
 }
-# Due to the CPU/memory usage of the controller process launched with a job on
-# controller VM (use ray job under the hood), we need to reserve some CPU/memory
-# for each serve controller process.
-# Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16
-# services.
-CONTROLLER_MEMORY_USAGE_GB = 1.0
 # A period of time to initialize your service. Any readiness probe failures
 # during this period will be ignored.
 DEFAULT_INITIAL_DELAY_SECONDS = 1200
@@ -115,3 +108,6 @@ TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
 # Dummy run command for cluster pool.
 POOL_DUMMY_RUN_COMMAND = 'echo "setup done"'
+# Error message for max number of services reached.
+MAX_NUMBER_OF_SERVICES_REACHED_ERROR = 'Max number of services reached.'

sky/serve/replica_managers.py CHANGED Viewed

@@ -13,16 +13,16 @@ import typing
 from typing import Any, Dict, List, Optional, Tuple
 import colorama
-import psutil
+import filelock
 import requests
-import sky
 from sky import backends
 from sky import core
 from sky import exceptions
 from sky import execution
 from sky import global_user_state
 from sky import sky_logging
+from sky import task as task_lib
 from sky.backends import backend_utils
 from sky.jobs import scheduler as jobs_scheduler
 from sky.serve import constants as serve_constants
@@ -41,7 +41,6 @@ from sky.utils import status_lib
 from sky.utils import ux_utils
 if typing.TYPE_CHECKING:
-    from sky import resources
     from sky.serve import service_spec
 logger = sky_logging.init_logger(__name__)
@@ -51,10 +50,6 @@ _PROCESS_POOL_REFRESH_INTERVAL = 20
 _RETRY_INIT_GAP_SECONDS = 60
 _DEFAULT_DRAIN_SECONDS = 120
-# Since sky.launch is very resource demanding, we limit the number of
-# concurrent sky.launch process to avoid overloading the machine.
-_MAX_NUM_LAUNCH = psutil.cpu_count() * 2
 # TODO(tian): Combine this with
 # sky/spot/recovery_strategy.py::StrategyExecutor::launch
@@ -81,7 +76,7 @@ def launch_cluster(replica_id: int,
     try:
         config = common_utils.read_yaml(
             os.path.expanduser(service_task_yaml_path))
-        task = sky.Task.from_yaml_config(config)
+        task = task_lib.Task.from_yaml_config(config)
         if resources_override is not None:
             resources = task.resources
             overrided_resources = [
@@ -177,7 +172,7 @@ def terminate_cluster(cluster_name: str,
 def _get_resources_ports(service_task_yaml_path: str) -> str:
     """Get the resources ports used by the task."""
-    task = sky.Task.from_yaml(service_task_yaml_path)
+    task = task_lib.Task.from_yaml(service_task_yaml_path)
     # Already checked all ports are valid in sky.serve.core.up
     assert task.resources, task
     assert task.service is not None, task
@@ -195,7 +190,7 @@ def _should_use_spot(service_task_yaml_path: str,
         if use_spot_override is not None:
             assert isinstance(use_spot_override, bool)
             return use_spot_override
-    task = sky.Task.from_yaml(service_task_yaml_path)
+    task = task_lib.Task.from_yaml(service_task_yaml_path)
     spot_use_resources = [
         resources for resources in task.resources if resources.use_spot
     ]
@@ -688,7 +683,7 @@ class SkyPilotReplicaManager(ReplicaManager):
                  service_task_yaml_path: str) -> None:
         super().__init__(service_name, spec)
         self.service_task_yaml_path = service_task_yaml_path
-        task = sky.Task.from_yaml(service_task_yaml_path)
+        task = task_lib.Task.from_yaml(service_task_yaml_path)
         self._spot_placer: Optional[spot_placer.SpotPlacer] = (
             spot_placer.SpotPlacer.from_task(spec, task))
         # TODO(tian): Store launch/down pid in the replica table, to make the
@@ -872,8 +867,9 @@ class SkyPilotReplicaManager(ReplicaManager):
             assert isinstance(handle, backends.CloudVmRayResourceHandle)
             replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
                                                 'replica_jobs')
-            job_log_file_name = (controller_utils.download_and_stream_job_log(
-                backend, handle, replica_job_logs_dir))
+            job_ids = ['1'] if self._is_pool else None
+            job_log_file_name = controller_utils.download_and_stream_job_log(
+                backend, handle, replica_job_logs_dir, job_ids)
             if job_log_file_name is not None:
                 logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
                 with open(log_file_name, 'a',
@@ -981,7 +977,9 @@ class SkyPilotReplicaManager(ReplicaManager):
         # To avoid `dictionary changed size during iteration` error.
         launch_process_pool_snapshot = list(self._launch_process_pool.items())
         for replica_id, p in launch_process_pool_snapshot:
-            if not p.is_alive():
+            if p.is_alive():
+                continue
+            with filelock.FileLock(controller_utils.get_resources_lock_path()):
                 info = serve_state.get_replica_info_from_id(
                     self._service_name, replica_id)
                 assert info is not None, replica_id
@@ -989,8 +987,7 @@ class SkyPilotReplicaManager(ReplicaManager):
                 schedule_next_jobs = False
                 if info.status == serve_state.ReplicaStatus.PENDING:
                     # sky.launch not started yet
-                    if (serve_state.total_number_provisioning_replicas() <
-                            _MAX_NUM_LAUNCH):
+                    if controller_utils.can_provision():
                         p.start()
                         info.status_property.sky_launch_status = (
                             ProcessStatus.RUNNING)
@@ -1044,6 +1041,8 @@ class SkyPilotReplicaManager(ReplicaManager):
                     self._terminate_replica(replica_id,
                                             sync_down_logs=True,
                                             replica_drain_delay_seconds=0)
+            # Try schedule next job after acquiring the lock.
+            jobs_scheduler.maybe_schedule_next_jobs()
         down_process_pool_snapshot = list(self._down_process_pool.items())
         for replica_id, p in down_process_pool_snapshot:
             if not p.is_alive():

sky/serve/serve_state.py CHANGED Viewed

@@ -502,6 +502,16 @@ def get_services() -> List[Dict[str, Any]]:
     return records
+@init_db
+def get_num_services() -> int:
+    """Get the number of services."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        return session.execute(
+            sqlalchemy.select(sqlalchemy.func.count()  # pylint: disable=not-callable
+                             ).select_from(services_table)).fetchone()[0]
 @init_db
 def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]:
     """Get all existing service records."""

sky/serve/serve_utils.py CHANGED Viewed

@@ -37,6 +37,7 @@ from sky.skylet import job_lib
 from sky.utils import annotations
 from sky.utils import command_runner
 from sky.utils import common_utils
+from sky.utils import controller_utils
 from sky.utils import log_utils
 from sky.utils import message_utils
 from sky.utils import resources_utils
@@ -56,14 +57,6 @@ else:
 logger = sky_logging.init_logger(__name__)
-@annotations.lru_cache(scope='request')
-def get_num_service_threshold():
-    """Get number of services threshold, calculating it only when needed."""
-    system_memory_gb = psutil.virtual_memory().total // (1024**3)
-    return system_memory_gb // constants.CONTROLLER_MEMORY_USAGE_GB
 _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
 # NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
@@ -259,14 +252,47 @@ def get_service_filelock_path(pool: str) -> str:
     return str(path)
+def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
+                                        pool: bool) -> None:
+    """Validate the consolidation mode config."""
+    # Check whether the consolidation mode config is changed.
+    controller = controller_utils.get_controller_for_pool(pool).value
+    if current_is_consolidation_mode:
+        controller_cn = controller.cluster_name
+        if global_user_state.get_cluster_from_name(controller_cn) is not None:
+            with ux_utils.print_exception_no_traceback():
+                raise exceptions.InconsistentConsolidationModeError(
+                    f'{colorama.Fore.RED}Consolidation mode for '
+                    f'{controller.controller_type} is enabled, but the '
+                    f'controller cluster {controller_cn} is still running. '
+                    'Please terminate the controller cluster first.'
+                    f'{colorama.Style.RESET_ALL}')
+    else:
+        noun = 'pool' if pool else 'service'
+        all_services = [
+            svc for svc in serve_state.get_services() if svc['pool'] == pool
+        ]
+        if all_services:
+            with ux_utils.print_exception_no_traceback():
+                raise exceptions.InconsistentConsolidationModeError(
+                    f'{colorama.Fore.RED}Consolidation mode for '
+                    f'{controller.controller_type} is disabled, but there are '
+                    f'still {len(all_services)} {noun}s running. Please '
+                    f'terminate those {noun}s first.{colorama.Style.RESET_ALL}')
 @annotations.lru_cache(scope='request', maxsize=1)
 def is_consolidation_mode(pool: bool = False) -> bool:
     # Use jobs config for pool consolidation mode.
-    controller_type = 'jobs' if pool else 'serve'
+    controller = controller_utils.get_controller_for_pool(pool).value
     consolidation_mode = skypilot_config.get_nested(
-        (controller_type, 'controller', 'consolidation_mode'),
+        (controller.controller_type, 'controller', 'consolidation_mode'),
         default_value=False)
-    # _check_consolidation_mode_consistency(consolidation_mode, pool)
+    # We should only do this check on API server, as the controller will not
+    # have related config and will always seemingly disabled for consolidation
+    # mode. Check #6611 for more details.
+    if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
+        _validate_consolidation_mode_config(consolidation_mode, pool)
     return consolidation_mode
@@ -490,6 +516,8 @@ def generate_remote_tls_certfile_name(service_name: str) -> str:
 def generate_replica_cluster_name(service_name: str, replica_id: int) -> str:
+    # NOTE(dev): This format is used in sky/serve/service.py::_cleanup, for
+    # checking replica cluster existence. Be careful when changing it.
     return f'{service_name}-{replica_id}'
@@ -762,9 +790,13 @@ def load_version_string(payload: str) -> str:
     return message_utils.decode_payload(payload)
-def num_replicas(service_name: str) -> int:
+def get_ready_replicas(
+        service_name: str) -> List['replica_managers.ReplicaInfo']:
     logger.info(f'Get number of replicas for pool {service_name!r}')
-    return len(serve_state.get_replica_infos(service_name))
+    return [
+        info for info in serve_state.get_replica_infos(service_name)
+        if info.status == serve_state.ReplicaStatus.READY
+    ]
 def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
@@ -789,12 +821,8 @@ def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
         logger.error(f'Service {service_name!r} is not a cluster pool.')
         return None
     with filelock.FileLock(get_service_filelock_path(service_name)):
         logger.debug(f'Get next cluster name for pool {service_name!r}')
-        ready_replicas = [
-            info for info in serve_state.get_replica_infos(service_name)
-            if info.status == serve_state.ReplicaStatus.READY
-        ]
+        ready_replicas = get_ready_replicas(service_name)
         idle_replicas: List['replica_managers.ReplicaInfo'] = []
         for replica_info in ready_replicas:
             jobs_on_replica = managed_job_state.get_nonterminal_job_ids_by_pool(
@@ -1010,11 +1038,18 @@ def wait_service_registration(service_name: str, job_id: int,
             lb_port = record['load_balancer_port']
             if lb_port is not None:
                 return message_utils.encode_payload(lb_port)
-        elif len(serve_state.get_services()) >= get_num_service_threshold():
-            with ux_utils.print_exception_no_traceback():
-                raise RuntimeError('Max number of services reached. '
-                                   'To spin up more services, please '
-                                   'tear down some existing services.')
+        else:
+            controller_log_path = os.path.expanduser(
+                generate_remote_controller_log_file_name(service_name))
+            if os.path.exists(controller_log_path):
+                with open(controller_log_path, 'r', encoding='utf-8') as f:
+                    log_content = f.read()
+                if (constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR
+                        in log_content):
+                    with ux_utils.print_exception_no_traceback():
+                        raise RuntimeError('Max number of services reached. '
+                                           'To spin up more services, please '
+                                           'tear down some existing services.')
         elapsed = time.time() - start_time
         if elapsed > constants.SERVICE_REGISTER_TIMEOUT_SECONDS:
             # Print the controller log to help user debug.

skypilot-nightly 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250808py3-none-any.whl → 1.0.0.dev20250814py3-none-any.whl