skypilot-nightly 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (120) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/kubernetes.py +5 -2
  3. sky/backends/backend_utils.py +102 -8
  4. sky/backends/cloud_vm_ray_backend.py +197 -31
  5. sky/catalog/cudo_catalog.py +1 -1
  6. sky/catalog/data_fetchers/fetch_cudo.py +1 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +6 -3
  8. sky/client/cli/command.py +60 -77
  9. sky/client/common.py +1 -1
  10. sky/client/sdk.py +19 -19
  11. sky/client/sdk_async.py +5 -4
  12. sky/clouds/aws.py +52 -1
  13. sky/clouds/kubernetes.py +14 -0
  14. sky/core.py +5 -0
  15. sky/dag.py +1 -0
  16. sky/dashboard/out/404.html +1 -1
  17. sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
  18. sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/{webpack-339efec49c0cc7d0.js → webpack-00c0a51d21157453.js} +1 -1
  25. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  26. sky/dashboard/out/clusters/[cluster].html +1 -1
  27. sky/dashboard/out/clusters.html +1 -1
  28. sky/dashboard/out/config.html +1 -1
  29. sky/dashboard/out/index.html +1 -1
  30. sky/dashboard/out/infra/[context].html +1 -1
  31. sky/dashboard/out/infra.html +1 -1
  32. sky/dashboard/out/jobs/[job].html +1 -1
  33. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  34. sky/dashboard/out/jobs.html +1 -1
  35. sky/dashboard/out/users.html +1 -1
  36. sky/dashboard/out/volumes.html +1 -1
  37. sky/dashboard/out/workspace/new.html +1 -1
  38. sky/dashboard/out/workspaces/[name].html +1 -1
  39. sky/dashboard/out/workspaces.html +1 -1
  40. sky/data/storage.py +11 -1
  41. sky/exceptions.py +5 -0
  42. sky/execution.py +15 -0
  43. sky/global_user_state.py +160 -2
  44. sky/jobs/constants.py +1 -1
  45. sky/jobs/controller.py +0 -1
  46. sky/jobs/recovery_strategy.py +6 -3
  47. sky/jobs/scheduler.py +23 -68
  48. sky/jobs/server/core.py +22 -12
  49. sky/jobs/state.py +6 -2
  50. sky/jobs/utils.py +17 -2
  51. sky/provision/__init__.py +4 -2
  52. sky/provision/aws/config.py +9 -0
  53. sky/provision/aws/instance.py +41 -17
  54. sky/provision/azure/instance.py +7 -4
  55. sky/provision/cudo/cudo_wrapper.py +1 -1
  56. sky/provision/cudo/instance.py +7 -4
  57. sky/provision/do/instance.py +7 -4
  58. sky/provision/fluidstack/instance.py +7 -4
  59. sky/provision/gcp/instance.py +7 -4
  60. sky/provision/hyperbolic/instance.py +7 -5
  61. sky/provision/kubernetes/instance.py +169 -6
  62. sky/provision/lambda_cloud/instance.py +7 -4
  63. sky/provision/nebius/instance.py +7 -4
  64. sky/provision/oci/instance.py +7 -4
  65. sky/provision/paperspace/instance.py +7 -5
  66. sky/provision/paperspace/utils.py +1 -1
  67. sky/provision/provisioner.py +6 -0
  68. sky/provision/runpod/instance.py +7 -4
  69. sky/provision/runpod/utils.py +1 -1
  70. sky/provision/scp/instance.py +7 -5
  71. sky/provision/vast/instance.py +7 -5
  72. sky/provision/vsphere/instance.py +7 -4
  73. sky/resources.py +1 -2
  74. sky/schemas/__init__.py +0 -0
  75. sky/schemas/api/__init__.py +0 -0
  76. sky/schemas/api/responses.py +70 -0
  77. sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
  78. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  79. sky/schemas/db/serve_state/001_initial_schema.py +1 -1
  80. sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
  81. sky/schemas/generated/__init__.py +0 -0
  82. sky/schemas/generated/autostopv1_pb2.py +36 -0
  83. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  84. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  85. sky/serve/constants.py +3 -7
  86. sky/serve/replica_managers.py +15 -16
  87. sky/serve/serve_state.py +10 -0
  88. sky/serve/serve_utils.py +58 -23
  89. sky/serve/server/impl.py +15 -19
  90. sky/serve/service.py +31 -16
  91. sky/server/server.py +20 -14
  92. sky/setup_files/dependencies.py +11 -10
  93. sky/skylet/autostop_lib.py +38 -5
  94. sky/skylet/constants.py +3 -1
  95. sky/skylet/services.py +44 -0
  96. sky/skylet/skylet.py +49 -4
  97. sky/skypilot_config.py +4 -4
  98. sky/task.py +19 -16
  99. sky/templates/aws-ray.yml.j2 +2 -2
  100. sky/templates/jobs-controller.yaml.j2 +6 -0
  101. sky/users/permission.py +1 -1
  102. sky/utils/cli_utils/status_utils.py +9 -0
  103. sky/utils/command_runner.py +1 -1
  104. sky/utils/config_utils.py +29 -5
  105. sky/utils/controller_utils.py +73 -0
  106. sky/utils/db/db_utils.py +39 -1
  107. sky/utils/db/migration_utils.py +1 -1
  108. sky/utils/schemas.py +3 -0
  109. sky/volumes/server/core.py +2 -2
  110. sky/volumes/server/server.py +2 -2
  111. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
  112. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +117 -108
  113. sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +0 -1
  114. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +0 -11
  115. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
  116. /sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
  117. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
  118. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
  119. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
  120. {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
File without changes
File without changes
@@ -0,0 +1,70 @@
1
+ """Responses for the API server."""
2
+
3
+ from typing import Optional
4
+
5
+ import pydantic
6
+
7
+ from sky import models
8
+ from sky.server import common
9
+
10
+
11
+ class ResponseBaseModel(pydantic.BaseModel):
12
+ """A pydantic model that acts like a dict.
13
+
14
+ Supports the following syntax:
15
+ class SampleResponse(DictLikePayload):
16
+ field: str
17
+
18
+ response = SampleResponse(field='value')
19
+ print(response['field']) # prints 'value'
20
+ response['field'] = 'value2'
21
+ print(response['field']) # prints 'value2'
22
+ print('field' in response) # prints True
23
+
24
+ This model exists for backwards compatibility with the
25
+ old SDK that used to return a dict.
26
+
27
+ The backward compatibility may be removed
28
+ in the future.
29
+ """
30
+ # Ignore extra fields in the request body, which is useful for backward
31
+ # compatibility. The difference with `allow` is that `ignore` will not
32
+ # include the unknown fields when dump the model, i.e., we can add new
33
+ # fields to the request body without breaking the existing old API server
34
+ # where the handler function does not accept the new field in function
35
+ # signature.
36
+ model_config = pydantic.ConfigDict(extra='ignore')
37
+
38
+ # backward compatibility with dict
39
+ # TODO(syang): remove this in v0.13.0
40
+ def __getitem__(self, key):
41
+ try:
42
+ return getattr(self, key)
43
+ except AttributeError as e:
44
+ raise KeyError(key) from e
45
+
46
+ def __setitem__(self, key, value):
47
+ setattr(self, key, value)
48
+
49
+ def __contains__(self, key):
50
+ return hasattr(self, key)
51
+
52
+ def keys(self):
53
+ return self.model_dump().keys()
54
+
55
+ def values(self):
56
+ return self.model_dump().values()
57
+
58
+ def items(self):
59
+ return self.model_dump().items()
60
+
61
+
62
+ class APIHealthResponse(ResponseBaseModel):
63
+ """Response for the API health endpoint."""
64
+ status: common.ApiServerStatus
65
+ api_version: str = ''
66
+ version: str = ''
67
+ version_on_disk: str = ''
68
+ commit: str = ''
69
+ basic_auth_enabled: bool = False
70
+ user: Optional[models.User] = None
@@ -22,7 +22,7 @@ depends_on = None
22
22
  def upgrade():
23
23
  with op.get_context().autocommit_block():
24
24
  # Create any missing tables with current schema first
25
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
25
+ db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
26
26
 
27
27
  # Add all missing columns to clusters table
28
28
  # This allows each column addition to fail independently without rolling
@@ -0,0 +1,32 @@
1
+ """Columns for whether the cluster is managed.
2
+
3
+ Revision ID: 005
4
+ Revises: 004
5
+ Create Date: 2025-08-08
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+
13
+ from sky.global_user_state import Base
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.π
17
+ revision: str = '005'
18
+ down_revision: Union[str, Sequence[str], None] = '004'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add new table for cluster events."""
25
+ with op.get_context().autocommit_block():
26
+ # Add new table for cluster events.
27
+ db_utils.add_table_to_db_sqlalchemy(Base.metadata, op.get_bind(),
28
+ 'cluster_events')
29
+
30
+
31
+ def downgrade():
32
+ pass
@@ -26,7 +26,7 @@ def upgrade():
26
26
  """Create initial schema and add all backwards compatibility columns"""
27
27
  with op.get_context().autocommit_block():
28
28
  # Create all tables with their current schema
29
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
29
+ db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
30
30
 
31
31
  # Add backwards compatibility columns using helper function that matches
32
32
  # original add_column_to_table_sqlalchemy behavior exactly
@@ -26,7 +26,7 @@ def upgrade():
26
26
  """Create initial schema and add all backwards compatibility columns"""
27
27
  with op.get_context().autocommit_block():
28
28
  # Create all tables with their current schema
29
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
29
+ db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
30
30
 
31
31
  # Add backwards compatibility columns using helper function that matches
32
32
  # original add_column_to_table_sqlalchemy behavior exactly
File without changes
@@ -0,0 +1,36 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
3
+ # source: sky/schemas/generated/autostopv1.proto
4
+ # Protobuf Python Version: 5.26.1
5
+ """Generated protocol buffer code."""
6
+ from google.protobuf import descriptor as _descriptor
7
+ from google.protobuf import descriptor_pool as _descriptor_pool
8
+ from google.protobuf import symbol_database as _symbol_database
9
+ from google.protobuf.internal import builder as _builder
10
+ # @@protoc_insertion_point(imports)
11
+
12
+ _sym_db = _symbol_database.Default()
13
+
14
+
15
+
16
+
17
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n&sky/schemas/generated/autostopv1.proto\x12\x0b\x61utostop.v1\"y\n\x12SetAutostopRequest\x12\x14\n\x0cidle_minutes\x18\x01 \x01(\x05\x12\x0f\n\x07\x62\x61\x63kend\x18\x02 \x01(\t\x12.\n\x08wait_for\x18\x03 \x01(\x0e\x32\x1c.autostop.v1.AutostopWaitFor\x12\x0c\n\x04\x64own\x18\x04 \x01(\x08\"\x15\n\x13SetAutostopResponse\"\x17\n\x15IsAutostoppingRequest\"1\n\x16IsAutostoppingResponse\x12\x17\n\x0fis_autostopping\x18\x01 \x01(\x08*\x90\x01\n\x0f\x41utostopWaitFor\x12!\n\x1d\x41UTOSTOP_WAIT_FOR_UNSPECIFIED\x10\x00\x12\"\n\x1e\x41UTOSTOP_WAIT_FOR_JOBS_AND_SSH\x10\x01\x12\x1a\n\x16\x41UTOSTOP_WAIT_FOR_JOBS\x10\x02\x12\x1a\n\x16\x41UTOSTOP_WAIT_FOR_NONE\x10\x03\x32\xbe\x01\n\x0f\x41utostopService\x12P\n\x0bSetAutostop\x12\x1f.autostop.v1.SetAutostopRequest\x1a .autostop.v1.SetAutostopResponse\x12Y\n\x0eIsAutostopping\x12\".autostop.v1.IsAutostoppingRequest\x1a#.autostop.v1.IsAutostoppingResponseb\x06proto3')
18
+
19
+ _globals = globals()
20
+ _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
21
+ _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sky.schemas.generated.autostopv1_pb2', _globals)
22
+ if not _descriptor._USE_C_DESCRIPTORS:
23
+ DESCRIPTOR._loaded_options = None
24
+ _globals['_AUTOSTOPWAITFOR']._serialized_start=278
25
+ _globals['_AUTOSTOPWAITFOR']._serialized_end=422
26
+ _globals['_SETAUTOSTOPREQUEST']._serialized_start=55
27
+ _globals['_SETAUTOSTOPREQUEST']._serialized_end=176
28
+ _globals['_SETAUTOSTOPRESPONSE']._serialized_start=178
29
+ _globals['_SETAUTOSTOPRESPONSE']._serialized_end=199
30
+ _globals['_ISAUTOSTOPPINGREQUEST']._serialized_start=201
31
+ _globals['_ISAUTOSTOPPINGREQUEST']._serialized_end=224
32
+ _globals['_ISAUTOSTOPPINGRESPONSE']._serialized_start=226
33
+ _globals['_ISAUTOSTOPPINGRESPONSE']._serialized_end=275
34
+ _globals['_AUTOSTOPSERVICE']._serialized_start=425
35
+ _globals['_AUTOSTOPSERVICE']._serialized_end=615
36
+ # @@protoc_insertion_point(module_scope)
@@ -0,0 +1,43 @@
1
+ from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
2
+ from google.protobuf import descriptor as _descriptor
3
+ from google.protobuf import message as _message
4
+ from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
5
+
6
+ DESCRIPTOR: _descriptor.FileDescriptor
7
+
8
+ class AutostopWaitFor(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
9
+ __slots__ = ()
10
+ AUTOSTOP_WAIT_FOR_UNSPECIFIED: _ClassVar[AutostopWaitFor]
11
+ AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: _ClassVar[AutostopWaitFor]
12
+ AUTOSTOP_WAIT_FOR_JOBS: _ClassVar[AutostopWaitFor]
13
+ AUTOSTOP_WAIT_FOR_NONE: _ClassVar[AutostopWaitFor]
14
+ AUTOSTOP_WAIT_FOR_UNSPECIFIED: AutostopWaitFor
15
+ AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: AutostopWaitFor
16
+ AUTOSTOP_WAIT_FOR_JOBS: AutostopWaitFor
17
+ AUTOSTOP_WAIT_FOR_NONE: AutostopWaitFor
18
+
19
+ class SetAutostopRequest(_message.Message):
20
+ __slots__ = ("idle_minutes", "backend", "wait_for", "down")
21
+ IDLE_MINUTES_FIELD_NUMBER: _ClassVar[int]
22
+ BACKEND_FIELD_NUMBER: _ClassVar[int]
23
+ WAIT_FOR_FIELD_NUMBER: _ClassVar[int]
24
+ DOWN_FIELD_NUMBER: _ClassVar[int]
25
+ idle_minutes: int
26
+ backend: str
27
+ wait_for: AutostopWaitFor
28
+ down: bool
29
+ def __init__(self, idle_minutes: _Optional[int] = ..., backend: _Optional[str] = ..., wait_for: _Optional[_Union[AutostopWaitFor, str]] = ..., down: bool = ...) -> None: ...
30
+
31
+ class SetAutostopResponse(_message.Message):
32
+ __slots__ = ()
33
+ def __init__(self) -> None: ...
34
+
35
+ class IsAutostoppingRequest(_message.Message):
36
+ __slots__ = ()
37
+ def __init__(self) -> None: ...
38
+
39
+ class IsAutostoppingResponse(_message.Message):
40
+ __slots__ = ("is_autostopping",)
41
+ IS_AUTOSTOPPING_FIELD_NUMBER: _ClassVar[int]
42
+ is_autostopping: bool
43
+ def __init__(self, is_autostopping: bool = ...) -> None: ...
@@ -0,0 +1,146 @@
1
+ # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
2
+ """Client and server classes corresponding to protobuf-defined services."""
3
+ import grpc
4
+ import warnings
5
+
6
+ from sky.schemas.generated import autostopv1_pb2 as sky_dot_schemas_dot_generated_dot_autostopv1__pb2
7
+
8
+ GRPC_GENERATED_VERSION = '1.63.0'
9
+ GRPC_VERSION = grpc.__version__
10
+ EXPECTED_ERROR_RELEASE = '1.65.0'
11
+ SCHEDULED_RELEASE_DATE = 'June 25, 2024'
12
+ _version_not_supported = False
13
+
14
+ try:
15
+ from grpc._utilities import first_version_is_lower
16
+ _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
17
+ except ImportError:
18
+ _version_not_supported = True
19
+
20
+ if _version_not_supported:
21
+ warnings.warn(
22
+ f'The grpc package installed is at version {GRPC_VERSION},'
23
+ + f' but the generated code in sky/schemas/generated/autostopv1_pb2_grpc.py depends on'
24
+ + f' grpcio>={GRPC_GENERATED_VERSION}.'
25
+ + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
26
+ + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
27
+ + f' This warning will become an error in {EXPECTED_ERROR_RELEASE},'
28
+ + f' scheduled for release on {SCHEDULED_RELEASE_DATE}.',
29
+ RuntimeWarning
30
+ )
31
+
32
+
33
+ class AutostopServiceStub(object):
34
+ """Missing associated documentation comment in .proto file."""
35
+
36
+ def __init__(self, channel):
37
+ """Constructor.
38
+
39
+ Args:
40
+ channel: A grpc.Channel.
41
+ """
42
+ self.SetAutostop = channel.unary_unary(
43
+ '/autostop.v1.AutostopService/SetAutostop',
44
+ request_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.SerializeToString,
45
+ response_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.FromString,
46
+ _registered_method=True)
47
+ self.IsAutostopping = channel.unary_unary(
48
+ '/autostop.v1.AutostopService/IsAutostopping',
49
+ request_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.SerializeToString,
50
+ response_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.FromString,
51
+ _registered_method=True)
52
+
53
+
54
+ class AutostopServiceServicer(object):
55
+ """Missing associated documentation comment in .proto file."""
56
+
57
+ def SetAutostop(self, request, context):
58
+ """Set autostop configuration for the cluster.
59
+ """
60
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
61
+ context.set_details('Method not implemented!')
62
+ raise NotImplementedError('Method not implemented!')
63
+
64
+ def IsAutostopping(self, request, context):
65
+ """Check if the cluster is currently autostopping.
66
+ """
67
+ context.set_code(grpc.StatusCode.UNIMPLEMENTED)
68
+ context.set_details('Method not implemented!')
69
+ raise NotImplementedError('Method not implemented!')
70
+
71
+
72
+ def add_AutostopServiceServicer_to_server(servicer, server):
73
+ rpc_method_handlers = {
74
+ 'SetAutostop': grpc.unary_unary_rpc_method_handler(
75
+ servicer.SetAutostop,
76
+ request_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.FromString,
77
+ response_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.SerializeToString,
78
+ ),
79
+ 'IsAutostopping': grpc.unary_unary_rpc_method_handler(
80
+ servicer.IsAutostopping,
81
+ request_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.FromString,
82
+ response_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.SerializeToString,
83
+ ),
84
+ }
85
+ generic_handler = grpc.method_handlers_generic_handler(
86
+ 'autostop.v1.AutostopService', rpc_method_handlers)
87
+ server.add_generic_rpc_handlers((generic_handler,))
88
+
89
+
90
+ # This class is part of an EXPERIMENTAL API.
91
+ class AutostopService(object):
92
+ """Missing associated documentation comment in .proto file."""
93
+
94
+ @staticmethod
95
+ def SetAutostop(request,
96
+ target,
97
+ options=(),
98
+ channel_credentials=None,
99
+ call_credentials=None,
100
+ insecure=False,
101
+ compression=None,
102
+ wait_for_ready=None,
103
+ timeout=None,
104
+ metadata=None):
105
+ return grpc.experimental.unary_unary(
106
+ request,
107
+ target,
108
+ '/autostop.v1.AutostopService/SetAutostop',
109
+ sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.SerializeToString,
110
+ sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.FromString,
111
+ options,
112
+ channel_credentials,
113
+ insecure,
114
+ call_credentials,
115
+ compression,
116
+ wait_for_ready,
117
+ timeout,
118
+ metadata,
119
+ _registered_method=True)
120
+
121
+ @staticmethod
122
+ def IsAutostopping(request,
123
+ target,
124
+ options=(),
125
+ channel_credentials=None,
126
+ call_credentials=None,
127
+ insecure=False,
128
+ compression=None,
129
+ wait_for_ready=None,
130
+ timeout=None,
131
+ metadata=None):
132
+ return grpc.experimental.unary_unary(
133
+ request,
134
+ target,
135
+ '/autostop.v1.AutostopService/IsAutostopping',
136
+ sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.SerializeToString,
137
+ sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.FromString,
138
+ options,
139
+ channel_credentials,
140
+ insecure,
141
+ call_credentials,
142
+ compression,
143
+ wait_for_ready,
144
+ timeout,
145
+ metadata,
146
+ _registered_method=True)
sky/serve/constants.py CHANGED
@@ -73,13 +73,6 @@ CONTROLLER_AUTOSTOP = {
73
73
  'down': False,
74
74
  }
75
75
 
76
- # Due to the CPU/memory usage of the controller process launched with a job on
77
- # controller VM (use ray job under the hood), we need to reserve some CPU/memory
78
- # for each serve controller process.
79
- # Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16
80
- # services.
81
- CONTROLLER_MEMORY_USAGE_GB = 1.0
82
-
83
76
  # A period of time to initialize your service. Any readiness probe failures
84
77
  # during this period will be ignored.
85
78
  DEFAULT_INITIAL_DELAY_SECONDS = 1200
@@ -115,3 +108,6 @@ TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
115
108
 
116
109
  # Dummy run command for cluster pool.
117
110
  POOL_DUMMY_RUN_COMMAND = 'echo "setup done"'
111
+
112
+ # Error message for max number of services reached.
113
+ MAX_NUMBER_OF_SERVICES_REACHED_ERROR = 'Max number of services reached.'
@@ -13,16 +13,16 @@ import typing
13
13
  from typing import Any, Dict, List, Optional, Tuple
14
14
 
15
15
  import colorama
16
- import psutil
16
+ import filelock
17
17
  import requests
18
18
 
19
- import sky
20
19
  from sky import backends
21
20
  from sky import core
22
21
  from sky import exceptions
23
22
  from sky import execution
24
23
  from sky import global_user_state
25
24
  from sky import sky_logging
25
+ from sky import task as task_lib
26
26
  from sky.backends import backend_utils
27
27
  from sky.jobs import scheduler as jobs_scheduler
28
28
  from sky.serve import constants as serve_constants
@@ -41,7 +41,6 @@ from sky.utils import status_lib
41
41
  from sky.utils import ux_utils
42
42
 
43
43
  if typing.TYPE_CHECKING:
44
- from sky import resources
45
44
  from sky.serve import service_spec
46
45
 
47
46
  logger = sky_logging.init_logger(__name__)
@@ -51,10 +50,6 @@ _PROCESS_POOL_REFRESH_INTERVAL = 20
51
50
  _RETRY_INIT_GAP_SECONDS = 60
52
51
  _DEFAULT_DRAIN_SECONDS = 120
53
52
 
54
- # Since sky.launch is very resource demanding, we limit the number of
55
- # concurrent sky.launch process to avoid overloading the machine.
56
- _MAX_NUM_LAUNCH = psutil.cpu_count() * 2
57
-
58
53
 
59
54
  # TODO(tian): Combine this with
60
55
  # sky/spot/recovery_strategy.py::StrategyExecutor::launch
@@ -81,7 +76,7 @@ def launch_cluster(replica_id: int,
81
76
  try:
82
77
  config = common_utils.read_yaml(
83
78
  os.path.expanduser(service_task_yaml_path))
84
- task = sky.Task.from_yaml_config(config)
79
+ task = task_lib.Task.from_yaml_config(config)
85
80
  if resources_override is not None:
86
81
  resources = task.resources
87
82
  overrided_resources = [
@@ -177,7 +172,7 @@ def terminate_cluster(cluster_name: str,
177
172
 
178
173
  def _get_resources_ports(service_task_yaml_path: str) -> str:
179
174
  """Get the resources ports used by the task."""
180
- task = sky.Task.from_yaml(service_task_yaml_path)
175
+ task = task_lib.Task.from_yaml(service_task_yaml_path)
181
176
  # Already checked all ports are valid in sky.serve.core.up
182
177
  assert task.resources, task
183
178
  assert task.service is not None, task
@@ -195,7 +190,7 @@ def _should_use_spot(service_task_yaml_path: str,
195
190
  if use_spot_override is not None:
196
191
  assert isinstance(use_spot_override, bool)
197
192
  return use_spot_override
198
- task = sky.Task.from_yaml(service_task_yaml_path)
193
+ task = task_lib.Task.from_yaml(service_task_yaml_path)
199
194
  spot_use_resources = [
200
195
  resources for resources in task.resources if resources.use_spot
201
196
  ]
@@ -688,7 +683,7 @@ class SkyPilotReplicaManager(ReplicaManager):
688
683
  service_task_yaml_path: str) -> None:
689
684
  super().__init__(service_name, spec)
690
685
  self.service_task_yaml_path = service_task_yaml_path
691
- task = sky.Task.from_yaml(service_task_yaml_path)
686
+ task = task_lib.Task.from_yaml(service_task_yaml_path)
692
687
  self._spot_placer: Optional[spot_placer.SpotPlacer] = (
693
688
  spot_placer.SpotPlacer.from_task(spec, task))
694
689
  # TODO(tian): Store launch/down pid in the replica table, to make the
@@ -872,8 +867,9 @@ class SkyPilotReplicaManager(ReplicaManager):
872
867
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
873
868
  replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
874
869
  'replica_jobs')
875
- job_log_file_name = (controller_utils.download_and_stream_job_log(
876
- backend, handle, replica_job_logs_dir))
870
+ job_ids = ['1'] if self._is_pool else None
871
+ job_log_file_name = controller_utils.download_and_stream_job_log(
872
+ backend, handle, replica_job_logs_dir, job_ids)
877
873
  if job_log_file_name is not None:
878
874
  logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
879
875
  with open(log_file_name, 'a',
@@ -981,7 +977,9 @@ class SkyPilotReplicaManager(ReplicaManager):
981
977
  # To avoid `dictionary changed size during iteration` error.
982
978
  launch_process_pool_snapshot = list(self._launch_process_pool.items())
983
979
  for replica_id, p in launch_process_pool_snapshot:
984
- if not p.is_alive():
980
+ if p.is_alive():
981
+ continue
982
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
985
983
  info = serve_state.get_replica_info_from_id(
986
984
  self._service_name, replica_id)
987
985
  assert info is not None, replica_id
@@ -989,8 +987,7 @@ class SkyPilotReplicaManager(ReplicaManager):
989
987
  schedule_next_jobs = False
990
988
  if info.status == serve_state.ReplicaStatus.PENDING:
991
989
  # sky.launch not started yet
992
- if (serve_state.total_number_provisioning_replicas() <
993
- _MAX_NUM_LAUNCH):
990
+ if controller_utils.can_provision():
994
991
  p.start()
995
992
  info.status_property.sky_launch_status = (
996
993
  ProcessStatus.RUNNING)
@@ -1044,6 +1041,8 @@ class SkyPilotReplicaManager(ReplicaManager):
1044
1041
  self._terminate_replica(replica_id,
1045
1042
  sync_down_logs=True,
1046
1043
  replica_drain_delay_seconds=0)
1044
+ # Try schedule next job after acquiring the lock.
1045
+ jobs_scheduler.maybe_schedule_next_jobs()
1047
1046
  down_process_pool_snapshot = list(self._down_process_pool.items())
1048
1047
  for replica_id, p in down_process_pool_snapshot:
1049
1048
  if not p.is_alive():
sky/serve/serve_state.py CHANGED
@@ -502,6 +502,16 @@ def get_services() -> List[Dict[str, Any]]:
502
502
  return records
503
503
 
504
504
 
505
+ @init_db
506
+ def get_num_services() -> int:
507
+ """Get the number of services."""
508
+ assert _SQLALCHEMY_ENGINE is not None
509
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
510
+ return session.execute(
511
+ sqlalchemy.select(sqlalchemy.func.count() # pylint: disable=not-callable
512
+ ).select_from(services_table)).fetchone()[0]
513
+
514
+
505
515
  @init_db
506
516
  def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]:
507
517
  """Get all existing service records."""
sky/serve/serve_utils.py CHANGED
@@ -37,6 +37,7 @@ from sky.skylet import job_lib
37
37
  from sky.utils import annotations
38
38
  from sky.utils import command_runner
39
39
  from sky.utils import common_utils
40
+ from sky.utils import controller_utils
40
41
  from sky.utils import log_utils
41
42
  from sky.utils import message_utils
42
43
  from sky.utils import resources_utils
@@ -56,14 +57,6 @@ else:
56
57
 
57
58
  logger = sky_logging.init_logger(__name__)
58
59
 
59
-
60
- @annotations.lru_cache(scope='request')
61
- def get_num_service_threshold():
62
- """Get number of services threshold, calculating it only when needed."""
63
- system_memory_gb = psutil.virtual_memory().total // (1024**3)
64
- return system_memory_gb // constants.CONTROLLER_MEMORY_USAGE_GB
65
-
66
-
67
60
  _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
68
61
 
69
62
  # NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
@@ -259,14 +252,47 @@ def get_service_filelock_path(pool: str) -> str:
259
252
  return str(path)
260
253
 
261
254
 
255
+ def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
256
+ pool: bool) -> None:
257
+ """Validate the consolidation mode config."""
258
+ # Check whether the consolidation mode config is changed.
259
+ controller = controller_utils.get_controller_for_pool(pool).value
260
+ if current_is_consolidation_mode:
261
+ controller_cn = controller.cluster_name
262
+ if global_user_state.get_cluster_from_name(controller_cn) is not None:
263
+ with ux_utils.print_exception_no_traceback():
264
+ raise exceptions.InconsistentConsolidationModeError(
265
+ f'{colorama.Fore.RED}Consolidation mode for '
266
+ f'{controller.controller_type} is enabled, but the '
267
+ f'controller cluster {controller_cn} is still running. '
268
+ 'Please terminate the controller cluster first.'
269
+ f'{colorama.Style.RESET_ALL}')
270
+ else:
271
+ noun = 'pool' if pool else 'service'
272
+ all_services = [
273
+ svc for svc in serve_state.get_services() if svc['pool'] == pool
274
+ ]
275
+ if all_services:
276
+ with ux_utils.print_exception_no_traceback():
277
+ raise exceptions.InconsistentConsolidationModeError(
278
+ f'{colorama.Fore.RED}Consolidation mode for '
279
+ f'{controller.controller_type} is disabled, but there are '
280
+ f'still {len(all_services)} {noun}s running. Please '
281
+ f'terminate those {noun}s first.{colorama.Style.RESET_ALL}')
282
+
283
+
262
284
  @annotations.lru_cache(scope='request', maxsize=1)
263
285
  def is_consolidation_mode(pool: bool = False) -> bool:
264
286
  # Use jobs config for pool consolidation mode.
265
- controller_type = 'jobs' if pool else 'serve'
287
+ controller = controller_utils.get_controller_for_pool(pool).value
266
288
  consolidation_mode = skypilot_config.get_nested(
267
- (controller_type, 'controller', 'consolidation_mode'),
289
+ (controller.controller_type, 'controller', 'consolidation_mode'),
268
290
  default_value=False)
269
- # _check_consolidation_mode_consistency(consolidation_mode, pool)
291
+ # We should only do this check on API server, as the controller will not
292
+ # have related config and will always seemingly disabled for consolidation
293
+ # mode. Check #6611 for more details.
294
+ if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
295
+ _validate_consolidation_mode_config(consolidation_mode, pool)
270
296
  return consolidation_mode
271
297
 
272
298
 
@@ -490,6 +516,8 @@ def generate_remote_tls_certfile_name(service_name: str) -> str:
490
516
 
491
517
 
492
518
  def generate_replica_cluster_name(service_name: str, replica_id: int) -> str:
519
+ # NOTE(dev): This format is used in sky/serve/service.py::_cleanup, for
520
+ # checking replica cluster existence. Be careful when changing it.
493
521
  return f'{service_name}-{replica_id}'
494
522
 
495
523
 
@@ -762,9 +790,13 @@ def load_version_string(payload: str) -> str:
762
790
  return message_utils.decode_payload(payload)
763
791
 
764
792
 
765
- def num_replicas(service_name: str) -> int:
793
+ def get_ready_replicas(
794
+ service_name: str) -> List['replica_managers.ReplicaInfo']:
766
795
  logger.info(f'Get number of replicas for pool {service_name!r}')
767
- return len(serve_state.get_replica_infos(service_name))
796
+ return [
797
+ info for info in serve_state.get_replica_infos(service_name)
798
+ if info.status == serve_state.ReplicaStatus.READY
799
+ ]
768
800
 
769
801
 
770
802
  def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
@@ -789,12 +821,8 @@ def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
789
821
  logger.error(f'Service {service_name!r} is not a cluster pool.')
790
822
  return None
791
823
  with filelock.FileLock(get_service_filelock_path(service_name)):
792
-
793
824
  logger.debug(f'Get next cluster name for pool {service_name!r}')
794
- ready_replicas = [
795
- info for info in serve_state.get_replica_infos(service_name)
796
- if info.status == serve_state.ReplicaStatus.READY
797
- ]
825
+ ready_replicas = get_ready_replicas(service_name)
798
826
  idle_replicas: List['replica_managers.ReplicaInfo'] = []
799
827
  for replica_info in ready_replicas:
800
828
  jobs_on_replica = managed_job_state.get_nonterminal_job_ids_by_pool(
@@ -1010,11 +1038,18 @@ def wait_service_registration(service_name: str, job_id: int,
1010
1038
  lb_port = record['load_balancer_port']
1011
1039
  if lb_port is not None:
1012
1040
  return message_utils.encode_payload(lb_port)
1013
- elif len(serve_state.get_services()) >= get_num_service_threshold():
1014
- with ux_utils.print_exception_no_traceback():
1015
- raise RuntimeError('Max number of services reached. '
1016
- 'To spin up more services, please '
1017
- 'tear down some existing services.')
1041
+ else:
1042
+ controller_log_path = os.path.expanduser(
1043
+ generate_remote_controller_log_file_name(service_name))
1044
+ if os.path.exists(controller_log_path):
1045
+ with open(controller_log_path, 'r', encoding='utf-8') as f:
1046
+ log_content = f.read()
1047
+ if (constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR
1048
+ in log_content):
1049
+ with ux_utils.print_exception_no_traceback():
1050
+ raise RuntimeError('Max number of services reached. '
1051
+ 'To spin up more services, please '
1052
+ 'tear down some existing services.')
1018
1053
  elapsed = time.time() - start_time
1019
1054
  if elapsed > constants.SERVICE_REGISTER_TIMEOUT_SECONDS:
1020
1055
  # Print the controller log to help user debug.