skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/backends/backend_utils.py +69 -6
- sky/backends/cloud_vm_ray_backend.py +156 -25
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +40 -77
- sky/client/common.py +1 -1
- sky/client/sdk.py +19 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +14 -0
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-7fd0cf9dbecff10f.js → webpack-00c0a51d21157453.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +63 -7
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +3 -3
- sky/jobs/scheduler.py +23 -68
- sky/jobs/server/core.py +18 -12
- sky/jobs/state.py +6 -2
- sky/jobs/utils.py +8 -0
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +9 -0
- sky/provision/aws/instance.py +36 -13
- sky/provision/azure/instance.py +2 -0
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +2 -0
- sky/provision/do/instance.py +2 -0
- sky/provision/fluidstack/instance.py +2 -0
- sky/provision/gcp/instance.py +2 -0
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/kubernetes/instance.py +133 -0
- sky/provision/lambda_cloud/instance.py +2 -0
- sky/provision/nebius/instance.py +2 -0
- sky/provision/oci/instance.py +2 -0
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/runpod/instance.py +2 -0
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +2 -0
- sky/provision/vast/instance.py +2 -0
- sky/provision/vsphere/instance.py +2 -0
- sky/resources.py +1 -2
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +15 -16
- sky/serve/serve_state.py +10 -0
- sky/serve/serve_utils.py +21 -20
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +31 -16
- sky/server/server.py +20 -14
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +73 -0
- sky/utils/db/db_utils.py +17 -0
- sky/utils/schemas.py +3 -0
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +102 -94
- /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
|
|
2
|
+
"""Client and server classes corresponding to protobuf-defined services."""
|
|
3
|
+
import grpc
|
|
4
|
+
import warnings
|
|
5
|
+
|
|
6
|
+
from sky.schemas.generated import autostopv1_pb2 as sky_dot_schemas_dot_generated_dot_autostopv1__pb2
|
|
7
|
+
|
|
8
|
+
GRPC_GENERATED_VERSION = '1.63.0'
|
|
9
|
+
GRPC_VERSION = grpc.__version__
|
|
10
|
+
EXPECTED_ERROR_RELEASE = '1.65.0'
|
|
11
|
+
SCHEDULED_RELEASE_DATE = 'June 25, 2024'
|
|
12
|
+
_version_not_supported = False
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from grpc._utilities import first_version_is_lower
|
|
16
|
+
_version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
|
|
17
|
+
except ImportError:
|
|
18
|
+
_version_not_supported = True
|
|
19
|
+
|
|
20
|
+
if _version_not_supported:
|
|
21
|
+
warnings.warn(
|
|
22
|
+
f'The grpc package installed is at version {GRPC_VERSION},'
|
|
23
|
+
+ f' but the generated code in sky/schemas/generated/autostopv1_pb2_grpc.py depends on'
|
|
24
|
+
+ f' grpcio>={GRPC_GENERATED_VERSION}.'
|
|
25
|
+
+ f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
|
|
26
|
+
+ f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
|
|
27
|
+
+ f' This warning will become an error in {EXPECTED_ERROR_RELEASE},'
|
|
28
|
+
+ f' scheduled for release on {SCHEDULED_RELEASE_DATE}.',
|
|
29
|
+
RuntimeWarning
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class AutostopServiceStub(object):
|
|
34
|
+
"""Missing associated documentation comment in .proto file."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, channel):
|
|
37
|
+
"""Constructor.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
channel: A grpc.Channel.
|
|
41
|
+
"""
|
|
42
|
+
self.SetAutostop = channel.unary_unary(
|
|
43
|
+
'/autostop.v1.AutostopService/SetAutostop',
|
|
44
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.SerializeToString,
|
|
45
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.FromString,
|
|
46
|
+
_registered_method=True)
|
|
47
|
+
self.IsAutostopping = channel.unary_unary(
|
|
48
|
+
'/autostop.v1.AutostopService/IsAutostopping',
|
|
49
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.SerializeToString,
|
|
50
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.FromString,
|
|
51
|
+
_registered_method=True)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class AutostopServiceServicer(object):
|
|
55
|
+
"""Missing associated documentation comment in .proto file."""
|
|
56
|
+
|
|
57
|
+
def SetAutostop(self, request, context):
|
|
58
|
+
"""Set autostop configuration for the cluster.
|
|
59
|
+
"""
|
|
60
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
61
|
+
context.set_details('Method not implemented!')
|
|
62
|
+
raise NotImplementedError('Method not implemented!')
|
|
63
|
+
|
|
64
|
+
def IsAutostopping(self, request, context):
|
|
65
|
+
"""Check if the cluster is currently autostopping.
|
|
66
|
+
"""
|
|
67
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
68
|
+
context.set_details('Method not implemented!')
|
|
69
|
+
raise NotImplementedError('Method not implemented!')
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def add_AutostopServiceServicer_to_server(servicer, server):
|
|
73
|
+
rpc_method_handlers = {
|
|
74
|
+
'SetAutostop': grpc.unary_unary_rpc_method_handler(
|
|
75
|
+
servicer.SetAutostop,
|
|
76
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.FromString,
|
|
77
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.SerializeToString,
|
|
78
|
+
),
|
|
79
|
+
'IsAutostopping': grpc.unary_unary_rpc_method_handler(
|
|
80
|
+
servicer.IsAutostopping,
|
|
81
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.FromString,
|
|
82
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.SerializeToString,
|
|
83
|
+
),
|
|
84
|
+
}
|
|
85
|
+
generic_handler = grpc.method_handlers_generic_handler(
|
|
86
|
+
'autostop.v1.AutostopService', rpc_method_handlers)
|
|
87
|
+
server.add_generic_rpc_handlers((generic_handler,))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# This class is part of an EXPERIMENTAL API.
|
|
91
|
+
class AutostopService(object):
|
|
92
|
+
"""Missing associated documentation comment in .proto file."""
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def SetAutostop(request,
|
|
96
|
+
target,
|
|
97
|
+
options=(),
|
|
98
|
+
channel_credentials=None,
|
|
99
|
+
call_credentials=None,
|
|
100
|
+
insecure=False,
|
|
101
|
+
compression=None,
|
|
102
|
+
wait_for_ready=None,
|
|
103
|
+
timeout=None,
|
|
104
|
+
metadata=None):
|
|
105
|
+
return grpc.experimental.unary_unary(
|
|
106
|
+
request,
|
|
107
|
+
target,
|
|
108
|
+
'/autostop.v1.AutostopService/SetAutostop',
|
|
109
|
+
sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopRequest.SerializeToString,
|
|
110
|
+
sky_dot_schemas_dot_generated_dot_autostopv1__pb2.SetAutostopResponse.FromString,
|
|
111
|
+
options,
|
|
112
|
+
channel_credentials,
|
|
113
|
+
insecure,
|
|
114
|
+
call_credentials,
|
|
115
|
+
compression,
|
|
116
|
+
wait_for_ready,
|
|
117
|
+
timeout,
|
|
118
|
+
metadata,
|
|
119
|
+
_registered_method=True)
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def IsAutostopping(request,
|
|
123
|
+
target,
|
|
124
|
+
options=(),
|
|
125
|
+
channel_credentials=None,
|
|
126
|
+
call_credentials=None,
|
|
127
|
+
insecure=False,
|
|
128
|
+
compression=None,
|
|
129
|
+
wait_for_ready=None,
|
|
130
|
+
timeout=None,
|
|
131
|
+
metadata=None):
|
|
132
|
+
return grpc.experimental.unary_unary(
|
|
133
|
+
request,
|
|
134
|
+
target,
|
|
135
|
+
'/autostop.v1.AutostopService/IsAutostopping',
|
|
136
|
+
sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingRequest.SerializeToString,
|
|
137
|
+
sky_dot_schemas_dot_generated_dot_autostopv1__pb2.IsAutostoppingResponse.FromString,
|
|
138
|
+
options,
|
|
139
|
+
channel_credentials,
|
|
140
|
+
insecure,
|
|
141
|
+
call_credentials,
|
|
142
|
+
compression,
|
|
143
|
+
wait_for_ready,
|
|
144
|
+
timeout,
|
|
145
|
+
metadata,
|
|
146
|
+
_registered_method=True)
|
sky/serve/constants.py
CHANGED
|
@@ -73,13 +73,6 @@ CONTROLLER_AUTOSTOP = {
|
|
|
73
73
|
'down': False,
|
|
74
74
|
}
|
|
75
75
|
|
|
76
|
-
# Due to the CPU/memory usage of the controller process launched with a job on
|
|
77
|
-
# controller VM (use ray job under the hood), we need to reserve some CPU/memory
|
|
78
|
-
# for each serve controller process.
|
|
79
|
-
# Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16
|
|
80
|
-
# services.
|
|
81
|
-
CONTROLLER_MEMORY_USAGE_GB = 1.0
|
|
82
|
-
|
|
83
76
|
# A period of time to initialize your service. Any readiness probe failures
|
|
84
77
|
# during this period will be ignored.
|
|
85
78
|
DEFAULT_INITIAL_DELAY_SECONDS = 1200
|
|
@@ -115,3 +108,6 @@ TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
|
|
|
115
108
|
|
|
116
109
|
# Dummy run command for cluster pool.
|
|
117
110
|
POOL_DUMMY_RUN_COMMAND = 'echo "setup done"'
|
|
111
|
+
|
|
112
|
+
# Error message for max number of services reached.
|
|
113
|
+
MAX_NUMBER_OF_SERVICES_REACHED_ERROR = 'Max number of services reached.'
|
sky/serve/replica_managers.py
CHANGED
|
@@ -13,16 +13,16 @@ import typing
|
|
|
13
13
|
from typing import Any, Dict, List, Optional, Tuple
|
|
14
14
|
|
|
15
15
|
import colorama
|
|
16
|
-
import
|
|
16
|
+
import filelock
|
|
17
17
|
import requests
|
|
18
18
|
|
|
19
|
-
import sky
|
|
20
19
|
from sky import backends
|
|
21
20
|
from sky import core
|
|
22
21
|
from sky import exceptions
|
|
23
22
|
from sky import execution
|
|
24
23
|
from sky import global_user_state
|
|
25
24
|
from sky import sky_logging
|
|
25
|
+
from sky import task as task_lib
|
|
26
26
|
from sky.backends import backend_utils
|
|
27
27
|
from sky.jobs import scheduler as jobs_scheduler
|
|
28
28
|
from sky.serve import constants as serve_constants
|
|
@@ -41,7 +41,6 @@ from sky.utils import status_lib
|
|
|
41
41
|
from sky.utils import ux_utils
|
|
42
42
|
|
|
43
43
|
if typing.TYPE_CHECKING:
|
|
44
|
-
from sky import resources
|
|
45
44
|
from sky.serve import service_spec
|
|
46
45
|
|
|
47
46
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -51,10 +50,6 @@ _PROCESS_POOL_REFRESH_INTERVAL = 20
|
|
|
51
50
|
_RETRY_INIT_GAP_SECONDS = 60
|
|
52
51
|
_DEFAULT_DRAIN_SECONDS = 120
|
|
53
52
|
|
|
54
|
-
# Since sky.launch is very resource demanding, we limit the number of
|
|
55
|
-
# concurrent sky.launch process to avoid overloading the machine.
|
|
56
|
-
_MAX_NUM_LAUNCH = psutil.cpu_count() * 2
|
|
57
|
-
|
|
58
53
|
|
|
59
54
|
# TODO(tian): Combine this with
|
|
60
55
|
# sky/spot/recovery_strategy.py::StrategyExecutor::launch
|
|
@@ -81,7 +76,7 @@ def launch_cluster(replica_id: int,
|
|
|
81
76
|
try:
|
|
82
77
|
config = common_utils.read_yaml(
|
|
83
78
|
os.path.expanduser(service_task_yaml_path))
|
|
84
|
-
task =
|
|
79
|
+
task = task_lib.Task.from_yaml_config(config)
|
|
85
80
|
if resources_override is not None:
|
|
86
81
|
resources = task.resources
|
|
87
82
|
overrided_resources = [
|
|
@@ -177,7 +172,7 @@ def terminate_cluster(cluster_name: str,
|
|
|
177
172
|
|
|
178
173
|
def _get_resources_ports(service_task_yaml_path: str) -> str:
|
|
179
174
|
"""Get the resources ports used by the task."""
|
|
180
|
-
task =
|
|
175
|
+
task = task_lib.Task.from_yaml(service_task_yaml_path)
|
|
181
176
|
# Already checked all ports are valid in sky.serve.core.up
|
|
182
177
|
assert task.resources, task
|
|
183
178
|
assert task.service is not None, task
|
|
@@ -195,7 +190,7 @@ def _should_use_spot(service_task_yaml_path: str,
|
|
|
195
190
|
if use_spot_override is not None:
|
|
196
191
|
assert isinstance(use_spot_override, bool)
|
|
197
192
|
return use_spot_override
|
|
198
|
-
task =
|
|
193
|
+
task = task_lib.Task.from_yaml(service_task_yaml_path)
|
|
199
194
|
spot_use_resources = [
|
|
200
195
|
resources for resources in task.resources if resources.use_spot
|
|
201
196
|
]
|
|
@@ -688,7 +683,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
688
683
|
service_task_yaml_path: str) -> None:
|
|
689
684
|
super().__init__(service_name, spec)
|
|
690
685
|
self.service_task_yaml_path = service_task_yaml_path
|
|
691
|
-
task =
|
|
686
|
+
task = task_lib.Task.from_yaml(service_task_yaml_path)
|
|
692
687
|
self._spot_placer: Optional[spot_placer.SpotPlacer] = (
|
|
693
688
|
spot_placer.SpotPlacer.from_task(spec, task))
|
|
694
689
|
# TODO(tian): Store launch/down pid in the replica table, to make the
|
|
@@ -872,8 +867,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
872
867
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
873
868
|
replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
874
869
|
'replica_jobs')
|
|
875
|
-
|
|
876
|
-
|
|
870
|
+
job_ids = ['1'] if self._is_pool else None
|
|
871
|
+
job_log_file_name = controller_utils.download_and_stream_job_log(
|
|
872
|
+
backend, handle, replica_job_logs_dir, job_ids)
|
|
877
873
|
if job_log_file_name is not None:
|
|
878
874
|
logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
|
|
879
875
|
with open(log_file_name, 'a',
|
|
@@ -981,7 +977,9 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
981
977
|
# To avoid `dictionary changed size during iteration` error.
|
|
982
978
|
launch_process_pool_snapshot = list(self._launch_process_pool.items())
|
|
983
979
|
for replica_id, p in launch_process_pool_snapshot:
|
|
984
|
-
if
|
|
980
|
+
if p.is_alive():
|
|
981
|
+
continue
|
|
982
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
985
983
|
info = serve_state.get_replica_info_from_id(
|
|
986
984
|
self._service_name, replica_id)
|
|
987
985
|
assert info is not None, replica_id
|
|
@@ -989,8 +987,7 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
989
987
|
schedule_next_jobs = False
|
|
990
988
|
if info.status == serve_state.ReplicaStatus.PENDING:
|
|
991
989
|
# sky.launch not started yet
|
|
992
|
-
if
|
|
993
|
-
_MAX_NUM_LAUNCH):
|
|
990
|
+
if controller_utils.can_provision():
|
|
994
991
|
p.start()
|
|
995
992
|
info.status_property.sky_launch_status = (
|
|
996
993
|
ProcessStatus.RUNNING)
|
|
@@ -1044,6 +1041,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
|
1044
1041
|
self._terminate_replica(replica_id,
|
|
1045
1042
|
sync_down_logs=True,
|
|
1046
1043
|
replica_drain_delay_seconds=0)
|
|
1044
|
+
# Try schedule next job after acquiring the lock.
|
|
1045
|
+
jobs_scheduler.maybe_schedule_next_jobs()
|
|
1047
1046
|
down_process_pool_snapshot = list(self._down_process_pool.items())
|
|
1048
1047
|
for replica_id, p in down_process_pool_snapshot:
|
|
1049
1048
|
if not p.is_alive():
|
sky/serve/serve_state.py
CHANGED
|
@@ -502,6 +502,16 @@ def get_services() -> List[Dict[str, Any]]:
|
|
|
502
502
|
return records
|
|
503
503
|
|
|
504
504
|
|
|
505
|
+
@init_db
|
|
506
|
+
def get_num_services() -> int:
|
|
507
|
+
"""Get the number of services."""
|
|
508
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
509
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
510
|
+
return session.execute(
|
|
511
|
+
sqlalchemy.select(sqlalchemy.func.count() # pylint: disable=not-callable
|
|
512
|
+
).select_from(services_table)).fetchone()[0]
|
|
513
|
+
|
|
514
|
+
|
|
505
515
|
@init_db
|
|
506
516
|
def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]:
|
|
507
517
|
"""Get all existing service records."""
|
sky/serve/serve_utils.py
CHANGED
|
@@ -57,14 +57,6 @@ else:
|
|
|
57
57
|
|
|
58
58
|
logger = sky_logging.init_logger(__name__)
|
|
59
59
|
|
|
60
|
-
|
|
61
|
-
@annotations.lru_cache(scope='request')
|
|
62
|
-
def get_num_service_threshold():
|
|
63
|
-
"""Get number of services threshold, calculating it only when needed."""
|
|
64
|
-
system_memory_gb = psutil.virtual_memory().total // (1024**3)
|
|
65
|
-
return system_memory_gb // constants.CONTROLLER_MEMORY_USAGE_GB
|
|
66
|
-
|
|
67
|
-
|
|
68
60
|
_CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
|
|
69
61
|
|
|
70
62
|
# NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
|
|
@@ -524,6 +516,8 @@ def generate_remote_tls_certfile_name(service_name: str) -> str:
|
|
|
524
516
|
|
|
525
517
|
|
|
526
518
|
def generate_replica_cluster_name(service_name: str, replica_id: int) -> str:
|
|
519
|
+
# NOTE(dev): This format is used in sky/serve/service.py::_cleanup, for
|
|
520
|
+
# checking replica cluster existence. Be careful when changing it.
|
|
527
521
|
return f'{service_name}-{replica_id}'
|
|
528
522
|
|
|
529
523
|
|
|
@@ -796,9 +790,13 @@ def load_version_string(payload: str) -> str:
|
|
|
796
790
|
return message_utils.decode_payload(payload)
|
|
797
791
|
|
|
798
792
|
|
|
799
|
-
def
|
|
793
|
+
def get_ready_replicas(
|
|
794
|
+
service_name: str) -> List['replica_managers.ReplicaInfo']:
|
|
800
795
|
logger.info(f'Get number of replicas for pool {service_name!r}')
|
|
801
|
-
return
|
|
796
|
+
return [
|
|
797
|
+
info for info in serve_state.get_replica_infos(service_name)
|
|
798
|
+
if info.status == serve_state.ReplicaStatus.READY
|
|
799
|
+
]
|
|
802
800
|
|
|
803
801
|
|
|
804
802
|
def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
|
|
@@ -823,12 +821,8 @@ def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
|
|
|
823
821
|
logger.error(f'Service {service_name!r} is not a cluster pool.')
|
|
824
822
|
return None
|
|
825
823
|
with filelock.FileLock(get_service_filelock_path(service_name)):
|
|
826
|
-
|
|
827
824
|
logger.debug(f'Get next cluster name for pool {service_name!r}')
|
|
828
|
-
ready_replicas =
|
|
829
|
-
info for info in serve_state.get_replica_infos(service_name)
|
|
830
|
-
if info.status == serve_state.ReplicaStatus.READY
|
|
831
|
-
]
|
|
825
|
+
ready_replicas = get_ready_replicas(service_name)
|
|
832
826
|
idle_replicas: List['replica_managers.ReplicaInfo'] = []
|
|
833
827
|
for replica_info in ready_replicas:
|
|
834
828
|
jobs_on_replica = managed_job_state.get_nonterminal_job_ids_by_pool(
|
|
@@ -1044,11 +1038,18 @@ def wait_service_registration(service_name: str, job_id: int,
|
|
|
1044
1038
|
lb_port = record['load_balancer_port']
|
|
1045
1039
|
if lb_port is not None:
|
|
1046
1040
|
return message_utils.encode_payload(lb_port)
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1041
|
+
else:
|
|
1042
|
+
controller_log_path = os.path.expanduser(
|
|
1043
|
+
generate_remote_controller_log_file_name(service_name))
|
|
1044
|
+
if os.path.exists(controller_log_path):
|
|
1045
|
+
with open(controller_log_path, 'r', encoding='utf-8') as f:
|
|
1046
|
+
log_content = f.read()
|
|
1047
|
+
if (constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR
|
|
1048
|
+
in log_content):
|
|
1049
|
+
with ux_utils.print_exception_no_traceback():
|
|
1050
|
+
raise RuntimeError('Max number of services reached. '
|
|
1051
|
+
'To spin up more services, please '
|
|
1052
|
+
'tear down some existing services.')
|
|
1052
1053
|
elapsed = time.time() - start_time
|
|
1053
1054
|
if elapsed > constants.SERVICE_REGISTER_TIMEOUT_SECONDS:
|
|
1054
1055
|
# Print the controller log to help user debug.
|
sky/serve/server/impl.py
CHANGED
|
@@ -11,7 +11,6 @@ import uuid
|
|
|
11
11
|
import colorama
|
|
12
12
|
import filelock
|
|
13
13
|
|
|
14
|
-
import sky
|
|
15
14
|
from sky import backends
|
|
16
15
|
from sky import exceptions
|
|
17
16
|
from sky import execution
|
|
@@ -25,6 +24,7 @@ from sky.serve import constants as serve_constants
|
|
|
25
24
|
from sky.serve import serve_state
|
|
26
25
|
from sky.serve import serve_utils
|
|
27
26
|
from sky.skylet import constants
|
|
27
|
+
from sky.skylet import job_lib
|
|
28
28
|
from sky.utils import admin_policy_utils
|
|
29
29
|
from sky.utils import command_runner
|
|
30
30
|
from sky.utils import common
|
|
@@ -39,7 +39,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
def _rewrite_tls_credential_paths_and_get_tls_env_vars(
|
|
42
|
-
service_name: str, task: '
|
|
42
|
+
service_name: str, task: 'task_lib.Task') -> Dict[str, Any]:
|
|
43
43
|
"""Rewrite the paths of TLS credentials in the task.
|
|
44
44
|
|
|
45
45
|
Args:
|
|
@@ -103,15 +103,11 @@ def _get_service_record(
|
|
|
103
103
|
|
|
104
104
|
|
|
105
105
|
def up(
|
|
106
|
-
task: '
|
|
106
|
+
task: 'task_lib.Task',
|
|
107
107
|
service_name: Optional[str] = None,
|
|
108
108
|
pool: bool = False,
|
|
109
109
|
) -> Tuple[str, str]:
|
|
110
110
|
"""Spins up a service or a pool."""
|
|
111
|
-
if pool and not serve_utils.is_consolidation_mode(pool):
|
|
112
|
-
raise ValueError(
|
|
113
|
-
'Pool is only supported in consolidation mode. To fix, set '
|
|
114
|
-
'`jobs.controller.consolidation_mode: true` in SkyPilot config.')
|
|
115
111
|
task.validate()
|
|
116
112
|
serve_utils.validate_service_task(task, pool=pool)
|
|
117
113
|
assert task.service is not None
|
|
@@ -191,8 +187,7 @@ def up(
|
|
|
191
187
|
controller_log_file = (
|
|
192
188
|
serve_utils.generate_remote_controller_log_file_name(service_name))
|
|
193
189
|
controller_resources = controller_utils.get_controller_resources(
|
|
194
|
-
controller=
|
|
195
|
-
task_resources=task.resources)
|
|
190
|
+
controller=controller, task_resources=task.resources)
|
|
196
191
|
controller_job_id = None
|
|
197
192
|
if serve_utils.is_consolidation_mode(pool):
|
|
198
193
|
# We need a unique integer per sky.serve.up call to avoid name
|
|
@@ -228,10 +223,11 @@ def up(
|
|
|
228
223
|
# balancer port from the controller? So we don't need to open so many
|
|
229
224
|
# ports here. Or, we should have a nginx traffic control to refuse
|
|
230
225
|
# any connection to the unregistered ports.
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
226
|
+
if not pool:
|
|
227
|
+
controller_resources = {
|
|
228
|
+
r.copy(ports=[serve_constants.LOAD_BALANCER_PORT_RANGE])
|
|
229
|
+
for r in controller_resources
|
|
230
|
+
}
|
|
235
231
|
controller_task.set_resources(controller_resources)
|
|
236
232
|
|
|
237
233
|
# # Set service_name so the backend will know to modify default ray
|
|
@@ -325,7 +321,7 @@ def up(
|
|
|
325
321
|
[controller_job_id],
|
|
326
322
|
stream_logs=False)
|
|
327
323
|
controller_job_status = list(statuses.values())[0]
|
|
328
|
-
if controller_job_status ==
|
|
324
|
+
if controller_job_status == job_lib.JobStatus.PENDING:
|
|
329
325
|
# Max number of services reached due to vCPU constraint.
|
|
330
326
|
# The controller job is pending due to ray job scheduling.
|
|
331
327
|
# We manually cancel the job here.
|
|
@@ -350,7 +346,7 @@ def up(
|
|
|
350
346
|
else:
|
|
351
347
|
lb_port = serve_utils.load_service_initialization_result(
|
|
352
348
|
lb_port_payload)
|
|
353
|
-
if not serve_utils.is_consolidation_mode(pool):
|
|
349
|
+
if not serve_utils.is_consolidation_mode(pool) and not pool:
|
|
354
350
|
socket_endpoint = backend_utils.get_endpoints(
|
|
355
351
|
controller_handle.cluster_name,
|
|
356
352
|
lb_port,
|
|
@@ -374,10 +370,10 @@ def up(
|
|
|
374
370
|
f'\n📋 Useful Commands'
|
|
375
371
|
f'\n{ux_utils.INDENT_SYMBOL}To submit jobs to the pool:\t'
|
|
376
372
|
f'{ux_utils.BOLD}sky jobs launch --pool {service_name} '
|
|
377
|
-
f'<
|
|
373
|
+
f'<yaml_file>{ux_utils.RESET_BOLD}'
|
|
378
374
|
f'\n{ux_utils.INDENT_SYMBOL}To submit multiple jobs:\t'
|
|
379
375
|
f'{ux_utils.BOLD}sky jobs launch --pool {service_name} '
|
|
380
|
-
f'--num-jobs 10 <
|
|
376
|
+
f'--num-jobs 10 <yaml_file>{ux_utils.RESET_BOLD}'
|
|
381
377
|
f'\n{ux_utils.INDENT_SYMBOL}To check the pool status:\t'
|
|
382
378
|
f'{ux_utils.BOLD}sky jobs pool status {service_name}'
|
|
383
379
|
f'{ux_utils.RESET_BOLD}'
|
|
@@ -421,7 +417,7 @@ def up(
|
|
|
421
417
|
|
|
422
418
|
|
|
423
419
|
def update(
|
|
424
|
-
task: '
|
|
420
|
+
task: 'task_lib.Task',
|
|
425
421
|
service_name: str,
|
|
426
422
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
427
423
|
pool: bool = False,
|
|
@@ -576,7 +572,7 @@ def update(
|
|
|
576
572
|
|
|
577
573
|
|
|
578
574
|
def apply(
|
|
579
|
-
task: '
|
|
575
|
+
task: 'task_lib.Task',
|
|
580
576
|
service_name: str,
|
|
581
577
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
582
578
|
pool: bool = False,
|
sky/serve/service.py
CHANGED
|
@@ -15,11 +15,13 @@ import filelock
|
|
|
15
15
|
|
|
16
16
|
from sky import authentication
|
|
17
17
|
from sky import exceptions
|
|
18
|
+
from sky import global_user_state
|
|
18
19
|
from sky import sky_logging
|
|
19
20
|
from sky import task as task_lib
|
|
20
21
|
from sky.backends import backend_utils
|
|
21
22
|
from sky.backends import cloud_vm_ray_backend
|
|
22
23
|
from sky.data import data_utils
|
|
24
|
+
from sky.jobs import scheduler as jobs_scheduler
|
|
23
25
|
from sky.serve import constants
|
|
24
26
|
from sky.serve import controller
|
|
25
27
|
from sky.serve import load_balancer
|
|
@@ -28,6 +30,7 @@ from sky.serve import serve_state
|
|
|
28
30
|
from sky.serve import serve_utils
|
|
29
31
|
from sky.skylet import constants as skylet_constants
|
|
30
32
|
from sky.utils import common_utils
|
|
33
|
+
from sky.utils import controller_utils
|
|
31
34
|
from sky.utils import subprocess_utils
|
|
32
35
|
from sky.utils import ux_utils
|
|
33
36
|
|
|
@@ -120,7 +123,16 @@ def _cleanup(service_name: str) -> bool:
|
|
|
120
123
|
replica_infos = serve_state.get_replica_infos(service_name)
|
|
121
124
|
info2proc: Dict[replica_managers.ReplicaInfo,
|
|
122
125
|
multiprocessing.Process] = dict()
|
|
126
|
+
# NOTE(dev): This relies on `sky/serve/serve_utils.py::
|
|
127
|
+
# generate_replica_cluster_name`. Change it if you change the function.
|
|
128
|
+
existing_cluster_names = global_user_state.get_cluster_names_start_with(
|
|
129
|
+
service_name)
|
|
123
130
|
for info in replica_infos:
|
|
131
|
+
if info.cluster_name not in existing_cluster_names:
|
|
132
|
+
logger.info(f'Cluster {info.cluster_name} for replica '
|
|
133
|
+
f'{info.replica_id} not found. Might be a failed '
|
|
134
|
+
'cluster. Skipping.')
|
|
135
|
+
continue
|
|
124
136
|
p = multiprocessing.Process(target=replica_managers.terminate_cluster,
|
|
125
137
|
args=(info.cluster_name,))
|
|
126
138
|
p.start()
|
|
@@ -214,22 +226,25 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
|
|
|
214
226
|
service_name, version)
|
|
215
227
|
|
|
216
228
|
if not is_recovery:
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
229
|
+
with filelock.FileLock(controller_utils.get_resources_lock_path()):
|
|
230
|
+
if not controller_utils.can_start_new_process():
|
|
231
|
+
cleanup_storage(tmp_task_yaml)
|
|
232
|
+
with ux_utils.print_exception_no_traceback():
|
|
233
|
+
raise RuntimeError(
|
|
234
|
+
constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR)
|
|
235
|
+
success = serve_state.add_service(
|
|
236
|
+
service_name,
|
|
237
|
+
controller_job_id=job_id,
|
|
238
|
+
policy=service_spec.autoscaling_policy_str(),
|
|
239
|
+
requested_resources_str=backend_utils.get_task_resources_str(
|
|
240
|
+
task),
|
|
241
|
+
load_balancing_policy=service_spec.load_balancing_policy,
|
|
242
|
+
status=serve_state.ServiceStatus.CONTROLLER_INIT,
|
|
243
|
+
tls_encrypted=service_spec.tls_credential is not None,
|
|
244
|
+
pool=service_spec.pool,
|
|
245
|
+
controller_pid=os.getpid(),
|
|
246
|
+
entrypoint=entrypoint)
|
|
247
|
+
jobs_scheduler.maybe_schedule_next_jobs()
|
|
233
248
|
# Directly throw an error here. See sky/serve/api.py::up
|
|
234
249
|
# for more details.
|
|
235
250
|
if not success:
|
sky/server/server.py
CHANGED
|
@@ -17,7 +17,7 @@ import resource
|
|
|
17
17
|
import shutil
|
|
18
18
|
import sys
|
|
19
19
|
import threading
|
|
20
|
-
from typing import
|
|
20
|
+
from typing import Dict, List, Literal, Optional, Set, Tuple
|
|
21
21
|
import uuid
|
|
22
22
|
import zipfile
|
|
23
23
|
|
|
@@ -42,6 +42,7 @@ from sky.data import storage_utils
|
|
|
42
42
|
from sky.jobs.server import server as jobs_rest
|
|
43
43
|
from sky.metrics import utils as metrics_utils
|
|
44
44
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
45
|
+
from sky.schemas.api import responses
|
|
45
46
|
from sky.serve.server import server as serve_rest
|
|
46
47
|
from sky.server import common
|
|
47
48
|
from sky.server import config as server_config
|
|
@@ -1531,8 +1532,12 @@ async def api_status(
|
|
|
1531
1532
|
return encoded_request_tasks
|
|
1532
1533
|
|
|
1533
1534
|
|
|
1534
|
-
@app.get(
|
|
1535
|
-
|
|
1535
|
+
@app.get(
|
|
1536
|
+
'/api/health',
|
|
1537
|
+
# response_model_exclude_unset omits unset fields
|
|
1538
|
+
# in the response JSON.
|
|
1539
|
+
response_model_exclude_unset=True)
|
|
1540
|
+
async def health(request: fastapi.Request) -> responses.APIHealthResponse:
|
|
1536
1541
|
"""Checks the health of the API server.
|
|
1537
1542
|
|
|
1538
1543
|
Returns:
|
|
@@ -1570,7 +1575,8 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
|
|
|
1570
1575
|
# - There is no harm when an malicious client calls /api/health
|
|
1571
1576
|
# without authentication since no sensitive information is
|
|
1572
1577
|
# returned.
|
|
1573
|
-
return
|
|
1578
|
+
return responses.APIHealthResponse(
|
|
1579
|
+
status=common.ApiServerStatus.HEALTHY,)
|
|
1574
1580
|
# TODO(aylei): remove this after min_compatible_api_version >= 14.
|
|
1575
1581
|
if client_version < 14:
|
|
1576
1582
|
# For Client with API version < 14, the NEEDS_AUTH status is not
|
|
@@ -1579,19 +1585,19 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
|
|
|
1579
1585
|
detail='Authentication required')
|
|
1580
1586
|
|
|
1581
1587
|
logger.debug(f'Health endpoint: request.state.auth_user = {user}')
|
|
1582
|
-
return
|
|
1583
|
-
|
|
1588
|
+
return responses.APIHealthResponse(
|
|
1589
|
+
status=server_status,
|
|
1584
1590
|
# Kept for backward compatibility, clients before 0.11.0 will read this
|
|
1585
1591
|
# field to check compatibility and hint the user to upgrade the CLI.
|
|
1586
1592
|
# TODO(aylei): remove this field after 0.13.0
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
|
|
1593
|
+
api_version=str(server_constants.API_VERSION),
|
|
1594
|
+
version=sky.__version__,
|
|
1595
|
+
version_on_disk=common.get_skypilot_version_on_disk(),
|
|
1596
|
+
commit=sky.__commit__,
|
|
1597
|
+
basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
|
|
1598
|
+
'false').lower() == 'true',
|
|
1599
|
+
user=user if user is not None else None,
|
|
1600
|
+
)
|
|
1595
1601
|
|
|
1596
1602
|
|
|
1597
1603
|
@app.websocket('/kubernetes-pod-ssh-proxy')
|