skypilot-nightly 1.0.0.dev20250925__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +38 -14
- sky/backends/cloud_vm_ray_backend.py +151 -36
- sky/client/cli/command.py +18 -9
- sky/client/cli/table_utils.py +34 -0
- sky/client/common.py +4 -2
- sky/client/sdk.py +11 -7
- sky/client/sdk_async.py +5 -5
- sky/core.py +6 -6
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
- sky/dashboard/out/_next/static/chunks/{webpack-16ba1d7187d2e3b1.js → webpack-7340bc0f0dd8ae74.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +0 -1
- sky/global_user_state.py +57 -34
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +4 -0
- sky/jobs/server/core.py +98 -26
- sky/jobs/server/utils.py +65 -32
- sky/jobs/state.py +145 -3
- sky/jobs/utils.py +85 -7
- sky/provision/runpod/__init__.py +2 -0
- sky/schemas/api/responses.py +18 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/serve/serve_utils.py +16 -0
- sky/serve/server/core.py +1 -1
- sky/serve/server/impl.py +6 -6
- sky/server/requests/payloads.py +2 -1
- sky/server/requests/serializers/decoders.py +2 -2
- sky/server/requests/serializers/encoders.py +7 -3
- sky/setup_files/dependencies.py +1 -1
- sky/skylet/constants.py +4 -1
- sky/skylet/events.py +42 -0
- sky/skylet/job_lib.py +2 -32
- sky/skylet/log_lib.py +211 -0
- sky/skylet/log_lib.pyi +30 -1
- sky/skylet/services.py +208 -2
- sky/skylet/skylet.py +3 -0
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +8 -3
- sky/utils/db/db_utils.py +5 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes/kubernetes_deploy_utils.py +35 -12
- sky/volumes/server/core.py +1 -0
- sky/volumes/volume.py +16 -17
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/METADATA +36 -36
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/RECORD +74 -69
- sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +0 -16
- /sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
|
|
2
|
+
"""Client and server classes corresponding to protobuf-defined services."""
|
|
3
|
+
import grpc
|
|
4
|
+
import warnings
|
|
5
|
+
|
|
6
|
+
from sky.schemas.generated import managed_jobsv1_pb2 as sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2
|
|
7
|
+
|
|
8
|
+
GRPC_GENERATED_VERSION = '1.63.0'
|
|
9
|
+
GRPC_VERSION = grpc.__version__
|
|
10
|
+
EXPECTED_ERROR_RELEASE = '1.65.0'
|
|
11
|
+
SCHEDULED_RELEASE_DATE = 'June 25, 2024'
|
|
12
|
+
_version_not_supported = False
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from grpc._utilities import first_version_is_lower
|
|
16
|
+
_version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
|
|
17
|
+
except ImportError:
|
|
18
|
+
_version_not_supported = True
|
|
19
|
+
|
|
20
|
+
if _version_not_supported:
|
|
21
|
+
warnings.warn(
|
|
22
|
+
f'The grpc package installed is at version {GRPC_VERSION},'
|
|
23
|
+
+ f' but the generated code in sky/schemas/generated/managed_jobsv1_pb2_grpc.py depends on'
|
|
24
|
+
+ f' grpcio>={GRPC_GENERATED_VERSION}.'
|
|
25
|
+
+ f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
|
|
26
|
+
+ f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
|
|
27
|
+
+ f' This warning will become an error in {EXPECTED_ERROR_RELEASE},'
|
|
28
|
+
+ f' scheduled for release on {SCHEDULED_RELEASE_DATE}.',
|
|
29
|
+
RuntimeWarning
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ManagedJobsServiceStub(object):
|
|
34
|
+
"""Missing associated documentation comment in .proto file."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, channel):
|
|
37
|
+
"""Constructor.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
channel: A grpc.Channel.
|
|
41
|
+
"""
|
|
42
|
+
self.GetVersion = channel.unary_unary(
|
|
43
|
+
'/managed_jobs.v1.ManagedJobsService/GetVersion',
|
|
44
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionRequest.SerializeToString,
|
|
45
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionResponse.FromString,
|
|
46
|
+
_registered_method=True)
|
|
47
|
+
self.GetJobTable = channel.unary_unary(
|
|
48
|
+
'/managed_jobs.v1.ManagedJobsService/GetJobTable',
|
|
49
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableRequest.SerializeToString,
|
|
50
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableResponse.FromString,
|
|
51
|
+
_registered_method=True)
|
|
52
|
+
self.GetAllJobIdsByName = channel.unary_unary(
|
|
53
|
+
'/managed_jobs.v1.ManagedJobsService/GetAllJobIdsByName',
|
|
54
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameRequest.SerializeToString,
|
|
55
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameResponse.FromString,
|
|
56
|
+
_registered_method=True)
|
|
57
|
+
self.CancelJobs = channel.unary_unary(
|
|
58
|
+
'/managed_jobs.v1.ManagedJobsService/CancelJobs',
|
|
59
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsRequest.SerializeToString,
|
|
60
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsResponse.FromString,
|
|
61
|
+
_registered_method=True)
|
|
62
|
+
self.StreamLogs = channel.unary_stream(
|
|
63
|
+
'/managed_jobs.v1.ManagedJobsService/StreamLogs',
|
|
64
|
+
request_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsRequest.SerializeToString,
|
|
65
|
+
response_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsResponse.FromString,
|
|
66
|
+
_registered_method=True)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class ManagedJobsServiceServicer(object):
|
|
70
|
+
"""Missing associated documentation comment in .proto file."""
|
|
71
|
+
|
|
72
|
+
def GetVersion(self, request, context):
|
|
73
|
+
"""Get controller version.
|
|
74
|
+
"""
|
|
75
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
76
|
+
context.set_details('Method not implemented!')
|
|
77
|
+
raise NotImplementedError('Method not implemented!')
|
|
78
|
+
|
|
79
|
+
def GetJobTable(self, request, context):
|
|
80
|
+
"""Get the managed job queue with advanced filtering.
|
|
81
|
+
"""
|
|
82
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
83
|
+
context.set_details('Method not implemented!')
|
|
84
|
+
raise NotImplementedError('Method not implemented!')
|
|
85
|
+
|
|
86
|
+
def GetAllJobIdsByName(self, request, context):
|
|
87
|
+
"""Get all job IDs by name.
|
|
88
|
+
"""
|
|
89
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
90
|
+
context.set_details('Method not implemented!')
|
|
91
|
+
raise NotImplementedError('Method not implemented!')
|
|
92
|
+
|
|
93
|
+
def CancelJobs(self, request, context):
|
|
94
|
+
"""Cancel managed jobs.
|
|
95
|
+
"""
|
|
96
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
97
|
+
context.set_details('Method not implemented!')
|
|
98
|
+
raise NotImplementedError('Method not implemented!')
|
|
99
|
+
|
|
100
|
+
def StreamLogs(self, request, context):
|
|
101
|
+
"""Stream managed job logs.
|
|
102
|
+
"""
|
|
103
|
+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
|
104
|
+
context.set_details('Method not implemented!')
|
|
105
|
+
raise NotImplementedError('Method not implemented!')
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def add_ManagedJobsServiceServicer_to_server(servicer, server):
|
|
109
|
+
rpc_method_handlers = {
|
|
110
|
+
'GetVersion': grpc.unary_unary_rpc_method_handler(
|
|
111
|
+
servicer.GetVersion,
|
|
112
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionRequest.FromString,
|
|
113
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionResponse.SerializeToString,
|
|
114
|
+
),
|
|
115
|
+
'GetJobTable': grpc.unary_unary_rpc_method_handler(
|
|
116
|
+
servicer.GetJobTable,
|
|
117
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableRequest.FromString,
|
|
118
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableResponse.SerializeToString,
|
|
119
|
+
),
|
|
120
|
+
'GetAllJobIdsByName': grpc.unary_unary_rpc_method_handler(
|
|
121
|
+
servicer.GetAllJobIdsByName,
|
|
122
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameRequest.FromString,
|
|
123
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameResponse.SerializeToString,
|
|
124
|
+
),
|
|
125
|
+
'CancelJobs': grpc.unary_unary_rpc_method_handler(
|
|
126
|
+
servicer.CancelJobs,
|
|
127
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsRequest.FromString,
|
|
128
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsResponse.SerializeToString,
|
|
129
|
+
),
|
|
130
|
+
'StreamLogs': grpc.unary_stream_rpc_method_handler(
|
|
131
|
+
servicer.StreamLogs,
|
|
132
|
+
request_deserializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsRequest.FromString,
|
|
133
|
+
response_serializer=sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsResponse.SerializeToString,
|
|
134
|
+
),
|
|
135
|
+
}
|
|
136
|
+
generic_handler = grpc.method_handlers_generic_handler(
|
|
137
|
+
'managed_jobs.v1.ManagedJobsService', rpc_method_handlers)
|
|
138
|
+
server.add_generic_rpc_handlers((generic_handler,))
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# This class is part of an EXPERIMENTAL API.
|
|
142
|
+
class ManagedJobsService(object):
|
|
143
|
+
"""Missing associated documentation comment in .proto file."""
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def GetVersion(request,
|
|
147
|
+
target,
|
|
148
|
+
options=(),
|
|
149
|
+
channel_credentials=None,
|
|
150
|
+
call_credentials=None,
|
|
151
|
+
insecure=False,
|
|
152
|
+
compression=None,
|
|
153
|
+
wait_for_ready=None,
|
|
154
|
+
timeout=None,
|
|
155
|
+
metadata=None):
|
|
156
|
+
return grpc.experimental.unary_unary(
|
|
157
|
+
request,
|
|
158
|
+
target,
|
|
159
|
+
'/managed_jobs.v1.ManagedJobsService/GetVersion',
|
|
160
|
+
sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionRequest.SerializeToString,
|
|
161
|
+
sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetVersionResponse.FromString,
|
|
162
|
+
options,
|
|
163
|
+
channel_credentials,
|
|
164
|
+
insecure,
|
|
165
|
+
call_credentials,
|
|
166
|
+
compression,
|
|
167
|
+
wait_for_ready,
|
|
168
|
+
timeout,
|
|
169
|
+
metadata,
|
|
170
|
+
_registered_method=True)
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def GetJobTable(request,
|
|
174
|
+
target,
|
|
175
|
+
options=(),
|
|
176
|
+
channel_credentials=None,
|
|
177
|
+
call_credentials=None,
|
|
178
|
+
insecure=False,
|
|
179
|
+
compression=None,
|
|
180
|
+
wait_for_ready=None,
|
|
181
|
+
timeout=None,
|
|
182
|
+
metadata=None):
|
|
183
|
+
return grpc.experimental.unary_unary(
|
|
184
|
+
request,
|
|
185
|
+
target,
|
|
186
|
+
'/managed_jobs.v1.ManagedJobsService/GetJobTable',
|
|
187
|
+
sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableRequest.SerializeToString,
|
|
188
|
+
sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetJobTableResponse.FromString,
|
|
189
|
+
options,
|
|
190
|
+
channel_credentials,
|
|
191
|
+
insecure,
|
|
192
|
+
call_credentials,
|
|
193
|
+
compression,
|
|
194
|
+
wait_for_ready,
|
|
195
|
+
timeout,
|
|
196
|
+
metadata,
|
|
197
|
+
_registered_method=True)
|
|
198
|
+
|
|
199
|
+
@staticmethod
|
|
200
|
+
def GetAllJobIdsByName(request,
|
|
201
|
+
target,
|
|
202
|
+
options=(),
|
|
203
|
+
channel_credentials=None,
|
|
204
|
+
call_credentials=None,
|
|
205
|
+
insecure=False,
|
|
206
|
+
compression=None,
|
|
207
|
+
wait_for_ready=None,
|
|
208
|
+
timeout=None,
|
|
209
|
+
metadata=None):
|
|
210
|
+
return grpc.experimental.unary_unary(
|
|
211
|
+
request,
|
|
212
|
+
target,
|
|
213
|
+
'/managed_jobs.v1.ManagedJobsService/GetAllJobIdsByName',
|
|
214
|
+
sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameRequest.SerializeToString,
|
|
215
|
+
sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.GetAllJobIdsByNameResponse.FromString,
|
|
216
|
+
options,
|
|
217
|
+
channel_credentials,
|
|
218
|
+
insecure,
|
|
219
|
+
call_credentials,
|
|
220
|
+
compression,
|
|
221
|
+
wait_for_ready,
|
|
222
|
+
timeout,
|
|
223
|
+
metadata,
|
|
224
|
+
_registered_method=True)
|
|
225
|
+
|
|
226
|
+
@staticmethod
|
|
227
|
+
def CancelJobs(request,
|
|
228
|
+
target,
|
|
229
|
+
options=(),
|
|
230
|
+
channel_credentials=None,
|
|
231
|
+
call_credentials=None,
|
|
232
|
+
insecure=False,
|
|
233
|
+
compression=None,
|
|
234
|
+
wait_for_ready=None,
|
|
235
|
+
timeout=None,
|
|
236
|
+
metadata=None):
|
|
237
|
+
return grpc.experimental.unary_unary(
|
|
238
|
+
request,
|
|
239
|
+
target,
|
|
240
|
+
'/managed_jobs.v1.ManagedJobsService/CancelJobs',
|
|
241
|
+
sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsRequest.SerializeToString,
|
|
242
|
+
sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.CancelJobsResponse.FromString,
|
|
243
|
+
options,
|
|
244
|
+
channel_credentials,
|
|
245
|
+
insecure,
|
|
246
|
+
call_credentials,
|
|
247
|
+
compression,
|
|
248
|
+
wait_for_ready,
|
|
249
|
+
timeout,
|
|
250
|
+
metadata,
|
|
251
|
+
_registered_method=True)
|
|
252
|
+
|
|
253
|
+
@staticmethod
|
|
254
|
+
def StreamLogs(request,
|
|
255
|
+
target,
|
|
256
|
+
options=(),
|
|
257
|
+
channel_credentials=None,
|
|
258
|
+
call_credentials=None,
|
|
259
|
+
insecure=False,
|
|
260
|
+
compression=None,
|
|
261
|
+
wait_for_ready=None,
|
|
262
|
+
timeout=None,
|
|
263
|
+
metadata=None):
|
|
264
|
+
return grpc.experimental.unary_stream(
|
|
265
|
+
request,
|
|
266
|
+
target,
|
|
267
|
+
'/managed_jobs.v1.ManagedJobsService/StreamLogs',
|
|
268
|
+
sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsRequest.SerializeToString,
|
|
269
|
+
sky_dot_schemas_dot_generated_dot_managed__jobsv1__pb2.StreamLogsResponse.FromString,
|
|
270
|
+
options,
|
|
271
|
+
channel_credentials,
|
|
272
|
+
insecure,
|
|
273
|
+
call_credentials,
|
|
274
|
+
compression,
|
|
275
|
+
wait_for_ready,
|
|
276
|
+
timeout,
|
|
277
|
+
metadata,
|
|
278
|
+
_registered_method=True)
|
sky/serve/serve_utils.py
CHANGED
|
@@ -408,6 +408,22 @@ def validate_service_task(task: 'sky.Task', pool: bool) -> None:
|
|
|
408
408
|
f'{sys_name} will replenish preempted spot '
|
|
409
409
|
f'with {policy_description} instances.')
|
|
410
410
|
|
|
411
|
+
if pool:
|
|
412
|
+
accelerators = set()
|
|
413
|
+
for resource in task.resources:
|
|
414
|
+
if resource.accelerators is not None:
|
|
415
|
+
if isinstance(resource.accelerators, str):
|
|
416
|
+
accelerators.add(resource.accelerators)
|
|
417
|
+
elif isinstance(resource.accelerators, dict):
|
|
418
|
+
accelerators.update(resource.accelerators.keys())
|
|
419
|
+
elif isinstance(resource.accelerators, list):
|
|
420
|
+
accelerators.update(resource.accelerators)
|
|
421
|
+
if len(accelerators) > 1:
|
|
422
|
+
with ux_utils.print_exception_no_traceback():
|
|
423
|
+
raise ValueError('Heterogeneous clusters are not supported for '
|
|
424
|
+
'cluster pools please specify one accelerator '
|
|
425
|
+
'for all workers.')
|
|
426
|
+
|
|
411
427
|
# Try to create a spot placer from the task yaml. Check if the task yaml
|
|
412
428
|
# is valid for spot placer.
|
|
413
429
|
spot_placer.SpotPlacer.from_task(task.service, task)
|
sky/serve/server/core.py
CHANGED
|
@@ -117,7 +117,7 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> None:
|
|
|
117
117
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
118
118
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
119
119
|
|
|
120
|
-
if
|
|
120
|
+
if not use_legacy:
|
|
121
121
|
try:
|
|
122
122
|
stdout = serve_rpc_utils.RpcRunner.terminate_replica(
|
|
123
123
|
handle, service_name, replica_id, purge)
|
sky/serve/server/impl.py
CHANGED
|
@@ -89,7 +89,7 @@ def _get_service_record(
|
|
|
89
89
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
90
90
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
91
91
|
|
|
92
|
-
if
|
|
92
|
+
if not use_legacy:
|
|
93
93
|
try:
|
|
94
94
|
service_statuses = serve_rpc_utils.RpcRunner.get_service_status(
|
|
95
95
|
handle, [service_name], pool)
|
|
@@ -589,7 +589,7 @@ def update(
|
|
|
589
589
|
|
|
590
590
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
591
591
|
|
|
592
|
-
if
|
|
592
|
+
if not use_legacy:
|
|
593
593
|
try:
|
|
594
594
|
current_version = serve_rpc_utils.RpcRunner.add_version(
|
|
595
595
|
handle, service_name)
|
|
@@ -636,7 +636,7 @@ def update(
|
|
|
636
636
|
|
|
637
637
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
638
638
|
|
|
639
|
-
if
|
|
639
|
+
if not use_legacy:
|
|
640
640
|
try:
|
|
641
641
|
serve_rpc_utils.RpcRunner.update_service(
|
|
642
642
|
handle, service_name, current_version, mode, pool)
|
|
@@ -730,7 +730,7 @@ def down(
|
|
|
730
730
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
731
731
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
732
732
|
|
|
733
|
-
if
|
|
733
|
+
if not use_legacy:
|
|
734
734
|
try:
|
|
735
735
|
stdout = serve_rpc_utils.RpcRunner.terminate_services(
|
|
736
736
|
handle, service_names, purge, pool)
|
|
@@ -792,7 +792,7 @@ def status(
|
|
|
792
792
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
793
793
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
794
794
|
|
|
795
|
-
if
|
|
795
|
+
if not use_legacy:
|
|
796
796
|
try:
|
|
797
797
|
service_records = serve_rpc_utils.RpcRunner.get_service_status(
|
|
798
798
|
handle, service_names, pool)
|
|
@@ -928,7 +928,7 @@ def _get_all_replica_targets(
|
|
|
928
928
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
929
929
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
930
930
|
|
|
931
|
-
if
|
|
931
|
+
if not use_legacy:
|
|
932
932
|
try:
|
|
933
933
|
service_records = serve_rpc_utils.RpcRunner.get_service_status(
|
|
934
934
|
handle, [service_name], pool)
|
sky/server/requests/payloads.py
CHANGED
|
@@ -683,8 +683,9 @@ class LocalUpBody(RequestBody):
|
|
|
683
683
|
ssh_key: Optional[str] = None
|
|
684
684
|
cleanup: bool = False
|
|
685
685
|
context_name: Optional[str] = None
|
|
686
|
-
name: Optional[str] = None
|
|
687
686
|
password: Optional[str] = None
|
|
687
|
+
name: Optional[str] = None
|
|
688
|
+
port_start: Optional[int] = None
|
|
688
689
|
|
|
689
690
|
|
|
690
691
|
class LocalDownBody(RequestBody):
|
|
@@ -101,11 +101,11 @@ def decode_start(return_value: str) -> 'backends.CloudVmRayResourceHandle':
|
|
|
101
101
|
|
|
102
102
|
|
|
103
103
|
@register_decoders('queue')
|
|
104
|
-
def decode_queue(return_value: List[dict],) -> List[
|
|
104
|
+
def decode_queue(return_value: List[dict],) -> List[responses.ClusterJobRecord]:
|
|
105
105
|
jobs = return_value
|
|
106
106
|
for job in jobs:
|
|
107
107
|
job['status'] = job_lib.JobStatus(job['status'])
|
|
108
|
-
return jobs
|
|
108
|
+
return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
|
|
109
109
|
|
|
110
110
|
|
|
111
111
|
@register_decoders('jobs.queue')
|
|
@@ -92,10 +92,14 @@ def encode_start(resource_handle: 'backends.CloudVmRayResourceHandle') -> str:
|
|
|
92
92
|
|
|
93
93
|
|
|
94
94
|
@register_encoder('queue')
|
|
95
|
-
def encode_queue(
|
|
95
|
+
def encode_queue(
|
|
96
|
+
jobs: List[responses.ClusterJobRecord],) -> List[Dict[str, Any]]:
|
|
97
|
+
response = []
|
|
96
98
|
for job in jobs:
|
|
97
|
-
|
|
98
|
-
|
|
99
|
+
response_job = job.model_dump()
|
|
100
|
+
response_job['status'] = job['status'].value
|
|
101
|
+
response.append(response_job)
|
|
102
|
+
return response
|
|
99
103
|
|
|
100
104
|
|
|
101
105
|
@register_encoder('status_kubernetes')
|
sky/setup_files/dependencies.py
CHANGED
sky/skylet/constants.py
CHANGED
|
@@ -57,6 +57,9 @@ SKY_REMOTE_PYTHON_ENV: str = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
|
|
|
57
57
|
ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
|
|
58
58
|
# uv is used for venv and pip, much faster than python implementations.
|
|
59
59
|
SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
|
|
60
|
+
# set UV_SYSTEM_PYTHON to false in case the
|
|
61
|
+
# user provided docker image set it to true.
|
|
62
|
+
# unset PYTHONPATH in case the user provided docker image set it.
|
|
60
63
|
SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
|
|
61
64
|
f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
|
|
62
65
|
# This won't reinstall uv if it's already installed, so it's safe to re-run.
|
|
@@ -97,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
97
100
|
# cluster yaml is updated.
|
|
98
101
|
#
|
|
99
102
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
100
|
-
SKYLET_VERSION = '
|
|
103
|
+
SKYLET_VERSION = '21'
|
|
101
104
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
102
105
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
103
106
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
sky/skylet/events.py
CHANGED
|
@@ -11,6 +11,7 @@ import psutil
|
|
|
11
11
|
from sky import clouds
|
|
12
12
|
from sky import sky_logging
|
|
13
13
|
from sky.backends import cloud_vm_ray_backend
|
|
14
|
+
from sky.jobs import constants as managed_job_constants
|
|
14
15
|
from sky.jobs import scheduler
|
|
15
16
|
from sky.jobs import state as managed_job_state
|
|
16
17
|
from sky.jobs import utils as managed_job_utils
|
|
@@ -21,6 +22,7 @@ from sky.skylet import job_lib
|
|
|
21
22
|
from sky.usage import usage_lib
|
|
22
23
|
from sky.utils import cluster_utils
|
|
23
24
|
from sky.utils import registry
|
|
25
|
+
from sky.utils import subprocess_utils
|
|
24
26
|
from sky.utils import ux_utils
|
|
25
27
|
from sky.utils import yaml_utils
|
|
26
28
|
|
|
@@ -74,6 +76,46 @@ class ManagedJobEvent(SkyletEvent):
|
|
|
74
76
|
EVENT_INTERVAL_SECONDS = 300
|
|
75
77
|
|
|
76
78
|
def _run(self):
|
|
79
|
+
if not os.path.exists(
|
|
80
|
+
os.path.expanduser(
|
|
81
|
+
managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE)):
|
|
82
|
+
# Note: since the skylet is started before the user setup (in
|
|
83
|
+
# jobs-controller.yaml.j2) runs, it's possible that we hit this
|
|
84
|
+
# before the indicator file is written. However, since we will wait
|
|
85
|
+
# EVENT_INTERVAL_SECONDS before the first run, this should be very
|
|
86
|
+
# unlikely.
|
|
87
|
+
logger.info('No jobs controller indicator file found.')
|
|
88
|
+
all_job_ids = managed_job_state.get_all_job_ids_by_name(None)
|
|
89
|
+
if not all_job_ids:
|
|
90
|
+
logger.info('No jobs running. Stopping controllers.')
|
|
91
|
+
# TODO(cooperc): Move this to a shared function also called by
|
|
92
|
+
# sdk.api_stop(). (#7229)
|
|
93
|
+
try:
|
|
94
|
+
with open(os.path.expanduser(
|
|
95
|
+
scheduler.JOB_CONTROLLER_PID_PATH),
|
|
96
|
+
'r',
|
|
97
|
+
encoding='utf-8') as f:
|
|
98
|
+
pids = f.read().split('\n')[:-1]
|
|
99
|
+
for pid in pids:
|
|
100
|
+
if subprocess_utils.is_process_alive(
|
|
101
|
+
int(pid.strip())):
|
|
102
|
+
subprocess_utils.kill_children_processes(
|
|
103
|
+
parent_pids=[int(pid.strip())], force=True)
|
|
104
|
+
os.remove(
|
|
105
|
+
os.path.expanduser(scheduler.JOB_CONTROLLER_PID_PATH))
|
|
106
|
+
except FileNotFoundError:
|
|
107
|
+
# its fine we will create it
|
|
108
|
+
pass
|
|
109
|
+
except Exception as e: # pylint: disable=broad-except
|
|
110
|
+
# in case we get perm issues or something is messed up, just
|
|
111
|
+
# ignore it and assume the process is dead
|
|
112
|
+
logger.error(
|
|
113
|
+
f'Error looking at job controller pid file: {e}')
|
|
114
|
+
pass
|
|
115
|
+
logger.info(f'{len(all_job_ids)} jobs running. Assuming the '
|
|
116
|
+
'indicator file hasn\'t been written yet.')
|
|
117
|
+
return
|
|
118
|
+
|
|
77
119
|
logger.info('=== Updating managed job status ===')
|
|
78
120
|
managed_job_utils.update_managed_jobs_statuses()
|
|
79
121
|
scheduler.maybe_start_controllers()
|
sky/skylet/job_lib.py
CHANGED
|
@@ -24,7 +24,6 @@ from sky import sky_logging
|
|
|
24
24
|
from sky.adaptors import common as adaptors_common
|
|
25
25
|
from sky.skylet import constants
|
|
26
26
|
from sky.utils import common_utils
|
|
27
|
-
from sky.utils import log_utils
|
|
28
27
|
from sky.utils import message_utils
|
|
29
28
|
from sky.utils import subprocess_utils
|
|
30
29
|
from sky.utils.db import db_utils
|
|
@@ -612,8 +611,8 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
|
|
|
612
611
|
PENDING state.
|
|
613
612
|
|
|
614
613
|
The normal job duration will use `start_at` instead of `submitted_at` (in
|
|
615
|
-
`format_job_queue()`), because the job may stay in PENDING if
|
|
616
|
-
busy.
|
|
614
|
+
`table_utils.format_job_queue()`), because the job may stay in PENDING if
|
|
615
|
+
the cluster is busy.
|
|
617
616
|
"""
|
|
618
617
|
return message_utils.encode_payload(
|
|
619
618
|
get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
|
|
@@ -941,35 +940,6 @@ def is_cluster_idle() -> bool:
|
|
|
941
940
|
assert False, 'Should not reach here'
|
|
942
941
|
|
|
943
942
|
|
|
944
|
-
def format_job_queue(jobs: List[Dict[str, Any]]):
|
|
945
|
-
"""Format the job queue for display.
|
|
946
|
-
|
|
947
|
-
Usage:
|
|
948
|
-
jobs = get_job_queue()
|
|
949
|
-
print(format_job_queue(jobs))
|
|
950
|
-
"""
|
|
951
|
-
job_table = log_utils.create_table([
|
|
952
|
-
'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
|
|
953
|
-
'STATUS', 'LOG', 'GIT COMMIT'
|
|
954
|
-
])
|
|
955
|
-
for job in jobs:
|
|
956
|
-
job_table.add_row([
|
|
957
|
-
job['job_id'],
|
|
958
|
-
job['job_name'],
|
|
959
|
-
job['username'],
|
|
960
|
-
log_utils.readable_time_duration(job['submitted_at']),
|
|
961
|
-
log_utils.readable_time_duration(job['start_at']),
|
|
962
|
-
log_utils.readable_time_duration(job['start_at'],
|
|
963
|
-
job['end_at'],
|
|
964
|
-
absolute=True),
|
|
965
|
-
job['resources'],
|
|
966
|
-
job['status'].colored_str(),
|
|
967
|
-
job['log_path'],
|
|
968
|
-
job.get('metadata', {}).get('git_commit', '-'),
|
|
969
|
-
])
|
|
970
|
-
return job_table
|
|
971
|
-
|
|
972
|
-
|
|
973
943
|
def dump_job_queue(user_hash: Optional[str], all_jobs: bool) -> str:
|
|
974
944
|
"""Get the job queue in encoded json format.
|
|
975
945
|
|