skypilot-nightly 1.0.0.dev20251019__py3-none-any.whl → 1.0.0.dev20251022__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/backends/backend_utils.py +11 -11
- sky/backends/cloud_vm_ray_backend.py +15 -4
- sky/client/cli/command.py +39 -10
- sky/client/cli/flags.py +4 -2
- sky/client/sdk.py +26 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/IgACOQPupLbX9z-RYVEDx/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
- sky/dashboard/out/_next/static/chunks/2755.9b1e69c921b5a870.js +26 -0
- sky/dashboard/out/_next/static/chunks/3015-d014dc5b9412fade.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.998db87cd52a1238.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.14326e329484b57e.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-919e3c01ab6b2633.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +2 -2
- sky/global_user_state.py +137 -37
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +4 -2
- sky/jobs/server/server.py +21 -12
- sky/jobs/state.py +307 -55
- sky/jobs/utils.py +248 -144
- sky/provision/kubernetes/network.py +9 -6
- sky/provision/provisioner.py +8 -0
- sky/schemas/api/responses.py +2 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/serve/server/server.py +8 -7
- sky/server/common.py +10 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +4 -2
- sky/server/requests/executor.py +30 -28
- sky/server/requests/payloads.py +5 -1
- sky/server/requests/preconditions.py +9 -4
- sky/server/requests/requests.py +130 -53
- sky/server/requests/serializers/encoders.py +3 -3
- sky/server/server.py +91 -58
- sky/server/stream_utils.py +127 -38
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/skylet/services.py +5 -5
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +4 -4
- sky/users/permission.py +4 -0
- sky/utils/asyncio_utils.py +63 -3
- sky/utils/db/db_utils.py +11 -3
- sky/utils/db/migration_utils.py +7 -3
- sky/volumes/server/server.py +3 -3
- sky/workspaces/server.py +6 -6
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/METADATA +37 -37
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/RECORD +87 -86
- sky/dashboard/out/_next/static/8e35zdobdd0bK_Nkba03m/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/webpack-3c431f6c9086e487.js +0 -1
- /sky/dashboard/out/_next/static/{8e35zdobdd0bK_Nkba03m → IgACOQPupLbX9z-RYVEDx}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-df9f87fcb7f24292.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/top_level.txt +0 -0
sky/server/stream_utils.py
CHANGED
|
@@ -25,6 +25,17 @@ logger = sky_logging.init_logger(__name__)
|
|
|
25
25
|
_BUFFER_SIZE = 8 * 1024 # 8KB
|
|
26
26
|
_BUFFER_TIMEOUT = 0.02 # 20ms
|
|
27
27
|
_HEARTBEAT_INTERVAL = 30
|
|
28
|
+
# If a SHORT request has been stuck in pending for
|
|
29
|
+
# _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
|
|
30
|
+
_SHORT_REQUEST_SPINNER_TIMEOUT = 2
|
|
31
|
+
# If there is an issue during provisioning that causes the cluster to be stuck
|
|
32
|
+
# in INIT state, we use this timeout to break the loop and stop streaming
|
|
33
|
+
# provision logs.
|
|
34
|
+
_PROVISION_LOG_TIMEOUT = 3
|
|
35
|
+
# Maximum time to wait for new log files to appear when streaming worker node
|
|
36
|
+
# provision logs. Worker logs are created sequentially during the provisioning
|
|
37
|
+
# process, so we need to wait for new files to appear.
|
|
38
|
+
_MAX_WAIT_FOR_NEW_LOG_FILES = 3 # seconds
|
|
28
39
|
|
|
29
40
|
LONG_REQUEST_POLL_INTERVAL = 1
|
|
30
41
|
DEFAULT_POLL_INTERVAL = 0.1
|
|
@@ -45,7 +56,7 @@ async def _yield_log_file_with_payloads_skipped(
|
|
|
45
56
|
|
|
46
57
|
async def log_streamer(
|
|
47
58
|
request_id: Optional[str],
|
|
48
|
-
log_path: pathlib.Path,
|
|
59
|
+
log_path: Optional[pathlib.Path] = None,
|
|
49
60
|
plain_logs: bool = False,
|
|
50
61
|
tail: Optional[int] = None,
|
|
51
62
|
follow: bool = True,
|
|
@@ -57,7 +68,9 @@ async def log_streamer(
|
|
|
57
68
|
Args:
|
|
58
69
|
request_id: The request ID to check whether the log tailing process
|
|
59
70
|
should be stopped.
|
|
60
|
-
log_path: The path to the log file
|
|
71
|
+
log_path: The path to the log file or directory containing the log
|
|
72
|
+
files. If it is a directory, all *.log files in the directory will be
|
|
73
|
+
streamed.
|
|
61
74
|
plain_logs: Whether to show plain logs.
|
|
62
75
|
tail: The number of lines to tail. If None, tail the whole file.
|
|
63
76
|
follow: Whether to follow the log file.
|
|
@@ -66,17 +79,26 @@ async def log_streamer(
|
|
|
66
79
|
"""
|
|
67
80
|
|
|
68
81
|
if request_id is not None:
|
|
82
|
+
start_time = asyncio.get_event_loop().time()
|
|
69
83
|
status_msg = rich_utils.EncodedStatusMessage(
|
|
70
84
|
f'[dim]Checking request: {request_id}[/dim]')
|
|
71
|
-
request_task = await requests_lib.get_request_async(request_id
|
|
85
|
+
request_task = await requests_lib.get_request_async(request_id,
|
|
86
|
+
fields=[
|
|
87
|
+
'request_id',
|
|
88
|
+
'name',
|
|
89
|
+
'schedule_type',
|
|
90
|
+
'status',
|
|
91
|
+
'status_msg'
|
|
92
|
+
])
|
|
72
93
|
|
|
73
94
|
if request_task is None:
|
|
74
95
|
raise fastapi.HTTPException(
|
|
75
96
|
status_code=404, detail=f'Request {request_id} not found')
|
|
76
97
|
request_id = request_task.request_id
|
|
77
98
|
|
|
78
|
-
#
|
|
79
|
-
# request
|
|
99
|
+
# By default, do not show the waiting spinner for SHORT requests.
|
|
100
|
+
# If the request has been stuck in pending for
|
|
101
|
+
# _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
|
|
80
102
|
show_request_waiting_spinner = (not plain_logs and
|
|
81
103
|
request_task.schedule_type
|
|
82
104
|
== requests_lib.ScheduleType.LONG)
|
|
@@ -89,14 +111,23 @@ async def log_streamer(
|
|
|
89
111
|
f'scheduled: {request_id}')
|
|
90
112
|
req_status = request_task.status
|
|
91
113
|
req_msg = request_task.status_msg
|
|
114
|
+
del request_task
|
|
92
115
|
# Slowly back off the database polling up to every 1 second, to avoid
|
|
93
116
|
# overloading the CPU and DB.
|
|
94
117
|
backoff = common_utils.Backoff(initial_backoff=polling_interval,
|
|
95
118
|
max_backoff_factor=10,
|
|
96
119
|
multiplier=1.2)
|
|
97
120
|
while req_status < requests_lib.RequestStatus.RUNNING:
|
|
121
|
+
current_time = asyncio.get_event_loop().time()
|
|
122
|
+
# Show the waiting spinner for a SHORT request if it has been stuck
|
|
123
|
+
# in pending for _SHORT_REQUEST_SPINNER_TIMEOUT seconds
|
|
124
|
+
if not show_request_waiting_spinner and (
|
|
125
|
+
current_time - start_time > _SHORT_REQUEST_SPINNER_TIMEOUT):
|
|
126
|
+
show_request_waiting_spinner = True
|
|
127
|
+
yield status_msg.init()
|
|
128
|
+
yield status_msg.start()
|
|
98
129
|
if req_msg is not None:
|
|
99
|
-
waiting_msg =
|
|
130
|
+
waiting_msg = req_msg
|
|
100
131
|
if show_request_waiting_spinner:
|
|
101
132
|
yield status_msg.update(f'[dim]{waiting_msg}[/dim]')
|
|
102
133
|
elif plain_logs and waiting_msg != last_waiting_msg:
|
|
@@ -119,11 +150,57 @@ async def log_streamer(
|
|
|
119
150
|
if show_request_waiting_spinner:
|
|
120
151
|
yield status_msg.stop()
|
|
121
152
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
153
|
+
if log_path is not None and log_path.is_dir():
|
|
154
|
+
# Track which log files we've already streamed
|
|
155
|
+
streamed_files = set()
|
|
156
|
+
no_new_files_count = 0
|
|
157
|
+
|
|
158
|
+
while True:
|
|
159
|
+
# Get all *.log files in the log_path
|
|
160
|
+
log_files = sorted(log_path.glob('*.log'))
|
|
161
|
+
|
|
162
|
+
# Filter out already streamed files
|
|
163
|
+
new_files = [f for f in log_files if f not in streamed_files]
|
|
164
|
+
|
|
165
|
+
if len(new_files) == 0:
|
|
166
|
+
if not follow:
|
|
167
|
+
break
|
|
168
|
+
# Wait a bit to see if new files appear
|
|
169
|
+
await asyncio.sleep(0.5)
|
|
170
|
+
no_new_files_count += 1
|
|
171
|
+
# Check if we've waited too long for new files
|
|
172
|
+
if no_new_files_count > _MAX_WAIT_FOR_NEW_LOG_FILES * 2:
|
|
173
|
+
break
|
|
174
|
+
continue
|
|
175
|
+
|
|
176
|
+
# Reset the no-new-files counter when we find new files
|
|
177
|
+
no_new_files_count = 0
|
|
178
|
+
|
|
179
|
+
for log_file_path in new_files:
|
|
180
|
+
# Add header before each file (similar to tail -f behavior)
|
|
181
|
+
header = f'\n==> {log_file_path} <==\n\n'
|
|
182
|
+
yield header
|
|
183
|
+
|
|
184
|
+
async with aiofiles.open(log_file_path, 'rb') as f:
|
|
185
|
+
async for chunk in _tail_log_file(f, request_id, plain_logs,
|
|
186
|
+
tail, follow,
|
|
187
|
+
cluster_name,
|
|
188
|
+
polling_interval):
|
|
189
|
+
yield chunk
|
|
190
|
+
|
|
191
|
+
# Mark this file as streamed
|
|
192
|
+
streamed_files.add(log_file_path)
|
|
193
|
+
|
|
194
|
+
# If not following, break after streaming all current files
|
|
195
|
+
if not follow:
|
|
196
|
+
break
|
|
197
|
+
else:
|
|
198
|
+
assert log_path is not None, (request_id, log_path)
|
|
199
|
+
async with aiofiles.open(log_path, 'rb') as f:
|
|
200
|
+
async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
|
|
201
|
+
follow, cluster_name,
|
|
202
|
+
polling_interval):
|
|
203
|
+
yield chunk
|
|
127
204
|
|
|
128
205
|
|
|
129
206
|
async def _tail_log_file(
|
|
@@ -197,7 +274,7 @@ async def _tail_log_file(
|
|
|
197
274
|
if (req_status.status ==
|
|
198
275
|
requests_lib.RequestStatus.CANCELLED):
|
|
199
276
|
request_task = await requests_lib.get_request_async(
|
|
200
|
-
request_id)
|
|
277
|
+
request_id, fields=['name', 'should_retry'])
|
|
201
278
|
if request_task.should_retry:
|
|
202
279
|
buffer.append(
|
|
203
280
|
message_utils.encode_payload(
|
|
@@ -206,6 +283,7 @@ async def _tail_log_file(
|
|
|
206
283
|
buffer.append(
|
|
207
284
|
f'{request_task.name!r} request {request_id}'
|
|
208
285
|
' cancelled\n')
|
|
286
|
+
del request_task
|
|
209
287
|
break
|
|
210
288
|
if not follow:
|
|
211
289
|
# The below checks (cluster status, heartbeat) are not needed
|
|
@@ -213,21 +291,24 @@ async def _tail_log_file(
|
|
|
213
291
|
break
|
|
214
292
|
# Provision logs pass in cluster_name, check cluster status
|
|
215
293
|
# periodically to see if provisioning is done.
|
|
216
|
-
if cluster_name is not None
|
|
217
|
-
|
|
218
|
-
cluster_status = await (
|
|
219
|
-
global_user_state.get_status_from_cluster_name_async(
|
|
220
|
-
cluster_name))
|
|
221
|
-
if cluster_status is None:
|
|
222
|
-
logger.debug(
|
|
223
|
-
'Stop tailing provision logs for cluster'
|
|
224
|
-
f' status for cluster {cluster_name} not found')
|
|
225
|
-
break
|
|
226
|
-
if cluster_status != status_lib.ClusterStatus.INIT:
|
|
227
|
-
logger.debug(f'Stop tailing provision logs for cluster'
|
|
228
|
-
f' {cluster_name} has status {cluster_status} '
|
|
229
|
-
'(not in INIT state)')
|
|
294
|
+
if cluster_name is not None:
|
|
295
|
+
if current_time - last_flush_time > _PROVISION_LOG_TIMEOUT:
|
|
230
296
|
break
|
|
297
|
+
if should_check_status:
|
|
298
|
+
last_status_check_time = current_time
|
|
299
|
+
cluster_status = await (
|
|
300
|
+
global_user_state.get_status_from_cluster_name_async(
|
|
301
|
+
cluster_name))
|
|
302
|
+
if cluster_status is None:
|
|
303
|
+
logger.debug(
|
|
304
|
+
'Stop tailing provision logs for cluster'
|
|
305
|
+
f' status for cluster {cluster_name} not found')
|
|
306
|
+
break
|
|
307
|
+
if cluster_status != status_lib.ClusterStatus.INIT:
|
|
308
|
+
logger.debug(
|
|
309
|
+
f'Stop tailing provision logs for cluster'
|
|
310
|
+
f' {cluster_name} has status {cluster_status} '
|
|
311
|
+
'(not in INIT state)')
|
|
231
312
|
if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
|
|
232
313
|
# Currently just used to keep the connection busy, refer to
|
|
233
314
|
# https://github.com/skypilot-org/skypilot/issues/5750 for
|
|
@@ -267,28 +348,36 @@ def stream_response_for_long_request(
|
|
|
267
348
|
request_id: str,
|
|
268
349
|
logs_path: pathlib.Path,
|
|
269
350
|
background_tasks: fastapi.BackgroundTasks,
|
|
351
|
+
kill_request_on_disconnect: bool = True,
|
|
270
352
|
) -> fastapi.responses.StreamingResponse:
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
353
|
+
"""Stream the logs of a long request."""
|
|
354
|
+
return stream_response(
|
|
355
|
+
request_id,
|
|
356
|
+
logs_path,
|
|
357
|
+
background_tasks,
|
|
358
|
+
polling_interval=LONG_REQUEST_POLL_INTERVAL,
|
|
359
|
+
kill_request_on_disconnect=kill_request_on_disconnect,
|
|
360
|
+
)
|
|
275
361
|
|
|
276
362
|
|
|
277
363
|
def stream_response(
|
|
278
364
|
request_id: str,
|
|
279
365
|
logs_path: pathlib.Path,
|
|
280
366
|
background_tasks: fastapi.BackgroundTasks,
|
|
281
|
-
polling_interval: float = DEFAULT_POLL_INTERVAL
|
|
367
|
+
polling_interval: float = DEFAULT_POLL_INTERVAL,
|
|
368
|
+
kill_request_on_disconnect: bool = True,
|
|
282
369
|
) -> fastapi.responses.StreamingResponse:
|
|
283
370
|
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
371
|
+
if kill_request_on_disconnect:
|
|
372
|
+
|
|
373
|
+
async def on_disconnect():
|
|
374
|
+
logger.info(f'User terminated the connection for request '
|
|
375
|
+
f'{request_id}')
|
|
376
|
+
requests_lib.kill_requests([request_id])
|
|
288
377
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
378
|
+
# The background task will be run after returning a response.
|
|
379
|
+
# https://fastapi.tiangolo.com/tutorial/background-tasks/
|
|
380
|
+
background_tasks.add_task(on_disconnect)
|
|
292
381
|
|
|
293
382
|
return fastapi.responses.StreamingResponse(
|
|
294
383
|
log_streamer(request_id, logs_path, polling_interval=polling_interval),
|
sky/server/uvicorn.py
CHANGED
|
@@ -46,11 +46,11 @@ except ValueError:
|
|
|
46
46
|
|
|
47
47
|
# TODO(aylei): use decorator to register requests that need to be proactively
|
|
48
48
|
# cancelled instead of hardcoding here.
|
|
49
|
-
_RETRIABLE_REQUEST_NAMES =
|
|
49
|
+
_RETRIABLE_REQUEST_NAMES = {
|
|
50
50
|
'sky.logs',
|
|
51
51
|
'sky.jobs.logs',
|
|
52
52
|
'sky.serve.logs',
|
|
53
|
-
|
|
53
|
+
}
|
|
54
54
|
|
|
55
55
|
|
|
56
56
|
def add_timestamp_prefix_for_server_logs() -> None:
|
|
@@ -151,37 +151,38 @@ class Server(uvicorn.Server):
|
|
|
151
151
|
requests_lib.RequestStatus.PENDING,
|
|
152
152
|
requests_lib.RequestStatus.RUNNING,
|
|
153
153
|
]
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
154
|
+
requests = [(request_task.request_id, request_task.name)
|
|
155
|
+
for request_task in requests_lib.get_request_tasks(
|
|
156
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
157
|
+
status=statuses, fields=['request_id', 'name']))
|
|
158
|
+
]
|
|
159
|
+
if not requests:
|
|
157
160
|
break
|
|
158
|
-
logger.info(f'{len(
|
|
161
|
+
logger.info(f'{len(requests)} on-going requests '
|
|
159
162
|
'found, waiting for them to finish...')
|
|
160
163
|
# Proactively cancel internal requests and logs requests since
|
|
161
164
|
# they can run for infinite time.
|
|
162
|
-
internal_request_ids =
|
|
165
|
+
internal_request_ids = {
|
|
163
166
|
d.id for d in daemons.INTERNAL_REQUEST_DAEMONS
|
|
164
|
-
|
|
167
|
+
}
|
|
165
168
|
if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
|
|
166
169
|
logger.warning('Timeout waiting for on-going requests to '
|
|
167
170
|
'finish, cancelling all on-going requests.')
|
|
168
|
-
for
|
|
169
|
-
self.interrupt_request_for_retry(
|
|
171
|
+
for request_id, _ in requests:
|
|
172
|
+
self.interrupt_request_for_retry(request_id)
|
|
170
173
|
break
|
|
171
174
|
interrupted = 0
|
|
172
|
-
for
|
|
173
|
-
if
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
elif req.name in _RETRIABLE_REQUEST_NAMES:
|
|
177
|
-
self.interrupt_request_for_retry(req.request_id)
|
|
175
|
+
for request_id, name in requests:
|
|
176
|
+
if (name in _RETRIABLE_REQUEST_NAMES or
|
|
177
|
+
request_id in internal_request_ids):
|
|
178
|
+
self.interrupt_request_for_retry(request_id)
|
|
178
179
|
interrupted += 1
|
|
179
180
|
# TODO(aylei): interrupt pending requests to accelerate the
|
|
180
181
|
# shutdown.
|
|
181
182
|
# If some requests are not interrupted, wait for them to finish,
|
|
182
183
|
# otherwise we just check again immediately to accelerate the
|
|
183
184
|
# shutdown process.
|
|
184
|
-
if interrupted < len(
|
|
185
|
+
if interrupted < len(requests):
|
|
185
186
|
time.sleep(_WAIT_REQUESTS_INTERVAL_SECONDS)
|
|
186
187
|
|
|
187
188
|
def interrupt_request_for_retry(self, request_id: str) -> None:
|
sky/setup_files/alembic.ini
CHANGED
|
@@ -98,6 +98,10 @@ version_table = alembic_version_spot_jobs_db
|
|
|
98
98
|
version_locations = %(here)s/../schemas/db/serve_state
|
|
99
99
|
version_table = alembic_version_serve_state_db
|
|
100
100
|
|
|
101
|
+
[sky_config_db]
|
|
102
|
+
version_locations = %(here)s/../schemas/db/skypilot_config
|
|
103
|
+
version_table = alembic_version_sky_config_db
|
|
104
|
+
|
|
101
105
|
[post_write_hooks]
|
|
102
106
|
# post_write_hooks defines scripts or Python functions that are run
|
|
103
107
|
# on newly generated revision scripts. See the documentation for further
|
sky/skylet/services.py
CHANGED
|
@@ -408,17 +408,17 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
408
408
|
) -> managed_jobsv1_pb2.GetJobTableResponse:
|
|
409
409
|
try:
|
|
410
410
|
accessible_workspaces = list(request.accessible_workspaces)
|
|
411
|
-
job_ids = list(request.job_ids.ids)
|
|
411
|
+
job_ids = (list(request.job_ids.ids)
|
|
412
|
+
if request.HasField('job_ids') else None)
|
|
412
413
|
user_hashes: Optional[List[Optional[str]]] = None
|
|
413
|
-
if request.user_hashes:
|
|
414
|
+
if request.HasField('user_hashes'):
|
|
414
415
|
user_hashes = list(request.user_hashes.hashes)
|
|
415
416
|
# For backwards compatibility, we show jobs that do not have a
|
|
416
417
|
# user_hash. TODO: Remove before 0.12.0.
|
|
417
418
|
if request.show_jobs_without_user_hash:
|
|
418
419
|
user_hashes.append(None)
|
|
419
|
-
statuses = list(
|
|
420
|
-
|
|
421
|
-
|
|
420
|
+
statuses = (list(request.statuses.statuses)
|
|
421
|
+
if request.HasField('statuses') else None)
|
|
422
422
|
job_queue = managed_job_utils.get_managed_job_queue(
|
|
423
423
|
skip_finished=request.skip_finished,
|
|
424
424
|
accessible_workspaces=accessible_workspaces,
|
sky/skypilot_config.py
CHANGED
|
@@ -64,7 +64,6 @@ from sqlalchemy import orm
|
|
|
64
64
|
from sqlalchemy.dialects import postgresql
|
|
65
65
|
from sqlalchemy.dialects import sqlite
|
|
66
66
|
from sqlalchemy.ext import declarative
|
|
67
|
-
from sqlalchemy.pool import NullPool
|
|
68
67
|
|
|
69
68
|
from sky import exceptions
|
|
70
69
|
from sky import sky_logging
|
|
@@ -77,6 +76,7 @@ from sky.utils import schemas
|
|
|
77
76
|
from sky.utils import ux_utils
|
|
78
77
|
from sky.utils import yaml_utils
|
|
79
78
|
from sky.utils.db import db_utils
|
|
79
|
+
from sky.utils.db import migration_utils
|
|
80
80
|
from sky.utils.kubernetes import config_map_utils
|
|
81
81
|
|
|
82
82
|
if typing.TYPE_CHECKING:
|
|
@@ -121,7 +121,8 @@ _PROJECT_CONFIG_PATH = '.sky.yaml'
|
|
|
121
121
|
|
|
122
122
|
API_SERVER_CONFIG_KEY = 'api_server_config'
|
|
123
123
|
|
|
124
|
-
|
|
124
|
+
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
125
|
+
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
125
126
|
|
|
126
127
|
Base = declarative.declarative_base()
|
|
127
128
|
|
|
@@ -481,7 +482,7 @@ def safe_reload_config() -> None:
|
|
|
481
482
|
reload_config()
|
|
482
483
|
|
|
483
484
|
|
|
484
|
-
def reload_config() -> None:
|
|
485
|
+
def reload_config(init_db: bool = False) -> None:
|
|
485
486
|
internal_config_path = os.environ.get(ENV_VAR_SKYPILOT_CONFIG)
|
|
486
487
|
if internal_config_path is not None:
|
|
487
488
|
# {ENV_VAR_SKYPILOT_CONFIG} is used internally.
|
|
@@ -493,7 +494,7 @@ def reload_config() -> None:
|
|
|
493
494
|
return
|
|
494
495
|
|
|
495
496
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
496
|
-
_reload_config_as_server()
|
|
497
|
+
_reload_config_as_server(init_db=init_db)
|
|
497
498
|
else:
|
|
498
499
|
_reload_config_as_client()
|
|
499
500
|
|
|
@@ -564,7 +565,43 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
|
|
|
564
565
|
_set_loaded_config_path(config_path)
|
|
565
566
|
|
|
566
567
|
|
|
567
|
-
def
|
|
568
|
+
def _create_table(engine: sqlalchemy.engine.Engine):
|
|
569
|
+
"""Initialize the config database with migrations."""
|
|
570
|
+
migration_utils.safe_alembic_upgrade(
|
|
571
|
+
engine, migration_utils.SKYPILOT_CONFIG_DB_NAME,
|
|
572
|
+
migration_utils.SKYPILOT_CONFIG_VERSION)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def _initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
576
|
+
"""Initialize and return the config database engine.
|
|
577
|
+
|
|
578
|
+
This function should only be called by the API Server during initialization.
|
|
579
|
+
Client-side code should never call this function.
|
|
580
|
+
"""
|
|
581
|
+
assert os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None, (
|
|
582
|
+
'initialize_and_get_db() can only be called by the API Server')
|
|
583
|
+
|
|
584
|
+
global _SQLALCHEMY_ENGINE
|
|
585
|
+
|
|
586
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
587
|
+
return _SQLALCHEMY_ENGINE
|
|
588
|
+
|
|
589
|
+
with _SQLALCHEMY_ENGINE_LOCK:
|
|
590
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
591
|
+
return _SQLALCHEMY_ENGINE
|
|
592
|
+
|
|
593
|
+
# We only store config in the DB when using Postgres,
|
|
594
|
+
# so no need to pass in db_name here.
|
|
595
|
+
engine = db_utils.get_engine(None)
|
|
596
|
+
|
|
597
|
+
# Run migrations if needed
|
|
598
|
+
_create_table(engine)
|
|
599
|
+
|
|
600
|
+
_SQLALCHEMY_ENGINE = engine
|
|
601
|
+
return _SQLALCHEMY_ENGINE
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def _reload_config_as_server(init_db: bool = False) -> None:
|
|
568
605
|
# Reset the global variables, to avoid using stale values.
|
|
569
606
|
_set_loaded_config(config_utils.Config())
|
|
570
607
|
_set_loaded_config_path(None)
|
|
@@ -580,37 +617,24 @@ def _reload_config_as_server() -> None:
|
|
|
580
617
|
raise ValueError(
|
|
581
618
|
'If db config is specified, no other config is allowed')
|
|
582
619
|
logger.debug('retrieving config from database')
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
db_config = config_utils.Config(
|
|
602
|
-
yaml_utils.safe_load(row.value))
|
|
603
|
-
db_config.pop_nested(('db',), None)
|
|
604
|
-
return db_config
|
|
605
|
-
return None
|
|
606
|
-
|
|
607
|
-
db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
|
|
608
|
-
if db_config:
|
|
609
|
-
server_config = overlay_skypilot_config(server_config,
|
|
610
|
-
db_config)
|
|
611
|
-
# Close the engine to avoid connection leaks
|
|
612
|
-
if dispose_engine:
|
|
613
|
-
sqlalchemy_engine.dispose()
|
|
620
|
+
|
|
621
|
+
if init_db:
|
|
622
|
+
_initialize_and_get_db()
|
|
623
|
+
|
|
624
|
+
def _get_config_yaml_from_db(key: str) -> Optional[config_utils.Config]:
|
|
625
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
626
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
627
|
+
row = session.query(config_yaml_table).filter_by(
|
|
628
|
+
key=key).first()
|
|
629
|
+
if row:
|
|
630
|
+
db_config = config_utils.Config(yaml_utils.safe_load(row.value))
|
|
631
|
+
db_config.pop_nested(('db',), None)
|
|
632
|
+
return db_config
|
|
633
|
+
return None
|
|
634
|
+
|
|
635
|
+
db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
|
|
636
|
+
if db_config:
|
|
637
|
+
server_config = overlay_skypilot_config(server_config, db_config)
|
|
614
638
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
615
639
|
logger.debug(f'server config: \n'
|
|
616
640
|
f'{yaml_utils.dump_yaml_str(dict(server_config))}')
|
|
@@ -666,7 +690,7 @@ def loaded_config_path_serialized() -> Optional[str]:
|
|
|
666
690
|
|
|
667
691
|
|
|
668
692
|
# Load on import, synchronization is guaranteed by python interpreter.
|
|
669
|
-
reload_config()
|
|
693
|
+
reload_config(init_db=True)
|
|
670
694
|
|
|
671
695
|
|
|
672
696
|
def loaded() -> bool:
|
|
@@ -880,44 +904,32 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
880
904
|
if new_db_url and new_db_url != existing_db_url:
|
|
881
905
|
raise ValueError('Cannot change db url while server is running')
|
|
882
906
|
if existing_db_url:
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
910
|
-
index_elements=[config_yaml_table.c.key],
|
|
911
|
-
set_={config_yaml_table.c.value: config_str})
|
|
912
|
-
session.execute(do_update_stmt)
|
|
913
|
-
session.commit()
|
|
914
|
-
|
|
915
|
-
logger.debug('saving api_server config to db')
|
|
916
|
-
_set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
|
|
917
|
-
db_updated = True
|
|
918
|
-
# Close the engine to avoid connection leaks
|
|
919
|
-
if dispose_engine:
|
|
920
|
-
sqlalchemy_engine.dispose()
|
|
907
|
+
|
|
908
|
+
def _set_config_yaml_to_db(key: str, config: config_utils.Config):
|
|
909
|
+
# reload_config(init_db=True) is called when this module is
|
|
910
|
+
# imported, so the database engine must already be initialized.
|
|
911
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
912
|
+
config_str = yaml_utils.dump_yaml_str(dict(config))
|
|
913
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
914
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
915
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
916
|
+
insert_func = sqlite.insert
|
|
917
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
918
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
919
|
+
insert_func = postgresql.insert
|
|
920
|
+
else:
|
|
921
|
+
raise ValueError('Unsupported database dialect')
|
|
922
|
+
insert_stmnt = insert_func(config_yaml_table).values(
|
|
923
|
+
key=key, value=config_str)
|
|
924
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
925
|
+
index_elements=[config_yaml_table.c.key],
|
|
926
|
+
set_={config_yaml_table.c.value: config_str})
|
|
927
|
+
session.execute(do_update_stmt)
|
|
928
|
+
session.commit()
|
|
929
|
+
|
|
930
|
+
logger.debug('saving api_server config to db')
|
|
931
|
+
_set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
|
|
932
|
+
db_updated = True
|
|
921
933
|
|
|
922
934
|
if not db_updated:
|
|
923
935
|
# save to the local file (PVC in Kubernetes, local file otherwise)
|
sky/ssh_node_pools/server.py
CHANGED
|
@@ -99,7 +99,7 @@ async def deploy_ssh_node_pool(request: fastapi.Request,
|
|
|
99
99
|
"""Deploy SSH Node Pool using existing ssh_up functionality."""
|
|
100
100
|
try:
|
|
101
101
|
ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=False)
|
|
102
|
-
executor.
|
|
102
|
+
await executor.schedule_request_async(
|
|
103
103
|
request_id=request.state.request_id,
|
|
104
104
|
request_name='ssh_up',
|
|
105
105
|
request_body=ssh_up_body,
|
|
@@ -124,7 +124,7 @@ async def deploy_ssh_node_pool_general(
|
|
|
124
124
|
ssh_up_body: payloads.SSHUpBody) -> Dict[str, str]:
|
|
125
125
|
"""Deploys all SSH Node Pools."""
|
|
126
126
|
try:
|
|
127
|
-
executor.
|
|
127
|
+
await executor.schedule_request_async(
|
|
128
128
|
request_id=request.state.request_id,
|
|
129
129
|
request_name='ssh_up',
|
|
130
130
|
request_body=ssh_up_body,
|
|
@@ -150,7 +150,7 @@ async def down_ssh_node_pool(request: fastapi.Request,
|
|
|
150
150
|
"""Cleans up a SSH Node Pools."""
|
|
151
151
|
try:
|
|
152
152
|
ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=True)
|
|
153
|
-
executor.
|
|
153
|
+
await executor.schedule_request_async(
|
|
154
154
|
request_id=request.state.request_id,
|
|
155
155
|
request_name='ssh_down',
|
|
156
156
|
request_body=ssh_up_body,
|
|
@@ -178,7 +178,7 @@ async def down_ssh_node_pool_general(
|
|
|
178
178
|
try:
|
|
179
179
|
# Set cleanup=True for down operation
|
|
180
180
|
ssh_up_body.cleanup = True
|
|
181
|
-
executor.
|
|
181
|
+
await executor.schedule_request_async(
|
|
182
182
|
request_id=request.state.request_id,
|
|
183
183
|
request_name='ssh_down',
|
|
184
184
|
request_body=ssh_up_body,
|
sky/users/permission.py
CHANGED
|
@@ -14,6 +14,7 @@ from sky import models
|
|
|
14
14
|
from sky import sky_logging
|
|
15
15
|
from sky.skylet import constants
|
|
16
16
|
from sky.users import rbac
|
|
17
|
+
from sky.utils import annotations
|
|
17
18
|
from sky.utils import common_utils
|
|
18
19
|
from sky.utils.db import db_utils
|
|
19
20
|
|
|
@@ -254,6 +255,9 @@ class PermissionService:
|
|
|
254
255
|
with _policy_lock():
|
|
255
256
|
self._load_policy_no_lock()
|
|
256
257
|
|
|
258
|
+
# Right now, not a lot of users are using multiple workspaces,
|
|
259
|
+
# so 5 should be more than enough.
|
|
260
|
+
@annotations.lru_cache(scope='request', maxsize=5)
|
|
257
261
|
def check_workspace_permission(self, user_id: str,
|
|
258
262
|
workspace_name: str) -> bool:
|
|
259
263
|
"""Check workspace permission.
|