skypilot-nightly 1.0.0.dev20250609__py3-none-any.whl → 1.0.0.dev20250610__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +3 -0
- sky/authentication.py +1 -7
- sky/backends/cloud_vm_ray_backend.py +9 -20
- sky/cli.py +2 -4
- sky/client/cli.py +2 -4
- sky/client/sdk.py +49 -4
- sky/clouds/kubernetes.py +15 -24
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/4lwUJxN6KwBqUxqO1VccB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +1 -0
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +1 -0
- sky/dashboard/out/_next/static/chunks/37-d8aebf1683522a0b.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +6 -0
- sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +1 -0
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +1 -0
- sky/dashboard/out/_next/static/chunks/{121-865d2bf8a3b84c6a.js → 491.b3d264269613fe09.js} +3 -3
- sky/dashboard/out/_next/static/chunks/513.211357a2914a34b2.js +1 -0
- sky/dashboard/out/_next/static/chunks/600.9cc76ec442b22e10.js +16 -0
- sky/dashboard/out/_next/static/chunks/616-d6128fa9e7cae6e6.js +39 -0
- sky/dashboard/out/_next/static/chunks/664-047bc03493fda379.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +1 -0
- sky/dashboard/out/_next/static/chunks/799-3625946b2ec2eb30.js +8 -0
- sky/dashboard/out/_next/static/chunks/804-4c9fc53aa74bc191.js +21 -0
- sky/dashboard/out/_next/static/chunks/843-6fcc4bf91ac45b39.js +11 -0
- sky/dashboard/out/_next/static/chunks/856-0776dc6ed6000c39.js +1 -0
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-a75b7712639298b7.js +1 -0
- sky/dashboard/out/_next/static/chunks/947-6620842ef80ae879.js +35 -0
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +1 -0
- sky/dashboard/out/_next/static/chunks/973-c807fc34f09c7df3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-4768de0aede04dc9.js +20 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-451a14e7e755ebbc.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-e56b17fd85d0ba58.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-fe233baf3d073491.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c8c2191328532b7d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-0574a5a4ba3cf0ac.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +18 -0
- sky/global_user_state.py +181 -74
- sky/jobs/client/sdk.py +29 -21
- sky/provision/kubernetes/constants.py +9 -0
- sky/provision/kubernetes/utils.py +106 -7
- sky/serve/client/sdk.py +56 -45
- sky/server/common.py +1 -5
- sky/server/requests/executor.py +50 -20
- sky/server/requests/payloads.py +3 -0
- sky/server/requests/process.py +69 -29
- sky/server/server.py +1 -0
- sky/server/stream_utils.py +111 -55
- sky/skylet/constants.py +1 -2
- sky/skypilot_config.py +99 -25
- sky/users/permission.py +1 -1
- sky/utils/admin_policy_utils.py +9 -3
- sky/utils/context.py +21 -1
- sky/utils/controller_utils.py +16 -1
- sky/utils/kubernetes/exec_kubeconfig_converter.py +19 -47
- {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/RECORD +85 -74
- sky/dashboard/out/_next/static/chunks/236-619ed0248fb6fdd9.js +0 -6
- sky/dashboard/out/_next/static/chunks/293-351268365226d251.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-600191c5804dcae2.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-680c19413b8f808b.js +0 -1
- sky/dashboard/out/_next/static/chunks/63-e2d7b1e75e67c713.js +0 -66
- sky/dashboard/out/_next/static/chunks/682-b60cfdacc15202e8.js +0 -6
- sky/dashboard/out/_next/static/chunks/843-16c7194621b2b512.js +0 -11
- sky/dashboard/out/_next/static/chunks/856-affc52adf5403a3a.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-2c584e28e6b4b106.js +0 -1
- sky/dashboard/out/_next/static/chunks/973-aed916d5b02d2d63.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-5f16aba5794ee8e7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-d31688d3e52736dd.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e7d8710a9b0491e5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-3c674e5d970e05cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-3aac7a015c6eede1.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-46d2e4ad6c487260.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-7013d816a2a0e76c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-f7f0c9e156d328bc.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-87e60396c376292f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-9355a0f13d1db61d.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-9a749cca1813bd27.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-8eeb628e03902f1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8fbcc5ab4af316d0.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-65d465f948974c0d.js +0 -1
- sky/dashboard/out/_next/static/xos0euNCptbGAM7_Q3Acl/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{xos0euNCptbGAM7_Q3Acl → 4lwUJxN6KwBqUxqO1VccB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/top_level.txt +0 -0
sky/server/requests/process.py
CHANGED
@@ -6,6 +6,7 @@ import threading
|
|
6
6
|
import time
|
7
7
|
from typing import Callable, Dict, Optional, Tuple
|
8
8
|
|
9
|
+
from sky import exceptions
|
9
10
|
from sky.utils import atomic
|
10
11
|
from sky.utils import subprocess_utils
|
11
12
|
|
@@ -67,14 +68,24 @@ class PoolExecutor(concurrent.futures.ProcessPoolExecutor):
|
|
67
68
|
|
68
69
|
|
69
70
|
# Define the worker function outside of the class to avoid pickling self
|
70
|
-
def _disposable_worker(fn, initializer
|
71
|
-
|
71
|
+
def _disposable_worker(fn, initializer, initargs, result_queue, args, kwargs):
|
72
|
+
"""The worker function that is used to run the task.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
fn: The function to run.
|
76
|
+
initializer: The initializer function to run before running the task.
|
77
|
+
initargs: The arguments to pass to the initializer function.
|
78
|
+
result_queue: The queue to put the result and exception into.
|
79
|
+
args: The arguments to pass to the function.
|
80
|
+
kwargs: The keyword arguments to pass to the function.
|
81
|
+
"""
|
72
82
|
try:
|
73
83
|
if initializer is not None:
|
74
84
|
initializer(*initargs)
|
75
|
-
fn(*args, **kwargs)
|
85
|
+
result = fn(*args, **kwargs)
|
86
|
+
result_queue.put(result)
|
76
87
|
except BaseException as e: # pylint: disable=broad-except
|
77
|
-
|
88
|
+
result_queue.put(e)
|
78
89
|
|
79
90
|
|
80
91
|
class DisposableExecutor:
|
@@ -98,28 +109,52 @@ class DisposableExecutor:
|
|
98
109
|
self._initializer: Optional[Callable] = initializer
|
99
110
|
self._initargs: Tuple = initargs
|
100
111
|
|
101
|
-
def _monitor_worker(self, process: multiprocessing.Process
|
112
|
+
def _monitor_worker(self, process: multiprocessing.Process,
|
113
|
+
future: concurrent.futures.Future,
|
114
|
+
result_queue: multiprocessing.Queue) -> None:
|
102
115
|
"""Monitor the worker process and cleanup when it's done."""
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
116
|
+
try:
|
117
|
+
process.join()
|
118
|
+
if not future.cancelled():
|
119
|
+
try:
|
120
|
+
# Get result from the queue if process completed
|
121
|
+
if not result_queue.empty():
|
122
|
+
result = result_queue.get(block=False)
|
123
|
+
if isinstance(result, BaseException):
|
124
|
+
future.set_exception(result)
|
125
|
+
else:
|
126
|
+
future.set_result(result)
|
127
|
+
else:
|
128
|
+
# Process ended but no result
|
129
|
+
future.set_result(None)
|
130
|
+
except (multiprocessing.TimeoutError, BrokenPipeError,
|
131
|
+
EOFError) as e:
|
132
|
+
future.set_exception(e)
|
133
|
+
finally:
|
134
|
+
if process.pid:
|
135
|
+
with self._lock:
|
136
|
+
if process.pid in self.workers:
|
137
|
+
del self.workers[process.pid]
|
138
|
+
|
139
|
+
def submit(self, fn, *args, **kwargs) -> concurrent.futures.Future:
|
140
|
+
"""Submit a task for execution and return a Future."""
|
141
|
+
future: concurrent.futures.Future = concurrent.futures.Future()
|
142
|
+
|
113
143
|
if self._shutdown:
|
114
|
-
|
144
|
+
raise RuntimeError('Cannot submit task after executor is shutdown')
|
145
|
+
|
115
146
|
with self._lock:
|
116
147
|
if (self.max_workers is not None and
|
117
148
|
len(self.workers) >= self.max_workers):
|
118
|
-
|
149
|
+
raise exceptions.ExecutionPoolFullError(
|
150
|
+
'Maximum workers reached')
|
119
151
|
|
152
|
+
result_queue: multiprocessing.Queue = multiprocessing.Queue()
|
120
153
|
process = multiprocessing.Process(target=_disposable_worker,
|
121
154
|
args=(fn, self._initializer,
|
122
|
-
self._initargs,
|
155
|
+
self._initargs, result_queue,
|
156
|
+
args, kwargs))
|
157
|
+
process.daemon = True
|
123
158
|
process.start()
|
124
159
|
|
125
160
|
with self._lock:
|
@@ -128,13 +163,13 @@ class DisposableExecutor:
|
|
128
163
|
raise RuntimeError('Failed to start process')
|
129
164
|
self.workers[pid] = process
|
130
165
|
|
131
|
-
# Start monitor thread to cleanup the worker process when it's done
|
166
|
+
# Start monitor thread to cleanup the worker process when it's done
|
132
167
|
monitor_thread = threading.Thread(target=self._monitor_worker,
|
133
|
-
args=(process,),
|
168
|
+
args=(process, future, result_queue),
|
134
169
|
daemon=True)
|
135
170
|
monitor_thread.start()
|
136
171
|
|
137
|
-
return
|
172
|
+
return future
|
138
173
|
|
139
174
|
def has_idle_workers(self) -> bool:
|
140
175
|
"""Check if there are any idle workers."""
|
@@ -173,12 +208,14 @@ class BurstableExecutor:
|
|
173
208
|
self._burst_executor = DisposableExecutor(max_workers=burst_workers,
|
174
209
|
**kwargs)
|
175
210
|
|
176
|
-
def submit_until_success(self, fn, *args,
|
211
|
+
def submit_until_success(self, fn, *args,
|
212
|
+
**kwargs) -> concurrent.futures.Future:
|
177
213
|
"""Submit a task for execution until success.
|
178
214
|
|
179
215
|
Prioritizes submitting to the guaranteed pool. If no idle workers
|
180
216
|
are available in the guaranteed pool, it will submit to the burst
|
181
|
-
pool.
|
217
|
+
pool. If the burst pool is full, it will retry the whole process until
|
218
|
+
the task is submitted successfully.
|
182
219
|
TODO(aylei): this is coupled with executor.RequestWorker since we
|
183
220
|
know the worker is dedicated to request scheduling and it either
|
184
221
|
blocks on request polling or request submitting. So it is no harm
|
@@ -188,17 +225,20 @@ class BurstableExecutor:
|
|
188
225
|
|
189
226
|
while True:
|
190
227
|
if self._executor is not None and self._executor.has_idle_workers():
|
191
|
-
|
192
|
-
|
228
|
+
logger.info('Submitting to the guaranteed pool')
|
229
|
+
return self._executor.submit(fn, *args, **kwargs)
|
193
230
|
if (self._burst_executor is not None and
|
194
231
|
self._burst_executor.has_idle_workers()):
|
195
|
-
|
196
|
-
|
232
|
+
try:
|
233
|
+
fut = self._burst_executor.submit(fn, *args, **kwargs)
|
234
|
+
return fut
|
235
|
+
except exceptions.ExecutionPoolFullError:
|
236
|
+
# The burst pool is full, try the next candidate.
|
237
|
+
pass
|
197
238
|
if self._executor is not None:
|
198
239
|
# No idle workers in either pool, still queue the request
|
199
240
|
# to the guaranteed pool to keep behavior consistent.
|
200
|
-
self._executor.submit(fn, *args, **kwargs)
|
201
|
-
break
|
241
|
+
return self._executor.submit(fn, *args, **kwargs)
|
202
242
|
logger.debug('No guaranteed pool set and the burst pool is full, '
|
203
243
|
'retry later.')
|
204
244
|
time.sleep(0.1)
|
sky/server/server.py
CHANGED
sky/server/stream_utils.py
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
import asyncio
|
4
4
|
import collections
|
5
5
|
import pathlib
|
6
|
-
from typing import AsyncGenerator, Deque, Optional
|
6
|
+
from typing import AsyncGenerator, Deque, List, Optional
|
7
7
|
|
8
8
|
import aiofiles
|
9
9
|
import fastapi
|
@@ -15,6 +15,12 @@ from sky.utils import rich_utils
|
|
15
15
|
|
16
16
|
logger = sky_logging.init_logger(__name__)
|
17
17
|
|
18
|
+
# When streaming log lines, buffer the lines in memory and flush them in chunks
|
19
|
+
# to improve log tailing throughput. Buffer size is the max size bytes of each
|
20
|
+
# chunk and the timeout threshold for flushing the buffer to ensure
|
21
|
+
# responsiveness.
|
22
|
+
_BUFFER_SIZE = 8 * 1024 # 8KB
|
23
|
+
_BUFFER_TIMEOUT = 0.02 # 20ms
|
18
24
|
_HEARTBEAT_INTERVAL = 30
|
19
25
|
|
20
26
|
|
@@ -36,7 +42,16 @@ async def log_streamer(request_id: Optional[str],
|
|
36
42
|
plain_logs: bool = False,
|
37
43
|
tail: Optional[int] = None,
|
38
44
|
follow: bool = True) -> AsyncGenerator[str, None]:
|
39
|
-
"""Streams the logs of a request.
|
45
|
+
"""Streams the logs of a request.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
request_id: The request ID to check whether the log tailing process
|
49
|
+
should be stopped.
|
50
|
+
log_path: The path to the log file.
|
51
|
+
plain_logs: Whether to show plain logs.
|
52
|
+
tail: The number of lines to tail. If None, tail the whole file.
|
53
|
+
follow: Whether to follow the log file.
|
54
|
+
"""
|
40
55
|
|
41
56
|
if request_id is not None:
|
42
57
|
status_msg = rich_utils.EncodedStatusMessage(
|
@@ -80,65 +95,106 @@ async def log_streamer(request_id: Optional[str],
|
|
80
95
|
if show_request_waiting_spinner:
|
81
96
|
yield status_msg.stop()
|
82
97
|
|
83
|
-
# Find last n lines of the log file. Do not read the whole file into memory.
|
84
98
|
async with aiofiles.open(log_path, 'rb') as f:
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
99
|
+
async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
|
100
|
+
follow):
|
101
|
+
yield chunk
|
102
|
+
|
103
|
+
|
104
|
+
async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
105
|
+
request_id: Optional[str] = None,
|
106
|
+
plain_logs: bool = False,
|
107
|
+
tail: Optional[int] = None,
|
108
|
+
follow: bool = True) -> AsyncGenerator[str, None]:
|
109
|
+
"""Tail the opened log file, buffer the lines and flush in chunks."""
|
110
|
+
|
111
|
+
if tail is not None:
|
112
|
+
# Find last n lines of the log file. Do not read the whole file into
|
113
|
+
# memory.
|
114
|
+
# TODO(zhwu): this will include the control lines for rich status,
|
115
|
+
# which may not lead to exact tail lines when showing on the client
|
116
|
+
# side.
|
117
|
+
lines: Deque[str] = collections.deque(maxlen=tail)
|
118
|
+
async for line_str in _yield_log_file_with_payloads_skipped(f):
|
119
|
+
lines.append(line_str)
|
120
|
+
for line_str in lines:
|
121
|
+
yield line_str
|
96
122
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
123
|
+
last_heartbeat_time = asyncio.get_event_loop().time()
|
124
|
+
|
125
|
+
# Buffer the lines in memory and flush them in chunks to improve log
|
126
|
+
# tailing throughput.
|
127
|
+
buffer: List[str] = []
|
128
|
+
buffer_bytes = 0
|
129
|
+
last_flush_time = asyncio.get_event_loop().time()
|
130
|
+
|
131
|
+
async def flush_buffer() -> AsyncGenerator[str, None]:
|
132
|
+
nonlocal buffer, buffer_bytes, last_flush_time
|
133
|
+
if buffer:
|
134
|
+
yield ''.join(buffer)
|
135
|
+
buffer.clear()
|
136
|
+
buffer_bytes = 0
|
137
|
+
last_flush_time = asyncio.get_event_loop().time()
|
138
|
+
|
139
|
+
while True:
|
140
|
+
# Sleep 0 to yield control to allow other coroutines to run,
|
141
|
+
# while keeps the loop tight to make log stream responsive.
|
142
|
+
await asyncio.sleep(0)
|
143
|
+
current_time = asyncio.get_event_loop().time()
|
144
|
+
# Flush the buffer when it is not empty and the buffer is full or the
|
145
|
+
# flush timeout is reached.
|
146
|
+
if buffer and (buffer_bytes >= _BUFFER_SIZE or
|
147
|
+
(current_time - last_flush_time) >= _BUFFER_TIMEOUT):
|
148
|
+
async for chunk in flush_buffer():
|
149
|
+
yield chunk
|
150
|
+
|
151
|
+
line: Optional[bytes] = await f.readline()
|
152
|
+
if not line:
|
153
|
+
if request_id is not None:
|
154
|
+
request_task = requests_lib.get_request(request_id)
|
155
|
+
if request_task.status > requests_lib.RequestStatus.RUNNING:
|
156
|
+
if (request_task.status ==
|
157
|
+
requests_lib.RequestStatus.CANCELLED):
|
158
|
+
buffer.append(
|
159
|
+
f'{request_task.name!r} request {request_id}'
|
160
|
+
' cancelled\n')
|
112
161
|
break
|
162
|
+
if not follow:
|
163
|
+
break
|
164
|
+
|
165
|
+
if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
|
166
|
+
# Currently just used to keep the connection busy, refer to
|
167
|
+
# https://github.com/skypilot-org/skypilot/issues/5750 for
|
168
|
+
# more details.
|
169
|
+
buffer.append(
|
170
|
+
message_utils.encode_payload(
|
171
|
+
rich_utils.Control.HEARTBEAT.encode('')))
|
172
|
+
last_heartbeat_time = current_time
|
173
|
+
|
174
|
+
# Sleep shortly to avoid storming the DB and CPU, this has
|
175
|
+
# little impact on the responsivness here since we are waiting
|
176
|
+
# for a new line to come in.
|
177
|
+
await asyncio.sleep(0.1)
|
178
|
+
continue
|
113
179
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
# for a new line to come in.
|
126
|
-
await asyncio.sleep(0.1)
|
180
|
+
# Refresh the heartbeat time, this is a trivial optimization for
|
181
|
+
# performance but it helps avoid unnecessary heartbeat strings
|
182
|
+
# being printed when the client runs in an old version.
|
183
|
+
last_heartbeat_time = asyncio.get_event_loop().time()
|
184
|
+
line_str = line.decode('utf-8')
|
185
|
+
if plain_logs:
|
186
|
+
is_payload, line_str = message_utils.decode_payload(
|
187
|
+
line_str, raise_for_mismatch=False)
|
188
|
+
# TODO(aylei): implement heartbeat mechanism for plain logs,
|
189
|
+
# sending invisible characters might be okay.
|
190
|
+
if is_payload:
|
127
191
|
continue
|
192
|
+
buffer.append(line_str)
|
193
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
128
194
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
last_heartbeat_time = asyncio.get_event_loop().time()
|
133
|
-
line_str = line.decode('utf-8')
|
134
|
-
if plain_logs:
|
135
|
-
is_payload, line_str = message_utils.decode_payload(
|
136
|
-
line_str, raise_for_mismatch=False)
|
137
|
-
# TODO(aylei): implement heartbeat mechanism for plain logs,
|
138
|
-
# sending invisible characters might be okay.
|
139
|
-
if is_payload:
|
140
|
-
continue
|
141
|
-
yield line_str
|
195
|
+
# Flush remaining lines in the buffer.
|
196
|
+
async for chunk in flush_buffer():
|
197
|
+
yield chunk
|
142
198
|
|
143
199
|
|
144
200
|
def stream_response(
|
sky/skylet/constants.py
CHANGED
@@ -377,8 +377,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
377
377
|
]
|
378
378
|
# When overriding the SkyPilot configs on the API server with the client one,
|
379
379
|
# we skip the following keys because they are meant to be client-side configs.
|
380
|
-
SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('
|
381
|
-
('api_server',),
|
380
|
+
SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('api_server',),
|
382
381
|
('allowed_clouds',),
|
383
382
|
('workspaces',), ('db',)]
|
384
383
|
|
sky/skypilot_config.py
CHANGED
@@ -58,6 +58,11 @@ import typing
|
|
58
58
|
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
59
59
|
|
60
60
|
import filelock
|
61
|
+
import sqlalchemy
|
62
|
+
from sqlalchemy import orm
|
63
|
+
from sqlalchemy.dialects import postgresql
|
64
|
+
from sqlalchemy.dialects import sqlite
|
65
|
+
from sqlalchemy.ext import declarative
|
61
66
|
|
62
67
|
from sky import exceptions
|
63
68
|
from sky import sky_logging
|
@@ -66,6 +71,7 @@ from sky.skylet import constants
|
|
66
71
|
from sky.utils import common_utils
|
67
72
|
from sky.utils import config_utils
|
68
73
|
from sky.utils import context
|
74
|
+
from sky.utils import db_utils
|
69
75
|
from sky.utils import schemas
|
70
76
|
from sky.utils import ux_utils
|
71
77
|
from sky.utils.kubernetes import config_map_utils
|
@@ -110,6 +116,56 @@ ENV_VAR_PROJECT_CONFIG = f'{constants.SKYPILOT_ENV_VAR_PREFIX}PROJECT_CONFIG'
|
|
110
116
|
_GLOBAL_CONFIG_PATH = '~/.sky/config.yaml'
|
111
117
|
_PROJECT_CONFIG_PATH = '.sky.yaml'
|
112
118
|
|
119
|
+
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
120
|
+
API_SERVER_CONFIG_KEY = 'api_server_config'
|
121
|
+
|
122
|
+
Base = declarative.declarative_base()
|
123
|
+
|
124
|
+
config_yaml_table = sqlalchemy.Table(
|
125
|
+
'config_yaml',
|
126
|
+
Base.metadata,
|
127
|
+
sqlalchemy.Column('key', sqlalchemy.Text, primary_key=True),
|
128
|
+
sqlalchemy.Column('value', sqlalchemy.Text),
|
129
|
+
)
|
130
|
+
|
131
|
+
|
132
|
+
def create_table():
|
133
|
+
# Create tables if they don't exist
|
134
|
+
Base.metadata.create_all(bind=_SQLALCHEMY_ENGINE)
|
135
|
+
|
136
|
+
|
137
|
+
def _get_config_yaml_from_db(key: str) -> Optional[config_utils.Config]:
|
138
|
+
assert _SQLALCHEMY_ENGINE is not None
|
139
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
140
|
+
row = session.query(config_yaml_table).filter_by(key=key).first()
|
141
|
+
if row:
|
142
|
+
db_config = config_utils.Config(yaml.safe_load(row.value))
|
143
|
+
db_config.pop_nested(('db',), None)
|
144
|
+
return db_config
|
145
|
+
return None
|
146
|
+
|
147
|
+
|
148
|
+
def _set_config_yaml_to_db(key: str, config: config_utils.Config):
|
149
|
+
assert _SQLALCHEMY_ENGINE is not None
|
150
|
+
config.pop_nested(('db',), None)
|
151
|
+
config_str = common_utils.dump_yaml_str(dict(config))
|
152
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
153
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
154
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
155
|
+
insert_func = sqlite.insert
|
156
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
157
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
158
|
+
insert_func = postgresql.insert
|
159
|
+
else:
|
160
|
+
raise ValueError('Unsupported database dialect')
|
161
|
+
insert_stmnt = insert_func(config_yaml_table).values(key=key,
|
162
|
+
value=config_str)
|
163
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
164
|
+
index_elements=[config_yaml_table.c.key],
|
165
|
+
set_={config_yaml_table.c.value: config_str})
|
166
|
+
session.execute(do_update_stmt)
|
167
|
+
session.commit()
|
168
|
+
|
113
169
|
|
114
170
|
class ConfigContext:
|
115
171
|
|
@@ -257,11 +313,6 @@ def _resolve_project_config_path() -> Optional[str]:
|
|
257
313
|
return None
|
258
314
|
|
259
315
|
|
260
|
-
def _get_project_config() -> config_utils.Config:
|
261
|
-
"""Returns the project config."""
|
262
|
-
return _get_config_from_path(_resolve_project_config_path())
|
263
|
-
|
264
|
-
|
265
316
|
def _resolve_server_config_path() -> Optional[str]:
|
266
317
|
# find the server config file
|
267
318
|
server_config_path = _get_config_file_path(ENV_VAR_GLOBAL_CONFIG)
|
@@ -507,26 +558,35 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
|
|
507
558
|
|
508
559
|
|
509
560
|
def _reload_config_as_server() -> None:
|
561
|
+
global _SQLALCHEMY_ENGINE
|
510
562
|
# Reset the global variables, to avoid using stale values.
|
511
563
|
_set_loaded_config(config_utils.Config())
|
512
564
|
_set_loaded_config_path(None)
|
513
565
|
|
514
|
-
overrides: List[config_utils.Config] = []
|
515
566
|
server_config_path = _resolve_server_config_path()
|
516
567
|
server_config = _get_config_from_path(server_config_path)
|
517
|
-
if server_config:
|
518
|
-
overrides.append(server_config)
|
519
568
|
|
520
|
-
# layer the configs on top of each other based on priority
|
521
|
-
overlaid_server_config: config_utils.Config = config_utils.Config()
|
522
|
-
for override in overrides:
|
523
|
-
overlaid_server_config = overlay_skypilot_config(
|
524
|
-
original_config=overlaid_server_config, override_configs=override)
|
525
569
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
526
|
-
logger.debug(
|
527
|
-
|
528
|
-
|
529
|
-
|
570
|
+
logger.debug(f'server config: \n'
|
571
|
+
f'{common_utils.dump_yaml_str(dict(server_config))}')
|
572
|
+
|
573
|
+
db_url = server_config.get_nested(('db',), None)
|
574
|
+
if db_url and len(server_config.keys()) > 1:
|
575
|
+
raise ValueError(
|
576
|
+
'if db config is specified, no other config is allowed')
|
577
|
+
|
578
|
+
if db_url:
|
579
|
+
if _SQLALCHEMY_ENGINE is None:
|
580
|
+
_SQLALCHEMY_ENGINE = sqlalchemy.create_engine(db_url)
|
581
|
+
create_table()
|
582
|
+
db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
|
583
|
+
if db_config:
|
584
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
585
|
+
logger.debug(f'Config loaded from db:\n'
|
586
|
+
f'{common_utils.dump_yaml_str(dict(db_config))}')
|
587
|
+
server_config = overlay_skypilot_config(server_config, db_config)
|
588
|
+
|
589
|
+
_set_loaded_config(server_config)
|
530
590
|
_set_loaded_config_path(server_config_path)
|
531
591
|
|
532
592
|
|
@@ -778,13 +838,27 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
778
838
|
if global_config_path is None:
|
779
839
|
global_config_path = get_user_config_path()
|
780
840
|
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
841
|
+
db_updated = False
|
842
|
+
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
843
|
+
existing_db_url = get_nested(('db',), None)
|
844
|
+
if existing_db_url:
|
845
|
+
new_db_url = config.get_nested(('db',), None)
|
846
|
+
if new_db_url and new_db_url != existing_db_url:
|
847
|
+
raise ValueError('Cannot change db url while server is running')
|
848
|
+
logger.debug('saving api_server config to db')
|
849
|
+
_set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
|
850
|
+
db_updated = True
|
851
|
+
|
852
|
+
if not db_updated:
|
853
|
+
# save to the local file (PVC in Kubernetes, local file otherwise)
|
854
|
+
common_utils.dump_yaml(global_config_path, dict(config))
|
855
|
+
|
856
|
+
if config_map_utils.is_running_in_kubernetes():
|
857
|
+
# In Kubernetes, sync the PVC config to ConfigMap for user
|
858
|
+
# convenience.
|
859
|
+
# PVC file is the source of truth, ConfigMap is just a mirror for
|
860
|
+
# easy access.
|
861
|
+
config_map_utils.patch_configmap_with_config(
|
862
|
+
config, global_config_path)
|
789
863
|
|
790
864
|
_reload_config()
|
sky/users/permission.py
CHANGED
@@ -36,7 +36,7 @@ class PermissionService:
|
|
36
36
|
with _lock:
|
37
37
|
if _enforcer_instance is None:
|
38
38
|
_enforcer_instance = self
|
39
|
-
engine = global_user_state.
|
39
|
+
engine = global_user_state.initialize_and_get_db()
|
40
40
|
adapter = sqlalchemy_adapter.Adapter(engine)
|
41
41
|
model_path = os.path.join(os.path.dirname(__file__),
|
42
42
|
'model.conf')
|
sky/utils/admin_policy_utils.py
CHANGED
@@ -55,6 +55,7 @@ def _get_policy_cls(
|
|
55
55
|
def apply_and_use_config_in_current_request(
|
56
56
|
entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
|
57
57
|
request_options: Optional[admin_policy.RequestOptions] = None,
|
58
|
+
at_client_side: bool = False,
|
58
59
|
) -> Iterator['dag_lib.Dag']:
|
59
60
|
"""Applies an admin policy and override SkyPilot config for current request
|
60
61
|
|
@@ -66,7 +67,7 @@ def apply_and_use_config_in_current_request(
|
|
66
67
|
Refer to `apply()` for more details.
|
67
68
|
"""
|
68
69
|
original_config = skypilot_config.to_dict()
|
69
|
-
dag, mutated_config = apply(entrypoint, request_options)
|
70
|
+
dag, mutated_config = apply(entrypoint, request_options, at_client_side)
|
70
71
|
if mutated_config != original_config:
|
71
72
|
with skypilot_config.replace_skypilot_config(mutated_config):
|
72
73
|
yield dag
|
@@ -77,6 +78,7 @@ def apply_and_use_config_in_current_request(
|
|
77
78
|
def apply(
|
78
79
|
entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
|
79
80
|
request_options: Optional[admin_policy.RequestOptions] = None,
|
81
|
+
at_client_side: bool = False,
|
80
82
|
) -> Tuple['dag_lib.Dag', config_utils.Config]:
|
81
83
|
"""Applies an admin policy (if registered) to a DAG or a task.
|
82
84
|
|
@@ -105,14 +107,18 @@ def apply(
|
|
105
107
|
if policy_cls is None:
|
106
108
|
return dag, skypilot_config.to_dict()
|
107
109
|
|
108
|
-
|
110
|
+
if at_client_side:
|
111
|
+
logger.info(f'Applying client admin policy: {policy}')
|
112
|
+
else:
|
113
|
+
logger.info(f'Applying server admin policy: {policy}')
|
109
114
|
config = copy.deepcopy(skypilot_config.to_dict())
|
110
115
|
mutated_dag = dag_lib.Dag()
|
111
116
|
mutated_dag.name = dag.name
|
112
117
|
|
113
118
|
mutated_config = None
|
114
119
|
for task in dag.tasks:
|
115
|
-
user_request = admin_policy.UserRequest(task, config, request_options
|
120
|
+
user_request = admin_policy.UserRequest(task, config, request_options,
|
121
|
+
at_client_side)
|
116
122
|
try:
|
117
123
|
mutated_user_request = policy_cls.validate_and_mutate(user_request)
|
118
124
|
except Exception as e: # pylint: disable=broad-except
|
sky/utils/context.py
CHANGED
@@ -4,11 +4,13 @@ import asyncio
|
|
4
4
|
from collections.abc import Mapping
|
5
5
|
from collections.abc import MutableMapping
|
6
6
|
import contextvars
|
7
|
+
import functools
|
7
8
|
import os
|
8
9
|
import pathlib
|
9
10
|
import subprocess
|
10
11
|
import sys
|
11
|
-
|
12
|
+
import typing
|
13
|
+
from typing import Any, Callable, Dict, Optional, TextIO, TypeVar
|
12
14
|
|
13
15
|
|
14
16
|
class Context(object):
|
@@ -256,6 +258,24 @@ class Popen(subprocess.Popen):
|
|
256
258
|
super().__init__(*args, env=env, **kwargs)
|
257
259
|
|
258
260
|
|
261
|
+
F = TypeVar('F', bound=Callable[..., Any])
|
262
|
+
|
263
|
+
|
264
|
+
def contextual(func: F) -> F:
|
265
|
+
"""Decorator to intiailize a context before executing the function.
|
266
|
+
|
267
|
+
If a context is already initialized, this decorator will reset the context,
|
268
|
+
i.e. all contextual variables set previously will be cleared.
|
269
|
+
"""
|
270
|
+
|
271
|
+
@functools.wraps(func)
|
272
|
+
def wrapper(*args, **kwargs):
|
273
|
+
initialize()
|
274
|
+
return func(*args, **kwargs)
|
275
|
+
|
276
|
+
return typing.cast(F, wrapper)
|
277
|
+
|
278
|
+
|
259
279
|
def initialize():
|
260
280
|
"""Initialize the current SkyPilot context."""
|
261
281
|
_CONTEXT.set(Context())
|