skypilot-nightly 1.0.0.dev20250609__py3-none-any.whl → 1.0.0.dev20250610__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +3 -0
  3. sky/authentication.py +1 -7
  4. sky/backends/cloud_vm_ray_backend.py +9 -20
  5. sky/cli.py +2 -4
  6. sky/client/cli.py +2 -4
  7. sky/client/sdk.py +49 -4
  8. sky/clouds/kubernetes.py +15 -24
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/4lwUJxN6KwBqUxqO1VccB/_buildManifest.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/37-d8aebf1683522a0b.js +6 -0
  14. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/{121-865d2bf8a3b84c6a.js → 491.b3d264269613fe09.js} +3 -3
  18. sky/dashboard/out/_next/static/chunks/513.211357a2914a34b2.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/600.9cc76ec442b22e10.js +16 -0
  20. sky/dashboard/out/_next/static/chunks/616-d6128fa9e7cae6e6.js +39 -0
  21. sky/dashboard/out/_next/static/chunks/664-047bc03493fda379.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +6 -0
  23. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/799-3625946b2ec2eb30.js +8 -0
  25. sky/dashboard/out/_next/static/chunks/804-4c9fc53aa74bc191.js +21 -0
  26. sky/dashboard/out/_next/static/chunks/843-6fcc4bf91ac45b39.js +11 -0
  27. sky/dashboard/out/_next/static/chunks/856-0776dc6ed6000c39.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/938-a75b7712639298b7.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/947-6620842ef80ae879.js +35 -0
  31. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/973-c807fc34f09c7df3.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/pages/_app-4768de0aede04dc9.js +20 -0
  34. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +6 -0
  35. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-451a14e7e755ebbc.js +6 -0
  36. sky/dashboard/out/_next/static/chunks/pages/clusters-e56b17fd85d0ba58.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +16 -0
  41. sky/dashboard/out/_next/static/chunks/pages/jobs-fe233baf3d073491.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c8c2191328532b7d.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/webpack-0574a5a4ba3cf0ac.js +1 -0
  47. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  48. sky/dashboard/out/clusters/[cluster].html +1 -1
  49. sky/dashboard/out/clusters.html +1 -1
  50. sky/dashboard/out/config.html +1 -1
  51. sky/dashboard/out/index.html +1 -1
  52. sky/dashboard/out/infra/[context].html +1 -1
  53. sky/dashboard/out/infra.html +1 -1
  54. sky/dashboard/out/jobs/[job].html +1 -1
  55. sky/dashboard/out/jobs.html +1 -1
  56. sky/dashboard/out/users.html +1 -1
  57. sky/dashboard/out/workspace/new.html +1 -1
  58. sky/dashboard/out/workspaces/[name].html +1 -1
  59. sky/dashboard/out/workspaces.html +1 -1
  60. sky/exceptions.py +18 -0
  61. sky/global_user_state.py +181 -74
  62. sky/jobs/client/sdk.py +29 -21
  63. sky/provision/kubernetes/constants.py +9 -0
  64. sky/provision/kubernetes/utils.py +106 -7
  65. sky/serve/client/sdk.py +56 -45
  66. sky/server/common.py +1 -5
  67. sky/server/requests/executor.py +50 -20
  68. sky/server/requests/payloads.py +3 -0
  69. sky/server/requests/process.py +69 -29
  70. sky/server/server.py +1 -0
  71. sky/server/stream_utils.py +111 -55
  72. sky/skylet/constants.py +1 -2
  73. sky/skypilot_config.py +99 -25
  74. sky/users/permission.py +1 -1
  75. sky/utils/admin_policy_utils.py +9 -3
  76. sky/utils/context.py +21 -1
  77. sky/utils/controller_utils.py +16 -1
  78. sky/utils/kubernetes/exec_kubeconfig_converter.py +19 -47
  79. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/METADATA +1 -1
  80. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/RECORD +85 -74
  81. sky/dashboard/out/_next/static/chunks/236-619ed0248fb6fdd9.js +0 -6
  82. sky/dashboard/out/_next/static/chunks/293-351268365226d251.js +0 -1
  83. sky/dashboard/out/_next/static/chunks/37-600191c5804dcae2.js +0 -6
  84. sky/dashboard/out/_next/static/chunks/470-680c19413b8f808b.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/63-e2d7b1e75e67c713.js +0 -66
  86. sky/dashboard/out/_next/static/chunks/682-b60cfdacc15202e8.js +0 -6
  87. sky/dashboard/out/_next/static/chunks/843-16c7194621b2b512.js +0 -11
  88. sky/dashboard/out/_next/static/chunks/856-affc52adf5403a3a.js +0 -1
  89. sky/dashboard/out/_next/static/chunks/969-2c584e28e6b4b106.js +0 -1
  90. sky/dashboard/out/_next/static/chunks/973-aed916d5b02d2d63.js +0 -1
  91. sky/dashboard/out/_next/static/chunks/pages/_app-5f16aba5794ee8e7.js +0 -1
  92. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-d31688d3e52736dd.js +0 -6
  93. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e7d8710a9b0491e5.js +0 -6
  94. sky/dashboard/out/_next/static/chunks/pages/clusters-3c674e5d970e05cb.js +0 -1
  95. sky/dashboard/out/_next/static/chunks/pages/config-3aac7a015c6eede1.js +0 -6
  96. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-46d2e4ad6c487260.js +0 -1
  97. sky/dashboard/out/_next/static/chunks/pages/infra-7013d816a2a0e76c.js +0 -1
  98. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-f7f0c9e156d328bc.js +0 -16
  99. sky/dashboard/out/_next/static/chunks/pages/jobs-87e60396c376292f.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/pages/users-9355a0f13d1db61d.js +0 -16
  101. sky/dashboard/out/_next/static/chunks/pages/workspace/new-9a749cca1813bd27.js +0 -1
  102. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-8eeb628e03902f1b.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/pages/workspaces-8fbcc5ab4af316d0.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/webpack-65d465f948974c0d.js +0 -1
  105. sky/dashboard/out/_next/static/xos0euNCptbGAM7_Q3Acl/_buildManifest.js +0 -1
  106. /sky/dashboard/out/_next/static/{xos0euNCptbGAM7_Q3Acl → 4lwUJxN6KwBqUxqO1VccB}/_ssgManifest.js +0 -0
  107. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/WHEEL +0 -0
  108. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/entry_points.txt +0 -0
  109. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/licenses/LICENSE +0 -0
  110. {skypilot_nightly-1.0.0.dev20250609.dist-info → skypilot_nightly-1.0.0.dev20250610.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,7 @@ import threading
6
6
  import time
7
7
  from typing import Callable, Dict, Optional, Tuple
8
8
 
9
+ from sky import exceptions
9
10
  from sky.utils import atomic
10
11
  from sky.utils import subprocess_utils
11
12
 
@@ -67,14 +68,24 @@ class PoolExecutor(concurrent.futures.ProcessPoolExecutor):
67
68
 
68
69
 
69
70
  # Define the worker function outside of the class to avoid pickling self
70
- def _disposable_worker(fn, initializer: Optional[Callable], initargs: Tuple,
71
- args, kwargs):
71
+ def _disposable_worker(fn, initializer, initargs, result_queue, args, kwargs):
72
+ """The worker function that is used to run the task.
73
+
74
+ Args:
75
+ fn: The function to run.
76
+ initializer: The initializer function to run before running the task.
77
+ initargs: The arguments to pass to the initializer function.
78
+ result_queue: The queue to put the result and exception into.
79
+ args: The arguments to pass to the function.
80
+ kwargs: The keyword arguments to pass to the function.
81
+ """
72
82
  try:
73
83
  if initializer is not None:
74
84
  initializer(*initargs)
75
- fn(*args, **kwargs)
85
+ result = fn(*args, **kwargs)
86
+ result_queue.put(result)
76
87
  except BaseException as e: # pylint: disable=broad-except
77
- return e
88
+ result_queue.put(e)
78
89
 
79
90
 
80
91
  class DisposableExecutor:
@@ -98,28 +109,52 @@ class DisposableExecutor:
98
109
  self._initializer: Optional[Callable] = initializer
99
110
  self._initargs: Tuple = initargs
100
111
 
101
- def _monitor_worker(self, process: multiprocessing.Process) -> None:
112
+ def _monitor_worker(self, process: multiprocessing.Process,
113
+ future: concurrent.futures.Future,
114
+ result_queue: multiprocessing.Queue) -> None:
102
115
  """Monitor the worker process and cleanup when it's done."""
103
- process.join()
104
- if process.pid:
105
- with self._lock:
106
- if process.pid in self.workers:
107
- del self.workers[process.pid]
108
-
109
- # Submit is not compatible with ProcessPoolExecutor because we does not
110
- # bother to return a Future. Can be improved if needed.
111
- def submit(self, fn, *args, **kwargs) -> bool:
112
- """Submit a task for execution."""
116
+ try:
117
+ process.join()
118
+ if not future.cancelled():
119
+ try:
120
+ # Get result from the queue if process completed
121
+ if not result_queue.empty():
122
+ result = result_queue.get(block=False)
123
+ if isinstance(result, BaseException):
124
+ future.set_exception(result)
125
+ else:
126
+ future.set_result(result)
127
+ else:
128
+ # Process ended but no result
129
+ future.set_result(None)
130
+ except (multiprocessing.TimeoutError, BrokenPipeError,
131
+ EOFError) as e:
132
+ future.set_exception(e)
133
+ finally:
134
+ if process.pid:
135
+ with self._lock:
136
+ if process.pid in self.workers:
137
+ del self.workers[process.pid]
138
+
139
+ def submit(self, fn, *args, **kwargs) -> concurrent.futures.Future:
140
+ """Submit a task for execution and return a Future."""
141
+ future: concurrent.futures.Future = concurrent.futures.Future()
142
+
113
143
  if self._shutdown:
114
- return False
144
+ raise RuntimeError('Cannot submit task after executor is shutdown')
145
+
115
146
  with self._lock:
116
147
  if (self.max_workers is not None and
117
148
  len(self.workers) >= self.max_workers):
118
- return False
149
+ raise exceptions.ExecutionPoolFullError(
150
+ 'Maximum workers reached')
119
151
 
152
+ result_queue: multiprocessing.Queue = multiprocessing.Queue()
120
153
  process = multiprocessing.Process(target=_disposable_worker,
121
154
  args=(fn, self._initializer,
122
- self._initargs, args, kwargs))
155
+ self._initargs, result_queue,
156
+ args, kwargs))
157
+ process.daemon = True
123
158
  process.start()
124
159
 
125
160
  with self._lock:
@@ -128,13 +163,13 @@ class DisposableExecutor:
128
163
  raise RuntimeError('Failed to start process')
129
164
  self.workers[pid] = process
130
165
 
131
- # Start monitor thread to cleanup the worker process when it's done.
166
+ # Start monitor thread to cleanup the worker process when it's done
132
167
  monitor_thread = threading.Thread(target=self._monitor_worker,
133
- args=(process,),
168
+ args=(process, future, result_queue),
134
169
  daemon=True)
135
170
  monitor_thread.start()
136
171
 
137
- return True
172
+ return future
138
173
 
139
174
  def has_idle_workers(self) -> bool:
140
175
  """Check if there are any idle workers."""
@@ -173,12 +208,14 @@ class BurstableExecutor:
173
208
  self._burst_executor = DisposableExecutor(max_workers=burst_workers,
174
209
  **kwargs)
175
210
 
176
- def submit_until_success(self, fn, *args, **kwargs):
211
+ def submit_until_success(self, fn, *args,
212
+ **kwargs) -> concurrent.futures.Future:
177
213
  """Submit a task for execution until success.
178
214
 
179
215
  Prioritizes submitting to the guaranteed pool. If no idle workers
180
216
  are available in the guaranteed pool, it will submit to the burst
181
- pool.
217
+ pool. If the burst pool is full, it will retry the whole process until
218
+ the task is submitted successfully.
182
219
  TODO(aylei): this is coupled with executor.RequestWorker since we
183
220
  know the worker is dedicated to request scheduling and it either
184
221
  blocks on request polling or request submitting. So it is no harm
@@ -188,17 +225,20 @@ class BurstableExecutor:
188
225
 
189
226
  while True:
190
227
  if self._executor is not None and self._executor.has_idle_workers():
191
- self._executor.submit(fn, *args, **kwargs)
192
- break
228
+ logger.info('Submitting to the guaranteed pool')
229
+ return self._executor.submit(fn, *args, **kwargs)
193
230
  if (self._burst_executor is not None and
194
231
  self._burst_executor.has_idle_workers()):
195
- self._burst_executor.submit(fn, *args, **kwargs)
196
- break
232
+ try:
233
+ fut = self._burst_executor.submit(fn, *args, **kwargs)
234
+ return fut
235
+ except exceptions.ExecutionPoolFullError:
236
+ # The burst pool is full, try the next candidate.
237
+ pass
197
238
  if self._executor is not None:
198
239
  # No idle workers in either pool, still queue the request
199
240
  # to the guaranteed pool to keep behavior consistent.
200
- self._executor.submit(fn, *args, **kwargs)
201
- break
241
+ return self._executor.submit(fn, *args, **kwargs)
202
242
  logger.debug('No guaranteed pool set and the burst pool is full, '
203
243
  'retry later.')
204
244
  time.sleep(0.1)
sky/server/server.py CHANGED
@@ -693,6 +693,7 @@ async def launch(launch_body: payloads.LaunchBody,
693
693
  func=execution.launch,
694
694
  schedule_type=requests_lib.ScheduleType.LONG,
695
695
  request_cluster_name=launch_body.cluster_name,
696
+ retryable=launch_body.retry_until_up,
696
697
  )
697
698
 
698
699
 
@@ -3,7 +3,7 @@
3
3
  import asyncio
4
4
  import collections
5
5
  import pathlib
6
- from typing import AsyncGenerator, Deque, Optional
6
+ from typing import AsyncGenerator, Deque, List, Optional
7
7
 
8
8
  import aiofiles
9
9
  import fastapi
@@ -15,6 +15,12 @@ from sky.utils import rich_utils
15
15
 
16
16
  logger = sky_logging.init_logger(__name__)
17
17
 
18
+ # When streaming log lines, buffer the lines in memory and flush them in chunks
19
+ # to improve log tailing throughput. Buffer size is the max size bytes of each
20
+ # chunk and the timeout threshold for flushing the buffer to ensure
21
+ # responsiveness.
22
+ _BUFFER_SIZE = 8 * 1024 # 8KB
23
+ _BUFFER_TIMEOUT = 0.02 # 20ms
18
24
  _HEARTBEAT_INTERVAL = 30
19
25
 
20
26
 
@@ -36,7 +42,16 @@ async def log_streamer(request_id: Optional[str],
36
42
  plain_logs: bool = False,
37
43
  tail: Optional[int] = None,
38
44
  follow: bool = True) -> AsyncGenerator[str, None]:
39
- """Streams the logs of a request."""
45
+ """Streams the logs of a request.
46
+
47
+ Args:
48
+ request_id: The request ID to check whether the log tailing process
49
+ should be stopped.
50
+ log_path: The path to the log file.
51
+ plain_logs: Whether to show plain logs.
52
+ tail: The number of lines to tail. If None, tail the whole file.
53
+ follow: Whether to follow the log file.
54
+ """
40
55
 
41
56
  if request_id is not None:
42
57
  status_msg = rich_utils.EncodedStatusMessage(
@@ -80,65 +95,106 @@ async def log_streamer(request_id: Optional[str],
80
95
  if show_request_waiting_spinner:
81
96
  yield status_msg.stop()
82
97
 
83
- # Find last n lines of the log file. Do not read the whole file into memory.
84
98
  async with aiofiles.open(log_path, 'rb') as f:
85
- if tail is not None:
86
- # TODO(zhwu): this will include the control lines for rich status,
87
- # which may not lead to exact tail lines when showing on the client
88
- # side.
89
- lines: Deque[str] = collections.deque(maxlen=tail)
90
- async for line_str in _yield_log_file_with_payloads_skipped(f):
91
- lines.append(line_str)
92
- for line_str in lines:
93
- yield line_str
94
-
95
- last_heartbeat_time = asyncio.get_event_loop().time()
99
+ async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
100
+ follow):
101
+ yield chunk
102
+
103
+
104
+ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
105
+ request_id: Optional[str] = None,
106
+ plain_logs: bool = False,
107
+ tail: Optional[int] = None,
108
+ follow: bool = True) -> AsyncGenerator[str, None]:
109
+ """Tail the opened log file, buffer the lines and flush in chunks."""
110
+
111
+ if tail is not None:
112
+ # Find last n lines of the log file. Do not read the whole file into
113
+ # memory.
114
+ # TODO(zhwu): this will include the control lines for rich status,
115
+ # which may not lead to exact tail lines when showing on the client
116
+ # side.
117
+ lines: Deque[str] = collections.deque(maxlen=tail)
118
+ async for line_str in _yield_log_file_with_payloads_skipped(f):
119
+ lines.append(line_str)
120
+ for line_str in lines:
121
+ yield line_str
96
122
 
97
- while True:
98
- # Sleep 0 to yield control to allow other coroutines to run,
99
- # while keeps the loop tight to make log stream responsive.
100
- await asyncio.sleep(0)
101
- line: Optional[bytes] = await f.readline()
102
- if not line:
103
- if request_id is not None:
104
- request_task = requests_lib.get_request(request_id)
105
- if request_task.status > requests_lib.RequestStatus.RUNNING:
106
- if (request_task.status ==
107
- requests_lib.RequestStatus.CANCELLED):
108
- yield (f'{request_task.name!r} request {request_id}'
109
- ' cancelled\n')
110
- break
111
- if not follow:
123
+ last_heartbeat_time = asyncio.get_event_loop().time()
124
+
125
+ # Buffer the lines in memory and flush them in chunks to improve log
126
+ # tailing throughput.
127
+ buffer: List[str] = []
128
+ buffer_bytes = 0
129
+ last_flush_time = asyncio.get_event_loop().time()
130
+
131
+ async def flush_buffer() -> AsyncGenerator[str, None]:
132
+ nonlocal buffer, buffer_bytes, last_flush_time
133
+ if buffer:
134
+ yield ''.join(buffer)
135
+ buffer.clear()
136
+ buffer_bytes = 0
137
+ last_flush_time = asyncio.get_event_loop().time()
138
+
139
+ while True:
140
+ # Sleep 0 to yield control to allow other coroutines to run,
141
+ # while keeps the loop tight to make log stream responsive.
142
+ await asyncio.sleep(0)
143
+ current_time = asyncio.get_event_loop().time()
144
+ # Flush the buffer when it is not empty and the buffer is full or the
145
+ # flush timeout is reached.
146
+ if buffer and (buffer_bytes >= _BUFFER_SIZE or
147
+ (current_time - last_flush_time) >= _BUFFER_TIMEOUT):
148
+ async for chunk in flush_buffer():
149
+ yield chunk
150
+
151
+ line: Optional[bytes] = await f.readline()
152
+ if not line:
153
+ if request_id is not None:
154
+ request_task = requests_lib.get_request(request_id)
155
+ if request_task.status > requests_lib.RequestStatus.RUNNING:
156
+ if (request_task.status ==
157
+ requests_lib.RequestStatus.CANCELLED):
158
+ buffer.append(
159
+ f'{request_task.name!r} request {request_id}'
160
+ ' cancelled\n')
112
161
  break
162
+ if not follow:
163
+ break
164
+
165
+ if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
166
+ # Currently just used to keep the connection busy, refer to
167
+ # https://github.com/skypilot-org/skypilot/issues/5750 for
168
+ # more details.
169
+ buffer.append(
170
+ message_utils.encode_payload(
171
+ rich_utils.Control.HEARTBEAT.encode('')))
172
+ last_heartbeat_time = current_time
173
+
174
+ # Sleep shortly to avoid storming the DB and CPU, this has
175
+ # little impact on the responsivness here since we are waiting
176
+ # for a new line to come in.
177
+ await asyncio.sleep(0.1)
178
+ continue
113
179
 
114
- current_time = asyncio.get_event_loop().time()
115
- if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
116
- # Currently just used to keep the connection busy, refer to
117
- # https://github.com/skypilot-org/skypilot/issues/5750 for
118
- # more details.
119
- yield message_utils.encode_payload(
120
- rich_utils.Control.HEARTBEAT.encode(''))
121
- last_heartbeat_time = current_time
122
-
123
- # Sleep shortly to avoid storming the DB and CPU, this has
124
- # little impact on the responsivness here since we are waiting
125
- # for a new line to come in.
126
- await asyncio.sleep(0.1)
180
+ # Refresh the heartbeat time, this is a trivial optimization for
181
+ # performance but it helps avoid unnecessary heartbeat strings
182
+ # being printed when the client runs in an old version.
183
+ last_heartbeat_time = asyncio.get_event_loop().time()
184
+ line_str = line.decode('utf-8')
185
+ if plain_logs:
186
+ is_payload, line_str = message_utils.decode_payload(
187
+ line_str, raise_for_mismatch=False)
188
+ # TODO(aylei): implement heartbeat mechanism for plain logs,
189
+ # sending invisible characters might be okay.
190
+ if is_payload:
127
191
  continue
192
+ buffer.append(line_str)
193
+ buffer_bytes += len(line_str.encode('utf-8'))
128
194
 
129
- # Refresh the heartbeat time, this is a trivial optimization for
130
- # performance but it helps avoid unnecessary heartbeat strings
131
- # being printed when the client runs in an old version.
132
- last_heartbeat_time = asyncio.get_event_loop().time()
133
- line_str = line.decode('utf-8')
134
- if plain_logs:
135
- is_payload, line_str = message_utils.decode_payload(
136
- line_str, raise_for_mismatch=False)
137
- # TODO(aylei): implement heartbeat mechanism for plain logs,
138
- # sending invisible characters might be okay.
139
- if is_payload:
140
- continue
141
- yield line_str
195
+ # Flush remaining lines in the buffer.
196
+ async for chunk in flush_buffer():
197
+ yield chunk
142
198
 
143
199
 
144
200
  def stream_response(
sky/skylet/constants.py CHANGED
@@ -377,8 +377,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
377
377
  ]
378
378
  # When overriding the SkyPilot configs on the API server with the client one,
379
379
  # we skip the following keys because they are meant to be client-side configs.
380
- SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('admin_policy',),
381
- ('api_server',),
380
+ SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('api_server',),
382
381
  ('allowed_clouds',),
383
382
  ('workspaces',), ('db',)]
384
383
 
sky/skypilot_config.py CHANGED
@@ -58,6 +58,11 @@ import typing
58
58
  from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
59
59
 
60
60
  import filelock
61
+ import sqlalchemy
62
+ from sqlalchemy import orm
63
+ from sqlalchemy.dialects import postgresql
64
+ from sqlalchemy.dialects import sqlite
65
+ from sqlalchemy.ext import declarative
61
66
 
62
67
  from sky import exceptions
63
68
  from sky import sky_logging
@@ -66,6 +71,7 @@ from sky.skylet import constants
66
71
  from sky.utils import common_utils
67
72
  from sky.utils import config_utils
68
73
  from sky.utils import context
74
+ from sky.utils import db_utils
69
75
  from sky.utils import schemas
70
76
  from sky.utils import ux_utils
71
77
  from sky.utils.kubernetes import config_map_utils
@@ -110,6 +116,56 @@ ENV_VAR_PROJECT_CONFIG = f'{constants.SKYPILOT_ENV_VAR_PREFIX}PROJECT_CONFIG'
110
116
  _GLOBAL_CONFIG_PATH = '~/.sky/config.yaml'
111
117
  _PROJECT_CONFIG_PATH = '.sky.yaml'
112
118
 
119
+ _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
120
+ API_SERVER_CONFIG_KEY = 'api_server_config'
121
+
122
+ Base = declarative.declarative_base()
123
+
124
+ config_yaml_table = sqlalchemy.Table(
125
+ 'config_yaml',
126
+ Base.metadata,
127
+ sqlalchemy.Column('key', sqlalchemy.Text, primary_key=True),
128
+ sqlalchemy.Column('value', sqlalchemy.Text),
129
+ )
130
+
131
+
132
+ def create_table():
133
+ # Create tables if they don't exist
134
+ Base.metadata.create_all(bind=_SQLALCHEMY_ENGINE)
135
+
136
+
137
+ def _get_config_yaml_from_db(key: str) -> Optional[config_utils.Config]:
138
+ assert _SQLALCHEMY_ENGINE is not None
139
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
140
+ row = session.query(config_yaml_table).filter_by(key=key).first()
141
+ if row:
142
+ db_config = config_utils.Config(yaml.safe_load(row.value))
143
+ db_config.pop_nested(('db',), None)
144
+ return db_config
145
+ return None
146
+
147
+
148
+ def _set_config_yaml_to_db(key: str, config: config_utils.Config):
149
+ assert _SQLALCHEMY_ENGINE is not None
150
+ config.pop_nested(('db',), None)
151
+ config_str = common_utils.dump_yaml_str(dict(config))
152
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
153
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
154
+ db_utils.SQLAlchemyDialect.SQLITE.value):
155
+ insert_func = sqlite.insert
156
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
157
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
158
+ insert_func = postgresql.insert
159
+ else:
160
+ raise ValueError('Unsupported database dialect')
161
+ insert_stmnt = insert_func(config_yaml_table).values(key=key,
162
+ value=config_str)
163
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
164
+ index_elements=[config_yaml_table.c.key],
165
+ set_={config_yaml_table.c.value: config_str})
166
+ session.execute(do_update_stmt)
167
+ session.commit()
168
+
113
169
 
114
170
  class ConfigContext:
115
171
 
@@ -257,11 +313,6 @@ def _resolve_project_config_path() -> Optional[str]:
257
313
  return None
258
314
 
259
315
 
260
- def _get_project_config() -> config_utils.Config:
261
- """Returns the project config."""
262
- return _get_config_from_path(_resolve_project_config_path())
263
-
264
-
265
316
  def _resolve_server_config_path() -> Optional[str]:
266
317
  # find the server config file
267
318
  server_config_path = _get_config_file_path(ENV_VAR_GLOBAL_CONFIG)
@@ -507,26 +558,35 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
507
558
 
508
559
 
509
560
  def _reload_config_as_server() -> None:
561
+ global _SQLALCHEMY_ENGINE
510
562
  # Reset the global variables, to avoid using stale values.
511
563
  _set_loaded_config(config_utils.Config())
512
564
  _set_loaded_config_path(None)
513
565
 
514
- overrides: List[config_utils.Config] = []
515
566
  server_config_path = _resolve_server_config_path()
516
567
  server_config = _get_config_from_path(server_config_path)
517
- if server_config:
518
- overrides.append(server_config)
519
568
 
520
- # layer the configs on top of each other based on priority
521
- overlaid_server_config: config_utils.Config = config_utils.Config()
522
- for override in overrides:
523
- overlaid_server_config = overlay_skypilot_config(
524
- original_config=overlaid_server_config, override_configs=override)
525
569
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
526
- logger.debug(
527
- f'server config: \n'
528
- f'{common_utils.dump_yaml_str(dict(overlaid_server_config))}')
529
- _set_loaded_config(overlaid_server_config)
570
+ logger.debug(f'server config: \n'
571
+ f'{common_utils.dump_yaml_str(dict(server_config))}')
572
+
573
+ db_url = server_config.get_nested(('db',), None)
574
+ if db_url and len(server_config.keys()) > 1:
575
+ raise ValueError(
576
+ 'if db config is specified, no other config is allowed')
577
+
578
+ if db_url:
579
+ if _SQLALCHEMY_ENGINE is None:
580
+ _SQLALCHEMY_ENGINE = sqlalchemy.create_engine(db_url)
581
+ create_table()
582
+ db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
583
+ if db_config:
584
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
585
+ logger.debug(f'Config loaded from db:\n'
586
+ f'{common_utils.dump_yaml_str(dict(db_config))}')
587
+ server_config = overlay_skypilot_config(server_config, db_config)
588
+
589
+ _set_loaded_config(server_config)
530
590
  _set_loaded_config_path(server_config_path)
531
591
 
532
592
 
@@ -778,13 +838,27 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
778
838
  if global_config_path is None:
779
839
  global_config_path = get_user_config_path()
780
840
 
781
- # Always save to the local file (PVC in Kubernetes, local file otherwise)
782
- common_utils.dump_yaml(global_config_path, dict(config))
783
-
784
- if config_map_utils.is_running_in_kubernetes():
785
- # In Kubernetes, sync the PVC config to ConfigMap for user convenience
786
- # PVC file is the source of truth, ConfigMap is just a mirror for easy
787
- # access
788
- config_map_utils.patch_configmap_with_config(config, global_config_path)
841
+ db_updated = False
842
+ if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
843
+ existing_db_url = get_nested(('db',), None)
844
+ if existing_db_url:
845
+ new_db_url = config.get_nested(('db',), None)
846
+ if new_db_url and new_db_url != existing_db_url:
847
+ raise ValueError('Cannot change db url while server is running')
848
+ logger.debug('saving api_server config to db')
849
+ _set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
850
+ db_updated = True
851
+
852
+ if not db_updated:
853
+ # save to the local file (PVC in Kubernetes, local file otherwise)
854
+ common_utils.dump_yaml(global_config_path, dict(config))
855
+
856
+ if config_map_utils.is_running_in_kubernetes():
857
+ # In Kubernetes, sync the PVC config to ConfigMap for user
858
+ # convenience.
859
+ # PVC file is the source of truth, ConfigMap is just a mirror for
860
+ # easy access.
861
+ config_map_utils.patch_configmap_with_config(
862
+ config, global_config_path)
789
863
 
790
864
  _reload_config()
sky/users/permission.py CHANGED
@@ -36,7 +36,7 @@ class PermissionService:
36
36
  with _lock:
37
37
  if _enforcer_instance is None:
38
38
  _enforcer_instance = self
39
- engine = global_user_state.SQLALCHEMY_ENGINE
39
+ engine = global_user_state.initialize_and_get_db()
40
40
  adapter = sqlalchemy_adapter.Adapter(engine)
41
41
  model_path = os.path.join(os.path.dirname(__file__),
42
42
  'model.conf')
@@ -55,6 +55,7 @@ def _get_policy_cls(
55
55
  def apply_and_use_config_in_current_request(
56
56
  entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
57
57
  request_options: Optional[admin_policy.RequestOptions] = None,
58
+ at_client_side: bool = False,
58
59
  ) -> Iterator['dag_lib.Dag']:
59
60
  """Applies an admin policy and override SkyPilot config for current request
60
61
 
@@ -66,7 +67,7 @@ def apply_and_use_config_in_current_request(
66
67
  Refer to `apply()` for more details.
67
68
  """
68
69
  original_config = skypilot_config.to_dict()
69
- dag, mutated_config = apply(entrypoint, request_options)
70
+ dag, mutated_config = apply(entrypoint, request_options, at_client_side)
70
71
  if mutated_config != original_config:
71
72
  with skypilot_config.replace_skypilot_config(mutated_config):
72
73
  yield dag
@@ -77,6 +78,7 @@ def apply_and_use_config_in_current_request(
77
78
  def apply(
78
79
  entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
79
80
  request_options: Optional[admin_policy.RequestOptions] = None,
81
+ at_client_side: bool = False,
80
82
  ) -> Tuple['dag_lib.Dag', config_utils.Config]:
81
83
  """Applies an admin policy (if registered) to a DAG or a task.
82
84
 
@@ -105,14 +107,18 @@ def apply(
105
107
  if policy_cls is None:
106
108
  return dag, skypilot_config.to_dict()
107
109
 
108
- logger.info(f'Applying policy: {policy}')
110
+ if at_client_side:
111
+ logger.info(f'Applying client admin policy: {policy}')
112
+ else:
113
+ logger.info(f'Applying server admin policy: {policy}')
109
114
  config = copy.deepcopy(skypilot_config.to_dict())
110
115
  mutated_dag = dag_lib.Dag()
111
116
  mutated_dag.name = dag.name
112
117
 
113
118
  mutated_config = None
114
119
  for task in dag.tasks:
115
- user_request = admin_policy.UserRequest(task, config, request_options)
120
+ user_request = admin_policy.UserRequest(task, config, request_options,
121
+ at_client_side)
116
122
  try:
117
123
  mutated_user_request = policy_cls.validate_and_mutate(user_request)
118
124
  except Exception as e: # pylint: disable=broad-except
sky/utils/context.py CHANGED
@@ -4,11 +4,13 @@ import asyncio
4
4
  from collections.abc import Mapping
5
5
  from collections.abc import MutableMapping
6
6
  import contextvars
7
+ import functools
7
8
  import os
8
9
  import pathlib
9
10
  import subprocess
10
11
  import sys
11
- from typing import Dict, Optional, TextIO
12
+ import typing
13
+ from typing import Any, Callable, Dict, Optional, TextIO, TypeVar
12
14
 
13
15
 
14
16
  class Context(object):
@@ -256,6 +258,24 @@ class Popen(subprocess.Popen):
256
258
  super().__init__(*args, env=env, **kwargs)
257
259
 
258
260
 
261
+ F = TypeVar('F', bound=Callable[..., Any])
262
+
263
+
264
+ def contextual(func: F) -> F:
265
+ """Decorator to intiailize a context before executing the function.
266
+
267
+ If a context is already initialized, this decorator will reset the context,
268
+ i.e. all contextual variables set previously will be cleared.
269
+ """
270
+
271
+ @functools.wraps(func)
272
+ def wrapper(*args, **kwargs):
273
+ initialize()
274
+ return func(*args, **kwargs)
275
+
276
+ return typing.cast(F, wrapper)
277
+
278
+
259
279
  def initialize():
260
280
  """Initialize the current SkyPilot context."""
261
281
  _CONTEXT.set(Context())