skypilot-nightly 1.0.0.dev20251019__py3-none-any.whl → 1.0.0.dev20251022__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (95) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +64 -0
  3. sky/backends/backend_utils.py +11 -11
  4. sky/backends/cloud_vm_ray_backend.py +15 -4
  5. sky/client/cli/command.py +39 -10
  6. sky/client/cli/flags.py +4 -2
  7. sky/client/sdk.py +26 -3
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/IgACOQPupLbX9z-RYVEDx/_buildManifest.js +1 -0
  10. sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
  11. sky/dashboard/out/_next/static/chunks/2755.9b1e69c921b5a870.js +26 -0
  12. sky/dashboard/out/_next/static/chunks/3015-d014dc5b9412fade.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.998db87cd52a1238.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
  16. sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9360.14326e329484b57e.js +31 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
  26. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/webpack-919e3c01ab6b2633.js +1 -0
  30. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  31. sky/dashboard/out/clusters/[cluster].html +1 -1
  32. sky/dashboard/out/clusters.html +1 -1
  33. sky/dashboard/out/config.html +1 -1
  34. sky/dashboard/out/index.html +1 -1
  35. sky/dashboard/out/infra/[context].html +1 -1
  36. sky/dashboard/out/infra.html +1 -1
  37. sky/dashboard/out/jobs/[job].html +1 -1
  38. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  39. sky/dashboard/out/jobs.html +1 -1
  40. sky/dashboard/out/users.html +1 -1
  41. sky/dashboard/out/volumes.html +1 -1
  42. sky/dashboard/out/workspace/new.html +1 -1
  43. sky/dashboard/out/workspaces/[name].html +1 -1
  44. sky/dashboard/out/workspaces.html +1 -1
  45. sky/data/storage.py +2 -2
  46. sky/global_user_state.py +137 -37
  47. sky/jobs/constants.py +1 -1
  48. sky/jobs/server/core.py +4 -2
  49. sky/jobs/server/server.py +21 -12
  50. sky/jobs/state.py +307 -55
  51. sky/jobs/utils.py +248 -144
  52. sky/provision/kubernetes/network.py +9 -6
  53. sky/provision/provisioner.py +8 -0
  54. sky/schemas/api/responses.py +2 -0
  55. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  56. sky/serve/server/server.py +8 -7
  57. sky/server/common.py +10 -15
  58. sky/server/constants.py +1 -1
  59. sky/server/daemons.py +4 -2
  60. sky/server/requests/executor.py +30 -28
  61. sky/server/requests/payloads.py +5 -1
  62. sky/server/requests/preconditions.py +9 -4
  63. sky/server/requests/requests.py +130 -53
  64. sky/server/requests/serializers/encoders.py +3 -3
  65. sky/server/server.py +91 -58
  66. sky/server/stream_utils.py +127 -38
  67. sky/server/uvicorn.py +18 -17
  68. sky/setup_files/alembic.ini +4 -0
  69. sky/skylet/services.py +5 -5
  70. sky/skypilot_config.py +87 -75
  71. sky/ssh_node_pools/server.py +4 -4
  72. sky/users/permission.py +4 -0
  73. sky/utils/asyncio_utils.py +63 -3
  74. sky/utils/db/db_utils.py +11 -3
  75. sky/utils/db/migration_utils.py +7 -3
  76. sky/volumes/server/server.py +3 -3
  77. sky/workspaces/server.py +6 -6
  78. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/METADATA +37 -37
  79. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/RECORD +87 -86
  80. sky/dashboard/out/_next/static/8e35zdobdd0bK_Nkba03m/_buildManifest.js +0 -1
  81. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  82. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  83. sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  87. sky/dashboard/out/_next/static/chunks/webpack-3c431f6c9086e487.js +0 -1
  88. /sky/dashboard/out/_next/static/{8e35zdobdd0bK_Nkba03m → IgACOQPupLbX9z-RYVEDx}/_ssgManifest.js +0 -0
  89. /sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-df9f87fcb7f24292.js} +0 -0
  90. /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
  91. /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
  92. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/WHEEL +0 -0
  93. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/entry_points.txt +0 -0
  94. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/licenses/LICENSE +0 -0
  95. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/top_level.txt +0 -0
@@ -25,6 +25,17 @@ logger = sky_logging.init_logger(__name__)
25
25
  _BUFFER_SIZE = 8 * 1024 # 8KB
26
26
  _BUFFER_TIMEOUT = 0.02 # 20ms
27
27
  _HEARTBEAT_INTERVAL = 30
28
+ # If a SHORT request has been stuck in pending for
29
+ # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
30
+ _SHORT_REQUEST_SPINNER_TIMEOUT = 2
31
+ # If there is an issue during provisioning that causes the cluster to be stuck
32
+ # in INIT state, we use this timeout to break the loop and stop streaming
33
+ # provision logs.
34
+ _PROVISION_LOG_TIMEOUT = 3
35
+ # Maximum time to wait for new log files to appear when streaming worker node
36
+ # provision logs. Worker logs are created sequentially during the provisioning
37
+ # process, so we need to wait for new files to appear.
38
+ _MAX_WAIT_FOR_NEW_LOG_FILES = 3 # seconds
28
39
 
29
40
  LONG_REQUEST_POLL_INTERVAL = 1
30
41
  DEFAULT_POLL_INTERVAL = 0.1
@@ -45,7 +56,7 @@ async def _yield_log_file_with_payloads_skipped(
45
56
 
46
57
  async def log_streamer(
47
58
  request_id: Optional[str],
48
- log_path: pathlib.Path,
59
+ log_path: Optional[pathlib.Path] = None,
49
60
  plain_logs: bool = False,
50
61
  tail: Optional[int] = None,
51
62
  follow: bool = True,
@@ -57,7 +68,9 @@ async def log_streamer(
57
68
  Args:
58
69
  request_id: The request ID to check whether the log tailing process
59
70
  should be stopped.
60
- log_path: The path to the log file.
71
+ log_path: The path to the log file or directory containing the log
72
+ files. If it is a directory, all *.log files in the directory will be
73
+ streamed.
61
74
  plain_logs: Whether to show plain logs.
62
75
  tail: The number of lines to tail. If None, tail the whole file.
63
76
  follow: Whether to follow the log file.
@@ -66,17 +79,26 @@ async def log_streamer(
66
79
  """
67
80
 
68
81
  if request_id is not None:
82
+ start_time = asyncio.get_event_loop().time()
69
83
  status_msg = rich_utils.EncodedStatusMessage(
70
84
  f'[dim]Checking request: {request_id}[/dim]')
71
- request_task = await requests_lib.get_request_async(request_id)
85
+ request_task = await requests_lib.get_request_async(request_id,
86
+ fields=[
87
+ 'request_id',
88
+ 'name',
89
+ 'schedule_type',
90
+ 'status',
91
+ 'status_msg'
92
+ ])
72
93
 
73
94
  if request_task is None:
74
95
  raise fastapi.HTTPException(
75
96
  status_code=404, detail=f'Request {request_id} not found')
76
97
  request_id = request_task.request_id
77
98
 
78
- # Do not show the waiting spinner if the request is a fast, non-blocking
79
- # request.
99
+ # By default, do not show the waiting spinner for SHORT requests.
100
+ # If the request has been stuck in pending for
101
+ # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
80
102
  show_request_waiting_spinner = (not plain_logs and
81
103
  request_task.schedule_type
82
104
  == requests_lib.ScheduleType.LONG)
@@ -89,14 +111,23 @@ async def log_streamer(
89
111
  f'scheduled: {request_id}')
90
112
  req_status = request_task.status
91
113
  req_msg = request_task.status_msg
114
+ del request_task
92
115
  # Slowly back off the database polling up to every 1 second, to avoid
93
116
  # overloading the CPU and DB.
94
117
  backoff = common_utils.Backoff(initial_backoff=polling_interval,
95
118
  max_backoff_factor=10,
96
119
  multiplier=1.2)
97
120
  while req_status < requests_lib.RequestStatus.RUNNING:
121
+ current_time = asyncio.get_event_loop().time()
122
+ # Show the waiting spinner for a SHORT request if it has been stuck
123
+ # in pending for _SHORT_REQUEST_SPINNER_TIMEOUT seconds
124
+ if not show_request_waiting_spinner and (
125
+ current_time - start_time > _SHORT_REQUEST_SPINNER_TIMEOUT):
126
+ show_request_waiting_spinner = True
127
+ yield status_msg.init()
128
+ yield status_msg.start()
98
129
  if req_msg is not None:
99
- waiting_msg = request_task.status_msg
130
+ waiting_msg = req_msg
100
131
  if show_request_waiting_spinner:
101
132
  yield status_msg.update(f'[dim]{waiting_msg}[/dim]')
102
133
  elif plain_logs and waiting_msg != last_waiting_msg:
@@ -119,11 +150,57 @@ async def log_streamer(
119
150
  if show_request_waiting_spinner:
120
151
  yield status_msg.stop()
121
152
 
122
- async with aiofiles.open(log_path, 'rb') as f:
123
- async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
124
- follow, cluster_name,
125
- polling_interval):
126
- yield chunk
153
+ if log_path is not None and log_path.is_dir():
154
+ # Track which log files we've already streamed
155
+ streamed_files = set()
156
+ no_new_files_count = 0
157
+
158
+ while True:
159
+ # Get all *.log files in the log_path
160
+ log_files = sorted(log_path.glob('*.log'))
161
+
162
+ # Filter out already streamed files
163
+ new_files = [f for f in log_files if f not in streamed_files]
164
+
165
+ if len(new_files) == 0:
166
+ if not follow:
167
+ break
168
+ # Wait a bit to see if new files appear
169
+ await asyncio.sleep(0.5)
170
+ no_new_files_count += 1
171
+ # Check if we've waited too long for new files
172
+ if no_new_files_count > _MAX_WAIT_FOR_NEW_LOG_FILES * 2:
173
+ break
174
+ continue
175
+
176
+ # Reset the no-new-files counter when we find new files
177
+ no_new_files_count = 0
178
+
179
+ for log_file_path in new_files:
180
+ # Add header before each file (similar to tail -f behavior)
181
+ header = f'\n==> {log_file_path} <==\n\n'
182
+ yield header
183
+
184
+ async with aiofiles.open(log_file_path, 'rb') as f:
185
+ async for chunk in _tail_log_file(f, request_id, plain_logs,
186
+ tail, follow,
187
+ cluster_name,
188
+ polling_interval):
189
+ yield chunk
190
+
191
+ # Mark this file as streamed
192
+ streamed_files.add(log_file_path)
193
+
194
+ # If not following, break after streaming all current files
195
+ if not follow:
196
+ break
197
+ else:
198
+ assert log_path is not None, (request_id, log_path)
199
+ async with aiofiles.open(log_path, 'rb') as f:
200
+ async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
201
+ follow, cluster_name,
202
+ polling_interval):
203
+ yield chunk
127
204
 
128
205
 
129
206
  async def _tail_log_file(
@@ -197,7 +274,7 @@ async def _tail_log_file(
197
274
  if (req_status.status ==
198
275
  requests_lib.RequestStatus.CANCELLED):
199
276
  request_task = await requests_lib.get_request_async(
200
- request_id)
277
+ request_id, fields=['name', 'should_retry'])
201
278
  if request_task.should_retry:
202
279
  buffer.append(
203
280
  message_utils.encode_payload(
@@ -206,6 +283,7 @@ async def _tail_log_file(
206
283
  buffer.append(
207
284
  f'{request_task.name!r} request {request_id}'
208
285
  ' cancelled\n')
286
+ del request_task
209
287
  break
210
288
  if not follow:
211
289
  # The below checks (cluster status, heartbeat) are not needed
@@ -213,21 +291,24 @@ async def _tail_log_file(
213
291
  break
214
292
  # Provision logs pass in cluster_name, check cluster status
215
293
  # periodically to see if provisioning is done.
216
- if cluster_name is not None and should_check_status:
217
- last_status_check_time = current_time
218
- cluster_status = await (
219
- global_user_state.get_status_from_cluster_name_async(
220
- cluster_name))
221
- if cluster_status is None:
222
- logger.debug(
223
- 'Stop tailing provision logs for cluster'
224
- f' status for cluster {cluster_name} not found')
225
- break
226
- if cluster_status != status_lib.ClusterStatus.INIT:
227
- logger.debug(f'Stop tailing provision logs for cluster'
228
- f' {cluster_name} has status {cluster_status} '
229
- '(not in INIT state)')
294
+ if cluster_name is not None:
295
+ if current_time - last_flush_time > _PROVISION_LOG_TIMEOUT:
230
296
  break
297
+ if should_check_status:
298
+ last_status_check_time = current_time
299
+ cluster_status = await (
300
+ global_user_state.get_status_from_cluster_name_async(
301
+ cluster_name))
302
+ if cluster_status is None:
303
+ logger.debug(
304
+ 'Stop tailing provision logs for cluster'
305
+ f' status for cluster {cluster_name} not found')
306
+ break
307
+ if cluster_status != status_lib.ClusterStatus.INIT:
308
+ logger.debug(
309
+ f'Stop tailing provision logs for cluster'
310
+ f' {cluster_name} has status {cluster_status} '
311
+ '(not in INIT state)')
231
312
  if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
232
313
  # Currently just used to keep the connection busy, refer to
233
314
  # https://github.com/skypilot-org/skypilot/issues/5750 for
@@ -267,28 +348,36 @@ def stream_response_for_long_request(
267
348
  request_id: str,
268
349
  logs_path: pathlib.Path,
269
350
  background_tasks: fastapi.BackgroundTasks,
351
+ kill_request_on_disconnect: bool = True,
270
352
  ) -> fastapi.responses.StreamingResponse:
271
- return stream_response(request_id,
272
- logs_path,
273
- background_tasks,
274
- polling_interval=LONG_REQUEST_POLL_INTERVAL)
353
+ """Stream the logs of a long request."""
354
+ return stream_response(
355
+ request_id,
356
+ logs_path,
357
+ background_tasks,
358
+ polling_interval=LONG_REQUEST_POLL_INTERVAL,
359
+ kill_request_on_disconnect=kill_request_on_disconnect,
360
+ )
275
361
 
276
362
 
277
363
  def stream_response(
278
364
  request_id: str,
279
365
  logs_path: pathlib.Path,
280
366
  background_tasks: fastapi.BackgroundTasks,
281
- polling_interval: float = DEFAULT_POLL_INTERVAL
367
+ polling_interval: float = DEFAULT_POLL_INTERVAL,
368
+ kill_request_on_disconnect: bool = True,
282
369
  ) -> fastapi.responses.StreamingResponse:
283
370
 
284
- async def on_disconnect():
285
- logger.info(f'User terminated the connection for request '
286
- f'{request_id}')
287
- requests_lib.kill_requests([request_id])
371
+ if kill_request_on_disconnect:
372
+
373
+ async def on_disconnect():
374
+ logger.info(f'User terminated the connection for request '
375
+ f'{request_id}')
376
+ requests_lib.kill_requests([request_id])
288
377
 
289
- # The background task will be run after returning a response.
290
- # https://fastapi.tiangolo.com/tutorial/background-tasks/
291
- background_tasks.add_task(on_disconnect)
378
+ # The background task will be run after returning a response.
379
+ # https://fastapi.tiangolo.com/tutorial/background-tasks/
380
+ background_tasks.add_task(on_disconnect)
292
381
 
293
382
  return fastapi.responses.StreamingResponse(
294
383
  log_streamer(request_id, logs_path, polling_interval=polling_interval),
sky/server/uvicorn.py CHANGED
@@ -46,11 +46,11 @@ except ValueError:
46
46
 
47
47
  # TODO(aylei): use decorator to register requests that need to be proactively
48
48
  # cancelled instead of hardcoding here.
49
- _RETRIABLE_REQUEST_NAMES = [
49
+ _RETRIABLE_REQUEST_NAMES = {
50
50
  'sky.logs',
51
51
  'sky.jobs.logs',
52
52
  'sky.serve.logs',
53
- ]
53
+ }
54
54
 
55
55
 
56
56
  def add_timestamp_prefix_for_server_logs() -> None:
@@ -151,37 +151,38 @@ class Server(uvicorn.Server):
151
151
  requests_lib.RequestStatus.PENDING,
152
152
  requests_lib.RequestStatus.RUNNING,
153
153
  ]
154
- reqs = requests_lib.get_request_tasks(
155
- req_filter=requests_lib.RequestTaskFilter(status=statuses))
156
- if not reqs:
154
+ requests = [(request_task.request_id, request_task.name)
155
+ for request_task in requests_lib.get_request_tasks(
156
+ req_filter=requests_lib.RequestTaskFilter(
157
+ status=statuses, fields=['request_id', 'name']))
158
+ ]
159
+ if not requests:
157
160
  break
158
- logger.info(f'{len(reqs)} on-going requests '
161
+ logger.info(f'{len(requests)} on-going requests '
159
162
  'found, waiting for them to finish...')
160
163
  # Proactively cancel internal requests and logs requests since
161
164
  # they can run for infinite time.
162
- internal_request_ids = [
165
+ internal_request_ids = {
163
166
  d.id for d in daemons.INTERNAL_REQUEST_DAEMONS
164
- ]
167
+ }
165
168
  if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
166
169
  logger.warning('Timeout waiting for on-going requests to '
167
170
  'finish, cancelling all on-going requests.')
168
- for req in reqs:
169
- self.interrupt_request_for_retry(req.request_id)
171
+ for request_id, _ in requests:
172
+ self.interrupt_request_for_retry(request_id)
170
173
  break
171
174
  interrupted = 0
172
- for req in reqs:
173
- if req.request_id in internal_request_ids:
174
- self.interrupt_request_for_retry(req.request_id)
175
- interrupted += 1
176
- elif req.name in _RETRIABLE_REQUEST_NAMES:
177
- self.interrupt_request_for_retry(req.request_id)
175
+ for request_id, name in requests:
176
+ if (name in _RETRIABLE_REQUEST_NAMES or
177
+ request_id in internal_request_ids):
178
+ self.interrupt_request_for_retry(request_id)
178
179
  interrupted += 1
179
180
  # TODO(aylei): interrupt pending requests to accelerate the
180
181
  # shutdown.
181
182
  # If some requests are not interrupted, wait for them to finish,
182
183
  # otherwise we just check again immediately to accelerate the
183
184
  # shutdown process.
184
- if interrupted < len(reqs):
185
+ if interrupted < len(requests):
185
186
  time.sleep(_WAIT_REQUESTS_INTERVAL_SECONDS)
186
187
 
187
188
  def interrupt_request_for_retry(self, request_id: str) -> None:
@@ -98,6 +98,10 @@ version_table = alembic_version_spot_jobs_db
98
98
  version_locations = %(here)s/../schemas/db/serve_state
99
99
  version_table = alembic_version_serve_state_db
100
100
 
101
+ [sky_config_db]
102
+ version_locations = %(here)s/../schemas/db/skypilot_config
103
+ version_table = alembic_version_sky_config_db
104
+
101
105
  [post_write_hooks]
102
106
  # post_write_hooks defines scripts or Python functions that are run
103
107
  # on newly generated revision scripts. See the documentation for further
sky/skylet/services.py CHANGED
@@ -408,17 +408,17 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
408
408
  ) -> managed_jobsv1_pb2.GetJobTableResponse:
409
409
  try:
410
410
  accessible_workspaces = list(request.accessible_workspaces)
411
- job_ids = list(request.job_ids.ids) if request.job_ids else None
411
+ job_ids = (list(request.job_ids.ids)
412
+ if request.HasField('job_ids') else None)
412
413
  user_hashes: Optional[List[Optional[str]]] = None
413
- if request.user_hashes:
414
+ if request.HasField('user_hashes'):
414
415
  user_hashes = list(request.user_hashes.hashes)
415
416
  # For backwards compatibility, we show jobs that do not have a
416
417
  # user_hash. TODO: Remove before 0.12.0.
417
418
  if request.show_jobs_without_user_hash:
418
419
  user_hashes.append(None)
419
- statuses = list(
420
- request.statuses.statuses) if request.statuses else None
421
-
420
+ statuses = (list(request.statuses.statuses)
421
+ if request.HasField('statuses') else None)
422
422
  job_queue = managed_job_utils.get_managed_job_queue(
423
423
  skip_finished=request.skip_finished,
424
424
  accessible_workspaces=accessible_workspaces,
sky/skypilot_config.py CHANGED
@@ -64,7 +64,6 @@ from sqlalchemy import orm
64
64
  from sqlalchemy.dialects import postgresql
65
65
  from sqlalchemy.dialects import sqlite
66
66
  from sqlalchemy.ext import declarative
67
- from sqlalchemy.pool import NullPool
68
67
 
69
68
  from sky import exceptions
70
69
  from sky import sky_logging
@@ -77,6 +76,7 @@ from sky.utils import schemas
77
76
  from sky.utils import ux_utils
78
77
  from sky.utils import yaml_utils
79
78
  from sky.utils.db import db_utils
79
+ from sky.utils.db import migration_utils
80
80
  from sky.utils.kubernetes import config_map_utils
81
81
 
82
82
  if typing.TYPE_CHECKING:
@@ -121,7 +121,8 @@ _PROJECT_CONFIG_PATH = '.sky.yaml'
121
121
 
122
122
  API_SERVER_CONFIG_KEY = 'api_server_config'
123
123
 
124
- _DB_USE_LOCK = threading.Lock()
124
+ _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
125
+ _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
125
126
 
126
127
  Base = declarative.declarative_base()
127
128
 
@@ -481,7 +482,7 @@ def safe_reload_config() -> None:
481
482
  reload_config()
482
483
 
483
484
 
484
- def reload_config() -> None:
485
+ def reload_config(init_db: bool = False) -> None:
485
486
  internal_config_path = os.environ.get(ENV_VAR_SKYPILOT_CONFIG)
486
487
  if internal_config_path is not None:
487
488
  # {ENV_VAR_SKYPILOT_CONFIG} is used internally.
@@ -493,7 +494,7 @@ def reload_config() -> None:
493
494
  return
494
495
 
495
496
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
496
- _reload_config_as_server()
497
+ _reload_config_as_server(init_db=init_db)
497
498
  else:
498
499
  _reload_config_as_client()
499
500
 
@@ -564,7 +565,43 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
564
565
  _set_loaded_config_path(config_path)
565
566
 
566
567
 
567
- def _reload_config_as_server() -> None:
568
+ def _create_table(engine: sqlalchemy.engine.Engine):
569
+ """Initialize the config database with migrations."""
570
+ migration_utils.safe_alembic_upgrade(
571
+ engine, migration_utils.SKYPILOT_CONFIG_DB_NAME,
572
+ migration_utils.SKYPILOT_CONFIG_VERSION)
573
+
574
+
575
+ def _initialize_and_get_db() -> sqlalchemy.engine.Engine:
576
+ """Initialize and return the config database engine.
577
+
578
+ This function should only be called by the API Server during initialization.
579
+ Client-side code should never call this function.
580
+ """
581
+ assert os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None, (
582
+ 'initialize_and_get_db() can only be called by the API Server')
583
+
584
+ global _SQLALCHEMY_ENGINE
585
+
586
+ if _SQLALCHEMY_ENGINE is not None:
587
+ return _SQLALCHEMY_ENGINE
588
+
589
+ with _SQLALCHEMY_ENGINE_LOCK:
590
+ if _SQLALCHEMY_ENGINE is not None:
591
+ return _SQLALCHEMY_ENGINE
592
+
593
+ # We only store config in the DB when using Postgres,
594
+ # so no need to pass in db_name here.
595
+ engine = db_utils.get_engine(None)
596
+
597
+ # Run migrations if needed
598
+ _create_table(engine)
599
+
600
+ _SQLALCHEMY_ENGINE = engine
601
+ return _SQLALCHEMY_ENGINE
602
+
603
+
604
+ def _reload_config_as_server(init_db: bool = False) -> None:
568
605
  # Reset the global variables, to avoid using stale values.
569
606
  _set_loaded_config(config_utils.Config())
570
607
  _set_loaded_config_path(None)
@@ -580,37 +617,24 @@ def _reload_config_as_server() -> None:
580
617
  raise ValueError(
581
618
  'If db config is specified, no other config is allowed')
582
619
  logger.debug('retrieving config from database')
583
- with _DB_USE_LOCK:
584
- dispose_engine = False
585
- if db_utils.get_max_connections() == 0:
586
- dispose_engine = True
587
- sqlalchemy_engine = sqlalchemy.create_engine(db_url,
588
- poolclass=NullPool)
589
- else:
590
- sqlalchemy_engine = db_utils.get_engine('config')
591
- db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata,
592
- sqlalchemy_engine)
593
-
594
- def _get_config_yaml_from_db(
595
- key: str) -> Optional[config_utils.Config]:
596
- assert sqlalchemy_engine is not None
597
- with orm.Session(sqlalchemy_engine) as session:
598
- row = session.query(config_yaml_table).filter_by(
599
- key=key).first()
600
- if row:
601
- db_config = config_utils.Config(
602
- yaml_utils.safe_load(row.value))
603
- db_config.pop_nested(('db',), None)
604
- return db_config
605
- return None
606
-
607
- db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
608
- if db_config:
609
- server_config = overlay_skypilot_config(server_config,
610
- db_config)
611
- # Close the engine to avoid connection leaks
612
- if dispose_engine:
613
- sqlalchemy_engine.dispose()
620
+
621
+ if init_db:
622
+ _initialize_and_get_db()
623
+
624
+ def _get_config_yaml_from_db(key: str) -> Optional[config_utils.Config]:
625
+ assert _SQLALCHEMY_ENGINE is not None
626
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
627
+ row = session.query(config_yaml_table).filter_by(
628
+ key=key).first()
629
+ if row:
630
+ db_config = config_utils.Config(yaml_utils.safe_load(row.value))
631
+ db_config.pop_nested(('db',), None)
632
+ return db_config
633
+ return None
634
+
635
+ db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
636
+ if db_config:
637
+ server_config = overlay_skypilot_config(server_config, db_config)
614
638
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
615
639
  logger.debug(f'server config: \n'
616
640
  f'{yaml_utils.dump_yaml_str(dict(server_config))}')
@@ -666,7 +690,7 @@ def loaded_config_path_serialized() -> Optional[str]:
666
690
 
667
691
 
668
692
  # Load on import, synchronization is guaranteed by python interpreter.
669
- reload_config()
693
+ reload_config(init_db=True)
670
694
 
671
695
 
672
696
  def loaded() -> bool:
@@ -880,44 +904,32 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
880
904
  if new_db_url and new_db_url != existing_db_url:
881
905
  raise ValueError('Cannot change db url while server is running')
882
906
  if existing_db_url:
883
- with _DB_USE_LOCK:
884
- dispose_engine = False
885
- if db_utils.get_max_connections() == 0:
886
- dispose_engine = True
887
- sqlalchemy_engine = sqlalchemy.create_engine(
888
- existing_db_url, poolclass=NullPool)
889
- else:
890
- sqlalchemy_engine = db_utils.get_engine('config')
891
- db_utils.add_all_tables_to_db_sqlalchemy(
892
- Base.metadata, sqlalchemy_engine)
893
-
894
- def _set_config_yaml_to_db(key: str,
895
- config: config_utils.Config):
896
- assert sqlalchemy_engine is not None
897
- config_str = yaml_utils.dump_yaml_str(dict(config))
898
- with orm.Session(sqlalchemy_engine) as session:
899
- if (sqlalchemy_engine.dialect.name ==
900
- db_utils.SQLAlchemyDialect.SQLITE.value):
901
- insert_func = sqlite.insert
902
- elif (sqlalchemy_engine.dialect.name ==
903
- db_utils.SQLAlchemyDialect.POSTGRESQL.value):
904
- insert_func = postgresql.insert
905
- else:
906
- raise ValueError('Unsupported database dialect')
907
- insert_stmnt = insert_func(config_yaml_table).values(
908
- key=key, value=config_str)
909
- do_update_stmt = insert_stmnt.on_conflict_do_update(
910
- index_elements=[config_yaml_table.c.key],
911
- set_={config_yaml_table.c.value: config_str})
912
- session.execute(do_update_stmt)
913
- session.commit()
914
-
915
- logger.debug('saving api_server config to db')
916
- _set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
917
- db_updated = True
918
- # Close the engine to avoid connection leaks
919
- if dispose_engine:
920
- sqlalchemy_engine.dispose()
907
+
908
+ def _set_config_yaml_to_db(key: str, config: config_utils.Config):
909
+ # reload_config(init_db=True) is called when this module is
910
+ # imported, so the database engine must already be initialized.
911
+ assert _SQLALCHEMY_ENGINE is not None
912
+ config_str = yaml_utils.dump_yaml_str(dict(config))
913
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
914
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
915
+ db_utils.SQLAlchemyDialect.SQLITE.value):
916
+ insert_func = sqlite.insert
917
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
918
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
919
+ insert_func = postgresql.insert
920
+ else:
921
+ raise ValueError('Unsupported database dialect')
922
+ insert_stmnt = insert_func(config_yaml_table).values(
923
+ key=key, value=config_str)
924
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
925
+ index_elements=[config_yaml_table.c.key],
926
+ set_={config_yaml_table.c.value: config_str})
927
+ session.execute(do_update_stmt)
928
+ session.commit()
929
+
930
+ logger.debug('saving api_server config to db')
931
+ _set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
932
+ db_updated = True
921
933
 
922
934
  if not db_updated:
923
935
  # save to the local file (PVC in Kubernetes, local file otherwise)
@@ -99,7 +99,7 @@ async def deploy_ssh_node_pool(request: fastapi.Request,
99
99
  """Deploy SSH Node Pool using existing ssh_up functionality."""
100
100
  try:
101
101
  ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=False)
102
- executor.schedule_request(
102
+ await executor.schedule_request_async(
103
103
  request_id=request.state.request_id,
104
104
  request_name='ssh_up',
105
105
  request_body=ssh_up_body,
@@ -124,7 +124,7 @@ async def deploy_ssh_node_pool_general(
124
124
  ssh_up_body: payloads.SSHUpBody) -> Dict[str, str]:
125
125
  """Deploys all SSH Node Pools."""
126
126
  try:
127
- executor.schedule_request(
127
+ await executor.schedule_request_async(
128
128
  request_id=request.state.request_id,
129
129
  request_name='ssh_up',
130
130
  request_body=ssh_up_body,
@@ -150,7 +150,7 @@ async def down_ssh_node_pool(request: fastapi.Request,
150
150
  """Cleans up a SSH Node Pools."""
151
151
  try:
152
152
  ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=True)
153
- executor.schedule_request(
153
+ await executor.schedule_request_async(
154
154
  request_id=request.state.request_id,
155
155
  request_name='ssh_down',
156
156
  request_body=ssh_up_body,
@@ -178,7 +178,7 @@ async def down_ssh_node_pool_general(
178
178
  try:
179
179
  # Set cleanup=True for down operation
180
180
  ssh_up_body.cleanup = True
181
- executor.schedule_request(
181
+ await executor.schedule_request_async(
182
182
  request_id=request.state.request_id,
183
183
  request_name='ssh_down',
184
184
  request_body=ssh_up_body,
sky/users/permission.py CHANGED
@@ -14,6 +14,7 @@ from sky import models
14
14
  from sky import sky_logging
15
15
  from sky.skylet import constants
16
16
  from sky.users import rbac
17
+ from sky.utils import annotations
17
18
  from sky.utils import common_utils
18
19
  from sky.utils.db import db_utils
19
20
 
@@ -254,6 +255,9 @@ class PermissionService:
254
255
  with _policy_lock():
255
256
  self._load_policy_no_lock()
256
257
 
258
+ # Right now, not a lot of users are using multiple workspaces,
259
+ # so 5 should be more than enough.
260
+ @annotations.lru_cache(scope='request', maxsize=5)
257
261
  def check_workspace_permission(self, user_id: str,
258
262
  workspace_name: str) -> bool:
259
263
  """Check workspace permission.