skypilot-nightly 1.0.0.dev20251021__py3-none-any.whl → 1.0.0.dev20251022__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (81) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +5 -2
  3. sky/dashboard/out/404.html +1 -1
  4. sky/dashboard/out/_next/static/IgACOQPupLbX9z-RYVEDx/_buildManifest.js +1 -0
  5. sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
  6. sky/dashboard/out/_next/static/chunks/2755.9b1e69c921b5a870.js +26 -0
  7. sky/dashboard/out/_next/static/chunks/3015-d014dc5b9412fade.js +1 -0
  8. sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.998db87cd52a1238.js} +1 -1
  9. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
  10. sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
  11. sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/9360.14326e329484b57e.js +31 -0
  14. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
  16. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
  18. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/webpack-919e3c01ab6b2633.js +1 -0
  25. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  26. sky/dashboard/out/clusters/[cluster].html +1 -1
  27. sky/dashboard/out/clusters.html +1 -1
  28. sky/dashboard/out/config.html +1 -1
  29. sky/dashboard/out/index.html +1 -1
  30. sky/dashboard/out/infra/[context].html +1 -1
  31. sky/dashboard/out/infra.html +1 -1
  32. sky/dashboard/out/jobs/[job].html +1 -1
  33. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  34. sky/dashboard/out/jobs.html +1 -1
  35. sky/dashboard/out/users.html +1 -1
  36. sky/dashboard/out/volumes.html +1 -1
  37. sky/dashboard/out/workspace/new.html +1 -1
  38. sky/dashboard/out/workspaces/[name].html +1 -1
  39. sky/dashboard/out/workspaces.html +1 -1
  40. sky/global_user_state.py +117 -17
  41. sky/jobs/constants.py +1 -1
  42. sky/jobs/server/core.py +4 -2
  43. sky/jobs/server/server.py +11 -11
  44. sky/jobs/state.py +307 -55
  45. sky/jobs/utils.py +248 -144
  46. sky/schemas/api/responses.py +2 -0
  47. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  48. sky/serve/server/server.py +7 -7
  49. sky/server/common.py +1 -13
  50. sky/server/requests/executor.py +20 -20
  51. sky/server/requests/payloads.py +3 -0
  52. sky/server/requests/requests.py +12 -19
  53. sky/server/requests/serializers/encoders.py +3 -3
  54. sky/server/server.py +34 -34
  55. sky/setup_files/alembic.ini +4 -0
  56. sky/skylet/services.py +5 -5
  57. sky/skypilot_config.py +87 -75
  58. sky/ssh_node_pools/server.py +4 -4
  59. sky/users/permission.py +4 -0
  60. sky/utils/db/db_utils.py +11 -3
  61. sky/utils/db/migration_utils.py +7 -3
  62. sky/volumes/server/server.py +3 -3
  63. sky/workspaces/server.py +6 -6
  64. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/METADATA +36 -35
  65. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/RECORD +73 -72
  66. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  67. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  68. sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
  69. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  70. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  71. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  72. sky/dashboard/out/_next/static/chunks/webpack-66f23594d38c7f16.js +0 -1
  73. sky/dashboard/out/_next/static/jDc1PlRsl9Cc5FQUMLBu8/_buildManifest.js +0 -1
  74. /sky/dashboard/out/_next/static/{jDc1PlRsl9Cc5FQUMLBu8 → IgACOQPupLbX9z-RYVEDx}/_ssgManifest.js +0 -0
  75. /sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-df9f87fcb7f24292.js} +0 -0
  76. /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
  77. /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
  78. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/WHEEL +0 -0
  79. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/entry_points.txt +0 -0
  80. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/licenses/LICENSE +0 -0
  81. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/top_level.txt +0 -0
@@ -542,6 +542,9 @@ class JobsQueueV2Body(RequestBody):
542
542
  page: Optional[int] = None
543
543
  limit: Optional[int] = None
544
544
  statuses: Optional[List[str]] = None
545
+ # The fields to return in the response.
546
+ # Refer to the fields in the `class ManagedJobRecord` in `response.py`
547
+ fields: Optional[List[str]] = None
545
548
 
546
549
 
547
550
  class JobsCancelBody(RequestBody):
@@ -16,6 +16,7 @@ import time
16
16
  import traceback
17
17
  from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
18
18
  Tuple)
19
+ import uuid
19
20
 
20
21
  import anyio
21
22
  import colorama
@@ -293,6 +294,11 @@ class Request:
293
294
  raise
294
295
 
295
296
 
297
+ def get_new_request_id() -> str:
298
+ """Get a new request ID."""
299
+ return str(uuid.uuid4())
300
+
301
+
296
302
  def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
297
303
  """Serialize the SkyPilot API request for display purposes.
298
304
 
@@ -657,17 +663,15 @@ async def _get_request_no_lock_async(
657
663
  return Request.from_row(row)
658
664
 
659
665
 
660
- @init_db
666
+ @init_db_async
661
667
  @metrics_lib.time_me
662
- def get_latest_request_id() -> Optional[str]:
668
+ async def get_latest_request_id_async() -> Optional[str]:
663
669
  """Get the latest request ID."""
664
670
  assert _DB is not None
665
- with _DB.conn:
666
- cursor = _DB.conn.cursor()
667
- cursor.execute(f'SELECT request_id FROM {REQUEST_TABLE} '
668
- 'ORDER BY created_at DESC LIMIT 1')
669
- row = cursor.fetchone()
670
- return row[0] if row else None
671
+ async with _DB.execute_fetchall_async(
672
+ (f'SELECT request_id FROM {REQUEST_TABLE} '
673
+ 'ORDER BY created_at DESC LIMIT 1')) as rows:
674
+ return rows[0][0] if rows else None
671
675
 
672
676
 
673
677
  @init_db
@@ -725,17 +729,6 @@ async def get_request_status_async(
725
729
  return StatusWithMsg(status, status_msg)
726
730
 
727
731
 
728
- @init_db
729
- @metrics_lib.time_me
730
- def create_if_not_exists(request: Request) -> bool:
731
- """Create a SkyPilot API request if it does not exist."""
732
- with filelock.FileLock(request_lock_path(request.request_id)):
733
- if _get_request_no_lock(request.request_id) is not None:
734
- return False
735
- _add_or_update_request_no_lock(request)
736
- return True
737
-
738
-
739
732
  @init_db_async
740
733
  @metrics_lib.time_me_async
741
734
  @asyncio_utils.shield
@@ -121,7 +121,7 @@ def encode_status_kubernetes(
121
121
  encoded_cluster = dataclasses.asdict(cluster)
122
122
  encoded_cluster['status'] = encoded_cluster['status'].value
123
123
  encoded_unmanaged_clusters.append(encoded_cluster)
124
- all_jobs = [job.model_dump() for job in all_jobs]
124
+ all_jobs = [job.model_dump(by_alias=True) for job in all_jobs]
125
125
  return encoded_all_clusters, encoded_unmanaged_clusters, all_jobs, context
126
126
 
127
127
 
@@ -151,9 +151,9 @@ def encode_jobs_queue_v2(
151
151
  for job in jobs:
152
152
  job['status'] = job['status'].value
153
153
  if total is None:
154
- return [job.model_dump() for job in jobs]
154
+ return [job.model_dump(by_alias=True) for job in jobs]
155
155
  return {
156
- 'jobs': [job.model_dump() for job in jobs],
156
+ 'jobs': [job.model_dump(by_alias=True) for job in jobs],
157
157
  'total': total,
158
158
  'total_no_filter': total_no_filter,
159
159
  'status_counts': status_counts
sky/server/server.py CHANGED
@@ -163,7 +163,7 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
163
163
  """Middleware to add a request ID to each request."""
164
164
 
165
165
  async def dispatch(self, request: fastapi.Request, call_next):
166
- request_id = str(uuid.uuid4())
166
+ request_id = requests_lib.get_new_request_id()
167
167
  request.state.request_id = request_id
168
168
  response = await call_next(request)
169
169
  # TODO(syang): remove X-Request-ID when v0.10.0 is released.
@@ -455,9 +455,9 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
455
455
  loop.call_at(target, tick)
456
456
 
457
457
 
458
- def schedule_on_boot_check():
458
+ async def schedule_on_boot_check_async():
459
459
  try:
460
- executor.schedule_request(
460
+ await executor.schedule_request_async(
461
461
  request_id='skypilot-server-on-boot-check',
462
462
  request_name='check',
463
463
  request_body=payloads.CheckBody(),
@@ -480,7 +480,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
480
480
  if event.should_skip():
481
481
  continue
482
482
  try:
483
- executor.schedule_request(
483
+ await executor.schedule_request_async(
484
484
  request_id=event.id,
485
485
  request_name=event.name,
486
486
  request_body=payloads.RequestBody(),
@@ -495,7 +495,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
495
495
  # Lifespan will be executed in each uvicorn worker process, we
496
496
  # can safely ignore the error if the task is already scheduled.
497
497
  logger.debug(f'Request {event.id} already exists.')
498
- schedule_on_boot_check()
498
+ await schedule_on_boot_check_async()
499
499
  asyncio.create_task(cleanup_upload_ids())
500
500
  if metrics_utils.METRICS_ENABLED:
501
501
  # Start monitoring the event loop lag in each server worker
@@ -729,7 +729,7 @@ async def token(request: fastapi.Request,
729
729
  async def check(request: fastapi.Request,
730
730
  check_body: payloads.CheckBody) -> None:
731
731
  """Checks enabled clouds."""
732
- executor.schedule_request(
732
+ await executor.schedule_request_async(
733
733
  request_id=request.state.request_id,
734
734
  request_name='check',
735
735
  request_body=check_body,
@@ -743,7 +743,7 @@ async def enabled_clouds(request: fastapi.Request,
743
743
  workspace: Optional[str] = None,
744
744
  expand: bool = False) -> None:
745
745
  """Gets enabled clouds on the server."""
746
- executor.schedule_request(
746
+ await executor.schedule_request_async(
747
747
  request_id=request.state.request_id,
748
748
  request_name='enabled_clouds',
749
749
  request_body=payloads.EnabledCloudsBody(workspace=workspace,
@@ -759,7 +759,7 @@ async def realtime_kubernetes_gpu_availability(
759
759
  realtime_gpu_availability_body: payloads.RealtimeGpuAvailabilityRequestBody
760
760
  ) -> None:
761
761
  """Gets real-time Kubernetes GPU availability."""
762
- executor.schedule_request(
762
+ await executor.schedule_request_async(
763
763
  request_id=request.state.request_id,
764
764
  request_name='realtime_kubernetes_gpu_availability',
765
765
  request_body=realtime_gpu_availability_body,
@@ -774,7 +774,7 @@ async def kubernetes_node_info(
774
774
  kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
775
775
  ) -> None:
776
776
  """Gets Kubernetes nodes information and hints."""
777
- executor.schedule_request(
777
+ await executor.schedule_request_async(
778
778
  request_id=request.state.request_id,
779
779
  request_name='kubernetes_node_info',
780
780
  request_body=kubernetes_node_info_body,
@@ -786,7 +786,7 @@ async def kubernetes_node_info(
786
786
  @app.get('/status_kubernetes')
787
787
  async def status_kubernetes(request: fastapi.Request) -> None:
788
788
  """Gets Kubernetes status."""
789
- executor.schedule_request(
789
+ await executor.schedule_request_async(
790
790
  request_id=request.state.request_id,
791
791
  request_name='status_kubernetes',
792
792
  request_body=payloads.RequestBody(),
@@ -800,7 +800,7 @@ async def list_accelerators(
800
800
  request: fastapi.Request,
801
801
  list_accelerator_counts_body: payloads.ListAcceleratorsBody) -> None:
802
802
  """Gets list of accelerators from cloud catalog."""
803
- executor.schedule_request(
803
+ await executor.schedule_request_async(
804
804
  request_id=request.state.request_id,
805
805
  request_name='list_accelerators',
806
806
  request_body=list_accelerator_counts_body,
@@ -815,7 +815,7 @@ async def list_accelerator_counts(
815
815
  list_accelerator_counts_body: payloads.ListAcceleratorCountsBody
816
816
  ) -> None:
817
817
  """Gets list of accelerator counts from cloud catalog."""
818
- executor.schedule_request(
818
+ await executor.schedule_request_async(
819
819
  request_id=request.state.request_id,
820
820
  request_name='list_accelerator_counts',
821
821
  request_body=list_accelerator_counts_body,
@@ -872,7 +872,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
872
872
  async def optimize(optimize_body: payloads.OptimizeBody,
873
873
  request: fastapi.Request) -> None:
874
874
  """Optimizes the user's DAG."""
875
- executor.schedule_request(
875
+ await executor.schedule_request_async(
876
876
  request_id=request.state.request_id,
877
877
  request_name='optimize',
878
878
  request_body=optimize_body,
@@ -1082,7 +1082,7 @@ async def launch(launch_body: payloads.LaunchBody,
1082
1082
  """Launches a cluster or task."""
1083
1083
  request_id = request.state.request_id
1084
1084
  logger.info(f'Launching request: {request_id}')
1085
- executor.schedule_request(
1085
+ await executor.schedule_request_async(
1086
1086
  request_id,
1087
1087
  request_name='launch',
1088
1088
  request_body=launch_body,
@@ -1098,7 +1098,7 @@ async def launch(launch_body: payloads.LaunchBody,
1098
1098
  async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
1099
1099
  """Executes a task on an existing cluster."""
1100
1100
  cluster_name = exec_body.cluster_name
1101
- executor.schedule_request(
1101
+ await executor.schedule_request_async(
1102
1102
  request_id=request.state.request_id,
1103
1103
  request_name='exec',
1104
1104
  request_body=exec_body,
@@ -1116,7 +1116,7 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
1116
1116
  async def stop(request: fastapi.Request,
1117
1117
  stop_body: payloads.StopOrDownBody) -> None:
1118
1118
  """Stops a cluster."""
1119
- executor.schedule_request(
1119
+ await executor.schedule_request_async(
1120
1120
  request_id=request.state.request_id,
1121
1121
  request_name='stop',
1122
1122
  request_body=stop_body,
@@ -1136,7 +1136,7 @@ async def status(
1136
1136
  raise fastapi.HTTPException(
1137
1137
  status_code=503,
1138
1138
  detail='Server is shutting down, please try again later.')
1139
- executor.schedule_request(
1139
+ await executor.schedule_request_async(
1140
1140
  request_id=request.state.request_id,
1141
1141
  request_name='status',
1142
1142
  request_body=status_body,
@@ -1151,7 +1151,7 @@ async def status(
1151
1151
  async def endpoints(request: fastapi.Request,
1152
1152
  endpoint_body: payloads.EndpointsBody) -> None:
1153
1153
  """Gets the endpoint for a given cluster and port number (endpoint)."""
1154
- executor.schedule_request(
1154
+ await executor.schedule_request_async(
1155
1155
  request_id=request.state.request_id,
1156
1156
  request_name='endpoints',
1157
1157
  request_body=endpoint_body,
@@ -1165,7 +1165,7 @@ async def endpoints(request: fastapi.Request,
1165
1165
  async def down(request: fastapi.Request,
1166
1166
  down_body: payloads.StopOrDownBody) -> None:
1167
1167
  """Tears down a cluster."""
1168
- executor.schedule_request(
1168
+ await executor.schedule_request_async(
1169
1169
  request_id=request.state.request_id,
1170
1170
  request_name='down',
1171
1171
  request_body=down_body,
@@ -1179,7 +1179,7 @@ async def down(request: fastapi.Request,
1179
1179
  async def start(request: fastapi.Request,
1180
1180
  start_body: payloads.StartBody) -> None:
1181
1181
  """Restarts a cluster."""
1182
- executor.schedule_request(
1182
+ await executor.schedule_request_async(
1183
1183
  request_id=request.state.request_id,
1184
1184
  request_name='start',
1185
1185
  request_body=start_body,
@@ -1193,7 +1193,7 @@ async def start(request: fastapi.Request,
1193
1193
  async def autostop(request: fastapi.Request,
1194
1194
  autostop_body: payloads.AutostopBody) -> None:
1195
1195
  """Schedules an autostop/autodown for a cluster."""
1196
- executor.schedule_request(
1196
+ await executor.schedule_request_async(
1197
1197
  request_id=request.state.request_id,
1198
1198
  request_name='autostop',
1199
1199
  request_body=autostop_body,
@@ -1207,7 +1207,7 @@ async def autostop(request: fastapi.Request,
1207
1207
  async def queue(request: fastapi.Request,
1208
1208
  queue_body: payloads.QueueBody) -> None:
1209
1209
  """Gets the job queue of a cluster."""
1210
- executor.schedule_request(
1210
+ await executor.schedule_request_async(
1211
1211
  request_id=request.state.request_id,
1212
1212
  request_name='queue',
1213
1213
  request_body=queue_body,
@@ -1221,7 +1221,7 @@ async def queue(request: fastapi.Request,
1221
1221
  async def job_status(request: fastapi.Request,
1222
1222
  job_status_body: payloads.JobStatusBody) -> None:
1223
1223
  """Gets the status of a job."""
1224
- executor.schedule_request(
1224
+ await executor.schedule_request_async(
1225
1225
  request_id=request.state.request_id,
1226
1226
  request_name='job_status',
1227
1227
  request_body=job_status_body,
@@ -1235,7 +1235,7 @@ async def job_status(request: fastapi.Request,
1235
1235
  async def cancel(request: fastapi.Request,
1236
1236
  cancel_body: payloads.CancelBody) -> None:
1237
1237
  """Cancels jobs on a cluster."""
1238
- executor.schedule_request(
1238
+ await executor.schedule_request_async(
1239
1239
  request_id=request.state.request_id,
1240
1240
  request_name='cancel',
1241
1241
  request_body=cancel_body,
@@ -1255,7 +1255,7 @@ async def logs(
1255
1255
  # launch, to finish, so that a user does not need to manually pull the
1256
1256
  # request status.
1257
1257
  executor.check_request_thread_executor_available()
1258
- request_task = executor.prepare_request(
1258
+ request_task = await executor.prepare_request_async(
1259
1259
  request_id=request.state.request_id,
1260
1260
  request_name='logs',
1261
1261
  request_body=cluster_job_body,
@@ -1286,7 +1286,7 @@ async def download_logs(
1286
1286
  # We should reuse the original request body, so that the env vars, such as
1287
1287
  # user hash, are kept the same.
1288
1288
  cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
1289
- executor.schedule_request(
1289
+ await executor.schedule_request_async(
1290
1290
  request_id=request.state.request_id,
1291
1291
  request_name='download_logs',
1292
1292
  request_body=cluster_jobs_body,
@@ -1437,7 +1437,7 @@ def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
1437
1437
  async def cost_report(request: fastapi.Request,
1438
1438
  cost_report_body: payloads.CostReportBody) -> None:
1439
1439
  """Gets the cost report of a cluster."""
1440
- executor.schedule_request(
1440
+ await executor.schedule_request_async(
1441
1441
  request_id=request.state.request_id,
1442
1442
  request_name='cost_report',
1443
1443
  request_body=cost_report_body,
@@ -1449,7 +1449,7 @@ async def cost_report(request: fastapi.Request,
1449
1449
  @app.get('/storage/ls')
1450
1450
  async def storage_ls(request: fastapi.Request) -> None:
1451
1451
  """Gets the storages."""
1452
- executor.schedule_request(
1452
+ await executor.schedule_request_async(
1453
1453
  request_id=request.state.request_id,
1454
1454
  request_name='storage_ls',
1455
1455
  request_body=payloads.RequestBody(),
@@ -1462,7 +1462,7 @@ async def storage_ls(request: fastapi.Request) -> None:
1462
1462
  async def storage_delete(request: fastapi.Request,
1463
1463
  storage_body: payloads.StorageBody) -> None:
1464
1464
  """Deletes a storage."""
1465
- executor.schedule_request(
1465
+ await executor.schedule_request_async(
1466
1466
  request_id=request.state.request_id,
1467
1467
  request_name='storage_delete',
1468
1468
  request_body=storage_body,
@@ -1475,7 +1475,7 @@ async def storage_delete(request: fastapi.Request,
1475
1475
  async def local_up(request: fastapi.Request,
1476
1476
  local_up_body: payloads.LocalUpBody) -> None:
1477
1477
  """Launches a Kubernetes cluster on API server."""
1478
- executor.schedule_request(
1478
+ await executor.schedule_request_async(
1479
1479
  request_id=request.state.request_id,
1480
1480
  request_name='local_up',
1481
1481
  request_body=local_up_body,
@@ -1488,7 +1488,7 @@ async def local_up(request: fastapi.Request,
1488
1488
  async def local_down(request: fastapi.Request,
1489
1489
  local_down_body: payloads.LocalDownBody) -> None:
1490
1490
  """Tears down the Kubernetes cluster started by local_up."""
1491
- executor.schedule_request(
1491
+ await executor.schedule_request_async(
1492
1492
  request_id=request.state.request_id,
1493
1493
  request_name='local_down',
1494
1494
  request_body=local_down_body,
@@ -1566,7 +1566,7 @@ async def stream(
1566
1566
  detail='Only one of request_id and log_path can be provided')
1567
1567
 
1568
1568
  if request_id is None and log_path is None:
1569
- request_id = requests_lib.get_latest_request_id()
1569
+ request_id = await requests_lib.get_latest_request_id_async()
1570
1570
  if request_id is None:
1571
1571
  raise fastapi.HTTPException(status_code=404,
1572
1572
  detail='No request found')
@@ -1672,7 +1672,7 @@ async def stream(
1672
1672
  async def api_cancel(request: fastapi.Request,
1673
1673
  request_cancel_body: payloads.RequestCancelBody) -> None:
1674
1674
  """Cancels requests."""
1675
- executor.schedule_request(
1675
+ await executor.schedule_request_async(
1676
1676
  request_id=request.state.request_id,
1677
1677
  request_name='api_cancel',
1678
1678
  request_body=request_cancel_body,
@@ -1908,7 +1908,7 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1908
1908
  async def all_contexts(request: fastapi.Request) -> None:
1909
1909
  """Gets all Kubernetes and SSH node pool contexts."""
1910
1910
 
1911
- executor.schedule_request(
1911
+ await executor.schedule_request_async(
1912
1912
  request_id=request.state.request_id,
1913
1913
  request_name='all_contexts',
1914
1914
  request_body=payloads.RequestBody(),
@@ -98,6 +98,10 @@ version_table = alembic_version_spot_jobs_db
98
98
  version_locations = %(here)s/../schemas/db/serve_state
99
99
  version_table = alembic_version_serve_state_db
100
100
 
101
+ [sky_config_db]
102
+ version_locations = %(here)s/../schemas/db/skypilot_config
103
+ version_table = alembic_version_sky_config_db
104
+
101
105
  [post_write_hooks]
102
106
  # post_write_hooks defines scripts or Python functions that are run
103
107
  # on newly generated revision scripts. See the documentation for further
sky/skylet/services.py CHANGED
@@ -408,17 +408,17 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
408
408
  ) -> managed_jobsv1_pb2.GetJobTableResponse:
409
409
  try:
410
410
  accessible_workspaces = list(request.accessible_workspaces)
411
- job_ids = list(request.job_ids.ids) if request.job_ids else None
411
+ job_ids = (list(request.job_ids.ids)
412
+ if request.HasField('job_ids') else None)
412
413
  user_hashes: Optional[List[Optional[str]]] = None
413
- if request.user_hashes:
414
+ if request.HasField('user_hashes'):
414
415
  user_hashes = list(request.user_hashes.hashes)
415
416
  # For backwards compatibility, we show jobs that do not have a
416
417
  # user_hash. TODO: Remove before 0.12.0.
417
418
  if request.show_jobs_without_user_hash:
418
419
  user_hashes.append(None)
419
- statuses = list(
420
- request.statuses.statuses) if request.statuses else None
421
-
420
+ statuses = (list(request.statuses.statuses)
421
+ if request.HasField('statuses') else None)
422
422
  job_queue = managed_job_utils.get_managed_job_queue(
423
423
  skip_finished=request.skip_finished,
424
424
  accessible_workspaces=accessible_workspaces,
sky/skypilot_config.py CHANGED
@@ -64,7 +64,6 @@ from sqlalchemy import orm
64
64
  from sqlalchemy.dialects import postgresql
65
65
  from sqlalchemy.dialects import sqlite
66
66
  from sqlalchemy.ext import declarative
67
- from sqlalchemy.pool import NullPool
68
67
 
69
68
  from sky import exceptions
70
69
  from sky import sky_logging
@@ -77,6 +76,7 @@ from sky.utils import schemas
77
76
  from sky.utils import ux_utils
78
77
  from sky.utils import yaml_utils
79
78
  from sky.utils.db import db_utils
79
+ from sky.utils.db import migration_utils
80
80
  from sky.utils.kubernetes import config_map_utils
81
81
 
82
82
  if typing.TYPE_CHECKING:
@@ -121,7 +121,8 @@ _PROJECT_CONFIG_PATH = '.sky.yaml'
121
121
 
122
122
  API_SERVER_CONFIG_KEY = 'api_server_config'
123
123
 
124
- _DB_USE_LOCK = threading.Lock()
124
+ _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
125
+ _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
125
126
 
126
127
  Base = declarative.declarative_base()
127
128
 
@@ -481,7 +482,7 @@ def safe_reload_config() -> None:
481
482
  reload_config()
482
483
 
483
484
 
484
- def reload_config() -> None:
485
+ def reload_config(init_db: bool = False) -> None:
485
486
  internal_config_path = os.environ.get(ENV_VAR_SKYPILOT_CONFIG)
486
487
  if internal_config_path is not None:
487
488
  # {ENV_VAR_SKYPILOT_CONFIG} is used internally.
@@ -493,7 +494,7 @@ def reload_config() -> None:
493
494
  return
494
495
 
495
496
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
496
- _reload_config_as_server()
497
+ _reload_config_as_server(init_db=init_db)
497
498
  else:
498
499
  _reload_config_as_client()
499
500
 
@@ -564,7 +565,43 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
564
565
  _set_loaded_config_path(config_path)
565
566
 
566
567
 
567
- def _reload_config_as_server() -> None:
568
+ def _create_table(engine: sqlalchemy.engine.Engine):
569
+ """Initialize the config database with migrations."""
570
+ migration_utils.safe_alembic_upgrade(
571
+ engine, migration_utils.SKYPILOT_CONFIG_DB_NAME,
572
+ migration_utils.SKYPILOT_CONFIG_VERSION)
573
+
574
+
575
+ def _initialize_and_get_db() -> sqlalchemy.engine.Engine:
576
+ """Initialize and return the config database engine.
577
+
578
+ This function should only be called by the API Server during initialization.
579
+ Client-side code should never call this function.
580
+ """
581
+ assert os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None, (
582
+ 'initialize_and_get_db() can only be called by the API Server')
583
+
584
+ global _SQLALCHEMY_ENGINE
585
+
586
+ if _SQLALCHEMY_ENGINE is not None:
587
+ return _SQLALCHEMY_ENGINE
588
+
589
+ with _SQLALCHEMY_ENGINE_LOCK:
590
+ if _SQLALCHEMY_ENGINE is not None:
591
+ return _SQLALCHEMY_ENGINE
592
+
593
+ # We only store config in the DB when using Postgres,
594
+ # so no need to pass in db_name here.
595
+ engine = db_utils.get_engine(None)
596
+
597
+ # Run migrations if needed
598
+ _create_table(engine)
599
+
600
+ _SQLALCHEMY_ENGINE = engine
601
+ return _SQLALCHEMY_ENGINE
602
+
603
+
604
+ def _reload_config_as_server(init_db: bool = False) -> None:
568
605
  # Reset the global variables, to avoid using stale values.
569
606
  _set_loaded_config(config_utils.Config())
570
607
  _set_loaded_config_path(None)
@@ -580,37 +617,24 @@ def _reload_config_as_server() -> None:
580
617
  raise ValueError(
581
618
  'If db config is specified, no other config is allowed')
582
619
  logger.debug('retrieving config from database')
583
- with _DB_USE_LOCK:
584
- dispose_engine = False
585
- if db_utils.get_max_connections() == 0:
586
- dispose_engine = True
587
- sqlalchemy_engine = sqlalchemy.create_engine(db_url,
588
- poolclass=NullPool)
589
- else:
590
- sqlalchemy_engine = db_utils.get_engine('config')
591
- db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata,
592
- sqlalchemy_engine)
593
-
594
- def _get_config_yaml_from_db(
595
- key: str) -> Optional[config_utils.Config]:
596
- assert sqlalchemy_engine is not None
597
- with orm.Session(sqlalchemy_engine) as session:
598
- row = session.query(config_yaml_table).filter_by(
599
- key=key).first()
600
- if row:
601
- db_config = config_utils.Config(
602
- yaml_utils.safe_load(row.value))
603
- db_config.pop_nested(('db',), None)
604
- return db_config
605
- return None
606
-
607
- db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
608
- if db_config:
609
- server_config = overlay_skypilot_config(server_config,
610
- db_config)
611
- # Close the engine to avoid connection leaks
612
- if dispose_engine:
613
- sqlalchemy_engine.dispose()
620
+
621
+ if init_db:
622
+ _initialize_and_get_db()
623
+
624
+ def _get_config_yaml_from_db(key: str) -> Optional[config_utils.Config]:
625
+ assert _SQLALCHEMY_ENGINE is not None
626
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
627
+ row = session.query(config_yaml_table).filter_by(
628
+ key=key).first()
629
+ if row:
630
+ db_config = config_utils.Config(yaml_utils.safe_load(row.value))
631
+ db_config.pop_nested(('db',), None)
632
+ return db_config
633
+ return None
634
+
635
+ db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
636
+ if db_config:
637
+ server_config = overlay_skypilot_config(server_config, db_config)
614
638
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
615
639
  logger.debug(f'server config: \n'
616
640
  f'{yaml_utils.dump_yaml_str(dict(server_config))}')
@@ -666,7 +690,7 @@ def loaded_config_path_serialized() -> Optional[str]:
666
690
 
667
691
 
668
692
  # Load on import, synchronization is guaranteed by python interpreter.
669
- reload_config()
693
+ reload_config(init_db=True)
670
694
 
671
695
 
672
696
  def loaded() -> bool:
@@ -880,44 +904,32 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
880
904
  if new_db_url and new_db_url != existing_db_url:
881
905
  raise ValueError('Cannot change db url while server is running')
882
906
  if existing_db_url:
883
- with _DB_USE_LOCK:
884
- dispose_engine = False
885
- if db_utils.get_max_connections() == 0:
886
- dispose_engine = True
887
- sqlalchemy_engine = sqlalchemy.create_engine(
888
- existing_db_url, poolclass=NullPool)
889
- else:
890
- sqlalchemy_engine = db_utils.get_engine('config')
891
- db_utils.add_all_tables_to_db_sqlalchemy(
892
- Base.metadata, sqlalchemy_engine)
893
-
894
- def _set_config_yaml_to_db(key: str,
895
- config: config_utils.Config):
896
- assert sqlalchemy_engine is not None
897
- config_str = yaml_utils.dump_yaml_str(dict(config))
898
- with orm.Session(sqlalchemy_engine) as session:
899
- if (sqlalchemy_engine.dialect.name ==
900
- db_utils.SQLAlchemyDialect.SQLITE.value):
901
- insert_func = sqlite.insert
902
- elif (sqlalchemy_engine.dialect.name ==
903
- db_utils.SQLAlchemyDialect.POSTGRESQL.value):
904
- insert_func = postgresql.insert
905
- else:
906
- raise ValueError('Unsupported database dialect')
907
- insert_stmnt = insert_func(config_yaml_table).values(
908
- key=key, value=config_str)
909
- do_update_stmt = insert_stmnt.on_conflict_do_update(
910
- index_elements=[config_yaml_table.c.key],
911
- set_={config_yaml_table.c.value: config_str})
912
- session.execute(do_update_stmt)
913
- session.commit()
914
-
915
- logger.debug('saving api_server config to db')
916
- _set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
917
- db_updated = True
918
- # Close the engine to avoid connection leaks
919
- if dispose_engine:
920
- sqlalchemy_engine.dispose()
907
+
908
+ def _set_config_yaml_to_db(key: str, config: config_utils.Config):
909
+ # reload_config(init_db=True) is called when this module is
910
+ # imported, so the database engine must already be initialized.
911
+ assert _SQLALCHEMY_ENGINE is not None
912
+ config_str = yaml_utils.dump_yaml_str(dict(config))
913
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
914
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
915
+ db_utils.SQLAlchemyDialect.SQLITE.value):
916
+ insert_func = sqlite.insert
917
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
918
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
919
+ insert_func = postgresql.insert
920
+ else:
921
+ raise ValueError('Unsupported database dialect')
922
+ insert_stmnt = insert_func(config_yaml_table).values(
923
+ key=key, value=config_str)
924
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
925
+ index_elements=[config_yaml_table.c.key],
926
+ set_={config_yaml_table.c.value: config_str})
927
+ session.execute(do_update_stmt)
928
+ session.commit()
929
+
930
+ logger.debug('saving api_server config to db')
931
+ _set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
932
+ db_updated = True
921
933
 
922
934
  if not db_updated:
923
935
  # save to the local file (PVC in Kubernetes, local file otherwise)