skypilot-nightly 1.0.0.dev20251021__py3-none-any.whl → 1.0.0.dev20251022__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +5 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/IgACOQPupLbX9z-RYVEDx/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
- sky/dashboard/out/_next/static/chunks/2755.9b1e69c921b5a870.js +26 -0
- sky/dashboard/out/_next/static/chunks/3015-d014dc5b9412fade.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.998db87cd52a1238.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.14326e329484b57e.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-919e3c01ab6b2633.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +117 -17
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +4 -2
- sky/jobs/server/server.py +11 -11
- sky/jobs/state.py +307 -55
- sky/jobs/utils.py +248 -144
- sky/schemas/api/responses.py +2 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/serve/server/server.py +7 -7
- sky/server/common.py +1 -13
- sky/server/requests/executor.py +20 -20
- sky/server/requests/payloads.py +3 -0
- sky/server/requests/requests.py +12 -19
- sky/server/requests/serializers/encoders.py +3 -3
- sky/server/server.py +34 -34
- sky/setup_files/alembic.ini +4 -0
- sky/skylet/services.py +5 -5
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +4 -4
- sky/users/permission.py +4 -0
- sky/utils/db/db_utils.py +11 -3
- sky/utils/db/migration_utils.py +7 -3
- sky/volumes/server/server.py +3 -3
- sky/workspaces/server.py +6 -6
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/METADATA +36 -35
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/RECORD +73 -72
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/webpack-66f23594d38c7f16.js +0 -1
- sky/dashboard/out/_next/static/jDc1PlRsl9Cc5FQUMLBu8/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{jDc1PlRsl9Cc5FQUMLBu8 → IgACOQPupLbX9z-RYVEDx}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-df9f87fcb7f24292.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/top_level.txt +0 -0
sky/server/requests/payloads.py
CHANGED
|
@@ -542,6 +542,9 @@ class JobsQueueV2Body(RequestBody):
|
|
|
542
542
|
page: Optional[int] = None
|
|
543
543
|
limit: Optional[int] = None
|
|
544
544
|
statuses: Optional[List[str]] = None
|
|
545
|
+
# The fields to return in the response.
|
|
546
|
+
# Refer to the fields in the `class ManagedJobRecord` in `response.py`
|
|
547
|
+
fields: Optional[List[str]] = None
|
|
545
548
|
|
|
546
549
|
|
|
547
550
|
class JobsCancelBody(RequestBody):
|
sky/server/requests/requests.py
CHANGED
|
@@ -16,6 +16,7 @@ import time
|
|
|
16
16
|
import traceback
|
|
17
17
|
from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
|
|
18
18
|
Tuple)
|
|
19
|
+
import uuid
|
|
19
20
|
|
|
20
21
|
import anyio
|
|
21
22
|
import colorama
|
|
@@ -293,6 +294,11 @@ class Request:
|
|
|
293
294
|
raise
|
|
294
295
|
|
|
295
296
|
|
|
297
|
+
def get_new_request_id() -> str:
|
|
298
|
+
"""Get a new request ID."""
|
|
299
|
+
return str(uuid.uuid4())
|
|
300
|
+
|
|
301
|
+
|
|
296
302
|
def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
|
|
297
303
|
"""Serialize the SkyPilot API request for display purposes.
|
|
298
304
|
|
|
@@ -657,17 +663,15 @@ async def _get_request_no_lock_async(
|
|
|
657
663
|
return Request.from_row(row)
|
|
658
664
|
|
|
659
665
|
|
|
660
|
-
@
|
|
666
|
+
@init_db_async
|
|
661
667
|
@metrics_lib.time_me
|
|
662
|
-
def
|
|
668
|
+
async def get_latest_request_id_async() -> Optional[str]:
|
|
663
669
|
"""Get the latest request ID."""
|
|
664
670
|
assert _DB is not None
|
|
665
|
-
with _DB.
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
row = cursor.fetchone()
|
|
670
|
-
return row[0] if row else None
|
|
671
|
+
async with _DB.execute_fetchall_async(
|
|
672
|
+
(f'SELECT request_id FROM {REQUEST_TABLE} '
|
|
673
|
+
'ORDER BY created_at DESC LIMIT 1')) as rows:
|
|
674
|
+
return rows[0][0] if rows else None
|
|
671
675
|
|
|
672
676
|
|
|
673
677
|
@init_db
|
|
@@ -725,17 +729,6 @@ async def get_request_status_async(
|
|
|
725
729
|
return StatusWithMsg(status, status_msg)
|
|
726
730
|
|
|
727
731
|
|
|
728
|
-
@init_db
|
|
729
|
-
@metrics_lib.time_me
|
|
730
|
-
def create_if_not_exists(request: Request) -> bool:
|
|
731
|
-
"""Create a SkyPilot API request if it does not exist."""
|
|
732
|
-
with filelock.FileLock(request_lock_path(request.request_id)):
|
|
733
|
-
if _get_request_no_lock(request.request_id) is not None:
|
|
734
|
-
return False
|
|
735
|
-
_add_or_update_request_no_lock(request)
|
|
736
|
-
return True
|
|
737
|
-
|
|
738
|
-
|
|
739
732
|
@init_db_async
|
|
740
733
|
@metrics_lib.time_me_async
|
|
741
734
|
@asyncio_utils.shield
|
|
@@ -121,7 +121,7 @@ def encode_status_kubernetes(
|
|
|
121
121
|
encoded_cluster = dataclasses.asdict(cluster)
|
|
122
122
|
encoded_cluster['status'] = encoded_cluster['status'].value
|
|
123
123
|
encoded_unmanaged_clusters.append(encoded_cluster)
|
|
124
|
-
all_jobs = [job.model_dump() for job in all_jobs]
|
|
124
|
+
all_jobs = [job.model_dump(by_alias=True) for job in all_jobs]
|
|
125
125
|
return encoded_all_clusters, encoded_unmanaged_clusters, all_jobs, context
|
|
126
126
|
|
|
127
127
|
|
|
@@ -151,9 +151,9 @@ def encode_jobs_queue_v2(
|
|
|
151
151
|
for job in jobs:
|
|
152
152
|
job['status'] = job['status'].value
|
|
153
153
|
if total is None:
|
|
154
|
-
return [job.model_dump() for job in jobs]
|
|
154
|
+
return [job.model_dump(by_alias=True) for job in jobs]
|
|
155
155
|
return {
|
|
156
|
-
'jobs': [job.model_dump() for job in jobs],
|
|
156
|
+
'jobs': [job.model_dump(by_alias=True) for job in jobs],
|
|
157
157
|
'total': total,
|
|
158
158
|
'total_no_filter': total_no_filter,
|
|
159
159
|
'status_counts': status_counts
|
sky/server/server.py
CHANGED
|
@@ -163,7 +163,7 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
163
163
|
"""Middleware to add a request ID to each request."""
|
|
164
164
|
|
|
165
165
|
async def dispatch(self, request: fastapi.Request, call_next):
|
|
166
|
-
request_id =
|
|
166
|
+
request_id = requests_lib.get_new_request_id()
|
|
167
167
|
request.state.request_id = request_id
|
|
168
168
|
response = await call_next(request)
|
|
169
169
|
# TODO(syang): remove X-Request-ID when v0.10.0 is released.
|
|
@@ -455,9 +455,9 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
|
|
|
455
455
|
loop.call_at(target, tick)
|
|
456
456
|
|
|
457
457
|
|
|
458
|
-
def
|
|
458
|
+
async def schedule_on_boot_check_async():
|
|
459
459
|
try:
|
|
460
|
-
executor.
|
|
460
|
+
await executor.schedule_request_async(
|
|
461
461
|
request_id='skypilot-server-on-boot-check',
|
|
462
462
|
request_name='check',
|
|
463
463
|
request_body=payloads.CheckBody(),
|
|
@@ -480,7 +480,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
480
480
|
if event.should_skip():
|
|
481
481
|
continue
|
|
482
482
|
try:
|
|
483
|
-
executor.
|
|
483
|
+
await executor.schedule_request_async(
|
|
484
484
|
request_id=event.id,
|
|
485
485
|
request_name=event.name,
|
|
486
486
|
request_body=payloads.RequestBody(),
|
|
@@ -495,7 +495,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
495
495
|
# Lifespan will be executed in each uvicorn worker process, we
|
|
496
496
|
# can safely ignore the error if the task is already scheduled.
|
|
497
497
|
logger.debug(f'Request {event.id} already exists.')
|
|
498
|
-
|
|
498
|
+
await schedule_on_boot_check_async()
|
|
499
499
|
asyncio.create_task(cleanup_upload_ids())
|
|
500
500
|
if metrics_utils.METRICS_ENABLED:
|
|
501
501
|
# Start monitoring the event loop lag in each server worker
|
|
@@ -729,7 +729,7 @@ async def token(request: fastapi.Request,
|
|
|
729
729
|
async def check(request: fastapi.Request,
|
|
730
730
|
check_body: payloads.CheckBody) -> None:
|
|
731
731
|
"""Checks enabled clouds."""
|
|
732
|
-
executor.
|
|
732
|
+
await executor.schedule_request_async(
|
|
733
733
|
request_id=request.state.request_id,
|
|
734
734
|
request_name='check',
|
|
735
735
|
request_body=check_body,
|
|
@@ -743,7 +743,7 @@ async def enabled_clouds(request: fastapi.Request,
|
|
|
743
743
|
workspace: Optional[str] = None,
|
|
744
744
|
expand: bool = False) -> None:
|
|
745
745
|
"""Gets enabled clouds on the server."""
|
|
746
|
-
executor.
|
|
746
|
+
await executor.schedule_request_async(
|
|
747
747
|
request_id=request.state.request_id,
|
|
748
748
|
request_name='enabled_clouds',
|
|
749
749
|
request_body=payloads.EnabledCloudsBody(workspace=workspace,
|
|
@@ -759,7 +759,7 @@ async def realtime_kubernetes_gpu_availability(
|
|
|
759
759
|
realtime_gpu_availability_body: payloads.RealtimeGpuAvailabilityRequestBody
|
|
760
760
|
) -> None:
|
|
761
761
|
"""Gets real-time Kubernetes GPU availability."""
|
|
762
|
-
executor.
|
|
762
|
+
await executor.schedule_request_async(
|
|
763
763
|
request_id=request.state.request_id,
|
|
764
764
|
request_name='realtime_kubernetes_gpu_availability',
|
|
765
765
|
request_body=realtime_gpu_availability_body,
|
|
@@ -774,7 +774,7 @@ async def kubernetes_node_info(
|
|
|
774
774
|
kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
|
|
775
775
|
) -> None:
|
|
776
776
|
"""Gets Kubernetes nodes information and hints."""
|
|
777
|
-
executor.
|
|
777
|
+
await executor.schedule_request_async(
|
|
778
778
|
request_id=request.state.request_id,
|
|
779
779
|
request_name='kubernetes_node_info',
|
|
780
780
|
request_body=kubernetes_node_info_body,
|
|
@@ -786,7 +786,7 @@ async def kubernetes_node_info(
|
|
|
786
786
|
@app.get('/status_kubernetes')
|
|
787
787
|
async def status_kubernetes(request: fastapi.Request) -> None:
|
|
788
788
|
"""Gets Kubernetes status."""
|
|
789
|
-
executor.
|
|
789
|
+
await executor.schedule_request_async(
|
|
790
790
|
request_id=request.state.request_id,
|
|
791
791
|
request_name='status_kubernetes',
|
|
792
792
|
request_body=payloads.RequestBody(),
|
|
@@ -800,7 +800,7 @@ async def list_accelerators(
|
|
|
800
800
|
request: fastapi.Request,
|
|
801
801
|
list_accelerator_counts_body: payloads.ListAcceleratorsBody) -> None:
|
|
802
802
|
"""Gets list of accelerators from cloud catalog."""
|
|
803
|
-
executor.
|
|
803
|
+
await executor.schedule_request_async(
|
|
804
804
|
request_id=request.state.request_id,
|
|
805
805
|
request_name='list_accelerators',
|
|
806
806
|
request_body=list_accelerator_counts_body,
|
|
@@ -815,7 +815,7 @@ async def list_accelerator_counts(
|
|
|
815
815
|
list_accelerator_counts_body: payloads.ListAcceleratorCountsBody
|
|
816
816
|
) -> None:
|
|
817
817
|
"""Gets list of accelerator counts from cloud catalog."""
|
|
818
|
-
executor.
|
|
818
|
+
await executor.schedule_request_async(
|
|
819
819
|
request_id=request.state.request_id,
|
|
820
820
|
request_name='list_accelerator_counts',
|
|
821
821
|
request_body=list_accelerator_counts_body,
|
|
@@ -872,7 +872,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
872
872
|
async def optimize(optimize_body: payloads.OptimizeBody,
|
|
873
873
|
request: fastapi.Request) -> None:
|
|
874
874
|
"""Optimizes the user's DAG."""
|
|
875
|
-
executor.
|
|
875
|
+
await executor.schedule_request_async(
|
|
876
876
|
request_id=request.state.request_id,
|
|
877
877
|
request_name='optimize',
|
|
878
878
|
request_body=optimize_body,
|
|
@@ -1082,7 +1082,7 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
|
1082
1082
|
"""Launches a cluster or task."""
|
|
1083
1083
|
request_id = request.state.request_id
|
|
1084
1084
|
logger.info(f'Launching request: {request_id}')
|
|
1085
|
-
executor.
|
|
1085
|
+
await executor.schedule_request_async(
|
|
1086
1086
|
request_id,
|
|
1087
1087
|
request_name='launch',
|
|
1088
1088
|
request_body=launch_body,
|
|
@@ -1098,7 +1098,7 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
|
1098
1098
|
async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
|
|
1099
1099
|
"""Executes a task on an existing cluster."""
|
|
1100
1100
|
cluster_name = exec_body.cluster_name
|
|
1101
|
-
executor.
|
|
1101
|
+
await executor.schedule_request_async(
|
|
1102
1102
|
request_id=request.state.request_id,
|
|
1103
1103
|
request_name='exec',
|
|
1104
1104
|
request_body=exec_body,
|
|
@@ -1116,7 +1116,7 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
|
|
|
1116
1116
|
async def stop(request: fastapi.Request,
|
|
1117
1117
|
stop_body: payloads.StopOrDownBody) -> None:
|
|
1118
1118
|
"""Stops a cluster."""
|
|
1119
|
-
executor.
|
|
1119
|
+
await executor.schedule_request_async(
|
|
1120
1120
|
request_id=request.state.request_id,
|
|
1121
1121
|
request_name='stop',
|
|
1122
1122
|
request_body=stop_body,
|
|
@@ -1136,7 +1136,7 @@ async def status(
|
|
|
1136
1136
|
raise fastapi.HTTPException(
|
|
1137
1137
|
status_code=503,
|
|
1138
1138
|
detail='Server is shutting down, please try again later.')
|
|
1139
|
-
executor.
|
|
1139
|
+
await executor.schedule_request_async(
|
|
1140
1140
|
request_id=request.state.request_id,
|
|
1141
1141
|
request_name='status',
|
|
1142
1142
|
request_body=status_body,
|
|
@@ -1151,7 +1151,7 @@ async def status(
|
|
|
1151
1151
|
async def endpoints(request: fastapi.Request,
|
|
1152
1152
|
endpoint_body: payloads.EndpointsBody) -> None:
|
|
1153
1153
|
"""Gets the endpoint for a given cluster and port number (endpoint)."""
|
|
1154
|
-
executor.
|
|
1154
|
+
await executor.schedule_request_async(
|
|
1155
1155
|
request_id=request.state.request_id,
|
|
1156
1156
|
request_name='endpoints',
|
|
1157
1157
|
request_body=endpoint_body,
|
|
@@ -1165,7 +1165,7 @@ async def endpoints(request: fastapi.Request,
|
|
|
1165
1165
|
async def down(request: fastapi.Request,
|
|
1166
1166
|
down_body: payloads.StopOrDownBody) -> None:
|
|
1167
1167
|
"""Tears down a cluster."""
|
|
1168
|
-
executor.
|
|
1168
|
+
await executor.schedule_request_async(
|
|
1169
1169
|
request_id=request.state.request_id,
|
|
1170
1170
|
request_name='down',
|
|
1171
1171
|
request_body=down_body,
|
|
@@ -1179,7 +1179,7 @@ async def down(request: fastapi.Request,
|
|
|
1179
1179
|
async def start(request: fastapi.Request,
|
|
1180
1180
|
start_body: payloads.StartBody) -> None:
|
|
1181
1181
|
"""Restarts a cluster."""
|
|
1182
|
-
executor.
|
|
1182
|
+
await executor.schedule_request_async(
|
|
1183
1183
|
request_id=request.state.request_id,
|
|
1184
1184
|
request_name='start',
|
|
1185
1185
|
request_body=start_body,
|
|
@@ -1193,7 +1193,7 @@ async def start(request: fastapi.Request,
|
|
|
1193
1193
|
async def autostop(request: fastapi.Request,
|
|
1194
1194
|
autostop_body: payloads.AutostopBody) -> None:
|
|
1195
1195
|
"""Schedules an autostop/autodown for a cluster."""
|
|
1196
|
-
executor.
|
|
1196
|
+
await executor.schedule_request_async(
|
|
1197
1197
|
request_id=request.state.request_id,
|
|
1198
1198
|
request_name='autostop',
|
|
1199
1199
|
request_body=autostop_body,
|
|
@@ -1207,7 +1207,7 @@ async def autostop(request: fastapi.Request,
|
|
|
1207
1207
|
async def queue(request: fastapi.Request,
|
|
1208
1208
|
queue_body: payloads.QueueBody) -> None:
|
|
1209
1209
|
"""Gets the job queue of a cluster."""
|
|
1210
|
-
executor.
|
|
1210
|
+
await executor.schedule_request_async(
|
|
1211
1211
|
request_id=request.state.request_id,
|
|
1212
1212
|
request_name='queue',
|
|
1213
1213
|
request_body=queue_body,
|
|
@@ -1221,7 +1221,7 @@ async def queue(request: fastapi.Request,
|
|
|
1221
1221
|
async def job_status(request: fastapi.Request,
|
|
1222
1222
|
job_status_body: payloads.JobStatusBody) -> None:
|
|
1223
1223
|
"""Gets the status of a job."""
|
|
1224
|
-
executor.
|
|
1224
|
+
await executor.schedule_request_async(
|
|
1225
1225
|
request_id=request.state.request_id,
|
|
1226
1226
|
request_name='job_status',
|
|
1227
1227
|
request_body=job_status_body,
|
|
@@ -1235,7 +1235,7 @@ async def job_status(request: fastapi.Request,
|
|
|
1235
1235
|
async def cancel(request: fastapi.Request,
|
|
1236
1236
|
cancel_body: payloads.CancelBody) -> None:
|
|
1237
1237
|
"""Cancels jobs on a cluster."""
|
|
1238
|
-
executor.
|
|
1238
|
+
await executor.schedule_request_async(
|
|
1239
1239
|
request_id=request.state.request_id,
|
|
1240
1240
|
request_name='cancel',
|
|
1241
1241
|
request_body=cancel_body,
|
|
@@ -1255,7 +1255,7 @@ async def logs(
|
|
|
1255
1255
|
# launch, to finish, so that a user does not need to manually pull the
|
|
1256
1256
|
# request status.
|
|
1257
1257
|
executor.check_request_thread_executor_available()
|
|
1258
|
-
request_task = executor.
|
|
1258
|
+
request_task = await executor.prepare_request_async(
|
|
1259
1259
|
request_id=request.state.request_id,
|
|
1260
1260
|
request_name='logs',
|
|
1261
1261
|
request_body=cluster_job_body,
|
|
@@ -1286,7 +1286,7 @@ async def download_logs(
|
|
|
1286
1286
|
# We should reuse the original request body, so that the env vars, such as
|
|
1287
1287
|
# user hash, are kept the same.
|
|
1288
1288
|
cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
|
|
1289
|
-
executor.
|
|
1289
|
+
await executor.schedule_request_async(
|
|
1290
1290
|
request_id=request.state.request_id,
|
|
1291
1291
|
request_name='download_logs',
|
|
1292
1292
|
request_body=cluster_jobs_body,
|
|
@@ -1437,7 +1437,7 @@ def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
|
|
|
1437
1437
|
async def cost_report(request: fastapi.Request,
|
|
1438
1438
|
cost_report_body: payloads.CostReportBody) -> None:
|
|
1439
1439
|
"""Gets the cost report of a cluster."""
|
|
1440
|
-
executor.
|
|
1440
|
+
await executor.schedule_request_async(
|
|
1441
1441
|
request_id=request.state.request_id,
|
|
1442
1442
|
request_name='cost_report',
|
|
1443
1443
|
request_body=cost_report_body,
|
|
@@ -1449,7 +1449,7 @@ async def cost_report(request: fastapi.Request,
|
|
|
1449
1449
|
@app.get('/storage/ls')
|
|
1450
1450
|
async def storage_ls(request: fastapi.Request) -> None:
|
|
1451
1451
|
"""Gets the storages."""
|
|
1452
|
-
executor.
|
|
1452
|
+
await executor.schedule_request_async(
|
|
1453
1453
|
request_id=request.state.request_id,
|
|
1454
1454
|
request_name='storage_ls',
|
|
1455
1455
|
request_body=payloads.RequestBody(),
|
|
@@ -1462,7 +1462,7 @@ async def storage_ls(request: fastapi.Request) -> None:
|
|
|
1462
1462
|
async def storage_delete(request: fastapi.Request,
|
|
1463
1463
|
storage_body: payloads.StorageBody) -> None:
|
|
1464
1464
|
"""Deletes a storage."""
|
|
1465
|
-
executor.
|
|
1465
|
+
await executor.schedule_request_async(
|
|
1466
1466
|
request_id=request.state.request_id,
|
|
1467
1467
|
request_name='storage_delete',
|
|
1468
1468
|
request_body=storage_body,
|
|
@@ -1475,7 +1475,7 @@ async def storage_delete(request: fastapi.Request,
|
|
|
1475
1475
|
async def local_up(request: fastapi.Request,
|
|
1476
1476
|
local_up_body: payloads.LocalUpBody) -> None:
|
|
1477
1477
|
"""Launches a Kubernetes cluster on API server."""
|
|
1478
|
-
executor.
|
|
1478
|
+
await executor.schedule_request_async(
|
|
1479
1479
|
request_id=request.state.request_id,
|
|
1480
1480
|
request_name='local_up',
|
|
1481
1481
|
request_body=local_up_body,
|
|
@@ -1488,7 +1488,7 @@ async def local_up(request: fastapi.Request,
|
|
|
1488
1488
|
async def local_down(request: fastapi.Request,
|
|
1489
1489
|
local_down_body: payloads.LocalDownBody) -> None:
|
|
1490
1490
|
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1491
|
-
executor.
|
|
1491
|
+
await executor.schedule_request_async(
|
|
1492
1492
|
request_id=request.state.request_id,
|
|
1493
1493
|
request_name='local_down',
|
|
1494
1494
|
request_body=local_down_body,
|
|
@@ -1566,7 +1566,7 @@ async def stream(
|
|
|
1566
1566
|
detail='Only one of request_id and log_path can be provided')
|
|
1567
1567
|
|
|
1568
1568
|
if request_id is None and log_path is None:
|
|
1569
|
-
request_id = requests_lib.
|
|
1569
|
+
request_id = await requests_lib.get_latest_request_id_async()
|
|
1570
1570
|
if request_id is None:
|
|
1571
1571
|
raise fastapi.HTTPException(status_code=404,
|
|
1572
1572
|
detail='No request found')
|
|
@@ -1672,7 +1672,7 @@ async def stream(
|
|
|
1672
1672
|
async def api_cancel(request: fastapi.Request,
|
|
1673
1673
|
request_cancel_body: payloads.RequestCancelBody) -> None:
|
|
1674
1674
|
"""Cancels requests."""
|
|
1675
|
-
executor.
|
|
1675
|
+
await executor.schedule_request_async(
|
|
1676
1676
|
request_id=request.state.request_id,
|
|
1677
1677
|
request_name='api_cancel',
|
|
1678
1678
|
request_body=request_cancel_body,
|
|
@@ -1908,7 +1908,7 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1908
1908
|
async def all_contexts(request: fastapi.Request) -> None:
|
|
1909
1909
|
"""Gets all Kubernetes and SSH node pool contexts."""
|
|
1910
1910
|
|
|
1911
|
-
executor.
|
|
1911
|
+
await executor.schedule_request_async(
|
|
1912
1912
|
request_id=request.state.request_id,
|
|
1913
1913
|
request_name='all_contexts',
|
|
1914
1914
|
request_body=payloads.RequestBody(),
|
sky/setup_files/alembic.ini
CHANGED
|
@@ -98,6 +98,10 @@ version_table = alembic_version_spot_jobs_db
|
|
|
98
98
|
version_locations = %(here)s/../schemas/db/serve_state
|
|
99
99
|
version_table = alembic_version_serve_state_db
|
|
100
100
|
|
|
101
|
+
[sky_config_db]
|
|
102
|
+
version_locations = %(here)s/../schemas/db/skypilot_config
|
|
103
|
+
version_table = alembic_version_sky_config_db
|
|
104
|
+
|
|
101
105
|
[post_write_hooks]
|
|
102
106
|
# post_write_hooks defines scripts or Python functions that are run
|
|
103
107
|
# on newly generated revision scripts. See the documentation for further
|
sky/skylet/services.py
CHANGED
|
@@ -408,17 +408,17 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
408
408
|
) -> managed_jobsv1_pb2.GetJobTableResponse:
|
|
409
409
|
try:
|
|
410
410
|
accessible_workspaces = list(request.accessible_workspaces)
|
|
411
|
-
job_ids = list(request.job_ids.ids)
|
|
411
|
+
job_ids = (list(request.job_ids.ids)
|
|
412
|
+
if request.HasField('job_ids') else None)
|
|
412
413
|
user_hashes: Optional[List[Optional[str]]] = None
|
|
413
|
-
if request.user_hashes:
|
|
414
|
+
if request.HasField('user_hashes'):
|
|
414
415
|
user_hashes = list(request.user_hashes.hashes)
|
|
415
416
|
# For backwards compatibility, we show jobs that do not have a
|
|
416
417
|
# user_hash. TODO: Remove before 0.12.0.
|
|
417
418
|
if request.show_jobs_without_user_hash:
|
|
418
419
|
user_hashes.append(None)
|
|
419
|
-
statuses = list(
|
|
420
|
-
|
|
421
|
-
|
|
420
|
+
statuses = (list(request.statuses.statuses)
|
|
421
|
+
if request.HasField('statuses') else None)
|
|
422
422
|
job_queue = managed_job_utils.get_managed_job_queue(
|
|
423
423
|
skip_finished=request.skip_finished,
|
|
424
424
|
accessible_workspaces=accessible_workspaces,
|
sky/skypilot_config.py
CHANGED
|
@@ -64,7 +64,6 @@ from sqlalchemy import orm
|
|
|
64
64
|
from sqlalchemy.dialects import postgresql
|
|
65
65
|
from sqlalchemy.dialects import sqlite
|
|
66
66
|
from sqlalchemy.ext import declarative
|
|
67
|
-
from sqlalchemy.pool import NullPool
|
|
68
67
|
|
|
69
68
|
from sky import exceptions
|
|
70
69
|
from sky import sky_logging
|
|
@@ -77,6 +76,7 @@ from sky.utils import schemas
|
|
|
77
76
|
from sky.utils import ux_utils
|
|
78
77
|
from sky.utils import yaml_utils
|
|
79
78
|
from sky.utils.db import db_utils
|
|
79
|
+
from sky.utils.db import migration_utils
|
|
80
80
|
from sky.utils.kubernetes import config_map_utils
|
|
81
81
|
|
|
82
82
|
if typing.TYPE_CHECKING:
|
|
@@ -121,7 +121,8 @@ _PROJECT_CONFIG_PATH = '.sky.yaml'
|
|
|
121
121
|
|
|
122
122
|
API_SERVER_CONFIG_KEY = 'api_server_config'
|
|
123
123
|
|
|
124
|
-
|
|
124
|
+
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
125
|
+
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
125
126
|
|
|
126
127
|
Base = declarative.declarative_base()
|
|
127
128
|
|
|
@@ -481,7 +482,7 @@ def safe_reload_config() -> None:
|
|
|
481
482
|
reload_config()
|
|
482
483
|
|
|
483
484
|
|
|
484
|
-
def reload_config() -> None:
|
|
485
|
+
def reload_config(init_db: bool = False) -> None:
|
|
485
486
|
internal_config_path = os.environ.get(ENV_VAR_SKYPILOT_CONFIG)
|
|
486
487
|
if internal_config_path is not None:
|
|
487
488
|
# {ENV_VAR_SKYPILOT_CONFIG} is used internally.
|
|
@@ -493,7 +494,7 @@ def reload_config() -> None:
|
|
|
493
494
|
return
|
|
494
495
|
|
|
495
496
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
496
|
-
_reload_config_as_server()
|
|
497
|
+
_reload_config_as_server(init_db=init_db)
|
|
497
498
|
else:
|
|
498
499
|
_reload_config_as_client()
|
|
499
500
|
|
|
@@ -564,7 +565,43 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
|
|
|
564
565
|
_set_loaded_config_path(config_path)
|
|
565
566
|
|
|
566
567
|
|
|
567
|
-
def
|
|
568
|
+
def _create_table(engine: sqlalchemy.engine.Engine):
|
|
569
|
+
"""Initialize the config database with migrations."""
|
|
570
|
+
migration_utils.safe_alembic_upgrade(
|
|
571
|
+
engine, migration_utils.SKYPILOT_CONFIG_DB_NAME,
|
|
572
|
+
migration_utils.SKYPILOT_CONFIG_VERSION)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def _initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
576
|
+
"""Initialize and return the config database engine.
|
|
577
|
+
|
|
578
|
+
This function should only be called by the API Server during initialization.
|
|
579
|
+
Client-side code should never call this function.
|
|
580
|
+
"""
|
|
581
|
+
assert os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None, (
|
|
582
|
+
'initialize_and_get_db() can only be called by the API Server')
|
|
583
|
+
|
|
584
|
+
global _SQLALCHEMY_ENGINE
|
|
585
|
+
|
|
586
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
587
|
+
return _SQLALCHEMY_ENGINE
|
|
588
|
+
|
|
589
|
+
with _SQLALCHEMY_ENGINE_LOCK:
|
|
590
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
591
|
+
return _SQLALCHEMY_ENGINE
|
|
592
|
+
|
|
593
|
+
# We only store config in the DB when using Postgres,
|
|
594
|
+
# so no need to pass in db_name here.
|
|
595
|
+
engine = db_utils.get_engine(None)
|
|
596
|
+
|
|
597
|
+
# Run migrations if needed
|
|
598
|
+
_create_table(engine)
|
|
599
|
+
|
|
600
|
+
_SQLALCHEMY_ENGINE = engine
|
|
601
|
+
return _SQLALCHEMY_ENGINE
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def _reload_config_as_server(init_db: bool = False) -> None:
|
|
568
605
|
# Reset the global variables, to avoid using stale values.
|
|
569
606
|
_set_loaded_config(config_utils.Config())
|
|
570
607
|
_set_loaded_config_path(None)
|
|
@@ -580,37 +617,24 @@ def _reload_config_as_server() -> None:
|
|
|
580
617
|
raise ValueError(
|
|
581
618
|
'If db config is specified, no other config is allowed')
|
|
582
619
|
logger.debug('retrieving config from database')
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
db_config = config_utils.Config(
|
|
602
|
-
yaml_utils.safe_load(row.value))
|
|
603
|
-
db_config.pop_nested(('db',), None)
|
|
604
|
-
return db_config
|
|
605
|
-
return None
|
|
606
|
-
|
|
607
|
-
db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
|
|
608
|
-
if db_config:
|
|
609
|
-
server_config = overlay_skypilot_config(server_config,
|
|
610
|
-
db_config)
|
|
611
|
-
# Close the engine to avoid connection leaks
|
|
612
|
-
if dispose_engine:
|
|
613
|
-
sqlalchemy_engine.dispose()
|
|
620
|
+
|
|
621
|
+
if init_db:
|
|
622
|
+
_initialize_and_get_db()
|
|
623
|
+
|
|
624
|
+
def _get_config_yaml_from_db(key: str) -> Optional[config_utils.Config]:
|
|
625
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
626
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
627
|
+
row = session.query(config_yaml_table).filter_by(
|
|
628
|
+
key=key).first()
|
|
629
|
+
if row:
|
|
630
|
+
db_config = config_utils.Config(yaml_utils.safe_load(row.value))
|
|
631
|
+
db_config.pop_nested(('db',), None)
|
|
632
|
+
return db_config
|
|
633
|
+
return None
|
|
634
|
+
|
|
635
|
+
db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
|
|
636
|
+
if db_config:
|
|
637
|
+
server_config = overlay_skypilot_config(server_config, db_config)
|
|
614
638
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
615
639
|
logger.debug(f'server config: \n'
|
|
616
640
|
f'{yaml_utils.dump_yaml_str(dict(server_config))}')
|
|
@@ -666,7 +690,7 @@ def loaded_config_path_serialized() -> Optional[str]:
|
|
|
666
690
|
|
|
667
691
|
|
|
668
692
|
# Load on import, synchronization is guaranteed by python interpreter.
|
|
669
|
-
reload_config()
|
|
693
|
+
reload_config(init_db=True)
|
|
670
694
|
|
|
671
695
|
|
|
672
696
|
def loaded() -> bool:
|
|
@@ -880,44 +904,32 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
880
904
|
if new_db_url and new_db_url != existing_db_url:
|
|
881
905
|
raise ValueError('Cannot change db url while server is running')
|
|
882
906
|
if existing_db_url:
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
910
|
-
index_elements=[config_yaml_table.c.key],
|
|
911
|
-
set_={config_yaml_table.c.value: config_str})
|
|
912
|
-
session.execute(do_update_stmt)
|
|
913
|
-
session.commit()
|
|
914
|
-
|
|
915
|
-
logger.debug('saving api_server config to db')
|
|
916
|
-
_set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
|
|
917
|
-
db_updated = True
|
|
918
|
-
# Close the engine to avoid connection leaks
|
|
919
|
-
if dispose_engine:
|
|
920
|
-
sqlalchemy_engine.dispose()
|
|
907
|
+
|
|
908
|
+
def _set_config_yaml_to_db(key: str, config: config_utils.Config):
|
|
909
|
+
# reload_config(init_db=True) is called when this module is
|
|
910
|
+
# imported, so the database engine must already be initialized.
|
|
911
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
912
|
+
config_str = yaml_utils.dump_yaml_str(dict(config))
|
|
913
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
914
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
915
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
916
|
+
insert_func = sqlite.insert
|
|
917
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
918
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
919
|
+
insert_func = postgresql.insert
|
|
920
|
+
else:
|
|
921
|
+
raise ValueError('Unsupported database dialect')
|
|
922
|
+
insert_stmnt = insert_func(config_yaml_table).values(
|
|
923
|
+
key=key, value=config_str)
|
|
924
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
925
|
+
index_elements=[config_yaml_table.c.key],
|
|
926
|
+
set_={config_yaml_table.c.value: config_str})
|
|
927
|
+
session.execute(do_update_stmt)
|
|
928
|
+
session.commit()
|
|
929
|
+
|
|
930
|
+
logger.debug('saving api_server config to db')
|
|
931
|
+
_set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
|
|
932
|
+
db_updated = True
|
|
921
933
|
|
|
922
934
|
if not db_updated:
|
|
923
935
|
# save to the local file (PVC in Kubernetes, local file otherwise)
|