skypilot-nightly 1.0.0.dev20251019__py3-none-any.whl → 1.0.0.dev20251022__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/backends/backend_utils.py +11 -11
- sky/backends/cloud_vm_ray_backend.py +15 -4
- sky/client/cli/command.py +39 -10
- sky/client/cli/flags.py +4 -2
- sky/client/sdk.py +26 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/IgACOQPupLbX9z-RYVEDx/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
- sky/dashboard/out/_next/static/chunks/2755.9b1e69c921b5a870.js +26 -0
- sky/dashboard/out/_next/static/chunks/3015-d014dc5b9412fade.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.998db87cd52a1238.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.14326e329484b57e.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-919e3c01ab6b2633.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +2 -2
- sky/global_user_state.py +137 -37
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +4 -2
- sky/jobs/server/server.py +21 -12
- sky/jobs/state.py +307 -55
- sky/jobs/utils.py +248 -144
- sky/provision/kubernetes/network.py +9 -6
- sky/provision/provisioner.py +8 -0
- sky/schemas/api/responses.py +2 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/serve/server/server.py +8 -7
- sky/server/common.py +10 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +4 -2
- sky/server/requests/executor.py +30 -28
- sky/server/requests/payloads.py +5 -1
- sky/server/requests/preconditions.py +9 -4
- sky/server/requests/requests.py +130 -53
- sky/server/requests/serializers/encoders.py +3 -3
- sky/server/server.py +91 -58
- sky/server/stream_utils.py +127 -38
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/skylet/services.py +5 -5
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +4 -4
- sky/users/permission.py +4 -0
- sky/utils/asyncio_utils.py +63 -3
- sky/utils/db/db_utils.py +11 -3
- sky/utils/db/migration_utils.py +7 -3
- sky/volumes/server/server.py +3 -3
- sky/workspaces/server.py +6 -6
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/METADATA +37 -37
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/RECORD +87 -86
- sky/dashboard/out/_next/static/8e35zdobdd0bK_Nkba03m/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/webpack-3c431f6c9086e487.js +0 -1
- /sky/dashboard/out/_next/static/{8e35zdobdd0bK_Nkba03m → IgACOQPupLbX9z-RYVEDx}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-df9f87fcb7f24292.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/top_level.txt +0 -0
sky/server/server.py
CHANGED
|
@@ -43,6 +43,7 @@ from sky.data import storage_utils
|
|
|
43
43
|
from sky.jobs import utils as managed_job_utils
|
|
44
44
|
from sky.jobs.server import server as jobs_rest
|
|
45
45
|
from sky.metrics import utils as metrics_utils
|
|
46
|
+
from sky.provision import metadata_utils
|
|
46
47
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
47
48
|
from sky.schemas.api import responses
|
|
48
49
|
from sky.serve.server import server as serve_rest
|
|
@@ -162,7 +163,7 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
162
163
|
"""Middleware to add a request ID to each request."""
|
|
163
164
|
|
|
164
165
|
async def dispatch(self, request: fastapi.Request, call_next):
|
|
165
|
-
request_id =
|
|
166
|
+
request_id = requests_lib.get_new_request_id()
|
|
166
167
|
request.state.request_id = request_id
|
|
167
168
|
response = await call_next(request)
|
|
168
169
|
# TODO(syang): remove X-Request-ID when v0.10.0 is released.
|
|
@@ -454,9 +455,9 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
|
|
|
454
455
|
loop.call_at(target, tick)
|
|
455
456
|
|
|
456
457
|
|
|
457
|
-
def
|
|
458
|
+
async def schedule_on_boot_check_async():
|
|
458
459
|
try:
|
|
459
|
-
executor.
|
|
460
|
+
await executor.schedule_request_async(
|
|
460
461
|
request_id='skypilot-server-on-boot-check',
|
|
461
462
|
request_name='check',
|
|
462
463
|
request_body=payloads.CheckBody(),
|
|
@@ -479,7 +480,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
479
480
|
if event.should_skip():
|
|
480
481
|
continue
|
|
481
482
|
try:
|
|
482
|
-
executor.
|
|
483
|
+
await executor.schedule_request_async(
|
|
483
484
|
request_id=event.id,
|
|
484
485
|
request_name=event.name,
|
|
485
486
|
request_body=payloads.RequestBody(),
|
|
@@ -494,7 +495,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
494
495
|
# Lifespan will be executed in each uvicorn worker process, we
|
|
495
496
|
# can safely ignore the error if the task is already scheduled.
|
|
496
497
|
logger.debug(f'Request {event.id} already exists.')
|
|
497
|
-
|
|
498
|
+
await schedule_on_boot_check_async()
|
|
498
499
|
asyncio.create_task(cleanup_upload_ids())
|
|
499
500
|
if metrics_utils.METRICS_ENABLED:
|
|
500
501
|
# Start monitoring the event loop lag in each server worker
|
|
@@ -728,7 +729,7 @@ async def token(request: fastapi.Request,
|
|
|
728
729
|
async def check(request: fastapi.Request,
|
|
729
730
|
check_body: payloads.CheckBody) -> None:
|
|
730
731
|
"""Checks enabled clouds."""
|
|
731
|
-
executor.
|
|
732
|
+
await executor.schedule_request_async(
|
|
732
733
|
request_id=request.state.request_id,
|
|
733
734
|
request_name='check',
|
|
734
735
|
request_body=check_body,
|
|
@@ -742,7 +743,7 @@ async def enabled_clouds(request: fastapi.Request,
|
|
|
742
743
|
workspace: Optional[str] = None,
|
|
743
744
|
expand: bool = False) -> None:
|
|
744
745
|
"""Gets enabled clouds on the server."""
|
|
745
|
-
executor.
|
|
746
|
+
await executor.schedule_request_async(
|
|
746
747
|
request_id=request.state.request_id,
|
|
747
748
|
request_name='enabled_clouds',
|
|
748
749
|
request_body=payloads.EnabledCloudsBody(workspace=workspace,
|
|
@@ -758,7 +759,7 @@ async def realtime_kubernetes_gpu_availability(
|
|
|
758
759
|
realtime_gpu_availability_body: payloads.RealtimeGpuAvailabilityRequestBody
|
|
759
760
|
) -> None:
|
|
760
761
|
"""Gets real-time Kubernetes GPU availability."""
|
|
761
|
-
executor.
|
|
762
|
+
await executor.schedule_request_async(
|
|
762
763
|
request_id=request.state.request_id,
|
|
763
764
|
request_name='realtime_kubernetes_gpu_availability',
|
|
764
765
|
request_body=realtime_gpu_availability_body,
|
|
@@ -773,7 +774,7 @@ async def kubernetes_node_info(
|
|
|
773
774
|
kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
|
|
774
775
|
) -> None:
|
|
775
776
|
"""Gets Kubernetes nodes information and hints."""
|
|
776
|
-
executor.
|
|
777
|
+
await executor.schedule_request_async(
|
|
777
778
|
request_id=request.state.request_id,
|
|
778
779
|
request_name='kubernetes_node_info',
|
|
779
780
|
request_body=kubernetes_node_info_body,
|
|
@@ -785,7 +786,7 @@ async def kubernetes_node_info(
|
|
|
785
786
|
@app.get('/status_kubernetes')
|
|
786
787
|
async def status_kubernetes(request: fastapi.Request) -> None:
|
|
787
788
|
"""Gets Kubernetes status."""
|
|
788
|
-
executor.
|
|
789
|
+
await executor.schedule_request_async(
|
|
789
790
|
request_id=request.state.request_id,
|
|
790
791
|
request_name='status_kubernetes',
|
|
791
792
|
request_body=payloads.RequestBody(),
|
|
@@ -799,7 +800,7 @@ async def list_accelerators(
|
|
|
799
800
|
request: fastapi.Request,
|
|
800
801
|
list_accelerator_counts_body: payloads.ListAcceleratorsBody) -> None:
|
|
801
802
|
"""Gets list of accelerators from cloud catalog."""
|
|
802
|
-
executor.
|
|
803
|
+
await executor.schedule_request_async(
|
|
803
804
|
request_id=request.state.request_id,
|
|
804
805
|
request_name='list_accelerators',
|
|
805
806
|
request_body=list_accelerator_counts_body,
|
|
@@ -814,7 +815,7 @@ async def list_accelerator_counts(
|
|
|
814
815
|
list_accelerator_counts_body: payloads.ListAcceleratorCountsBody
|
|
815
816
|
) -> None:
|
|
816
817
|
"""Gets list of accelerator counts from cloud catalog."""
|
|
817
|
-
executor.
|
|
818
|
+
await executor.schedule_request_async(
|
|
818
819
|
request_id=request.state.request_id,
|
|
819
820
|
request_name='list_accelerator_counts',
|
|
820
821
|
request_body=list_accelerator_counts_body,
|
|
@@ -871,7 +872,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
871
872
|
async def optimize(optimize_body: payloads.OptimizeBody,
|
|
872
873
|
request: fastapi.Request) -> None:
|
|
873
874
|
"""Optimizes the user's DAG."""
|
|
874
|
-
executor.
|
|
875
|
+
await executor.schedule_request_async(
|
|
875
876
|
request_id=request.state.request_id,
|
|
876
877
|
request_name='optimize',
|
|
877
878
|
request_body=optimize_body,
|
|
@@ -1081,7 +1082,7 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
|
1081
1082
|
"""Launches a cluster or task."""
|
|
1082
1083
|
request_id = request.state.request_id
|
|
1083
1084
|
logger.info(f'Launching request: {request_id}')
|
|
1084
|
-
executor.
|
|
1085
|
+
await executor.schedule_request_async(
|
|
1085
1086
|
request_id,
|
|
1086
1087
|
request_name='launch',
|
|
1087
1088
|
request_body=launch_body,
|
|
@@ -1097,7 +1098,7 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
|
1097
1098
|
async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
|
|
1098
1099
|
"""Executes a task on an existing cluster."""
|
|
1099
1100
|
cluster_name = exec_body.cluster_name
|
|
1100
|
-
executor.
|
|
1101
|
+
await executor.schedule_request_async(
|
|
1101
1102
|
request_id=request.state.request_id,
|
|
1102
1103
|
request_name='exec',
|
|
1103
1104
|
request_body=exec_body,
|
|
@@ -1115,7 +1116,7 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
|
|
|
1115
1116
|
async def stop(request: fastapi.Request,
|
|
1116
1117
|
stop_body: payloads.StopOrDownBody) -> None:
|
|
1117
1118
|
"""Stops a cluster."""
|
|
1118
|
-
executor.
|
|
1119
|
+
await executor.schedule_request_async(
|
|
1119
1120
|
request_id=request.state.request_id,
|
|
1120
1121
|
request_name='stop',
|
|
1121
1122
|
request_body=stop_body,
|
|
@@ -1135,7 +1136,7 @@ async def status(
|
|
|
1135
1136
|
raise fastapi.HTTPException(
|
|
1136
1137
|
status_code=503,
|
|
1137
1138
|
detail='Server is shutting down, please try again later.')
|
|
1138
|
-
executor.
|
|
1139
|
+
await executor.schedule_request_async(
|
|
1139
1140
|
request_id=request.state.request_id,
|
|
1140
1141
|
request_name='status',
|
|
1141
1142
|
request_body=status_body,
|
|
@@ -1150,7 +1151,7 @@ async def status(
|
|
|
1150
1151
|
async def endpoints(request: fastapi.Request,
|
|
1151
1152
|
endpoint_body: payloads.EndpointsBody) -> None:
|
|
1152
1153
|
"""Gets the endpoint for a given cluster and port number (endpoint)."""
|
|
1153
|
-
executor.
|
|
1154
|
+
await executor.schedule_request_async(
|
|
1154
1155
|
request_id=request.state.request_id,
|
|
1155
1156
|
request_name='endpoints',
|
|
1156
1157
|
request_body=endpoint_body,
|
|
@@ -1164,7 +1165,7 @@ async def endpoints(request: fastapi.Request,
|
|
|
1164
1165
|
async def down(request: fastapi.Request,
|
|
1165
1166
|
down_body: payloads.StopOrDownBody) -> None:
|
|
1166
1167
|
"""Tears down a cluster."""
|
|
1167
|
-
executor.
|
|
1168
|
+
await executor.schedule_request_async(
|
|
1168
1169
|
request_id=request.state.request_id,
|
|
1169
1170
|
request_name='down',
|
|
1170
1171
|
request_body=down_body,
|
|
@@ -1178,7 +1179,7 @@ async def down(request: fastapi.Request,
|
|
|
1178
1179
|
async def start(request: fastapi.Request,
|
|
1179
1180
|
start_body: payloads.StartBody) -> None:
|
|
1180
1181
|
"""Restarts a cluster."""
|
|
1181
|
-
executor.
|
|
1182
|
+
await executor.schedule_request_async(
|
|
1182
1183
|
request_id=request.state.request_id,
|
|
1183
1184
|
request_name='start',
|
|
1184
1185
|
request_body=start_body,
|
|
@@ -1192,7 +1193,7 @@ async def start(request: fastapi.Request,
|
|
|
1192
1193
|
async def autostop(request: fastapi.Request,
|
|
1193
1194
|
autostop_body: payloads.AutostopBody) -> None:
|
|
1194
1195
|
"""Schedules an autostop/autodown for a cluster."""
|
|
1195
|
-
executor.
|
|
1196
|
+
await executor.schedule_request_async(
|
|
1196
1197
|
request_id=request.state.request_id,
|
|
1197
1198
|
request_name='autostop',
|
|
1198
1199
|
request_body=autostop_body,
|
|
@@ -1206,7 +1207,7 @@ async def autostop(request: fastapi.Request,
|
|
|
1206
1207
|
async def queue(request: fastapi.Request,
|
|
1207
1208
|
queue_body: payloads.QueueBody) -> None:
|
|
1208
1209
|
"""Gets the job queue of a cluster."""
|
|
1209
|
-
executor.
|
|
1210
|
+
await executor.schedule_request_async(
|
|
1210
1211
|
request_id=request.state.request_id,
|
|
1211
1212
|
request_name='queue',
|
|
1212
1213
|
request_body=queue_body,
|
|
@@ -1220,7 +1221,7 @@ async def queue(request: fastapi.Request,
|
|
|
1220
1221
|
async def job_status(request: fastapi.Request,
|
|
1221
1222
|
job_status_body: payloads.JobStatusBody) -> None:
|
|
1222
1223
|
"""Gets the status of a job."""
|
|
1223
|
-
executor.
|
|
1224
|
+
await executor.schedule_request_async(
|
|
1224
1225
|
request_id=request.state.request_id,
|
|
1225
1226
|
request_name='job_status',
|
|
1226
1227
|
request_body=job_status_body,
|
|
@@ -1234,7 +1235,7 @@ async def job_status(request: fastapi.Request,
|
|
|
1234
1235
|
async def cancel(request: fastapi.Request,
|
|
1235
1236
|
cancel_body: payloads.CancelBody) -> None:
|
|
1236
1237
|
"""Cancels jobs on a cluster."""
|
|
1237
|
-
executor.
|
|
1238
|
+
await executor.schedule_request_async(
|
|
1238
1239
|
request_id=request.state.request_id,
|
|
1239
1240
|
request_name='cancel',
|
|
1240
1241
|
request_body=cancel_body,
|
|
@@ -1254,7 +1255,7 @@ async def logs(
|
|
|
1254
1255
|
# launch, to finish, so that a user does not need to manually pull the
|
|
1255
1256
|
# request status.
|
|
1256
1257
|
executor.check_request_thread_executor_available()
|
|
1257
|
-
request_task = executor.
|
|
1258
|
+
request_task = await executor.prepare_request_async(
|
|
1258
1259
|
request_id=request.state.request_id,
|
|
1259
1260
|
request_name='logs',
|
|
1260
1261
|
request_body=cluster_job_body,
|
|
@@ -1270,6 +1271,7 @@ async def logs(
|
|
|
1270
1271
|
request_id=request.state.request_id,
|
|
1271
1272
|
logs_path=request_task.log_path,
|
|
1272
1273
|
background_tasks=background_tasks,
|
|
1274
|
+
kill_request_on_disconnect=False,
|
|
1273
1275
|
)
|
|
1274
1276
|
|
|
1275
1277
|
|
|
@@ -1284,7 +1286,7 @@ async def download_logs(
|
|
|
1284
1286
|
# We should reuse the original request body, so that the env vars, such as
|
|
1285
1287
|
# user hash, are kept the same.
|
|
1286
1288
|
cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
|
|
1287
|
-
executor.
|
|
1289
|
+
await executor.schedule_request_async(
|
|
1288
1290
|
request_id=request.state.request_id,
|
|
1289
1291
|
request_name='download_logs',
|
|
1290
1292
|
request_body=cluster_jobs_body,
|
|
@@ -1363,38 +1365,65 @@ async def download(download_body: payloads.DownloadBody,
|
|
|
1363
1365
|
|
|
1364
1366
|
# TODO(aylei): run it asynchronously after global_user_state support async op
|
|
1365
1367
|
@app.post('/provision_logs')
|
|
1366
|
-
def provision_logs(
|
|
1368
|
+
def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
|
|
1367
1369
|
follow: bool = True,
|
|
1368
1370
|
tail: int = 0) -> fastapi.responses.StreamingResponse:
|
|
1369
1371
|
"""Streams the provision.log for the latest launch request of a cluster."""
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1372
|
+
log_path = None
|
|
1373
|
+
cluster_name = provision_logs_body.cluster_name
|
|
1374
|
+
worker = provision_logs_body.worker
|
|
1375
|
+
# stream head node logs
|
|
1376
|
+
if worker is None:
|
|
1377
|
+
# Prefer clusters table first, then cluster_history as fallback.
|
|
1378
|
+
log_path_str = global_user_state.get_cluster_provision_log_path(
|
|
1379
|
+
cluster_name)
|
|
1380
|
+
if not log_path_str:
|
|
1381
|
+
log_path_str = (
|
|
1382
|
+
global_user_state.get_cluster_history_provision_log_path(
|
|
1383
|
+
cluster_name))
|
|
1384
|
+
if not log_path_str:
|
|
1385
|
+
raise fastapi.HTTPException(
|
|
1386
|
+
status_code=404,
|
|
1387
|
+
detail=('Provision log path is not recorded for this cluster. '
|
|
1388
|
+
'Please relaunch to generate provisioning logs.'))
|
|
1389
|
+
log_path = pathlib.Path(log_path_str).expanduser().resolve()
|
|
1390
|
+
if not log_path.exists():
|
|
1391
|
+
raise fastapi.HTTPException(
|
|
1392
|
+
status_code=404,
|
|
1393
|
+
detail=f'Provision log path does not exist: {str(log_path)}')
|
|
1381
1394
|
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1395
|
+
# stream worker node logs
|
|
1396
|
+
else:
|
|
1397
|
+
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
|
1398
|
+
if handle is None:
|
|
1399
|
+
raise fastapi.HTTPException(
|
|
1400
|
+
status_code=404,
|
|
1401
|
+
detail=('Cluster handle is not recorded for this cluster. '
|
|
1402
|
+
'Please relaunch to generate provisioning logs.'))
|
|
1403
|
+
# instance_ids includes head node
|
|
1404
|
+
instance_ids = handle.instance_ids
|
|
1405
|
+
if instance_ids is None:
|
|
1406
|
+
raise fastapi.HTTPException(
|
|
1407
|
+
status_code=400,
|
|
1408
|
+
detail='Instance IDs are not recorded for this cluster. '
|
|
1409
|
+
'Please relaunch to generate provisioning logs.')
|
|
1410
|
+
if worker > len(instance_ids) - 1:
|
|
1411
|
+
raise fastapi.HTTPException(
|
|
1412
|
+
status_code=400,
|
|
1413
|
+
detail=f'Worker {worker} is out of range. '
|
|
1414
|
+
f'The cluster has {len(instance_ids)} nodes.')
|
|
1415
|
+
log_path = metadata_utils.get_instance_log_dir(
|
|
1416
|
+
handle.get_cluster_name_on_cloud(), instance_ids[worker])
|
|
1387
1417
|
|
|
1388
1418
|
# Tail semantics: 0 means print all lines. Convert 0 -> None for streamer.
|
|
1389
1419
|
effective_tail = None if tail is None or tail <= 0 else tail
|
|
1390
1420
|
|
|
1391
1421
|
return fastapi.responses.StreamingResponse(
|
|
1392
|
-
content=stream_utils.log_streamer(
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
cluster_name=cluster_body.cluster_name),
|
|
1422
|
+
content=stream_utils.log_streamer(None,
|
|
1423
|
+
log_path,
|
|
1424
|
+
tail=effective_tail,
|
|
1425
|
+
follow=follow,
|
|
1426
|
+
cluster_name=cluster_name),
|
|
1398
1427
|
media_type='text/plain',
|
|
1399
1428
|
headers={
|
|
1400
1429
|
'Cache-Control': 'no-cache, no-transform',
|
|
@@ -1408,7 +1437,7 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
|
|
|
1408
1437
|
async def cost_report(request: fastapi.Request,
|
|
1409
1438
|
cost_report_body: payloads.CostReportBody) -> None:
|
|
1410
1439
|
"""Gets the cost report of a cluster."""
|
|
1411
|
-
executor.
|
|
1440
|
+
await executor.schedule_request_async(
|
|
1412
1441
|
request_id=request.state.request_id,
|
|
1413
1442
|
request_name='cost_report',
|
|
1414
1443
|
request_body=cost_report_body,
|
|
@@ -1420,7 +1449,7 @@ async def cost_report(request: fastapi.Request,
|
|
|
1420
1449
|
@app.get('/storage/ls')
|
|
1421
1450
|
async def storage_ls(request: fastapi.Request) -> None:
|
|
1422
1451
|
"""Gets the storages."""
|
|
1423
|
-
executor.
|
|
1452
|
+
await executor.schedule_request_async(
|
|
1424
1453
|
request_id=request.state.request_id,
|
|
1425
1454
|
request_name='storage_ls',
|
|
1426
1455
|
request_body=payloads.RequestBody(),
|
|
@@ -1433,7 +1462,7 @@ async def storage_ls(request: fastapi.Request) -> None:
|
|
|
1433
1462
|
async def storage_delete(request: fastapi.Request,
|
|
1434
1463
|
storage_body: payloads.StorageBody) -> None:
|
|
1435
1464
|
"""Deletes a storage."""
|
|
1436
|
-
executor.
|
|
1465
|
+
await executor.schedule_request_async(
|
|
1437
1466
|
request_id=request.state.request_id,
|
|
1438
1467
|
request_name='storage_delete',
|
|
1439
1468
|
request_body=storage_body,
|
|
@@ -1446,7 +1475,7 @@ async def storage_delete(request: fastapi.Request,
|
|
|
1446
1475
|
async def local_up(request: fastapi.Request,
|
|
1447
1476
|
local_up_body: payloads.LocalUpBody) -> None:
|
|
1448
1477
|
"""Launches a Kubernetes cluster on API server."""
|
|
1449
|
-
executor.
|
|
1478
|
+
await executor.schedule_request_async(
|
|
1450
1479
|
request_id=request.state.request_id,
|
|
1451
1480
|
request_name='local_up',
|
|
1452
1481
|
request_body=local_up_body,
|
|
@@ -1459,7 +1488,7 @@ async def local_up(request: fastapi.Request,
|
|
|
1459
1488
|
async def local_down(request: fastapi.Request,
|
|
1460
1489
|
local_down_body: payloads.LocalDownBody) -> None:
|
|
1461
1490
|
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1462
|
-
executor.
|
|
1491
|
+
await executor.schedule_request_async(
|
|
1463
1492
|
request_id=request.state.request_id,
|
|
1464
1493
|
request_name='local_down',
|
|
1465
1494
|
request_body=local_down_body,
|
|
@@ -1537,7 +1566,7 @@ async def stream(
|
|
|
1537
1566
|
detail='Only one of request_id and log_path can be provided')
|
|
1538
1567
|
|
|
1539
1568
|
if request_id is None and log_path is None:
|
|
1540
|
-
request_id = requests_lib.
|
|
1569
|
+
request_id = await requests_lib.get_latest_request_id_async()
|
|
1541
1570
|
if request_id is None:
|
|
1542
1571
|
raise fastapi.HTTPException(status_code=404,
|
|
1543
1572
|
detail='No request found')
|
|
@@ -1567,11 +1596,14 @@ async def stream(
|
|
|
1567
1596
|
polling_interval = stream_utils.DEFAULT_POLL_INTERVAL
|
|
1568
1597
|
# Original plain text streaming logic
|
|
1569
1598
|
if request_id is not None:
|
|
1570
|
-
request_task = await requests_lib.get_request_async(
|
|
1599
|
+
request_task = await requests_lib.get_request_async(
|
|
1600
|
+
request_id, fields=['request_id', 'schedule_type'])
|
|
1571
1601
|
if request_task is None:
|
|
1572
1602
|
print(f'No task with request ID {request_id}')
|
|
1573
1603
|
raise fastapi.HTTPException(
|
|
1574
1604
|
status_code=404, detail=f'Request {request_id!r} not found')
|
|
1605
|
+
# req.log_path is derived from request_id,
|
|
1606
|
+
# so it's ok to just grab the request_id in the above query.
|
|
1575
1607
|
log_path_to_stream = request_task.log_path
|
|
1576
1608
|
if not log_path_to_stream.exists():
|
|
1577
1609
|
# The log file might be deleted by the request GC daemon but the
|
|
@@ -1581,6 +1613,7 @@ async def stream(
|
|
|
1581
1613
|
detail=f'Log of request {request_id!r} has been deleted')
|
|
1582
1614
|
if request_task.schedule_type == requests_lib.ScheduleType.LONG:
|
|
1583
1615
|
polling_interval = stream_utils.LONG_REQUEST_POLL_INTERVAL
|
|
1616
|
+
del request_task
|
|
1584
1617
|
else:
|
|
1585
1618
|
assert log_path is not None, (request_id, log_path)
|
|
1586
1619
|
if log_path == constants.API_SERVER_LOGS:
|
|
@@ -1639,7 +1672,7 @@ async def stream(
|
|
|
1639
1672
|
async def api_cancel(request: fastapi.Request,
|
|
1640
1673
|
request_cancel_body: payloads.RequestCancelBody) -> None:
|
|
1641
1674
|
"""Cancels requests."""
|
|
1642
|
-
executor.
|
|
1675
|
+
await executor.schedule_request_async(
|
|
1643
1676
|
request_id=request.state.request_id,
|
|
1644
1677
|
request_name='api_cancel',
|
|
1645
1678
|
request_body=request_cancel_body,
|
|
@@ -1875,7 +1908,7 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1875
1908
|
async def all_contexts(request: fastapi.Request) -> None:
|
|
1876
1909
|
"""Gets all Kubernetes and SSH node pool contexts."""
|
|
1877
1910
|
|
|
1878
|
-
executor.
|
|
1911
|
+
await executor.schedule_request_async(
|
|
1879
1912
|
request_id=request.state.request_id,
|
|
1880
1913
|
request_name='all_contexts',
|
|
1881
1914
|
request_body=payloads.RequestBody(),
|