skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/backends/backend_utils.py +9 -6
- sky/backends/cloud_vm_ray_backend.py +2 -3
- sky/check.py +25 -13
- sky/client/cli/command.py +52 -24
- sky/cloud_stores.py +73 -0
- sky/clouds/aws.py +59 -11
- sky/core.py +7 -5
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +71 -2
- sky/data/storage.py +166 -9
- sky/global_user_state.py +14 -18
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +62 -67
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/scheduler.py +15 -2
- sky/jobs/server/core.py +85 -13
- sky/jobs/server/server.py +14 -13
- sky/jobs/server/utils.py +28 -10
- sky/jobs/state.py +216 -40
- sky/jobs/utils.py +65 -28
- sky/metrics/utils.py +18 -0
- sky/optimizer.py +1 -1
- sky/provision/kubernetes/instance.py +88 -19
- sky/provision/kubernetes/volume.py +2 -2
- sky/schemas/api/responses.py +3 -5
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/replica_managers.py +2 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/server.py +8 -7
- sky/server/common.py +21 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +23 -17
- sky/server/requests/executor.py +7 -3
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/request_names.py +80 -0
- sky/server/requests/requests.py +137 -102
- sky/server/requests/serializers/decoders.py +0 -6
- sky/server/requests/serializers/encoders.py +33 -6
- sky/server/server.py +105 -36
- sky/server/stream_utils.py +56 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +6 -1
- sky/skylet/events.py +7 -0
- sky/skylet/services.py +18 -7
- sky/ssh_node_pools/server.py +5 -4
- sky/task.py +14 -42
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +140 -12
- sky/users/permission.py +4 -1
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/context_utils.py +13 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/volumes/server/server.py +4 -3
- sky/workspaces/server.py +7 -6
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
- sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
- sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
- /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
sky/server/server.py
CHANGED
|
@@ -6,6 +6,7 @@ import base64
|
|
|
6
6
|
from concurrent.futures import ThreadPoolExecutor
|
|
7
7
|
import contextlib
|
|
8
8
|
import datetime
|
|
9
|
+
from enum import IntEnum
|
|
9
10
|
import hashlib
|
|
10
11
|
import json
|
|
11
12
|
import multiprocessing
|
|
@@ -15,6 +16,7 @@ import posixpath
|
|
|
15
16
|
import re
|
|
16
17
|
import resource
|
|
17
18
|
import shutil
|
|
19
|
+
import struct
|
|
18
20
|
import sys
|
|
19
21
|
import threading
|
|
20
22
|
import traceback
|
|
@@ -25,6 +27,7 @@ import zipfile
|
|
|
25
27
|
import aiofiles
|
|
26
28
|
import anyio
|
|
27
29
|
import fastapi
|
|
30
|
+
from fastapi import responses as fastapi_responses
|
|
28
31
|
from fastapi.middleware import cors
|
|
29
32
|
import starlette.middleware.base
|
|
30
33
|
import uvloop
|
|
@@ -61,6 +64,7 @@ from sky.server.auth import oauth2_proxy
|
|
|
61
64
|
from sky.server.requests import executor
|
|
62
65
|
from sky.server.requests import payloads
|
|
63
66
|
from sky.server.requests import preconditions
|
|
67
|
+
from sky.server.requests import request_names
|
|
64
68
|
from sky.server.requests import requests as requests_lib
|
|
65
69
|
from sky.skylet import constants
|
|
66
70
|
from sky.ssh_node_pools import server as ssh_node_pools_rest
|
|
@@ -459,7 +463,7 @@ async def schedule_on_boot_check_async():
|
|
|
459
463
|
try:
|
|
460
464
|
await executor.schedule_request_async(
|
|
461
465
|
request_id='skypilot-server-on-boot-check',
|
|
462
|
-
request_name=
|
|
466
|
+
request_name=request_names.RequestName.CHECK,
|
|
463
467
|
request_body=payloads.CheckBody(),
|
|
464
468
|
func=sky_check.check,
|
|
465
469
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -731,7 +735,7 @@ async def check(request: fastapi.Request,
|
|
|
731
735
|
"""Checks enabled clouds."""
|
|
732
736
|
await executor.schedule_request_async(
|
|
733
737
|
request_id=request.state.request_id,
|
|
734
|
-
request_name=
|
|
738
|
+
request_name=request_names.RequestName.CHECK,
|
|
735
739
|
request_body=check_body,
|
|
736
740
|
func=sky_check.check,
|
|
737
741
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -745,7 +749,7 @@ async def enabled_clouds(request: fastapi.Request,
|
|
|
745
749
|
"""Gets enabled clouds on the server."""
|
|
746
750
|
await executor.schedule_request_async(
|
|
747
751
|
request_id=request.state.request_id,
|
|
748
|
-
request_name=
|
|
752
|
+
request_name=request_names.RequestName.ENABLED_CLOUDS,
|
|
749
753
|
request_body=payloads.EnabledCloudsBody(workspace=workspace,
|
|
750
754
|
expand=expand),
|
|
751
755
|
func=core.enabled_clouds,
|
|
@@ -761,7 +765,8 @@ async def realtime_kubernetes_gpu_availability(
|
|
|
761
765
|
"""Gets real-time Kubernetes GPU availability."""
|
|
762
766
|
await executor.schedule_request_async(
|
|
763
767
|
request_id=request.state.request_id,
|
|
764
|
-
request_name=
|
|
768
|
+
request_name=request_names.RequestName.
|
|
769
|
+
REALTIME_KUBERNETES_GPU_AVAILABILITY,
|
|
765
770
|
request_body=realtime_gpu_availability_body,
|
|
766
771
|
func=core.realtime_kubernetes_gpu_availability,
|
|
767
772
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -776,7 +781,7 @@ async def kubernetes_node_info(
|
|
|
776
781
|
"""Gets Kubernetes nodes information and hints."""
|
|
777
782
|
await executor.schedule_request_async(
|
|
778
783
|
request_id=request.state.request_id,
|
|
779
|
-
request_name=
|
|
784
|
+
request_name=request_names.RequestName.KUBERNETES_NODE_INFO,
|
|
780
785
|
request_body=kubernetes_node_info_body,
|
|
781
786
|
func=kubernetes_utils.get_kubernetes_node_info,
|
|
782
787
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -788,7 +793,7 @@ async def status_kubernetes(request: fastapi.Request) -> None:
|
|
|
788
793
|
"""Gets Kubernetes status."""
|
|
789
794
|
await executor.schedule_request_async(
|
|
790
795
|
request_id=request.state.request_id,
|
|
791
|
-
request_name=
|
|
796
|
+
request_name=request_names.RequestName.STATUS_KUBERNETES,
|
|
792
797
|
request_body=payloads.RequestBody(),
|
|
793
798
|
func=core.status_kubernetes,
|
|
794
799
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -802,7 +807,7 @@ async def list_accelerators(
|
|
|
802
807
|
"""Gets list of accelerators from cloud catalog."""
|
|
803
808
|
await executor.schedule_request_async(
|
|
804
809
|
request_id=request.state.request_id,
|
|
805
|
-
request_name=
|
|
810
|
+
request_name=request_names.RequestName.LIST_ACCELERATORS,
|
|
806
811
|
request_body=list_accelerator_counts_body,
|
|
807
812
|
func=catalog.list_accelerators,
|
|
808
813
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -817,7 +822,7 @@ async def list_accelerator_counts(
|
|
|
817
822
|
"""Gets list of accelerator counts from cloud catalog."""
|
|
818
823
|
await executor.schedule_request_async(
|
|
819
824
|
request_id=request.state.request_id,
|
|
820
|
-
request_name=
|
|
825
|
+
request_name=request_names.RequestName.LIST_ACCELERATOR_COUNTS,
|
|
821
826
|
request_body=list_accelerator_counts_body,
|
|
822
827
|
func=catalog.list_accelerator_counts,
|
|
823
828
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -874,7 +879,7 @@ async def optimize(optimize_body: payloads.OptimizeBody,
|
|
|
874
879
|
"""Optimizes the user's DAG."""
|
|
875
880
|
await executor.schedule_request_async(
|
|
876
881
|
request_id=request.state.request_id,
|
|
877
|
-
request_name=
|
|
882
|
+
request_name=request_names.RequestName.OPTIMIZE,
|
|
878
883
|
request_body=optimize_body,
|
|
879
884
|
ignore_return_value=True,
|
|
880
885
|
func=core.optimize,
|
|
@@ -1084,7 +1089,7 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
|
1084
1089
|
logger.info(f'Launching request: {request_id}')
|
|
1085
1090
|
await executor.schedule_request_async(
|
|
1086
1091
|
request_id,
|
|
1087
|
-
request_name=
|
|
1092
|
+
request_name=request_names.RequestName.CLUSTER_LAUNCH,
|
|
1088
1093
|
request_body=launch_body,
|
|
1089
1094
|
func=execution.launch,
|
|
1090
1095
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1100,7 +1105,7 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
|
|
|
1100
1105
|
cluster_name = exec_body.cluster_name
|
|
1101
1106
|
await executor.schedule_request_async(
|
|
1102
1107
|
request_id=request.state.request_id,
|
|
1103
|
-
request_name=
|
|
1108
|
+
request_name=request_names.RequestName.CLUSTER_EXEC,
|
|
1104
1109
|
request_body=exec_body,
|
|
1105
1110
|
func=execution.exec,
|
|
1106
1111
|
precondition=preconditions.ClusterStartCompletePrecondition(
|
|
@@ -1118,7 +1123,7 @@ async def stop(request: fastapi.Request,
|
|
|
1118
1123
|
"""Stops a cluster."""
|
|
1119
1124
|
await executor.schedule_request_async(
|
|
1120
1125
|
request_id=request.state.request_id,
|
|
1121
|
-
request_name=
|
|
1126
|
+
request_name=request_names.RequestName.CLUSTER_STOP,
|
|
1122
1127
|
request_body=stop_body,
|
|
1123
1128
|
func=core.stop,
|
|
1124
1129
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1138,7 +1143,7 @@ async def status(
|
|
|
1138
1143
|
detail='Server is shutting down, please try again later.')
|
|
1139
1144
|
await executor.schedule_request_async(
|
|
1140
1145
|
request_id=request.state.request_id,
|
|
1141
|
-
request_name=
|
|
1146
|
+
request_name=request_names.RequestName.CLUSTER_STATUS,
|
|
1142
1147
|
request_body=status_body,
|
|
1143
1148
|
func=core.status,
|
|
1144
1149
|
schedule_type=(requests_lib.ScheduleType.LONG if
|
|
@@ -1153,7 +1158,7 @@ async def endpoints(request: fastapi.Request,
|
|
|
1153
1158
|
"""Gets the endpoint for a given cluster and port number (endpoint)."""
|
|
1154
1159
|
await executor.schedule_request_async(
|
|
1155
1160
|
request_id=request.state.request_id,
|
|
1156
|
-
request_name=
|
|
1161
|
+
request_name=request_names.RequestName.CLUSTER_ENDPOINTS,
|
|
1157
1162
|
request_body=endpoint_body,
|
|
1158
1163
|
func=core.endpoints,
|
|
1159
1164
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1167,7 +1172,7 @@ async def down(request: fastapi.Request,
|
|
|
1167
1172
|
"""Tears down a cluster."""
|
|
1168
1173
|
await executor.schedule_request_async(
|
|
1169
1174
|
request_id=request.state.request_id,
|
|
1170
|
-
request_name=
|
|
1175
|
+
request_name=request_names.RequestName.CLUSTER_DOWN,
|
|
1171
1176
|
request_body=down_body,
|
|
1172
1177
|
func=core.down,
|
|
1173
1178
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1181,7 +1186,7 @@ async def start(request: fastapi.Request,
|
|
|
1181
1186
|
"""Restarts a cluster."""
|
|
1182
1187
|
await executor.schedule_request_async(
|
|
1183
1188
|
request_id=request.state.request_id,
|
|
1184
|
-
request_name=
|
|
1189
|
+
request_name=request_names.RequestName.CLUSTER_START,
|
|
1185
1190
|
request_body=start_body,
|
|
1186
1191
|
func=core.start,
|
|
1187
1192
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1195,7 +1200,7 @@ async def autostop(request: fastapi.Request,
|
|
|
1195
1200
|
"""Schedules an autostop/autodown for a cluster."""
|
|
1196
1201
|
await executor.schedule_request_async(
|
|
1197
1202
|
request_id=request.state.request_id,
|
|
1198
|
-
request_name=
|
|
1203
|
+
request_name=request_names.RequestName.CLUSTER_AUTOSTOP,
|
|
1199
1204
|
request_body=autostop_body,
|
|
1200
1205
|
func=core.autostop,
|
|
1201
1206
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1209,7 +1214,7 @@ async def queue(request: fastapi.Request,
|
|
|
1209
1214
|
"""Gets the job queue of a cluster."""
|
|
1210
1215
|
await executor.schedule_request_async(
|
|
1211
1216
|
request_id=request.state.request_id,
|
|
1212
|
-
request_name=
|
|
1217
|
+
request_name=request_names.RequestName.CLUSTER_QUEUE,
|
|
1213
1218
|
request_body=queue_body,
|
|
1214
1219
|
func=core.queue,
|
|
1215
1220
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1223,7 +1228,7 @@ async def job_status(request: fastapi.Request,
|
|
|
1223
1228
|
"""Gets the status of a job."""
|
|
1224
1229
|
await executor.schedule_request_async(
|
|
1225
1230
|
request_id=request.state.request_id,
|
|
1226
|
-
request_name=
|
|
1231
|
+
request_name=request_names.RequestName.CLUSTER_JOB_STATUS,
|
|
1227
1232
|
request_body=job_status_body,
|
|
1228
1233
|
func=core.job_status,
|
|
1229
1234
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1237,7 +1242,7 @@ async def cancel(request: fastapi.Request,
|
|
|
1237
1242
|
"""Cancels jobs on a cluster."""
|
|
1238
1243
|
await executor.schedule_request_async(
|
|
1239
1244
|
request_id=request.state.request_id,
|
|
1240
|
-
request_name=
|
|
1245
|
+
request_name=request_names.RequestName.CLUSTER_JOB_CANCEL,
|
|
1241
1246
|
request_body=cancel_body,
|
|
1242
1247
|
func=core.cancel,
|
|
1243
1248
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1257,7 +1262,7 @@ async def logs(
|
|
|
1257
1262
|
executor.check_request_thread_executor_available()
|
|
1258
1263
|
request_task = await executor.prepare_request_async(
|
|
1259
1264
|
request_id=request.state.request_id,
|
|
1260
|
-
request_name=
|
|
1265
|
+
request_name=request_names.RequestName.CLUSTER_JOB_LOGS,
|
|
1261
1266
|
request_body=cluster_job_body,
|
|
1262
1267
|
func=core.tail_logs,
|
|
1263
1268
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1288,7 +1293,7 @@ async def download_logs(
|
|
|
1288
1293
|
cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
|
|
1289
1294
|
await executor.schedule_request_async(
|
|
1290
1295
|
request_id=request.state.request_id,
|
|
1291
|
-
request_name=
|
|
1296
|
+
request_name=request_names.RequestName.CLUSTER_JOB_DOWNLOAD_LOGS,
|
|
1292
1297
|
request_body=cluster_jobs_body,
|
|
1293
1298
|
func=core.download_logs,
|
|
1294
1299
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1439,7 +1444,7 @@ async def cost_report(request: fastapi.Request,
|
|
|
1439
1444
|
"""Gets the cost report of a cluster."""
|
|
1440
1445
|
await executor.schedule_request_async(
|
|
1441
1446
|
request_id=request.state.request_id,
|
|
1442
|
-
request_name=
|
|
1447
|
+
request_name=request_names.RequestName.CLUSTER_COST_REPORT,
|
|
1443
1448
|
request_body=cost_report_body,
|
|
1444
1449
|
func=core.cost_report,
|
|
1445
1450
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1451,7 +1456,7 @@ async def storage_ls(request: fastapi.Request) -> None:
|
|
|
1451
1456
|
"""Gets the storages."""
|
|
1452
1457
|
await executor.schedule_request_async(
|
|
1453
1458
|
request_id=request.state.request_id,
|
|
1454
|
-
request_name=
|
|
1459
|
+
request_name=request_names.RequestName.STORAGE_LS,
|
|
1455
1460
|
request_body=payloads.RequestBody(),
|
|
1456
1461
|
func=core.storage_ls,
|
|
1457
1462
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1464,7 +1469,7 @@ async def storage_delete(request: fastapi.Request,
|
|
|
1464
1469
|
"""Deletes a storage."""
|
|
1465
1470
|
await executor.schedule_request_async(
|
|
1466
1471
|
request_id=request.state.request_id,
|
|
1467
|
-
request_name=
|
|
1472
|
+
request_name=request_names.RequestName.STORAGE_DELETE,
|
|
1468
1473
|
request_body=storage_body,
|
|
1469
1474
|
func=core.storage_delete,
|
|
1470
1475
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1477,7 +1482,7 @@ async def local_up(request: fastapi.Request,
|
|
|
1477
1482
|
"""Launches a Kubernetes cluster on API server."""
|
|
1478
1483
|
await executor.schedule_request_async(
|
|
1479
1484
|
request_id=request.state.request_id,
|
|
1480
|
-
request_name=
|
|
1485
|
+
request_name=request_names.RequestName.LOCAL_UP,
|
|
1481
1486
|
request_body=local_up_body,
|
|
1482
1487
|
func=core.local_up,
|
|
1483
1488
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1490,7 +1495,7 @@ async def local_down(request: fastapi.Request,
|
|
|
1490
1495
|
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1491
1496
|
await executor.schedule_request_async(
|
|
1492
1497
|
request_id=request.state.request_id,
|
|
1493
|
-
request_name=
|
|
1498
|
+
request_name=request_names.RequestName.LOCAL_DOWN,
|
|
1494
1499
|
request_body=local_down_body,
|
|
1495
1500
|
func=core.local_down,
|
|
1496
1501
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1512,7 +1517,7 @@ async def get_expanded_request_id(request_id: str) -> str:
|
|
|
1512
1517
|
|
|
1513
1518
|
|
|
1514
1519
|
# === API server related APIs ===
|
|
1515
|
-
@app.get('/api/get')
|
|
1520
|
+
@app.get('/api/get', response_class=fastapi_responses.ORJSONResponse)
|
|
1516
1521
|
async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
1517
1522
|
"""Gets a request with a given request ID prefix."""
|
|
1518
1523
|
# Validate request_id prefix matches a single request.
|
|
@@ -1698,7 +1703,7 @@ async def api_cancel(request: fastapi.Request,
|
|
|
1698
1703
|
"""Cancels requests."""
|
|
1699
1704
|
await executor.schedule_request_async(
|
|
1700
1705
|
request_id=request.state.request_id,
|
|
1701
|
-
request_name=
|
|
1706
|
+
request_name=request_names.RequestName.API_CANCEL,
|
|
1702
1707
|
request_body=request_cancel_body,
|
|
1703
1708
|
func=requests_lib.kill_requests_with_prefix,
|
|
1704
1709
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1803,16 +1808,31 @@ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
|
|
|
1803
1808
|
basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
|
|
1804
1809
|
'false').lower() == 'true',
|
|
1805
1810
|
user=user if user is not None else None,
|
|
1811
|
+
service_account_token_enabled=(os.environ.get(
|
|
1812
|
+
constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
|
|
1813
|
+
'false').lower() == 'true'),
|
|
1806
1814
|
)
|
|
1807
1815
|
|
|
1808
1816
|
|
|
1817
|
+
class KubernetesSSHMessageType(IntEnum):
|
|
1818
|
+
REGULAR_DATA = 0
|
|
1819
|
+
PINGPONG = 1
|
|
1820
|
+
LATENCY_MEASUREMENT = 2
|
|
1821
|
+
|
|
1822
|
+
|
|
1809
1823
|
@app.websocket('/kubernetes-pod-ssh-proxy')
|
|
1810
|
-
async def kubernetes_pod_ssh_proxy(
|
|
1811
|
-
|
|
1824
|
+
async def kubernetes_pod_ssh_proxy(
|
|
1825
|
+
websocket: fastapi.WebSocket,
|
|
1826
|
+
cluster_name: str,
|
|
1827
|
+
client_version: Optional[int] = None) -> None:
|
|
1812
1828
|
"""Proxies SSH to the Kubernetes pod with websocket."""
|
|
1813
1829
|
await websocket.accept()
|
|
1814
1830
|
logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
|
|
1815
1831
|
|
|
1832
|
+
timestamps_supported = client_version is not None and client_version > 21
|
|
1833
|
+
logger.info(f'Websocket timestamps supported: {timestamps_supported}, \
|
|
1834
|
+
client_version = {client_version}')
|
|
1835
|
+
|
|
1816
1836
|
# Run core.status in another thread to avoid blocking the event loop.
|
|
1817
1837
|
with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
|
|
1818
1838
|
cluster_records = await context_utils.to_thread_with_executor(
|
|
@@ -1867,6 +1887,42 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1867
1887
|
async def websocket_to_ssh():
|
|
1868
1888
|
try:
|
|
1869
1889
|
async for message in websocket.iter_bytes():
|
|
1890
|
+
if timestamps_supported:
|
|
1891
|
+
type_size = struct.calcsize('!B')
|
|
1892
|
+
message_type = struct.unpack('!B',
|
|
1893
|
+
message[:type_size])[0]
|
|
1894
|
+
if (message_type ==
|
|
1895
|
+
KubernetesSSHMessageType.REGULAR_DATA):
|
|
1896
|
+
# Regular data - strip type byte and forward to SSH
|
|
1897
|
+
message = message[type_size:]
|
|
1898
|
+
elif message_type == KubernetesSSHMessageType.PINGPONG:
|
|
1899
|
+
# PING message - respond with PONG (type 1)
|
|
1900
|
+
ping_id_size = struct.calcsize('!I')
|
|
1901
|
+
if len(message) != type_size + ping_id_size:
|
|
1902
|
+
raise ValueError('Invalid PING message '
|
|
1903
|
+
f'length: {len(message)}')
|
|
1904
|
+
# Return the same PING message, so that the client
|
|
1905
|
+
# can measure the latency.
|
|
1906
|
+
await websocket.send_bytes(message)
|
|
1907
|
+
continue
|
|
1908
|
+
elif (message_type ==
|
|
1909
|
+
KubernetesSSHMessageType.LATENCY_MEASUREMENT):
|
|
1910
|
+
# Latency measurement from client
|
|
1911
|
+
latency_size = struct.calcsize('!Q')
|
|
1912
|
+
if len(message) != type_size + latency_size:
|
|
1913
|
+
raise ValueError(
|
|
1914
|
+
'Invalid latency measurement '
|
|
1915
|
+
f'message length: {len(message)}')
|
|
1916
|
+
avg_latency_ms = struct.unpack(
|
|
1917
|
+
'!Q',
|
|
1918
|
+
message[type_size:type_size + latency_size])[0]
|
|
1919
|
+
latency_seconds = avg_latency_ms / 1000
|
|
1920
|
+
metrics_utils.SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS.labels(pid=os.getpid()).observe(latency_seconds) # pylint: disable=line-too-long
|
|
1921
|
+
continue
|
|
1922
|
+
else:
|
|
1923
|
+
# Unknown message type.
|
|
1924
|
+
raise ValueError(
|
|
1925
|
+
f'Unknown message type: {message_type}')
|
|
1870
1926
|
writer.write(message)
|
|
1871
1927
|
try:
|
|
1872
1928
|
await writer.drain()
|
|
@@ -1897,6 +1953,11 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1897
1953
|
nonlocal ssh_failed
|
|
1898
1954
|
ssh_failed = True
|
|
1899
1955
|
break
|
|
1956
|
+
if timestamps_supported:
|
|
1957
|
+
# Prepend message type byte (0 = regular data)
|
|
1958
|
+
message_type_bytes = struct.pack(
|
|
1959
|
+
'!B', KubernetesSSHMessageType.REGULAR_DATA.value)
|
|
1960
|
+
data = message_type_bytes + data
|
|
1900
1961
|
await websocket.send_bytes(data)
|
|
1901
1962
|
except Exception: # pylint: disable=broad-except
|
|
1902
1963
|
pass
|
|
@@ -1936,7 +1997,7 @@ async def all_contexts(request: fastapi.Request) -> None:
|
|
|
1936
1997
|
|
|
1937
1998
|
await executor.schedule_request_async(
|
|
1938
1999
|
request_id=request.state.request_id,
|
|
1939
|
-
request_name=
|
|
2000
|
+
request_name=request_names.RequestName.ALL_CONTEXTS,
|
|
1940
2001
|
request_body=payloads.RequestBody(),
|
|
1941
2002
|
func=core.get_all_contexts,
|
|
1942
2003
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -2050,7 +2111,6 @@ if __name__ == '__main__':
|
|
|
2050
2111
|
# Serve metrics on a separate port to isolate it from the application APIs:
|
|
2051
2112
|
# metrics port will not be exposed to the public network typically.
|
|
2052
2113
|
parser.add_argument('--metrics-port', default=9090, type=int)
|
|
2053
|
-
parser.add_argument('--start-with-python', action='store_true')
|
|
2054
2114
|
cmd_args = parser.parse_args()
|
|
2055
2115
|
if cmd_args.port == cmd_args.metrics_port:
|
|
2056
2116
|
logger.error('port and metrics-port cannot be the same, exiting.')
|
|
@@ -2065,9 +2125,18 @@ if __name__ == '__main__':
|
|
|
2065
2125
|
logger.error(f'Port {cmd_args.port} is not available, exiting.')
|
|
2066
2126
|
raise RuntimeError(f'Port {cmd_args.port} is not available')
|
|
2067
2127
|
|
|
2068
|
-
|
|
2069
|
-
|
|
2070
|
-
|
|
2128
|
+
# Maybe touch the signal file on API server startup. Do it again here even
|
|
2129
|
+
# if we already touched it in the sky/server/common.py::_start_api_server.
|
|
2130
|
+
# This is because the sky/server/common.py::_start_api_server function call
|
|
2131
|
+
# is running outside the skypilot API server process tree. The process tree
|
|
2132
|
+
# starts within that function (see the `subprocess.Popen` call in
|
|
2133
|
+
# sky/server/common.py::_start_api_server). When pg is used, the
|
|
2134
|
+
# _start_api_server function will not load the config file from db, which
|
|
2135
|
+
# will ignore the consolidation mode config. Here, inside the process tree,
|
|
2136
|
+
# we already reload the config as a server (with env var _start_api_server),
|
|
2137
|
+
# so we will respect the consolidation mode config.
|
|
2138
|
+
# Refers to #7717 for more details.
|
|
2139
|
+
managed_job_utils.is_consolidation_mode(on_api_restart=True)
|
|
2071
2140
|
|
|
2072
2141
|
# Show the privacy policy if it is not already shown. We place it here so
|
|
2073
2142
|
# that it is shown only when the API server is started.
|
sky/server/stream_utils.py
CHANGED
|
@@ -25,6 +25,8 @@ logger = sky_logging.init_logger(__name__)
|
|
|
25
25
|
_BUFFER_SIZE = 8 * 1024 # 8KB
|
|
26
26
|
_BUFFER_TIMEOUT = 0.02 # 20ms
|
|
27
27
|
_HEARTBEAT_INTERVAL = 30
|
|
28
|
+
_READ_CHUNK_SIZE = 256 * 1024 # 256KB chunks for file reading
|
|
29
|
+
|
|
28
30
|
# If a SHORT request has been stuck in pending for
|
|
29
31
|
# _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
|
|
30
32
|
_SHORT_REQUEST_SPINNER_TIMEOUT = 2
|
|
@@ -235,6 +237,9 @@ async def _tail_log_file(
|
|
|
235
237
|
buffer_bytes = 0
|
|
236
238
|
last_flush_time = asyncio.get_event_loop().time()
|
|
237
239
|
|
|
240
|
+
# Read file in chunks instead of line-by-line for better performance
|
|
241
|
+
incomplete_line = b'' # Buffer for incomplete lines across chunks
|
|
242
|
+
|
|
238
243
|
async def flush_buffer() -> AsyncGenerator[str, None]:
|
|
239
244
|
nonlocal buffer, buffer_bytes, last_flush_time
|
|
240
245
|
if buffer:
|
|
@@ -255,8 +260,23 @@ async def _tail_log_file(
|
|
|
255
260
|
async for chunk in flush_buffer():
|
|
256
261
|
yield chunk
|
|
257
262
|
|
|
258
|
-
|
|
259
|
-
|
|
263
|
+
# Read file in chunks for better I/O performance
|
|
264
|
+
file_chunk: bytes = await f.read(_READ_CHUNK_SIZE)
|
|
265
|
+
if not file_chunk:
|
|
266
|
+
# Process any remaining incomplete line
|
|
267
|
+
if incomplete_line:
|
|
268
|
+
line_str = incomplete_line.decode('utf-8')
|
|
269
|
+
if plain_logs:
|
|
270
|
+
is_payload, line_str = message_utils.decode_payload(
|
|
271
|
+
line_str, raise_for_mismatch=False)
|
|
272
|
+
if not is_payload:
|
|
273
|
+
buffer.append(line_str)
|
|
274
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
275
|
+
else:
|
|
276
|
+
buffer.append(line_str)
|
|
277
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
278
|
+
incomplete_line = b''
|
|
279
|
+
|
|
260
280
|
# Avoid checking the status too frequently to avoid overloading the
|
|
261
281
|
# DB.
|
|
262
282
|
should_check_status = (current_time -
|
|
@@ -328,16 +348,39 @@ async def _tail_log_file(
|
|
|
328
348
|
# performance but it helps avoid unnecessary heartbeat strings
|
|
329
349
|
# being printed when the client runs in an old version.
|
|
330
350
|
last_heartbeat_time = asyncio.get_event_loop().time()
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
351
|
+
|
|
352
|
+
# Combine with any incomplete line from previous chunk
|
|
353
|
+
file_chunk = incomplete_line + file_chunk
|
|
354
|
+
incomplete_line = b''
|
|
355
|
+
|
|
356
|
+
# Split chunk into lines, preserving line structure
|
|
357
|
+
lines_bytes = file_chunk.split(b'\n')
|
|
358
|
+
|
|
359
|
+
# If chunk doesn't end with newline, the last element is incomplete
|
|
360
|
+
if file_chunk and not file_chunk.endswith(b'\n'):
|
|
361
|
+
incomplete_line = lines_bytes[-1]
|
|
362
|
+
lines_bytes = lines_bytes[:-1]
|
|
363
|
+
else:
|
|
364
|
+
# If ends with \n, split creates an empty last element we should
|
|
365
|
+
# ignore
|
|
366
|
+
if lines_bytes and lines_bytes[-1] == b'':
|
|
367
|
+
lines_bytes = lines_bytes[:-1]
|
|
368
|
+
|
|
369
|
+
# Process all complete lines in this chunk
|
|
370
|
+
for line_bytes in lines_bytes:
|
|
371
|
+
# Reconstruct line with newline (since split removed it)
|
|
372
|
+
line_str = line_bytes.decode('utf-8') + '\n'
|
|
373
|
+
|
|
374
|
+
if plain_logs:
|
|
375
|
+
is_payload, line_str = message_utils.decode_payload(
|
|
376
|
+
line_str, raise_for_mismatch=False)
|
|
377
|
+
# TODO(aylei): implement heartbeat mechanism for plain logs,
|
|
378
|
+
# sending invisible characters might be okay.
|
|
379
|
+
if is_payload:
|
|
380
|
+
continue
|
|
381
|
+
|
|
382
|
+
buffer.append(line_str)
|
|
383
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
341
384
|
|
|
342
385
|
# Flush remaining lines in the buffer.
|
|
343
386
|
async for chunk in flush_buffer():
|
|
@@ -373,7 +416,7 @@ def stream_response(
|
|
|
373
416
|
async def on_disconnect():
|
|
374
417
|
logger.info(f'User terminated the connection for request '
|
|
375
418
|
f'{request_id}')
|
|
376
|
-
requests_lib.
|
|
419
|
+
await requests_lib.kill_request_async(request_id)
|
|
377
420
|
|
|
378
421
|
# The background task will be run after returning a response.
|
|
379
422
|
# https://fastapi.tiangolo.com/tutorial/background-tasks/
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -49,6 +49,7 @@ install_requires = [
|
|
|
49
49
|
# <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
|
|
50
50
|
'pyyaml > 3.13, != 5.4.*',
|
|
51
51
|
'ijson',
|
|
52
|
+
'orjson',
|
|
52
53
|
'requests',
|
|
53
54
|
# SkyPilot inherits from uvicorn.Server to customize the behavior of
|
|
54
55
|
# uvicorn, so we need to pin uvicorn version to avoid potential break
|
|
@@ -187,6 +188,7 @@ cloud_dependencies: Dict[str, List[str]] = {
|
|
|
187
188
|
'docker': ['docker'] + local_ray,
|
|
188
189
|
'lambda': [], # No dependencies needed for lambda
|
|
189
190
|
'cloudflare': aws_dependencies,
|
|
191
|
+
'coreweave': aws_dependencies,
|
|
190
192
|
'scp': local_ray,
|
|
191
193
|
'oci': ['oci'],
|
|
192
194
|
# Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
|
sky/skylet/constants.py
CHANGED
|
@@ -100,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
100
100
|
# cluster yaml is updated.
|
|
101
101
|
#
|
|
102
102
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
103
|
-
SKYLET_VERSION = '
|
|
103
|
+
SKYLET_VERSION = '25'
|
|
104
104
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
105
105
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
106
106
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
|
@@ -422,6 +422,8 @@ SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
|
|
|
422
422
|
# but the configs won't be applied)
|
|
423
423
|
('jobs', 'controller', 'consolidation_mode'),
|
|
424
424
|
('serve', 'controller', 'consolidation_mode'),
|
|
425
|
+
('jobs', 'controller', 'controller_logs_gc_retention_hours'),
|
|
426
|
+
('jobs', 'controller', 'task_logs_gc_retention_hours'),
|
|
425
427
|
]
|
|
426
428
|
|
|
427
429
|
# Constants for Azure blob storage
|
|
@@ -548,3 +550,6 @@ ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
|
|
|
548
550
|
|
|
549
551
|
ARM64_ARCH = 'arm64'
|
|
550
552
|
X86_64_ARCH = 'x86_64'
|
|
553
|
+
|
|
554
|
+
SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
|
|
555
|
+
f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')
|
sky/skylet/events.py
CHANGED
|
@@ -326,8 +326,15 @@ class AutostopEvent(SkyletEvent):
|
|
|
326
326
|
cluster_name_on_cloud = cluster_config['cluster_name']
|
|
327
327
|
is_cluster_multinode = cluster_config['max_workers'] > 0
|
|
328
328
|
|
|
329
|
+
# Clear AWS credentials from environment to force boto3 to use IAM
|
|
330
|
+
# role attached to the instance (lowest priority in credential chain).
|
|
331
|
+
# This allows the cluster to stop/terminate itself using its IAM role.
|
|
329
332
|
os.environ.pop('AWS_ACCESS_KEY_ID', None)
|
|
330
333
|
os.environ.pop('AWS_SECRET_ACCESS_KEY', None)
|
|
334
|
+
os.environ.pop('AWS_SESSION_TOKEN', None)
|
|
335
|
+
# Point boto3 to /dev/null to skip reading credentials from files.
|
|
336
|
+
os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/dev/null'
|
|
337
|
+
os.environ['AWS_CONFIG_FILE'] = '/dev/null'
|
|
331
338
|
|
|
332
339
|
# Stop the ray autoscaler to avoid scaling up, during
|
|
333
340
|
# stopping/terminating of the cluster.
|
sky/skylet/services.py
CHANGED
|
@@ -407,7 +407,9 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
407
407
|
context: grpc.ServicerContext
|
|
408
408
|
) -> managed_jobsv1_pb2.GetJobTableResponse:
|
|
409
409
|
try:
|
|
410
|
-
accessible_workspaces =
|
|
410
|
+
accessible_workspaces = (
|
|
411
|
+
list(request.accessible_workspaces.workspaces)
|
|
412
|
+
if request.HasField('accessible_workspaces') else None)
|
|
411
413
|
job_ids = (list(request.job_ids.ids)
|
|
412
414
|
if request.HasField('job_ids') else None)
|
|
413
415
|
user_hashes: Optional[List[Optional[str]]] = None
|
|
@@ -419,6 +421,8 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
419
421
|
user_hashes.append(None)
|
|
420
422
|
statuses = (list(request.statuses.statuses)
|
|
421
423
|
if request.HasField('statuses') else None)
|
|
424
|
+
fields = (list(request.fields.fields)
|
|
425
|
+
if request.HasField('fields') else None)
|
|
422
426
|
job_queue = managed_job_utils.get_managed_job_queue(
|
|
423
427
|
skip_finished=request.skip_finished,
|
|
424
428
|
accessible_workspaces=accessible_workspaces,
|
|
@@ -432,7 +436,9 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
432
436
|
page=request.page if request.HasField('page') else None,
|
|
433
437
|
limit=request.limit if request.HasField('limit') else None,
|
|
434
438
|
user_hashes=user_hashes,
|
|
435
|
-
statuses=statuses
|
|
439
|
+
statuses=statuses,
|
|
440
|
+
fields=fields,
|
|
441
|
+
)
|
|
436
442
|
jobs = job_queue['jobs']
|
|
437
443
|
total = job_queue['total']
|
|
438
444
|
total_no_filter = job_queue['total_no_filter']
|
|
@@ -440,7 +446,16 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
440
446
|
|
|
441
447
|
jobs_info = []
|
|
442
448
|
for job in jobs:
|
|
449
|
+
converted_metadata = None
|
|
450
|
+
metadata = job.get('metadata')
|
|
451
|
+
if metadata:
|
|
452
|
+
converted_metadata = {
|
|
453
|
+
k: v for k, v in metadata.items() if v is not None
|
|
454
|
+
}
|
|
443
455
|
job_info = managed_jobsv1_pb2.ManagedJobInfo(
|
|
456
|
+
# The `spot.job_id`, which can be used to identify
|
|
457
|
+
# different tasks for the same job
|
|
458
|
+
_job_id=job.get('_job_id'),
|
|
444
459
|
job_id=job.get('job_id'),
|
|
445
460
|
task_id=job.get('task_id'),
|
|
446
461
|
job_name=job.get('job_name'),
|
|
@@ -468,11 +483,7 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
468
483
|
end_at=job.get('end_at'),
|
|
469
484
|
user_yaml=job.get('user_yaml'),
|
|
470
485
|
entrypoint=job.get('entrypoint'),
|
|
471
|
-
metadata=
|
|
472
|
-
k: v
|
|
473
|
-
for k, v in job.get('metadata', {}).items()
|
|
474
|
-
if v is not None
|
|
475
|
-
},
|
|
486
|
+
metadata=converted_metadata,
|
|
476
487
|
pool=job.get('pool'),
|
|
477
488
|
pool_hash=job.get('pool_hash'))
|
|
478
489
|
jobs_info.append(job_info)
|