skypilot-nightly 1.0.0.dev20251029__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/aws.py +25 -7
- sky/client/cli/command.py +47 -23
- sky/clouds/aws.py +59 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
- sky/dashboard/out/_next/static/chunks/{webpack-485984ca04e021d0.js → webpack-e38d5319cd10a3a0.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +32 -2
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +62 -67
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/scheduler.py +15 -2
- sky/jobs/server/core.py +85 -13
- sky/jobs/server/server.py +12 -11
- sky/jobs/server/utils.py +28 -10
- sky/jobs/state.py +216 -40
- sky/jobs/utils.py +60 -22
- sky/metrics/utils.py +18 -0
- sky/schemas/api/responses.py +1 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/server/server.py +8 -7
- sky/server/common.py +21 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +23 -17
- sky/server/requests/executor.py +7 -3
- sky/server/requests/request_names.py +80 -0
- sky/server/server.py +103 -35
- sky/skylet/constants.py +6 -1
- sky/skylet/events.py +7 -0
- sky/skylet/services.py +18 -7
- sky/ssh_node_pools/server.py +5 -4
- sky/task.py +4 -42
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/templates/websocket_proxy.py +140 -12
- sky/users/permission.py +4 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/resource_checker.py +4 -1
- sky/utils/schemas.py +23 -4
- sky/volumes/server/server.py +4 -3
- sky/workspaces/server.py +7 -6
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +36 -36
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +67 -62
- sky/dashboard/out/_next/static/chunks/2755.a239c652bf8684dd.js +0 -26
- /sky/dashboard/out/_next/static/{DabuSAKsc_y0wyJxpTIdQ → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{DabuSAKsc_y0wyJxpTIdQ → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
sky/server/server.py
CHANGED
|
@@ -6,6 +6,7 @@ import base64
|
|
|
6
6
|
from concurrent.futures import ThreadPoolExecutor
|
|
7
7
|
import contextlib
|
|
8
8
|
import datetime
|
|
9
|
+
from enum import IntEnum
|
|
9
10
|
import hashlib
|
|
10
11
|
import json
|
|
11
12
|
import multiprocessing
|
|
@@ -15,6 +16,7 @@ import posixpath
|
|
|
15
16
|
import re
|
|
16
17
|
import resource
|
|
17
18
|
import shutil
|
|
19
|
+
import struct
|
|
18
20
|
import sys
|
|
19
21
|
import threading
|
|
20
22
|
import traceback
|
|
@@ -62,6 +64,7 @@ from sky.server.auth import oauth2_proxy
|
|
|
62
64
|
from sky.server.requests import executor
|
|
63
65
|
from sky.server.requests import payloads
|
|
64
66
|
from sky.server.requests import preconditions
|
|
67
|
+
from sky.server.requests import request_names
|
|
65
68
|
from sky.server.requests import requests as requests_lib
|
|
66
69
|
from sky.skylet import constants
|
|
67
70
|
from sky.ssh_node_pools import server as ssh_node_pools_rest
|
|
@@ -460,7 +463,7 @@ async def schedule_on_boot_check_async():
|
|
|
460
463
|
try:
|
|
461
464
|
await executor.schedule_request_async(
|
|
462
465
|
request_id='skypilot-server-on-boot-check',
|
|
463
|
-
request_name=
|
|
466
|
+
request_name=request_names.RequestName.CHECK,
|
|
464
467
|
request_body=payloads.CheckBody(),
|
|
465
468
|
func=sky_check.check,
|
|
466
469
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -732,7 +735,7 @@ async def check(request: fastapi.Request,
|
|
|
732
735
|
"""Checks enabled clouds."""
|
|
733
736
|
await executor.schedule_request_async(
|
|
734
737
|
request_id=request.state.request_id,
|
|
735
|
-
request_name=
|
|
738
|
+
request_name=request_names.RequestName.CHECK,
|
|
736
739
|
request_body=check_body,
|
|
737
740
|
func=sky_check.check,
|
|
738
741
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -746,7 +749,7 @@ async def enabled_clouds(request: fastapi.Request,
|
|
|
746
749
|
"""Gets enabled clouds on the server."""
|
|
747
750
|
await executor.schedule_request_async(
|
|
748
751
|
request_id=request.state.request_id,
|
|
749
|
-
request_name=
|
|
752
|
+
request_name=request_names.RequestName.ENABLED_CLOUDS,
|
|
750
753
|
request_body=payloads.EnabledCloudsBody(workspace=workspace,
|
|
751
754
|
expand=expand),
|
|
752
755
|
func=core.enabled_clouds,
|
|
@@ -762,7 +765,8 @@ async def realtime_kubernetes_gpu_availability(
|
|
|
762
765
|
"""Gets real-time Kubernetes GPU availability."""
|
|
763
766
|
await executor.schedule_request_async(
|
|
764
767
|
request_id=request.state.request_id,
|
|
765
|
-
request_name=
|
|
768
|
+
request_name=request_names.RequestName.
|
|
769
|
+
REALTIME_KUBERNETES_GPU_AVAILABILITY,
|
|
766
770
|
request_body=realtime_gpu_availability_body,
|
|
767
771
|
func=core.realtime_kubernetes_gpu_availability,
|
|
768
772
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -777,7 +781,7 @@ async def kubernetes_node_info(
|
|
|
777
781
|
"""Gets Kubernetes nodes information and hints."""
|
|
778
782
|
await executor.schedule_request_async(
|
|
779
783
|
request_id=request.state.request_id,
|
|
780
|
-
request_name=
|
|
784
|
+
request_name=request_names.RequestName.KUBERNETES_NODE_INFO,
|
|
781
785
|
request_body=kubernetes_node_info_body,
|
|
782
786
|
func=kubernetes_utils.get_kubernetes_node_info,
|
|
783
787
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -789,7 +793,7 @@ async def status_kubernetes(request: fastapi.Request) -> None:
|
|
|
789
793
|
"""Gets Kubernetes status."""
|
|
790
794
|
await executor.schedule_request_async(
|
|
791
795
|
request_id=request.state.request_id,
|
|
792
|
-
request_name=
|
|
796
|
+
request_name=request_names.RequestName.STATUS_KUBERNETES,
|
|
793
797
|
request_body=payloads.RequestBody(),
|
|
794
798
|
func=core.status_kubernetes,
|
|
795
799
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -803,7 +807,7 @@ async def list_accelerators(
|
|
|
803
807
|
"""Gets list of accelerators from cloud catalog."""
|
|
804
808
|
await executor.schedule_request_async(
|
|
805
809
|
request_id=request.state.request_id,
|
|
806
|
-
request_name=
|
|
810
|
+
request_name=request_names.RequestName.LIST_ACCELERATORS,
|
|
807
811
|
request_body=list_accelerator_counts_body,
|
|
808
812
|
func=catalog.list_accelerators,
|
|
809
813
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -818,7 +822,7 @@ async def list_accelerator_counts(
|
|
|
818
822
|
"""Gets list of accelerator counts from cloud catalog."""
|
|
819
823
|
await executor.schedule_request_async(
|
|
820
824
|
request_id=request.state.request_id,
|
|
821
|
-
request_name=
|
|
825
|
+
request_name=request_names.RequestName.LIST_ACCELERATOR_COUNTS,
|
|
822
826
|
request_body=list_accelerator_counts_body,
|
|
823
827
|
func=catalog.list_accelerator_counts,
|
|
824
828
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -875,7 +879,7 @@ async def optimize(optimize_body: payloads.OptimizeBody,
|
|
|
875
879
|
"""Optimizes the user's DAG."""
|
|
876
880
|
await executor.schedule_request_async(
|
|
877
881
|
request_id=request.state.request_id,
|
|
878
|
-
request_name=
|
|
882
|
+
request_name=request_names.RequestName.OPTIMIZE,
|
|
879
883
|
request_body=optimize_body,
|
|
880
884
|
ignore_return_value=True,
|
|
881
885
|
func=core.optimize,
|
|
@@ -1085,7 +1089,7 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
|
1085
1089
|
logger.info(f'Launching request: {request_id}')
|
|
1086
1090
|
await executor.schedule_request_async(
|
|
1087
1091
|
request_id,
|
|
1088
|
-
request_name=
|
|
1092
|
+
request_name=request_names.RequestName.CLUSTER_LAUNCH,
|
|
1089
1093
|
request_body=launch_body,
|
|
1090
1094
|
func=execution.launch,
|
|
1091
1095
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1101,7 +1105,7 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
|
|
|
1101
1105
|
cluster_name = exec_body.cluster_name
|
|
1102
1106
|
await executor.schedule_request_async(
|
|
1103
1107
|
request_id=request.state.request_id,
|
|
1104
|
-
request_name=
|
|
1108
|
+
request_name=request_names.RequestName.CLUSTER_EXEC,
|
|
1105
1109
|
request_body=exec_body,
|
|
1106
1110
|
func=execution.exec,
|
|
1107
1111
|
precondition=preconditions.ClusterStartCompletePrecondition(
|
|
@@ -1119,7 +1123,7 @@ async def stop(request: fastapi.Request,
|
|
|
1119
1123
|
"""Stops a cluster."""
|
|
1120
1124
|
await executor.schedule_request_async(
|
|
1121
1125
|
request_id=request.state.request_id,
|
|
1122
|
-
request_name=
|
|
1126
|
+
request_name=request_names.RequestName.CLUSTER_STOP,
|
|
1123
1127
|
request_body=stop_body,
|
|
1124
1128
|
func=core.stop,
|
|
1125
1129
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1139,7 +1143,7 @@ async def status(
|
|
|
1139
1143
|
detail='Server is shutting down, please try again later.')
|
|
1140
1144
|
await executor.schedule_request_async(
|
|
1141
1145
|
request_id=request.state.request_id,
|
|
1142
|
-
request_name=
|
|
1146
|
+
request_name=request_names.RequestName.CLUSTER_STATUS,
|
|
1143
1147
|
request_body=status_body,
|
|
1144
1148
|
func=core.status,
|
|
1145
1149
|
schedule_type=(requests_lib.ScheduleType.LONG if
|
|
@@ -1154,7 +1158,7 @@ async def endpoints(request: fastapi.Request,
|
|
|
1154
1158
|
"""Gets the endpoint for a given cluster and port number (endpoint)."""
|
|
1155
1159
|
await executor.schedule_request_async(
|
|
1156
1160
|
request_id=request.state.request_id,
|
|
1157
|
-
request_name=
|
|
1161
|
+
request_name=request_names.RequestName.CLUSTER_ENDPOINTS,
|
|
1158
1162
|
request_body=endpoint_body,
|
|
1159
1163
|
func=core.endpoints,
|
|
1160
1164
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1168,7 +1172,7 @@ async def down(request: fastapi.Request,
|
|
|
1168
1172
|
"""Tears down a cluster."""
|
|
1169
1173
|
await executor.schedule_request_async(
|
|
1170
1174
|
request_id=request.state.request_id,
|
|
1171
|
-
request_name=
|
|
1175
|
+
request_name=request_names.RequestName.CLUSTER_DOWN,
|
|
1172
1176
|
request_body=down_body,
|
|
1173
1177
|
func=core.down,
|
|
1174
1178
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1182,7 +1186,7 @@ async def start(request: fastapi.Request,
|
|
|
1182
1186
|
"""Restarts a cluster."""
|
|
1183
1187
|
await executor.schedule_request_async(
|
|
1184
1188
|
request_id=request.state.request_id,
|
|
1185
|
-
request_name=
|
|
1189
|
+
request_name=request_names.RequestName.CLUSTER_START,
|
|
1186
1190
|
request_body=start_body,
|
|
1187
1191
|
func=core.start,
|
|
1188
1192
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1196,7 +1200,7 @@ async def autostop(request: fastapi.Request,
|
|
|
1196
1200
|
"""Schedules an autostop/autodown for a cluster."""
|
|
1197
1201
|
await executor.schedule_request_async(
|
|
1198
1202
|
request_id=request.state.request_id,
|
|
1199
|
-
request_name=
|
|
1203
|
+
request_name=request_names.RequestName.CLUSTER_AUTOSTOP,
|
|
1200
1204
|
request_body=autostop_body,
|
|
1201
1205
|
func=core.autostop,
|
|
1202
1206
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1210,7 +1214,7 @@ async def queue(request: fastapi.Request,
|
|
|
1210
1214
|
"""Gets the job queue of a cluster."""
|
|
1211
1215
|
await executor.schedule_request_async(
|
|
1212
1216
|
request_id=request.state.request_id,
|
|
1213
|
-
request_name=
|
|
1217
|
+
request_name=request_names.RequestName.CLUSTER_QUEUE,
|
|
1214
1218
|
request_body=queue_body,
|
|
1215
1219
|
func=core.queue,
|
|
1216
1220
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1224,7 +1228,7 @@ async def job_status(request: fastapi.Request,
|
|
|
1224
1228
|
"""Gets the status of a job."""
|
|
1225
1229
|
await executor.schedule_request_async(
|
|
1226
1230
|
request_id=request.state.request_id,
|
|
1227
|
-
request_name=
|
|
1231
|
+
request_name=request_names.RequestName.CLUSTER_JOB_STATUS,
|
|
1228
1232
|
request_body=job_status_body,
|
|
1229
1233
|
func=core.job_status,
|
|
1230
1234
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1238,7 +1242,7 @@ async def cancel(request: fastapi.Request,
|
|
|
1238
1242
|
"""Cancels jobs on a cluster."""
|
|
1239
1243
|
await executor.schedule_request_async(
|
|
1240
1244
|
request_id=request.state.request_id,
|
|
1241
|
-
request_name=
|
|
1245
|
+
request_name=request_names.RequestName.CLUSTER_JOB_CANCEL,
|
|
1242
1246
|
request_body=cancel_body,
|
|
1243
1247
|
func=core.cancel,
|
|
1244
1248
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1258,7 +1262,7 @@ async def logs(
|
|
|
1258
1262
|
executor.check_request_thread_executor_available()
|
|
1259
1263
|
request_task = await executor.prepare_request_async(
|
|
1260
1264
|
request_id=request.state.request_id,
|
|
1261
|
-
request_name=
|
|
1265
|
+
request_name=request_names.RequestName.CLUSTER_JOB_LOGS,
|
|
1262
1266
|
request_body=cluster_job_body,
|
|
1263
1267
|
func=core.tail_logs,
|
|
1264
1268
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1289,7 +1293,7 @@ async def download_logs(
|
|
|
1289
1293
|
cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
|
|
1290
1294
|
await executor.schedule_request_async(
|
|
1291
1295
|
request_id=request.state.request_id,
|
|
1292
|
-
request_name=
|
|
1296
|
+
request_name=request_names.RequestName.CLUSTER_JOB_DOWNLOAD_LOGS,
|
|
1293
1297
|
request_body=cluster_jobs_body,
|
|
1294
1298
|
func=core.download_logs,
|
|
1295
1299
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1440,7 +1444,7 @@ async def cost_report(request: fastapi.Request,
|
|
|
1440
1444
|
"""Gets the cost report of a cluster."""
|
|
1441
1445
|
await executor.schedule_request_async(
|
|
1442
1446
|
request_id=request.state.request_id,
|
|
1443
|
-
request_name=
|
|
1447
|
+
request_name=request_names.RequestName.CLUSTER_COST_REPORT,
|
|
1444
1448
|
request_body=cost_report_body,
|
|
1445
1449
|
func=core.cost_report,
|
|
1446
1450
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1452,7 +1456,7 @@ async def storage_ls(request: fastapi.Request) -> None:
|
|
|
1452
1456
|
"""Gets the storages."""
|
|
1453
1457
|
await executor.schedule_request_async(
|
|
1454
1458
|
request_id=request.state.request_id,
|
|
1455
|
-
request_name=
|
|
1459
|
+
request_name=request_names.RequestName.STORAGE_LS,
|
|
1456
1460
|
request_body=payloads.RequestBody(),
|
|
1457
1461
|
func=core.storage_ls,
|
|
1458
1462
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1465,7 +1469,7 @@ async def storage_delete(request: fastapi.Request,
|
|
|
1465
1469
|
"""Deletes a storage."""
|
|
1466
1470
|
await executor.schedule_request_async(
|
|
1467
1471
|
request_id=request.state.request_id,
|
|
1468
|
-
request_name=
|
|
1472
|
+
request_name=request_names.RequestName.STORAGE_DELETE,
|
|
1469
1473
|
request_body=storage_body,
|
|
1470
1474
|
func=core.storage_delete,
|
|
1471
1475
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1478,7 +1482,7 @@ async def local_up(request: fastapi.Request,
|
|
|
1478
1482
|
"""Launches a Kubernetes cluster on API server."""
|
|
1479
1483
|
await executor.schedule_request_async(
|
|
1480
1484
|
request_id=request.state.request_id,
|
|
1481
|
-
request_name=
|
|
1485
|
+
request_name=request_names.RequestName.LOCAL_UP,
|
|
1482
1486
|
request_body=local_up_body,
|
|
1483
1487
|
func=core.local_up,
|
|
1484
1488
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1491,7 +1495,7 @@ async def local_down(request: fastapi.Request,
|
|
|
1491
1495
|
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1492
1496
|
await executor.schedule_request_async(
|
|
1493
1497
|
request_id=request.state.request_id,
|
|
1494
|
-
request_name=
|
|
1498
|
+
request_name=request_names.RequestName.LOCAL_DOWN,
|
|
1495
1499
|
request_body=local_down_body,
|
|
1496
1500
|
func=core.local_down,
|
|
1497
1501
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1699,7 +1703,7 @@ async def api_cancel(request: fastapi.Request,
|
|
|
1699
1703
|
"""Cancels requests."""
|
|
1700
1704
|
await executor.schedule_request_async(
|
|
1701
1705
|
request_id=request.state.request_id,
|
|
1702
|
-
request_name=
|
|
1706
|
+
request_name=request_names.RequestName.API_CANCEL,
|
|
1703
1707
|
request_body=request_cancel_body,
|
|
1704
1708
|
func=requests_lib.kill_requests_with_prefix,
|
|
1705
1709
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1804,16 +1808,31 @@ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
|
|
|
1804
1808
|
basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
|
|
1805
1809
|
'false').lower() == 'true',
|
|
1806
1810
|
user=user if user is not None else None,
|
|
1811
|
+
service_account_token_enabled=(os.environ.get(
|
|
1812
|
+
constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
|
|
1813
|
+
'false').lower() == 'true'),
|
|
1807
1814
|
)
|
|
1808
1815
|
|
|
1809
1816
|
|
|
1817
|
+
class KubernetesSSHMessageType(IntEnum):
|
|
1818
|
+
REGULAR_DATA = 0
|
|
1819
|
+
PINGPONG = 1
|
|
1820
|
+
LATENCY_MEASUREMENT = 2
|
|
1821
|
+
|
|
1822
|
+
|
|
1810
1823
|
@app.websocket('/kubernetes-pod-ssh-proxy')
|
|
1811
|
-
async def kubernetes_pod_ssh_proxy(
|
|
1812
|
-
|
|
1824
|
+
async def kubernetes_pod_ssh_proxy(
|
|
1825
|
+
websocket: fastapi.WebSocket,
|
|
1826
|
+
cluster_name: str,
|
|
1827
|
+
client_version: Optional[int] = None) -> None:
|
|
1813
1828
|
"""Proxies SSH to the Kubernetes pod with websocket."""
|
|
1814
1829
|
await websocket.accept()
|
|
1815
1830
|
logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
|
|
1816
1831
|
|
|
1832
|
+
timestamps_supported = client_version is not None and client_version > 21
|
|
1833
|
+
logger.info(f'Websocket timestamps supported: {timestamps_supported}, \
|
|
1834
|
+
client_version = {client_version}')
|
|
1835
|
+
|
|
1817
1836
|
# Run core.status in another thread to avoid blocking the event loop.
|
|
1818
1837
|
with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
|
|
1819
1838
|
cluster_records = await context_utils.to_thread_with_executor(
|
|
@@ -1868,6 +1887,42 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1868
1887
|
async def websocket_to_ssh():
|
|
1869
1888
|
try:
|
|
1870
1889
|
async for message in websocket.iter_bytes():
|
|
1890
|
+
if timestamps_supported:
|
|
1891
|
+
type_size = struct.calcsize('!B')
|
|
1892
|
+
message_type = struct.unpack('!B',
|
|
1893
|
+
message[:type_size])[0]
|
|
1894
|
+
if (message_type ==
|
|
1895
|
+
KubernetesSSHMessageType.REGULAR_DATA):
|
|
1896
|
+
# Regular data - strip type byte and forward to SSH
|
|
1897
|
+
message = message[type_size:]
|
|
1898
|
+
elif message_type == KubernetesSSHMessageType.PINGPONG:
|
|
1899
|
+
# PING message - respond with PONG (type 1)
|
|
1900
|
+
ping_id_size = struct.calcsize('!I')
|
|
1901
|
+
if len(message) != type_size + ping_id_size:
|
|
1902
|
+
raise ValueError('Invalid PING message '
|
|
1903
|
+
f'length: {len(message)}')
|
|
1904
|
+
# Return the same PING message, so that the client
|
|
1905
|
+
# can measure the latency.
|
|
1906
|
+
await websocket.send_bytes(message)
|
|
1907
|
+
continue
|
|
1908
|
+
elif (message_type ==
|
|
1909
|
+
KubernetesSSHMessageType.LATENCY_MEASUREMENT):
|
|
1910
|
+
# Latency measurement from client
|
|
1911
|
+
latency_size = struct.calcsize('!Q')
|
|
1912
|
+
if len(message) != type_size + latency_size:
|
|
1913
|
+
raise ValueError(
|
|
1914
|
+
'Invalid latency measurement '
|
|
1915
|
+
f'message length: {len(message)}')
|
|
1916
|
+
avg_latency_ms = struct.unpack(
|
|
1917
|
+
'!Q',
|
|
1918
|
+
message[type_size:type_size + latency_size])[0]
|
|
1919
|
+
latency_seconds = avg_latency_ms / 1000
|
|
1920
|
+
metrics_utils.SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS.labels(pid=os.getpid()).observe(latency_seconds) # pylint: disable=line-too-long
|
|
1921
|
+
continue
|
|
1922
|
+
else:
|
|
1923
|
+
# Unknown message type.
|
|
1924
|
+
raise ValueError(
|
|
1925
|
+
f'Unknown message type: {message_type}')
|
|
1871
1926
|
writer.write(message)
|
|
1872
1927
|
try:
|
|
1873
1928
|
await writer.drain()
|
|
@@ -1898,6 +1953,11 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1898
1953
|
nonlocal ssh_failed
|
|
1899
1954
|
ssh_failed = True
|
|
1900
1955
|
break
|
|
1956
|
+
if timestamps_supported:
|
|
1957
|
+
# Prepend message type byte (0 = regular data)
|
|
1958
|
+
message_type_bytes = struct.pack(
|
|
1959
|
+
'!B', KubernetesSSHMessageType.REGULAR_DATA.value)
|
|
1960
|
+
data = message_type_bytes + data
|
|
1901
1961
|
await websocket.send_bytes(data)
|
|
1902
1962
|
except Exception: # pylint: disable=broad-except
|
|
1903
1963
|
pass
|
|
@@ -1937,7 +1997,7 @@ async def all_contexts(request: fastapi.Request) -> None:
|
|
|
1937
1997
|
|
|
1938
1998
|
await executor.schedule_request_async(
|
|
1939
1999
|
request_id=request.state.request_id,
|
|
1940
|
-
request_name=
|
|
2000
|
+
request_name=request_names.RequestName.ALL_CONTEXTS,
|
|
1941
2001
|
request_body=payloads.RequestBody(),
|
|
1942
2002
|
func=core.get_all_contexts,
|
|
1943
2003
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -2051,7 +2111,6 @@ if __name__ == '__main__':
|
|
|
2051
2111
|
# Serve metrics on a separate port to isolate it from the application APIs:
|
|
2052
2112
|
# metrics port will not be exposed to the public network typically.
|
|
2053
2113
|
parser.add_argument('--metrics-port', default=9090, type=int)
|
|
2054
|
-
parser.add_argument('--start-with-python', action='store_true')
|
|
2055
2114
|
cmd_args = parser.parse_args()
|
|
2056
2115
|
if cmd_args.port == cmd_args.metrics_port:
|
|
2057
2116
|
logger.error('port and metrics-port cannot be the same, exiting.')
|
|
@@ -2066,9 +2125,18 @@ if __name__ == '__main__':
|
|
|
2066
2125
|
logger.error(f'Port {cmd_args.port} is not available, exiting.')
|
|
2067
2126
|
raise RuntimeError(f'Port {cmd_args.port} is not available')
|
|
2068
2127
|
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
|
|
2128
|
+
# Maybe touch the signal file on API server startup. Do it again here even
|
|
2129
|
+
# if we already touched it in the sky/server/common.py::_start_api_server.
|
|
2130
|
+
# This is because the sky/server/common.py::_start_api_server function call
|
|
2131
|
+
# is running outside the skypilot API server process tree. The process tree
|
|
2132
|
+
# starts within that function (see the `subprocess.Popen` call in
|
|
2133
|
+
# sky/server/common.py::_start_api_server). When pg is used, the
|
|
2134
|
+
# _start_api_server function will not load the config file from db, which
|
|
2135
|
+
# will ignore the consolidation mode config. Here, inside the process tree,
|
|
2136
|
+
# we already reload the config as a server (with env var _start_api_server),
|
|
2137
|
+
# so we will respect the consolidation mode config.
|
|
2138
|
+
# Refers to #7717 for more details.
|
|
2139
|
+
managed_job_utils.is_consolidation_mode(on_api_restart=True)
|
|
2072
2140
|
|
|
2073
2141
|
# Show the privacy policy if it is not already shown. We place it here so
|
|
2074
2142
|
# that it is shown only when the API server is started.
|
sky/skylet/constants.py
CHANGED
|
@@ -100,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
100
100
|
# cluster yaml is updated.
|
|
101
101
|
#
|
|
102
102
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
103
|
-
SKYLET_VERSION = '
|
|
103
|
+
SKYLET_VERSION = '25'
|
|
104
104
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
105
105
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
106
106
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
|
@@ -422,6 +422,8 @@ SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
|
|
|
422
422
|
# but the configs won't be applied)
|
|
423
423
|
('jobs', 'controller', 'consolidation_mode'),
|
|
424
424
|
('serve', 'controller', 'consolidation_mode'),
|
|
425
|
+
('jobs', 'controller', 'controller_logs_gc_retention_hours'),
|
|
426
|
+
('jobs', 'controller', 'task_logs_gc_retention_hours'),
|
|
425
427
|
]
|
|
426
428
|
|
|
427
429
|
# Constants for Azure blob storage
|
|
@@ -548,3 +550,6 @@ ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
|
|
|
548
550
|
|
|
549
551
|
ARM64_ARCH = 'arm64'
|
|
550
552
|
X86_64_ARCH = 'x86_64'
|
|
553
|
+
|
|
554
|
+
SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
|
|
555
|
+
f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')
|
sky/skylet/events.py
CHANGED
|
@@ -326,8 +326,15 @@ class AutostopEvent(SkyletEvent):
|
|
|
326
326
|
cluster_name_on_cloud = cluster_config['cluster_name']
|
|
327
327
|
is_cluster_multinode = cluster_config['max_workers'] > 0
|
|
328
328
|
|
|
329
|
+
# Clear AWS credentials from environment to force boto3 to use IAM
|
|
330
|
+
# role attached to the instance (lowest priority in credential chain).
|
|
331
|
+
# This allows the cluster to stop/terminate itself using its IAM role.
|
|
329
332
|
os.environ.pop('AWS_ACCESS_KEY_ID', None)
|
|
330
333
|
os.environ.pop('AWS_SECRET_ACCESS_KEY', None)
|
|
334
|
+
os.environ.pop('AWS_SESSION_TOKEN', None)
|
|
335
|
+
# Point boto3 to /dev/null to skip reading credentials from files.
|
|
336
|
+
os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/dev/null'
|
|
337
|
+
os.environ['AWS_CONFIG_FILE'] = '/dev/null'
|
|
331
338
|
|
|
332
339
|
# Stop the ray autoscaler to avoid scaling up, during
|
|
333
340
|
# stopping/terminating of the cluster.
|
sky/skylet/services.py
CHANGED
|
@@ -407,7 +407,9 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
407
407
|
context: grpc.ServicerContext
|
|
408
408
|
) -> managed_jobsv1_pb2.GetJobTableResponse:
|
|
409
409
|
try:
|
|
410
|
-
accessible_workspaces =
|
|
410
|
+
accessible_workspaces = (
|
|
411
|
+
list(request.accessible_workspaces.workspaces)
|
|
412
|
+
if request.HasField('accessible_workspaces') else None)
|
|
411
413
|
job_ids = (list(request.job_ids.ids)
|
|
412
414
|
if request.HasField('job_ids') else None)
|
|
413
415
|
user_hashes: Optional[List[Optional[str]]] = None
|
|
@@ -419,6 +421,8 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
419
421
|
user_hashes.append(None)
|
|
420
422
|
statuses = (list(request.statuses.statuses)
|
|
421
423
|
if request.HasField('statuses') else None)
|
|
424
|
+
fields = (list(request.fields.fields)
|
|
425
|
+
if request.HasField('fields') else None)
|
|
422
426
|
job_queue = managed_job_utils.get_managed_job_queue(
|
|
423
427
|
skip_finished=request.skip_finished,
|
|
424
428
|
accessible_workspaces=accessible_workspaces,
|
|
@@ -432,7 +436,9 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
432
436
|
page=request.page if request.HasField('page') else None,
|
|
433
437
|
limit=request.limit if request.HasField('limit') else None,
|
|
434
438
|
user_hashes=user_hashes,
|
|
435
|
-
statuses=statuses
|
|
439
|
+
statuses=statuses,
|
|
440
|
+
fields=fields,
|
|
441
|
+
)
|
|
436
442
|
jobs = job_queue['jobs']
|
|
437
443
|
total = job_queue['total']
|
|
438
444
|
total_no_filter = job_queue['total_no_filter']
|
|
@@ -440,7 +446,16 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
440
446
|
|
|
441
447
|
jobs_info = []
|
|
442
448
|
for job in jobs:
|
|
449
|
+
converted_metadata = None
|
|
450
|
+
metadata = job.get('metadata')
|
|
451
|
+
if metadata:
|
|
452
|
+
converted_metadata = {
|
|
453
|
+
k: v for k, v in metadata.items() if v is not None
|
|
454
|
+
}
|
|
443
455
|
job_info = managed_jobsv1_pb2.ManagedJobInfo(
|
|
456
|
+
# The `spot.job_id`, which can be used to identify
|
|
457
|
+
# different tasks for the same job
|
|
458
|
+
_job_id=job.get('_job_id'),
|
|
444
459
|
job_id=job.get('job_id'),
|
|
445
460
|
task_id=job.get('task_id'),
|
|
446
461
|
job_name=job.get('job_name'),
|
|
@@ -468,11 +483,7 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
468
483
|
end_at=job.get('end_at'),
|
|
469
484
|
user_yaml=job.get('user_yaml'),
|
|
470
485
|
entrypoint=job.get('entrypoint'),
|
|
471
|
-
metadata=
|
|
472
|
-
k: v
|
|
473
|
-
for k, v in job.get('metadata', {}).items()
|
|
474
|
-
if v is not None
|
|
475
|
-
},
|
|
486
|
+
metadata=converted_metadata,
|
|
476
487
|
pool=job.get('pool'),
|
|
477
488
|
pool_hash=job.get('pool_hash'))
|
|
478
489
|
jobs_info.append(job_info)
|
sky/ssh_node_pools/server.py
CHANGED
|
@@ -7,6 +7,7 @@ import fastapi
|
|
|
7
7
|
from sky import core as sky_core
|
|
8
8
|
from sky.server.requests import executor
|
|
9
9
|
from sky.server.requests import payloads
|
|
10
|
+
from sky.server.requests import request_names
|
|
10
11
|
from sky.server.requests import requests as requests_lib
|
|
11
12
|
from sky.ssh_node_pools import core as ssh_node_pools_core
|
|
12
13
|
from sky.utils import common_utils
|
|
@@ -101,7 +102,7 @@ async def deploy_ssh_node_pool(request: fastapi.Request,
|
|
|
101
102
|
ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=False)
|
|
102
103
|
await executor.schedule_request_async(
|
|
103
104
|
request_id=request.state.request_id,
|
|
104
|
-
request_name=
|
|
105
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
|
|
105
106
|
request_body=ssh_up_body,
|
|
106
107
|
func=sky_core.ssh_up,
|
|
107
108
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -126,7 +127,7 @@ async def deploy_ssh_node_pool_general(
|
|
|
126
127
|
try:
|
|
127
128
|
await executor.schedule_request_async(
|
|
128
129
|
request_id=request.state.request_id,
|
|
129
|
-
request_name=
|
|
130
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
|
|
130
131
|
request_body=ssh_up_body,
|
|
131
132
|
func=sky_core.ssh_up,
|
|
132
133
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -152,7 +153,7 @@ async def down_ssh_node_pool(request: fastapi.Request,
|
|
|
152
153
|
ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=True)
|
|
153
154
|
await executor.schedule_request_async(
|
|
154
155
|
request_id=request.state.request_id,
|
|
155
|
-
request_name=
|
|
156
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
|
|
156
157
|
request_body=ssh_up_body,
|
|
157
158
|
func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
|
|
158
159
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -180,7 +181,7 @@ async def down_ssh_node_pool_general(
|
|
|
180
181
|
ssh_up_body.cleanup = True
|
|
181
182
|
await executor.schedule_request_async(
|
|
182
183
|
request_id=request.state.request_id,
|
|
183
|
-
request_name=
|
|
184
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
|
|
184
185
|
request_body=ssh_up_body,
|
|
185
186
|
func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
|
|
186
187
|
schedule_type=requests_lib.ScheduleType.LONG,
|
sky/task.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Task: a coarse-grained stage in an application."""
|
|
2
2
|
import collections
|
|
3
|
-
import inspect
|
|
4
3
|
import json
|
|
5
4
|
import os
|
|
6
5
|
import re
|
|
@@ -29,10 +28,6 @@ from sky.utils import yaml_utils
|
|
|
29
28
|
|
|
30
29
|
logger = sky_logging.init_logger(__name__)
|
|
31
30
|
|
|
32
|
-
# A lambda generating commands (node rank_i, node addrs -> cmd_i).
|
|
33
|
-
CommandGen = Callable[[int, List[str]], Optional[str]]
|
|
34
|
-
CommandOrCommandGen = Union[str, CommandGen]
|
|
35
|
-
|
|
36
31
|
_VALID_NAME_REGEX = '[a-zA-Z0-9]+(?:[._-]{1,2}[a-zA-Z0-9]+)*'
|
|
37
32
|
_VALID_NAME_DESCR = ('ASCII characters and may contain lowercase and'
|
|
38
33
|
' uppercase letters, digits, underscores, periods,'
|
|
@@ -236,7 +231,7 @@ class Task:
|
|
|
236
231
|
name: Optional[str] = None,
|
|
237
232
|
*,
|
|
238
233
|
setup: Optional[Union[str, List[str]]] = None,
|
|
239
|
-
run: Optional[Union[
|
|
234
|
+
run: Optional[Union[str, List[str]]] = None,
|
|
240
235
|
envs: Optional[Dict[str, str]] = None,
|
|
241
236
|
secrets: Optional[Dict[str, str]] = None,
|
|
242
237
|
workdir: Optional[Union[str, Dict[str, Any]]] = None,
|
|
@@ -349,7 +344,7 @@ class Task:
|
|
|
349
344
|
self._volumes = volumes or {}
|
|
350
345
|
|
|
351
346
|
# concatenate commands if given as list
|
|
352
|
-
def _concat(commands):
|
|
347
|
+
def _concat(commands: Optional[Union[str, List[str]]]) -> Optional[str]:
|
|
353
348
|
if isinstance(commands, list):
|
|
354
349
|
return '\n'.join(commands)
|
|
355
350
|
return commands
|
|
@@ -447,42 +442,9 @@ class Task:
|
|
|
447
442
|
|
|
448
443
|
def validate_run(self):
|
|
449
444
|
"""Validates if the run command is valid."""
|
|
450
|
-
if
|
|
451
|
-
run_sig = inspect.signature(self.run)
|
|
452
|
-
# Check that run is a function with 2 arguments.
|
|
453
|
-
if len(run_sig.parameters) != 2:
|
|
454
|
-
with ux_utils.print_exception_no_traceback():
|
|
455
|
-
raise ValueError(_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
|
|
456
|
-
|
|
457
|
-
type_list = [int, List[str]]
|
|
458
|
-
# Check annotations, if exists
|
|
459
|
-
for i, param in enumerate(run_sig.parameters.values()):
|
|
460
|
-
if param.annotation != inspect.Parameter.empty:
|
|
461
|
-
if param.annotation != type_list[i]:
|
|
462
|
-
with ux_utils.print_exception_no_traceback():
|
|
463
|
-
raise ValueError(
|
|
464
|
-
_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
|
|
465
|
-
|
|
466
|
-
# Check self containedness.
|
|
467
|
-
run_closure = inspect.getclosurevars(self.run)
|
|
468
|
-
if run_closure.nonlocals:
|
|
469
|
-
with ux_utils.print_exception_no_traceback():
|
|
470
|
-
raise ValueError(
|
|
471
|
-
'run command generator must be self contained. '
|
|
472
|
-
f'Found nonlocals: {run_closure.nonlocals}')
|
|
473
|
-
if run_closure.globals:
|
|
474
|
-
with ux_utils.print_exception_no_traceback():
|
|
475
|
-
raise ValueError(
|
|
476
|
-
'run command generator must be self contained. '
|
|
477
|
-
f'Found globals: {run_closure.globals}')
|
|
478
|
-
if run_closure.unbound:
|
|
479
|
-
# Do not raise an error here. Import statements, which are
|
|
480
|
-
# allowed, will be considered as unbounded.
|
|
481
|
-
pass
|
|
482
|
-
elif self.run is not None and not isinstance(self.run, str):
|
|
445
|
+
if self.run is not None and not isinstance(self.run, str):
|
|
483
446
|
with ux_utils.print_exception_no_traceback():
|
|
484
|
-
raise ValueError('run must be
|
|
485
|
-
f'a command generator ({CommandGen}). '
|
|
447
|
+
raise ValueError('run must be a shell script (str). '
|
|
486
448
|
f'Got {type(self.run)}')
|
|
487
449
|
|
|
488
450
|
def expand_and_validate_file_mounts(self):
|
|
@@ -1059,7 +1059,7 @@ available_node_types:
|
|
|
1059
1059
|
# Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
|
|
1060
1060
|
# For SkyServe, this will be None and every service will be recovered. This is because SkyServe
|
|
1061
1061
|
# will delete the service from the database after it is terminated so everything in the database is running.
|
|
1062
|
-
ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs = state.
|
|
1062
|
+
ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs, _ = state.get_managed_jobs_with_filters(fields=['job_id', 'schedule_state']); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
|
|
1063
1063
|
if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
|
|
1064
1064
|
read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
|
|
1065
1065
|
fi
|