skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (114) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/backends/backend_utils.py +9 -6
  5. sky/backends/cloud_vm_ray_backend.py +2 -3
  6. sky/check.py +25 -13
  7. sky/client/cli/command.py +52 -24
  8. sky/cloud_stores.py +73 -0
  9. sky/clouds/aws.py +59 -11
  10. sky/core.py +7 -5
  11. sky/dashboard/out/404.html +1 -1
  12. sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
  13. sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
  15. sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
  26. sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
  28. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  29. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  30. sky/dashboard/out/clusters/[cluster].html +1 -1
  31. sky/dashboard/out/clusters.html +1 -1
  32. sky/dashboard/out/config.html +1 -1
  33. sky/dashboard/out/index.html +1 -1
  34. sky/dashboard/out/infra/[context].html +1 -1
  35. sky/dashboard/out/infra.html +1 -1
  36. sky/dashboard/out/jobs/[job].html +1 -1
  37. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  38. sky/dashboard/out/jobs.html +1 -1
  39. sky/dashboard/out/users.html +1 -1
  40. sky/dashboard/out/volumes.html +1 -1
  41. sky/dashboard/out/workspace/new.html +1 -1
  42. sky/dashboard/out/workspaces/[name].html +1 -1
  43. sky/dashboard/out/workspaces.html +1 -1
  44. sky/data/data_utils.py +92 -1
  45. sky/data/mounting_utils.py +71 -2
  46. sky/data/storage.py +166 -9
  47. sky/global_user_state.py +14 -18
  48. sky/jobs/constants.py +2 -0
  49. sky/jobs/controller.py +62 -67
  50. sky/jobs/file_content_utils.py +80 -0
  51. sky/jobs/log_gc.py +201 -0
  52. sky/jobs/scheduler.py +15 -2
  53. sky/jobs/server/core.py +85 -13
  54. sky/jobs/server/server.py +14 -13
  55. sky/jobs/server/utils.py +28 -10
  56. sky/jobs/state.py +216 -40
  57. sky/jobs/utils.py +65 -28
  58. sky/metrics/utils.py +18 -0
  59. sky/optimizer.py +1 -1
  60. sky/provision/kubernetes/instance.py +88 -19
  61. sky/provision/kubernetes/volume.py +2 -2
  62. sky/schemas/api/responses.py +3 -5
  63. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  64. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  65. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  66. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  67. sky/serve/replica_managers.py +2 -2
  68. sky/serve/serve_utils.py +9 -2
  69. sky/serve/server/server.py +8 -7
  70. sky/server/common.py +21 -15
  71. sky/server/constants.py +1 -1
  72. sky/server/daemons.py +23 -17
  73. sky/server/requests/executor.py +7 -3
  74. sky/server/requests/payloads.py +2 -0
  75. sky/server/requests/request_names.py +80 -0
  76. sky/server/requests/requests.py +137 -102
  77. sky/server/requests/serializers/decoders.py +0 -6
  78. sky/server/requests/serializers/encoders.py +33 -6
  79. sky/server/server.py +105 -36
  80. sky/server/stream_utils.py +56 -13
  81. sky/setup_files/dependencies.py +2 -0
  82. sky/skylet/constants.py +6 -1
  83. sky/skylet/events.py +7 -0
  84. sky/skylet/services.py +18 -7
  85. sky/ssh_node_pools/server.py +5 -4
  86. sky/task.py +14 -42
  87. sky/templates/kubernetes-ray.yml.j2 +1 -1
  88. sky/templates/nebius-ray.yml.j2 +1 -0
  89. sky/templates/websocket_proxy.py +140 -12
  90. sky/users/permission.py +4 -1
  91. sky/utils/cli_utils/status_utils.py +8 -2
  92. sky/utils/context_utils.py +13 -1
  93. sky/utils/db/migration_utils.py +1 -1
  94. sky/utils/resource_checker.py +4 -1
  95. sky/utils/resources_utils.py +53 -29
  96. sky/utils/schemas.py +23 -4
  97. sky/volumes/server/server.py +4 -3
  98. sky/workspaces/server.py +7 -6
  99. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
  100. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
  101. sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
  102. sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
  107. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
  108. sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
  109. /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
  110. /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
  111. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
  112. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
  113. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
  114. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
sky/server/server.py CHANGED
@@ -6,6 +6,7 @@ import base64
6
6
  from concurrent.futures import ThreadPoolExecutor
7
7
  import contextlib
8
8
  import datetime
9
+ from enum import IntEnum
9
10
  import hashlib
10
11
  import json
11
12
  import multiprocessing
@@ -15,6 +16,7 @@ import posixpath
15
16
  import re
16
17
  import resource
17
18
  import shutil
19
+ import struct
18
20
  import sys
19
21
  import threading
20
22
  import traceback
@@ -25,6 +27,7 @@ import zipfile
25
27
  import aiofiles
26
28
  import anyio
27
29
  import fastapi
30
+ from fastapi import responses as fastapi_responses
28
31
  from fastapi.middleware import cors
29
32
  import starlette.middleware.base
30
33
  import uvloop
@@ -61,6 +64,7 @@ from sky.server.auth import oauth2_proxy
61
64
  from sky.server.requests import executor
62
65
  from sky.server.requests import payloads
63
66
  from sky.server.requests import preconditions
67
+ from sky.server.requests import request_names
64
68
  from sky.server.requests import requests as requests_lib
65
69
  from sky.skylet import constants
66
70
  from sky.ssh_node_pools import server as ssh_node_pools_rest
@@ -459,7 +463,7 @@ async def schedule_on_boot_check_async():
459
463
  try:
460
464
  await executor.schedule_request_async(
461
465
  request_id='skypilot-server-on-boot-check',
462
- request_name='check',
466
+ request_name=request_names.RequestName.CHECK,
463
467
  request_body=payloads.CheckBody(),
464
468
  func=sky_check.check,
465
469
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -731,7 +735,7 @@ async def check(request: fastapi.Request,
731
735
  """Checks enabled clouds."""
732
736
  await executor.schedule_request_async(
733
737
  request_id=request.state.request_id,
734
- request_name='check',
738
+ request_name=request_names.RequestName.CHECK,
735
739
  request_body=check_body,
736
740
  func=sky_check.check,
737
741
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -745,7 +749,7 @@ async def enabled_clouds(request: fastapi.Request,
745
749
  """Gets enabled clouds on the server."""
746
750
  await executor.schedule_request_async(
747
751
  request_id=request.state.request_id,
748
- request_name='enabled_clouds',
752
+ request_name=request_names.RequestName.ENABLED_CLOUDS,
749
753
  request_body=payloads.EnabledCloudsBody(workspace=workspace,
750
754
  expand=expand),
751
755
  func=core.enabled_clouds,
@@ -761,7 +765,8 @@ async def realtime_kubernetes_gpu_availability(
761
765
  """Gets real-time Kubernetes GPU availability."""
762
766
  await executor.schedule_request_async(
763
767
  request_id=request.state.request_id,
764
- request_name='realtime_kubernetes_gpu_availability',
768
+ request_name=request_names.RequestName.
769
+ REALTIME_KUBERNETES_GPU_AVAILABILITY,
765
770
  request_body=realtime_gpu_availability_body,
766
771
  func=core.realtime_kubernetes_gpu_availability,
767
772
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -776,7 +781,7 @@ async def kubernetes_node_info(
776
781
  """Gets Kubernetes nodes information and hints."""
777
782
  await executor.schedule_request_async(
778
783
  request_id=request.state.request_id,
779
- request_name='kubernetes_node_info',
784
+ request_name=request_names.RequestName.KUBERNETES_NODE_INFO,
780
785
  request_body=kubernetes_node_info_body,
781
786
  func=kubernetes_utils.get_kubernetes_node_info,
782
787
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -788,7 +793,7 @@ async def status_kubernetes(request: fastapi.Request) -> None:
788
793
  """Gets Kubernetes status."""
789
794
  await executor.schedule_request_async(
790
795
  request_id=request.state.request_id,
791
- request_name='status_kubernetes',
796
+ request_name=request_names.RequestName.STATUS_KUBERNETES,
792
797
  request_body=payloads.RequestBody(),
793
798
  func=core.status_kubernetes,
794
799
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -802,7 +807,7 @@ async def list_accelerators(
802
807
  """Gets list of accelerators from cloud catalog."""
803
808
  await executor.schedule_request_async(
804
809
  request_id=request.state.request_id,
805
- request_name='list_accelerators',
810
+ request_name=request_names.RequestName.LIST_ACCELERATORS,
806
811
  request_body=list_accelerator_counts_body,
807
812
  func=catalog.list_accelerators,
808
813
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -817,7 +822,7 @@ async def list_accelerator_counts(
817
822
  """Gets list of accelerator counts from cloud catalog."""
818
823
  await executor.schedule_request_async(
819
824
  request_id=request.state.request_id,
820
- request_name='list_accelerator_counts',
825
+ request_name=request_names.RequestName.LIST_ACCELERATOR_COUNTS,
821
826
  request_body=list_accelerator_counts_body,
822
827
  func=catalog.list_accelerator_counts,
823
828
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -874,7 +879,7 @@ async def optimize(optimize_body: payloads.OptimizeBody,
874
879
  """Optimizes the user's DAG."""
875
880
  await executor.schedule_request_async(
876
881
  request_id=request.state.request_id,
877
- request_name='optimize',
882
+ request_name=request_names.RequestName.OPTIMIZE,
878
883
  request_body=optimize_body,
879
884
  ignore_return_value=True,
880
885
  func=core.optimize,
@@ -1084,7 +1089,7 @@ async def launch(launch_body: payloads.LaunchBody,
1084
1089
  logger.info(f'Launching request: {request_id}')
1085
1090
  await executor.schedule_request_async(
1086
1091
  request_id,
1087
- request_name='launch',
1092
+ request_name=request_names.RequestName.CLUSTER_LAUNCH,
1088
1093
  request_body=launch_body,
1089
1094
  func=execution.launch,
1090
1095
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1100,7 +1105,7 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
1100
1105
  cluster_name = exec_body.cluster_name
1101
1106
  await executor.schedule_request_async(
1102
1107
  request_id=request.state.request_id,
1103
- request_name='exec',
1108
+ request_name=request_names.RequestName.CLUSTER_EXEC,
1104
1109
  request_body=exec_body,
1105
1110
  func=execution.exec,
1106
1111
  precondition=preconditions.ClusterStartCompletePrecondition(
@@ -1118,7 +1123,7 @@ async def stop(request: fastapi.Request,
1118
1123
  """Stops a cluster."""
1119
1124
  await executor.schedule_request_async(
1120
1125
  request_id=request.state.request_id,
1121
- request_name='stop',
1126
+ request_name=request_names.RequestName.CLUSTER_STOP,
1122
1127
  request_body=stop_body,
1123
1128
  func=core.stop,
1124
1129
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1138,7 +1143,7 @@ async def status(
1138
1143
  detail='Server is shutting down, please try again later.')
1139
1144
  await executor.schedule_request_async(
1140
1145
  request_id=request.state.request_id,
1141
- request_name='status',
1146
+ request_name=request_names.RequestName.CLUSTER_STATUS,
1142
1147
  request_body=status_body,
1143
1148
  func=core.status,
1144
1149
  schedule_type=(requests_lib.ScheduleType.LONG if
@@ -1153,7 +1158,7 @@ async def endpoints(request: fastapi.Request,
1153
1158
  """Gets the endpoint for a given cluster and port number (endpoint)."""
1154
1159
  await executor.schedule_request_async(
1155
1160
  request_id=request.state.request_id,
1156
- request_name='endpoints',
1161
+ request_name=request_names.RequestName.CLUSTER_ENDPOINTS,
1157
1162
  request_body=endpoint_body,
1158
1163
  func=core.endpoints,
1159
1164
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1167,7 +1172,7 @@ async def down(request: fastapi.Request,
1167
1172
  """Tears down a cluster."""
1168
1173
  await executor.schedule_request_async(
1169
1174
  request_id=request.state.request_id,
1170
- request_name='down',
1175
+ request_name=request_names.RequestName.CLUSTER_DOWN,
1171
1176
  request_body=down_body,
1172
1177
  func=core.down,
1173
1178
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1181,7 +1186,7 @@ async def start(request: fastapi.Request,
1181
1186
  """Restarts a cluster."""
1182
1187
  await executor.schedule_request_async(
1183
1188
  request_id=request.state.request_id,
1184
- request_name='start',
1189
+ request_name=request_names.RequestName.CLUSTER_START,
1185
1190
  request_body=start_body,
1186
1191
  func=core.start,
1187
1192
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1195,7 +1200,7 @@ async def autostop(request: fastapi.Request,
1195
1200
  """Schedules an autostop/autodown for a cluster."""
1196
1201
  await executor.schedule_request_async(
1197
1202
  request_id=request.state.request_id,
1198
- request_name='autostop',
1203
+ request_name=request_names.RequestName.CLUSTER_AUTOSTOP,
1199
1204
  request_body=autostop_body,
1200
1205
  func=core.autostop,
1201
1206
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1209,7 +1214,7 @@ async def queue(request: fastapi.Request,
1209
1214
  """Gets the job queue of a cluster."""
1210
1215
  await executor.schedule_request_async(
1211
1216
  request_id=request.state.request_id,
1212
- request_name='queue',
1217
+ request_name=request_names.RequestName.CLUSTER_QUEUE,
1213
1218
  request_body=queue_body,
1214
1219
  func=core.queue,
1215
1220
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1223,7 +1228,7 @@ async def job_status(request: fastapi.Request,
1223
1228
  """Gets the status of a job."""
1224
1229
  await executor.schedule_request_async(
1225
1230
  request_id=request.state.request_id,
1226
- request_name='job_status',
1231
+ request_name=request_names.RequestName.CLUSTER_JOB_STATUS,
1227
1232
  request_body=job_status_body,
1228
1233
  func=core.job_status,
1229
1234
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1237,7 +1242,7 @@ async def cancel(request: fastapi.Request,
1237
1242
  """Cancels jobs on a cluster."""
1238
1243
  await executor.schedule_request_async(
1239
1244
  request_id=request.state.request_id,
1240
- request_name='cancel',
1245
+ request_name=request_names.RequestName.CLUSTER_JOB_CANCEL,
1241
1246
  request_body=cancel_body,
1242
1247
  func=core.cancel,
1243
1248
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1257,7 +1262,7 @@ async def logs(
1257
1262
  executor.check_request_thread_executor_available()
1258
1263
  request_task = await executor.prepare_request_async(
1259
1264
  request_id=request.state.request_id,
1260
- request_name='logs',
1265
+ request_name=request_names.RequestName.CLUSTER_JOB_LOGS,
1261
1266
  request_body=cluster_job_body,
1262
1267
  func=core.tail_logs,
1263
1268
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1288,7 +1293,7 @@ async def download_logs(
1288
1293
  cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
1289
1294
  await executor.schedule_request_async(
1290
1295
  request_id=request.state.request_id,
1291
- request_name='download_logs',
1296
+ request_name=request_names.RequestName.CLUSTER_JOB_DOWNLOAD_LOGS,
1292
1297
  request_body=cluster_jobs_body,
1293
1298
  func=core.download_logs,
1294
1299
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1439,7 +1444,7 @@ async def cost_report(request: fastapi.Request,
1439
1444
  """Gets the cost report of a cluster."""
1440
1445
  await executor.schedule_request_async(
1441
1446
  request_id=request.state.request_id,
1442
- request_name='cost_report',
1447
+ request_name=request_names.RequestName.CLUSTER_COST_REPORT,
1443
1448
  request_body=cost_report_body,
1444
1449
  func=core.cost_report,
1445
1450
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1451,7 +1456,7 @@ async def storage_ls(request: fastapi.Request) -> None:
1451
1456
  """Gets the storages."""
1452
1457
  await executor.schedule_request_async(
1453
1458
  request_id=request.state.request_id,
1454
- request_name='storage_ls',
1459
+ request_name=request_names.RequestName.STORAGE_LS,
1455
1460
  request_body=payloads.RequestBody(),
1456
1461
  func=core.storage_ls,
1457
1462
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1464,7 +1469,7 @@ async def storage_delete(request: fastapi.Request,
1464
1469
  """Deletes a storage."""
1465
1470
  await executor.schedule_request_async(
1466
1471
  request_id=request.state.request_id,
1467
- request_name='storage_delete',
1472
+ request_name=request_names.RequestName.STORAGE_DELETE,
1468
1473
  request_body=storage_body,
1469
1474
  func=core.storage_delete,
1470
1475
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1477,7 +1482,7 @@ async def local_up(request: fastapi.Request,
1477
1482
  """Launches a Kubernetes cluster on API server."""
1478
1483
  await executor.schedule_request_async(
1479
1484
  request_id=request.state.request_id,
1480
- request_name='local_up',
1485
+ request_name=request_names.RequestName.LOCAL_UP,
1481
1486
  request_body=local_up_body,
1482
1487
  func=core.local_up,
1483
1488
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1490,7 +1495,7 @@ async def local_down(request: fastapi.Request,
1490
1495
  """Tears down the Kubernetes cluster started by local_up."""
1491
1496
  await executor.schedule_request_async(
1492
1497
  request_id=request.state.request_id,
1493
- request_name='local_down',
1498
+ request_name=request_names.RequestName.LOCAL_DOWN,
1494
1499
  request_body=local_down_body,
1495
1500
  func=core.local_down,
1496
1501
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1512,7 +1517,7 @@ async def get_expanded_request_id(request_id: str) -> str:
1512
1517
 
1513
1518
 
1514
1519
  # === API server related APIs ===
1515
- @app.get('/api/get')
1520
+ @app.get('/api/get', response_class=fastapi_responses.ORJSONResponse)
1516
1521
  async def api_get(request_id: str) -> payloads.RequestPayload:
1517
1522
  """Gets a request with a given request ID prefix."""
1518
1523
  # Validate request_id prefix matches a single request.
@@ -1698,7 +1703,7 @@ async def api_cancel(request: fastapi.Request,
1698
1703
  """Cancels requests."""
1699
1704
  await executor.schedule_request_async(
1700
1705
  request_id=request.state.request_id,
1701
- request_name='api_cancel',
1706
+ request_name=request_names.RequestName.API_CANCEL,
1702
1707
  request_body=request_cancel_body,
1703
1708
  func=requests_lib.kill_requests_with_prefix,
1704
1709
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1803,16 +1808,31 @@ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
1803
1808
  basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
1804
1809
  'false').lower() == 'true',
1805
1810
  user=user if user is not None else None,
1811
+ service_account_token_enabled=(os.environ.get(
1812
+ constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
1813
+ 'false').lower() == 'true'),
1806
1814
  )
1807
1815
 
1808
1816
 
1817
+ class KubernetesSSHMessageType(IntEnum):
1818
+ REGULAR_DATA = 0
1819
+ PINGPONG = 1
1820
+ LATENCY_MEASUREMENT = 2
1821
+
1822
+
1809
1823
  @app.websocket('/kubernetes-pod-ssh-proxy')
1810
- async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1811
- cluster_name: str) -> None:
1824
+ async def kubernetes_pod_ssh_proxy(
1825
+ websocket: fastapi.WebSocket,
1826
+ cluster_name: str,
1827
+ client_version: Optional[int] = None) -> None:
1812
1828
  """Proxies SSH to the Kubernetes pod with websocket."""
1813
1829
  await websocket.accept()
1814
1830
  logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
1815
1831
 
1832
+ timestamps_supported = client_version is not None and client_version > 21
1833
+ logger.info(f'Websocket timestamps supported: {timestamps_supported}, \
1834
+ client_version = {client_version}')
1835
+
1816
1836
  # Run core.status in another thread to avoid blocking the event loop.
1817
1837
  with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
1818
1838
  cluster_records = await context_utils.to_thread_with_executor(
@@ -1867,6 +1887,42 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1867
1887
  async def websocket_to_ssh():
1868
1888
  try:
1869
1889
  async for message in websocket.iter_bytes():
1890
+ if timestamps_supported:
1891
+ type_size = struct.calcsize('!B')
1892
+ message_type = struct.unpack('!B',
1893
+ message[:type_size])[0]
1894
+ if (message_type ==
1895
+ KubernetesSSHMessageType.REGULAR_DATA):
1896
+ # Regular data - strip type byte and forward to SSH
1897
+ message = message[type_size:]
1898
+ elif message_type == KubernetesSSHMessageType.PINGPONG:
1899
+ # PING message - respond with PONG (type 1)
1900
+ ping_id_size = struct.calcsize('!I')
1901
+ if len(message) != type_size + ping_id_size:
1902
+ raise ValueError('Invalid PING message '
1903
+ f'length: {len(message)}')
1904
+ # Return the same PING message, so that the client
1905
+ # can measure the latency.
1906
+ await websocket.send_bytes(message)
1907
+ continue
1908
+ elif (message_type ==
1909
+ KubernetesSSHMessageType.LATENCY_MEASUREMENT):
1910
+ # Latency measurement from client
1911
+ latency_size = struct.calcsize('!Q')
1912
+ if len(message) != type_size + latency_size:
1913
+ raise ValueError(
1914
+ 'Invalid latency measurement '
1915
+ f'message length: {len(message)}')
1916
+ avg_latency_ms = struct.unpack(
1917
+ '!Q',
1918
+ message[type_size:type_size + latency_size])[0]
1919
+ latency_seconds = avg_latency_ms / 1000
1920
+ metrics_utils.SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS.labels(pid=os.getpid()).observe(latency_seconds) # pylint: disable=line-too-long
1921
+ continue
1922
+ else:
1923
+ # Unknown message type.
1924
+ raise ValueError(
1925
+ f'Unknown message type: {message_type}')
1870
1926
  writer.write(message)
1871
1927
  try:
1872
1928
  await writer.drain()
@@ -1897,6 +1953,11 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1897
1953
  nonlocal ssh_failed
1898
1954
  ssh_failed = True
1899
1955
  break
1956
+ if timestamps_supported:
1957
+ # Prepend message type byte (0 = regular data)
1958
+ message_type_bytes = struct.pack(
1959
+ '!B', KubernetesSSHMessageType.REGULAR_DATA.value)
1960
+ data = message_type_bytes + data
1900
1961
  await websocket.send_bytes(data)
1901
1962
  except Exception: # pylint: disable=broad-except
1902
1963
  pass
@@ -1936,7 +1997,7 @@ async def all_contexts(request: fastapi.Request) -> None:
1936
1997
 
1937
1998
  await executor.schedule_request_async(
1938
1999
  request_id=request.state.request_id,
1939
- request_name='all_contexts',
2000
+ request_name=request_names.RequestName.ALL_CONTEXTS,
1940
2001
  request_body=payloads.RequestBody(),
1941
2002
  func=core.get_all_contexts,
1942
2003
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -2050,7 +2111,6 @@ if __name__ == '__main__':
2050
2111
  # Serve metrics on a separate port to isolate it from the application APIs:
2051
2112
  # metrics port will not be exposed to the public network typically.
2052
2113
  parser.add_argument('--metrics-port', default=9090, type=int)
2053
- parser.add_argument('--start-with-python', action='store_true')
2054
2114
  cmd_args = parser.parse_args()
2055
2115
  if cmd_args.port == cmd_args.metrics_port:
2056
2116
  logger.error('port and metrics-port cannot be the same, exiting.')
@@ -2065,9 +2125,18 @@ if __name__ == '__main__':
2065
2125
  logger.error(f'Port {cmd_args.port} is not available, exiting.')
2066
2126
  raise RuntimeError(f'Port {cmd_args.port} is not available')
2067
2127
 
2068
- if not cmd_args.start_with_python:
2069
- # Maybe touch the signal file on API server startup.
2070
- managed_job_utils.is_consolidation_mode(on_api_restart=True)
2128
+ # Maybe touch the signal file on API server startup. Do it again here even
2129
+ # if we already touched it in the sky/server/common.py::_start_api_server.
2130
+ # This is because the sky/server/common.py::_start_api_server function call
2131
+ # is running outside the skypilot API server process tree. The process tree
2132
+ # starts within that function (see the `subprocess.Popen` call in
2133
+ # sky/server/common.py::_start_api_server). When pg is used, the
2134
+ # _start_api_server function will not load the config file from db, which
2135
+ # will ignore the consolidation mode config. Here, inside the process tree,
2136
+ # we already reload the config as a server (with env var _start_api_server),
2137
+ # so we will respect the consolidation mode config.
2138
+ # Refers to #7717 for more details.
2139
+ managed_job_utils.is_consolidation_mode(on_api_restart=True)
2071
2140
 
2072
2141
  # Show the privacy policy if it is not already shown. We place it here so
2073
2142
  # that it is shown only when the API server is started.
@@ -25,6 +25,8 @@ logger = sky_logging.init_logger(__name__)
25
25
  _BUFFER_SIZE = 8 * 1024 # 8KB
26
26
  _BUFFER_TIMEOUT = 0.02 # 20ms
27
27
  _HEARTBEAT_INTERVAL = 30
28
+ _READ_CHUNK_SIZE = 256 * 1024 # 256KB chunks for file reading
29
+
28
30
  # If a SHORT request has been stuck in pending for
29
31
  # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
30
32
  _SHORT_REQUEST_SPINNER_TIMEOUT = 2
@@ -235,6 +237,9 @@ async def _tail_log_file(
235
237
  buffer_bytes = 0
236
238
  last_flush_time = asyncio.get_event_loop().time()
237
239
 
240
+ # Read file in chunks instead of line-by-line for better performance
241
+ incomplete_line = b'' # Buffer for incomplete lines across chunks
242
+
238
243
  async def flush_buffer() -> AsyncGenerator[str, None]:
239
244
  nonlocal buffer, buffer_bytes, last_flush_time
240
245
  if buffer:
@@ -255,8 +260,23 @@ async def _tail_log_file(
255
260
  async for chunk in flush_buffer():
256
261
  yield chunk
257
262
 
258
- line: Optional[bytes] = await f.readline()
259
- if not line:
263
+ # Read file in chunks for better I/O performance
264
+ file_chunk: bytes = await f.read(_READ_CHUNK_SIZE)
265
+ if not file_chunk:
266
+ # Process any remaining incomplete line
267
+ if incomplete_line:
268
+ line_str = incomplete_line.decode('utf-8')
269
+ if plain_logs:
270
+ is_payload, line_str = message_utils.decode_payload(
271
+ line_str, raise_for_mismatch=False)
272
+ if not is_payload:
273
+ buffer.append(line_str)
274
+ buffer_bytes += len(line_str.encode('utf-8'))
275
+ else:
276
+ buffer.append(line_str)
277
+ buffer_bytes += len(line_str.encode('utf-8'))
278
+ incomplete_line = b''
279
+
260
280
  # Avoid checking the status too frequently to avoid overloading the
261
281
  # DB.
262
282
  should_check_status = (current_time -
@@ -328,16 +348,39 @@ async def _tail_log_file(
328
348
  # performance but it helps avoid unnecessary heartbeat strings
329
349
  # being printed when the client runs in an old version.
330
350
  last_heartbeat_time = asyncio.get_event_loop().time()
331
- line_str = line.decode('utf-8')
332
- if plain_logs:
333
- is_payload, line_str = message_utils.decode_payload(
334
- line_str, raise_for_mismatch=False)
335
- # TODO(aylei): implement heartbeat mechanism for plain logs,
336
- # sending invisible characters might be okay.
337
- if is_payload:
338
- continue
339
- buffer.append(line_str)
340
- buffer_bytes += len(line_str.encode('utf-8'))
351
+
352
+ # Combine with any incomplete line from previous chunk
353
+ file_chunk = incomplete_line + file_chunk
354
+ incomplete_line = b''
355
+
356
+ # Split chunk into lines, preserving line structure
357
+ lines_bytes = file_chunk.split(b'\n')
358
+
359
+ # If chunk doesn't end with newline, the last element is incomplete
360
+ if file_chunk and not file_chunk.endswith(b'\n'):
361
+ incomplete_line = lines_bytes[-1]
362
+ lines_bytes = lines_bytes[:-1]
363
+ else:
364
+ # If ends with \n, split creates an empty last element we should
365
+ # ignore
366
+ if lines_bytes and lines_bytes[-1] == b'':
367
+ lines_bytes = lines_bytes[:-1]
368
+
369
+ # Process all complete lines in this chunk
370
+ for line_bytes in lines_bytes:
371
+ # Reconstruct line with newline (since split removed it)
372
+ line_str = line_bytes.decode('utf-8') + '\n'
373
+
374
+ if plain_logs:
375
+ is_payload, line_str = message_utils.decode_payload(
376
+ line_str, raise_for_mismatch=False)
377
+ # TODO(aylei): implement heartbeat mechanism for plain logs,
378
+ # sending invisible characters might be okay.
379
+ if is_payload:
380
+ continue
381
+
382
+ buffer.append(line_str)
383
+ buffer_bytes += len(line_str.encode('utf-8'))
341
384
 
342
385
  # Flush remaining lines in the buffer.
343
386
  async for chunk in flush_buffer():
@@ -373,7 +416,7 @@ def stream_response(
373
416
  async def on_disconnect():
374
417
  logger.info(f'User terminated the connection for request '
375
418
  f'{request_id}')
376
- requests_lib.kill_requests([request_id])
419
+ await requests_lib.kill_request_async(request_id)
377
420
 
378
421
  # The background task will be run after returning a response.
379
422
  # https://fastapi.tiangolo.com/tutorial/background-tasks/
@@ -49,6 +49,7 @@ install_requires = [
49
49
  # <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
50
50
  'pyyaml > 3.13, != 5.4.*',
51
51
  'ijson',
52
+ 'orjson',
52
53
  'requests',
53
54
  # SkyPilot inherits from uvicorn.Server to customize the behavior of
54
55
  # uvicorn, so we need to pin uvicorn version to avoid potential break
@@ -187,6 +188,7 @@ cloud_dependencies: Dict[str, List[str]] = {
187
188
  'docker': ['docker'] + local_ray,
188
189
  'lambda': [], # No dependencies needed for lambda
189
190
  'cloudflare': aws_dependencies,
191
+ 'coreweave': aws_dependencies,
190
192
  'scp': local_ray,
191
193
  'oci': ['oci'],
192
194
  # Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
sky/skylet/constants.py CHANGED
@@ -100,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
100
100
  # cluster yaml is updated.
101
101
  #
102
102
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
103
- SKYLET_VERSION = '23'
103
+ SKYLET_VERSION = '25'
104
104
  # The version of the lib files that skylet/jobs use. Whenever there is an API
105
105
  # change for the job_lib or log_lib, we need to bump this version, so that the
106
106
  # user can be notified to update their SkyPilot version on the remote cluster.
@@ -422,6 +422,8 @@ SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
422
422
  # but the configs won't be applied)
423
423
  ('jobs', 'controller', 'consolidation_mode'),
424
424
  ('serve', 'controller', 'consolidation_mode'),
425
+ ('jobs', 'controller', 'controller_logs_gc_retention_hours'),
426
+ ('jobs', 'controller', 'task_logs_gc_retention_hours'),
425
427
  ]
426
428
 
427
429
  # Constants for Azure blob storage
@@ -548,3 +550,6 @@ ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
548
550
 
549
551
  ARM64_ARCH = 'arm64'
550
552
  X86_64_ARCH = 'x86_64'
553
+
554
+ SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
555
+ f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')
sky/skylet/events.py CHANGED
@@ -326,8 +326,15 @@ class AutostopEvent(SkyletEvent):
326
326
  cluster_name_on_cloud = cluster_config['cluster_name']
327
327
  is_cluster_multinode = cluster_config['max_workers'] > 0
328
328
 
329
+ # Clear AWS credentials from environment to force boto3 to use IAM
330
+ # role attached to the instance (lowest priority in credential chain).
331
+ # This allows the cluster to stop/terminate itself using its IAM role.
329
332
  os.environ.pop('AWS_ACCESS_KEY_ID', None)
330
333
  os.environ.pop('AWS_SECRET_ACCESS_KEY', None)
334
+ os.environ.pop('AWS_SESSION_TOKEN', None)
335
+ # Point boto3 to /dev/null to skip reading credentials from files.
336
+ os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/dev/null'
337
+ os.environ['AWS_CONFIG_FILE'] = '/dev/null'
331
338
 
332
339
  # Stop the ray autoscaler to avoid scaling up, during
333
340
  # stopping/terminating of the cluster.
sky/skylet/services.py CHANGED
@@ -407,7 +407,9 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
407
407
  context: grpc.ServicerContext
408
408
  ) -> managed_jobsv1_pb2.GetJobTableResponse:
409
409
  try:
410
- accessible_workspaces = list(request.accessible_workspaces)
410
+ accessible_workspaces = (
411
+ list(request.accessible_workspaces.workspaces)
412
+ if request.HasField('accessible_workspaces') else None)
411
413
  job_ids = (list(request.job_ids.ids)
412
414
  if request.HasField('job_ids') else None)
413
415
  user_hashes: Optional[List[Optional[str]]] = None
@@ -419,6 +421,8 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
419
421
  user_hashes.append(None)
420
422
  statuses = (list(request.statuses.statuses)
421
423
  if request.HasField('statuses') else None)
424
+ fields = (list(request.fields.fields)
425
+ if request.HasField('fields') else None)
422
426
  job_queue = managed_job_utils.get_managed_job_queue(
423
427
  skip_finished=request.skip_finished,
424
428
  accessible_workspaces=accessible_workspaces,
@@ -432,7 +436,9 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
432
436
  page=request.page if request.HasField('page') else None,
433
437
  limit=request.limit if request.HasField('limit') else None,
434
438
  user_hashes=user_hashes,
435
- statuses=statuses)
439
+ statuses=statuses,
440
+ fields=fields,
441
+ )
436
442
  jobs = job_queue['jobs']
437
443
  total = job_queue['total']
438
444
  total_no_filter = job_queue['total_no_filter']
@@ -440,7 +446,16 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
440
446
 
441
447
  jobs_info = []
442
448
  for job in jobs:
449
+ converted_metadata = None
450
+ metadata = job.get('metadata')
451
+ if metadata:
452
+ converted_metadata = {
453
+ k: v for k, v in metadata.items() if v is not None
454
+ }
443
455
  job_info = managed_jobsv1_pb2.ManagedJobInfo(
456
+ # The `spot.job_id`, which can be used to identify
457
+ # different tasks for the same job
458
+ _job_id=job.get('_job_id'),
444
459
  job_id=job.get('job_id'),
445
460
  task_id=job.get('task_id'),
446
461
  job_name=job.get('job_name'),
@@ -468,11 +483,7 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
468
483
  end_at=job.get('end_at'),
469
484
  user_yaml=job.get('user_yaml'),
470
485
  entrypoint=job.get('entrypoint'),
471
- metadata={
472
- k: v
473
- for k, v in job.get('metadata', {}).items()
474
- if v is not None
475
- },
486
+ metadata=converted_metadata,
476
487
  pool=job.get('pool'),
477
488
  pool_hash=job.get('pool_hash'))
478
489
  jobs_info.append(job_info)