skypilot-nightly 1.0.0.dev20251029__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (68) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/client/cli/command.py +47 -23
  4. sky/clouds/aws.py +59 -11
  5. sky/dashboard/out/404.html +1 -1
  6. sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
  7. sky/dashboard/out/_next/static/chunks/{webpack-485984ca04e021d0.js → webpack-e38d5319cd10a3a0.js} +1 -1
  8. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  9. sky/dashboard/out/clusters/[cluster].html +1 -1
  10. sky/dashboard/out/clusters.html +1 -1
  11. sky/dashboard/out/config.html +1 -1
  12. sky/dashboard/out/index.html +1 -1
  13. sky/dashboard/out/infra/[context].html +1 -1
  14. sky/dashboard/out/infra.html +1 -1
  15. sky/dashboard/out/jobs/[job].html +1 -1
  16. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  17. sky/dashboard/out/jobs.html +1 -1
  18. sky/dashboard/out/users.html +1 -1
  19. sky/dashboard/out/volumes.html +1 -1
  20. sky/dashboard/out/workspace/new.html +1 -1
  21. sky/dashboard/out/workspaces/[name].html +1 -1
  22. sky/dashboard/out/workspaces.html +1 -1
  23. sky/data/mounting_utils.py +32 -2
  24. sky/jobs/constants.py +2 -0
  25. sky/jobs/controller.py +62 -67
  26. sky/jobs/file_content_utils.py +80 -0
  27. sky/jobs/log_gc.py +201 -0
  28. sky/jobs/scheduler.py +15 -2
  29. sky/jobs/server/core.py +85 -13
  30. sky/jobs/server/server.py +12 -11
  31. sky/jobs/server/utils.py +28 -10
  32. sky/jobs/state.py +216 -40
  33. sky/jobs/utils.py +60 -22
  34. sky/metrics/utils.py +18 -0
  35. sky/schemas/api/responses.py +1 -0
  36. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  37. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  38. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  39. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  40. sky/serve/server/server.py +8 -7
  41. sky/server/common.py +21 -15
  42. sky/server/constants.py +1 -1
  43. sky/server/daemons.py +23 -17
  44. sky/server/requests/executor.py +7 -3
  45. sky/server/requests/request_names.py +80 -0
  46. sky/server/server.py +103 -35
  47. sky/skylet/constants.py +6 -1
  48. sky/skylet/events.py +7 -0
  49. sky/skylet/services.py +18 -7
  50. sky/ssh_node_pools/server.py +5 -4
  51. sky/task.py +4 -42
  52. sky/templates/kubernetes-ray.yml.j2 +1 -1
  53. sky/templates/websocket_proxy.py +140 -12
  54. sky/users/permission.py +4 -1
  55. sky/utils/db/migration_utils.py +1 -1
  56. sky/utils/resource_checker.py +4 -1
  57. sky/utils/schemas.py +23 -4
  58. sky/volumes/server/server.py +4 -3
  59. sky/workspaces/server.py +7 -6
  60. {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +36 -36
  61. {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +67 -62
  62. sky/dashboard/out/_next/static/chunks/2755.a239c652bf8684dd.js +0 -26
  63. /sky/dashboard/out/_next/static/{DabuSAKsc_y0wyJxpTIdQ → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +0 -0
  64. /sky/dashboard/out/_next/static/{DabuSAKsc_y0wyJxpTIdQ → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
  65. {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
  66. {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
  67. {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
  68. {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
sky/server/server.py CHANGED
@@ -6,6 +6,7 @@ import base64
6
6
  from concurrent.futures import ThreadPoolExecutor
7
7
  import contextlib
8
8
  import datetime
9
+ from enum import IntEnum
9
10
  import hashlib
10
11
  import json
11
12
  import multiprocessing
@@ -15,6 +16,7 @@ import posixpath
15
16
  import re
16
17
  import resource
17
18
  import shutil
19
+ import struct
18
20
  import sys
19
21
  import threading
20
22
  import traceback
@@ -62,6 +64,7 @@ from sky.server.auth import oauth2_proxy
62
64
  from sky.server.requests import executor
63
65
  from sky.server.requests import payloads
64
66
  from sky.server.requests import preconditions
67
+ from sky.server.requests import request_names
65
68
  from sky.server.requests import requests as requests_lib
66
69
  from sky.skylet import constants
67
70
  from sky.ssh_node_pools import server as ssh_node_pools_rest
@@ -460,7 +463,7 @@ async def schedule_on_boot_check_async():
460
463
  try:
461
464
  await executor.schedule_request_async(
462
465
  request_id='skypilot-server-on-boot-check',
463
- request_name='check',
466
+ request_name=request_names.RequestName.CHECK,
464
467
  request_body=payloads.CheckBody(),
465
468
  func=sky_check.check,
466
469
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -732,7 +735,7 @@ async def check(request: fastapi.Request,
732
735
  """Checks enabled clouds."""
733
736
  await executor.schedule_request_async(
734
737
  request_id=request.state.request_id,
735
- request_name='check',
738
+ request_name=request_names.RequestName.CHECK,
736
739
  request_body=check_body,
737
740
  func=sky_check.check,
738
741
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -746,7 +749,7 @@ async def enabled_clouds(request: fastapi.Request,
746
749
  """Gets enabled clouds on the server."""
747
750
  await executor.schedule_request_async(
748
751
  request_id=request.state.request_id,
749
- request_name='enabled_clouds',
752
+ request_name=request_names.RequestName.ENABLED_CLOUDS,
750
753
  request_body=payloads.EnabledCloudsBody(workspace=workspace,
751
754
  expand=expand),
752
755
  func=core.enabled_clouds,
@@ -762,7 +765,8 @@ async def realtime_kubernetes_gpu_availability(
762
765
  """Gets real-time Kubernetes GPU availability."""
763
766
  await executor.schedule_request_async(
764
767
  request_id=request.state.request_id,
765
- request_name='realtime_kubernetes_gpu_availability',
768
+ request_name=request_names.RequestName.
769
+ REALTIME_KUBERNETES_GPU_AVAILABILITY,
766
770
  request_body=realtime_gpu_availability_body,
767
771
  func=core.realtime_kubernetes_gpu_availability,
768
772
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -777,7 +781,7 @@ async def kubernetes_node_info(
777
781
  """Gets Kubernetes nodes information and hints."""
778
782
  await executor.schedule_request_async(
779
783
  request_id=request.state.request_id,
780
- request_name='kubernetes_node_info',
784
+ request_name=request_names.RequestName.KUBERNETES_NODE_INFO,
781
785
  request_body=kubernetes_node_info_body,
782
786
  func=kubernetes_utils.get_kubernetes_node_info,
783
787
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -789,7 +793,7 @@ async def status_kubernetes(request: fastapi.Request) -> None:
789
793
  """Gets Kubernetes status."""
790
794
  await executor.schedule_request_async(
791
795
  request_id=request.state.request_id,
792
- request_name='status_kubernetes',
796
+ request_name=request_names.RequestName.STATUS_KUBERNETES,
793
797
  request_body=payloads.RequestBody(),
794
798
  func=core.status_kubernetes,
795
799
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -803,7 +807,7 @@ async def list_accelerators(
803
807
  """Gets list of accelerators from cloud catalog."""
804
808
  await executor.schedule_request_async(
805
809
  request_id=request.state.request_id,
806
- request_name='list_accelerators',
810
+ request_name=request_names.RequestName.LIST_ACCELERATORS,
807
811
  request_body=list_accelerator_counts_body,
808
812
  func=catalog.list_accelerators,
809
813
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -818,7 +822,7 @@ async def list_accelerator_counts(
818
822
  """Gets list of accelerator counts from cloud catalog."""
819
823
  await executor.schedule_request_async(
820
824
  request_id=request.state.request_id,
821
- request_name='list_accelerator_counts',
825
+ request_name=request_names.RequestName.LIST_ACCELERATOR_COUNTS,
822
826
  request_body=list_accelerator_counts_body,
823
827
  func=catalog.list_accelerator_counts,
824
828
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -875,7 +879,7 @@ async def optimize(optimize_body: payloads.OptimizeBody,
875
879
  """Optimizes the user's DAG."""
876
880
  await executor.schedule_request_async(
877
881
  request_id=request.state.request_id,
878
- request_name='optimize',
882
+ request_name=request_names.RequestName.OPTIMIZE,
879
883
  request_body=optimize_body,
880
884
  ignore_return_value=True,
881
885
  func=core.optimize,
@@ -1085,7 +1089,7 @@ async def launch(launch_body: payloads.LaunchBody,
1085
1089
  logger.info(f'Launching request: {request_id}')
1086
1090
  await executor.schedule_request_async(
1087
1091
  request_id,
1088
- request_name='launch',
1092
+ request_name=request_names.RequestName.CLUSTER_LAUNCH,
1089
1093
  request_body=launch_body,
1090
1094
  func=execution.launch,
1091
1095
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1101,7 +1105,7 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
1101
1105
  cluster_name = exec_body.cluster_name
1102
1106
  await executor.schedule_request_async(
1103
1107
  request_id=request.state.request_id,
1104
- request_name='exec',
1108
+ request_name=request_names.RequestName.CLUSTER_EXEC,
1105
1109
  request_body=exec_body,
1106
1110
  func=execution.exec,
1107
1111
  precondition=preconditions.ClusterStartCompletePrecondition(
@@ -1119,7 +1123,7 @@ async def stop(request: fastapi.Request,
1119
1123
  """Stops a cluster."""
1120
1124
  await executor.schedule_request_async(
1121
1125
  request_id=request.state.request_id,
1122
- request_name='stop',
1126
+ request_name=request_names.RequestName.CLUSTER_STOP,
1123
1127
  request_body=stop_body,
1124
1128
  func=core.stop,
1125
1129
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1139,7 +1143,7 @@ async def status(
1139
1143
  detail='Server is shutting down, please try again later.')
1140
1144
  await executor.schedule_request_async(
1141
1145
  request_id=request.state.request_id,
1142
- request_name='status',
1146
+ request_name=request_names.RequestName.CLUSTER_STATUS,
1143
1147
  request_body=status_body,
1144
1148
  func=core.status,
1145
1149
  schedule_type=(requests_lib.ScheduleType.LONG if
@@ -1154,7 +1158,7 @@ async def endpoints(request: fastapi.Request,
1154
1158
  """Gets the endpoint for a given cluster and port number (endpoint)."""
1155
1159
  await executor.schedule_request_async(
1156
1160
  request_id=request.state.request_id,
1157
- request_name='endpoints',
1161
+ request_name=request_names.RequestName.CLUSTER_ENDPOINTS,
1158
1162
  request_body=endpoint_body,
1159
1163
  func=core.endpoints,
1160
1164
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1168,7 +1172,7 @@ async def down(request: fastapi.Request,
1168
1172
  """Tears down a cluster."""
1169
1173
  await executor.schedule_request_async(
1170
1174
  request_id=request.state.request_id,
1171
- request_name='down',
1175
+ request_name=request_names.RequestName.CLUSTER_DOWN,
1172
1176
  request_body=down_body,
1173
1177
  func=core.down,
1174
1178
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1182,7 +1186,7 @@ async def start(request: fastapi.Request,
1182
1186
  """Restarts a cluster."""
1183
1187
  await executor.schedule_request_async(
1184
1188
  request_id=request.state.request_id,
1185
- request_name='start',
1189
+ request_name=request_names.RequestName.CLUSTER_START,
1186
1190
  request_body=start_body,
1187
1191
  func=core.start,
1188
1192
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1196,7 +1200,7 @@ async def autostop(request: fastapi.Request,
1196
1200
  """Schedules an autostop/autodown for a cluster."""
1197
1201
  await executor.schedule_request_async(
1198
1202
  request_id=request.state.request_id,
1199
- request_name='autostop',
1203
+ request_name=request_names.RequestName.CLUSTER_AUTOSTOP,
1200
1204
  request_body=autostop_body,
1201
1205
  func=core.autostop,
1202
1206
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1210,7 +1214,7 @@ async def queue(request: fastapi.Request,
1210
1214
  """Gets the job queue of a cluster."""
1211
1215
  await executor.schedule_request_async(
1212
1216
  request_id=request.state.request_id,
1213
- request_name='queue',
1217
+ request_name=request_names.RequestName.CLUSTER_QUEUE,
1214
1218
  request_body=queue_body,
1215
1219
  func=core.queue,
1216
1220
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1224,7 +1228,7 @@ async def job_status(request: fastapi.Request,
1224
1228
  """Gets the status of a job."""
1225
1229
  await executor.schedule_request_async(
1226
1230
  request_id=request.state.request_id,
1227
- request_name='job_status',
1231
+ request_name=request_names.RequestName.CLUSTER_JOB_STATUS,
1228
1232
  request_body=job_status_body,
1229
1233
  func=core.job_status,
1230
1234
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1238,7 +1242,7 @@ async def cancel(request: fastapi.Request,
1238
1242
  """Cancels jobs on a cluster."""
1239
1243
  await executor.schedule_request_async(
1240
1244
  request_id=request.state.request_id,
1241
- request_name='cancel',
1245
+ request_name=request_names.RequestName.CLUSTER_JOB_CANCEL,
1242
1246
  request_body=cancel_body,
1243
1247
  func=core.cancel,
1244
1248
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1258,7 +1262,7 @@ async def logs(
1258
1262
  executor.check_request_thread_executor_available()
1259
1263
  request_task = await executor.prepare_request_async(
1260
1264
  request_id=request.state.request_id,
1261
- request_name='logs',
1265
+ request_name=request_names.RequestName.CLUSTER_JOB_LOGS,
1262
1266
  request_body=cluster_job_body,
1263
1267
  func=core.tail_logs,
1264
1268
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1289,7 +1293,7 @@ async def download_logs(
1289
1293
  cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
1290
1294
  await executor.schedule_request_async(
1291
1295
  request_id=request.state.request_id,
1292
- request_name='download_logs',
1296
+ request_name=request_names.RequestName.CLUSTER_JOB_DOWNLOAD_LOGS,
1293
1297
  request_body=cluster_jobs_body,
1294
1298
  func=core.download_logs,
1295
1299
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1440,7 +1444,7 @@ async def cost_report(request: fastapi.Request,
1440
1444
  """Gets the cost report of a cluster."""
1441
1445
  await executor.schedule_request_async(
1442
1446
  request_id=request.state.request_id,
1443
- request_name='cost_report',
1447
+ request_name=request_names.RequestName.CLUSTER_COST_REPORT,
1444
1448
  request_body=cost_report_body,
1445
1449
  func=core.cost_report,
1446
1450
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1452,7 +1456,7 @@ async def storage_ls(request: fastapi.Request) -> None:
1452
1456
  """Gets the storages."""
1453
1457
  await executor.schedule_request_async(
1454
1458
  request_id=request.state.request_id,
1455
- request_name='storage_ls',
1459
+ request_name=request_names.RequestName.STORAGE_LS,
1456
1460
  request_body=payloads.RequestBody(),
1457
1461
  func=core.storage_ls,
1458
1462
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1465,7 +1469,7 @@ async def storage_delete(request: fastapi.Request,
1465
1469
  """Deletes a storage."""
1466
1470
  await executor.schedule_request_async(
1467
1471
  request_id=request.state.request_id,
1468
- request_name='storage_delete',
1472
+ request_name=request_names.RequestName.STORAGE_DELETE,
1469
1473
  request_body=storage_body,
1470
1474
  func=core.storage_delete,
1471
1475
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1478,7 +1482,7 @@ async def local_up(request: fastapi.Request,
1478
1482
  """Launches a Kubernetes cluster on API server."""
1479
1483
  await executor.schedule_request_async(
1480
1484
  request_id=request.state.request_id,
1481
- request_name='local_up',
1485
+ request_name=request_names.RequestName.LOCAL_UP,
1482
1486
  request_body=local_up_body,
1483
1487
  func=core.local_up,
1484
1488
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1491,7 +1495,7 @@ async def local_down(request: fastapi.Request,
1491
1495
  """Tears down the Kubernetes cluster started by local_up."""
1492
1496
  await executor.schedule_request_async(
1493
1497
  request_id=request.state.request_id,
1494
- request_name='local_down',
1498
+ request_name=request_names.RequestName.LOCAL_DOWN,
1495
1499
  request_body=local_down_body,
1496
1500
  func=core.local_down,
1497
1501
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1699,7 +1703,7 @@ async def api_cancel(request: fastapi.Request,
1699
1703
  """Cancels requests."""
1700
1704
  await executor.schedule_request_async(
1701
1705
  request_id=request.state.request_id,
1702
- request_name='api_cancel',
1706
+ request_name=request_names.RequestName.API_CANCEL,
1703
1707
  request_body=request_cancel_body,
1704
1708
  func=requests_lib.kill_requests_with_prefix,
1705
1709
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1804,16 +1808,31 @@ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
1804
1808
  basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
1805
1809
  'false').lower() == 'true',
1806
1810
  user=user if user is not None else None,
1811
+ service_account_token_enabled=(os.environ.get(
1812
+ constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
1813
+ 'false').lower() == 'true'),
1807
1814
  )
1808
1815
 
1809
1816
 
1817
+ class KubernetesSSHMessageType(IntEnum):
1818
+ REGULAR_DATA = 0
1819
+ PINGPONG = 1
1820
+ LATENCY_MEASUREMENT = 2
1821
+
1822
+
1810
1823
  @app.websocket('/kubernetes-pod-ssh-proxy')
1811
- async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1812
- cluster_name: str) -> None:
1824
+ async def kubernetes_pod_ssh_proxy(
1825
+ websocket: fastapi.WebSocket,
1826
+ cluster_name: str,
1827
+ client_version: Optional[int] = None) -> None:
1813
1828
  """Proxies SSH to the Kubernetes pod with websocket."""
1814
1829
  await websocket.accept()
1815
1830
  logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
1816
1831
 
1832
+ timestamps_supported = client_version is not None and client_version > 21
1833
+ logger.info(f'Websocket timestamps supported: {timestamps_supported}, \
1834
+ client_version = {client_version}')
1835
+
1817
1836
  # Run core.status in another thread to avoid blocking the event loop.
1818
1837
  with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
1819
1838
  cluster_records = await context_utils.to_thread_with_executor(
@@ -1868,6 +1887,42 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1868
1887
  async def websocket_to_ssh():
1869
1888
  try:
1870
1889
  async for message in websocket.iter_bytes():
1890
+ if timestamps_supported:
1891
+ type_size = struct.calcsize('!B')
1892
+ message_type = struct.unpack('!B',
1893
+ message[:type_size])[0]
1894
+ if (message_type ==
1895
+ KubernetesSSHMessageType.REGULAR_DATA):
1896
+ # Regular data - strip type byte and forward to SSH
1897
+ message = message[type_size:]
1898
+ elif message_type == KubernetesSSHMessageType.PINGPONG:
1899
+ # PING message - respond with PONG (type 1)
1900
+ ping_id_size = struct.calcsize('!I')
1901
+ if len(message) != type_size + ping_id_size:
1902
+ raise ValueError('Invalid PING message '
1903
+ f'length: {len(message)}')
1904
+ # Return the same PING message, so that the client
1905
+ # can measure the latency.
1906
+ await websocket.send_bytes(message)
1907
+ continue
1908
+ elif (message_type ==
1909
+ KubernetesSSHMessageType.LATENCY_MEASUREMENT):
1910
+ # Latency measurement from client
1911
+ latency_size = struct.calcsize('!Q')
1912
+ if len(message) != type_size + latency_size:
1913
+ raise ValueError(
1914
+ 'Invalid latency measurement '
1915
+ f'message length: {len(message)}')
1916
+ avg_latency_ms = struct.unpack(
1917
+ '!Q',
1918
+ message[type_size:type_size + latency_size])[0]
1919
+ latency_seconds = avg_latency_ms / 1000
1920
+ metrics_utils.SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS.labels(pid=os.getpid()).observe(latency_seconds) # pylint: disable=line-too-long
1921
+ continue
1922
+ else:
1923
+ # Unknown message type.
1924
+ raise ValueError(
1925
+ f'Unknown message type: {message_type}')
1871
1926
  writer.write(message)
1872
1927
  try:
1873
1928
  await writer.drain()
@@ -1898,6 +1953,11 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1898
1953
  nonlocal ssh_failed
1899
1954
  ssh_failed = True
1900
1955
  break
1956
+ if timestamps_supported:
1957
+ # Prepend message type byte (0 = regular data)
1958
+ message_type_bytes = struct.pack(
1959
+ '!B', KubernetesSSHMessageType.REGULAR_DATA.value)
1960
+ data = message_type_bytes + data
1901
1961
  await websocket.send_bytes(data)
1902
1962
  except Exception: # pylint: disable=broad-except
1903
1963
  pass
@@ -1937,7 +1997,7 @@ async def all_contexts(request: fastapi.Request) -> None:
1937
1997
 
1938
1998
  await executor.schedule_request_async(
1939
1999
  request_id=request.state.request_id,
1940
- request_name='all_contexts',
2000
+ request_name=request_names.RequestName.ALL_CONTEXTS,
1941
2001
  request_body=payloads.RequestBody(),
1942
2002
  func=core.get_all_contexts,
1943
2003
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -2051,7 +2111,6 @@ if __name__ == '__main__':
2051
2111
  # Serve metrics on a separate port to isolate it from the application APIs:
2052
2112
  # metrics port will not be exposed to the public network typically.
2053
2113
  parser.add_argument('--metrics-port', default=9090, type=int)
2054
- parser.add_argument('--start-with-python', action='store_true')
2055
2114
  cmd_args = parser.parse_args()
2056
2115
  if cmd_args.port == cmd_args.metrics_port:
2057
2116
  logger.error('port and metrics-port cannot be the same, exiting.')
@@ -2066,9 +2125,18 @@ if __name__ == '__main__':
2066
2125
  logger.error(f'Port {cmd_args.port} is not available, exiting.')
2067
2126
  raise RuntimeError(f'Port {cmd_args.port} is not available')
2068
2127
 
2069
- if not cmd_args.start_with_python:
2070
- # Maybe touch the signal file on API server startup.
2071
- managed_job_utils.is_consolidation_mode(on_api_restart=True)
2128
+ # Maybe touch the signal file on API server startup. Do it again here even
2129
+ # if we already touched it in the sky/server/common.py::_start_api_server.
2130
+ # This is because the sky/server/common.py::_start_api_server function call
2131
+ # is running outside the skypilot API server process tree. The process tree
2132
+ # starts within that function (see the `subprocess.Popen` call in
2133
+ # sky/server/common.py::_start_api_server). When pg is used, the
2134
+ # _start_api_server function will not load the config file from db, which
2135
+ # will ignore the consolidation mode config. Here, inside the process tree,
2136
+ # we already reload the config as a server (with env var _start_api_server),
2137
+ # so we will respect the consolidation mode config.
2138
+ # Refers to #7717 for more details.
2139
+ managed_job_utils.is_consolidation_mode(on_api_restart=True)
2072
2140
 
2073
2141
  # Show the privacy policy if it is not already shown. We place it here so
2074
2142
  # that it is shown only when the API server is started.
sky/skylet/constants.py CHANGED
@@ -100,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
100
100
  # cluster yaml is updated.
101
101
  #
102
102
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
103
- SKYLET_VERSION = '23'
103
+ SKYLET_VERSION = '25'
104
104
  # The version of the lib files that skylet/jobs use. Whenever there is an API
105
105
  # change for the job_lib or log_lib, we need to bump this version, so that the
106
106
  # user can be notified to update their SkyPilot version on the remote cluster.
@@ -422,6 +422,8 @@ SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
422
422
  # but the configs won't be applied)
423
423
  ('jobs', 'controller', 'consolidation_mode'),
424
424
  ('serve', 'controller', 'consolidation_mode'),
425
+ ('jobs', 'controller', 'controller_logs_gc_retention_hours'),
426
+ ('jobs', 'controller', 'task_logs_gc_retention_hours'),
425
427
  ]
426
428
 
427
429
  # Constants for Azure blob storage
@@ -548,3 +550,6 @@ ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
548
550
 
549
551
  ARM64_ARCH = 'arm64'
550
552
  X86_64_ARCH = 'x86_64'
553
+
554
+ SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
555
+ f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')
sky/skylet/events.py CHANGED
@@ -326,8 +326,15 @@ class AutostopEvent(SkyletEvent):
326
326
  cluster_name_on_cloud = cluster_config['cluster_name']
327
327
  is_cluster_multinode = cluster_config['max_workers'] > 0
328
328
 
329
+ # Clear AWS credentials from environment to force boto3 to use IAM
330
+ # role attached to the instance (lowest priority in credential chain).
331
+ # This allows the cluster to stop/terminate itself using its IAM role.
329
332
  os.environ.pop('AWS_ACCESS_KEY_ID', None)
330
333
  os.environ.pop('AWS_SECRET_ACCESS_KEY', None)
334
+ os.environ.pop('AWS_SESSION_TOKEN', None)
335
+ # Point boto3 to /dev/null to skip reading credentials from files.
336
+ os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/dev/null'
337
+ os.environ['AWS_CONFIG_FILE'] = '/dev/null'
331
338
 
332
339
  # Stop the ray autoscaler to avoid scaling up, during
333
340
  # stopping/terminating of the cluster.
sky/skylet/services.py CHANGED
@@ -407,7 +407,9 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
407
407
  context: grpc.ServicerContext
408
408
  ) -> managed_jobsv1_pb2.GetJobTableResponse:
409
409
  try:
410
- accessible_workspaces = list(request.accessible_workspaces)
410
+ accessible_workspaces = (
411
+ list(request.accessible_workspaces.workspaces)
412
+ if request.HasField('accessible_workspaces') else None)
411
413
  job_ids = (list(request.job_ids.ids)
412
414
  if request.HasField('job_ids') else None)
413
415
  user_hashes: Optional[List[Optional[str]]] = None
@@ -419,6 +421,8 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
419
421
  user_hashes.append(None)
420
422
  statuses = (list(request.statuses.statuses)
421
423
  if request.HasField('statuses') else None)
424
+ fields = (list(request.fields.fields)
425
+ if request.HasField('fields') else None)
422
426
  job_queue = managed_job_utils.get_managed_job_queue(
423
427
  skip_finished=request.skip_finished,
424
428
  accessible_workspaces=accessible_workspaces,
@@ -432,7 +436,9 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
432
436
  page=request.page if request.HasField('page') else None,
433
437
  limit=request.limit if request.HasField('limit') else None,
434
438
  user_hashes=user_hashes,
435
- statuses=statuses)
439
+ statuses=statuses,
440
+ fields=fields,
441
+ )
436
442
  jobs = job_queue['jobs']
437
443
  total = job_queue['total']
438
444
  total_no_filter = job_queue['total_no_filter']
@@ -440,7 +446,16 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
440
446
 
441
447
  jobs_info = []
442
448
  for job in jobs:
449
+ converted_metadata = None
450
+ metadata = job.get('metadata')
451
+ if metadata:
452
+ converted_metadata = {
453
+ k: v for k, v in metadata.items() if v is not None
454
+ }
443
455
  job_info = managed_jobsv1_pb2.ManagedJobInfo(
456
+ # The `spot.job_id`, which can be used to identify
457
+ # different tasks for the same job
458
+ _job_id=job.get('_job_id'),
444
459
  job_id=job.get('job_id'),
445
460
  task_id=job.get('task_id'),
446
461
  job_name=job.get('job_name'),
@@ -468,11 +483,7 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
468
483
  end_at=job.get('end_at'),
469
484
  user_yaml=job.get('user_yaml'),
470
485
  entrypoint=job.get('entrypoint'),
471
- metadata={
472
- k: v
473
- for k, v in job.get('metadata', {}).items()
474
- if v is not None
475
- },
486
+ metadata=converted_metadata,
476
487
  pool=job.get('pool'),
477
488
  pool_hash=job.get('pool_hash'))
478
489
  jobs_info.append(job_info)
@@ -7,6 +7,7 @@ import fastapi
7
7
  from sky import core as sky_core
8
8
  from sky.server.requests import executor
9
9
  from sky.server.requests import payloads
10
+ from sky.server.requests import request_names
10
11
  from sky.server.requests import requests as requests_lib
11
12
  from sky.ssh_node_pools import core as ssh_node_pools_core
12
13
  from sky.utils import common_utils
@@ -101,7 +102,7 @@ async def deploy_ssh_node_pool(request: fastapi.Request,
101
102
  ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=False)
102
103
  await executor.schedule_request_async(
103
104
  request_id=request.state.request_id,
104
- request_name='ssh_up',
105
+ request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
105
106
  request_body=ssh_up_body,
106
107
  func=sky_core.ssh_up,
107
108
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -126,7 +127,7 @@ async def deploy_ssh_node_pool_general(
126
127
  try:
127
128
  await executor.schedule_request_async(
128
129
  request_id=request.state.request_id,
129
- request_name='ssh_up',
130
+ request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
130
131
  request_body=ssh_up_body,
131
132
  func=sky_core.ssh_up,
132
133
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -152,7 +153,7 @@ async def down_ssh_node_pool(request: fastapi.Request,
152
153
  ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=True)
153
154
  await executor.schedule_request_async(
154
155
  request_id=request.state.request_id,
155
- request_name='ssh_down',
156
+ request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
156
157
  request_body=ssh_up_body,
157
158
  func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
158
159
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -180,7 +181,7 @@ async def down_ssh_node_pool_general(
180
181
  ssh_up_body.cleanup = True
181
182
  await executor.schedule_request_async(
182
183
  request_id=request.state.request_id,
183
- request_name='ssh_down',
184
+ request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
184
185
  request_body=ssh_up_body,
185
186
  func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
186
187
  schedule_type=requests_lib.ScheduleType.LONG,
sky/task.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Task: a coarse-grained stage in an application."""
2
2
  import collections
3
- import inspect
4
3
  import json
5
4
  import os
6
5
  import re
@@ -29,10 +28,6 @@ from sky.utils import yaml_utils
29
28
 
30
29
  logger = sky_logging.init_logger(__name__)
31
30
 
32
- # A lambda generating commands (node rank_i, node addrs -> cmd_i).
33
- CommandGen = Callable[[int, List[str]], Optional[str]]
34
- CommandOrCommandGen = Union[str, CommandGen]
35
-
36
31
  _VALID_NAME_REGEX = '[a-zA-Z0-9]+(?:[._-]{1,2}[a-zA-Z0-9]+)*'
37
32
  _VALID_NAME_DESCR = ('ASCII characters and may contain lowercase and'
38
33
  ' uppercase letters, digits, underscores, periods,'
@@ -236,7 +231,7 @@ class Task:
236
231
  name: Optional[str] = None,
237
232
  *,
238
233
  setup: Optional[Union[str, List[str]]] = None,
239
- run: Optional[Union[CommandOrCommandGen, List[str]]] = None,
234
+ run: Optional[Union[str, List[str]]] = None,
240
235
  envs: Optional[Dict[str, str]] = None,
241
236
  secrets: Optional[Dict[str, str]] = None,
242
237
  workdir: Optional[Union[str, Dict[str, Any]]] = None,
@@ -349,7 +344,7 @@ class Task:
349
344
  self._volumes = volumes or {}
350
345
 
351
346
  # concatenate commands if given as list
352
- def _concat(commands):
347
+ def _concat(commands: Optional[Union[str, List[str]]]) -> Optional[str]:
353
348
  if isinstance(commands, list):
354
349
  return '\n'.join(commands)
355
350
  return commands
@@ -447,42 +442,9 @@ class Task:
447
442
 
448
443
  def validate_run(self):
449
444
  """Validates if the run command is valid."""
450
- if callable(self.run):
451
- run_sig = inspect.signature(self.run)
452
- # Check that run is a function with 2 arguments.
453
- if len(run_sig.parameters) != 2:
454
- with ux_utils.print_exception_no_traceback():
455
- raise ValueError(_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
456
-
457
- type_list = [int, List[str]]
458
- # Check annotations, if exists
459
- for i, param in enumerate(run_sig.parameters.values()):
460
- if param.annotation != inspect.Parameter.empty:
461
- if param.annotation != type_list[i]:
462
- with ux_utils.print_exception_no_traceback():
463
- raise ValueError(
464
- _RUN_FN_CHECK_FAIL_MSG.format(run_sig))
465
-
466
- # Check self containedness.
467
- run_closure = inspect.getclosurevars(self.run)
468
- if run_closure.nonlocals:
469
- with ux_utils.print_exception_no_traceback():
470
- raise ValueError(
471
- 'run command generator must be self contained. '
472
- f'Found nonlocals: {run_closure.nonlocals}')
473
- if run_closure.globals:
474
- with ux_utils.print_exception_no_traceback():
475
- raise ValueError(
476
- 'run command generator must be self contained. '
477
- f'Found globals: {run_closure.globals}')
478
- if run_closure.unbound:
479
- # Do not raise an error here. Import statements, which are
480
- # allowed, will be considered as unbounded.
481
- pass
482
- elif self.run is not None and not isinstance(self.run, str):
445
+ if self.run is not None and not isinstance(self.run, str):
483
446
  with ux_utils.print_exception_no_traceback():
484
- raise ValueError('run must be either a shell script (str) or '
485
- f'a command generator ({CommandGen}). '
447
+ raise ValueError('run must be a shell script (str). '
486
448
  f'Got {type(self.run)}')
487
449
 
488
450
  def expand_and_validate_file_mounts(self):
@@ -1059,7 +1059,7 @@ available_node_types:
1059
1059
  # Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
1060
1060
  # For SkyServe, this will be None and every service will be recovered. This is because SkyServe
1061
1061
  # will delete the service from the database after it is terminated so everything in the database is running.
1062
- ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs = state.get_managed_jobs(); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
1062
+ ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs, _ = state.get_managed_jobs_with_filters(fields=['job_id', 'schedule_state']); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
1063
1063
  if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
1064
1064
  read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
1065
1065
  fi