PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251027py3-none-any.whl → 1.0.0.dev20251101py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (114) hide show

sky/__init__.py +2 -2
sky/adaptors/aws.py +25 -7
sky/adaptors/coreweave.py +278 -0
sky/backends/backend_utils.py +9 -6
sky/backends/cloud_vm_ray_backend.py +2 -3
sky/check.py +25 -13
sky/client/cli/command.py +52 -24
sky/cloud_stores.py +73 -0
sky/clouds/aws.py +59 -11
sky/core.py +7 -5
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +92 -1
sky/data/mounting_utils.py +71 -2
sky/data/storage.py +166 -9
sky/global_user_state.py +14 -18
sky/jobs/constants.py +2 -0
sky/jobs/controller.py +62 -67
sky/jobs/file_content_utils.py +80 -0
sky/jobs/log_gc.py +201 -0
sky/jobs/scheduler.py +15 -2
sky/jobs/server/core.py +85 -13
sky/jobs/server/server.py +14 -13
sky/jobs/server/utils.py +28 -10
sky/jobs/state.py +216 -40
sky/jobs/utils.py +65 -28
sky/metrics/utils.py +18 -0
sky/optimizer.py +1 -1
sky/provision/kubernetes/instance.py +88 -19
sky/provision/kubernetes/volume.py +2 -2
sky/schemas/api/responses.py +3 -5
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
sky/serve/replica_managers.py +2 -2
sky/serve/serve_utils.py +9 -2
sky/serve/server/server.py +8 -7
sky/server/common.py +21 -15
sky/server/constants.py +1 -1
sky/server/daemons.py +23 -17
sky/server/requests/executor.py +7 -3
sky/server/requests/payloads.py +2 -0
sky/server/requests/request_names.py +80 -0
sky/server/requests/requests.py +137 -102
sky/server/requests/serializers/decoders.py +0 -6
sky/server/requests/serializers/encoders.py +33 -6
sky/server/server.py +105 -36
sky/server/stream_utils.py +56 -13
sky/setup_files/dependencies.py +2 -0
sky/skylet/constants.py +6 -1
sky/skylet/events.py +7 -0
sky/skylet/services.py +18 -7
sky/ssh_node_pools/server.py +5 -4
sky/task.py +14 -42
sky/templates/kubernetes-ray.yml.j2 +1 -1
sky/templates/nebius-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +140 -12
sky/users/permission.py +4 -1
sky/utils/cli_utils/status_utils.py +8 -2
sky/utils/context_utils.py +13 -1
sky/utils/db/migration_utils.py +1 -1
sky/utils/resource_checker.py +4 -1
sky/utils/resources_utils.py +53 -29
sky/utils/schemas.py +23 -4
sky/volumes/server/server.py +4 -3
sky/workspaces/server.py +7 -6
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
/sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
/sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0

sky/server/server.py CHANGED Viewed

@@ -6,6 +6,7 @@ import base64
 from concurrent.futures import ThreadPoolExecutor
 import contextlib
 import datetime
+from enum import IntEnum
 import hashlib
 import json
 import multiprocessing
@@ -15,6 +16,7 @@ import posixpath
 import re
 import resource
 import shutil
+import struct
 import sys
 import threading
 import traceback
@@ -25,6 +27,7 @@ import zipfile
 import aiofiles
 import anyio
 import fastapi
+from fastapi import responses as fastapi_responses
 from fastapi.middleware import cors
 import starlette.middleware.base
 import uvloop
@@ -61,6 +64,7 @@ from sky.server.auth import oauth2_proxy
 from sky.server.requests import executor
 from sky.server.requests import payloads
 from sky.server.requests import preconditions
+from sky.server.requests import request_names
 from sky.server.requests import requests as requests_lib
 from sky.skylet import constants
 from sky.ssh_node_pools import server as ssh_node_pools_rest
@@ -459,7 +463,7 @@ async def schedule_on_boot_check_async():
     try:
         await executor.schedule_request_async(
             request_id='skypilot-server-on-boot-check',
-            request_name='check',
+            request_name=request_names.RequestName.CHECK,
             request_body=payloads.CheckBody(),
             func=sky_check.check,
             schedule_type=requests_lib.ScheduleType.SHORT,
@@ -731,7 +735,7 @@ async def check(request: fastapi.Request,
     """Checks enabled clouds."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='check',
+        request_name=request_names.RequestName.CHECK,
         request_body=check_body,
         func=sky_check.check,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -745,7 +749,7 @@ async def enabled_clouds(request: fastapi.Request,
     """Gets enabled clouds on the server."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='enabled_clouds',
+        request_name=request_names.RequestName.ENABLED_CLOUDS,
         request_body=payloads.EnabledCloudsBody(workspace=workspace,
                                                 expand=expand),
         func=core.enabled_clouds,
@@ -761,7 +765,8 @@ async def realtime_kubernetes_gpu_availability(
     """Gets real-time Kubernetes GPU availability."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='realtime_kubernetes_gpu_availability',
+        request_name=request_names.RequestName.
+        REALTIME_KUBERNETES_GPU_AVAILABILITY,
         request_body=realtime_gpu_availability_body,
         func=core.realtime_kubernetes_gpu_availability,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -776,7 +781,7 @@ async def kubernetes_node_info(
     """Gets Kubernetes nodes information and hints."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='kubernetes_node_info',
+        request_name=request_names.RequestName.KUBERNETES_NODE_INFO,
         request_body=kubernetes_node_info_body,
         func=kubernetes_utils.get_kubernetes_node_info,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -788,7 +793,7 @@ async def status_kubernetes(request: fastapi.Request) -> None:
     """Gets Kubernetes status."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='status_kubernetes',
+        request_name=request_names.RequestName.STATUS_KUBERNETES,
         request_body=payloads.RequestBody(),
         func=core.status_kubernetes,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -802,7 +807,7 @@ async def list_accelerators(
     """Gets list of accelerators from cloud catalog."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='list_accelerators',
+        request_name=request_names.RequestName.LIST_ACCELERATORS,
         request_body=list_accelerator_counts_body,
         func=catalog.list_accelerators,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -817,7 +822,7 @@ async def list_accelerator_counts(
     """Gets list of accelerator counts from cloud catalog."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='list_accelerator_counts',
+        request_name=request_names.RequestName.LIST_ACCELERATOR_COUNTS,
         request_body=list_accelerator_counts_body,
         func=catalog.list_accelerator_counts,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -874,7 +879,7 @@ async def optimize(optimize_body: payloads.OptimizeBody,
     """Optimizes the user's DAG."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='optimize',
+        request_name=request_names.RequestName.OPTIMIZE,
         request_body=optimize_body,
         ignore_return_value=True,
         func=core.optimize,
@@ -1084,7 +1089,7 @@ async def launch(launch_body: payloads.LaunchBody,
     logger.info(f'Launching request: {request_id}')
     await executor.schedule_request_async(
         request_id,
-        request_name='launch',
+        request_name=request_names.RequestName.CLUSTER_LAUNCH,
         request_body=launch_body,
         func=execution.launch,
         schedule_type=requests_lib.ScheduleType.LONG,
@@ -1100,7 +1105,7 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
     cluster_name = exec_body.cluster_name
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='exec',
+        request_name=request_names.RequestName.CLUSTER_EXEC,
         request_body=exec_body,
         func=execution.exec,
         precondition=preconditions.ClusterStartCompletePrecondition(
@@ -1118,7 +1123,7 @@ async def stop(request: fastapi.Request,
     """Stops a cluster."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='stop',
+        request_name=request_names.RequestName.CLUSTER_STOP,
         request_body=stop_body,
         func=core.stop,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1138,7 +1143,7 @@ async def status(
             detail='Server is shutting down, please try again later.')
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='status',
+        request_name=request_names.RequestName.CLUSTER_STATUS,
         request_body=status_body,
         func=core.status,
         schedule_type=(requests_lib.ScheduleType.LONG if
@@ -1153,7 +1158,7 @@ async def endpoints(request: fastapi.Request,
     """Gets the endpoint for a given cluster and port number (endpoint)."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='endpoints',
+        request_name=request_names.RequestName.CLUSTER_ENDPOINTS,
         request_body=endpoint_body,
         func=core.endpoints,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1167,7 +1172,7 @@ async def down(request: fastapi.Request,
     """Tears down a cluster."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='down',
+        request_name=request_names.RequestName.CLUSTER_DOWN,
         request_body=down_body,
         func=core.down,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1181,7 +1186,7 @@ async def start(request: fastapi.Request,
     """Restarts a cluster."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='start',
+        request_name=request_names.RequestName.CLUSTER_START,
         request_body=start_body,
         func=core.start,
         schedule_type=requests_lib.ScheduleType.LONG,
@@ -1195,7 +1200,7 @@ async def autostop(request: fastapi.Request,
     """Schedules an autostop/autodown for a cluster."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='autostop',
+        request_name=request_names.RequestName.CLUSTER_AUTOSTOP,
         request_body=autostop_body,
         func=core.autostop,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1209,7 +1214,7 @@ async def queue(request: fastapi.Request,
     """Gets the job queue of a cluster."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='queue',
+        request_name=request_names.RequestName.CLUSTER_QUEUE,
         request_body=queue_body,
         func=core.queue,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1223,7 +1228,7 @@ async def job_status(request: fastapi.Request,
     """Gets the status of a job."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='job_status',
+        request_name=request_names.RequestName.CLUSTER_JOB_STATUS,
         request_body=job_status_body,
         func=core.job_status,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1237,7 +1242,7 @@ async def cancel(request: fastapi.Request,
     """Cancels jobs on a cluster."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='cancel',
+        request_name=request_names.RequestName.CLUSTER_JOB_CANCEL,
         request_body=cancel_body,
         func=core.cancel,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1257,7 +1262,7 @@ async def logs(
     executor.check_request_thread_executor_available()
     request_task = await executor.prepare_request_async(
         request_id=request.state.request_id,
-        request_name='logs',
+        request_name=request_names.RequestName.CLUSTER_JOB_LOGS,
         request_body=cluster_job_body,
         func=core.tail_logs,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1288,7 +1293,7 @@ async def download_logs(
     cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='download_logs',
+        request_name=request_names.RequestName.CLUSTER_JOB_DOWNLOAD_LOGS,
         request_body=cluster_jobs_body,
         func=core.download_logs,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1439,7 +1444,7 @@ async def cost_report(request: fastapi.Request,
     """Gets the cost report of a cluster."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='cost_report',
+        request_name=request_names.RequestName.CLUSTER_COST_REPORT,
         request_body=cost_report_body,
         func=core.cost_report,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1451,7 +1456,7 @@ async def storage_ls(request: fastapi.Request) -> None:
     """Gets the storages."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='storage_ls',
+        request_name=request_names.RequestName.STORAGE_LS,
         request_body=payloads.RequestBody(),
         func=core.storage_ls,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1464,7 +1469,7 @@ async def storage_delete(request: fastapi.Request,
     """Deletes a storage."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='storage_delete',
+        request_name=request_names.RequestName.STORAGE_DELETE,
         request_body=storage_body,
         func=core.storage_delete,
         schedule_type=requests_lib.ScheduleType.LONG,
@@ -1477,7 +1482,7 @@ async def local_up(request: fastapi.Request,
     """Launches a Kubernetes cluster on API server."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='local_up',
+        request_name=request_names.RequestName.LOCAL_UP,
         request_body=local_up_body,
         func=core.local_up,
         schedule_type=requests_lib.ScheduleType.LONG,
@@ -1490,7 +1495,7 @@ async def local_down(request: fastapi.Request,
     """Tears down the Kubernetes cluster started by local_up."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='local_down',
+        request_name=request_names.RequestName.LOCAL_DOWN,
         request_body=local_down_body,
         func=core.local_down,
         schedule_type=requests_lib.ScheduleType.LONG,
@@ -1512,7 +1517,7 @@ async def get_expanded_request_id(request_id: str) -> str:
 # === API server related APIs ===
-@app.get('/api/get')
+@app.get('/api/get', response_class=fastapi_responses.ORJSONResponse)
 async def api_get(request_id: str) -> payloads.RequestPayload:
     """Gets a request with a given request ID prefix."""
     # Validate request_id prefix matches a single request.
@@ -1698,7 +1703,7 @@ async def api_cancel(request: fastapi.Request,
     """Cancels requests."""
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='api_cancel',
+        request_name=request_names.RequestName.API_CANCEL,
         request_body=request_cancel_body,
         func=requests_lib.kill_requests_with_prefix,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1803,16 +1808,31 @@ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
         basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
                                           'false').lower() == 'true',
         user=user if user is not None else None,
+        service_account_token_enabled=(os.environ.get(
+            constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
+            'false').lower() == 'true'),
     )
+class KubernetesSSHMessageType(IntEnum):
+    REGULAR_DATA = 0
+    PINGPONG = 1
+    LATENCY_MEASUREMENT = 2
 @app.websocket('/kubernetes-pod-ssh-proxy')
-async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
-                                   cluster_name: str) -> None:
+async def kubernetes_pod_ssh_proxy(
+        websocket: fastapi.WebSocket,
+        cluster_name: str,
+        client_version: Optional[int] = None) -> None:
     """Proxies SSH to the Kubernetes pod with websocket."""
     await websocket.accept()
     logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
+    timestamps_supported = client_version is not None and client_version > 21
+    logger.info(f'Websocket timestamps supported: {timestamps_supported}, \
+        client_version = {client_version}')
     # Run core.status in another thread to avoid blocking the event loop.
     with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
         cluster_records = await context_utils.to_thread_with_executor(
@@ -1867,6 +1887,42 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
         async def websocket_to_ssh():
             try:
                 async for message in websocket.iter_bytes():
+                    if timestamps_supported:
+                        type_size = struct.calcsize('!B')
+                        message_type = struct.unpack('!B',
+                                                     message[:type_size])[0]
+                        if (message_type ==
+                                KubernetesSSHMessageType.REGULAR_DATA):
+                            # Regular data - strip type byte and forward to SSH
+                            message = message[type_size:]
+                        elif message_type == KubernetesSSHMessageType.PINGPONG:
+                            # PING message - respond with PONG (type 1)
+                            ping_id_size = struct.calcsize('!I')
+                            if len(message) != type_size + ping_id_size:
+                                raise ValueError('Invalid PING message '
+                                                 f'length: {len(message)}')
+                            # Return the same PING message, so that the client
+                            # can measure the latency.
+                            await websocket.send_bytes(message)
+                            continue
+                        elif (message_type ==
+                              KubernetesSSHMessageType.LATENCY_MEASUREMENT):
+                            # Latency measurement from client
+                            latency_size = struct.calcsize('!Q')
+                            if len(message) != type_size + latency_size:
+                                raise ValueError(
+                                    'Invalid latency measurement '
+                                    f'message length: {len(message)}')
+                            avg_latency_ms = struct.unpack(
+                                '!Q',
+                                message[type_size:type_size + latency_size])[0]
+                            latency_seconds = avg_latency_ms / 1000
+                            metrics_utils.SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS.labels(pid=os.getpid()).observe(latency_seconds)  # pylint: disable=line-too-long
+                            continue
+                        else:
+                            # Unknown message type.
+                            raise ValueError(
+                                f'Unknown message type: {message_type}')
                     writer.write(message)
                     try:
                         await writer.drain()
@@ -1897,6 +1953,11 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
                             nonlocal ssh_failed
                             ssh_failed = True
                         break
+                    if timestamps_supported:
+                        # Prepend message type byte (0 = regular data)
+                        message_type_bytes = struct.pack(
+                            '!B', KubernetesSSHMessageType.REGULAR_DATA.value)
+                        data = message_type_bytes + data
                     await websocket.send_bytes(data)
             except Exception:  # pylint: disable=broad-except
                 pass
@@ -1936,7 +1997,7 @@ async def all_contexts(request: fastapi.Request) -> None:
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='all_contexts',
+        request_name=request_names.RequestName.ALL_CONTEXTS,
         request_body=payloads.RequestBody(),
         func=core.get_all_contexts,
         schedule_type=requests_lib.ScheduleType.SHORT,
@@ -2050,7 +2111,6 @@ if __name__ == '__main__':
     # Serve metrics on a separate port to isolate it from the application APIs:
     # metrics port will not be exposed to the public network typically.
     parser.add_argument('--metrics-port', default=9090, type=int)
-    parser.add_argument('--start-with-python', action='store_true')
     cmd_args = parser.parse_args()
     if cmd_args.port == cmd_args.metrics_port:
         logger.error('port and metrics-port cannot be the same, exiting.')
@@ -2065,9 +2125,18 @@ if __name__ == '__main__':
         logger.error(f'Port {cmd_args.port} is not available, exiting.')
         raise RuntimeError(f'Port {cmd_args.port} is not available')
-    if not cmd_args.start_with_python:
-        # Maybe touch the signal file on API server startup.
-        managed_job_utils.is_consolidation_mode(on_api_restart=True)
+    # Maybe touch the signal file on API server startup. Do it again here even
+    # if we already touched it in the sky/server/common.py::_start_api_server.
+    # This is because the sky/server/common.py::_start_api_server function call
+    # is running outside the skypilot API server process tree. The process tree
+    # starts within that function (see the `subprocess.Popen` call in
+    # sky/server/common.py::_start_api_server). When pg is used, the
+    # _start_api_server function will not load the config file from db, which
+    # will ignore the consolidation mode config. Here, inside the process tree,
+    # we already reload the config as a server (with env var _start_api_server),
+    # so we will respect the consolidation mode config.
+    # Refers to #7717 for more details.
+    managed_job_utils.is_consolidation_mode(on_api_restart=True)
     # Show the privacy policy if it is not already shown. We place it here so
     # that it is shown only when the API server is started.

sky/server/stream_utils.py CHANGED Viewed

@@ -25,6 +25,8 @@ logger = sky_logging.init_logger(__name__)
 _BUFFER_SIZE = 8 * 1024  # 8KB
 _BUFFER_TIMEOUT = 0.02  # 20ms
 _HEARTBEAT_INTERVAL = 30
+_READ_CHUNK_SIZE = 256 * 1024  # 256KB chunks for file reading
 # If a SHORT request has been stuck in pending for
 # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
 _SHORT_REQUEST_SPINNER_TIMEOUT = 2
@@ -235,6 +237,9 @@ async def _tail_log_file(
     buffer_bytes = 0
     last_flush_time = asyncio.get_event_loop().time()
+    # Read file in chunks instead of line-by-line for better performance
+    incomplete_line = b''  # Buffer for incomplete lines across chunks
     async def flush_buffer() -> AsyncGenerator[str, None]:
         nonlocal buffer, buffer_bytes, last_flush_time
         if buffer:
@@ -255,8 +260,23 @@ async def _tail_log_file(
             async for chunk in flush_buffer():
                 yield chunk
-        line: Optional[bytes] = await f.readline()
-        if not line:
+        # Read file in chunks for better I/O performance
+        file_chunk: bytes = await f.read(_READ_CHUNK_SIZE)
+        if not file_chunk:
+            # Process any remaining incomplete line
+            if incomplete_line:
+                line_str = incomplete_line.decode('utf-8')
+                if plain_logs:
+                    is_payload, line_str = message_utils.decode_payload(
+                        line_str, raise_for_mismatch=False)
+                    if not is_payload:
+                        buffer.append(line_str)
+                        buffer_bytes += len(line_str.encode('utf-8'))
+                else:
+                    buffer.append(line_str)
+                    buffer_bytes += len(line_str.encode('utf-8'))
+                incomplete_line = b''
             # Avoid checking the status too frequently to avoid overloading the
             # DB.
             should_check_status = (current_time -
@@ -328,16 +348,39 @@ async def _tail_log_file(
         # performance but it helps avoid unnecessary heartbeat strings
         # being printed when the client runs in an old version.
         last_heartbeat_time = asyncio.get_event_loop().time()
-        line_str = line.decode('utf-8')
-        if plain_logs:
-            is_payload, line_str = message_utils.decode_payload(
-                line_str, raise_for_mismatch=False)
-            # TODO(aylei): implement heartbeat mechanism for plain logs,
-            # sending invisible characters might be okay.
-            if is_payload:
-                continue
-        buffer.append(line_str)
-        buffer_bytes += len(line_str.encode('utf-8'))
+        # Combine with any incomplete line from previous chunk
+        file_chunk = incomplete_line + file_chunk
+        incomplete_line = b''
+        # Split chunk into lines, preserving line structure
+        lines_bytes = file_chunk.split(b'\n')
+        # If chunk doesn't end with newline, the last element is incomplete
+        if file_chunk and not file_chunk.endswith(b'\n'):
+            incomplete_line = lines_bytes[-1]
+            lines_bytes = lines_bytes[:-1]
+        else:
+            # If ends with \n, split creates an empty last element we should
+            # ignore
+            if lines_bytes and lines_bytes[-1] == b'':
+                lines_bytes = lines_bytes[:-1]
+        # Process all complete lines in this chunk
+        for line_bytes in lines_bytes:
+            # Reconstruct line with newline (since split removed it)
+            line_str = line_bytes.decode('utf-8') + '\n'
+            if plain_logs:
+                is_payload, line_str = message_utils.decode_payload(
+                    line_str, raise_for_mismatch=False)
+                # TODO(aylei): implement heartbeat mechanism for plain logs,
+                # sending invisible characters might be okay.
+                if is_payload:
+                    continue
+            buffer.append(line_str)
+            buffer_bytes += len(line_str.encode('utf-8'))
     # Flush remaining lines in the buffer.
     async for chunk in flush_buffer():
@@ -373,7 +416,7 @@ def stream_response(
         async def on_disconnect():
             logger.info(f'User terminated the connection for request '
                         f'{request_id}')
-            requests_lib.kill_requests([request_id])
+            await requests_lib.kill_request_async(request_id)
         # The background task will be run after returning a response.
         # https://fastapi.tiangolo.com/tutorial/background-tasks/

sky/setup_files/dependencies.py CHANGED Viewed

@@ -49,6 +49,7 @@ install_requires = [
     # <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
     'pyyaml > 3.13, != 5.4.*',
     'ijson',
+    'orjson',
     'requests',
     # SkyPilot inherits from uvicorn.Server to customize the behavior of
     # uvicorn, so we need to pin uvicorn version to avoid potential break
@@ -187,6 +188,7 @@ cloud_dependencies: Dict[str, List[str]] = {
     'docker': ['docker'] + local_ray,
     'lambda': [],  # No dependencies needed for lambda
     'cloudflare': aws_dependencies,
+    'coreweave': aws_dependencies,
     'scp': local_ray,
     'oci': ['oci'],
     # Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long

sky/skylet/constants.py CHANGED Viewed

@@ -100,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
 # cluster yaml is updated.
 #
 # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
-SKYLET_VERSION = '23'
+SKYLET_VERSION = '25'
 # The version of the lib files that skylet/jobs use. Whenever there is an API
 # change for the job_lib or log_lib, we need to bump this version, so that the
 # user can be notified to update their SkyPilot version on the remote cluster.
@@ -422,6 +422,8 @@ SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
     #   but the configs won't be applied)
     ('jobs', 'controller', 'consolidation_mode'),
     ('serve', 'controller', 'consolidation_mode'),
+    ('jobs', 'controller', 'controller_logs_gc_retention_hours'),
+    ('jobs', 'controller', 'task_logs_gc_retention_hours'),
 ]
 # Constants for Azure blob storage
@@ -548,3 +550,6 @@ ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
 ARM64_ARCH = 'arm64'
 X86_64_ARCH = 'x86_64'
+SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
+    f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')

sky/skylet/events.py CHANGED Viewed

@@ -326,8 +326,15 @@ class AutostopEvent(SkyletEvent):
         cluster_name_on_cloud = cluster_config['cluster_name']
         is_cluster_multinode = cluster_config['max_workers'] > 0
+        # Clear AWS credentials from environment to force boto3 to use IAM
+        # role attached to the instance (lowest priority in credential chain).
+        # This allows the cluster to stop/terminate itself using its IAM role.
         os.environ.pop('AWS_ACCESS_KEY_ID', None)
         os.environ.pop('AWS_SECRET_ACCESS_KEY', None)
+        os.environ.pop('AWS_SESSION_TOKEN', None)
+        # Point boto3 to /dev/null to skip reading credentials from files.
+        os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/dev/null'
+        os.environ['AWS_CONFIG_FILE'] = '/dev/null'
         # Stop the ray autoscaler to avoid scaling up, during
         # stopping/terminating of the cluster.

sky/skylet/services.py CHANGED Viewed

@@ -407,7 +407,9 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
         context: grpc.ServicerContext
     ) -> managed_jobsv1_pb2.GetJobTableResponse:
         try:
-            accessible_workspaces = list(request.accessible_workspaces)
+            accessible_workspaces = (
+                list(request.accessible_workspaces.workspaces)
+                if request.HasField('accessible_workspaces') else None)
             job_ids = (list(request.job_ids.ids)
                        if request.HasField('job_ids') else None)
             user_hashes: Optional[List[Optional[str]]] = None
@@ -419,6 +421,8 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
                     user_hashes.append(None)
             statuses = (list(request.statuses.statuses)
                         if request.HasField('statuses') else None)
+            fields = (list(request.fields.fields)
+                      if request.HasField('fields') else None)
             job_queue = managed_job_utils.get_managed_job_queue(
                 skip_finished=request.skip_finished,
                 accessible_workspaces=accessible_workspaces,
@@ -432,7 +436,9 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
                 page=request.page if request.HasField('page') else None,
                 limit=request.limit if request.HasField('limit') else None,
                 user_hashes=user_hashes,
-                statuses=statuses)
+                statuses=statuses,
+                fields=fields,
+            )
             jobs = job_queue['jobs']
             total = job_queue['total']
             total_no_filter = job_queue['total_no_filter']
@@ -440,7 +446,16 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
             jobs_info = []
             for job in jobs:
+                converted_metadata = None
+                metadata = job.get('metadata')
+                if metadata:
+                    converted_metadata = {
+                        k: v for k, v in metadata.items() if v is not None
+                    }
                 job_info = managed_jobsv1_pb2.ManagedJobInfo(
+                    # The `spot.job_id`, which can be used to identify
+                    # different tasks for the same job
+                    _job_id=job.get('_job_id'),
                     job_id=job.get('job_id'),
                     task_id=job.get('task_id'),
                     job_name=job.get('job_name'),
@@ -468,11 +483,7 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
                     end_at=job.get('end_at'),
                     user_yaml=job.get('user_yaml'),
                     entrypoint=job.get('entrypoint'),
-                    metadata={
-                        k: v
-                        for k, v in job.get('metadata', {}).items()
-                        if v is not None
-                    },
+                    metadata=converted_metadata,
                     pool=job.get('pool'),
                     pool_hash=job.get('pool_hash'))
                 jobs_info.append(job_info)

skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251027py3-none-any.whl → 1.0.0.dev20251101py3-none-any.whl