PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251009py3-none-any.whl → 1.0.0.dev20251107py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show

sky/__init__.py +6 -2
sky/adaptors/aws.py +25 -7
sky/adaptors/coreweave.py +278 -0
sky/adaptors/kubernetes.py +64 -0
sky/adaptors/shadeform.py +89 -0
sky/admin_policy.py +20 -0
sky/authentication.py +59 -149
sky/backends/backend_utils.py +104 -63
sky/backends/cloud_vm_ray_backend.py +84 -39
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/catalog/kubernetes_catalog.py +24 -28
sky/catalog/runpod_catalog.py +5 -1
sky/catalog/shadeform_catalog.py +165 -0
sky/check.py +25 -13
sky/client/cli/command.py +335 -86
sky/client/cli/flags.py +4 -2
sky/client/cli/table_utils.py +17 -9
sky/client/sdk.py +59 -12
sky/cloud_stores.py +73 -0
sky/clouds/__init__.py +2 -0
sky/clouds/aws.py +71 -16
sky/clouds/azure.py +12 -5
sky/clouds/cloud.py +19 -9
sky/clouds/cudo.py +12 -5
sky/clouds/do.py +4 -1
sky/clouds/fluidstack.py +12 -5
sky/clouds/gcp.py +12 -5
sky/clouds/hyperbolic.py +12 -5
sky/clouds/ibm.py +12 -5
sky/clouds/kubernetes.py +62 -25
sky/clouds/lambda_cloud.py +12 -5
sky/clouds/nebius.py +12 -5
sky/clouds/oci.py +12 -5
sky/clouds/paperspace.py +4 -1
sky/clouds/primeintellect.py +4 -1
sky/clouds/runpod.py +12 -5
sky/clouds/scp.py +12 -5
sky/clouds/seeweb.py +4 -1
sky/clouds/shadeform.py +400 -0
sky/clouds/ssh.py +4 -2
sky/clouds/vast.py +12 -5
sky/clouds/vsphere.py +4 -1
sky/core.py +12 -11
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +92 -1
sky/data/mounting_utils.py +143 -19
sky/data/storage.py +168 -11
sky/exceptions.py +13 -1
sky/execution.py +13 -0
sky/global_user_state.py +189 -113
sky/jobs/client/sdk.py +32 -10
sky/jobs/client/sdk_async.py +9 -3
sky/jobs/constants.py +3 -1
sky/jobs/controller.py +164 -192
sky/jobs/file_content_utils.py +80 -0
sky/jobs/log_gc.py +201 -0
sky/jobs/recovery_strategy.py +59 -82
sky/jobs/scheduler.py +20 -9
sky/jobs/server/core.py +105 -23
sky/jobs/server/server.py +40 -28
sky/jobs/server/utils.py +32 -11
sky/jobs/state.py +588 -110
sky/jobs/utils.py +442 -209
sky/logs/agent.py +1 -1
sky/metrics/utils.py +45 -6
sky/optimizer.py +1 -1
sky/provision/__init__.py +7 -0
sky/provision/aws/instance.py +2 -1
sky/provision/azure/instance.py +2 -1
sky/provision/common.py +2 -0
sky/provision/cudo/instance.py +2 -1
sky/provision/do/instance.py +2 -1
sky/provision/fluidstack/instance.py +4 -3
sky/provision/gcp/instance.py +2 -1
sky/provision/hyperbolic/instance.py +2 -1
sky/provision/instance_setup.py +10 -2
sky/provision/kubernetes/constants.py +0 -1
sky/provision/kubernetes/instance.py +222 -89
sky/provision/kubernetes/network.py +12 -8
sky/provision/kubernetes/utils.py +114 -53
sky/provision/kubernetes/volume.py +5 -4
sky/provision/lambda_cloud/instance.py +2 -1
sky/provision/nebius/instance.py +2 -1
sky/provision/oci/instance.py +2 -1
sky/provision/paperspace/instance.py +2 -1
sky/provision/provisioner.py +11 -2
sky/provision/runpod/instance.py +2 -1
sky/provision/scp/instance.py +2 -1
sky/provision/seeweb/instance.py +3 -3
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/vast/instance.py +2 -1
sky/provision/vsphere/instance.py +2 -1
sky/resources.py +1 -1
sky/schemas/api/responses.py +9 -5
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/generated/jobsv1_pb2.py +52 -52
sky/schemas/generated/jobsv1_pb2.pyi +4 -2
sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
sky/serve/client/impl.py +11 -3
sky/serve/replica_managers.py +5 -2
sky/serve/serve_utils.py +9 -2
sky/serve/server/impl.py +7 -2
sky/serve/server/server.py +18 -15
sky/serve/service.py +2 -2
sky/server/auth/oauth2_proxy.py +2 -5
sky/server/common.py +31 -28
sky/server/constants.py +5 -1
sky/server/daemons.py +27 -19
sky/server/requests/executor.py +138 -74
sky/server/requests/payloads.py +9 -1
sky/server/requests/preconditions.py +13 -10
sky/server/requests/request_names.py +120 -0
sky/server/requests/requests.py +485 -153
sky/server/requests/serializers/decoders.py +26 -13
sky/server/requests/serializers/encoders.py +56 -11
sky/server/requests/threads.py +106 -0
sky/server/rest.py +70 -18
sky/server/server.py +283 -104
sky/server/stream_utils.py +233 -59
sky/server/uvicorn.py +18 -17
sky/setup_files/alembic.ini +4 -0
sky/setup_files/dependencies.py +32 -13
sky/sky_logging.py +0 -2
sky/skylet/constants.py +30 -7
sky/skylet/events.py +7 -0
sky/skylet/log_lib.py +8 -2
sky/skylet/log_lib.pyi +1 -1
sky/skylet/services.py +26 -13
sky/skylet/subprocess_daemon.py +103 -29
sky/skypilot_config.py +87 -75
sky/ssh_node_pools/server.py +9 -8
sky/task.py +67 -54
sky/templates/kubernetes-ray.yml.j2 +8 -1
sky/templates/nebius-ray.yml.j2 +1 -0
sky/templates/shadeform-ray.yml.j2 +72 -0
sky/templates/websocket_proxy.py +142 -12
sky/users/permission.py +8 -1
sky/utils/admin_policy_utils.py +16 -3
sky/utils/asyncio_utils.py +78 -0
sky/utils/auth_utils.py +153 -0
sky/utils/cli_utils/status_utils.py +8 -2
sky/utils/command_runner.py +11 -0
sky/utils/common.py +3 -1
sky/utils/common_utils.py +7 -4
sky/utils/context.py +57 -51
sky/utils/context_utils.py +30 -12
sky/utils/controller_utils.py +35 -8
sky/utils/db/db_utils.py +37 -10
sky/utils/db/migration_utils.py +8 -4
sky/utils/locks.py +24 -6
sky/utils/resource_checker.py +4 -1
sky/utils/resources_utils.py +53 -29
sky/utils/schemas.py +23 -4
sky/utils/subprocess_utils.py +17 -4
sky/volumes/server/server.py +7 -6
sky/workspaces/server.py +13 -12
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
/sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0

sky/server/stream_utils.py CHANGED Viewed

@@ -11,6 +11,7 @@ import fastapi
 from sky import global_user_state
 from sky import sky_logging
 from sky.server.requests import requests as requests_lib
+from sky.utils import common_utils
 from sky.utils import message_utils
 from sky.utils import rich_utils
 from sky.utils import status_lib
@@ -24,7 +25,22 @@ logger = sky_logging.init_logger(__name__)
 _BUFFER_SIZE = 8 * 1024  # 8KB
 _BUFFER_TIMEOUT = 0.02  # 20ms
 _HEARTBEAT_INTERVAL = 30
-_CLUSTER_STATUS_INTERVAL = 1
+_READ_CHUNK_SIZE = 256 * 1024  # 256KB chunks for file reading
+# If a SHORT request has been stuck in pending for
+# _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
+_SHORT_REQUEST_SPINNER_TIMEOUT = 2
+# If there is an issue during provisioning that causes the cluster to be stuck
+# in INIT state, we use this timeout to break the loop and stop streaming
+# provision logs.
+_PROVISION_LOG_TIMEOUT = 3
+# Maximum time to wait for new log files to appear when streaming worker node
+# provision logs. Worker logs are created sequentially during the provisioning
+# process, so we need to wait for new files to appear.
+_MAX_WAIT_FOR_NEW_LOG_FILES = 3  # seconds
+LONG_REQUEST_POLL_INTERVAL = 1
+DEFAULT_POLL_INTERVAL = 0.1
 async def _yield_log_file_with_payloads_skipped(
@@ -41,18 +57,22 @@ async def _yield_log_file_with_payloads_skipped(
 async def log_streamer(
-        request_id: Optional[str],
-        log_path: pathlib.Path,
-        plain_logs: bool = False,
-        tail: Optional[int] = None,
-        follow: bool = True,
-        cluster_name: Optional[str] = None) -> AsyncGenerator[str, None]:
+    request_id: Optional[str],
+    log_path: Optional[pathlib.Path] = None,
+    plain_logs: bool = False,
+    tail: Optional[int] = None,
+    follow: bool = True,
+    cluster_name: Optional[str] = None,
+    polling_interval: float = DEFAULT_POLL_INTERVAL
+) -> AsyncGenerator[str, None]:
     """Streams the logs of a request.
     Args:
         request_id: The request ID to check whether the log tailing process
             should be stopped.
-        log_path: The path to the log file.
+        log_path: The path to the log file or directory containing the log
+        files. If it is a directory, all *.log files in the directory will be
+        streamed.
         plain_logs: Whether to show plain logs.
         tail: The number of lines to tail. If None, tail the whole file.
         follow: Whether to follow the log file.
@@ -61,17 +81,26 @@ async def log_streamer(
     """
     if request_id is not None:
+        start_time = asyncio.get_event_loop().time()
         status_msg = rich_utils.EncodedStatusMessage(
             f'[dim]Checking request: {request_id}[/dim]')
-        request_task = await requests_lib.get_request_async(request_id)
+        request_task = await requests_lib.get_request_async(request_id,
+                                                            fields=[
+                                                                'request_id',
+                                                                'name',
+                                                                'schedule_type',
+                                                                'status',
+                                                                'status_msg'
+                                                            ])
         if request_task is None:
             raise fastapi.HTTPException(
                 status_code=404, detail=f'Request {request_id} not found')
         request_id = request_task.request_id
-        # Do not show the waiting spinner if the request is a fast, non-blocking
-        # request.
+        # By default, do not show the waiting spinner for SHORT requests.
+        # If the request has been stuck in pending for
+        # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
         show_request_waiting_spinner = (not plain_logs and
                                         request_task.schedule_type
                                         == requests_lib.ScheduleType.LONG)
@@ -84,9 +113,23 @@ async def log_streamer(
                        f'scheduled: {request_id}')
         req_status = request_task.status
         req_msg = request_task.status_msg
+        del request_task
+        # Slowly back off the database polling up to every 1 second, to avoid
+        # overloading the CPU and DB.
+        backoff = common_utils.Backoff(initial_backoff=polling_interval,
+                                       max_backoff_factor=10,
+                                       multiplier=1.2)
         while req_status < requests_lib.RequestStatus.RUNNING:
+            current_time = asyncio.get_event_loop().time()
+            # Show the waiting spinner for a SHORT request if it has been stuck
+            # in pending for _SHORT_REQUEST_SPINNER_TIMEOUT seconds
+            if not show_request_waiting_spinner and (
+                    current_time - start_time > _SHORT_REQUEST_SPINNER_TIMEOUT):
+                show_request_waiting_spinner = True
+                yield status_msg.init()
+                yield status_msg.start()
             if req_msg is not None:
-                waiting_msg = request_task.status_msg
+                waiting_msg = req_msg
             if show_request_waiting_spinner:
                 yield status_msg.update(f'[dim]{waiting_msg}[/dim]')
             elif plain_logs and waiting_msg != last_waiting_msg:
@@ -99,7 +142,7 @@ async def log_streamer(
             # TODO(aylei): we should use a better mechanism to avoid busy
             # polling the DB, which can be a bottleneck for high-concurrency
             # requests.
-            await asyncio.sleep(0.1)
+            await asyncio.sleep(backoff.current_backoff())
             status_with_msg = await requests_lib.get_request_status_async(
                 request_id, include_msg=True)
             req_status = status_with_msg.status
@@ -109,19 +152,68 @@ async def log_streamer(
         if show_request_waiting_spinner:
             yield status_msg.stop()
-    async with aiofiles.open(log_path, 'rb') as f:
-        async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
-                                          follow, cluster_name):
-            yield chunk
+    if log_path is not None and log_path.is_dir():
+        # Track which log files we've already streamed
+        streamed_files = set()
+        no_new_files_count = 0
+        while True:
+            # Get all *.log files in the log_path
+            log_files = sorted(log_path.glob('*.log'))
+            # Filter out already streamed files
+            new_files = [f for f in log_files if f not in streamed_files]
+            if len(new_files) == 0:
+                if not follow:
+                    break
+                # Wait a bit to see if new files appear
+                await asyncio.sleep(0.5)
+                no_new_files_count += 1
+                # Check if we've waited too long for new files
+                if no_new_files_count > _MAX_WAIT_FOR_NEW_LOG_FILES * 2:
+                    break
+                continue
+            # Reset the no-new-files counter when we find new files
+            no_new_files_count = 0
+            for log_file_path in new_files:
+                # Add header before each file (similar to tail -f behavior)
+                header = f'\n==> {log_file_path} <==\n\n'
+                yield header
+                async with aiofiles.open(log_file_path, 'rb') as f:
+                    async for chunk in _tail_log_file(f, request_id, plain_logs,
+                                                      tail, follow,
+                                                      cluster_name,
+                                                      polling_interval):
+                        yield chunk
+                # Mark this file as streamed
+                streamed_files.add(log_file_path)
+            # If not following, break after streaming all current files
+            if not follow:
+                break
+    else:
+        assert log_path is not None, (request_id, log_path)
+        async with aiofiles.open(log_path, 'rb') as f:
+            async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
+                                              follow, cluster_name,
+                                              polling_interval):
+                yield chunk
 async def _tail_log_file(
-        f: aiofiles.threadpool.binary.AsyncBufferedReader,
-        request_id: Optional[str] = None,
-        plain_logs: bool = False,
-        tail: Optional[int] = None,
-        follow: bool = True,
-        cluster_name: Optional[str] = None) -> AsyncGenerator[str, None]:
+    f: aiofiles.threadpool.binary.AsyncBufferedReader,
+    request_id: Optional[str] = None,
+    plain_logs: bool = False,
+    tail: Optional[int] = None,
+    follow: bool = True,
+    cluster_name: Optional[str] = None,
+    polling_interval: float = DEFAULT_POLL_INTERVAL
+) -> AsyncGenerator[str, None]:
     """Tail the opened log file, buffer the lines and flush in chunks."""
     if tail is not None:
@@ -137,7 +229,7 @@ async def _tail_log_file(
             yield line_str
     last_heartbeat_time = asyncio.get_event_loop().time()
-    last_cluster_status_check_time = asyncio.get_event_loop().time()
+    last_status_check_time = asyncio.get_event_loop().time()
     # Buffer the lines in memory and flush them in chunks to improve log
     # tailing throughput.
@@ -145,6 +237,9 @@ async def _tail_log_file(
     buffer_bytes = 0
     last_flush_time = asyncio.get_event_loop().time()
+    # Read file in chunks instead of line-by-line for better performance
+    incomplete_line = b''  # Buffer for incomplete lines across chunks
     async def flush_buffer() -> AsyncGenerator[str, None]:
         nonlocal buffer, buffer_bytes, last_flush_time
         if buffer:
@@ -165,16 +260,41 @@ async def _tail_log_file(
             async for chunk in flush_buffer():
                 yield chunk
-        line: Optional[bytes] = await f.readline()
-        if not line:
-            if request_id is not None:
+        # Read file in chunks for better I/O performance
+        file_chunk: bytes = await f.read(_READ_CHUNK_SIZE)
+        if not file_chunk:
+            # Process any remaining incomplete line
+            if incomplete_line:
+                line_str = incomplete_line.decode('utf-8')
+                if plain_logs:
+                    is_payload, line_str = message_utils.decode_payload(
+                        line_str, raise_for_mismatch=False)
+                    if not is_payload:
+                        buffer.append(line_str)
+                        buffer_bytes += len(line_str.encode('utf-8'))
+                else:
+                    buffer.append(line_str)
+                    buffer_bytes += len(line_str.encode('utf-8'))
+                incomplete_line = b''
+            # Avoid checking the status too frequently to avoid overloading the
+            # DB.
+            should_check_status = (current_time -
+                                   last_status_check_time) >= polling_interval
+            if not follow:
+                # We will only hit this path once, but we should make sure to
+                # check the status so that we display the final request status
+                # if the request is complete.
+                should_check_status = True
+            if request_id is not None and should_check_status:
+                last_status_check_time = current_time
                 req_status = await requests_lib.get_request_status_async(
                     request_id)
                 if req_status.status > requests_lib.RequestStatus.RUNNING:
                     if (req_status.status ==
                             requests_lib.RequestStatus.CANCELLED):
                         request_task = await requests_lib.get_request_async(
-                            request_id)
+                            request_id, fields=['name', 'should_retry'])
                         if request_task.should_retry:
                             buffer.append(
                                 message_utils.encode_payload(
@@ -183,22 +303,32 @@ async def _tail_log_file(
                             buffer.append(
                                 f'{request_task.name!r} request {request_id}'
                                 ' cancelled\n')
+                        del request_task
                     break
             if not follow:
+                # The below checks (cluster status, heartbeat) are not needed
+                # for non-follow logs.
                 break
             # Provision logs pass in cluster_name, check cluster status
-            # periodically to see if provisioning is done. We only
-            # check once a second to avoid overloading the DB.
-            check_status = (current_time - last_cluster_status_check_time
-                           ) >= _CLUSTER_STATUS_INTERVAL
-            if cluster_name is not None and check_status:
-                cluster_record = await (
-                    global_user_state.get_status_from_cluster_name_async(
-                        cluster_name))
-                if (cluster_record is None or
-                        cluster_record != status_lib.ClusterStatus.INIT):
+            # periodically to see if provisioning is done.
+            if cluster_name is not None:
+                if current_time - last_flush_time > _PROVISION_LOG_TIMEOUT:
                     break
-                last_cluster_status_check_time = current_time
+                if should_check_status:
+                    last_status_check_time = current_time
+                    cluster_status = await (
+                        global_user_state.get_status_from_cluster_name_async(
+                            cluster_name))
+                    if cluster_status is None:
+                        logger.debug(
+                            'Stop tailing provision logs for cluster'
+                            f' status for cluster {cluster_name} not found')
+                        break
+                    if cluster_status != status_lib.ClusterStatus.INIT:
+                        logger.debug(
+                            f'Stop tailing provision logs for cluster'
+                            f' {cluster_name} has status {cluster_status} '
+                            '(not in INIT state)')
             if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
                 # Currently just used to keep the connection busy, refer to
                 # https://github.com/skypilot-org/skypilot/issues/5750 for
@@ -218,38 +348,82 @@ async def _tail_log_file(
         # performance but it helps avoid unnecessary heartbeat strings
         # being printed when the client runs in an old version.
         last_heartbeat_time = asyncio.get_event_loop().time()
-        line_str = line.decode('utf-8')
-        if plain_logs:
-            is_payload, line_str = message_utils.decode_payload(
-                line_str, raise_for_mismatch=False)
-            # TODO(aylei): implement heartbeat mechanism for plain logs,
-            # sending invisible characters might be okay.
-            if is_payload:
-                continue
-        buffer.append(line_str)
-        buffer_bytes += len(line_str.encode('utf-8'))
+        # Combine with any incomplete line from previous chunk
+        file_chunk = incomplete_line + file_chunk
+        incomplete_line = b''
+        # Split chunk into lines, preserving line structure
+        lines_bytes = file_chunk.split(b'\n')
+        # If chunk doesn't end with newline, the last element is incomplete
+        if file_chunk and not file_chunk.endswith(b'\n'):
+            incomplete_line = lines_bytes[-1]
+            lines_bytes = lines_bytes[:-1]
+        else:
+            # If ends with \n, split creates an empty last element we should
+            # ignore
+            if lines_bytes and lines_bytes[-1] == b'':
+                lines_bytes = lines_bytes[:-1]
+        # Process all complete lines in this chunk
+        for line_bytes in lines_bytes:
+            # Reconstruct line with newline (since split removed it)
+            line_str = line_bytes.decode('utf-8') + '\n'
+            if plain_logs:
+                is_payload, line_str = message_utils.decode_payload(
+                    line_str, raise_for_mismatch=False)
+                # TODO(aylei): implement heartbeat mechanism for plain logs,
+                # sending invisible characters might be okay.
+                if is_payload:
+                    continue
+            buffer.append(line_str)
+            buffer_bytes += len(line_str.encode('utf-8'))
     # Flush remaining lines in the buffer.
     async for chunk in flush_buffer():
         yield chunk
+def stream_response_for_long_request(
+    request_id: str,
+    logs_path: pathlib.Path,
+    background_tasks: fastapi.BackgroundTasks,
+    kill_request_on_disconnect: bool = True,
+) -> fastapi.responses.StreamingResponse:
+    """Stream the logs of a long request."""
+    return stream_response(
+        request_id,
+        logs_path,
+        background_tasks,
+        polling_interval=LONG_REQUEST_POLL_INTERVAL,
+        kill_request_on_disconnect=kill_request_on_disconnect,
+    )
 def stream_response(
-    request_id: str, logs_path: pathlib.Path,
-    background_tasks: fastapi.BackgroundTasks
+    request_id: str,
+    logs_path: pathlib.Path,
+    background_tasks: fastapi.BackgroundTasks,
+    polling_interval: float = DEFAULT_POLL_INTERVAL,
+    kill_request_on_disconnect: bool = True,
 ) -> fastapi.responses.StreamingResponse:
-    async def on_disconnect():
-        logger.info(f'User terminated the connection for request '
-                    f'{request_id}')
-        requests_lib.kill_requests([request_id])
+    if kill_request_on_disconnect:
+        async def on_disconnect():
+            logger.info(f'User terminated the connection for request '
+                        f'{request_id}')
+            await requests_lib.kill_request_async(request_id)
-    # The background task will be run after returning a response.
-    # https://fastapi.tiangolo.com/tutorial/background-tasks/
-    background_tasks.add_task(on_disconnect)
+        # The background task will be run after returning a response.
+        # https://fastapi.tiangolo.com/tutorial/background-tasks/
+        background_tasks.add_task(on_disconnect)
     return fastapi.responses.StreamingResponse(
-        log_streamer(request_id, logs_path),
+        log_streamer(request_id, logs_path, polling_interval=polling_interval),
         media_type='text/plain',
         headers={
             'Cache-Control': 'no-cache, no-transform',

sky/server/uvicorn.py CHANGED Viewed

@@ -46,11 +46,11 @@ except ValueError:
 # TODO(aylei): use decorator to register requests that need to be proactively
 # cancelled instead of hardcoding here.
-_RETRIABLE_REQUEST_NAMES = [
+_RETRIABLE_REQUEST_NAMES = {
     'sky.logs',
     'sky.jobs.logs',
     'sky.serve.logs',
-]
+}
 def add_timestamp_prefix_for_server_logs() -> None:
@@ -151,37 +151,38 @@ class Server(uvicorn.Server):
                 requests_lib.RequestStatus.PENDING,
                 requests_lib.RequestStatus.RUNNING,
             ]
-            reqs = requests_lib.get_request_tasks(
-                req_filter=requests_lib.RequestTaskFilter(status=statuses))
-            if not reqs:
+            requests = [(request_task.request_id, request_task.name)
+                        for request_task in requests_lib.get_request_tasks(
+                            req_filter=requests_lib.RequestTaskFilter(
+                                status=statuses, fields=['request_id', 'name']))
+                       ]
+            if not requests:
                 break
-            logger.info(f'{len(reqs)} on-going requests '
+            logger.info(f'{len(requests)} on-going requests '
                         'found, waiting for them to finish...')
             # Proactively cancel internal requests and logs requests since
             # they can run for infinite time.
-            internal_request_ids = [
+            internal_request_ids = {
                 d.id for d in daemons.INTERNAL_REQUEST_DAEMONS
-            ]
+            }
             if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
                 logger.warning('Timeout waiting for on-going requests to '
                                'finish, cancelling all on-going requests.')
-                for req in reqs:
-                    self.interrupt_request_for_retry(req.request_id)
+                for request_id, _ in requests:
+                    self.interrupt_request_for_retry(request_id)
                 break
             interrupted = 0
-            for req in reqs:
-                if req.request_id in internal_request_ids:
-                    self.interrupt_request_for_retry(req.request_id)
-                    interrupted += 1
-                elif req.name in _RETRIABLE_REQUEST_NAMES:
-                    self.interrupt_request_for_retry(req.request_id)
+            for request_id, name in requests:
+                if (name in _RETRIABLE_REQUEST_NAMES or
+                        request_id in internal_request_ids):
+                    self.interrupt_request_for_retry(request_id)
                     interrupted += 1
                 # TODO(aylei): interrupt pending requests to accelerate the
                 # shutdown.
             # If some requests are not interrupted, wait for them to finish,
             # otherwise we just check again immediately to accelerate the
             # shutdown process.
-            if interrupted < len(reqs):
+            if interrupted < len(requests):
                 time.sleep(_WAIT_REQUESTS_INTERVAL_SECONDS)
     def interrupt_request_for_retry(self, request_id: str) -> None:

sky/setup_files/alembic.ini CHANGED Viewed

@@ -98,6 +98,10 @@ version_table = alembic_version_spot_jobs_db
 version_locations = %(here)s/../schemas/db/serve_state
 version_table = alembic_version_serve_state_db
+[sky_config_db]
+version_locations = %(here)s/../schemas/db/skypilot_config
+version_table = alembic_version_sky_config_db
 [post_write_hooks]
 # post_write_hooks defines scripts or Python functions that are run
 # on newly generated revision scripts.  See the documentation for further

sky/setup_files/dependencies.py CHANGED Viewed

@@ -49,6 +49,7 @@ install_requires = [
     # <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
     'pyyaml > 3.13, != 5.4.*',
     'ijson',
+    'orjson',
     'requests',
     # SkyPilot inherits from uvicorn.Server to customize the behavior of
     # uvicorn, so we need to pin uvicorn version to avoid potential break
@@ -86,7 +87,6 @@ install_requires = [
     'types-paramiko',
     'alembic',
     'aiohttp',
-    'aiosqlite',
     'anyio',
 ]
@@ -104,6 +104,10 @@ GRPC = 'grpcio>=1.63.0'
 PROTOBUF = 'protobuf>=5.26.1, < 7.0.0'
 server_dependencies = [
+    # TODO: Some of these dependencies are also specified in install_requires,
+    # so they are redundant here. We should figure out if they are only needed
+    # on the server (should remove from install_requires), or if they are needed
+    # on the client (should remove from here).
     'casbin',
     'sqlalchemy_adapter',
     'passlib',
@@ -144,11 +148,19 @@ aws_dependencies = [
     'colorama < 0.4.5',
 ]
+# Kubernetes 32.0.0 has an authentication bug:
+# https://github.com/kubernetes-client/python/issues/2333
+kubernetes_dependencies = [
+    'kubernetes>=20.0.0,!=32.0.0',
+    'websockets',
+    'python-dateutil',
+]
 # azure-cli cannot be installed normally by uv, so we need to work around it in
 # a few places.
 AZURE_CLI = 'azure-cli>=2.65.0'
-extras_require: Dict[str, List[str]] = {
+cloud_dependencies: Dict[str, List[str]] = {
     'aws': aws_dependencies,
     # TODO(zongheng): azure-cli is huge and takes a long time to install.
     # Tracked in: https://github.com/Azure/azure-cli/issues/7387
@@ -184,14 +196,11 @@ extras_require: Dict[str, List[str]] = {
     'docker': ['docker'] + local_ray,
     'lambda': [],  # No dependencies needed for lambda
     'cloudflare': aws_dependencies,
+    'coreweave': aws_dependencies + kubernetes_dependencies,
     'scp': local_ray,
     'oci': ['oci'],
-    # Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
-    'kubernetes': [
-        'kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'
-    ],
-    'ssh': ['kubernetes>=20.0.0,!=32.0.0', 'websockets', 'python-dateutil'],
-    'remote': remote,
+    'kubernetes': kubernetes_dependencies,
+    'ssh': kubernetes_dependencies,
     # For the container registry auth api. Reference:
     # https://github.com/runpod/runpod-python/releases/tag/1.6.1
     # RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
@@ -221,12 +230,11 @@ extras_require: Dict[str, List[str]] = {
     ] + aws_dependencies,
     'hyperbolic': [],  # No dependencies needed for hyperbolic
     'seeweb': ['ecsapi>=0.2.0'],
-    'server': server_dependencies,
+    'shadeform': [],  # No dependencies needed for shadeform
 }
 # Calculate which clouds should be included in the [all] installation.
-clouds_for_all = set(extras_require)
-clouds_for_all.remove('remote')
+clouds_for_all = set(cloud_dependencies)
 if sys.version_info < (3, 10):
     # Nebius needs python3.10. If python 3.9 [all] will not install nebius
@@ -241,5 +249,16 @@ if sys.version_info >= (3, 12):
     # TODO: Remove once https://github.com/vast-ai/vast-sdk/pull/6 is released
     clouds_for_all.remove('vast')
-extras_require['all'] = list(
-    set().union(*[extras_require[cloud] for cloud in clouds_for_all]))
+cloud_extras = {
+    cloud: dependencies + server_dependencies
+    for cloud, dependencies in cloud_dependencies.items()
+}
+extras_require: Dict[str, List[str]] = {
+    # Include server_dependencies with each cloud.
+    **cloud_extras,
+    'all': list(set().union(*[cloud_extras[cloud] for cloud in clouds_for_all])
+               ),
+    'remote': remote,
+    'server': server_dependencies,
+}

sky/sky_logging.py CHANGED Viewed

@@ -109,7 +109,6 @@ def _setup_logger():
     global _default_handler
     if _default_handler is None:
         _default_handler = EnvAwareHandler(sys.stdout)
-        _default_handler.flush = sys.stdout.flush  # type: ignore
         if env_options.Options.SHOW_DEBUG_INFO.get():
             _default_handler.setLevel(logging.DEBUG)
         else:
@@ -129,7 +128,6 @@ def _setup_logger():
         for logger_name in _SENSITIVE_LOGGER:
             logger = logging.getLogger(logger_name)
             handler_to_logger = EnvAwareHandler(sys.stdout, sensitive=True)
-            handler_to_logger.flush = sys.stdout.flush  # type: ignore
             logger.addHandler(handler_to_logger)
             logger.setLevel(logging.INFO)
             if _show_logging_prefix():

skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251009py3-none-any.whl → 1.0.0.dev20251107py3-none-any.whl