PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251019__py3-none-any.whl → 1.0.0.dev20251021__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251019py3-none-any.whl → 1.0.0.dev20251021py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (49) hide show

sky/__init__.py +2 -2
sky/adaptors/kubernetes.py +61 -0
sky/backends/backend_utils.py +11 -11
sky/backends/cloud_vm_ray_backend.py +15 -4
sky/client/cli/command.py +39 -10
sky/client/cli/flags.py +4 -2
sky/client/sdk.py +26 -3
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-3c431f6c9086e487.js → webpack-66f23594d38c7f16.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage.py +2 -2
sky/global_user_state.py +20 -20
sky/jobs/server/server.py +10 -1
sky/provision/kubernetes/network.py +9 -6
sky/provision/provisioner.py +8 -0
sky/serve/server/server.py +1 -0
sky/server/common.py +9 -2
sky/server/constants.py +1 -1
sky/server/daemons.py +4 -2
sky/server/requests/executor.py +10 -8
sky/server/requests/payloads.py +2 -1
sky/server/requests/preconditions.py +9 -4
sky/server/requests/requests.py +118 -34
sky/server/server.py +57 -24
sky/server/stream_utils.py +127 -38
sky/server/uvicorn.py +18 -17
sky/utils/asyncio_utils.py +63 -3
{skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251021.dist-info}/METADATA +35 -36
{skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251021.dist-info}/RECORD +49 -49
/sky/dashboard/out/_next/static/{8e35zdobdd0bK_Nkba03m → jDc1PlRsl9Cc5FQUMLBu8}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{8e35zdobdd0bK_Nkba03m → jDc1PlRsl9Cc5FQUMLBu8}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251021.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251021.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251021.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251021.dist-info}/top_level.txt +0 -0

sky/server/requests/requests.py CHANGED Viewed

@@ -400,7 +400,8 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
         for request_task in get_request_tasks(req_filter=RequestTaskFilter(
             status=[RequestStatus.PENDING, RequestStatus.RUNNING],
             exclude_request_names=[exclude_request_name],
-            cluster_names=[cluster_name]))
+            cluster_names=[cluster_name],
+            fields=['request_id']))
     ]
     kill_requests(request_ids)
@@ -425,7 +426,8 @@ def kill_requests(request_ids: Optional[List[str]] = None,
                 status=[RequestStatus.PENDING, RequestStatus.RUNNING],
                 # Avoid cancelling the cancel request itself.
                 exclude_request_names=['sky.api_cancel'],
-                user_id=user_id))
+                user_id=user_id,
+                fields=['request_id']))
         ]
     cancelled_request_ids = []
     for request_id in request_ids:
@@ -592,6 +594,18 @@ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
             _add_or_update_request_no_lock(request)
+@init_db
+@metrics_lib.time_me
+@asyncio_utils.shield
+async def update_status_async(request_id: str, status: RequestStatus) -> None:
+    """Update the status of a request"""
+    async with filelock.AsyncFileLock(request_lock_path(request_id)):
+        request = await _get_request_no_lock_async(request_id)
+        if request is not None:
+            request.status = status
+            await _add_or_update_request_no_lock_async(request)
 @init_db
 @metrics_lib.time_me
 @asyncio_utils.shield
@@ -604,30 +618,42 @@ async def update_status_msg_async(request_id: str, status_msg: str) -> None:
             await _add_or_update_request_no_lock_async(request)
-_get_request_sql = (f'SELECT {", ".join(REQUEST_COLUMNS)} FROM {REQUEST_TABLE} '
-                    'WHERE request_id LIKE ?')
-def _get_request_no_lock(request_id: str) -> Optional[Request]:
+def _get_request_no_lock(
+        request_id: str,
+        fields: Optional[List[str]] = None) -> Optional[Request]:
     """Get a SkyPilot API request."""
     assert _DB is not None
+    columns_str = ', '.join(REQUEST_COLUMNS)
+    if fields:
+        columns_str = ', '.join(fields)
     with _DB.conn:
         cursor = _DB.conn.cursor()
-        cursor.execute(_get_request_sql, (request_id + '%',))
+        cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
+                        'WHERE request_id LIKE ?'), (request_id + '%',))
         row = cursor.fetchone()
         if row is None:
             return None
+    if fields:
+        row = _update_request_row_fields(row, fields)
     return Request.from_row(row)
-async def _get_request_no_lock_async(request_id: str) -> Optional[Request]:
+async def _get_request_no_lock_async(
+        request_id: str,
+        fields: Optional[List[str]] = None) -> Optional[Request]:
     """Async version of _get_request_no_lock."""
     assert _DB is not None
-    async with _DB.execute_fetchall_async(_get_request_sql,
-                                          (request_id + '%',)) as rows:
+    columns_str = ', '.join(REQUEST_COLUMNS)
+    if fields:
+        columns_str = ', '.join(fields)
+    async with _DB.execute_fetchall_async(
+        (f'SELECT {columns_str} FROM {REQUEST_TABLE} '
+         'WHERE request_id LIKE ?'), (request_id + '%',)) as rows:
         row = rows[0] if rows else None
         if row is None:
             return None
+    if fields:
+        row = _update_request_row_fields(row, fields)
     return Request.from_row(row)
@@ -646,20 +672,23 @@ def get_latest_request_id() -> Optional[str]:
 @init_db
 @metrics_lib.time_me
-def get_request(request_id: str) -> Optional[Request]:
+def get_request(request_id: str,
+                fields: Optional[List[str]] = None) -> Optional[Request]:
     """Get a SkyPilot API request."""
     with filelock.FileLock(request_lock_path(request_id)):
-        return _get_request_no_lock(request_id)
+        return _get_request_no_lock(request_id, fields)
 @init_db_async
 @metrics_lib.time_me_async
 @asyncio_utils.shield
-async def get_request_async(request_id: str) -> Optional[Request]:
+async def get_request_async(
+        request_id: str,
+        fields: Optional[List[str]] = None) -> Optional[Request]:
     """Async version of get_request."""
     # TODO(aylei): figure out how to remove FileLock here to avoid the overhead
     async with filelock.AsyncFileLock(request_lock_path(request_id)):
-        return await _get_request_no_lock_async(request_id)
+        return await _get_request_no_lock_async(request_id, fields)
 class StatusWithMsg(NamedTuple):
@@ -896,6 +925,23 @@ def set_request_failed(request_id: str, e: BaseException) -> None:
         request_task.set_error(e)
+@init_db_async
+@metrics_lib.time_me_async
+@asyncio_utils.shield
+async def set_request_failed_async(request_id: str, e: BaseException) -> None:
+    """Set a request to failed and populate the error message."""
+    with ux_utils.enable_traceback():
+        stacktrace = traceback.format_exc()
+    setattr(e, 'stacktrace', stacktrace)
+    async with filelock.AsyncFileLock(request_lock_path(request_id)):
+        request_task = await _get_request_no_lock_async(request_id)
+        assert request_task is not None, request_id
+        request_task.status = RequestStatus.FAILED
+        request_task.finished_at = time.time()
+        request_task.set_error(e)
+        await _add_or_update_request_no_lock_async(request_task)
 def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
     """Set a request to succeeded and populate the result."""
     with update_request(request_id) as request_task:
@@ -906,28 +952,50 @@ def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
             request_task.set_return_value(result)
-def set_request_cancelled(request_id: str) -> None:
+@init_db_async
+@metrics_lib.time_me_async
+@asyncio_utils.shield
+async def set_request_succeeded_async(request_id: str,
+                                      result: Optional[Any]) -> None:
+    """Set a request to succeeded and populate the result."""
+    async with filelock.AsyncFileLock(request_lock_path(request_id)):
+        request_task = await _get_request_no_lock_async(request_id)
+        assert request_task is not None, request_id
+        request_task.status = RequestStatus.SUCCEEDED
+        request_task.finished_at = time.time()
+        if result is not None:
+            request_task.set_return_value(result)
+        await _add_or_update_request_no_lock_async(request_task)
+@init_db_async
+@metrics_lib.time_me_async
+@asyncio_utils.shield
+async def set_request_cancelled_async(request_id: str) -> None:
     """Set a pending or running request to cancelled."""
-    with update_request(request_id) as request_task:
+    async with filelock.AsyncFileLock(request_lock_path(request_id)):
+        request_task = await _get_request_no_lock_async(request_id)
         assert request_task is not None, request_id
         # Already finished or cancelled.
         if request_task.status > RequestStatus.RUNNING:
             return
         request_task.finished_at = time.time()
         request_task.status = RequestStatus.CANCELLED
+        await _add_or_update_request_no_lock_async(request_task)
 @init_db
 @metrics_lib.time_me
-async def _delete_requests(requests: List[Request]):
+async def _delete_requests(request_ids: List[str]):
     """Clean up requests by their IDs."""
-    id_list_str = ','.join(repr(req.request_id) for req in requests)
+    id_list_str = ','.join(repr(request_id) for request_id in request_ids)
     assert _DB is not None
     await _DB.execute_and_commit_async(
         f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
-async def clean_finished_requests_with_retention(retention_seconds: int):
+async def clean_finished_requests_with_retention(retention_seconds: int,
+                                                 batch_size: int = 1000):
     """Clean up finished requests older than the retention period.
     This function removes old finished requests (SUCCEEDED, FAILED, CANCELLED)
@@ -936,24 +1004,40 @@ async def clean_finished_requests_with_retention(retention_seconds: int):
     Args:
         retention_seconds: Requests older than this many seconds will be
             deleted.
+        batch_size: batch delete 'batch_size' requests at a time to
+            avoid using too much memory and once and to let each
+            db query complete in a reasonable time. All stale
+            requests older than the retention period will be deleted
+            regardless of the batch size.
     """
-    reqs = await get_request_tasks_async(
-        req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
-                                     finished_before=time.time() -
-                                     retention_seconds))
-    futs = []
-    for req in reqs:
-        futs.append(
-            asyncio.create_task(
-                anyio.Path(req.log_path.absolute()).unlink(missing_ok=True)))
-    await asyncio.gather(*futs)
-    await _delete_requests(reqs)
+    total_deleted = 0
+    while True:
+        reqs = await get_request_tasks_async(
+            req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
+                                         finished_before=time.time() -
+                                         retention_seconds,
+                                         limit=batch_size,
+                                         fields=['request_id']))
+        if len(reqs) == 0:
+            break
+        futs = []
+        for req in reqs:
+            # req.log_path is derived from request_id,
+            # so it's ok to just grab the request_id in the above query.
+            futs.append(
+                asyncio.create_task(
+                    anyio.Path(
+                        req.log_path.absolute()).unlink(missing_ok=True)))
+        await asyncio.gather(*futs)
+        await _delete_requests([req.request_id for req in reqs])
+        total_deleted += len(reqs)
+        if len(reqs) < batch_size:
+            break
     # To avoid leakage of the log file, logs must be deleted before the
     # request task in the database.
-    logger.info(f'Cleaned up {len(reqs)} finished requests '
+    logger.info(f'Cleaned up {total_deleted} finished requests '
                 f'older than {retention_seconds} seconds')

sky/server/server.py CHANGED Viewed

@@ -43,6 +43,7 @@ from sky.data import storage_utils
 from sky.jobs import utils as managed_job_utils
 from sky.jobs.server import server as jobs_rest
 from sky.metrics import utils as metrics_utils
+from sky.provision import metadata_utils
 from sky.provision.kubernetes import utils as kubernetes_utils
 from sky.schemas.api import responses
 from sky.serve.server import server as serve_rest
@@ -1270,6 +1271,7 @@ async def logs(
         request_id=request.state.request_id,
         logs_path=request_task.log_path,
         background_tasks=background_tasks,
+        kill_request_on_disconnect=False,
     )
@@ -1363,38 +1365,65 @@ async def download(download_body: payloads.DownloadBody,
 # TODO(aylei): run it asynchronously after global_user_state support async op
 @app.post('/provision_logs')
-def provision_logs(cluster_body: payloads.ClusterNameBody,
+def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
                    follow: bool = True,
                    tail: int = 0) -> fastapi.responses.StreamingResponse:
     """Streams the provision.log for the latest launch request of a cluster."""
-    # Prefer clusters table first, then cluster_history as fallback.
-    log_path_str = global_user_state.get_cluster_provision_log_path(
-        cluster_body.cluster_name)
-    if not log_path_str:
-        log_path_str = global_user_state.get_cluster_history_provision_log_path(
-            cluster_body.cluster_name)
-    if not log_path_str:
-        raise fastapi.HTTPException(
-            status_code=404,
-            detail=('Provision log path is not recorded for this cluster. '
-                    'Please relaunch to generate provisioning logs.'))
+    log_path = None
+    cluster_name = provision_logs_body.cluster_name
+    worker = provision_logs_body.worker
+    # stream head node logs
+    if worker is None:
+        # Prefer clusters table first, then cluster_history as fallback.
+        log_path_str = global_user_state.get_cluster_provision_log_path(
+            cluster_name)
+        if not log_path_str:
+            log_path_str = (
+                global_user_state.get_cluster_history_provision_log_path(
+                    cluster_name))
+        if not log_path_str:
+            raise fastapi.HTTPException(
+                status_code=404,
+                detail=('Provision log path is not recorded for this cluster. '
+                        'Please relaunch to generate provisioning logs.'))
+        log_path = pathlib.Path(log_path_str).expanduser().resolve()
+        if not log_path.exists():
+            raise fastapi.HTTPException(
+                status_code=404,
+                detail=f'Provision log path does not exist: {str(log_path)}')
-    log_path = pathlib.Path(log_path_str).expanduser().resolve()
-    if not log_path.exists():
-        raise fastapi.HTTPException(
-            status_code=404,
-            detail=f'Provision log path does not exist: {str(log_path)}')
+    # stream worker node logs
+    else:
+        handle = global_user_state.get_handle_from_cluster_name(cluster_name)
+        if handle is None:
+            raise fastapi.HTTPException(
+                status_code=404,
+                detail=('Cluster handle is not recorded for this cluster. '
+                        'Please relaunch to generate provisioning logs.'))
+        # instance_ids includes head node
+        instance_ids = handle.instance_ids
+        if instance_ids is None:
+            raise fastapi.HTTPException(
+                status_code=400,
+                detail='Instance IDs are not recorded for this cluster. '
+                'Please relaunch to generate provisioning logs.')
+        if worker > len(instance_ids) - 1:
+            raise fastapi.HTTPException(
+                status_code=400,
+                detail=f'Worker {worker} is out of range. '
+                f'The cluster has {len(instance_ids)} nodes.')
+        log_path = metadata_utils.get_instance_log_dir(
+            handle.get_cluster_name_on_cloud(), instance_ids[worker])
     # Tail semantics: 0 means print all lines. Convert 0 -> None for streamer.
     effective_tail = None if tail is None or tail <= 0 else tail
     return fastapi.responses.StreamingResponse(
-        content=stream_utils.log_streamer(
-            None,
-            log_path,
-            tail=effective_tail,
-            follow=follow,
-            cluster_name=cluster_body.cluster_name),
+        content=stream_utils.log_streamer(None,
+                                          log_path,
+                                          tail=effective_tail,
+                                          follow=follow,
+                                          cluster_name=cluster_name),
         media_type='text/plain',
         headers={
             'Cache-Control': 'no-cache, no-transform',
@@ -1567,11 +1596,14 @@ async def stream(
     polling_interval = stream_utils.DEFAULT_POLL_INTERVAL
     # Original plain text streaming logic
     if request_id is not None:
-        request_task = await requests_lib.get_request_async(request_id)
+        request_task = await requests_lib.get_request_async(
+            request_id, fields=['request_id', 'schedule_type'])
         if request_task is None:
             print(f'No task with request ID {request_id}')
             raise fastapi.HTTPException(
                 status_code=404, detail=f'Request {request_id!r} not found')
+        # req.log_path is derived from request_id,
+        # so it's ok to just grab the request_id in the above query.
         log_path_to_stream = request_task.log_path
         if not log_path_to_stream.exists():
             # The log file might be deleted by the request GC daemon but the
@@ -1581,6 +1613,7 @@ async def stream(
                 detail=f'Log of request {request_id!r} has been deleted')
         if request_task.schedule_type == requests_lib.ScheduleType.LONG:
             polling_interval = stream_utils.LONG_REQUEST_POLL_INTERVAL
+        del request_task
     else:
         assert log_path is not None, (request_id, log_path)
         if log_path == constants.API_SERVER_LOGS:

sky/server/stream_utils.py CHANGED Viewed

@@ -25,6 +25,17 @@ logger = sky_logging.init_logger(__name__)
 _BUFFER_SIZE = 8 * 1024  # 8KB
 _BUFFER_TIMEOUT = 0.02  # 20ms
 _HEARTBEAT_INTERVAL = 30
+# If a SHORT request has been stuck in pending for
+# _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
+_SHORT_REQUEST_SPINNER_TIMEOUT = 2
+# If there is an issue during provisioning that causes the cluster to be stuck
+# in INIT state, we use this timeout to break the loop and stop streaming
+# provision logs.
+_PROVISION_LOG_TIMEOUT = 3
+# Maximum time to wait for new log files to appear when streaming worker node
+# provision logs. Worker logs are created sequentially during the provisioning
+# process, so we need to wait for new files to appear.
+_MAX_WAIT_FOR_NEW_LOG_FILES = 3  # seconds
 LONG_REQUEST_POLL_INTERVAL = 1
 DEFAULT_POLL_INTERVAL = 0.1
@@ -45,7 +56,7 @@ async def _yield_log_file_with_payloads_skipped(
 async def log_streamer(
     request_id: Optional[str],
-    log_path: pathlib.Path,
+    log_path: Optional[pathlib.Path] = None,
     plain_logs: bool = False,
     tail: Optional[int] = None,
     follow: bool = True,
@@ -57,7 +68,9 @@ async def log_streamer(
     Args:
         request_id: The request ID to check whether the log tailing process
             should be stopped.
-        log_path: The path to the log file.
+        log_path: The path to the log file or directory containing the log
+        files. If it is a directory, all *.log files in the directory will be
+        streamed.
         plain_logs: Whether to show plain logs.
         tail: The number of lines to tail. If None, tail the whole file.
         follow: Whether to follow the log file.
@@ -66,17 +79,26 @@ async def log_streamer(
     """
     if request_id is not None:
+        start_time = asyncio.get_event_loop().time()
         status_msg = rich_utils.EncodedStatusMessage(
             f'[dim]Checking request: {request_id}[/dim]')
-        request_task = await requests_lib.get_request_async(request_id)
+        request_task = await requests_lib.get_request_async(request_id,
+                                                            fields=[
+                                                                'request_id',
+                                                                'name',
+                                                                'schedule_type',
+                                                                'status',
+                                                                'status_msg'
+                                                            ])
         if request_task is None:
             raise fastapi.HTTPException(
                 status_code=404, detail=f'Request {request_id} not found')
         request_id = request_task.request_id
-        # Do not show the waiting spinner if the request is a fast, non-blocking
-        # request.
+        # By default, do not show the waiting spinner for SHORT requests.
+        # If the request has been stuck in pending for
+        # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
         show_request_waiting_spinner = (not plain_logs and
                                         request_task.schedule_type
                                         == requests_lib.ScheduleType.LONG)
@@ -89,14 +111,23 @@ async def log_streamer(
                        f'scheduled: {request_id}')
         req_status = request_task.status
         req_msg = request_task.status_msg
+        del request_task
         # Slowly back off the database polling up to every 1 second, to avoid
         # overloading the CPU and DB.
         backoff = common_utils.Backoff(initial_backoff=polling_interval,
                                        max_backoff_factor=10,
                                        multiplier=1.2)
         while req_status < requests_lib.RequestStatus.RUNNING:
+            current_time = asyncio.get_event_loop().time()
+            # Show the waiting spinner for a SHORT request if it has been stuck
+            # in pending for _SHORT_REQUEST_SPINNER_TIMEOUT seconds
+            if not show_request_waiting_spinner and (
+                    current_time - start_time > _SHORT_REQUEST_SPINNER_TIMEOUT):
+                show_request_waiting_spinner = True
+                yield status_msg.init()
+                yield status_msg.start()
             if req_msg is not None:
-                waiting_msg = request_task.status_msg
+                waiting_msg = req_msg
             if show_request_waiting_spinner:
                 yield status_msg.update(f'[dim]{waiting_msg}[/dim]')
             elif plain_logs and waiting_msg != last_waiting_msg:
@@ -119,11 +150,57 @@ async def log_streamer(
         if show_request_waiting_spinner:
             yield status_msg.stop()
-    async with aiofiles.open(log_path, 'rb') as f:
-        async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
-                                          follow, cluster_name,
-                                          polling_interval):
-            yield chunk
+    if log_path is not None and log_path.is_dir():
+        # Track which log files we've already streamed
+        streamed_files = set()
+        no_new_files_count = 0
+        while True:
+            # Get all *.log files in the log_path
+            log_files = sorted(log_path.glob('*.log'))
+            # Filter out already streamed files
+            new_files = [f for f in log_files if f not in streamed_files]
+            if len(new_files) == 0:
+                if not follow:
+                    break
+                # Wait a bit to see if new files appear
+                await asyncio.sleep(0.5)
+                no_new_files_count += 1
+                # Check if we've waited too long for new files
+                if no_new_files_count > _MAX_WAIT_FOR_NEW_LOG_FILES * 2:
+                    break
+                continue
+            # Reset the no-new-files counter when we find new files
+            no_new_files_count = 0
+            for log_file_path in new_files:
+                # Add header before each file (similar to tail -f behavior)
+                header = f'\n==> {log_file_path} <==\n\n'
+                yield header
+                async with aiofiles.open(log_file_path, 'rb') as f:
+                    async for chunk in _tail_log_file(f, request_id, plain_logs,
+                                                      tail, follow,
+                                                      cluster_name,
+                                                      polling_interval):
+                        yield chunk
+                # Mark this file as streamed
+                streamed_files.add(log_file_path)
+            # If not following, break after streaming all current files
+            if not follow:
+                break
+    else:
+        assert log_path is not None, (request_id, log_path)
+        async with aiofiles.open(log_path, 'rb') as f:
+            async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
+                                              follow, cluster_name,
+                                              polling_interval):
+                yield chunk
 async def _tail_log_file(
@@ -197,7 +274,7 @@ async def _tail_log_file(
                     if (req_status.status ==
                             requests_lib.RequestStatus.CANCELLED):
                         request_task = await requests_lib.get_request_async(
-                            request_id)
+                            request_id, fields=['name', 'should_retry'])
                         if request_task.should_retry:
                             buffer.append(
                                 message_utils.encode_payload(
@@ -206,6 +283,7 @@ async def _tail_log_file(
                             buffer.append(
                                 f'{request_task.name!r} request {request_id}'
                                 ' cancelled\n')
+                        del request_task
                     break
             if not follow:
                 # The below checks (cluster status, heartbeat) are not needed
@@ -213,21 +291,24 @@ async def _tail_log_file(
                 break
             # Provision logs pass in cluster_name, check cluster status
             # periodically to see if provisioning is done.
-            if cluster_name is not None and should_check_status:
-                last_status_check_time = current_time
-                cluster_status = await (
-                    global_user_state.get_status_from_cluster_name_async(
-                        cluster_name))
-                if cluster_status is None:
-                    logger.debug(
-                        'Stop tailing provision logs for cluster'
-                        f' status for cluster {cluster_name} not found')
-                    break
-                if cluster_status != status_lib.ClusterStatus.INIT:
-                    logger.debug(f'Stop tailing provision logs for cluster'
-                                 f' {cluster_name} has status {cluster_status} '
-                                 '(not in INIT state)')
+            if cluster_name is not None:
+                if current_time - last_flush_time > _PROVISION_LOG_TIMEOUT:
                     break
+                if should_check_status:
+                    last_status_check_time = current_time
+                    cluster_status = await (
+                        global_user_state.get_status_from_cluster_name_async(
+                            cluster_name))
+                    if cluster_status is None:
+                        logger.debug(
+                            'Stop tailing provision logs for cluster'
+                            f' status for cluster {cluster_name} not found')
+                        break
+                    if cluster_status != status_lib.ClusterStatus.INIT:
+                        logger.debug(
+                            f'Stop tailing provision logs for cluster'
+                            f' {cluster_name} has status {cluster_status} '
+                            '(not in INIT state)')
             if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
                 # Currently just used to keep the connection busy, refer to
                 # https://github.com/skypilot-org/skypilot/issues/5750 for
@@ -267,28 +348,36 @@ def stream_response_for_long_request(
     request_id: str,
     logs_path: pathlib.Path,
     background_tasks: fastapi.BackgroundTasks,
+    kill_request_on_disconnect: bool = True,
 ) -> fastapi.responses.StreamingResponse:
-    return stream_response(request_id,
-                           logs_path,
-                           background_tasks,
-                           polling_interval=LONG_REQUEST_POLL_INTERVAL)
+    """Stream the logs of a long request."""
+    return stream_response(
+        request_id,
+        logs_path,
+        background_tasks,
+        polling_interval=LONG_REQUEST_POLL_INTERVAL,
+        kill_request_on_disconnect=kill_request_on_disconnect,
+    )
 def stream_response(
     request_id: str,
     logs_path: pathlib.Path,
     background_tasks: fastapi.BackgroundTasks,
-    polling_interval: float = DEFAULT_POLL_INTERVAL
+    polling_interval: float = DEFAULT_POLL_INTERVAL,
+    kill_request_on_disconnect: bool = True,
 ) -> fastapi.responses.StreamingResponse:
-    async def on_disconnect():
-        logger.info(f'User terminated the connection for request '
-                    f'{request_id}')
-        requests_lib.kill_requests([request_id])
+    if kill_request_on_disconnect:
+        async def on_disconnect():
+            logger.info(f'User terminated the connection for request '
+                        f'{request_id}')
+            requests_lib.kill_requests([request_id])
-    # The background task will be run after returning a response.
-    # https://fastapi.tiangolo.com/tutorial/background-tasks/
-    background_tasks.add_task(on_disconnect)
+        # The background task will be run after returning a response.
+        # https://fastapi.tiangolo.com/tutorial/background-tasks/
+        background_tasks.add_task(on_disconnect)
     return fastapi.responses.StreamingResponse(
         log_streamer(request_id, logs_path, polling_interval=polling_interval),

skypilot-nightly 1.0.0.dev20251019__py3-none-any.whl → 1.0.0.dev20251021__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251019py3-none-any.whl → 1.0.0.dev20251021py3-none-any.whl