PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250926__py3-none-any.whl → 1.0.0.dev20251001__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250926py3-none-any.whl → 1.0.0.dev20251001py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (71) hide show

sky/__init__.py +2 -2
sky/backends/backend_utils.py +43 -14
sky/backends/cloud_vm_ray_backend.py +153 -38
sky/check.py +0 -29
sky/client/cli/command.py +48 -26
sky/client/cli/table_utils.py +91 -0
sky/client/sdk.py +14 -23
sky/client/sdk_async.py +5 -5
sky/core.py +18 -20
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-07349868f7905d37.js → [pool]-509b2977a6373bf6.js} +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-8e64d11e58eab5cb.js → webpack-4f0c389a4ce5fd9c.js} +1 -1
sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → m3YT2i5s6v4SsIdYc8WZa}/_buildManifest.js +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage.py +11 -0
sky/data/storage_utils.py +1 -45
sky/execution.py +0 -1
sky/global_user_state.py +3 -3
sky/jobs/client/sdk.py +3 -2
sky/jobs/controller.py +15 -0
sky/jobs/server/core.py +120 -28
sky/jobs/server/server.py +1 -1
sky/jobs/server/utils.py +65 -32
sky/jobs/state.py +145 -3
sky/jobs/utils.py +87 -8
sky/provision/kubernetes/instance.py +1 -1
sky/schemas/api/responses.py +73 -0
sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
sky/serve/serve_utils.py +16 -0
sky/serve/server/core.py +1 -1
sky/serve/server/impl.py +6 -6
sky/server/common.py +2 -1
sky/server/requests/serializers/decoders.py +10 -6
sky/server/requests/serializers/encoders.py +13 -8
sky/skylet/constants.py +1 -1
sky/skylet/job_lib.py +2 -32
sky/skylet/log_lib.py +211 -0
sky/skylet/log_lib.pyi +30 -1
sky/skylet/services.py +208 -2
sky/skylet/skylet.py +3 -0
sky/task.py +4 -0
sky/utils/cluster_utils.py +23 -5
sky/utils/command_runner.py +21 -5
sky/utils/command_runner.pyi +11 -0
sky/utils/volume.py +5 -0
{skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/METADATA +35 -35
{skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/RECORD +70 -66
sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
/sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → m3YT2i5s6v4SsIdYc8WZa}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/top_level.txt +0 -0

sky/server/requests/serializers/decoders.py CHANGED Viewed

@@ -72,7 +72,7 @@ def decode_status_kubernetes(
                         List[Dict[str, Any]], Optional[str]]
 ) -> Tuple[List[kubernetes_utils.KubernetesSkyPilotClusterInfoPayload],
            List[kubernetes_utils.KubernetesSkyPilotClusterInfoPayload],
-           List[Dict[str, Any]], Optional[str]]:
+           List[responses.ManagedJobRecord], Optional[str]]:
     (encoded_all_clusters, encoded_unmanaged_clusters, all_jobs,
      context) = return_value
     all_clusters = []
@@ -85,6 +85,7 @@ def decode_status_kubernetes(
         cluster['status'] = status_lib.ClusterStatus(cluster['status'])
         unmanaged_clusters.append(
             kubernetes_utils.KubernetesSkyPilotClusterInfoPayload(**cluster))
+    all_jobs = [responses.ManagedJobRecord(**job) for job in all_jobs]
     return all_clusters, unmanaged_clusters, all_jobs, context
@@ -101,11 +102,11 @@ def decode_start(return_value: str) -> 'backends.CloudVmRayResourceHandle':
 @register_decoders('queue')
-def decode_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
+def decode_queue(return_value: List[dict],) -> List[responses.ClusterJobRecord]:
     jobs = return_value
     for job in jobs:
         job['status'] = job_lib.JobStatus(job['status'])
-    return jobs
+    return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
 @register_decoders('jobs.queue')
@@ -115,7 +116,7 @@ def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
 @register_decoders('jobs.queue_v2')
-def decode_jobs_queue_v2(return_value) -> List[Dict[str, Any]]:
+def decode_jobs_queue_v2(return_value) -> List[responses.ManagedJobRecord]:
     """Decode jobs queue response.
     Supports legacy list, or a dict {jobs, total}.
@@ -129,6 +130,7 @@ def decode_jobs_queue_v2(return_value) -> List[Dict[str, Any]]:
         jobs = return_value
     for job in jobs:
         job['status'] = managed_jobs.ManagedJobStatus(job['status'])
+    jobs = [responses.ManagedJobRecord(**job) for job in jobs]
     return jobs
@@ -181,14 +183,16 @@ def decode_list_accelerators(
 @register_decoders('storage_ls')
 def decode_storage_ls(
-        return_value: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        return_value: List[Dict[str, Any]]) -> List[responses.StorageRecord]:
     for storage_info in return_value:
         storage_info['status'] = status_lib.StorageStatus(
             storage_info['status'])
         storage_info['store'] = [
             storage.StoreType(store) for store in storage_info['store']
         ]
-    return return_value
+    return [
+        responses.StorageRecord(**storage_info) for storage_info in return_value
+    ]
 @register_decoders('job_status')

sky/server/requests/serializers/encoders.py CHANGED Viewed

@@ -92,10 +92,14 @@ def encode_start(resource_handle: 'backends.CloudVmRayResourceHandle') -> str:
 @register_encoder('queue')
-def encode_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
+def encode_queue(
+    jobs: List[responses.ClusterJobRecord],) -> List[Dict[str, Any]]:
+    response = []
     for job in jobs:
-        job['status'] = job['status'].value
-    return jobs
+        response_job = job.model_dump()
+        response_job['status'] = job['status'].value
+        response.append(response_job)
+    return response
 @register_encoder('status_kubernetes')
@@ -103,7 +107,7 @@ def encode_status_kubernetes(
     return_value: Tuple[
         List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
         List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
-        List[Dict[str, Any]], Optional[str]]
+        List[responses.ManagedJobRecord], Optional[str]]
 ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]],
            Optional[str]]:
     all_clusters, unmanaged_clusters, all_jobs, context = return_value
@@ -117,6 +121,7 @@ def encode_status_kubernetes(
         encoded_cluster = dataclasses.asdict(cluster)
         encoded_cluster['status'] = encoded_cluster['status'].value
         encoded_unmanaged_clusters.append(encoded_cluster)
+    all_jobs = [job.model_dump() for job in all_jobs]
     return encoded_all_clusters, encoded_unmanaged_clusters, all_jobs, context
@@ -146,9 +151,9 @@ def encode_jobs_queue_v2(
     for job in jobs:
         job['status'] = job['status'].value
     if total is None:
-        return jobs
+        return [job.model_dump() for job in jobs]
     return {
-        'jobs': jobs,
+        'jobs': [job.model_dump() for job in jobs],
         'total': total,
         'total_no_filter': total_no_filter,
         'status_counts': status_counts
@@ -199,11 +204,11 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
 @register_encoder('storage_ls')
 def encode_storage_ls(
-        return_value: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
     for storage_info in return_value:
         storage_info['status'] = storage_info['status'].value
         storage_info['store'] = [store.value for store in storage_info['store']]
-    return return_value
+    return [storage_info.model_dump() for storage_info in return_value]
 @register_encoder('job_status')

sky/skylet/constants.py CHANGED Viewed

@@ -100,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
 # cluster yaml is updated.
 #
 # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
-SKYLET_VERSION = '19'
+SKYLET_VERSION = '21'
 # The version of the lib files that skylet/jobs use. Whenever there is an API
 # change for the job_lib or log_lib, we need to bump this version, so that the
 # user can be notified to update their SkyPilot version on the remote cluster.

sky/skylet/job_lib.py CHANGED Viewed

@@ -24,7 +24,6 @@ from sky import sky_logging
 from sky.adaptors import common as adaptors_common
 from sky.skylet import constants
 from sky.utils import common_utils
-from sky.utils import log_utils
 from sky.utils import message_utils
 from sky.utils import subprocess_utils
 from sky.utils.db import db_utils
@@ -612,8 +611,8 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
     PENDING state.
     The normal job duration will use `start_at` instead of `submitted_at` (in
-    `format_job_queue()`), because the job may stay in PENDING if the cluster is
-    busy.
+    `table_utils.format_job_queue()`), because the job may stay in PENDING if
+    the cluster is busy.
     """
     return message_utils.encode_payload(
         get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
@@ -941,35 +940,6 @@ def is_cluster_idle() -> bool:
     assert False, 'Should not reach here'
-def format_job_queue(jobs: List[Dict[str, Any]]):
-    """Format the job queue for display.
-    Usage:
-        jobs = get_job_queue()
-        print(format_job_queue(jobs))
-    """
-    job_table = log_utils.create_table([
-        'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
-        'STATUS', 'LOG', 'GIT COMMIT'
-    ])
-    for job in jobs:
-        job_table.add_row([
-            job['job_id'],
-            job['job_name'],
-            job['username'],
-            log_utils.readable_time_duration(job['submitted_at']),
-            log_utils.readable_time_duration(job['start_at']),
-            log_utils.readable_time_duration(job['start_at'],
-                                             job['end_at'],
-                                             absolute=True),
-            job['resources'],
-            job['status'].colored_str(),
-            job['log_path'],
-            job.get('metadata', {}).get('git_commit', '-'),
-        ])
-    return job_table
 def dump_job_queue(user_hash: Optional[str], all_jobs: bool) -> str:
     """Get the job queue in encoded json format.

sky/skylet/log_lib.py CHANGED Viewed

@@ -8,11 +8,13 @@ import functools
 import io
 import multiprocessing.pool
 import os
+import queue as queue_lib
 import shlex
 import subprocess
 import sys
 import tempfile
 import textwrap
+import threading
 import time
 from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
                     Tuple, Union)
@@ -39,6 +41,11 @@ logger = sky_logging.init_logger(__name__)
 LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
+# 16-64KiB seems to be the sweet spot:
+# https://github.com/grpc/grpc.github.io/issues/371
+# TODO(kevin): Benchmark this ourselves and verify.
+DEFAULT_LOG_CHUNK_SIZE = 16 * 1024  # 16KiB
 class _ProcessingArgs:
     """Arguments for processing logs."""
@@ -563,3 +570,207 @@ def tail_logs(job_id: Optional[int],
         except FileNotFoundError:
             print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
                   f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
+def tail_logs_iter(job_id: Optional[int],
+                   log_dir: Optional[str],
+                   managed_job_id: Optional[int] = None,
+                   follow: bool = True,
+                   tail: int = 0) -> Iterator[str]:
+    """Tail the logs of a job. This is mostly the same as tail_logs, but
+    returns an iterator instead of printing to stdout/stderr."""
+    if job_id is None:
+        # This only happens when job_lib.get_latest_job_id() returns None,
+        # which means no job has been submitted to this cluster. See
+        # sky.skylet.job_lib.JobLibCodeGen.tail_logs for more details.
+        logger.info('Skip streaming logs as no job has been submitted.')
+        return
+    job_str = f'job {job_id}'
+    if managed_job_id is not None:
+        job_str = f'managed job {managed_job_id}'
+    if log_dir is None:
+        msg = f'{job_str.capitalize()} not found (see `sky queue`).'
+        yield msg + '\n'
+        return
+    logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
+                 f'{managed_job_id}.')
+    log_path = os.path.join(log_dir, 'run.log')
+    log_path = os.path.expanduser(log_path)
+    status = job_lib.update_job_status([job_id], silent=True)[0]
+    # Wait for the log to be written. This is needed due to the `ray submit`
+    # will take some time to start the job and write the log.
+    retry_cnt = 0
+    while status is not None and not status.is_terminal():
+        retry_cnt += 1
+        if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
+            break
+        if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
+            err = (f'{colorama.Fore.RED}ERROR: Logs for '
+                   f'{job_str} (status: {status.value}) does not exist '
+                   f'after retrying {retry_cnt} times.'
+                   f'{colorama.Style.RESET_ALL}')
+            yield err + '\n'
+            return
+        waiting = (f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
+                   'to be written...')
+        yield waiting + '\n'
+        time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
+        status = job_lib.update_job_status([job_id], silent=True)[0]
+    start_stream_at = LOG_FILE_START_STREAMING_AT
+    # Explicitly declare the type to avoid mypy warning.
+    lines: Iterable[str] = []
+    if follow and status in [
+            job_lib.JobStatus.SETTING_UP,
+            job_lib.JobStatus.PENDING,
+            job_lib.JobStatus.RUNNING,
+    ]:
+        # Not using `ray job logs` because it will put progress bar in
+        # multiple lines.
+        with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
+            # Using `_follow` instead of `tail -f` to streaming the whole
+            # log and creating a new process for tail.
+            start_streaming = False
+            if tail > 0:
+                head_lines_of_log_file = _peek_head_lines(log_file)
+                lines = collections.deque(log_file, maxlen=tail)
+                start_streaming = _should_stream_the_whole_tail_lines(
+                    head_lines_of_log_file, lines, start_stream_at)
+                for line in lines:
+                    if start_stream_at in line:
+                        start_streaming = True
+                    if start_streaming:
+                        yield line
+            # Now, the cursor is at the end of the last lines
+            # if tail > 0
+            for line in _follow_job_logs(log_file,
+                                         job_id=job_id,
+                                         start_streaming=start_streaming,
+                                         start_streaming_at=start_stream_at):
+                yield line
+    else:
+        try:
+            start_streaming = False
+            with open(log_path, 'r', encoding='utf-8') as log_file:
+                if tail > 0:
+                    # If tail > 0, we need to read the last n lines.
+                    # We use double ended queue to rotate the last n lines.
+                    head_lines_of_log_file = _peek_head_lines(log_file)
+                    lines = collections.deque(log_file, maxlen=tail)
+                    start_streaming = _should_stream_the_whole_tail_lines(
+                        head_lines_of_log_file, lines, start_stream_at)
+                else:
+                    lines = log_file
+                for line in lines:
+                    if start_stream_at in line:
+                        start_streaming = True
+                    if start_streaming:
+                        yield line
+                status_str = status.value if status is not None else 'None'
+                # Only show "Job finished" for actually terminal states
+                if status is not None and status.is_terminal():
+                    finish = ux_utils.finishing_message(
+                        f'Job finished (status: {status_str}).')
+                    yield finish + '\n'
+            return
+        except FileNotFoundError:
+            err = (
+                f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
+                f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
+            yield err + '\n'
+class LogBuffer:
+    """In-memory buffer for chunking log lines for streaming."""
+    def __init__(self, max_chars: int = DEFAULT_LOG_CHUNK_SIZE):
+        """Initialize the log buffer.
+        Args:
+            max_chars: Maximum buffer size (in characters, not bytes) before
+                       flushing. The actual amount of bytes (UTF-8 encoding)
+                       could be more than this, depending on the characters,
+                       i.e. ASCII characters take 1 byte, while others
+                       may take 2-4 bytes. But this is fine as our default
+                       chunk size is well below the default value of
+                       grpc.max_receive_message_length which is 4MB.
+        """
+        self.max_chars = max_chars
+        self._buffer = io.StringIO()
+    def _should_flush(self) -> bool:
+        return self._buffer.tell() >= self.max_chars
+    def flush(self) -> str:
+        """Get the current buffered content and clear the buffer.
+        Returns:
+            The buffered log lines as a single string
+        """
+        if not self._buffer.tell():
+            return ''
+        chunk = self._buffer.getvalue()
+        self._buffer.truncate(0)
+        self._buffer.seek(0)
+        return chunk
+    def write(self, line: str) -> bool:
+        """Add a line to the buffer.
+        Args:
+            line: The log line to add
+        Returns:
+            True if buffer should be flushed after adding the line
+        """
+        self._buffer.write(line)
+        return self._should_flush()
+    def close(self):
+        self._buffer.close()
+def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
+                               timeout: float) -> Iterable[str]:
+    """Iterates over an iterable, writing each item to a buffer,
+        and flushing the buffer when it is full or no item is
+        yielded within the timeout duration."""
+    # TODO(kevin): Simplify this using asyncio.timeout, once we move
+    # the skylet event loop and gRPC server to asyncio.
+    # https://docs.python.org/3/library/asyncio-task.html#timeouts
+    queue: queue_lib.Queue = queue_lib.Queue()
+    sentinel = object()
+    def producer():
+        try:
+            for item in iterable:
+                queue.put(item)
+        finally:
+            queue.put(sentinel)
+    thread = threading.Thread(target=producer, daemon=True)
+    thread.start()
+    while True:
+        try:
+            item = queue.get(timeout=timeout)
+        except queue_lib.Empty:
+            out = buffer.flush()
+            if out:
+                yield out
+            continue
+        if item is sentinel:
+            thread.join()
+            out = buffer.flush()
+            if out:
+                yield out
+            return
+        if buffer.write(item):
+            out = buffer.flush()
+            if out:
+                yield out

sky/skylet/log_lib.pyi CHANGED Viewed

@@ -4,7 +4,7 @@ overloaded type hints for run_with_log(), as we need to determine
 the return type based on the value of require_outputs.
 """
 import typing
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union
 from typing_extensions import Literal
@@ -143,3 +143,32 @@ def tail_logs(job_id: int,
               managed_job_id: Optional[int] = ...,
               follow: bool = ...) -> None:
     ...
+def tail_logs_iter(job_id: Optional[int],
+                   log_dir: Optional[str],
+                   managed_job_id: Optional[int] = ...,
+                   follow: bool = ...,
+                   tail: int = ...) -> Iterator[str]:
+    ...
+class LogBuffer:
+    max_chars: int
+    def __init__(self, max_chars: int = ...):
+        ...
+    def flush(self) -> str:
+        ...
+    def write(self, line: str) -> bool:
+        ...
+    def close(self):
+        ...
+def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
+                               timeout: float) -> Iterable[str]:
+    ...

skypilot-nightly 1.0.0.dev20250926__py3-none-any.whl → 1.0.0.dev20251001__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250926py3-none-any.whl → 1.0.0.dev20251001py3-none-any.whl