PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250925__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250925py3-none-any.whl → 1.0.0.dev20250927py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (78) hide show

sky/__init__.py +2 -2
sky/backends/backend_utils.py +38 -14
sky/backends/cloud_vm_ray_backend.py +151 -36
sky/client/cli/command.py +18 -9
sky/client/cli/table_utils.py +34 -0
sky/client/common.py +4 -2
sky/client/sdk.py +11 -7
sky/client/sdk_async.py +5 -5
sky/core.py +6 -6
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js +1 -1
sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
sky/dashboard/out/_next/static/chunks/{webpack-16ba1d7187d2e3b1.js → webpack-7340bc0f0dd8ae74.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/execution.py +0 -1
sky/global_user_state.py +57 -34
sky/jobs/constants.py +2 -0
sky/jobs/controller.py +4 -0
sky/jobs/server/core.py +98 -26
sky/jobs/server/utils.py +65 -32
sky/jobs/state.py +145 -3
sky/jobs/utils.py +85 -7
sky/provision/runpod/__init__.py +2 -0
sky/schemas/api/responses.py +18 -0
sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
sky/serve/serve_utils.py +16 -0
sky/serve/server/core.py +1 -1
sky/serve/server/impl.py +6 -6
sky/server/requests/payloads.py +2 -1
sky/server/requests/serializers/decoders.py +2 -2
sky/server/requests/serializers/encoders.py +7 -3
sky/setup_files/dependencies.py +1 -1
sky/skylet/constants.py +4 -1
sky/skylet/events.py +42 -0
sky/skylet/job_lib.py +2 -32
sky/skylet/log_lib.py +211 -0
sky/skylet/log_lib.pyi +30 -1
sky/skylet/services.py +208 -2
sky/skylet/skylet.py +3 -0
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/kubernetes-ray.yml.j2 +8 -3
sky/utils/db/db_utils.py +5 -1
sky/utils/db/migration_utils.py +1 -1
sky/utils/kubernetes/kubernetes_deploy_utils.py +35 -12
sky/volumes/server/core.py +1 -0
sky/volumes/volume.py +16 -17
{skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/METADATA +36 -36
{skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/RECORD +74 -69
sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +0 -1
sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +0 -16
/sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/top_level.txt +0 -0

sky/skylet/log_lib.py CHANGED Viewed

@@ -8,11 +8,13 @@ import functools
 import io
 import multiprocessing.pool
 import os
+import queue as queue_lib
 import shlex
 import subprocess
 import sys
 import tempfile
 import textwrap
+import threading
 import time
 from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
                     Tuple, Union)
@@ -39,6 +41,11 @@ logger = sky_logging.init_logger(__name__)
 LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
+# 16-64KiB seems to be the sweet spot:
+# https://github.com/grpc/grpc.github.io/issues/371
+# TODO(kevin): Benchmark this ourselves and verify.
+DEFAULT_LOG_CHUNK_SIZE = 16 * 1024  # 16KiB
 class _ProcessingArgs:
     """Arguments for processing logs."""
@@ -563,3 +570,207 @@ def tail_logs(job_id: Optional[int],
         except FileNotFoundError:
             print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
                   f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
+def tail_logs_iter(job_id: Optional[int],
+                   log_dir: Optional[str],
+                   managed_job_id: Optional[int] = None,
+                   follow: bool = True,
+                   tail: int = 0) -> Iterator[str]:
+    """Tail the logs of a job. This is mostly the same as tail_logs, but
+    returns an iterator instead of printing to stdout/stderr."""
+    if job_id is None:
+        # This only happens when job_lib.get_latest_job_id() returns None,
+        # which means no job has been submitted to this cluster. See
+        # sky.skylet.job_lib.JobLibCodeGen.tail_logs for more details.
+        logger.info('Skip streaming logs as no job has been submitted.')
+        return
+    job_str = f'job {job_id}'
+    if managed_job_id is not None:
+        job_str = f'managed job {managed_job_id}'
+    if log_dir is None:
+        msg = f'{job_str.capitalize()} not found (see `sky queue`).'
+        yield msg + '\n'
+        return
+    logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
+                 f'{managed_job_id}.')
+    log_path = os.path.join(log_dir, 'run.log')
+    log_path = os.path.expanduser(log_path)
+    status = job_lib.update_job_status([job_id], silent=True)[0]
+    # Wait for the log to be written. This is needed due to the `ray submit`
+    # will take some time to start the job and write the log.
+    retry_cnt = 0
+    while status is not None and not status.is_terminal():
+        retry_cnt += 1
+        if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
+            break
+        if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
+            err = (f'{colorama.Fore.RED}ERROR: Logs for '
+                   f'{job_str} (status: {status.value}) does not exist '
+                   f'after retrying {retry_cnt} times.'
+                   f'{colorama.Style.RESET_ALL}')
+            yield err + '\n'
+            return
+        waiting = (f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
+                   'to be written...')
+        yield waiting + '\n'
+        time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
+        status = job_lib.update_job_status([job_id], silent=True)[0]
+    start_stream_at = LOG_FILE_START_STREAMING_AT
+    # Explicitly declare the type to avoid mypy warning.
+    lines: Iterable[str] = []
+    if follow and status in [
+            job_lib.JobStatus.SETTING_UP,
+            job_lib.JobStatus.PENDING,
+            job_lib.JobStatus.RUNNING,
+    ]:
+        # Not using `ray job logs` because it will put progress bar in
+        # multiple lines.
+        with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
+            # Using `_follow` instead of `tail -f` to streaming the whole
+            # log and creating a new process for tail.
+            start_streaming = False
+            if tail > 0:
+                head_lines_of_log_file = _peek_head_lines(log_file)
+                lines = collections.deque(log_file, maxlen=tail)
+                start_streaming = _should_stream_the_whole_tail_lines(
+                    head_lines_of_log_file, lines, start_stream_at)
+                for line in lines:
+                    if start_stream_at in line:
+                        start_streaming = True
+                    if start_streaming:
+                        yield line
+            # Now, the cursor is at the end of the last lines
+            # if tail > 0
+            for line in _follow_job_logs(log_file,
+                                         job_id=job_id,
+                                         start_streaming=start_streaming,
+                                         start_streaming_at=start_stream_at):
+                yield line
+    else:
+        try:
+            start_streaming = False
+            with open(log_path, 'r', encoding='utf-8') as log_file:
+                if tail > 0:
+                    # If tail > 0, we need to read the last n lines.
+                    # We use double ended queue to rotate the last n lines.
+                    head_lines_of_log_file = _peek_head_lines(log_file)
+                    lines = collections.deque(log_file, maxlen=tail)
+                    start_streaming = _should_stream_the_whole_tail_lines(
+                        head_lines_of_log_file, lines, start_stream_at)
+                else:
+                    lines = log_file
+                for line in lines:
+                    if start_stream_at in line:
+                        start_streaming = True
+                    if start_streaming:
+                        yield line
+                status_str = status.value if status is not None else 'None'
+                # Only show "Job finished" for actually terminal states
+                if status is not None and status.is_terminal():
+                    finish = ux_utils.finishing_message(
+                        f'Job finished (status: {status_str}).')
+                    yield finish + '\n'
+            return
+        except FileNotFoundError:
+            err = (
+                f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
+                f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
+            yield err + '\n'
+class LogBuffer:
+    """In-memory buffer for chunking log lines for streaming."""
+    def __init__(self, max_chars: int = DEFAULT_LOG_CHUNK_SIZE):
+        """Initialize the log buffer.
+        Args:
+            max_chars: Maximum buffer size (in characters, not bytes) before
+                       flushing. The actual amount of bytes (UTF-8 encoding)
+                       could be more than this, depending on the characters,
+                       i.e. ASCII characters take 1 byte, while others
+                       may take 2-4 bytes. But this is fine as our default
+                       chunk size is well below the default value of
+                       grpc.max_receive_message_length which is 4MB.
+        """
+        self.max_chars = max_chars
+        self._buffer = io.StringIO()
+    def _should_flush(self) -> bool:
+        return self._buffer.tell() >= self.max_chars
+    def flush(self) -> str:
+        """Get the current buffered content and clear the buffer.
+        Returns:
+            The buffered log lines as a single string
+        """
+        if not self._buffer.tell():
+            return ''
+        chunk = self._buffer.getvalue()
+        self._buffer.truncate(0)
+        self._buffer.seek(0)
+        return chunk
+    def write(self, line: str) -> bool:
+        """Add a line to the buffer.
+        Args:
+            line: The log line to add
+        Returns:
+            True if buffer should be flushed after adding the line
+        """
+        self._buffer.write(line)
+        return self._should_flush()
+    def close(self):
+        self._buffer.close()
+def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
+                               timeout: float) -> Iterable[str]:
+    """Iterates over an iterable, writing each item to a buffer,
+        and flushing the buffer when it is full or no item is
+        yielded within the timeout duration."""
+    # TODO(kevin): Simplify this using asyncio.timeout, once we move
+    # the skylet event loop and gRPC server to asyncio.
+    # https://docs.python.org/3/library/asyncio-task.html#timeouts
+    queue: queue_lib.Queue = queue_lib.Queue()
+    sentinel = object()
+    def producer():
+        try:
+            for item in iterable:
+                queue.put(item)
+        finally:
+            queue.put(sentinel)
+    thread = threading.Thread(target=producer, daemon=True)
+    thread.start()
+    while True:
+        try:
+            item = queue.get(timeout=timeout)
+        except queue_lib.Empty:
+            out = buffer.flush()
+            if out:
+                yield out
+            continue
+        if item is sentinel:
+            thread.join()
+            out = buffer.flush()
+            if out:
+                yield out
+            return
+        if buffer.write(item):
+            out = buffer.flush()
+            if out:
+                yield out

sky/skylet/log_lib.pyi CHANGED Viewed

@@ -4,7 +4,7 @@ overloaded type hints for run_with_log(), as we need to determine
 the return type based on the value of require_outputs.
 """
 import typing
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union
 from typing_extensions import Literal
@@ -143,3 +143,32 @@ def tail_logs(job_id: int,
               managed_job_id: Optional[int] = ...,
               follow: bool = ...) -> None:
     ...
+def tail_logs_iter(job_id: Optional[int],
+                   log_dir: Optional[str],
+                   managed_job_id: Optional[int] = ...,
+                   follow: bool = ...,
+                   tail: int = ...) -> Iterator[str]:
+    ...
+class LogBuffer:
+    max_chars: int
+    def __init__(self, max_chars: int = ...):
+        ...
+    def flush(self) -> str:
+        ...
+    def write(self, line: str) -> bool:
+        ...
+    def close(self):
+        ...
+def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
+                               timeout: float) -> Iterable[str]:
+    ...

sky/skylet/services.py CHANGED Viewed

@@ -1,15 +1,20 @@
 """gRPC service implementations for skylet."""
 import os
+from typing import List, Optional
 import grpc
+from sky import exceptions
 from sky import sky_logging
 from sky.jobs import state as managed_job_state
+from sky.jobs import utils as managed_job_utils
 from sky.schemas.generated import autostopv1_pb2
 from sky.schemas.generated import autostopv1_pb2_grpc
 from sky.schemas.generated import jobsv1_pb2
 from sky.schemas.generated import jobsv1_pb2_grpc
+from sky.schemas.generated import managed_jobsv1_pb2
+from sky.schemas.generated import managed_jobsv1_pb2_grpc
 from sky.schemas.generated import servev1_pb2
 from sky.schemas.generated import servev1_pb2_grpc
 from sky.serve import serve_rpc_utils
@@ -18,9 +23,14 @@ from sky.serve import serve_utils
 from sky.skylet import autostop_lib
 from sky.skylet import constants
 from sky.skylet import job_lib
+from sky.skylet import log_lib
 logger = sky_logging.init_logger(__name__)
+# In the worst case, flush the log buffer every 50ms,
+# to ensure responsiveness.
+DEFAULT_LOG_CHUNK_FLUSH_INTERVAL = 0.05
 class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
     """Implementation of the AutostopService gRPC service."""
@@ -275,8 +285,39 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
             self,
             request: jobsv1_pb2.TailLogsRequest,  # type: ignore[return]
             context: grpc.ServicerContext):
-        # TODO(kevin): implement this
-        raise NotImplementedError('TailLogs is not implemented')
+        buffer = log_lib.LogBuffer()
+        try:
+            job_id = request.job_id if request.HasField(
+                'job_id') else job_lib.get_latest_job_id()
+            managed_job_id = request.managed_job_id if request.HasField(
+                'managed_job_id') else None
+            log_dir = job_lib.get_log_dir_for_job(job_id)
+            if log_dir is None:
+                run_timestamp = job_lib.get_run_timestamp(job_id)
+                log_dir = None if run_timestamp is None else os.path.join(
+                    constants.SKY_LOGS_DIRECTORY, run_timestamp)
+            for line in log_lib.buffered_iter_with_timeout(
+                    buffer,
+                    log_lib.tail_logs_iter(job_id, log_dir, managed_job_id,
+                                           request.follow, request.tail),
+                    DEFAULT_LOG_CHUNK_FLUSH_INTERVAL):
+                yield jobsv1_pb2.TailLogsResponse(log_line=line)
+            job_status = job_lib.get_status(job_id)
+            exit_code = exceptions.JobExitCode.from_job_status(job_status)
+            # Fix for dashboard: When follow=False and job is still running
+            # (NOT_FINISHED=101), exit with success (0) since fetching current
+            # logs is a successful operation.
+            # This prevents shell wrappers from printing "command terminated
+            # with exit code 101".
+            exit_code_int = 0 if not request.follow and int(
+                exit_code) == 101 else int(exit_code)
+            yield jobsv1_pb2.TailLogsResponse(exit_code=exit_code_int)
+        except Exception as e:  # pylint: disable=broad-except
+            context.abort(grpc.StatusCode.INTERNAL, str(e))
+        finally:
+            buffer.close()
     def GetJobStatus(  # type: ignore[return]
             self, request: jobsv1_pb2.GetJobStatusRequest,
@@ -343,3 +384,168 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
                 job_log_dirs=job_log_dirs)
         except Exception as e:  # pylint: disable=broad-except
             context.abort(grpc.StatusCode.INTERNAL, str(e))
+class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
+                            ):
+    """Implementation of the ManagedJobsService gRPC service."""
+    def GetVersion(  # type: ignore[return]
+            self, request: managed_jobsv1_pb2.GetVersionRequest,
+            context: grpc.ServicerContext
+    ) -> managed_jobsv1_pb2.GetVersionResponse:
+        try:
+            return managed_jobsv1_pb2.GetVersionResponse(
+                controller_version=constants.SKYLET_VERSION)
+        except Exception as e:  # pylint: disable=broad-except
+            context.abort(grpc.StatusCode.INTERNAL, str(e))
+    def GetJobTable(  # type: ignore[return]
+        self, request: managed_jobsv1_pb2.GetJobTableRequest,
+        context: grpc.ServicerContext
+    ) -> managed_jobsv1_pb2.GetJobTableResponse:
+        try:
+            accessible_workspaces = list(request.accessible_workspaces)
+            job_ids = list(request.job_ids.ids) if request.job_ids else None
+            user_hashes: Optional[List[Optional[str]]] = None
+            if request.user_hashes:
+                user_hashes = list(request.user_hashes.hashes)
+                # For backwards compatibility, we show jobs that do not have a
+                # user_hash. TODO: Remove before 0.12.0.
+                if request.show_jobs_without_user_hash:
+                    user_hashes.append(None)
+            statuses = list(
+                request.statuses.statuses) if request.statuses else None
+            job_queue = managed_job_utils.get_managed_job_queue(
+                skip_finished=request.skip_finished,
+                accessible_workspaces=accessible_workspaces,
+                job_ids=job_ids,
+                workspace_match=request.workspace_match
+                if request.HasField('workspace_match') else None,
+                name_match=request.name_match
+                if request.HasField('name_match') else None,
+                pool_match=request.pool_match
+                if request.HasField('pool_match') else None,
+                page=request.page if request.HasField('page') else None,
+                limit=request.limit if request.HasField('limit') else None,
+                user_hashes=user_hashes,
+                statuses=statuses)
+            jobs = job_queue['jobs']
+            total = job_queue['total']
+            total_no_filter = job_queue['total_no_filter']
+            status_counts = job_queue['status_counts']
+            jobs_info = []
+            for job in jobs:
+                job_info = managed_jobsv1_pb2.ManagedJobInfo(
+                    job_id=job.get('job_id'),
+                    task_id=job.get('task_id'),
+                    job_name=job.get('job_name'),
+                    task_name=job.get('task_name'),
+                    job_duration=job.get('job_duration'),
+                    workspace=job.get('workspace'),
+                    status=managed_job_state.ManagedJobStatus(
+                        job.get('status')).to_protobuf(),
+                    schedule_state=managed_job_state.ManagedJobScheduleState(
+                        job.get('schedule_state')).to_protobuf(),
+                    resources=job.get('resources'),
+                    cluster_resources=job.get('cluster_resources'),
+                    cluster_resources_full=job.get('cluster_resources_full'),
+                    cloud=job.get('cloud'),
+                    region=job.get('region'),
+                    infra=job.get('infra'),
+                    accelerators=job.get('accelerators'),
+                    recovery_count=job.get('recovery_count'),
+                    details=job.get('details'),
+                    failure_reason=job.get('failure_reason'),
+                    user_name=job.get('user_name'),
+                    user_hash=job.get('user_hash'),
+                    submitted_at=job.get('submitted_at'),
+                    start_at=job.get('start_at'),
+                    end_at=job.get('end_at'),
+                    user_yaml=job.get('user_yaml'),
+                    entrypoint=job.get('entrypoint'),
+                    metadata={
+                        k: v
+                        for k, v in job.get('metadata', {}).items()
+                        if v is not None
+                    },
+                    pool=job.get('pool'),
+                    pool_hash=job.get('pool_hash'))
+                jobs_info.append(job_info)
+            return managed_jobsv1_pb2.GetJobTableResponse(
+                jobs=jobs_info,
+                total=total,
+                total_no_filter=total_no_filter,
+                status_counts=status_counts)
+        except Exception as e:  # pylint: disable=broad-except
+            context.abort(grpc.StatusCode.INTERNAL, str(e))
+    def GetAllJobIdsByName(  # type: ignore[return]
+        self, request: managed_jobsv1_pb2.GetAllJobIdsByNameRequest,
+        context: grpc.ServicerContext
+    ) -> managed_jobsv1_pb2.GetAllJobIdsByNameResponse:
+        try:
+            job_name = request.job_name if request.HasField(
+                'job_name') else None
+            job_ids = managed_job_state.get_all_job_ids_by_name(job_name)
+            return managed_jobsv1_pb2.GetAllJobIdsByNameResponse(
+                job_ids=job_ids)
+        except Exception as e:  # pylint: disable=broad-except
+            context.abort(grpc.StatusCode.INTERNAL, str(e))
+    def CancelJobs(  # type: ignore[return]
+            self, request: managed_jobsv1_pb2.CancelJobsRequest,
+            context: grpc.ServicerContext
+    ) -> managed_jobsv1_pb2.CancelJobsResponse:
+        try:
+            cancellation_criteria = request.WhichOneof('cancellation_criteria')
+            if cancellation_criteria is None:
+                context.abort(
+                    grpc.StatusCode.INVALID_ARGUMENT,
+                    'exactly one cancellation criteria must be specified.')
+            if cancellation_criteria == 'all_users':
+                user_hash = request.user_hash if request.HasField(
+                    'user_hash') else None
+                all_users = request.all_users
+                if not all_users and user_hash is None:
+                    context.abort(
+                        grpc.StatusCode.INVALID_ARGUMENT,
+                        'user_hash is required when all_users is False')
+                message = managed_job_utils.cancel_jobs_by_id(
+                    job_ids=None,
+                    all_users=all_users,
+                    current_workspace=request.current_workspace,
+                    user_hash=user_hash)
+            elif cancellation_criteria == 'job_ids':
+                job_ids = list(request.job_ids.ids)
+                message = managed_job_utils.cancel_jobs_by_id(
+                    job_ids=job_ids,
+                    current_workspace=request.current_workspace)
+            elif cancellation_criteria == 'job_name':
+                message = managed_job_utils.cancel_job_by_name(
+                    job_name=request.job_name,
+                    current_workspace=request.current_workspace)
+            elif cancellation_criteria == 'pool_name':
+                message = managed_job_utils.cancel_jobs_by_pool(
+                    pool_name=request.pool_name,
+                    current_workspace=request.current_workspace)
+            else:
+                context.abort(
+                    grpc.StatusCode.INVALID_ARGUMENT,
+                    f'invalid cancellation criteria: {cancellation_criteria}')
+            return managed_jobsv1_pb2.CancelJobsResponse(message=message)
+        except Exception as e:  # pylint: disable=broad-except
+            context.abort(grpc.StatusCode.INTERNAL, str(e))
+    def StreamLogs(
+            self,
+            request: managed_jobsv1_pb2.
+        StreamLogsRequest,  # type: ignore[return]
+            context: grpc.ServicerContext):
+        # TODO(kevin): implement this
+        context.abort(grpc.StatusCode.UNIMPLEMENTED,
+                      'StreamLogs is not implemented')

sky/skylet/skylet.py CHANGED Viewed

@@ -10,6 +10,7 @@ import sky
 from sky import sky_logging
 from sky.schemas.generated import autostopv1_pb2_grpc
 from sky.schemas.generated import jobsv1_pb2_grpc
+from sky.schemas.generated import managed_jobsv1_pb2_grpc
 from sky.schemas.generated import servev1_pb2_grpc
 from sky.skylet import constants
 from sky.skylet import events
@@ -55,6 +56,8 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
         services.JobsServiceImpl(), server)
     servev1_pb2_grpc.add_ServeServiceServicer_to_server(
         services.ServeServiceImpl(), server)
+    managed_jobsv1_pb2_grpc.add_ManagedJobsServiceServicer_to_server(
+        services.ManagedJobsServiceImpl(), server)
     listen_addr = f'127.0.0.1:{port}'
     server.add_insecure_port(listen_addr)

sky/templates/jobs-controller.yaml.j2 CHANGED Viewed

@@ -36,6 +36,9 @@ setup: |
   grep -q 'alias sky-env=' ~/.bashrc || echo 'alias sky-env="{{ sky_activate_python_env }}"' >> ~/.bashrc
   {% endif %}
+  # This is used by the skylet events to check if we are a jobs controller.
+  touch {{job_controller_indicator_file}}
 run: |
   {%- if consolidation_mode_job_id is none %}
   {{ sky_activate_python_env }}

sky/templates/kubernetes-ray.yml.j2 CHANGED Viewed

@@ -901,15 +901,20 @@ available_node_types:
                 {{ conda_installation_commands }}
                 {{ ray_installation_commands }}
-                VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
+                # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
+                # unset PYTHONPATH in case the user provided docker image set it.
+                VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip install skypilot[kubernetes,remote]
                 # Wait for `patch` package to be installed before applying ray patches
                 until dpkg -l | grep -q "^ii  patch "; do
                   sleep 0.1
                   echo "Waiting for patch package to be installed..."
                 done
                 # Apply Ray patches for progress bar fix
-                ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
-                  VIRTUAL_ENV=~/skypilot-runtime python -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
+                # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
+                # unset PYTHONPATH in case the user provided docker image set it.
+                # ~/.sky/python_path is seeded by conda_installation_commands
+                VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
+                  $(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
                 }
                 touch /tmp/ray_skypilot_installation_complete
                 echo "=== Ray and skypilot installation completed ==="

sky/utils/db/db_utils.py CHANGED Viewed

@@ -201,6 +201,7 @@ def add_column_to_table_alembic(
     server_default: Optional[str] = None,
     copy_from: Optional[str] = None,
     value_to_replace_existing_entries: Optional[Any] = None,
+    index: Optional[bool] = None,
 ):
     """Add a column to a table using Alembic operations.
@@ -215,6 +216,8 @@ def add_column_to_table_alembic(
         copy_from: Column name to copy values from (for existing rows)
         value_to_replace_existing_entries: Default value for existing NULL
             entries
+        index: If True, create an index on this column. If None, no index
+            is created.
     """
     from alembic import op  # pylint: disable=import-outside-toplevel
@@ -222,7 +225,8 @@ def add_column_to_table_alembic(
         # Create the column with server_default if provided
         column = sqlalchemy.Column(column_name,
                                    column_type,
-                                   server_default=server_default)
+                                   server_default=server_default,
+                                   index=index)
         op.add_column(table_name, column)
         # Handle data migration

sky/utils/db/migration_utils.py CHANGED Viewed

@@ -17,7 +17,7 @@ logger = sky_logging.init_logger(__name__)
 DB_INIT_LOCK_TIMEOUT_SECONDS = 10
 GLOBAL_USER_STATE_DB_NAME = 'state_db'
-GLOBAL_USER_STATE_VERSION = '008'
+GLOBAL_USER_STATE_VERSION = '009'
 GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
 SPOT_JOBS_DB_NAME = 'spot_jobs_db'

skypilot-nightly 1.0.0.dev20250925__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250925py3-none-any.whl → 1.0.0.dev20250927py3-none-any.whl