PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250513__py3-none-any.whl → 1.0.0.dev20250514__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250513py3-none-any.whl → 1.0.0.dev20250514py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

sky/__init__.py +2 -2
sky/backends/backend_utils.py +0 -3
sky/backends/cloud_vm_ray_backend.py +22 -10
sky/clouds/gcp.py +24 -8
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +33 -11
sky/clouds/service_catalog/gcp_catalog.py +7 -1
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/global_user_state.py +0 -2
sky/resources.py +4 -0
sky/server/requests/executor.py +22 -114
sky/server/requests/requests.py +0 -15
sky/server/server.py +7 -12
sky/server/uvicorn.py +2 -12
sky/sky_logging.py +2 -40
sky/skylet/log_lib.py +11 -51
sky/templates/nebius-ray.yml.j2 +3 -1
sky/utils/command_runner.py +0 -3
sky/utils/rich_utils.py +37 -81
sky/utils/subprocess_utils.py +2 -8
{skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250514.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250514.dist-info}/RECORD +33 -35
sky/utils/context.py +0 -264
sky/utils/context_utils.py +0 -172
/sky/dashboard/out/_next/static/{2dkponv64SfFShA8Rnw0D → tdxxQrPV6NW90a983oHXe}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{2dkponv64SfFShA8Rnw0D → tdxxQrPV6NW90a983oHXe}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250514.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250514.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250514.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250514.dist-info}/top_level.txt +0 -0

sky/server/requests/executor.py CHANGED Viewed

@@ -18,10 +18,7 @@ The number of the workers is determined by the system resources.
 See the [README.md](../README.md) for detailed architecture of the executor.
 """
-import asyncio
 import contextlib
-import contextvars
-import functools
 import multiprocessing
 import os
 import queue as queue_lib
@@ -50,7 +47,6 @@ from sky.server.requests.queues import mp_queue
 from sky.skylet import constants
 from sky.utils import annotations
 from sky.utils import common_utils
-from sky.utils import context
 from sky.utils import subprocess_utils
 from sky.utils import timeline
@@ -64,6 +60,7 @@ else:
     from typing_extensions import ParamSpec
 P = ParamSpec('P')
 logger = sky_logging.init_logger(__name__)
 # On macOS, the default start method for multiprocessing is 'fork', which
@@ -344,114 +341,6 @@ def _request_execution_wrapper(request_id: str,
             logger.info(f'Request {request_id} finished')
-async def execute_request_coroutine(request: api_requests.Request):
-    """Execute a request in current event loop.
-    Similar to _request_execution_wrapper, but executed as coroutine in current
-    event loop. This is designed for executing tasks that are not CPU
-    intensive, e.g. sky logs.
-    """
-    ctx = context.get()
-    if ctx is None:
-        raise ValueError('Context is not initialized')
-    logger.info(f'Executing request {request.request_id} in coroutine')
-    func = request.entrypoint
-    request_body = request.request_body
-    with api_requests.update_request(request.request_id) as request_task:
-        request_task.status = api_requests.RequestStatus.RUNNING
-    # Redirect stdout and stderr to the request log path.
-    original_output = ctx.redirect_log(request.log_path)
-    # Override environment variables that backs env_options.Options
-    # TODO(aylei): compared to process executor, running task in coroutine has
-    # two issues to fix:
-    # 1. skypilot config is not contextual
-    # 2. envs that read directly from os.environ are not contextual
-    ctx.override_envs(request_body.env_vars)
-    loop = asyncio.get_running_loop()
-    pyctx = contextvars.copy_context()
-    func_call = functools.partial(pyctx.run, func, **request_body.to_kwargs())
-    fut: asyncio.Future = loop.run_in_executor(None, func_call)
-    async def poll_task(request_id: str) -> bool:
-        request = api_requests.get_request(request_id)
-        if request is None:
-            raise RuntimeError('Request not found')
-        if request.status == api_requests.RequestStatus.CANCELLED:
-            ctx.cancel()
-            return True
-        if fut.done():
-            try:
-                result = await fut
-                api_requests.set_request_succeeded(request_id, result)
-            except asyncio.CancelledError:
-                # The task is cancelled by ctx.cancel(), where the status
-                # should already be set to CANCELLED.
-                pass
-            except Exception as e:  # pylint: disable=broad-except
-                ctx.redirect_log(original_output)
-                api_requests.set_request_failed(request_id, e)
-                logger.error(f'Request {request_id} failed due to '
-                             f'{common_utils.format_exception(e)}')
-            return True
-        return False
-    try:
-        while True:
-            res = await poll_task(request.request_id)
-            if res:
-                break
-            await asyncio.sleep(0.5)
-    except asyncio.CancelledError:
-        # Current coroutine is cancelled due to client disconnect, set the
-        # request status for consistency.
-        api_requests.set_request_cancelled(request.request_id)
-        pass
-    # pylint: disable=broad-except
-    except (Exception, KeyboardInterrupt, SystemExit) as e:
-        # Handle any other error
-        ctx.redirect_log(original_output)
-        ctx.cancel()
-        api_requests.set_request_failed(request.request_id, e)
-        logger.error(f'Request {request.request_id} interrupted due to '
-                     f'unhandled exception: {common_utils.format_exception(e)}')
-        raise
-def prepare_request(
-    request_id: str,
-    request_name: str,
-    request_body: payloads.RequestBody,
-    func: Callable[P, Any],
-    request_cluster_name: Optional[str] = None,
-    schedule_type: api_requests.ScheduleType = (api_requests.ScheduleType.LONG),
-    is_skypilot_system: bool = False,
-) -> api_requests.Request:
-    """Prepare a request for execution."""
-    user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
-    if is_skypilot_system:
-        user_id = server_constants.SKYPILOT_SYSTEM_USER_ID
-        global_user_state.add_or_update_user(
-            models.User(id=user_id, name=user_id))
-    request = api_requests.Request(request_id=request_id,
-                                   name=server_constants.REQUEST_NAME_PREFIX +
-                                   request_name,
-                                   entrypoint=func,
-                                   request_body=request_body,
-                                   status=api_requests.RequestStatus.PENDING,
-                                   created_at=time.time(),
-                                   schedule_type=schedule_type,
-                                   user_id=user_id,
-                                   cluster_name=request_cluster_name)
-    if not api_requests.create_if_not_exists(request):
-        raise RuntimeError(f'Request {request_id} already exists.')
-    request.log_path.touch()
-    return request
 def schedule_request(
         request_id: str,
         request_name: str,
@@ -483,8 +372,27 @@ def schedule_request(
             The precondition is waited asynchronously and does not block the
             caller.
     """
-    prepare_request(request_id, request_name, request_body, func,
-                    request_cluster_name, schedule_type, is_skypilot_system)
+    user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
+    if is_skypilot_system:
+        user_id = server_constants.SKYPILOT_SYSTEM_USER_ID
+        global_user_state.add_or_update_user(
+            models.User(id=user_id, name=user_id))
+    request = api_requests.Request(request_id=request_id,
+                                   name=server_constants.REQUEST_NAME_PREFIX +
+                                   request_name,
+                                   entrypoint=func,
+                                   request_body=request_body,
+                                   status=api_requests.RequestStatus.PENDING,
+                                   created_at=time.time(),
+                                   schedule_type=schedule_type,
+                                   user_id=user_id,
+                                   cluster_name=request_cluster_name)
+    if not api_requests.create_if_not_exists(request):
+        logger.debug(f'Request {request_id} already exists.')
+        return
+    request.log_path.touch()
     def enqueue():
         input_tuple = (request_id, ignore_return_value)

sky/server/requests/requests.py CHANGED Viewed

@@ -606,18 +606,3 @@ def set_request_failed(request_id: str, e: BaseException) -> None:
         assert request_task is not None, request_id
         request_task.status = RequestStatus.FAILED
         request_task.set_error(e)
-def set_request_succeeded(request_id: str, result: Any) -> None:
-    """Set a request to succeeded and populate the result."""
-    with update_request(request_id) as request_task:
-        assert request_task is not None, request_id
-        request_task.status = RequestStatus.SUCCEEDED
-        request_task.set_return_value(result)
-def set_request_cancelled(request_id: str) -> None:
-    """Set a request to cancelled."""
-    with update_request(request_id) as request_task:
-        assert request_task is not None, request_id
-        request_task.status = RequestStatus.CANCELLED

sky/server/server.py CHANGED Viewed

@@ -47,7 +47,6 @@ from sky.usage import usage_lib
 from sky.utils import admin_policy_utils
 from sky.utils import common as common_lib
 from sky.utils import common_utils
-from sky.utils import context
 from sky.utils import dag_utils
 from sky.utils import env_options
 from sky.utils import status_lib
@@ -674,28 +673,24 @@ async def logs(
     # TODO(zhwu): This should wait for the request on the cluster, e.g., async
     # launch, to finish, so that a user does not need to manually pull the
     # request status.
-    # Only initialize the context in logs handler to limit the scope of this
-    # experimental change.
-    # TODO(aylei): init in lifespan() to enable SkyPilot context in all APIs.
-    context.initialize()
-    request_task = executor.prepare_request(
+    executor.schedule_request(
         request_id=request.state.request_id,
         request_name='logs',
         request_body=cluster_job_body,
         func=core.tail_logs,
+        # TODO(aylei): We have tail logs scheduled as SHORT request, because it
+        # should be responsive. However, it can be long running if the user's
+        # job keeps running, and we should avoid it taking the SHORT worker.
         schedule_type=requests_lib.ScheduleType.SHORT,
+        request_cluster_name=cluster_job_body.cluster_name,
     )
-    task = asyncio.create_task(executor.execute_request_coroutine(request_task))
-    def cancel_task():
-        task.cancel()
+    request_task = requests_lib.get_request(request.state.request_id)
-    # Cancel the task after the request is done or client disconnects
-    background_tasks.add_task(cancel_task)
     # TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
     # the same approach as /stream.
     return stream_utils.stream_response(
-        request_id=request.state.request_id,
+        request_id=request_task.request_id,
         logs_path=request_task.log_path,
         background_tasks=background_tasks,
     )

sky/server/uvicorn.py CHANGED Viewed

@@ -3,7 +3,6 @@
 This module is a wrapper around uvicorn to customize the behavior of the
 server.
 """
-import functools
 import os
 import threading
 from typing import Optional
@@ -11,7 +10,6 @@ from typing import Optional
 import uvicorn
 from uvicorn.supervisors import multiprocess
-from sky.utils import context_utils
 from sky.utils import subprocess_utils
@@ -23,27 +21,19 @@ def run(config: uvicorn.Config):
         # guard by an exception.
         raise ValueError('Reload is not supported yet.')
     server = uvicorn.Server(config=config)
-    run_server_process = functools.partial(_run_server_process, server)
     try:
         if config.workers is not None and config.workers > 1:
             sock = config.bind_socket()
-            SlowStartMultiprocess(config,
-                                  target=run_server_process,
+            SlowStartMultiprocess(config, target=server.run,
                                   sockets=[sock]).run()
         else:
-            run_server_process()
+            server.run()
     finally:
         # Copied from unvicorn.run()
         if config.uds and os.path.exists(config.uds):
             os.remove(config.uds)
-def _run_server_process(server: uvicorn.Server, *args, **kwargs):
-    """Run the server process with contextually aware."""
-    context_utils.hijack_sys_attrs()
-    server.run(*args, **kwargs)
 class SlowStartMultiprocess(multiprocess.Multiprocess):
     """Uvicorn Multiprocess wrapper with slow start.

sky/sky_logging.py CHANGED Viewed

@@ -10,7 +10,6 @@ import threading
 import colorama
 from sky.skylet import constants
-from sky.utils import context
 from sky.utils import env_options
 from sky.utils import rich_utils
@@ -48,43 +47,6 @@ class NewLineFormatter(logging.Formatter):
         return msg
-class EnvAwareHandler(rich_utils.RichSafeStreamHandler):
-    """A handler that awares environment variables.
-    This handler dynamically reflects the log level from environment variables.
-    """
-    def __init__(self, stream=None, level=logging.NOTSET, sensitive=False):
-        super().__init__(stream)
-        self.level = level
-        self._sensitive = sensitive
-    @property
-    def level(self):
-        # Only refresh log level if we are in a context, since the log level
-        # has already been reloaded eagerly in multi-processing. Refresh again
-        # is a no-op and can be avoided.
-        # TODO(aylei): unify the mechanism for coroutine context and
-        # multi-processing.
-        if context.get() is not None:
-            if self._sensitive:
-                # For sensitive logger, suppress debug log despite the
-                # SKYPILOT_DEBUG env var if SUPPRESS_SENSITIVE_LOG is set
-                if env_options.Options.SUPPRESS_SENSITIVE_LOG.get():
-                    return logging.INFO
-            if env_options.Options.SHOW_DEBUG_INFO.get():
-                return logging.DEBUG
-            else:
-                return self._level
-        else:
-            return self._level
-    @level.setter
-    def level(self, level):
-        # pylint: disable=protected-access
-        self._level = logging._checkLevel(level)
 _root_logger = logging.getLogger('sky')
 _default_handler = None
 _logging_config = threading.local()
@@ -105,7 +67,7 @@ def _setup_logger():
     _root_logger.setLevel(logging.DEBUG)
     global _default_handler
     if _default_handler is None:
-        _default_handler = EnvAwareHandler(sys.stdout)
+        _default_handler = rich_utils.RichSafeStreamHandler(sys.stdout)
         _default_handler.flush = sys.stdout.flush  # type: ignore
         if env_options.Options.SHOW_DEBUG_INFO.get():
             _default_handler.setLevel(logging.DEBUG)
@@ -125,7 +87,7 @@ def _setup_logger():
         # for certain loggers.
         for logger_name in _SENSITIVE_LOGGER:
             logger = logging.getLogger(logger_name)
-            handler_to_logger = EnvAwareHandler(sys.stdout, sensitive=True)
+            handler_to_logger = rich_utils.RichSafeStreamHandler(sys.stdout)
             handler_to_logger.flush = sys.stdout.flush  # type: ignore
             logger.addHandler(handler_to_logger)
             logger.setLevel(logging.INFO)

sky/skylet/log_lib.py CHANGED Viewed

@@ -4,7 +4,6 @@ This is a remote utility module that provides logging functionality.
 """
 import collections
 import copy
-import functools
 import io
 import multiprocessing.pool
 import os
@@ -22,8 +21,6 @@ import colorama
 from sky import sky_logging
 from sky.skylet import constants
 from sky.skylet import job_lib
-from sky.utils import context
-from sky.utils import context_utils
 from sky.utils import log_utils
 from sky.utils import subprocess_utils
 from sky.utils import ux_utils
@@ -80,9 +77,6 @@ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
     with open(args.log_path, 'a', encoding='utf-8') as fout:
         with line_processor:
             while True:
-                ctx = context.get()
-                if ctx is not None and ctx.is_canceled():
-                    return
                 line = out_io.readline()
                 if not line:
                     break
@@ -117,29 +111,30 @@ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
     return ''.join(out)
-def process_subprocess_stream(proc, stdout_stream_handler,
-                              stderr_stream_handler) -> Tuple[str, str]:
-    """Process the stream of a process in threads, blocking."""
+def process_subprocess_stream(proc, args: _ProcessingArgs) -> Tuple[str, str]:
+    """Redirect the process's filtered stdout/stderr to both stream and file"""
     if proc.stderr is not None:
         # Asyncio does not work as the output processing can be executed in a
         # different thread.
         # selectors is possible to handle the multiplexing of stdout/stderr,
         # but it introduces buffering making the output not streaming.
         with multiprocessing.pool.ThreadPool(processes=1) as pool:
-            stderr_fut = pool.apply_async(stderr_stream_handler,
-                                          args=(proc.stderr, sys.stderr))
+            err_args = copy.copy(args)
+            err_args.line_processor = None
+            stderr_fut = pool.apply_async(_handle_io_stream,
+                                          args=(proc.stderr, sys.stderr,
+                                                err_args))
             # Do not launch a thread for stdout as the rich.status does not
             # work in a thread, which is used in
             # log_utils.RayUpLineProcessor.
-            stdout = stdout_stream_handler(proc.stdout, sys.stdout)
+            stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
             stderr = stderr_fut.get()
     else:
-        stdout = stdout_stream_handler(proc.stdout, sys.stdout)
+        stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
         stderr = ''
     return stdout, stderr
-@context_utils.cancellation_guard
 def run_with_log(
     cmd: Union[List[str], str],
     log_path: str,
@@ -181,12 +176,7 @@ def run_with_log(
     # Redirect stderr to stdout when using ray, to preserve the order of
     # stdout and stderr.
     stdout_arg = stderr_arg = None
-    ctx = context.get()
-    if process_stream or ctx is not None:
-        # Capture stdout/stderr of the subprocess if:
-        # 1. Post-processing is needed (process_stream=True)
-        # 2. Potential contextual handling is needed (ctx is not None)
-        # TODO(aylei): can we always capture the stdout/stderr?
+    if process_stream:
         stdout_arg = subprocess.PIPE
         stderr_arg = subprocess.PIPE if not with_ray else subprocess.STDOUT
     # Use stdin=subprocess.DEVNULL by default, as allowing inputs will mess up
@@ -207,8 +197,6 @@ def run_with_log(
             subprocess_utils.kill_process_daemon(proc.pid)
             stdout = ''
             stderr = ''
-            stdout_stream_handler = None
-            stderr_stream_handler = None
             if process_stream:
                 if skip_lines is None:
@@ -235,35 +223,7 @@ def run_with_log(
                     replace_crlf=with_ray,
                     streaming_prefix=streaming_prefix,
                 )
-                stdout_stream_handler = functools.partial(
-                    _handle_io_stream,
-                    args=args,
-                )
-                if proc.stderr is not None:
-                    err_args = copy.copy(args)
-                    err_args.line_processor = None
-                    stderr_stream_handler = functools.partial(
-                        _handle_io_stream,
-                        args=err_args,
-                    )
-            if ctx is not None:
-                # When runs in a coroutine, always process the subprocess
-                # stream to:
-                # 1. handle context cancellation
-                # 2. redirect subprocess stdout/stderr to the contextual
-                #    stdout/stderr of current coroutine.
-                stdout, stderr = context_utils.pipe_and_wait_process(
-                    ctx,
-                    proc,
-                    cancel_callback=subprocess_utils.kill_children_processes,
-                    stdout_stream_handler=stdout_stream_handler,
-                    stderr_stream_handler=stderr_stream_handler)
-            elif process_stream:
-                # When runs in a process, only process subprocess stream if
-                # necessary to avoid unnecessary stream handling overhead.
-                stdout, stderr = process_subprocess_stream(
-                    proc, stdout_stream_handler, stderr_stream_handler)
-            # Ensure returncode is set.
+                stdout, stderr = process_subprocess_stream(proc, args)
             proc.wait()
             if require_outputs:
                 return proc.returncode, stdout, stderr

sky/templates/nebius-ray.yml.j2 CHANGED Viewed

@@ -105,6 +105,7 @@ file_mounts: {
   "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
 {%- for remote_path, local_path in credentials.items() %}
   "{{remote_path}}": "{{local_path}}",
+  "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
 {%- endfor %}
 }
@@ -120,6 +121,7 @@ initialization_commands: []
 # Increment the following for catching performance bugs easier:
 #   current num items (num SSH connections): 1
 setup_commands:
+  # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
   # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
   # Create ~/.ssh/config file in case the file does not exist in the image.
   # Line 'rm ..': there is another installation of pip.
@@ -142,6 +144,6 @@ setup_commands:
     {{ ray_skypilot_installation_commands }}
     sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
     sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
-    mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n  StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n  StrictHostKeyChecking no\n" >> ~/.ssh/config;
+    mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n  StrictHostKeyChecking no\n  IdentityFile ~/.ssh/sky-cluster-key\n  IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n  StrictHostKeyChecking no\n  IdentityFile ~/.ssh/sky-cluster-key\n  IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
     [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
     {{ ssh_max_sessions_config }}

sky/utils/command_runner.py CHANGED Viewed

@@ -11,7 +11,6 @@ from sky import sky_logging
 from sky.skylet import constants
 from sky.skylet import log_lib
 from sky.utils import common_utils
-from sky.utils import context_utils
 from sky.utils import control_master_utils
 from sky.utils import subprocess_utils
 from sky.utils import timeline
@@ -575,7 +574,6 @@ class SSHCommandRunner(CommandRunner):
                                      shell=True)
     @timeline.event
-    @context_utils.cancellation_guard
     def run(
             self,
             cmd: Union[str, List[str]],
@@ -781,7 +779,6 @@ class KubernetesCommandRunner(CommandRunner):
         return kubectl_cmd
     @timeline.event
-    @context_utils.cancellation_guard
     def run(
             self,
             cmd: Union[str, List[str]],

skypilot-nightly 1.0.0.dev20250513__py3-none-any.whl → 1.0.0.dev20250514__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250513py3-none-any.whl → 1.0.0.dev20250514py3-none-any.whl