PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250510__py3-none-any.whl → 1.0.0.dev20250513__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250510py3-none-any.whl → 1.0.0.dev20250513py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

sky/__init__.py +2 -2
sky/backends/backend_utils.py +3 -0
sky/backends/cloud_vm_ray_backend.py +7 -0
sky/cli.py +109 -109
sky/client/cli.py +109 -109
sky/clouds/gcp.py +35 -8
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/{C0fkLhvxyqkymoV7IeInQ → 2dkponv64SfFShA8Rnw0D}/_buildManifest.js +1 -1
sky/dashboard/out/_next/static/chunks/845-0ca6f2c1ba667c3b.js +1 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/global_user_state.py +2 -0
sky/provision/docker_utils.py +4 -1
sky/provision/gcp/config.py +197 -15
sky/provision/gcp/constants.py +64 -0
sky/provision/nebius/instance.py +3 -1
sky/provision/nebius/utils.py +4 -2
sky/server/requests/executor.py +114 -22
sky/server/requests/requests.py +15 -0
sky/server/server.py +12 -7
sky/server/uvicorn.py +12 -2
sky/sky_logging.py +40 -2
sky/skylet/constants.py +3 -0
sky/skylet/log_lib.py +51 -11
sky/templates/gcp-ray.yml.j2 +11 -0
sky/templates/nebius-ray.yml.j2 +4 -0
sky/templates/websocket_proxy.py +29 -9
sky/utils/command_runner.py +3 -0
sky/utils/context.py +264 -0
sky/utils/context_utils.py +172 -0
sky/utils/rich_utils.py +81 -37
sky/utils/schemas.py +9 -1
sky/utils/subprocess_utils.py +8 -2
{skypilot_nightly-1.0.0.dev20250510.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20250510.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/RECORD +44 -42
sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
/sky/dashboard/out/_next/static/{C0fkLhvxyqkymoV7IeInQ → 2dkponv64SfFShA8Rnw0D}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250510.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250510.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250510.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250510.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/top_level.txt +0 -0

sky/server/server.py CHANGED Viewed

@@ -47,6 +47,7 @@ from sky.usage import usage_lib
 from sky.utils import admin_policy_utils
 from sky.utils import common as common_lib
 from sky.utils import common_utils
+from sky.utils import context
 from sky.utils import dag_utils
 from sky.utils import env_options
 from sky.utils import status_lib
@@ -673,24 +674,28 @@ async def logs(
     # TODO(zhwu): This should wait for the request on the cluster, e.g., async
     # launch, to finish, so that a user does not need to manually pull the
     # request status.
-    executor.schedule_request(
+    # Only initialize the context in logs handler to limit the scope of this
+    # experimental change.
+    # TODO(aylei): init in lifespan() to enable SkyPilot context in all APIs.
+    context.initialize()
+    request_task = executor.prepare_request(
         request_id=request.state.request_id,
         request_name='logs',
         request_body=cluster_job_body,
         func=core.tail_logs,
-        # TODO(aylei): We have tail logs scheduled as SHORT request, because it
-        # should be responsive. However, it can be long running if the user's
-        # job keeps running, and we should avoid it taking the SHORT worker.
         schedule_type=requests_lib.ScheduleType.SHORT,
-        request_cluster_name=cluster_job_body.cluster_name,
     )
+    task = asyncio.create_task(executor.execute_request_coroutine(request_task))
-    request_task = requests_lib.get_request(request.state.request_id)
+    def cancel_task():
+        task.cancel()
+    # Cancel the task after the request is done or client disconnects
+    background_tasks.add_task(cancel_task)
     # TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
     # the same approach as /stream.
     return stream_utils.stream_response(
-        request_id=request_task.request_id,
+        request_id=request.state.request_id,
         logs_path=request_task.log_path,
         background_tasks=background_tasks,
     )

sky/server/uvicorn.py CHANGED Viewed

@@ -3,6 +3,7 @@
 This module is a wrapper around uvicorn to customize the behavior of the
 server.
 """
+import functools
 import os
 import threading
 from typing import Optional
@@ -10,6 +11,7 @@ from typing import Optional
 import uvicorn
 from uvicorn.supervisors import multiprocess
+from sky.utils import context_utils
 from sky.utils import subprocess_utils
@@ -21,19 +23,27 @@ def run(config: uvicorn.Config):
         # guard by an exception.
         raise ValueError('Reload is not supported yet.')
     server = uvicorn.Server(config=config)
+    run_server_process = functools.partial(_run_server_process, server)
     try:
         if config.workers is not None and config.workers > 1:
             sock = config.bind_socket()
-            SlowStartMultiprocess(config, target=server.run,
+            SlowStartMultiprocess(config,
+                                  target=run_server_process,
                                   sockets=[sock]).run()
         else:
-            server.run()
+            run_server_process()
     finally:
         # Copied from unvicorn.run()
         if config.uds and os.path.exists(config.uds):
             os.remove(config.uds)
+def _run_server_process(server: uvicorn.Server, *args, **kwargs):
+    """Run the server process with contextually aware."""
+    context_utils.hijack_sys_attrs()
+    server.run(*args, **kwargs)
 class SlowStartMultiprocess(multiprocess.Multiprocess):
     """Uvicorn Multiprocess wrapper with slow start.

sky/sky_logging.py CHANGED Viewed

@@ -10,6 +10,7 @@ import threading
 import colorama
 from sky.skylet import constants
+from sky.utils import context
 from sky.utils import env_options
 from sky.utils import rich_utils
@@ -47,6 +48,43 @@ class NewLineFormatter(logging.Formatter):
         return msg
+class EnvAwareHandler(rich_utils.RichSafeStreamHandler):
+    """A handler that awares environment variables.
+    This handler dynamically reflects the log level from environment variables.
+    """
+    def __init__(self, stream=None, level=logging.NOTSET, sensitive=False):
+        super().__init__(stream)
+        self.level = level
+        self._sensitive = sensitive
+    @property
+    def level(self):
+        # Only refresh log level if we are in a context, since the log level
+        # has already been reloaded eagerly in multi-processing. Refresh again
+        # is a no-op and can be avoided.
+        # TODO(aylei): unify the mechanism for coroutine context and
+        # multi-processing.
+        if context.get() is not None:
+            if self._sensitive:
+                # For sensitive logger, suppress debug log despite the
+                # SKYPILOT_DEBUG env var if SUPPRESS_SENSITIVE_LOG is set
+                if env_options.Options.SUPPRESS_SENSITIVE_LOG.get():
+                    return logging.INFO
+            if env_options.Options.SHOW_DEBUG_INFO.get():
+                return logging.DEBUG
+            else:
+                return self._level
+        else:
+            return self._level
+    @level.setter
+    def level(self, level):
+        # pylint: disable=protected-access
+        self._level = logging._checkLevel(level)
 _root_logger = logging.getLogger('sky')
 _default_handler = None
 _logging_config = threading.local()
@@ -67,7 +105,7 @@ def _setup_logger():
     _root_logger.setLevel(logging.DEBUG)
     global _default_handler
     if _default_handler is None:
-        _default_handler = rich_utils.RichSafeStreamHandler(sys.stdout)
+        _default_handler = EnvAwareHandler(sys.stdout)
         _default_handler.flush = sys.stdout.flush  # type: ignore
         if env_options.Options.SHOW_DEBUG_INFO.get():
             _default_handler.setLevel(logging.DEBUG)
@@ -87,7 +125,7 @@ def _setup_logger():
         # for certain loggers.
         for logger_name in _SENSITIVE_LOGGER:
             logger = logging.getLogger(logger_name)
-            handler_to_logger = rich_utils.RichSafeStreamHandler(sys.stdout)
+            handler_to_logger = EnvAwareHandler(sys.stdout, sensitive=True)
             handler_to_logger.flush = sys.stdout.flush  # type: ignore
             logger.addHandler(handler_to_logger)
             logger.setLevel(logging.INFO)

sky/skylet/constants.py CHANGED Viewed

@@ -370,6 +370,9 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
     ('kubernetes', 'pod_config'),
     ('kubernetes', 'provision_timeout'),
     ('gcp', 'managed_instance_group'),
+    ('gcp', 'enable_gvnic'),
+    ('gcp', 'enable_gpu_direct'),
+    ('gcp', 'placement_policy'),
 ]
 # When overriding the SkyPilot configs on the API server with the client one,
 # we skip the following keys because they are meant to be client-side configs.

sky/skylet/log_lib.py CHANGED Viewed

@@ -4,6 +4,7 @@ This is a remote utility module that provides logging functionality.
 """
 import collections
 import copy
+import functools
 import io
 import multiprocessing.pool
 import os
@@ -21,6 +22,8 @@ import colorama
 from sky import sky_logging
 from sky.skylet import constants
 from sky.skylet import job_lib
+from sky.utils import context
+from sky.utils import context_utils
 from sky.utils import log_utils
 from sky.utils import subprocess_utils
 from sky.utils import ux_utils
@@ -77,6 +80,9 @@ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
     with open(args.log_path, 'a', encoding='utf-8') as fout:
         with line_processor:
             while True:
+                ctx = context.get()
+                if ctx is not None and ctx.is_canceled():
+                    return
                 line = out_io.readline()
                 if not line:
                     break
@@ -111,30 +117,29 @@ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
     return ''.join(out)
-def process_subprocess_stream(proc, args: _ProcessingArgs) -> Tuple[str, str]:
-    """Redirect the process's filtered stdout/stderr to both stream and file"""
+def process_subprocess_stream(proc, stdout_stream_handler,
+                              stderr_stream_handler) -> Tuple[str, str]:
+    """Process the stream of a process in threads, blocking."""
     if proc.stderr is not None:
         # Asyncio does not work as the output processing can be executed in a
         # different thread.
         # selectors is possible to handle the multiplexing of stdout/stderr,
         # but it introduces buffering making the output not streaming.
         with multiprocessing.pool.ThreadPool(processes=1) as pool:
-            err_args = copy.copy(args)
-            err_args.line_processor = None
-            stderr_fut = pool.apply_async(_handle_io_stream,
-                                          args=(proc.stderr, sys.stderr,
-                                                err_args))
+            stderr_fut = pool.apply_async(stderr_stream_handler,
+                                          args=(proc.stderr, sys.stderr))
             # Do not launch a thread for stdout as the rich.status does not
             # work in a thread, which is used in
             # log_utils.RayUpLineProcessor.
-            stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
+            stdout = stdout_stream_handler(proc.stdout, sys.stdout)
             stderr = stderr_fut.get()
     else:
-        stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
+        stdout = stdout_stream_handler(proc.stdout, sys.stdout)
         stderr = ''
     return stdout, stderr
+@context_utils.cancellation_guard
 def run_with_log(
     cmd: Union[List[str], str],
     log_path: str,
@@ -176,7 +181,12 @@ def run_with_log(
     # Redirect stderr to stdout when using ray, to preserve the order of
     # stdout and stderr.
     stdout_arg = stderr_arg = None
-    if process_stream:
+    ctx = context.get()
+    if process_stream or ctx is not None:
+        # Capture stdout/stderr of the subprocess if:
+        # 1. Post-processing is needed (process_stream=True)
+        # 2. Potential contextual handling is needed (ctx is not None)
+        # TODO(aylei): can we always capture the stdout/stderr?
         stdout_arg = subprocess.PIPE
         stderr_arg = subprocess.PIPE if not with_ray else subprocess.STDOUT
     # Use stdin=subprocess.DEVNULL by default, as allowing inputs will mess up
@@ -197,6 +207,8 @@ def run_with_log(
             subprocess_utils.kill_process_daemon(proc.pid)
             stdout = ''
             stderr = ''
+            stdout_stream_handler = None
+            stderr_stream_handler = None
             if process_stream:
                 if skip_lines is None:
@@ -223,7 +235,35 @@ def run_with_log(
                     replace_crlf=with_ray,
                     streaming_prefix=streaming_prefix,
                 )
-                stdout, stderr = process_subprocess_stream(proc, args)
+                stdout_stream_handler = functools.partial(
+                    _handle_io_stream,
+                    args=args,
+                )
+                if proc.stderr is not None:
+                    err_args = copy.copy(args)
+                    err_args.line_processor = None
+                    stderr_stream_handler = functools.partial(
+                        _handle_io_stream,
+                        args=err_args,
+                    )
+            if ctx is not None:
+                # When runs in a coroutine, always process the subprocess
+                # stream to:
+                # 1. handle context cancellation
+                # 2. redirect subprocess stdout/stderr to the contextual
+                #    stdout/stderr of current coroutine.
+                stdout, stderr = context_utils.pipe_and_wait_process(
+                    ctx,
+                    proc,
+                    cancel_callback=subprocess_utils.kill_children_processes,
+                    stdout_stream_handler=stdout_stream_handler,
+                    stderr_stream_handler=stderr_stream_handler)
+            elif process_stream:
+                # When runs in a process, only process subprocess stream if
+                # necessary to avoid unnecessary stream handling overhead.
+                stdout, stderr = process_subprocess_stream(
+                    proc, stdout_stream_handler, stderr_stream_handler)
+            # Ensure returncode is set.
             proc.wait()
             if require_outputs:
                 return proc.returncode, stdout, stderr

sky/templates/gcp-ray.yml.j2 CHANGED Viewed

@@ -69,6 +69,12 @@ provider:
 {%- if enable_gvnic %}
   enable_gvnic: {{ enable_gvnic }}
 {%- endif %}
+{%- if enable_gpu_direct %}
+  enable_gpu_direct: {{ enable_gpu_direct }}
+{%- endif %}
+{%- if placement_policy %}
+  placement_policy: {{ placement_policy }}
+{%- endif %}
 auth:
   ssh_user: gcpuser
@@ -148,6 +154,11 @@ available_node_types:
           - key: install-nvidia-driver
             value: "True"
   {%- endif %}
+  {%- if user_data is not none %}
+          - key: user-data
+            value: |-
+              {{ user_data | indent(10) }}
+  {%- endif %}
   {%- if use_spot or gpu is not none %}
       scheduling:
   {%- if use_spot %}

sky/templates/nebius-ray.yml.j2 CHANGED Viewed

@@ -9,6 +9,7 @@ provider:
   type: external
   module: sky.provision.nebius
   region: "{{region}}"
+  use_internal_ips: {{use_internal_ips}}
 {%- if docker_image is not none %}
 docker:
@@ -34,6 +35,9 @@ docker:
 auth:
   ssh_user: ubuntu
   ssh_private_key: {{ssh_private_key}}
+{% if ssh_proxy_command is not none %}
+  ssh_proxy_command: {{ssh_proxy_command}}
+{% endif %}
 available_node_types:
   ray_head_default:

sky/templates/websocket_proxy.py CHANGED Viewed

@@ -16,8 +16,11 @@ from typing import Dict
 from urllib.request import Request
 import websockets
+from websockets.asyncio.client import ClientConnection
 from websockets.asyncio.client import connect
+BUFFER_SIZE = 2**16  # 64KB
 def _get_cookie_header(url: str) -> Dict[str, str]:
     """Extract Cookie header value from a cookie jar for a specific URL"""
@@ -51,19 +54,36 @@ async def main(url: str) -> None:
             old_settings = None
         try:
-            await asyncio.gather(stdin_to_websocket(websocket),
-                                 websocket_to_stdout(websocket))
+            loop = asyncio.get_running_loop()
+            # Use asyncio.Stream primitives to wrap stdin and stdout, this is to
+            # avoid creating a new thread for each read/write operation
+            # excessively.
+            stdin_reader = asyncio.StreamReader()
+            protocol = asyncio.StreamReaderProtocol(stdin_reader)
+            await loop.connect_read_pipe(lambda: protocol, sys.stdin)
+            transport, protocol = await loop.connect_write_pipe(
+                asyncio.streams.FlowControlMixin, sys.stdout)  # type: ignore
+            stdout_writer = asyncio.StreamWriter(transport, protocol, None,
+                                                 loop)
+            await asyncio.gather(stdin_to_websocket(stdin_reader, websocket),
+                                 websocket_to_stdout(websocket, stdout_writer))
         finally:
             if old_settings:
                 termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN,
                                   old_settings)
-async def stdin_to_websocket(websocket):
+async def stdin_to_websocket(reader: asyncio.StreamReader,
+                             websocket: ClientConnection):
     try:
         while True:
-            data = await asyncio.get_event_loop().run_in_executor(
-                None, sys.stdin.buffer.read, 1)
+            # Read at most BUFFER_SIZE bytes, this not affect
+            # responsiveness since it will return as soon as
+            # there is at least one byte.
+            # The BUFFER_SIZE is chosen to be large enough to improve
+            # throughput.
+            data = await reader.read(BUFFER_SIZE)
             if not data:
                 break
             await websocket.send(data)
@@ -73,13 +93,13 @@ async def stdin_to_websocket(websocket):
         await websocket.close()
-async def websocket_to_stdout(websocket):
+async def websocket_to_stdout(websocket: ClientConnection,
+                              writer: asyncio.StreamWriter):
     try:
         while True:
             message = await websocket.recv()
-            sys.stdout.buffer.write(message)
-            await asyncio.get_event_loop().run_in_executor(
-                None, sys.stdout.buffer.flush)
+            writer.write(message)
+            await writer.drain()
     except websockets.exceptions.ConnectionClosed:
         print('WebSocket connection closed', file=sys.stderr)
     except Exception as e:  # pylint: disable=broad-except

sky/utils/command_runner.py CHANGED Viewed

@@ -11,6 +11,7 @@ from sky import sky_logging
 from sky.skylet import constants
 from sky.skylet import log_lib
 from sky.utils import common_utils
+from sky.utils import context_utils
 from sky.utils import control_master_utils
 from sky.utils import subprocess_utils
 from sky.utils import timeline
@@ -574,6 +575,7 @@ class SSHCommandRunner(CommandRunner):
                                      shell=True)
     @timeline.event
+    @context_utils.cancellation_guard
     def run(
             self,
             cmd: Union[str, List[str]],
@@ -779,6 +781,7 @@ class KubernetesCommandRunner(CommandRunner):
         return kubectl_cmd
     @timeline.event
+    @context_utils.cancellation_guard
     def run(
             self,
             cmd: Union[str, List[str]],

skypilot-nightly 1.0.0.dev20250510__py3-none-any.whl → 1.0.0.dev20250513__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250510py3-none-any.whl → 1.0.0.dev20250513py3-none-any.whl