skypilot-nightly 1.0.0.dev20250513__py3-none-any.whl → 1.0.0.dev20250514__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +0 -3
- sky/backends/cloud_vm_ray_backend.py +22 -10
- sky/clouds/gcp.py +24 -8
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +33 -11
- sky/clouds/service_catalog/gcp_catalog.py +7 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/global_user_state.py +0 -2
- sky/resources.py +4 -0
- sky/server/requests/executor.py +22 -114
- sky/server/requests/requests.py +0 -15
- sky/server/server.py +7 -12
- sky/server/uvicorn.py +2 -12
- sky/sky_logging.py +2 -40
- sky/skylet/log_lib.py +11 -51
- sky/templates/nebius-ray.yml.j2 +3 -1
- sky/utils/command_runner.py +0 -3
- sky/utils/rich_utils.py +37 -81
- sky/utils/subprocess_utils.py +2 -8
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250514.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250514.dist-info}/RECORD +33 -35
- sky/utils/context.py +0 -264
- sky/utils/context_utils.py +0 -172
- /sky/dashboard/out/_next/static/{2dkponv64SfFShA8Rnw0D → tdxxQrPV6NW90a983oHXe}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{2dkponv64SfFShA8Rnw0D → tdxxQrPV6NW90a983oHXe}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250514.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250514.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250514.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250514.dist-info}/top_level.txt +0 -0
sky/server/requests/executor.py
CHANGED
@@ -18,10 +18,7 @@ The number of the workers is determined by the system resources.
|
|
18
18
|
|
19
19
|
See the [README.md](../README.md) for detailed architecture of the executor.
|
20
20
|
"""
|
21
|
-
import asyncio
|
22
21
|
import contextlib
|
23
|
-
import contextvars
|
24
|
-
import functools
|
25
22
|
import multiprocessing
|
26
23
|
import os
|
27
24
|
import queue as queue_lib
|
@@ -50,7 +47,6 @@ from sky.server.requests.queues import mp_queue
|
|
50
47
|
from sky.skylet import constants
|
51
48
|
from sky.utils import annotations
|
52
49
|
from sky.utils import common_utils
|
53
|
-
from sky.utils import context
|
54
50
|
from sky.utils import subprocess_utils
|
55
51
|
from sky.utils import timeline
|
56
52
|
|
@@ -64,6 +60,7 @@ else:
|
|
64
60
|
from typing_extensions import ParamSpec
|
65
61
|
|
66
62
|
P = ParamSpec('P')
|
63
|
+
|
67
64
|
logger = sky_logging.init_logger(__name__)
|
68
65
|
|
69
66
|
# On macOS, the default start method for multiprocessing is 'fork', which
|
@@ -344,114 +341,6 @@ def _request_execution_wrapper(request_id: str,
|
|
344
341
|
logger.info(f'Request {request_id} finished')
|
345
342
|
|
346
343
|
|
347
|
-
async def execute_request_coroutine(request: api_requests.Request):
|
348
|
-
"""Execute a request in current event loop.
|
349
|
-
|
350
|
-
Similar to _request_execution_wrapper, but executed as coroutine in current
|
351
|
-
event loop. This is designed for executing tasks that are not CPU
|
352
|
-
intensive, e.g. sky logs.
|
353
|
-
"""
|
354
|
-
ctx = context.get()
|
355
|
-
if ctx is None:
|
356
|
-
raise ValueError('Context is not initialized')
|
357
|
-
logger.info(f'Executing request {request.request_id} in coroutine')
|
358
|
-
func = request.entrypoint
|
359
|
-
request_body = request.request_body
|
360
|
-
with api_requests.update_request(request.request_id) as request_task:
|
361
|
-
request_task.status = api_requests.RequestStatus.RUNNING
|
362
|
-
# Redirect stdout and stderr to the request log path.
|
363
|
-
original_output = ctx.redirect_log(request.log_path)
|
364
|
-
# Override environment variables that backs env_options.Options
|
365
|
-
# TODO(aylei): compared to process executor, running task in coroutine has
|
366
|
-
# two issues to fix:
|
367
|
-
# 1. skypilot config is not contextual
|
368
|
-
# 2. envs that read directly from os.environ are not contextual
|
369
|
-
ctx.override_envs(request_body.env_vars)
|
370
|
-
loop = asyncio.get_running_loop()
|
371
|
-
pyctx = contextvars.copy_context()
|
372
|
-
func_call = functools.partial(pyctx.run, func, **request_body.to_kwargs())
|
373
|
-
fut: asyncio.Future = loop.run_in_executor(None, func_call)
|
374
|
-
|
375
|
-
async def poll_task(request_id: str) -> bool:
|
376
|
-
request = api_requests.get_request(request_id)
|
377
|
-
if request is None:
|
378
|
-
raise RuntimeError('Request not found')
|
379
|
-
|
380
|
-
if request.status == api_requests.RequestStatus.CANCELLED:
|
381
|
-
ctx.cancel()
|
382
|
-
return True
|
383
|
-
|
384
|
-
if fut.done():
|
385
|
-
try:
|
386
|
-
result = await fut
|
387
|
-
api_requests.set_request_succeeded(request_id, result)
|
388
|
-
except asyncio.CancelledError:
|
389
|
-
# The task is cancelled by ctx.cancel(), where the status
|
390
|
-
# should already be set to CANCELLED.
|
391
|
-
pass
|
392
|
-
except Exception as e: # pylint: disable=broad-except
|
393
|
-
ctx.redirect_log(original_output)
|
394
|
-
api_requests.set_request_failed(request_id, e)
|
395
|
-
logger.error(f'Request {request_id} failed due to '
|
396
|
-
f'{common_utils.format_exception(e)}')
|
397
|
-
return True
|
398
|
-
return False
|
399
|
-
|
400
|
-
try:
|
401
|
-
while True:
|
402
|
-
res = await poll_task(request.request_id)
|
403
|
-
if res:
|
404
|
-
break
|
405
|
-
await asyncio.sleep(0.5)
|
406
|
-
except asyncio.CancelledError:
|
407
|
-
# Current coroutine is cancelled due to client disconnect, set the
|
408
|
-
# request status for consistency.
|
409
|
-
api_requests.set_request_cancelled(request.request_id)
|
410
|
-
pass
|
411
|
-
# pylint: disable=broad-except
|
412
|
-
except (Exception, KeyboardInterrupt, SystemExit) as e:
|
413
|
-
# Handle any other error
|
414
|
-
ctx.redirect_log(original_output)
|
415
|
-
ctx.cancel()
|
416
|
-
api_requests.set_request_failed(request.request_id, e)
|
417
|
-
logger.error(f'Request {request.request_id} interrupted due to '
|
418
|
-
f'unhandled exception: {common_utils.format_exception(e)}')
|
419
|
-
raise
|
420
|
-
|
421
|
-
|
422
|
-
def prepare_request(
|
423
|
-
request_id: str,
|
424
|
-
request_name: str,
|
425
|
-
request_body: payloads.RequestBody,
|
426
|
-
func: Callable[P, Any],
|
427
|
-
request_cluster_name: Optional[str] = None,
|
428
|
-
schedule_type: api_requests.ScheduleType = (api_requests.ScheduleType.LONG),
|
429
|
-
is_skypilot_system: bool = False,
|
430
|
-
) -> api_requests.Request:
|
431
|
-
"""Prepare a request for execution."""
|
432
|
-
user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
|
433
|
-
if is_skypilot_system:
|
434
|
-
user_id = server_constants.SKYPILOT_SYSTEM_USER_ID
|
435
|
-
global_user_state.add_or_update_user(
|
436
|
-
models.User(id=user_id, name=user_id))
|
437
|
-
request = api_requests.Request(request_id=request_id,
|
438
|
-
name=server_constants.REQUEST_NAME_PREFIX +
|
439
|
-
request_name,
|
440
|
-
entrypoint=func,
|
441
|
-
request_body=request_body,
|
442
|
-
status=api_requests.RequestStatus.PENDING,
|
443
|
-
created_at=time.time(),
|
444
|
-
schedule_type=schedule_type,
|
445
|
-
user_id=user_id,
|
446
|
-
cluster_name=request_cluster_name)
|
447
|
-
|
448
|
-
if not api_requests.create_if_not_exists(request):
|
449
|
-
raise RuntimeError(f'Request {request_id} already exists.')
|
450
|
-
|
451
|
-
request.log_path.touch()
|
452
|
-
return request
|
453
|
-
|
454
|
-
|
455
344
|
def schedule_request(
|
456
345
|
request_id: str,
|
457
346
|
request_name: str,
|
@@ -483,8 +372,27 @@ def schedule_request(
|
|
483
372
|
The precondition is waited asynchronously and does not block the
|
484
373
|
caller.
|
485
374
|
"""
|
486
|
-
|
487
|
-
|
375
|
+
user_id = request_body.env_vars[constants.USER_ID_ENV_VAR]
|
376
|
+
if is_skypilot_system:
|
377
|
+
user_id = server_constants.SKYPILOT_SYSTEM_USER_ID
|
378
|
+
global_user_state.add_or_update_user(
|
379
|
+
models.User(id=user_id, name=user_id))
|
380
|
+
request = api_requests.Request(request_id=request_id,
|
381
|
+
name=server_constants.REQUEST_NAME_PREFIX +
|
382
|
+
request_name,
|
383
|
+
entrypoint=func,
|
384
|
+
request_body=request_body,
|
385
|
+
status=api_requests.RequestStatus.PENDING,
|
386
|
+
created_at=time.time(),
|
387
|
+
schedule_type=schedule_type,
|
388
|
+
user_id=user_id,
|
389
|
+
cluster_name=request_cluster_name)
|
390
|
+
|
391
|
+
if not api_requests.create_if_not_exists(request):
|
392
|
+
logger.debug(f'Request {request_id} already exists.')
|
393
|
+
return
|
394
|
+
|
395
|
+
request.log_path.touch()
|
488
396
|
|
489
397
|
def enqueue():
|
490
398
|
input_tuple = (request_id, ignore_return_value)
|
sky/server/requests/requests.py
CHANGED
@@ -606,18 +606,3 @@ def set_request_failed(request_id: str, e: BaseException) -> None:
|
|
606
606
|
assert request_task is not None, request_id
|
607
607
|
request_task.status = RequestStatus.FAILED
|
608
608
|
request_task.set_error(e)
|
609
|
-
|
610
|
-
|
611
|
-
def set_request_succeeded(request_id: str, result: Any) -> None:
|
612
|
-
"""Set a request to succeeded and populate the result."""
|
613
|
-
with update_request(request_id) as request_task:
|
614
|
-
assert request_task is not None, request_id
|
615
|
-
request_task.status = RequestStatus.SUCCEEDED
|
616
|
-
request_task.set_return_value(result)
|
617
|
-
|
618
|
-
|
619
|
-
def set_request_cancelled(request_id: str) -> None:
|
620
|
-
"""Set a request to cancelled."""
|
621
|
-
with update_request(request_id) as request_task:
|
622
|
-
assert request_task is not None, request_id
|
623
|
-
request_task.status = RequestStatus.CANCELLED
|
sky/server/server.py
CHANGED
@@ -47,7 +47,6 @@ from sky.usage import usage_lib
|
|
47
47
|
from sky.utils import admin_policy_utils
|
48
48
|
from sky.utils import common as common_lib
|
49
49
|
from sky.utils import common_utils
|
50
|
-
from sky.utils import context
|
51
50
|
from sky.utils import dag_utils
|
52
51
|
from sky.utils import env_options
|
53
52
|
from sky.utils import status_lib
|
@@ -674,28 +673,24 @@ async def logs(
|
|
674
673
|
# TODO(zhwu): This should wait for the request on the cluster, e.g., async
|
675
674
|
# launch, to finish, so that a user does not need to manually pull the
|
676
675
|
# request status.
|
677
|
-
|
678
|
-
# experimental change.
|
679
|
-
# TODO(aylei): init in lifespan() to enable SkyPilot context in all APIs.
|
680
|
-
context.initialize()
|
681
|
-
request_task = executor.prepare_request(
|
676
|
+
executor.schedule_request(
|
682
677
|
request_id=request.state.request_id,
|
683
678
|
request_name='logs',
|
684
679
|
request_body=cluster_job_body,
|
685
680
|
func=core.tail_logs,
|
681
|
+
# TODO(aylei): We have tail logs scheduled as SHORT request, because it
|
682
|
+
# should be responsive. However, it can be long running if the user's
|
683
|
+
# job keeps running, and we should avoid it taking the SHORT worker.
|
686
684
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
685
|
+
request_cluster_name=cluster_job_body.cluster_name,
|
687
686
|
)
|
688
|
-
task = asyncio.create_task(executor.execute_request_coroutine(request_task))
|
689
687
|
|
690
|
-
|
691
|
-
task.cancel()
|
688
|
+
request_task = requests_lib.get_request(request.state.request_id)
|
692
689
|
|
693
|
-
# Cancel the task after the request is done or client disconnects
|
694
|
-
background_tasks.add_task(cancel_task)
|
695
690
|
# TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
|
696
691
|
# the same approach as /stream.
|
697
692
|
return stream_utils.stream_response(
|
698
|
-
request_id=
|
693
|
+
request_id=request_task.request_id,
|
699
694
|
logs_path=request_task.log_path,
|
700
695
|
background_tasks=background_tasks,
|
701
696
|
)
|
sky/server/uvicorn.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
This module is a wrapper around uvicorn to customize the behavior of the
|
4
4
|
server.
|
5
5
|
"""
|
6
|
-
import functools
|
7
6
|
import os
|
8
7
|
import threading
|
9
8
|
from typing import Optional
|
@@ -11,7 +10,6 @@ from typing import Optional
|
|
11
10
|
import uvicorn
|
12
11
|
from uvicorn.supervisors import multiprocess
|
13
12
|
|
14
|
-
from sky.utils import context_utils
|
15
13
|
from sky.utils import subprocess_utils
|
16
14
|
|
17
15
|
|
@@ -23,27 +21,19 @@ def run(config: uvicorn.Config):
|
|
23
21
|
# guard by an exception.
|
24
22
|
raise ValueError('Reload is not supported yet.')
|
25
23
|
server = uvicorn.Server(config=config)
|
26
|
-
run_server_process = functools.partial(_run_server_process, server)
|
27
24
|
try:
|
28
25
|
if config.workers is not None and config.workers > 1:
|
29
26
|
sock = config.bind_socket()
|
30
|
-
SlowStartMultiprocess(config,
|
31
|
-
target=run_server_process,
|
27
|
+
SlowStartMultiprocess(config, target=server.run,
|
32
28
|
sockets=[sock]).run()
|
33
29
|
else:
|
34
|
-
|
30
|
+
server.run()
|
35
31
|
finally:
|
36
32
|
# Copied from unvicorn.run()
|
37
33
|
if config.uds and os.path.exists(config.uds):
|
38
34
|
os.remove(config.uds)
|
39
35
|
|
40
36
|
|
41
|
-
def _run_server_process(server: uvicorn.Server, *args, **kwargs):
|
42
|
-
"""Run the server process with contextually aware."""
|
43
|
-
context_utils.hijack_sys_attrs()
|
44
|
-
server.run(*args, **kwargs)
|
45
|
-
|
46
|
-
|
47
37
|
class SlowStartMultiprocess(multiprocess.Multiprocess):
|
48
38
|
"""Uvicorn Multiprocess wrapper with slow start.
|
49
39
|
|
sky/sky_logging.py
CHANGED
@@ -10,7 +10,6 @@ import threading
|
|
10
10
|
import colorama
|
11
11
|
|
12
12
|
from sky.skylet import constants
|
13
|
-
from sky.utils import context
|
14
13
|
from sky.utils import env_options
|
15
14
|
from sky.utils import rich_utils
|
16
15
|
|
@@ -48,43 +47,6 @@ class NewLineFormatter(logging.Formatter):
|
|
48
47
|
return msg
|
49
48
|
|
50
49
|
|
51
|
-
class EnvAwareHandler(rich_utils.RichSafeStreamHandler):
|
52
|
-
"""A handler that awares environment variables.
|
53
|
-
|
54
|
-
This handler dynamically reflects the log level from environment variables.
|
55
|
-
"""
|
56
|
-
|
57
|
-
def __init__(self, stream=None, level=logging.NOTSET, sensitive=False):
|
58
|
-
super().__init__(stream)
|
59
|
-
self.level = level
|
60
|
-
self._sensitive = sensitive
|
61
|
-
|
62
|
-
@property
|
63
|
-
def level(self):
|
64
|
-
# Only refresh log level if we are in a context, since the log level
|
65
|
-
# has already been reloaded eagerly in multi-processing. Refresh again
|
66
|
-
# is a no-op and can be avoided.
|
67
|
-
# TODO(aylei): unify the mechanism for coroutine context and
|
68
|
-
# multi-processing.
|
69
|
-
if context.get() is not None:
|
70
|
-
if self._sensitive:
|
71
|
-
# For sensitive logger, suppress debug log despite the
|
72
|
-
# SKYPILOT_DEBUG env var if SUPPRESS_SENSITIVE_LOG is set
|
73
|
-
if env_options.Options.SUPPRESS_SENSITIVE_LOG.get():
|
74
|
-
return logging.INFO
|
75
|
-
if env_options.Options.SHOW_DEBUG_INFO.get():
|
76
|
-
return logging.DEBUG
|
77
|
-
else:
|
78
|
-
return self._level
|
79
|
-
else:
|
80
|
-
return self._level
|
81
|
-
|
82
|
-
@level.setter
|
83
|
-
def level(self, level):
|
84
|
-
# pylint: disable=protected-access
|
85
|
-
self._level = logging._checkLevel(level)
|
86
|
-
|
87
|
-
|
88
50
|
_root_logger = logging.getLogger('sky')
|
89
51
|
_default_handler = None
|
90
52
|
_logging_config = threading.local()
|
@@ -105,7 +67,7 @@ def _setup_logger():
|
|
105
67
|
_root_logger.setLevel(logging.DEBUG)
|
106
68
|
global _default_handler
|
107
69
|
if _default_handler is None:
|
108
|
-
_default_handler =
|
70
|
+
_default_handler = rich_utils.RichSafeStreamHandler(sys.stdout)
|
109
71
|
_default_handler.flush = sys.stdout.flush # type: ignore
|
110
72
|
if env_options.Options.SHOW_DEBUG_INFO.get():
|
111
73
|
_default_handler.setLevel(logging.DEBUG)
|
@@ -125,7 +87,7 @@ def _setup_logger():
|
|
125
87
|
# for certain loggers.
|
126
88
|
for logger_name in _SENSITIVE_LOGGER:
|
127
89
|
logger = logging.getLogger(logger_name)
|
128
|
-
handler_to_logger =
|
90
|
+
handler_to_logger = rich_utils.RichSafeStreamHandler(sys.stdout)
|
129
91
|
handler_to_logger.flush = sys.stdout.flush # type: ignore
|
130
92
|
logger.addHandler(handler_to_logger)
|
131
93
|
logger.setLevel(logging.INFO)
|
sky/skylet/log_lib.py
CHANGED
@@ -4,7 +4,6 @@ This is a remote utility module that provides logging functionality.
|
|
4
4
|
"""
|
5
5
|
import collections
|
6
6
|
import copy
|
7
|
-
import functools
|
8
7
|
import io
|
9
8
|
import multiprocessing.pool
|
10
9
|
import os
|
@@ -22,8 +21,6 @@ import colorama
|
|
22
21
|
from sky import sky_logging
|
23
22
|
from sky.skylet import constants
|
24
23
|
from sky.skylet import job_lib
|
25
|
-
from sky.utils import context
|
26
|
-
from sky.utils import context_utils
|
27
24
|
from sky.utils import log_utils
|
28
25
|
from sky.utils import subprocess_utils
|
29
26
|
from sky.utils import ux_utils
|
@@ -80,9 +77,6 @@ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
|
|
80
77
|
with open(args.log_path, 'a', encoding='utf-8') as fout:
|
81
78
|
with line_processor:
|
82
79
|
while True:
|
83
|
-
ctx = context.get()
|
84
|
-
if ctx is not None and ctx.is_canceled():
|
85
|
-
return
|
86
80
|
line = out_io.readline()
|
87
81
|
if not line:
|
88
82
|
break
|
@@ -117,29 +111,30 @@ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
|
|
117
111
|
return ''.join(out)
|
118
112
|
|
119
113
|
|
120
|
-
def process_subprocess_stream(proc,
|
121
|
-
|
122
|
-
"""Process the stream of a process in threads, blocking."""
|
114
|
+
def process_subprocess_stream(proc, args: _ProcessingArgs) -> Tuple[str, str]:
|
115
|
+
"""Redirect the process's filtered stdout/stderr to both stream and file"""
|
123
116
|
if proc.stderr is not None:
|
124
117
|
# Asyncio does not work as the output processing can be executed in a
|
125
118
|
# different thread.
|
126
119
|
# selectors is possible to handle the multiplexing of stdout/stderr,
|
127
120
|
# but it introduces buffering making the output not streaming.
|
128
121
|
with multiprocessing.pool.ThreadPool(processes=1) as pool:
|
129
|
-
|
130
|
-
|
122
|
+
err_args = copy.copy(args)
|
123
|
+
err_args.line_processor = None
|
124
|
+
stderr_fut = pool.apply_async(_handle_io_stream,
|
125
|
+
args=(proc.stderr, sys.stderr,
|
126
|
+
err_args))
|
131
127
|
# Do not launch a thread for stdout as the rich.status does not
|
132
128
|
# work in a thread, which is used in
|
133
129
|
# log_utils.RayUpLineProcessor.
|
134
|
-
stdout =
|
130
|
+
stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
|
135
131
|
stderr = stderr_fut.get()
|
136
132
|
else:
|
137
|
-
stdout =
|
133
|
+
stdout = _handle_io_stream(proc.stdout, sys.stdout, args)
|
138
134
|
stderr = ''
|
139
135
|
return stdout, stderr
|
140
136
|
|
141
137
|
|
142
|
-
@context_utils.cancellation_guard
|
143
138
|
def run_with_log(
|
144
139
|
cmd: Union[List[str], str],
|
145
140
|
log_path: str,
|
@@ -181,12 +176,7 @@ def run_with_log(
|
|
181
176
|
# Redirect stderr to stdout when using ray, to preserve the order of
|
182
177
|
# stdout and stderr.
|
183
178
|
stdout_arg = stderr_arg = None
|
184
|
-
|
185
|
-
if process_stream or ctx is not None:
|
186
|
-
# Capture stdout/stderr of the subprocess if:
|
187
|
-
# 1. Post-processing is needed (process_stream=True)
|
188
|
-
# 2. Potential contextual handling is needed (ctx is not None)
|
189
|
-
# TODO(aylei): can we always capture the stdout/stderr?
|
179
|
+
if process_stream:
|
190
180
|
stdout_arg = subprocess.PIPE
|
191
181
|
stderr_arg = subprocess.PIPE if not with_ray else subprocess.STDOUT
|
192
182
|
# Use stdin=subprocess.DEVNULL by default, as allowing inputs will mess up
|
@@ -207,8 +197,6 @@ def run_with_log(
|
|
207
197
|
subprocess_utils.kill_process_daemon(proc.pid)
|
208
198
|
stdout = ''
|
209
199
|
stderr = ''
|
210
|
-
stdout_stream_handler = None
|
211
|
-
stderr_stream_handler = None
|
212
200
|
|
213
201
|
if process_stream:
|
214
202
|
if skip_lines is None:
|
@@ -235,35 +223,7 @@ def run_with_log(
|
|
235
223
|
replace_crlf=with_ray,
|
236
224
|
streaming_prefix=streaming_prefix,
|
237
225
|
)
|
238
|
-
|
239
|
-
_handle_io_stream,
|
240
|
-
args=args,
|
241
|
-
)
|
242
|
-
if proc.stderr is not None:
|
243
|
-
err_args = copy.copy(args)
|
244
|
-
err_args.line_processor = None
|
245
|
-
stderr_stream_handler = functools.partial(
|
246
|
-
_handle_io_stream,
|
247
|
-
args=err_args,
|
248
|
-
)
|
249
|
-
if ctx is not None:
|
250
|
-
# When runs in a coroutine, always process the subprocess
|
251
|
-
# stream to:
|
252
|
-
# 1. handle context cancellation
|
253
|
-
# 2. redirect subprocess stdout/stderr to the contextual
|
254
|
-
# stdout/stderr of current coroutine.
|
255
|
-
stdout, stderr = context_utils.pipe_and_wait_process(
|
256
|
-
ctx,
|
257
|
-
proc,
|
258
|
-
cancel_callback=subprocess_utils.kill_children_processes,
|
259
|
-
stdout_stream_handler=stdout_stream_handler,
|
260
|
-
stderr_stream_handler=stderr_stream_handler)
|
261
|
-
elif process_stream:
|
262
|
-
# When runs in a process, only process subprocess stream if
|
263
|
-
# necessary to avoid unnecessary stream handling overhead.
|
264
|
-
stdout, stderr = process_subprocess_stream(
|
265
|
-
proc, stdout_stream_handler, stderr_stream_handler)
|
266
|
-
# Ensure returncode is set.
|
226
|
+
stdout, stderr = process_subprocess_stream(proc, args)
|
267
227
|
proc.wait()
|
268
228
|
if require_outputs:
|
269
229
|
return proc.returncode, stdout, stderr
|
sky/templates/nebius-ray.yml.j2
CHANGED
@@ -105,6 +105,7 @@ file_mounts: {
|
|
105
105
|
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
106
106
|
{%- for remote_path, local_path in credentials.items() %}
|
107
107
|
"{{remote_path}}": "{{local_path}}",
|
108
|
+
"~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
|
108
109
|
{%- endfor %}
|
109
110
|
}
|
110
111
|
|
@@ -120,6 +121,7 @@ initialization_commands: []
|
|
120
121
|
# Increment the following for catching performance bugs easier:
|
121
122
|
# current num items (num SSH connections): 1
|
122
123
|
setup_commands:
|
124
|
+
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
123
125
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
124
126
|
# Create ~/.ssh/config file in case the file does not exist in the image.
|
125
127
|
# Line 'rm ..': there is another installation of pip.
|
@@ -142,6 +144,6 @@ setup_commands:
|
|
142
144
|
{{ ray_skypilot_installation_commands }}
|
143
145
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
144
146
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
145
|
-
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
147
|
+
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
146
148
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
|
147
149
|
{{ ssh_max_sessions_config }}
|
sky/utils/command_runner.py
CHANGED
@@ -11,7 +11,6 @@ from sky import sky_logging
|
|
11
11
|
from sky.skylet import constants
|
12
12
|
from sky.skylet import log_lib
|
13
13
|
from sky.utils import common_utils
|
14
|
-
from sky.utils import context_utils
|
15
14
|
from sky.utils import control_master_utils
|
16
15
|
from sky.utils import subprocess_utils
|
17
16
|
from sky.utils import timeline
|
@@ -575,7 +574,6 @@ class SSHCommandRunner(CommandRunner):
|
|
575
574
|
shell=True)
|
576
575
|
|
577
576
|
@timeline.event
|
578
|
-
@context_utils.cancellation_guard
|
579
577
|
def run(
|
580
578
|
self,
|
581
579
|
cmd: Union[str, List[str]],
|
@@ -781,7 +779,6 @@ class KubernetesCommandRunner(CommandRunner):
|
|
781
779
|
return kubectl_cmd
|
782
780
|
|
783
781
|
@timeline.event
|
784
|
-
@context_utils.cancellation_guard
|
785
782
|
def run(
|
786
783
|
self,
|
787
784
|
cmd: Union[str, List[str]],
|