skypilot-nightly 1.0.0.dev20250514__py3-none-any.whl → 1.0.0.dev20250515__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend.py +3 -2
- sky/backends/backend_utils.py +19 -17
- sky/backends/cloud_vm_ray_backend.py +30 -11
- sky/clouds/aws.py +11 -9
- sky/clouds/azure.py +16 -13
- sky/clouds/cloud.py +4 -3
- sky/clouds/cudo.py +3 -2
- sky/clouds/do.py +3 -2
- sky/clouds/fluidstack.py +3 -3
- sky/clouds/gcp.py +1 -1
- sky/clouds/ibm.py +12 -10
- sky/clouds/kubernetes.py +3 -2
- sky/clouds/lambda_cloud.py +6 -6
- sky/clouds/nebius.py +6 -5
- sky/clouds/oci.py +9 -7
- sky/clouds/paperspace.py +3 -2
- sky/clouds/runpod.py +9 -9
- sky/clouds/scp.py +5 -3
- sky/clouds/vast.py +8 -7
- sky/clouds/vsphere.py +4 -2
- sky/core.py +18 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/pages/index-6b0d9e5031b70c58.js +1 -0
- sky/dashboard/out/_next/static/{tdxxQrPV6NW90a983oHXe → jFI0Y-uJZ_XDK5IGJpKFU}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/execution.py +33 -0
- sky/global_user_state.py +2 -0
- sky/jobs/recovery_strategy.py +4 -1
- sky/jobs/server/core.py +6 -12
- sky/optimizer.py +19 -13
- sky/provision/kubernetes/utils.py +26 -1
- sky/resources.py +203 -44
- sky/serve/server/core.py +0 -5
- sky/serve/spot_placer.py +3 -0
- sky/server/requests/executor.py +114 -22
- sky/server/requests/requests.py +15 -0
- sky/server/server.py +63 -20
- sky/server/uvicorn.py +12 -2
- sky/sky_logging.py +40 -2
- sky/skylet/log_lib.py +60 -11
- sky/skylet/log_lib.pyi +5 -0
- sky/task.py +8 -6
- sky/utils/cli_utils/status_utils.py +6 -5
- sky/utils/command_runner.py +3 -0
- sky/utils/context.py +264 -0
- sky/utils/context_utils.py +172 -0
- sky/utils/controller_utils.py +39 -43
- sky/utils/dag_utils.py +4 -2
- sky/utils/resources_utils.py +3 -0
- sky/utils/rich_utils.py +81 -37
- sky/utils/schemas.py +33 -24
- sky/utils/subprocess_utils.py +8 -2
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/RECORD +65 -63
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/WHEEL +1 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- /sky/dashboard/out/_next/static/{tdxxQrPV6NW90a983oHXe → jFI0Y-uJZ_XDK5IGJpKFU}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/top_level.txt +0 -0
sky/utils/command_runner.py
CHANGED
@@ -11,6 +11,7 @@ from sky import sky_logging
|
|
11
11
|
from sky.skylet import constants
|
12
12
|
from sky.skylet import log_lib
|
13
13
|
from sky.utils import common_utils
|
14
|
+
from sky.utils import context_utils
|
14
15
|
from sky.utils import control_master_utils
|
15
16
|
from sky.utils import subprocess_utils
|
16
17
|
from sky.utils import timeline
|
@@ -574,6 +575,7 @@ class SSHCommandRunner(CommandRunner):
|
|
574
575
|
shell=True)
|
575
576
|
|
576
577
|
@timeline.event
|
578
|
+
@context_utils.cancellation_guard
|
577
579
|
def run(
|
578
580
|
self,
|
579
581
|
cmd: Union[str, List[str]],
|
@@ -779,6 +781,7 @@ class KubernetesCommandRunner(CommandRunner):
|
|
779
781
|
return kubectl_cmd
|
780
782
|
|
781
783
|
@timeline.event
|
784
|
+
@context_utils.cancellation_guard
|
782
785
|
def run(
|
783
786
|
self,
|
784
787
|
cmd: Union[str, List[str]],
|
sky/utils/context.py
ADDED
@@ -0,0 +1,264 @@
|
|
1
|
+
"""SkyPilot context for threads and coroutines."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
from collections.abc import Mapping
|
5
|
+
from collections.abc import MutableMapping
|
6
|
+
import contextvars
|
7
|
+
import os
|
8
|
+
import pathlib
|
9
|
+
import subprocess
|
10
|
+
import sys
|
11
|
+
from typing import Dict, Optional, TextIO
|
12
|
+
|
13
|
+
|
14
|
+
class Context(object):
|
15
|
+
"""SkyPilot typed context vars for threads and coroutines.
|
16
|
+
|
17
|
+
This is a wrapper around `contextvars.ContextVar` that provides a typed
|
18
|
+
interface for the SkyPilot specific context variables that can be accessed
|
19
|
+
at any layer of the call stack. ContextVar is coroutine local, an empty
|
20
|
+
Context will be intialized for each coroutine when it is created.
|
21
|
+
|
22
|
+
Adding a new context variable for a new feature is as simple as:
|
23
|
+
1. Add a new instance variable to the Context class.
|
24
|
+
2. (Optional) Add new accessor methods if the variable should be protected.
|
25
|
+
|
26
|
+
To propagate the context to a new thread/coroutine, use
|
27
|
+
`contextvars.copy_context()`.
|
28
|
+
|
29
|
+
Example:
|
30
|
+
import asyncio
|
31
|
+
import contextvars
|
32
|
+
import time
|
33
|
+
from sky.utils import context
|
34
|
+
|
35
|
+
def sync_task():
|
36
|
+
while True:
|
37
|
+
if context.get().is_canceled():
|
38
|
+
break
|
39
|
+
time.sleep(1)
|
40
|
+
|
41
|
+
async def fastapi_handler():
|
42
|
+
# context.initialize() has been called in lifespan
|
43
|
+
ctx = contextvars.copy_context()
|
44
|
+
# asyncio.to_thread copies current context implicitly
|
45
|
+
task = asyncio.to_thread(sync_task)
|
46
|
+
# Or explicitly:
|
47
|
+
# loop = asyncio.get_running_loop()
|
48
|
+
# ctx = contextvars.copy_context()
|
49
|
+
# task = loop.run_in_executor(None, ctx.run, sync_task)
|
50
|
+
await asyncio.sleep(1)
|
51
|
+
context.get().cancel()
|
52
|
+
await task
|
53
|
+
"""
|
54
|
+
|
55
|
+
def __init__(self):
|
56
|
+
self._canceled = asyncio.Event()
|
57
|
+
self._log_file = None
|
58
|
+
self._log_file_handle = None
|
59
|
+
self.env_overrides = {}
|
60
|
+
|
61
|
+
def cancel(self):
|
62
|
+
"""Cancel the context."""
|
63
|
+
self._canceled.set()
|
64
|
+
|
65
|
+
def is_canceled(self):
|
66
|
+
"""Check if the context is canceled."""
|
67
|
+
return self._canceled.is_set()
|
68
|
+
|
69
|
+
def redirect_log(
|
70
|
+
self, log_file: Optional[pathlib.Path]) -> Optional[pathlib.Path]:
|
71
|
+
"""Redirect the stdout and stderr of current context to a file.
|
72
|
+
|
73
|
+
Args:
|
74
|
+
log_file: The log file to redirect to. If None, the stdout and
|
75
|
+
stderr will be restored to the original streams.
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
The old log file, or None if the stdout and stderr were not
|
79
|
+
redirected.
|
80
|
+
"""
|
81
|
+
original_log_file = self._log_file
|
82
|
+
original_log_handle = self._log_file_handle
|
83
|
+
if log_file is None:
|
84
|
+
self._log_file_handle = None
|
85
|
+
else:
|
86
|
+
self._log_file_handle = open(log_file, 'a', encoding='utf-8')
|
87
|
+
self._log_file = log_file
|
88
|
+
if original_log_file is not None:
|
89
|
+
original_log_handle.close()
|
90
|
+
return original_log_file
|
91
|
+
|
92
|
+
def output_stream(self, fallback: TextIO) -> TextIO:
|
93
|
+
if self._log_file_handle is None:
|
94
|
+
return fallback
|
95
|
+
else:
|
96
|
+
return self._log_file_handle
|
97
|
+
|
98
|
+
def override_envs(self, envs: Dict[str, str]):
|
99
|
+
for k, v in envs.items():
|
100
|
+
self.env_overrides[k] = v
|
101
|
+
|
102
|
+
|
103
|
+
_CONTEXT = contextvars.ContextVar('sky_context', default=None)
|
104
|
+
|
105
|
+
|
106
|
+
def get() -> Optional[Context]:
|
107
|
+
"""Get the current SkyPilot context.
|
108
|
+
|
109
|
+
If the context is not initialized, get() will return None. This helps
|
110
|
+
sync code to check whether it runs in a cancellable context and avoid
|
111
|
+
polling the cancellation event if it is not.
|
112
|
+
"""
|
113
|
+
return _CONTEXT.get()
|
114
|
+
|
115
|
+
|
116
|
+
class ContextualEnviron(MutableMapping):
|
117
|
+
"""Environment variables wrapper with contextual overrides.
|
118
|
+
|
119
|
+
An instance of ContextualEnviron will typically be used to replace
|
120
|
+
os.environ to make the envron access of current process contextual
|
121
|
+
aware.
|
122
|
+
|
123
|
+
Behavior of spawning a subprocess:
|
124
|
+
- The contexual overrides will not be applied to the subprocess by
|
125
|
+
default.
|
126
|
+
- When using env=os.environ to pass the environment variables to the
|
127
|
+
subprocess explicitly. The subprocess will inherit the contextual
|
128
|
+
environment variables at the time of the spawn, that is, it will not
|
129
|
+
see the updates to the environment variables after the spawn. Also,
|
130
|
+
os.environ of the subprocess will not be a ContextualEnviron unless
|
131
|
+
the subprocess hijacks os.environ explicitly.
|
132
|
+
- Optionally, context.Popen() can be used to automatically pass
|
133
|
+
os.environ with overrides to subprocess.
|
134
|
+
|
135
|
+
|
136
|
+
Example:
|
137
|
+
1. Parent process:
|
138
|
+
# Hijack os.environ to be a ContextualEnviron
|
139
|
+
os.environ = ContextualEnviron(os.environ)
|
140
|
+
ctx = context.get()
|
141
|
+
ctx.override_envs({'FOO': 'BAR1'})
|
142
|
+
proc = subprocess.Popen(..., env=os.environ)
|
143
|
+
# Or use context.Popen instead
|
144
|
+
# proc = context.Popen(...)
|
145
|
+
ctx.override_envs({'FOO': 'BAR2'})
|
146
|
+
2. Subprocess:
|
147
|
+
assert os.environ['FOO'] == 'BAR1'
|
148
|
+
ctx = context.get()
|
149
|
+
# Override the contextual env var in the subprocess does not take
|
150
|
+
# effect since the os.environ is not hijacked.
|
151
|
+
ctx.override_envs({'FOO': 'BAR3'})
|
152
|
+
assert os.environ['FOO'] == 'BAR1'
|
153
|
+
"""
|
154
|
+
|
155
|
+
def __init__(self, environ):
|
156
|
+
self._environ = environ
|
157
|
+
|
158
|
+
def __getitem__(self, key):
|
159
|
+
ctx = get()
|
160
|
+
if ctx is not None:
|
161
|
+
if key in ctx.env_overrides:
|
162
|
+
return ctx.env_overrides[key]
|
163
|
+
return self._environ[key]
|
164
|
+
|
165
|
+
def __iter__(self):
|
166
|
+
ctx = get()
|
167
|
+
if ctx is not None:
|
168
|
+
for key in ctx.env_overrides:
|
169
|
+
yield key
|
170
|
+
for key in self._environ:
|
171
|
+
# Deduplicate the keys
|
172
|
+
if key not in ctx.env_overrides:
|
173
|
+
yield key
|
174
|
+
else:
|
175
|
+
return self._environ.__iter__()
|
176
|
+
|
177
|
+
def __len__(self):
|
178
|
+
return len(dict(self))
|
179
|
+
|
180
|
+
def __setitem__(self, key, value):
|
181
|
+
return self._environ.__setitem__(key, value)
|
182
|
+
|
183
|
+
def __delitem__(self, key):
|
184
|
+
return self._environ.__delitem__(key)
|
185
|
+
|
186
|
+
def __repr__(self):
|
187
|
+
return self._environ.__repr__()
|
188
|
+
|
189
|
+
def copy(self):
|
190
|
+
copied = self._environ.copy()
|
191
|
+
ctx = get()
|
192
|
+
if ctx is not None:
|
193
|
+
copied.update(ctx.env_overrides)
|
194
|
+
return copied
|
195
|
+
|
196
|
+
def setdefault(self, key, default=None):
|
197
|
+
return self._environ.setdefault(key, default)
|
198
|
+
|
199
|
+
def __ior__(self, other):
|
200
|
+
if not isinstance(other, Mapping):
|
201
|
+
return NotImplemented
|
202
|
+
self.update(other)
|
203
|
+
return self
|
204
|
+
|
205
|
+
def __or__(self, other):
|
206
|
+
if not isinstance(other, Mapping):
|
207
|
+
return NotImplemented
|
208
|
+
new = dict(self)
|
209
|
+
new.update(other)
|
210
|
+
return new
|
211
|
+
|
212
|
+
def __ror__(self, other):
|
213
|
+
if not isinstance(other, Mapping):
|
214
|
+
return NotImplemented
|
215
|
+
new = dict(other)
|
216
|
+
new.update(self)
|
217
|
+
return new
|
218
|
+
|
219
|
+
|
220
|
+
class Popen(subprocess.Popen):
|
221
|
+
|
222
|
+
def __init__(self, *args, **kwargs):
|
223
|
+
env = kwargs.pop('env', None)
|
224
|
+
if env is None:
|
225
|
+
env = os.environ
|
226
|
+
super().__init__(*args, env=env, **kwargs)
|
227
|
+
|
228
|
+
|
229
|
+
def initialize():
|
230
|
+
"""Initialize the current SkyPilot context."""
|
231
|
+
_CONTEXT.set(Context())
|
232
|
+
|
233
|
+
|
234
|
+
class _ContextualStream:
|
235
|
+
"""A base class for streams that are contextually aware.
|
236
|
+
|
237
|
+
This class implements the TextIO interface via __getattr__ to delegate
|
238
|
+
attribute access to the original or contextual stream.
|
239
|
+
"""
|
240
|
+
_original_stream: TextIO
|
241
|
+
|
242
|
+
def __init__(self, original_stream: TextIO):
|
243
|
+
self._original_stream = original_stream
|
244
|
+
|
245
|
+
def __getattr__(self, attr: str):
|
246
|
+
return getattr(self._active_stream(), attr)
|
247
|
+
|
248
|
+
def _active_stream(self) -> TextIO:
|
249
|
+
ctx = get()
|
250
|
+
if ctx is None:
|
251
|
+
return self._original_stream
|
252
|
+
return ctx.output_stream(self._original_stream)
|
253
|
+
|
254
|
+
|
255
|
+
class Stdout(_ContextualStream):
|
256
|
+
|
257
|
+
def __init__(self):
|
258
|
+
super().__init__(sys.stdout)
|
259
|
+
|
260
|
+
|
261
|
+
class Stderr(_ContextualStream):
|
262
|
+
|
263
|
+
def __init__(self):
|
264
|
+
super().__init__(sys.stderr)
|
@@ -0,0 +1,172 @@
|
|
1
|
+
"""Utilities for SkyPilot context."""
|
2
|
+
import asyncio
|
3
|
+
import functools
|
4
|
+
import io
|
5
|
+
import multiprocessing
|
6
|
+
import os
|
7
|
+
import subprocess
|
8
|
+
import sys
|
9
|
+
import typing
|
10
|
+
from typing import Any, Callable, IO, Optional, Tuple, TypeVar
|
11
|
+
|
12
|
+
from sky import sky_logging
|
13
|
+
from sky.utils import context
|
14
|
+
from sky.utils import subprocess_utils
|
15
|
+
|
16
|
+
StreamHandler = Callable[[IO[Any], IO[Any]], str]
|
17
|
+
|
18
|
+
|
19
|
+
# TODO(aylei): call hijack_sys_attrs() proactivly in module init at server-side
|
20
|
+
# once we have context widely adopted.
|
21
|
+
def hijack_sys_attrs():
|
22
|
+
"""hijack system attributes to be context aware
|
23
|
+
|
24
|
+
This function should be called at the very beginning of the processes
|
25
|
+
that might use sky.utils.context.
|
26
|
+
"""
|
27
|
+
# Modify stdout and stderr of unvicorn process to be contextually aware,
|
28
|
+
# use setattr to bypass the TextIO type check.
|
29
|
+
setattr(sys, 'stdout', context.Stdout())
|
30
|
+
setattr(sys, 'stderr', context.Stderr())
|
31
|
+
# Reload logger to apply latest stdout and stderr.
|
32
|
+
sky_logging.reload_logger()
|
33
|
+
# Hijack os.environ with ContextualEnviron to make env variables
|
34
|
+
# contextually aware.
|
35
|
+
setattr(os, 'environ', context.ContextualEnviron(os.environ))
|
36
|
+
# Hijack subprocess.Popen to pass the contextual environ to subprocess
|
37
|
+
# by default.
|
38
|
+
setattr(subprocess, 'Popen', context.Popen)
|
39
|
+
|
40
|
+
|
41
|
+
def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
|
42
|
+
"""Passthrough the stream from the process to the output stream"""
|
43
|
+
wrapped = io.TextIOWrapper(in_stream,
|
44
|
+
encoding='utf-8',
|
45
|
+
newline='',
|
46
|
+
errors='replace',
|
47
|
+
write_through=True)
|
48
|
+
while True:
|
49
|
+
line = wrapped.readline()
|
50
|
+
if line:
|
51
|
+
out_stream.write(line)
|
52
|
+
out_stream.flush()
|
53
|
+
else:
|
54
|
+
break
|
55
|
+
return ''
|
56
|
+
|
57
|
+
|
58
|
+
def pipe_and_wait_process(
|
59
|
+
ctx: context.Context,
|
60
|
+
proc: subprocess.Popen,
|
61
|
+
poll_interval: float = 0.5,
|
62
|
+
cancel_callback: Optional[Callable[[], None]] = None,
|
63
|
+
stdout_stream_handler: Optional[StreamHandler] = None,
|
64
|
+
stderr_stream_handler: Optional[StreamHandler] = None
|
65
|
+
) -> Tuple[str, str]:
|
66
|
+
"""Wait for the process to finish or cancel it if the context is cancelled.
|
67
|
+
|
68
|
+
Args:
|
69
|
+
proc: The process to wait for.
|
70
|
+
poll_interval: The interval to poll the process.
|
71
|
+
cancel_callback: The callback to call if the context is cancelled.
|
72
|
+
stdout_stream_handler: An optional handler to handle the stdout stream,
|
73
|
+
if None, the stdout stream will be passed through.
|
74
|
+
stderr_stream_handler: An optional handler to handle the stderr stream,
|
75
|
+
if None, the stderr stream will be passed through.
|
76
|
+
"""
|
77
|
+
|
78
|
+
if stdout_stream_handler is None:
|
79
|
+
stdout_stream_handler = passthrough_stream_handler
|
80
|
+
if stderr_stream_handler is None:
|
81
|
+
stderr_stream_handler = passthrough_stream_handler
|
82
|
+
|
83
|
+
# Threads are lazily created, so no harm if stderr is None
|
84
|
+
with multiprocessing.pool.ThreadPool(processes=2) as pool:
|
85
|
+
# Context will be lost in the new thread, capture current output stream
|
86
|
+
# and pass it to the new thread directly.
|
87
|
+
stdout_fut = pool.apply_async(
|
88
|
+
stdout_stream_handler, (proc.stdout, ctx.output_stream(sys.stdout)))
|
89
|
+
stderr_fut = None
|
90
|
+
if proc.stderr is not None:
|
91
|
+
stderr_fut = pool.apply_async(
|
92
|
+
stderr_stream_handler,
|
93
|
+
(proc.stderr, ctx.output_stream(sys.stderr)))
|
94
|
+
try:
|
95
|
+
wait_process(ctx,
|
96
|
+
proc,
|
97
|
+
poll_interval=poll_interval,
|
98
|
+
cancel_callback=cancel_callback)
|
99
|
+
finally:
|
100
|
+
# Wait for the stream handler threads to exit when process is done
|
101
|
+
# or cancelled
|
102
|
+
stdout_fut.wait()
|
103
|
+
if stderr_fut is not None:
|
104
|
+
stderr_fut.wait()
|
105
|
+
stdout = stdout_fut.get()
|
106
|
+
stderr = ''
|
107
|
+
if stderr_fut is not None:
|
108
|
+
stderr = stderr_fut.get()
|
109
|
+
return stdout, stderr
|
110
|
+
|
111
|
+
|
112
|
+
def wait_process(ctx: context.Context,
|
113
|
+
proc: subprocess.Popen,
|
114
|
+
poll_interval: float = 0.5,
|
115
|
+
cancel_callback: Optional[Callable[[], None]] = None):
|
116
|
+
"""Wait for the process to finish or cancel it if the context is cancelled.
|
117
|
+
|
118
|
+
Args:
|
119
|
+
proc: The process to wait for.
|
120
|
+
poll_interval: The interval to poll the process.
|
121
|
+
cancel_callback: The callback to call if the context is cancelled.
|
122
|
+
"""
|
123
|
+
while True:
|
124
|
+
if ctx.is_canceled():
|
125
|
+
if cancel_callback is not None:
|
126
|
+
cancel_callback()
|
127
|
+
# Kill the process despite the caller's callback, the utility
|
128
|
+
# function gracefully handles the case where the process is
|
129
|
+
# already terminated.
|
130
|
+
subprocess_utils.kill_process_with_grace_period(proc)
|
131
|
+
raise asyncio.CancelledError()
|
132
|
+
try:
|
133
|
+
proc.wait(poll_interval)
|
134
|
+
except subprocess.TimeoutExpired:
|
135
|
+
pass
|
136
|
+
else:
|
137
|
+
# Process exited
|
138
|
+
break
|
139
|
+
|
140
|
+
|
141
|
+
F = TypeVar('F', bound=Callable[..., Any])
|
142
|
+
|
143
|
+
|
144
|
+
def cancellation_guard(func: F) -> F:
|
145
|
+
"""Decorator to make a synchronous function cancellable via context.
|
146
|
+
|
147
|
+
Guards the function execution by checking context.is_canceled() before
|
148
|
+
executing the function and raises asyncio.CancelledError if the context
|
149
|
+
is already cancelled.
|
150
|
+
|
151
|
+
This basically mimics the behavior of asyncio, which checks coroutine
|
152
|
+
cancelled in await call.
|
153
|
+
|
154
|
+
Args:
|
155
|
+
func: The function to be decorated.
|
156
|
+
|
157
|
+
Returns:
|
158
|
+
The wrapped function that checks cancellation before execution.
|
159
|
+
|
160
|
+
Raises:
|
161
|
+
asyncio.CancelledError: If the context is cancelled before execution.
|
162
|
+
"""
|
163
|
+
|
164
|
+
@functools.wraps(func)
|
165
|
+
def wrapper(*args, **kwargs):
|
166
|
+
ctx = context.get()
|
167
|
+
if ctx is not None and ctx.is_canceled():
|
168
|
+
raise asyncio.CancelledError(
|
169
|
+
f'Function {func.__name__} cancelled before execution')
|
170
|
+
return func(*args, **kwargs)
|
171
|
+
|
172
|
+
return typing.cast(F, wrapper)
|
sky/utils/controller_utils.py
CHANGED
@@ -6,7 +6,7 @@ import getpass
|
|
6
6
|
import os
|
7
7
|
import tempfile
|
8
8
|
import typing
|
9
|
-
from typing import Any, Dict, Iterable, List, Optional, Set
|
9
|
+
from typing import Any, Dict, Iterable, List, Optional, Set
|
10
10
|
import uuid
|
11
11
|
|
12
12
|
import colorama
|
@@ -517,6 +517,30 @@ def get_controller_resources(
|
|
517
517
|
if custom_controller_resources_config is not None:
|
518
518
|
controller_resources_config_copied.update(
|
519
519
|
custom_controller_resources_config)
|
520
|
+
# Compatibility with the old way of specifying the controller autostop
|
521
|
+
# config. TODO(cooperc): Remove this before 0.12.0.
|
522
|
+
custom_controller_autostop_config = skypilot_config.get_nested(
|
523
|
+
(controller.value.controller_type, 'controller', 'autostop'), None)
|
524
|
+
if custom_controller_autostop_config is not None:
|
525
|
+
logger.warning(
|
526
|
+
f'{colorama.Fore.YELLOW}Warning: Config value '
|
527
|
+
f'`{controller.value.controller_type}.controller.autostop` '
|
528
|
+
'is deprecated. Please use '
|
529
|
+
f'`{controller.value.controller_type}.controller.resources.'
|
530
|
+
f'autostop` instead.{colorama.Style.RESET_ALL}')
|
531
|
+
# Only set the autostop config if it is not already specified.
|
532
|
+
if controller_resources_config_copied.get('autostop') is None:
|
533
|
+
controller_resources_config_copied['autostop'] = (
|
534
|
+
custom_controller_autostop_config)
|
535
|
+
else:
|
536
|
+
logger.warning(f'{colorama.Fore.YELLOW}Ignoring the old '
|
537
|
+
'config, since it is already specified in '
|
538
|
+
f'resources.{colorama.Style.RESET_ALL}')
|
539
|
+
# Set the default autostop config for the controller, if not already
|
540
|
+
# specified.
|
541
|
+
if controller_resources_config_copied.get('autostop') is None:
|
542
|
+
controller_resources_config_copied['autostop'] = (
|
543
|
+
controller.value.default_autostop_config)
|
520
544
|
|
521
545
|
try:
|
522
546
|
controller_resources = resources.Resources.from_yaml_config(
|
@@ -547,7 +571,10 @@ def get_controller_resources(
|
|
547
571
|
if controller_record is not None:
|
548
572
|
handle = controller_record.get('handle', None)
|
549
573
|
if handle is not None:
|
550
|
-
|
574
|
+
# Use the existing resources, but override the autostop config with
|
575
|
+
# the one currently specified in the config.
|
576
|
+
controller_resources_to_use = handle.launched_resources.copy(
|
577
|
+
autostop=controller_resources_config_copied.get('autostop'))
|
551
578
|
|
552
579
|
# If the controller and replicas are from the same cloud (and region/zone),
|
553
580
|
# it should provide better connectivity. We will let the controller choose
|
@@ -608,8 +635,9 @@ def get_controller_resources(
|
|
608
635
|
controller_zone = controller_resources_to_use.zone
|
609
636
|
|
610
637
|
# Filter clouds if controller_resources_to_use.cloud is specified.
|
611
|
-
filtered_clouds =
|
612
|
-
|
638
|
+
filtered_clouds: Set[str] = {controller_cloud
|
639
|
+
} if controller_cloud is not None else set(
|
640
|
+
requested_clouds_with_region_zone.keys())
|
613
641
|
|
614
642
|
# Filter regions and zones and construct the result.
|
615
643
|
result: Set[resources.Resources] = set()
|
@@ -618,15 +646,17 @@ def get_controller_resources(
|
|
618
646
|
{None: {None}})
|
619
647
|
|
620
648
|
# Filter regions if controller_resources_to_use.region is specified.
|
621
|
-
filtered_regions = ({
|
622
|
-
|
649
|
+
filtered_regions: Set[Optional[str]] = ({
|
650
|
+
controller_region
|
651
|
+
} if controller_region is not None else set(regions.keys()))
|
623
652
|
|
624
653
|
for region in filtered_regions:
|
625
654
|
zones = regions.get(region, {None})
|
626
655
|
|
627
656
|
# Filter zones if controller_resources_to_use.zone is specified.
|
628
|
-
filtered_zones = ({
|
629
|
-
|
657
|
+
filtered_zones: Set[Optional[str]] = ({
|
658
|
+
controller_zone
|
659
|
+
} if controller_zone is not None else set(zones))
|
630
660
|
|
631
661
|
# Create combinations of cloud, region, and zone.
|
632
662
|
for zone in filtered_zones:
|
@@ -641,40 +671,6 @@ def get_controller_resources(
|
|
641
671
|
return result
|
642
672
|
|
643
673
|
|
644
|
-
def get_controller_autostop_config(
|
645
|
-
controller: Controllers) -> Tuple[Optional[int], bool]:
|
646
|
-
"""Get the autostop config for the controller.
|
647
|
-
|
648
|
-
Returns:
|
649
|
-
A tuple of (idle_minutes_to_autostop, down), which correspond to the
|
650
|
-
values passed to execution.launch().
|
651
|
-
"""
|
652
|
-
controller_autostop_config_copied: Dict[str, Any] = copy.copy(
|
653
|
-
controller.value.default_autostop_config)
|
654
|
-
if skypilot_config.loaded():
|
655
|
-
custom_controller_autostop_config = skypilot_config.get_nested(
|
656
|
-
(controller.value.controller_type, 'controller', 'autostop'), None)
|
657
|
-
if custom_controller_autostop_config is False:
|
658
|
-
# Disabled with `autostop: false` in config.
|
659
|
-
# To indicate autostop is disabled, we return None for
|
660
|
-
# idle_minutes_to_autostop.
|
661
|
-
return None, False
|
662
|
-
elif custom_controller_autostop_config is True:
|
663
|
-
# Enabled with default values. There is no change in behavior, but
|
664
|
-
# this is included by for completeness, since `False` is valid.
|
665
|
-
pass
|
666
|
-
elif custom_controller_autostop_config is not None:
|
667
|
-
# We have specific config values.
|
668
|
-
# Override the controller autostop config with the ones specified in
|
669
|
-
# the config.
|
670
|
-
assert isinstance(custom_controller_autostop_config, dict)
|
671
|
-
controller_autostop_config_copied.update(
|
672
|
-
custom_controller_autostop_config)
|
673
|
-
|
674
|
-
return (controller_autostop_config_copied['idle_minutes'],
|
675
|
-
controller_autostop_config_copied['down'])
|
676
|
-
|
677
|
-
|
678
674
|
def _setup_proxy_command_on_controller(
|
679
675
|
controller_launched_cloud: 'clouds.Cloud',
|
680
676
|
user_config: Dict[str, Any]) -> config_utils.Config:
|
@@ -703,7 +699,7 @@ def _setup_proxy_command_on_controller(
|
|
703
699
|
# NOTE: suppose that we have a controller in old VPC, then user
|
704
700
|
# changes 'vpc_name' in the config and does a 'job launch' /
|
705
701
|
# 'serve up'. In general, the old controller may not successfully
|
706
|
-
# launch the job in the new VPC. This happens if the two VPCs don
|
702
|
+
# launch the job in the new VPC. This happens if the two VPCs don't
|
707
703
|
# have peering set up. Like other places in the code, we assume
|
708
704
|
# properly setting up networking is user's responsibilities.
|
709
705
|
# TODO(zongheng): consider adding a basic check that checks
|
sky/utils/dag_utils.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
"""Utilities for loading and dumping DAGs from/to YAML files."""
|
2
2
|
import copy
|
3
|
-
from typing import Any, Dict, List, Optional, Tuple
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
4
4
|
|
5
5
|
from sky import dag as dag_lib
|
6
6
|
from sky import sky_logging
|
@@ -195,7 +195,9 @@ def fill_default_config_in_dag_for_job_launch(dag: dag_lib.Dag) -> None:
|
|
195
195
|
assert default_strategy is not None
|
196
196
|
for resources in list(task_.resources):
|
197
197
|
original_job_recovery = resources.job_recovery
|
198
|
-
job_recovery = {
|
198
|
+
job_recovery: Dict[str, Optional[Union[str, int]]] = {
|
199
|
+
'strategy': default_strategy
|
200
|
+
}
|
199
201
|
if isinstance(original_job_recovery, str):
|
200
202
|
job_recovery['strategy'] = original_job_recovery
|
201
203
|
elif isinstance(original_job_recovery, dict):
|
sky/utils/resources_utils.py
CHANGED
@@ -140,10 +140,12 @@ def simplify_ports(ports: List[str]) -> List[str]:
|
|
140
140
|
def format_resource(resource: 'resources_lib.Resources',
|
141
141
|
simplify: bool = False) -> str:
|
142
142
|
if simplify:
|
143
|
+
resource = resource.assert_launchable()
|
143
144
|
cloud = resource.cloud
|
144
145
|
if resource.accelerators is None:
|
145
146
|
vcpu, _ = cloud.get_vcpus_mem_from_instance_type(
|
146
147
|
resource.instance_type)
|
148
|
+
assert vcpu is not None, 'vCPU must be specified'
|
147
149
|
hardware = f'vCPU={int(vcpu)}'
|
148
150
|
else:
|
149
151
|
hardware = f'{resource.accelerators}'
|
@@ -248,6 +250,7 @@ def make_launchables_for_valid_region_zones(
|
|
248
250
|
launchables = []
|
249
251
|
regions = launchable_resources.get_valid_regions_for_launchable()
|
250
252
|
for region in regions:
|
253
|
+
assert launchable_resources.cloud is not None, 'Cloud must be specified'
|
251
254
|
optimize_by_zone = (override_optimize_by_zone or
|
252
255
|
launchable_resources.cloud.optimize_by_zone())
|
253
256
|
# It is possible that we force the optimize_by_zone but some clouds
|