skypilot-nightly 1.0.0.dev20250514__py3-none-any.whl → 1.0.0.dev20250515__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +3 -2
  3. sky/backends/backend_utils.py +19 -17
  4. sky/backends/cloud_vm_ray_backend.py +30 -11
  5. sky/clouds/aws.py +11 -9
  6. sky/clouds/azure.py +16 -13
  7. sky/clouds/cloud.py +4 -3
  8. sky/clouds/cudo.py +3 -2
  9. sky/clouds/do.py +3 -2
  10. sky/clouds/fluidstack.py +3 -3
  11. sky/clouds/gcp.py +1 -1
  12. sky/clouds/ibm.py +12 -10
  13. sky/clouds/kubernetes.py +3 -2
  14. sky/clouds/lambda_cloud.py +6 -6
  15. sky/clouds/nebius.py +6 -5
  16. sky/clouds/oci.py +9 -7
  17. sky/clouds/paperspace.py +3 -2
  18. sky/clouds/runpod.py +9 -9
  19. sky/clouds/scp.py +5 -3
  20. sky/clouds/vast.py +8 -7
  21. sky/clouds/vsphere.py +4 -2
  22. sky/core.py +18 -12
  23. sky/dashboard/out/404.html +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/index-6b0d9e5031b70c58.js +1 -0
  25. sky/dashboard/out/_next/static/{tdxxQrPV6NW90a983oHXe → jFI0Y-uJZ_XDK5IGJpKFU}/_buildManifest.js +1 -1
  26. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  27. sky/dashboard/out/clusters/[cluster].html +1 -1
  28. sky/dashboard/out/clusters.html +1 -1
  29. sky/dashboard/out/index.html +1 -1
  30. sky/dashboard/out/jobs/[job].html +1 -1
  31. sky/dashboard/out/jobs.html +1 -1
  32. sky/execution.py +33 -0
  33. sky/global_user_state.py +2 -0
  34. sky/jobs/recovery_strategy.py +4 -1
  35. sky/jobs/server/core.py +6 -12
  36. sky/optimizer.py +19 -13
  37. sky/provision/kubernetes/utils.py +26 -1
  38. sky/resources.py +203 -44
  39. sky/serve/server/core.py +0 -5
  40. sky/serve/spot_placer.py +3 -0
  41. sky/server/requests/executor.py +114 -22
  42. sky/server/requests/requests.py +15 -0
  43. sky/server/server.py +63 -20
  44. sky/server/uvicorn.py +12 -2
  45. sky/sky_logging.py +40 -2
  46. sky/skylet/log_lib.py +60 -11
  47. sky/skylet/log_lib.pyi +5 -0
  48. sky/task.py +8 -6
  49. sky/utils/cli_utils/status_utils.py +6 -5
  50. sky/utils/command_runner.py +3 -0
  51. sky/utils/context.py +264 -0
  52. sky/utils/context_utils.py +172 -0
  53. sky/utils/controller_utils.py +39 -43
  54. sky/utils/dag_utils.py +4 -2
  55. sky/utils/resources_utils.py +3 -0
  56. sky/utils/rich_utils.py +81 -37
  57. sky/utils/schemas.py +33 -24
  58. sky/utils/subprocess_utils.py +8 -2
  59. {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/METADATA +1 -1
  60. {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/RECORD +65 -63
  61. {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/WHEEL +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  63. /sky/dashboard/out/_next/static/{tdxxQrPV6NW90a983oHXe → jFI0Y-uJZ_XDK5IGJpKFU}/_ssgManifest.js +0 -0
  64. {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/entry_points.txt +0 -0
  65. {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/licenses/LICENSE +0 -0
  66. {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,7 @@ from sky import sky_logging
11
11
  from sky.skylet import constants
12
12
  from sky.skylet import log_lib
13
13
  from sky.utils import common_utils
14
+ from sky.utils import context_utils
14
15
  from sky.utils import control_master_utils
15
16
  from sky.utils import subprocess_utils
16
17
  from sky.utils import timeline
@@ -574,6 +575,7 @@ class SSHCommandRunner(CommandRunner):
574
575
  shell=True)
575
576
 
576
577
  @timeline.event
578
+ @context_utils.cancellation_guard
577
579
  def run(
578
580
  self,
579
581
  cmd: Union[str, List[str]],
@@ -779,6 +781,7 @@ class KubernetesCommandRunner(CommandRunner):
779
781
  return kubectl_cmd
780
782
 
781
783
  @timeline.event
784
+ @context_utils.cancellation_guard
782
785
  def run(
783
786
  self,
784
787
  cmd: Union[str, List[str]],
sky/utils/context.py ADDED
@@ -0,0 +1,264 @@
1
+ """SkyPilot context for threads and coroutines."""
2
+
3
+ import asyncio
4
+ from collections.abc import Mapping
5
+ from collections.abc import MutableMapping
6
+ import contextvars
7
+ import os
8
+ import pathlib
9
+ import subprocess
10
+ import sys
11
+ from typing import Dict, Optional, TextIO
12
+
13
+
14
+ class Context(object):
15
+ """SkyPilot typed context vars for threads and coroutines.
16
+
17
+ This is a wrapper around `contextvars.ContextVar` that provides a typed
18
+ interface for the SkyPilot specific context variables that can be accessed
19
+ at any layer of the call stack. ContextVar is coroutine local, an empty
20
+ Context will be intialized for each coroutine when it is created.
21
+
22
+ Adding a new context variable for a new feature is as simple as:
23
+ 1. Add a new instance variable to the Context class.
24
+ 2. (Optional) Add new accessor methods if the variable should be protected.
25
+
26
+ To propagate the context to a new thread/coroutine, use
27
+ `contextvars.copy_context()`.
28
+
29
+ Example:
30
+ import asyncio
31
+ import contextvars
32
+ import time
33
+ from sky.utils import context
34
+
35
+ def sync_task():
36
+ while True:
37
+ if context.get().is_canceled():
38
+ break
39
+ time.sleep(1)
40
+
41
+ async def fastapi_handler():
42
+ # context.initialize() has been called in lifespan
43
+ ctx = contextvars.copy_context()
44
+ # asyncio.to_thread copies current context implicitly
45
+ task = asyncio.to_thread(sync_task)
46
+ # Or explicitly:
47
+ # loop = asyncio.get_running_loop()
48
+ # ctx = contextvars.copy_context()
49
+ # task = loop.run_in_executor(None, ctx.run, sync_task)
50
+ await asyncio.sleep(1)
51
+ context.get().cancel()
52
+ await task
53
+ """
54
+
55
+ def __init__(self):
56
+ self._canceled = asyncio.Event()
57
+ self._log_file = None
58
+ self._log_file_handle = None
59
+ self.env_overrides = {}
60
+
61
+ def cancel(self):
62
+ """Cancel the context."""
63
+ self._canceled.set()
64
+
65
+ def is_canceled(self):
66
+ """Check if the context is canceled."""
67
+ return self._canceled.is_set()
68
+
69
+ def redirect_log(
70
+ self, log_file: Optional[pathlib.Path]) -> Optional[pathlib.Path]:
71
+ """Redirect the stdout and stderr of current context to a file.
72
+
73
+ Args:
74
+ log_file: The log file to redirect to. If None, the stdout and
75
+ stderr will be restored to the original streams.
76
+
77
+ Returns:
78
+ The old log file, or None if the stdout and stderr were not
79
+ redirected.
80
+ """
81
+ original_log_file = self._log_file
82
+ original_log_handle = self._log_file_handle
83
+ if log_file is None:
84
+ self._log_file_handle = None
85
+ else:
86
+ self._log_file_handle = open(log_file, 'a', encoding='utf-8')
87
+ self._log_file = log_file
88
+ if original_log_file is not None:
89
+ original_log_handle.close()
90
+ return original_log_file
91
+
92
+ def output_stream(self, fallback: TextIO) -> TextIO:
93
+ if self._log_file_handle is None:
94
+ return fallback
95
+ else:
96
+ return self._log_file_handle
97
+
98
+ def override_envs(self, envs: Dict[str, str]):
99
+ for k, v in envs.items():
100
+ self.env_overrides[k] = v
101
+
102
+
103
+ _CONTEXT = contextvars.ContextVar('sky_context', default=None)
104
+
105
+
106
+ def get() -> Optional[Context]:
107
+ """Get the current SkyPilot context.
108
+
109
+ If the context is not initialized, get() will return None. This helps
110
+ sync code to check whether it runs in a cancellable context and avoid
111
+ polling the cancellation event if it is not.
112
+ """
113
+ return _CONTEXT.get()
114
+
115
+
116
+ class ContextualEnviron(MutableMapping):
117
+ """Environment variables wrapper with contextual overrides.
118
+
119
+ An instance of ContextualEnviron will typically be used to replace
120
+ os.environ to make the envron access of current process contextual
121
+ aware.
122
+
123
+ Behavior of spawning a subprocess:
124
+ - The contexual overrides will not be applied to the subprocess by
125
+ default.
126
+ - When using env=os.environ to pass the environment variables to the
127
+ subprocess explicitly. The subprocess will inherit the contextual
128
+ environment variables at the time of the spawn, that is, it will not
129
+ see the updates to the environment variables after the spawn. Also,
130
+ os.environ of the subprocess will not be a ContextualEnviron unless
131
+ the subprocess hijacks os.environ explicitly.
132
+ - Optionally, context.Popen() can be used to automatically pass
133
+ os.environ with overrides to subprocess.
134
+
135
+
136
+ Example:
137
+ 1. Parent process:
138
+ # Hijack os.environ to be a ContextualEnviron
139
+ os.environ = ContextualEnviron(os.environ)
140
+ ctx = context.get()
141
+ ctx.override_envs({'FOO': 'BAR1'})
142
+ proc = subprocess.Popen(..., env=os.environ)
143
+ # Or use context.Popen instead
144
+ # proc = context.Popen(...)
145
+ ctx.override_envs({'FOO': 'BAR2'})
146
+ 2. Subprocess:
147
+ assert os.environ['FOO'] == 'BAR1'
148
+ ctx = context.get()
149
+ # Override the contextual env var in the subprocess does not take
150
+ # effect since the os.environ is not hijacked.
151
+ ctx.override_envs({'FOO': 'BAR3'})
152
+ assert os.environ['FOO'] == 'BAR1'
153
+ """
154
+
155
+ def __init__(self, environ):
156
+ self._environ = environ
157
+
158
+ def __getitem__(self, key):
159
+ ctx = get()
160
+ if ctx is not None:
161
+ if key in ctx.env_overrides:
162
+ return ctx.env_overrides[key]
163
+ return self._environ[key]
164
+
165
+ def __iter__(self):
166
+ ctx = get()
167
+ if ctx is not None:
168
+ for key in ctx.env_overrides:
169
+ yield key
170
+ for key in self._environ:
171
+ # Deduplicate the keys
172
+ if key not in ctx.env_overrides:
173
+ yield key
174
+ else:
175
+ return self._environ.__iter__()
176
+
177
+ def __len__(self):
178
+ return len(dict(self))
179
+
180
+ def __setitem__(self, key, value):
181
+ return self._environ.__setitem__(key, value)
182
+
183
+ def __delitem__(self, key):
184
+ return self._environ.__delitem__(key)
185
+
186
+ def __repr__(self):
187
+ return self._environ.__repr__()
188
+
189
+ def copy(self):
190
+ copied = self._environ.copy()
191
+ ctx = get()
192
+ if ctx is not None:
193
+ copied.update(ctx.env_overrides)
194
+ return copied
195
+
196
+ def setdefault(self, key, default=None):
197
+ return self._environ.setdefault(key, default)
198
+
199
+ def __ior__(self, other):
200
+ if not isinstance(other, Mapping):
201
+ return NotImplemented
202
+ self.update(other)
203
+ return self
204
+
205
+ def __or__(self, other):
206
+ if not isinstance(other, Mapping):
207
+ return NotImplemented
208
+ new = dict(self)
209
+ new.update(other)
210
+ return new
211
+
212
+ def __ror__(self, other):
213
+ if not isinstance(other, Mapping):
214
+ return NotImplemented
215
+ new = dict(other)
216
+ new.update(self)
217
+ return new
218
+
219
+
220
+ class Popen(subprocess.Popen):
221
+
222
+ def __init__(self, *args, **kwargs):
223
+ env = kwargs.pop('env', None)
224
+ if env is None:
225
+ env = os.environ
226
+ super().__init__(*args, env=env, **kwargs)
227
+
228
+
229
+ def initialize():
230
+ """Initialize the current SkyPilot context."""
231
+ _CONTEXT.set(Context())
232
+
233
+
234
+ class _ContextualStream:
235
+ """A base class for streams that are contextually aware.
236
+
237
+ This class implements the TextIO interface via __getattr__ to delegate
238
+ attribute access to the original or contextual stream.
239
+ """
240
+ _original_stream: TextIO
241
+
242
+ def __init__(self, original_stream: TextIO):
243
+ self._original_stream = original_stream
244
+
245
+ def __getattr__(self, attr: str):
246
+ return getattr(self._active_stream(), attr)
247
+
248
+ def _active_stream(self) -> TextIO:
249
+ ctx = get()
250
+ if ctx is None:
251
+ return self._original_stream
252
+ return ctx.output_stream(self._original_stream)
253
+
254
+
255
+ class Stdout(_ContextualStream):
256
+
257
+ def __init__(self):
258
+ super().__init__(sys.stdout)
259
+
260
+
261
+ class Stderr(_ContextualStream):
262
+
263
+ def __init__(self):
264
+ super().__init__(sys.stderr)
@@ -0,0 +1,172 @@
1
+ """Utilities for SkyPilot context."""
2
+ import asyncio
3
+ import functools
4
+ import io
5
+ import multiprocessing
6
+ import os
7
+ import subprocess
8
+ import sys
9
+ import typing
10
+ from typing import Any, Callable, IO, Optional, Tuple, TypeVar
11
+
12
+ from sky import sky_logging
13
+ from sky.utils import context
14
+ from sky.utils import subprocess_utils
15
+
16
+ StreamHandler = Callable[[IO[Any], IO[Any]], str]
17
+
18
+
19
+ # TODO(aylei): call hijack_sys_attrs() proactivly in module init at server-side
20
+ # once we have context widely adopted.
21
+ def hijack_sys_attrs():
22
+ """hijack system attributes to be context aware
23
+
24
+ This function should be called at the very beginning of the processes
25
+ that might use sky.utils.context.
26
+ """
27
+ # Modify stdout and stderr of unvicorn process to be contextually aware,
28
+ # use setattr to bypass the TextIO type check.
29
+ setattr(sys, 'stdout', context.Stdout())
30
+ setattr(sys, 'stderr', context.Stderr())
31
+ # Reload logger to apply latest stdout and stderr.
32
+ sky_logging.reload_logger()
33
+ # Hijack os.environ with ContextualEnviron to make env variables
34
+ # contextually aware.
35
+ setattr(os, 'environ', context.ContextualEnviron(os.environ))
36
+ # Hijack subprocess.Popen to pass the contextual environ to subprocess
37
+ # by default.
38
+ setattr(subprocess, 'Popen', context.Popen)
39
+
40
+
41
+ def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
42
+ """Passthrough the stream from the process to the output stream"""
43
+ wrapped = io.TextIOWrapper(in_stream,
44
+ encoding='utf-8',
45
+ newline='',
46
+ errors='replace',
47
+ write_through=True)
48
+ while True:
49
+ line = wrapped.readline()
50
+ if line:
51
+ out_stream.write(line)
52
+ out_stream.flush()
53
+ else:
54
+ break
55
+ return ''
56
+
57
+
58
+ def pipe_and_wait_process(
59
+ ctx: context.Context,
60
+ proc: subprocess.Popen,
61
+ poll_interval: float = 0.5,
62
+ cancel_callback: Optional[Callable[[], None]] = None,
63
+ stdout_stream_handler: Optional[StreamHandler] = None,
64
+ stderr_stream_handler: Optional[StreamHandler] = None
65
+ ) -> Tuple[str, str]:
66
+ """Wait for the process to finish or cancel it if the context is cancelled.
67
+
68
+ Args:
69
+ proc: The process to wait for.
70
+ poll_interval: The interval to poll the process.
71
+ cancel_callback: The callback to call if the context is cancelled.
72
+ stdout_stream_handler: An optional handler to handle the stdout stream,
73
+ if None, the stdout stream will be passed through.
74
+ stderr_stream_handler: An optional handler to handle the stderr stream,
75
+ if None, the stderr stream will be passed through.
76
+ """
77
+
78
+ if stdout_stream_handler is None:
79
+ stdout_stream_handler = passthrough_stream_handler
80
+ if stderr_stream_handler is None:
81
+ stderr_stream_handler = passthrough_stream_handler
82
+
83
+ # Threads are lazily created, so no harm if stderr is None
84
+ with multiprocessing.pool.ThreadPool(processes=2) as pool:
85
+ # Context will be lost in the new thread, capture current output stream
86
+ # and pass it to the new thread directly.
87
+ stdout_fut = pool.apply_async(
88
+ stdout_stream_handler, (proc.stdout, ctx.output_stream(sys.stdout)))
89
+ stderr_fut = None
90
+ if proc.stderr is not None:
91
+ stderr_fut = pool.apply_async(
92
+ stderr_stream_handler,
93
+ (proc.stderr, ctx.output_stream(sys.stderr)))
94
+ try:
95
+ wait_process(ctx,
96
+ proc,
97
+ poll_interval=poll_interval,
98
+ cancel_callback=cancel_callback)
99
+ finally:
100
+ # Wait for the stream handler threads to exit when process is done
101
+ # or cancelled
102
+ stdout_fut.wait()
103
+ if stderr_fut is not None:
104
+ stderr_fut.wait()
105
+ stdout = stdout_fut.get()
106
+ stderr = ''
107
+ if stderr_fut is not None:
108
+ stderr = stderr_fut.get()
109
+ return stdout, stderr
110
+
111
+
112
+ def wait_process(ctx: context.Context,
113
+ proc: subprocess.Popen,
114
+ poll_interval: float = 0.5,
115
+ cancel_callback: Optional[Callable[[], None]] = None):
116
+ """Wait for the process to finish or cancel it if the context is cancelled.
117
+
118
+ Args:
119
+ proc: The process to wait for.
120
+ poll_interval: The interval to poll the process.
121
+ cancel_callback: The callback to call if the context is cancelled.
122
+ """
123
+ while True:
124
+ if ctx.is_canceled():
125
+ if cancel_callback is not None:
126
+ cancel_callback()
127
+ # Kill the process despite the caller's callback, the utility
128
+ # function gracefully handles the case where the process is
129
+ # already terminated.
130
+ subprocess_utils.kill_process_with_grace_period(proc)
131
+ raise asyncio.CancelledError()
132
+ try:
133
+ proc.wait(poll_interval)
134
+ except subprocess.TimeoutExpired:
135
+ pass
136
+ else:
137
+ # Process exited
138
+ break
139
+
140
+
141
+ F = TypeVar('F', bound=Callable[..., Any])
142
+
143
+
144
+ def cancellation_guard(func: F) -> F:
145
+ """Decorator to make a synchronous function cancellable via context.
146
+
147
+ Guards the function execution by checking context.is_canceled() before
148
+ executing the function and raises asyncio.CancelledError if the context
149
+ is already cancelled.
150
+
151
+ This basically mimics the behavior of asyncio, which checks coroutine
152
+ cancelled in await call.
153
+
154
+ Args:
155
+ func: The function to be decorated.
156
+
157
+ Returns:
158
+ The wrapped function that checks cancellation before execution.
159
+
160
+ Raises:
161
+ asyncio.CancelledError: If the context is cancelled before execution.
162
+ """
163
+
164
+ @functools.wraps(func)
165
+ def wrapper(*args, **kwargs):
166
+ ctx = context.get()
167
+ if ctx is not None and ctx.is_canceled():
168
+ raise asyncio.CancelledError(
169
+ f'Function {func.__name__} cancelled before execution')
170
+ return func(*args, **kwargs)
171
+
172
+ return typing.cast(F, wrapper)
@@ -6,7 +6,7 @@ import getpass
6
6
  import os
7
7
  import tempfile
8
8
  import typing
9
- from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
9
+ from typing import Any, Dict, Iterable, List, Optional, Set
10
10
  import uuid
11
11
 
12
12
  import colorama
@@ -517,6 +517,30 @@ def get_controller_resources(
517
517
  if custom_controller_resources_config is not None:
518
518
  controller_resources_config_copied.update(
519
519
  custom_controller_resources_config)
520
+ # Compatibility with the old way of specifying the controller autostop
521
+ # config. TODO(cooperc): Remove this before 0.12.0.
522
+ custom_controller_autostop_config = skypilot_config.get_nested(
523
+ (controller.value.controller_type, 'controller', 'autostop'), None)
524
+ if custom_controller_autostop_config is not None:
525
+ logger.warning(
526
+ f'{colorama.Fore.YELLOW}Warning: Config value '
527
+ f'`{controller.value.controller_type}.controller.autostop` '
528
+ 'is deprecated. Please use '
529
+ f'`{controller.value.controller_type}.controller.resources.'
530
+ f'autostop` instead.{colorama.Style.RESET_ALL}')
531
+ # Only set the autostop config if it is not already specified.
532
+ if controller_resources_config_copied.get('autostop') is None:
533
+ controller_resources_config_copied['autostop'] = (
534
+ custom_controller_autostop_config)
535
+ else:
536
+ logger.warning(f'{colorama.Fore.YELLOW}Ignoring the old '
537
+ 'config, since it is already specified in '
538
+ f'resources.{colorama.Style.RESET_ALL}')
539
+ # Set the default autostop config for the controller, if not already
540
+ # specified.
541
+ if controller_resources_config_copied.get('autostop') is None:
542
+ controller_resources_config_copied['autostop'] = (
543
+ controller.value.default_autostop_config)
520
544
 
521
545
  try:
522
546
  controller_resources = resources.Resources.from_yaml_config(
@@ -547,7 +571,10 @@ def get_controller_resources(
547
571
  if controller_record is not None:
548
572
  handle = controller_record.get('handle', None)
549
573
  if handle is not None:
550
- controller_resources_to_use = handle.launched_resources
574
+ # Use the existing resources, but override the autostop config with
575
+ # the one currently specified in the config.
576
+ controller_resources_to_use = handle.launched_resources.copy(
577
+ autostop=controller_resources_config_copied.get('autostop'))
551
578
 
552
579
  # If the controller and replicas are from the same cloud (and region/zone),
553
580
  # it should provide better connectivity. We will let the controller choose
@@ -608,8 +635,9 @@ def get_controller_resources(
608
635
  controller_zone = controller_resources_to_use.zone
609
636
 
610
637
  # Filter clouds if controller_resources_to_use.cloud is specified.
611
- filtered_clouds = ({controller_cloud} if controller_cloud is not None else
612
- requested_clouds_with_region_zone.keys())
638
+ filtered_clouds: Set[str] = {controller_cloud
639
+ } if controller_cloud is not None else set(
640
+ requested_clouds_with_region_zone.keys())
613
641
 
614
642
  # Filter regions and zones and construct the result.
615
643
  result: Set[resources.Resources] = set()
@@ -618,15 +646,17 @@ def get_controller_resources(
618
646
  {None: {None}})
619
647
 
620
648
  # Filter regions if controller_resources_to_use.region is specified.
621
- filtered_regions = ({controller_region} if controller_region is not None
622
- else regions.keys())
649
+ filtered_regions: Set[Optional[str]] = ({
650
+ controller_region
651
+ } if controller_region is not None else set(regions.keys()))
623
652
 
624
653
  for region in filtered_regions:
625
654
  zones = regions.get(region, {None})
626
655
 
627
656
  # Filter zones if controller_resources_to_use.zone is specified.
628
- filtered_zones = ({controller_zone}
629
- if controller_zone is not None else zones)
657
+ filtered_zones: Set[Optional[str]] = ({
658
+ controller_zone
659
+ } if controller_zone is not None else set(zones))
630
660
 
631
661
  # Create combinations of cloud, region, and zone.
632
662
  for zone in filtered_zones:
@@ -641,40 +671,6 @@ def get_controller_resources(
641
671
  return result
642
672
 
643
673
 
644
- def get_controller_autostop_config(
645
- controller: Controllers) -> Tuple[Optional[int], bool]:
646
- """Get the autostop config for the controller.
647
-
648
- Returns:
649
- A tuple of (idle_minutes_to_autostop, down), which correspond to the
650
- values passed to execution.launch().
651
- """
652
- controller_autostop_config_copied: Dict[str, Any] = copy.copy(
653
- controller.value.default_autostop_config)
654
- if skypilot_config.loaded():
655
- custom_controller_autostop_config = skypilot_config.get_nested(
656
- (controller.value.controller_type, 'controller', 'autostop'), None)
657
- if custom_controller_autostop_config is False:
658
- # Disabled with `autostop: false` in config.
659
- # To indicate autostop is disabled, we return None for
660
- # idle_minutes_to_autostop.
661
- return None, False
662
- elif custom_controller_autostop_config is True:
663
- # Enabled with default values. There is no change in behavior, but
664
- # this is included by for completeness, since `False` is valid.
665
- pass
666
- elif custom_controller_autostop_config is not None:
667
- # We have specific config values.
668
- # Override the controller autostop config with the ones specified in
669
- # the config.
670
- assert isinstance(custom_controller_autostop_config, dict)
671
- controller_autostop_config_copied.update(
672
- custom_controller_autostop_config)
673
-
674
- return (controller_autostop_config_copied['idle_minutes'],
675
- controller_autostop_config_copied['down'])
676
-
677
-
678
674
  def _setup_proxy_command_on_controller(
679
675
  controller_launched_cloud: 'clouds.Cloud',
680
676
  user_config: Dict[str, Any]) -> config_utils.Config:
@@ -703,7 +699,7 @@ def _setup_proxy_command_on_controller(
703
699
  # NOTE: suppose that we have a controller in old VPC, then user
704
700
  # changes 'vpc_name' in the config and does a 'job launch' /
705
701
  # 'serve up'. In general, the old controller may not successfully
706
- # launch the job in the new VPC. This happens if the two VPCs dont
702
+ # launch the job in the new VPC. This happens if the two VPCs don't
707
703
  # have peering set up. Like other places in the code, we assume
708
704
  # properly setting up networking is user's responsibilities.
709
705
  # TODO(zongheng): consider adding a basic check that checks
sky/utils/dag_utils.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """Utilities for loading and dumping DAGs from/to YAML files."""
2
2
  import copy
3
- from typing import Any, Dict, List, Optional, Tuple
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
4
4
 
5
5
  from sky import dag as dag_lib
6
6
  from sky import sky_logging
@@ -195,7 +195,9 @@ def fill_default_config_in_dag_for_job_launch(dag: dag_lib.Dag) -> None:
195
195
  assert default_strategy is not None
196
196
  for resources in list(task_.resources):
197
197
  original_job_recovery = resources.job_recovery
198
- job_recovery = {'strategy': default_strategy}
198
+ job_recovery: Dict[str, Optional[Union[str, int]]] = {
199
+ 'strategy': default_strategy
200
+ }
199
201
  if isinstance(original_job_recovery, str):
200
202
  job_recovery['strategy'] = original_job_recovery
201
203
  elif isinstance(original_job_recovery, dict):
@@ -140,10 +140,12 @@ def simplify_ports(ports: List[str]) -> List[str]:
140
140
  def format_resource(resource: 'resources_lib.Resources',
141
141
  simplify: bool = False) -> str:
142
142
  if simplify:
143
+ resource = resource.assert_launchable()
143
144
  cloud = resource.cloud
144
145
  if resource.accelerators is None:
145
146
  vcpu, _ = cloud.get_vcpus_mem_from_instance_type(
146
147
  resource.instance_type)
148
+ assert vcpu is not None, 'vCPU must be specified'
147
149
  hardware = f'vCPU={int(vcpu)}'
148
150
  else:
149
151
  hardware = f'{resource.accelerators}'
@@ -248,6 +250,7 @@ def make_launchables_for_valid_region_zones(
248
250
  launchables = []
249
251
  regions = launchable_resources.get_valid_regions_for_launchable()
250
252
  for region in regions:
253
+ assert launchable_resources.cloud is not None, 'Cloud must be specified'
251
254
  optimize_by_zone = (override_optimize_by_zone or
252
255
  launchable_resources.cloud.optimize_by_zone())
253
256
  # It is possible that we force the optimize_by_zone but some clouds