skypilot-nightly 1.0.0.dev20250514__py3-none-any.whl → 1.0.0.dev20250516__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend.py +3 -2
- sky/backends/backend_utils.py +19 -17
- sky/backends/cloud_vm_ray_backend.py +30 -11
- sky/clouds/aws.py +11 -9
- sky/clouds/azure.py +16 -13
- sky/clouds/cloud.py +4 -3
- sky/clouds/cudo.py +3 -2
- sky/clouds/do.py +3 -2
- sky/clouds/fluidstack.py +3 -3
- sky/clouds/gcp.py +1 -1
- sky/clouds/ibm.py +12 -10
- sky/clouds/kubernetes.py +3 -2
- sky/clouds/lambda_cloud.py +6 -6
- sky/clouds/nebius.py +6 -5
- sky/clouds/oci.py +9 -7
- sky/clouds/paperspace.py +3 -2
- sky/clouds/runpod.py +9 -9
- sky/clouds/scp.py +5 -3
- sky/clouds/vast.py +8 -7
- sky/clouds/vsphere.py +4 -2
- sky/core.py +18 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/pages/index-6b0d9e5031b70c58.js +1 -0
- sky/dashboard/out/_next/static/{tdxxQrPV6NW90a983oHXe → y1yf6Xc0zwam5fFluIyUm}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/execution.py +33 -0
- sky/global_user_state.py +2 -0
- sky/jobs/recovery_strategy.py +4 -1
- sky/jobs/server/core.py +6 -12
- sky/optimizer.py +19 -13
- sky/provision/kubernetes/utils.py +26 -1
- sky/resources.py +203 -44
- sky/serve/server/core.py +0 -5
- sky/serve/spot_placer.py +3 -0
- sky/server/requests/executor.py +114 -22
- sky/server/requests/requests.py +15 -0
- sky/server/server.py +63 -20
- sky/server/uvicorn.py +12 -2
- sky/setup_files/dependencies.py +4 -1
- sky/sky_logging.py +40 -2
- sky/skylet/log_lib.py +60 -11
- sky/skylet/log_lib.pyi +5 -0
- sky/task.py +8 -6
- sky/utils/cli_utils/status_utils.py +6 -5
- sky/utils/command_runner.py +3 -0
- sky/utils/context.py +264 -0
- sky/utils/context_utils.py +172 -0
- sky/utils/controller_utils.py +39 -43
- sky/utils/dag_utils.py +4 -2
- sky/utils/resources_utils.py +3 -0
- sky/utils/rich_utils.py +81 -37
- sky/utils/schemas.py +33 -24
- sky/utils/subprocess_utils.py +8 -2
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250516.dist-info}/METADATA +2 -2
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250516.dist-info}/RECORD +66 -64
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250516.dist-info}/WHEEL +1 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- /sky/dashboard/out/_next/static/{tdxxQrPV6NW90a983oHXe → y1yf6Xc0zwam5fFluIyUm}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250516.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250516.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250514.dist-info → skypilot_nightly-1.0.0.dev20250516.dist-info}/top_level.txt +0 -0
sky/resources.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
"""Resources: compute requirements of Tasks."""
|
2
2
|
import dataclasses
|
3
3
|
import textwrap
|
4
|
-
|
4
|
+
import typing
|
5
|
+
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
|
5
6
|
|
6
7
|
import colorama
|
7
8
|
|
@@ -34,6 +35,50 @@ RESOURCE_CONFIG_ALIASES = {
|
|
34
35
|
}
|
35
36
|
|
36
37
|
|
38
|
+
@dataclasses.dataclass
|
39
|
+
class AutostopConfig:
|
40
|
+
"""Configuration for autostop."""
|
41
|
+
# enabled isn't present in the yaml config, but it's needed for this class
|
42
|
+
# to be complete.
|
43
|
+
enabled: bool
|
44
|
+
# If enabled is False, these values are ignored.
|
45
|
+
idle_minutes: int = 5
|
46
|
+
down: bool = False
|
47
|
+
|
48
|
+
def to_yaml_config(self) -> Union[Literal[False], Dict[str, Any]]:
|
49
|
+
if not self.enabled:
|
50
|
+
return False
|
51
|
+
return {
|
52
|
+
'idle_minutes': self.idle_minutes,
|
53
|
+
'down': self.down,
|
54
|
+
}
|
55
|
+
|
56
|
+
@classmethod
|
57
|
+
def from_yaml_config(
|
58
|
+
cls, config: Union[bool, int, Dict[str, Any], None]
|
59
|
+
) -> Optional['AutostopConfig']:
|
60
|
+
if isinstance(config, bool):
|
61
|
+
if config:
|
62
|
+
return cls(enabled=True)
|
63
|
+
else:
|
64
|
+
return cls(enabled=False)
|
65
|
+
|
66
|
+
if isinstance(config, int):
|
67
|
+
return cls(idle_minutes=config, down=False, enabled=True)
|
68
|
+
|
69
|
+
if isinstance(config, dict):
|
70
|
+
# If we have a dict, autostop is enabled. (Only way to disable is
|
71
|
+
# with `false`, a bool.)
|
72
|
+
autostop_config = cls(enabled=True)
|
73
|
+
if 'idle_minutes' in config:
|
74
|
+
autostop_config.idle_minutes = config['idle_minutes']
|
75
|
+
if 'down' in config:
|
76
|
+
autostop_config.down = config['down']
|
77
|
+
return autostop_config
|
78
|
+
|
79
|
+
return None
|
80
|
+
|
81
|
+
|
37
82
|
class Resources:
|
38
83
|
"""Resources: compute requirements of Tasks.
|
39
84
|
|
@@ -51,7 +96,7 @@ class Resources:
|
|
51
96
|
"""
|
52
97
|
# If any fields changed, increment the version. For backward compatibility,
|
53
98
|
# modify the __setstate__ method to handle the old version.
|
54
|
-
_VERSION =
|
99
|
+
_VERSION = 23
|
55
100
|
|
56
101
|
def __init__(
|
57
102
|
self,
|
@@ -59,17 +104,19 @@ class Resources:
|
|
59
104
|
instance_type: Optional[str] = None,
|
60
105
|
cpus: Union[None, int, float, str] = None,
|
61
106
|
memory: Union[None, int, float, str] = None,
|
62
|
-
accelerators: Union[None, str, Dict[str, int]] = None,
|
107
|
+
accelerators: Union[None, str, Dict[str, Union[int, float]]] = None,
|
63
108
|
accelerator_args: Optional[Dict[str, str]] = None,
|
64
109
|
use_spot: Optional[bool] = None,
|
65
|
-
job_recovery: Optional[Union[Dict[str, Union[str, int]]
|
110
|
+
job_recovery: Optional[Union[Dict[str, Optional[Union[str, int]]],
|
111
|
+
str]] = None,
|
66
112
|
region: Optional[str] = None,
|
67
113
|
zone: Optional[str] = None,
|
68
|
-
image_id: Union[Dict[str, str], str, None] = None,
|
114
|
+
image_id: Union[Dict[Optional[str], str], str, None] = None,
|
69
115
|
disk_size: Optional[int] = None,
|
70
116
|
disk_tier: Optional[Union[str, resources_utils.DiskTier]] = None,
|
71
117
|
ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
|
72
118
|
labels: Optional[Dict[str, str]] = None,
|
119
|
+
autostop: Union[bool, int, Dict[str, Any], None] = None,
|
73
120
|
# Internal use only.
|
74
121
|
# pylint: disable=invalid-name
|
75
122
|
_docker_login_config: Optional[docker_utils.DockerLoginConfig] = None,
|
@@ -152,6 +199,8 @@ class Resources:
|
|
152
199
|
instance tags. On GCP, labels map to instance labels. On
|
153
200
|
Kubernetes, labels map to pod labels. On other clouds, labels are
|
154
201
|
not supported and will be ignored.
|
202
|
+
autostop: the autostop configuration to use. For launched resources,
|
203
|
+
may or may not correspond to the actual current autostop config.
|
155
204
|
_docker_login_config: the docker configuration to use. This includes
|
156
205
|
the docker username, password, and registry server. If None, skip
|
157
206
|
docker login.
|
@@ -177,7 +226,8 @@ class Resources:
|
|
177
226
|
|
178
227
|
self._use_spot_specified = use_spot is not None
|
179
228
|
self._use_spot = use_spot if use_spot is not None else False
|
180
|
-
self._job_recovery: Optional[Dict[str, Union[str,
|
229
|
+
self._job_recovery: Optional[Dict[str, Optional[Union[str,
|
230
|
+
int]]]] = None
|
181
231
|
if job_recovery is not None:
|
182
232
|
if isinstance(job_recovery, str):
|
183
233
|
job_recovery = {'strategy': job_recovery}
|
@@ -188,7 +238,7 @@ class Resources:
|
|
188
238
|
if strategy_name == 'none':
|
189
239
|
self._job_recovery = None
|
190
240
|
else:
|
191
|
-
if strategy_name
|
241
|
+
if isinstance(strategy_name, str):
|
192
242
|
job_recovery['strategy'] = strategy_name.upper()
|
193
243
|
self._job_recovery = job_recovery
|
194
244
|
|
@@ -201,7 +251,7 @@ class Resources:
|
|
201
251
|
else:
|
202
252
|
self._disk_size = _DEFAULT_DISK_SIZE_GB
|
203
253
|
|
204
|
-
self._image_id =
|
254
|
+
self._image_id: Optional[Dict[Optional[str], str]] = None
|
205
255
|
if isinstance(image_id, str):
|
206
256
|
self._image_id = {self._region: image_id.strip()}
|
207
257
|
elif isinstance(image_id, dict):
|
@@ -209,8 +259,11 @@ class Resources:
|
|
209
259
|
self._image_id = {self._region: image_id[None].strip()}
|
210
260
|
else:
|
211
261
|
self._image_id = {
|
212
|
-
k.strip(): v.strip()
|
262
|
+
typing.cast(str, k).strip(): v.strip()
|
263
|
+
for k, v in image_id.items()
|
213
264
|
}
|
265
|
+
else:
|
266
|
+
self._image_id = image_id
|
214
267
|
self._is_image_managed = _is_image_managed
|
215
268
|
|
216
269
|
if isinstance(disk_tier, str):
|
@@ -228,7 +281,7 @@ class Resources:
|
|
228
281
|
if isinstance(ports, tuple):
|
229
282
|
ports = list(ports)
|
230
283
|
if not isinstance(ports, list):
|
231
|
-
ports = [ports]
|
284
|
+
ports = [str(ports)]
|
232
285
|
ports = resources_utils.simplify_ports(
|
233
286
|
[str(port) for port in ports])
|
234
287
|
if not ports:
|
@@ -250,11 +303,12 @@ class Resources:
|
|
250
303
|
self._requires_fuse = _requires_fuse
|
251
304
|
|
252
305
|
self._cluster_config_overrides = _cluster_config_overrides
|
253
|
-
self._cached_repr = None
|
306
|
+
self._cached_repr: Optional[str] = None
|
254
307
|
|
255
308
|
self._set_cpus(cpus)
|
256
309
|
self._set_memory(memory)
|
257
310
|
self._set_accelerators(accelerators, accelerator_args)
|
311
|
+
self._set_autostop_config(autostop)
|
258
312
|
|
259
313
|
def validate(self):
|
260
314
|
"""Validate the resources and infer the missing fields if possible."""
|
@@ -378,19 +432,19 @@ class Resources:
|
|
378
432
|
return repr_str
|
379
433
|
|
380
434
|
@property
|
381
|
-
def cloud(self):
|
435
|
+
def cloud(self) -> Optional[clouds.Cloud]:
|
382
436
|
return self._cloud
|
383
437
|
|
384
438
|
@property
|
385
|
-
def region(self):
|
439
|
+
def region(self) -> Optional[str]:
|
386
440
|
return self._region
|
387
441
|
|
388
442
|
@property
|
389
|
-
def zone(self):
|
443
|
+
def zone(self) -> Optional[str]:
|
390
444
|
return self._zone
|
391
445
|
|
392
446
|
@property
|
393
|
-
def instance_type(self):
|
447
|
+
def instance_type(self) -> Optional[str]:
|
394
448
|
return self._instance_type
|
395
449
|
|
396
450
|
@property
|
@@ -444,7 +498,7 @@ class Resources:
|
|
444
498
|
return None
|
445
499
|
|
446
500
|
@property
|
447
|
-
def accelerator_args(self) -> Optional[Dict[str,
|
501
|
+
def accelerator_args(self) -> Optional[Dict[str, Any]]:
|
448
502
|
return self._accelerator_args
|
449
503
|
|
450
504
|
@property
|
@@ -456,7 +510,7 @@ class Resources:
|
|
456
510
|
return self._use_spot_specified
|
457
511
|
|
458
512
|
@property
|
459
|
-
def job_recovery(self) -> Optional[Dict[str, Union[str, int]]]:
|
513
|
+
def job_recovery(self) -> Optional[Dict[str, Optional[Union[str, int]]]]:
|
460
514
|
return self._job_recovery
|
461
515
|
|
462
516
|
@property
|
@@ -464,11 +518,11 @@ class Resources:
|
|
464
518
|
return self._disk_size
|
465
519
|
|
466
520
|
@property
|
467
|
-
def image_id(self) -> Optional[Dict[str, str]]:
|
521
|
+
def image_id(self) -> Optional[Dict[Optional[str], str]]:
|
468
522
|
return self._image_id
|
469
523
|
|
470
524
|
@property
|
471
|
-
def disk_tier(self) -> resources_utils.DiskTier:
|
525
|
+
def disk_tier(self) -> Optional[resources_utils.DiskTier]:
|
472
526
|
return self._disk_tier
|
473
527
|
|
474
528
|
@property
|
@@ -479,6 +533,16 @@ class Resources:
|
|
479
533
|
def labels(self) -> Optional[Dict[str, str]]:
|
480
534
|
return self._labels
|
481
535
|
|
536
|
+
@property
|
537
|
+
def autostop_config(self) -> Optional[AutostopConfig]:
|
538
|
+
"""The requested autostop config.
|
539
|
+
|
540
|
+
Warning: This is the autostop config that was originally used to
|
541
|
+
launch the resources. It may not correspond to the actual current
|
542
|
+
autostop config.
|
543
|
+
"""
|
544
|
+
return self._autostop_config
|
545
|
+
|
482
546
|
@property
|
483
547
|
def is_image_managed(self) -> Optional[bool]:
|
484
548
|
return self._is_image_managed
|
@@ -489,16 +553,22 @@ class Resources:
|
|
489
553
|
return False
|
490
554
|
return self._requires_fuse
|
491
555
|
|
556
|
+
def set_requires_fuse(self, value: bool) -> None:
|
557
|
+
"""Sets whether this resource requires FUSE mounting support.
|
558
|
+
|
559
|
+
Args:
|
560
|
+
value: Whether the resource requires FUSE mounting support.
|
561
|
+
"""
|
562
|
+
# TODO(zeping): This violates the immutability of Resources.
|
563
|
+
# Refactor to use Resources.copy instead.
|
564
|
+
self._requires_fuse = value
|
565
|
+
|
492
566
|
@property
|
493
567
|
def cluster_config_overrides(self) -> Dict[str, Any]:
|
494
568
|
if self._cluster_config_overrides is None:
|
495
569
|
return {}
|
496
570
|
return self._cluster_config_overrides
|
497
571
|
|
498
|
-
@requires_fuse.setter
|
499
|
-
def requires_fuse(self, value: Optional[bool]) -> None:
|
500
|
-
self._requires_fuse = value
|
501
|
-
|
502
572
|
@property
|
503
573
|
def docker_login_config(self) -> Optional[docker_utils.DockerLoginConfig]:
|
504
574
|
return self._docker_login_config
|
@@ -572,8 +642,8 @@ class Resources:
|
|
572
642
|
|
573
643
|
def _set_accelerators(
|
574
644
|
self,
|
575
|
-
accelerators: Union[None, str, Dict[str, int]],
|
576
|
-
accelerator_args: Optional[Dict[str,
|
645
|
+
accelerators: Union[None, str, Dict[str, Union[int, float]]],
|
646
|
+
accelerator_args: Optional[Dict[str, Any]],
|
577
647
|
) -> None:
|
578
648
|
"""Sets accelerators.
|
579
649
|
|
@@ -608,10 +678,11 @@ class Resources:
|
|
608
678
|
self._cloud = clouds.Kubernetes()
|
609
679
|
else:
|
610
680
|
self._cloud = clouds.GCP()
|
611
|
-
assert
|
612
|
-
|
613
|
-
|
614
|
-
|
681
|
+
assert self.cloud is not None and (
|
682
|
+
self.cloud.is_same_cloud(clouds.GCP()) or
|
683
|
+
self.cloud.is_same_cloud(clouds.Kubernetes())), (
|
684
|
+
'Cloud must be GCP or Kubernetes for TPU '
|
685
|
+
'accelerators.')
|
615
686
|
|
616
687
|
if accelerator_args is None:
|
617
688
|
accelerator_args = {}
|
@@ -645,15 +716,34 @@ class Resources:
|
|
645
716
|
'Cannot specify instance type (got '
|
646
717
|
f'{self.instance_type!r}) for TPU VM.')
|
647
718
|
|
648
|
-
self._accelerators
|
649
|
-
|
719
|
+
self._accelerators: Optional[Dict[str, Union[int,
|
720
|
+
float]]] = accelerators
|
721
|
+
self._accelerator_args: Optional[Dict[str, Any]] = accelerator_args
|
722
|
+
|
723
|
+
def _set_autostop_config(
|
724
|
+
self,
|
725
|
+
autostop: Union[bool, int, Dict[str, Any], None],
|
726
|
+
) -> None:
|
727
|
+
self._autostop_config = AutostopConfig.from_yaml_config(autostop)
|
650
728
|
|
651
729
|
def is_launchable(self) -> bool:
|
730
|
+
"""Returns whether the resource is launchable."""
|
652
731
|
return self.cloud is not None and self._instance_type is not None
|
653
732
|
|
733
|
+
def assert_launchable(self) -> 'LaunchableResources':
|
734
|
+
"""A workaround to make mypy understand that is_launchable() is true.
|
735
|
+
|
736
|
+
Note: The `cast` to `LaunchableResources` is only for static type
|
737
|
+
checking with MyPy. At runtime, the Python interpreter does not enforce
|
738
|
+
types, and the returned object will still be an instance of `Resources`.
|
739
|
+
"""
|
740
|
+
assert self.is_launchable(), self
|
741
|
+
return typing.cast(LaunchableResources, self)
|
742
|
+
|
654
743
|
def need_cleanup_after_preemption_or_failure(self) -> bool:
|
655
744
|
"""Whether a resource needs cleanup after preemption or failure."""
|
656
745
|
assert self.is_launchable(), self
|
746
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
657
747
|
return self.cloud.need_cleanup_after_preemption_or_failure(self)
|
658
748
|
|
659
749
|
def _try_canonicalize_accelerators(self) -> None:
|
@@ -710,10 +800,10 @@ class Resources:
|
|
710
800
|
else:
|
711
801
|
table = log_utils.create_table(['Cloud', 'Hint'])
|
712
802
|
table.add_row(['-----', '----'])
|
713
|
-
for
|
803
|
+
for cloud_msg, error in cloud_to_errors.items():
|
714
804
|
reason_str = '\n'.join(textwrap.wrap(
|
715
805
|
str(error), 80))
|
716
|
-
table.add_row([
|
806
|
+
table.add_row([cloud_msg, reason_str])
|
717
807
|
hint = table.get_string()
|
718
808
|
raise ValueError(
|
719
809
|
f'Invalid (region {self._region!r}, zone '
|
@@ -745,11 +835,13 @@ class Resources:
|
|
745
835
|
ssh_proxy_command dict with region names as keys).
|
746
836
|
"""
|
747
837
|
assert self.is_launchable(), self
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
838
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
839
|
+
assert self._instance_type is not None, (
|
840
|
+
'Instance type must be specified')
|
841
|
+
regions = self.cloud.regions_with_offering(self._instance_type,
|
842
|
+
self.accelerators,
|
843
|
+
self._use_spot, self._region,
|
844
|
+
self._zone)
|
753
845
|
if self._image_id is not None and None not in self._image_id:
|
754
846
|
regions = [r for r in regions if r.name in self._image_id]
|
755
847
|
|
@@ -849,6 +941,10 @@ class Resources:
|
|
849
941
|
cpus, mem = self.cloud.get_vcpus_mem_from_instance_type(
|
850
942
|
self._instance_type)
|
851
943
|
if self._cpus is not None:
|
944
|
+
assert cpus is not None, (
|
945
|
+
f'Can\'t get vCPUs from instance type: '
|
946
|
+
f'{self._instance_type}, check catalog or '
|
947
|
+
f'specify cpus directly.')
|
852
948
|
if self._cpus.endswith('+'):
|
853
949
|
if cpus < float(self._cpus[:-1]):
|
854
950
|
with ux_utils.print_exception_no_traceback():
|
@@ -863,6 +959,10 @@ class Resources:
|
|
863
959
|
f'number of vCPUs. {self.instance_type} has {cpus} '
|
864
960
|
f'vCPUs, but {self._cpus} is requested.')
|
865
961
|
if self.memory is not None:
|
962
|
+
assert mem is not None, (
|
963
|
+
f'Can\'t get memory from instance type: '
|
964
|
+
f'{self._instance_type}, check catalog or '
|
965
|
+
f'specify memory directly.')
|
866
966
|
if self.memory.endswith(('+', 'x')):
|
867
967
|
if mem < float(self.memory[:-1]):
|
868
968
|
with ux_utils.print_exception_no_traceback():
|
@@ -886,6 +986,8 @@ class Resources:
|
|
886
986
|
if self._job_recovery is None or self._job_recovery['strategy'] is None:
|
887
987
|
return
|
888
988
|
# Validate the job recovery strategy
|
989
|
+
assert isinstance(self._job_recovery['strategy'],
|
990
|
+
str), 'Job recovery strategy must be a string'
|
889
991
|
registry.JOBS_RECOVERY_STRATEGY_REGISTRY.from_str(
|
890
992
|
self._job_recovery['strategy'])
|
891
993
|
|
@@ -920,7 +1022,7 @@ class Resources:
|
|
920
1022
|
'Cloud must be specified when image_id is provided.')
|
921
1023
|
|
922
1024
|
try:
|
923
|
-
self.
|
1025
|
+
self.cloud.check_features_are_supported(
|
924
1026
|
self,
|
925
1027
|
requested_features={
|
926
1028
|
clouds.CloudImplementationFeatures.IMAGE_ID
|
@@ -943,14 +1045,14 @@ class Resources:
|
|
943
1045
|
# Check the image_id's are valid.
|
944
1046
|
for region, image_id in self._image_id.items():
|
945
1047
|
if (image_id.startswith('skypilot:') and
|
946
|
-
not self.
|
1048
|
+
not self.cloud.is_image_tag_valid(image_id, region)):
|
947
1049
|
region_str = f' ({region})' if region else ''
|
948
1050
|
with ux_utils.print_exception_no_traceback():
|
949
1051
|
raise ValueError(
|
950
1052
|
f'Image tag {image_id!r} is not valid, please make sure'
|
951
1053
|
f' the tag exists in {self._cloud}{region_str}.')
|
952
1054
|
|
953
|
-
if (self.
|
1055
|
+
if (self.cloud.is_same_cloud(clouds.AWS()) and
|
954
1056
|
not image_id.startswith('skypilot:') and region is None):
|
955
1057
|
with ux_utils.print_exception_no_traceback():
|
956
1058
|
raise ValueError(
|
@@ -1055,6 +1157,9 @@ class Resources:
|
|
1055
1157
|
"""Returns cost in USD for the runtime in seconds."""
|
1056
1158
|
hours = seconds / 3600
|
1057
1159
|
# Instance.
|
1160
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
1161
|
+
assert self._instance_type is not None, (
|
1162
|
+
'Instance type must be specified')
|
1058
1163
|
hourly_cost = self.cloud.instance_type_to_hourly_cost(
|
1059
1164
|
self._instance_type, self.use_spot, self._region, self._zone)
|
1060
1165
|
# Accelerators (if any).
|
@@ -1099,6 +1204,7 @@ class Resources:
|
|
1099
1204
|
docker_image = self.extract_docker_image()
|
1100
1205
|
|
1101
1206
|
# Cloud specific variables
|
1207
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
1102
1208
|
cloud_specific_variables = self.cloud.make_deploy_resources_variables(
|
1103
1209
|
self, cluster_name, region, zones, num_nodes, dryrun)
|
1104
1210
|
|
@@ -1153,9 +1259,12 @@ class Resources:
|
|
1153
1259
|
specific_reservations = set(
|
1154
1260
|
skypilot_config.get_nested(
|
1155
1261
|
(str(self.cloud).lower(), 'specific_reservations'), set()))
|
1262
|
+
|
1263
|
+
assert (self.cloud is not None and self.instance_type is not None and
|
1264
|
+
self.region
|
1265
|
+
is not None), ('Cloud, instance type, region must be specified')
|
1156
1266
|
return self.cloud.get_reservations_available_resources(
|
1157
|
-
self.
|
1158
|
-
specific_reservations)
|
1267
|
+
self.instance_type, self.region, self.zone, specific_reservations)
|
1159
1268
|
|
1160
1269
|
def less_demanding_than(
|
1161
1270
|
self,
|
@@ -1175,6 +1284,9 @@ class Resources:
|
|
1175
1284
|
if isinstance(other, list):
|
1176
1285
|
resources_list = [self.less_demanding_than(o) for o in other]
|
1177
1286
|
return requested_num_nodes <= sum(resources_list)
|
1287
|
+
|
1288
|
+
assert other.cloud is not None, 'Other cloud must be specified'
|
1289
|
+
|
1178
1290
|
if self.cloud is not None and not self.cloud.is_same_cloud(other.cloud):
|
1179
1291
|
return False
|
1180
1292
|
# self.cloud <= other.cloud
|
@@ -1263,6 +1375,7 @@ class Resources:
|
|
1263
1375
|
If a field in `blocked` is None, it should be considered as a wildcard
|
1264
1376
|
for that field.
|
1265
1377
|
"""
|
1378
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
1266
1379
|
is_matched = True
|
1267
1380
|
if (blocked.cloud is not None and
|
1268
1381
|
not self.cloud.is_same_cloud(blocked.cloud)):
|
@@ -1301,7 +1414,7 @@ class Resources:
|
|
1301
1414
|
use_spot = self.use_spot if self._use_spot_specified else None
|
1302
1415
|
|
1303
1416
|
current_override_configs = self._cluster_config_overrides
|
1304
|
-
if
|
1417
|
+
if current_override_configs is None:
|
1305
1418
|
current_override_configs = {}
|
1306
1419
|
new_override_configs = override.pop('_cluster_config_overrides', {})
|
1307
1420
|
overlaid_configs = skypilot_config.overlay_skypilot_config(
|
@@ -1314,6 +1427,10 @@ class Resources:
|
|
1314
1427
|
if elem is not None:
|
1315
1428
|
override_configs.set_nested(key, elem)
|
1316
1429
|
|
1430
|
+
current_autostop_config = None
|
1431
|
+
if self.autostop_config is not None:
|
1432
|
+
current_autostop_config = self.autostop_config.to_yaml_config()
|
1433
|
+
|
1317
1434
|
override_configs = dict(override_configs) if override_configs else None
|
1318
1435
|
resources = Resources(
|
1319
1436
|
cloud=override.pop('cloud', self.cloud),
|
@@ -1332,6 +1449,7 @@ class Resources:
|
|
1332
1449
|
disk_tier=override.pop('disk_tier', self.disk_tier),
|
1333
1450
|
ports=override.pop('ports', self.ports),
|
1334
1451
|
labels=override.pop('labels', self.labels),
|
1452
|
+
autostop=override.pop('autostop', current_autostop_config),
|
1335
1453
|
_docker_login_config=override.pop('_docker_login_config',
|
1336
1454
|
self._docker_login_config),
|
1337
1455
|
_docker_username_for_runpod=override.pop(
|
@@ -1529,6 +1647,7 @@ class Resources:
|
|
1529
1647
|
resources_fields['disk_tier'] = config.pop('disk_tier', None)
|
1530
1648
|
resources_fields['ports'] = config.pop('ports', None)
|
1531
1649
|
resources_fields['labels'] = config.pop('labels', None)
|
1650
|
+
resources_fields['autostop'] = config.pop('autostop', None)
|
1532
1651
|
resources_fields['_docker_login_config'] = config.pop(
|
1533
1652
|
'_docker_login_config', None)
|
1534
1653
|
resources_fields['_docker_username_for_runpod'] = config.pop(
|
@@ -1578,6 +1697,8 @@ class Resources:
|
|
1578
1697
|
config['disk_tier'] = self.disk_tier.value
|
1579
1698
|
add_if_not_none('ports', self.ports)
|
1580
1699
|
add_if_not_none('labels', self.labels)
|
1700
|
+
if self._autostop_config is not None:
|
1701
|
+
config['autostop'] = self._autostop_config.to_yaml_config()
|
1581
1702
|
if self._docker_login_config is not None:
|
1582
1703
|
config['_docker_login_config'] = dataclasses.asdict(
|
1583
1704
|
self._docker_login_config)
|
@@ -1733,4 +1854,42 @@ class Resources:
|
|
1733
1854
|
self._docker_username_for_runpod = state.pop(
|
1734
1855
|
'_docker_username_for_runpod', None)
|
1735
1856
|
|
1857
|
+
if version < 23:
|
1858
|
+
self._autostop_config = None
|
1859
|
+
|
1736
1860
|
self.__dict__.update(state)
|
1861
|
+
|
1862
|
+
|
1863
|
+
class LaunchableResources(Resources):
|
1864
|
+
"""A class representing resources that can be launched on a cloud provider.
|
1865
|
+
|
1866
|
+
This class is primarily a type hint for MyPy to indicate that an instance
|
1867
|
+
of `Resources` is launchable (i.e., `cloud` and `instance_type` are not
|
1868
|
+
None). It should not be instantiated directly.
|
1869
|
+
"""
|
1870
|
+
|
1871
|
+
def __init__(self, *args, **kwargs) -> None: # pylint: disable=super-init-not-called,unused-argument
|
1872
|
+
assert False, (
|
1873
|
+
'LaunchableResources should not be instantiated directly. '
|
1874
|
+
'It is only used for type checking by MyPy.')
|
1875
|
+
|
1876
|
+
@property
|
1877
|
+
def cloud(self) -> clouds.Cloud:
|
1878
|
+
assert self._cloud is not None, 'Cloud must be specified'
|
1879
|
+
return self._cloud
|
1880
|
+
|
1881
|
+
@property
|
1882
|
+
def instance_type(self) -> str:
|
1883
|
+
assert self._instance_type is not None, (
|
1884
|
+
'Instance type must be specified')
|
1885
|
+
return self._instance_type
|
1886
|
+
|
1887
|
+
def copy(self, **override) -> 'LaunchableResources':
|
1888
|
+
"""Ensure MyPy understands the return type is LaunchableResources.
|
1889
|
+
|
1890
|
+
This method is not expected to be called at runtime, as
|
1891
|
+
LaunchableResources should not be directly instantiated. It primarily
|
1892
|
+
serves as a type hint for static analysis.
|
1893
|
+
"""
|
1894
|
+
self.assert_launchable()
|
1895
|
+
return typing.cast(LaunchableResources, super().copy(**override))
|
sky/serve/server/core.py
CHANGED
@@ -219,17 +219,12 @@ def up(
|
|
219
219
|
# whether the service is already running. If the id is the same
|
220
220
|
# with the current job id, we know the service is up and running
|
221
221
|
# for the first time; otherwise it is a name conflict.
|
222
|
-
controller_idle_minutes_to_autostop, controller_down = (
|
223
|
-
controller_utils.get_controller_autostop_config(
|
224
|
-
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER))
|
225
222
|
# Since the controller may be shared among multiple users, launch the
|
226
223
|
# controller with the API server's user hash.
|
227
224
|
with common.with_server_user_hash():
|
228
225
|
controller_job_id, controller_handle = execution.launch(
|
229
226
|
task=controller_task,
|
230
227
|
cluster_name=controller_name,
|
231
|
-
idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
|
232
|
-
down=controller_down,
|
233
228
|
retry_until_up=True,
|
234
229
|
_disable_controller_check=True,
|
235
230
|
)
|
sky/serve/spot_placer.py
CHANGED
@@ -46,6 +46,8 @@ class Location:
|
|
46
46
|
|
47
47
|
@classmethod
|
48
48
|
def from_resources(cls, resources: 'resources_lib.Resources') -> 'Location':
|
49
|
+
assert resources.cloud is not None, 'Cloud must be specified'
|
50
|
+
assert resources.region is not None, 'Region must be specified'
|
49
51
|
return cls(resources.cloud, resources.region, resources.zone)
|
50
52
|
|
51
53
|
def to_dict(self) -> Dict[str, Any]:
|
@@ -147,6 +149,7 @@ def _get_possible_location_from_task(task: 'task_lib.Task') -> List[Location]:
|
|
147
149
|
cloud_str = str(launchable.cloud)
|
148
150
|
region = launchable.region
|
149
151
|
zone = launchable.zone
|
152
|
+
assert region is not None, 'Region must be specified'
|
150
153
|
if (cloud_str not in location_requirements and
|
151
154
|
location_requirements):
|
152
155
|
continue
|