skypilot-nightly 1.0.0.dev20250513__py3-none-any.whl → 1.0.0.dev20250515__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend.py +3 -2
- sky/backends/backend_utils.py +16 -17
- sky/backends/cloud_vm_ray_backend.py +47 -16
- sky/clouds/aws.py +11 -9
- sky/clouds/azure.py +16 -13
- sky/clouds/cloud.py +4 -3
- sky/clouds/cudo.py +3 -2
- sky/clouds/do.py +3 -2
- sky/clouds/fluidstack.py +3 -3
- sky/clouds/gcp.py +25 -9
- sky/clouds/ibm.py +12 -10
- sky/clouds/kubernetes.py +3 -2
- sky/clouds/lambda_cloud.py +6 -6
- sky/clouds/nebius.py +6 -5
- sky/clouds/oci.py +9 -7
- sky/clouds/paperspace.py +3 -2
- sky/clouds/runpod.py +9 -9
- sky/clouds/scp.py +5 -3
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +33 -11
- sky/clouds/service_catalog/gcp_catalog.py +7 -1
- sky/clouds/vast.py +8 -7
- sky/clouds/vsphere.py +4 -2
- sky/core.py +18 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/pages/index-6b0d9e5031b70c58.js +1 -0
- sky/dashboard/out/_next/static/{2dkponv64SfFShA8Rnw0D → jFI0Y-uJZ_XDK5IGJpKFU}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/execution.py +33 -0
- sky/jobs/recovery_strategy.py +4 -1
- sky/jobs/server/core.py +6 -12
- sky/optimizer.py +19 -13
- sky/provision/kubernetes/utils.py +26 -1
- sky/resources.py +206 -43
- sky/serve/server/core.py +0 -5
- sky/serve/spot_placer.py +3 -0
- sky/server/server.py +51 -13
- sky/skylet/log_lib.py +12 -3
- sky/skylet/log_lib.pyi +5 -0
- sky/task.py +8 -6
- sky/templates/nebius-ray.yml.j2 +3 -1
- sky/utils/cli_utils/status_utils.py +6 -5
- sky/utils/controller_utils.py +39 -43
- sky/utils/dag_utils.py +4 -2
- sky/utils/resources_utils.py +3 -0
- sky/utils/schemas.py +33 -24
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/RECORD +58 -58
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/WHEEL +1 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- /sky/dashboard/out/_next/static/{2dkponv64SfFShA8Rnw0D → jFI0Y-uJZ_XDK5IGJpKFU}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/top_level.txt +0 -0
sky/resources.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
"""Resources: compute requirements of Tasks."""
|
2
2
|
import dataclasses
|
3
3
|
import textwrap
|
4
|
-
|
4
|
+
import typing
|
5
|
+
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
|
5
6
|
|
6
7
|
import colorama
|
7
8
|
|
@@ -34,6 +35,50 @@ RESOURCE_CONFIG_ALIASES = {
|
|
34
35
|
}
|
35
36
|
|
36
37
|
|
38
|
+
@dataclasses.dataclass
|
39
|
+
class AutostopConfig:
|
40
|
+
"""Configuration for autostop."""
|
41
|
+
# enabled isn't present in the yaml config, but it's needed for this class
|
42
|
+
# to be complete.
|
43
|
+
enabled: bool
|
44
|
+
# If enabled is False, these values are ignored.
|
45
|
+
idle_minutes: int = 5
|
46
|
+
down: bool = False
|
47
|
+
|
48
|
+
def to_yaml_config(self) -> Union[Literal[False], Dict[str, Any]]:
|
49
|
+
if not self.enabled:
|
50
|
+
return False
|
51
|
+
return {
|
52
|
+
'idle_minutes': self.idle_minutes,
|
53
|
+
'down': self.down,
|
54
|
+
}
|
55
|
+
|
56
|
+
@classmethod
|
57
|
+
def from_yaml_config(
|
58
|
+
cls, config: Union[bool, int, Dict[str, Any], None]
|
59
|
+
) -> Optional['AutostopConfig']:
|
60
|
+
if isinstance(config, bool):
|
61
|
+
if config:
|
62
|
+
return cls(enabled=True)
|
63
|
+
else:
|
64
|
+
return cls(enabled=False)
|
65
|
+
|
66
|
+
if isinstance(config, int):
|
67
|
+
return cls(idle_minutes=config, down=False, enabled=True)
|
68
|
+
|
69
|
+
if isinstance(config, dict):
|
70
|
+
# If we have a dict, autostop is enabled. (Only way to disable is
|
71
|
+
# with `false`, a bool.)
|
72
|
+
autostop_config = cls(enabled=True)
|
73
|
+
if 'idle_minutes' in config:
|
74
|
+
autostop_config.idle_minutes = config['idle_minutes']
|
75
|
+
if 'down' in config:
|
76
|
+
autostop_config.down = config['down']
|
77
|
+
return autostop_config
|
78
|
+
|
79
|
+
return None
|
80
|
+
|
81
|
+
|
37
82
|
class Resources:
|
38
83
|
"""Resources: compute requirements of Tasks.
|
39
84
|
|
@@ -51,7 +96,7 @@ class Resources:
|
|
51
96
|
"""
|
52
97
|
# If any fields changed, increment the version. For backward compatibility,
|
53
98
|
# modify the __setstate__ method to handle the old version.
|
54
|
-
_VERSION =
|
99
|
+
_VERSION = 23
|
55
100
|
|
56
101
|
def __init__(
|
57
102
|
self,
|
@@ -59,17 +104,19 @@ class Resources:
|
|
59
104
|
instance_type: Optional[str] = None,
|
60
105
|
cpus: Union[None, int, float, str] = None,
|
61
106
|
memory: Union[None, int, float, str] = None,
|
62
|
-
accelerators: Union[None, str, Dict[str, int]] = None,
|
107
|
+
accelerators: Union[None, str, Dict[str, Union[int, float]]] = None,
|
63
108
|
accelerator_args: Optional[Dict[str, str]] = None,
|
64
109
|
use_spot: Optional[bool] = None,
|
65
|
-
job_recovery: Optional[Union[Dict[str, Union[str, int]]
|
110
|
+
job_recovery: Optional[Union[Dict[str, Optional[Union[str, int]]],
|
111
|
+
str]] = None,
|
66
112
|
region: Optional[str] = None,
|
67
113
|
zone: Optional[str] = None,
|
68
|
-
image_id: Union[Dict[str, str], str, None] = None,
|
114
|
+
image_id: Union[Dict[Optional[str], str], str, None] = None,
|
69
115
|
disk_size: Optional[int] = None,
|
70
116
|
disk_tier: Optional[Union[str, resources_utils.DiskTier]] = None,
|
71
117
|
ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
|
72
118
|
labels: Optional[Dict[str, str]] = None,
|
119
|
+
autostop: Union[bool, int, Dict[str, Any], None] = None,
|
73
120
|
# Internal use only.
|
74
121
|
# pylint: disable=invalid-name
|
75
122
|
_docker_login_config: Optional[docker_utils.DockerLoginConfig] = None,
|
@@ -152,6 +199,8 @@ class Resources:
|
|
152
199
|
instance tags. On GCP, labels map to instance labels. On
|
153
200
|
Kubernetes, labels map to pod labels. On other clouds, labels are
|
154
201
|
not supported and will be ignored.
|
202
|
+
autostop: the autostop configuration to use. For launched resources,
|
203
|
+
may or may not correspond to the actual current autostop config.
|
155
204
|
_docker_login_config: the docker configuration to use. This includes
|
156
205
|
the docker username, password, and registry server. If None, skip
|
157
206
|
docker login.
|
@@ -177,7 +226,8 @@ class Resources:
|
|
177
226
|
|
178
227
|
self._use_spot_specified = use_spot is not None
|
179
228
|
self._use_spot = use_spot if use_spot is not None else False
|
180
|
-
self._job_recovery: Optional[Dict[str, Union[str,
|
229
|
+
self._job_recovery: Optional[Dict[str, Optional[Union[str,
|
230
|
+
int]]]] = None
|
181
231
|
if job_recovery is not None:
|
182
232
|
if isinstance(job_recovery, str):
|
183
233
|
job_recovery = {'strategy': job_recovery}
|
@@ -188,7 +238,7 @@ class Resources:
|
|
188
238
|
if strategy_name == 'none':
|
189
239
|
self._job_recovery = None
|
190
240
|
else:
|
191
|
-
if strategy_name
|
241
|
+
if isinstance(strategy_name, str):
|
192
242
|
job_recovery['strategy'] = strategy_name.upper()
|
193
243
|
self._job_recovery = job_recovery
|
194
244
|
|
@@ -201,7 +251,7 @@ class Resources:
|
|
201
251
|
else:
|
202
252
|
self._disk_size = _DEFAULT_DISK_SIZE_GB
|
203
253
|
|
204
|
-
self._image_id =
|
254
|
+
self._image_id: Optional[Dict[Optional[str], str]] = None
|
205
255
|
if isinstance(image_id, str):
|
206
256
|
self._image_id = {self._region: image_id.strip()}
|
207
257
|
elif isinstance(image_id, dict):
|
@@ -209,8 +259,11 @@ class Resources:
|
|
209
259
|
self._image_id = {self._region: image_id[None].strip()}
|
210
260
|
else:
|
211
261
|
self._image_id = {
|
212
|
-
k.strip(): v.strip()
|
262
|
+
typing.cast(str, k).strip(): v.strip()
|
263
|
+
for k, v in image_id.items()
|
213
264
|
}
|
265
|
+
else:
|
266
|
+
self._image_id = image_id
|
214
267
|
self._is_image_managed = _is_image_managed
|
215
268
|
|
216
269
|
if isinstance(disk_tier, str):
|
@@ -228,7 +281,7 @@ class Resources:
|
|
228
281
|
if isinstance(ports, tuple):
|
229
282
|
ports = list(ports)
|
230
283
|
if not isinstance(ports, list):
|
231
|
-
ports = [ports]
|
284
|
+
ports = [str(ports)]
|
232
285
|
ports = resources_utils.simplify_ports(
|
233
286
|
[str(port) for port in ports])
|
234
287
|
if not ports:
|
@@ -250,11 +303,12 @@ class Resources:
|
|
250
303
|
self._requires_fuse = _requires_fuse
|
251
304
|
|
252
305
|
self._cluster_config_overrides = _cluster_config_overrides
|
253
|
-
self._cached_repr = None
|
306
|
+
self._cached_repr: Optional[str] = None
|
254
307
|
|
255
308
|
self._set_cpus(cpus)
|
256
309
|
self._set_memory(memory)
|
257
310
|
self._set_accelerators(accelerators, accelerator_args)
|
311
|
+
self._set_autostop_config(autostop)
|
258
312
|
|
259
313
|
def validate(self):
|
260
314
|
"""Validate the resources and infer the missing fields if possible."""
|
@@ -378,19 +432,19 @@ class Resources:
|
|
378
432
|
return repr_str
|
379
433
|
|
380
434
|
@property
|
381
|
-
def cloud(self):
|
435
|
+
def cloud(self) -> Optional[clouds.Cloud]:
|
382
436
|
return self._cloud
|
383
437
|
|
384
438
|
@property
|
385
|
-
def region(self):
|
439
|
+
def region(self) -> Optional[str]:
|
386
440
|
return self._region
|
387
441
|
|
388
442
|
@property
|
389
|
-
def zone(self):
|
443
|
+
def zone(self) -> Optional[str]:
|
390
444
|
return self._zone
|
391
445
|
|
392
446
|
@property
|
393
|
-
def instance_type(self):
|
447
|
+
def instance_type(self) -> Optional[str]:
|
394
448
|
return self._instance_type
|
395
449
|
|
396
450
|
@property
|
@@ -444,7 +498,7 @@ class Resources:
|
|
444
498
|
return None
|
445
499
|
|
446
500
|
@property
|
447
|
-
def accelerator_args(self) -> Optional[Dict[str,
|
501
|
+
def accelerator_args(self) -> Optional[Dict[str, Any]]:
|
448
502
|
return self._accelerator_args
|
449
503
|
|
450
504
|
@property
|
@@ -456,7 +510,7 @@ class Resources:
|
|
456
510
|
return self._use_spot_specified
|
457
511
|
|
458
512
|
@property
|
459
|
-
def job_recovery(self) -> Optional[Dict[str, Union[str, int]]]:
|
513
|
+
def job_recovery(self) -> Optional[Dict[str, Optional[Union[str, int]]]]:
|
460
514
|
return self._job_recovery
|
461
515
|
|
462
516
|
@property
|
@@ -464,11 +518,11 @@ class Resources:
|
|
464
518
|
return self._disk_size
|
465
519
|
|
466
520
|
@property
|
467
|
-
def image_id(self) -> Optional[Dict[str, str]]:
|
521
|
+
def image_id(self) -> Optional[Dict[Optional[str], str]]:
|
468
522
|
return self._image_id
|
469
523
|
|
470
524
|
@property
|
471
|
-
def disk_tier(self) -> resources_utils.DiskTier:
|
525
|
+
def disk_tier(self) -> Optional[resources_utils.DiskTier]:
|
472
526
|
return self._disk_tier
|
473
527
|
|
474
528
|
@property
|
@@ -479,6 +533,16 @@ class Resources:
|
|
479
533
|
def labels(self) -> Optional[Dict[str, str]]:
|
480
534
|
return self._labels
|
481
535
|
|
536
|
+
@property
|
537
|
+
def autostop_config(self) -> Optional[AutostopConfig]:
|
538
|
+
"""The requested autostop config.
|
539
|
+
|
540
|
+
Warning: This is the autostop config that was originally used to
|
541
|
+
launch the resources. It may not correspond to the actual current
|
542
|
+
autostop config.
|
543
|
+
"""
|
544
|
+
return self._autostop_config
|
545
|
+
|
482
546
|
@property
|
483
547
|
def is_image_managed(self) -> Optional[bool]:
|
484
548
|
return self._is_image_managed
|
@@ -489,15 +553,25 @@ class Resources:
|
|
489
553
|
return False
|
490
554
|
return self._requires_fuse
|
491
555
|
|
556
|
+
def set_requires_fuse(self, value: bool) -> None:
|
557
|
+
"""Sets whether this resource requires FUSE mounting support.
|
558
|
+
|
559
|
+
Args:
|
560
|
+
value: Whether the resource requires FUSE mounting support.
|
561
|
+
"""
|
562
|
+
# TODO(zeping): This violates the immutability of Resources.
|
563
|
+
# Refactor to use Resources.copy instead.
|
564
|
+
self._requires_fuse = value
|
565
|
+
|
492
566
|
@property
|
493
567
|
def cluster_config_overrides(self) -> Dict[str, Any]:
|
494
568
|
if self._cluster_config_overrides is None:
|
495
569
|
return {}
|
496
570
|
return self._cluster_config_overrides
|
497
571
|
|
498
|
-
@
|
499
|
-
def
|
500
|
-
self.
|
572
|
+
@property
|
573
|
+
def docker_login_config(self) -> Optional[docker_utils.DockerLoginConfig]:
|
574
|
+
return self._docker_login_config
|
501
575
|
|
502
576
|
@property
|
503
577
|
def docker_username_for_runpod(self) -> Optional[str]:
|
@@ -568,8 +642,8 @@ class Resources:
|
|
568
642
|
|
569
643
|
def _set_accelerators(
|
570
644
|
self,
|
571
|
-
accelerators: Union[None, str, Dict[str, int]],
|
572
|
-
accelerator_args: Optional[Dict[str,
|
645
|
+
accelerators: Union[None, str, Dict[str, Union[int, float]]],
|
646
|
+
accelerator_args: Optional[Dict[str, Any]],
|
573
647
|
) -> None:
|
574
648
|
"""Sets accelerators.
|
575
649
|
|
@@ -604,10 +678,11 @@ class Resources:
|
|
604
678
|
self._cloud = clouds.Kubernetes()
|
605
679
|
else:
|
606
680
|
self._cloud = clouds.GCP()
|
607
|
-
assert
|
608
|
-
|
609
|
-
|
610
|
-
|
681
|
+
assert self.cloud is not None and (
|
682
|
+
self.cloud.is_same_cloud(clouds.GCP()) or
|
683
|
+
self.cloud.is_same_cloud(clouds.Kubernetes())), (
|
684
|
+
'Cloud must be GCP or Kubernetes for TPU '
|
685
|
+
'accelerators.')
|
611
686
|
|
612
687
|
if accelerator_args is None:
|
613
688
|
accelerator_args = {}
|
@@ -641,15 +716,34 @@ class Resources:
|
|
641
716
|
'Cannot specify instance type (got '
|
642
717
|
f'{self.instance_type!r}) for TPU VM.')
|
643
718
|
|
644
|
-
self._accelerators
|
645
|
-
|
719
|
+
self._accelerators: Optional[Dict[str, Union[int,
|
720
|
+
float]]] = accelerators
|
721
|
+
self._accelerator_args: Optional[Dict[str, Any]] = accelerator_args
|
722
|
+
|
723
|
+
def _set_autostop_config(
|
724
|
+
self,
|
725
|
+
autostop: Union[bool, int, Dict[str, Any], None],
|
726
|
+
) -> None:
|
727
|
+
self._autostop_config = AutostopConfig.from_yaml_config(autostop)
|
646
728
|
|
647
729
|
def is_launchable(self) -> bool:
|
730
|
+
"""Returns whether the resource is launchable."""
|
648
731
|
return self.cloud is not None and self._instance_type is not None
|
649
732
|
|
733
|
+
def assert_launchable(self) -> 'LaunchableResources':
|
734
|
+
"""A workaround to make mypy understand that is_launchable() is true.
|
735
|
+
|
736
|
+
Note: The `cast` to `LaunchableResources` is only for static type
|
737
|
+
checking with MyPy. At runtime, the Python interpreter does not enforce
|
738
|
+
types, and the returned object will still be an instance of `Resources`.
|
739
|
+
"""
|
740
|
+
assert self.is_launchable(), self
|
741
|
+
return typing.cast(LaunchableResources, self)
|
742
|
+
|
650
743
|
def need_cleanup_after_preemption_or_failure(self) -> bool:
|
651
744
|
"""Whether a resource needs cleanup after preemption or failure."""
|
652
745
|
assert self.is_launchable(), self
|
746
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
653
747
|
return self.cloud.need_cleanup_after_preemption_or_failure(self)
|
654
748
|
|
655
749
|
def _try_canonicalize_accelerators(self) -> None:
|
@@ -706,10 +800,10 @@ class Resources:
|
|
706
800
|
else:
|
707
801
|
table = log_utils.create_table(['Cloud', 'Hint'])
|
708
802
|
table.add_row(['-----', '----'])
|
709
|
-
for
|
803
|
+
for cloud_msg, error in cloud_to_errors.items():
|
710
804
|
reason_str = '\n'.join(textwrap.wrap(
|
711
805
|
str(error), 80))
|
712
|
-
table.add_row([
|
806
|
+
table.add_row([cloud_msg, reason_str])
|
713
807
|
hint = table.get_string()
|
714
808
|
raise ValueError(
|
715
809
|
f'Invalid (region {self._region!r}, zone '
|
@@ -741,11 +835,13 @@ class Resources:
|
|
741
835
|
ssh_proxy_command dict with region names as keys).
|
742
836
|
"""
|
743
837
|
assert self.is_launchable(), self
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
838
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
839
|
+
assert self._instance_type is not None, (
|
840
|
+
'Instance type must be specified')
|
841
|
+
regions = self.cloud.regions_with_offering(self._instance_type,
|
842
|
+
self.accelerators,
|
843
|
+
self._use_spot, self._region,
|
844
|
+
self._zone)
|
749
845
|
if self._image_id is not None and None not in self._image_id:
|
750
846
|
regions = [r for r in regions if r.name in self._image_id]
|
751
847
|
|
@@ -845,6 +941,10 @@ class Resources:
|
|
845
941
|
cpus, mem = self.cloud.get_vcpus_mem_from_instance_type(
|
846
942
|
self._instance_type)
|
847
943
|
if self._cpus is not None:
|
944
|
+
assert cpus is not None, (
|
945
|
+
f'Can\'t get vCPUs from instance type: '
|
946
|
+
f'{self._instance_type}, check catalog or '
|
947
|
+
f'specify cpus directly.')
|
848
948
|
if self._cpus.endswith('+'):
|
849
949
|
if cpus < float(self._cpus[:-1]):
|
850
950
|
with ux_utils.print_exception_no_traceback():
|
@@ -859,6 +959,10 @@ class Resources:
|
|
859
959
|
f'number of vCPUs. {self.instance_type} has {cpus} '
|
860
960
|
f'vCPUs, but {self._cpus} is requested.')
|
861
961
|
if self.memory is not None:
|
962
|
+
assert mem is not None, (
|
963
|
+
f'Can\'t get memory from instance type: '
|
964
|
+
f'{self._instance_type}, check catalog or '
|
965
|
+
f'specify memory directly.')
|
862
966
|
if self.memory.endswith(('+', 'x')):
|
863
967
|
if mem < float(self.memory[:-1]):
|
864
968
|
with ux_utils.print_exception_no_traceback():
|
@@ -882,6 +986,8 @@ class Resources:
|
|
882
986
|
if self._job_recovery is None or self._job_recovery['strategy'] is None:
|
883
987
|
return
|
884
988
|
# Validate the job recovery strategy
|
989
|
+
assert isinstance(self._job_recovery['strategy'],
|
990
|
+
str), 'Job recovery strategy must be a string'
|
885
991
|
registry.JOBS_RECOVERY_STRATEGY_REGISTRY.from_str(
|
886
992
|
self._job_recovery['strategy'])
|
887
993
|
|
@@ -916,7 +1022,7 @@ class Resources:
|
|
916
1022
|
'Cloud must be specified when image_id is provided.')
|
917
1023
|
|
918
1024
|
try:
|
919
|
-
self.
|
1025
|
+
self.cloud.check_features_are_supported(
|
920
1026
|
self,
|
921
1027
|
requested_features={
|
922
1028
|
clouds.CloudImplementationFeatures.IMAGE_ID
|
@@ -939,14 +1045,14 @@ class Resources:
|
|
939
1045
|
# Check the image_id's are valid.
|
940
1046
|
for region, image_id in self._image_id.items():
|
941
1047
|
if (image_id.startswith('skypilot:') and
|
942
|
-
not self.
|
1048
|
+
not self.cloud.is_image_tag_valid(image_id, region)):
|
943
1049
|
region_str = f' ({region})' if region else ''
|
944
1050
|
with ux_utils.print_exception_no_traceback():
|
945
1051
|
raise ValueError(
|
946
1052
|
f'Image tag {image_id!r} is not valid, please make sure'
|
947
1053
|
f' the tag exists in {self._cloud}{region_str}.')
|
948
1054
|
|
949
|
-
if (self.
|
1055
|
+
if (self.cloud.is_same_cloud(clouds.AWS()) and
|
950
1056
|
not image_id.startswith('skypilot:') and region is None):
|
951
1057
|
with ux_utils.print_exception_no_traceback():
|
952
1058
|
raise ValueError(
|
@@ -1051,6 +1157,9 @@ class Resources:
|
|
1051
1157
|
"""Returns cost in USD for the runtime in seconds."""
|
1052
1158
|
hours = seconds / 3600
|
1053
1159
|
# Instance.
|
1160
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
1161
|
+
assert self._instance_type is not None, (
|
1162
|
+
'Instance type must be specified')
|
1054
1163
|
hourly_cost = self.cloud.instance_type_to_hourly_cost(
|
1055
1164
|
self._instance_type, self.use_spot, self._region, self._zone)
|
1056
1165
|
# Accelerators (if any).
|
@@ -1095,6 +1204,7 @@ class Resources:
|
|
1095
1204
|
docker_image = self.extract_docker_image()
|
1096
1205
|
|
1097
1206
|
# Cloud specific variables
|
1207
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
1098
1208
|
cloud_specific_variables = self.cloud.make_deploy_resources_variables(
|
1099
1209
|
self, cluster_name, region, zones, num_nodes, dryrun)
|
1100
1210
|
|
@@ -1149,9 +1259,12 @@ class Resources:
|
|
1149
1259
|
specific_reservations = set(
|
1150
1260
|
skypilot_config.get_nested(
|
1151
1261
|
(str(self.cloud).lower(), 'specific_reservations'), set()))
|
1262
|
+
|
1263
|
+
assert (self.cloud is not None and self.instance_type is not None and
|
1264
|
+
self.region
|
1265
|
+
is not None), ('Cloud, instance type, region must be specified')
|
1152
1266
|
return self.cloud.get_reservations_available_resources(
|
1153
|
-
self.
|
1154
|
-
specific_reservations)
|
1267
|
+
self.instance_type, self.region, self.zone, specific_reservations)
|
1155
1268
|
|
1156
1269
|
def less_demanding_than(
|
1157
1270
|
self,
|
@@ -1171,6 +1284,9 @@ class Resources:
|
|
1171
1284
|
if isinstance(other, list):
|
1172
1285
|
resources_list = [self.less_demanding_than(o) for o in other]
|
1173
1286
|
return requested_num_nodes <= sum(resources_list)
|
1287
|
+
|
1288
|
+
assert other.cloud is not None, 'Other cloud must be specified'
|
1289
|
+
|
1174
1290
|
if self.cloud is not None and not self.cloud.is_same_cloud(other.cloud):
|
1175
1291
|
return False
|
1176
1292
|
# self.cloud <= other.cloud
|
@@ -1259,6 +1375,7 @@ class Resources:
|
|
1259
1375
|
If a field in `blocked` is None, it should be considered as a wildcard
|
1260
1376
|
for that field.
|
1261
1377
|
"""
|
1378
|
+
assert self.cloud is not None, 'Cloud must be specified'
|
1262
1379
|
is_matched = True
|
1263
1380
|
if (blocked.cloud is not None and
|
1264
1381
|
not self.cloud.is_same_cloud(blocked.cloud)):
|
@@ -1297,7 +1414,7 @@ class Resources:
|
|
1297
1414
|
use_spot = self.use_spot if self._use_spot_specified else None
|
1298
1415
|
|
1299
1416
|
current_override_configs = self._cluster_config_overrides
|
1300
|
-
if
|
1417
|
+
if current_override_configs is None:
|
1301
1418
|
current_override_configs = {}
|
1302
1419
|
new_override_configs = override.pop('_cluster_config_overrides', {})
|
1303
1420
|
overlaid_configs = skypilot_config.overlay_skypilot_config(
|
@@ -1310,6 +1427,10 @@ class Resources:
|
|
1310
1427
|
if elem is not None:
|
1311
1428
|
override_configs.set_nested(key, elem)
|
1312
1429
|
|
1430
|
+
current_autostop_config = None
|
1431
|
+
if self.autostop_config is not None:
|
1432
|
+
current_autostop_config = self.autostop_config.to_yaml_config()
|
1433
|
+
|
1313
1434
|
override_configs = dict(override_configs) if override_configs else None
|
1314
1435
|
resources = Resources(
|
1315
1436
|
cloud=override.pop('cloud', self.cloud),
|
@@ -1328,6 +1449,7 @@ class Resources:
|
|
1328
1449
|
disk_tier=override.pop('disk_tier', self.disk_tier),
|
1329
1450
|
ports=override.pop('ports', self.ports),
|
1330
1451
|
labels=override.pop('labels', self.labels),
|
1452
|
+
autostop=override.pop('autostop', current_autostop_config),
|
1331
1453
|
_docker_login_config=override.pop('_docker_login_config',
|
1332
1454
|
self._docker_login_config),
|
1333
1455
|
_docker_username_for_runpod=override.pop(
|
@@ -1525,6 +1647,7 @@ class Resources:
|
|
1525
1647
|
resources_fields['disk_tier'] = config.pop('disk_tier', None)
|
1526
1648
|
resources_fields['ports'] = config.pop('ports', None)
|
1527
1649
|
resources_fields['labels'] = config.pop('labels', None)
|
1650
|
+
resources_fields['autostop'] = config.pop('autostop', None)
|
1528
1651
|
resources_fields['_docker_login_config'] = config.pop(
|
1529
1652
|
'_docker_login_config', None)
|
1530
1653
|
resources_fields['_docker_username_for_runpod'] = config.pop(
|
@@ -1574,6 +1697,8 @@ class Resources:
|
|
1574
1697
|
config['disk_tier'] = self.disk_tier.value
|
1575
1698
|
add_if_not_none('ports', self.ports)
|
1576
1699
|
add_if_not_none('labels', self.labels)
|
1700
|
+
if self._autostop_config is not None:
|
1701
|
+
config['autostop'] = self._autostop_config.to_yaml_config()
|
1577
1702
|
if self._docker_login_config is not None:
|
1578
1703
|
config['_docker_login_config'] = dataclasses.asdict(
|
1579
1704
|
self._docker_login_config)
|
@@ -1729,4 +1854,42 @@ class Resources:
|
|
1729
1854
|
self._docker_username_for_runpod = state.pop(
|
1730
1855
|
'_docker_username_for_runpod', None)
|
1731
1856
|
|
1857
|
+
if version < 23:
|
1858
|
+
self._autostop_config = None
|
1859
|
+
|
1732
1860
|
self.__dict__.update(state)
|
1861
|
+
|
1862
|
+
|
1863
|
+
class LaunchableResources(Resources):
|
1864
|
+
"""A class representing resources that can be launched on a cloud provider.
|
1865
|
+
|
1866
|
+
This class is primarily a type hint for MyPy to indicate that an instance
|
1867
|
+
of `Resources` is launchable (i.e., `cloud` and `instance_type` are not
|
1868
|
+
None). It should not be instantiated directly.
|
1869
|
+
"""
|
1870
|
+
|
1871
|
+
def __init__(self, *args, **kwargs) -> None: # pylint: disable=super-init-not-called,unused-argument
|
1872
|
+
assert False, (
|
1873
|
+
'LaunchableResources should not be instantiated directly. '
|
1874
|
+
'It is only used for type checking by MyPy.')
|
1875
|
+
|
1876
|
+
@property
|
1877
|
+
def cloud(self) -> clouds.Cloud:
|
1878
|
+
assert self._cloud is not None, 'Cloud must be specified'
|
1879
|
+
return self._cloud
|
1880
|
+
|
1881
|
+
@property
|
1882
|
+
def instance_type(self) -> str:
|
1883
|
+
assert self._instance_type is not None, (
|
1884
|
+
'Instance type must be specified')
|
1885
|
+
return self._instance_type
|
1886
|
+
|
1887
|
+
def copy(self, **override) -> 'LaunchableResources':
|
1888
|
+
"""Ensure MyPy understands the return type is LaunchableResources.
|
1889
|
+
|
1890
|
+
This method is not expected to be called at runtime, as
|
1891
|
+
LaunchableResources should not be directly instantiated. It primarily
|
1892
|
+
serves as a type hint for static analysis.
|
1893
|
+
"""
|
1894
|
+
self.assert_launchable()
|
1895
|
+
return typing.cast(LaunchableResources, super().copy(**override))
|
sky/serve/server/core.py
CHANGED
@@ -219,17 +219,12 @@ def up(
|
|
219
219
|
# whether the service is already running. If the id is the same
|
220
220
|
# with the current job id, we know the service is up and running
|
221
221
|
# for the first time; otherwise it is a name conflict.
|
222
|
-
controller_idle_minutes_to_autostop, controller_down = (
|
223
|
-
controller_utils.get_controller_autostop_config(
|
224
|
-
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER))
|
225
222
|
# Since the controller may be shared among multiple users, launch the
|
226
223
|
# controller with the API server's user hash.
|
227
224
|
with common.with_server_user_hash():
|
228
225
|
controller_job_id, controller_handle = execution.launch(
|
229
226
|
task=controller_task,
|
230
227
|
cluster_name=controller_name,
|
231
|
-
idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
|
232
|
-
down=controller_down,
|
233
228
|
retry_until_up=True,
|
234
229
|
_disable_controller_check=True,
|
235
230
|
)
|
sky/serve/spot_placer.py
CHANGED
@@ -46,6 +46,8 @@ class Location:
|
|
46
46
|
|
47
47
|
@classmethod
|
48
48
|
def from_resources(cls, resources: 'resources_lib.Resources') -> 'Location':
|
49
|
+
assert resources.cloud is not None, 'Cloud must be specified'
|
50
|
+
assert resources.region is not None, 'Region must be specified'
|
49
51
|
return cls(resources.cloud, resources.region, resources.zone)
|
50
52
|
|
51
53
|
def to_dict(self) -> Dict[str, Any]:
|
@@ -147,6 +149,7 @@ def _get_possible_location_from_task(task: 'task_lib.Task') -> List[Location]:
|
|
147
149
|
cloud_str = str(launchable.cloud)
|
148
150
|
region = launchable.region
|
149
151
|
zone = launchable.zone
|
152
|
+
assert region is not None, 'Region must be specified'
|
150
153
|
if (cloud_str not in location_requirements and
|
151
154
|
location_requirements):
|
152
155
|
continue
|
sky/server/server.py
CHANGED
@@ -9,6 +9,7 @@ import logging
|
|
9
9
|
import multiprocessing
|
10
10
|
import os
|
11
11
|
import pathlib
|
12
|
+
import posixpath
|
12
13
|
import re
|
13
14
|
import shutil
|
14
15
|
import sys
|
@@ -167,8 +168,36 @@ class InternalDashboardPrefixMiddleware(
|
|
167
168
|
return await call_next(request)
|
168
169
|
|
169
170
|
|
171
|
+
class CacheControlStaticMiddleware(starlette.middleware.base.BaseHTTPMiddleware
|
172
|
+
):
|
173
|
+
"""Middleware to add cache control headers to static files."""
|
174
|
+
|
175
|
+
async def dispatch(self, request: fastapi.Request, call_next):
|
176
|
+
if request.url.path.startswith('/dashboard/_next'):
|
177
|
+
response = await call_next(request)
|
178
|
+
response.headers['Cache-Control'] = 'max-age=3600'
|
179
|
+
return response
|
180
|
+
return await call_next(request)
|
181
|
+
|
182
|
+
|
183
|
+
class PathCleanMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
184
|
+
"""Middleware to check the path of requests."""
|
185
|
+
|
186
|
+
async def dispatch(self, request: fastapi.Request, call_next):
|
187
|
+
if request.url.path.startswith('/dashboard/'):
|
188
|
+
# If the requested path is not relative to the expected directory,
|
189
|
+
# then the user is attempting path traversal, so deny the request.
|
190
|
+
parent = pathlib.Path('/dashboard')
|
191
|
+
request_path = pathlib.Path(posixpath.normpath(request.url.path))
|
192
|
+
if not _is_relative_to(request_path, parent):
|
193
|
+
raise fastapi.HTTPException(status_code=403, detail='Forbidden')
|
194
|
+
return await call_next(request)
|
195
|
+
|
196
|
+
|
170
197
|
app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
|
171
198
|
app.add_middleware(InternalDashboardPrefixMiddleware)
|
199
|
+
app.add_middleware(PathCleanMiddleware)
|
200
|
+
app.add_middleware(CacheControlStaticMiddleware)
|
172
201
|
app.add_middleware(
|
173
202
|
cors.CORSMiddleware,
|
174
203
|
# TODO(zhwu): in production deployment, we should restrict the allowed
|
@@ -1130,25 +1159,28 @@ async def complete_storage_name(incomplete: str,) -> List[str]:
|
|
1130
1159
|
return global_user_state.get_storage_names_start_with(incomplete)
|
1131
1160
|
|
1132
1161
|
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
"""Serves static files for any unmatched routes.
|
1162
|
+
@app.get('/dashboard/{full_path:path}')
|
1163
|
+
async def serve_dashboard(full_path: str):
|
1164
|
+
"""Serves the Next.js dashboard application.
|
1137
1165
|
|
1138
|
-
|
1139
|
-
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1166
|
+
Args:
|
1167
|
+
full_path: The path requested by the client.
|
1168
|
+
e.g. /clusters, /jobs
|
1169
|
+
|
1170
|
+
Returns:
|
1171
|
+
FileResponse for static files or index.html for client-side routing.
|
1143
1172
|
|
1144
|
-
|
1173
|
+
Raises:
|
1174
|
+
HTTPException: If the path is invalid or file not found.
|
1175
|
+
"""
|
1176
|
+
# Try to serve the staticfile directly e.g. /skypilot.svg,
|
1177
|
+
# /favicon.ico, and /_next/, etc.
|
1145
1178
|
file_path = os.path.join(server_constants.DASHBOARD_DIR, full_path)
|
1146
1179
|
if os.path.isfile(file_path):
|
1147
1180
|
return fastapi.responses.FileResponse(file_path)
|
1148
1181
|
|
1149
|
-
#
|
1150
|
-
#
|
1151
|
-
# client will be redirected to the index.html.
|
1182
|
+
# Serve index.html for client-side routing
|
1183
|
+
# e.g. /clusters, /jobs
|
1152
1184
|
index_path = os.path.join(server_constants.DASHBOARD_DIR, 'index.html')
|
1153
1185
|
try:
|
1154
1186
|
with open(index_path, 'r', encoding='utf-8') as f:
|
@@ -1159,6 +1191,12 @@ async def serve_static_or_dashboard(full_path: str):
|
|
1159
1191
|
raise fastapi.HTTPException(status_code=500, detail=str(e))
|
1160
1192
|
|
1161
1193
|
|
1194
|
+
# Redirect the root path to dashboard
|
1195
|
+
@app.get('/')
|
1196
|
+
async def root():
|
1197
|
+
return fastapi.responses.RedirectResponse(url='/dashboard/')
|
1198
|
+
|
1199
|
+
|
1162
1200
|
if __name__ == '__main__':
|
1163
1201
|
import uvicorn
|
1164
1202
|
|