skypilot-nightly 1.0.0.dev20250513__py3-none-any.whl → 1.0.0.dev20250515__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +3 -2
  3. sky/backends/backend_utils.py +16 -17
  4. sky/backends/cloud_vm_ray_backend.py +47 -16
  5. sky/clouds/aws.py +11 -9
  6. sky/clouds/azure.py +16 -13
  7. sky/clouds/cloud.py +4 -3
  8. sky/clouds/cudo.py +3 -2
  9. sky/clouds/do.py +3 -2
  10. sky/clouds/fluidstack.py +3 -3
  11. sky/clouds/gcp.py +25 -9
  12. sky/clouds/ibm.py +12 -10
  13. sky/clouds/kubernetes.py +3 -2
  14. sky/clouds/lambda_cloud.py +6 -6
  15. sky/clouds/nebius.py +6 -5
  16. sky/clouds/oci.py +9 -7
  17. sky/clouds/paperspace.py +3 -2
  18. sky/clouds/runpod.py +9 -9
  19. sky/clouds/scp.py +5 -3
  20. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +33 -11
  21. sky/clouds/service_catalog/gcp_catalog.py +7 -1
  22. sky/clouds/vast.py +8 -7
  23. sky/clouds/vsphere.py +4 -2
  24. sky/core.py +18 -12
  25. sky/dashboard/out/404.html +1 -1
  26. sky/dashboard/out/_next/static/chunks/pages/index-6b0d9e5031b70c58.js +1 -0
  27. sky/dashboard/out/_next/static/{2dkponv64SfFShA8Rnw0D → jFI0Y-uJZ_XDK5IGJpKFU}/_buildManifest.js +1 -1
  28. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  29. sky/dashboard/out/clusters/[cluster].html +1 -1
  30. sky/dashboard/out/clusters.html +1 -1
  31. sky/dashboard/out/index.html +1 -1
  32. sky/dashboard/out/jobs/[job].html +1 -1
  33. sky/dashboard/out/jobs.html +1 -1
  34. sky/execution.py +33 -0
  35. sky/jobs/recovery_strategy.py +4 -1
  36. sky/jobs/server/core.py +6 -12
  37. sky/optimizer.py +19 -13
  38. sky/provision/kubernetes/utils.py +26 -1
  39. sky/resources.py +206 -43
  40. sky/serve/server/core.py +0 -5
  41. sky/serve/spot_placer.py +3 -0
  42. sky/server/server.py +51 -13
  43. sky/skylet/log_lib.py +12 -3
  44. sky/skylet/log_lib.pyi +5 -0
  45. sky/task.py +8 -6
  46. sky/templates/nebius-ray.yml.j2 +3 -1
  47. sky/utils/cli_utils/status_utils.py +6 -5
  48. sky/utils/controller_utils.py +39 -43
  49. sky/utils/dag_utils.py +4 -2
  50. sky/utils/resources_utils.py +3 -0
  51. sky/utils/schemas.py +33 -24
  52. {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/METADATA +1 -1
  53. {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/RECORD +58 -58
  54. {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/WHEEL +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  56. /sky/dashboard/out/_next/static/{2dkponv64SfFShA8Rnw0D → jFI0Y-uJZ_XDK5IGJpKFU}/_ssgManifest.js +0 -0
  57. {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/entry_points.txt +0 -0
  58. {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/licenses/LICENSE +0 -0
  59. {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/top_level.txt +0 -0
sky/resources.py CHANGED
@@ -1,7 +1,8 @@
1
1
  """Resources: compute requirements of Tasks."""
2
2
  import dataclasses
3
3
  import textwrap
4
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
4
+ import typing
5
+ from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
5
6
 
6
7
  import colorama
7
8
 
@@ -34,6 +35,50 @@ RESOURCE_CONFIG_ALIASES = {
34
35
  }
35
36
 
36
37
 
38
+ @dataclasses.dataclass
39
+ class AutostopConfig:
40
+ """Configuration for autostop."""
41
+ # enabled isn't present in the yaml config, but it's needed for this class
42
+ # to be complete.
43
+ enabled: bool
44
+ # If enabled is False, these values are ignored.
45
+ idle_minutes: int = 5
46
+ down: bool = False
47
+
48
+ def to_yaml_config(self) -> Union[Literal[False], Dict[str, Any]]:
49
+ if not self.enabled:
50
+ return False
51
+ return {
52
+ 'idle_minutes': self.idle_minutes,
53
+ 'down': self.down,
54
+ }
55
+
56
+ @classmethod
57
+ def from_yaml_config(
58
+ cls, config: Union[bool, int, Dict[str, Any], None]
59
+ ) -> Optional['AutostopConfig']:
60
+ if isinstance(config, bool):
61
+ if config:
62
+ return cls(enabled=True)
63
+ else:
64
+ return cls(enabled=False)
65
+
66
+ if isinstance(config, int):
67
+ return cls(idle_minutes=config, down=False, enabled=True)
68
+
69
+ if isinstance(config, dict):
70
+ # If we have a dict, autostop is enabled. (Only way to disable is
71
+ # with `false`, a bool.)
72
+ autostop_config = cls(enabled=True)
73
+ if 'idle_minutes' in config:
74
+ autostop_config.idle_minutes = config['idle_minutes']
75
+ if 'down' in config:
76
+ autostop_config.down = config['down']
77
+ return autostop_config
78
+
79
+ return None
80
+
81
+
37
82
  class Resources:
38
83
  """Resources: compute requirements of Tasks.
39
84
 
@@ -51,7 +96,7 @@ class Resources:
51
96
  """
52
97
  # If any fields changed, increment the version. For backward compatibility,
53
98
  # modify the __setstate__ method to handle the old version.
54
- _VERSION = 22
99
+ _VERSION = 23
55
100
 
56
101
  def __init__(
57
102
  self,
@@ -59,17 +104,19 @@ class Resources:
59
104
  instance_type: Optional[str] = None,
60
105
  cpus: Union[None, int, float, str] = None,
61
106
  memory: Union[None, int, float, str] = None,
62
- accelerators: Union[None, str, Dict[str, int]] = None,
107
+ accelerators: Union[None, str, Dict[str, Union[int, float]]] = None,
63
108
  accelerator_args: Optional[Dict[str, str]] = None,
64
109
  use_spot: Optional[bool] = None,
65
- job_recovery: Optional[Union[Dict[str, Union[str, int]], str]] = None,
110
+ job_recovery: Optional[Union[Dict[str, Optional[Union[str, int]]],
111
+ str]] = None,
66
112
  region: Optional[str] = None,
67
113
  zone: Optional[str] = None,
68
- image_id: Union[Dict[str, str], str, None] = None,
114
+ image_id: Union[Dict[Optional[str], str], str, None] = None,
69
115
  disk_size: Optional[int] = None,
70
116
  disk_tier: Optional[Union[str, resources_utils.DiskTier]] = None,
71
117
  ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
72
118
  labels: Optional[Dict[str, str]] = None,
119
+ autostop: Union[bool, int, Dict[str, Any], None] = None,
73
120
  # Internal use only.
74
121
  # pylint: disable=invalid-name
75
122
  _docker_login_config: Optional[docker_utils.DockerLoginConfig] = None,
@@ -152,6 +199,8 @@ class Resources:
152
199
  instance tags. On GCP, labels map to instance labels. On
153
200
  Kubernetes, labels map to pod labels. On other clouds, labels are
154
201
  not supported and will be ignored.
202
+ autostop: the autostop configuration to use. For launched resources,
203
+ may or may not correspond to the actual current autostop config.
155
204
  _docker_login_config: the docker configuration to use. This includes
156
205
  the docker username, password, and registry server. If None, skip
157
206
  docker login.
@@ -177,7 +226,8 @@ class Resources:
177
226
 
178
227
  self._use_spot_specified = use_spot is not None
179
228
  self._use_spot = use_spot if use_spot is not None else False
180
- self._job_recovery: Optional[Dict[str, Union[str, int]]] = None
229
+ self._job_recovery: Optional[Dict[str, Optional[Union[str,
230
+ int]]]] = None
181
231
  if job_recovery is not None:
182
232
  if isinstance(job_recovery, str):
183
233
  job_recovery = {'strategy': job_recovery}
@@ -188,7 +238,7 @@ class Resources:
188
238
  if strategy_name == 'none':
189
239
  self._job_recovery = None
190
240
  else:
191
- if strategy_name is not None:
241
+ if isinstance(strategy_name, str):
192
242
  job_recovery['strategy'] = strategy_name.upper()
193
243
  self._job_recovery = job_recovery
194
244
 
@@ -201,7 +251,7 @@ class Resources:
201
251
  else:
202
252
  self._disk_size = _DEFAULT_DISK_SIZE_GB
203
253
 
204
- self._image_id = image_id
254
+ self._image_id: Optional[Dict[Optional[str], str]] = None
205
255
  if isinstance(image_id, str):
206
256
  self._image_id = {self._region: image_id.strip()}
207
257
  elif isinstance(image_id, dict):
@@ -209,8 +259,11 @@ class Resources:
209
259
  self._image_id = {self._region: image_id[None].strip()}
210
260
  else:
211
261
  self._image_id = {
212
- k.strip(): v.strip() for k, v in image_id.items()
262
+ typing.cast(str, k).strip(): v.strip()
263
+ for k, v in image_id.items()
213
264
  }
265
+ else:
266
+ self._image_id = image_id
214
267
  self._is_image_managed = _is_image_managed
215
268
 
216
269
  if isinstance(disk_tier, str):
@@ -228,7 +281,7 @@ class Resources:
228
281
  if isinstance(ports, tuple):
229
282
  ports = list(ports)
230
283
  if not isinstance(ports, list):
231
- ports = [ports]
284
+ ports = [str(ports)]
232
285
  ports = resources_utils.simplify_ports(
233
286
  [str(port) for port in ports])
234
287
  if not ports:
@@ -250,11 +303,12 @@ class Resources:
250
303
  self._requires_fuse = _requires_fuse
251
304
 
252
305
  self._cluster_config_overrides = _cluster_config_overrides
253
- self._cached_repr = None
306
+ self._cached_repr: Optional[str] = None
254
307
 
255
308
  self._set_cpus(cpus)
256
309
  self._set_memory(memory)
257
310
  self._set_accelerators(accelerators, accelerator_args)
311
+ self._set_autostop_config(autostop)
258
312
 
259
313
  def validate(self):
260
314
  """Validate the resources and infer the missing fields if possible."""
@@ -378,19 +432,19 @@ class Resources:
378
432
  return repr_str
379
433
 
380
434
  @property
381
- def cloud(self):
435
+ def cloud(self) -> Optional[clouds.Cloud]:
382
436
  return self._cloud
383
437
 
384
438
  @property
385
- def region(self):
439
+ def region(self) -> Optional[str]:
386
440
  return self._region
387
441
 
388
442
  @property
389
- def zone(self):
443
+ def zone(self) -> Optional[str]:
390
444
  return self._zone
391
445
 
392
446
  @property
393
- def instance_type(self):
447
+ def instance_type(self) -> Optional[str]:
394
448
  return self._instance_type
395
449
 
396
450
  @property
@@ -444,7 +498,7 @@ class Resources:
444
498
  return None
445
499
 
446
500
  @property
447
- def accelerator_args(self) -> Optional[Dict[str, str]]:
501
+ def accelerator_args(self) -> Optional[Dict[str, Any]]:
448
502
  return self._accelerator_args
449
503
 
450
504
  @property
@@ -456,7 +510,7 @@ class Resources:
456
510
  return self._use_spot_specified
457
511
 
458
512
  @property
459
- def job_recovery(self) -> Optional[Dict[str, Union[str, int]]]:
513
+ def job_recovery(self) -> Optional[Dict[str, Optional[Union[str, int]]]]:
460
514
  return self._job_recovery
461
515
 
462
516
  @property
@@ -464,11 +518,11 @@ class Resources:
464
518
  return self._disk_size
465
519
 
466
520
  @property
467
- def image_id(self) -> Optional[Dict[str, str]]:
521
+ def image_id(self) -> Optional[Dict[Optional[str], str]]:
468
522
  return self._image_id
469
523
 
470
524
  @property
471
- def disk_tier(self) -> resources_utils.DiskTier:
525
+ def disk_tier(self) -> Optional[resources_utils.DiskTier]:
472
526
  return self._disk_tier
473
527
 
474
528
  @property
@@ -479,6 +533,16 @@ class Resources:
479
533
  def labels(self) -> Optional[Dict[str, str]]:
480
534
  return self._labels
481
535
 
536
+ @property
537
+ def autostop_config(self) -> Optional[AutostopConfig]:
538
+ """The requested autostop config.
539
+
540
+ Warning: This is the autostop config that was originally used to
541
+ launch the resources. It may not correspond to the actual current
542
+ autostop config.
543
+ """
544
+ return self._autostop_config
545
+
482
546
  @property
483
547
  def is_image_managed(self) -> Optional[bool]:
484
548
  return self._is_image_managed
@@ -489,15 +553,25 @@ class Resources:
489
553
  return False
490
554
  return self._requires_fuse
491
555
 
556
+ def set_requires_fuse(self, value: bool) -> None:
557
+ """Sets whether this resource requires FUSE mounting support.
558
+
559
+ Args:
560
+ value: Whether the resource requires FUSE mounting support.
561
+ """
562
+ # TODO(zeping): This violates the immutability of Resources.
563
+ # Refactor to use Resources.copy instead.
564
+ self._requires_fuse = value
565
+
492
566
  @property
493
567
  def cluster_config_overrides(self) -> Dict[str, Any]:
494
568
  if self._cluster_config_overrides is None:
495
569
  return {}
496
570
  return self._cluster_config_overrides
497
571
 
498
- @requires_fuse.setter
499
- def requires_fuse(self, value: Optional[bool]) -> None:
500
- self._requires_fuse = value
572
+ @property
573
+ def docker_login_config(self) -> Optional[docker_utils.DockerLoginConfig]:
574
+ return self._docker_login_config
501
575
 
502
576
  @property
503
577
  def docker_username_for_runpod(self) -> Optional[str]:
@@ -568,8 +642,8 @@ class Resources:
568
642
 
569
643
  def _set_accelerators(
570
644
  self,
571
- accelerators: Union[None, str, Dict[str, int]],
572
- accelerator_args: Optional[Dict[str, str]],
645
+ accelerators: Union[None, str, Dict[str, Union[int, float]]],
646
+ accelerator_args: Optional[Dict[str, Any]],
573
647
  ) -> None:
574
648
  """Sets accelerators.
575
649
 
@@ -604,10 +678,11 @@ class Resources:
604
678
  self._cloud = clouds.Kubernetes()
605
679
  else:
606
680
  self._cloud = clouds.GCP()
607
- assert (self.cloud.is_same_cloud(clouds.GCP()) or
608
- self.cloud.is_same_cloud(clouds.Kubernetes())), (
609
- 'Cloud must be GCP or Kubernetes for TPU '
610
- 'accelerators.')
681
+ assert self.cloud is not None and (
682
+ self.cloud.is_same_cloud(clouds.GCP()) or
683
+ self.cloud.is_same_cloud(clouds.Kubernetes())), (
684
+ 'Cloud must be GCP or Kubernetes for TPU '
685
+ 'accelerators.')
611
686
 
612
687
  if accelerator_args is None:
613
688
  accelerator_args = {}
@@ -641,15 +716,34 @@ class Resources:
641
716
  'Cannot specify instance type (got '
642
717
  f'{self.instance_type!r}) for TPU VM.')
643
718
 
644
- self._accelerators = accelerators
645
- self._accelerator_args = accelerator_args
719
+ self._accelerators: Optional[Dict[str, Union[int,
720
+ float]]] = accelerators
721
+ self._accelerator_args: Optional[Dict[str, Any]] = accelerator_args
722
+
723
+ def _set_autostop_config(
724
+ self,
725
+ autostop: Union[bool, int, Dict[str, Any], None],
726
+ ) -> None:
727
+ self._autostop_config = AutostopConfig.from_yaml_config(autostop)
646
728
 
647
729
  def is_launchable(self) -> bool:
730
+ """Returns whether the resource is launchable."""
648
731
  return self.cloud is not None and self._instance_type is not None
649
732
 
733
+ def assert_launchable(self) -> 'LaunchableResources':
734
+ """A workaround to make mypy understand that is_launchable() is true.
735
+
736
+ Note: The `cast` to `LaunchableResources` is only for static type
737
+ checking with MyPy. At runtime, the Python interpreter does not enforce
738
+ types, and the returned object will still be an instance of `Resources`.
739
+ """
740
+ assert self.is_launchable(), self
741
+ return typing.cast(LaunchableResources, self)
742
+
650
743
  def need_cleanup_after_preemption_or_failure(self) -> bool:
651
744
  """Whether a resource needs cleanup after preemption or failure."""
652
745
  assert self.is_launchable(), self
746
+ assert self.cloud is not None, 'Cloud must be specified'
653
747
  return self.cloud.need_cleanup_after_preemption_or_failure(self)
654
748
 
655
749
  def _try_canonicalize_accelerators(self) -> None:
@@ -706,10 +800,10 @@ class Resources:
706
800
  else:
707
801
  table = log_utils.create_table(['Cloud', 'Hint'])
708
802
  table.add_row(['-----', '----'])
709
- for cloud, error in cloud_to_errors.items():
803
+ for cloud_msg, error in cloud_to_errors.items():
710
804
  reason_str = '\n'.join(textwrap.wrap(
711
805
  str(error), 80))
712
- table.add_row([str(cloud), reason_str])
806
+ table.add_row([cloud_msg, reason_str])
713
807
  hint = table.get_string()
714
808
  raise ValueError(
715
809
  f'Invalid (region {self._region!r}, zone '
@@ -741,11 +835,13 @@ class Resources:
741
835
  ssh_proxy_command dict with region names as keys).
742
836
  """
743
837
  assert self.is_launchable(), self
744
-
745
- regions = self._cloud.regions_with_offering(self._instance_type,
746
- self.accelerators,
747
- self._use_spot,
748
- self._region, self._zone)
838
+ assert self.cloud is not None, 'Cloud must be specified'
839
+ assert self._instance_type is not None, (
840
+ 'Instance type must be specified')
841
+ regions = self.cloud.regions_with_offering(self._instance_type,
842
+ self.accelerators,
843
+ self._use_spot, self._region,
844
+ self._zone)
749
845
  if self._image_id is not None and None not in self._image_id:
750
846
  regions = [r for r in regions if r.name in self._image_id]
751
847
 
@@ -845,6 +941,10 @@ class Resources:
845
941
  cpus, mem = self.cloud.get_vcpus_mem_from_instance_type(
846
942
  self._instance_type)
847
943
  if self._cpus is not None:
944
+ assert cpus is not None, (
945
+ f'Can\'t get vCPUs from instance type: '
946
+ f'{self._instance_type}, check catalog or '
947
+ f'specify cpus directly.')
848
948
  if self._cpus.endswith('+'):
849
949
  if cpus < float(self._cpus[:-1]):
850
950
  with ux_utils.print_exception_no_traceback():
@@ -859,6 +959,10 @@ class Resources:
859
959
  f'number of vCPUs. {self.instance_type} has {cpus} '
860
960
  f'vCPUs, but {self._cpus} is requested.')
861
961
  if self.memory is not None:
962
+ assert mem is not None, (
963
+ f'Can\'t get memory from instance type: '
964
+ f'{self._instance_type}, check catalog or '
965
+ f'specify memory directly.')
862
966
  if self.memory.endswith(('+', 'x')):
863
967
  if mem < float(self.memory[:-1]):
864
968
  with ux_utils.print_exception_no_traceback():
@@ -882,6 +986,8 @@ class Resources:
882
986
  if self._job_recovery is None or self._job_recovery['strategy'] is None:
883
987
  return
884
988
  # Validate the job recovery strategy
989
+ assert isinstance(self._job_recovery['strategy'],
990
+ str), 'Job recovery strategy must be a string'
885
991
  registry.JOBS_RECOVERY_STRATEGY_REGISTRY.from_str(
886
992
  self._job_recovery['strategy'])
887
993
 
@@ -916,7 +1022,7 @@ class Resources:
916
1022
  'Cloud must be specified when image_id is provided.')
917
1023
 
918
1024
  try:
919
- self._cloud.check_features_are_supported(
1025
+ self.cloud.check_features_are_supported(
920
1026
  self,
921
1027
  requested_features={
922
1028
  clouds.CloudImplementationFeatures.IMAGE_ID
@@ -939,14 +1045,14 @@ class Resources:
939
1045
  # Check the image_id's are valid.
940
1046
  for region, image_id in self._image_id.items():
941
1047
  if (image_id.startswith('skypilot:') and
942
- not self._cloud.is_image_tag_valid(image_id, region)):
1048
+ not self.cloud.is_image_tag_valid(image_id, region)):
943
1049
  region_str = f' ({region})' if region else ''
944
1050
  with ux_utils.print_exception_no_traceback():
945
1051
  raise ValueError(
946
1052
  f'Image tag {image_id!r} is not valid, please make sure'
947
1053
  f' the tag exists in {self._cloud}{region_str}.')
948
1054
 
949
- if (self._cloud.is_same_cloud(clouds.AWS()) and
1055
+ if (self.cloud.is_same_cloud(clouds.AWS()) and
950
1056
  not image_id.startswith('skypilot:') and region is None):
951
1057
  with ux_utils.print_exception_no_traceback():
952
1058
  raise ValueError(
@@ -1051,6 +1157,9 @@ class Resources:
1051
1157
  """Returns cost in USD for the runtime in seconds."""
1052
1158
  hours = seconds / 3600
1053
1159
  # Instance.
1160
+ assert self.cloud is not None, 'Cloud must be specified'
1161
+ assert self._instance_type is not None, (
1162
+ 'Instance type must be specified')
1054
1163
  hourly_cost = self.cloud.instance_type_to_hourly_cost(
1055
1164
  self._instance_type, self.use_spot, self._region, self._zone)
1056
1165
  # Accelerators (if any).
@@ -1095,6 +1204,7 @@ class Resources:
1095
1204
  docker_image = self.extract_docker_image()
1096
1205
 
1097
1206
  # Cloud specific variables
1207
+ assert self.cloud is not None, 'Cloud must be specified'
1098
1208
  cloud_specific_variables = self.cloud.make_deploy_resources_variables(
1099
1209
  self, cluster_name, region, zones, num_nodes, dryrun)
1100
1210
 
@@ -1149,9 +1259,12 @@ class Resources:
1149
1259
  specific_reservations = set(
1150
1260
  skypilot_config.get_nested(
1151
1261
  (str(self.cloud).lower(), 'specific_reservations'), set()))
1262
+
1263
+ assert (self.cloud is not None and self.instance_type is not None and
1264
+ self.region
1265
+ is not None), ('Cloud, instance type, region must be specified')
1152
1266
  return self.cloud.get_reservations_available_resources(
1153
- self._instance_type, self._region, self._zone,
1154
- specific_reservations)
1267
+ self.instance_type, self.region, self.zone, specific_reservations)
1155
1268
 
1156
1269
  def less_demanding_than(
1157
1270
  self,
@@ -1171,6 +1284,9 @@ class Resources:
1171
1284
  if isinstance(other, list):
1172
1285
  resources_list = [self.less_demanding_than(o) for o in other]
1173
1286
  return requested_num_nodes <= sum(resources_list)
1287
+
1288
+ assert other.cloud is not None, 'Other cloud must be specified'
1289
+
1174
1290
  if self.cloud is not None and not self.cloud.is_same_cloud(other.cloud):
1175
1291
  return False
1176
1292
  # self.cloud <= other.cloud
@@ -1259,6 +1375,7 @@ class Resources:
1259
1375
  If a field in `blocked` is None, it should be considered as a wildcard
1260
1376
  for that field.
1261
1377
  """
1378
+ assert self.cloud is not None, 'Cloud must be specified'
1262
1379
  is_matched = True
1263
1380
  if (blocked.cloud is not None and
1264
1381
  not self.cloud.is_same_cloud(blocked.cloud)):
@@ -1297,7 +1414,7 @@ class Resources:
1297
1414
  use_spot = self.use_spot if self._use_spot_specified else None
1298
1415
 
1299
1416
  current_override_configs = self._cluster_config_overrides
1300
- if self._cluster_config_overrides is None:
1417
+ if current_override_configs is None:
1301
1418
  current_override_configs = {}
1302
1419
  new_override_configs = override.pop('_cluster_config_overrides', {})
1303
1420
  overlaid_configs = skypilot_config.overlay_skypilot_config(
@@ -1310,6 +1427,10 @@ class Resources:
1310
1427
  if elem is not None:
1311
1428
  override_configs.set_nested(key, elem)
1312
1429
 
1430
+ current_autostop_config = None
1431
+ if self.autostop_config is not None:
1432
+ current_autostop_config = self.autostop_config.to_yaml_config()
1433
+
1313
1434
  override_configs = dict(override_configs) if override_configs else None
1314
1435
  resources = Resources(
1315
1436
  cloud=override.pop('cloud', self.cloud),
@@ -1328,6 +1449,7 @@ class Resources:
1328
1449
  disk_tier=override.pop('disk_tier', self.disk_tier),
1329
1450
  ports=override.pop('ports', self.ports),
1330
1451
  labels=override.pop('labels', self.labels),
1452
+ autostop=override.pop('autostop', current_autostop_config),
1331
1453
  _docker_login_config=override.pop('_docker_login_config',
1332
1454
  self._docker_login_config),
1333
1455
  _docker_username_for_runpod=override.pop(
@@ -1525,6 +1647,7 @@ class Resources:
1525
1647
  resources_fields['disk_tier'] = config.pop('disk_tier', None)
1526
1648
  resources_fields['ports'] = config.pop('ports', None)
1527
1649
  resources_fields['labels'] = config.pop('labels', None)
1650
+ resources_fields['autostop'] = config.pop('autostop', None)
1528
1651
  resources_fields['_docker_login_config'] = config.pop(
1529
1652
  '_docker_login_config', None)
1530
1653
  resources_fields['_docker_username_for_runpod'] = config.pop(
@@ -1574,6 +1697,8 @@ class Resources:
1574
1697
  config['disk_tier'] = self.disk_tier.value
1575
1698
  add_if_not_none('ports', self.ports)
1576
1699
  add_if_not_none('labels', self.labels)
1700
+ if self._autostop_config is not None:
1701
+ config['autostop'] = self._autostop_config.to_yaml_config()
1577
1702
  if self._docker_login_config is not None:
1578
1703
  config['_docker_login_config'] = dataclasses.asdict(
1579
1704
  self._docker_login_config)
@@ -1729,4 +1854,42 @@ class Resources:
1729
1854
  self._docker_username_for_runpod = state.pop(
1730
1855
  '_docker_username_for_runpod', None)
1731
1856
 
1857
+ if version < 23:
1858
+ self._autostop_config = None
1859
+
1732
1860
  self.__dict__.update(state)
1861
+
1862
+
1863
+ class LaunchableResources(Resources):
1864
+ """A class representing resources that can be launched on a cloud provider.
1865
+
1866
+ This class is primarily a type hint for MyPy to indicate that an instance
1867
+ of `Resources` is launchable (i.e., `cloud` and `instance_type` are not
1868
+ None). It should not be instantiated directly.
1869
+ """
1870
+
1871
+ def __init__(self, *args, **kwargs) -> None: # pylint: disable=super-init-not-called,unused-argument
1872
+ assert False, (
1873
+ 'LaunchableResources should not be instantiated directly. '
1874
+ 'It is only used for type checking by MyPy.')
1875
+
1876
+ @property
1877
+ def cloud(self) -> clouds.Cloud:
1878
+ assert self._cloud is not None, 'Cloud must be specified'
1879
+ return self._cloud
1880
+
1881
+ @property
1882
+ def instance_type(self) -> str:
1883
+ assert self._instance_type is not None, (
1884
+ 'Instance type must be specified')
1885
+ return self._instance_type
1886
+
1887
+ def copy(self, **override) -> 'LaunchableResources':
1888
+ """Ensure MyPy understands the return type is LaunchableResources.
1889
+
1890
+ This method is not expected to be called at runtime, as
1891
+ LaunchableResources should not be directly instantiated. It primarily
1892
+ serves as a type hint for static analysis.
1893
+ """
1894
+ self.assert_launchable()
1895
+ return typing.cast(LaunchableResources, super().copy(**override))
sky/serve/server/core.py CHANGED
@@ -219,17 +219,12 @@ def up(
219
219
  # whether the service is already running. If the id is the same
220
220
  # with the current job id, we know the service is up and running
221
221
  # for the first time; otherwise it is a name conflict.
222
- controller_idle_minutes_to_autostop, controller_down = (
223
- controller_utils.get_controller_autostop_config(
224
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER))
225
222
  # Since the controller may be shared among multiple users, launch the
226
223
  # controller with the API server's user hash.
227
224
  with common.with_server_user_hash():
228
225
  controller_job_id, controller_handle = execution.launch(
229
226
  task=controller_task,
230
227
  cluster_name=controller_name,
231
- idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
232
- down=controller_down,
233
228
  retry_until_up=True,
234
229
  _disable_controller_check=True,
235
230
  )
sky/serve/spot_placer.py CHANGED
@@ -46,6 +46,8 @@ class Location:
46
46
 
47
47
  @classmethod
48
48
  def from_resources(cls, resources: 'resources_lib.Resources') -> 'Location':
49
+ assert resources.cloud is not None, 'Cloud must be specified'
50
+ assert resources.region is not None, 'Region must be specified'
49
51
  return cls(resources.cloud, resources.region, resources.zone)
50
52
 
51
53
  def to_dict(self) -> Dict[str, Any]:
@@ -147,6 +149,7 @@ def _get_possible_location_from_task(task: 'task_lib.Task') -> List[Location]:
147
149
  cloud_str = str(launchable.cloud)
148
150
  region = launchable.region
149
151
  zone = launchable.zone
152
+ assert region is not None, 'Region must be specified'
150
153
  if (cloud_str not in location_requirements and
151
154
  location_requirements):
152
155
  continue
sky/server/server.py CHANGED
@@ -9,6 +9,7 @@ import logging
9
9
  import multiprocessing
10
10
  import os
11
11
  import pathlib
12
+ import posixpath
12
13
  import re
13
14
  import shutil
14
15
  import sys
@@ -167,8 +168,36 @@ class InternalDashboardPrefixMiddleware(
167
168
  return await call_next(request)
168
169
 
169
170
 
171
+ class CacheControlStaticMiddleware(starlette.middleware.base.BaseHTTPMiddleware
172
+ ):
173
+ """Middleware to add cache control headers to static files."""
174
+
175
+ async def dispatch(self, request: fastapi.Request, call_next):
176
+ if request.url.path.startswith('/dashboard/_next'):
177
+ response = await call_next(request)
178
+ response.headers['Cache-Control'] = 'max-age=3600'
179
+ return response
180
+ return await call_next(request)
181
+
182
+
183
+ class PathCleanMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
184
+ """Middleware to check the path of requests."""
185
+
186
+ async def dispatch(self, request: fastapi.Request, call_next):
187
+ if request.url.path.startswith('/dashboard/'):
188
+ # If the requested path is not relative to the expected directory,
189
+ # then the user is attempting path traversal, so deny the request.
190
+ parent = pathlib.Path('/dashboard')
191
+ request_path = pathlib.Path(posixpath.normpath(request.url.path))
192
+ if not _is_relative_to(request_path, parent):
193
+ raise fastapi.HTTPException(status_code=403, detail='Forbidden')
194
+ return await call_next(request)
195
+
196
+
170
197
  app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
171
198
  app.add_middleware(InternalDashboardPrefixMiddleware)
199
+ app.add_middleware(PathCleanMiddleware)
200
+ app.add_middleware(CacheControlStaticMiddleware)
172
201
  app.add_middleware(
173
202
  cors.CORSMiddleware,
174
203
  # TODO(zhwu): in production deployment, we should restrict the allowed
@@ -1130,25 +1159,28 @@ async def complete_storage_name(incomplete: str,) -> List[str]:
1130
1159
  return global_user_state.get_storage_names_start_with(incomplete)
1131
1160
 
1132
1161
 
1133
- # Add a route to serve static files
1134
- @app.get('/{full_path:path}')
1135
- async def serve_static_or_dashboard(full_path: str):
1136
- """Serves static files for any unmatched routes.
1162
+ @app.get('/dashboard/{full_path:path}')
1163
+ async def serve_dashboard(full_path: str):
1164
+ """Serves the Next.js dashboard application.
1137
1165
 
1138
- Handles the /dashboard prefix from Next.js configuration.
1139
- """
1140
- # Check if the path starts with 'dashboard/' and remove it if it does
1141
- if full_path.startswith('dashboard/'):
1142
- full_path = full_path[len('dashboard/'):]
1166
+ Args:
1167
+ full_path: The path requested by the client.
1168
+ e.g. /clusters, /jobs
1169
+
1170
+ Returns:
1171
+ FileResponse for static files or index.html for client-side routing.
1143
1172
 
1144
- # Try to serve the file directly from the out directory first
1173
+ Raises:
1174
+ HTTPException: If the path is invalid or file not found.
1175
+ """
1176
+ # Try to serve the staticfile directly e.g. /skypilot.svg,
1177
+ # /favicon.ico, and /_next/, etc.
1145
1178
  file_path = os.path.join(server_constants.DASHBOARD_DIR, full_path)
1146
1179
  if os.path.isfile(file_path):
1147
1180
  return fastapi.responses.FileResponse(file_path)
1148
1181
 
1149
- # If file not found, serve the index.html for client-side routing.
1150
- # For example, the non-matched arbitrary route (/ or /test) from
1151
- # client will be redirected to the index.html.
1182
+ # Serve index.html for client-side routing
1183
+ # e.g. /clusters, /jobs
1152
1184
  index_path = os.path.join(server_constants.DASHBOARD_DIR, 'index.html')
1153
1185
  try:
1154
1186
  with open(index_path, 'r', encoding='utf-8') as f:
@@ -1159,6 +1191,12 @@ async def serve_static_or_dashboard(full_path: str):
1159
1191
  raise fastapi.HTTPException(status_code=500, detail=str(e))
1160
1192
 
1161
1193
 
1194
+ # Redirect the root path to dashboard
1195
+ @app.get('/')
1196
+ async def root():
1197
+ return fastapi.responses.RedirectResponse(url='/dashboard/')
1198
+
1199
+
1162
1200
  if __name__ == '__main__':
1163
1201
  import uvicorn
1164
1202