skypilot-nightly 1.0.0.dev20250521__py3-none-any.whl → 1.0.0.dev20250522__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. sky/__init__.py +2 -2
  2. sky/client/sdk.py +72 -1
  3. sky/clouds/__init__.py +2 -0
  4. sky/clouds/cloud.py +6 -0
  5. sky/clouds/gcp.py +156 -21
  6. sky/clouds/service_catalog/__init__.py +3 -0
  7. sky/clouds/service_catalog/common.py +9 -2
  8. sky/clouds/service_catalog/constants.py +1 -0
  9. sky/core.py +6 -8
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  12. sky/dashboard/out/clusters/[cluster].html +1 -1
  13. sky/dashboard/out/clusters.html +1 -1
  14. sky/dashboard/out/index.html +1 -1
  15. sky/dashboard/out/infra.html +1 -1
  16. sky/dashboard/out/jobs/[job].html +1 -1
  17. sky/dashboard/out/jobs.html +1 -1
  18. sky/data/storage.py +1 -0
  19. sky/execution.py +56 -7
  20. sky/jobs/server/core.py +4 -2
  21. sky/optimizer.py +6 -11
  22. sky/provision/gcp/constants.py +147 -4
  23. sky/provision/gcp/instance_utils.py +10 -0
  24. sky/provision/gcp/volume_utils.py +247 -0
  25. sky/resources.py +173 -3
  26. sky/serve/server/core.py +2 -4
  27. sky/server/common.py +46 -9
  28. sky/server/constants.py +2 -0
  29. sky/server/html/token_page.html +154 -0
  30. sky/server/requests/executor.py +3 -6
  31. sky/server/server.py +40 -8
  32. sky/skypilot_config.py +117 -31
  33. sky/task.py +24 -1
  34. sky/templates/gcp-ray.yml.j2 +44 -1
  35. sky/templates/nebius-ray.yml.j2 +0 -2
  36. sky/utils/admin_policy_utils.py +26 -22
  37. sky/utils/context.py +36 -6
  38. sky/utils/context_utils.py +15 -0
  39. sky/utils/resources_utils.py +14 -0
  40. sky/utils/schemas.py +46 -0
  41. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/METADATA +1 -1
  42. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/RECORD +48 -46
  43. /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → CzOVV6JpRQBRt5GhZuhyK}/_buildManifest.js +0 -0
  44. /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → CzOVV6JpRQBRt5GhZuhyK}/_ssgManifest.js +0 -0
  45. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/WHEEL +0 -0
  46. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/entry_points.txt +0 -0
  47. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/licenses/LICENSE +0 -0
  48. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,247 @@
1
+ """Utilities for GCP volumes."""
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from sky import clouds
5
+ from sky import exceptions
6
+ from sky import sky_logging
7
+ from sky.adaptors import gcp
8
+ from sky.provision.gcp import constants
9
+ from sky.utils import resources_utils
10
+ from sky.utils import ux_utils
11
+
12
+ logger = sky_logging.init_logger(__name__)
13
+
14
+
15
+ def get_data_disk_tier_mapping(
16
+ instance_type: Optional[str],) -> Dict[resources_utils.DiskTier, str]:
17
+ # Define the default mapping from disk tiers to disk types.
18
+ # Refer to https://cloud.google.com/compute/docs/disks/hyperdisks
19
+ # and https://cloud.google.com/compute/docs/disks/persistent-disks
20
+ tier2name = {
21
+ resources_utils.DiskTier.ULTRA: 'pd-extreme',
22
+ resources_utils.DiskTier.HIGH: 'pd-ssd',
23
+ resources_utils.DiskTier.MEDIUM: 'pd-balanced',
24
+ resources_utils.DiskTier.LOW: 'pd-standard',
25
+ }
26
+
27
+ if instance_type is None:
28
+ return tier2name
29
+
30
+ # Remap series-specific disk types.
31
+ series = instance_type.split('-')[0]
32
+
33
+ if series in ['a4', 'x4']:
34
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
35
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
36
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
37
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
38
+ elif series in ['m4']:
39
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
40
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
41
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
42
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
43
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
44
+ if num_cpus < 112:
45
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
46
+ elif series in ['c4', 'c4a', 'c4d']:
47
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
48
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
49
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
50
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
51
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
52
+ if num_cpus < 64:
53
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
54
+ elif series in ['a3']:
55
+ if (instance_type.startswith('a3-ultragpu') or
56
+ instance_type.startswith('a3-megagpu') or
57
+ instance_type.startswith('a3-edgegpu')):
58
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
59
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
60
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
61
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
62
+ elif instance_type.startswith('a3-highgpu'):
63
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
64
+ if instance_type.startswith('a3-highgpu-8g'):
65
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
66
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
67
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
68
+ elif instance_type.startswith('a3-highgpu-4g'):
69
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
70
+ else:
71
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
72
+ elif series in ['c3d']:
73
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
74
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
75
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
76
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
77
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
78
+ if num_cpus < 60:
79
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
80
+ elif series in ['c3']:
81
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
82
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
83
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
84
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
85
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
86
+ if num_cpus < 88:
87
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
88
+ elif series in ['n4']:
89
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
90
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
91
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
92
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
93
+ elif series in ['n2d', 'n1', 't2d', 't2a', 'e2', 'c2', 'c2d', 'a2']:
94
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
95
+ elif series in ['z3']:
96
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
97
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
98
+ elif series in ['h3']:
99
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
100
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
101
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
102
+ elif series in ['m3']:
103
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
104
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
105
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
106
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
107
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
108
+ if num_cpus < 64:
109
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
110
+ elif series in ['m2']:
111
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
112
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
113
+ elif series in ['m1']:
114
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
115
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
116
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
117
+ if num_cpus < 80:
118
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
119
+ elif series in ['g2']:
120
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
121
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
122
+ elif series in ['n2']:
123
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
124
+ if num_cpus < 64:
125
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
126
+ elif num_cpus >= 80:
127
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
128
+
129
+ return tier2name
130
+
131
+
132
+ def validate_instance_volumes(
133
+ instance_type: Optional[str],
134
+ volumes: Optional[List[Dict[str, Any]]],
135
+ ) -> None:
136
+ if not volumes:
137
+ return
138
+ if instance_type is None:
139
+ logger.warning('Instance type is not specified,'
140
+ ' skipping instance volume validation')
141
+ return
142
+ instance_volume_count = 0
143
+ for volume in volumes:
144
+ if volume['storage_type'] == resources_utils.StorageType.INSTANCE:
145
+ instance_volume_count += 1
146
+ if (instance_type in constants.SSD_AUTO_ATTACH_MACHINE_TYPES and
147
+ instance_volume_count >
148
+ constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]):
149
+ raise exceptions.ResourcesUnavailableError(
150
+ f'The instance type {instance_type} supports'
151
+ f' {constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]}'
152
+ f' instance storage, but {instance_volume_count} are specified')
153
+ # TODO(hailong):
154
+ # check the instance storage count for the other instance types,
155
+ # refer to https://cloud.google.com/compute/docs/disks/local-ssd
156
+
157
+
158
+ def translate_attach_mode(attach_mode: resources_utils.DiskAttachMode) -> str:
159
+ if attach_mode == resources_utils.DiskAttachMode.READ_ONLY:
160
+ return 'READ_ONLY'
161
+ return 'READ_WRITE'
162
+
163
+
164
+ def check_volume_name_exist_in_region(
165
+ project_id: str, region: clouds.Region, use_mig: bool,
166
+ volume_name: str) -> Optional[Dict[str, Any]]:
167
+ """Check if the volume name exists and return the volume info."""
168
+ logger.debug(f'Checking volume {volume_name} in region {region}')
169
+ try:
170
+ compute = gcp.build('compute',
171
+ 'v1',
172
+ credentials=None,
173
+ cache_discovery=False)
174
+ except gcp.credential_error_exception():
175
+ with ux_utils.print_exception_no_traceback():
176
+ raise ValueError('Not able to build compute client') from None
177
+
178
+ # Get all the zones in the region
179
+ all_zones = compute.zones().list(project=project_id).execute()
180
+ region_zones = []
181
+ if 'items' in all_zones:
182
+ for zone in all_zones['items']:
183
+ if zone['region'].split('/')[-1] == region.name:
184
+ region_zones.append(zone['name'])
185
+ volume_info = None
186
+ for zone in region_zones:
187
+ try:
188
+ volume_info = compute.disks().get(project=project_id,
189
+ zone=zone,
190
+ disk=volume_name).execute()
191
+ if volume_info is not None:
192
+ if use_mig:
193
+ # With MIG, instance template will be used, in this case,
194
+ # the `selfLink` for zonal disk needs to be the volume name
195
+ # Refer to https://cloud.google.com/compute/docs/
196
+ # reference/rest/v1/instances/insert
197
+ volume_info['selfLink'] = volume_name
198
+ volume_info['available_zones'] = [zone]
199
+ return volume_info
200
+ except gcp.http_error_exception() as e:
201
+ if e.resp.status == 403:
202
+ with ux_utils.print_exception_no_traceback():
203
+ raise ValueError('Not able to access the volume '
204
+ f'{volume_name!r}') from None
205
+ if e.resp.status == 404:
206
+ continue # Try next zone
207
+ raise
208
+
209
+ # If not found in any zone, check region disk
210
+ try:
211
+ volume_info = compute.regionDisks().get(project=project_id,
212
+ region=region.name,
213
+ disk=volume_name).execute()
214
+ # 'replicaZones':
215
+ # ['https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-a',
216
+ # 'https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-c']
217
+ if volume_info is not None and 'replicaZones' in volume_info:
218
+ replica_zones = [
219
+ zone.split('/')[-1] for zone in volume_info['replicaZones']
220
+ ]
221
+ volume_info['available_zones'] = replica_zones
222
+ return volume_info
223
+ except gcp.http_error_exception() as e:
224
+ if e.resp.status == 403:
225
+ with ux_utils.print_exception_no_traceback():
226
+ raise ValueError('Not able to access the volume '
227
+ f'{volume_name!r}') from None
228
+ if e.resp.status == 404:
229
+ logger.warning(
230
+ f'Volume {volume_name} is not found in region {region}.'
231
+ f' It will be created.')
232
+ return volume_info
233
+ raise
234
+
235
+
236
+ def check_volume_zone_match(volume_name: str,
237
+ zones: Optional[List[clouds.Zone]],
238
+ available_zones: List[str]):
239
+ if zones is None:
240
+ return None
241
+ for zone in zones:
242
+ if zone.name in available_zones:
243
+ return None
244
+ with ux_utils.print_exception_no_traceback():
245
+ # Return a ResourcesUnavailableError to trigger failover
246
+ raise exceptions.ResourcesUnavailableError(
247
+ f'Volume {volume_name} not available in zones {zones}') from None
sky/resources.py CHANGED
@@ -98,7 +98,7 @@ class Resources:
98
98
  """
99
99
  # If any fields changed, increment the version. For backward compatibility,
100
100
  # modify the __setstate__ method to handle the old version.
101
- _VERSION = 23
101
+ _VERSION = 24
102
102
 
103
103
  def __init__(
104
104
  self,
@@ -120,6 +120,7 @@ class Resources:
120
120
  ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
121
121
  labels: Optional[Dict[str, str]] = None,
122
122
  autostop: Union[bool, int, Dict[str, Any], None] = None,
123
+ volumes: Optional[List[Dict[str, Any]]] = None,
123
124
  # Internal use only.
124
125
  # pylint: disable=invalid-name
125
126
  _docker_login_config: Optional[docker_utils.DockerLoginConfig] = None,
@@ -210,6 +211,7 @@ class Resources:
210
211
  not supported and will be ignored.
211
212
  autostop: the autostop configuration to use. For launched resources,
212
213
  may or may not correspond to the actual current autostop config.
214
+ volumes: the volumes to mount on the instance.
213
215
  _docker_login_config: the docker configuration to use. This includes
214
216
  the docker username, password, and registry server. If None, skip
215
217
  docker login.
@@ -337,6 +339,7 @@ class Resources:
337
339
  self._set_memory(memory)
338
340
  self._set_accelerators(accelerators, accelerator_args)
339
341
  self._set_autostop_config(autostop)
342
+ self._set_volumes(volumes)
340
343
 
341
344
  def validate(self):
342
345
  """Validate the resources and infer the missing fields if possible."""
@@ -347,6 +350,7 @@ class Resources:
347
350
  self._try_validate_managed_job_attributes()
348
351
  self._try_validate_image_id()
349
352
  self._try_validate_disk_tier()
353
+ self._try_validate_volumes()
350
354
  self._try_validate_ports()
351
355
  self._try_validate_labels()
352
356
 
@@ -566,6 +570,10 @@ class Resources:
566
570
  def labels(self) -> Optional[Dict[str, str]]:
567
571
  return self._labels
568
572
 
573
+ @property
574
+ def volumes(self) -> Optional[List[Dict[str, Any]]]:
575
+ return self._volumes
576
+
569
577
  @property
570
578
  def autostop_config(self) -> Optional[AutostopConfig]:
571
579
  """The requested autostop config.
@@ -759,6 +767,91 @@ class Resources:
759
767
  ) -> None:
760
768
  self._autostop_config = AutostopConfig.from_yaml_config(autostop)
761
769
 
770
+ def _set_volumes(
771
+ self,
772
+ volumes: Optional[List[Dict[str, Any]]],
773
+ ) -> None:
774
+ if not volumes:
775
+ self._volumes = None
776
+ return
777
+ valid_volumes = []
778
+ supported_tiers = [tier.value for tier in resources_utils.DiskTier]
779
+ supported_storage_types = [
780
+ storage_type.value for storage_type in resources_utils.StorageType
781
+ ]
782
+ supported_attach_modes = [
783
+ attach_mode.value for attach_mode in resources_utils.DiskAttachMode
784
+ ]
785
+ network_type = resources_utils.StorageType.NETWORK
786
+ read_write_mode = resources_utils.DiskAttachMode.READ_WRITE
787
+ for volume in volumes:
788
+ if 'path' not in volume:
789
+ with ux_utils.print_exception_no_traceback():
790
+ raise ValueError(f'Invalid volume {volume!r}. '
791
+ f'Volume must have a "path" field.')
792
+ if 'storage_type' not in volume:
793
+ volume['storage_type'] = network_type
794
+ else:
795
+ if isinstance(volume['storage_type'], str):
796
+ storage_type_str = str(volume['storage_type']).lower()
797
+ if storage_type_str not in supported_storage_types:
798
+ logger.warning(
799
+ f'Invalid storage_type {storage_type_str!r}. '
800
+ f'Set it to '
801
+ f'{network_type.value}.')
802
+ volume['storage_type'] = network_type
803
+ else:
804
+ volume['storage_type'] = resources_utils.StorageType(
805
+ storage_type_str)
806
+ if 'auto_delete' not in volume:
807
+ volume['auto_delete'] = False
808
+ if 'attach_mode' in volume:
809
+ if isinstance(volume['attach_mode'], str):
810
+ attach_mode_str = str(volume['attach_mode']).lower()
811
+ if attach_mode_str not in supported_attach_modes:
812
+ logger.warning(
813
+ f'Invalid attach_mode {attach_mode_str!r}. '
814
+ f'Set it to {read_write_mode.value}.')
815
+ volume['attach_mode'] = read_write_mode
816
+ else:
817
+ volume['attach_mode'] = resources_utils.DiskAttachMode(
818
+ attach_mode_str)
819
+ else:
820
+ volume['attach_mode'] = read_write_mode
821
+ if volume['storage_type'] == network_type:
822
+ if ('disk_size' in volume and
823
+ round(volume['disk_size']) != volume['disk_size']):
824
+ with ux_utils.print_exception_no_traceback():
825
+ raise ValueError(f'Volume size must be an integer. '
826
+ f'Got: {volume["size"]}.')
827
+ if 'name' not in volume:
828
+ with ux_utils.print_exception_no_traceback():
829
+ raise ValueError(f'Network volume {volume["path"]} '
830
+ f'must have "name" field.')
831
+ elif 'name' in volume:
832
+ logger.info(f'Volume {volume["path"]} is a local disk. '
833
+ f'The "name" field will be ignored.')
834
+ del volume['name']
835
+ if 'disk_tier' in volume:
836
+ if isinstance(volume['disk_tier'], str):
837
+ disk_tier_str = str(volume['disk_tier']).lower()
838
+ if disk_tier_str not in supported_tiers:
839
+ logger.warning(
840
+ f'Invalid disk_tier {disk_tier_str!r}. '
841
+ f'Set it to {resources_utils.DiskTier.BEST.value}.')
842
+ volume['disk_tier'] = resources_utils.DiskTier.BEST
843
+ else:
844
+ volume['disk_tier'] = resources_utils.DiskTier(
845
+ disk_tier_str)
846
+ elif volume['storage_type'] == network_type:
847
+ logger.debug(
848
+ f'No disk_tier specified for volume {volume["path"]}. '
849
+ f'Set it to {resources_utils.DiskTier.BEST.value}.')
850
+ volume['disk_tier'] = resources_utils.DiskTier.BEST
851
+
852
+ valid_volumes.append(volume)
853
+ self._volumes = valid_volumes
854
+
762
855
  def is_launchable(self) -> bool:
763
856
  """Returns whether the resource is launchable."""
764
857
  return self.cloud is not None and self._instance_type is not None
@@ -1123,6 +1216,48 @@ class Resources:
1123
1216
  f'Disk tier {self.disk_tier.value} is not supported '
1124
1217
  f'for instance type {self.instance_type}.') from None
1125
1218
 
1219
+ def _try_validate_volumes(self) -> None:
1220
+ """Try to validate the volumes attribute.
1221
+
1222
+ Raises:
1223
+ ValueError: if the attribute is invalid.
1224
+ """
1225
+ if self.volumes is None:
1226
+ return
1227
+ if self.cloud is None:
1228
+ with ux_utils.print_exception_no_traceback():
1229
+ raise ValueError('Cloud must be specified when '
1230
+ 'volumes are provided.')
1231
+ if not self.cloud.is_same_cloud(clouds.GCP()):
1232
+ with ux_utils.print_exception_no_traceback():
1233
+ raise ValueError(f'Volumes are only supported for GCP'
1234
+ f' not for {self.cloud}.')
1235
+
1236
+ need_region_or_zone = False
1237
+ try:
1238
+ for volume in self.volumes:
1239
+ if ('name' in volume and volume['storage_type']
1240
+ == resources_utils.StorageType.NETWORK):
1241
+ need_region_or_zone = True
1242
+ if 'disk_tier' not in volume:
1243
+ continue
1244
+ # TODO(hailong): check instance local SSD
1245
+ # support for instance_type.
1246
+ # Refer to https://cloud.google.com/compute/docs/disks/local-ssd#machine-series-lssd # pylint: disable=line-too-long
1247
+ self.cloud.check_disk_tier_enabled(self.instance_type,
1248
+ volume['disk_tier'])
1249
+ if (need_region_or_zone and self._region is None and
1250
+ self._zone is None):
1251
+ with ux_utils.print_exception_no_traceback():
1252
+ raise ValueError('When specifying the volume name, please'
1253
+ ' also specify the region or zone.')
1254
+ except exceptions.NotSupportedError:
1255
+ with ux_utils.print_exception_no_traceback():
1256
+ raise ValueError(
1257
+ f'Disk tier {volume["disk_tier"].value} is not '
1258
+ f'supported for instance type {self.instance_type}.'
1259
+ ) from None
1260
+
1126
1261
  def _try_validate_ports(self) -> None:
1127
1262
  """Try to validate the ports attribute.
1128
1263
 
@@ -1293,9 +1428,18 @@ class Resources:
1293
1428
  skypilot_config.get_nested(
1294
1429
  (str(self.cloud).lower(), 'specific_reservations'), set()))
1295
1430
 
1431
+ if isinstance(self.cloud, clouds.DummyCloud):
1432
+ return self.cloud.get_reservations_available_resources(
1433
+ instance_type='',
1434
+ region='',
1435
+ zone=None,
1436
+ specific_reservations=specific_reservations)
1437
+
1296
1438
  assert (self.cloud is not None and self.instance_type is not None and
1297
- self.region
1298
- is not None), ('Cloud, instance type, region must be specified')
1439
+ self.region is not None), (
1440
+ f'Cloud, instance type, region must be specified. '
1441
+ f'Resources={self}, cloud={self.cloud}, '
1442
+ f'instance_type={self.instance_type}, region={self.region}')
1299
1443
  return self.cloud.get_reservations_available_resources(
1300
1444
  self.instance_type, self.region, self.zone, specific_reservations)
1301
1445
 
@@ -1483,6 +1627,7 @@ class Resources:
1483
1627
  ports=override.pop('ports', self.ports),
1484
1628
  labels=override.pop('labels', self.labels),
1485
1629
  autostop=override.pop('autostop', current_autostop_config),
1630
+ volumes=override.pop('volumes', self.volumes),
1486
1631
  infra=override.pop('infra', None),
1487
1632
  _docker_login_config=override.pop('_docker_login_config',
1488
1633
  self._docker_login_config),
@@ -1523,6 +1668,12 @@ class Resources:
1523
1668
  features.add(clouds.CloudImplementationFeatures.IMAGE_ID)
1524
1669
  if self.ports is not None:
1525
1670
  features.add(clouds.CloudImplementationFeatures.OPEN_PORTS)
1671
+ if self.volumes is not None:
1672
+ for volume in self.volumes:
1673
+ if 'disk_tier' in volume and volume[
1674
+ 'disk_tier'] != resources_utils.DiskTier.BEST:
1675
+ features.add(
1676
+ clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER)
1526
1677
  return features
1527
1678
 
1528
1679
  @staticmethod
@@ -1692,6 +1843,7 @@ class Resources:
1692
1843
  resources_fields['ports'] = config.pop('ports', None)
1693
1844
  resources_fields['labels'] = config.pop('labels', None)
1694
1845
  resources_fields['autostop'] = config.pop('autostop', None)
1846
+ resources_fields['volumes'] = config.pop('volumes', None)
1695
1847
  resources_fields['_docker_login_config'] = config.pop(
1696
1848
  '_docker_login_config', None)
1697
1849
  resources_fields['_docker_username_for_runpod'] = config.pop(
@@ -1742,6 +1894,21 @@ class Resources:
1742
1894
  config['disk_tier'] = self.disk_tier.value
1743
1895
  add_if_not_none('ports', self.ports)
1744
1896
  add_if_not_none('labels', self.labels)
1897
+ if self.volumes is not None:
1898
+ # Convert DiskTier/StorageType enum to string value for each volume
1899
+ volumes = []
1900
+ for volume in self.volumes:
1901
+ volume_copy = volume.copy()
1902
+ if 'disk_tier' in volume_copy:
1903
+ volume_copy['disk_tier'] = volume_copy['disk_tier'].value
1904
+ if 'storage_type' in volume_copy:
1905
+ volume_copy['storage_type'] = volume_copy[
1906
+ 'storage_type'].value
1907
+ if 'attach_mode' in volume_copy:
1908
+ volume_copy['attach_mode'] = volume_copy[
1909
+ 'attach_mode'].value
1910
+ volumes.append(volume_copy)
1911
+ config['volumes'] = volumes
1745
1912
  if self._autostop_config is not None:
1746
1913
  config['autostop'] = self._autostop_config.to_yaml_config()
1747
1914
  if self._docker_login_config is not None:
@@ -1902,6 +2069,9 @@ class Resources:
1902
2069
  if version < 23:
1903
2070
  self._autostop_config = None
1904
2071
 
2072
+ if version < 24:
2073
+ self._volumes = None
2074
+
1905
2075
  self.__dict__.update(state)
1906
2076
 
1907
2077
 
sky/serve/server/core.py CHANGED
@@ -141,8 +141,7 @@ def up(
141
141
  # Always apply the policy again here, even though it might have been applied
142
142
  # in the CLI. This is to ensure that we apply the policy to the final DAG
143
143
  # and get the mutated config.
144
- dag, mutated_user_config = admin_policy_utils.apply(
145
- task, use_mutated_config_in_current_request=False)
144
+ dag, mutated_user_config = admin_policy_utils.apply(task)
146
145
  task = dag.tasks[0]
147
146
 
148
147
  with rich_utils.safe_status(
@@ -352,8 +351,7 @@ def update(
352
351
  # and get the mutated config.
353
352
  # TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
354
353
  # will not apply the config.
355
- dag, _ = admin_policy_utils.apply(
356
- task, use_mutated_config_in_current_request=False)
354
+ dag, _ = admin_policy_utils.apply(task)
357
355
  task = dag.tasks[0]
358
356
 
359
357
  assert task.service is not None
sky/server/common.py CHANGED
@@ -12,7 +12,7 @@ import subprocess
12
12
  import sys
13
13
  import time
14
14
  import typing
15
- from typing import Any, Dict, Optional
15
+ from typing import Any, Dict, Literal, Optional
16
16
  from urllib import parse
17
17
  import uuid
18
18
 
@@ -116,6 +116,7 @@ class ApiServerStatus(enum.Enum):
116
116
  HEALTHY = 'healthy'
117
117
  UNHEALTHY = 'unhealthy'
118
118
  VERSION_MISMATCH = 'version_mismatch'
119
+ NEEDS_AUTH = 'needs_auth'
119
120
 
120
121
 
121
122
  @dataclasses.dataclass
@@ -127,15 +128,21 @@ class ApiServerInfo:
127
128
  commit: Optional[str] = None
128
129
 
129
130
 
131
+ def get_api_cookie_jar_path() -> str:
132
+ return os.environ.get(server_constants.API_COOKIE_FILE_ENV_VAR,
133
+ server_constants.API_COOKIE_FILE_DEFAULT_LOCATION)
134
+
135
+
130
136
  def get_api_cookie_jar() -> requests.cookies.RequestsCookieJar:
131
137
  """Returns the cookie jar used by the client to access the API server."""
132
- cookie_file = os.environ.get(server_constants.API_COOKIE_FILE_ENV_VAR)
133
138
  cookie_jar = requests.cookies.RequestsCookieJar()
134
- if cookie_file and os.path.exists(cookie_file):
139
+ cookie_file = get_api_cookie_jar_path()
140
+ if cookie_file:
135
141
  cookie_path = pathlib.Path(cookie_file).expanduser().resolve()
136
- file_cookie_jar = MozillaCookieJar(cookie_path)
137
- file_cookie_jar.load()
138
- cookie_jar.update(file_cookie_jar)
142
+ if cookie_path.exists():
143
+ file_cookie_jar = MozillaCookieJar(cookie_path)
144
+ file_cookie_jar.load()
145
+ cookie_jar.update(file_cookie_jar)
139
146
  return cookie_jar
140
147
 
141
148
 
@@ -196,6 +203,7 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
196
203
  response = requests.get(f'{server_url}/api/health',
197
204
  timeout=2.5,
198
205
  cookies=get_api_cookie_jar())
206
+ logger.debug(f'Health check status: {response.status_code}')
199
207
  if response.status_code == 200:
200
208
  try:
201
209
  result = response.json()
@@ -217,9 +225,24 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
217
225
  server_info.status = ApiServerStatus.VERSION_MISMATCH
218
226
  return server_info
219
227
  except (json.JSONDecodeError, AttributeError) as e:
228
+ # Try to check if we got redirected to a login page.
229
+ for prev_response in response.history:
230
+ logger.debug(f'Previous response: {prev_response.url}')
231
+ # Heuristic: check if the url looks like a login page or
232
+ # oauth flow.
233
+ if any(key in prev_response.url
234
+ for key in ['login', 'oauth2']):
235
+ logger.debug(
236
+ f'URL {prev_response.url} looks like '
237
+ 'a login page or oauth flow, so try to '
238
+ 'get the cookie.')
239
+ return ApiServerInfo(
240
+ status=ApiServerStatus.NEEDS_AUTH)
220
241
  logger.warning('Failed to parse API server response: '
221
242
  f'{str(e)}')
222
243
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
244
+ elif response.status_code == 401:
245
+ return ApiServerInfo(status=ApiServerStatus.NEEDS_AUTH)
223
246
  else:
224
247
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
225
248
  except requests.exceptions.Timeout:
@@ -369,7 +392,12 @@ def _start_api_server(deploy: bool = False,
369
392
  f'SkyPilot API server started. {dashboard_msg}'))
370
393
 
371
394
 
372
- def check_server_healthy(endpoint: Optional[str] = None,) -> None:
395
+ def check_server_healthy(
396
+ endpoint: Optional[str] = None
397
+ ) -> Literal[
398
+ # Use an incomplete list of Literals here to enforce raising for other
399
+ # enum values.
400
+ ApiServerStatus.HEALTHY, ApiServerStatus.NEEDS_AUTH]:
373
401
  """Check if the API server is healthy.
374
402
 
375
403
  Args:
@@ -379,6 +407,11 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
379
407
  Raises:
380
408
  RuntimeError: If the server is not healthy or the client version does
381
409
  not match the server version.
410
+
411
+ Returns:
412
+ ApiServerStatus: The status of the API server, unless the server is
413
+ unhealthy or the client version does not match the server version,
414
+ in which case an exception is raised.
382
415
  """
383
416
  endpoint = endpoint if endpoint is not None else get_server_url()
384
417
  api_server_info = get_api_server_status(endpoint)
@@ -441,6 +474,8 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
441
474
 
442
475
  hinted_for_server_install_version_mismatch = True
443
476
 
477
+ return api_server_status
478
+
444
479
 
445
480
  def _get_version_info_hint(server_info: ApiServerInfo) -> str:
446
481
  assert server_info.version is not None, 'Server version is None'
@@ -491,11 +526,13 @@ def get_skypilot_version_on_disk() -> str:
491
526
  def check_server_healthy_or_start_fn(deploy: bool = False,
492
527
  host: str = '127.0.0.1',
493
528
  foreground: bool = False):
529
+ api_server_status = None
494
530
  try:
495
- check_server_healthy()
531
+ api_server_status = check_server_healthy()
496
532
  except exceptions.ApiServerConnectionError as exc:
497
533
  endpoint = get_server_url()
498
- if not is_api_server_local():
534
+ if (not is_api_server_local() or
535
+ api_server_status == ApiServerStatus.NEEDS_AUTH):
499
536
  with ux_utils.print_exception_no_traceback():
500
537
  raise exceptions.ApiServerConnectionError(endpoint) from exc
501
538
  # Lock to prevent multiple processes from starting the server at the
sky/server/constants.py CHANGED
@@ -26,6 +26,8 @@ CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
26
26
 
27
27
  # Environment variable for a file path to the API cookie file.
28
28
  API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
29
+ # Default file if unset.
30
+ API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
29
31
 
30
32
  # The path to the dashboard build output
31
33
  DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',