skypilot-nightly 1.0.0.dev20250521__py3-none-any.whl → 1.0.0.dev20250522__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/client/sdk.py +72 -1
- sky/clouds/__init__.py +2 -0
- sky/clouds/cloud.py +6 -0
- sky/clouds/gcp.py +156 -21
- sky/clouds/service_catalog/__init__.py +3 -0
- sky/clouds/service_catalog/common.py +9 -2
- sky/clouds/service_catalog/constants.py +1 -0
- sky/core.py +6 -8
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/data/storage.py +1 -0
- sky/execution.py +56 -7
- sky/jobs/server/core.py +4 -2
- sky/optimizer.py +6 -11
- sky/provision/gcp/constants.py +147 -4
- sky/provision/gcp/instance_utils.py +10 -0
- sky/provision/gcp/volume_utils.py +247 -0
- sky/resources.py +173 -3
- sky/serve/server/core.py +2 -4
- sky/server/common.py +46 -9
- sky/server/constants.py +2 -0
- sky/server/html/token_page.html +154 -0
- sky/server/requests/executor.py +3 -6
- sky/server/server.py +40 -8
- sky/skypilot_config.py +117 -31
- sky/task.py +24 -1
- sky/templates/gcp-ray.yml.j2 +44 -1
- sky/templates/nebius-ray.yml.j2 +0 -2
- sky/utils/admin_policy_utils.py +26 -22
- sky/utils/context.py +36 -6
- sky/utils/context_utils.py +15 -0
- sky/utils/resources_utils.py +14 -0
- sky/utils/schemas.py +46 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/RECORD +48 -46
- /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → CzOVV6JpRQBRt5GhZuhyK}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → CzOVV6JpRQBRt5GhZuhyK}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,247 @@
|
|
1
|
+
"""Utilities for GCP volumes."""
|
2
|
+
from typing import Any, Dict, List, Optional
|
3
|
+
|
4
|
+
from sky import clouds
|
5
|
+
from sky import exceptions
|
6
|
+
from sky import sky_logging
|
7
|
+
from sky.adaptors import gcp
|
8
|
+
from sky.provision.gcp import constants
|
9
|
+
from sky.utils import resources_utils
|
10
|
+
from sky.utils import ux_utils
|
11
|
+
|
12
|
+
logger = sky_logging.init_logger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def get_data_disk_tier_mapping(
|
16
|
+
instance_type: Optional[str],) -> Dict[resources_utils.DiskTier, str]:
|
17
|
+
# Define the default mapping from disk tiers to disk types.
|
18
|
+
# Refer to https://cloud.google.com/compute/docs/disks/hyperdisks
|
19
|
+
# and https://cloud.google.com/compute/docs/disks/persistent-disks
|
20
|
+
tier2name = {
|
21
|
+
resources_utils.DiskTier.ULTRA: 'pd-extreme',
|
22
|
+
resources_utils.DiskTier.HIGH: 'pd-ssd',
|
23
|
+
resources_utils.DiskTier.MEDIUM: 'pd-balanced',
|
24
|
+
resources_utils.DiskTier.LOW: 'pd-standard',
|
25
|
+
}
|
26
|
+
|
27
|
+
if instance_type is None:
|
28
|
+
return tier2name
|
29
|
+
|
30
|
+
# Remap series-specific disk types.
|
31
|
+
series = instance_type.split('-')[0]
|
32
|
+
|
33
|
+
if series in ['a4', 'x4']:
|
34
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
35
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
36
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
37
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
38
|
+
elif series in ['m4']:
|
39
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
40
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
41
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
42
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
43
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
44
|
+
if num_cpus < 112:
|
45
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
46
|
+
elif series in ['c4', 'c4a', 'c4d']:
|
47
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
48
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
49
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
50
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
51
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
52
|
+
if num_cpus < 64:
|
53
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
54
|
+
elif series in ['a3']:
|
55
|
+
if (instance_type.startswith('a3-ultragpu') or
|
56
|
+
instance_type.startswith('a3-megagpu') or
|
57
|
+
instance_type.startswith('a3-edgegpu')):
|
58
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
59
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
60
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
61
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
62
|
+
elif instance_type.startswith('a3-highgpu'):
|
63
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
64
|
+
if instance_type.startswith('a3-highgpu-8g'):
|
65
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
66
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
67
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
68
|
+
elif instance_type.startswith('a3-highgpu-4g'):
|
69
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
70
|
+
else:
|
71
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
72
|
+
elif series in ['c3d']:
|
73
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
74
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
75
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
76
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
77
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
78
|
+
if num_cpus < 60:
|
79
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
80
|
+
elif series in ['c3']:
|
81
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
82
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
83
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
84
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
85
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
86
|
+
if num_cpus < 88:
|
87
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
88
|
+
elif series in ['n4']:
|
89
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
90
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
91
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
92
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
93
|
+
elif series in ['n2d', 'n1', 't2d', 't2a', 'e2', 'c2', 'c2d', 'a2']:
|
94
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
95
|
+
elif series in ['z3']:
|
96
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
97
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
98
|
+
elif series in ['h3']:
|
99
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
100
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
101
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
102
|
+
elif series in ['m3']:
|
103
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
104
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
105
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
106
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
107
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
108
|
+
if num_cpus < 64:
|
109
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
110
|
+
elif series in ['m2']:
|
111
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
112
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
113
|
+
elif series in ['m1']:
|
114
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
115
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
116
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
117
|
+
if num_cpus < 80:
|
118
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
119
|
+
elif series in ['g2']:
|
120
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
121
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
122
|
+
elif series in ['n2']:
|
123
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
124
|
+
if num_cpus < 64:
|
125
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
126
|
+
elif num_cpus >= 80:
|
127
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
128
|
+
|
129
|
+
return tier2name
|
130
|
+
|
131
|
+
|
132
|
+
def validate_instance_volumes(
|
133
|
+
instance_type: Optional[str],
|
134
|
+
volumes: Optional[List[Dict[str, Any]]],
|
135
|
+
) -> None:
|
136
|
+
if not volumes:
|
137
|
+
return
|
138
|
+
if instance_type is None:
|
139
|
+
logger.warning('Instance type is not specified,'
|
140
|
+
' skipping instance volume validation')
|
141
|
+
return
|
142
|
+
instance_volume_count = 0
|
143
|
+
for volume in volumes:
|
144
|
+
if volume['storage_type'] == resources_utils.StorageType.INSTANCE:
|
145
|
+
instance_volume_count += 1
|
146
|
+
if (instance_type in constants.SSD_AUTO_ATTACH_MACHINE_TYPES and
|
147
|
+
instance_volume_count >
|
148
|
+
constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]):
|
149
|
+
raise exceptions.ResourcesUnavailableError(
|
150
|
+
f'The instance type {instance_type} supports'
|
151
|
+
f' {constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]}'
|
152
|
+
f' instance storage, but {instance_volume_count} are specified')
|
153
|
+
# TODO(hailong):
|
154
|
+
# check the instance storage count for the other instance types,
|
155
|
+
# refer to https://cloud.google.com/compute/docs/disks/local-ssd
|
156
|
+
|
157
|
+
|
158
|
+
def translate_attach_mode(attach_mode: resources_utils.DiskAttachMode) -> str:
|
159
|
+
if attach_mode == resources_utils.DiskAttachMode.READ_ONLY:
|
160
|
+
return 'READ_ONLY'
|
161
|
+
return 'READ_WRITE'
|
162
|
+
|
163
|
+
|
164
|
+
def check_volume_name_exist_in_region(
|
165
|
+
project_id: str, region: clouds.Region, use_mig: bool,
|
166
|
+
volume_name: str) -> Optional[Dict[str, Any]]:
|
167
|
+
"""Check if the volume name exists and return the volume info."""
|
168
|
+
logger.debug(f'Checking volume {volume_name} in region {region}')
|
169
|
+
try:
|
170
|
+
compute = gcp.build('compute',
|
171
|
+
'v1',
|
172
|
+
credentials=None,
|
173
|
+
cache_discovery=False)
|
174
|
+
except gcp.credential_error_exception():
|
175
|
+
with ux_utils.print_exception_no_traceback():
|
176
|
+
raise ValueError('Not able to build compute client') from None
|
177
|
+
|
178
|
+
# Get all the zones in the region
|
179
|
+
all_zones = compute.zones().list(project=project_id).execute()
|
180
|
+
region_zones = []
|
181
|
+
if 'items' in all_zones:
|
182
|
+
for zone in all_zones['items']:
|
183
|
+
if zone['region'].split('/')[-1] == region.name:
|
184
|
+
region_zones.append(zone['name'])
|
185
|
+
volume_info = None
|
186
|
+
for zone in region_zones:
|
187
|
+
try:
|
188
|
+
volume_info = compute.disks().get(project=project_id,
|
189
|
+
zone=zone,
|
190
|
+
disk=volume_name).execute()
|
191
|
+
if volume_info is not None:
|
192
|
+
if use_mig:
|
193
|
+
# With MIG, instance template will be used, in this case,
|
194
|
+
# the `selfLink` for zonal disk needs to be the volume name
|
195
|
+
# Refer to https://cloud.google.com/compute/docs/
|
196
|
+
# reference/rest/v1/instances/insert
|
197
|
+
volume_info['selfLink'] = volume_name
|
198
|
+
volume_info['available_zones'] = [zone]
|
199
|
+
return volume_info
|
200
|
+
except gcp.http_error_exception() as e:
|
201
|
+
if e.resp.status == 403:
|
202
|
+
with ux_utils.print_exception_no_traceback():
|
203
|
+
raise ValueError('Not able to access the volume '
|
204
|
+
f'{volume_name!r}') from None
|
205
|
+
if e.resp.status == 404:
|
206
|
+
continue # Try next zone
|
207
|
+
raise
|
208
|
+
|
209
|
+
# If not found in any zone, check region disk
|
210
|
+
try:
|
211
|
+
volume_info = compute.regionDisks().get(project=project_id,
|
212
|
+
region=region.name,
|
213
|
+
disk=volume_name).execute()
|
214
|
+
# 'replicaZones':
|
215
|
+
# ['https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-a',
|
216
|
+
# 'https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-c']
|
217
|
+
if volume_info is not None and 'replicaZones' in volume_info:
|
218
|
+
replica_zones = [
|
219
|
+
zone.split('/')[-1] for zone in volume_info['replicaZones']
|
220
|
+
]
|
221
|
+
volume_info['available_zones'] = replica_zones
|
222
|
+
return volume_info
|
223
|
+
except gcp.http_error_exception() as e:
|
224
|
+
if e.resp.status == 403:
|
225
|
+
with ux_utils.print_exception_no_traceback():
|
226
|
+
raise ValueError('Not able to access the volume '
|
227
|
+
f'{volume_name!r}') from None
|
228
|
+
if e.resp.status == 404:
|
229
|
+
logger.warning(
|
230
|
+
f'Volume {volume_name} is not found in region {region}.'
|
231
|
+
f' It will be created.')
|
232
|
+
return volume_info
|
233
|
+
raise
|
234
|
+
|
235
|
+
|
236
|
+
def check_volume_zone_match(volume_name: str,
|
237
|
+
zones: Optional[List[clouds.Zone]],
|
238
|
+
available_zones: List[str]):
|
239
|
+
if zones is None:
|
240
|
+
return None
|
241
|
+
for zone in zones:
|
242
|
+
if zone.name in available_zones:
|
243
|
+
return None
|
244
|
+
with ux_utils.print_exception_no_traceback():
|
245
|
+
# Return a ResourcesUnavailableError to trigger failover
|
246
|
+
raise exceptions.ResourcesUnavailableError(
|
247
|
+
f'Volume {volume_name} not available in zones {zones}') from None
|
sky/resources.py
CHANGED
@@ -98,7 +98,7 @@ class Resources:
|
|
98
98
|
"""
|
99
99
|
# If any fields changed, increment the version. For backward compatibility,
|
100
100
|
# modify the __setstate__ method to handle the old version.
|
101
|
-
_VERSION =
|
101
|
+
_VERSION = 24
|
102
102
|
|
103
103
|
def __init__(
|
104
104
|
self,
|
@@ -120,6 +120,7 @@ class Resources:
|
|
120
120
|
ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
|
121
121
|
labels: Optional[Dict[str, str]] = None,
|
122
122
|
autostop: Union[bool, int, Dict[str, Any], None] = None,
|
123
|
+
volumes: Optional[List[Dict[str, Any]]] = None,
|
123
124
|
# Internal use only.
|
124
125
|
# pylint: disable=invalid-name
|
125
126
|
_docker_login_config: Optional[docker_utils.DockerLoginConfig] = None,
|
@@ -210,6 +211,7 @@ class Resources:
|
|
210
211
|
not supported and will be ignored.
|
211
212
|
autostop: the autostop configuration to use. For launched resources,
|
212
213
|
may or may not correspond to the actual current autostop config.
|
214
|
+
volumes: the volumes to mount on the instance.
|
213
215
|
_docker_login_config: the docker configuration to use. This includes
|
214
216
|
the docker username, password, and registry server. If None, skip
|
215
217
|
docker login.
|
@@ -337,6 +339,7 @@ class Resources:
|
|
337
339
|
self._set_memory(memory)
|
338
340
|
self._set_accelerators(accelerators, accelerator_args)
|
339
341
|
self._set_autostop_config(autostop)
|
342
|
+
self._set_volumes(volumes)
|
340
343
|
|
341
344
|
def validate(self):
|
342
345
|
"""Validate the resources and infer the missing fields if possible."""
|
@@ -347,6 +350,7 @@ class Resources:
|
|
347
350
|
self._try_validate_managed_job_attributes()
|
348
351
|
self._try_validate_image_id()
|
349
352
|
self._try_validate_disk_tier()
|
353
|
+
self._try_validate_volumes()
|
350
354
|
self._try_validate_ports()
|
351
355
|
self._try_validate_labels()
|
352
356
|
|
@@ -566,6 +570,10 @@ class Resources:
|
|
566
570
|
def labels(self) -> Optional[Dict[str, str]]:
|
567
571
|
return self._labels
|
568
572
|
|
573
|
+
@property
|
574
|
+
def volumes(self) -> Optional[List[Dict[str, Any]]]:
|
575
|
+
return self._volumes
|
576
|
+
|
569
577
|
@property
|
570
578
|
def autostop_config(self) -> Optional[AutostopConfig]:
|
571
579
|
"""The requested autostop config.
|
@@ -759,6 +767,91 @@ class Resources:
|
|
759
767
|
) -> None:
|
760
768
|
self._autostop_config = AutostopConfig.from_yaml_config(autostop)
|
761
769
|
|
770
|
+
def _set_volumes(
|
771
|
+
self,
|
772
|
+
volumes: Optional[List[Dict[str, Any]]],
|
773
|
+
) -> None:
|
774
|
+
if not volumes:
|
775
|
+
self._volumes = None
|
776
|
+
return
|
777
|
+
valid_volumes = []
|
778
|
+
supported_tiers = [tier.value for tier in resources_utils.DiskTier]
|
779
|
+
supported_storage_types = [
|
780
|
+
storage_type.value for storage_type in resources_utils.StorageType
|
781
|
+
]
|
782
|
+
supported_attach_modes = [
|
783
|
+
attach_mode.value for attach_mode in resources_utils.DiskAttachMode
|
784
|
+
]
|
785
|
+
network_type = resources_utils.StorageType.NETWORK
|
786
|
+
read_write_mode = resources_utils.DiskAttachMode.READ_WRITE
|
787
|
+
for volume in volumes:
|
788
|
+
if 'path' not in volume:
|
789
|
+
with ux_utils.print_exception_no_traceback():
|
790
|
+
raise ValueError(f'Invalid volume {volume!r}. '
|
791
|
+
f'Volume must have a "path" field.')
|
792
|
+
if 'storage_type' not in volume:
|
793
|
+
volume['storage_type'] = network_type
|
794
|
+
else:
|
795
|
+
if isinstance(volume['storage_type'], str):
|
796
|
+
storage_type_str = str(volume['storage_type']).lower()
|
797
|
+
if storage_type_str not in supported_storage_types:
|
798
|
+
logger.warning(
|
799
|
+
f'Invalid storage_type {storage_type_str!r}. '
|
800
|
+
f'Set it to '
|
801
|
+
f'{network_type.value}.')
|
802
|
+
volume['storage_type'] = network_type
|
803
|
+
else:
|
804
|
+
volume['storage_type'] = resources_utils.StorageType(
|
805
|
+
storage_type_str)
|
806
|
+
if 'auto_delete' not in volume:
|
807
|
+
volume['auto_delete'] = False
|
808
|
+
if 'attach_mode' in volume:
|
809
|
+
if isinstance(volume['attach_mode'], str):
|
810
|
+
attach_mode_str = str(volume['attach_mode']).lower()
|
811
|
+
if attach_mode_str not in supported_attach_modes:
|
812
|
+
logger.warning(
|
813
|
+
f'Invalid attach_mode {attach_mode_str!r}. '
|
814
|
+
f'Set it to {read_write_mode.value}.')
|
815
|
+
volume['attach_mode'] = read_write_mode
|
816
|
+
else:
|
817
|
+
volume['attach_mode'] = resources_utils.DiskAttachMode(
|
818
|
+
attach_mode_str)
|
819
|
+
else:
|
820
|
+
volume['attach_mode'] = read_write_mode
|
821
|
+
if volume['storage_type'] == network_type:
|
822
|
+
if ('disk_size' in volume and
|
823
|
+
round(volume['disk_size']) != volume['disk_size']):
|
824
|
+
with ux_utils.print_exception_no_traceback():
|
825
|
+
raise ValueError(f'Volume size must be an integer. '
|
826
|
+
f'Got: {volume["size"]}.')
|
827
|
+
if 'name' not in volume:
|
828
|
+
with ux_utils.print_exception_no_traceback():
|
829
|
+
raise ValueError(f'Network volume {volume["path"]} '
|
830
|
+
f'must have "name" field.')
|
831
|
+
elif 'name' in volume:
|
832
|
+
logger.info(f'Volume {volume["path"]} is a local disk. '
|
833
|
+
f'The "name" field will be ignored.')
|
834
|
+
del volume['name']
|
835
|
+
if 'disk_tier' in volume:
|
836
|
+
if isinstance(volume['disk_tier'], str):
|
837
|
+
disk_tier_str = str(volume['disk_tier']).lower()
|
838
|
+
if disk_tier_str not in supported_tiers:
|
839
|
+
logger.warning(
|
840
|
+
f'Invalid disk_tier {disk_tier_str!r}. '
|
841
|
+
f'Set it to {resources_utils.DiskTier.BEST.value}.')
|
842
|
+
volume['disk_tier'] = resources_utils.DiskTier.BEST
|
843
|
+
else:
|
844
|
+
volume['disk_tier'] = resources_utils.DiskTier(
|
845
|
+
disk_tier_str)
|
846
|
+
elif volume['storage_type'] == network_type:
|
847
|
+
logger.debug(
|
848
|
+
f'No disk_tier specified for volume {volume["path"]}. '
|
849
|
+
f'Set it to {resources_utils.DiskTier.BEST.value}.')
|
850
|
+
volume['disk_tier'] = resources_utils.DiskTier.BEST
|
851
|
+
|
852
|
+
valid_volumes.append(volume)
|
853
|
+
self._volumes = valid_volumes
|
854
|
+
|
762
855
|
def is_launchable(self) -> bool:
|
763
856
|
"""Returns whether the resource is launchable."""
|
764
857
|
return self.cloud is not None and self._instance_type is not None
|
@@ -1123,6 +1216,48 @@ class Resources:
|
|
1123
1216
|
f'Disk tier {self.disk_tier.value} is not supported '
|
1124
1217
|
f'for instance type {self.instance_type}.') from None
|
1125
1218
|
|
1219
|
+
def _try_validate_volumes(self) -> None:
|
1220
|
+
"""Try to validate the volumes attribute.
|
1221
|
+
|
1222
|
+
Raises:
|
1223
|
+
ValueError: if the attribute is invalid.
|
1224
|
+
"""
|
1225
|
+
if self.volumes is None:
|
1226
|
+
return
|
1227
|
+
if self.cloud is None:
|
1228
|
+
with ux_utils.print_exception_no_traceback():
|
1229
|
+
raise ValueError('Cloud must be specified when '
|
1230
|
+
'volumes are provided.')
|
1231
|
+
if not self.cloud.is_same_cloud(clouds.GCP()):
|
1232
|
+
with ux_utils.print_exception_no_traceback():
|
1233
|
+
raise ValueError(f'Volumes are only supported for GCP'
|
1234
|
+
f' not for {self.cloud}.')
|
1235
|
+
|
1236
|
+
need_region_or_zone = False
|
1237
|
+
try:
|
1238
|
+
for volume in self.volumes:
|
1239
|
+
if ('name' in volume and volume['storage_type']
|
1240
|
+
== resources_utils.StorageType.NETWORK):
|
1241
|
+
need_region_or_zone = True
|
1242
|
+
if 'disk_tier' not in volume:
|
1243
|
+
continue
|
1244
|
+
# TODO(hailong): check instance local SSD
|
1245
|
+
# support for instance_type.
|
1246
|
+
# Refer to https://cloud.google.com/compute/docs/disks/local-ssd#machine-series-lssd # pylint: disable=line-too-long
|
1247
|
+
self.cloud.check_disk_tier_enabled(self.instance_type,
|
1248
|
+
volume['disk_tier'])
|
1249
|
+
if (need_region_or_zone and self._region is None and
|
1250
|
+
self._zone is None):
|
1251
|
+
with ux_utils.print_exception_no_traceback():
|
1252
|
+
raise ValueError('When specifying the volume name, please'
|
1253
|
+
' also specify the region or zone.')
|
1254
|
+
except exceptions.NotSupportedError:
|
1255
|
+
with ux_utils.print_exception_no_traceback():
|
1256
|
+
raise ValueError(
|
1257
|
+
f'Disk tier {volume["disk_tier"].value} is not '
|
1258
|
+
f'supported for instance type {self.instance_type}.'
|
1259
|
+
) from None
|
1260
|
+
|
1126
1261
|
def _try_validate_ports(self) -> None:
|
1127
1262
|
"""Try to validate the ports attribute.
|
1128
1263
|
|
@@ -1293,9 +1428,18 @@ class Resources:
|
|
1293
1428
|
skypilot_config.get_nested(
|
1294
1429
|
(str(self.cloud).lower(), 'specific_reservations'), set()))
|
1295
1430
|
|
1431
|
+
if isinstance(self.cloud, clouds.DummyCloud):
|
1432
|
+
return self.cloud.get_reservations_available_resources(
|
1433
|
+
instance_type='',
|
1434
|
+
region='',
|
1435
|
+
zone=None,
|
1436
|
+
specific_reservations=specific_reservations)
|
1437
|
+
|
1296
1438
|
assert (self.cloud is not None and self.instance_type is not None and
|
1297
|
-
self.region
|
1298
|
-
|
1439
|
+
self.region is not None), (
|
1440
|
+
f'Cloud, instance type, region must be specified. '
|
1441
|
+
f'Resources={self}, cloud={self.cloud}, '
|
1442
|
+
f'instance_type={self.instance_type}, region={self.region}')
|
1299
1443
|
return self.cloud.get_reservations_available_resources(
|
1300
1444
|
self.instance_type, self.region, self.zone, specific_reservations)
|
1301
1445
|
|
@@ -1483,6 +1627,7 @@ class Resources:
|
|
1483
1627
|
ports=override.pop('ports', self.ports),
|
1484
1628
|
labels=override.pop('labels', self.labels),
|
1485
1629
|
autostop=override.pop('autostop', current_autostop_config),
|
1630
|
+
volumes=override.pop('volumes', self.volumes),
|
1486
1631
|
infra=override.pop('infra', None),
|
1487
1632
|
_docker_login_config=override.pop('_docker_login_config',
|
1488
1633
|
self._docker_login_config),
|
@@ -1523,6 +1668,12 @@ class Resources:
|
|
1523
1668
|
features.add(clouds.CloudImplementationFeatures.IMAGE_ID)
|
1524
1669
|
if self.ports is not None:
|
1525
1670
|
features.add(clouds.CloudImplementationFeatures.OPEN_PORTS)
|
1671
|
+
if self.volumes is not None:
|
1672
|
+
for volume in self.volumes:
|
1673
|
+
if 'disk_tier' in volume and volume[
|
1674
|
+
'disk_tier'] != resources_utils.DiskTier.BEST:
|
1675
|
+
features.add(
|
1676
|
+
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER)
|
1526
1677
|
return features
|
1527
1678
|
|
1528
1679
|
@staticmethod
|
@@ -1692,6 +1843,7 @@ class Resources:
|
|
1692
1843
|
resources_fields['ports'] = config.pop('ports', None)
|
1693
1844
|
resources_fields['labels'] = config.pop('labels', None)
|
1694
1845
|
resources_fields['autostop'] = config.pop('autostop', None)
|
1846
|
+
resources_fields['volumes'] = config.pop('volumes', None)
|
1695
1847
|
resources_fields['_docker_login_config'] = config.pop(
|
1696
1848
|
'_docker_login_config', None)
|
1697
1849
|
resources_fields['_docker_username_for_runpod'] = config.pop(
|
@@ -1742,6 +1894,21 @@ class Resources:
|
|
1742
1894
|
config['disk_tier'] = self.disk_tier.value
|
1743
1895
|
add_if_not_none('ports', self.ports)
|
1744
1896
|
add_if_not_none('labels', self.labels)
|
1897
|
+
if self.volumes is not None:
|
1898
|
+
# Convert DiskTier/StorageType enum to string value for each volume
|
1899
|
+
volumes = []
|
1900
|
+
for volume in self.volumes:
|
1901
|
+
volume_copy = volume.copy()
|
1902
|
+
if 'disk_tier' in volume_copy:
|
1903
|
+
volume_copy['disk_tier'] = volume_copy['disk_tier'].value
|
1904
|
+
if 'storage_type' in volume_copy:
|
1905
|
+
volume_copy['storage_type'] = volume_copy[
|
1906
|
+
'storage_type'].value
|
1907
|
+
if 'attach_mode' in volume_copy:
|
1908
|
+
volume_copy['attach_mode'] = volume_copy[
|
1909
|
+
'attach_mode'].value
|
1910
|
+
volumes.append(volume_copy)
|
1911
|
+
config['volumes'] = volumes
|
1745
1912
|
if self._autostop_config is not None:
|
1746
1913
|
config['autostop'] = self._autostop_config.to_yaml_config()
|
1747
1914
|
if self._docker_login_config is not None:
|
@@ -1902,6 +2069,9 @@ class Resources:
|
|
1902
2069
|
if version < 23:
|
1903
2070
|
self._autostop_config = None
|
1904
2071
|
|
2072
|
+
if version < 24:
|
2073
|
+
self._volumes = None
|
2074
|
+
|
1905
2075
|
self.__dict__.update(state)
|
1906
2076
|
|
1907
2077
|
|
sky/serve/server/core.py
CHANGED
@@ -141,8 +141,7 @@ def up(
|
|
141
141
|
# Always apply the policy again here, even though it might have been applied
|
142
142
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
143
143
|
# and get the mutated config.
|
144
|
-
dag, mutated_user_config = admin_policy_utils.apply(
|
145
|
-
task, use_mutated_config_in_current_request=False)
|
144
|
+
dag, mutated_user_config = admin_policy_utils.apply(task)
|
146
145
|
task = dag.tasks[0]
|
147
146
|
|
148
147
|
with rich_utils.safe_status(
|
@@ -352,8 +351,7 @@ def update(
|
|
352
351
|
# and get the mutated config.
|
353
352
|
# TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
|
354
353
|
# will not apply the config.
|
355
|
-
dag, _ = admin_policy_utils.apply(
|
356
|
-
task, use_mutated_config_in_current_request=False)
|
354
|
+
dag, _ = admin_policy_utils.apply(task)
|
357
355
|
task = dag.tasks[0]
|
358
356
|
|
359
357
|
assert task.service is not None
|
sky/server/common.py
CHANGED
@@ -12,7 +12,7 @@ import subprocess
|
|
12
12
|
import sys
|
13
13
|
import time
|
14
14
|
import typing
|
15
|
-
from typing import Any, Dict, Optional
|
15
|
+
from typing import Any, Dict, Literal, Optional
|
16
16
|
from urllib import parse
|
17
17
|
import uuid
|
18
18
|
|
@@ -116,6 +116,7 @@ class ApiServerStatus(enum.Enum):
|
|
116
116
|
HEALTHY = 'healthy'
|
117
117
|
UNHEALTHY = 'unhealthy'
|
118
118
|
VERSION_MISMATCH = 'version_mismatch'
|
119
|
+
NEEDS_AUTH = 'needs_auth'
|
119
120
|
|
120
121
|
|
121
122
|
@dataclasses.dataclass
|
@@ -127,15 +128,21 @@ class ApiServerInfo:
|
|
127
128
|
commit: Optional[str] = None
|
128
129
|
|
129
130
|
|
131
|
+
def get_api_cookie_jar_path() -> str:
|
132
|
+
return os.environ.get(server_constants.API_COOKIE_FILE_ENV_VAR,
|
133
|
+
server_constants.API_COOKIE_FILE_DEFAULT_LOCATION)
|
134
|
+
|
135
|
+
|
130
136
|
def get_api_cookie_jar() -> requests.cookies.RequestsCookieJar:
|
131
137
|
"""Returns the cookie jar used by the client to access the API server."""
|
132
|
-
cookie_file = os.environ.get(server_constants.API_COOKIE_FILE_ENV_VAR)
|
133
138
|
cookie_jar = requests.cookies.RequestsCookieJar()
|
134
|
-
|
139
|
+
cookie_file = get_api_cookie_jar_path()
|
140
|
+
if cookie_file:
|
135
141
|
cookie_path = pathlib.Path(cookie_file).expanduser().resolve()
|
136
|
-
|
137
|
-
|
138
|
-
|
142
|
+
if cookie_path.exists():
|
143
|
+
file_cookie_jar = MozillaCookieJar(cookie_path)
|
144
|
+
file_cookie_jar.load()
|
145
|
+
cookie_jar.update(file_cookie_jar)
|
139
146
|
return cookie_jar
|
140
147
|
|
141
148
|
|
@@ -196,6 +203,7 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
196
203
|
response = requests.get(f'{server_url}/api/health',
|
197
204
|
timeout=2.5,
|
198
205
|
cookies=get_api_cookie_jar())
|
206
|
+
logger.debug(f'Health check status: {response.status_code}')
|
199
207
|
if response.status_code == 200:
|
200
208
|
try:
|
201
209
|
result = response.json()
|
@@ -217,9 +225,24 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
217
225
|
server_info.status = ApiServerStatus.VERSION_MISMATCH
|
218
226
|
return server_info
|
219
227
|
except (json.JSONDecodeError, AttributeError) as e:
|
228
|
+
# Try to check if we got redirected to a login page.
|
229
|
+
for prev_response in response.history:
|
230
|
+
logger.debug(f'Previous response: {prev_response.url}')
|
231
|
+
# Heuristic: check if the url looks like a login page or
|
232
|
+
# oauth flow.
|
233
|
+
if any(key in prev_response.url
|
234
|
+
for key in ['login', 'oauth2']):
|
235
|
+
logger.debug(
|
236
|
+
f'URL {prev_response.url} looks like '
|
237
|
+
'a login page or oauth flow, so try to '
|
238
|
+
'get the cookie.')
|
239
|
+
return ApiServerInfo(
|
240
|
+
status=ApiServerStatus.NEEDS_AUTH)
|
220
241
|
logger.warning('Failed to parse API server response: '
|
221
242
|
f'{str(e)}')
|
222
243
|
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
244
|
+
elif response.status_code == 401:
|
245
|
+
return ApiServerInfo(status=ApiServerStatus.NEEDS_AUTH)
|
223
246
|
else:
|
224
247
|
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
225
248
|
except requests.exceptions.Timeout:
|
@@ -369,7 +392,12 @@ def _start_api_server(deploy: bool = False,
|
|
369
392
|
f'SkyPilot API server started. {dashboard_msg}'))
|
370
393
|
|
371
394
|
|
372
|
-
def check_server_healthy(
|
395
|
+
def check_server_healthy(
|
396
|
+
endpoint: Optional[str] = None
|
397
|
+
) -> Literal[
|
398
|
+
# Use an incomplete list of Literals here to enforce raising for other
|
399
|
+
# enum values.
|
400
|
+
ApiServerStatus.HEALTHY, ApiServerStatus.NEEDS_AUTH]:
|
373
401
|
"""Check if the API server is healthy.
|
374
402
|
|
375
403
|
Args:
|
@@ -379,6 +407,11 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
|
|
379
407
|
Raises:
|
380
408
|
RuntimeError: If the server is not healthy or the client version does
|
381
409
|
not match the server version.
|
410
|
+
|
411
|
+
Returns:
|
412
|
+
ApiServerStatus: The status of the API server, unless the server is
|
413
|
+
unhealthy or the client version does not match the server version,
|
414
|
+
in which case an exception is raised.
|
382
415
|
"""
|
383
416
|
endpoint = endpoint if endpoint is not None else get_server_url()
|
384
417
|
api_server_info = get_api_server_status(endpoint)
|
@@ -441,6 +474,8 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
|
|
441
474
|
|
442
475
|
hinted_for_server_install_version_mismatch = True
|
443
476
|
|
477
|
+
return api_server_status
|
478
|
+
|
444
479
|
|
445
480
|
def _get_version_info_hint(server_info: ApiServerInfo) -> str:
|
446
481
|
assert server_info.version is not None, 'Server version is None'
|
@@ -491,11 +526,13 @@ def get_skypilot_version_on_disk() -> str:
|
|
491
526
|
def check_server_healthy_or_start_fn(deploy: bool = False,
|
492
527
|
host: str = '127.0.0.1',
|
493
528
|
foreground: bool = False):
|
529
|
+
api_server_status = None
|
494
530
|
try:
|
495
|
-
check_server_healthy()
|
531
|
+
api_server_status = check_server_healthy()
|
496
532
|
except exceptions.ApiServerConnectionError as exc:
|
497
533
|
endpoint = get_server_url()
|
498
|
-
if not is_api_server_local()
|
534
|
+
if (not is_api_server_local() or
|
535
|
+
api_server_status == ApiServerStatus.NEEDS_AUTH):
|
499
536
|
with ux_utils.print_exception_no_traceback():
|
500
537
|
raise exceptions.ApiServerConnectionError(endpoint) from exc
|
501
538
|
# Lock to prevent multiple processes from starting the server at the
|
sky/server/constants.py
CHANGED
@@ -26,6 +26,8 @@ CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
|
|
26
26
|
|
27
27
|
# Environment variable for a file path to the API cookie file.
|
28
28
|
API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
|
29
|
+
# Default file if unset.
|
30
|
+
API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
|
29
31
|
|
30
32
|
# The path to the dashboard build output
|
31
33
|
DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',
|