skypilot-nightly 1.0.0.dev20250521__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +46 -16
- sky/backends/cloud_vm_ray_backend.py +16 -4
- sky/check.py +109 -44
- sky/cli.py +261 -90
- sky/client/cli.py +261 -90
- sky/client/sdk.py +122 -3
- sky/clouds/__init__.py +5 -0
- sky/clouds/aws.py +4 -2
- sky/clouds/azure.py +4 -2
- sky/clouds/cloud.py +30 -6
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +2 -1
- sky/clouds/gcp.py +160 -23
- sky/clouds/ibm.py +4 -2
- sky/clouds/kubernetes.py +66 -22
- sky/clouds/lambda_cloud.py +2 -1
- sky/clouds/nebius.py +18 -2
- sky/clouds/oci.py +4 -2
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +2 -1
- sky/clouds/service_catalog/__init__.py +3 -0
- sky/clouds/service_catalog/common.py +9 -2
- sky/clouds/service_catalog/constants.py +2 -1
- sky/clouds/service_catalog/ssh_catalog.py +167 -0
- sky/clouds/ssh.py +203 -0
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +59 -17
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/data/storage.py +1 -0
- sky/execution.py +56 -7
- sky/jobs/server/core.py +4 -2
- sky/optimizer.py +29 -15
- sky/provision/__init__.py +1 -0
- sky/provision/aws/instance.py +17 -1
- sky/provision/gcp/constants.py +147 -4
- sky/provision/gcp/instance_utils.py +10 -0
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/kubernetes/instance.py +16 -5
- sky/provision/kubernetes/utils.py +37 -19
- sky/provision/nebius/instance.py +3 -1
- sky/provision/nebius/utils.py +14 -2
- sky/provision/ssh/__init__.py +18 -0
- sky/resources.py +177 -4
- sky/serve/server/core.py +2 -4
- sky/server/common.py +46 -9
- sky/server/constants.py +2 -0
- sky/server/html/token_page.html +154 -0
- sky/server/requests/executor.py +3 -6
- sky/server/requests/payloads.py +7 -0
- sky/server/server.py +80 -8
- sky/setup_files/dependencies.py +1 -0
- sky/skypilot_config.py +117 -31
- sky/task.py +24 -1
- sky/templates/gcp-ray.yml.j2 +44 -1
- sky/templates/nebius-ray.yml.j2 +12 -2
- sky/utils/admin_policy_utils.py +26 -22
- sky/utils/context.py +36 -6
- sky/utils/context_utils.py +15 -0
- sky/utils/infra_utils.py +21 -1
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/create_cluster.sh +1 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
- sky/utils/kubernetes/ssh-tunnel.sh +387 -0
- sky/utils/log_utils.py +214 -1
- sky/utils/resources_utils.py +14 -0
- sky/utils/schemas.py +67 -0
- sky/utils/ux_utils.py +2 -1
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +88 -81
- sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
@@ -1137,6 +1137,11 @@ def get_accelerator_label_key_values(
|
|
1137
1137
|
# support pollingthe clusters for autoscaling information, such as the
|
1138
1138
|
# node pools configured etc.
|
1139
1139
|
|
1140
|
+
is_ssh_node_pool = context.startswith('ssh-') if context else False
|
1141
|
+
cloud_name = 'SSH Node Pool' if is_ssh_node_pool else 'Kubernetes cluster'
|
1142
|
+
context_display_name = context.lstrip('ssh-') if (
|
1143
|
+
context and is_ssh_node_pool) else context
|
1144
|
+
|
1140
1145
|
autoscaler_type = get_autoscaler_type()
|
1141
1146
|
if autoscaler_type is not None:
|
1142
1147
|
# If autoscaler is set in config.yaml, override the label key and value
|
@@ -1176,13 +1181,17 @@ def get_accelerator_label_key_values(
|
|
1176
1181
|
suffix = ''
|
1177
1182
|
if env_options.Options.SHOW_DEBUG_INFO.get():
|
1178
1183
|
suffix = f' Found node labels: {node_labels}'
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1183
|
-
|
1184
|
-
|
1185
|
-
|
1184
|
+
msg = (f'Could not detect GPU labels in {cloud_name}.')
|
1185
|
+
if not is_ssh_node_pool:
|
1186
|
+
msg += (' Run `sky check ssh` to debug.')
|
1187
|
+
else:
|
1188
|
+
msg += (
|
1189
|
+
' If this cluster has GPUs, please ensure GPU nodes have '
|
1190
|
+
'node labels of either of these formats: '
|
1191
|
+
f'{supported_formats}. Please refer to '
|
1192
|
+
'the documentation on how to set up node labels.')
|
1193
|
+
msg += f'{suffix}'
|
1194
|
+
raise exceptions.ResourcesUnavailableError(msg)
|
1186
1195
|
else:
|
1187
1196
|
# Validate the label value on all nodes labels to ensure they are
|
1188
1197
|
# correctly setup and will behave as expected.
|
@@ -1193,7 +1202,7 @@ def get_accelerator_label_key_values(
|
|
1193
1202
|
value)
|
1194
1203
|
if not is_valid:
|
1195
1204
|
raise exceptions.ResourcesUnavailableError(
|
1196
|
-
f'Node {node_name!r} in
|
1205
|
+
f'Node {node_name!r} in {cloud_name} has '
|
1197
1206
|
f'invalid GPU label: {label}={value}. {reason}')
|
1198
1207
|
if check_mode:
|
1199
1208
|
# If check mode is enabled and we reached so far, we can
|
@@ -1257,10 +1266,10 @@ def get_accelerator_label_key_values(
|
|
1257
1266
|
# TODO(Doyoung): Update the error message raised with the
|
1258
1267
|
# multi-host TPU support.
|
1259
1268
|
raise exceptions.ResourcesUnavailableError(
|
1260
|
-
'Could not find any node in the
|
1269
|
+
f'Could not find any node in the {cloud_name} '
|
1261
1270
|
f'with {acc_type}. Please ensure at least one node in the '
|
1262
1271
|
f'cluster has {acc_type} and node labels are setup '
|
1263
|
-
'correctly. Please refer to the
|
1272
|
+
'correctly. Please refer to the documentation for more. '
|
1264
1273
|
f'{suffix}. Note that multi-host TPU podslices are '
|
1265
1274
|
'currently not unsupported.')
|
1266
1275
|
else:
|
@@ -1270,15 +1279,24 @@ def get_accelerator_label_key_values(
|
|
1270
1279
|
if env_options.Options.SHOW_DEBUG_INFO.get():
|
1271
1280
|
suffix = (' Available resources on the cluster: '
|
1272
1281
|
f'{cluster_resources}')
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1277
|
-
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1281
|
-
|
1282
|
+
if is_ssh_node_pool:
|
1283
|
+
msg = (
|
1284
|
+
f'Could not detect GPUs in SSH Node Pool '
|
1285
|
+
f'\'{context_display_name}\'. If this cluster contains '
|
1286
|
+
'GPUs, please ensure GPU drivers are installed on the node '
|
1287
|
+
'and re-run '
|
1288
|
+
f'`sky ssh up --infra {context_display_name}`. {suffix}')
|
1289
|
+
else:
|
1290
|
+
msg = (
|
1291
|
+
f'Could not detect GPU/TPU resources ({GPU_RESOURCE_KEY!r} or '
|
1292
|
+
f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
|
1293
|
+
' contains GPUs, please ensure GPU drivers are installed on '
|
1294
|
+
'the node. Check if the GPUs are setup correctly by running '
|
1295
|
+
'`kubectl describe nodes` and looking for the '
|
1296
|
+
f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. '
|
1297
|
+
'Please refer to the documentation on how to set up GPUs.'
|
1298
|
+
f'{suffix}')
|
1299
|
+
raise exceptions.ResourcesUnavailableError(msg)
|
1282
1300
|
assert False, 'This should not be reached'
|
1283
1301
|
|
1284
1302
|
|
sky/provision/nebius/instance.py
CHANGED
@@ -134,7 +134,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
134
134
|
disk_size=config.node_config['DiskSize'],
|
135
135
|
user_data=config.node_config['UserData'],
|
136
136
|
associate_public_ip_address=(
|
137
|
-
not config.provider_config['use_internal_ips'])
|
137
|
+
not config.provider_config['use_internal_ips']),
|
138
|
+
filesystems=config.node_config.get('filesystems', []),
|
139
|
+
)
|
138
140
|
except Exception as e: # pylint: disable=broad-except
|
139
141
|
logger.warning(f'run_instances error: {e}')
|
140
142
|
raise
|
sky/provision/nebius/utils.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
"""Nebius library wrapper for SkyPilot."""
|
2
2
|
import time
|
3
|
-
from typing import Any, Dict
|
3
|
+
from typing import Any, Dict, List
|
4
4
|
import uuid
|
5
5
|
|
6
6
|
from sky import sky_logging
|
@@ -158,7 +158,8 @@ def start(instance_id: str) -> None:
|
|
158
158
|
|
159
159
|
def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
160
160
|
preset: str, region: str, image_family: str, disk_size: int,
|
161
|
-
user_data: str, associate_public_ip_address: bool
|
161
|
+
user_data: str, associate_public_ip_address: bool,
|
162
|
+
filesystems: List[Dict[str, Any]]) -> str:
|
162
163
|
# Each node must have a unique name to avoid conflicts between
|
163
164
|
# multiple worker VMs. To ensure uniqueness,a UUID is appended
|
164
165
|
# to the node name.
|
@@ -217,6 +218,16 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
|
217
218
|
f' seconds) while waiting for disk {disk_name}'
|
218
219
|
f' to be ready.')
|
219
220
|
|
221
|
+
filesystems_spec = []
|
222
|
+
if filesystems:
|
223
|
+
for fs in filesystems:
|
224
|
+
filesystems_spec.append(nebius.compute().AttachedFilesystemSpec(
|
225
|
+
mount_tag=fs['filesystem_mount_tag'],
|
226
|
+
attach_mode=nebius.compute().AttachedFilesystemSpec.AttachMode[
|
227
|
+
fs['filesystem_attach_mode']],
|
228
|
+
existing_filesystem=nebius.compute().ExistingFilesystem(
|
229
|
+
id=fs['filesystem_id'])))
|
230
|
+
|
220
231
|
service = nebius.vpc().SubnetServiceClient(nebius.sdk())
|
221
232
|
sub_net = service.list(nebius.vpc().ListSubnetsRequest(
|
222
233
|
parent_id=project_id,)).wait()
|
@@ -237,6 +248,7 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
|
237
248
|
cloud_init_user_data=user_data,
|
238
249
|
resources=nebius.compute().ResourcesSpec(platform=platform,
|
239
250
|
preset=preset),
|
251
|
+
filesystems=filesystems_spec if filesystems_spec else None,
|
240
252
|
network_interfaces=[
|
241
253
|
nebius.compute().NetworkInterfaceSpec(
|
242
254
|
subnet_id=sub_net.items[0].metadata.id,
|
@@ -0,0 +1,18 @@
|
|
1
|
+
"""SSH provisioner for SkyPilot.
|
2
|
+
|
3
|
+
This module implements the provisioner interface for SSH targets.
|
4
|
+
It reuses most of the functionality from the Kubernetes provisioner,
|
5
|
+
since the SSH implementation is based on Kubernetes under the hood.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from sky.provision.kubernetes.config import bootstrap_instances
|
9
|
+
from sky.provision.kubernetes.instance import get_cluster_info
|
10
|
+
from sky.provision.kubernetes.instance import get_command_runners
|
11
|
+
from sky.provision.kubernetes.instance import query_instances
|
12
|
+
from sky.provision.kubernetes.instance import run_instances
|
13
|
+
from sky.provision.kubernetes.instance import stop_instances
|
14
|
+
from sky.provision.kubernetes.instance import terminate_instances
|
15
|
+
from sky.provision.kubernetes.instance import wait_instances
|
16
|
+
from sky.provision.kubernetes.network import cleanup_ports
|
17
|
+
from sky.provision.kubernetes.network import open_ports
|
18
|
+
from sky.provision.kubernetes.network import query_ports
|
sky/resources.py
CHANGED
@@ -98,7 +98,7 @@ class Resources:
|
|
98
98
|
"""
|
99
99
|
# If any fields changed, increment the version. For backward compatibility,
|
100
100
|
# modify the __setstate__ method to handle the old version.
|
101
|
-
_VERSION =
|
101
|
+
_VERSION = 24
|
102
102
|
|
103
103
|
def __init__(
|
104
104
|
self,
|
@@ -120,6 +120,7 @@ class Resources:
|
|
120
120
|
ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
|
121
121
|
labels: Optional[Dict[str, str]] = None,
|
122
122
|
autostop: Union[bool, int, Dict[str, Any], None] = None,
|
123
|
+
volumes: Optional[List[Dict[str, Any]]] = None,
|
123
124
|
# Internal use only.
|
124
125
|
# pylint: disable=invalid-name
|
125
126
|
_docker_login_config: Optional[docker_utils.DockerLoginConfig] = None,
|
@@ -210,6 +211,7 @@ class Resources:
|
|
210
211
|
not supported and will be ignored.
|
211
212
|
autostop: the autostop configuration to use. For launched resources,
|
212
213
|
may or may not correspond to the actual current autostop config.
|
214
|
+
volumes: the volumes to mount on the instance.
|
213
215
|
_docker_login_config: the docker configuration to use. This includes
|
214
216
|
the docker username, password, and registry server. If None, skip
|
215
217
|
docker login.
|
@@ -337,6 +339,7 @@ class Resources:
|
|
337
339
|
self._set_memory(memory)
|
338
340
|
self._set_accelerators(accelerators, accelerator_args)
|
339
341
|
self._set_autostop_config(autostop)
|
342
|
+
self._set_volumes(volumes)
|
340
343
|
|
341
344
|
def validate(self):
|
342
345
|
"""Validate the resources and infer the missing fields if possible."""
|
@@ -347,6 +350,7 @@ class Resources:
|
|
347
350
|
self._try_validate_managed_job_attributes()
|
348
351
|
self._try_validate_image_id()
|
349
352
|
self._try_validate_disk_tier()
|
353
|
+
self._try_validate_volumes()
|
350
354
|
self._try_validate_ports()
|
351
355
|
self._try_validate_labels()
|
352
356
|
|
@@ -448,7 +452,10 @@ class Resources:
|
|
448
452
|
def repr_with_region_zone(self) -> str:
|
449
453
|
region_str = ''
|
450
454
|
if self.region is not None:
|
451
|
-
|
455
|
+
region_name = self.region
|
456
|
+
if self.region.startswith('ssh-'):
|
457
|
+
region_name = self.region.lstrip('ssh-')
|
458
|
+
region_str = f', region={region_name}'
|
452
459
|
zone_str = ''
|
453
460
|
if self.zone is not None:
|
454
461
|
zone_str = f', zone={self.zone}'
|
@@ -566,6 +573,10 @@ class Resources:
|
|
566
573
|
def labels(self) -> Optional[Dict[str, str]]:
|
567
574
|
return self._labels
|
568
575
|
|
576
|
+
@property
|
577
|
+
def volumes(self) -> Optional[List[Dict[str, Any]]]:
|
578
|
+
return self._volumes
|
579
|
+
|
569
580
|
@property
|
570
581
|
def autostop_config(self) -> Optional[AutostopConfig]:
|
571
582
|
"""The requested autostop config.
|
@@ -759,6 +770,91 @@ class Resources:
|
|
759
770
|
) -> None:
|
760
771
|
self._autostop_config = AutostopConfig.from_yaml_config(autostop)
|
761
772
|
|
773
|
+
def _set_volumes(
|
774
|
+
self,
|
775
|
+
volumes: Optional[List[Dict[str, Any]]],
|
776
|
+
) -> None:
|
777
|
+
if not volumes:
|
778
|
+
self._volumes = None
|
779
|
+
return
|
780
|
+
valid_volumes = []
|
781
|
+
supported_tiers = [tier.value for tier in resources_utils.DiskTier]
|
782
|
+
supported_storage_types = [
|
783
|
+
storage_type.value for storage_type in resources_utils.StorageType
|
784
|
+
]
|
785
|
+
supported_attach_modes = [
|
786
|
+
attach_mode.value for attach_mode in resources_utils.DiskAttachMode
|
787
|
+
]
|
788
|
+
network_type = resources_utils.StorageType.NETWORK
|
789
|
+
read_write_mode = resources_utils.DiskAttachMode.READ_WRITE
|
790
|
+
for volume in volumes:
|
791
|
+
if 'path' not in volume:
|
792
|
+
with ux_utils.print_exception_no_traceback():
|
793
|
+
raise ValueError(f'Invalid volume {volume!r}. '
|
794
|
+
f'Volume must have a "path" field.')
|
795
|
+
if 'storage_type' not in volume:
|
796
|
+
volume['storage_type'] = network_type
|
797
|
+
else:
|
798
|
+
if isinstance(volume['storage_type'], str):
|
799
|
+
storage_type_str = str(volume['storage_type']).lower()
|
800
|
+
if storage_type_str not in supported_storage_types:
|
801
|
+
logger.warning(
|
802
|
+
f'Invalid storage_type {storage_type_str!r}. '
|
803
|
+
f'Set it to '
|
804
|
+
f'{network_type.value}.')
|
805
|
+
volume['storage_type'] = network_type
|
806
|
+
else:
|
807
|
+
volume['storage_type'] = resources_utils.StorageType(
|
808
|
+
storage_type_str)
|
809
|
+
if 'auto_delete' not in volume:
|
810
|
+
volume['auto_delete'] = False
|
811
|
+
if 'attach_mode' in volume:
|
812
|
+
if isinstance(volume['attach_mode'], str):
|
813
|
+
attach_mode_str = str(volume['attach_mode']).lower()
|
814
|
+
if attach_mode_str not in supported_attach_modes:
|
815
|
+
logger.warning(
|
816
|
+
f'Invalid attach_mode {attach_mode_str!r}. '
|
817
|
+
f'Set it to {read_write_mode.value}.')
|
818
|
+
volume['attach_mode'] = read_write_mode
|
819
|
+
else:
|
820
|
+
volume['attach_mode'] = resources_utils.DiskAttachMode(
|
821
|
+
attach_mode_str)
|
822
|
+
else:
|
823
|
+
volume['attach_mode'] = read_write_mode
|
824
|
+
if volume['storage_type'] == network_type:
|
825
|
+
if ('disk_size' in volume and
|
826
|
+
round(volume['disk_size']) != volume['disk_size']):
|
827
|
+
with ux_utils.print_exception_no_traceback():
|
828
|
+
raise ValueError(f'Volume size must be an integer. '
|
829
|
+
f'Got: {volume["size"]}.')
|
830
|
+
if 'name' not in volume:
|
831
|
+
with ux_utils.print_exception_no_traceback():
|
832
|
+
raise ValueError(f'Network volume {volume["path"]} '
|
833
|
+
f'must have "name" field.')
|
834
|
+
elif 'name' in volume:
|
835
|
+
logger.info(f'Volume {volume["path"]} is a local disk. '
|
836
|
+
f'The "name" field will be ignored.')
|
837
|
+
del volume['name']
|
838
|
+
if 'disk_tier' in volume:
|
839
|
+
if isinstance(volume['disk_tier'], str):
|
840
|
+
disk_tier_str = str(volume['disk_tier']).lower()
|
841
|
+
if disk_tier_str not in supported_tiers:
|
842
|
+
logger.warning(
|
843
|
+
f'Invalid disk_tier {disk_tier_str!r}. '
|
844
|
+
f'Set it to {resources_utils.DiskTier.BEST.value}.')
|
845
|
+
volume['disk_tier'] = resources_utils.DiskTier.BEST
|
846
|
+
else:
|
847
|
+
volume['disk_tier'] = resources_utils.DiskTier(
|
848
|
+
disk_tier_str)
|
849
|
+
elif volume['storage_type'] == network_type:
|
850
|
+
logger.debug(
|
851
|
+
f'No disk_tier specified for volume {volume["path"]}. '
|
852
|
+
f'Set it to {resources_utils.DiskTier.BEST.value}.')
|
853
|
+
volume['disk_tier'] = resources_utils.DiskTier.BEST
|
854
|
+
|
855
|
+
valid_volumes.append(volume)
|
856
|
+
self._volumes = valid_volumes
|
857
|
+
|
762
858
|
def is_launchable(self) -> bool:
|
763
859
|
"""Returns whether the resource is launchable."""
|
764
860
|
return self.cloud is not None and self._instance_type is not None
|
@@ -1123,6 +1219,48 @@ class Resources:
|
|
1123
1219
|
f'Disk tier {self.disk_tier.value} is not supported '
|
1124
1220
|
f'for instance type {self.instance_type}.') from None
|
1125
1221
|
|
1222
|
+
def _try_validate_volumes(self) -> None:
|
1223
|
+
"""Try to validate the volumes attribute.
|
1224
|
+
|
1225
|
+
Raises:
|
1226
|
+
ValueError: if the attribute is invalid.
|
1227
|
+
"""
|
1228
|
+
if self.volumes is None:
|
1229
|
+
return
|
1230
|
+
if self.cloud is None:
|
1231
|
+
with ux_utils.print_exception_no_traceback():
|
1232
|
+
raise ValueError('Cloud must be specified when '
|
1233
|
+
'volumes are provided.')
|
1234
|
+
if not self.cloud.is_same_cloud(clouds.GCP()):
|
1235
|
+
with ux_utils.print_exception_no_traceback():
|
1236
|
+
raise ValueError(f'Volumes are only supported for GCP'
|
1237
|
+
f' not for {self.cloud}.')
|
1238
|
+
|
1239
|
+
need_region_or_zone = False
|
1240
|
+
try:
|
1241
|
+
for volume in self.volumes:
|
1242
|
+
if ('name' in volume and volume['storage_type']
|
1243
|
+
== resources_utils.StorageType.NETWORK):
|
1244
|
+
need_region_or_zone = True
|
1245
|
+
if 'disk_tier' not in volume:
|
1246
|
+
continue
|
1247
|
+
# TODO(hailong): check instance local SSD
|
1248
|
+
# support for instance_type.
|
1249
|
+
# Refer to https://cloud.google.com/compute/docs/disks/local-ssd#machine-series-lssd # pylint: disable=line-too-long
|
1250
|
+
self.cloud.check_disk_tier_enabled(self.instance_type,
|
1251
|
+
volume['disk_tier'])
|
1252
|
+
if (need_region_or_zone and self._region is None and
|
1253
|
+
self._zone is None):
|
1254
|
+
with ux_utils.print_exception_no_traceback():
|
1255
|
+
raise ValueError('When specifying the volume name, please'
|
1256
|
+
' also specify the region or zone.')
|
1257
|
+
except exceptions.NotSupportedError:
|
1258
|
+
with ux_utils.print_exception_no_traceback():
|
1259
|
+
raise ValueError(
|
1260
|
+
f'Disk tier {volume["disk_tier"].value} is not '
|
1261
|
+
f'supported for instance type {self.instance_type}.'
|
1262
|
+
) from None
|
1263
|
+
|
1126
1264
|
def _try_validate_ports(self) -> None:
|
1127
1265
|
"""Try to validate the ports attribute.
|
1128
1266
|
|
@@ -1293,9 +1431,18 @@ class Resources:
|
|
1293
1431
|
skypilot_config.get_nested(
|
1294
1432
|
(str(self.cloud).lower(), 'specific_reservations'), set()))
|
1295
1433
|
|
1434
|
+
if isinstance(self.cloud, clouds.DummyCloud):
|
1435
|
+
return self.cloud.get_reservations_available_resources(
|
1436
|
+
instance_type='',
|
1437
|
+
region='',
|
1438
|
+
zone=None,
|
1439
|
+
specific_reservations=specific_reservations)
|
1440
|
+
|
1296
1441
|
assert (self.cloud is not None and self.instance_type is not None and
|
1297
|
-
self.region
|
1298
|
-
|
1442
|
+
self.region is not None), (
|
1443
|
+
f'Cloud, instance type, region must be specified. '
|
1444
|
+
f'Resources={self}, cloud={self.cloud}, '
|
1445
|
+
f'instance_type={self.instance_type}, region={self.region}')
|
1299
1446
|
return self.cloud.get_reservations_available_resources(
|
1300
1447
|
self.instance_type, self.region, self.zone, specific_reservations)
|
1301
1448
|
|
@@ -1483,6 +1630,7 @@ class Resources:
|
|
1483
1630
|
ports=override.pop('ports', self.ports),
|
1484
1631
|
labels=override.pop('labels', self.labels),
|
1485
1632
|
autostop=override.pop('autostop', current_autostop_config),
|
1633
|
+
volumes=override.pop('volumes', self.volumes),
|
1486
1634
|
infra=override.pop('infra', None),
|
1487
1635
|
_docker_login_config=override.pop('_docker_login_config',
|
1488
1636
|
self._docker_login_config),
|
@@ -1523,6 +1671,12 @@ class Resources:
|
|
1523
1671
|
features.add(clouds.CloudImplementationFeatures.IMAGE_ID)
|
1524
1672
|
if self.ports is not None:
|
1525
1673
|
features.add(clouds.CloudImplementationFeatures.OPEN_PORTS)
|
1674
|
+
if self.volumes is not None:
|
1675
|
+
for volume in self.volumes:
|
1676
|
+
if 'disk_tier' in volume and volume[
|
1677
|
+
'disk_tier'] != resources_utils.DiskTier.BEST:
|
1678
|
+
features.add(
|
1679
|
+
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER)
|
1526
1680
|
return features
|
1527
1681
|
|
1528
1682
|
@staticmethod
|
@@ -1692,6 +1846,7 @@ class Resources:
|
|
1692
1846
|
resources_fields['ports'] = config.pop('ports', None)
|
1693
1847
|
resources_fields['labels'] = config.pop('labels', None)
|
1694
1848
|
resources_fields['autostop'] = config.pop('autostop', None)
|
1849
|
+
resources_fields['volumes'] = config.pop('volumes', None)
|
1695
1850
|
resources_fields['_docker_login_config'] = config.pop(
|
1696
1851
|
'_docker_login_config', None)
|
1697
1852
|
resources_fields['_docker_username_for_runpod'] = config.pop(
|
@@ -1742,6 +1897,21 @@ class Resources:
|
|
1742
1897
|
config['disk_tier'] = self.disk_tier.value
|
1743
1898
|
add_if_not_none('ports', self.ports)
|
1744
1899
|
add_if_not_none('labels', self.labels)
|
1900
|
+
if self.volumes is not None:
|
1901
|
+
# Convert DiskTier/StorageType enum to string value for each volume
|
1902
|
+
volumes = []
|
1903
|
+
for volume in self.volumes:
|
1904
|
+
volume_copy = volume.copy()
|
1905
|
+
if 'disk_tier' in volume_copy:
|
1906
|
+
volume_copy['disk_tier'] = volume_copy['disk_tier'].value
|
1907
|
+
if 'storage_type' in volume_copy:
|
1908
|
+
volume_copy['storage_type'] = volume_copy[
|
1909
|
+
'storage_type'].value
|
1910
|
+
if 'attach_mode' in volume_copy:
|
1911
|
+
volume_copy['attach_mode'] = volume_copy[
|
1912
|
+
'attach_mode'].value
|
1913
|
+
volumes.append(volume_copy)
|
1914
|
+
config['volumes'] = volumes
|
1745
1915
|
if self._autostop_config is not None:
|
1746
1916
|
config['autostop'] = self._autostop_config.to_yaml_config()
|
1747
1917
|
if self._docker_login_config is not None:
|
@@ -1902,6 +2072,9 @@ class Resources:
|
|
1902
2072
|
if version < 23:
|
1903
2073
|
self._autostop_config = None
|
1904
2074
|
|
2075
|
+
if version < 24:
|
2076
|
+
self._volumes = None
|
2077
|
+
|
1905
2078
|
self.__dict__.update(state)
|
1906
2079
|
|
1907
2080
|
|
sky/serve/server/core.py
CHANGED
@@ -141,8 +141,7 @@ def up(
|
|
141
141
|
# Always apply the policy again here, even though it might have been applied
|
142
142
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
143
143
|
# and get the mutated config.
|
144
|
-
dag, mutated_user_config = admin_policy_utils.apply(
|
145
|
-
task, use_mutated_config_in_current_request=False)
|
144
|
+
dag, mutated_user_config = admin_policy_utils.apply(task)
|
146
145
|
task = dag.tasks[0]
|
147
146
|
|
148
147
|
with rich_utils.safe_status(
|
@@ -352,8 +351,7 @@ def update(
|
|
352
351
|
# and get the mutated config.
|
353
352
|
# TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
|
354
353
|
# will not apply the config.
|
355
|
-
dag, _ = admin_policy_utils.apply(
|
356
|
-
task, use_mutated_config_in_current_request=False)
|
354
|
+
dag, _ = admin_policy_utils.apply(task)
|
357
355
|
task = dag.tasks[0]
|
358
356
|
|
359
357
|
assert task.service is not None
|
sky/server/common.py
CHANGED
@@ -12,7 +12,7 @@ import subprocess
|
|
12
12
|
import sys
|
13
13
|
import time
|
14
14
|
import typing
|
15
|
-
from typing import Any, Dict, Optional
|
15
|
+
from typing import Any, Dict, Literal, Optional
|
16
16
|
from urllib import parse
|
17
17
|
import uuid
|
18
18
|
|
@@ -116,6 +116,7 @@ class ApiServerStatus(enum.Enum):
|
|
116
116
|
HEALTHY = 'healthy'
|
117
117
|
UNHEALTHY = 'unhealthy'
|
118
118
|
VERSION_MISMATCH = 'version_mismatch'
|
119
|
+
NEEDS_AUTH = 'needs_auth'
|
119
120
|
|
120
121
|
|
121
122
|
@dataclasses.dataclass
|
@@ -127,15 +128,21 @@ class ApiServerInfo:
|
|
127
128
|
commit: Optional[str] = None
|
128
129
|
|
129
130
|
|
131
|
+
def get_api_cookie_jar_path() -> str:
|
132
|
+
return os.environ.get(server_constants.API_COOKIE_FILE_ENV_VAR,
|
133
|
+
server_constants.API_COOKIE_FILE_DEFAULT_LOCATION)
|
134
|
+
|
135
|
+
|
130
136
|
def get_api_cookie_jar() -> requests.cookies.RequestsCookieJar:
|
131
137
|
"""Returns the cookie jar used by the client to access the API server."""
|
132
|
-
cookie_file = os.environ.get(server_constants.API_COOKIE_FILE_ENV_VAR)
|
133
138
|
cookie_jar = requests.cookies.RequestsCookieJar()
|
134
|
-
|
139
|
+
cookie_file = get_api_cookie_jar_path()
|
140
|
+
if cookie_file:
|
135
141
|
cookie_path = pathlib.Path(cookie_file).expanduser().resolve()
|
136
|
-
|
137
|
-
|
138
|
-
|
142
|
+
if cookie_path.exists():
|
143
|
+
file_cookie_jar = MozillaCookieJar(cookie_path)
|
144
|
+
file_cookie_jar.load()
|
145
|
+
cookie_jar.update(file_cookie_jar)
|
139
146
|
return cookie_jar
|
140
147
|
|
141
148
|
|
@@ -196,6 +203,7 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
196
203
|
response = requests.get(f'{server_url}/api/health',
|
197
204
|
timeout=2.5,
|
198
205
|
cookies=get_api_cookie_jar())
|
206
|
+
logger.debug(f'Health check status: {response.status_code}')
|
199
207
|
if response.status_code == 200:
|
200
208
|
try:
|
201
209
|
result = response.json()
|
@@ -217,9 +225,24 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
217
225
|
server_info.status = ApiServerStatus.VERSION_MISMATCH
|
218
226
|
return server_info
|
219
227
|
except (json.JSONDecodeError, AttributeError) as e:
|
228
|
+
# Try to check if we got redirected to a login page.
|
229
|
+
for prev_response in response.history:
|
230
|
+
logger.debug(f'Previous response: {prev_response.url}')
|
231
|
+
# Heuristic: check if the url looks like a login page or
|
232
|
+
# oauth flow.
|
233
|
+
if any(key in prev_response.url
|
234
|
+
for key in ['login', 'oauth2']):
|
235
|
+
logger.debug(
|
236
|
+
f'URL {prev_response.url} looks like '
|
237
|
+
'a login page or oauth flow, so try to '
|
238
|
+
'get the cookie.')
|
239
|
+
return ApiServerInfo(
|
240
|
+
status=ApiServerStatus.NEEDS_AUTH)
|
220
241
|
logger.warning('Failed to parse API server response: '
|
221
242
|
f'{str(e)}')
|
222
243
|
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
244
|
+
elif response.status_code == 401:
|
245
|
+
return ApiServerInfo(status=ApiServerStatus.NEEDS_AUTH)
|
223
246
|
else:
|
224
247
|
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
225
248
|
except requests.exceptions.Timeout:
|
@@ -369,7 +392,12 @@ def _start_api_server(deploy: bool = False,
|
|
369
392
|
f'SkyPilot API server started. {dashboard_msg}'))
|
370
393
|
|
371
394
|
|
372
|
-
def check_server_healthy(
|
395
|
+
def check_server_healthy(
|
396
|
+
endpoint: Optional[str] = None
|
397
|
+
) -> Literal[
|
398
|
+
# Use an incomplete list of Literals here to enforce raising for other
|
399
|
+
# enum values.
|
400
|
+
ApiServerStatus.HEALTHY, ApiServerStatus.NEEDS_AUTH]:
|
373
401
|
"""Check if the API server is healthy.
|
374
402
|
|
375
403
|
Args:
|
@@ -379,6 +407,11 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
|
|
379
407
|
Raises:
|
380
408
|
RuntimeError: If the server is not healthy or the client version does
|
381
409
|
not match the server version.
|
410
|
+
|
411
|
+
Returns:
|
412
|
+
ApiServerStatus: The status of the API server, unless the server is
|
413
|
+
unhealthy or the client version does not match the server version,
|
414
|
+
in which case an exception is raised.
|
382
415
|
"""
|
383
416
|
endpoint = endpoint if endpoint is not None else get_server_url()
|
384
417
|
api_server_info = get_api_server_status(endpoint)
|
@@ -441,6 +474,8 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
|
|
441
474
|
|
442
475
|
hinted_for_server_install_version_mismatch = True
|
443
476
|
|
477
|
+
return api_server_status
|
478
|
+
|
444
479
|
|
445
480
|
def _get_version_info_hint(server_info: ApiServerInfo) -> str:
|
446
481
|
assert server_info.version is not None, 'Server version is None'
|
@@ -491,11 +526,13 @@ def get_skypilot_version_on_disk() -> str:
|
|
491
526
|
def check_server_healthy_or_start_fn(deploy: bool = False,
|
492
527
|
host: str = '127.0.0.1',
|
493
528
|
foreground: bool = False):
|
529
|
+
api_server_status = None
|
494
530
|
try:
|
495
|
-
check_server_healthy()
|
531
|
+
api_server_status = check_server_healthy()
|
496
532
|
except exceptions.ApiServerConnectionError as exc:
|
497
533
|
endpoint = get_server_url()
|
498
|
-
if not is_api_server_local()
|
534
|
+
if (not is_api_server_local() or
|
535
|
+
api_server_status == ApiServerStatus.NEEDS_AUTH):
|
499
536
|
with ux_utils.print_exception_no_traceback():
|
500
537
|
raise exceptions.ApiServerConnectionError(endpoint) from exc
|
501
538
|
# Lock to prevent multiple processes from starting the server at the
|
sky/server/constants.py
CHANGED
@@ -26,6 +26,8 @@ CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
|
|
26
26
|
|
27
27
|
# Environment variable for a file path to the API cookie file.
|
28
28
|
API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
|
29
|
+
# Default file if unset.
|
30
|
+
API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
|
29
31
|
|
30
32
|
# The path to the dashboard build output
|
31
33
|
DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',
|