skypilot-nightly 1.0.0.dev20250521__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +46 -16
  3. sky/backends/cloud_vm_ray_backend.py +16 -4
  4. sky/check.py +109 -44
  5. sky/cli.py +261 -90
  6. sky/client/cli.py +261 -90
  7. sky/client/sdk.py +122 -3
  8. sky/clouds/__init__.py +5 -0
  9. sky/clouds/aws.py +4 -2
  10. sky/clouds/azure.py +4 -2
  11. sky/clouds/cloud.py +30 -6
  12. sky/clouds/cudo.py +2 -1
  13. sky/clouds/do.py +2 -1
  14. sky/clouds/fluidstack.py +2 -1
  15. sky/clouds/gcp.py +160 -23
  16. sky/clouds/ibm.py +4 -2
  17. sky/clouds/kubernetes.py +66 -22
  18. sky/clouds/lambda_cloud.py +2 -1
  19. sky/clouds/nebius.py +18 -2
  20. sky/clouds/oci.py +4 -2
  21. sky/clouds/paperspace.py +2 -1
  22. sky/clouds/runpod.py +2 -1
  23. sky/clouds/scp.py +2 -1
  24. sky/clouds/service_catalog/__init__.py +3 -0
  25. sky/clouds/service_catalog/common.py +9 -2
  26. sky/clouds/service_catalog/constants.py +2 -1
  27. sky/clouds/service_catalog/ssh_catalog.py +167 -0
  28. sky/clouds/ssh.py +203 -0
  29. sky/clouds/vast.py +2 -1
  30. sky/clouds/vsphere.py +2 -1
  31. sky/core.py +59 -17
  32. sky/dashboard/out/404.html +1 -1
  33. sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
  35. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  36. sky/dashboard/out/clusters/[cluster].html +1 -1
  37. sky/dashboard/out/clusters.html +1 -1
  38. sky/dashboard/out/index.html +1 -1
  39. sky/dashboard/out/infra.html +1 -1
  40. sky/dashboard/out/jobs/[job].html +1 -1
  41. sky/dashboard/out/jobs.html +1 -1
  42. sky/data/storage.py +1 -0
  43. sky/execution.py +56 -7
  44. sky/jobs/server/core.py +4 -2
  45. sky/optimizer.py +29 -15
  46. sky/provision/__init__.py +1 -0
  47. sky/provision/aws/instance.py +17 -1
  48. sky/provision/gcp/constants.py +147 -4
  49. sky/provision/gcp/instance_utils.py +10 -0
  50. sky/provision/gcp/volume_utils.py +247 -0
  51. sky/provision/kubernetes/instance.py +16 -5
  52. sky/provision/kubernetes/utils.py +37 -19
  53. sky/provision/nebius/instance.py +3 -1
  54. sky/provision/nebius/utils.py +14 -2
  55. sky/provision/ssh/__init__.py +18 -0
  56. sky/resources.py +177 -4
  57. sky/serve/server/core.py +2 -4
  58. sky/server/common.py +46 -9
  59. sky/server/constants.py +2 -0
  60. sky/server/html/token_page.html +154 -0
  61. sky/server/requests/executor.py +3 -6
  62. sky/server/requests/payloads.py +7 -0
  63. sky/server/server.py +80 -8
  64. sky/setup_files/dependencies.py +1 -0
  65. sky/skypilot_config.py +117 -31
  66. sky/task.py +24 -1
  67. sky/templates/gcp-ray.yml.j2 +44 -1
  68. sky/templates/nebius-ray.yml.j2 +12 -2
  69. sky/utils/admin_policy_utils.py +26 -22
  70. sky/utils/context.py +36 -6
  71. sky/utils/context_utils.py +15 -0
  72. sky/utils/infra_utils.py +21 -1
  73. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  74. sky/utils/kubernetes/create_cluster.sh +1 -0
  75. sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
  76. sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
  77. sky/utils/kubernetes/ssh-tunnel.sh +387 -0
  78. sky/utils/log_utils.py +214 -1
  79. sky/utils/resources_utils.py +14 -0
  80. sky/utils/schemas.py +67 -0
  81. sky/utils/ux_utils.py +2 -1
  82. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
  83. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +88 -81
  84. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
  85. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  86. /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
  87. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
  88. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
  89. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
  90. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
@@ -1137,6 +1137,11 @@ def get_accelerator_label_key_values(
1137
1137
  # support pollingthe clusters for autoscaling information, such as the
1138
1138
  # node pools configured etc.
1139
1139
 
1140
+ is_ssh_node_pool = context.startswith('ssh-') if context else False
1141
+ cloud_name = 'SSH Node Pool' if is_ssh_node_pool else 'Kubernetes cluster'
1142
+ context_display_name = context.lstrip('ssh-') if (
1143
+ context and is_ssh_node_pool) else context
1144
+
1140
1145
  autoscaler_type = get_autoscaler_type()
1141
1146
  if autoscaler_type is not None:
1142
1147
  # If autoscaler is set in config.yaml, override the label key and value
@@ -1176,13 +1181,17 @@ def get_accelerator_label_key_values(
1176
1181
  suffix = ''
1177
1182
  if env_options.Options.SHOW_DEBUG_INFO.get():
1178
1183
  suffix = f' Found node labels: {node_labels}'
1179
- raise exceptions.ResourcesUnavailableError(
1180
- 'Could not detect GPU labels in Kubernetes cluster. '
1181
- 'If this cluster has GPUs, please ensure GPU nodes have '
1182
- 'node labels of either of these formats: '
1183
- f'{supported_formats}. Please refer to '
1184
- 'the documentation on how to set up node labels.'
1185
- f'{suffix}')
1184
+ msg = (f'Could not detect GPU labels in {cloud_name}.')
1185
+ if not is_ssh_node_pool:
1186
+ msg += (' Run `sky check ssh` to debug.')
1187
+ else:
1188
+ msg += (
1189
+ ' If this cluster has GPUs, please ensure GPU nodes have '
1190
+ 'node labels of either of these formats: '
1191
+ f'{supported_formats}. Please refer to '
1192
+ 'the documentation on how to set up node labels.')
1193
+ msg += f'{suffix}'
1194
+ raise exceptions.ResourcesUnavailableError(msg)
1186
1195
  else:
1187
1196
  # Validate the label value on all nodes labels to ensure they are
1188
1197
  # correctly setup and will behave as expected.
@@ -1193,7 +1202,7 @@ def get_accelerator_label_key_values(
1193
1202
  value)
1194
1203
  if not is_valid:
1195
1204
  raise exceptions.ResourcesUnavailableError(
1196
- f'Node {node_name!r} in Kubernetes cluster has '
1205
+ f'Node {node_name!r} in {cloud_name} has '
1197
1206
  f'invalid GPU label: {label}={value}. {reason}')
1198
1207
  if check_mode:
1199
1208
  # If check mode is enabled and we reached so far, we can
@@ -1257,10 +1266,10 @@ def get_accelerator_label_key_values(
1257
1266
  # TODO(Doyoung): Update the error message raised with the
1258
1267
  # multi-host TPU support.
1259
1268
  raise exceptions.ResourcesUnavailableError(
1260
- 'Could not find any node in the Kubernetes cluster '
1269
+ f'Could not find any node in the {cloud_name} '
1261
1270
  f'with {acc_type}. Please ensure at least one node in the '
1262
1271
  f'cluster has {acc_type} and node labels are setup '
1263
- 'correctly. Please refer to the documentration for more. '
1272
+ 'correctly. Please refer to the documentation for more. '
1264
1273
  f'{suffix}. Note that multi-host TPU podslices are '
1265
1274
  'currently not unsupported.')
1266
1275
  else:
@@ -1270,15 +1279,24 @@ def get_accelerator_label_key_values(
1270
1279
  if env_options.Options.SHOW_DEBUG_INFO.get():
1271
1280
  suffix = (' Available resources on the cluster: '
1272
1281
  f'{cluster_resources}')
1273
- raise exceptions.ResourcesUnavailableError(
1274
- f'Could not detect GPU/TPU resources ({GPU_RESOURCE_KEY!r} or '
1275
- f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
1276
- ' contains GPUs, please ensure GPU drivers are installed on '
1277
- 'the node. Check if the GPUs are setup correctly by running '
1278
- '`kubectl describe nodes` and looking for the '
1279
- f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. '
1280
- 'Please refer to the documentation on how to set up GPUs.'
1281
- f'{suffix}')
1282
+ if is_ssh_node_pool:
1283
+ msg = (
1284
+ f'Could not detect GPUs in SSH Node Pool '
1285
+ f'\'{context_display_name}\'. If this cluster contains '
1286
+ 'GPUs, please ensure GPU drivers are installed on the node '
1287
+ 'and re-run '
1288
+ f'`sky ssh up --infra {context_display_name}`. {suffix}')
1289
+ else:
1290
+ msg = (
1291
+ f'Could not detect GPU/TPU resources ({GPU_RESOURCE_KEY!r} or '
1292
+ f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
1293
+ ' contains GPUs, please ensure GPU drivers are installed on '
1294
+ 'the node. Check if the GPUs are setup correctly by running '
1295
+ '`kubectl describe nodes` and looking for the '
1296
+ f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. '
1297
+ 'Please refer to the documentation on how to set up GPUs.'
1298
+ f'{suffix}')
1299
+ raise exceptions.ResourcesUnavailableError(msg)
1282
1300
  assert False, 'This should not be reached'
1283
1301
 
1284
1302
 
@@ -134,7 +134,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
134
134
  disk_size=config.node_config['DiskSize'],
135
135
  user_data=config.node_config['UserData'],
136
136
  associate_public_ip_address=(
137
- not config.provider_config['use_internal_ips']))
137
+ not config.provider_config['use_internal_ips']),
138
+ filesystems=config.node_config.get('filesystems', []),
139
+ )
138
140
  except Exception as e: # pylint: disable=broad-except
139
141
  logger.warning(f'run_instances error: {e}')
140
142
  raise
@@ -1,6 +1,6 @@
1
1
  """Nebius library wrapper for SkyPilot."""
2
2
  import time
3
- from typing import Any, Dict
3
+ from typing import Any, Dict, List
4
4
  import uuid
5
5
 
6
6
  from sky import sky_logging
@@ -158,7 +158,8 @@ def start(instance_id: str) -> None:
158
158
 
159
159
  def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
160
160
  preset: str, region: str, image_family: str, disk_size: int,
161
- user_data: str, associate_public_ip_address: bool) -> str:
161
+ user_data: str, associate_public_ip_address: bool,
162
+ filesystems: List[Dict[str, Any]]) -> str:
162
163
  # Each node must have a unique name to avoid conflicts between
163
164
  # multiple worker VMs. To ensure uniqueness,a UUID is appended
164
165
  # to the node name.
@@ -217,6 +218,16 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
217
218
  f' seconds) while waiting for disk {disk_name}'
218
219
  f' to be ready.')
219
220
 
221
+ filesystems_spec = []
222
+ if filesystems:
223
+ for fs in filesystems:
224
+ filesystems_spec.append(nebius.compute().AttachedFilesystemSpec(
225
+ mount_tag=fs['filesystem_mount_tag'],
226
+ attach_mode=nebius.compute().AttachedFilesystemSpec.AttachMode[
227
+ fs['filesystem_attach_mode']],
228
+ existing_filesystem=nebius.compute().ExistingFilesystem(
229
+ id=fs['filesystem_id'])))
230
+
220
231
  service = nebius.vpc().SubnetServiceClient(nebius.sdk())
221
232
  sub_net = service.list(nebius.vpc().ListSubnetsRequest(
222
233
  parent_id=project_id,)).wait()
@@ -237,6 +248,7 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
237
248
  cloud_init_user_data=user_data,
238
249
  resources=nebius.compute().ResourcesSpec(platform=platform,
239
250
  preset=preset),
251
+ filesystems=filesystems_spec if filesystems_spec else None,
240
252
  network_interfaces=[
241
253
  nebius.compute().NetworkInterfaceSpec(
242
254
  subnet_id=sub_net.items[0].metadata.id,
@@ -0,0 +1,18 @@
1
+ """SSH provisioner for SkyPilot.
2
+
3
+ This module implements the provisioner interface for SSH targets.
4
+ It reuses most of the functionality from the Kubernetes provisioner,
5
+ since the SSH implementation is based on Kubernetes under the hood.
6
+ """
7
+
8
+ from sky.provision.kubernetes.config import bootstrap_instances
9
+ from sky.provision.kubernetes.instance import get_cluster_info
10
+ from sky.provision.kubernetes.instance import get_command_runners
11
+ from sky.provision.kubernetes.instance import query_instances
12
+ from sky.provision.kubernetes.instance import run_instances
13
+ from sky.provision.kubernetes.instance import stop_instances
14
+ from sky.provision.kubernetes.instance import terminate_instances
15
+ from sky.provision.kubernetes.instance import wait_instances
16
+ from sky.provision.kubernetes.network import cleanup_ports
17
+ from sky.provision.kubernetes.network import open_ports
18
+ from sky.provision.kubernetes.network import query_ports
sky/resources.py CHANGED
@@ -98,7 +98,7 @@ class Resources:
98
98
  """
99
99
  # If any fields changed, increment the version. For backward compatibility,
100
100
  # modify the __setstate__ method to handle the old version.
101
- _VERSION = 23
101
+ _VERSION = 24
102
102
 
103
103
  def __init__(
104
104
  self,
@@ -120,6 +120,7 @@ class Resources:
120
120
  ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
121
121
  labels: Optional[Dict[str, str]] = None,
122
122
  autostop: Union[bool, int, Dict[str, Any], None] = None,
123
+ volumes: Optional[List[Dict[str, Any]]] = None,
123
124
  # Internal use only.
124
125
  # pylint: disable=invalid-name
125
126
  _docker_login_config: Optional[docker_utils.DockerLoginConfig] = None,
@@ -210,6 +211,7 @@ class Resources:
210
211
  not supported and will be ignored.
211
212
  autostop: the autostop configuration to use. For launched resources,
212
213
  may or may not correspond to the actual current autostop config.
214
+ volumes: the volumes to mount on the instance.
213
215
  _docker_login_config: the docker configuration to use. This includes
214
216
  the docker username, password, and registry server. If None, skip
215
217
  docker login.
@@ -337,6 +339,7 @@ class Resources:
337
339
  self._set_memory(memory)
338
340
  self._set_accelerators(accelerators, accelerator_args)
339
341
  self._set_autostop_config(autostop)
342
+ self._set_volumes(volumes)
340
343
 
341
344
  def validate(self):
342
345
  """Validate the resources and infer the missing fields if possible."""
@@ -347,6 +350,7 @@ class Resources:
347
350
  self._try_validate_managed_job_attributes()
348
351
  self._try_validate_image_id()
349
352
  self._try_validate_disk_tier()
353
+ self._try_validate_volumes()
350
354
  self._try_validate_ports()
351
355
  self._try_validate_labels()
352
356
 
@@ -448,7 +452,10 @@ class Resources:
448
452
  def repr_with_region_zone(self) -> str:
449
453
  region_str = ''
450
454
  if self.region is not None:
451
- region_str = f', region={self.region}'
455
+ region_name = self.region
456
+ if self.region.startswith('ssh-'):
457
+ region_name = self.region.lstrip('ssh-')
458
+ region_str = f', region={region_name}'
452
459
  zone_str = ''
453
460
  if self.zone is not None:
454
461
  zone_str = f', zone={self.zone}'
@@ -566,6 +573,10 @@ class Resources:
566
573
  def labels(self) -> Optional[Dict[str, str]]:
567
574
  return self._labels
568
575
 
576
+ @property
577
+ def volumes(self) -> Optional[List[Dict[str, Any]]]:
578
+ return self._volumes
579
+
569
580
  @property
570
581
  def autostop_config(self) -> Optional[AutostopConfig]:
571
582
  """The requested autostop config.
@@ -759,6 +770,91 @@ class Resources:
759
770
  ) -> None:
760
771
  self._autostop_config = AutostopConfig.from_yaml_config(autostop)
761
772
 
773
+ def _set_volumes(
774
+ self,
775
+ volumes: Optional[List[Dict[str, Any]]],
776
+ ) -> None:
777
+ if not volumes:
778
+ self._volumes = None
779
+ return
780
+ valid_volumes = []
781
+ supported_tiers = [tier.value for tier in resources_utils.DiskTier]
782
+ supported_storage_types = [
783
+ storage_type.value for storage_type in resources_utils.StorageType
784
+ ]
785
+ supported_attach_modes = [
786
+ attach_mode.value for attach_mode in resources_utils.DiskAttachMode
787
+ ]
788
+ network_type = resources_utils.StorageType.NETWORK
789
+ read_write_mode = resources_utils.DiskAttachMode.READ_WRITE
790
+ for volume in volumes:
791
+ if 'path' not in volume:
792
+ with ux_utils.print_exception_no_traceback():
793
+ raise ValueError(f'Invalid volume {volume!r}. '
794
+ f'Volume must have a "path" field.')
795
+ if 'storage_type' not in volume:
796
+ volume['storage_type'] = network_type
797
+ else:
798
+ if isinstance(volume['storage_type'], str):
799
+ storage_type_str = str(volume['storage_type']).lower()
800
+ if storage_type_str not in supported_storage_types:
801
+ logger.warning(
802
+ f'Invalid storage_type {storage_type_str!r}. '
803
+ f'Set it to '
804
+ f'{network_type.value}.')
805
+ volume['storage_type'] = network_type
806
+ else:
807
+ volume['storage_type'] = resources_utils.StorageType(
808
+ storage_type_str)
809
+ if 'auto_delete' not in volume:
810
+ volume['auto_delete'] = False
811
+ if 'attach_mode' in volume:
812
+ if isinstance(volume['attach_mode'], str):
813
+ attach_mode_str = str(volume['attach_mode']).lower()
814
+ if attach_mode_str not in supported_attach_modes:
815
+ logger.warning(
816
+ f'Invalid attach_mode {attach_mode_str!r}. '
817
+ f'Set it to {read_write_mode.value}.')
818
+ volume['attach_mode'] = read_write_mode
819
+ else:
820
+ volume['attach_mode'] = resources_utils.DiskAttachMode(
821
+ attach_mode_str)
822
+ else:
823
+ volume['attach_mode'] = read_write_mode
824
+ if volume['storage_type'] == network_type:
825
+ if ('disk_size' in volume and
826
+ round(volume['disk_size']) != volume['disk_size']):
827
+ with ux_utils.print_exception_no_traceback():
828
+ raise ValueError(f'Volume size must be an integer. '
829
+ f'Got: {volume["size"]}.')
830
+ if 'name' not in volume:
831
+ with ux_utils.print_exception_no_traceback():
832
+ raise ValueError(f'Network volume {volume["path"]} '
833
+ f'must have "name" field.')
834
+ elif 'name' in volume:
835
+ logger.info(f'Volume {volume["path"]} is a local disk. '
836
+ f'The "name" field will be ignored.')
837
+ del volume['name']
838
+ if 'disk_tier' in volume:
839
+ if isinstance(volume['disk_tier'], str):
840
+ disk_tier_str = str(volume['disk_tier']).lower()
841
+ if disk_tier_str not in supported_tiers:
842
+ logger.warning(
843
+ f'Invalid disk_tier {disk_tier_str!r}. '
844
+ f'Set it to {resources_utils.DiskTier.BEST.value}.')
845
+ volume['disk_tier'] = resources_utils.DiskTier.BEST
846
+ else:
847
+ volume['disk_tier'] = resources_utils.DiskTier(
848
+ disk_tier_str)
849
+ elif volume['storage_type'] == network_type:
850
+ logger.debug(
851
+ f'No disk_tier specified for volume {volume["path"]}. '
852
+ f'Set it to {resources_utils.DiskTier.BEST.value}.')
853
+ volume['disk_tier'] = resources_utils.DiskTier.BEST
854
+
855
+ valid_volumes.append(volume)
856
+ self._volumes = valid_volumes
857
+
762
858
  def is_launchable(self) -> bool:
763
859
  """Returns whether the resource is launchable."""
764
860
  return self.cloud is not None and self._instance_type is not None
@@ -1123,6 +1219,48 @@ class Resources:
1123
1219
  f'Disk tier {self.disk_tier.value} is not supported '
1124
1220
  f'for instance type {self.instance_type}.') from None
1125
1221
 
1222
+ def _try_validate_volumes(self) -> None:
1223
+ """Try to validate the volumes attribute.
1224
+
1225
+ Raises:
1226
+ ValueError: if the attribute is invalid.
1227
+ """
1228
+ if self.volumes is None:
1229
+ return
1230
+ if self.cloud is None:
1231
+ with ux_utils.print_exception_no_traceback():
1232
+ raise ValueError('Cloud must be specified when '
1233
+ 'volumes are provided.')
1234
+ if not self.cloud.is_same_cloud(clouds.GCP()):
1235
+ with ux_utils.print_exception_no_traceback():
1236
+ raise ValueError(f'Volumes are only supported for GCP'
1237
+ f' not for {self.cloud}.')
1238
+
1239
+ need_region_or_zone = False
1240
+ try:
1241
+ for volume in self.volumes:
1242
+ if ('name' in volume and volume['storage_type']
1243
+ == resources_utils.StorageType.NETWORK):
1244
+ need_region_or_zone = True
1245
+ if 'disk_tier' not in volume:
1246
+ continue
1247
+ # TODO(hailong): check instance local SSD
1248
+ # support for instance_type.
1249
+ # Refer to https://cloud.google.com/compute/docs/disks/local-ssd#machine-series-lssd # pylint: disable=line-too-long
1250
+ self.cloud.check_disk_tier_enabled(self.instance_type,
1251
+ volume['disk_tier'])
1252
+ if (need_region_or_zone and self._region is None and
1253
+ self._zone is None):
1254
+ with ux_utils.print_exception_no_traceback():
1255
+ raise ValueError('When specifying the volume name, please'
1256
+ ' also specify the region or zone.')
1257
+ except exceptions.NotSupportedError:
1258
+ with ux_utils.print_exception_no_traceback():
1259
+ raise ValueError(
1260
+ f'Disk tier {volume["disk_tier"].value} is not '
1261
+ f'supported for instance type {self.instance_type}.'
1262
+ ) from None
1263
+
1126
1264
  def _try_validate_ports(self) -> None:
1127
1265
  """Try to validate the ports attribute.
1128
1266
 
@@ -1293,9 +1431,18 @@ class Resources:
1293
1431
  skypilot_config.get_nested(
1294
1432
  (str(self.cloud).lower(), 'specific_reservations'), set()))
1295
1433
 
1434
+ if isinstance(self.cloud, clouds.DummyCloud):
1435
+ return self.cloud.get_reservations_available_resources(
1436
+ instance_type='',
1437
+ region='',
1438
+ zone=None,
1439
+ specific_reservations=specific_reservations)
1440
+
1296
1441
  assert (self.cloud is not None and self.instance_type is not None and
1297
- self.region
1298
- is not None), ('Cloud, instance type, region must be specified')
1442
+ self.region is not None), (
1443
+ f'Cloud, instance type, region must be specified. '
1444
+ f'Resources={self}, cloud={self.cloud}, '
1445
+ f'instance_type={self.instance_type}, region={self.region}')
1299
1446
  return self.cloud.get_reservations_available_resources(
1300
1447
  self.instance_type, self.region, self.zone, specific_reservations)
1301
1448
 
@@ -1483,6 +1630,7 @@ class Resources:
1483
1630
  ports=override.pop('ports', self.ports),
1484
1631
  labels=override.pop('labels', self.labels),
1485
1632
  autostop=override.pop('autostop', current_autostop_config),
1633
+ volumes=override.pop('volumes', self.volumes),
1486
1634
  infra=override.pop('infra', None),
1487
1635
  _docker_login_config=override.pop('_docker_login_config',
1488
1636
  self._docker_login_config),
@@ -1523,6 +1671,12 @@ class Resources:
1523
1671
  features.add(clouds.CloudImplementationFeatures.IMAGE_ID)
1524
1672
  if self.ports is not None:
1525
1673
  features.add(clouds.CloudImplementationFeatures.OPEN_PORTS)
1674
+ if self.volumes is not None:
1675
+ for volume in self.volumes:
1676
+ if 'disk_tier' in volume and volume[
1677
+ 'disk_tier'] != resources_utils.DiskTier.BEST:
1678
+ features.add(
1679
+ clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER)
1526
1680
  return features
1527
1681
 
1528
1682
  @staticmethod
@@ -1692,6 +1846,7 @@ class Resources:
1692
1846
  resources_fields['ports'] = config.pop('ports', None)
1693
1847
  resources_fields['labels'] = config.pop('labels', None)
1694
1848
  resources_fields['autostop'] = config.pop('autostop', None)
1849
+ resources_fields['volumes'] = config.pop('volumes', None)
1695
1850
  resources_fields['_docker_login_config'] = config.pop(
1696
1851
  '_docker_login_config', None)
1697
1852
  resources_fields['_docker_username_for_runpod'] = config.pop(
@@ -1742,6 +1897,21 @@ class Resources:
1742
1897
  config['disk_tier'] = self.disk_tier.value
1743
1898
  add_if_not_none('ports', self.ports)
1744
1899
  add_if_not_none('labels', self.labels)
1900
+ if self.volumes is not None:
1901
+ # Convert DiskTier/StorageType enum to string value for each volume
1902
+ volumes = []
1903
+ for volume in self.volumes:
1904
+ volume_copy = volume.copy()
1905
+ if 'disk_tier' in volume_copy:
1906
+ volume_copy['disk_tier'] = volume_copy['disk_tier'].value
1907
+ if 'storage_type' in volume_copy:
1908
+ volume_copy['storage_type'] = volume_copy[
1909
+ 'storage_type'].value
1910
+ if 'attach_mode' in volume_copy:
1911
+ volume_copy['attach_mode'] = volume_copy[
1912
+ 'attach_mode'].value
1913
+ volumes.append(volume_copy)
1914
+ config['volumes'] = volumes
1745
1915
  if self._autostop_config is not None:
1746
1916
  config['autostop'] = self._autostop_config.to_yaml_config()
1747
1917
  if self._docker_login_config is not None:
@@ -1902,6 +2072,9 @@ class Resources:
1902
2072
  if version < 23:
1903
2073
  self._autostop_config = None
1904
2074
 
2075
+ if version < 24:
2076
+ self._volumes = None
2077
+
1905
2078
  self.__dict__.update(state)
1906
2079
 
1907
2080
 
sky/serve/server/core.py CHANGED
@@ -141,8 +141,7 @@ def up(
141
141
  # Always apply the policy again here, even though it might have been applied
142
142
  # in the CLI. This is to ensure that we apply the policy to the final DAG
143
143
  # and get the mutated config.
144
- dag, mutated_user_config = admin_policy_utils.apply(
145
- task, use_mutated_config_in_current_request=False)
144
+ dag, mutated_user_config = admin_policy_utils.apply(task)
146
145
  task = dag.tasks[0]
147
146
 
148
147
  with rich_utils.safe_status(
@@ -352,8 +351,7 @@ def update(
352
351
  # and get the mutated config.
353
352
  # TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
354
353
  # will not apply the config.
355
- dag, _ = admin_policy_utils.apply(
356
- task, use_mutated_config_in_current_request=False)
354
+ dag, _ = admin_policy_utils.apply(task)
357
355
  task = dag.tasks[0]
358
356
 
359
357
  assert task.service is not None
sky/server/common.py CHANGED
@@ -12,7 +12,7 @@ import subprocess
12
12
  import sys
13
13
  import time
14
14
  import typing
15
- from typing import Any, Dict, Optional
15
+ from typing import Any, Dict, Literal, Optional
16
16
  from urllib import parse
17
17
  import uuid
18
18
 
@@ -116,6 +116,7 @@ class ApiServerStatus(enum.Enum):
116
116
  HEALTHY = 'healthy'
117
117
  UNHEALTHY = 'unhealthy'
118
118
  VERSION_MISMATCH = 'version_mismatch'
119
+ NEEDS_AUTH = 'needs_auth'
119
120
 
120
121
 
121
122
  @dataclasses.dataclass
@@ -127,15 +128,21 @@ class ApiServerInfo:
127
128
  commit: Optional[str] = None
128
129
 
129
130
 
131
+ def get_api_cookie_jar_path() -> str:
132
+ return os.environ.get(server_constants.API_COOKIE_FILE_ENV_VAR,
133
+ server_constants.API_COOKIE_FILE_DEFAULT_LOCATION)
134
+
135
+
130
136
  def get_api_cookie_jar() -> requests.cookies.RequestsCookieJar:
131
137
  """Returns the cookie jar used by the client to access the API server."""
132
- cookie_file = os.environ.get(server_constants.API_COOKIE_FILE_ENV_VAR)
133
138
  cookie_jar = requests.cookies.RequestsCookieJar()
134
- if cookie_file and os.path.exists(cookie_file):
139
+ cookie_file = get_api_cookie_jar_path()
140
+ if cookie_file:
135
141
  cookie_path = pathlib.Path(cookie_file).expanduser().resolve()
136
- file_cookie_jar = MozillaCookieJar(cookie_path)
137
- file_cookie_jar.load()
138
- cookie_jar.update(file_cookie_jar)
142
+ if cookie_path.exists():
143
+ file_cookie_jar = MozillaCookieJar(cookie_path)
144
+ file_cookie_jar.load()
145
+ cookie_jar.update(file_cookie_jar)
139
146
  return cookie_jar
140
147
 
141
148
 
@@ -196,6 +203,7 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
196
203
  response = requests.get(f'{server_url}/api/health',
197
204
  timeout=2.5,
198
205
  cookies=get_api_cookie_jar())
206
+ logger.debug(f'Health check status: {response.status_code}')
199
207
  if response.status_code == 200:
200
208
  try:
201
209
  result = response.json()
@@ -217,9 +225,24 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
217
225
  server_info.status = ApiServerStatus.VERSION_MISMATCH
218
226
  return server_info
219
227
  except (json.JSONDecodeError, AttributeError) as e:
228
+ # Try to check if we got redirected to a login page.
229
+ for prev_response in response.history:
230
+ logger.debug(f'Previous response: {prev_response.url}')
231
+ # Heuristic: check if the url looks like a login page or
232
+ # oauth flow.
233
+ if any(key in prev_response.url
234
+ for key in ['login', 'oauth2']):
235
+ logger.debug(
236
+ f'URL {prev_response.url} looks like '
237
+ 'a login page or oauth flow, so try to '
238
+ 'get the cookie.')
239
+ return ApiServerInfo(
240
+ status=ApiServerStatus.NEEDS_AUTH)
220
241
  logger.warning('Failed to parse API server response: '
221
242
  f'{str(e)}')
222
243
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
244
+ elif response.status_code == 401:
245
+ return ApiServerInfo(status=ApiServerStatus.NEEDS_AUTH)
223
246
  else:
224
247
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
225
248
  except requests.exceptions.Timeout:
@@ -369,7 +392,12 @@ def _start_api_server(deploy: bool = False,
369
392
  f'SkyPilot API server started. {dashboard_msg}'))
370
393
 
371
394
 
372
- def check_server_healthy(endpoint: Optional[str] = None,) -> None:
395
+ def check_server_healthy(
396
+ endpoint: Optional[str] = None
397
+ ) -> Literal[
398
+ # Use an incomplete list of Literals here to enforce raising for other
399
+ # enum values.
400
+ ApiServerStatus.HEALTHY, ApiServerStatus.NEEDS_AUTH]:
373
401
  """Check if the API server is healthy.
374
402
 
375
403
  Args:
@@ -379,6 +407,11 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
379
407
  Raises:
380
408
  RuntimeError: If the server is not healthy or the client version does
381
409
  not match the server version.
410
+
411
+ Returns:
412
+ ApiServerStatus: The status of the API server, unless the server is
413
+ unhealthy or the client version does not match the server version,
414
+ in which case an exception is raised.
382
415
  """
383
416
  endpoint = endpoint if endpoint is not None else get_server_url()
384
417
  api_server_info = get_api_server_status(endpoint)
@@ -441,6 +474,8 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
441
474
 
442
475
  hinted_for_server_install_version_mismatch = True
443
476
 
477
+ return api_server_status
478
+
444
479
 
445
480
  def _get_version_info_hint(server_info: ApiServerInfo) -> str:
446
481
  assert server_info.version is not None, 'Server version is None'
@@ -491,11 +526,13 @@ def get_skypilot_version_on_disk() -> str:
491
526
  def check_server_healthy_or_start_fn(deploy: bool = False,
492
527
  host: str = '127.0.0.1',
493
528
  foreground: bool = False):
529
+ api_server_status = None
494
530
  try:
495
- check_server_healthy()
531
+ api_server_status = check_server_healthy()
496
532
  except exceptions.ApiServerConnectionError as exc:
497
533
  endpoint = get_server_url()
498
- if not is_api_server_local():
534
+ if (not is_api_server_local() or
535
+ api_server_status == ApiServerStatus.NEEDS_AUTH):
499
536
  with ux_utils.print_exception_no_traceback():
500
537
  raise exceptions.ApiServerConnectionError(endpoint) from exc
501
538
  # Lock to prevent multiple processes from starting the server at the
sky/server/constants.py CHANGED
@@ -26,6 +26,8 @@ CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
26
26
 
27
27
  # Environment variable for a file path to the API cookie file.
28
28
  API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
29
+ # Default file if unset.
30
+ API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
29
31
 
30
32
  # The path to the dashboard build output
31
33
  DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',