skypilot-nightly 1.0.0.dev20250520__py3-none-any.whl → 1.0.0.dev20250522__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +56 -37
  4. sky/check.py +3 -3
  5. sky/cli.py +89 -16
  6. sky/client/cli.py +89 -16
  7. sky/client/sdk.py +92 -4
  8. sky/clouds/__init__.py +2 -0
  9. sky/clouds/cloud.py +6 -0
  10. sky/clouds/gcp.py +156 -21
  11. sky/clouds/service_catalog/__init__.py +3 -0
  12. sky/clouds/service_catalog/common.py +9 -2
  13. sky/clouds/service_catalog/constants.py +1 -0
  14. sky/core.py +6 -8
  15. sky/dashboard/out/404.html +1 -1
  16. sky/dashboard/out/_next/static/CzOVV6JpRQBRt5GhZuhyK/_buildManifest.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +6 -0
  18. sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/{678-206dddca808e6d16.js → 582-683f4f27b81996dc.js} +2 -2
  22. sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +1 -0
  29. sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +3 -0
  30. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  31. sky/dashboard/out/clusters/[cluster].html +1 -1
  32. sky/dashboard/out/clusters.html +1 -1
  33. sky/dashboard/out/index.html +1 -1
  34. sky/dashboard/out/infra.html +1 -0
  35. sky/dashboard/out/jobs/[job].html +1 -1
  36. sky/dashboard/out/jobs.html +1 -1
  37. sky/data/storage.py +1 -0
  38. sky/execution.py +57 -8
  39. sky/jobs/server/core.py +5 -3
  40. sky/jobs/utils.py +38 -7
  41. sky/optimizer.py +41 -39
  42. sky/provision/gcp/constants.py +147 -4
  43. sky/provision/gcp/instance_utils.py +10 -0
  44. sky/provision/gcp/volume_utils.py +247 -0
  45. sky/provision/provisioner.py +16 -7
  46. sky/resources.py +233 -18
  47. sky/serve/serve_utils.py +5 -13
  48. sky/serve/server/core.py +2 -4
  49. sky/server/common.py +60 -14
  50. sky/server/constants.py +2 -0
  51. sky/server/html/token_page.html +154 -0
  52. sky/server/requests/executor.py +3 -6
  53. sky/server/requests/payloads.py +3 -3
  54. sky/server/server.py +40 -8
  55. sky/skypilot_config.py +117 -31
  56. sky/task.py +24 -1
  57. sky/templates/gcp-ray.yml.j2 +44 -1
  58. sky/templates/nebius-ray.yml.j2 +0 -2
  59. sky/utils/admin_policy_utils.py +26 -22
  60. sky/utils/cli_utils/status_utils.py +95 -56
  61. sky/utils/common_utils.py +35 -2
  62. sky/utils/context.py +36 -6
  63. sky/utils/context_utils.py +15 -0
  64. sky/utils/infra_utils.py +175 -0
  65. sky/utils/resources_utils.py +55 -21
  66. sky/utils/schemas.py +111 -5
  67. {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/METADATA +1 -1
  68. {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/RECORD +73 -68
  69. {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/WHEEL +1 -1
  70. sky/dashboard/out/_next/static/8hlc2dkbIDDBOkxtEW7X6/_buildManifest.js +0 -1
  71. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  72. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  73. sky/dashboard/out/_next/static/chunks/845-0ca6f2c1ba667c3b.js +0 -1
  74. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  75. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  76. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  77. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  78. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  79. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  80. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  81. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  82. /sky/dashboard/out/_next/static/{8hlc2dkbIDDBOkxtEW7X6 → CzOVV6JpRQBRt5GhZuhyK}/_ssgManifest.js +0 -0
  83. {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/entry_points.txt +0 -0
  84. {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/licenses/LICENSE +0 -0
  85. {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/top_level.txt +0 -0
@@ -17,6 +17,7 @@ from sky import clouds
17
17
  from sky import exceptions
18
18
  from sky import provision
19
19
  from sky import sky_logging
20
+ from sky import skypilot_config
20
21
  from sky.adaptors import aws
21
22
  from sky.backends import backend_utils
22
23
  from sky.provision import common as provision_common
@@ -228,9 +229,9 @@ def _ssh_probe_command(ip: str,
228
229
  ssh_port: int,
229
230
  ssh_user: str,
230
231
  ssh_private_key: str,
232
+ ssh_probe_timeout: int,
231
233
  ssh_proxy_command: Optional[str] = None) -> List[str]:
232
- # NOTE: Ray uses 'uptime' command and 10s timeout, we use the same
233
- # setting here.
234
+ # NOTE: Ray uses 'uptime' command, we use the same setting here.
234
235
  command = [
235
236
  'ssh',
236
237
  '-T',
@@ -244,7 +245,7 @@ def _ssh_probe_command(ip: str,
244
245
  '-o',
245
246
  'PasswordAuthentication=no',
246
247
  '-o',
247
- 'ConnectTimeout=10s',
248
+ f'ConnectTimeout={ssh_probe_timeout}s',
248
249
  '-o',
249
250
  f'UserKnownHostsFile={os.devnull}',
250
251
  '-o',
@@ -277,6 +278,7 @@ def _wait_ssh_connection_direct(ip: str,
277
278
  ssh_port: int,
278
279
  ssh_user: str,
279
280
  ssh_private_key: str,
281
+ ssh_probe_timeout: int,
280
282
  ssh_control_name: Optional[str] = None,
281
283
  ssh_proxy_command: Optional[str] = None,
282
284
  **kwargs) -> Tuple[bool, str]:
@@ -305,6 +307,7 @@ def _wait_ssh_connection_direct(ip: str,
305
307
  if success:
306
308
  return _wait_ssh_connection_indirect(ip, ssh_port, ssh_user,
307
309
  ssh_private_key,
310
+ ssh_probe_timeout,
308
311
  ssh_control_name,
309
312
  ssh_proxy_command)
310
313
  except socket.timeout: # this is the most expected exception
@@ -312,7 +315,7 @@ def _wait_ssh_connection_direct(ip: str,
312
315
  except Exception as e: # pylint: disable=broad-except
313
316
  stderr = f'Error: {common_utils.format_exception(e)}'
314
317
  command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
315
- ssh_proxy_command)
318
+ ssh_probe_timeout, ssh_proxy_command)
316
319
  logger.debug(f'Waiting for SSH to {ip}. Try: '
317
320
  f'{_shlex_join(command)}. '
318
321
  f'{stderr}')
@@ -323,6 +326,7 @@ def _wait_ssh_connection_indirect(ip: str,
323
326
  ssh_port: int,
324
327
  ssh_user: str,
325
328
  ssh_private_key: str,
329
+ ssh_probe_timeout: int,
326
330
  ssh_control_name: Optional[str] = None,
327
331
  ssh_proxy_command: Optional[str] = None,
328
332
  **kwargs) -> Tuple[bool, str]:
@@ -333,14 +337,14 @@ def _wait_ssh_connection_indirect(ip: str,
333
337
  """
334
338
  del ssh_control_name, kwargs # unused
335
339
  command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
336
- ssh_proxy_command)
340
+ ssh_probe_timeout, ssh_proxy_command)
337
341
  message = f'Waiting for SSH using command: {_shlex_join(command)}'
338
342
  logger.debug(message)
339
343
  try:
340
344
  proc = subprocess.run(command,
341
345
  shell=False,
342
346
  check=False,
343
- timeout=10,
347
+ timeout=ssh_probe_timeout,
344
348
  stdout=subprocess.DEVNULL,
345
349
  stderr=subprocess.PIPE)
346
350
  if proc.returncode != 0:
@@ -383,8 +387,13 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
383
387
  def _retry_ssh_thread(ip_ssh_port: Tuple[str, int]):
384
388
  ip, ssh_port = ip_ssh_port
385
389
  success = False
390
+ ssh_probe_timeout = skypilot_config.get_nested(
391
+ ('provision', 'ssh_timeout'), 10)
386
392
  while not success:
387
- success, stderr = waiter(ip, ssh_port, **ssh_credentials)
393
+ success, stderr = waiter(ip,
394
+ ssh_port,
395
+ **ssh_credentials,
396
+ ssh_probe_timeout=ssh_probe_timeout)
388
397
  if not success and time.time() - start > timeout:
389
398
  with ux_utils.print_exception_no_traceback():
390
399
  raise RuntimeError(
sky/resources.py CHANGED
@@ -6,6 +6,7 @@ from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
6
6
 
7
7
  import colorama
8
8
 
9
+ import sky
9
10
  from sky import check as sky_check
10
11
  from sky import clouds
11
12
  from sky import exceptions
@@ -20,6 +21,7 @@ from sky.utils import accelerator_registry
20
21
  from sky.utils import annotations
21
22
  from sky.utils import common_utils
22
23
  from sky.utils import config_utils
24
+ from sky.utils import infra_utils
23
25
  from sky.utils import log_utils
24
26
  from sky.utils import registry
25
27
  from sky.utils import resources_utils
@@ -96,7 +98,7 @@ class Resources:
96
98
  """
97
99
  # If any fields changed, increment the version. For backward compatibility,
98
100
  # modify the __setstate__ method to handle the old version.
99
- _VERSION = 23
101
+ _VERSION = 24
100
102
 
101
103
  def __init__(
102
104
  self,
@@ -106,6 +108,7 @@ class Resources:
106
108
  memory: Union[None, int, float, str] = None,
107
109
  accelerators: Union[None, str, Dict[str, Union[int, float]]] = None,
108
110
  accelerator_args: Optional[Dict[str, str]] = None,
111
+ infra: Optional[str] = None,
109
112
  use_spot: Optional[bool] = None,
110
113
  job_recovery: Optional[Union[Dict[str, Optional[Union[str, int]]],
111
114
  str]] = None,
@@ -117,6 +120,7 @@ class Resources:
117
120
  ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
118
121
  labels: Optional[Dict[str, str]] = None,
119
122
  autostop: Union[bool, int, Dict[str, Any], None] = None,
123
+ volumes: Optional[List[Dict[str, Any]]] = None,
120
124
  # Internal use only.
121
125
  # pylint: disable=invalid-name
122
126
  _docker_login_config: Optional[docker_utils.DockerLoginConfig] = None,
@@ -134,9 +138,9 @@ class Resources:
134
138
  .. code-block:: python
135
139
 
136
140
  # Fully specified cloud and instance type (is_launchable() is True).
137
- sky.Resources(clouds.AWS(), 'p3.2xlarge')
138
- sky.Resources(clouds.GCP(), 'n1-standard-16')
139
- sky.Resources(clouds.GCP(), 'n1-standard-8', 'V100')
141
+ sky.Resources(infra='aws', instance_type='p3.2xlarge')
142
+ sky.Resources(infra='k8s/my-cluster-ctx', accelerators='V100')
143
+ sky.Resources(infra='gcp/us-central1', accelerators='V100')
140
144
 
141
145
  # Specifying required resources; the system decides the
142
146
  # cloud/instance type. The below are equivalent:
@@ -145,8 +149,9 @@ class Resources:
145
149
  sky.Resources(accelerators={'V100': 1})
146
150
  sky.Resources(cpus='2+', memory='16+', accelerators='V100')
147
151
 
152
+
148
153
  Args:
149
- cloud: the cloud to use.
154
+ cloud: the cloud to use. Deprecated. Use `infra` instead.
150
155
  instance_type: the instance type to use.
151
156
  cpus: the number of CPUs required for the task.
152
157
  If a str, must be a string of the form ``'2'`` or ``'2+'``, where
@@ -160,6 +165,11 @@ class Resources:
160
165
  dict of the form ``{'V100': 2}`` or ``{'tpu-v2-8': 1}``.
161
166
  accelerator_args: accelerator-specific arguments. For example,
162
167
  ``{'tpu_vm': True, 'runtime_version': 'tpu-vm-base'}`` for TPUs.
168
+ infra: a string specifying the infrastructure to use, in the format
169
+ of "cloud/region" or "cloud/region/zone". For example,
170
+ `aws/us-east-1` or `k8s/my-cluster-ctx`. This is an alternative to
171
+ specifying cloud, region, and zone separately. If provided, it
172
+ takes precedence over cloud, region, and zone parameters.
163
173
  use_spot: whether to use spot instances. If None, defaults to
164
174
  False.
165
175
  job_recovery: the job recovery strategy to use for the managed
@@ -172,8 +182,8 @@ class Resources:
172
182
  - max_restarts_on_errors: the max number of restarts on user code
173
183
  errors.
174
184
 
175
- region: the region to use.
176
- zone: the zone to use.
185
+ region: the region to use. Deprecated. Use `infra` instead.
186
+ zone: the zone to use. Deprecated. Use `infra` instead.
177
187
  image_id: the image ID to use. If a str, must be a string
178
188
  of the image id from the cloud, such as AWS:
179
189
  ``'ami-1234567890abcdef0'``, GCP:
@@ -201,6 +211,7 @@ class Resources:
201
211
  not supported and will be ignored.
202
212
  autostop: the autostop configuration to use. For launched resources,
203
213
  may or may not correspond to the actual current autostop config.
214
+ volumes: the volumes to mount on the instance.
204
215
  _docker_login_config: the docker configuration to use. This includes
205
216
  the docker username, password, and registry server. If None, skip
206
217
  docker login.
@@ -218,6 +229,25 @@ class Resources:
218
229
  exceptions.NoCloudAccessError: if no public cloud is enabled.
219
230
  """
220
231
  self._version = self._VERSION
232
+
233
+ if infra is not None and (cloud is not None or region is not None or
234
+ zone is not None):
235
+ with ux_utils.print_exception_no_traceback():
236
+ raise ValueError('Cannot specify both `infra` and `cloud`, '
237
+ '`region`, or `zone` parameters. '
238
+ f'Got: infra={infra}, cloud={cloud}, '
239
+ f'region={region}, zone={zone}')
240
+
241
+ # Infra is user facing, and cloud, region, zone in parameters are for
242
+ # backward compatibility. Internally, we keep using cloud, region, zone
243
+ # for simplicity.
244
+ if infra is not None:
245
+ infra_info = infra_utils.InfraInfo.from_str(infra)
246
+ # Infra takes precedence over individually specified parameters
247
+ cloud = sky.CLOUD_REGISTRY.from_str(infra_info.cloud)
248
+ region = infra_info.region
249
+ zone = infra_info.zone
250
+
221
251
  self._cloud = cloud
222
252
  self._region: Optional[str] = region
223
253
  self._zone: Optional[str] = zone
@@ -309,6 +339,7 @@ class Resources:
309
339
  self._set_memory(memory)
310
340
  self._set_accelerators(accelerators, accelerator_args)
311
341
  self._set_autostop_config(autostop)
342
+ self._set_volumes(volumes)
312
343
 
313
344
  def validate(self):
314
345
  """Validate the resources and infer the missing fields if possible."""
@@ -319,6 +350,7 @@ class Resources:
319
350
  self._try_validate_managed_job_attributes()
320
351
  self._try_validate_image_id()
321
352
  self._try_validate_disk_tier()
353
+ self._try_validate_volumes()
322
354
  self._try_validate_ports()
323
355
  self._try_validate_labels()
324
356
 
@@ -431,6 +463,11 @@ class Resources:
431
463
  repr_str += f'{region_str}{zone_str}'
432
464
  return repr_str
433
465
 
466
+ @property
467
+ def infra(self) -> infra_utils.InfraInfo:
468
+ cloud = str(self.cloud) if self.cloud is not None else None
469
+ return infra_utils.InfraInfo(cloud, self.region, self.zone)
470
+
434
471
  @property
435
472
  def cloud(self) -> Optional[clouds.Cloud]:
436
473
  return self._cloud
@@ -486,9 +523,9 @@ class Resources:
486
523
  def accelerators(self) -> Optional[Dict[str, Union[int, float]]]:
487
524
  """Returns the accelerators field directly or by inferring.
488
525
 
489
- For example, Resources(AWS, 'p3.2xlarge') has its accelerators field
490
- set to None, but this function will infer {'V100': 1} from the instance
491
- type.
526
+ For example, Resources(infra='aws', instance_type='p3.2xlarge') has its
527
+ accelerators field set to None, but this function will infer {'V100': 1}
528
+ from the instance type.
492
529
  """
493
530
  if self._accelerators is not None:
494
531
  return self._accelerators
@@ -533,6 +570,10 @@ class Resources:
533
570
  def labels(self) -> Optional[Dict[str, str]]:
534
571
  return self._labels
535
572
 
573
+ @property
574
+ def volumes(self) -> Optional[List[Dict[str, Any]]]:
575
+ return self._volumes
576
+
536
577
  @property
537
578
  def autostop_config(self) -> Optional[AutostopConfig]:
538
579
  """The requested autostop config.
@@ -726,6 +767,91 @@ class Resources:
726
767
  ) -> None:
727
768
  self._autostop_config = AutostopConfig.from_yaml_config(autostop)
728
769
 
770
+ def _set_volumes(
771
+ self,
772
+ volumes: Optional[List[Dict[str, Any]]],
773
+ ) -> None:
774
+ if not volumes:
775
+ self._volumes = None
776
+ return
777
+ valid_volumes = []
778
+ supported_tiers = [tier.value for tier in resources_utils.DiskTier]
779
+ supported_storage_types = [
780
+ storage_type.value for storage_type in resources_utils.StorageType
781
+ ]
782
+ supported_attach_modes = [
783
+ attach_mode.value for attach_mode in resources_utils.DiskAttachMode
784
+ ]
785
+ network_type = resources_utils.StorageType.NETWORK
786
+ read_write_mode = resources_utils.DiskAttachMode.READ_WRITE
787
+ for volume in volumes:
788
+ if 'path' not in volume:
789
+ with ux_utils.print_exception_no_traceback():
790
+ raise ValueError(f'Invalid volume {volume!r}. '
791
+ f'Volume must have a "path" field.')
792
+ if 'storage_type' not in volume:
793
+ volume['storage_type'] = network_type
794
+ else:
795
+ if isinstance(volume['storage_type'], str):
796
+ storage_type_str = str(volume['storage_type']).lower()
797
+ if storage_type_str not in supported_storage_types:
798
+ logger.warning(
799
+ f'Invalid storage_type {storage_type_str!r}. '
800
+ f'Set it to '
801
+ f'{network_type.value}.')
802
+ volume['storage_type'] = network_type
803
+ else:
804
+ volume['storage_type'] = resources_utils.StorageType(
805
+ storage_type_str)
806
+ if 'auto_delete' not in volume:
807
+ volume['auto_delete'] = False
808
+ if 'attach_mode' in volume:
809
+ if isinstance(volume['attach_mode'], str):
810
+ attach_mode_str = str(volume['attach_mode']).lower()
811
+ if attach_mode_str not in supported_attach_modes:
812
+ logger.warning(
813
+ f'Invalid attach_mode {attach_mode_str!r}. '
814
+ f'Set it to {read_write_mode.value}.')
815
+ volume['attach_mode'] = read_write_mode
816
+ else:
817
+ volume['attach_mode'] = resources_utils.DiskAttachMode(
818
+ attach_mode_str)
819
+ else:
820
+ volume['attach_mode'] = read_write_mode
821
+ if volume['storage_type'] == network_type:
822
+ if ('disk_size' in volume and
823
+ round(volume['disk_size']) != volume['disk_size']):
824
+ with ux_utils.print_exception_no_traceback():
825
+ raise ValueError(f'Volume size must be an integer. '
826
+ f'Got: {volume["size"]}.')
827
+ if 'name' not in volume:
828
+ with ux_utils.print_exception_no_traceback():
829
+ raise ValueError(f'Network volume {volume["path"]} '
830
+ f'must have "name" field.')
831
+ elif 'name' in volume:
832
+ logger.info(f'Volume {volume["path"]} is a local disk. '
833
+ f'The "name" field will be ignored.')
834
+ del volume['name']
835
+ if 'disk_tier' in volume:
836
+ if isinstance(volume['disk_tier'], str):
837
+ disk_tier_str = str(volume['disk_tier']).lower()
838
+ if disk_tier_str not in supported_tiers:
839
+ logger.warning(
840
+ f'Invalid disk_tier {disk_tier_str!r}. '
841
+ f'Set it to {resources_utils.DiskTier.BEST.value}.')
842
+ volume['disk_tier'] = resources_utils.DiskTier.BEST
843
+ else:
844
+ volume['disk_tier'] = resources_utils.DiskTier(
845
+ disk_tier_str)
846
+ elif volume['storage_type'] == network_type:
847
+ logger.debug(
848
+ f'No disk_tier specified for volume {volume["path"]}. '
849
+ f'Set it to {resources_utils.DiskTier.BEST.value}.')
850
+ volume['disk_tier'] = resources_utils.DiskTier.BEST
851
+
852
+ valid_volumes.append(volume)
853
+ self._volumes = valid_volumes
854
+
729
855
  def is_launchable(self) -> bool:
730
856
  """Returns whether the resource is launchable."""
731
857
  return self.cloud is not None and self._instance_type is not None
@@ -1090,6 +1216,48 @@ class Resources:
1090
1216
  f'Disk tier {self.disk_tier.value} is not supported '
1091
1217
  f'for instance type {self.instance_type}.') from None
1092
1218
 
1219
+ def _try_validate_volumes(self) -> None:
1220
+ """Try to validate the volumes attribute.
1221
+
1222
+ Raises:
1223
+ ValueError: if the attribute is invalid.
1224
+ """
1225
+ if self.volumes is None:
1226
+ return
1227
+ if self.cloud is None:
1228
+ with ux_utils.print_exception_no_traceback():
1229
+ raise ValueError('Cloud must be specified when '
1230
+ 'volumes are provided.')
1231
+ if not self.cloud.is_same_cloud(clouds.GCP()):
1232
+ with ux_utils.print_exception_no_traceback():
1233
+ raise ValueError(f'Volumes are only supported for GCP'
1234
+ f' not for {self.cloud}.')
1235
+
1236
+ need_region_or_zone = False
1237
+ try:
1238
+ for volume in self.volumes:
1239
+ if ('name' in volume and volume['storage_type']
1240
+ == resources_utils.StorageType.NETWORK):
1241
+ need_region_or_zone = True
1242
+ if 'disk_tier' not in volume:
1243
+ continue
1244
+ # TODO(hailong): check instance local SSD
1245
+ # support for instance_type.
1246
+ # Refer to https://cloud.google.com/compute/docs/disks/local-ssd#machine-series-lssd # pylint: disable=line-too-long
1247
+ self.cloud.check_disk_tier_enabled(self.instance_type,
1248
+ volume['disk_tier'])
1249
+ if (need_region_or_zone and self._region is None and
1250
+ self._zone is None):
1251
+ with ux_utils.print_exception_no_traceback():
1252
+ raise ValueError('When specifying the volume name, please'
1253
+ ' also specify the region or zone.')
1254
+ except exceptions.NotSupportedError:
1255
+ with ux_utils.print_exception_no_traceback():
1256
+ raise ValueError(
1257
+ f'Disk tier {volume["disk_tier"].value} is not '
1258
+ f'supported for instance type {self.instance_type}.'
1259
+ ) from None
1260
+
1093
1261
  def _try_validate_ports(self) -> None:
1094
1262
  """Try to validate the ports attribute.
1095
1263
 
@@ -1260,9 +1428,18 @@ class Resources:
1260
1428
  skypilot_config.get_nested(
1261
1429
  (str(self.cloud).lower(), 'specific_reservations'), set()))
1262
1430
 
1431
+ if isinstance(self.cloud, clouds.DummyCloud):
1432
+ return self.cloud.get_reservations_available_resources(
1433
+ instance_type='',
1434
+ region='',
1435
+ zone=None,
1436
+ specific_reservations=specific_reservations)
1437
+
1263
1438
  assert (self.cloud is not None and self.instance_type is not None and
1264
- self.region
1265
- is not None), ('Cloud, instance type, region must be specified')
1439
+ self.region is not None), (
1440
+ f'Cloud, instance type, region must be specified. '
1441
+ f'Resources={self}, cloud={self.cloud}, '
1442
+ f'instance_type={self.instance_type}, region={self.region}')
1266
1443
  return self.cloud.get_reservations_available_resources(
1267
1444
  self.instance_type, self.region, self.zone, specific_reservations)
1268
1445
 
@@ -1450,6 +1627,8 @@ class Resources:
1450
1627
  ports=override.pop('ports', self.ports),
1451
1628
  labels=override.pop('labels', self.labels),
1452
1629
  autostop=override.pop('autostop', current_autostop_config),
1630
+ volumes=override.pop('volumes', self.volumes),
1631
+ infra=override.pop('infra', None),
1453
1632
  _docker_login_config=override.pop('_docker_login_config',
1454
1633
  self._docker_login_config),
1455
1634
  _docker_username_for_runpod=override.pop(
@@ -1489,6 +1668,12 @@ class Resources:
1489
1668
  features.add(clouds.CloudImplementationFeatures.IMAGE_ID)
1490
1669
  if self.ports is not None:
1491
1670
  features.add(clouds.CloudImplementationFeatures.OPEN_PORTS)
1671
+ if self.volumes is not None:
1672
+ for volume in self.volumes:
1673
+ if 'disk_tier' in volume and volume[
1674
+ 'disk_tier'] != resources_utils.DiskTier.BEST:
1675
+ features.add(
1676
+ clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER)
1492
1677
  return features
1493
1678
 
1494
1679
  @staticmethod
@@ -1621,9 +1806,21 @@ class Resources:
1621
1806
  @classmethod
1622
1807
  def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
1623
1808
 
1624
- resources_fields = {}
1809
+ resources_fields: Dict[str, Any] = {}
1810
+
1811
+ # Extract infra field if present
1812
+ infra = config.pop('infra', None)
1813
+ resources_fields['infra'] = infra
1814
+
1815
+ # Keep backward compatibility with cloud, region, zone
1816
+ # Note: if both `infra` and any of `cloud`, `region`, `zone` are
1817
+ # specified, it will raise an error during the Resources.__init__
1818
+ # validation.
1625
1819
  resources_fields['cloud'] = registry.CLOUD_REGISTRY.from_str(
1626
1820
  config.pop('cloud', None))
1821
+ resources_fields['region'] = config.pop('region', None)
1822
+ resources_fields['zone'] = config.pop('zone', None)
1823
+
1627
1824
  resources_fields['instance_type'] = config.pop('instance_type', None)
1628
1825
  resources_fields['cpus'] = config.pop('cpus', None)
1629
1826
  resources_fields['memory'] = config.pop('memory', None)
@@ -1641,13 +1838,12 @@ class Resources:
1641
1838
  # exclusive by the schema validation.
1642
1839
  resources_fields['job_recovery'] = config.pop('job_recovery', None)
1643
1840
  resources_fields['disk_size'] = config.pop('disk_size', None)
1644
- resources_fields['region'] = config.pop('region', None)
1645
- resources_fields['zone'] = config.pop('zone', None)
1646
1841
  resources_fields['image_id'] = config.pop('image_id', None)
1647
1842
  resources_fields['disk_tier'] = config.pop('disk_tier', None)
1648
1843
  resources_fields['ports'] = config.pop('ports', None)
1649
1844
  resources_fields['labels'] = config.pop('labels', None)
1650
1845
  resources_fields['autostop'] = config.pop('autostop', None)
1846
+ resources_fields['volumes'] = config.pop('volumes', None)
1651
1847
  resources_fields['_docker_login_config'] = config.pop(
1652
1848
  '_docker_login_config', None)
1653
1849
  resources_fields['_docker_username_for_runpod'] = config.pop(
@@ -1679,7 +1875,10 @@ class Resources:
1679
1875
  if value is not None and value != 'None':
1680
1876
  config[key] = value
1681
1877
 
1682
- add_if_not_none('cloud', str(self.cloud))
1878
+ # Construct infra field if cloud is set
1879
+ infra = self.infra.to_str()
1880
+ add_if_not_none('infra', infra)
1881
+
1683
1882
  add_if_not_none('instance_type', self.instance_type)
1684
1883
  add_if_not_none('cpus', self._cpus)
1685
1884
  add_if_not_none('memory', self.memory)
@@ -1690,13 +1889,26 @@ class Resources:
1690
1889
  add_if_not_none('use_spot', self.use_spot)
1691
1890
  add_if_not_none('job_recovery', self.job_recovery)
1692
1891
  add_if_not_none('disk_size', self.disk_size)
1693
- add_if_not_none('region', self.region)
1694
- add_if_not_none('zone', self.zone)
1695
1892
  add_if_not_none('image_id', self.image_id)
1696
1893
  if self.disk_tier is not None:
1697
1894
  config['disk_tier'] = self.disk_tier.value
1698
1895
  add_if_not_none('ports', self.ports)
1699
1896
  add_if_not_none('labels', self.labels)
1897
+ if self.volumes is not None:
1898
+ # Convert DiskTier/StorageType enum to string value for each volume
1899
+ volumes = []
1900
+ for volume in self.volumes:
1901
+ volume_copy = volume.copy()
1902
+ if 'disk_tier' in volume_copy:
1903
+ volume_copy['disk_tier'] = volume_copy['disk_tier'].value
1904
+ if 'storage_type' in volume_copy:
1905
+ volume_copy['storage_type'] = volume_copy[
1906
+ 'storage_type'].value
1907
+ if 'attach_mode' in volume_copy:
1908
+ volume_copy['attach_mode'] = volume_copy[
1909
+ 'attach_mode'].value
1910
+ volumes.append(volume_copy)
1911
+ config['volumes'] = volumes
1700
1912
  if self._autostop_config is not None:
1701
1913
  config['autostop'] = self._autostop_config.to_yaml_config()
1702
1914
  if self._docker_login_config is not None:
@@ -1857,6 +2069,9 @@ class Resources:
1857
2069
  if version < 23:
1858
2070
  self._autostop_config = None
1859
2071
 
2072
+ if version < 24:
2073
+ self._volumes = None
2074
+
1860
2075
  self.__dict__.update(state)
1861
2076
 
1862
2077
 
sky/serve/serve_utils.py CHANGED
@@ -1027,11 +1027,9 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
1027
1027
  return 'No existing replicas.'
1028
1028
 
1029
1029
  replica_columns = [
1030
- 'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT', 'LAUNCHED', 'RESOURCES',
1031
- 'STATUS', 'REGION'
1030
+ 'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT', 'LAUNCHED', 'INFRA',
1031
+ 'RESOURCES', 'STATUS'
1032
1032
  ]
1033
- if show_all:
1034
- replica_columns.append('ZONE')
1035
1033
  replica_table = log_utils.create_table(replica_columns)
1036
1034
 
1037
1035
  truncate_hint = ''
@@ -1047,21 +1045,17 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
1047
1045
  version = (record['version'] if 'version' in record else '-')
1048
1046
  replica_endpoint = endpoint if endpoint else '-'
1049
1047
  launched_at = log_utils.readable_time_duration(record['launched_at'])
1048
+ infra = '-'
1050
1049
  resources_str = '-'
1051
1050
  replica_status = record['status']
1052
1051
  status_str = replica_status.colored_str()
1053
- region = '-'
1054
- zone = '-'
1055
1052
 
1056
1053
  replica_handle: Optional['backends.CloudVmRayResourceHandle'] = record[
1057
1054
  'handle']
1058
1055
  if replica_handle is not None:
1056
+ infra = replica_handle.launched_resources.infra.formatted_str()
1059
1057
  resources_str = resources_utils.get_readable_resources_repr(
1060
1058
  replica_handle, simplify=not show_all)
1061
- if replica_handle.launched_resources.region is not None:
1062
- region = replica_handle.launched_resources.region
1063
- if replica_handle.launched_resources.zone is not None:
1064
- zone = replica_handle.launched_resources.zone
1065
1059
 
1066
1060
  replica_values = [
1067
1061
  service_name,
@@ -1069,12 +1063,10 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
1069
1063
  version,
1070
1064
  replica_endpoint,
1071
1065
  launched_at,
1066
+ infra,
1072
1067
  resources_str,
1073
1068
  status_str,
1074
- region,
1075
1069
  ]
1076
- if show_all:
1077
- replica_values.append(zone)
1078
1070
  replica_table.add_row(replica_values)
1079
1071
 
1080
1072
  return f'{replica_table}{truncate_hint}'
sky/serve/server/core.py CHANGED
@@ -141,8 +141,7 @@ def up(
141
141
  # Always apply the policy again here, even though it might have been applied
142
142
  # in the CLI. This is to ensure that we apply the policy to the final DAG
143
143
  # and get the mutated config.
144
- dag, mutated_user_config = admin_policy_utils.apply(
145
- task, use_mutated_config_in_current_request=False)
144
+ dag, mutated_user_config = admin_policy_utils.apply(task)
146
145
  task = dag.tasks[0]
147
146
 
148
147
  with rich_utils.safe_status(
@@ -352,8 +351,7 @@ def update(
352
351
  # and get the mutated config.
353
352
  # TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
354
353
  # will not apply the config.
355
- dag, _ = admin_policy_utils.apply(
356
- task, use_mutated_config_in_current_request=False)
354
+ dag, _ = admin_policy_utils.apply(task)
357
355
  task = dag.tasks[0]
358
356
 
359
357
  assert task.service is not None