skypilot-nightly 1.0.0.dev20250520__py3-none-any.whl → 1.0.0.dev20250522__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +56 -37
- sky/check.py +3 -3
- sky/cli.py +89 -16
- sky/client/cli.py +89 -16
- sky/client/sdk.py +92 -4
- sky/clouds/__init__.py +2 -0
- sky/clouds/cloud.py +6 -0
- sky/clouds/gcp.py +156 -21
- sky/clouds/service_catalog/__init__.py +3 -0
- sky/clouds/service_catalog/common.py +9 -2
- sky/clouds/service_catalog/constants.py +1 -0
- sky/core.py +6 -8
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/CzOVV6JpRQBRt5GhZuhyK/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +6 -0
- sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +6 -0
- sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +1 -0
- sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +1 -0
- sky/dashboard/out/_next/static/chunks/{678-206dddca808e6d16.js → 582-683f4f27b81996dc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +1 -0
- sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/data/storage.py +1 -0
- sky/execution.py +57 -8
- sky/jobs/server/core.py +5 -3
- sky/jobs/utils.py +38 -7
- sky/optimizer.py +41 -39
- sky/provision/gcp/constants.py +147 -4
- sky/provision/gcp/instance_utils.py +10 -0
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/provisioner.py +16 -7
- sky/resources.py +233 -18
- sky/serve/serve_utils.py +5 -13
- sky/serve/server/core.py +2 -4
- sky/server/common.py +60 -14
- sky/server/constants.py +2 -0
- sky/server/html/token_page.html +154 -0
- sky/server/requests/executor.py +3 -6
- sky/server/requests/payloads.py +3 -3
- sky/server/server.py +40 -8
- sky/skypilot_config.py +117 -31
- sky/task.py +24 -1
- sky/templates/gcp-ray.yml.j2 +44 -1
- sky/templates/nebius-ray.yml.j2 +0 -2
- sky/utils/admin_policy_utils.py +26 -22
- sky/utils/cli_utils/status_utils.py +95 -56
- sky/utils/common_utils.py +35 -2
- sky/utils/context.py +36 -6
- sky/utils/context_utils.py +15 -0
- sky/utils/infra_utils.py +175 -0
- sky/utils/resources_utils.py +55 -21
- sky/utils/schemas.py +111 -5
- {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/RECORD +73 -68
- {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/WHEEL +1 -1
- sky/dashboard/out/_next/static/8hlc2dkbIDDBOkxtEW7X6/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/845-0ca6f2c1ba667c3b.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- /sky/dashboard/out/_next/static/{8hlc2dkbIDDBOkxtEW7X6 → CzOVV6JpRQBRt5GhZuhyK}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/top_level.txt +0 -0
sky/provision/provisioner.py
CHANGED
@@ -17,6 +17,7 @@ from sky import clouds
|
|
17
17
|
from sky import exceptions
|
18
18
|
from sky import provision
|
19
19
|
from sky import sky_logging
|
20
|
+
from sky import skypilot_config
|
20
21
|
from sky.adaptors import aws
|
21
22
|
from sky.backends import backend_utils
|
22
23
|
from sky.provision import common as provision_common
|
@@ -228,9 +229,9 @@ def _ssh_probe_command(ip: str,
|
|
228
229
|
ssh_port: int,
|
229
230
|
ssh_user: str,
|
230
231
|
ssh_private_key: str,
|
232
|
+
ssh_probe_timeout: int,
|
231
233
|
ssh_proxy_command: Optional[str] = None) -> List[str]:
|
232
|
-
# NOTE: Ray uses 'uptime' command
|
233
|
-
# setting here.
|
234
|
+
# NOTE: Ray uses 'uptime' command, we use the same setting here.
|
234
235
|
command = [
|
235
236
|
'ssh',
|
236
237
|
'-T',
|
@@ -244,7 +245,7 @@ def _ssh_probe_command(ip: str,
|
|
244
245
|
'-o',
|
245
246
|
'PasswordAuthentication=no',
|
246
247
|
'-o',
|
247
|
-
'ConnectTimeout=
|
248
|
+
f'ConnectTimeout={ssh_probe_timeout}s',
|
248
249
|
'-o',
|
249
250
|
f'UserKnownHostsFile={os.devnull}',
|
250
251
|
'-o',
|
@@ -277,6 +278,7 @@ def _wait_ssh_connection_direct(ip: str,
|
|
277
278
|
ssh_port: int,
|
278
279
|
ssh_user: str,
|
279
280
|
ssh_private_key: str,
|
281
|
+
ssh_probe_timeout: int,
|
280
282
|
ssh_control_name: Optional[str] = None,
|
281
283
|
ssh_proxy_command: Optional[str] = None,
|
282
284
|
**kwargs) -> Tuple[bool, str]:
|
@@ -305,6 +307,7 @@ def _wait_ssh_connection_direct(ip: str,
|
|
305
307
|
if success:
|
306
308
|
return _wait_ssh_connection_indirect(ip, ssh_port, ssh_user,
|
307
309
|
ssh_private_key,
|
310
|
+
ssh_probe_timeout,
|
308
311
|
ssh_control_name,
|
309
312
|
ssh_proxy_command)
|
310
313
|
except socket.timeout: # this is the most expected exception
|
@@ -312,7 +315,7 @@ def _wait_ssh_connection_direct(ip: str,
|
|
312
315
|
except Exception as e: # pylint: disable=broad-except
|
313
316
|
stderr = f'Error: {common_utils.format_exception(e)}'
|
314
317
|
command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
|
315
|
-
ssh_proxy_command)
|
318
|
+
ssh_probe_timeout, ssh_proxy_command)
|
316
319
|
logger.debug(f'Waiting for SSH to {ip}. Try: '
|
317
320
|
f'{_shlex_join(command)}. '
|
318
321
|
f'{stderr}')
|
@@ -323,6 +326,7 @@ def _wait_ssh_connection_indirect(ip: str,
|
|
323
326
|
ssh_port: int,
|
324
327
|
ssh_user: str,
|
325
328
|
ssh_private_key: str,
|
329
|
+
ssh_probe_timeout: int,
|
326
330
|
ssh_control_name: Optional[str] = None,
|
327
331
|
ssh_proxy_command: Optional[str] = None,
|
328
332
|
**kwargs) -> Tuple[bool, str]:
|
@@ -333,14 +337,14 @@ def _wait_ssh_connection_indirect(ip: str,
|
|
333
337
|
"""
|
334
338
|
del ssh_control_name, kwargs # unused
|
335
339
|
command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
|
336
|
-
ssh_proxy_command)
|
340
|
+
ssh_probe_timeout, ssh_proxy_command)
|
337
341
|
message = f'Waiting for SSH using command: {_shlex_join(command)}'
|
338
342
|
logger.debug(message)
|
339
343
|
try:
|
340
344
|
proc = subprocess.run(command,
|
341
345
|
shell=False,
|
342
346
|
check=False,
|
343
|
-
timeout=
|
347
|
+
timeout=ssh_probe_timeout,
|
344
348
|
stdout=subprocess.DEVNULL,
|
345
349
|
stderr=subprocess.PIPE)
|
346
350
|
if proc.returncode != 0:
|
@@ -383,8 +387,13 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
|
|
383
387
|
def _retry_ssh_thread(ip_ssh_port: Tuple[str, int]):
|
384
388
|
ip, ssh_port = ip_ssh_port
|
385
389
|
success = False
|
390
|
+
ssh_probe_timeout = skypilot_config.get_nested(
|
391
|
+
('provision', 'ssh_timeout'), 10)
|
386
392
|
while not success:
|
387
|
-
success, stderr = waiter(ip,
|
393
|
+
success, stderr = waiter(ip,
|
394
|
+
ssh_port,
|
395
|
+
**ssh_credentials,
|
396
|
+
ssh_probe_timeout=ssh_probe_timeout)
|
388
397
|
if not success and time.time() - start > timeout:
|
389
398
|
with ux_utils.print_exception_no_traceback():
|
390
399
|
raise RuntimeError(
|
sky/resources.py
CHANGED
@@ -6,6 +6,7 @@ from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
|
|
6
6
|
|
7
7
|
import colorama
|
8
8
|
|
9
|
+
import sky
|
9
10
|
from sky import check as sky_check
|
10
11
|
from sky import clouds
|
11
12
|
from sky import exceptions
|
@@ -20,6 +21,7 @@ from sky.utils import accelerator_registry
|
|
20
21
|
from sky.utils import annotations
|
21
22
|
from sky.utils import common_utils
|
22
23
|
from sky.utils import config_utils
|
24
|
+
from sky.utils import infra_utils
|
23
25
|
from sky.utils import log_utils
|
24
26
|
from sky.utils import registry
|
25
27
|
from sky.utils import resources_utils
|
@@ -96,7 +98,7 @@ class Resources:
|
|
96
98
|
"""
|
97
99
|
# If any fields changed, increment the version. For backward compatibility,
|
98
100
|
# modify the __setstate__ method to handle the old version.
|
99
|
-
_VERSION =
|
101
|
+
_VERSION = 24
|
100
102
|
|
101
103
|
def __init__(
|
102
104
|
self,
|
@@ -106,6 +108,7 @@ class Resources:
|
|
106
108
|
memory: Union[None, int, float, str] = None,
|
107
109
|
accelerators: Union[None, str, Dict[str, Union[int, float]]] = None,
|
108
110
|
accelerator_args: Optional[Dict[str, str]] = None,
|
111
|
+
infra: Optional[str] = None,
|
109
112
|
use_spot: Optional[bool] = None,
|
110
113
|
job_recovery: Optional[Union[Dict[str, Optional[Union[str, int]]],
|
111
114
|
str]] = None,
|
@@ -117,6 +120,7 @@ class Resources:
|
|
117
120
|
ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
|
118
121
|
labels: Optional[Dict[str, str]] = None,
|
119
122
|
autostop: Union[bool, int, Dict[str, Any], None] = None,
|
123
|
+
volumes: Optional[List[Dict[str, Any]]] = None,
|
120
124
|
# Internal use only.
|
121
125
|
# pylint: disable=invalid-name
|
122
126
|
_docker_login_config: Optional[docker_utils.DockerLoginConfig] = None,
|
@@ -134,9 +138,9 @@ class Resources:
|
|
134
138
|
.. code-block:: python
|
135
139
|
|
136
140
|
# Fully specified cloud and instance type (is_launchable() is True).
|
137
|
-
sky.Resources(
|
138
|
-
sky.Resources(
|
139
|
-
sky.Resources(
|
141
|
+
sky.Resources(infra='aws', instance_type='p3.2xlarge')
|
142
|
+
sky.Resources(infra='k8s/my-cluster-ctx', accelerators='V100')
|
143
|
+
sky.Resources(infra='gcp/us-central1', accelerators='V100')
|
140
144
|
|
141
145
|
# Specifying required resources; the system decides the
|
142
146
|
# cloud/instance type. The below are equivalent:
|
@@ -145,8 +149,9 @@ class Resources:
|
|
145
149
|
sky.Resources(accelerators={'V100': 1})
|
146
150
|
sky.Resources(cpus='2+', memory='16+', accelerators='V100')
|
147
151
|
|
152
|
+
|
148
153
|
Args:
|
149
|
-
cloud: the cloud to use.
|
154
|
+
cloud: the cloud to use. Deprecated. Use `infra` instead.
|
150
155
|
instance_type: the instance type to use.
|
151
156
|
cpus: the number of CPUs required for the task.
|
152
157
|
If a str, must be a string of the form ``'2'`` or ``'2+'``, where
|
@@ -160,6 +165,11 @@ class Resources:
|
|
160
165
|
dict of the form ``{'V100': 2}`` or ``{'tpu-v2-8': 1}``.
|
161
166
|
accelerator_args: accelerator-specific arguments. For example,
|
162
167
|
``{'tpu_vm': True, 'runtime_version': 'tpu-vm-base'}`` for TPUs.
|
168
|
+
infra: a string specifying the infrastructure to use, in the format
|
169
|
+
of "cloud/region" or "cloud/region/zone". For example,
|
170
|
+
`aws/us-east-1` or `k8s/my-cluster-ctx`. This is an alternative to
|
171
|
+
specifying cloud, region, and zone separately. If provided, it
|
172
|
+
takes precedence over cloud, region, and zone parameters.
|
163
173
|
use_spot: whether to use spot instances. If None, defaults to
|
164
174
|
False.
|
165
175
|
job_recovery: the job recovery strategy to use for the managed
|
@@ -172,8 +182,8 @@ class Resources:
|
|
172
182
|
- max_restarts_on_errors: the max number of restarts on user code
|
173
183
|
errors.
|
174
184
|
|
175
|
-
region: the region to use.
|
176
|
-
zone: the zone to use.
|
185
|
+
region: the region to use. Deprecated. Use `infra` instead.
|
186
|
+
zone: the zone to use. Deprecated. Use `infra` instead.
|
177
187
|
image_id: the image ID to use. If a str, must be a string
|
178
188
|
of the image id from the cloud, such as AWS:
|
179
189
|
``'ami-1234567890abcdef0'``, GCP:
|
@@ -201,6 +211,7 @@ class Resources:
|
|
201
211
|
not supported and will be ignored.
|
202
212
|
autostop: the autostop configuration to use. For launched resources,
|
203
213
|
may or may not correspond to the actual current autostop config.
|
214
|
+
volumes: the volumes to mount on the instance.
|
204
215
|
_docker_login_config: the docker configuration to use. This includes
|
205
216
|
the docker username, password, and registry server. If None, skip
|
206
217
|
docker login.
|
@@ -218,6 +229,25 @@ class Resources:
|
|
218
229
|
exceptions.NoCloudAccessError: if no public cloud is enabled.
|
219
230
|
"""
|
220
231
|
self._version = self._VERSION
|
232
|
+
|
233
|
+
if infra is not None and (cloud is not None or region is not None or
|
234
|
+
zone is not None):
|
235
|
+
with ux_utils.print_exception_no_traceback():
|
236
|
+
raise ValueError('Cannot specify both `infra` and `cloud`, '
|
237
|
+
'`region`, or `zone` parameters. '
|
238
|
+
f'Got: infra={infra}, cloud={cloud}, '
|
239
|
+
f'region={region}, zone={zone}')
|
240
|
+
|
241
|
+
# Infra is user facing, and cloud, region, zone in parameters are for
|
242
|
+
# backward compatibility. Internally, we keep using cloud, region, zone
|
243
|
+
# for simplicity.
|
244
|
+
if infra is not None:
|
245
|
+
infra_info = infra_utils.InfraInfo.from_str(infra)
|
246
|
+
# Infra takes precedence over individually specified parameters
|
247
|
+
cloud = sky.CLOUD_REGISTRY.from_str(infra_info.cloud)
|
248
|
+
region = infra_info.region
|
249
|
+
zone = infra_info.zone
|
250
|
+
|
221
251
|
self._cloud = cloud
|
222
252
|
self._region: Optional[str] = region
|
223
253
|
self._zone: Optional[str] = zone
|
@@ -309,6 +339,7 @@ class Resources:
|
|
309
339
|
self._set_memory(memory)
|
310
340
|
self._set_accelerators(accelerators, accelerator_args)
|
311
341
|
self._set_autostop_config(autostop)
|
342
|
+
self._set_volumes(volumes)
|
312
343
|
|
313
344
|
def validate(self):
|
314
345
|
"""Validate the resources and infer the missing fields if possible."""
|
@@ -319,6 +350,7 @@ class Resources:
|
|
319
350
|
self._try_validate_managed_job_attributes()
|
320
351
|
self._try_validate_image_id()
|
321
352
|
self._try_validate_disk_tier()
|
353
|
+
self._try_validate_volumes()
|
322
354
|
self._try_validate_ports()
|
323
355
|
self._try_validate_labels()
|
324
356
|
|
@@ -431,6 +463,11 @@ class Resources:
|
|
431
463
|
repr_str += f'{region_str}{zone_str}'
|
432
464
|
return repr_str
|
433
465
|
|
466
|
+
@property
|
467
|
+
def infra(self) -> infra_utils.InfraInfo:
|
468
|
+
cloud = str(self.cloud) if self.cloud is not None else None
|
469
|
+
return infra_utils.InfraInfo(cloud, self.region, self.zone)
|
470
|
+
|
434
471
|
@property
|
435
472
|
def cloud(self) -> Optional[clouds.Cloud]:
|
436
473
|
return self._cloud
|
@@ -486,9 +523,9 @@ class Resources:
|
|
486
523
|
def accelerators(self) -> Optional[Dict[str, Union[int, float]]]:
|
487
524
|
"""Returns the accelerators field directly or by inferring.
|
488
525
|
|
489
|
-
For example, Resources(
|
490
|
-
set to None, but this function will infer {'V100': 1}
|
491
|
-
type.
|
526
|
+
For example, Resources(infra='aws', instance_type='p3.2xlarge') has its
|
527
|
+
accelerators field set to None, but this function will infer {'V100': 1}
|
528
|
+
from the instance type.
|
492
529
|
"""
|
493
530
|
if self._accelerators is not None:
|
494
531
|
return self._accelerators
|
@@ -533,6 +570,10 @@ class Resources:
|
|
533
570
|
def labels(self) -> Optional[Dict[str, str]]:
|
534
571
|
return self._labels
|
535
572
|
|
573
|
+
@property
|
574
|
+
def volumes(self) -> Optional[List[Dict[str, Any]]]:
|
575
|
+
return self._volumes
|
576
|
+
|
536
577
|
@property
|
537
578
|
def autostop_config(self) -> Optional[AutostopConfig]:
|
538
579
|
"""The requested autostop config.
|
@@ -726,6 +767,91 @@ class Resources:
|
|
726
767
|
) -> None:
|
727
768
|
self._autostop_config = AutostopConfig.from_yaml_config(autostop)
|
728
769
|
|
770
|
+
def _set_volumes(
|
771
|
+
self,
|
772
|
+
volumes: Optional[List[Dict[str, Any]]],
|
773
|
+
) -> None:
|
774
|
+
if not volumes:
|
775
|
+
self._volumes = None
|
776
|
+
return
|
777
|
+
valid_volumes = []
|
778
|
+
supported_tiers = [tier.value for tier in resources_utils.DiskTier]
|
779
|
+
supported_storage_types = [
|
780
|
+
storage_type.value for storage_type in resources_utils.StorageType
|
781
|
+
]
|
782
|
+
supported_attach_modes = [
|
783
|
+
attach_mode.value for attach_mode in resources_utils.DiskAttachMode
|
784
|
+
]
|
785
|
+
network_type = resources_utils.StorageType.NETWORK
|
786
|
+
read_write_mode = resources_utils.DiskAttachMode.READ_WRITE
|
787
|
+
for volume in volumes:
|
788
|
+
if 'path' not in volume:
|
789
|
+
with ux_utils.print_exception_no_traceback():
|
790
|
+
raise ValueError(f'Invalid volume {volume!r}. '
|
791
|
+
f'Volume must have a "path" field.')
|
792
|
+
if 'storage_type' not in volume:
|
793
|
+
volume['storage_type'] = network_type
|
794
|
+
else:
|
795
|
+
if isinstance(volume['storage_type'], str):
|
796
|
+
storage_type_str = str(volume['storage_type']).lower()
|
797
|
+
if storage_type_str not in supported_storage_types:
|
798
|
+
logger.warning(
|
799
|
+
f'Invalid storage_type {storage_type_str!r}. '
|
800
|
+
f'Set it to '
|
801
|
+
f'{network_type.value}.')
|
802
|
+
volume['storage_type'] = network_type
|
803
|
+
else:
|
804
|
+
volume['storage_type'] = resources_utils.StorageType(
|
805
|
+
storage_type_str)
|
806
|
+
if 'auto_delete' not in volume:
|
807
|
+
volume['auto_delete'] = False
|
808
|
+
if 'attach_mode' in volume:
|
809
|
+
if isinstance(volume['attach_mode'], str):
|
810
|
+
attach_mode_str = str(volume['attach_mode']).lower()
|
811
|
+
if attach_mode_str not in supported_attach_modes:
|
812
|
+
logger.warning(
|
813
|
+
f'Invalid attach_mode {attach_mode_str!r}. '
|
814
|
+
f'Set it to {read_write_mode.value}.')
|
815
|
+
volume['attach_mode'] = read_write_mode
|
816
|
+
else:
|
817
|
+
volume['attach_mode'] = resources_utils.DiskAttachMode(
|
818
|
+
attach_mode_str)
|
819
|
+
else:
|
820
|
+
volume['attach_mode'] = read_write_mode
|
821
|
+
if volume['storage_type'] == network_type:
|
822
|
+
if ('disk_size' in volume and
|
823
|
+
round(volume['disk_size']) != volume['disk_size']):
|
824
|
+
with ux_utils.print_exception_no_traceback():
|
825
|
+
raise ValueError(f'Volume size must be an integer. '
|
826
|
+
f'Got: {volume["size"]}.')
|
827
|
+
if 'name' not in volume:
|
828
|
+
with ux_utils.print_exception_no_traceback():
|
829
|
+
raise ValueError(f'Network volume {volume["path"]} '
|
830
|
+
f'must have "name" field.')
|
831
|
+
elif 'name' in volume:
|
832
|
+
logger.info(f'Volume {volume["path"]} is a local disk. '
|
833
|
+
f'The "name" field will be ignored.')
|
834
|
+
del volume['name']
|
835
|
+
if 'disk_tier' in volume:
|
836
|
+
if isinstance(volume['disk_tier'], str):
|
837
|
+
disk_tier_str = str(volume['disk_tier']).lower()
|
838
|
+
if disk_tier_str not in supported_tiers:
|
839
|
+
logger.warning(
|
840
|
+
f'Invalid disk_tier {disk_tier_str!r}. '
|
841
|
+
f'Set it to {resources_utils.DiskTier.BEST.value}.')
|
842
|
+
volume['disk_tier'] = resources_utils.DiskTier.BEST
|
843
|
+
else:
|
844
|
+
volume['disk_tier'] = resources_utils.DiskTier(
|
845
|
+
disk_tier_str)
|
846
|
+
elif volume['storage_type'] == network_type:
|
847
|
+
logger.debug(
|
848
|
+
f'No disk_tier specified for volume {volume["path"]}. '
|
849
|
+
f'Set it to {resources_utils.DiskTier.BEST.value}.')
|
850
|
+
volume['disk_tier'] = resources_utils.DiskTier.BEST
|
851
|
+
|
852
|
+
valid_volumes.append(volume)
|
853
|
+
self._volumes = valid_volumes
|
854
|
+
|
729
855
|
def is_launchable(self) -> bool:
|
730
856
|
"""Returns whether the resource is launchable."""
|
731
857
|
return self.cloud is not None and self._instance_type is not None
|
@@ -1090,6 +1216,48 @@ class Resources:
|
|
1090
1216
|
f'Disk tier {self.disk_tier.value} is not supported '
|
1091
1217
|
f'for instance type {self.instance_type}.') from None
|
1092
1218
|
|
1219
|
+
def _try_validate_volumes(self) -> None:
|
1220
|
+
"""Try to validate the volumes attribute.
|
1221
|
+
|
1222
|
+
Raises:
|
1223
|
+
ValueError: if the attribute is invalid.
|
1224
|
+
"""
|
1225
|
+
if self.volumes is None:
|
1226
|
+
return
|
1227
|
+
if self.cloud is None:
|
1228
|
+
with ux_utils.print_exception_no_traceback():
|
1229
|
+
raise ValueError('Cloud must be specified when '
|
1230
|
+
'volumes are provided.')
|
1231
|
+
if not self.cloud.is_same_cloud(clouds.GCP()):
|
1232
|
+
with ux_utils.print_exception_no_traceback():
|
1233
|
+
raise ValueError(f'Volumes are only supported for GCP'
|
1234
|
+
f' not for {self.cloud}.')
|
1235
|
+
|
1236
|
+
need_region_or_zone = False
|
1237
|
+
try:
|
1238
|
+
for volume in self.volumes:
|
1239
|
+
if ('name' in volume and volume['storage_type']
|
1240
|
+
== resources_utils.StorageType.NETWORK):
|
1241
|
+
need_region_or_zone = True
|
1242
|
+
if 'disk_tier' not in volume:
|
1243
|
+
continue
|
1244
|
+
# TODO(hailong): check instance local SSD
|
1245
|
+
# support for instance_type.
|
1246
|
+
# Refer to https://cloud.google.com/compute/docs/disks/local-ssd#machine-series-lssd # pylint: disable=line-too-long
|
1247
|
+
self.cloud.check_disk_tier_enabled(self.instance_type,
|
1248
|
+
volume['disk_tier'])
|
1249
|
+
if (need_region_or_zone and self._region is None and
|
1250
|
+
self._zone is None):
|
1251
|
+
with ux_utils.print_exception_no_traceback():
|
1252
|
+
raise ValueError('When specifying the volume name, please'
|
1253
|
+
' also specify the region or zone.')
|
1254
|
+
except exceptions.NotSupportedError:
|
1255
|
+
with ux_utils.print_exception_no_traceback():
|
1256
|
+
raise ValueError(
|
1257
|
+
f'Disk tier {volume["disk_tier"].value} is not '
|
1258
|
+
f'supported for instance type {self.instance_type}.'
|
1259
|
+
) from None
|
1260
|
+
|
1093
1261
|
def _try_validate_ports(self) -> None:
|
1094
1262
|
"""Try to validate the ports attribute.
|
1095
1263
|
|
@@ -1260,9 +1428,18 @@ class Resources:
|
|
1260
1428
|
skypilot_config.get_nested(
|
1261
1429
|
(str(self.cloud).lower(), 'specific_reservations'), set()))
|
1262
1430
|
|
1431
|
+
if isinstance(self.cloud, clouds.DummyCloud):
|
1432
|
+
return self.cloud.get_reservations_available_resources(
|
1433
|
+
instance_type='',
|
1434
|
+
region='',
|
1435
|
+
zone=None,
|
1436
|
+
specific_reservations=specific_reservations)
|
1437
|
+
|
1263
1438
|
assert (self.cloud is not None and self.instance_type is not None and
|
1264
|
-
self.region
|
1265
|
-
|
1439
|
+
self.region is not None), (
|
1440
|
+
f'Cloud, instance type, region must be specified. '
|
1441
|
+
f'Resources={self}, cloud={self.cloud}, '
|
1442
|
+
f'instance_type={self.instance_type}, region={self.region}')
|
1266
1443
|
return self.cloud.get_reservations_available_resources(
|
1267
1444
|
self.instance_type, self.region, self.zone, specific_reservations)
|
1268
1445
|
|
@@ -1450,6 +1627,8 @@ class Resources:
|
|
1450
1627
|
ports=override.pop('ports', self.ports),
|
1451
1628
|
labels=override.pop('labels', self.labels),
|
1452
1629
|
autostop=override.pop('autostop', current_autostop_config),
|
1630
|
+
volumes=override.pop('volumes', self.volumes),
|
1631
|
+
infra=override.pop('infra', None),
|
1453
1632
|
_docker_login_config=override.pop('_docker_login_config',
|
1454
1633
|
self._docker_login_config),
|
1455
1634
|
_docker_username_for_runpod=override.pop(
|
@@ -1489,6 +1668,12 @@ class Resources:
|
|
1489
1668
|
features.add(clouds.CloudImplementationFeatures.IMAGE_ID)
|
1490
1669
|
if self.ports is not None:
|
1491
1670
|
features.add(clouds.CloudImplementationFeatures.OPEN_PORTS)
|
1671
|
+
if self.volumes is not None:
|
1672
|
+
for volume in self.volumes:
|
1673
|
+
if 'disk_tier' in volume and volume[
|
1674
|
+
'disk_tier'] != resources_utils.DiskTier.BEST:
|
1675
|
+
features.add(
|
1676
|
+
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER)
|
1492
1677
|
return features
|
1493
1678
|
|
1494
1679
|
@staticmethod
|
@@ -1621,9 +1806,21 @@ class Resources:
|
|
1621
1806
|
@classmethod
|
1622
1807
|
def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
|
1623
1808
|
|
1624
|
-
resources_fields = {}
|
1809
|
+
resources_fields: Dict[str, Any] = {}
|
1810
|
+
|
1811
|
+
# Extract infra field if present
|
1812
|
+
infra = config.pop('infra', None)
|
1813
|
+
resources_fields['infra'] = infra
|
1814
|
+
|
1815
|
+
# Keep backward compatibility with cloud, region, zone
|
1816
|
+
# Note: if both `infra` and any of `cloud`, `region`, `zone` are
|
1817
|
+
# specified, it will raise an error during the Resources.__init__
|
1818
|
+
# validation.
|
1625
1819
|
resources_fields['cloud'] = registry.CLOUD_REGISTRY.from_str(
|
1626
1820
|
config.pop('cloud', None))
|
1821
|
+
resources_fields['region'] = config.pop('region', None)
|
1822
|
+
resources_fields['zone'] = config.pop('zone', None)
|
1823
|
+
|
1627
1824
|
resources_fields['instance_type'] = config.pop('instance_type', None)
|
1628
1825
|
resources_fields['cpus'] = config.pop('cpus', None)
|
1629
1826
|
resources_fields['memory'] = config.pop('memory', None)
|
@@ -1641,13 +1838,12 @@ class Resources:
|
|
1641
1838
|
# exclusive by the schema validation.
|
1642
1839
|
resources_fields['job_recovery'] = config.pop('job_recovery', None)
|
1643
1840
|
resources_fields['disk_size'] = config.pop('disk_size', None)
|
1644
|
-
resources_fields['region'] = config.pop('region', None)
|
1645
|
-
resources_fields['zone'] = config.pop('zone', None)
|
1646
1841
|
resources_fields['image_id'] = config.pop('image_id', None)
|
1647
1842
|
resources_fields['disk_tier'] = config.pop('disk_tier', None)
|
1648
1843
|
resources_fields['ports'] = config.pop('ports', None)
|
1649
1844
|
resources_fields['labels'] = config.pop('labels', None)
|
1650
1845
|
resources_fields['autostop'] = config.pop('autostop', None)
|
1846
|
+
resources_fields['volumes'] = config.pop('volumes', None)
|
1651
1847
|
resources_fields['_docker_login_config'] = config.pop(
|
1652
1848
|
'_docker_login_config', None)
|
1653
1849
|
resources_fields['_docker_username_for_runpod'] = config.pop(
|
@@ -1679,7 +1875,10 @@ class Resources:
|
|
1679
1875
|
if value is not None and value != 'None':
|
1680
1876
|
config[key] = value
|
1681
1877
|
|
1682
|
-
|
1878
|
+
# Construct infra field if cloud is set
|
1879
|
+
infra = self.infra.to_str()
|
1880
|
+
add_if_not_none('infra', infra)
|
1881
|
+
|
1683
1882
|
add_if_not_none('instance_type', self.instance_type)
|
1684
1883
|
add_if_not_none('cpus', self._cpus)
|
1685
1884
|
add_if_not_none('memory', self.memory)
|
@@ -1690,13 +1889,26 @@ class Resources:
|
|
1690
1889
|
add_if_not_none('use_spot', self.use_spot)
|
1691
1890
|
add_if_not_none('job_recovery', self.job_recovery)
|
1692
1891
|
add_if_not_none('disk_size', self.disk_size)
|
1693
|
-
add_if_not_none('region', self.region)
|
1694
|
-
add_if_not_none('zone', self.zone)
|
1695
1892
|
add_if_not_none('image_id', self.image_id)
|
1696
1893
|
if self.disk_tier is not None:
|
1697
1894
|
config['disk_tier'] = self.disk_tier.value
|
1698
1895
|
add_if_not_none('ports', self.ports)
|
1699
1896
|
add_if_not_none('labels', self.labels)
|
1897
|
+
if self.volumes is not None:
|
1898
|
+
# Convert DiskTier/StorageType enum to string value for each volume
|
1899
|
+
volumes = []
|
1900
|
+
for volume in self.volumes:
|
1901
|
+
volume_copy = volume.copy()
|
1902
|
+
if 'disk_tier' in volume_copy:
|
1903
|
+
volume_copy['disk_tier'] = volume_copy['disk_tier'].value
|
1904
|
+
if 'storage_type' in volume_copy:
|
1905
|
+
volume_copy['storage_type'] = volume_copy[
|
1906
|
+
'storage_type'].value
|
1907
|
+
if 'attach_mode' in volume_copy:
|
1908
|
+
volume_copy['attach_mode'] = volume_copy[
|
1909
|
+
'attach_mode'].value
|
1910
|
+
volumes.append(volume_copy)
|
1911
|
+
config['volumes'] = volumes
|
1700
1912
|
if self._autostop_config is not None:
|
1701
1913
|
config['autostop'] = self._autostop_config.to_yaml_config()
|
1702
1914
|
if self._docker_login_config is not None:
|
@@ -1857,6 +2069,9 @@ class Resources:
|
|
1857
2069
|
if version < 23:
|
1858
2070
|
self._autostop_config = None
|
1859
2071
|
|
2072
|
+
if version < 24:
|
2073
|
+
self._volumes = None
|
2074
|
+
|
1860
2075
|
self.__dict__.update(state)
|
1861
2076
|
|
1862
2077
|
|
sky/serve/serve_utils.py
CHANGED
@@ -1027,11 +1027,9 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
1027
1027
|
return 'No existing replicas.'
|
1028
1028
|
|
1029
1029
|
replica_columns = [
|
1030
|
-
'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT', 'LAUNCHED', '
|
1031
|
-
'
|
1030
|
+
'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT', 'LAUNCHED', 'INFRA',
|
1031
|
+
'RESOURCES', 'STATUS'
|
1032
1032
|
]
|
1033
|
-
if show_all:
|
1034
|
-
replica_columns.append('ZONE')
|
1035
1033
|
replica_table = log_utils.create_table(replica_columns)
|
1036
1034
|
|
1037
1035
|
truncate_hint = ''
|
@@ -1047,21 +1045,17 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
1047
1045
|
version = (record['version'] if 'version' in record else '-')
|
1048
1046
|
replica_endpoint = endpoint if endpoint else '-'
|
1049
1047
|
launched_at = log_utils.readable_time_duration(record['launched_at'])
|
1048
|
+
infra = '-'
|
1050
1049
|
resources_str = '-'
|
1051
1050
|
replica_status = record['status']
|
1052
1051
|
status_str = replica_status.colored_str()
|
1053
|
-
region = '-'
|
1054
|
-
zone = '-'
|
1055
1052
|
|
1056
1053
|
replica_handle: Optional['backends.CloudVmRayResourceHandle'] = record[
|
1057
1054
|
'handle']
|
1058
1055
|
if replica_handle is not None:
|
1056
|
+
infra = replica_handle.launched_resources.infra.formatted_str()
|
1059
1057
|
resources_str = resources_utils.get_readable_resources_repr(
|
1060
1058
|
replica_handle, simplify=not show_all)
|
1061
|
-
if replica_handle.launched_resources.region is not None:
|
1062
|
-
region = replica_handle.launched_resources.region
|
1063
|
-
if replica_handle.launched_resources.zone is not None:
|
1064
|
-
zone = replica_handle.launched_resources.zone
|
1065
1059
|
|
1066
1060
|
replica_values = [
|
1067
1061
|
service_name,
|
@@ -1069,12 +1063,10 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
1069
1063
|
version,
|
1070
1064
|
replica_endpoint,
|
1071
1065
|
launched_at,
|
1066
|
+
infra,
|
1072
1067
|
resources_str,
|
1073
1068
|
status_str,
|
1074
|
-
region,
|
1075
1069
|
]
|
1076
|
-
if show_all:
|
1077
|
-
replica_values.append(zone)
|
1078
1070
|
replica_table.add_row(replica_values)
|
1079
1071
|
|
1080
1072
|
return f'{replica_table}{truncate_hint}'
|
sky/serve/server/core.py
CHANGED
@@ -141,8 +141,7 @@ def up(
|
|
141
141
|
# Always apply the policy again here, even though it might have been applied
|
142
142
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
143
143
|
# and get the mutated config.
|
144
|
-
dag, mutated_user_config = admin_policy_utils.apply(
|
145
|
-
task, use_mutated_config_in_current_request=False)
|
144
|
+
dag, mutated_user_config = admin_policy_utils.apply(task)
|
146
145
|
task = dag.tasks[0]
|
147
146
|
|
148
147
|
with rich_utils.safe_status(
|
@@ -352,8 +351,7 @@ def update(
|
|
352
351
|
# and get the mutated config.
|
353
352
|
# TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
|
354
353
|
# will not apply the config.
|
355
|
-
dag, _ = admin_policy_utils.apply(
|
356
|
-
task, use_mutated_config_in_current_request=False)
|
354
|
+
dag, _ = admin_policy_utils.apply(task)
|
357
355
|
task = dag.tasks[0]
|
358
356
|
|
359
357
|
assert task.service is not None
|