skypilot-nightly 1.0.0.dev20250513__py3-none-any.whl → 1.0.0.dev20250515__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend.py +3 -2
- sky/backends/backend_utils.py +16 -17
- sky/backends/cloud_vm_ray_backend.py +47 -16
- sky/clouds/aws.py +11 -9
- sky/clouds/azure.py +16 -13
- sky/clouds/cloud.py +4 -3
- sky/clouds/cudo.py +3 -2
- sky/clouds/do.py +3 -2
- sky/clouds/fluidstack.py +3 -3
- sky/clouds/gcp.py +25 -9
- sky/clouds/ibm.py +12 -10
- sky/clouds/kubernetes.py +3 -2
- sky/clouds/lambda_cloud.py +6 -6
- sky/clouds/nebius.py +6 -5
- sky/clouds/oci.py +9 -7
- sky/clouds/paperspace.py +3 -2
- sky/clouds/runpod.py +9 -9
- sky/clouds/scp.py +5 -3
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +33 -11
- sky/clouds/service_catalog/gcp_catalog.py +7 -1
- sky/clouds/vast.py +8 -7
- sky/clouds/vsphere.py +4 -2
- sky/core.py +18 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/pages/index-6b0d9e5031b70c58.js +1 -0
- sky/dashboard/out/_next/static/{2dkponv64SfFShA8Rnw0D → jFI0Y-uJZ_XDK5IGJpKFU}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/execution.py +33 -0
- sky/jobs/recovery_strategy.py +4 -1
- sky/jobs/server/core.py +6 -12
- sky/optimizer.py +19 -13
- sky/provision/kubernetes/utils.py +26 -1
- sky/resources.py +206 -43
- sky/serve/server/core.py +0 -5
- sky/serve/spot_placer.py +3 -0
- sky/server/server.py +51 -13
- sky/skylet/log_lib.py +12 -3
- sky/skylet/log_lib.pyi +5 -0
- sky/task.py +8 -6
- sky/templates/nebius-ray.yml.j2 +3 -1
- sky/utils/cli_utils/status_utils.py +6 -5
- sky/utils/controller_utils.py +39 -43
- sky/utils/dag_utils.py +4 -2
- sky/utils/resources_utils.py +3 -0
- sky/utils/schemas.py +33 -24
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/RECORD +58 -58
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/WHEEL +1 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- /sky/dashboard/out/_next/static/{2dkponv64SfFShA8Rnw0D → jFI0Y-uJZ_XDK5IGJpKFU}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250513.dist-info → skypilot_nightly-1.0.0.dev20250515.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'c7068a2c749d03a06ecc8940d34a4911ac9391bc'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250515'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/backends/backend.py
CHANGED
@@ -37,8 +37,9 @@ class Backend(Generic[_ResourceHandleType]):
|
|
37
37
|
ResourceHandle = ResourceHandle # pylint: disable=invalid-name
|
38
38
|
|
39
39
|
# --- APIs ---
|
40
|
-
def check_resources_fit_cluster(
|
41
|
-
|
40
|
+
def check_resources_fit_cluster(
|
41
|
+
self, handle: _ResourceHandleType,
|
42
|
+
task: 'task_lib.Task') -> Optional['resources.Resources']:
|
42
43
|
"""Check whether resources of the task are satisfied by cluster."""
|
43
44
|
raise NotImplementedError
|
44
45
|
|
sky/backends/backend_utils.py
CHANGED
@@ -606,7 +606,7 @@ def write_cluster_config(
|
|
606
606
|
# other cases, we exclude the cloud from credential file uploads after
|
607
607
|
# running required checks.
|
608
608
|
assert cluster_name is not None
|
609
|
-
excluded_clouds = set()
|
609
|
+
excluded_clouds: Set[clouds.Cloud] = set()
|
610
610
|
remote_identity_config = skypilot_config.get_nested(
|
611
611
|
(str(cloud).lower(), 'remote_identity'), None)
|
612
612
|
remote_identity = schemas.get_default_remote_identity(str(cloud).lower())
|
@@ -1557,7 +1557,8 @@ def check_owner_identity(cluster_name: str) -> None:
|
|
1557
1557
|
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
1558
1558
|
return
|
1559
1559
|
|
1560
|
-
|
1560
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
1561
|
+
cloud = launched_resources.cloud
|
1561
1562
|
user_identities = cloud.get_user_identities()
|
1562
1563
|
owner_identity = record['owner']
|
1563
1564
|
if user_identities is None:
|
@@ -1721,12 +1722,12 @@ def check_can_clone_disk_and_override_task(
|
|
1721
1722
|
'a new target cluster name.')
|
1722
1723
|
|
1723
1724
|
new_task_resources = []
|
1724
|
-
|
1725
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
1726
|
+
original_cloud = launched_resources.cloud
|
1725
1727
|
original_cloud.check_features_are_supported(
|
1726
|
-
|
1728
|
+
launched_resources,
|
1727
1729
|
{clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER})
|
1728
1730
|
|
1729
|
-
assert original_cloud is not None, handle.launched_resources
|
1730
1731
|
has_override = False
|
1731
1732
|
has_disk_size_met = False
|
1732
1733
|
has_cloud_met = False
|
@@ -1740,7 +1741,7 @@ def check_can_clone_disk_and_override_task(
|
|
1740
1741
|
continue
|
1741
1742
|
has_cloud_met = True
|
1742
1743
|
|
1743
|
-
override_param = {}
|
1744
|
+
override_param: Dict[str, Any] = {}
|
1744
1745
|
if task_resources.cloud is None:
|
1745
1746
|
override_param['cloud'] = original_cloud
|
1746
1747
|
if task_resources.region is None:
|
@@ -1934,8 +1935,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
1934
1935
|
return global_user_state.get_cluster_from_name(cluster_name)
|
1935
1936
|
|
1936
1937
|
# All cases below are transitioning the cluster to non-UP states.
|
1937
|
-
|
1938
|
-
if (not node_statuses and
|
1938
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
1939
|
+
if (not node_statuses and launched_resources.cloud.STATUS_VERSION >=
|
1939
1940
|
clouds.StatusVersion.SKYPILOT):
|
1940
1941
|
# Note: launched_at is set during sky launch, even on an existing
|
1941
1942
|
# cluster. This will catch the case where the cluster was terminated on
|
@@ -2470,7 +2471,7 @@ def is_controller_accessible(
|
|
2470
2471
|
need_connection_check):
|
2471
2472
|
# Check ssh connection if (1) controller is in INIT state, or (2) we failed to fetch the
|
2472
2473
|
# status, both of which can happen when controller's status lock is held by another `sky jobs launch` or
|
2473
|
-
# `sky serve up`. If we have
|
2474
|
+
# `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
|
2474
2475
|
# we can allow access to the controller.
|
2475
2476
|
ssh_credentials = ssh_credential_from_yaml(handle.cluster_yaml,
|
2476
2477
|
handle.docker_user,
|
@@ -2968,7 +2969,7 @@ def get_endpoints(cluster: str,
|
|
2968
2969
|
f'for cluster {cluster!r} with backend '
|
2969
2970
|
f'{get_backend_from_handle(handle).NAME}.')
|
2970
2971
|
|
2971
|
-
launched_resources = handle.launched_resources
|
2972
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
2972
2973
|
cloud = launched_resources.cloud
|
2973
2974
|
try:
|
2974
2975
|
cloud.check_features_are_supported(
|
@@ -2985,11 +2986,11 @@ def get_endpoints(cluster: str,
|
|
2985
2986
|
head_ip=handle.head_ip,
|
2986
2987
|
provider_config=config['provider'])
|
2987
2988
|
|
2989
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
2988
2990
|
# Validation before returning the endpoints
|
2989
2991
|
if port is not None:
|
2990
2992
|
# If the requested endpoint was not to be exposed
|
2991
|
-
port_set = resources_utils.port_ranges_to_set(
|
2992
|
-
handle.launched_resources.ports)
|
2993
|
+
port_set = resources_utils.port_ranges_to_set(launched_resources.ports)
|
2993
2994
|
if port not in port_set:
|
2994
2995
|
logger.warning(f'Port {port} is not exposed on '
|
2995
2996
|
f'cluster {cluster!r}.')
|
@@ -2998,8 +2999,7 @@ def get_endpoints(cluster: str,
|
|
2998
2999
|
if port not in port_details:
|
2999
3000
|
error_msg = (f'Port {port} not exposed yet. '
|
3000
3001
|
f'{_ENDPOINTS_RETRY_MESSAGE} ')
|
3001
|
-
if
|
3002
|
-
clouds.Kubernetes()):
|
3002
|
+
if launched_resources.cloud.is_same_cloud(clouds.Kubernetes()):
|
3003
3003
|
# Add Kubernetes specific debugging info
|
3004
3004
|
error_msg += (kubernetes_utils.get_endpoint_debug_message())
|
3005
3005
|
logger.warning(error_msg)
|
@@ -3008,7 +3008,7 @@ def get_endpoints(cluster: str,
|
|
3008
3008
|
else:
|
3009
3009
|
if not port_details:
|
3010
3010
|
# If cluster had no ports to be exposed
|
3011
|
-
if
|
3011
|
+
if launched_resources.ports is None:
|
3012
3012
|
logger.warning(f'Cluster {cluster!r} does not have any '
|
3013
3013
|
'ports to be exposed.')
|
3014
3014
|
return {}
|
@@ -3017,8 +3017,7 @@ def get_endpoints(cluster: str,
|
|
3017
3017
|
else:
|
3018
3018
|
error_msg = (f'No endpoints exposed yet. '
|
3019
3019
|
f'{_ENDPOINTS_RETRY_MESSAGE} ')
|
3020
|
-
if
|
3021
|
-
clouds.Kubernetes()):
|
3020
|
+
if launched_resources.cloud.is_same_cloud(clouds.Kubernetes()):
|
3022
3021
|
# Add Kubernetes specific debugging info
|
3023
3022
|
error_msg += \
|
3024
3023
|
kubernetes_utils.get_endpoint_debug_message()
|
@@ -303,8 +303,6 @@ class RayCodeGen:
|
|
303
303
|
from sky.skylet import autostop_lib
|
304
304
|
from sky.skylet import constants
|
305
305
|
from sky.skylet import job_lib
|
306
|
-
from sky.utils import context
|
307
|
-
from sky.utils import context_utils
|
308
306
|
from sky.utils import log_utils
|
309
307
|
from sky.utils import subprocess_utils
|
310
308
|
|
@@ -367,6 +365,7 @@ class RayCodeGen:
|
|
367
365
|
# by ray.remote. This should be removed once we have a better way to
|
368
366
|
# specify dependencies for ray.
|
369
367
|
inspect.getsource(log_lib._ProcessingArgs), # pylint: disable=protected-access
|
368
|
+
inspect.getsource(log_lib._get_context), # pylint: disable=protected-access
|
370
369
|
inspect.getsource(log_lib._handle_io_stream), # pylint: disable=protected-access
|
371
370
|
inspect.getsource(log_lib.process_subprocess_stream),
|
372
371
|
inspect.getsource(log_lib.run_with_log),
|
@@ -1359,6 +1358,8 @@ class RetryingVmProvisioner(object):
|
|
1359
1358
|
# Get previous cluster status
|
1360
1359
|
cluster_exists = prev_cluster_status is not None
|
1361
1360
|
|
1361
|
+
to_provision = to_provision.assert_launchable()
|
1362
|
+
|
1362
1363
|
assert to_provision.region is not None, (
|
1363
1364
|
to_provision, 'region should have been set by the optimizer.')
|
1364
1365
|
region = clouds.Region(to_provision.region)
|
@@ -2034,6 +2035,7 @@ class RetryingVmProvisioner(object):
|
|
2034
2035
|
f' that never expire or a service account.\033[0m')
|
2035
2036
|
logger.warning(warnings)
|
2036
2037
|
|
2038
|
+
to_provision = to_provision.assert_launchable()
|
2037
2039
|
# Retrying launchable resources.
|
2038
2040
|
while True:
|
2039
2041
|
try:
|
@@ -2163,9 +2165,10 @@ class RetryingVmProvisioner(object):
|
|
2163
2165
|
raise exceptions.ResourcesUnavailableError(
|
2164
2166
|
_RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
|
2165
2167
|
failover_history=failover_history)
|
2166
|
-
|
2168
|
+
best_resources = task.best_resources
|
2167
2169
|
assert task in self._dag.tasks, 'Internal logic error.'
|
2168
|
-
assert
|
2170
|
+
assert best_resources is not None, task
|
2171
|
+
to_provision = best_resources
|
2169
2172
|
return config_dict
|
2170
2173
|
|
2171
2174
|
|
@@ -2431,19 +2434,21 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2431
2434
|
self.cluster_yaml, self.docker_user, self.ssh_user)
|
2432
2435
|
if avoid_ssh_control:
|
2433
2436
|
ssh_credentials.pop('ssh_control_name', None)
|
2437
|
+
|
2438
|
+
launched_resources = self.launched_resources.assert_launchable()
|
2434
2439
|
updated_to_skypilot_provisioner_after_provisioned = (
|
2435
|
-
|
2440
|
+
launched_resources.cloud.PROVISIONER_VERSION >=
|
2436
2441
|
clouds.ProvisionerVersion.SKYPILOT and
|
2437
2442
|
self.cached_external_ips is not None and
|
2438
2443
|
self.cached_cluster_info is None)
|
2439
2444
|
if updated_to_skypilot_provisioner_after_provisioned:
|
2440
2445
|
logger.debug(
|
2441
|
-
f'{
|
2446
|
+
f'{launched_resources.cloud} has been updated to the new '
|
2442
2447
|
f'provisioner after cluster {self.cluster_name} was '
|
2443
2448
|
f'provisioned. Cached IPs are used for connecting to the '
|
2444
2449
|
'cluster.')
|
2445
2450
|
if (clouds.ProvisionerVersion.RAY_PROVISIONER_SKYPILOT_TERMINATOR >=
|
2446
|
-
|
2451
|
+
launched_resources.cloud.PROVISIONER_VERSION or
|
2447
2452
|
updated_to_skypilot_provisioner_after_provisioned):
|
2448
2453
|
ip_list = (self.cached_external_ips
|
2449
2454
|
if force_cached else self.external_ips())
|
@@ -3147,8 +3152,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3147
3152
|
resources_utils.port_ranges_to_set(current_ports) -
|
3148
3153
|
resources_utils.port_ranges_to_set(prev_ports))
|
3149
3154
|
if open_new_ports:
|
3150
|
-
|
3151
|
-
if not (cloud.OPEN_PORTS_VERSION <=
|
3155
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
3156
|
+
if not (launched_resources.cloud.OPEN_PORTS_VERSION <=
|
3152
3157
|
clouds.OpenPortsVersion.LAUNCH_ONLY):
|
3153
3158
|
with rich_utils.safe_status(
|
3154
3159
|
ux_utils.spinner_message(
|
@@ -3252,9 +3257,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3252
3257
|
TODO: Delete COPY storage_mounts in task.sync_storage_mounts(), and
|
3253
3258
|
assert here that all storage_mounts are MOUNT mode.
|
3254
3259
|
"""
|
3260
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
3255
3261
|
with rich_utils.safe_status(ux_utils.spinner_message('Syncing files')):
|
3256
3262
|
controller_utils.replace_skypilot_config_path_in_file_mounts(
|
3257
|
-
|
3263
|
+
launched_resources.cloud, all_file_mounts)
|
3258
3264
|
self._execute_file_mounts(handle, all_file_mounts)
|
3259
3265
|
self._execute_storage_mounts(handle, storage_mounts)
|
3260
3266
|
self._set_storage_mounts_metadata(handle.cluster_name,
|
@@ -4154,7 +4160,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4154
4160
|
log_path = os.path.join(os.path.expanduser(self.log_dir),
|
4155
4161
|
'teardown.log')
|
4156
4162
|
log_abs_path = os.path.abspath(log_path)
|
4157
|
-
|
4163
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
4164
|
+
cloud = launched_resources.cloud
|
4158
4165
|
config = common_utils.read_yaml(handle.cluster_yaml)
|
4159
4166
|
cluster_name = handle.cluster_name
|
4160
4167
|
cluster_name_on_cloud = handle.cluster_name_on_cloud
|
@@ -4366,10 +4373,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4366
4373
|
# the right resource to provision the cluster.
|
4367
4374
|
if handle.cluster_yaml is not None:
|
4368
4375
|
try:
|
4369
|
-
|
4376
|
+
launched_resources = (
|
4377
|
+
handle.launched_resources.assert_launchable())
|
4378
|
+
cloud = launched_resources.cloud
|
4370
4379
|
config = common_utils.read_yaml(handle.cluster_yaml)
|
4371
4380
|
cloud.check_features_are_supported(
|
4372
|
-
|
4381
|
+
launched_resources,
|
4373
4382
|
{clouds.CloudImplementationFeatures.OPEN_PORTS})
|
4374
4383
|
provision_lib.cleanup_ports(repr(cloud),
|
4375
4384
|
cluster_name_on_cloud,
|
@@ -4495,6 +4504,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4495
4504
|
|
4496
4505
|
# down = False is the default, but warn the user in case
|
4497
4506
|
# they have explicitly specified it.
|
4507
|
+
# TODO(cooperc): Fix for new autostop stuff.
|
4498
4508
|
config_override_down = skypilot_config.get_nested(
|
4499
4509
|
(controller.value.controller_type, 'controller',
|
4500
4510
|
'autostop', 'down'), None)
|
@@ -4664,6 +4674,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4664
4674
|
handle_before_refresh = record['handle']
|
4665
4675
|
status_before_refresh = record['status']
|
4666
4676
|
|
4677
|
+
handle: Optional[CloudVmRayResourceHandle]
|
4667
4678
|
prev_cluster_status, handle = (status_before_refresh,
|
4668
4679
|
handle_before_refresh)
|
4669
4680
|
|
@@ -4684,7 +4695,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4684
4695
|
prev_cluster_status = None
|
4685
4696
|
handle = None
|
4686
4697
|
# We should check the cluster_ever_up after refresh, because if the
|
4687
|
-
# cluster is terminated (through console or auto-
|
4698
|
+
# cluster is terminated (through console or auto-down), the record will
|
4688
4699
|
# become None and the cluster_ever_up should be considered as False.
|
4689
4700
|
cluster_ever_up = record is not None and record['cluster_ever_up']
|
4690
4701
|
prev_config_hash = record['config_hash'] if record is not None else None
|
@@ -4697,16 +4708,21 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4697
4708
|
self.check_resources_fit_cluster(handle, task)
|
4698
4709
|
# Use the existing cluster.
|
4699
4710
|
assert handle.launched_resources is not None, (cluster_name, handle)
|
4711
|
+
# Take a random resource in order to get resource info that applies
|
4712
|
+
# to all resources.
|
4713
|
+
one_task_resource = list(task.resources)[0]
|
4700
4714
|
# Assume resources share the same ports.
|
4701
4715
|
for resource in task.resources:
|
4702
|
-
assert resource.ports ==
|
4716
|
+
assert resource.ports == one_task_resource.ports
|
4703
4717
|
requested_ports_set = resources_utils.port_ranges_to_set(
|
4704
|
-
|
4718
|
+
one_task_resource.ports)
|
4705
4719
|
current_ports_set = resources_utils.port_ranges_to_set(
|
4706
4720
|
handle.launched_resources.ports)
|
4707
4721
|
all_ports = resources_utils.port_set_to_ranges(current_ports_set |
|
4708
4722
|
requested_ports_set)
|
4709
4723
|
to_provision = handle.launched_resources
|
4724
|
+
assert to_provision is not None
|
4725
|
+
to_provision = to_provision.assert_launchable()
|
4710
4726
|
if (to_provision.cloud.OPEN_PORTS_VERSION <=
|
4711
4727
|
clouds.OpenPortsVersion.LAUNCH_ONLY):
|
4712
4728
|
if not requested_ports_set <= current_ports_set:
|
@@ -4720,6 +4736,21 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4720
4736
|
'a new cluster with the desired ports open.')
|
4721
4737
|
if all_ports:
|
4722
4738
|
to_provision = to_provision.copy(ports=all_ports)
|
4739
|
+
# Docker login should always be the same for all resources, since
|
4740
|
+
# it's set from envs.
|
4741
|
+
for resource in task.resources:
|
4742
|
+
assert (resource.docker_login_config ==
|
4743
|
+
one_task_resource.docker_login_config), (
|
4744
|
+
resource.docker_login_config,
|
4745
|
+
one_task_resource.docker_login_config)
|
4746
|
+
# If we have docker login config in the new task, override the
|
4747
|
+
# existing resources to pick up new credentials. This allows the
|
4748
|
+
# user to specify new or fixed credentials if the existing
|
4749
|
+
# credentials are not working. If we don't do this, the credentials
|
4750
|
+
# from the existing resources will always be reused.
|
4751
|
+
if one_task_resource.docker_login_config is not None:
|
4752
|
+
to_provision = to_provision.copy(
|
4753
|
+
_docker_login_config=one_task_resource.docker_login_config)
|
4723
4754
|
return RetryingVmProvisioner.ToProvisionConfig(
|
4724
4755
|
cluster_name,
|
4725
4756
|
to_provision,
|
sky/clouds/aws.py
CHANGED
@@ -440,18 +440,19 @@ class AWS(clouds.Cloud):
|
|
440
440
|
region_name = region.name
|
441
441
|
zone_names = [zone.name for zone in zones]
|
442
442
|
|
443
|
-
|
444
|
-
#
|
445
|
-
acc_dict = self.get_accelerators_from_instance_type(
|
443
|
+
resources = resources.assert_launchable()
|
444
|
+
# resources.accelerators is cleared but .instance_type encodes the info.
|
445
|
+
acc_dict = self.get_accelerators_from_instance_type(
|
446
|
+
resources.instance_type)
|
446
447
|
custom_resources = resources_utils.make_ray_custom_resources_str(
|
447
448
|
acc_dict)
|
448
449
|
|
449
|
-
if
|
450
|
+
if resources.extract_docker_image() is not None:
|
450
451
|
image_id_to_use = None
|
451
452
|
else:
|
452
|
-
image_id_to_use =
|
453
|
+
image_id_to_use = resources.image_id
|
453
454
|
image_id = self._get_image_id(image_id_to_use, region_name,
|
454
|
-
|
455
|
+
resources.instance_type)
|
455
456
|
|
456
457
|
disk_encrypted = skypilot_config.get_nested(('aws', 'disk_encrypted'),
|
457
458
|
False)
|
@@ -483,17 +484,17 @@ class AWS(clouds.Cloud):
|
|
483
484
|
'in `~/.sky/config.yaml`.')
|
484
485
|
|
485
486
|
return {
|
486
|
-
'instance_type':
|
487
|
+
'instance_type': resources.instance_type,
|
487
488
|
'custom_resources': custom_resources,
|
488
489
|
'disk_encrypted': disk_encrypted,
|
489
|
-
'use_spot':
|
490
|
+
'use_spot': resources.use_spot,
|
490
491
|
'region': region_name,
|
491
492
|
'zones': ','.join(zone_names),
|
492
493
|
'image_id': image_id,
|
493
494
|
'security_group': security_group,
|
494
495
|
'security_group_managed_by_skypilot':
|
495
496
|
str(security_group != user_security_group).lower(),
|
496
|
-
**AWS._get_disk_specs(
|
497
|
+
**AWS._get_disk_specs(resources.disk_tier)
|
497
498
|
}
|
498
499
|
|
499
500
|
def _get_feasible_launchable_resources(
|
@@ -971,6 +972,7 @@ class AWS(clouds.Cloud):
|
|
971
972
|
botocore.exceptions.ClientError: error in Boto3 client request.
|
972
973
|
"""
|
973
974
|
|
975
|
+
resources = resources.assert_launchable()
|
974
976
|
instance_type = resources.instance_type
|
975
977
|
region = resources.region
|
976
978
|
use_spot = resources.use_spot
|
sky/clouds/azure.py
CHANGED
@@ -325,9 +325,10 @@ class Azure(clouds.Cloud):
|
|
325
325
|
|
326
326
|
region_name = region.name
|
327
327
|
|
328
|
-
|
329
|
-
#
|
330
|
-
acc_dict = self.get_accelerators_from_instance_type(
|
328
|
+
resources = resources.assert_launchable()
|
329
|
+
# resources.accelerators is cleared but .instance_type encodes the info.
|
330
|
+
acc_dict = self.get_accelerators_from_instance_type(
|
331
|
+
resources.instance_type)
|
331
332
|
acc_count = None
|
332
333
|
if acc_dict is not None:
|
333
334
|
acc_count = str(sum(acc_dict.values()))
|
@@ -339,8 +340,9 @@ class Azure(clouds.Cloud):
|
|
339
340
|
# pylint: disable=import-outside-toplevel
|
340
341
|
from sky.clouds.service_catalog import azure_catalog
|
341
342
|
gen_version = azure_catalog.get_gen_version_from_instance_type(
|
342
|
-
|
343
|
-
image_id = self._get_default_image_tag(gen_version,
|
343
|
+
resources.instance_type)
|
344
|
+
image_id = self._get_default_image_tag(gen_version,
|
345
|
+
resources.instance_type)
|
344
346
|
else:
|
345
347
|
if None in resources.image_id:
|
346
348
|
image_id = resources.image_id[None]
|
@@ -407,18 +409,19 @@ class Azure(clouds.Cloud):
|
|
407
409
|
""").split('\n')
|
408
410
|
|
409
411
|
def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
|
410
|
-
if (
|
411
|
-
|
412
|
-
return
|
412
|
+
if (resources.disk_tier is not None and
|
413
|
+
resources.disk_tier != resources_utils.DiskTier.BEST):
|
414
|
+
return resources.disk_tier
|
413
415
|
# Failover disk tier from high to low. Default disk tier
|
414
416
|
# (Premium_LRS, medium) only support s-series instance types,
|
415
417
|
# so we failover to lower tiers for non-s-series.
|
416
418
|
all_tiers = list(reversed(resources_utils.DiskTier))
|
417
419
|
start_index = all_tiers.index(
|
418
|
-
Azure._translate_disk_tier(
|
420
|
+
Azure._translate_disk_tier(resources.disk_tier))
|
419
421
|
while start_index < len(all_tiers):
|
420
422
|
disk_tier = all_tiers[start_index]
|
421
|
-
ok, _ = Azure.check_disk_tier(
|
423
|
+
ok, _ = Azure.check_disk_tier(resources.instance_type,
|
424
|
+
disk_tier)
|
422
425
|
if ok:
|
423
426
|
return disk_tier
|
424
427
|
start_index += 1
|
@@ -426,11 +429,11 @@ class Azure(clouds.Cloud):
|
|
426
429
|
|
427
430
|
disk_tier = _failover_disk_tier()
|
428
431
|
|
429
|
-
resources_vars = {
|
430
|
-
'instance_type':
|
432
|
+
resources_vars: Dict[str, Any] = {
|
433
|
+
'instance_type': resources.instance_type,
|
431
434
|
'custom_resources': custom_resources,
|
432
435
|
'num_gpus': acc_count,
|
433
|
-
'use_spot':
|
436
|
+
'use_spot': resources.use_spot,
|
434
437
|
'region': region_name,
|
435
438
|
# Azure does not support specific zones.
|
436
439
|
'zones': None,
|
sky/clouds/cloud.py
CHANGED
@@ -11,7 +11,8 @@ import collections
|
|
11
11
|
import enum
|
12
12
|
import math
|
13
13
|
import typing
|
14
|
-
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple,
|
14
|
+
from typing import (Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple,
|
15
|
+
Union)
|
15
16
|
|
16
17
|
from typing_extensions import assert_never
|
17
18
|
|
@@ -302,7 +303,7 @@ class Cloud:
|
|
302
303
|
zones: Optional[List['Zone']],
|
303
304
|
num_nodes: int,
|
304
305
|
dryrun: bool = False,
|
305
|
-
) -> Dict[str,
|
306
|
+
) -> Dict[str, Any]:
|
306
307
|
"""Converts planned sky.Resources to cloud-specific resource variables.
|
307
308
|
|
308
309
|
These variables are used to fill the node type section (instance type,
|
@@ -721,7 +722,7 @@ class Cloud:
|
|
721
722
|
Raises:
|
722
723
|
ResourcesMismatchError: If the accelerator is not supported.
|
723
724
|
"""
|
724
|
-
|
725
|
+
resources = resources.assert_launchable()
|
725
726
|
|
726
727
|
def _equal_accelerators(
|
727
728
|
acc_requested: Optional[Dict[str, Union[int, float]]],
|
sky/clouds/cudo.py
CHANGED
@@ -201,8 +201,9 @@ class Cudo(clouds.Cloud):
|
|
201
201
|
dryrun: bool = False,
|
202
202
|
) -> Dict[str, Optional[str]]:
|
203
203
|
del zones, cluster_name # unused
|
204
|
-
|
205
|
-
acc_dict = self.get_accelerators_from_instance_type(
|
204
|
+
resources = resources.assert_launchable()
|
205
|
+
acc_dict = self.get_accelerators_from_instance_type(
|
206
|
+
resources.instance_type)
|
206
207
|
custom_resources = resources_utils.make_ray_custom_resources_str(
|
207
208
|
acc_dict)
|
208
209
|
|
sky/clouds/do.py
CHANGED
@@ -181,8 +181,9 @@ class DO(clouds.Cloud):
|
|
181
181
|
dryrun: bool = False) -> Dict[str, Optional[str]]:
|
182
182
|
del zones, dryrun, cluster_name
|
183
183
|
|
184
|
-
|
185
|
-
acc_dict = self.get_accelerators_from_instance_type(
|
184
|
+
resources = resources.assert_launchable()
|
185
|
+
acc_dict = self.get_accelerators_from_instance_type(
|
186
|
+
resources.instance_type)
|
186
187
|
if acc_dict is not None:
|
187
188
|
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
|
188
189
|
else:
|
sky/clouds/fluidstack.py
CHANGED
@@ -188,9 +188,9 @@ class Fluidstack(clouds.Cloud):
|
|
188
188
|
) -> Dict[str, Optional[str]]:
|
189
189
|
|
190
190
|
assert zones is None, 'FluidStack does not support zones.'
|
191
|
-
|
192
|
-
|
193
|
-
|
191
|
+
resources = resources.assert_launchable()
|
192
|
+
acc_dict = self.get_accelerators_from_instance_type(
|
193
|
+
resources.instance_type)
|
194
194
|
custom_resources = resources_utils.make_ray_custom_resources_str(
|
195
195
|
acc_dict)
|
196
196
|
|
sky/clouds/gcp.py
CHANGED
@@ -521,11 +521,13 @@ class GCP(clouds.Cloud):
|
|
521
521
|
else:
|
522
522
|
# Convert to GCP names:
|
523
523
|
# https://cloud.google.com/compute/docs/gpus
|
524
|
-
if acc in ('A100-80GB', 'L4'):
|
524
|
+
if acc in ('A100-80GB', 'L4', 'B200'):
|
525
525
|
# A100-80GB and L4 have a different name pattern.
|
526
526
|
resources_vars['gpu'] = f'nvidia-{acc.lower()}'
|
527
527
|
elif acc in ('H100', 'H100-MEGA'):
|
528
528
|
resources_vars['gpu'] = f'nvidia-{acc.lower()}-80gb'
|
529
|
+
elif acc in ('H200',):
|
530
|
+
resources_vars['gpu'] = f'nvidia-{acc.lower()}-141gb'
|
529
531
|
else:
|
530
532
|
resources_vars['gpu'] = 'nvidia-tesla-{}'.format(
|
531
533
|
acc.lower())
|
@@ -1037,7 +1039,7 @@ class GCP(clouds.Cloud):
|
|
1037
1039
|
@staticmethod
|
1038
1040
|
def _check_instance_type_accelerators_combination(
|
1039
1041
|
resources: 'resources.Resources') -> None:
|
1040
|
-
|
1042
|
+
resources = resources.assert_launchable()
|
1041
1043
|
service_catalog.check_accelerator_attachable_to_host(
|
1042
1044
|
resources.instance_type, resources.accelerators, resources.zone,
|
1043
1045
|
'gcp')
|
@@ -1059,15 +1061,24 @@ class GCP(clouds.Cloud):
|
|
1059
1061
|
raise exceptions.NotSupportedError(msg)
|
1060
1062
|
|
1061
1063
|
@classmethod
|
1062
|
-
def _get_disk_type(
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1064
|
+
def _get_disk_type(
|
1065
|
+
cls,
|
1066
|
+
instance_type: Optional[str],
|
1067
|
+
disk_tier: Optional[resources_utils.DiskTier],
|
1068
|
+
) -> str:
|
1069
|
+
|
1070
|
+
def _propagate_disk_type(
|
1071
|
+
lowest: Optional[str] = None,
|
1072
|
+
highest: Optional[str] = None,
|
1073
|
+
# pylint: disable=redefined-builtin
|
1074
|
+
all: Optional[str] = None) -> None:
|
1067
1075
|
if lowest is not None:
|
1068
1076
|
tier2name[resources_utils.DiskTier.LOW] = lowest
|
1069
1077
|
if highest is not None:
|
1070
1078
|
tier2name[resources_utils.DiskTier.ULTRA] = highest
|
1079
|
+
if all is not None:
|
1080
|
+
for tier in tier2name:
|
1081
|
+
tier2name[tier] = all
|
1071
1082
|
|
1072
1083
|
tier = cls._translate_disk_tier(disk_tier)
|
1073
1084
|
|
@@ -1081,7 +1092,8 @@ class GCP(clouds.Cloud):
|
|
1081
1092
|
|
1082
1093
|
# Remap series-specific disk types.
|
1083
1094
|
# Reference: https://github.com/skypilot-org/skypilot/issues/4705
|
1084
|
-
|
1095
|
+
assert instance_type is not None, (instance_type, disk_tier)
|
1096
|
+
series = instance_type.split('-')[0]
|
1085
1097
|
|
1086
1098
|
# General handling of unsupported disk types
|
1087
1099
|
if series in ['n1', 'a2', 'g2']:
|
@@ -1092,6 +1104,9 @@ class GCP(clouds.Cloud):
|
|
1092
1104
|
# These series don't support pd-standard, use pd-balanced for LOW.
|
1093
1105
|
_propagate_disk_type(
|
1094
1106
|
lowest=tier2name[resources_utils.DiskTier.MEDIUM])
|
1107
|
+
if instance_type.startswith('a3-ultragpu'):
|
1108
|
+
# a3-ultragpu instances only support hyperdisk-balanced.
|
1109
|
+
_propagate_disk_type(all='hyperdisk-balanced')
|
1095
1110
|
|
1096
1111
|
# Series specific handling
|
1097
1112
|
if series == 'n2':
|
@@ -1114,7 +1129,8 @@ class GCP(clouds.Cloud):
|
|
1114
1129
|
specs: Dict[str, Any] = {
|
1115
1130
|
'disk_tier': cls._get_disk_type(instance_type, disk_tier)
|
1116
1131
|
}
|
1117
|
-
if disk_tier == resources_utils.DiskTier.ULTRA
|
1132
|
+
if (disk_tier == resources_utils.DiskTier.ULTRA and
|
1133
|
+
specs['disk_tier'] == 'pd-extreme'):
|
1118
1134
|
# Only pd-extreme supports custom iops.
|
1119
1135
|
# see https://cloud.google.com/compute/docs/disks#disk-types
|
1120
1136
|
specs['disk_iops'] = 20000
|
sky/clouds/ibm.py
CHANGED
@@ -175,7 +175,7 @@ class IBM(clouds.Cloud):
|
|
175
175
|
zones: Optional[List['clouds.Zone']],
|
176
176
|
num_nodes: int,
|
177
177
|
dryrun: bool = False,
|
178
|
-
) -> Dict[str,
|
178
|
+
) -> Dict[str, Any]:
|
179
179
|
"""Converts planned sky.Resources to cloud-specific resource variables.
|
180
180
|
|
181
181
|
These variables are used to fill the node type section (instance type,
|
@@ -204,30 +204,32 @@ class IBM(clouds.Cloud):
|
|
204
204
|
# clouds implementing 'zones_provision_loop()'
|
205
205
|
zone_names = [zone.name for zone in zones] # type: ignore[union-attr]
|
206
206
|
|
207
|
-
|
208
|
-
assert not
|
207
|
+
resources = resources.assert_launchable()
|
208
|
+
assert not resources.use_spot, \
|
209
209
|
'IBM does not currently support spot instances in this framework'
|
210
210
|
|
211
|
-
acc_dict = self.get_accelerators_from_instance_type(
|
211
|
+
acc_dict = self.get_accelerators_from_instance_type(
|
212
|
+
resources.instance_type)
|
212
213
|
custom_resources = resources_utils.make_ray_custom_resources_str(
|
213
214
|
acc_dict)
|
214
215
|
|
215
|
-
instance_resources = _get_profile_resources(
|
216
|
+
instance_resources = _get_profile_resources(resources.instance_type)
|
216
217
|
|
217
218
|
worker_instance_type = get_cred_file_field('worker_instance_type',
|
218
|
-
|
219
|
+
resources.instance_type)
|
219
220
|
worker_instance_resources = _get_profile_resources(worker_instance_type)
|
220
221
|
# r.image_id: {clouds.Region:image_id} - property of Resources class
|
221
|
-
image_id =
|
222
|
-
region.name] if
|
222
|
+
image_id = resources.image_id[
|
223
|
+
region.name] if resources.image_id else self.get_default_image(
|
224
|
+
region_name)
|
223
225
|
|
224
226
|
return {
|
225
|
-
'instance_type':
|
227
|
+
'instance_type': resources.instance_type,
|
226
228
|
'instance_resources': instance_resources,
|
227
229
|
'worker_instance_type': worker_instance_type,
|
228
230
|
'worker_instance_resources': worker_instance_resources,
|
229
231
|
'custom_resources': custom_resources,
|
230
|
-
'use_spot':
|
232
|
+
'use_spot': resources.use_spot,
|
231
233
|
'region': region_name,
|
232
234
|
'zones': ','.join(zone_names),
|
233
235
|
'image_id': image_id,
|