skypilot-nightly 1.0.0.dev20241203__py3-none-any.whl → 1.0.0.dev20241205__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/backends/backend.py +42 -15
- sky/backends/backend_utils.py +143 -9
- sky/backends/cloud_vm_ray_backend.py +103 -25
- sky/backends/local_docker_backend.py +11 -7
- sky/cli.py +11 -2
- sky/clouds/service_catalog/common.py +2 -2
- sky/core.py +25 -18
- sky/exceptions.py +7 -0
- sky/execution.py +30 -11
- sky/global_user_state.py +23 -10
- sky/jobs/controller.py +28 -8
- sky/jobs/core.py +61 -35
- sky/jobs/recovery_strategy.py +2 -1
- sky/jobs/state.py +33 -1
- sky/jobs/utils.py +16 -2
- sky/setup_files/dependencies.py +141 -0
- sky/setup_files/setup.py +12 -124
- sky/skylet/constants.py +36 -11
- sky/skylet/log_lib.py +3 -1
- sky/skylet/log_lib.pyi +3 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -2
- sky/utils/common_utils.py +19 -0
- sky/utils/controller_utils.py +60 -98
- {skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/METADATA +3 -2
- {skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/RECORD +30 -29
- {skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '6e5083293f0d9a9d069d51274c57f0e59e47e5ce'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241205'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -105,6 +105,7 @@ from sky.data import StorageMode
|
|
105
105
|
from sky.data import StoreType
|
106
106
|
from sky.execution import exec # pylint: disable=redefined-builtin
|
107
107
|
from sky.execution import launch
|
108
|
+
from sky.jobs import ManagedJobStatus
|
108
109
|
# TODO (zhwu): These imports are for backward compatibility, and spot APIs
|
109
110
|
# should be called with `sky.spot.xxx` instead. Remove in release 0.8.0
|
110
111
|
from sky.jobs.core import spot_cancel
|
@@ -163,6 +164,7 @@ __all__ = [
|
|
163
164
|
'StoreType',
|
164
165
|
'ClusterStatus',
|
165
166
|
'JobStatus',
|
167
|
+
'ManagedJobStatus',
|
166
168
|
# APIs
|
167
169
|
'Dag',
|
168
170
|
'Task',
|
sky/backends/backend.py
CHANGED
@@ -45,20 +45,45 @@ class Backend(Generic[_ResourceHandleType]):
|
|
45
45
|
@timeline.event
|
46
46
|
@usage_lib.messages.usage.update_runtime('provision')
|
47
47
|
def provision(
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
48
|
+
self,
|
49
|
+
task: 'task_lib.Task',
|
50
|
+
to_provision: Optional['resources.Resources'],
|
51
|
+
dryrun: bool,
|
52
|
+
stream_logs: bool,
|
53
|
+
cluster_name: Optional[str] = None,
|
54
|
+
retry_until_up: bool = False,
|
55
|
+
skip_unnecessary_provisioning: bool = False,
|
56
|
+
) -> Optional[_ResourceHandleType]:
|
57
|
+
"""Provisions resources for the given task.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
task: The task to provision resources for.
|
61
|
+
to_provision: Resource config to provision. Should only be None if
|
62
|
+
cluster_name refers to an existing cluster, whose resources will
|
63
|
+
be used.
|
64
|
+
dryrun: If True, don't actually provision anything.
|
65
|
+
stream_logs: If True, stream additional logs to console.
|
66
|
+
cluster_name: Name of the cluster to provision. If None, a name will
|
67
|
+
be auto-generated. If the name refers to an existing cluster,
|
68
|
+
the existing cluster will be reused and re-provisioned.
|
69
|
+
retry_until_up: If True, retry provisioning until resources are
|
70
|
+
successfully launched.
|
71
|
+
skip_if_no_cluster_updates: If True, compare the cluster config to
|
72
|
+
the existing cluster_name's config. Skip provisioning if no
|
73
|
+
updates are needed for the existing cluster.
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
A ResourceHandle object for the provisioned resources, or None if
|
77
|
+
dryrun is True.
|
78
|
+
"""
|
55
79
|
if cluster_name is None:
|
56
80
|
cluster_name = sky.backends.backend_utils.generate_cluster_name()
|
57
81
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
58
82
|
usage_lib.messages.usage.update_actual_task(task)
|
59
83
|
with rich_utils.safe_status(ux_utils.spinner_message('Launching')):
|
60
84
|
return self._provision(task, to_provision, dryrun, stream_logs,
|
61
|
-
cluster_name, retry_until_up
|
85
|
+
cluster_name, retry_until_up,
|
86
|
+
skip_unnecessary_provisioning)
|
62
87
|
|
63
88
|
@timeline.event
|
64
89
|
@usage_lib.messages.usage.update_runtime('sync_workdir')
|
@@ -126,13 +151,15 @@ class Backend(Generic[_ResourceHandleType]):
|
|
126
151
|
|
127
152
|
# --- Implementations of the APIs ---
|
128
153
|
def _provision(
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
154
|
+
self,
|
155
|
+
task: 'task_lib.Task',
|
156
|
+
to_provision: Optional['resources.Resources'],
|
157
|
+
dryrun: bool,
|
158
|
+
stream_logs: bool,
|
159
|
+
cluster_name: str,
|
160
|
+
retry_until_up: bool = False,
|
161
|
+
skip_unnecessary_provisioning: bool = False,
|
162
|
+
) -> Optional[_ResourceHandleType]:
|
136
163
|
raise NotImplementedError
|
137
164
|
|
138
165
|
def _sync_workdir(self, handle: _ResourceHandleType, workdir: Path) -> None:
|
sky/backends/backend_utils.py
CHANGED
@@ -3,6 +3,7 @@ from datetime import datetime
|
|
3
3
|
import enum
|
4
4
|
import fnmatch
|
5
5
|
import functools
|
6
|
+
import hashlib
|
6
7
|
import os
|
7
8
|
import pathlib
|
8
9
|
import pprint
|
@@ -644,11 +645,17 @@ def write_cluster_config(
|
|
644
645
|
keep_launch_fields_in_existing_config: bool = True) -> Dict[str, str]:
|
645
646
|
"""Fills in cluster configuration templates and writes them out.
|
646
647
|
|
647
|
-
Returns:
|
648
|
-
|
649
|
-
- 'ray'
|
650
|
-
- '
|
651
|
-
- '
|
648
|
+
Returns:
|
649
|
+
Dict with the following keys:
|
650
|
+
- 'ray': Path to the generated Ray yaml config file
|
651
|
+
- 'cluster_name': Name of the cluster
|
652
|
+
- 'cluster_name_on_cloud': Name of the cluster as it appears in the
|
653
|
+
cloud provider
|
654
|
+
- 'config_hash': Hash of the cluster config and file mounts contents.
|
655
|
+
Can be missing if we unexpectedly failed to calculate the hash for
|
656
|
+
some reason. In that case we will continue without the optimization to
|
657
|
+
skip provisioning.
|
658
|
+
|
652
659
|
Raises:
|
653
660
|
exceptions.ResourcesUnavailableError: if the region/zones requested does
|
654
661
|
not appear in the catalog, or an ssh_proxy_command is specified but
|
@@ -903,6 +910,12 @@ def write_cluster_config(
|
|
903
910
|
if dryrun:
|
904
911
|
# If dryrun, return the unfinished tmp yaml path.
|
905
912
|
config_dict['ray'] = tmp_yaml_path
|
913
|
+
try:
|
914
|
+
config_dict['config_hash'] = _deterministic_cluster_yaml_hash(
|
915
|
+
tmp_yaml_path)
|
916
|
+
except Exception as e: # pylint: disable=broad-except
|
917
|
+
logger.warning(f'Failed to calculate config_hash: {e}')
|
918
|
+
logger.debug('Full exception:', exc_info=e)
|
906
919
|
return config_dict
|
907
920
|
_add_auth_to_cluster_config(cloud, tmp_yaml_path)
|
908
921
|
|
@@ -925,6 +938,17 @@ def write_cluster_config(
|
|
925
938
|
yaml_config = common_utils.read_yaml(tmp_yaml_path)
|
926
939
|
config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
|
927
940
|
|
941
|
+
# Make sure to do this before we optimize file mounts. Optimization is
|
942
|
+
# non-deterministic, but everything else before this point should be
|
943
|
+
# deterministic.
|
944
|
+
try:
|
945
|
+
config_dict['config_hash'] = _deterministic_cluster_yaml_hash(
|
946
|
+
tmp_yaml_path)
|
947
|
+
except Exception as e: # pylint: disable=broad-except
|
948
|
+
logger.warning('Failed to calculate config_hash: '
|
949
|
+
f'{common_utils.format_exception(e)}')
|
950
|
+
logger.debug('Full exception:', exc_info=e)
|
951
|
+
|
928
952
|
# Optimization: copy the contents of source files in file_mounts to a
|
929
953
|
# special dir, and upload that as the only file_mount instead. Delay
|
930
954
|
# calling this optimization until now, when all source files have been
|
@@ -1033,6 +1057,115 @@ def _count_healthy_nodes_from_ray(output: str,
|
|
1033
1057
|
return ready_head, ready_workers
|
1034
1058
|
|
1035
1059
|
|
1060
|
+
@timeline.event
|
1061
|
+
def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
|
1062
|
+
"""Hash the cluster yaml and contents of file mounts to a unique string.
|
1063
|
+
|
1064
|
+
Two invocations of this function should return the same string if and only
|
1065
|
+
if the contents of the yaml are the same and the file contents of all the
|
1066
|
+
file_mounts specified in the yaml are the same.
|
1067
|
+
|
1068
|
+
Limitations:
|
1069
|
+
- This function can be expensive if the file mounts are large. (E.g. a few
|
1070
|
+
seconds for ~1GB.) This should be okay since we expect that the
|
1071
|
+
file_mounts in the cluster yaml (the wheel and cloud credentials) will be
|
1072
|
+
small.
|
1073
|
+
- Symbolic links are not explicitly handled. Some symbolic link changes may
|
1074
|
+
not be detected.
|
1075
|
+
|
1076
|
+
Implementation: We create a byte sequence that captures the state of the
|
1077
|
+
yaml file and all the files in the file mounts, then hash the byte sequence.
|
1078
|
+
|
1079
|
+
The format of the byte sequence is:
|
1080
|
+
32 bytes - sha256 hash of the yaml file
|
1081
|
+
for each file mount:
|
1082
|
+
file mount remote destination (UTF-8), \0
|
1083
|
+
if the file mount source is a file:
|
1084
|
+
'file' encoded to UTF-8
|
1085
|
+
32 byte sha256 hash of the file contents
|
1086
|
+
if the file mount source is a directory:
|
1087
|
+
'dir' encoded to UTF-8
|
1088
|
+
for each directory and subdirectory withinin the file mount (starting from
|
1089
|
+
the root and descending recursively):
|
1090
|
+
name of the directory (UTF-8), \0
|
1091
|
+
name of each subdirectory within the directory (UTF-8) terminated by \0
|
1092
|
+
\0
|
1093
|
+
for each file in the directory:
|
1094
|
+
name of the file (UTF-8), \0
|
1095
|
+
32 bytes - sha256 hash of the file contents
|
1096
|
+
\0
|
1097
|
+
if the file mount source is something else or does not exist, nothing
|
1098
|
+
\0\0
|
1099
|
+
|
1100
|
+
Rather than constructing the whole byte sequence, which may be quite large,
|
1101
|
+
we construct it incrementally by using hash.update() to add new bytes.
|
1102
|
+
"""
|
1103
|
+
|
1104
|
+
def _hash_file(path: str) -> bytes:
|
1105
|
+
return common_utils.hash_file(path, 'sha256').digest()
|
1106
|
+
|
1107
|
+
config_hash = hashlib.sha256()
|
1108
|
+
|
1109
|
+
config_hash.update(_hash_file(yaml_path))
|
1110
|
+
|
1111
|
+
yaml_config = common_utils.read_yaml(yaml_path)
|
1112
|
+
file_mounts = yaml_config.get('file_mounts', {})
|
1113
|
+
# Remove the file mounts added by the newline.
|
1114
|
+
if '' in file_mounts:
|
1115
|
+
assert file_mounts[''] == '', file_mounts['']
|
1116
|
+
file_mounts.pop('')
|
1117
|
+
|
1118
|
+
for dst, src in sorted(file_mounts.items()):
|
1119
|
+
expanded_src = os.path.expanduser(src)
|
1120
|
+
config_hash.update(dst.encode('utf-8') + b'\0')
|
1121
|
+
|
1122
|
+
# If the file mount source is a symlink, this should be true. In that
|
1123
|
+
# case we hash the contents of the symlink destination.
|
1124
|
+
if os.path.isfile(expanded_src):
|
1125
|
+
config_hash.update('file'.encode('utf-8'))
|
1126
|
+
config_hash.update(_hash_file(expanded_src))
|
1127
|
+
|
1128
|
+
# This can also be a symlink to a directory. os.walk will treat it as a
|
1129
|
+
# normal directory and list the contents of the symlink destination.
|
1130
|
+
elif os.path.isdir(expanded_src):
|
1131
|
+
config_hash.update('dir'.encode('utf-8'))
|
1132
|
+
|
1133
|
+
# Aside from expanded_src, os.walk will list symlinks to directories
|
1134
|
+
# but will not recurse into them.
|
1135
|
+
for (dirpath, dirnames, filenames) in os.walk(expanded_src):
|
1136
|
+
config_hash.update(dirpath.encode('utf-8') + b'\0')
|
1137
|
+
|
1138
|
+
# Note: inplace sort will also affect the traversal order of
|
1139
|
+
# os.walk. We need it so that the os.walk order is
|
1140
|
+
# deterministic.
|
1141
|
+
dirnames.sort()
|
1142
|
+
# This includes symlinks to directories. os.walk will recurse
|
1143
|
+
# into all the directories but not the symlinks. We don't hash
|
1144
|
+
# the link destination, so if a symlink to a directory changes,
|
1145
|
+
# we won't notice.
|
1146
|
+
for dirname in dirnames:
|
1147
|
+
config_hash.update(dirname.encode('utf-8') + b'\0')
|
1148
|
+
config_hash.update(b'\0')
|
1149
|
+
|
1150
|
+
filenames.sort()
|
1151
|
+
# This includes symlinks to files. We could hash the symlink
|
1152
|
+
# destination itself but instead just hash the destination
|
1153
|
+
# contents.
|
1154
|
+
for filename in filenames:
|
1155
|
+
config_hash.update(filename.encode('utf-8') + b'\0')
|
1156
|
+
config_hash.update(
|
1157
|
+
_hash_file(os.path.join(dirpath, filename)))
|
1158
|
+
config_hash.update(b'\0')
|
1159
|
+
|
1160
|
+
else:
|
1161
|
+
logger.debug(
|
1162
|
+
f'Unexpected file_mount that is not a file or dir: {src}')
|
1163
|
+
|
1164
|
+
config_hash.update(b'\0\0')
|
1165
|
+
|
1166
|
+
return config_hash.hexdigest()
|
1167
|
+
|
1168
|
+
|
1036
1169
|
def get_docker_user(ip: str, cluster_config_file: str) -> str:
|
1037
1170
|
"""Find docker container username."""
|
1038
1171
|
ssh_credentials = ssh_credential_from_yaml(cluster_config_file)
|
@@ -1612,14 +1745,14 @@ def check_can_clone_disk_and_override_task(
|
|
1612
1745
|
The task to use and the resource handle of the source cluster.
|
1613
1746
|
|
1614
1747
|
Raises:
|
1615
|
-
|
1748
|
+
exceptions.ClusterDoesNotExist: If the source cluster does not exist.
|
1616
1749
|
exceptions.NotSupportedError: If the source cluster is not valid or the
|
1617
1750
|
task is not compatible to clone disk from the source cluster.
|
1618
1751
|
"""
|
1619
1752
|
source_cluster_status, handle = refresh_cluster_status_handle(cluster_name)
|
1620
1753
|
if source_cluster_status is None:
|
1621
1754
|
with ux_utils.print_exception_no_traceback():
|
1622
|
-
raise
|
1755
|
+
raise exceptions.ClusterDoesNotExist(
|
1623
1756
|
f'Cannot find cluster {cluster_name!r} to clone disk from.')
|
1624
1757
|
|
1625
1758
|
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
@@ -2136,7 +2269,7 @@ def check_cluster_available(
|
|
2136
2269
|
"""Check if the cluster is available.
|
2137
2270
|
|
2138
2271
|
Raises:
|
2139
|
-
|
2272
|
+
exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
2140
2273
|
exceptions.ClusterNotUpError: if the cluster is not UP.
|
2141
2274
|
exceptions.NotSupportedError: if the cluster is not based on
|
2142
2275
|
CloudVmRayBackend.
|
@@ -2201,7 +2334,8 @@ def check_cluster_available(
|
|
2201
2334
|
error_msg += message
|
2202
2335
|
|
2203
2336
|
with ux_utils.print_exception_no_traceback():
|
2204
|
-
raise
|
2337
|
+
raise exceptions.ClusterDoesNotExist(
|
2338
|
+
f'{colorama.Fore.YELLOW}{error_msg}{reset}')
|
2205
2339
|
assert cluster_status is not None, 'handle is not None but status is None'
|
2206
2340
|
backend = get_backend_from_handle(handle)
|
2207
2341
|
if check_cloud_vm_ray_backend and not isinstance(
|
@@ -301,6 +301,8 @@ class RayCodeGen:
|
|
301
301
|
)
|
302
302
|
def get_or_fail(futures, pg) -> List[int]:
|
303
303
|
\"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
|
304
|
+
if not futures:
|
305
|
+
return []
|
304
306
|
returncodes = [1] * len(futures)
|
305
307
|
# Wait for 1 task to be ready.
|
306
308
|
ready = []
|
@@ -1153,6 +1155,7 @@ class RetryingVmProvisioner(object):
|
|
1153
1155
|
prev_cluster_status: Optional[status_lib.ClusterStatus],
|
1154
1156
|
prev_handle: Optional['CloudVmRayResourceHandle'],
|
1155
1157
|
prev_cluster_ever_up: bool,
|
1158
|
+
prev_config_hash: Optional[str],
|
1156
1159
|
) -> None:
|
1157
1160
|
assert cluster_name is not None, 'cluster_name must be specified.'
|
1158
1161
|
self.cluster_name = cluster_name
|
@@ -1161,6 +1164,7 @@ class RetryingVmProvisioner(object):
|
|
1161
1164
|
self.prev_cluster_status = prev_cluster_status
|
1162
1165
|
self.prev_handle = prev_handle
|
1163
1166
|
self.prev_cluster_ever_up = prev_cluster_ever_up
|
1167
|
+
self.prev_config_hash = prev_config_hash
|
1164
1168
|
|
1165
1169
|
def __init__(self,
|
1166
1170
|
log_dir: str,
|
@@ -1322,8 +1326,21 @@ class RetryingVmProvisioner(object):
|
|
1322
1326
|
prev_cluster_status: Optional[status_lib.ClusterStatus],
|
1323
1327
|
prev_handle: Optional['CloudVmRayResourceHandle'],
|
1324
1328
|
prev_cluster_ever_up: bool,
|
1329
|
+
skip_if_config_hash_matches: Optional[str],
|
1325
1330
|
) -> Dict[str, Any]:
|
1326
|
-
"""The provision retry loop.
|
1331
|
+
"""The provision retry loop.
|
1332
|
+
|
1333
|
+
Returns a config_dict with the following fields:
|
1334
|
+
All fields from backend_utils.write_cluster_config(). See its
|
1335
|
+
docstring.
|
1336
|
+
- 'provisioning_skipped': True if provisioning was short-circuited
|
1337
|
+
by skip_if_config_hash_matches, False otherwise.
|
1338
|
+
- 'handle': The provisioned cluster handle.
|
1339
|
+
- 'provision_record': (Only if using the new skypilot provisioner) The
|
1340
|
+
record returned by provisioner.bulk_provision().
|
1341
|
+
- 'resources_vars': (Only if using the new skypilot provisioner) The
|
1342
|
+
resources variables given by make_deploy_resources_variables().
|
1343
|
+
"""
|
1327
1344
|
# Get log_path name
|
1328
1345
|
log_path = os.path.join(self.log_dir, 'provision.log')
|
1329
1346
|
log_abs_path = os.path.abspath(log_path)
|
@@ -1432,8 +1449,18 @@ class RetryingVmProvisioner(object):
|
|
1432
1449
|
raise exceptions.ResourcesUnavailableError(
|
1433
1450
|
f'Failed to provision on cloud {to_provision.cloud} due to '
|
1434
1451
|
f'invalid cloud config: {common_utils.format_exception(e)}')
|
1452
|
+
|
1453
|
+
if ('config_hash' in config_dict and
|
1454
|
+
skip_if_config_hash_matches == config_dict['config_hash']):
|
1455
|
+
logger.debug('Skipping provisioning of cluster with matching '
|
1456
|
+
'config hash.')
|
1457
|
+
config_dict['provisioning_skipped'] = True
|
1458
|
+
return config_dict
|
1459
|
+
config_dict['provisioning_skipped'] = False
|
1460
|
+
|
1435
1461
|
if dryrun:
|
1436
1462
|
return config_dict
|
1463
|
+
|
1437
1464
|
cluster_config_file = config_dict['ray']
|
1438
1465
|
|
1439
1466
|
launched_resources = to_provision.copy(region=region.name)
|
@@ -1945,8 +1972,13 @@ class RetryingVmProvisioner(object):
|
|
1945
1972
|
to_provision_config: ToProvisionConfig,
|
1946
1973
|
dryrun: bool,
|
1947
1974
|
stream_logs: bool,
|
1975
|
+
skip_unnecessary_provisioning: bool,
|
1948
1976
|
) -> Dict[str, Any]:
|
1949
|
-
"""Provision with retries for all launchable resources.
|
1977
|
+
"""Provision with retries for all launchable resources.
|
1978
|
+
|
1979
|
+
Returns the config_dict from _retry_zones() - see its docstring for
|
1980
|
+
details.
|
1981
|
+
"""
|
1950
1982
|
cluster_name = to_provision_config.cluster_name
|
1951
1983
|
to_provision = to_provision_config.resources
|
1952
1984
|
num_nodes = to_provision_config.num_nodes
|
@@ -1955,6 +1987,8 @@ class RetryingVmProvisioner(object):
|
|
1955
1987
|
prev_cluster_ever_up = to_provision_config.prev_cluster_ever_up
|
1956
1988
|
launchable_retries_disabled = (self._dag is None or
|
1957
1989
|
self._optimize_target is None)
|
1990
|
+
skip_if_config_hash_matches = (to_provision_config.prev_config_hash if
|
1991
|
+
skip_unnecessary_provisioning else None)
|
1958
1992
|
|
1959
1993
|
failover_history: List[Exception] = list()
|
1960
1994
|
|
@@ -1994,7 +2028,8 @@ class RetryingVmProvisioner(object):
|
|
1994
2028
|
cloud_user_identity=cloud_user,
|
1995
2029
|
prev_cluster_status=prev_cluster_status,
|
1996
2030
|
prev_handle=prev_handle,
|
1997
|
-
prev_cluster_ever_up=prev_cluster_ever_up
|
2031
|
+
prev_cluster_ever_up=prev_cluster_ever_up,
|
2032
|
+
skip_if_config_hash_matches=skip_if_config_hash_matches)
|
1998
2033
|
if dryrun:
|
1999
2034
|
return config_dict
|
2000
2035
|
except (exceptions.InvalidClusterNameError,
|
@@ -2695,14 +2730,21 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2695
2730
|
return valid_resource
|
2696
2731
|
|
2697
2732
|
def _provision(
|
2698
|
-
|
2699
|
-
|
2700
|
-
|
2701
|
-
|
2702
|
-
|
2703
|
-
|
2704
|
-
|
2705
|
-
|
2733
|
+
self,
|
2734
|
+
task: task_lib.Task,
|
2735
|
+
to_provision: Optional[resources_lib.Resources],
|
2736
|
+
dryrun: bool,
|
2737
|
+
stream_logs: bool,
|
2738
|
+
cluster_name: str,
|
2739
|
+
retry_until_up: bool = False,
|
2740
|
+
skip_unnecessary_provisioning: bool = False,
|
2741
|
+
) -> Optional[CloudVmRayResourceHandle]:
|
2742
|
+
"""Provisions the cluster, or re-provisions an existing cluster.
|
2743
|
+
|
2744
|
+
Use the SKYPILOT provisioner if it's supported by the cloud, otherwise
|
2745
|
+
use 'ray up'.
|
2746
|
+
|
2747
|
+
See also docstring for Backend.provision().
|
2706
2748
|
|
2707
2749
|
Raises:
|
2708
2750
|
exceptions.ClusterOwnerIdentityMismatchError: if the cluster
|
@@ -2787,7 +2829,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2787
2829
|
rich_utils.force_update_status(
|
2788
2830
|
ux_utils.spinner_message('Launching', log_path))
|
2789
2831
|
config_dict = retry_provisioner.provision_with_retries(
|
2790
|
-
task, to_provision_config, dryrun, stream_logs
|
2832
|
+
task, to_provision_config, dryrun, stream_logs,
|
2833
|
+
skip_unnecessary_provisioning)
|
2791
2834
|
break
|
2792
2835
|
except exceptions.ResourcesUnavailableError as e:
|
2793
2836
|
# Do not remove the stopped cluster from the global state
|
@@ -2837,11 +2880,23 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2837
2880
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
2838
2881
|
return record['handle'] if record is not None else None
|
2839
2882
|
|
2883
|
+
if config_dict['provisioning_skipped']:
|
2884
|
+
# Skip further provisioning.
|
2885
|
+
# In this case, we won't have certain fields in the config_dict
|
2886
|
+
# ('handle', 'provision_record', 'resources_vars')
|
2887
|
+
# We need to return the handle - but it should be the existing
|
2888
|
+
# handle for the cluster.
|
2889
|
+
record = global_user_state.get_cluster_from_name(cluster_name)
|
2890
|
+
assert record is not None and record['handle'] is not None, (
|
2891
|
+
cluster_name, record)
|
2892
|
+
return record['handle']
|
2893
|
+
|
2840
2894
|
if 'provision_record' in config_dict:
|
2841
2895
|
# New provisioner is used here.
|
2842
2896
|
handle = config_dict['handle']
|
2843
2897
|
provision_record = config_dict['provision_record']
|
2844
2898
|
resources_vars = config_dict['resources_vars']
|
2899
|
+
config_hash = config_dict.get('config_hash', None)
|
2845
2900
|
|
2846
2901
|
# Setup SkyPilot runtime after the cluster is provisioned
|
2847
2902
|
# 1. Wait for SSH to be ready.
|
@@ -2876,7 +2931,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2876
2931
|
self._update_after_cluster_provisioned(
|
2877
2932
|
handle, to_provision_config.prev_handle, task,
|
2878
2933
|
prev_cluster_status, handle.external_ips(),
|
2879
|
-
handle.external_ssh_ports(), lock_path)
|
2934
|
+
handle.external_ssh_ports(), lock_path, config_hash)
|
2880
2935
|
return handle
|
2881
2936
|
|
2882
2937
|
cluster_config_file = config_dict['ray']
|
@@ -2948,7 +3003,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2948
3003
|
|
2949
3004
|
self._update_after_cluster_provisioned(
|
2950
3005
|
handle, to_provision_config.prev_handle, task,
|
2951
|
-
prev_cluster_status, ip_list, ssh_port_list, lock_path
|
3006
|
+
prev_cluster_status, ip_list, ssh_port_list, lock_path,
|
3007
|
+
config_hash)
|
2952
3008
|
return handle
|
2953
3009
|
|
2954
3010
|
def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
|
@@ -2966,8 +3022,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2966
3022
|
prev_handle: Optional[CloudVmRayResourceHandle],
|
2967
3023
|
task: task_lib.Task,
|
2968
3024
|
prev_cluster_status: Optional[status_lib.ClusterStatus],
|
2969
|
-
ip_list: List[str], ssh_port_list: List[int],
|
2970
|
-
|
3025
|
+
ip_list: List[str], ssh_port_list: List[int], lock_path: str,
|
3026
|
+
config_hash: str) -> None:
|
2971
3027
|
usage_lib.messages.usage.update_cluster_resources(
|
2972
3028
|
handle.launched_nodes, handle.launched_resources)
|
2973
3029
|
usage_lib.messages.usage.update_final_cluster_status(
|
@@ -3027,6 +3083,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3027
3083
|
handle,
|
3028
3084
|
set(task.resources),
|
3029
3085
|
ready=True,
|
3086
|
+
config_hash=config_hash,
|
3030
3087
|
)
|
3031
3088
|
usage_lib.messages.usage.update_final_cluster_status(
|
3032
3089
|
status_lib.ClusterStatus.UP)
|
@@ -3460,15 +3517,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3460
3517
|
Returns:
|
3461
3518
|
Job id if the task is submitted to the cluster, None otherwise.
|
3462
3519
|
"""
|
3463
|
-
if task.run is None:
|
3520
|
+
if task.run is None and self._setup_cmd is None:
|
3521
|
+
# This message is fine without mentioning setup, as there are three
|
3522
|
+
# cases when run section is empty:
|
3523
|
+
# 1. setup specified, no --detach-setup: setup is executed and this
|
3524
|
+
# message is fine for saying no run command specified.
|
3525
|
+
# 2. setup specified, with --detach-setup: setup is executed in
|
3526
|
+
# detached mode and this message will not be shown.
|
3527
|
+
# 3. no setup specified: this message is fine as a user is likely
|
3528
|
+
# creating a cluster only, and ok with the empty run command.
|
3464
3529
|
logger.info('Run commands not specified or empty.')
|
3465
3530
|
return None
|
3466
|
-
|
3467
|
-
|
3468
|
-
|
3469
|
-
|
3470
|
-
|
3471
|
-
|
3531
|
+
if task.run is None:
|
3532
|
+
# If the task has no run command, we still need to execute the
|
3533
|
+
# generated ray driver program to run the setup command in detached
|
3534
|
+
# mode.
|
3535
|
+
# In this case, we reset the resources for the task, so that the
|
3536
|
+
# detached setup does not need to wait for the task resources to be
|
3537
|
+
# ready (which is not used for setup anyway).
|
3538
|
+
valid_resource = sky.Resources()
|
3539
|
+
else:
|
3540
|
+
# Check the task resources vs the cluster resources. Since
|
3541
|
+
# `sky exec` will not run the provision and _check_existing_cluster
|
3542
|
+
# We need to check ports here since sky.exec shouldn't change
|
3543
|
+
# resources.
|
3544
|
+
valid_resource = self.check_resources_fit_cluster(handle,
|
3545
|
+
task,
|
3546
|
+
check_ports=True)
|
3472
3547
|
task_copy = copy.copy(task)
|
3473
3548
|
# Handle multiple resources exec case.
|
3474
3549
|
task_copy.set_resources(valid_resource)
|
@@ -4328,6 +4403,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4328
4403
|
# cluster is terminated (through console or auto-dwon), the record will
|
4329
4404
|
# become None and the cluster_ever_up should be considered as False.
|
4330
4405
|
cluster_ever_up = record is not None and record['cluster_ever_up']
|
4406
|
+
prev_config_hash = record['config_hash'] if record is not None else None
|
4331
4407
|
logger.debug(f'cluster_ever_up: {cluster_ever_up}')
|
4332
4408
|
logger.debug(f'record: {record}')
|
4333
4409
|
|
@@ -4366,7 +4442,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4366
4442
|
handle.launched_nodes,
|
4367
4443
|
prev_cluster_status=prev_cluster_status,
|
4368
4444
|
prev_handle=handle,
|
4369
|
-
prev_cluster_ever_up=cluster_ever_up
|
4445
|
+
prev_cluster_ever_up=cluster_ever_up,
|
4446
|
+
prev_config_hash=prev_config_hash)
|
4370
4447
|
usage_lib.messages.usage.set_new_cluster()
|
4371
4448
|
# Use the task_cloud, because the cloud in `to_provision` can be changed
|
4372
4449
|
# later during the retry.
|
@@ -4407,7 +4484,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4407
4484
|
task.num_nodes,
|
4408
4485
|
prev_cluster_status=None,
|
4409
4486
|
prev_handle=None,
|
4410
|
-
prev_cluster_ever_up=False
|
4487
|
+
prev_cluster_ever_up=False,
|
4488
|
+
prev_config_hash=prev_config_hash)
|
4411
4489
|
|
4412
4490
|
def _execute_file_mounts(self, handle: CloudVmRayResourceHandle,
|
4413
4491
|
file_mounts: Optional[Dict[Path, Path]]):
|
@@ -131,13 +131,14 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
131
131
|
pass
|
132
132
|
|
133
133
|
def _provision(
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
134
|
+
self,
|
135
|
+
task: 'task_lib.Task',
|
136
|
+
to_provision: Optional['resources.Resources'],
|
137
|
+
dryrun: bool,
|
138
|
+
stream_logs: bool,
|
139
|
+
cluster_name: str,
|
140
|
+
retry_until_up: bool = False,
|
141
|
+
skip_unnecessary_provisioning: bool = False,
|
141
142
|
) -> Optional[LocalDockerResourceHandle]:
|
142
143
|
"""Builds docker image for the task and returns cluster name as handle.
|
143
144
|
|
@@ -153,6 +154,9 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
153
154
|
logger.warning(
|
154
155
|
f'Retrying until up is not supported in backend: {self.NAME}. '
|
155
156
|
'Ignored the flag.')
|
157
|
+
if skip_unnecessary_provisioning:
|
158
|
+
logger.warning(f'skip_unnecessary_provisioning is not supported in '
|
159
|
+
f'backend: {self.NAME}. Ignored the flag.')
|
156
160
|
if stream_logs:
|
157
161
|
logger.info(
|
158
162
|
'Streaming build logs is not supported in LocalDockerBackend. '
|
sky/cli.py
CHANGED
@@ -3914,16 +3914,25 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
|
|
3914
3914
|
default=False,
|
3915
3915
|
help=('Show the controller logs of this job; useful for debugging '
|
3916
3916
|
'launching/recoveries, etc.'))
|
3917
|
+
@click.option(
|
3918
|
+
'--refresh',
|
3919
|
+
'-r',
|
3920
|
+
default=False,
|
3921
|
+
is_flag=True,
|
3922
|
+
required=False,
|
3923
|
+
help='Query the latest job logs, restarting the jobs controller if stopped.'
|
3924
|
+
)
|
3917
3925
|
@click.argument('job_id', required=False, type=int)
|
3918
3926
|
@usage_lib.entrypoint
|
3919
3927
|
def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
3920
|
-
controller: bool):
|
3928
|
+
controller: bool, refresh: bool):
|
3921
3929
|
"""Tail the log of a managed job."""
|
3922
3930
|
try:
|
3923
3931
|
managed_jobs.tail_logs(name=name,
|
3924
3932
|
job_id=job_id,
|
3925
3933
|
follow=follow,
|
3926
|
-
controller=controller
|
3934
|
+
controller=controller,
|
3935
|
+
refresh=refresh)
|
3927
3936
|
except exceptions.ClusterNotUpError:
|
3928
3937
|
with ux_utils.print_exception_no_traceback():
|
3929
3938
|
raise
|
@@ -15,6 +15,7 @@ from sky.adaptors import common as adaptors_common
|
|
15
15
|
from sky.clouds import cloud as cloud_lib
|
16
16
|
from sky.clouds import cloud_registry
|
17
17
|
from sky.clouds.service_catalog import constants
|
18
|
+
from sky.utils import common_utils
|
18
19
|
from sky.utils import rich_utils
|
19
20
|
from sky.utils import ux_utils
|
20
21
|
|
@@ -69,8 +70,7 @@ def is_catalog_modified(filename: str) -> bool:
|
|
69
70
|
meta_path = os.path.join(_ABSOLUTE_VERSIONED_CATALOG_DIR, '.meta', filename)
|
70
71
|
md5_filepath = meta_path + '.md5'
|
71
72
|
if os.path.exists(md5_filepath):
|
72
|
-
|
73
|
-
file_md5 = hashlib.md5(f.read()).hexdigest()
|
73
|
+
file_md5 = common_utils.hash_file(catalog_path, 'md5').hexdigest()
|
74
74
|
with open(md5_filepath, 'r', encoding='utf-8') as f:
|
75
75
|
last_md5 = f.read()
|
76
76
|
return file_md5 != last_md5
|