skypilot-nightly 1.0.0.dev20241203__py3-none-any.whl → 1.0.0.dev20241205__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '747382a7cf75c691c91846efe708ce25b2a3aeb8'
8
+ _SKYPILOT_COMMIT_SHA = '6e5083293f0d9a9d069d51274c57f0e59e47e5ce'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241203'
38
+ __version__ = '1.0.0.dev20241205'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -105,6 +105,7 @@ from sky.data import StorageMode
105
105
  from sky.data import StoreType
106
106
  from sky.execution import exec # pylint: disable=redefined-builtin
107
107
  from sky.execution import launch
108
+ from sky.jobs import ManagedJobStatus
108
109
  # TODO (zhwu): These imports are for backward compatibility, and spot APIs
109
110
  # should be called with `sky.spot.xxx` instead. Remove in release 0.8.0
110
111
  from sky.jobs.core import spot_cancel
@@ -163,6 +164,7 @@ __all__ = [
163
164
  'StoreType',
164
165
  'ClusterStatus',
165
166
  'JobStatus',
167
+ 'ManagedJobStatus',
166
168
  # APIs
167
169
  'Dag',
168
170
  'Task',
sky/backends/backend.py CHANGED
@@ -45,20 +45,45 @@ class Backend(Generic[_ResourceHandleType]):
45
45
  @timeline.event
46
46
  @usage_lib.messages.usage.update_runtime('provision')
47
47
  def provision(
48
- self,
49
- task: 'task_lib.Task',
50
- to_provision: Optional['resources.Resources'],
51
- dryrun: bool,
52
- stream_logs: bool,
53
- cluster_name: Optional[str] = None,
54
- retry_until_up: bool = False) -> Optional[_ResourceHandleType]:
48
+ self,
49
+ task: 'task_lib.Task',
50
+ to_provision: Optional['resources.Resources'],
51
+ dryrun: bool,
52
+ stream_logs: bool,
53
+ cluster_name: Optional[str] = None,
54
+ retry_until_up: bool = False,
55
+ skip_unnecessary_provisioning: bool = False,
56
+ ) -> Optional[_ResourceHandleType]:
57
+ """Provisions resources for the given task.
58
+
59
+ Args:
60
+ task: The task to provision resources for.
61
+ to_provision: Resource config to provision. Should only be None if
62
+ cluster_name refers to an existing cluster, whose resources will
63
+ be used.
64
+ dryrun: If True, don't actually provision anything.
65
+ stream_logs: If True, stream additional logs to console.
66
+ cluster_name: Name of the cluster to provision. If None, a name will
67
+ be auto-generated. If the name refers to an existing cluster,
68
+ the existing cluster will be reused and re-provisioned.
69
+ retry_until_up: If True, retry provisioning until resources are
70
+ successfully launched.
71
+ skip_if_no_cluster_updates: If True, compare the cluster config to
72
+ the existing cluster_name's config. Skip provisioning if no
73
+ updates are needed for the existing cluster.
74
+
75
+ Returns:
76
+ A ResourceHandle object for the provisioned resources, or None if
77
+ dryrun is True.
78
+ """
55
79
  if cluster_name is None:
56
80
  cluster_name = sky.backends.backend_utils.generate_cluster_name()
57
81
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
58
82
  usage_lib.messages.usage.update_actual_task(task)
59
83
  with rich_utils.safe_status(ux_utils.spinner_message('Launching')):
60
84
  return self._provision(task, to_provision, dryrun, stream_logs,
61
- cluster_name, retry_until_up)
85
+ cluster_name, retry_until_up,
86
+ skip_unnecessary_provisioning)
62
87
 
63
88
  @timeline.event
64
89
  @usage_lib.messages.usage.update_runtime('sync_workdir')
@@ -126,13 +151,15 @@ class Backend(Generic[_ResourceHandleType]):
126
151
 
127
152
  # --- Implementations of the APIs ---
128
153
  def _provision(
129
- self,
130
- task: 'task_lib.Task',
131
- to_provision: Optional['resources.Resources'],
132
- dryrun: bool,
133
- stream_logs: bool,
134
- cluster_name: str,
135
- retry_until_up: bool = False) -> Optional[_ResourceHandleType]:
154
+ self,
155
+ task: 'task_lib.Task',
156
+ to_provision: Optional['resources.Resources'],
157
+ dryrun: bool,
158
+ stream_logs: bool,
159
+ cluster_name: str,
160
+ retry_until_up: bool = False,
161
+ skip_unnecessary_provisioning: bool = False,
162
+ ) -> Optional[_ResourceHandleType]:
136
163
  raise NotImplementedError
137
164
 
138
165
  def _sync_workdir(self, handle: _ResourceHandleType, workdir: Path) -> None:
@@ -3,6 +3,7 @@ from datetime import datetime
3
3
  import enum
4
4
  import fnmatch
5
5
  import functools
6
+ import hashlib
6
7
  import os
7
8
  import pathlib
8
9
  import pprint
@@ -644,11 +645,17 @@ def write_cluster_config(
644
645
  keep_launch_fields_in_existing_config: bool = True) -> Dict[str, str]:
645
646
  """Fills in cluster configuration templates and writes them out.
646
647
 
647
- Returns: {provisioner: path to yaml, the provisioning spec}.
648
- 'provisioner' can be
649
- - 'ray'
650
- - 'tpu-create-script' (if TPU is requested)
651
- - 'tpu-delete-script' (if TPU is requested)
648
+ Returns:
649
+ Dict with the following keys:
650
+ - 'ray': Path to the generated Ray yaml config file
651
+ - 'cluster_name': Name of the cluster
652
+ - 'cluster_name_on_cloud': Name of the cluster as it appears in the
653
+ cloud provider
654
+ - 'config_hash': Hash of the cluster config and file mounts contents.
655
+ Can be missing if we unexpectedly failed to calculate the hash for
656
+ some reason. In that case we will continue without the optimization to
657
+ skip provisioning.
658
+
652
659
  Raises:
653
660
  exceptions.ResourcesUnavailableError: if the region/zones requested does
654
661
  not appear in the catalog, or an ssh_proxy_command is specified but
@@ -903,6 +910,12 @@ def write_cluster_config(
903
910
  if dryrun:
904
911
  # If dryrun, return the unfinished tmp yaml path.
905
912
  config_dict['ray'] = tmp_yaml_path
913
+ try:
914
+ config_dict['config_hash'] = _deterministic_cluster_yaml_hash(
915
+ tmp_yaml_path)
916
+ except Exception as e: # pylint: disable=broad-except
917
+ logger.warning(f'Failed to calculate config_hash: {e}')
918
+ logger.debug('Full exception:', exc_info=e)
906
919
  return config_dict
907
920
  _add_auth_to_cluster_config(cloud, tmp_yaml_path)
908
921
 
@@ -925,6 +938,17 @@ def write_cluster_config(
925
938
  yaml_config = common_utils.read_yaml(tmp_yaml_path)
926
939
  config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
927
940
 
941
+ # Make sure to do this before we optimize file mounts. Optimization is
942
+ # non-deterministic, but everything else before this point should be
943
+ # deterministic.
944
+ try:
945
+ config_dict['config_hash'] = _deterministic_cluster_yaml_hash(
946
+ tmp_yaml_path)
947
+ except Exception as e: # pylint: disable=broad-except
948
+ logger.warning('Failed to calculate config_hash: '
949
+ f'{common_utils.format_exception(e)}')
950
+ logger.debug('Full exception:', exc_info=e)
951
+
928
952
  # Optimization: copy the contents of source files in file_mounts to a
929
953
  # special dir, and upload that as the only file_mount instead. Delay
930
954
  # calling this optimization until now, when all source files have been
@@ -1033,6 +1057,115 @@ def _count_healthy_nodes_from_ray(output: str,
1033
1057
  return ready_head, ready_workers
1034
1058
 
1035
1059
 
1060
+ @timeline.event
1061
+ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
1062
+ """Hash the cluster yaml and contents of file mounts to a unique string.
1063
+
1064
+ Two invocations of this function should return the same string if and only
1065
+ if the contents of the yaml are the same and the file contents of all the
1066
+ file_mounts specified in the yaml are the same.
1067
+
1068
+ Limitations:
1069
+ - This function can be expensive if the file mounts are large. (E.g. a few
1070
+ seconds for ~1GB.) This should be okay since we expect that the
1071
+ file_mounts in the cluster yaml (the wheel and cloud credentials) will be
1072
+ small.
1073
+ - Symbolic links are not explicitly handled. Some symbolic link changes may
1074
+ not be detected.
1075
+
1076
+ Implementation: We create a byte sequence that captures the state of the
1077
+ yaml file and all the files in the file mounts, then hash the byte sequence.
1078
+
1079
+ The format of the byte sequence is:
1080
+ 32 bytes - sha256 hash of the yaml file
1081
+ for each file mount:
1082
+ file mount remote destination (UTF-8), \0
1083
+ if the file mount source is a file:
1084
+ 'file' encoded to UTF-8
1085
+ 32 byte sha256 hash of the file contents
1086
+ if the file mount source is a directory:
1087
+ 'dir' encoded to UTF-8
1088
+ for each directory and subdirectory withinin the file mount (starting from
1089
+ the root and descending recursively):
1090
+ name of the directory (UTF-8), \0
1091
+ name of each subdirectory within the directory (UTF-8) terminated by \0
1092
+ \0
1093
+ for each file in the directory:
1094
+ name of the file (UTF-8), \0
1095
+ 32 bytes - sha256 hash of the file contents
1096
+ \0
1097
+ if the file mount source is something else or does not exist, nothing
1098
+ \0\0
1099
+
1100
+ Rather than constructing the whole byte sequence, which may be quite large,
1101
+ we construct it incrementally by using hash.update() to add new bytes.
1102
+ """
1103
+
1104
+ def _hash_file(path: str) -> bytes:
1105
+ return common_utils.hash_file(path, 'sha256').digest()
1106
+
1107
+ config_hash = hashlib.sha256()
1108
+
1109
+ config_hash.update(_hash_file(yaml_path))
1110
+
1111
+ yaml_config = common_utils.read_yaml(yaml_path)
1112
+ file_mounts = yaml_config.get('file_mounts', {})
1113
+ # Remove the file mounts added by the newline.
1114
+ if '' in file_mounts:
1115
+ assert file_mounts[''] == '', file_mounts['']
1116
+ file_mounts.pop('')
1117
+
1118
+ for dst, src in sorted(file_mounts.items()):
1119
+ expanded_src = os.path.expanduser(src)
1120
+ config_hash.update(dst.encode('utf-8') + b'\0')
1121
+
1122
+ # If the file mount source is a symlink, this should be true. In that
1123
+ # case we hash the contents of the symlink destination.
1124
+ if os.path.isfile(expanded_src):
1125
+ config_hash.update('file'.encode('utf-8'))
1126
+ config_hash.update(_hash_file(expanded_src))
1127
+
1128
+ # This can also be a symlink to a directory. os.walk will treat it as a
1129
+ # normal directory and list the contents of the symlink destination.
1130
+ elif os.path.isdir(expanded_src):
1131
+ config_hash.update('dir'.encode('utf-8'))
1132
+
1133
+ # Aside from expanded_src, os.walk will list symlinks to directories
1134
+ # but will not recurse into them.
1135
+ for (dirpath, dirnames, filenames) in os.walk(expanded_src):
1136
+ config_hash.update(dirpath.encode('utf-8') + b'\0')
1137
+
1138
+ # Note: inplace sort will also affect the traversal order of
1139
+ # os.walk. We need it so that the os.walk order is
1140
+ # deterministic.
1141
+ dirnames.sort()
1142
+ # This includes symlinks to directories. os.walk will recurse
1143
+ # into all the directories but not the symlinks. We don't hash
1144
+ # the link destination, so if a symlink to a directory changes,
1145
+ # we won't notice.
1146
+ for dirname in dirnames:
1147
+ config_hash.update(dirname.encode('utf-8') + b'\0')
1148
+ config_hash.update(b'\0')
1149
+
1150
+ filenames.sort()
1151
+ # This includes symlinks to files. We could hash the symlink
1152
+ # destination itself but instead just hash the destination
1153
+ # contents.
1154
+ for filename in filenames:
1155
+ config_hash.update(filename.encode('utf-8') + b'\0')
1156
+ config_hash.update(
1157
+ _hash_file(os.path.join(dirpath, filename)))
1158
+ config_hash.update(b'\0')
1159
+
1160
+ else:
1161
+ logger.debug(
1162
+ f'Unexpected file_mount that is not a file or dir: {src}')
1163
+
1164
+ config_hash.update(b'\0\0')
1165
+
1166
+ return config_hash.hexdigest()
1167
+
1168
+
1036
1169
  def get_docker_user(ip: str, cluster_config_file: str) -> str:
1037
1170
  """Find docker container username."""
1038
1171
  ssh_credentials = ssh_credential_from_yaml(cluster_config_file)
@@ -1612,14 +1745,14 @@ def check_can_clone_disk_and_override_task(
1612
1745
  The task to use and the resource handle of the source cluster.
1613
1746
 
1614
1747
  Raises:
1615
- ValueError: If the source cluster does not exist.
1748
+ exceptions.ClusterDoesNotExist: If the source cluster does not exist.
1616
1749
  exceptions.NotSupportedError: If the source cluster is not valid or the
1617
1750
  task is not compatible to clone disk from the source cluster.
1618
1751
  """
1619
1752
  source_cluster_status, handle = refresh_cluster_status_handle(cluster_name)
1620
1753
  if source_cluster_status is None:
1621
1754
  with ux_utils.print_exception_no_traceback():
1622
- raise ValueError(
1755
+ raise exceptions.ClusterDoesNotExist(
1623
1756
  f'Cannot find cluster {cluster_name!r} to clone disk from.')
1624
1757
 
1625
1758
  if not isinstance(handle, backends.CloudVmRayResourceHandle):
@@ -2136,7 +2269,7 @@ def check_cluster_available(
2136
2269
  """Check if the cluster is available.
2137
2270
 
2138
2271
  Raises:
2139
- ValueError: if the cluster does not exist.
2272
+ exceptions.ClusterDoesNotExist: if the cluster does not exist.
2140
2273
  exceptions.ClusterNotUpError: if the cluster is not UP.
2141
2274
  exceptions.NotSupportedError: if the cluster is not based on
2142
2275
  CloudVmRayBackend.
@@ -2201,7 +2334,8 @@ def check_cluster_available(
2201
2334
  error_msg += message
2202
2335
 
2203
2336
  with ux_utils.print_exception_no_traceback():
2204
- raise ValueError(f'{colorama.Fore.YELLOW}{error_msg}{reset}')
2337
+ raise exceptions.ClusterDoesNotExist(
2338
+ f'{colorama.Fore.YELLOW}{error_msg}{reset}')
2205
2339
  assert cluster_status is not None, 'handle is not None but status is None'
2206
2340
  backend = get_backend_from_handle(handle)
2207
2341
  if check_cloud_vm_ray_backend and not isinstance(
@@ -301,6 +301,8 @@ class RayCodeGen:
301
301
  )
302
302
  def get_or_fail(futures, pg) -> List[int]:
303
303
  \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
304
+ if not futures:
305
+ return []
304
306
  returncodes = [1] * len(futures)
305
307
  # Wait for 1 task to be ready.
306
308
  ready = []
@@ -1153,6 +1155,7 @@ class RetryingVmProvisioner(object):
1153
1155
  prev_cluster_status: Optional[status_lib.ClusterStatus],
1154
1156
  prev_handle: Optional['CloudVmRayResourceHandle'],
1155
1157
  prev_cluster_ever_up: bool,
1158
+ prev_config_hash: Optional[str],
1156
1159
  ) -> None:
1157
1160
  assert cluster_name is not None, 'cluster_name must be specified.'
1158
1161
  self.cluster_name = cluster_name
@@ -1161,6 +1164,7 @@ class RetryingVmProvisioner(object):
1161
1164
  self.prev_cluster_status = prev_cluster_status
1162
1165
  self.prev_handle = prev_handle
1163
1166
  self.prev_cluster_ever_up = prev_cluster_ever_up
1167
+ self.prev_config_hash = prev_config_hash
1164
1168
 
1165
1169
  def __init__(self,
1166
1170
  log_dir: str,
@@ -1322,8 +1326,21 @@ class RetryingVmProvisioner(object):
1322
1326
  prev_cluster_status: Optional[status_lib.ClusterStatus],
1323
1327
  prev_handle: Optional['CloudVmRayResourceHandle'],
1324
1328
  prev_cluster_ever_up: bool,
1329
+ skip_if_config_hash_matches: Optional[str],
1325
1330
  ) -> Dict[str, Any]:
1326
- """The provision retry loop."""
1331
+ """The provision retry loop.
1332
+
1333
+ Returns a config_dict with the following fields:
1334
+ All fields from backend_utils.write_cluster_config(). See its
1335
+ docstring.
1336
+ - 'provisioning_skipped': True if provisioning was short-circuited
1337
+ by skip_if_config_hash_matches, False otherwise.
1338
+ - 'handle': The provisioned cluster handle.
1339
+ - 'provision_record': (Only if using the new skypilot provisioner) The
1340
+ record returned by provisioner.bulk_provision().
1341
+ - 'resources_vars': (Only if using the new skypilot provisioner) The
1342
+ resources variables given by make_deploy_resources_variables().
1343
+ """
1327
1344
  # Get log_path name
1328
1345
  log_path = os.path.join(self.log_dir, 'provision.log')
1329
1346
  log_abs_path = os.path.abspath(log_path)
@@ -1432,8 +1449,18 @@ class RetryingVmProvisioner(object):
1432
1449
  raise exceptions.ResourcesUnavailableError(
1433
1450
  f'Failed to provision on cloud {to_provision.cloud} due to '
1434
1451
  f'invalid cloud config: {common_utils.format_exception(e)}')
1452
+
1453
+ if ('config_hash' in config_dict and
1454
+ skip_if_config_hash_matches == config_dict['config_hash']):
1455
+ logger.debug('Skipping provisioning of cluster with matching '
1456
+ 'config hash.')
1457
+ config_dict['provisioning_skipped'] = True
1458
+ return config_dict
1459
+ config_dict['provisioning_skipped'] = False
1460
+
1435
1461
  if dryrun:
1436
1462
  return config_dict
1463
+
1437
1464
  cluster_config_file = config_dict['ray']
1438
1465
 
1439
1466
  launched_resources = to_provision.copy(region=region.name)
@@ -1945,8 +1972,13 @@ class RetryingVmProvisioner(object):
1945
1972
  to_provision_config: ToProvisionConfig,
1946
1973
  dryrun: bool,
1947
1974
  stream_logs: bool,
1975
+ skip_unnecessary_provisioning: bool,
1948
1976
  ) -> Dict[str, Any]:
1949
- """Provision with retries for all launchable resources."""
1977
+ """Provision with retries for all launchable resources.
1978
+
1979
+ Returns the config_dict from _retry_zones() - see its docstring for
1980
+ details.
1981
+ """
1950
1982
  cluster_name = to_provision_config.cluster_name
1951
1983
  to_provision = to_provision_config.resources
1952
1984
  num_nodes = to_provision_config.num_nodes
@@ -1955,6 +1987,8 @@ class RetryingVmProvisioner(object):
1955
1987
  prev_cluster_ever_up = to_provision_config.prev_cluster_ever_up
1956
1988
  launchable_retries_disabled = (self._dag is None or
1957
1989
  self._optimize_target is None)
1990
+ skip_if_config_hash_matches = (to_provision_config.prev_config_hash if
1991
+ skip_unnecessary_provisioning else None)
1958
1992
 
1959
1993
  failover_history: List[Exception] = list()
1960
1994
 
@@ -1994,7 +2028,8 @@ class RetryingVmProvisioner(object):
1994
2028
  cloud_user_identity=cloud_user,
1995
2029
  prev_cluster_status=prev_cluster_status,
1996
2030
  prev_handle=prev_handle,
1997
- prev_cluster_ever_up=prev_cluster_ever_up)
2031
+ prev_cluster_ever_up=prev_cluster_ever_up,
2032
+ skip_if_config_hash_matches=skip_if_config_hash_matches)
1998
2033
  if dryrun:
1999
2034
  return config_dict
2000
2035
  except (exceptions.InvalidClusterNameError,
@@ -2695,14 +2730,21 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2695
2730
  return valid_resource
2696
2731
 
2697
2732
  def _provision(
2698
- self,
2699
- task: task_lib.Task,
2700
- to_provision: Optional[resources_lib.Resources],
2701
- dryrun: bool,
2702
- stream_logs: bool,
2703
- cluster_name: str,
2704
- retry_until_up: bool = False) -> Optional[CloudVmRayResourceHandle]:
2705
- """Provisions using 'ray up'.
2733
+ self,
2734
+ task: task_lib.Task,
2735
+ to_provision: Optional[resources_lib.Resources],
2736
+ dryrun: bool,
2737
+ stream_logs: bool,
2738
+ cluster_name: str,
2739
+ retry_until_up: bool = False,
2740
+ skip_unnecessary_provisioning: bool = False,
2741
+ ) -> Optional[CloudVmRayResourceHandle]:
2742
+ """Provisions the cluster, or re-provisions an existing cluster.
2743
+
2744
+ Use the SKYPILOT provisioner if it's supported by the cloud, otherwise
2745
+ use 'ray up'.
2746
+
2747
+ See also docstring for Backend.provision().
2706
2748
 
2707
2749
  Raises:
2708
2750
  exceptions.ClusterOwnerIdentityMismatchError: if the cluster
@@ -2787,7 +2829,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2787
2829
  rich_utils.force_update_status(
2788
2830
  ux_utils.spinner_message('Launching', log_path))
2789
2831
  config_dict = retry_provisioner.provision_with_retries(
2790
- task, to_provision_config, dryrun, stream_logs)
2832
+ task, to_provision_config, dryrun, stream_logs,
2833
+ skip_unnecessary_provisioning)
2791
2834
  break
2792
2835
  except exceptions.ResourcesUnavailableError as e:
2793
2836
  # Do not remove the stopped cluster from the global state
@@ -2837,11 +2880,23 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2837
2880
  record = global_user_state.get_cluster_from_name(cluster_name)
2838
2881
  return record['handle'] if record is not None else None
2839
2882
 
2883
+ if config_dict['provisioning_skipped']:
2884
+ # Skip further provisioning.
2885
+ # In this case, we won't have certain fields in the config_dict
2886
+ # ('handle', 'provision_record', 'resources_vars')
2887
+ # We need to return the handle - but it should be the existing
2888
+ # handle for the cluster.
2889
+ record = global_user_state.get_cluster_from_name(cluster_name)
2890
+ assert record is not None and record['handle'] is not None, (
2891
+ cluster_name, record)
2892
+ return record['handle']
2893
+
2840
2894
  if 'provision_record' in config_dict:
2841
2895
  # New provisioner is used here.
2842
2896
  handle = config_dict['handle']
2843
2897
  provision_record = config_dict['provision_record']
2844
2898
  resources_vars = config_dict['resources_vars']
2899
+ config_hash = config_dict.get('config_hash', None)
2845
2900
 
2846
2901
  # Setup SkyPilot runtime after the cluster is provisioned
2847
2902
  # 1. Wait for SSH to be ready.
@@ -2876,7 +2931,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2876
2931
  self._update_after_cluster_provisioned(
2877
2932
  handle, to_provision_config.prev_handle, task,
2878
2933
  prev_cluster_status, handle.external_ips(),
2879
- handle.external_ssh_ports(), lock_path)
2934
+ handle.external_ssh_ports(), lock_path, config_hash)
2880
2935
  return handle
2881
2936
 
2882
2937
  cluster_config_file = config_dict['ray']
@@ -2948,7 +3003,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2948
3003
 
2949
3004
  self._update_after_cluster_provisioned(
2950
3005
  handle, to_provision_config.prev_handle, task,
2951
- prev_cluster_status, ip_list, ssh_port_list, lock_path)
3006
+ prev_cluster_status, ip_list, ssh_port_list, lock_path,
3007
+ config_hash)
2952
3008
  return handle
2953
3009
 
2954
3010
  def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
@@ -2966,8 +3022,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2966
3022
  prev_handle: Optional[CloudVmRayResourceHandle],
2967
3023
  task: task_lib.Task,
2968
3024
  prev_cluster_status: Optional[status_lib.ClusterStatus],
2969
- ip_list: List[str], ssh_port_list: List[int],
2970
- lock_path: str) -> None:
3025
+ ip_list: List[str], ssh_port_list: List[int], lock_path: str,
3026
+ config_hash: str) -> None:
2971
3027
  usage_lib.messages.usage.update_cluster_resources(
2972
3028
  handle.launched_nodes, handle.launched_resources)
2973
3029
  usage_lib.messages.usage.update_final_cluster_status(
@@ -3027,6 +3083,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3027
3083
  handle,
3028
3084
  set(task.resources),
3029
3085
  ready=True,
3086
+ config_hash=config_hash,
3030
3087
  )
3031
3088
  usage_lib.messages.usage.update_final_cluster_status(
3032
3089
  status_lib.ClusterStatus.UP)
@@ -3460,15 +3517,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3460
3517
  Returns:
3461
3518
  Job id if the task is submitted to the cluster, None otherwise.
3462
3519
  """
3463
- if task.run is None:
3520
+ if task.run is None and self._setup_cmd is None:
3521
+ # This message is fine without mentioning setup, as there are three
3522
+ # cases when run section is empty:
3523
+ # 1. setup specified, no --detach-setup: setup is executed and this
3524
+ # message is fine for saying no run command specified.
3525
+ # 2. setup specified, with --detach-setup: setup is executed in
3526
+ # detached mode and this message will not be shown.
3527
+ # 3. no setup specified: this message is fine as a user is likely
3528
+ # creating a cluster only, and ok with the empty run command.
3464
3529
  logger.info('Run commands not specified or empty.')
3465
3530
  return None
3466
- # Check the task resources vs the cluster resources. Since `sky exec`
3467
- # will not run the provision and _check_existing_cluster
3468
- # We need to check ports here since sky.exec shouldn't change resources
3469
- valid_resource = self.check_resources_fit_cluster(handle,
3470
- task,
3471
- check_ports=True)
3531
+ if task.run is None:
3532
+ # If the task has no run command, we still need to execute the
3533
+ # generated ray driver program to run the setup command in detached
3534
+ # mode.
3535
+ # In this case, we reset the resources for the task, so that the
3536
+ # detached setup does not need to wait for the task resources to be
3537
+ # ready (which is not used for setup anyway).
3538
+ valid_resource = sky.Resources()
3539
+ else:
3540
+ # Check the task resources vs the cluster resources. Since
3541
+ # `sky exec` will not run the provision and _check_existing_cluster
3542
+ # We need to check ports here since sky.exec shouldn't change
3543
+ # resources.
3544
+ valid_resource = self.check_resources_fit_cluster(handle,
3545
+ task,
3546
+ check_ports=True)
3472
3547
  task_copy = copy.copy(task)
3473
3548
  # Handle multiple resources exec case.
3474
3549
  task_copy.set_resources(valid_resource)
@@ -4328,6 +4403,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4328
4403
  # cluster is terminated (through console or auto-dwon), the record will
4329
4404
  # become None and the cluster_ever_up should be considered as False.
4330
4405
  cluster_ever_up = record is not None and record['cluster_ever_up']
4406
+ prev_config_hash = record['config_hash'] if record is not None else None
4331
4407
  logger.debug(f'cluster_ever_up: {cluster_ever_up}')
4332
4408
  logger.debug(f'record: {record}')
4333
4409
 
@@ -4366,7 +4442,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4366
4442
  handle.launched_nodes,
4367
4443
  prev_cluster_status=prev_cluster_status,
4368
4444
  prev_handle=handle,
4369
- prev_cluster_ever_up=cluster_ever_up)
4445
+ prev_cluster_ever_up=cluster_ever_up,
4446
+ prev_config_hash=prev_config_hash)
4370
4447
  usage_lib.messages.usage.set_new_cluster()
4371
4448
  # Use the task_cloud, because the cloud in `to_provision` can be changed
4372
4449
  # later during the retry.
@@ -4407,7 +4484,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4407
4484
  task.num_nodes,
4408
4485
  prev_cluster_status=None,
4409
4486
  prev_handle=None,
4410
- prev_cluster_ever_up=False)
4487
+ prev_cluster_ever_up=False,
4488
+ prev_config_hash=prev_config_hash)
4411
4489
 
4412
4490
  def _execute_file_mounts(self, handle: CloudVmRayResourceHandle,
4413
4491
  file_mounts: Optional[Dict[Path, Path]]):
@@ -131,13 +131,14 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
131
131
  pass
132
132
 
133
133
  def _provision(
134
- self,
135
- task: 'task_lib.Task',
136
- to_provision: Optional['resources.Resources'],
137
- dryrun: bool,
138
- stream_logs: bool,
139
- cluster_name: str,
140
- retry_until_up: bool = False
134
+ self,
135
+ task: 'task_lib.Task',
136
+ to_provision: Optional['resources.Resources'],
137
+ dryrun: bool,
138
+ stream_logs: bool,
139
+ cluster_name: str,
140
+ retry_until_up: bool = False,
141
+ skip_unnecessary_provisioning: bool = False,
141
142
  ) -> Optional[LocalDockerResourceHandle]:
142
143
  """Builds docker image for the task and returns cluster name as handle.
143
144
 
@@ -153,6 +154,9 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
153
154
  logger.warning(
154
155
  f'Retrying until up is not supported in backend: {self.NAME}. '
155
156
  'Ignored the flag.')
157
+ if skip_unnecessary_provisioning:
158
+ logger.warning(f'skip_unnecessary_provisioning is not supported in '
159
+ f'backend: {self.NAME}. Ignored the flag.')
156
160
  if stream_logs:
157
161
  logger.info(
158
162
  'Streaming build logs is not supported in LocalDockerBackend. '
sky/cli.py CHANGED
@@ -3914,16 +3914,25 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
3914
3914
  default=False,
3915
3915
  help=('Show the controller logs of this job; useful for debugging '
3916
3916
  'launching/recoveries, etc.'))
3917
+ @click.option(
3918
+ '--refresh',
3919
+ '-r',
3920
+ default=False,
3921
+ is_flag=True,
3922
+ required=False,
3923
+ help='Query the latest job logs, restarting the jobs controller if stopped.'
3924
+ )
3917
3925
  @click.argument('job_id', required=False, type=int)
3918
3926
  @usage_lib.entrypoint
3919
3927
  def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
3920
- controller: bool):
3928
+ controller: bool, refresh: bool):
3921
3929
  """Tail the log of a managed job."""
3922
3930
  try:
3923
3931
  managed_jobs.tail_logs(name=name,
3924
3932
  job_id=job_id,
3925
3933
  follow=follow,
3926
- controller=controller)
3934
+ controller=controller,
3935
+ refresh=refresh)
3927
3936
  except exceptions.ClusterNotUpError:
3928
3937
  with ux_utils.print_exception_no_traceback():
3929
3938
  raise
@@ -15,6 +15,7 @@ from sky.adaptors import common as adaptors_common
15
15
  from sky.clouds import cloud as cloud_lib
16
16
  from sky.clouds import cloud_registry
17
17
  from sky.clouds.service_catalog import constants
18
+ from sky.utils import common_utils
18
19
  from sky.utils import rich_utils
19
20
  from sky.utils import ux_utils
20
21
 
@@ -69,8 +70,7 @@ def is_catalog_modified(filename: str) -> bool:
69
70
  meta_path = os.path.join(_ABSOLUTE_VERSIONED_CATALOG_DIR, '.meta', filename)
70
71
  md5_filepath = meta_path + '.md5'
71
72
  if os.path.exists(md5_filepath):
72
- with open(catalog_path, 'rb') as f:
73
- file_md5 = hashlib.md5(f.read()).hexdigest()
73
+ file_md5 = common_utils.hash_file(catalog_path, 'md5').hexdigest()
74
74
  with open(md5_filepath, 'r', encoding='utf-8') as f:
75
75
  last_md5 = f.read()
76
76
  return file_md5 != last_md5