PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20241203__py3-none-any.whl → 1.0.0.dev20241205__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20241203py3-none-any.whl → 1.0.0.dev20241205py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

sky/__init__.py +4 -2
sky/backends/backend.py +42 -15
sky/backends/backend_utils.py +143 -9
sky/backends/cloud_vm_ray_backend.py +103 -25
sky/backends/local_docker_backend.py +11 -7
sky/cli.py +11 -2
sky/clouds/service_catalog/common.py +2 -2
sky/core.py +25 -18
sky/exceptions.py +7 -0
sky/execution.py +30 -11
sky/global_user_state.py +23 -10
sky/jobs/controller.py +28 -8
sky/jobs/core.py +61 -35
sky/jobs/recovery_strategy.py +2 -1
sky/jobs/state.py +33 -1
sky/jobs/utils.py +16 -2
sky/setup_files/dependencies.py +141 -0
sky/setup_files/setup.py +12 -124
sky/skylet/constants.py +36 -11
sky/skylet/log_lib.py +3 -1
sky/skylet/log_lib.pyi +3 -0
sky/templates/kubernetes-ray.yml.j2 +4 -2
sky/utils/common_utils.py +19 -0
sky/utils/controller_utils.py +60 -98
{skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/METADATA +3 -2
{skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/RECORD +30 -29
{skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/top_level.txt +0 -0

sky/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 import urllib.request
 # Replaced with the current commit when building the wheels.
-_SKYPILOT_COMMIT_SHA = '747382a7cf75c691c91846efe708ce25b2a3aeb8'
+_SKYPILOT_COMMIT_SHA = '6e5083293f0d9a9d069d51274c57f0e59e47e5ce'
 def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev20241203'
+__version__ = '1.0.0.dev20241205'
 __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -105,6 +105,7 @@ from sky.data import StorageMode
 from sky.data import StoreType
 from sky.execution import exec  # pylint: disable=redefined-builtin
 from sky.execution import launch
+from sky.jobs import ManagedJobStatus
 # TODO (zhwu): These imports are for backward compatibility, and spot APIs
 # should be called with `sky.spot.xxx` instead. Remove in release 0.8.0
 from sky.jobs.core import spot_cancel
@@ -163,6 +164,7 @@ __all__ = [
     'StoreType',
     'ClusterStatus',
     'JobStatus',
+    'ManagedJobStatus',
     # APIs
     'Dag',
     'Task',

sky/backends/backend.py CHANGED Viewed

@@ -45,20 +45,45 @@ class Backend(Generic[_ResourceHandleType]):
     @timeline.event
     @usage_lib.messages.usage.update_runtime('provision')
     def provision(
-            self,
-            task: 'task_lib.Task',
-            to_provision: Optional['resources.Resources'],
-            dryrun: bool,
-            stream_logs: bool,
-            cluster_name: Optional[str] = None,
-            retry_until_up: bool = False) -> Optional[_ResourceHandleType]:
+        self,
+        task: 'task_lib.Task',
+        to_provision: Optional['resources.Resources'],
+        dryrun: bool,
+        stream_logs: bool,
+        cluster_name: Optional[str] = None,
+        retry_until_up: bool = False,
+        skip_unnecessary_provisioning: bool = False,
+    ) -> Optional[_ResourceHandleType]:
+        """Provisions resources for the given task.
+        Args:
+            task: The task to provision resources for.
+            to_provision: Resource config to provision. Should only be None if
+                cluster_name refers to an existing cluster, whose resources will
+                be used.
+            dryrun: If True, don't actually provision anything.
+            stream_logs: If True, stream additional logs to console.
+            cluster_name: Name of the cluster to provision. If None, a name will
+                be auto-generated. If the name refers to an existing cluster,
+                the existing cluster will be reused and re-provisioned.
+            retry_until_up: If True, retry provisioning until resources are
+                successfully launched.
+            skip_if_no_cluster_updates: If True, compare the cluster config to
+                the existing cluster_name's config. Skip provisioning if no
+                updates are needed for the existing cluster.
+        Returns:
+            A ResourceHandle object for the provisioned resources, or None if
+            dryrun is True.
+        """
         if cluster_name is None:
             cluster_name = sky.backends.backend_utils.generate_cluster_name()
         usage_lib.record_cluster_name_for_current_operation(cluster_name)
         usage_lib.messages.usage.update_actual_task(task)
         with rich_utils.safe_status(ux_utils.spinner_message('Launching')):
             return self._provision(task, to_provision, dryrun, stream_logs,
-                                   cluster_name, retry_until_up)
+                                   cluster_name, retry_until_up,
+                                   skip_unnecessary_provisioning)
     @timeline.event
     @usage_lib.messages.usage.update_runtime('sync_workdir')
@@ -126,13 +151,15 @@ class Backend(Generic[_ResourceHandleType]):
     # --- Implementations of the APIs ---
     def _provision(
-            self,
-            task: 'task_lib.Task',
-            to_provision: Optional['resources.Resources'],
-            dryrun: bool,
-            stream_logs: bool,
-            cluster_name: str,
-            retry_until_up: bool = False) -> Optional[_ResourceHandleType]:
+        self,
+        task: 'task_lib.Task',
+        to_provision: Optional['resources.Resources'],
+        dryrun: bool,
+        stream_logs: bool,
+        cluster_name: str,
+        retry_until_up: bool = False,
+        skip_unnecessary_provisioning: bool = False,
+    ) -> Optional[_ResourceHandleType]:
         raise NotImplementedError
     def _sync_workdir(self, handle: _ResourceHandleType, workdir: Path) -> None:

sky/backends/backend_utils.py CHANGED Viewed

@@ -3,6 +3,7 @@ from datetime import datetime
 import enum
 import fnmatch
 import functools
+import hashlib
 import os
 import pathlib
 import pprint
@@ -644,11 +645,17 @@ def write_cluster_config(
         keep_launch_fields_in_existing_config: bool = True) -> Dict[str, str]:
     """Fills in cluster configuration templates and writes them out.
-    Returns: {provisioner: path to yaml, the provisioning spec}.
-      'provisioner' can be
-        - 'ray'
-        - 'tpu-create-script' (if TPU is requested)
-        - 'tpu-delete-script' (if TPU is requested)
+    Returns:
+        Dict with the following keys:
+        - 'ray': Path to the generated Ray yaml config file
+        - 'cluster_name': Name of the cluster
+        - 'cluster_name_on_cloud': Name of the cluster as it appears in the
+          cloud provider
+        - 'config_hash': Hash of the cluster config and file mounts contents.
+          Can be missing if we unexpectedly failed to calculate the hash for
+          some reason. In that case we will continue without the optimization to
+          skip provisioning.
     Raises:
         exceptions.ResourcesUnavailableError: if the region/zones requested does
             not appear in the catalog, or an ssh_proxy_command is specified but
@@ -903,6 +910,12 @@ def write_cluster_config(
     if dryrun:
         # If dryrun, return the unfinished tmp yaml path.
         config_dict['ray'] = tmp_yaml_path
+        try:
+            config_dict['config_hash'] = _deterministic_cluster_yaml_hash(
+                tmp_yaml_path)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.warning(f'Failed to calculate config_hash: {e}')
+            logger.debug('Full exception:', exc_info=e)
         return config_dict
     _add_auth_to_cluster_config(cloud, tmp_yaml_path)
@@ -925,6 +938,17 @@ def write_cluster_config(
     yaml_config = common_utils.read_yaml(tmp_yaml_path)
     config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
+    # Make sure to do this before we optimize file mounts. Optimization is
+    # non-deterministic, but everything else before this point should be
+    # deterministic.
+    try:
+        config_dict['config_hash'] = _deterministic_cluster_yaml_hash(
+            tmp_yaml_path)
+    except Exception as e:  # pylint: disable=broad-except
+        logger.warning('Failed to calculate config_hash: '
+                       f'{common_utils.format_exception(e)}')
+        logger.debug('Full exception:', exc_info=e)
     # Optimization: copy the contents of source files in file_mounts to a
     # special dir, and upload that as the only file_mount instead. Delay
     # calling this optimization until now, when all source files have been
@@ -1033,6 +1057,115 @@ def _count_healthy_nodes_from_ray(output: str,
     return ready_head, ready_workers
+@timeline.event
+def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
+    """Hash the cluster yaml and contents of file mounts to a unique string.
+    Two invocations of this function should return the same string if and only
+    if the contents of the yaml are the same and the file contents of all the
+    file_mounts specified in the yaml are the same.
+    Limitations:
+    - This function can be expensive if the file mounts are large. (E.g. a few
+      seconds for ~1GB.) This should be okay since we expect that the
+      file_mounts in the cluster yaml (the wheel and cloud credentials) will be
+      small.
+    - Symbolic links are not explicitly handled. Some symbolic link changes may
+      not be detected.
+    Implementation: We create a byte sequence that captures the state of the
+    yaml file and all the files in the file mounts, then hash the byte sequence.
+    The format of the byte sequence is:
+    32 bytes - sha256 hash of the yaml file
+    for each file mount:
+      file mount remote destination (UTF-8), \0
+      if the file mount source is a file:
+        'file' encoded to UTF-8
+        32 byte sha256 hash of the file contents
+      if the file mount source is a directory:
+        'dir' encoded to UTF-8
+        for each directory and subdirectory withinin the file mount (starting from
+            the root and descending recursively):
+          name of the directory (UTF-8), \0
+          name of each subdirectory within the directory (UTF-8) terminated by \0
+          \0
+          for each file in the directory:
+            name of the file (UTF-8), \0
+            32 bytes - sha256 hash of the file contents
+          \0
+      if the file mount source is something else or does not exist, nothing
+      \0\0
+    Rather than constructing the whole byte sequence, which may be quite large,
+    we construct it incrementally by using hash.update() to add new bytes.
+    """
+    def _hash_file(path: str) -> bytes:
+        return common_utils.hash_file(path, 'sha256').digest()
+    config_hash = hashlib.sha256()
+    config_hash.update(_hash_file(yaml_path))
+    yaml_config = common_utils.read_yaml(yaml_path)
+    file_mounts = yaml_config.get('file_mounts', {})
+    # Remove the file mounts added by the newline.
+    if '' in file_mounts:
+        assert file_mounts[''] == '', file_mounts['']
+        file_mounts.pop('')
+    for dst, src in sorted(file_mounts.items()):
+        expanded_src = os.path.expanduser(src)
+        config_hash.update(dst.encode('utf-8') + b'\0')
+        # If the file mount source is a symlink, this should be true. In that
+        # case we hash the contents of the symlink destination.
+        if os.path.isfile(expanded_src):
+            config_hash.update('file'.encode('utf-8'))
+            config_hash.update(_hash_file(expanded_src))
+        # This can also be a symlink to a directory. os.walk will treat it as a
+        # normal directory and list the contents of the symlink destination.
+        elif os.path.isdir(expanded_src):
+            config_hash.update('dir'.encode('utf-8'))
+            # Aside from expanded_src, os.walk will list symlinks to directories
+            # but will not recurse into them.
+            for (dirpath, dirnames, filenames) in os.walk(expanded_src):
+                config_hash.update(dirpath.encode('utf-8') + b'\0')
+                # Note: inplace sort will also affect the traversal order of
+                # os.walk. We need it so that the os.walk order is
+                # deterministic.
+                dirnames.sort()
+                # This includes symlinks to directories. os.walk will recurse
+                # into all the directories but not the symlinks. We don't hash
+                # the link destination, so if a symlink to a directory changes,
+                # we won't notice.
+                for dirname in dirnames:
+                    config_hash.update(dirname.encode('utf-8') + b'\0')
+                config_hash.update(b'\0')
+                filenames.sort()
+                # This includes symlinks to files. We could hash the symlink
+                # destination itself but instead just hash the destination
+                # contents.
+                for filename in filenames:
+                    config_hash.update(filename.encode('utf-8') + b'\0')
+                    config_hash.update(
+                        _hash_file(os.path.join(dirpath, filename)))
+                config_hash.update(b'\0')
+        else:
+            logger.debug(
+                f'Unexpected file_mount that is not a file or dir: {src}')
+        config_hash.update(b'\0\0')
+    return config_hash.hexdigest()
 def get_docker_user(ip: str, cluster_config_file: str) -> str:
     """Find docker container username."""
     ssh_credentials = ssh_credential_from_yaml(cluster_config_file)
@@ -1612,14 +1745,14 @@ def check_can_clone_disk_and_override_task(
         The task to use and the resource handle of the source cluster.
     Raises:
-        ValueError: If the source cluster does not exist.
+        exceptions.ClusterDoesNotExist: If the source cluster does not exist.
         exceptions.NotSupportedError: If the source cluster is not valid or the
             task is not compatible to clone disk from the source cluster.
     """
     source_cluster_status, handle = refresh_cluster_status_handle(cluster_name)
     if source_cluster_status is None:
         with ux_utils.print_exception_no_traceback():
-            raise ValueError(
+            raise exceptions.ClusterDoesNotExist(
                 f'Cannot find cluster {cluster_name!r} to clone disk from.')
     if not isinstance(handle, backends.CloudVmRayResourceHandle):
@@ -2136,7 +2269,7 @@ def check_cluster_available(
     """Check if the cluster is available.
     Raises:
-        ValueError: if the cluster does not exist.
+        exceptions.ClusterDoesNotExist: if the cluster does not exist.
         exceptions.ClusterNotUpError: if the cluster is not UP.
         exceptions.NotSupportedError: if the cluster is not based on
           CloudVmRayBackend.
@@ -2201,7 +2334,8 @@ def check_cluster_available(
             error_msg += message
         with ux_utils.print_exception_no_traceback():
-            raise ValueError(f'{colorama.Fore.YELLOW}{error_msg}{reset}')
+            raise exceptions.ClusterDoesNotExist(
+                f'{colorama.Fore.YELLOW}{error_msg}{reset}')
     assert cluster_status is not None, 'handle is not None but status is None'
     backend = get_backend_from_handle(handle)
     if check_cloud_vm_ray_backend and not isinstance(

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -301,6 +301,8 @@ class RayCodeGen:
             )
             def get_or_fail(futures, pg) -> List[int]:
                 \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
+                if not futures:
+                    return []
                 returncodes = [1] * len(futures)
                 # Wait for 1 task to be ready.
                 ready = []
@@ -1153,6 +1155,7 @@ class RetryingVmProvisioner(object):
             prev_cluster_status: Optional[status_lib.ClusterStatus],
             prev_handle: Optional['CloudVmRayResourceHandle'],
             prev_cluster_ever_up: bool,
+            prev_config_hash: Optional[str],
         ) -> None:
             assert cluster_name is not None, 'cluster_name must be specified.'
             self.cluster_name = cluster_name
@@ -1161,6 +1164,7 @@ class RetryingVmProvisioner(object):
             self.prev_cluster_status = prev_cluster_status
             self.prev_handle = prev_handle
             self.prev_cluster_ever_up = prev_cluster_ever_up
+            self.prev_config_hash = prev_config_hash
     def __init__(self,
                  log_dir: str,
@@ -1322,8 +1326,21 @@ class RetryingVmProvisioner(object):
         prev_cluster_status: Optional[status_lib.ClusterStatus],
         prev_handle: Optional['CloudVmRayResourceHandle'],
         prev_cluster_ever_up: bool,
+        skip_if_config_hash_matches: Optional[str],
     ) -> Dict[str, Any]:
-        """The provision retry loop."""
+        """The provision retry loop.
+        Returns a config_dict with the following fields:
+        All fields from backend_utils.write_cluster_config(). See its
+          docstring.
+        - 'provisioning_skipped': True if provisioning was short-circuited
+          by skip_if_config_hash_matches, False otherwise.
+        - 'handle': The provisioned cluster handle.
+        - 'provision_record': (Only if using the new skypilot provisioner) The
+          record returned by provisioner.bulk_provision().
+        - 'resources_vars': (Only if using the new skypilot provisioner) The
+          resources variables given by make_deploy_resources_variables().
+        """
         # Get log_path name
         log_path = os.path.join(self.log_dir, 'provision.log')
         log_abs_path = os.path.abspath(log_path)
@@ -1432,8 +1449,18 @@ class RetryingVmProvisioner(object):
                 raise exceptions.ResourcesUnavailableError(
                     f'Failed to provision on cloud {to_provision.cloud} due to '
                     f'invalid cloud config: {common_utils.format_exception(e)}')
+            if ('config_hash' in config_dict and
+                    skip_if_config_hash_matches == config_dict['config_hash']):
+                logger.debug('Skipping provisioning of cluster with matching '
+                             'config hash.')
+                config_dict['provisioning_skipped'] = True
+                return config_dict
+            config_dict['provisioning_skipped'] = False
             if dryrun:
                 return config_dict
             cluster_config_file = config_dict['ray']
             launched_resources = to_provision.copy(region=region.name)
@@ -1945,8 +1972,13 @@ class RetryingVmProvisioner(object):
         to_provision_config: ToProvisionConfig,
         dryrun: bool,
         stream_logs: bool,
+        skip_unnecessary_provisioning: bool,
     ) -> Dict[str, Any]:
-        """Provision with retries for all launchable resources."""
+        """Provision with retries for all launchable resources.
+        Returns the config_dict from _retry_zones() - see its docstring for
+        details.
+        """
         cluster_name = to_provision_config.cluster_name
         to_provision = to_provision_config.resources
         num_nodes = to_provision_config.num_nodes
@@ -1955,6 +1987,8 @@ class RetryingVmProvisioner(object):
         prev_cluster_ever_up = to_provision_config.prev_cluster_ever_up
         launchable_retries_disabled = (self._dag is None or
                                        self._optimize_target is None)
+        skip_if_config_hash_matches = (to_provision_config.prev_config_hash if
+                                       skip_unnecessary_provisioning else None)
         failover_history: List[Exception] = list()
@@ -1994,7 +2028,8 @@ class RetryingVmProvisioner(object):
                     cloud_user_identity=cloud_user,
                     prev_cluster_status=prev_cluster_status,
                     prev_handle=prev_handle,
-                    prev_cluster_ever_up=prev_cluster_ever_up)
+                    prev_cluster_ever_up=prev_cluster_ever_up,
+                    skip_if_config_hash_matches=skip_if_config_hash_matches)
                 if dryrun:
                     return config_dict
             except (exceptions.InvalidClusterNameError,
@@ -2695,14 +2730,21 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         return valid_resource
     def _provision(
-            self,
-            task: task_lib.Task,
-            to_provision: Optional[resources_lib.Resources],
-            dryrun: bool,
-            stream_logs: bool,
-            cluster_name: str,
-            retry_until_up: bool = False) -> Optional[CloudVmRayResourceHandle]:
-        """Provisions using 'ray up'.
+        self,
+        task: task_lib.Task,
+        to_provision: Optional[resources_lib.Resources],
+        dryrun: bool,
+        stream_logs: bool,
+        cluster_name: str,
+        retry_until_up: bool = False,
+        skip_unnecessary_provisioning: bool = False,
+    ) -> Optional[CloudVmRayResourceHandle]:
+        """Provisions the cluster, or re-provisions an existing cluster.
+        Use the SKYPILOT provisioner if it's supported by the cloud, otherwise
+        use 'ray up'.
+        See also docstring for Backend.provision().
         Raises:
             exceptions.ClusterOwnerIdentityMismatchError: if the cluster
@@ -2787,7 +2829,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     rich_utils.force_update_status(
                         ux_utils.spinner_message('Launching', log_path))
                     config_dict = retry_provisioner.provision_with_retries(
-                        task, to_provision_config, dryrun, stream_logs)
+                        task, to_provision_config, dryrun, stream_logs,
+                        skip_unnecessary_provisioning)
                     break
                 except exceptions.ResourcesUnavailableError as e:
                     # Do not remove the stopped cluster from the global state
@@ -2837,11 +2880,23 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 record = global_user_state.get_cluster_from_name(cluster_name)
                 return record['handle'] if record is not None else None
+            if config_dict['provisioning_skipped']:
+                # Skip further provisioning.
+                # In this case, we won't have certain fields in the config_dict
+                # ('handle', 'provision_record', 'resources_vars')
+                # We need to return the handle - but it should be the existing
+                # handle for the cluster.
+                record = global_user_state.get_cluster_from_name(cluster_name)
+                assert record is not None and record['handle'] is not None, (
+                    cluster_name, record)
+                return record['handle']
             if 'provision_record' in config_dict:
                 # New provisioner is used here.
                 handle = config_dict['handle']
                 provision_record = config_dict['provision_record']
                 resources_vars = config_dict['resources_vars']
+                config_hash = config_dict.get('config_hash', None)
                 # Setup SkyPilot runtime after the cluster is provisioned
                 # 1. Wait for SSH to be ready.
@@ -2876,7 +2931,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 self._update_after_cluster_provisioned(
                     handle, to_provision_config.prev_handle, task,
                     prev_cluster_status, handle.external_ips(),
-                    handle.external_ssh_ports(), lock_path)
+                    handle.external_ssh_ports(), lock_path, config_hash)
                 return handle
             cluster_config_file = config_dict['ray']
@@ -2948,7 +3003,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             self._update_after_cluster_provisioned(
                 handle, to_provision_config.prev_handle, task,
-                prev_cluster_status, ip_list, ssh_port_list, lock_path)
+                prev_cluster_status, ip_list, ssh_port_list, lock_path,
+                config_hash)
             return handle
     def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
@@ -2966,8 +3022,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             prev_handle: Optional[CloudVmRayResourceHandle],
             task: task_lib.Task,
             prev_cluster_status: Optional[status_lib.ClusterStatus],
-            ip_list: List[str], ssh_port_list: List[int],
-            lock_path: str) -> None:
+            ip_list: List[str], ssh_port_list: List[int], lock_path: str,
+            config_hash: str) -> None:
         usage_lib.messages.usage.update_cluster_resources(
             handle.launched_nodes, handle.launched_resources)
         usage_lib.messages.usage.update_final_cluster_status(
@@ -3027,6 +3083,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 handle,
                 set(task.resources),
                 ready=True,
+                config_hash=config_hash,
             )
             usage_lib.messages.usage.update_final_cluster_status(
                 status_lib.ClusterStatus.UP)
@@ -3460,15 +3517,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         Returns:
             Job id if the task is submitted to the cluster, None otherwise.
         """
-        if task.run is None:
+        if task.run is None and self._setup_cmd is None:
+            # This message is fine without mentioning setup, as there are three
+            # cases when run section is empty:
+            # 1. setup specified, no --detach-setup: setup is executed and this
+            #    message is fine for saying no run command specified.
+            # 2. setup specified, with --detach-setup: setup is executed in
+            #    detached mode and this message will not be shown.
+            # 3. no setup specified: this message is fine as a user is likely
+            #    creating a cluster only, and ok with the empty run command.
             logger.info('Run commands not specified or empty.')
             return None
-        # Check the task resources vs the cluster resources. Since `sky exec`
-        # will not run the provision and _check_existing_cluster
-        # We need to check ports here since sky.exec shouldn't change resources
-        valid_resource = self.check_resources_fit_cluster(handle,
-                                                          task,
-                                                          check_ports=True)
+        if task.run is None:
+            # If the task has no run command, we still need to execute the
+            # generated ray driver program to run the setup command in detached
+            # mode.
+            # In this case, we reset the resources for the task, so that the
+            # detached setup does not need to wait for the task resources to be
+            # ready (which is not used for setup anyway).
+            valid_resource = sky.Resources()
+        else:
+            # Check the task resources vs the cluster resources. Since
+            # `sky exec` will not run the provision and _check_existing_cluster
+            # We need to check ports here since sky.exec shouldn't change
+            # resources.
+            valid_resource = self.check_resources_fit_cluster(handle,
+                                                              task,
+                                                              check_ports=True)
         task_copy = copy.copy(task)
         # Handle multiple resources exec case.
         task_copy.set_resources(valid_resource)
@@ -4328,6 +4403,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         # cluster is terminated (through console or auto-dwon), the record will
         # become None and the cluster_ever_up should be considered as False.
         cluster_ever_up = record is not None and record['cluster_ever_up']
+        prev_config_hash = record['config_hash'] if record is not None else None
         logger.debug(f'cluster_ever_up: {cluster_ever_up}')
         logger.debug(f'record: {record}')
@@ -4366,7 +4442,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 handle.launched_nodes,
                 prev_cluster_status=prev_cluster_status,
                 prev_handle=handle,
-                prev_cluster_ever_up=cluster_ever_up)
+                prev_cluster_ever_up=cluster_ever_up,
+                prev_config_hash=prev_config_hash)
         usage_lib.messages.usage.set_new_cluster()
         # Use the task_cloud, because the cloud in `to_provision` can be changed
         # later during the retry.
@@ -4407,7 +4484,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             task.num_nodes,
             prev_cluster_status=None,
             prev_handle=None,
-            prev_cluster_ever_up=False)
+            prev_cluster_ever_up=False,
+            prev_config_hash=prev_config_hash)
     def _execute_file_mounts(self, handle: CloudVmRayResourceHandle,
                              file_mounts: Optional[Dict[Path, Path]]):

sky/backends/local_docker_backend.py CHANGED Viewed

@@ -131,13 +131,14 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
         pass
     def _provision(
-            self,
-            task: 'task_lib.Task',
-            to_provision: Optional['resources.Resources'],
-            dryrun: bool,
-            stream_logs: bool,
-            cluster_name: str,
-            retry_until_up: bool = False
+        self,
+        task: 'task_lib.Task',
+        to_provision: Optional['resources.Resources'],
+        dryrun: bool,
+        stream_logs: bool,
+        cluster_name: str,
+        retry_until_up: bool = False,
+        skip_unnecessary_provisioning: bool = False,
     ) -> Optional[LocalDockerResourceHandle]:
         """Builds docker image for the task and returns cluster name as handle.
@@ -153,6 +154,9 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
             logger.warning(
                 f'Retrying until up is not supported in backend: {self.NAME}. '
                 'Ignored the flag.')
+        if skip_unnecessary_provisioning:
+            logger.warning(f'skip_unnecessary_provisioning is not supported in '
+                           f'backend: {self.NAME}. Ignored the flag.')
         if stream_logs:
             logger.info(
                 'Streaming build logs is not supported in LocalDockerBackend. '

sky/cli.py CHANGED Viewed

@@ -3914,16 +3914,25 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
     default=False,
     help=('Show the controller logs of this job; useful for debugging '
           'launching/recoveries, etc.'))
+@click.option(
+    '--refresh',
+    '-r',
+    default=False,
+    is_flag=True,
+    required=False,
+    help='Query the latest job logs, restarting the jobs controller if stopped.'
+)
 @click.argument('job_id', required=False, type=int)
 @usage_lib.entrypoint
 def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
-              controller: bool):
+              controller: bool, refresh: bool):
     """Tail the log of a managed job."""
     try:
         managed_jobs.tail_logs(name=name,
                                job_id=job_id,
                                follow=follow,
-                               controller=controller)
+                               controller=controller,
+                               refresh=refresh)
     except exceptions.ClusterNotUpError:
         with ux_utils.print_exception_no_traceback():
             raise

sky/clouds/service_catalog/common.py CHANGED Viewed

@@ -15,6 +15,7 @@ from sky.adaptors import common as adaptors_common
 from sky.clouds import cloud as cloud_lib
 from sky.clouds import cloud_registry
 from sky.clouds.service_catalog import constants
+from sky.utils import common_utils
 from sky.utils import rich_utils
 from sky.utils import ux_utils
@@ -69,8 +70,7 @@ def is_catalog_modified(filename: str) -> bool:
     meta_path = os.path.join(_ABSOLUTE_VERSIONED_CATALOG_DIR, '.meta', filename)
     md5_filepath = meta_path + '.md5'
     if os.path.exists(md5_filepath):
-        with open(catalog_path, 'rb') as f:
-            file_md5 = hashlib.md5(f.read()).hexdigest()
+        file_md5 = common_utils.hash_file(catalog_path, 'md5').hexdigest()
         with open(md5_filepath, 'r', encoding='utf-8') as f:
             last_md5 = f.read()
         return file_md5 != last_md5

skypilot-nightly 1.0.0.dev20241203__py3-none-any.whl → 1.0.0.dev20241205__py3-none-any.whl

skypilot-nightly 1.0.0.dev20241203py3-none-any.whl → 1.0.0.dev20241205py3-none-any.whl