PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20241028__py3-none-any.whl → 1.0.0.dev20241030__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20241028py3-none-any.whl → 1.0.0.dev20241030py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

sky/__init__.py +2 -2
sky/adaptors/azure.py +3 -0
sky/backends/backend_utils.py +10 -133
sky/backends/cloud_vm_ray_backend.py +17 -105
sky/clouds/azure.py +10 -1
sky/execution.py +5 -4
sky/jobs/controller.py +38 -22
sky/jobs/recovery_strategy.py +30 -5
sky/jobs/state.py +33 -5
sky/jobs/utils.py +28 -4
sky/optimizer.py +11 -7
sky/provision/azure/azure-config-template.json +7 -1
sky/provision/azure/config.py +65 -45
sky/provision/azure/instance.py +275 -70
sky/provision/constants.py +7 -0
sky/provision/gcp/instance.py +0 -7
sky/resources.py +25 -8
sky/serve/core.py +0 -2
sky/serve/serve_state.py +3 -7
sky/serve/serve_utils.py +2 -14
sky/serve/service_spec.py +0 -28
sky/setup_files/setup.py +4 -3
sky/skylet/job_lib.py +37 -53
sky/skylet/log_lib.py +5 -14
sky/templates/azure-ray.yml.j2 +1 -0
sky/utils/dag_utils.py +14 -4
sky/utils/schemas.py +25 -15
{skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/METADATA +13 -11
{skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/RECORD +33 -33
{skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/WHEEL +1 -1
{skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/top_level.txt +0 -0

sky/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 import urllib.request
 # Replaced with the current commit when building the wheels.
-_SKYPILOT_COMMIT_SHA = 'c0c17483d1f692ad639144050f5f6fa0966e47a5'
+_SKYPILOT_COMMIT_SHA = '9d50f192b262d5f6cc74b5b6644f3a9e3ea31f2f'
 def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev20241028'
+__version__ = '1.0.0.dev20241030'
 __root_dir__ = os.path.dirname(os.path.abspath(__file__))

sky/adaptors/azure.py CHANGED Viewed

@@ -131,6 +131,9 @@ def get_client(name: str,
             from azure.mgmt import authorization
             return authorization.AuthorizationManagementClient(
                 credential, subscription_id)
+        elif name == 'msi':
+            from azure.mgmt import msi
+            return msi.ManagedServiceIdentityClient(credential, subscription_id)
         elif name == 'graph':
             import msgraph
             return msgraph.GraphServiceClient(credential)

sky/backends/backend_utils.py CHANGED Viewed

@@ -401,6 +401,8 @@ class SSHConfigHelper(object):
     ssh_conf_path = '~/.ssh/config'
     ssh_conf_lock_path = os.path.expanduser('~/.sky/ssh_config.lock')
+    ssh_conf_per_cluster_lock_path = os.path.expanduser(
+        '~/.sky/ssh_config_{}.lock')
     ssh_cluster_path = SKY_USER_FILE_PATH + '/ssh/{}'
     @classmethod
@@ -486,12 +488,6 @@ class SSHConfigHelper(object):
         config_path = os.path.expanduser(cls.ssh_conf_path)
-        # For backward compatibility: before #2706, we wrote the config of SkyPilot clusters
-        # directly in ~/.ssh/config. For these clusters, we remove the config in ~/.ssh/config
-        # and write/overwrite the config in ~/.sky/ssh/<cluster_name> instead.
-        cls._remove_stale_cluster_config_for_backward_compatibility(
-            cluster_name, ip, auth_config, docker_user)
         if not os.path.exists(config_path):
             config = ['\n']
             with open(config_path,
@@ -560,139 +556,20 @@ class SSHConfigHelper(object):
             f.write(codegen)
     @classmethod
-    def _remove_stale_cluster_config_for_backward_compatibility(
-        cls,
-        cluster_name: str,
-        ip: str,
-        auth_config: Dict[str, str],
-        docker_user: Optional[str] = None,
-    ):
-        """Remove authentication information for cluster from local SSH config.
-        If no existing host matching the provided specification is found, then
-        nothing is removed.
-        Args:
-            ip: Head node's IP address.
-            auth_config: read_yaml(handle.cluster_yaml)['auth']
-            docker_user: If not None, use this user to ssh into the docker
-        """
-        username = auth_config['ssh_user']
-        config_path = os.path.expanduser(cls.ssh_conf_path)
-        cluster_config_path = os.path.expanduser(
-            cls.ssh_cluster_path.format(cluster_name))
-        if not os.path.exists(config_path):
-            return
-        with open(config_path, 'r', encoding='utf-8') as f:
-            config = f.readlines()
-        start_line_idx = None
-        # Scan the config for the cluster name.
-        for i, line in enumerate(config):
-            next_line = config[i + 1] if i + 1 < len(config) else ''
-            if docker_user is None:
-                found = (line.strip() == f'HostName {ip}' and
-                         next_line.strip() == f'User {username}')
-            else:
-                found = (line.strip() == 'HostName localhost' and
-                         next_line.strip() == f'User {docker_user}')
-                if found:
-                    # Find the line starting with ProxyCommand and contains the ip
-                    found = False
-                    for idx in range(i, len(config)):
-                        # Stop if we reach an empty line, which means a new host
-                        if not config[idx].strip():
-                            break
-                        if config[idx].strip().startswith('ProxyCommand'):
-                            proxy_command_line = config[idx].strip()
-                            if proxy_command_line.endswith(f'@{ip}'):
-                                found = True
-                                break
-            if found:
-                start_line_idx = i - 1
-                break
-        if start_line_idx is not None:
-            # Scan for end of previous config.
-            cursor = start_line_idx
-            while cursor > 0 and len(config[cursor].strip()) > 0:
-                cursor -= 1
-            prev_end_line_idx = cursor
-            # Scan for end of the cluster config.
-            end_line_idx = None
-            cursor = start_line_idx + 1
-            start_line_idx -= 1  # remove auto-generated comment
-            while cursor < len(config):
-                if config[cursor].strip().startswith(
-                        '# ') or config[cursor].strip().startswith('Host '):
-                    end_line_idx = cursor
-                    break
-                cursor += 1
-            # Remove sky-generated config and update the file.
-            config[prev_end_line_idx:end_line_idx] = [
-                '\n'
-            ] if end_line_idx is not None else []
-            with open(config_path, 'w', encoding='utf-8') as f:
-                f.write(''.join(config).strip())
-                f.write('\n' * 2)
-        # Delete include statement if it exists in the config.
-        sky_autogen_comment = ('# Added by sky (use `sky stop/down '
-                               f'{cluster_name}` to remove)')
-        with open(config_path, 'r', encoding='utf-8') as f:
-            config = f.readlines()
-        for i, line in enumerate(config):
-            config_str = line.strip()
-            if f'Include {cluster_config_path}' in config_str:
-                with open(config_path, 'w', encoding='utf-8') as f:
-                    if i < len(config) - 1 and config[i + 1] == '\n':
-                        del config[i + 1]
-                    # Delete Include string
-                    del config[i]
-                    # Delete Sky Autogen Comment
-                    if i > 0 and sky_autogen_comment in config[i - 1].strip():
-                        del config[i - 1]
-                    f.write(''.join(config))
-                break
-            if 'Host' in config_str:
-                break
-    @classmethod
-    # TODO: We can remove this after 0.6.0 and have a lock only per cluster.
-    @timeline.FileLockEvent(ssh_conf_lock_path)
-    def remove_cluster(
-        cls,
-        cluster_name: str,
-        ip: str,
-        auth_config: Dict[str, str],
-        docker_user: Optional[str] = None,
-    ):
+    def remove_cluster(cls, cluster_name: str):
         """Remove authentication information for cluster from ~/.sky/ssh/<cluster_name>.
-        For backward compatibility also remove the config from ~/.ssh/config if it exists.
         If no existing host matching the provided specification is found, then
         nothing is removed.
         Args:
-            ip: Head node's IP address.
-            auth_config: read_yaml(handle.cluster_yaml)['auth']
-            docker_user: If not None, use this user to ssh into the docker
+            cluster_name: Cluster name.
         """
-        cluster_config_path = os.path.expanduser(
-            cls.ssh_cluster_path.format(cluster_name))
-        common_utils.remove_file_if_exists(cluster_config_path)
-        # Ensures backward compatibility: before #2706, we wrote the config of SkyPilot clusters
-        # directly in ~/.ssh/config. For these clusters, we should clean up the config.
-        # TODO: Remove this after 0.6.0
-        cls._remove_stale_cluster_config_for_backward_compatibility(
-            cluster_name, ip, auth_config, docker_user)
+        with timeline.FileLockEvent(
+                cls.ssh_conf_per_cluster_lock_path.format(cluster_name)):
+            cluster_config_path = os.path.expanduser(
+                cls.ssh_cluster_path.format(cluster_name))
+            common_utils.remove_file_if_exists(cluster_config_path)
 def _replace_yaml_dicts(
@@ -867,7 +744,7 @@ def write_cluster_config(
     labels = skypilot_config.get_nested((str(cloud).lower(), 'labels'), {})
     # Deprecated: instance_tags have been replaced by labels. For backward
     # compatibility, we support them and the schema allows them only if
-    # `labels` are not specified. This should be removed after 0.7.0.
+    # `labels` are not specified. This should be removed after 0.8.0.
     labels = skypilot_config.get_nested((str(cloud).lower(), 'instance_tags'),
                                         labels)
     # labels is a dict, which is guaranteed by the type check in

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -2118,13 +2118,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
             stable_internal_external_ips: Optional[List[Tuple[str,
                                                               str]]] = None,
             stable_ssh_ports: Optional[List[int]] = None,
-            cluster_info: Optional[provision_common.ClusterInfo] = None,
-            # The following 2 fields are deprecated. SkyPilot new provisioner
-            # API handles the TPU node creation/deletion.
-            # Backward compatibility for TPU nodes created before #2943.
-            # TODO (zhwu): Remove this after 0.6.0.
-            tpu_create_script: Optional[str] = None,
-            tpu_delete_script: Optional[str] = None) -> None:
+            cluster_info: Optional[provision_common.ClusterInfo] = None
+    ) -> None:
         self._version = self._VERSION
         self.cluster_name = cluster_name
         self.cluster_name_on_cloud = cluster_name_on_cloud
@@ -2139,12 +2134,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         self.launched_nodes = launched_nodes
         self.launched_resources = launched_resources
         self.docker_user: Optional[str] = None
-        # Deprecated. SkyPilot new provisioner API handles the TPU node
-        # creation/deletion.
-        # Backward compatibility for TPU nodes created before #2943.
-        # TODO (zhwu): Remove this after 0.6.0.
-        self.tpu_create_script = tpu_create_script
-        self.tpu_delete_script = tpu_delete_script
     def __repr__(self):
         return (f'ResourceHandle('
@@ -2160,10 +2149,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
                 f'\n\tlaunched_resources={self.launched_nodes}x '
                 f'{self.launched_resources}, '
                 f'\n\tdocker_user={self.docker_user},'
-                f'\n\tssh_user={self.ssh_user},'
-                # TODO (zhwu): Remove this after 0.6.0.
-                f'\n\ttpu_create_script={self.tpu_create_script}, '
-                f'\n\ttpu_delete_script={self.tpu_delete_script})')
+                f'\n\tssh_user={self.ssh_user}')
     def get_cluster_name(self):
         return self.cluster_name
@@ -2176,26 +2162,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         return common_utils.read_yaml(self.cluster_yaml).get(
             'provider', {}).get('use_internal_ips', False)
-    def _update_cluster_region(self):
-        """Update the region in handle.launched_resources.
-        This is for backward compatibility to handle the clusters launched
-        long before. We should remove this after 0.6.0.
-        """
-        if self.launched_resources.region is not None:
-            return
-        config = common_utils.read_yaml(self.cluster_yaml)
-        provider = config['provider']
-        cloud = self.launched_resources.cloud
-        if cloud.is_same_cloud(clouds.Azure()):
-            region = provider['location']
-        elif cloud.is_same_cloud(clouds.GCP()) or cloud.is_same_cloud(
-                clouds.AWS()):
-            region = provider['region']
-        self.launched_resources = self.launched_resources.copy(region=region)
     def update_ssh_ports(self, max_attempts: int = 1) -> None:
         """Fetches and sets the SSH ports for the cluster nodes.
@@ -2567,8 +2533,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         if version < 4:
             self.update_ssh_ports()
-        self._update_cluster_region()
         if version < 8:
             try:
                 self._update_cluster_info()
@@ -2649,8 +2613,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         if record is not None:
             usage_lib.messages.usage.update_cluster_status(record['status'])
-        # Backward compatibility: the old launched_resources without region info
-        # was handled by ResourceHandle._update_cluster_region.
         assert launched_resources.region is not None, handle
         mismatch_str = (f'To fix: specify a new cluster name, or down the '
@@ -3213,9 +3175,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
             if returncode == 255:
                 is_message_too_long = False
-                with open(setup_log_path, 'r', encoding='utf-8') as f:
-                    if 'too long' in f.read():
-                        is_message_too_long = True
+                try:
+                    with open(os.path.expanduser(setup_log_path),
+                              'r',
+                              encoding='utf-8') as f:
+                        if 'too long' in f.read():
+                            is_message_too_long = True
+                except Exception as e:  # pylint: disable=broad-except
+                    # We don't crash the setup if we cannot read the log file.
+                    # Instead, we should retry the setup with dumping the script
+                    # to a file to be safe.
+                    logger.debug('Failed to read setup log file '
+                                 f'{setup_log_path}: {e}')
+                    is_message_too_long = True
                 if is_message_too_long:
                     # If the setup script is too long, we retry it with dumping
@@ -3585,9 +3557,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
         try:
-            # TODO(mraheja): remove pylint disabling when filelock
-            # version updated
-            # pylint: disable=abstract-class-instantiated
             with filelock.FileLock(
                     lock_path,
                     backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
@@ -4096,55 +4065,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         * Removing ssh configs for the cluster;
         * Updating the local state of the cluster;
         * Removing the terminated cluster's scripts and ray yaml files.
-        Raises:
-            RuntimeError: If it fails to delete the TPU.
         """
-        log_path = os.path.join(os.path.expanduser(self.log_dir),
-                                'teardown.log')
-        log_abs_path = os.path.abspath(log_path)
         cluster_name_on_cloud = handle.cluster_name_on_cloud
-        # Backward compatibility for TPU nodes created before #2943. Any TPU
-        # node launched before that PR have the delete script generated (and do
-        # not have the tpu_node config set in its cluster yaml), so we have to
-        # call the deletion script to clean up the TPU node.
-        # For TPU nodes launched after the PR, deletion is done in SkyPilot's
-        # new GCP provisioner API.
-        # TODO (zhwu): Remove this after 0.6.0.
-        if (handle.tpu_delete_script is not None and
-                os.path.exists(handle.tpu_delete_script)):
-            # Only call the deletion script if the cluster config does not
-            # contain TPU node config. Otherwise, the deletion should
-            # already be handled by the new provisioner.
-            config = common_utils.read_yaml(handle.cluster_yaml)
-            tpu_node_config = config['provider'].get('tpu_node')
-            if tpu_node_config is None:
-                with rich_utils.safe_status(
-                        ux_utils.spinner_message('Terminating TPU')):
-                    tpu_rc, tpu_stdout, tpu_stderr = log_lib.run_with_log(
-                        ['bash', handle.tpu_delete_script],
-                        log_abs_path,
-                        stream_logs=False,
-                        require_outputs=True)
-                if tpu_rc != 0:
-                    if _TPU_NOT_FOUND_ERROR in tpu_stderr:
-                        logger.info('TPU not found. '
-                                    'It should have been deleted already.')
-                    elif purge:
-                        logger.warning(
-                            _TEARDOWN_PURGE_WARNING.format(
-                                reason='stopping/terminating TPU',
-                                details=tpu_stderr))
-                    else:
-                        raise RuntimeError(
-                            _TEARDOWN_FAILURE_MESSAGE.format(
-                                extra_reason='It is caused by TPU failure.',
-                                cluster_name=common_utils.cluster_name_in_hint(
-                                    handle.cluster_name, cluster_name_on_cloud),
-                                stdout=tpu_stdout,
-                                stderr=tpu_stderr))
         if (terminate and handle.launched_resources.is_image_managed is True):
             # Delete the image when terminating a "cloned" cluster, i.e.,
             # whose image is created by SkyPilot (--clone-disk-from)
@@ -4189,11 +4112,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         # The cluster file must exist because the cluster_yaml will only
         # be removed after the cluster entry in the database is removed.
         config = common_utils.read_yaml(handle.cluster_yaml)
-        auth_config = config['auth']
-        backend_utils.SSHConfigHelper.remove_cluster(handle.cluster_name,
-                                                     handle.head_ip,
-                                                     auth_config,
-                                                     handle.docker_user)
+        backend_utils.SSHConfigHelper.remove_cluster(handle.cluster_name)
         global_user_state.remove_cluster(handle.cluster_name,
                                          terminate=terminate)
@@ -4202,13 +4121,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             # This function could be directly called from status refresh,
             # where we need to cleanup the cluster profile.
             metadata_utils.remove_cluster_metadata(handle.cluster_name)
-            # Clean up TPU creation/deletion scripts
-            # Backward compatibility for TPU nodes created before #2943.
-            # TODO (zhwu): Remove this after 0.6.0.
-            if handle.tpu_delete_script is not None:
-                assert handle.tpu_create_script is not None
-                common_utils.remove_file_if_exists(handle.tpu_create_script)
-                common_utils.remove_file_if_exists(handle.tpu_delete_script)
             # Clean up generated config
             # No try-except is needed since Ray will fail to teardown the

sky/clouds/azure.py CHANGED Viewed

@@ -12,6 +12,7 @@ import colorama
 from sky import clouds
 from sky import exceptions
 from sky import sky_logging
+from sky import skypilot_config
 from sky.adaptors import azure
 from sky.clouds import service_catalog
 from sky.clouds.utils import azure_utils
@@ -353,6 +354,13 @@ class Azure(clouds.Cloud):
         need_nvidia_driver_extension = (acc_dict is not None and
                                         'A10' in acc_dict)
+        # Determine resource group for deploying the instance.
+        resource_group_name = skypilot_config.get_nested(
+            ('azure', 'resource_group_vm'), None)
+        use_external_resource_group = resource_group_name is not None
+        if resource_group_name is None:
+            resource_group_name = f'{cluster_name.name_on_cloud}-{region_name}'
         # Setup commands to eliminate the banner and restart sshd.
         # This script will modify /etc/ssh/sshd_config and add a bash script
         # into .bashrc. The bash script will restart sshd if it has not been
@@ -409,7 +417,8 @@ class Azure(clouds.Cloud):
             'disk_tier': Azure._get_disk_type(disk_tier),
             'cloud_init_setup_commands': cloud_init_setup_commands,
             'azure_subscription_id': self.get_project_id(dryrun),
-            'resource_group': f'{cluster_name.name_on_cloud}-{region_name}',
+            'resource_group': resource_group_name,
+            'use_external_resource_group': use_external_resource_group,
         }
         # Setting disk performance tier for high disk tier.

sky/execution.py CHANGED Viewed

@@ -171,10 +171,11 @@ def _execute(
     task = dag.tasks[0]
     if any(r.job_recovery is not None for r in task.resources):
-        with ux_utils.print_exception_no_traceback():
-            raise ValueError(
-                'Job recovery is specified in the task. To launch a '
-                'managed job, please use: sky jobs launch')
+        logger.warning(
+            f'{colorama.Style.DIM}The task has `job_recovery` specified, '
+            'but is launched as an unmanaged job. It will be ignored.'
+            'To enable job recovery, use managed jobs: sky jobs launch.'
+            f'{colorama.Style.RESET_ALL}')
     cluster_exists = False
     if cluster_name is not None:

sky/jobs/controller.py CHANGED Viewed

@@ -160,6 +160,11 @@ class JobsController:
         if task_id == 0:
             submitted_at = backend_utils.get_timestamp_from_run_timestamp(
                 self._backend.run_timestamp)
+        assert task.name is not None, task
+        cluster_name = managed_job_utils.generate_managed_job_cluster_name(
+            task.name, self._job_id)
+        self._strategy_executor = recovery_strategy.StrategyExecutor.make(
+            cluster_name, self._backend, task, self._retry_until_up)
         managed_job_state.set_submitted(
             self._job_id,
             task_id,
@@ -167,15 +172,14 @@ class JobsController:
             submitted_at,
             resources_str=backend_utils.get_task_resources_str(
                 task, is_managed_job=True),
+            specs={
+                'max_restarts_on_errors':
+                    self._strategy_executor.max_restarts_on_errors
+            },
             callback_func=callback_func)
         logger.info(
             f'Submitted managed job {self._job_id} (task: {task_id}, name: '
             f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
-        assert task.name is not None, task
-        cluster_name = managed_job_utils.generate_managed_job_cluster_name(
-            task.name, self._job_id)
-        self._strategy_executor = recovery_strategy.StrategyExecutor.make(
-            cluster_name, self._backend, task, self._retry_until_up)
         logger.info('Started monitoring.')
         managed_job_state.set_starting(job_id=self._job_id,
@@ -283,23 +287,35 @@ class JobsController:
                     failure_reason = (
                         'To see the details, run: '
                         f'sky jobs logs --controller {self._job_id}')
-                    managed_job_state.set_failed(
-                        self._job_id,
-                        task_id,
-                        failure_type=managed_job_status,
-                        failure_reason=failure_reason,
-                        end_time=end_time,
-                        callback_func=callback_func)
-                    return False
-                # Although the cluster is healthy, we fail to access the
-                # job status. Try to recover the job (will not restart the
-                # cluster, if the cluster is healthy).
-                assert job_status is None, job_status
-                logger.info('Failed to fetch the job status while the '
-                            'cluster is healthy. Try to recover the job '
-                            '(the cluster will not be restarted).')
+                    should_restart_on_failure = (
+                        self._strategy_executor.should_restart_on_failure())
+                    if should_restart_on_failure:
+                        max_restarts = (
+                            self._strategy_executor.max_restarts_on_errors)
+                        logger.info(
+                            f'User program crashed '
+                            f'({managed_job_status.value}). '
+                            f'Retry the job as max_restarts_on_errors is '
+                            f'set to {max_restarts}. '
+                            f'[{self._strategy_executor.restart_cnt_on_failure}'
+                            f'/{max_restarts}]')
+                    else:
+                        managed_job_state.set_failed(
+                            self._job_id,
+                            task_id,
+                            failure_type=managed_job_status,
+                            failure_reason=failure_reason,
+                            end_time=end_time,
+                            callback_func=callback_func)
+                        return False
+                else:
+                    # Although the cluster is healthy, we fail to access the
+                    # job status. Try to recover the job (will not restart the
+                    # cluster, if the cluster is healthy).
+                    assert job_status is None, job_status
+                    logger.info('Failed to fetch the job status while the '
+                                'cluster is healthy. Try to recover the job '
+                                '(the cluster will not be restarted).')
             # When the handle is None, the cluster should be cleaned up already.
             if handle is not None:
                 resources = handle.launched_resources

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -66,7 +66,8 @@ class StrategyExecutor:
     RETRY_INIT_GAP_SECONDS = 60
     def __init__(self, cluster_name: str, backend: 'backends.Backend',
-                 task: 'task_lib.Task', retry_until_up: bool) -> None:
+                 task: 'task_lib.Task', retry_until_up: bool,
+                 max_restarts_on_errors: int) -> None:
         """Initialize the strategy executor.
         Args:
@@ -82,6 +83,8 @@ class StrategyExecutor:
         self.cluster_name = cluster_name
         self.backend = backend
         self.retry_until_up = retry_until_up
+        self.max_restarts_on_errors = max_restarts_on_errors
+        self.restart_cnt_on_failure = 0
     def __init_subclass__(cls, name: str, default: bool = False):
         RECOVERY_STRATEGIES[name] = cls
@@ -109,8 +112,17 @@ class StrategyExecutor:
         # set the new_task_resources to be the same type (list or set) as the
         # original task.resources
         task.set_resources(type(task.resources)(new_resources_list))
-        return RECOVERY_STRATEGIES[job_recovery](cluster_name, backend, task,
-                                                 retry_until_up)
+        if isinstance(job_recovery, dict):
+            job_recovery_name = job_recovery.pop('strategy',
+                                                 DEFAULT_RECOVERY_STRATEGY)
+            max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
+                                                      0)
+        else:
+            job_recovery_name = job_recovery
+            max_restarts_on_errors = 0
+        return RECOVERY_STRATEGIES[job_recovery_name](cluster_name, backend,
+                                                      task, retry_until_up,
+                                                      max_restarts_on_errors)
     def launch(self) -> float:
         """Launch the cluster for the first time.
@@ -368,6 +380,17 @@ class StrategyExecutor:
                         f'{gap_seconds:.1f} seconds.')
             time.sleep(gap_seconds)
+    def should_restart_on_failure(self) -> bool:
+        """Increments counter & checks if job should be restarted on a failure.
+        Returns:
+            True if the job should be restarted, otherwise False.
+        """
+        self.restart_cnt_on_failure += 1
+        if self.restart_cnt_on_failure > self.max_restarts_on_errors:
+            return False
+        return True
 class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
                                default=False):
@@ -376,8 +399,10 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
     _MAX_RETRY_CNT = 240  # Retry for 4 hours.
     def __init__(self, cluster_name: str, backend: 'backends.Backend',
-                 task: 'task_lib.Task', retry_until_up: bool) -> None:
-        super().__init__(cluster_name, backend, task, retry_until_up)
+                 task: 'task_lib.Task', retry_until_up: bool,
+                 max_restarts_on_errors: int) -> None:
+        super().__init__(cluster_name, backend, task, retry_until_up,
+                         max_restarts_on_errors)
         # Note down the cloud/region of the launched cluster, so that we can
         # first retry in the same cloud/region. (Inside recover() we may not
         # rely on cluster handle, as it can be None if the cluster is

skypilot-nightly 1.0.0.dev20241028__py3-none-any.whl → 1.0.0.dev20241030__py3-none-any.whl

skypilot-nightly 1.0.0.dev20241028py3-none-any.whl → 1.0.0.dev20241030py3-none-any.whl