PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20241011__py3-none-any.whl → 1.0.0.dev20241013__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20241011py3-none-any.whl → 1.0.0.dev20241013py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

sky/__init__.py +2 -2
sky/adaptors/azure.py +3 -1
sky/adaptors/common.py +6 -2
sky/backends/backend.py +9 -4
sky/backends/backend_utils.py +13 -16
sky/backends/cloud_vm_ray_backend.py +207 -161
sky/backends/local_docker_backend.py +3 -1
sky/benchmark/benchmark_utils.py +5 -4
sky/cli.py +128 -31
sky/clouds/service_catalog/aws_catalog.py +6 -7
sky/clouds/service_catalog/common.py +4 -3
sky/clouds/service_catalog/cudo_catalog.py +11 -1
sky/core.py +4 -2
sky/data/storage.py +44 -32
sky/data/storage_utils.py +12 -7
sky/exceptions.py +5 -0
sky/execution.py +10 -24
sky/jobs/__init__.py +2 -0
sky/jobs/core.py +87 -7
sky/jobs/utils.py +35 -19
sky/optimizer.py +50 -37
sky/provision/aws/config.py +15 -6
sky/provision/azure/config.py +14 -3
sky/provision/azure/instance.py +15 -9
sky/provision/kubernetes/instance.py +3 -1
sky/provision/kubernetes/utils.py +25 -0
sky/provision/provisioner.py +63 -74
sky/serve/core.py +42 -40
sky/sky_logging.py +9 -5
sky/skylet/log_lib.py +5 -4
sky/skylet/providers/lambda_cloud/node_provider.py +1 -1
sky/utils/cli_utils/status_utils.py +168 -21
sky/utils/command_runner.py +11 -11
sky/utils/common_utils.py +22 -5
sky/utils/controller_utils.py +78 -29
sky/utils/env_options.py +22 -7
sky/utils/log_utils.py +39 -24
sky/utils/resources_utils.py +23 -0
sky/utils/rich_utils.py +55 -5
sky/utils/ux_utils.py +63 -4
{skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/RECORD +46 -46
{skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/top_level.txt +0 -0

sky/data/storage_utils.py CHANGED Viewed

@@ -12,7 +12,6 @@ from sky import sky_logging
 from sky.skylet import constants
 from sky.utils import common_utils
 from sky.utils import log_utils
-from sky.utils.cli_utils import status_utils
 logger = sky_logging.init_logger(__name__)
@@ -22,6 +21,8 @@ _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG = (
     'to the cloud storage for {path!r}'
     'due to the following error: {error_msg!r}')
+_LAST_USE_TRUNC_LENGTH = 25
 def format_storage_table(storages: List[Dict[str, Any]],
                          show_all: bool = False) -> str:
@@ -46,8 +47,8 @@ def format_storage_table(storages: List[Dict[str, Any]],
         if show_all:
             command = row['last_use']
         else:
-            command = status_utils.truncate_long_string(
-                row['last_use'], status_utils.COMMAND_TRUNC_LENGTH)
+            command = common_utils.truncate_long_string(row['last_use'],
+                                                        _LAST_USE_TRUNC_LENGTH)
         storage_table.add_row([
             # NAME
             row['name'],
@@ -212,9 +213,13 @@ def get_excluded_files(src_dir_path: str) -> List[str]:
     skyignore_path = os.path.join(expand_src_dir_path,
                                   constants.SKY_IGNORE_FILE)
     if os.path.exists(skyignore_path):
-        logger.info(f'Exclude files to sync to cluster based on '
-                    f'{constants.SKY_IGNORE_FILE}.')
+        logger.info(f'  {colorama.Style.DIM}'
+                    f'Excluded files to sync to cluster based on '
+                    f'{constants.SKY_IGNORE_FILE}.'
+                    f'{colorama.Style.RESET_ALL}')
         return get_excluded_files_from_skyignore(src_dir_path)
-    logger.info(f'Exclude files to sync to cluster based on '
-                f'{constants.GIT_IGNORE_FILE}.')
+    logger.info(f'  {colorama.Style.DIM}'
+                f'Excluded files to sync to cluster based on '
+                f'{constants.GIT_IGNORE_FILE}.'
+                f'{colorama.Style.RESET_ALL}')
     return get_excluded_files_from_gitignore(src_dir_path)

sky/exceptions.py CHANGED Viewed

@@ -291,3 +291,8 @@ class PortDoesNotExistError(Exception):
 class UserRequestRejectedByPolicy(Exception):
     """Raised when a user request is rejected by an admin policy."""
     pass
+class NoClusterLaunchedError(Exception):
+    """No cluster launched, so cleanup can be skipped during failover."""
+    pass

sky/execution.py CHANGED Viewed

@@ -3,7 +3,6 @@
 See `Stage` for a Task's life cycle.
 """
 import enum
-import os
 from typing import List, Optional, Tuple, Union
 import colorama
@@ -20,10 +19,8 @@ from sky.usage import usage_lib
 from sky.utils import admin_policy_utils
 from sky.utils import controller_utils
 from sky.utils import dag_utils
-from sky.utils import env_options
 from sky.utils import resources_utils
 from sky.utils import rich_utils
-from sky.utils import subprocess_utils
 from sky.utils import timeline
 from sky.utils import ux_utils
@@ -293,11 +290,17 @@ def _execute(
             logger.info('Dryrun finished.')
             return None, None
-        if Stage.SYNC_WORKDIR in stages and not dryrun:
-            if task.workdir is not None:
-                backend.sync_workdir(handle, task.workdir)
+        do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
+                      task.workdir is not None)
+        do_file_mounts = (Stage.SYNC_FILE_MOUNTS in stages and not dryrun and
+                          task.file_mounts is not None)
+        if do_workdir or do_file_mounts:
+            logger.info(ux_utils.starting_message('Mounting files.'))
-        if Stage.SYNC_FILE_MOUNTS in stages and not dryrun:
+        if do_workdir:
+            backend.sync_workdir(handle, task.workdir)
+        if do_file_mounts:
             backend.sync_file_mounts(handle, task.file_mounts,
                                      task.storage_mounts)
@@ -330,23 +333,6 @@ def _execute(
                 backend.teardown_ephemeral_storage(task)
                 backend.teardown(handle, terminate=True)
     finally:
-        controller = controller_utils.Controllers.from_name(cluster_name)
-        if controller is None and not _is_launched_by_sky_serve_controller:
-            # UX: print live clusters to make users aware (to save costs).
-            #
-            # Don't print if this job is launched by the jobs controller,
-            # because managed jobs are serverless, there can be many of them,
-            # and users tend to continuously monitor managed jobs using `sky
-            # job queue`. Also don't print if this job is a skyserve controller
-            # job or launched by a skyserve controller job, because the
-            # redirect for this subprocess.run won't success and it will
-            # pollute the controller logs.
-            #
-            # Disable the usage collection for this status command.
-            env = dict(os.environ,
-                       **{env_options.Options.DISABLE_LOGGING.value: '1'})
-            subprocess_utils.run(
-                'sky status --no-show-managed-jobs --no-show-services', env=env)
         print()
         print('\x1b[?25h', end='')  # Show cursor.
     return job_id, handle

sky/jobs/__init__.py CHANGED Viewed

@@ -8,6 +8,7 @@ from sky.jobs.constants import JOBS_TASK_YAML_PREFIX
 from sky.jobs.core import cancel
 from sky.jobs.core import launch
 from sky.jobs.core import queue
+from sky.jobs.core import queue_from_kubernetes_pod
 from sky.jobs.core import tail_logs
 from sky.jobs.recovery_strategy import DEFAULT_RECOVERY_STRATEGY
 from sky.jobs.recovery_strategy import RECOVERY_STRATEGIES
@@ -34,6 +35,7 @@ __all__ = [
     'cancel',
     'launch',
     'queue',
+    'queue_from_kubernetes_pod',
     'tail_logs',
     # utils
     'ManagedJobCodeGen',

sky/jobs/core.py CHANGED Viewed

@@ -9,6 +9,7 @@ import colorama
 import sky
 from sky import backends
 from sky import exceptions
+from sky import provision as provision_lib
 from sky import sky_logging
 from sky import status_lib
 from sky import task as task_lib
@@ -16,6 +17,7 @@ from sky.backends import backend_utils
 from sky.clouds.service_catalog import common as service_catalog_common
 from sky.jobs import constants as managed_job_constants
 from sky.jobs import utils as managed_job_utils
+from sky.provision import common
 from sky.skylet import constants as skylet_constants
 from sky.usage import usage_lib
 from sky.utils import admin_policy_utils
@@ -77,9 +79,11 @@ def launch(
     dag_utils.fill_default_config_in_dag_for_job_launch(dag)
-    for task_ in dag.tasks:
-        controller_utils.maybe_translate_local_file_mounts_and_sync_up(
-            task_, path='jobs')
+    with rich_utils.safe_status(
+            ux_utils.spinner_message('Initializing managed job')):
+        for task_ in dag.tasks:
+            controller_utils.maybe_translate_local_file_mounts_and_sync_up(
+                task_, path='jobs')
     with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
                                      mode='w') as f:
@@ -127,7 +131,6 @@ def launch(
             f'{colorama.Fore.YELLOW}'
             f'Launching managed job {dag.name!r} from jobs controller...'
             f'{colorama.Style.RESET_ALL}')
-        sky_logging.print('Launching jobs controller...')
         sky.launch(task=controller_task,
                    stream_logs=stream_logs,
                    cluster_name=controller_name,
@@ -138,6 +141,82 @@ def launch(
                    _disable_controller_check=True)
+def queue_from_kubernetes_pod(
+        pod_name: str,
+        context: Optional[str] = None,
+        skip_finished: bool = False) -> List[Dict[str, Any]]:
+    """Gets the jobs queue from a specific controller pod.
+    Args:
+        pod_name (str): The name of the controller pod to query for jobs.
+        context (Optional[str]): The Kubernetes context to use. If None, the
+            current context is used.
+        skip_finished (bool): If True, does not return finished jobs.
+    Returns:
+        [
+            {
+                'job_id': int,
+                'job_name': str,
+                'resources': str,
+                'submitted_at': (float) timestamp of submission,
+                'end_at': (float) timestamp of end,
+                'duration': (float) duration in seconds,
+                'recovery_count': (int) Number of retries,
+                'status': (sky.jobs.ManagedJobStatus) of the job,
+                'cluster_resources': (str) resources of the cluster,
+                'region': (str) region of the cluster,
+            }
+        ]
+    Raises:
+        RuntimeError: If there's an error fetching the managed jobs.
+    """
+    # Create dummy cluster info to get the command runner.
+    provider_config = {'context': context}
+    instances = {
+        pod_name: [
+            common.InstanceInfo(instance_id=pod_name,
+                                internal_ip='',
+                                external_ip='',
+                                tags={})
+        ]
+    }  # Internal IP is not required for Kubernetes
+    cluster_info = common.ClusterInfo(provider_name='kubernetes',
+                                      head_instance_id=pod_name,
+                                      provider_config=provider_config,
+                                      instances=instances)
+    managed_jobs_runner = provision_lib.get_command_runners(
+        'kubernetes', cluster_info)[0]
+    code = managed_job_utils.ManagedJobCodeGen.get_job_table()
+    returncode, job_table_payload, stderr = managed_jobs_runner.run(
+        code,
+        require_outputs=True,
+        separate_stderr=True,
+        stream_logs=False,
+    )
+    try:
+        subprocess_utils.handle_returncode(returncode,
+                                           code,
+                                           'Failed to fetch managed jobs',
+                                           job_table_payload + stderr,
+                                           stream_logs=False)
+    except exceptions.CommandError as e:
+        raise RuntimeError(str(e)) from e
+    jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
+    if skip_finished:
+        # Filter out the finished jobs. If a multi-task job is partially
+        # finished, we will include all its tasks.
+        non_finished_tasks = list(
+            filter(lambda job: not job['status'].is_terminal(), jobs))
+        non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
+        jobs = list(
+            filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
+    return jobs
 @usage_lib.entrypoint
 def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
@@ -184,11 +263,12 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
                           f'{colorama.Style.RESET_ALL}')
         rich_utils.force_update_status(
-            '[cyan] Checking managed jobs - restarting '
-            'controller[/]')
+            ux_utils.spinner_message('Checking managed jobs - restarting '
+                                     'controller'))
         handle = sky.start(jobs_controller_type.value.cluster_name)
         controller_status = status_lib.ClusterStatus.UP
-        rich_utils.force_update_status('[cyan] Checking managed jobs[/]')
+        rich_utils.force_update_status(
+            ux_utils.spinner_message('Checking managed jobs'))
     assert handle is not None, (controller_status, refresh)

sky/jobs/utils.py CHANGED Viewed

@@ -34,6 +34,7 @@ from sky.utils import common_utils
 from sky.utils import log_utils
 from sky.utils import rich_utils
 from sky.utils import subprocess_utils
+from sky.utils import ux_utils
 if typing.TYPE_CHECKING:
     import sky
@@ -57,11 +58,13 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
 _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
-_JOB_WAITING_STATUS_MESSAGE = ('[bold cyan]Waiting for the task to start'
-                               '{status_str}.[/] It may take a few minutes.')
+_JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
+    'Waiting for task to start[/]'
+    '{status_str}. It may take a few minutes.\n'
+    '  [dim]View controller logs: sky jobs logs --controller {job_id}')
 _JOB_CANCELLED_MESSAGE = (
-    '[bold cyan]Waiting for the task status to be updated.'
-    '[/] It may take a minute.')
+    ux_utils.spinner_message('Waiting for task status to be updated.') +
+    ' It may take a minute.')
 # The maximum time to wait for the managed job status to transition to terminal
 # state, after the job finished. This is a safeguard to avoid the case where
@@ -290,8 +293,8 @@ def cancel_job_by_name(job_name: str) -> str:
 def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
     """Stream logs by job id."""
     controller_status = job_lib.get_status(job_id)
-    status_msg = ('[bold cyan]Waiting for controller process to be RUNNING'
-                  '{status_str}[/].')
+    status_msg = ux_utils.spinner_message(
+        'Waiting for controller process to be RUNNING') + '{status_str}'
     status_display = rich_utils.safe_status(status_msg.format(status_str=''))
     num_tasks = managed_job_state.get_num_tasks(job_id)
@@ -310,7 +313,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
             time.sleep(_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS)
             controller_status = job_lib.get_status(job_id)
-        msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='')
+        msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='', job_id=job_id)
         status_display.update(msg)
         prev_msg = msg
         managed_job_status = managed_job_state.get_status(job_id)
@@ -356,7 +359,8 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
                 logger.debug(
                     f'INFO: The log is not ready yet{status_str}. '
                     f'Waiting for {JOB_STATUS_CHECK_GAP_SECONDS} seconds.')
-                msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str=status_str)
+                msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str=status_str,
+                                                         job_id=job_id)
                 if msg != prev_msg:
                     status_display.update(msg)
                     prev_msg = msg
@@ -444,8 +448,9 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
         managed_job_status = managed_job_state.get_status(job_id)
         assert managed_job_status is not None, job_id
-    logger.info(f'Logs finished for job {job_id} '
-                f'(status: {managed_job_status.value}).')
+    logger.info(
+        ux_utils.finishing_message(f'Managed job finished: {job_id} '
+                                   f'(status: {managed_job_status.value}).'))
     return ''
@@ -599,11 +604,20 @@ def format_job_table(
       a list of "rows" (each of which is a list of str).
     """
     jobs = collections.defaultdict(list)
+    # Check if the tasks have user information.
+    tasks_have_user = any([task.get('user') for task in tasks])
+    if max_jobs and tasks_have_user:
+        raise ValueError('max_jobs is not supported when tasks have user info.')
+    def get_hash(task):
+        if tasks_have_user:
+            return (task['user'], task['job_id'])
+        return task['job_id']
     for task in tasks:
         # The tasks within the same job_id are already sorted
         # by the task_id.
-        jobs[task['job_id']].append(task)
-    jobs = dict(jobs)
+        jobs[get_hash(task)].append(task)
     status_counts: Dict[str, int] = collections.defaultdict(int)
     for job_tasks in jobs.values():
@@ -611,17 +625,14 @@ def format_job_table(
         if not managed_job_status.is_terminal():
             status_counts[managed_job_status.value] += 1
-    if max_jobs is not None:
-        job_ids = sorted(jobs.keys(), reverse=True)
-        job_ids = job_ids[:max_jobs]
-        jobs = {job_id: jobs[job_id] for job_id in job_ids}
     columns = [
         'ID', 'TASK', 'NAME', 'RESOURCES', 'SUBMITTED', 'TOT. DURATION',
         'JOB DURATION', '#RECOVERIES', 'STATUS'
     ]
     if show_all:
         columns += ['STARTED', 'CLUSTER', 'REGION', 'FAILURE']
+    if tasks_have_user:
+        columns.insert(0, 'USER')
     job_table = log_utils.create_table(columns)
     status_counts: Dict[str, int] = collections.defaultdict(int)
@@ -636,9 +647,9 @@ def format_job_table(
     for task in all_tasks:
         # The tasks within the same job_id are already sorted
         # by the task_id.
-        jobs[task['job_id']].append(task)
+        jobs[get_hash(task)].append(task)
-    for job_id, job_tasks in jobs.items():
+    for job_hash, job_tasks in jobs.items():
         if len(job_tasks) > 1:
             # Aggregate the tasks into a new row in the table.
             job_name = job_tasks[0]['job_name']
@@ -674,6 +685,7 @@ def format_job_table(
             if not managed_job_status.is_terminal():
                 status_str += f' (task: {current_task_id})'
+            job_id = job_hash[1] if tasks_have_user else job_hash
             job_values = [
                 job_id,
                 '',
@@ -692,6 +704,8 @@ def format_job_table(
                     '-',
                     failure_reason if failure_reason is not None else '-',
                 ])
+            if tasks_have_user:
+                job_values.insert(0, job_tasks[0].get('user', '-'))
             job_table.add_row(job_values)
         for task in job_tasks:
@@ -724,6 +738,8 @@ def format_job_table(
                     task['failure_reason']
                     if task['failure_reason'] is not None else '-',
                 ])
+            if tasks_have_user:
+                values.insert(0, task.get('user', '-'))
             job_table.add_row(values)
         if len(job_tasks) > 1:

sky/optimizer.py CHANGED Viewed

@@ -123,22 +123,23 @@ class Optimizer:
                 for a task.
             exceptions.NoCloudAccessError: if no public clouds are enabled.
         """
-        _check_specified_clouds(dag)
-        # This function is effectful: mutates every node in 'dag' by setting
-        # node.best_resources if it is None.
-        Optimizer._add_dummy_source_sink_nodes(dag)
-        try:
-            unused_best_plan = Optimizer._optimize_dag(
-                dag=dag,
-                minimize_cost=minimize == OptimizeTarget.COST,
-                blocked_resources=blocked_resources,
-                quiet=quiet)
-        finally:
-            # Make sure to remove the dummy source/sink nodes, even if the
-            # optimization fails.
-            Optimizer._remove_dummy_source_sink_nodes(dag)
-        return dag
+        with rich_utils.safe_status(ux_utils.spinner_message('Optimizing')):
+            _check_specified_clouds(dag)
+            # This function is effectful: mutates every node in 'dag' by setting
+            # node.best_resources if it is None.
+            Optimizer._add_dummy_source_sink_nodes(dag)
+            try:
+                unused_best_plan = Optimizer._optimize_dag(
+                    dag=dag,
+                    minimize_cost=minimize == OptimizeTarget.COST,
+                    blocked_resources=blocked_resources,
+                    quiet=quiet)
+            finally:
+                # Make sure to remove the dummy source/sink nodes, even if the
+                # optimization fails.
+                Optimizer._remove_dummy_source_sink_nodes(dag)
+            return dag
     @staticmethod
     def _add_dummy_source_sink_nodes(dag: 'dag_lib.Dag'):
@@ -259,6 +260,9 @@ class Optimizer:
             launchable_resources: Dict[resources_lib.Resources,
                                        List[resources_lib.Resources]]
         ) -> Dict[resources_lib.Resources, int]:
+            if not resources_utils.need_to_query_reservations():
+                return {}
             num_available_reserved_nodes_per_resource = {}
             def get_reservations_available_resources(
@@ -269,7 +273,7 @@ class Optimizer:
             launchable_resources_list: List[resources_lib.Resources] = sum(
                 launchable_resources.values(), [])
             with rich_utils.safe_status(
-                    '[cyan]Checking reserved resources...[/]'):
+                    ux_utils.spinner_message('Checking reserved resources')):
                 subprocess_utils.run_in_parallel(
                     get_reservations_available_resources,
                     launchable_resources_list)
@@ -337,8 +341,8 @@ class Optimizer:
                     if minimize_cost:
                         cost_per_node = resources.get_cost(estimated_runtime)
                         num_available_reserved_nodes = (
-                            num_available_reserved_nodes_per_resource[resources]
-                        )
+                            num_available_reserved_nodes_per_resource.get(
+                                resources, 0))
                         # We consider the cost of the unused reservation
                         # resources to be 0 since we are already paying for
@@ -384,10 +388,14 @@ class Optimizer:
                     fuzzy_candidates_str = (
                         f'\nTry one of these offered accelerators: {cyan}'
                         f'{fuzzy_candidates}{reset}')
+                node_resources_reprs = ', '.join(f'{node.num_nodes}x ' +
+                                                 r.repr_with_region_zone
+                                                 for r in node.resources)
                 error_msg = (
                     f'{source_hint.capitalize()} does not contain any '
-                    f'instances satisfying the request:\n{node}.'
-                    f'\n\nTo fix: relax or change the '
+                    f'instances satisfying the request: '
+                    f'{node_resources_reprs}.'
+                    f'\nTo fix: relax or change the '
                     f'resource requirements.{fuzzy_candidates_str}\n\n'
                     f'Hint: {bold}sky show-gpus{reset} '
                     'to list available accelerators.\n'
@@ -716,7 +724,6 @@ class Optimizer:
         node_to_cost_map: _TaskToCostMap,
         minimize_cost: bool,
     ):
-        logger.info('== Optimizer ==')
         ordered_node_to_cost_map = collections.OrderedDict()
         ordered_best_plan = collections.OrderedDict()
         for node in topo_order:
@@ -738,15 +745,18 @@ class Optimizer:
                     node.get_inputs() is None and node.get_outputs() is None):
                 print_hourly_cost = True
-        if print_hourly_cost:
-            logger.info(f'{colorama.Style.BRIGHT}Estimated cost: '
-                        f'{colorama.Style.RESET_ALL}${total_cost:.1f} / hour\n')
-        else:
-            logger.info(f'{colorama.Style.BRIGHT}Estimated total runtime: '
-                        f'{colorama.Style.RESET_ALL}{total_time / 3600:.1f} '
-                        'hours\n'
-                        f'{colorama.Style.BRIGHT}Estimated total cost: '
-                        f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
+        if not env_options.Options.MINIMIZE_LOGGING.get():
+            if print_hourly_cost:
+                logger.info(
+                    f'{colorama.Style.BRIGHT}Estimated cost: '
+                    f'{colorama.Style.RESET_ALL}${total_cost:.1f} / hour\n')
+            else:
+                logger.info(
+                    f'{colorama.Style.BRIGHT}Estimated total runtime: '
+                    f'{colorama.Style.RESET_ALL}{total_time / 3600:.1f} '
+                    'hours\n'
+                    f'{colorama.Style.BRIGHT}Estimated total cost: '
+                    f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
         def _get_resources_element_list(
                 resources: 'resources_lib.Resources') -> List[str]:
@@ -845,7 +855,7 @@ class Optimizer:
             best_plan_table = _create_table(['TASK', '#NODES'] +
                                             resource_fields)
             best_plan_table.add_rows(best_plan_rows)
-            logger.info(f'{best_plan_table}\n')
+            logger.info(f'{best_plan_table}')
         # Print the egress plan if any data egress is scheduled.
         Optimizer._print_egress_plan(graph, best_plan, minimize_cost)
@@ -864,6 +874,10 @@ class Optimizer:
             }
             task_str = (f'for task {task.name!r} ' if num_tasks > 1 else '')
             plural = 's' if task.num_nodes > 1 else ''
+            if num_tasks > 1:
+                # Add a new line for better readability, when there are multiple
+                # tasks.
+                logger.info('')
             logger.info(
                 f'{colorama.Style.BRIGHT}Considered resources {task_str}'
                 f'({task.num_nodes} node{plural}):'
@@ -934,7 +948,7 @@ class Optimizer:
             table = _create_table(field_names)
             table.add_rows(rows)
-            logger.info(f'{table}\n')
+            logger.info(f'{table}')
             # Warning message for using disk_tier=ultra
             # TODO(yi): Consider price of disks in optimizer and
@@ -965,10 +979,10 @@ class Optimizer:
                             f'Multiple {cloud} instances satisfy '
                             f'{acc_name}:{int(acc_count)}. '
                             f'The cheapest {candidate_list[0]!r} is considered '
-                            f'among:\n{instance_list}.\n')
+                            f'among:\n{instance_list}.')
             if is_multi_instances:
                 logger.info(
-                    f'To list more details, run \'sky show-gpus {acc_name}\'.')
+                    f'To list more details, run: sky show-gpus {acc_name}\n')
     @staticmethod
     def _optimize_dag(
@@ -1101,8 +1115,7 @@ class Optimizer:
             Optimizer.print_optimized_plan(graph, topo_order, best_plan,
                                            total_time, total_cost,
                                            node_to_cost_map, minimize_cost)
-            if not env_options.Options.MINIMIZE_LOGGING.get():
-                Optimizer._print_candidates(local_node_to_candidate_map)
+            Optimizer._print_candidates(local_node_to_candidate_map)
         return best_plan

sky/provision/aws/config.py CHANGED Viewed

@@ -16,10 +16,12 @@ from typing import Any, Dict, List, Optional, Set, Tuple
 import colorama
+from sky import exceptions
 from sky import sky_logging
 from sky.adaptors import aws
 from sky.provision import common
 from sky.provision.aws import utils
+from sky.utils import common_utils
 logger = sky_logging.init_logger(__name__)
@@ -535,12 +537,19 @@ def _get_or_create_vpc_security_group(ec2, vpc_id: str,
     if vpc_id in vpc_to_existing_sg:
         return vpc_to_existing_sg[vpc_id]
-    # create a new security group
-    ec2.meta.client.create_security_group(
-        Description='Auto-created security group for Ray workers',
-        GroupName=expected_sg_name,
-        VpcId=vpc_id,
-    )
+    try:
+        # create a new security group
+        ec2.meta.client.create_security_group(
+            Description='Auto-created security group for Ray workers',
+            GroupName=expected_sg_name,
+            VpcId=vpc_id,
+        )
+    except ec2.meta.client.exceptions.ClientError as e:
+        message = ('Failed to create security group. Error: '
+                   f'{common_utils.format_exception(e)}')
+        logger.warning(message)
+        raise exceptions.NoClusterLaunchedError(message) from e
     security_group = _get_security_groups_from_vpc_ids(ec2, [vpc_id],
                                                        [expected_sg_name])

skypilot-nightly 1.0.0.dev20241011__py3-none-any.whl → 1.0.0.dev20241013__py3-none-any.whl

skypilot-nightly 1.0.0.dev20241011py3-none-any.whl → 1.0.0.dev20241013py3-none-any.whl