skypilot-nightly 1.0.0.dev20241011__py3-none-any.whl → 1.0.0.dev20241013__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/azure.py +3 -1
  3. sky/adaptors/common.py +6 -2
  4. sky/backends/backend.py +9 -4
  5. sky/backends/backend_utils.py +13 -16
  6. sky/backends/cloud_vm_ray_backend.py +207 -161
  7. sky/backends/local_docker_backend.py +3 -1
  8. sky/benchmark/benchmark_utils.py +5 -4
  9. sky/cli.py +128 -31
  10. sky/clouds/service_catalog/aws_catalog.py +6 -7
  11. sky/clouds/service_catalog/common.py +4 -3
  12. sky/clouds/service_catalog/cudo_catalog.py +11 -1
  13. sky/core.py +4 -2
  14. sky/data/storage.py +44 -32
  15. sky/data/storage_utils.py +12 -7
  16. sky/exceptions.py +5 -0
  17. sky/execution.py +10 -24
  18. sky/jobs/__init__.py +2 -0
  19. sky/jobs/core.py +87 -7
  20. sky/jobs/utils.py +35 -19
  21. sky/optimizer.py +50 -37
  22. sky/provision/aws/config.py +15 -6
  23. sky/provision/azure/config.py +14 -3
  24. sky/provision/azure/instance.py +15 -9
  25. sky/provision/kubernetes/instance.py +3 -1
  26. sky/provision/kubernetes/utils.py +25 -0
  27. sky/provision/provisioner.py +63 -74
  28. sky/serve/core.py +42 -40
  29. sky/sky_logging.py +9 -5
  30. sky/skylet/log_lib.py +5 -4
  31. sky/skylet/providers/lambda_cloud/node_provider.py +1 -1
  32. sky/utils/cli_utils/status_utils.py +168 -21
  33. sky/utils/command_runner.py +11 -11
  34. sky/utils/common_utils.py +22 -5
  35. sky/utils/controller_utils.py +78 -29
  36. sky/utils/env_options.py +22 -7
  37. sky/utils/log_utils.py +39 -24
  38. sky/utils/resources_utils.py +23 -0
  39. sky/utils/rich_utils.py +55 -5
  40. sky/utils/ux_utils.py +63 -4
  41. {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/METADATA +1 -1
  42. {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/RECORD +46 -46
  43. {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/LICENSE +0 -0
  44. {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/WHEEL +0 -0
  45. {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/entry_points.txt +0 -0
  46. {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/top_level.txt +0 -0
sky/data/storage_utils.py CHANGED
@@ -12,7 +12,6 @@ from sky import sky_logging
12
12
  from sky.skylet import constants
13
13
  from sky.utils import common_utils
14
14
  from sky.utils import log_utils
15
- from sky.utils.cli_utils import status_utils
16
15
 
17
16
  logger = sky_logging.init_logger(__name__)
18
17
 
@@ -22,6 +21,8 @@ _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG = (
22
21
  'to the cloud storage for {path!r}'
23
22
  'due to the following error: {error_msg!r}')
24
23
 
24
+ _LAST_USE_TRUNC_LENGTH = 25
25
+
25
26
 
26
27
  def format_storage_table(storages: List[Dict[str, Any]],
27
28
  show_all: bool = False) -> str:
@@ -46,8 +47,8 @@ def format_storage_table(storages: List[Dict[str, Any]],
46
47
  if show_all:
47
48
  command = row['last_use']
48
49
  else:
49
- command = status_utils.truncate_long_string(
50
- row['last_use'], status_utils.COMMAND_TRUNC_LENGTH)
50
+ command = common_utils.truncate_long_string(row['last_use'],
51
+ _LAST_USE_TRUNC_LENGTH)
51
52
  storage_table.add_row([
52
53
  # NAME
53
54
  row['name'],
@@ -212,9 +213,13 @@ def get_excluded_files(src_dir_path: str) -> List[str]:
212
213
  skyignore_path = os.path.join(expand_src_dir_path,
213
214
  constants.SKY_IGNORE_FILE)
214
215
  if os.path.exists(skyignore_path):
215
- logger.info(f'Exclude files to sync to cluster based on '
216
- f'{constants.SKY_IGNORE_FILE}.')
216
+ logger.info(f' {colorama.Style.DIM}'
217
+ f'Excluded files to sync to cluster based on '
218
+ f'{constants.SKY_IGNORE_FILE}.'
219
+ f'{colorama.Style.RESET_ALL}')
217
220
  return get_excluded_files_from_skyignore(src_dir_path)
218
- logger.info(f'Exclude files to sync to cluster based on '
219
- f'{constants.GIT_IGNORE_FILE}.')
221
+ logger.info(f' {colorama.Style.DIM}'
222
+ f'Excluded files to sync to cluster based on '
223
+ f'{constants.GIT_IGNORE_FILE}.'
224
+ f'{colorama.Style.RESET_ALL}')
220
225
  return get_excluded_files_from_gitignore(src_dir_path)
sky/exceptions.py CHANGED
@@ -291,3 +291,8 @@ class PortDoesNotExistError(Exception):
291
291
  class UserRequestRejectedByPolicy(Exception):
292
292
  """Raised when a user request is rejected by an admin policy."""
293
293
  pass
294
+
295
+
296
+ class NoClusterLaunchedError(Exception):
297
+ """No cluster launched, so cleanup can be skipped during failover."""
298
+ pass
sky/execution.py CHANGED
@@ -3,7 +3,6 @@
3
3
  See `Stage` for a Task's life cycle.
4
4
  """
5
5
  import enum
6
- import os
7
6
  from typing import List, Optional, Tuple, Union
8
7
 
9
8
  import colorama
@@ -20,10 +19,8 @@ from sky.usage import usage_lib
20
19
  from sky.utils import admin_policy_utils
21
20
  from sky.utils import controller_utils
22
21
  from sky.utils import dag_utils
23
- from sky.utils import env_options
24
22
  from sky.utils import resources_utils
25
23
  from sky.utils import rich_utils
26
- from sky.utils import subprocess_utils
27
24
  from sky.utils import timeline
28
25
  from sky.utils import ux_utils
29
26
 
@@ -293,11 +290,17 @@ def _execute(
293
290
  logger.info('Dryrun finished.')
294
291
  return None, None
295
292
 
296
- if Stage.SYNC_WORKDIR in stages and not dryrun:
297
- if task.workdir is not None:
298
- backend.sync_workdir(handle, task.workdir)
293
+ do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
294
+ task.workdir is not None)
295
+ do_file_mounts = (Stage.SYNC_FILE_MOUNTS in stages and not dryrun and
296
+ task.file_mounts is not None)
297
+ if do_workdir or do_file_mounts:
298
+ logger.info(ux_utils.starting_message('Mounting files.'))
299
299
 
300
- if Stage.SYNC_FILE_MOUNTS in stages and not dryrun:
300
+ if do_workdir:
301
+ backend.sync_workdir(handle, task.workdir)
302
+
303
+ if do_file_mounts:
301
304
  backend.sync_file_mounts(handle, task.file_mounts,
302
305
  task.storage_mounts)
303
306
 
@@ -330,23 +333,6 @@ def _execute(
330
333
  backend.teardown_ephemeral_storage(task)
331
334
  backend.teardown(handle, terminate=True)
332
335
  finally:
333
- controller = controller_utils.Controllers.from_name(cluster_name)
334
- if controller is None and not _is_launched_by_sky_serve_controller:
335
- # UX: print live clusters to make users aware (to save costs).
336
- #
337
- # Don't print if this job is launched by the jobs controller,
338
- # because managed jobs are serverless, there can be many of them,
339
- # and users tend to continuously monitor managed jobs using `sky
340
- # job queue`. Also don't print if this job is a skyserve controller
341
- # job or launched by a skyserve controller job, because the
342
- # redirect for this subprocess.run won't success and it will
343
- # pollute the controller logs.
344
- #
345
- # Disable the usage collection for this status command.
346
- env = dict(os.environ,
347
- **{env_options.Options.DISABLE_LOGGING.value: '1'})
348
- subprocess_utils.run(
349
- 'sky status --no-show-managed-jobs --no-show-services', env=env)
350
336
  print()
351
337
  print('\x1b[?25h', end='') # Show cursor.
352
338
  return job_id, handle
sky/jobs/__init__.py CHANGED
@@ -8,6 +8,7 @@ from sky.jobs.constants import JOBS_TASK_YAML_PREFIX
8
8
  from sky.jobs.core import cancel
9
9
  from sky.jobs.core import launch
10
10
  from sky.jobs.core import queue
11
+ from sky.jobs.core import queue_from_kubernetes_pod
11
12
  from sky.jobs.core import tail_logs
12
13
  from sky.jobs.recovery_strategy import DEFAULT_RECOVERY_STRATEGY
13
14
  from sky.jobs.recovery_strategy import RECOVERY_STRATEGIES
@@ -34,6 +35,7 @@ __all__ = [
34
35
  'cancel',
35
36
  'launch',
36
37
  'queue',
38
+ 'queue_from_kubernetes_pod',
37
39
  'tail_logs',
38
40
  # utils
39
41
  'ManagedJobCodeGen',
sky/jobs/core.py CHANGED
@@ -9,6 +9,7 @@ import colorama
9
9
  import sky
10
10
  from sky import backends
11
11
  from sky import exceptions
12
+ from sky import provision as provision_lib
12
13
  from sky import sky_logging
13
14
  from sky import status_lib
14
15
  from sky import task as task_lib
@@ -16,6 +17,7 @@ from sky.backends import backend_utils
16
17
  from sky.clouds.service_catalog import common as service_catalog_common
17
18
  from sky.jobs import constants as managed_job_constants
18
19
  from sky.jobs import utils as managed_job_utils
20
+ from sky.provision import common
19
21
  from sky.skylet import constants as skylet_constants
20
22
  from sky.usage import usage_lib
21
23
  from sky.utils import admin_policy_utils
@@ -77,9 +79,11 @@ def launch(
77
79
 
78
80
  dag_utils.fill_default_config_in_dag_for_job_launch(dag)
79
81
 
80
- for task_ in dag.tasks:
81
- controller_utils.maybe_translate_local_file_mounts_and_sync_up(
82
- task_, path='jobs')
82
+ with rich_utils.safe_status(
83
+ ux_utils.spinner_message('Initializing managed job')):
84
+ for task_ in dag.tasks:
85
+ controller_utils.maybe_translate_local_file_mounts_and_sync_up(
86
+ task_, path='jobs')
83
87
 
84
88
  with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
85
89
  mode='w') as f:
@@ -127,7 +131,6 @@ def launch(
127
131
  f'{colorama.Fore.YELLOW}'
128
132
  f'Launching managed job {dag.name!r} from jobs controller...'
129
133
  f'{colorama.Style.RESET_ALL}')
130
- sky_logging.print('Launching jobs controller...')
131
134
  sky.launch(task=controller_task,
132
135
  stream_logs=stream_logs,
133
136
  cluster_name=controller_name,
@@ -138,6 +141,82 @@ def launch(
138
141
  _disable_controller_check=True)
139
142
 
140
143
 
144
+ def queue_from_kubernetes_pod(
145
+ pod_name: str,
146
+ context: Optional[str] = None,
147
+ skip_finished: bool = False) -> List[Dict[str, Any]]:
148
+ """Gets the jobs queue from a specific controller pod.
149
+
150
+ Args:
151
+ pod_name (str): The name of the controller pod to query for jobs.
152
+ context (Optional[str]): The Kubernetes context to use. If None, the
153
+ current context is used.
154
+ skip_finished (bool): If True, does not return finished jobs.
155
+
156
+ Returns:
157
+ [
158
+ {
159
+ 'job_id': int,
160
+ 'job_name': str,
161
+ 'resources': str,
162
+ 'submitted_at': (float) timestamp of submission,
163
+ 'end_at': (float) timestamp of end,
164
+ 'duration': (float) duration in seconds,
165
+ 'recovery_count': (int) Number of retries,
166
+ 'status': (sky.jobs.ManagedJobStatus) of the job,
167
+ 'cluster_resources': (str) resources of the cluster,
168
+ 'region': (str) region of the cluster,
169
+ }
170
+ ]
171
+
172
+ Raises:
173
+ RuntimeError: If there's an error fetching the managed jobs.
174
+ """
175
+ # Create dummy cluster info to get the command runner.
176
+ provider_config = {'context': context}
177
+ instances = {
178
+ pod_name: [
179
+ common.InstanceInfo(instance_id=pod_name,
180
+ internal_ip='',
181
+ external_ip='',
182
+ tags={})
183
+ ]
184
+ } # Internal IP is not required for Kubernetes
185
+ cluster_info = common.ClusterInfo(provider_name='kubernetes',
186
+ head_instance_id=pod_name,
187
+ provider_config=provider_config,
188
+ instances=instances)
189
+ managed_jobs_runner = provision_lib.get_command_runners(
190
+ 'kubernetes', cluster_info)[0]
191
+
192
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table()
193
+ returncode, job_table_payload, stderr = managed_jobs_runner.run(
194
+ code,
195
+ require_outputs=True,
196
+ separate_stderr=True,
197
+ stream_logs=False,
198
+ )
199
+ try:
200
+ subprocess_utils.handle_returncode(returncode,
201
+ code,
202
+ 'Failed to fetch managed jobs',
203
+ job_table_payload + stderr,
204
+ stream_logs=False)
205
+ except exceptions.CommandError as e:
206
+ raise RuntimeError(str(e)) from e
207
+
208
+ jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
209
+ if skip_finished:
210
+ # Filter out the finished jobs. If a multi-task job is partially
211
+ # finished, we will include all its tasks.
212
+ non_finished_tasks = list(
213
+ filter(lambda job: not job['status'].is_terminal(), jobs))
214
+ non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
215
+ jobs = list(
216
+ filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
217
+ return jobs
218
+
219
+
141
220
  @usage_lib.entrypoint
142
221
  def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
143
222
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
@@ -184,11 +263,12 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
184
263
  f'{colorama.Style.RESET_ALL}')
185
264
 
186
265
  rich_utils.force_update_status(
187
- '[cyan] Checking managed jobs - restarting '
188
- 'controller[/]')
266
+ ux_utils.spinner_message('Checking managed jobs - restarting '
267
+ 'controller'))
189
268
  handle = sky.start(jobs_controller_type.value.cluster_name)
190
269
  controller_status = status_lib.ClusterStatus.UP
191
- rich_utils.force_update_status('[cyan] Checking managed jobs[/]')
270
+ rich_utils.force_update_status(
271
+ ux_utils.spinner_message('Checking managed jobs'))
192
272
 
193
273
  assert handle is not None, (controller_status, refresh)
194
274
 
sky/jobs/utils.py CHANGED
@@ -34,6 +34,7 @@ from sky.utils import common_utils
34
34
  from sky.utils import log_utils
35
35
  from sky.utils import rich_utils
36
36
  from sky.utils import subprocess_utils
37
+ from sky.utils import ux_utils
37
38
 
38
39
  if typing.TYPE_CHECKING:
39
40
  import sky
@@ -57,11 +58,13 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
57
58
 
58
59
  _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
59
60
 
60
- _JOB_WAITING_STATUS_MESSAGE = ('[bold cyan]Waiting for the task to start'
61
- '{status_str}.[/] It may take a few minutes.')
61
+ _JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
62
+ 'Waiting for task to start[/]'
63
+ '{status_str}. It may take a few minutes.\n'
64
+ ' [dim]View controller logs: sky jobs logs --controller {job_id}')
62
65
  _JOB_CANCELLED_MESSAGE = (
63
- '[bold cyan]Waiting for the task status to be updated.'
64
- '[/] It may take a minute.')
66
+ ux_utils.spinner_message('Waiting for task status to be updated.') +
67
+ ' It may take a minute.')
65
68
 
66
69
  # The maximum time to wait for the managed job status to transition to terminal
67
70
  # state, after the job finished. This is a safeguard to avoid the case where
@@ -290,8 +293,8 @@ def cancel_job_by_name(job_name: str) -> str:
290
293
  def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
291
294
  """Stream logs by job id."""
292
295
  controller_status = job_lib.get_status(job_id)
293
- status_msg = ('[bold cyan]Waiting for controller process to be RUNNING'
294
- '{status_str}[/].')
296
+ status_msg = ux_utils.spinner_message(
297
+ 'Waiting for controller process to be RUNNING') + '{status_str}'
295
298
  status_display = rich_utils.safe_status(status_msg.format(status_str=''))
296
299
  num_tasks = managed_job_state.get_num_tasks(job_id)
297
300
 
@@ -310,7 +313,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
310
313
  time.sleep(_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS)
311
314
  controller_status = job_lib.get_status(job_id)
312
315
 
313
- msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='')
316
+ msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='', job_id=job_id)
314
317
  status_display.update(msg)
315
318
  prev_msg = msg
316
319
  managed_job_status = managed_job_state.get_status(job_id)
@@ -356,7 +359,8 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
356
359
  logger.debug(
357
360
  f'INFO: The log is not ready yet{status_str}. '
358
361
  f'Waiting for {JOB_STATUS_CHECK_GAP_SECONDS} seconds.')
359
- msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str=status_str)
362
+ msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str=status_str,
363
+ job_id=job_id)
360
364
  if msg != prev_msg:
361
365
  status_display.update(msg)
362
366
  prev_msg = msg
@@ -444,8 +448,9 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
444
448
  managed_job_status = managed_job_state.get_status(job_id)
445
449
  assert managed_job_status is not None, job_id
446
450
 
447
- logger.info(f'Logs finished for job {job_id} '
448
- f'(status: {managed_job_status.value}).')
451
+ logger.info(
452
+ ux_utils.finishing_message(f'Managed job finished: {job_id} '
453
+ f'(status: {managed_job_status.value}).'))
449
454
  return ''
450
455
 
451
456
 
@@ -599,11 +604,20 @@ def format_job_table(
599
604
  a list of "rows" (each of which is a list of str).
600
605
  """
601
606
  jobs = collections.defaultdict(list)
607
+ # Check if the tasks have user information.
608
+ tasks_have_user = any([task.get('user') for task in tasks])
609
+ if max_jobs and tasks_have_user:
610
+ raise ValueError('max_jobs is not supported when tasks have user info.')
611
+
612
+ def get_hash(task):
613
+ if tasks_have_user:
614
+ return (task['user'], task['job_id'])
615
+ return task['job_id']
616
+
602
617
  for task in tasks:
603
618
  # The tasks within the same job_id are already sorted
604
619
  # by the task_id.
605
- jobs[task['job_id']].append(task)
606
- jobs = dict(jobs)
620
+ jobs[get_hash(task)].append(task)
607
621
 
608
622
  status_counts: Dict[str, int] = collections.defaultdict(int)
609
623
  for job_tasks in jobs.values():
@@ -611,17 +625,14 @@ def format_job_table(
611
625
  if not managed_job_status.is_terminal():
612
626
  status_counts[managed_job_status.value] += 1
613
627
 
614
- if max_jobs is not None:
615
- job_ids = sorted(jobs.keys(), reverse=True)
616
- job_ids = job_ids[:max_jobs]
617
- jobs = {job_id: jobs[job_id] for job_id in job_ids}
618
-
619
628
  columns = [
620
629
  'ID', 'TASK', 'NAME', 'RESOURCES', 'SUBMITTED', 'TOT. DURATION',
621
630
  'JOB DURATION', '#RECOVERIES', 'STATUS'
622
631
  ]
623
632
  if show_all:
624
633
  columns += ['STARTED', 'CLUSTER', 'REGION', 'FAILURE']
634
+ if tasks_have_user:
635
+ columns.insert(0, 'USER')
625
636
  job_table = log_utils.create_table(columns)
626
637
 
627
638
  status_counts: Dict[str, int] = collections.defaultdict(int)
@@ -636,9 +647,9 @@ def format_job_table(
636
647
  for task in all_tasks:
637
648
  # The tasks within the same job_id are already sorted
638
649
  # by the task_id.
639
- jobs[task['job_id']].append(task)
650
+ jobs[get_hash(task)].append(task)
640
651
 
641
- for job_id, job_tasks in jobs.items():
652
+ for job_hash, job_tasks in jobs.items():
642
653
  if len(job_tasks) > 1:
643
654
  # Aggregate the tasks into a new row in the table.
644
655
  job_name = job_tasks[0]['job_name']
@@ -674,6 +685,7 @@ def format_job_table(
674
685
  if not managed_job_status.is_terminal():
675
686
  status_str += f' (task: {current_task_id})'
676
687
 
688
+ job_id = job_hash[1] if tasks_have_user else job_hash
677
689
  job_values = [
678
690
  job_id,
679
691
  '',
@@ -692,6 +704,8 @@ def format_job_table(
692
704
  '-',
693
705
  failure_reason if failure_reason is not None else '-',
694
706
  ])
707
+ if tasks_have_user:
708
+ job_values.insert(0, job_tasks[0].get('user', '-'))
695
709
  job_table.add_row(job_values)
696
710
 
697
711
  for task in job_tasks:
@@ -724,6 +738,8 @@ def format_job_table(
724
738
  task['failure_reason']
725
739
  if task['failure_reason'] is not None else '-',
726
740
  ])
741
+ if tasks_have_user:
742
+ values.insert(0, task.get('user', '-'))
727
743
  job_table.add_row(values)
728
744
 
729
745
  if len(job_tasks) > 1:
sky/optimizer.py CHANGED
@@ -123,22 +123,23 @@ class Optimizer:
123
123
  for a task.
124
124
  exceptions.NoCloudAccessError: if no public clouds are enabled.
125
125
  """
126
- _check_specified_clouds(dag)
127
-
128
- # This function is effectful: mutates every node in 'dag' by setting
129
- # node.best_resources if it is None.
130
- Optimizer._add_dummy_source_sink_nodes(dag)
131
- try:
132
- unused_best_plan = Optimizer._optimize_dag(
133
- dag=dag,
134
- minimize_cost=minimize == OptimizeTarget.COST,
135
- blocked_resources=blocked_resources,
136
- quiet=quiet)
137
- finally:
138
- # Make sure to remove the dummy source/sink nodes, even if the
139
- # optimization fails.
140
- Optimizer._remove_dummy_source_sink_nodes(dag)
141
- return dag
126
+ with rich_utils.safe_status(ux_utils.spinner_message('Optimizing')):
127
+ _check_specified_clouds(dag)
128
+
129
+ # This function is effectful: mutates every node in 'dag' by setting
130
+ # node.best_resources if it is None.
131
+ Optimizer._add_dummy_source_sink_nodes(dag)
132
+ try:
133
+ unused_best_plan = Optimizer._optimize_dag(
134
+ dag=dag,
135
+ minimize_cost=minimize == OptimizeTarget.COST,
136
+ blocked_resources=blocked_resources,
137
+ quiet=quiet)
138
+ finally:
139
+ # Make sure to remove the dummy source/sink nodes, even if the
140
+ # optimization fails.
141
+ Optimizer._remove_dummy_source_sink_nodes(dag)
142
+ return dag
142
143
 
143
144
  @staticmethod
144
145
  def _add_dummy_source_sink_nodes(dag: 'dag_lib.Dag'):
@@ -259,6 +260,9 @@ class Optimizer:
259
260
  launchable_resources: Dict[resources_lib.Resources,
260
261
  List[resources_lib.Resources]]
261
262
  ) -> Dict[resources_lib.Resources, int]:
263
+ if not resources_utils.need_to_query_reservations():
264
+ return {}
265
+
262
266
  num_available_reserved_nodes_per_resource = {}
263
267
 
264
268
  def get_reservations_available_resources(
@@ -269,7 +273,7 @@ class Optimizer:
269
273
  launchable_resources_list: List[resources_lib.Resources] = sum(
270
274
  launchable_resources.values(), [])
271
275
  with rich_utils.safe_status(
272
- '[cyan]Checking reserved resources...[/]'):
276
+ ux_utils.spinner_message('Checking reserved resources')):
273
277
  subprocess_utils.run_in_parallel(
274
278
  get_reservations_available_resources,
275
279
  launchable_resources_list)
@@ -337,8 +341,8 @@ class Optimizer:
337
341
  if minimize_cost:
338
342
  cost_per_node = resources.get_cost(estimated_runtime)
339
343
  num_available_reserved_nodes = (
340
- num_available_reserved_nodes_per_resource[resources]
341
- )
344
+ num_available_reserved_nodes_per_resource.get(
345
+ resources, 0))
342
346
 
343
347
  # We consider the cost of the unused reservation
344
348
  # resources to be 0 since we are already paying for
@@ -384,10 +388,14 @@ class Optimizer:
384
388
  fuzzy_candidates_str = (
385
389
  f'\nTry one of these offered accelerators: {cyan}'
386
390
  f'{fuzzy_candidates}{reset}')
391
+ node_resources_reprs = ', '.join(f'{node.num_nodes}x ' +
392
+ r.repr_with_region_zone
393
+ for r in node.resources)
387
394
  error_msg = (
388
395
  f'{source_hint.capitalize()} does not contain any '
389
- f'instances satisfying the request:\n{node}.'
390
- f'\n\nTo fix: relax or change the '
396
+ f'instances satisfying the request: '
397
+ f'{node_resources_reprs}.'
398
+ f'\nTo fix: relax or change the '
391
399
  f'resource requirements.{fuzzy_candidates_str}\n\n'
392
400
  f'Hint: {bold}sky show-gpus{reset} '
393
401
  'to list available accelerators.\n'
@@ -716,7 +724,6 @@ class Optimizer:
716
724
  node_to_cost_map: _TaskToCostMap,
717
725
  minimize_cost: bool,
718
726
  ):
719
- logger.info('== Optimizer ==')
720
727
  ordered_node_to_cost_map = collections.OrderedDict()
721
728
  ordered_best_plan = collections.OrderedDict()
722
729
  for node in topo_order:
@@ -738,15 +745,18 @@ class Optimizer:
738
745
  node.get_inputs() is None and node.get_outputs() is None):
739
746
  print_hourly_cost = True
740
747
 
741
- if print_hourly_cost:
742
- logger.info(f'{colorama.Style.BRIGHT}Estimated cost: '
743
- f'{colorama.Style.RESET_ALL}${total_cost:.1f} / hour\n')
744
- else:
745
- logger.info(f'{colorama.Style.BRIGHT}Estimated total runtime: '
746
- f'{colorama.Style.RESET_ALL}{total_time / 3600:.1f} '
747
- 'hours\n'
748
- f'{colorama.Style.BRIGHT}Estimated total cost: '
749
- f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
748
+ if not env_options.Options.MINIMIZE_LOGGING.get():
749
+ if print_hourly_cost:
750
+ logger.info(
751
+ f'{colorama.Style.BRIGHT}Estimated cost: '
752
+ f'{colorama.Style.RESET_ALL}${total_cost:.1f} / hour\n')
753
+ else:
754
+ logger.info(
755
+ f'{colorama.Style.BRIGHT}Estimated total runtime: '
756
+ f'{colorama.Style.RESET_ALL}{total_time / 3600:.1f} '
757
+ 'hours\n'
758
+ f'{colorama.Style.BRIGHT}Estimated total cost: '
759
+ f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
750
760
 
751
761
  def _get_resources_element_list(
752
762
  resources: 'resources_lib.Resources') -> List[str]:
@@ -845,7 +855,7 @@ class Optimizer:
845
855
  best_plan_table = _create_table(['TASK', '#NODES'] +
846
856
  resource_fields)
847
857
  best_plan_table.add_rows(best_plan_rows)
848
- logger.info(f'{best_plan_table}\n')
858
+ logger.info(f'{best_plan_table}')
849
859
 
850
860
  # Print the egress plan if any data egress is scheduled.
851
861
  Optimizer._print_egress_plan(graph, best_plan, minimize_cost)
@@ -864,6 +874,10 @@ class Optimizer:
864
874
  }
865
875
  task_str = (f'for task {task.name!r} ' if num_tasks > 1 else '')
866
876
  plural = 's' if task.num_nodes > 1 else ''
877
+ if num_tasks > 1:
878
+ # Add a new line for better readability, when there are multiple
879
+ # tasks.
880
+ logger.info('')
867
881
  logger.info(
868
882
  f'{colorama.Style.BRIGHT}Considered resources {task_str}'
869
883
  f'({task.num_nodes} node{plural}):'
@@ -934,7 +948,7 @@ class Optimizer:
934
948
 
935
949
  table = _create_table(field_names)
936
950
  table.add_rows(rows)
937
- logger.info(f'{table}\n')
951
+ logger.info(f'{table}')
938
952
 
939
953
  # Warning message for using disk_tier=ultra
940
954
  # TODO(yi): Consider price of disks in optimizer and
@@ -965,10 +979,10 @@ class Optimizer:
965
979
  f'Multiple {cloud} instances satisfy '
966
980
  f'{acc_name}:{int(acc_count)}. '
967
981
  f'The cheapest {candidate_list[0]!r} is considered '
968
- f'among:\n{instance_list}.\n')
982
+ f'among:\n{instance_list}.')
969
983
  if is_multi_instances:
970
984
  logger.info(
971
- f'To list more details, run \'sky show-gpus {acc_name}\'.')
985
+ f'To list more details, run: sky show-gpus {acc_name}\n')
972
986
 
973
987
  @staticmethod
974
988
  def _optimize_dag(
@@ -1101,8 +1115,7 @@ class Optimizer:
1101
1115
  Optimizer.print_optimized_plan(graph, topo_order, best_plan,
1102
1116
  total_time, total_cost,
1103
1117
  node_to_cost_map, minimize_cost)
1104
- if not env_options.Options.MINIMIZE_LOGGING.get():
1105
- Optimizer._print_candidates(local_node_to_candidate_map)
1118
+ Optimizer._print_candidates(local_node_to_candidate_map)
1106
1119
  return best_plan
1107
1120
 
1108
1121
 
@@ -16,10 +16,12 @@ from typing import Any, Dict, List, Optional, Set, Tuple
16
16
 
17
17
  import colorama
18
18
 
19
+ from sky import exceptions
19
20
  from sky import sky_logging
20
21
  from sky.adaptors import aws
21
22
  from sky.provision import common
22
23
  from sky.provision.aws import utils
24
+ from sky.utils import common_utils
23
25
 
24
26
  logger = sky_logging.init_logger(__name__)
25
27
 
@@ -535,12 +537,19 @@ def _get_or_create_vpc_security_group(ec2, vpc_id: str,
535
537
  if vpc_id in vpc_to_existing_sg:
536
538
  return vpc_to_existing_sg[vpc_id]
537
539
 
538
- # create a new security group
539
- ec2.meta.client.create_security_group(
540
- Description='Auto-created security group for Ray workers',
541
- GroupName=expected_sg_name,
542
- VpcId=vpc_id,
543
- )
540
+ try:
541
+ # create a new security group
542
+ ec2.meta.client.create_security_group(
543
+ Description='Auto-created security group for Ray workers',
544
+ GroupName=expected_sg_name,
545
+ VpcId=vpc_id,
546
+ )
547
+ except ec2.meta.client.exceptions.ClientError as e:
548
+ message = ('Failed to create security group. Error: '
549
+ f'{common_utils.format_exception(e)}')
550
+ logger.warning(message)
551
+ raise exceptions.NoClusterLaunchedError(message) from e
552
+
544
553
  security_group = _get_security_groups_from_vpc_ids(ec2, [vpc_id],
545
554
  [expected_sg_name])
546
555