skypilot-nightly 1.0.0.dev20241012__py3-none-any.whl → 1.0.0.dev20241014__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/azure.py +3 -1
  3. sky/adaptors/common.py +6 -2
  4. sky/backends/backend.py +9 -4
  5. sky/backends/backend_utils.py +13 -16
  6. sky/backends/cloud_vm_ray_backend.py +207 -161
  7. sky/backends/local_docker_backend.py +3 -1
  8. sky/benchmark/benchmark_utils.py +5 -4
  9. sky/cli.py +36 -28
  10. sky/clouds/oci.py +17 -2
  11. sky/clouds/service_catalog/aws_catalog.py +6 -7
  12. sky/clouds/service_catalog/common.py +4 -3
  13. sky/clouds/service_catalog/cudo_catalog.py +11 -1
  14. sky/core.py +4 -2
  15. sky/data/storage.py +44 -32
  16. sky/data/storage_utils.py +8 -4
  17. sky/exceptions.py +5 -0
  18. sky/execution.py +10 -24
  19. sky/jobs/core.py +9 -7
  20. sky/jobs/utils.py +15 -10
  21. sky/optimizer.py +50 -37
  22. sky/provision/aws/config.py +15 -6
  23. sky/provision/azure/config.py +14 -3
  24. sky/provision/azure/instance.py +15 -9
  25. sky/provision/kubernetes/instance.py +3 -1
  26. sky/provision/provisioner.py +63 -74
  27. sky/serve/core.py +42 -40
  28. sky/sky_logging.py +9 -5
  29. sky/skylet/job_lib.py +15 -0
  30. sky/skylet/log_lib.py +5 -4
  31. sky/skylet/providers/lambda_cloud/node_provider.py +1 -1
  32. sky/utils/command_runner.py +11 -11
  33. sky/utils/common_utils.py +2 -5
  34. sky/utils/controller_utils.py +78 -29
  35. sky/utils/env_options.py +22 -7
  36. sky/utils/log_utils.py +39 -24
  37. sky/utils/resources_utils.py +23 -0
  38. sky/utils/rich_utils.py +55 -5
  39. sky/utils/ux_utils.py +63 -4
  40. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/METADATA +1 -1
  41. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/RECORD +45 -45
  42. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/LICENSE +0 -0
  43. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/WHEEL +0 -0
  44. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/entry_points.txt +0 -0
  45. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/top_level.txt +0 -0
sky/jobs/core.py CHANGED
@@ -79,9 +79,11 @@ def launch(
79
79
 
80
80
  dag_utils.fill_default_config_in_dag_for_job_launch(dag)
81
81
 
82
- for task_ in dag.tasks:
83
- controller_utils.maybe_translate_local_file_mounts_and_sync_up(
84
- task_, path='jobs')
82
+ with rich_utils.safe_status(
83
+ ux_utils.spinner_message('Initializing managed job')):
84
+ for task_ in dag.tasks:
85
+ controller_utils.maybe_translate_local_file_mounts_and_sync_up(
86
+ task_, path='jobs')
85
87
 
86
88
  with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
87
89
  mode='w') as f:
@@ -129,7 +131,6 @@ def launch(
129
131
  f'{colorama.Fore.YELLOW}'
130
132
  f'Launching managed job {dag.name!r} from jobs controller...'
131
133
  f'{colorama.Style.RESET_ALL}')
132
- sky_logging.print('Launching jobs controller...')
133
134
  sky.launch(task=controller_task,
134
135
  stream_logs=stream_logs,
135
136
  cluster_name=controller_name,
@@ -262,11 +263,12 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
262
263
  f'{colorama.Style.RESET_ALL}')
263
264
 
264
265
  rich_utils.force_update_status(
265
- '[cyan] Checking managed jobs - restarting '
266
- 'controller[/]')
266
+ ux_utils.spinner_message('Checking managed jobs - restarting '
267
+ 'controller'))
267
268
  handle = sky.start(jobs_controller_type.value.cluster_name)
268
269
  controller_status = status_lib.ClusterStatus.UP
269
- rich_utils.force_update_status('[cyan] Checking managed jobs[/]')
270
+ rich_utils.force_update_status(
271
+ ux_utils.spinner_message('Checking managed jobs'))
270
272
 
271
273
  assert handle is not None, (controller_status, refresh)
272
274
 
sky/jobs/utils.py CHANGED
@@ -34,6 +34,7 @@ from sky.utils import common_utils
34
34
  from sky.utils import log_utils
35
35
  from sky.utils import rich_utils
36
36
  from sky.utils import subprocess_utils
37
+ from sky.utils import ux_utils
37
38
 
38
39
  if typing.TYPE_CHECKING:
39
40
  import sky
@@ -57,11 +58,13 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
57
58
 
58
59
  _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
59
60
 
60
- _JOB_WAITING_STATUS_MESSAGE = ('[bold cyan]Waiting for the task to start'
61
- '{status_str}.[/] It may take a few minutes.')
61
+ _JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
62
+ 'Waiting for task to start[/]'
63
+ '{status_str}. It may take a few minutes.\n'
64
+ ' [dim]View controller logs: sky jobs logs --controller {job_id}')
62
65
  _JOB_CANCELLED_MESSAGE = (
63
- '[bold cyan]Waiting for the task status to be updated.'
64
- '[/] It may take a minute.')
66
+ ux_utils.spinner_message('Waiting for task status to be updated.') +
67
+ ' It may take a minute.')
65
68
 
66
69
  # The maximum time to wait for the managed job status to transition to terminal
67
70
  # state, after the job finished. This is a safeguard to avoid the case where
@@ -290,8 +293,8 @@ def cancel_job_by_name(job_name: str) -> str:
290
293
  def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
291
294
  """Stream logs by job id."""
292
295
  controller_status = job_lib.get_status(job_id)
293
- status_msg = ('[bold cyan]Waiting for controller process to be RUNNING'
294
- '{status_str}[/].')
296
+ status_msg = ux_utils.spinner_message(
297
+ 'Waiting for controller process to be RUNNING') + '{status_str}'
295
298
  status_display = rich_utils.safe_status(status_msg.format(status_str=''))
296
299
  num_tasks = managed_job_state.get_num_tasks(job_id)
297
300
 
@@ -310,7 +313,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
310
313
  time.sleep(_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS)
311
314
  controller_status = job_lib.get_status(job_id)
312
315
 
313
- msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='')
316
+ msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='', job_id=job_id)
314
317
  status_display.update(msg)
315
318
  prev_msg = msg
316
319
  managed_job_status = managed_job_state.get_status(job_id)
@@ -356,7 +359,8 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
356
359
  logger.debug(
357
360
  f'INFO: The log is not ready yet{status_str}. '
358
361
  f'Waiting for {JOB_STATUS_CHECK_GAP_SECONDS} seconds.')
359
- msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str=status_str)
362
+ msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str=status_str,
363
+ job_id=job_id)
360
364
  if msg != prev_msg:
361
365
  status_display.update(msg)
362
366
  prev_msg = msg
@@ -444,8 +448,9 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
444
448
  managed_job_status = managed_job_state.get_status(job_id)
445
449
  assert managed_job_status is not None, job_id
446
450
 
447
- logger.info(f'Logs finished for job {job_id} '
448
- f'(status: {managed_job_status.value}).')
451
+ logger.info(
452
+ ux_utils.finishing_message(f'Managed job finished: {job_id} '
453
+ f'(status: {managed_job_status.value}).'))
449
454
  return ''
450
455
 
451
456
 
sky/optimizer.py CHANGED
@@ -123,22 +123,23 @@ class Optimizer:
123
123
  for a task.
124
124
  exceptions.NoCloudAccessError: if no public clouds are enabled.
125
125
  """
126
- _check_specified_clouds(dag)
127
-
128
- # This function is effectful: mutates every node in 'dag' by setting
129
- # node.best_resources if it is None.
130
- Optimizer._add_dummy_source_sink_nodes(dag)
131
- try:
132
- unused_best_plan = Optimizer._optimize_dag(
133
- dag=dag,
134
- minimize_cost=minimize == OptimizeTarget.COST,
135
- blocked_resources=blocked_resources,
136
- quiet=quiet)
137
- finally:
138
- # Make sure to remove the dummy source/sink nodes, even if the
139
- # optimization fails.
140
- Optimizer._remove_dummy_source_sink_nodes(dag)
141
- return dag
126
+ with rich_utils.safe_status(ux_utils.spinner_message('Optimizing')):
127
+ _check_specified_clouds(dag)
128
+
129
+ # This function is effectful: mutates every node in 'dag' by setting
130
+ # node.best_resources if it is None.
131
+ Optimizer._add_dummy_source_sink_nodes(dag)
132
+ try:
133
+ unused_best_plan = Optimizer._optimize_dag(
134
+ dag=dag,
135
+ minimize_cost=minimize == OptimizeTarget.COST,
136
+ blocked_resources=blocked_resources,
137
+ quiet=quiet)
138
+ finally:
139
+ # Make sure to remove the dummy source/sink nodes, even if the
140
+ # optimization fails.
141
+ Optimizer._remove_dummy_source_sink_nodes(dag)
142
+ return dag
142
143
 
143
144
  @staticmethod
144
145
  def _add_dummy_source_sink_nodes(dag: 'dag_lib.Dag'):
@@ -259,6 +260,9 @@ class Optimizer:
259
260
  launchable_resources: Dict[resources_lib.Resources,
260
261
  List[resources_lib.Resources]]
261
262
  ) -> Dict[resources_lib.Resources, int]:
263
+ if not resources_utils.need_to_query_reservations():
264
+ return {}
265
+
262
266
  num_available_reserved_nodes_per_resource = {}
263
267
 
264
268
  def get_reservations_available_resources(
@@ -269,7 +273,7 @@ class Optimizer:
269
273
  launchable_resources_list: List[resources_lib.Resources] = sum(
270
274
  launchable_resources.values(), [])
271
275
  with rich_utils.safe_status(
272
- '[cyan]Checking reserved resources...[/]'):
276
+ ux_utils.spinner_message('Checking reserved resources')):
273
277
  subprocess_utils.run_in_parallel(
274
278
  get_reservations_available_resources,
275
279
  launchable_resources_list)
@@ -337,8 +341,8 @@ class Optimizer:
337
341
  if minimize_cost:
338
342
  cost_per_node = resources.get_cost(estimated_runtime)
339
343
  num_available_reserved_nodes = (
340
- num_available_reserved_nodes_per_resource[resources]
341
- )
344
+ num_available_reserved_nodes_per_resource.get(
345
+ resources, 0))
342
346
 
343
347
  # We consider the cost of the unused reservation
344
348
  # resources to be 0 since we are already paying for
@@ -384,10 +388,14 @@ class Optimizer:
384
388
  fuzzy_candidates_str = (
385
389
  f'\nTry one of these offered accelerators: {cyan}'
386
390
  f'{fuzzy_candidates}{reset}')
391
+ node_resources_reprs = ', '.join(f'{node.num_nodes}x ' +
392
+ r.repr_with_region_zone
393
+ for r in node.resources)
387
394
  error_msg = (
388
395
  f'{source_hint.capitalize()} does not contain any '
389
- f'instances satisfying the request:\n{node}.'
390
- f'\n\nTo fix: relax or change the '
396
+ f'instances satisfying the request: '
397
+ f'{node_resources_reprs}.'
398
+ f'\nTo fix: relax or change the '
391
399
  f'resource requirements.{fuzzy_candidates_str}\n\n'
392
400
  f'Hint: {bold}sky show-gpus{reset} '
393
401
  'to list available accelerators.\n'
@@ -716,7 +724,6 @@ class Optimizer:
716
724
  node_to_cost_map: _TaskToCostMap,
717
725
  minimize_cost: bool,
718
726
  ):
719
- logger.info('== Optimizer ==')
720
727
  ordered_node_to_cost_map = collections.OrderedDict()
721
728
  ordered_best_plan = collections.OrderedDict()
722
729
  for node in topo_order:
@@ -738,15 +745,18 @@ class Optimizer:
738
745
  node.get_inputs() is None and node.get_outputs() is None):
739
746
  print_hourly_cost = True
740
747
 
741
- if print_hourly_cost:
742
- logger.info(f'{colorama.Style.BRIGHT}Estimated cost: '
743
- f'{colorama.Style.RESET_ALL}${total_cost:.1f} / hour\n')
744
- else:
745
- logger.info(f'{colorama.Style.BRIGHT}Estimated total runtime: '
746
- f'{colorama.Style.RESET_ALL}{total_time / 3600:.1f} '
747
- 'hours\n'
748
- f'{colorama.Style.BRIGHT}Estimated total cost: '
749
- f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
748
+ if not env_options.Options.MINIMIZE_LOGGING.get():
749
+ if print_hourly_cost:
750
+ logger.info(
751
+ f'{colorama.Style.BRIGHT}Estimated cost: '
752
+ f'{colorama.Style.RESET_ALL}${total_cost:.1f} / hour\n')
753
+ else:
754
+ logger.info(
755
+ f'{colorama.Style.BRIGHT}Estimated total runtime: '
756
+ f'{colorama.Style.RESET_ALL}{total_time / 3600:.1f} '
757
+ 'hours\n'
758
+ f'{colorama.Style.BRIGHT}Estimated total cost: '
759
+ f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
750
760
 
751
761
  def _get_resources_element_list(
752
762
  resources: 'resources_lib.Resources') -> List[str]:
@@ -845,7 +855,7 @@ class Optimizer:
845
855
  best_plan_table = _create_table(['TASK', '#NODES'] +
846
856
  resource_fields)
847
857
  best_plan_table.add_rows(best_plan_rows)
848
- logger.info(f'{best_plan_table}\n')
858
+ logger.info(f'{best_plan_table}')
849
859
 
850
860
  # Print the egress plan if any data egress is scheduled.
851
861
  Optimizer._print_egress_plan(graph, best_plan, minimize_cost)
@@ -864,6 +874,10 @@ class Optimizer:
864
874
  }
865
875
  task_str = (f'for task {task.name!r} ' if num_tasks > 1 else '')
866
876
  plural = 's' if task.num_nodes > 1 else ''
877
+ if num_tasks > 1:
878
+ # Add a new line for better readability, when there are multiple
879
+ # tasks.
880
+ logger.info('')
867
881
  logger.info(
868
882
  f'{colorama.Style.BRIGHT}Considered resources {task_str}'
869
883
  f'({task.num_nodes} node{plural}):'
@@ -934,7 +948,7 @@ class Optimizer:
934
948
 
935
949
  table = _create_table(field_names)
936
950
  table.add_rows(rows)
937
- logger.info(f'{table}\n')
951
+ logger.info(f'{table}')
938
952
 
939
953
  # Warning message for using disk_tier=ultra
940
954
  # TODO(yi): Consider price of disks in optimizer and
@@ -965,10 +979,10 @@ class Optimizer:
965
979
  f'Multiple {cloud} instances satisfy '
966
980
  f'{acc_name}:{int(acc_count)}. '
967
981
  f'The cheapest {candidate_list[0]!r} is considered '
968
- f'among:\n{instance_list}.\n')
982
+ f'among:\n{instance_list}.')
969
983
  if is_multi_instances:
970
984
  logger.info(
971
- f'To list more details, run \'sky show-gpus {acc_name}\'.')
985
+ f'To list more details, run: sky show-gpus {acc_name}\n')
972
986
 
973
987
  @staticmethod
974
988
  def _optimize_dag(
@@ -1101,8 +1115,7 @@ class Optimizer:
1101
1115
  Optimizer.print_optimized_plan(graph, topo_order, best_plan,
1102
1116
  total_time, total_cost,
1103
1117
  node_to_cost_map, minimize_cost)
1104
- if not env_options.Options.MINIMIZE_LOGGING.get():
1105
- Optimizer._print_candidates(local_node_to_candidate_map)
1118
+ Optimizer._print_candidates(local_node_to_candidate_map)
1106
1119
  return best_plan
1107
1120
 
1108
1121
 
@@ -16,10 +16,12 @@ from typing import Any, Dict, List, Optional, Set, Tuple
16
16
 
17
17
  import colorama
18
18
 
19
+ from sky import exceptions
19
20
  from sky import sky_logging
20
21
  from sky.adaptors import aws
21
22
  from sky.provision import common
22
23
  from sky.provision.aws import utils
24
+ from sky.utils import common_utils
23
25
 
24
26
  logger = sky_logging.init_logger(__name__)
25
27
 
@@ -535,12 +537,19 @@ def _get_or_create_vpc_security_group(ec2, vpc_id: str,
535
537
  if vpc_id in vpc_to_existing_sg:
536
538
  return vpc_to_existing_sg[vpc_id]
537
539
 
538
- # create a new security group
539
- ec2.meta.client.create_security_group(
540
- Description='Auto-created security group for Ray workers',
541
- GroupName=expected_sg_name,
542
- VpcId=vpc_id,
543
- )
540
+ try:
541
+ # create a new security group
542
+ ec2.meta.client.create_security_group(
543
+ Description='Auto-created security group for Ray workers',
544
+ GroupName=expected_sg_name,
545
+ VpcId=vpc_id,
546
+ )
547
+ except ec2.meta.client.exceptions.ClientError as e:
548
+ message = ('Failed to create security group. Error: '
549
+ f'{common_utils.format_exception(e)}')
550
+ logger.warning(message)
551
+ raise exceptions.NoClusterLaunchedError(message) from e
552
+
544
553
  security_group = _get_security_groups_from_vpc_ids(ec2, [vpc_id],
545
554
  [expected_sg_name])
546
555
 
@@ -5,16 +5,18 @@ a cluster to be launched.
5
5
  """
6
6
  import hashlib
7
7
  import json
8
- import logging
9
8
  from pathlib import Path
10
9
  import random
11
10
  import time
12
11
  from typing import Any, Callable
13
12
 
13
+ from sky import exceptions
14
+ from sky import sky_logging
14
15
  from sky.adaptors import azure
15
16
  from sky.provision import common
17
+ from sky.utils import common_utils
16
18
 
17
- logger = logging.getLogger(__name__)
19
+ logger = sky_logging.init_logger(__name__)
18
20
 
19
21
  UNIQUE_ID_LEN = 4
20
22
  _DEPLOYMENT_NAME = 'skypilot-config'
@@ -92,10 +94,19 @@ def bootstrap_instances(
92
94
  retry += 1
93
95
  continue
94
96
  raise
97
+ except azure.exceptions().ClientAuthenticationError as e:
98
+ message = (
99
+ 'Failed to authenticate with Azure. Please check your Azure '
100
+ f'credentials. Error: {common_utils.format_exception(e)}'
101
+ ).replace('\n', ' ')
102
+ logger.error(message)
103
+ raise exceptions.NoClusterLaunchedError(message) from e
95
104
  else:
96
- raise TimeoutError(
105
+ message = (
97
106
  f'Timed out waiting for resource group {resource_group} to be '
98
107
  'deleted.')
108
+ logger.error(message)
109
+ raise TimeoutError(message)
99
110
 
100
111
  # load the template file
101
112
  current_path = Path(__file__).parent
@@ -441,15 +441,21 @@ def run_instances(region: str, cluster_name_on_cloud: str,
441
441
  if to_start_count > 0:
442
442
  resource_client = azure.get_client('resource', subscription_id)
443
443
  logger.debug(f'run_instances: Creating {to_start_count} instances.')
444
- created_instances = _create_instances(
445
- compute_client=compute_client,
446
- resource_client=resource_client,
447
- cluster_name_on_cloud=cluster_name_on_cloud,
448
- resource_group=resource_group,
449
- provider_config=provider_config,
450
- node_config=config.node_config,
451
- tags=tags,
452
- count=to_start_count)
444
+ try:
445
+ created_instances = _create_instances(
446
+ compute_client=compute_client,
447
+ resource_client=resource_client,
448
+ cluster_name_on_cloud=cluster_name_on_cloud,
449
+ resource_group=resource_group,
450
+ provider_config=provider_config,
451
+ node_config=config.node_config,
452
+ tags=tags,
453
+ count=to_start_count)
454
+ except Exception as e:
455
+ err_message = common_utils.format_exception(
456
+ e, use_bracket=True).replace('\n', ' ')
457
+ logger.error(f'Failed to create instances: {err_message}')
458
+ raise
453
459
  created_instance_ids = [inst.name for inst in created_instances]
454
460
 
455
461
  non_running_instance_statuses = list(
@@ -632,7 +632,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
632
632
  try:
633
633
  return _create_pods(region, cluster_name_on_cloud, config)
634
634
  except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
635
- logger.warning(f'run_instances: Error occurred when creating pods: {e}')
635
+ e_msg = common_utils.format_exception(e).replace('\n', ' ')
636
+ logger.warning('run_instances: Error occurred when creating pods: '
637
+ f'{e_msg}')
636
638
  raise
637
639
 
638
640
 
@@ -14,6 +14,7 @@ import colorama
14
14
 
15
15
  import sky
16
16
  from sky import clouds
17
+ from sky import exceptions
17
18
  from sky import provision
18
19
  from sky import sky_logging
19
20
  from sky import status_lib
@@ -42,76 +43,50 @@ _TITLE = '\n\n' + '=' * 20 + ' {} ' + '=' * 20 + '\n'
42
43
  def _bulk_provision(
43
44
  cloud: clouds.Cloud,
44
45
  region: clouds.Region,
45
- zones: Optional[List[clouds.Zone]],
46
46
  cluster_name: resources_utils.ClusterName,
47
47
  bootstrap_config: provision_common.ProvisionConfig,
48
48
  ) -> provision_common.ProvisionRecord:
49
49
  provider_name = repr(cloud)
50
50
  region_name = region.name
51
51
 
52
- style = colorama.Style
53
-
54
- if not zones:
55
- # For Azure, zones is always an empty list.
56
- zone_str = 'all zones'
57
- else:
58
- zone_str = ','.join(z.name for z in zones)
59
-
60
- if isinstance(cloud, clouds.Kubernetes):
61
- # Omit the region name for Kubernetes.
62
- logger.info(f'{style.BRIGHT}Launching on {cloud}{style.RESET_ALL} '
63
- f'{cluster_name!r}.')
64
- else:
65
- logger.info(f'{style.BRIGHT}Launching on {cloud} '
66
- f'{region_name}{style.RESET_ALL} ({zone_str})')
67
-
68
52
  start = time.time()
69
- with rich_utils.safe_status('[bold cyan]Launching[/]') as status:
53
+ # TODO(suquark): Should we cache the bootstrapped result?
54
+ # Currently it is not necessary as bootstrapping takes
55
+ # only ~3s, caching it seems over-engineering and could
56
+ # cause other issues like the cache is not synced
57
+ # with the cloud configuration.
58
+ config = provision.bootstrap_instances(provider_name, region_name,
59
+ cluster_name.name_on_cloud,
60
+ bootstrap_config)
61
+
62
+ provision_record = provision.run_instances(provider_name,
63
+ region_name,
64
+ cluster_name.name_on_cloud,
65
+ config=config)
66
+
67
+ backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=3)
68
+ logger.debug(f'\nWaiting for instances of {cluster_name!r} to be ready...')
69
+ rich_utils.force_update_status(
70
+ ux_utils.spinner_message('Launching - Checking instance status',
71
+ str(provision_logging.config.log_path)))
72
+ # AWS would take a very short time (<<1s) updating the state of the
73
+ # instance.
74
+ time.sleep(1)
75
+ for retry_cnt in range(_MAX_RETRY):
70
76
  try:
71
- # TODO(suquark): Should we cache the bootstrapped result?
72
- # Currently it is not necessary as bootstrapping takes
73
- # only ~3s, caching it seems over-engineering and could
74
- # cause other issues like the cache is not synced
75
- # with the cloud configuration.
76
- config = provision.bootstrap_instances(provider_name, region_name,
77
- cluster_name.name_on_cloud,
78
- bootstrap_config)
79
- except Exception as e:
80
- logger.error(f'{colorama.Fore.YELLOW}Failed to configure '
81
- f'{cluster_name!r} on {cloud} {region} ({zone_str}) '
82
- 'with the following error:'
83
- f'{colorama.Style.RESET_ALL}\n'
84
- f'{common_utils.format_exception(e)}')
85
- raise
86
-
87
- provision_record = provision.run_instances(provider_name,
88
- region_name,
89
- cluster_name.name_on_cloud,
90
- config=config)
91
-
92
- backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=3)
93
- logger.debug(
94
- f'\nWaiting for instances of {cluster_name!r} to be ready...')
95
- status.update('[bold cyan]Launching - Checking instance status[/]')
96
- # AWS would take a very short time (<<1s) updating the state of the
97
- # instance.
98
- time.sleep(1)
99
- for retry_cnt in range(_MAX_RETRY):
100
- try:
101
- provision.wait_instances(provider_name,
102
- region_name,
103
- cluster_name.name_on_cloud,
104
- state=status_lib.ClusterStatus.UP)
105
- break
106
- except (aws.botocore_exceptions().WaiterError, RuntimeError):
107
- time.sleep(backoff.current_backoff())
108
- else:
109
- raise RuntimeError(
110
- f'Failed to wait for instances of {cluster_name!r} to be '
111
- f'ready on the cloud provider after max retries {_MAX_RETRY}.')
112
- logger.debug(
113
- f'Instances of {cluster_name!r} are ready after {retry_cnt} '
114
- 'retries.')
77
+ provision.wait_instances(provider_name,
78
+ region_name,
79
+ cluster_name.name_on_cloud,
80
+ state=status_lib.ClusterStatus.UP)
81
+ break
82
+ except (aws.botocore_exceptions().WaiterError, RuntimeError):
83
+ time.sleep(backoff.current_backoff())
84
+ else:
85
+ raise RuntimeError(
86
+ f'Failed to wait for instances of {cluster_name!r} to be '
87
+ f'ready on the cloud provider after max retries {_MAX_RETRY}.')
88
+ logger.debug(f'Instances of {cluster_name!r} are ready after {retry_cnt} '
89
+ 'retries.')
115
90
 
116
91
  logger.debug(
117
92
  f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
@@ -162,8 +137,11 @@ def bulk_provision(
162
137
  logger.debug(
163
138
  'Provision config:\n'
164
139
  f'{json.dumps(dataclasses.asdict(bootstrap_config), indent=2)}')
165
- return _bulk_provision(cloud, region, zones, cluster_name,
140
+ return _bulk_provision(cloud, region, cluster_name,
166
141
  bootstrap_config)
142
+ except exceptions.NoClusterLaunchedError:
143
+ # Skip the teardown if the cluster was never launched.
144
+ raise
167
145
  except Exception: # pylint: disable=broad-except
168
146
  zone_str = 'all zones'
169
147
  if zones:
@@ -440,23 +418,30 @@ def _post_provision_setup(
440
418
  # We don't set docker_user here, as we are configuring the VM itself.
441
419
  ssh_credentials = backend_utils.ssh_credential_from_yaml(
442
420
  cluster_yaml, ssh_user=cluster_info.ssh_user)
421
+ docker_config = config_from_yaml.get('docker', {})
443
422
 
444
423
  with rich_utils.safe_status(
445
- '[bold cyan]Launching - Waiting for SSH access[/]') as status:
424
+ ux_utils.spinner_message(
425
+ 'Launching - Waiting for SSH access',
426
+ provision_logging.config.log_path)) as status:
446
427
 
447
428
  logger.debug(
448
429
  f'\nWaiting for SSH to be available for {cluster_name!r} ...')
449
430
  wait_for_ssh(cluster_info, ssh_credentials)
450
- logger.debug(f'SSH Conection ready for {cluster_name!r}')
431
+ logger.debug(f'SSH Connection ready for {cluster_name!r}')
432
+ vm_str = 'Instance' if cloud_name.lower() != 'kubernetes' else 'Pod'
451
433
  plural = '' if len(cluster_info.instances) == 1 else 's'
452
- logger.info(f'{colorama.Fore.GREEN}Successfully provisioned '
453
- f'or found existing instance{plural}.'
454
- f'{colorama.Style.RESET_ALL}')
434
+ verb = 'is' if len(cluster_info.instances) == 1 else 'are'
435
+ indent_str = (ux_utils.INDENT_SYMBOL
436
+ if docker_config else ux_utils.INDENT_LAST_SYMBOL)
437
+ logger.info(f'{indent_str}{colorama.Style.DIM}{vm_str}{plural} {verb} '
438
+ f'up.{colorama.Style.RESET_ALL}')
455
439
 
456
- docker_config = config_from_yaml.get('docker', {})
457
440
  if docker_config:
458
441
  status.update(
459
- '[bold cyan]Launching - Initializing docker container[/]')
442
+ ux_utils.spinner_message(
443
+ 'Launching - Initializing docker container',
444
+ provision_logging.config.log_path))
460
445
  docker_user = instance_setup.initialize_docker(
461
446
  cluster_name.name_on_cloud,
462
447
  docker_config=docker_config,
@@ -470,6 +455,8 @@ def _post_provision_setup(
470
455
  cluster_info.docker_user = docker_user
471
456
  ssh_credentials['docker_user'] = docker_user
472
457
  logger.debug(f'Docker user: {docker_user}')
458
+ logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
459
+ f'Docker container is up.{colorama.Style.RESET_ALL}')
473
460
 
474
461
  # We mount the metadata with sky wheel for speedup.
475
462
  # NOTE: currently we mount all credentials for all nodes, because
@@ -482,8 +469,9 @@ def _post_provision_setup(
482
469
  # for later.
483
470
  file_mounts = config_from_yaml.get('file_mounts', {})
484
471
 
485
- runtime_preparation_str = ('[bold cyan]Preparing SkyPilot '
486
- 'runtime ({step}/3 - {step_name})')
472
+ runtime_preparation_str = (ux_utils.spinner_message(
473
+ 'Preparing SkyPilot runtime ({step}/3 - {step_name})',
474
+ provision_logging.config.log_path))
487
475
  status.update(
488
476
  runtime_preparation_str.format(step=1, step_name='initializing'))
489
477
  instance_setup.internal_file_mounts(cluster_name.name_on_cloud,
@@ -551,8 +539,9 @@ def _post_provision_setup(
551
539
  instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
552
540
  cluster_info, ssh_credentials)
553
541
 
554
- logger.info(f'{colorama.Fore.GREEN}Successfully provisioned cluster: '
555
- f'{cluster_name}{colorama.Style.RESET_ALL}')
542
+ logger.info(
543
+ ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
544
+ provision_logging.config.log_path))
556
545
  return cluster_info
557
546
 
558
547