skypilot-nightly 1.0.0.dev20241012__py3-none-any.whl → 1.0.0.dev20241013__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/azure.py +3 -1
  3. sky/adaptors/common.py +6 -2
  4. sky/backends/backend.py +9 -4
  5. sky/backends/backend_utils.py +13 -16
  6. sky/backends/cloud_vm_ray_backend.py +207 -161
  7. sky/backends/local_docker_backend.py +3 -1
  8. sky/benchmark/benchmark_utils.py +5 -4
  9. sky/cli.py +36 -28
  10. sky/clouds/service_catalog/aws_catalog.py +6 -7
  11. sky/clouds/service_catalog/common.py +4 -3
  12. sky/clouds/service_catalog/cudo_catalog.py +11 -1
  13. sky/core.py +4 -2
  14. sky/data/storage.py +44 -32
  15. sky/data/storage_utils.py +8 -4
  16. sky/exceptions.py +5 -0
  17. sky/execution.py +10 -24
  18. sky/jobs/core.py +9 -7
  19. sky/jobs/utils.py +15 -10
  20. sky/optimizer.py +50 -37
  21. sky/provision/aws/config.py +15 -6
  22. sky/provision/azure/config.py +14 -3
  23. sky/provision/azure/instance.py +15 -9
  24. sky/provision/kubernetes/instance.py +3 -1
  25. sky/provision/provisioner.py +63 -74
  26. sky/serve/core.py +42 -40
  27. sky/sky_logging.py +9 -5
  28. sky/skylet/log_lib.py +5 -4
  29. sky/skylet/providers/lambda_cloud/node_provider.py +1 -1
  30. sky/utils/command_runner.py +11 -11
  31. sky/utils/common_utils.py +2 -5
  32. sky/utils/controller_utils.py +78 -29
  33. sky/utils/env_options.py +22 -7
  34. sky/utils/log_utils.py +39 -24
  35. sky/utils/resources_utils.py +23 -0
  36. sky/utils/rich_utils.py +55 -5
  37. sky/utils/ux_utils.py +63 -4
  38. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/METADATA +1 -1
  39. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/RECORD +43 -43
  40. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/LICENSE +0 -0
  41. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/WHEEL +0 -0
  42. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/entry_points.txt +0 -0
  43. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/top_level.txt +0 -0
@@ -122,9 +122,6 @@ _RSYNC_NOT_FOUND_MESSAGE = (
122
122
 
123
123
  _TPU_NOT_FOUND_ERROR = 'ERROR: (gcloud.compute.tpus.delete) NOT_FOUND'
124
124
 
125
- _CTRL_C_TIP_MESSAGE = ('INFO: Tip: use Ctrl-C to exit log streaming '
126
- '(task will not be killed).')
127
-
128
125
  _MAX_RAY_UP_RETRY = 5
129
126
 
130
127
  # Number of retries for getting zones.
@@ -405,22 +402,35 @@ class RayCodeGen:
405
402
  **gpu_dict,
406
403
  })
407
404
 
405
+ streaming_message = (
406
+ f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
407
+ f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
408
+ f'be killed){colorama.Style.RESET_ALL}')
408
409
  self._code += [
409
410
  textwrap.dedent(f"""\
410
411
  pg = ray_util.placement_group({json.dumps(bundles)}, 'STRICT_SPREAD')
411
412
  plural = 's' if {num_nodes} > 1 else ''
412
413
  node_str = f'{num_nodes} node{{plural}}'
413
414
 
414
- message = {_CTRL_C_TIP_MESSAGE!r} + '\\n'
415
- message += f'INFO: Waiting for task resources on {{node_str}}. This will block if the cluster is full.'
416
- print(message,
417
- flush=True)
415
+ # We have this `INFO: Tip:` message only for backward
416
+ # compatibility, because if a cluster has the old SkyPilot version,
417
+ # it relies on this message to start log streaming.
418
+ # This message will be skipped for new clusters, because we use
419
+ # start_streaming_at for the `Waiting for task resources on`
420
+ # message.
421
+ # TODO: Remove this message in v0.9.0.
422
+ message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}INFO: '
423
+ 'Tip: use Ctrl-C to exit log streaming, not kill '
424
+ 'the job.{colorama.Style.RESET_ALL}\\n')
425
+ message += ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
426
+ 'Waiting for task resources on '
427
+ f'{{node_str}}.{colorama.Style.RESET_ALL}')
428
+ print(message, flush=True)
418
429
  # FIXME: This will print the error message from autoscaler if
419
430
  # it is waiting for other task to finish. We should hide the
420
431
  # error message.
421
432
  ray.get(pg.ready())
422
- print('INFO: All task resources reserved.',
423
- flush=True)
433
+ print({streaming_message!r}, flush=True)
424
434
  """)
425
435
  ]
426
436
 
@@ -496,7 +506,6 @@ class RayCodeGen:
496
506
  )).remote()
497
507
  for i in range(pg.bundle_count)
498
508
  ])
499
- print('INFO: Reserved IPs:', gang_scheduling_id_to_ip)
500
509
 
501
510
  cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
502
511
  job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
@@ -743,15 +752,14 @@ class FailoverCloudErrorHandlerV1:
743
752
  region: 'clouds.Region',
744
753
  zones: Optional[List['clouds.Zone']], stdout: str,
745
754
  stderr: str):
746
- del zones # Unused.
755
+ del region, zones # Unused.
747
756
  errors = FailoverCloudErrorHandlerV1._handle_errors(
748
757
  stdout,
749
758
  stderr,
750
759
  is_error_str_known=lambda x: 'LambdaCloudError:' in x.strip())
751
- logger.warning(f'Got error(s) in {region.name}:')
752
- messages = '\n\t'.join(errors)
760
+ messages = '\n '.join(errors)
753
761
  style = colorama.Style
754
- logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
762
+ logger.warning(f' {style.DIM}{messages}{style.RESET_ALL}')
755
763
  _add_to_blocked_resources(blocked_resources,
756
764
  launchable_resources.copy(zone=None))
757
765
 
@@ -926,6 +934,10 @@ class FailoverCloudErrorHandlerV2:
926
934
  _add_to_blocked_resources(
927
935
  blocked_resources,
928
936
  resources_lib.Resources(cloud=clouds.Azure()))
937
+ elif 'ClientAuthenticationError' in str(err):
938
+ _add_to_blocked_resources(
939
+ blocked_resources,
940
+ resources_lib.Resources(cloud=clouds.Azure()))
929
941
  else:
930
942
  _add_to_blocked_resources(blocked_resources,
931
943
  launchable_resources.copy(zone=None))
@@ -1224,9 +1236,10 @@ class RetryingVmProvisioner(object):
1224
1236
 
1225
1237
  if prev_cluster_status != status_lib.ClusterStatus.UP:
1226
1238
  logger.info(
1227
- f'Cluster {cluster_name!r} (status: '
1228
- f'{prev_cluster_status.value}) was previously launched '
1229
- f'in {cloud} {region.name}. Relaunching in that region.')
1239
+ f'{colorama.Style.DIM}Cluster {cluster_name!r} (status: '
1240
+ f'{prev_cluster_status.value}) was previously in '
1241
+ f'{cloud} ({region.name}). Restarting.'
1242
+ f'{colorama.Style.RESET_ALL}')
1230
1243
  yield zones
1231
1244
 
1232
1245
  # If it reaches here: the cluster status in the database gets
@@ -1303,17 +1316,14 @@ class RetryingVmProvisioner(object):
1303
1316
  prev_cluster_ever_up: bool,
1304
1317
  ) -> Dict[str, Any]:
1305
1318
  """The provision retry loop."""
1306
- style = colorama.Style
1307
- fore = colorama.Fore
1308
1319
  # Get log_path name
1309
1320
  log_path = os.path.join(self.log_dir, 'provision.log')
1310
1321
  log_abs_path = os.path.abspath(log_path)
1311
1322
  if not dryrun:
1312
1323
  os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
1313
1324
  os.system(f'touch {log_path}')
1314
- tail_cmd = f'tail -n100 -f {log_path}'
1315
- logger.info('To view detailed progress: '
1316
- f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
1325
+ rich_utils.force_update_status(
1326
+ ux_utils.spinner_message('Launching', log_path))
1317
1327
 
1318
1328
  # Get previous cluster status
1319
1329
  cluster_exists = prev_cluster_status is not None
@@ -1481,6 +1491,23 @@ class RetryingVmProvisioner(object):
1481
1491
  if to_provision.cloud.OPEN_PORTS_VERSION <=
1482
1492
  clouds.OpenPortsVersion.LAUNCH_ONLY else None)
1483
1493
  try:
1494
+ controller = controller_utils.Controllers.from_name(
1495
+ cluster_name)
1496
+ controller_str = ('' if controller is None else
1497
+ f' {controller.value.name}')
1498
+ if isinstance(to_provision.cloud, clouds.Kubernetes):
1499
+ # Omit the region name for Kubernetes.
1500
+ logger.info(
1501
+ ux_utils.starting_message(
1502
+ f'Launching{controller_str} on '
1503
+ f'{to_provision.cloud}.'))
1504
+ else:
1505
+ logger.info(
1506
+ ux_utils.starting_message(
1507
+ f'Launching{controller_str} on '
1508
+ f'{to_provision.cloud} '
1509
+ f'{region.name}{colorama.Style.RESET_ALL}'
1510
+ f'{zone_str}.'))
1484
1511
  provision_record = provisioner.bulk_provision(
1485
1512
  to_provision.cloud,
1486
1513
  region,
@@ -1528,6 +1555,7 @@ class RetryingVmProvisioner(object):
1528
1555
  'region_name': region.name,
1529
1556
  'zone_str': zone_str,
1530
1557
  }
1558
+
1531
1559
  status, stdout, stderr, head_internal_ip, head_external_ip = (
1532
1560
  self._gang_schedule_ray_up(to_provision.cloud,
1533
1561
  cluster_config_file, handle,
@@ -1566,9 +1594,9 @@ class RetryingVmProvisioner(object):
1566
1594
  self._ensure_cluster_ray_started(handle, log_abs_path)
1567
1595
 
1568
1596
  config_dict['handle'] = handle
1569
- plural = '' if num_nodes == 1 else 's'
1570
- logger.info(f'{fore.GREEN}Successfully provisioned or found'
1571
- f' existing VM{plural}.{style.RESET_ALL}')
1597
+ logger.info(
1598
+ ux_utils.finishing_message(
1599
+ f'Cluster launched: {cluster_name!r}.', log_path))
1572
1600
  return config_dict
1573
1601
 
1574
1602
  # The cluster is not ready. We must perform error recording and/or
@@ -1633,17 +1661,15 @@ class RetryingVmProvisioner(object):
1633
1661
 
1634
1662
  if to_provision.zone is not None:
1635
1663
  message = (
1636
- f'Failed to acquire resources in {to_provision.zone}. '
1637
- 'Try changing resource requirements or use another zone.')
1664
+ f'Failed to acquire resources in {to_provision.zone} for '
1665
+ f'{requested_resources}. ')
1638
1666
  elif to_provision.region is not None:
1639
1667
  # For public clouds, provision.region is always set.
1640
1668
  message = ('Failed to acquire resources in all zones in '
1641
- f'{to_provision.region}. Try changing resource '
1642
- 'requirements or use another region.')
1669
+ f'{to_provision.region} for {requested_resources}. ')
1643
1670
  else:
1644
- message = (f'Failed to acquire resources in {to_provision.cloud}. '
1645
- 'Try changing resource requirements or use another '
1646
- 'cloud provider.')
1671
+ message = (f'Failed to acquire resources in {to_provision.cloud} '
1672
+ f'for {requested_resources}. ')
1647
1673
  # Do not failover to other locations if the cluster was ever up, since
1648
1674
  # the user can have some data on the cluster.
1649
1675
  raise exceptions.ResourcesUnavailableError(
@@ -1694,7 +1720,7 @@ class RetryingVmProvisioner(object):
1694
1720
  log_abs_path,
1695
1721
  stream_logs=False,
1696
1722
  start_streaming_at='Shared connection to',
1697
- line_processor=log_utils.RayUpLineProcessor(),
1723
+ line_processor=log_utils.RayUpLineProcessor(log_abs_path),
1698
1724
  # Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging
1699
1725
  # time during 'ray up' if insufficient capacity occurs.
1700
1726
  env=dict(
@@ -1714,13 +1740,14 @@ class RetryingVmProvisioner(object):
1714
1740
 
1715
1741
  region_name = logging_info['region_name']
1716
1742
  zone_str = logging_info['zone_str']
1717
- style = colorama.Style
1718
1743
  if isinstance(to_provision_cloud, clouds.Kubernetes):
1719
- logger.info(f'{style.BRIGHT}Launching on {to_provision_cloud} '
1720
- f'{style.RESET_ALL}')
1744
+ logger.info(
1745
+ ux_utils.starting_message(
1746
+ f'Launching on {to_provision_cloud}.'))
1721
1747
  else:
1722
- logger.info(f'{style.BRIGHT}Launching on {to_provision_cloud} '
1723
- f'{region_name}{style.RESET_ALL}{zone_str}')
1748
+ logger.info(
1749
+ ux_utils.starting_message(f'Launching on {to_provision_cloud} '
1750
+ f'{region_name}{zone_str}.'))
1724
1751
  start = time.time()
1725
1752
 
1726
1753
  # Edge case: /tmp/ray does not exist, so autoscaler can't create/store
@@ -1822,11 +1849,6 @@ class RetryingVmProvisioner(object):
1822
1849
  head_internal_ip, head_external_ip)
1823
1850
 
1824
1851
  # All code below is handling num_nodes > 1.
1825
- provision_str = ('Successfully provisioned or found existing head '
1826
- 'instance.')
1827
- logger.info(f'{style.BRIGHT}{provision_str} '
1828
- f'Waiting for workers.{style.RESET_ALL}')
1829
-
1830
1852
  # FIXME(zongheng): the below requires ray processes are up on head. To
1831
1853
  # repro it failing: launch a 2-node cluster, log into head and ray
1832
1854
  # stop, then launch again.
@@ -2006,13 +2028,6 @@ class RetryingVmProvisioner(object):
2006
2028
  # Provisioning succeeded.
2007
2029
  break
2008
2030
 
2009
- if to_provision.zone is None:
2010
- region_or_zone_str = str(to_provision.region)
2011
- else:
2012
- region_or_zone_str = str(to_provision.zone)
2013
- logger.warning(f'\n{style.BRIGHT}Provision failed for {num_nodes}x '
2014
- f'{to_provision} in {region_or_zone_str}. '
2015
- f'Trying other locations (if any).{style.RESET_ALL}')
2016
2031
  if prev_cluster_status is None:
2017
2032
  # Add failed resources to the blocklist, only when it
2018
2033
  # is in fallback mode.
@@ -2027,8 +2042,10 @@ class RetryingVmProvisioner(object):
2027
2042
  ), prev_cluster_status
2028
2043
  assert global_user_state.get_handle_from_cluster_name(
2029
2044
  cluster_name) is None, cluster_name
2030
- logger.info('Retrying provisioning with requested resources '
2031
- f'{task.num_nodes}x {task.resources}')
2045
+ logger.info(
2046
+ ux_utils.retry_message(
2047
+ f'Retrying provisioning with requested resources: '
2048
+ f'{task.num_nodes}x {task.resources}'))
2032
2049
  # Retry with the current, potentially "smaller" resources:
2033
2050
  # to_provision == the current new resources (e.g., V100:1),
2034
2051
  # which may be "smaller" than the original (V100:8).
@@ -2038,6 +2055,12 @@ class RetryingVmProvisioner(object):
2038
2055
  prev_cluster_status = None
2039
2056
  prev_handle = None
2040
2057
 
2058
+ retry_message = ux_utils.retry_message(
2059
+ 'Trying other potential resources.')
2060
+ logger.warning(f'\n{retry_message}')
2061
+ log_path = os.path.join(self.log_dir, 'provision.log')
2062
+ rich_utils.force_update_status(
2063
+ ux_utils.spinner_message('Looking for resources', log_path))
2041
2064
  # Set to None so that sky.optimize() will assign a new one
2042
2065
  # (otherwise will skip re-optimizing this task).
2043
2066
  # TODO: set all remaining tasks' best_resources to None.
@@ -2781,6 +2804,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2781
2804
  local_wheel_path,
2782
2805
  wheel_hash,
2783
2806
  blocked_resources=task.blocked_resources)
2807
+ log_path = os.path.join(self.log_dir, 'provision.log')
2808
+ rich_utils.force_update_status(
2809
+ ux_utils.spinner_message('Launching', log_path))
2784
2810
  config_dict = retry_provisioner.provision_with_retries(
2785
2811
  task, to_provision_config, dryrun, stream_logs)
2786
2812
  break
@@ -2796,27 +2822,34 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2796
2822
  usage_lib.messages.usage.update_final_cluster_status(
2797
2823
  None)
2798
2824
  error_message = (
2799
- 'Failed to provision all possible launchable '
2800
- 'resources.'
2801
- f' Relax the task\'s resource requirements: '
2825
+ f'{colorama.Fore.RED}Failed to provision all '
2826
+ f'possible launchable resources.'
2827
+ f'{colorama.Style.RESET_ALL}'
2828
+ ' Relax the task\'s resource requirements: '
2802
2829
  f'{task.num_nodes}x {list(task.resources)[0]}')
2830
+
2831
+ log_path = retry_provisioner.log_dir + '/provision.log'
2803
2832
  if retry_until_up:
2804
2833
  logger.error(error_message)
2805
2834
  # Sleep and retry.
2806
2835
  gap_seconds = backoff.current_backoff()
2807
2836
  plural = 's' if attempt_cnt > 1 else ''
2808
- logger.info(
2809
- f'{colorama.Style.BRIGHT}=== Retry until up ==='
2810
- f'{colorama.Style.RESET_ALL}\n'
2811
- f'Retrying provisioning after {gap_seconds:.0f}s '
2812
- '(backoff with random jittering). '
2813
- f'Already tried {attempt_cnt} attempt{plural}.')
2837
+ retry_message = ux_utils.retry_message(
2838
+ f'Retry after {gap_seconds:.0f}s '
2839
+ f'({attempt_cnt} attempt{plural}). ')
2840
+ logger.info(f'\n{retry_message} '
2841
+ f'{ux_utils.log_path_hint(log_path)}'
2842
+ f'{colorama.Style.RESET_ALL}')
2814
2843
  attempt_cnt += 1
2815
2844
  time.sleep(gap_seconds)
2816
2845
  continue
2846
+ logger.error(
2847
+ f'{colorama.Fore.RED}⨯{colorama.Style.RESET_ALL} '
2848
+ 'Failed to provision resources. '
2849
+ f'{ux_utils.log_path_hint(log_path)}')
2817
2850
  error_message += (
2818
- '\nTo keep retrying until the cluster is up, use the '
2819
- '`--retry-until-up` flag.')
2851
+ '\nTo keep retrying until the cluster is up, use '
2852
+ 'the `--retry-until-up` flag.')
2820
2853
  with ux_utils.print_exception_no_traceback():
2821
2854
  raise exceptions.ResourcesUnavailableError(
2822
2855
  error_message,
@@ -2927,7 +2960,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2927
2960
  # and restarted if necessary.
2928
2961
  logger.debug('Checking if skylet is running on the head node.')
2929
2962
  with rich_utils.safe_status(
2930
- '[bold cyan]Preparing SkyPilot runtime'):
2963
+ ux_utils.spinner_message('Preparing SkyPilot runtime')):
2931
2964
  # We need to source bashrc for skylet to make sure the autostop
2932
2965
  # event can access the path to the cloud CLIs.
2933
2966
  self.run_on_head(handle,
@@ -2970,7 +3003,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2970
3003
  cmd = job_lib.JobLibCodeGen.update_status()
2971
3004
  logger.debug('Update job queue on remote cluster.')
2972
3005
  with rich_utils.safe_status(
2973
- '[bold cyan]Preparing SkyPilot runtime'):
3006
+ ux_utils.spinner_message('Preparing SkyPilot runtime')):
2974
3007
  returncode, _, stderr = self.run_on_head(handle,
2975
3008
  cmd,
2976
3009
  require_outputs=True)
@@ -3005,7 +3038,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3005
3038
  if not (cloud.OPEN_PORTS_VERSION <=
3006
3039
  clouds.OpenPortsVersion.LAUNCH_ONLY):
3007
3040
  with rich_utils.safe_status(
3008
- '[bold cyan]Launching - Opening new ports'):
3041
+ ux_utils.spinner_message(
3042
+ 'Launching - Opening new ports')):
3009
3043
  self._open_ports(handle)
3010
3044
 
3011
3045
  with timeline.Event('backend.provision.post_process'):
@@ -3054,7 +3088,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3054
3088
  dir_size = backend_utils.path_size_megabytes(full_workdir)
3055
3089
  if dir_size >= _PATH_SIZE_MEGABYTES_WARN_THRESHOLD:
3056
3090
  logger.warning(
3057
- f'{fore.YELLOW}The size of workdir {workdir!r} '
3091
+ f' {fore.YELLOW}The size of workdir {workdir!r} '
3058
3092
  f'is {dir_size} MB. Try to keep workdir small or use '
3059
3093
  '.skyignore to exclude large files, as large sizes will slow '
3060
3094
  f'down rsync.{style.RESET_ALL}')
@@ -3076,17 +3110,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3076
3110
  num_nodes = handle.launched_nodes
3077
3111
  plural = 's' if num_nodes > 1 else ''
3078
3112
  logger.info(
3079
- f'{fore.CYAN}Syncing workdir (to {num_nodes} node{plural}): '
3080
- f'{style.BRIGHT}{workdir}{style.RESET_ALL}'
3081
- f' -> '
3082
- f'{style.BRIGHT}{SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
3113
+ f' {style.DIM}Syncing workdir (to {num_nodes} node{plural}): '
3114
+ f'{workdir} -> {SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
3083
3115
  os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
3084
3116
  os.system(f'touch {log_path}')
3085
- tail_cmd = f'tail -n100 -f {log_path}'
3086
- logger.info('To view detailed progress: '
3087
- f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
3088
- with rich_utils.safe_status('[bold cyan]Syncing[/]'):
3117
+ with rich_utils.safe_status(
3118
+ ux_utils.spinner_message('Syncing workdir', log_path)):
3089
3119
  subprocess_utils.run_in_parallel(_sync_workdir_node, runners)
3120
+ logger.info(ux_utils.finishing_message('Workdir synced.', log_path))
3090
3121
 
3091
3122
  def _sync_file_mounts(
3092
3123
  self,
@@ -3095,17 +3126,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3095
3126
  storage_mounts: Optional[Dict[Path, storage_lib.Storage]],
3096
3127
  ) -> None:
3097
3128
  """Mounts all user files to the remote nodes."""
3098
- controller_utils.replace_skypilot_config_path_in_file_mounts(
3099
- handle.launched_resources.cloud, all_file_mounts)
3100
- self._execute_file_mounts(handle, all_file_mounts)
3101
- self._execute_storage_mounts(handle, storage_mounts)
3102
- self._set_storage_mounts_metadata(handle.cluster_name, storage_mounts)
3129
+ with rich_utils.safe_status(ux_utils.spinner_message('Syncing files')):
3130
+ controller_utils.replace_skypilot_config_path_in_file_mounts(
3131
+ handle.launched_resources.cloud, all_file_mounts)
3132
+ self._execute_file_mounts(handle, all_file_mounts)
3133
+ self._execute_storage_mounts(handle, storage_mounts)
3134
+ self._set_storage_mounts_metadata(handle.cluster_name,
3135
+ storage_mounts)
3103
3136
 
3104
3137
  def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
3105
3138
  detach_setup: bool) -> None:
3106
3139
  start = time.time()
3107
- style = colorama.Style
3108
- fore = colorama.Fore
3109
3140
 
3110
3141
  if task.setup is None:
3111
3142
  return
@@ -3161,7 +3192,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3161
3192
  # and source ~/.bashrc in the setup_cmd.
3162
3193
  # bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long
3163
3194
  # bash: no job control in this shell
3164
- skip_lines=3)
3195
+ skip_num_lines=3)
3165
3196
  return returncode
3166
3197
 
3167
3198
  returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
@@ -3212,23 +3243,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3212
3243
 
3213
3244
  num_nodes = len(runners)
3214
3245
  plural = 's' if num_nodes > 1 else ''
3246
+ node_str = f'{num_nodes} VM{plural}'
3247
+ if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
3248
+ node_str = f'{num_nodes} pod{plural}'
3249
+ controller = controller_utils.Controllers.from_name(handle.cluster_name)
3250
+ if controller is not None:
3251
+ node_str = controller.value.name
3215
3252
  if not detach_setup:
3216
- logger.info(f'{fore.CYAN}Running setup on {num_nodes} node{plural}.'
3217
- f'{style.RESET_ALL}')
3253
+ logger.info(
3254
+ ux_utils.starting_message(f'Running setup on {node_str}.'))
3218
3255
  # TODO(zhwu): run_in_parallel uses multi-thread to run the commands,
3219
3256
  # which can cause the program waiting for all the threads to finish,
3220
3257
  # even if some of them raise exceptions. We should replace it with
3221
3258
  # multi-process.
3259
+ rich_utils.stop_safe_status()
3222
3260
  subprocess_utils.run_in_parallel(_setup_node, range(num_nodes))
3223
3261
 
3224
3262
  if detach_setup:
3225
3263
  # Only set this when setup needs to be run outside the self._setup()
3226
3264
  # as part of a job (--detach-setup).
3227
3265
  self._setup_cmd = setup_cmd
3266
+ logger.info(ux_utils.finishing_message('Setup completed.'))
3228
3267
  return
3229
- logger.info(f'{fore.GREEN}Setup completed.{style.RESET_ALL}')
3230
3268
  end = time.time()
3231
3269
  logger.debug(f'Setup took {end - start} seconds.')
3270
+ setup_log_path = os.path.join(self.log_dir, 'setup-*.log')
3271
+ logger.info(
3272
+ ux_utils.finishing_message('Setup completed.', setup_log_path))
3232
3273
 
3233
3274
  def _exec_code_on_head(
3234
3275
  self,
@@ -3240,7 +3281,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3240
3281
  ) -> None:
3241
3282
  """Executes generated code on the head node."""
3242
3283
  style = colorama.Style
3243
- fore = colorama.Fore
3244
3284
 
3245
3285
  script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
3246
3286
  remote_log_dir = self.log_dir
@@ -3330,9 +3370,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3330
3370
  f'Failed to submit job {job_id}.',
3331
3371
  stderr=stdout + stderr)
3332
3372
 
3333
- logger.info('Job submitted with Job ID: '
3334
- f'{style.BRIGHT}{job_id}{style.RESET_ALL}')
3335
-
3373
+ controller = controller_utils.Controllers.from_name(handle.cluster_name)
3374
+ if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
3375
+ logger.info(ux_utils.starting_message('Service registered.'))
3376
+ else:
3377
+ logger.info(
3378
+ ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
3379
+ rich_utils.stop_safe_status()
3336
3380
  try:
3337
3381
  if not detach_run:
3338
3382
  if (handle.cluster_name in controller_utils.Controllers.
@@ -3347,35 +3391,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3347
3391
  controller = controller_utils.Controllers.from_name(name)
3348
3392
  if controller == controller_utils.Controllers.JOBS_CONTROLLER:
3349
3393
  logger.info(
3350
- f'{fore.CYAN}Managed Job ID: '
3394
+ f'\n📋 Useful Commands'
3395
+ f'\nManaged Job ID: '
3351
3396
  f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
3352
- '\nTo cancel the job:\t\t'
3353
- f'{backend_utils.BOLD}sky jobs cancel {job_id}'
3354
- f'{backend_utils.RESET_BOLD}'
3355
- '\nTo stream job logs:\t\t'
3356
- f'{backend_utils.BOLD}sky jobs logs {job_id}'
3357
- f'{backend_utils.RESET_BOLD}'
3358
- f'\nTo stream controller logs:\t'
3359
- f'{backend_utils.BOLD}sky jobs logs --controller {job_id}'
3360
- f'{backend_utils.RESET_BOLD}'
3361
- '\nTo view all managed jobs:\t'
3362
- f'{backend_utils.BOLD}sky jobs queue'
3363
- f'{backend_utils.RESET_BOLD}'
3364
- '\nTo view managed job dashboard:\t'
3365
- f'{backend_utils.BOLD}sky jobs dashboard'
3366
- f'{backend_utils.RESET_BOLD}')
3397
+ f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t\t'
3398
+ f'{ux_utils.BOLD}sky jobs cancel {job_id}'
3399
+ f'{ux_utils.RESET_BOLD}'
3400
+ f'\n{ux_utils.INDENT_SYMBOL}To stream job logs:\t\t\t'
3401
+ f'{ux_utils.BOLD}sky jobs logs {job_id}'
3402
+ f'{ux_utils.RESET_BOLD}'
3403
+ f'\n{ux_utils.INDENT_SYMBOL}To stream controller logs:\t\t'
3404
+ f'{ux_utils.BOLD}sky jobs logs --controller {job_id}'
3405
+ f'{ux_utils.RESET_BOLD}'
3406
+ f'\n{ux_utils.INDENT_SYMBOL}To view all managed jobs:\t\t'
3407
+ f'{ux_utils.BOLD}sky jobs queue'
3408
+ f'{ux_utils.RESET_BOLD}'
3409
+ f'\n{ux_utils.INDENT_LAST_SYMBOL}To view managed job '
3410
+ f'dashboard:\t{ux_utils.BOLD}sky jobs dashboard'
3411
+ f'{ux_utils.RESET_BOLD}')
3367
3412
  elif controller is None:
3368
- logger.info(f'{fore.CYAN}Job ID: '
3369
- f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
3370
- '\nTo cancel the job:\t'
3371
- f'{backend_utils.BOLD}sky cancel {name} {job_id}'
3372
- f'{backend_utils.RESET_BOLD}'
3373
- '\nTo stream job logs:\t'
3374
- f'{backend_utils.BOLD}sky logs {name} {job_id}'
3375
- f'{backend_utils.RESET_BOLD}'
3376
- '\nTo view the job queue:\t'
3377
- f'{backend_utils.BOLD}sky queue {name}'
3378
- f'{backend_utils.RESET_BOLD}')
3413
+ logger.info(f'\n📋 Useful Commands'
3414
+ f'\nJob ID: {job_id}'
3415
+ f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t'
3416
+ f'{ux_utils.BOLD}sky cancel {name} {job_id}'
3417
+ f'{ux_utils.RESET_BOLD}'
3418
+ f'\n{ux_utils.INDENT_SYMBOL}To stream job logs:\t\t'
3419
+ f'{ux_utils.BOLD}sky logs {name} {job_id}'
3420
+ f'{ux_utils.RESET_BOLD}'
3421
+ f'\n{ux_utils.INDENT_LAST_SYMBOL}To view job '
3422
+ 'queue:\t\t'
3423
+ f'{ux_utils.BOLD}sky queue {name}'
3424
+ f'{ux_utils.RESET_BOLD}')
3379
3425
 
3380
3426
  def _add_job(self, handle: CloudVmRayResourceHandle,
3381
3427
  job_name: Optional[str], resources_str: str) -> int:
@@ -3452,27 +3498,23 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3452
3498
 
3453
3499
  def _post_execute(self, handle: CloudVmRayResourceHandle,
3454
3500
  down: bool) -> None:
3455
- fore = colorama.Fore
3456
- style = colorama.Style
3457
3501
  name = handle.cluster_name
3458
3502
  controller = controller_utils.Controllers.from_name(name)
3459
- if controller is not None or down:
3503
+ if controller is not None:
3460
3504
  return
3461
- stop_str = ('\nTo stop the cluster:'
3462
- f'\t{backend_utils.BOLD}sky stop {name}'
3463
- f'{backend_utils.RESET_BOLD}')
3464
- logger.info(f'\n{fore.CYAN}Cluster name: '
3465
- f'{style.BRIGHT}{name}{style.RESET_ALL}'
3466
- '\nTo log into the head VM:\t'
3467
- f'{backend_utils.BOLD}ssh {name}'
3468
- f'{backend_utils.RESET_BOLD}'
3469
- '\nTo submit a job:'
3470
- f'\t\t{backend_utils.BOLD}sky exec {name} yaml_file'
3471
- f'{backend_utils.RESET_BOLD}'
3472
- f'{stop_str}'
3473
- '\nTo teardown the cluster:'
3474
- f'\t{backend_utils.BOLD}sky down {name}'
3475
- f'{backend_utils.RESET_BOLD}')
3505
+ logger.info(f'\nCluster name: {name}'
3506
+ f'\n{ux_utils.INDENT_SYMBOL}To log into the head VM:\t'
3507
+ f'{ux_utils.BOLD}ssh {name}'
3508
+ f'{ux_utils.RESET_BOLD}'
3509
+ f'\n{ux_utils.INDENT_SYMBOL}To submit a job:'
3510
+ f'\t\t{ux_utils.BOLD}sky exec {name} yaml_file'
3511
+ f'{ux_utils.RESET_BOLD}'
3512
+ f'\n{ux_utils.INDENT_SYMBOL}To stop the cluster:'
3513
+ f'\t{ux_utils.BOLD}sky stop {name}'
3514
+ f'{ux_utils.RESET_BOLD}'
3515
+ f'\n{ux_utils.INDENT_LAST_SYMBOL}To teardown the cluster:'
3516
+ f'\t{ux_utils.BOLD}sky down {name}'
3517
+ f'{ux_utils.RESET_BOLD}')
3476
3518
  if (gcp_utils.is_tpu(handle.launched_resources) and
3477
3519
  not gcp_utils.is_tpu_vm(handle.launched_resources)):
3478
3520
  logger.info('Tip: `sky down` will delete launched TPU(s) too.')
@@ -3808,11 +3850,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3808
3850
  Raises:
3809
3851
  RuntimeError: If the cluster fails to be terminated/stopped.
3810
3852
  """
3853
+ cluster_status_fetched = False
3811
3854
  if refresh_cluster_status:
3812
- prev_cluster_status, _ = (
3813
- backend_utils.refresh_cluster_status_handle(
3814
- handle.cluster_name, acquire_per_cluster_status_lock=False))
3815
- else:
3855
+ try:
3856
+ prev_cluster_status, _ = (
3857
+ backend_utils.refresh_cluster_status_handle(
3858
+ handle.cluster_name,
3859
+ acquire_per_cluster_status_lock=False))
3860
+ cluster_status_fetched = True
3861
+ except exceptions.ClusterStatusFetchingError:
3862
+ logger.warning(
3863
+ 'Failed to fetch cluster status for '
3864
+ f'{handle.cluster_name!r}. Assuming the cluster is still '
3865
+ 'up.')
3866
+ if not cluster_status_fetched:
3816
3867
  record = global_user_state.get_cluster_from_name(
3817
3868
  handle.cluster_name)
3818
3869
  prev_cluster_status = record[
@@ -3972,8 +4023,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3972
4023
  f.flush()
3973
4024
 
3974
4025
  teardown_verb = 'Terminating' if terminate else 'Stopping'
3975
- with rich_utils.safe_status(f'[bold cyan]{teardown_verb} '
3976
- f'[green]{cluster_name}'):
4026
+ with rich_utils.safe_status(
4027
+ ux_utils.spinner_message(
4028
+ f'{teardown_verb}: {cluster_name}', log_path)):
3977
4029
  # FIXME(zongheng): support retries. This call can fail for
3978
4030
  # example due to GCP returning list requests per limit
3979
4031
  # exceeded.
@@ -4053,7 +4105,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4053
4105
  config = common_utils.read_yaml(handle.cluster_yaml)
4054
4106
  tpu_node_config = config['provider'].get('tpu_node')
4055
4107
  if tpu_node_config is None:
4056
- with rich_utils.safe_status('[bold cyan]Terminating TPU...'):
4108
+ with rich_utils.safe_status(
4109
+ ux_utils.spinner_message('Terminating TPU')):
4057
4110
  tpu_rc, tpu_stdout, tpu_stderr = log_lib.run_with_log(
4058
4111
  ['bash', handle.tpu_delete_script],
4059
4112
  log_abs_path,
@@ -4425,13 +4478,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4425
4478
  to_provision = handle_before_refresh.launched_resources
4426
4479
  self.check_resources_fit_cluster(handle_before_refresh, task)
4427
4480
 
4428
- logger.info(
4429
- f'{colorama.Fore.CYAN}Creating a new cluster: {cluster_name!r} '
4430
- f'[{task.num_nodes}x {to_provision}].'
4431
- f'{colorama.Style.RESET_ALL}\n'
4432
- 'Tip: to reuse an existing cluster, '
4433
- 'specify --cluster (-c). '
4434
- 'Run `sky status` to see existing clusters.')
4435
4481
  return RetryingVmProvisioner.ToProvisionConfig(
4436
4482
  cluster_name,
4437
4483
  to_provision,
@@ -4454,7 +4500,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4454
4500
  symlink_commands = []
4455
4501
  fore = colorama.Fore
4456
4502
  style = colorama.Style
4457
- logger.info(f'{fore.CYAN}Processing file mounts.{style.RESET_ALL}')
4458
4503
  start = time.time()
4459
4504
  runners = handle.get_command_runners()
4460
4505
  log_path = os.path.join(self.log_dir, 'file_mounts.log')
@@ -4468,20 +4513,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4468
4513
  src_size = backend_utils.path_size_megabytes(full_src)
4469
4514
  if src_size >= _PATH_SIZE_MEGABYTES_WARN_THRESHOLD:
4470
4515
  logger.warning(
4471
- f'{fore.YELLOW}The size of file mount src {src!r} '
4516
+ f' {fore.YELLOW}The size of file mount src {src!r} '
4472
4517
  f'is {src_size} MB. Try to keep src small or use '
4473
4518
  '.skyignore to exclude large files, as large sizes '
4474
4519
  f'will slow down rsync. {style.RESET_ALL}')
4475
4520
  if os.path.islink(full_src):
4476
4521
  logger.warning(
4477
- f'{fore.YELLOW}Source path {src!r} is a symlink. '
4522
+ f' {fore.YELLOW}Source path {src!r} is a symlink. '
4478
4523
  f'Symlink contents are not uploaded.{style.RESET_ALL}')
4479
4524
 
4480
4525
  os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
4481
4526
  os.system(f'touch {log_path}')
4482
- tail_cmd = f'tail -n100 -f {log_path}'
4483
- logger.info('To view detailed progress: '
4484
- f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
4527
+
4528
+ rich_utils.force_update_status(
4529
+ ux_utils.spinner_message('Syncing file mounts', log_path))
4485
4530
 
4486
4531
  for dst, src in file_mounts.items():
4487
4532
  # TODO: room for improvement. Here there are many moving parts
@@ -4576,6 +4621,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4576
4621
  subprocess_utils.run_in_parallel(_symlink_node, runners)
4577
4622
  end = time.time()
4578
4623
  logger.debug(f'File mount sync took {end - start} seconds.')
4624
+ logger.info(ux_utils.finishing_message('Files synced.', log_path))
4579
4625
 
4580
4626
  def _execute_storage_mounts(
4581
4627
  self, handle: CloudVmRayResourceHandle,
@@ -4599,16 +4645,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4599
4645
  # Handle cases when there aren't any Storages with MOUNT mode.
4600
4646
  if not storage_mounts:
4601
4647
  return
4602
-
4603
- fore = colorama.Fore
4604
- style = colorama.Style
4605
- plural = 's' if len(storage_mounts) > 1 else ''
4606
- logger.info(f'{fore.CYAN}Processing {len(storage_mounts)} '
4607
- f'storage mount{plural}.{style.RESET_ALL}')
4608
4648
  start = time.time()
4609
4649
  runners = handle.get_command_runners()
4610
4650
  log_path = os.path.join(self.log_dir, 'storage_mounts.log')
4611
4651
 
4652
+ plural = 's' if len(storage_mounts) > 1 else ''
4653
+ rich_utils.force_update_status(
4654
+ ux_utils.spinner_message(
4655
+ f'Mounting {len(storage_mounts)} storage{plural}', log_path))
4656
+
4612
4657
  for dst, storage_obj in storage_mounts.items():
4613
4658
  if not os.path.isabs(dst) and not dst.startswith('~/'):
4614
4659
  dst = f'{SKY_REMOTE_WORKDIR}/{dst}'
@@ -4662,6 +4707,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4662
4707
 
4663
4708
  end = time.time()
4664
4709
  logger.debug(f'Storage mount sync took {end - start} seconds.')
4710
+ logger.info(ux_utils.finishing_message('Storage mounted.', log_path))
4665
4711
 
4666
4712
  def _set_storage_mounts_metadata(
4667
4713
  self, cluster_name: str,