skypilot-nightly 1.0.0.dev20241012__py3-none-any.whl → 1.0.0.dev20241014__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/azure.py +3 -1
  3. sky/adaptors/common.py +6 -2
  4. sky/backends/backend.py +9 -4
  5. sky/backends/backend_utils.py +13 -16
  6. sky/backends/cloud_vm_ray_backend.py +207 -161
  7. sky/backends/local_docker_backend.py +3 -1
  8. sky/benchmark/benchmark_utils.py +5 -4
  9. sky/cli.py +36 -28
  10. sky/clouds/oci.py +17 -2
  11. sky/clouds/service_catalog/aws_catalog.py +6 -7
  12. sky/clouds/service_catalog/common.py +4 -3
  13. sky/clouds/service_catalog/cudo_catalog.py +11 -1
  14. sky/core.py +4 -2
  15. sky/data/storage.py +44 -32
  16. sky/data/storage_utils.py +8 -4
  17. sky/exceptions.py +5 -0
  18. sky/execution.py +10 -24
  19. sky/jobs/core.py +9 -7
  20. sky/jobs/utils.py +15 -10
  21. sky/optimizer.py +50 -37
  22. sky/provision/aws/config.py +15 -6
  23. sky/provision/azure/config.py +14 -3
  24. sky/provision/azure/instance.py +15 -9
  25. sky/provision/kubernetes/instance.py +3 -1
  26. sky/provision/provisioner.py +63 -74
  27. sky/serve/core.py +42 -40
  28. sky/sky_logging.py +9 -5
  29. sky/skylet/job_lib.py +15 -0
  30. sky/skylet/log_lib.py +5 -4
  31. sky/skylet/providers/lambda_cloud/node_provider.py +1 -1
  32. sky/utils/command_runner.py +11 -11
  33. sky/utils/common_utils.py +2 -5
  34. sky/utils/controller_utils.py +78 -29
  35. sky/utils/env_options.py +22 -7
  36. sky/utils/log_utils.py +39 -24
  37. sky/utils/resources_utils.py +23 -0
  38. sky/utils/rich_utils.py +55 -5
  39. sky/utils/ux_utils.py +63 -4
  40. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/METADATA +1 -1
  41. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/RECORD +45 -45
  42. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/LICENSE +0 -0
  43. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/WHEEL +0 -0
  44. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/entry_points.txt +0 -0
  45. {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/top_level.txt +0 -0
@@ -14,6 +14,7 @@ from sky.backends import backend_utils
14
14
  from sky.backends import docker_utils
15
15
  from sky.data import storage as storage_lib
16
16
  from sky.utils import rich_utils
17
+ from sky.utils import ux_utils
17
18
 
18
19
  if typing.TYPE_CHECKING:
19
20
  from sky import resources
@@ -159,7 +160,8 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
159
160
  handle = LocalDockerResourceHandle(cluster_name)
160
161
  logger.info(f'Building docker image for task {task.name}. '
161
162
  'This might take some time.')
162
- with rich_utils.safe_status('[bold cyan]Building Docker image[/]'):
163
+ with rich_utils.safe_status(
164
+ ux_utils.spinner_message('Building Docker image')):
163
165
  image_tag, metadata = docker_utils.build_dockerimage_from_task(task)
164
166
  self.images[handle] = (image_tag, metadata)
165
167
  logger.info(f'Image {image_tag} built.')
@@ -595,7 +595,8 @@ def update_benchmark_state(benchmark: str) -> None:
595
595
  remote_dir = os.path.join(bucket_name, benchmark)
596
596
  local_dir = os.path.join(_SKY_LOCAL_BENCHMARK_DIR, benchmark)
597
597
  os.makedirs(local_dir, exist_ok=True)
598
- with rich_utils.safe_status('[bold cyan]Downloading benchmark logs[/]'):
598
+ with rich_utils.safe_status(
599
+ ux_utils.spinner_message('Downloading benchmark logs')):
599
600
  _download_remote_dir(remote_dir, local_dir, bucket_type)
600
601
 
601
602
  # Update the benchmark results in parallel.
@@ -604,9 +605,9 @@ def update_benchmark_state(benchmark: str) -> None:
604
605
  progress = rich_progress.Progress(transient=True,
605
606
  redirect_stdout=False,
606
607
  redirect_stderr=False)
607
- task = progress.add_task(
608
- f'[bold cyan]Processing {num_candidates} benchmark result{plural}[/]',
609
- total=num_candidates)
608
+ task = progress.add_task(ux_utils.spinner_message(
609
+ f'Processing {num_candidates} benchmark result{plural}'),
610
+ total=num_candidates)
610
611
 
611
612
  def _update_with_progress_bar(arg: Any) -> None:
612
613
  message = _update_benchmark_result(arg)
sky/cli.py CHANGED
@@ -1814,7 +1814,8 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
1814
1814
  if show_managed_jobs:
1815
1815
  click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1816
1816
  f'Managed jobs{colorama.Style.RESET_ALL}')
1817
- with rich_utils.safe_status('[cyan]Checking managed jobs[/]'):
1817
+ with rich_utils.safe_status(
1818
+ ux_utils.spinner_message('Checking managed jobs')):
1818
1819
  managed_jobs_query_interrupted, result = _try_get_future_result(
1819
1820
  managed_jobs_future)
1820
1821
  if managed_jobs_query_interrupted:
@@ -1855,7 +1856,8 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
1855
1856
  # The pool is terminated, so we cannot run the service query.
1856
1857
  msg = 'KeyboardInterrupt'
1857
1858
  else:
1858
- with rich_utils.safe_status('[cyan]Checking services[/]'):
1859
+ with rich_utils.safe_status(
1860
+ ux_utils.spinner_message('Checking services')):
1859
1861
  interrupted, result = _try_get_future_result(
1860
1862
  services_future)
1861
1863
  if interrupted:
@@ -2551,8 +2553,8 @@ def start(
2551
2553
  'is currently not supported.\n'
2552
2554
  'Please start the former independently.')
2553
2555
  if controllers:
2554
- bold = backend_utils.BOLD
2555
- reset_bold = backend_utils.RESET_BOLD
2556
+ bold = ux_utils.BOLD
2557
+ reset_bold = ux_utils.RESET_BOLD
2556
2558
  if len(controllers) != 1:
2557
2559
  raise click.UsageError(
2558
2560
  'Starting multiple controllers is currently not supported.\n'
@@ -2673,7 +2675,7 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str):
2673
2675
  assert controller is not None, controller_name
2674
2676
 
2675
2677
  with rich_utils.safe_status(
2676
- '[bold cyan]Checking for in-progress managed jobs[/]'):
2678
+ ux_utils.spinner_message('Checking for in-progress managed jobs')):
2677
2679
  try:
2678
2680
  managed_jobs_ = managed_jobs.queue(refresh=False,
2679
2681
  skip_finished=True)
@@ -2725,7 +2727,8 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str):
2725
2727
  """
2726
2728
  controller = controller_utils.Controllers.from_name(controller_name)
2727
2729
  assert controller is not None, controller_name
2728
- with rich_utils.safe_status('[bold cyan]Checking for live services[/]'):
2730
+ with rich_utils.safe_status(
2731
+ ux_utils.spinner_message('Checking for live services')):
2729
2732
  try:
2730
2733
  services = serve_lib.status()
2731
2734
  except exceptions.ClusterNotUpError as e:
@@ -2909,9 +2912,9 @@ def _down_or_stop_clusters(
2909
2912
  progress = rich_progress.Progress(transient=True,
2910
2913
  redirect_stdout=False,
2911
2914
  redirect_stderr=False)
2912
- task = progress.add_task(
2913
- f'[bold cyan]{operation} {len(clusters)} cluster{plural}[/]',
2914
- total=len(clusters))
2915
+ task = progress.add_task(ux_utils.spinner_message(
2916
+ f'{operation} {len(clusters)} cluster{plural}'),
2917
+ total=len(clusters))
2915
2918
 
2916
2919
  def _down_or_stop(name: str):
2917
2920
  success_progress = False
@@ -3680,7 +3683,7 @@ def jobs_launch(
3680
3683
  dag_utils.fill_default_config_in_dag_for_job_launch(dag)
3681
3684
 
3682
3685
  click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
3683
- fg='yellow')
3686
+ fg='cyan')
3684
3687
  dag = sky.optimize(dag)
3685
3688
 
3686
3689
  if not yes:
@@ -3774,7 +3777,8 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
3774
3777
 
3775
3778
  """
3776
3779
  click.secho('Fetching managed job statuses...', fg='yellow')
3777
- with rich_utils.safe_status('[cyan]Checking managed jobs[/]'):
3780
+ with rich_utils.safe_status(
3781
+ ux_utils.spinner_message('Checking managed jobs')):
3778
3782
  _, msg = _get_managed_jobs(refresh=refresh,
3779
3783
  skip_finished=skip_finished,
3780
3784
  show_all=all,
@@ -3825,10 +3829,12 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
3825
3829
  # Cancel managed jobs with IDs 1, 2, 3
3826
3830
  $ sky jobs cancel 1 2 3
3827
3831
  """
3828
- backend_utils.is_controller_accessible(
3829
- controller=controller_utils.Controllers.JOBS_CONTROLLER,
3830
- stopped_message='All managed jobs should have finished.',
3831
- exit_if_not_accessible=True)
3832
+ with rich_utils.safe_status(
3833
+ ux_utils.spinner_message('Checking managed jobs')):
3834
+ backend_utils.is_controller_accessible(
3835
+ controller=controller_utils.Controllers.JOBS_CONTROLLER,
3836
+ stopped_message='All managed jobs should have finished.',
3837
+ exit_if_not_accessible=True)
3832
3838
 
3833
3839
  job_id_str = ','.join(map(str, job_ids))
3834
3840
  if sum([len(job_ids) > 0, name is not None, all]) != 1:
@@ -4390,7 +4396,7 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]):
4390
4396
  sky serve status my-service
4391
4397
  """
4392
4398
  # This won't pollute the output of --endpoint.
4393
- with rich_utils.safe_status('[cyan]Checking services[/]'):
4399
+ with rich_utils.safe_status(ux_utils.spinner_message('Checking services')):
4394
4400
  _, msg = _get_services(service_names,
4395
4401
  show_all=all,
4396
4402
  show_endpoint=endpoint,
@@ -4814,11 +4820,11 @@ def benchmark_launch(
4814
4820
  f'\n{colorama.Fore.CYAN}Benchmark name: '
4815
4821
  f'{colorama.Style.BRIGHT}{benchmark}{colorama.Style.RESET_ALL}'
4816
4822
  '\nTo see the benchmark results: '
4817
- f'{backend_utils.BOLD}sky bench show '
4818
- f'{benchmark}{backend_utils.RESET_BOLD}'
4823
+ f'{ux_utils.BOLD}sky bench show '
4824
+ f'{benchmark}{ux_utils.RESET_BOLD}'
4819
4825
  '\nTo teardown the clusters: '
4820
- f'{backend_utils.BOLD}sky bench down '
4821
- f'{benchmark}{backend_utils.RESET_BOLD}')
4826
+ f'{ux_utils.BOLD}sky bench down '
4827
+ f'{benchmark}{ux_utils.RESET_BOLD}')
4822
4828
  subprocess_utils.run('sky bench ls')
4823
4829
  else:
4824
4830
  logger.error('No benchmarking clusters are created.')
@@ -5109,9 +5115,9 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
5109
5115
  progress = rich_progress.Progress(transient=True,
5110
5116
  redirect_stdout=False,
5111
5117
  redirect_stderr=False)
5112
- task = progress.add_task(
5113
- f'[bold cyan]Deleting {len(to_delete)} benchmark{plural}: ',
5114
- total=len(to_delete))
5118
+ task = progress.add_task(ux_utils.spinner_message(
5119
+ f'Deleting {len(to_delete)} benchmark{plural}'),
5120
+ total=len(to_delete))
5115
5121
 
5116
5122
  def _delete_benchmark(benchmark: str) -> None:
5117
5123
  clusters = benchmark_state.get_benchmark_clusters(benchmark)
@@ -5126,8 +5132,8 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
5126
5132
  message = (f'{colorama.Fore.YELLOW}Benchmark {benchmark} '
5127
5133
  f'has {num_clusters} un-terminated cluster{plural}. '
5128
5134
  f'Terminate the cluster{plural} with '
5129
- f'{backend_utils.BOLD} sky bench down {benchmark} '
5130
- f'{backend_utils.RESET_BOLD} '
5135
+ f'{ux_utils.BOLD} sky bench down {benchmark} '
5136
+ f'{ux_utils.RESET_BOLD} '
5131
5137
  'before deleting the benchmark report.')
5132
5138
  success = False
5133
5139
  else:
@@ -5228,7 +5234,7 @@ def _deploy_local_cluster(gpus: bool):
5228
5234
  f'Full log: {log_path}'
5229
5235
  f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
5230
5236
  # Run sky check
5231
- with rich_utils.safe_status('[bold cyan]Running sky check...'):
5237
+ with rich_utils.safe_status(ux_utils.spinner_message('Running sky check')):
5232
5238
  sky_check.check(clouds=['kubernetes'], quiet=True)
5233
5239
  if cluster_created:
5234
5240
  # Prepare completion message which shows CPU and GPU count
@@ -5425,7 +5431,8 @@ def local_down():
5425
5431
  'local_down.log')
5426
5432
  tail_cmd = 'tail -n100 -f ' + log_path
5427
5433
 
5428
- with rich_utils.safe_status('[bold cyan]Removing local cluster...'):
5434
+ with rich_utils.safe_status(
5435
+ ux_utils.spinner_message('Removing local cluster')):
5429
5436
  style = colorama.Style
5430
5437
  click.echo('To view detailed progress: '
5431
5438
  f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
@@ -5448,7 +5455,8 @@ def local_down():
5448
5455
  f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
5449
5456
  if cluster_removed:
5450
5457
  # Run sky check
5451
- with rich_utils.safe_status('[bold cyan]Running sky check...'):
5458
+ with rich_utils.safe_status(
5459
+ ux_utils.spinner_message('Running sky check')):
5452
5460
  sky_check.check(clouds=['kubernetes'], quiet=True)
5453
5461
  click.echo(
5454
5462
  f'{colorama.Fore.GREEN}Local cluster removed.{style.RESET_ALL}')
sky/clouds/oci.py CHANGED
@@ -4,6 +4,19 @@ History:
4
4
  - Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
5
5
  - Hysun He (hysun.he@oracle.com) @ May 4, 2023: Support use the default
6
6
  image_id (configurable) if no image_id specified in the task yaml.
7
+ - Hysun He (hysun.he@oracle.com) @ Oct 12, 2024:
8
+ get_credential_file_mounts(): bug fix for sky config
9
+ file path resolution (by os.path.expanduser) when construct the file
10
+ mounts. This bug will cause the created workder nodes located in different
11
+ compartment and VCN than the header node if user specifies compartment_id
12
+ in the sky config file, because the ~/.sky/config is not sync-ed to the
13
+ remote machine.
14
+ The workaround is set the sky config file path using ENV before running
15
+ the sky launch: export SKYPILOT_CONFIG=/home/ubuntu/.sky/config.yaml
16
+ - Hysun He (hysun.he@oracle.com) @ Oct 12, 2024:
17
+ make_deploy_resources_variables(): Bug fix for specify the image_id as
18
+ the ocid of the image in the task.yaml file, in this case the image_id
19
+ for the node config should be set to the ocid instead of a dict.
7
20
  """
8
21
  import json
9
22
  import logging
@@ -211,7 +224,9 @@ class OCI(clouds.Cloud):
211
224
  listing_id = image_cols[1]
212
225
  res_ver = image_cols[2]
213
226
  else:
214
- image_id = resources.image_id
227
+ # Oct.12,2024 by HysunHe: Bug fix - resources.image_id is an
228
+ # dict. The image_id here should be the ocid format.
229
+ image_id = image_str
215
230
  listing_id = None
216
231
  res_ver = None
217
232
 
@@ -447,7 +462,7 @@ class OCI(clouds.Cloud):
447
462
  credential_files = [oci_cfg_file, api_key_file]
448
463
 
449
464
  # Sky config file is optional
450
- if os.path.exists(sky_cfg_file):
465
+ if os.path.exists(os.path.expanduser(sky_cfg_file)):
451
466
  credential_files.append(sky_cfg_file)
452
467
 
453
468
  file_mounts = {
@@ -10,8 +10,6 @@ import threading
10
10
  import typing
11
11
  from typing import Dict, List, Optional, Tuple
12
12
 
13
- import colorama
14
-
15
13
  from sky import exceptions
16
14
  from sky import sky_logging
17
15
  from sky.adaptors import common as adaptors_common
@@ -21,6 +19,8 @@ from sky.clouds.service_catalog import config
21
19
  from sky.clouds.service_catalog.data_fetchers import fetch_aws
22
20
  from sky.utils import common_utils
23
21
  from sky.utils import resources_utils
22
+ from sky.utils import rich_utils
23
+ from sky.utils import ux_utils
24
24
 
25
25
  if typing.TYPE_CHECKING:
26
26
  import pandas as pd
@@ -82,11 +82,10 @@ def _get_az_mappings(aws_user_hash: str) -> Optional['pd.DataFrame']:
82
82
  az_mappings = None
83
83
  if aws_user_hash != 'default':
84
84
  # Fetch az mapping from AWS.
85
- print(
86
- f'\r{colorama.Style.DIM}AWS: Fetching availability zones '
87
- f'mapping...{colorama.Style.RESET_ALL}',
88
- end='')
89
- az_mappings = fetch_aws.fetch_availability_zone_mappings()
85
+ with rich_utils.safe_status(
86
+ ux_utils.spinner_message('AWS: Fetching availability '
87
+ 'zones mapping')):
88
+ az_mappings = fetch_aws.fetch_availability_zone_mappings()
90
89
  else:
91
90
  return None
92
91
  az_mappings.to_csv(az_mapping_path, index=False)
@@ -198,9 +198,10 @@ def read_catalog(filename: str,
198
198
  if pull_frequency_hours is not None:
199
199
  update_frequency_str = (
200
200
  f' (every {pull_frequency_hours} hours)')
201
- with rich_utils.safe_status((f'Updating {cloud} catalog: '
202
- f'{filename}'
203
- f'{update_frequency_str}')):
201
+ with rich_utils.safe_status(
202
+ ux_utils.spinner_message(
203
+ f'Updating {cloud} catalog: {filename}') +
204
+ f'{update_frequency_str}'):
204
205
  try:
205
206
  r = requests.get(url)
206
207
  r.raise_for_status()
@@ -14,6 +14,9 @@ _PULL_FREQUENCY_HOURS = 1
14
14
  _df = common.read_catalog(cudo_mt.VMS_CSV,
15
15
  pull_frequency_hours=_PULL_FREQUENCY_HOURS)
16
16
 
17
+ _DEFAULT_NUM_VCPUS = 8
18
+ _DEFAULT_MEMORY_CPU_RATIO = 2
19
+
17
20
 
18
21
  def instance_type_exists(instance_type: str) -> bool:
19
22
  return common.instance_type_exists_impl(_df, instance_type)
@@ -52,7 +55,14 @@ def get_default_instance_type(cpus: Optional[str] = None,
52
55
  del disk_tier
53
56
  # NOTE: After expanding catalog to multiple entries, you may
54
57
  # want to specify a default instance type or family.
55
- return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
58
+ if cpus is None and memory is None:
59
+ cpus = f'{_DEFAULT_NUM_VCPUS}+'
60
+
61
+ memory_gb_or_ratio = memory
62
+ if memory is None:
63
+ memory_gb_or_ratio = f'{_DEFAULT_MEMORY_CPU_RATIO}x'
64
+ return common.get_instance_type_for_cpus_mem_impl(_df, cpus,
65
+ memory_gb_or_ratio)
56
66
 
57
67
 
58
68
  def get_accelerators_from_instance_type(
sky/core.py CHANGED
@@ -21,6 +21,7 @@ from sky.usage import usage_lib
21
21
  from sky.utils import controller_utils
22
22
  from sky.utils import rich_utils
23
23
  from sky.utils import subprocess_utils
24
+ from sky.utils import ux_utils
24
25
 
25
26
  if typing.TYPE_CHECKING:
26
27
  from sky import resources as resources_lib
@@ -127,8 +128,9 @@ def endpoints(cluster: str,
127
128
  RuntimeError: if the cluster has no ports to be exposed or no endpoints
128
129
  are exposed yet.
129
130
  """
130
- with rich_utils.safe_status('[bold cyan]Fetching endpoints for cluster '
131
- f'{cluster}...[/]'):
131
+ with rich_utils.safe_status(
132
+ ux_utils.spinner_message(
133
+ f'Fetching endpoints for cluster {cluster}')):
132
134
  return backend_utils.get_endpoints(cluster=cluster, port=port)
133
135
 
134
136
 
sky/data/storage.py CHANGED
@@ -1317,8 +1317,8 @@ class S3Store(AbstractStore):
1317
1317
  source_message = source_path_list[0]
1318
1318
 
1319
1319
  with rich_utils.safe_status(
1320
- f'[bold cyan]Syncing '
1321
- f'[green]{source_message}[/] to [green]s3://{self.name}/[/]'):
1320
+ ux_utils.spinner_message(f'Syncing {source_message} -> '
1321
+ f's3://{self.name}/')):
1322
1322
  data_utils.parallel_upload(
1323
1323
  source_path_list,
1324
1324
  get_file_sync_command,
@@ -1445,7 +1445,8 @@ class S3Store(AbstractStore):
1445
1445
  }
1446
1446
  s3_client.create_bucket(**create_bucket_config)
1447
1447
  logger.info(
1448
- f'Created S3 bucket {bucket_name!r} in {region or "us-east-1"}')
1448
+ f' {colorama.Style.DIM}Created S3 bucket {bucket_name!r} in '
1449
+ f'{region or "us-east-1"}{colorama.Style.RESET_ALL}')
1449
1450
 
1450
1451
  # Add AWS tags configured in config.yaml to the bucket.
1451
1452
  # This is useful for cost tracking and external cleanup.
@@ -1486,7 +1487,8 @@ class S3Store(AbstractStore):
1486
1487
  remove_command = f'aws s3 rb s3://{bucket_name} --force'
1487
1488
  try:
1488
1489
  with rich_utils.safe_status(
1489
- f'[bold cyan]Deleting S3 bucket {bucket_name}[/]'):
1490
+ ux_utils.spinner_message(
1491
+ f'Deleting S3 bucket [green]{bucket_name}')):
1490
1492
  subprocess.check_output(remove_command.split(' '),
1491
1493
  stderr=subprocess.STDOUT)
1492
1494
  except subprocess.CalledProcessError as e:
@@ -1726,8 +1728,8 @@ class GcsStore(AbstractStore):
1726
1728
  f'cp -e -n -r -I gs://{self.name}')
1727
1729
 
1728
1730
  with rich_utils.safe_status(
1729
- f'[bold cyan]Syncing '
1730
- f'[green]{source_message}[/] to [green]gs://{self.name}/[/]'):
1731
+ ux_utils.spinner_message(f'Syncing {source_message} -> '
1732
+ f'gs://{self.name}/')):
1731
1733
  data_utils.run_upload_cli(sync_command,
1732
1734
  self._ACCESS_DENIED_MESSAGE,
1733
1735
  bucket_name=self.name)
@@ -1781,8 +1783,8 @@ class GcsStore(AbstractStore):
1781
1783
  source_message = source_path_list[0]
1782
1784
 
1783
1785
  with rich_utils.safe_status(
1784
- f'[bold cyan]Syncing '
1785
- f'[green]{source_message}[/] to [green]gs://{self.name}/[/]'):
1786
+ ux_utils.spinner_message(f'Syncing {source_message} -> '
1787
+ f'gs://{self.name}/')):
1786
1788
  data_utils.parallel_upload(
1787
1789
  source_path_list,
1788
1790
  get_file_sync_command,
@@ -1904,8 +1906,9 @@ class GcsStore(AbstractStore):
1904
1906
  f'Attempted to create a bucket {self.name} but failed.'
1905
1907
  ) from e
1906
1908
  logger.info(
1907
- f'Created GCS bucket {new_bucket.name} in {new_bucket.location} '
1908
- f'with storage class {new_bucket.storage_class}')
1909
+ f' {colorama.Style.DIM}Created GCS bucket {new_bucket.name!r} in '
1910
+ f'{new_bucket.location} with storage class '
1911
+ f'{new_bucket.storage_class}{colorama.Style.RESET_ALL}')
1909
1912
  return new_bucket
1910
1913
 
1911
1914
  def _delete_gcs_bucket(self, bucket_name: str) -> bool:
@@ -1919,7 +1922,8 @@ class GcsStore(AbstractStore):
1919
1922
  """
1920
1923
 
1921
1924
  with rich_utils.safe_status(
1922
- f'[bold cyan]Deleting GCS bucket {bucket_name}[/]'):
1925
+ ux_utils.spinner_message(
1926
+ f'Deleting GCS bucket [green]{bucket_name}')):
1923
1927
  try:
1924
1928
  self.client.get_bucket(bucket_name)
1925
1929
  except gcp.forbidden_exception() as e:
@@ -2306,11 +2310,12 @@ class AzureBlobStore(AbstractStore):
2306
2310
  resource_group_name)
2307
2311
  except azure.exceptions().ResourceNotFoundError:
2308
2312
  with rich_utils.safe_status(
2309
- '[bold cyan]Setting up resource group: '
2310
- f'{resource_group_name}'):
2313
+ ux_utils.spinner_message(
2314
+ f'Setting up resource group: '
2315
+ f'{resource_group_name}')):
2311
2316
  self.resource_client.resource_groups.create_or_update(
2312
2317
  resource_group_name, {'location': self.region})
2313
- logger.info('Created Azure resource group '
2318
+ logger.info(' Created Azure resource group '
2314
2319
  f'{resource_group_name!r}.')
2315
2320
  # check if the storage account name already exists under the
2316
2321
  # given resource group name.
@@ -2319,13 +2324,14 @@ class AzureBlobStore(AbstractStore):
2319
2324
  resource_group_name, storage_account_name)
2320
2325
  except azure.exceptions().ResourceNotFoundError:
2321
2326
  with rich_utils.safe_status(
2322
- '[bold cyan]Setting up storage account: '
2323
- f'{storage_account_name}'):
2327
+ ux_utils.spinner_message(
2328
+ f'Setting up storage account: '
2329
+ f'{storage_account_name}')):
2324
2330
  self._create_storage_account(resource_group_name,
2325
2331
  storage_account_name)
2326
2332
  # wait until new resource creation propagates to Azure.
2327
2333
  time.sleep(1)
2328
- logger.info('Created Azure storage account '
2334
+ logger.info(' Created Azure storage account '
2329
2335
  f'{storage_account_name!r}.')
2330
2336
 
2331
2337
  return storage_account_name, resource_group_name
@@ -2514,9 +2520,9 @@ class AzureBlobStore(AbstractStore):
2514
2520
  container_endpoint = data_utils.AZURE_CONTAINER_URL.format(
2515
2521
  storage_account_name=self.storage_account_name,
2516
2522
  container_name=self.name)
2517
- with rich_utils.safe_status(f'[bold cyan]Syncing '
2518
- f'[green]{source_message}[/] to '
2519
- f'[green]{container_endpoint}/[/]'):
2523
+ with rich_utils.safe_status(
2524
+ ux_utils.spinner_message(
2525
+ f'Syncing {source_message} -> {container_endpoint}/')):
2520
2526
  data_utils.parallel_upload(
2521
2527
  source_path_list,
2522
2528
  get_file_sync_command,
@@ -2665,9 +2671,10 @@ class AzureBlobStore(AbstractStore):
2665
2671
  self.storage_account_name,
2666
2672
  container_name,
2667
2673
  blob_container={})
2668
- logger.info('Created AZ Container '
2674
+ logger.info(f' {colorama.Style.DIM}Created AZ Container '
2669
2675
  f'{container_name!r} in {self.region!r} under storage '
2670
- f'account {self.storage_account_name!r}.')
2676
+ f'account {self.storage_account_name!r}.'
2677
+ f'{colorama.Style.RESET_ALL}')
2671
2678
  except azure.exceptions().ResourceExistsError as e:
2672
2679
  if 'container is being deleted' in e.error.message:
2673
2680
  with ux_utils.print_exception_no_traceback():
@@ -2700,7 +2707,8 @@ class AzureBlobStore(AbstractStore):
2700
2707
  """
2701
2708
  try:
2702
2709
  with rich_utils.safe_status(
2703
- f'[bold cyan]Deleting Azure container {container_name}[/]'):
2710
+ ux_utils.spinner_message(
2711
+ f'Deleting Azure container {container_name}')):
2704
2712
  # Check for the existance of the container before deletion.
2705
2713
  self.storage_client.blob_containers.get(
2706
2714
  self.resource_group_name,
@@ -2916,8 +2924,8 @@ class R2Store(AbstractStore):
2916
2924
  source_message = source_path_list[0]
2917
2925
 
2918
2926
  with rich_utils.safe_status(
2919
- f'[bold cyan]Syncing '
2920
- f'[green]{source_message}[/] to [green]r2://{self.name}/[/]'):
2927
+ ux_utils.spinner_message(
2928
+ f'Syncing {source_message} -> r2://{self.name}/')):
2921
2929
  data_utils.parallel_upload(
2922
2930
  source_path_list,
2923
2931
  get_file_sync_command,
@@ -3055,7 +3063,9 @@ class R2Store(AbstractStore):
3055
3063
  location = {'LocationConstraint': region}
3056
3064
  r2_client.create_bucket(Bucket=bucket_name,
3057
3065
  CreateBucketConfiguration=location)
3058
- logger.info(f'Created R2 bucket {bucket_name} in {region}')
3066
+ logger.info(f' {colorama.Style.DIM}Created R2 bucket '
3067
+ f'{bucket_name!r} in {region}'
3068
+ f'{colorama.Style.RESET_ALL}')
3059
3069
  except aws.botocore_exceptions().ClientError as e:
3060
3070
  with ux_utils.print_exception_no_traceback():
3061
3071
  raise exceptions.StorageBucketCreateError(
@@ -3087,7 +3097,8 @@ class R2Store(AbstractStore):
3087
3097
  f'--profile={cloudflare.R2_PROFILE_NAME}')
3088
3098
  try:
3089
3099
  with rich_utils.safe_status(
3090
- f'[bold cyan]Deleting R2 bucket {bucket_name}[/]'):
3100
+ ux_utils.spinner_message(
3101
+ f'Deleting R2 bucket {bucket_name}')):
3091
3102
  subprocess.check_output(remove_command,
3092
3103
  stderr=subprocess.STDOUT,
3093
3104
  shell=True)
@@ -3354,9 +3365,8 @@ class IBMCosStore(AbstractStore):
3354
3365
  source_message = source_path_list[0]
3355
3366
 
3356
3367
  with rich_utils.safe_status(
3357
- f'[bold cyan]Syncing '
3358
- f'[green]{source_message}[/] to '
3359
- f'[green]cos://{self.region}/{self.name}/[/]'):
3368
+ ux_utils.spinner_message(f'Syncing {source_message} -> '
3369
+ f'cos://{self.region}/{self.name}/')):
3360
3370
  data_utils.parallel_upload(
3361
3371
  source_path_list,
3362
3372
  get_file_sync_command,
@@ -3490,8 +3500,10 @@ class IBMCosStore(AbstractStore):
3490
3500
  CreateBucketConfiguration={
3491
3501
  'LocationConstraint': f'{region}-smart'
3492
3502
  })
3493
- logger.info(f'Created IBM COS bucket {bucket_name} in {region} '
3494
- f'with storage class smart tier')
3503
+ logger.info(f' {colorama.Style.DIM}Created IBM COS bucket '
3504
+ f'{bucket_name!r} in {region} '
3505
+ 'with storage class smart tier'
3506
+ f'{colorama.Style.RESET_ALL}')
3495
3507
  self.bucket = self.s3_resource.Bucket(bucket_name)
3496
3508
 
3497
3509
  except ibm.ibm_botocore.exceptions.ClientError as e: # type: ignore[union-attr] # pylint: disable=line-too-long
sky/data/storage_utils.py CHANGED
@@ -213,9 +213,13 @@ def get_excluded_files(src_dir_path: str) -> List[str]:
213
213
  skyignore_path = os.path.join(expand_src_dir_path,
214
214
  constants.SKY_IGNORE_FILE)
215
215
  if os.path.exists(skyignore_path):
216
- logger.info(f'Exclude files to sync to cluster based on '
217
- f'{constants.SKY_IGNORE_FILE}.')
216
+ logger.info(f' {colorama.Style.DIM}'
217
+ f'Excluded files to sync to cluster based on '
218
+ f'{constants.SKY_IGNORE_FILE}.'
219
+ f'{colorama.Style.RESET_ALL}')
218
220
  return get_excluded_files_from_skyignore(src_dir_path)
219
- logger.info(f'Exclude files to sync to cluster based on '
220
- f'{constants.GIT_IGNORE_FILE}.')
221
+ logger.info(f' {colorama.Style.DIM}'
222
+ f'Excluded files to sync to cluster based on '
223
+ f'{constants.GIT_IGNORE_FILE}.'
224
+ f'{colorama.Style.RESET_ALL}')
221
225
  return get_excluded_files_from_gitignore(src_dir_path)
sky/exceptions.py CHANGED
@@ -291,3 +291,8 @@ class PortDoesNotExistError(Exception):
291
291
  class UserRequestRejectedByPolicy(Exception):
292
292
  """Raised when a user request is rejected by an admin policy."""
293
293
  pass
294
+
295
+
296
+ class NoClusterLaunchedError(Exception):
297
+ """No cluster launched, so cleanup can be skipped during failover."""
298
+ pass
sky/execution.py CHANGED
@@ -3,7 +3,6 @@
3
3
  See `Stage` for a Task's life cycle.
4
4
  """
5
5
  import enum
6
- import os
7
6
  from typing import List, Optional, Tuple, Union
8
7
 
9
8
  import colorama
@@ -20,10 +19,8 @@ from sky.usage import usage_lib
20
19
  from sky.utils import admin_policy_utils
21
20
  from sky.utils import controller_utils
22
21
  from sky.utils import dag_utils
23
- from sky.utils import env_options
24
22
  from sky.utils import resources_utils
25
23
  from sky.utils import rich_utils
26
- from sky.utils import subprocess_utils
27
24
  from sky.utils import timeline
28
25
  from sky.utils import ux_utils
29
26
 
@@ -293,11 +290,17 @@ def _execute(
293
290
  logger.info('Dryrun finished.')
294
291
  return None, None
295
292
 
296
- if Stage.SYNC_WORKDIR in stages and not dryrun:
297
- if task.workdir is not None:
298
- backend.sync_workdir(handle, task.workdir)
293
+ do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
294
+ task.workdir is not None)
295
+ do_file_mounts = (Stage.SYNC_FILE_MOUNTS in stages and not dryrun and
296
+ task.file_mounts is not None)
297
+ if do_workdir or do_file_mounts:
298
+ logger.info(ux_utils.starting_message('Mounting files.'))
299
299
 
300
- if Stage.SYNC_FILE_MOUNTS in stages and not dryrun:
300
+ if do_workdir:
301
+ backend.sync_workdir(handle, task.workdir)
302
+
303
+ if do_file_mounts:
301
304
  backend.sync_file_mounts(handle, task.file_mounts,
302
305
  task.storage_mounts)
303
306
 
@@ -330,23 +333,6 @@ def _execute(
330
333
  backend.teardown_ephemeral_storage(task)
331
334
  backend.teardown(handle, terminate=True)
332
335
  finally:
333
- controller = controller_utils.Controllers.from_name(cluster_name)
334
- if controller is None and not _is_launched_by_sky_serve_controller:
335
- # UX: print live clusters to make users aware (to save costs).
336
- #
337
- # Don't print if this job is launched by the jobs controller,
338
- # because managed jobs are serverless, there can be many of them,
339
- # and users tend to continuously monitor managed jobs using `sky
340
- # job queue`. Also don't print if this job is a skyserve controller
341
- # job or launched by a skyserve controller job, because the
342
- # redirect for this subprocess.run won't success and it will
343
- # pollute the controller logs.
344
- #
345
- # Disable the usage collection for this status command.
346
- env = dict(os.environ,
347
- **{env_options.Options.DISABLE_LOGGING.value: '1'})
348
- subprocess_utils.run(
349
- 'sky status --no-show-managed-jobs --no-show-services', env=env)
350
336
  print()
351
337
  print('\x1b[?25h', end='') # Show cursor.
352
338
  return job_id, handle