skypilot-nightly 1.0.0.dev20241012__py3-none-any.whl → 1.0.0.dev20241013__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/azure.py +3 -1
- sky/adaptors/common.py +6 -2
- sky/backends/backend.py +9 -4
- sky/backends/backend_utils.py +13 -16
- sky/backends/cloud_vm_ray_backend.py +207 -161
- sky/backends/local_docker_backend.py +3 -1
- sky/benchmark/benchmark_utils.py +5 -4
- sky/cli.py +36 -28
- sky/clouds/service_catalog/aws_catalog.py +6 -7
- sky/clouds/service_catalog/common.py +4 -3
- sky/clouds/service_catalog/cudo_catalog.py +11 -1
- sky/core.py +4 -2
- sky/data/storage.py +44 -32
- sky/data/storage_utils.py +8 -4
- sky/exceptions.py +5 -0
- sky/execution.py +10 -24
- sky/jobs/core.py +9 -7
- sky/jobs/utils.py +15 -10
- sky/optimizer.py +50 -37
- sky/provision/aws/config.py +15 -6
- sky/provision/azure/config.py +14 -3
- sky/provision/azure/instance.py +15 -9
- sky/provision/kubernetes/instance.py +3 -1
- sky/provision/provisioner.py +63 -74
- sky/serve/core.py +42 -40
- sky/sky_logging.py +9 -5
- sky/skylet/log_lib.py +5 -4
- sky/skylet/providers/lambda_cloud/node_provider.py +1 -1
- sky/utils/command_runner.py +11 -11
- sky/utils/common_utils.py +2 -5
- sky/utils/controller_utils.py +78 -29
- sky/utils/env_options.py +22 -7
- sky/utils/log_utils.py +39 -24
- sky/utils/resources_utils.py +23 -0
- sky/utils/rich_utils.py +55 -5
- sky/utils/ux_utils.py +63 -4
- {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/RECORD +43 -43
- {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/top_level.txt +0 -0
@@ -14,6 +14,7 @@ from sky.backends import backend_utils
|
|
14
14
|
from sky.backends import docker_utils
|
15
15
|
from sky.data import storage as storage_lib
|
16
16
|
from sky.utils import rich_utils
|
17
|
+
from sky.utils import ux_utils
|
17
18
|
|
18
19
|
if typing.TYPE_CHECKING:
|
19
20
|
from sky import resources
|
@@ -159,7 +160,8 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
159
160
|
handle = LocalDockerResourceHandle(cluster_name)
|
160
161
|
logger.info(f'Building docker image for task {task.name}. '
|
161
162
|
'This might take some time.')
|
162
|
-
with rich_utils.safe_status(
|
163
|
+
with rich_utils.safe_status(
|
164
|
+
ux_utils.spinner_message('Building Docker image')):
|
163
165
|
image_tag, metadata = docker_utils.build_dockerimage_from_task(task)
|
164
166
|
self.images[handle] = (image_tag, metadata)
|
165
167
|
logger.info(f'Image {image_tag} built.')
|
sky/benchmark/benchmark_utils.py
CHANGED
@@ -595,7 +595,8 @@ def update_benchmark_state(benchmark: str) -> None:
|
|
595
595
|
remote_dir = os.path.join(bucket_name, benchmark)
|
596
596
|
local_dir = os.path.join(_SKY_LOCAL_BENCHMARK_DIR, benchmark)
|
597
597
|
os.makedirs(local_dir, exist_ok=True)
|
598
|
-
with rich_utils.safe_status(
|
598
|
+
with rich_utils.safe_status(
|
599
|
+
ux_utils.spinner_message('Downloading benchmark logs')):
|
599
600
|
_download_remote_dir(remote_dir, local_dir, bucket_type)
|
600
601
|
|
601
602
|
# Update the benchmark results in parallel.
|
@@ -604,9 +605,9 @@ def update_benchmark_state(benchmark: str) -> None:
|
|
604
605
|
progress = rich_progress.Progress(transient=True,
|
605
606
|
redirect_stdout=False,
|
606
607
|
redirect_stderr=False)
|
607
|
-
task = progress.add_task(
|
608
|
-
f'
|
609
|
-
|
608
|
+
task = progress.add_task(ux_utils.spinner_message(
|
609
|
+
f'Processing {num_candidates} benchmark result{plural}'),
|
610
|
+
total=num_candidates)
|
610
611
|
|
611
612
|
def _update_with_progress_bar(arg: Any) -> None:
|
612
613
|
message = _update_benchmark_result(arg)
|
sky/cli.py
CHANGED
@@ -1814,7 +1814,8 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1814
1814
|
if show_managed_jobs:
|
1815
1815
|
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1816
1816
|
f'Managed jobs{colorama.Style.RESET_ALL}')
|
1817
|
-
with rich_utils.safe_status(
|
1817
|
+
with rich_utils.safe_status(
|
1818
|
+
ux_utils.spinner_message('Checking managed jobs')):
|
1818
1819
|
managed_jobs_query_interrupted, result = _try_get_future_result(
|
1819
1820
|
managed_jobs_future)
|
1820
1821
|
if managed_jobs_query_interrupted:
|
@@ -1855,7 +1856,8 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1855
1856
|
# The pool is terminated, so we cannot run the service query.
|
1856
1857
|
msg = 'KeyboardInterrupt'
|
1857
1858
|
else:
|
1858
|
-
with rich_utils.safe_status(
|
1859
|
+
with rich_utils.safe_status(
|
1860
|
+
ux_utils.spinner_message('Checking services')):
|
1859
1861
|
interrupted, result = _try_get_future_result(
|
1860
1862
|
services_future)
|
1861
1863
|
if interrupted:
|
@@ -2551,8 +2553,8 @@ def start(
|
|
2551
2553
|
'is currently not supported.\n'
|
2552
2554
|
'Please start the former independently.')
|
2553
2555
|
if controllers:
|
2554
|
-
bold =
|
2555
|
-
reset_bold =
|
2556
|
+
bold = ux_utils.BOLD
|
2557
|
+
reset_bold = ux_utils.RESET_BOLD
|
2556
2558
|
if len(controllers) != 1:
|
2557
2559
|
raise click.UsageError(
|
2558
2560
|
'Starting multiple controllers is currently not supported.\n'
|
@@ -2673,7 +2675,7 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str):
|
|
2673
2675
|
assert controller is not None, controller_name
|
2674
2676
|
|
2675
2677
|
with rich_utils.safe_status(
|
2676
|
-
'
|
2678
|
+
ux_utils.spinner_message('Checking for in-progress managed jobs')):
|
2677
2679
|
try:
|
2678
2680
|
managed_jobs_ = managed_jobs.queue(refresh=False,
|
2679
2681
|
skip_finished=True)
|
@@ -2725,7 +2727,8 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str):
|
|
2725
2727
|
"""
|
2726
2728
|
controller = controller_utils.Controllers.from_name(controller_name)
|
2727
2729
|
assert controller is not None, controller_name
|
2728
|
-
with rich_utils.safe_status(
|
2730
|
+
with rich_utils.safe_status(
|
2731
|
+
ux_utils.spinner_message('Checking for live services')):
|
2729
2732
|
try:
|
2730
2733
|
services = serve_lib.status()
|
2731
2734
|
except exceptions.ClusterNotUpError as e:
|
@@ -2909,9 +2912,9 @@ def _down_or_stop_clusters(
|
|
2909
2912
|
progress = rich_progress.Progress(transient=True,
|
2910
2913
|
redirect_stdout=False,
|
2911
2914
|
redirect_stderr=False)
|
2912
|
-
task = progress.add_task(
|
2913
|
-
f'
|
2914
|
-
|
2915
|
+
task = progress.add_task(ux_utils.spinner_message(
|
2916
|
+
f'{operation} {len(clusters)} cluster{plural}'),
|
2917
|
+
total=len(clusters))
|
2915
2918
|
|
2916
2919
|
def _down_or_stop(name: str):
|
2917
2920
|
success_progress = False
|
@@ -3680,7 +3683,7 @@ def jobs_launch(
|
|
3680
3683
|
dag_utils.fill_default_config_in_dag_for_job_launch(dag)
|
3681
3684
|
|
3682
3685
|
click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
|
3683
|
-
fg='
|
3686
|
+
fg='cyan')
|
3684
3687
|
dag = sky.optimize(dag)
|
3685
3688
|
|
3686
3689
|
if not yes:
|
@@ -3774,7 +3777,8 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
|
|
3774
3777
|
|
3775
3778
|
"""
|
3776
3779
|
click.secho('Fetching managed job statuses...', fg='yellow')
|
3777
|
-
with rich_utils.safe_status(
|
3780
|
+
with rich_utils.safe_status(
|
3781
|
+
ux_utils.spinner_message('Checking managed jobs')):
|
3778
3782
|
_, msg = _get_managed_jobs(refresh=refresh,
|
3779
3783
|
skip_finished=skip_finished,
|
3780
3784
|
show_all=all,
|
@@ -3825,10 +3829,12 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
|
|
3825
3829
|
# Cancel managed jobs with IDs 1, 2, 3
|
3826
3830
|
$ sky jobs cancel 1 2 3
|
3827
3831
|
"""
|
3828
|
-
|
3829
|
-
|
3830
|
-
|
3831
|
-
|
3832
|
+
with rich_utils.safe_status(
|
3833
|
+
ux_utils.spinner_message('Checking managed jobs')):
|
3834
|
+
backend_utils.is_controller_accessible(
|
3835
|
+
controller=controller_utils.Controllers.JOBS_CONTROLLER,
|
3836
|
+
stopped_message='All managed jobs should have finished.',
|
3837
|
+
exit_if_not_accessible=True)
|
3832
3838
|
|
3833
3839
|
job_id_str = ','.join(map(str, job_ids))
|
3834
3840
|
if sum([len(job_ids) > 0, name is not None, all]) != 1:
|
@@ -4390,7 +4396,7 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]):
|
|
4390
4396
|
sky serve status my-service
|
4391
4397
|
"""
|
4392
4398
|
# This won't pollute the output of --endpoint.
|
4393
|
-
with rich_utils.safe_status('
|
4399
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Checking services')):
|
4394
4400
|
_, msg = _get_services(service_names,
|
4395
4401
|
show_all=all,
|
4396
4402
|
show_endpoint=endpoint,
|
@@ -4814,11 +4820,11 @@ def benchmark_launch(
|
|
4814
4820
|
f'\n{colorama.Fore.CYAN}Benchmark name: '
|
4815
4821
|
f'{colorama.Style.BRIGHT}{benchmark}{colorama.Style.RESET_ALL}'
|
4816
4822
|
'\nTo see the benchmark results: '
|
4817
|
-
f'{
|
4818
|
-
f'{benchmark}{
|
4823
|
+
f'{ux_utils.BOLD}sky bench show '
|
4824
|
+
f'{benchmark}{ux_utils.RESET_BOLD}'
|
4819
4825
|
'\nTo teardown the clusters: '
|
4820
|
-
f'{
|
4821
|
-
f'{benchmark}{
|
4826
|
+
f'{ux_utils.BOLD}sky bench down '
|
4827
|
+
f'{benchmark}{ux_utils.RESET_BOLD}')
|
4822
4828
|
subprocess_utils.run('sky bench ls')
|
4823
4829
|
else:
|
4824
4830
|
logger.error('No benchmarking clusters are created.')
|
@@ -5109,9 +5115,9 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
|
|
5109
5115
|
progress = rich_progress.Progress(transient=True,
|
5110
5116
|
redirect_stdout=False,
|
5111
5117
|
redirect_stderr=False)
|
5112
|
-
task = progress.add_task(
|
5113
|
-
f'
|
5114
|
-
|
5118
|
+
task = progress.add_task(ux_utils.spinner_message(
|
5119
|
+
f'Deleting {len(to_delete)} benchmark{plural}'),
|
5120
|
+
total=len(to_delete))
|
5115
5121
|
|
5116
5122
|
def _delete_benchmark(benchmark: str) -> None:
|
5117
5123
|
clusters = benchmark_state.get_benchmark_clusters(benchmark)
|
@@ -5126,8 +5132,8 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
|
|
5126
5132
|
message = (f'{colorama.Fore.YELLOW}Benchmark {benchmark} '
|
5127
5133
|
f'has {num_clusters} un-terminated cluster{plural}. '
|
5128
5134
|
f'Terminate the cluster{plural} with '
|
5129
|
-
f'{
|
5130
|
-
f'{
|
5135
|
+
f'{ux_utils.BOLD} sky bench down {benchmark} '
|
5136
|
+
f'{ux_utils.RESET_BOLD} '
|
5131
5137
|
'before deleting the benchmark report.')
|
5132
5138
|
success = False
|
5133
5139
|
else:
|
@@ -5228,7 +5234,7 @@ def _deploy_local_cluster(gpus: bool):
|
|
5228
5234
|
f'Full log: {log_path}'
|
5229
5235
|
f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
|
5230
5236
|
# Run sky check
|
5231
|
-
with rich_utils.safe_status('
|
5237
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Running sky check')):
|
5232
5238
|
sky_check.check(clouds=['kubernetes'], quiet=True)
|
5233
5239
|
if cluster_created:
|
5234
5240
|
# Prepare completion message which shows CPU and GPU count
|
@@ -5425,7 +5431,8 @@ def local_down():
|
|
5425
5431
|
'local_down.log')
|
5426
5432
|
tail_cmd = 'tail -n100 -f ' + log_path
|
5427
5433
|
|
5428
|
-
with rich_utils.safe_status(
|
5434
|
+
with rich_utils.safe_status(
|
5435
|
+
ux_utils.spinner_message('Removing local cluster')):
|
5429
5436
|
style = colorama.Style
|
5430
5437
|
click.echo('To view detailed progress: '
|
5431
5438
|
f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
|
@@ -5448,7 +5455,8 @@ def local_down():
|
|
5448
5455
|
f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
|
5449
5456
|
if cluster_removed:
|
5450
5457
|
# Run sky check
|
5451
|
-
with rich_utils.safe_status(
|
5458
|
+
with rich_utils.safe_status(
|
5459
|
+
ux_utils.spinner_message('Running sky check')):
|
5452
5460
|
sky_check.check(clouds=['kubernetes'], quiet=True)
|
5453
5461
|
click.echo(
|
5454
5462
|
f'{colorama.Fore.GREEN}Local cluster removed.{style.RESET_ALL}')
|
@@ -10,8 +10,6 @@ import threading
|
|
10
10
|
import typing
|
11
11
|
from typing import Dict, List, Optional, Tuple
|
12
12
|
|
13
|
-
import colorama
|
14
|
-
|
15
13
|
from sky import exceptions
|
16
14
|
from sky import sky_logging
|
17
15
|
from sky.adaptors import common as adaptors_common
|
@@ -21,6 +19,8 @@ from sky.clouds.service_catalog import config
|
|
21
19
|
from sky.clouds.service_catalog.data_fetchers import fetch_aws
|
22
20
|
from sky.utils import common_utils
|
23
21
|
from sky.utils import resources_utils
|
22
|
+
from sky.utils import rich_utils
|
23
|
+
from sky.utils import ux_utils
|
24
24
|
|
25
25
|
if typing.TYPE_CHECKING:
|
26
26
|
import pandas as pd
|
@@ -82,11 +82,10 @@ def _get_az_mappings(aws_user_hash: str) -> Optional['pd.DataFrame']:
|
|
82
82
|
az_mappings = None
|
83
83
|
if aws_user_hash != 'default':
|
84
84
|
# Fetch az mapping from AWS.
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
az_mappings = fetch_aws.fetch_availability_zone_mappings()
|
85
|
+
with rich_utils.safe_status(
|
86
|
+
ux_utils.spinner_message('AWS: Fetching availability '
|
87
|
+
'zones mapping')):
|
88
|
+
az_mappings = fetch_aws.fetch_availability_zone_mappings()
|
90
89
|
else:
|
91
90
|
return None
|
92
91
|
az_mappings.to_csv(az_mapping_path, index=False)
|
@@ -198,9 +198,10 @@ def read_catalog(filename: str,
|
|
198
198
|
if pull_frequency_hours is not None:
|
199
199
|
update_frequency_str = (
|
200
200
|
f' (every {pull_frequency_hours} hours)')
|
201
|
-
with rich_utils.safe_status(
|
202
|
-
|
203
|
-
|
201
|
+
with rich_utils.safe_status(
|
202
|
+
ux_utils.spinner_message(
|
203
|
+
f'Updating {cloud} catalog: {filename}') +
|
204
|
+
f'{update_frequency_str}'):
|
204
205
|
try:
|
205
206
|
r = requests.get(url)
|
206
207
|
r.raise_for_status()
|
@@ -14,6 +14,9 @@ _PULL_FREQUENCY_HOURS = 1
|
|
14
14
|
_df = common.read_catalog(cudo_mt.VMS_CSV,
|
15
15
|
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
|
16
16
|
|
17
|
+
_DEFAULT_NUM_VCPUS = 8
|
18
|
+
_DEFAULT_MEMORY_CPU_RATIO = 2
|
19
|
+
|
17
20
|
|
18
21
|
def instance_type_exists(instance_type: str) -> bool:
|
19
22
|
return common.instance_type_exists_impl(_df, instance_type)
|
@@ -52,7 +55,14 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
52
55
|
del disk_tier
|
53
56
|
# NOTE: After expanding catalog to multiple entries, you may
|
54
57
|
# want to specify a default instance type or family.
|
55
|
-
|
58
|
+
if cpus is None and memory is None:
|
59
|
+
cpus = f'{_DEFAULT_NUM_VCPUS}+'
|
60
|
+
|
61
|
+
memory_gb_or_ratio = memory
|
62
|
+
if memory is None:
|
63
|
+
memory_gb_or_ratio = f'{_DEFAULT_MEMORY_CPU_RATIO}x'
|
64
|
+
return common.get_instance_type_for_cpus_mem_impl(_df, cpus,
|
65
|
+
memory_gb_or_ratio)
|
56
66
|
|
57
67
|
|
58
68
|
def get_accelerators_from_instance_type(
|
sky/core.py
CHANGED
@@ -21,6 +21,7 @@ from sky.usage import usage_lib
|
|
21
21
|
from sky.utils import controller_utils
|
22
22
|
from sky.utils import rich_utils
|
23
23
|
from sky.utils import subprocess_utils
|
24
|
+
from sky.utils import ux_utils
|
24
25
|
|
25
26
|
if typing.TYPE_CHECKING:
|
26
27
|
from sky import resources as resources_lib
|
@@ -127,8 +128,9 @@ def endpoints(cluster: str,
|
|
127
128
|
RuntimeError: if the cluster has no ports to be exposed or no endpoints
|
128
129
|
are exposed yet.
|
129
130
|
"""
|
130
|
-
with rich_utils.safe_status(
|
131
|
-
|
131
|
+
with rich_utils.safe_status(
|
132
|
+
ux_utils.spinner_message(
|
133
|
+
f'Fetching endpoints for cluster {cluster}')):
|
132
134
|
return backend_utils.get_endpoints(cluster=cluster, port=port)
|
133
135
|
|
134
136
|
|
sky/data/storage.py
CHANGED
@@ -1317,8 +1317,8 @@ class S3Store(AbstractStore):
|
|
1317
1317
|
source_message = source_path_list[0]
|
1318
1318
|
|
1319
1319
|
with rich_utils.safe_status(
|
1320
|
-
f'
|
1321
|
-
|
1320
|
+
ux_utils.spinner_message(f'Syncing {source_message} -> '
|
1321
|
+
f's3://{self.name}/')):
|
1322
1322
|
data_utils.parallel_upload(
|
1323
1323
|
source_path_list,
|
1324
1324
|
get_file_sync_command,
|
@@ -1445,7 +1445,8 @@ class S3Store(AbstractStore):
|
|
1445
1445
|
}
|
1446
1446
|
s3_client.create_bucket(**create_bucket_config)
|
1447
1447
|
logger.info(
|
1448
|
-
f'Created S3 bucket {bucket_name!r} in
|
1448
|
+
f' {colorama.Style.DIM}Created S3 bucket {bucket_name!r} in '
|
1449
|
+
f'{region or "us-east-1"}{colorama.Style.RESET_ALL}')
|
1449
1450
|
|
1450
1451
|
# Add AWS tags configured in config.yaml to the bucket.
|
1451
1452
|
# This is useful for cost tracking and external cleanup.
|
@@ -1486,7 +1487,8 @@ class S3Store(AbstractStore):
|
|
1486
1487
|
remove_command = f'aws s3 rb s3://{bucket_name} --force'
|
1487
1488
|
try:
|
1488
1489
|
with rich_utils.safe_status(
|
1489
|
-
|
1490
|
+
ux_utils.spinner_message(
|
1491
|
+
f'Deleting S3 bucket [green]{bucket_name}')):
|
1490
1492
|
subprocess.check_output(remove_command.split(' '),
|
1491
1493
|
stderr=subprocess.STDOUT)
|
1492
1494
|
except subprocess.CalledProcessError as e:
|
@@ -1726,8 +1728,8 @@ class GcsStore(AbstractStore):
|
|
1726
1728
|
f'cp -e -n -r -I gs://{self.name}')
|
1727
1729
|
|
1728
1730
|
with rich_utils.safe_status(
|
1729
|
-
f'
|
1730
|
-
|
1731
|
+
ux_utils.spinner_message(f'Syncing {source_message} -> '
|
1732
|
+
f'gs://{self.name}/')):
|
1731
1733
|
data_utils.run_upload_cli(sync_command,
|
1732
1734
|
self._ACCESS_DENIED_MESSAGE,
|
1733
1735
|
bucket_name=self.name)
|
@@ -1781,8 +1783,8 @@ class GcsStore(AbstractStore):
|
|
1781
1783
|
source_message = source_path_list[0]
|
1782
1784
|
|
1783
1785
|
with rich_utils.safe_status(
|
1784
|
-
f'
|
1785
|
-
|
1786
|
+
ux_utils.spinner_message(f'Syncing {source_message} -> '
|
1787
|
+
f'gs://{self.name}/')):
|
1786
1788
|
data_utils.parallel_upload(
|
1787
1789
|
source_path_list,
|
1788
1790
|
get_file_sync_command,
|
@@ -1904,8 +1906,9 @@ class GcsStore(AbstractStore):
|
|
1904
1906
|
f'Attempted to create a bucket {self.name} but failed.'
|
1905
1907
|
) from e
|
1906
1908
|
logger.info(
|
1907
|
-
f'Created GCS bucket {new_bucket.name} in
|
1908
|
-
f'with storage class
|
1909
|
+
f' {colorama.Style.DIM}Created GCS bucket {new_bucket.name!r} in '
|
1910
|
+
f'{new_bucket.location} with storage class '
|
1911
|
+
f'{new_bucket.storage_class}{colorama.Style.RESET_ALL}')
|
1909
1912
|
return new_bucket
|
1910
1913
|
|
1911
1914
|
def _delete_gcs_bucket(self, bucket_name: str) -> bool:
|
@@ -1919,7 +1922,8 @@ class GcsStore(AbstractStore):
|
|
1919
1922
|
"""
|
1920
1923
|
|
1921
1924
|
with rich_utils.safe_status(
|
1922
|
-
|
1925
|
+
ux_utils.spinner_message(
|
1926
|
+
f'Deleting GCS bucket [green]{bucket_name}')):
|
1923
1927
|
try:
|
1924
1928
|
self.client.get_bucket(bucket_name)
|
1925
1929
|
except gcp.forbidden_exception() as e:
|
@@ -2306,11 +2310,12 @@ class AzureBlobStore(AbstractStore):
|
|
2306
2310
|
resource_group_name)
|
2307
2311
|
except azure.exceptions().ResourceNotFoundError:
|
2308
2312
|
with rich_utils.safe_status(
|
2309
|
-
|
2310
|
-
|
2313
|
+
ux_utils.spinner_message(
|
2314
|
+
f'Setting up resource group: '
|
2315
|
+
f'{resource_group_name}')):
|
2311
2316
|
self.resource_client.resource_groups.create_or_update(
|
2312
2317
|
resource_group_name, {'location': self.region})
|
2313
|
-
logger.info('Created Azure resource group '
|
2318
|
+
logger.info(' Created Azure resource group '
|
2314
2319
|
f'{resource_group_name!r}.')
|
2315
2320
|
# check if the storage account name already exists under the
|
2316
2321
|
# given resource group name.
|
@@ -2319,13 +2324,14 @@ class AzureBlobStore(AbstractStore):
|
|
2319
2324
|
resource_group_name, storage_account_name)
|
2320
2325
|
except azure.exceptions().ResourceNotFoundError:
|
2321
2326
|
with rich_utils.safe_status(
|
2322
|
-
|
2323
|
-
|
2327
|
+
ux_utils.spinner_message(
|
2328
|
+
f'Setting up storage account: '
|
2329
|
+
f'{storage_account_name}')):
|
2324
2330
|
self._create_storage_account(resource_group_name,
|
2325
2331
|
storage_account_name)
|
2326
2332
|
# wait until new resource creation propagates to Azure.
|
2327
2333
|
time.sleep(1)
|
2328
|
-
logger.info('Created Azure storage account '
|
2334
|
+
logger.info(' Created Azure storage account '
|
2329
2335
|
f'{storage_account_name!r}.')
|
2330
2336
|
|
2331
2337
|
return storage_account_name, resource_group_name
|
@@ -2514,9 +2520,9 @@ class AzureBlobStore(AbstractStore):
|
|
2514
2520
|
container_endpoint = data_utils.AZURE_CONTAINER_URL.format(
|
2515
2521
|
storage_account_name=self.storage_account_name,
|
2516
2522
|
container_name=self.name)
|
2517
|
-
with rich_utils.safe_status(
|
2518
|
-
|
2519
|
-
|
2523
|
+
with rich_utils.safe_status(
|
2524
|
+
ux_utils.spinner_message(
|
2525
|
+
f'Syncing {source_message} -> {container_endpoint}/')):
|
2520
2526
|
data_utils.parallel_upload(
|
2521
2527
|
source_path_list,
|
2522
2528
|
get_file_sync_command,
|
@@ -2665,9 +2671,10 @@ class AzureBlobStore(AbstractStore):
|
|
2665
2671
|
self.storage_account_name,
|
2666
2672
|
container_name,
|
2667
2673
|
blob_container={})
|
2668
|
-
logger.info('Created AZ Container '
|
2674
|
+
logger.info(f' {colorama.Style.DIM}Created AZ Container '
|
2669
2675
|
f'{container_name!r} in {self.region!r} under storage '
|
2670
|
-
f'account {self.storage_account_name!r}.'
|
2676
|
+
f'account {self.storage_account_name!r}.'
|
2677
|
+
f'{colorama.Style.RESET_ALL}')
|
2671
2678
|
except azure.exceptions().ResourceExistsError as e:
|
2672
2679
|
if 'container is being deleted' in e.error.message:
|
2673
2680
|
with ux_utils.print_exception_no_traceback():
|
@@ -2700,7 +2707,8 @@ class AzureBlobStore(AbstractStore):
|
|
2700
2707
|
"""
|
2701
2708
|
try:
|
2702
2709
|
with rich_utils.safe_status(
|
2703
|
-
|
2710
|
+
ux_utils.spinner_message(
|
2711
|
+
f'Deleting Azure container {container_name}')):
|
2704
2712
|
# Check for the existance of the container before deletion.
|
2705
2713
|
self.storage_client.blob_containers.get(
|
2706
2714
|
self.resource_group_name,
|
@@ -2916,8 +2924,8 @@ class R2Store(AbstractStore):
|
|
2916
2924
|
source_message = source_path_list[0]
|
2917
2925
|
|
2918
2926
|
with rich_utils.safe_status(
|
2919
|
-
|
2920
|
-
|
2927
|
+
ux_utils.spinner_message(
|
2928
|
+
f'Syncing {source_message} -> r2://{self.name}/')):
|
2921
2929
|
data_utils.parallel_upload(
|
2922
2930
|
source_path_list,
|
2923
2931
|
get_file_sync_command,
|
@@ -3055,7 +3063,9 @@ class R2Store(AbstractStore):
|
|
3055
3063
|
location = {'LocationConstraint': region}
|
3056
3064
|
r2_client.create_bucket(Bucket=bucket_name,
|
3057
3065
|
CreateBucketConfiguration=location)
|
3058
|
-
logger.info(f'Created R2 bucket
|
3066
|
+
logger.info(f' {colorama.Style.DIM}Created R2 bucket '
|
3067
|
+
f'{bucket_name!r} in {region}'
|
3068
|
+
f'{colorama.Style.RESET_ALL}')
|
3059
3069
|
except aws.botocore_exceptions().ClientError as e:
|
3060
3070
|
with ux_utils.print_exception_no_traceback():
|
3061
3071
|
raise exceptions.StorageBucketCreateError(
|
@@ -3087,7 +3097,8 @@ class R2Store(AbstractStore):
|
|
3087
3097
|
f'--profile={cloudflare.R2_PROFILE_NAME}')
|
3088
3098
|
try:
|
3089
3099
|
with rich_utils.safe_status(
|
3090
|
-
|
3100
|
+
ux_utils.spinner_message(
|
3101
|
+
f'Deleting R2 bucket {bucket_name}')):
|
3091
3102
|
subprocess.check_output(remove_command,
|
3092
3103
|
stderr=subprocess.STDOUT,
|
3093
3104
|
shell=True)
|
@@ -3354,9 +3365,8 @@ class IBMCosStore(AbstractStore):
|
|
3354
3365
|
source_message = source_path_list[0]
|
3355
3366
|
|
3356
3367
|
with rich_utils.safe_status(
|
3357
|
-
f'
|
3358
|
-
|
3359
|
-
f'[green]cos://{self.region}/{self.name}/[/]'):
|
3368
|
+
ux_utils.spinner_message(f'Syncing {source_message} -> '
|
3369
|
+
f'cos://{self.region}/{self.name}/')):
|
3360
3370
|
data_utils.parallel_upload(
|
3361
3371
|
source_path_list,
|
3362
3372
|
get_file_sync_command,
|
@@ -3490,8 +3500,10 @@ class IBMCosStore(AbstractStore):
|
|
3490
3500
|
CreateBucketConfiguration={
|
3491
3501
|
'LocationConstraint': f'{region}-smart'
|
3492
3502
|
})
|
3493
|
-
logger.info(f'Created IBM COS bucket
|
3494
|
-
f'
|
3503
|
+
logger.info(f' {colorama.Style.DIM}Created IBM COS bucket '
|
3504
|
+
f'{bucket_name!r} in {region} '
|
3505
|
+
'with storage class smart tier'
|
3506
|
+
f'{colorama.Style.RESET_ALL}')
|
3495
3507
|
self.bucket = self.s3_resource.Bucket(bucket_name)
|
3496
3508
|
|
3497
3509
|
except ibm.ibm_botocore.exceptions.ClientError as e: # type: ignore[union-attr] # pylint: disable=line-too-long
|
sky/data/storage_utils.py
CHANGED
@@ -213,9 +213,13 @@ def get_excluded_files(src_dir_path: str) -> List[str]:
|
|
213
213
|
skyignore_path = os.path.join(expand_src_dir_path,
|
214
214
|
constants.SKY_IGNORE_FILE)
|
215
215
|
if os.path.exists(skyignore_path):
|
216
|
-
logger.info(f'
|
217
|
-
f'
|
216
|
+
logger.info(f' {colorama.Style.DIM}'
|
217
|
+
f'Excluded files to sync to cluster based on '
|
218
|
+
f'{constants.SKY_IGNORE_FILE}.'
|
219
|
+
f'{colorama.Style.RESET_ALL}')
|
218
220
|
return get_excluded_files_from_skyignore(src_dir_path)
|
219
|
-
logger.info(f'
|
220
|
-
f'
|
221
|
+
logger.info(f' {colorama.Style.DIM}'
|
222
|
+
f'Excluded files to sync to cluster based on '
|
223
|
+
f'{constants.GIT_IGNORE_FILE}.'
|
224
|
+
f'{colorama.Style.RESET_ALL}')
|
221
225
|
return get_excluded_files_from_gitignore(src_dir_path)
|
sky/exceptions.py
CHANGED
@@ -291,3 +291,8 @@ class PortDoesNotExistError(Exception):
|
|
291
291
|
class UserRequestRejectedByPolicy(Exception):
|
292
292
|
"""Raised when a user request is rejected by an admin policy."""
|
293
293
|
pass
|
294
|
+
|
295
|
+
|
296
|
+
class NoClusterLaunchedError(Exception):
|
297
|
+
"""No cluster launched, so cleanup can be skipped during failover."""
|
298
|
+
pass
|
sky/execution.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
See `Stage` for a Task's life cycle.
|
4
4
|
"""
|
5
5
|
import enum
|
6
|
-
import os
|
7
6
|
from typing import List, Optional, Tuple, Union
|
8
7
|
|
9
8
|
import colorama
|
@@ -20,10 +19,8 @@ from sky.usage import usage_lib
|
|
20
19
|
from sky.utils import admin_policy_utils
|
21
20
|
from sky.utils import controller_utils
|
22
21
|
from sky.utils import dag_utils
|
23
|
-
from sky.utils import env_options
|
24
22
|
from sky.utils import resources_utils
|
25
23
|
from sky.utils import rich_utils
|
26
|
-
from sky.utils import subprocess_utils
|
27
24
|
from sky.utils import timeline
|
28
25
|
from sky.utils import ux_utils
|
29
26
|
|
@@ -293,11 +290,17 @@ def _execute(
|
|
293
290
|
logger.info('Dryrun finished.')
|
294
291
|
return None, None
|
295
292
|
|
296
|
-
|
297
|
-
|
298
|
-
|
293
|
+
do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
|
294
|
+
task.workdir is not None)
|
295
|
+
do_file_mounts = (Stage.SYNC_FILE_MOUNTS in stages and not dryrun and
|
296
|
+
task.file_mounts is not None)
|
297
|
+
if do_workdir or do_file_mounts:
|
298
|
+
logger.info(ux_utils.starting_message('Mounting files.'))
|
299
299
|
|
300
|
-
if
|
300
|
+
if do_workdir:
|
301
|
+
backend.sync_workdir(handle, task.workdir)
|
302
|
+
|
303
|
+
if do_file_mounts:
|
301
304
|
backend.sync_file_mounts(handle, task.file_mounts,
|
302
305
|
task.storage_mounts)
|
303
306
|
|
@@ -330,23 +333,6 @@ def _execute(
|
|
330
333
|
backend.teardown_ephemeral_storage(task)
|
331
334
|
backend.teardown(handle, terminate=True)
|
332
335
|
finally:
|
333
|
-
controller = controller_utils.Controllers.from_name(cluster_name)
|
334
|
-
if controller is None and not _is_launched_by_sky_serve_controller:
|
335
|
-
# UX: print live clusters to make users aware (to save costs).
|
336
|
-
#
|
337
|
-
# Don't print if this job is launched by the jobs controller,
|
338
|
-
# because managed jobs are serverless, there can be many of them,
|
339
|
-
# and users tend to continuously monitor managed jobs using `sky
|
340
|
-
# job queue`. Also don't print if this job is a skyserve controller
|
341
|
-
# job or launched by a skyserve controller job, because the
|
342
|
-
# redirect for this subprocess.run won't success and it will
|
343
|
-
# pollute the controller logs.
|
344
|
-
#
|
345
|
-
# Disable the usage collection for this status command.
|
346
|
-
env = dict(os.environ,
|
347
|
-
**{env_options.Options.DISABLE_LOGGING.value: '1'})
|
348
|
-
subprocess_utils.run(
|
349
|
-
'sky status --no-show-managed-jobs --no-show-services', env=env)
|
350
336
|
print()
|
351
337
|
print('\x1b[?25h', end='') # Show cursor.
|
352
338
|
return job_id, handle
|
sky/jobs/core.py
CHANGED
@@ -79,9 +79,11 @@ def launch(
|
|
79
79
|
|
80
80
|
dag_utils.fill_default_config_in_dag_for_job_launch(dag)
|
81
81
|
|
82
|
-
|
83
|
-
|
84
|
-
|
82
|
+
with rich_utils.safe_status(
|
83
|
+
ux_utils.spinner_message('Initializing managed job')):
|
84
|
+
for task_ in dag.tasks:
|
85
|
+
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
86
|
+
task_, path='jobs')
|
85
87
|
|
86
88
|
with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
|
87
89
|
mode='w') as f:
|
@@ -129,7 +131,6 @@ def launch(
|
|
129
131
|
f'{colorama.Fore.YELLOW}'
|
130
132
|
f'Launching managed job {dag.name!r} from jobs controller...'
|
131
133
|
f'{colorama.Style.RESET_ALL}')
|
132
|
-
sky_logging.print('Launching jobs controller...')
|
133
134
|
sky.launch(task=controller_task,
|
134
135
|
stream_logs=stream_logs,
|
135
136
|
cluster_name=controller_name,
|
@@ -262,11 +263,12 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
|
|
262
263
|
f'{colorama.Style.RESET_ALL}')
|
263
264
|
|
264
265
|
rich_utils.force_update_status(
|
265
|
-
'
|
266
|
-
|
266
|
+
ux_utils.spinner_message('Checking managed jobs - restarting '
|
267
|
+
'controller'))
|
267
268
|
handle = sky.start(jobs_controller_type.value.cluster_name)
|
268
269
|
controller_status = status_lib.ClusterStatus.UP
|
269
|
-
rich_utils.force_update_status(
|
270
|
+
rich_utils.force_update_status(
|
271
|
+
ux_utils.spinner_message('Checking managed jobs'))
|
270
272
|
|
271
273
|
assert handle is not None, (controller_status, refresh)
|
272
274
|
|