skypilot-nightly 1.0.0.dev20241011__py3-none-any.whl → 1.0.0.dev20241013__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/azure.py +3 -1
- sky/adaptors/common.py +6 -2
- sky/backends/backend.py +9 -4
- sky/backends/backend_utils.py +13 -16
- sky/backends/cloud_vm_ray_backend.py +207 -161
- sky/backends/local_docker_backend.py +3 -1
- sky/benchmark/benchmark_utils.py +5 -4
- sky/cli.py +128 -31
- sky/clouds/service_catalog/aws_catalog.py +6 -7
- sky/clouds/service_catalog/common.py +4 -3
- sky/clouds/service_catalog/cudo_catalog.py +11 -1
- sky/core.py +4 -2
- sky/data/storage.py +44 -32
- sky/data/storage_utils.py +12 -7
- sky/exceptions.py +5 -0
- sky/execution.py +10 -24
- sky/jobs/__init__.py +2 -0
- sky/jobs/core.py +87 -7
- sky/jobs/utils.py +35 -19
- sky/optimizer.py +50 -37
- sky/provision/aws/config.py +15 -6
- sky/provision/azure/config.py +14 -3
- sky/provision/azure/instance.py +15 -9
- sky/provision/kubernetes/instance.py +3 -1
- sky/provision/kubernetes/utils.py +25 -0
- sky/provision/provisioner.py +63 -74
- sky/serve/core.py +42 -40
- sky/sky_logging.py +9 -5
- sky/skylet/log_lib.py +5 -4
- sky/skylet/providers/lambda_cloud/node_provider.py +1 -1
- sky/utils/cli_utils/status_utils.py +168 -21
- sky/utils/command_runner.py +11 -11
- sky/utils/common_utils.py +22 -5
- sky/utils/controller_utils.py +78 -29
- sky/utils/env_options.py +22 -7
- sky/utils/log_utils.py +39 -24
- sky/utils/resources_utils.py +23 -0
- sky/utils/rich_utils.py +55 -5
- sky/utils/ux_utils.py +63 -4
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/RECORD +46 -46
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/top_level.txt +0 -0
@@ -14,6 +14,7 @@ from sky.backends import backend_utils
|
|
14
14
|
from sky.backends import docker_utils
|
15
15
|
from sky.data import storage as storage_lib
|
16
16
|
from sky.utils import rich_utils
|
17
|
+
from sky.utils import ux_utils
|
17
18
|
|
18
19
|
if typing.TYPE_CHECKING:
|
19
20
|
from sky import resources
|
@@ -159,7 +160,8 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
159
160
|
handle = LocalDockerResourceHandle(cluster_name)
|
160
161
|
logger.info(f'Building docker image for task {task.name}. '
|
161
162
|
'This might take some time.')
|
162
|
-
with rich_utils.safe_status(
|
163
|
+
with rich_utils.safe_status(
|
164
|
+
ux_utils.spinner_message('Building Docker image')):
|
163
165
|
image_tag, metadata = docker_utils.build_dockerimage_from_task(task)
|
164
166
|
self.images[handle] = (image_tag, metadata)
|
165
167
|
logger.info(f'Image {image_tag} built.')
|
sky/benchmark/benchmark_utils.py
CHANGED
@@ -595,7 +595,8 @@ def update_benchmark_state(benchmark: str) -> None:
|
|
595
595
|
remote_dir = os.path.join(bucket_name, benchmark)
|
596
596
|
local_dir = os.path.join(_SKY_LOCAL_BENCHMARK_DIR, benchmark)
|
597
597
|
os.makedirs(local_dir, exist_ok=True)
|
598
|
-
with rich_utils.safe_status(
|
598
|
+
with rich_utils.safe_status(
|
599
|
+
ux_utils.spinner_message('Downloading benchmark logs')):
|
599
600
|
_download_remote_dir(remote_dir, local_dir, bucket_type)
|
600
601
|
|
601
602
|
# Update the benchmark results in parallel.
|
@@ -604,9 +605,9 @@ def update_benchmark_state(benchmark: str) -> None:
|
|
604
605
|
progress = rich_progress.Progress(transient=True,
|
605
606
|
redirect_stdout=False,
|
606
607
|
redirect_stderr=False)
|
607
|
-
task = progress.add_task(
|
608
|
-
f'
|
609
|
-
|
608
|
+
task = progress.add_task(ux_utils.spinner_message(
|
609
|
+
f'Processing {num_candidates} benchmark result{plural}'),
|
610
|
+
total=num_candidates)
|
610
611
|
|
611
612
|
def _update_with_progress_bar(arg: Any) -> None:
|
612
613
|
message = _update_benchmark_result(arg)
|
sky/cli.py
CHANGED
@@ -1458,6 +1458,79 @@ def _get_services(service_names: Optional[List[str]],
|
|
1458
1458
|
return num_services, msg
|
1459
1459
|
|
1460
1460
|
|
1461
|
+
def _status_kubernetes(show_all: bool):
|
1462
|
+
"""Show all SkyPilot resources in the current Kubernetes context.
|
1463
|
+
|
1464
|
+
Args:
|
1465
|
+
show_all (bool): Show all job information (e.g., start time, failures).
|
1466
|
+
"""
|
1467
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
1468
|
+
try:
|
1469
|
+
pods = kubernetes_utils.get_skypilot_pods(context)
|
1470
|
+
except exceptions.ResourcesUnavailableError as e:
|
1471
|
+
with ux_utils.print_exception_no_traceback():
|
1472
|
+
raise ValueError('Failed to get SkyPilot pods from '
|
1473
|
+
f'Kubernetes: {str(e)}') from e
|
1474
|
+
all_clusters, jobs_controllers, serve_controllers = (
|
1475
|
+
status_utils.process_skypilot_pods(pods, context))
|
1476
|
+
all_jobs = []
|
1477
|
+
with rich_utils.safe_status(
|
1478
|
+
'[bold cyan]Checking in-progress managed jobs[/]') as spinner:
|
1479
|
+
for i, (_, job_controller_info) in enumerate(jobs_controllers.items()):
|
1480
|
+
user = job_controller_info['user']
|
1481
|
+
pod = job_controller_info['pods'][0]
|
1482
|
+
status_message = ('[bold cyan]Checking managed jobs controller')
|
1483
|
+
if len(jobs_controllers) > 1:
|
1484
|
+
status_message += f's ({i+1}/{len(jobs_controllers)})'
|
1485
|
+
spinner.update(f'{status_message}[/]')
|
1486
|
+
try:
|
1487
|
+
job_list = managed_jobs.queue_from_kubernetes_pod(
|
1488
|
+
pod.metadata.name)
|
1489
|
+
except RuntimeError as e:
|
1490
|
+
logger.warning('Failed to get managed jobs from controller '
|
1491
|
+
f'{pod.metadata.name}: {str(e)}')
|
1492
|
+
job_list = []
|
1493
|
+
# Add user field to jobs
|
1494
|
+
for job in job_list:
|
1495
|
+
job['user'] = user
|
1496
|
+
all_jobs.extend(job_list)
|
1497
|
+
# Reconcile cluster state between managed jobs and clusters:
|
1498
|
+
# To maintain a clear separation between regular SkyPilot clusters
|
1499
|
+
# and those from managed jobs, we need to exclude the latter from
|
1500
|
+
# the main cluster list.
|
1501
|
+
# We do this by reconstructing managed job cluster names from each
|
1502
|
+
# job's name and ID. We then use this set to filter out managed
|
1503
|
+
# clusters from the main cluster list. This is necessary because there
|
1504
|
+
# are no identifiers distinguishing clusters from managed jobs from
|
1505
|
+
# regular clusters.
|
1506
|
+
managed_job_cluster_names = set()
|
1507
|
+
for job in all_jobs:
|
1508
|
+
# Managed job cluster name is <job_name>-<job_id>
|
1509
|
+
managed_cluster_name = f'{job["job_name"]}-{job["job_id"]}'
|
1510
|
+
managed_job_cluster_names.add(managed_cluster_name)
|
1511
|
+
unmanaged_clusters = [
|
1512
|
+
c for c in all_clusters
|
1513
|
+
if c['cluster_name'] not in managed_job_cluster_names
|
1514
|
+
]
|
1515
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1516
|
+
f'Kubernetes cluster state (context: {context})'
|
1517
|
+
f'{colorama.Style.RESET_ALL}')
|
1518
|
+
status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
|
1519
|
+
show_all)
|
1520
|
+
if all_jobs:
|
1521
|
+
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1522
|
+
f'Managed jobs'
|
1523
|
+
f'{colorama.Style.RESET_ALL}')
|
1524
|
+
msg = managed_jobs.format_job_table(all_jobs, show_all=show_all)
|
1525
|
+
click.echo(msg)
|
1526
|
+
if serve_controllers:
|
1527
|
+
# TODO: Parse serve controllers and show services separately.
|
1528
|
+
# Currently we show a hint that services are shown as clusters.
|
1529
|
+
click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
|
1530
|
+
'shown in the "SkyPilot clusters" section.'
|
1531
|
+
f'{colorama.Style.RESET_ALL}')
|
1532
|
+
|
1533
|
+
|
1461
1534
|
@cli.command()
|
1462
1535
|
@click.option('--all',
|
1463
1536
|
'-a',
|
@@ -1503,6 +1576,14 @@ def _get_services(service_names: Optional[List[str]],
|
|
1503
1576
|
is_flag=True,
|
1504
1577
|
required=False,
|
1505
1578
|
help='Also show sky serve services, if any.')
|
1579
|
+
@click.option(
|
1580
|
+
'--kubernetes',
|
1581
|
+
'--k8s',
|
1582
|
+
default=False,
|
1583
|
+
is_flag=True,
|
1584
|
+
required=False,
|
1585
|
+
help='[Experimental] Show all SkyPilot resources (including from other '
|
1586
|
+
'users) in the current Kubernetes context.')
|
1506
1587
|
@click.argument('clusters',
|
1507
1588
|
required=False,
|
1508
1589
|
type=str,
|
@@ -1512,7 +1593,7 @@ def _get_services(service_names: Optional[List[str]],
|
|
1512
1593
|
# pylint: disable=redefined-builtin
|
1513
1594
|
def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
|
1514
1595
|
endpoint: Optional[int], show_managed_jobs: bool,
|
1515
|
-
show_services: bool, clusters: List[str]):
|
1596
|
+
show_services: bool, kubernetes: bool, clusters: List[str]):
|
1516
1597
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
1517
1598
|
"""Show clusters.
|
1518
1599
|
|
@@ -1571,6 +1652,9 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1571
1652
|
or for autostop-enabled clusters, use ``--refresh`` to query the latest
|
1572
1653
|
cluster statuses from the cloud providers.
|
1573
1654
|
"""
|
1655
|
+
if kubernetes:
|
1656
|
+
_status_kubernetes(all)
|
1657
|
+
return
|
1574
1658
|
# Using a pool with 2 worker to run the managed job query and sky serve
|
1575
1659
|
# service query in parallel to speed up. The pool provides a AsyncResult
|
1576
1660
|
# object that can be used as a future.
|
@@ -1730,7 +1814,8 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1730
1814
|
if show_managed_jobs:
|
1731
1815
|
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1732
1816
|
f'Managed jobs{colorama.Style.RESET_ALL}')
|
1733
|
-
with rich_utils.safe_status(
|
1817
|
+
with rich_utils.safe_status(
|
1818
|
+
ux_utils.spinner_message('Checking managed jobs')):
|
1734
1819
|
managed_jobs_query_interrupted, result = _try_get_future_result(
|
1735
1820
|
managed_jobs_future)
|
1736
1821
|
if managed_jobs_query_interrupted:
|
@@ -1771,7 +1856,8 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1771
1856
|
# The pool is terminated, so we cannot run the service query.
|
1772
1857
|
msg = 'KeyboardInterrupt'
|
1773
1858
|
else:
|
1774
|
-
with rich_utils.safe_status(
|
1859
|
+
with rich_utils.safe_status(
|
1860
|
+
ux_utils.spinner_message('Checking services')):
|
1775
1861
|
interrupted, result = _try_get_future_result(
|
1776
1862
|
services_future)
|
1777
1863
|
if interrupted:
|
@@ -2467,8 +2553,8 @@ def start(
|
|
2467
2553
|
'is currently not supported.\n'
|
2468
2554
|
'Please start the former independently.')
|
2469
2555
|
if controllers:
|
2470
|
-
bold =
|
2471
|
-
reset_bold =
|
2556
|
+
bold = ux_utils.BOLD
|
2557
|
+
reset_bold = ux_utils.RESET_BOLD
|
2472
2558
|
if len(controllers) != 1:
|
2473
2559
|
raise click.UsageError(
|
2474
2560
|
'Starting multiple controllers is currently not supported.\n'
|
@@ -2589,7 +2675,7 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str):
|
|
2589
2675
|
assert controller is not None, controller_name
|
2590
2676
|
|
2591
2677
|
with rich_utils.safe_status(
|
2592
|
-
'
|
2678
|
+
ux_utils.spinner_message('Checking for in-progress managed jobs')):
|
2593
2679
|
try:
|
2594
2680
|
managed_jobs_ = managed_jobs.queue(refresh=False,
|
2595
2681
|
skip_finished=True)
|
@@ -2641,7 +2727,8 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str):
|
|
2641
2727
|
"""
|
2642
2728
|
controller = controller_utils.Controllers.from_name(controller_name)
|
2643
2729
|
assert controller is not None, controller_name
|
2644
|
-
with rich_utils.safe_status(
|
2730
|
+
with rich_utils.safe_status(
|
2731
|
+
ux_utils.spinner_message('Checking for live services')):
|
2645
2732
|
try:
|
2646
2733
|
services = serve_lib.status()
|
2647
2734
|
except exceptions.ClusterNotUpError as e:
|
@@ -2825,9 +2912,9 @@ def _down_or_stop_clusters(
|
|
2825
2912
|
progress = rich_progress.Progress(transient=True,
|
2826
2913
|
redirect_stdout=False,
|
2827
2914
|
redirect_stderr=False)
|
2828
|
-
task = progress.add_task(
|
2829
|
-
f'
|
2830
|
-
|
2915
|
+
task = progress.add_task(ux_utils.spinner_message(
|
2916
|
+
f'{operation} {len(clusters)} cluster{plural}'),
|
2917
|
+
total=len(clusters))
|
2831
2918
|
|
2832
2919
|
def _down_or_stop(name: str):
|
2833
2920
|
success_progress = False
|
@@ -3113,7 +3200,12 @@ def show_gpus(
|
|
3113
3200
|
print_section_titles = False
|
3114
3201
|
# If cloud is kubernetes, we want to show real-time capacity
|
3115
3202
|
if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes):
|
3116
|
-
|
3203
|
+
if region:
|
3204
|
+
context = region
|
3205
|
+
else:
|
3206
|
+
# If region is not specified, we use the current context
|
3207
|
+
context = (
|
3208
|
+
kubernetes_utils.get_current_kube_config_context_name())
|
3117
3209
|
try:
|
3118
3210
|
# If --cloud kubernetes is not specified, we want to catch
|
3119
3211
|
# the case where no GPUs are available on the cluster and
|
@@ -3128,7 +3220,7 @@ def show_gpus(
|
|
3128
3220
|
else:
|
3129
3221
|
print_section_titles = True
|
3130
3222
|
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3131
|
-
f'Kubernetes GPUs (
|
3223
|
+
f'Kubernetes GPUs (context: {context})'
|
3132
3224
|
f'{colorama.Style.RESET_ALL}\n')
|
3133
3225
|
yield from k8s_realtime_table.get_string()
|
3134
3226
|
k8s_node_table = _get_kubernetes_node_info_table(context)
|
@@ -3591,7 +3683,7 @@ def jobs_launch(
|
|
3591
3683
|
dag_utils.fill_default_config_in_dag_for_job_launch(dag)
|
3592
3684
|
|
3593
3685
|
click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
|
3594
|
-
fg='
|
3686
|
+
fg='cyan')
|
3595
3687
|
dag = sky.optimize(dag)
|
3596
3688
|
|
3597
3689
|
if not yes:
|
@@ -3685,7 +3777,8 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
|
|
3685
3777
|
|
3686
3778
|
"""
|
3687
3779
|
click.secho('Fetching managed job statuses...', fg='yellow')
|
3688
|
-
with rich_utils.safe_status(
|
3780
|
+
with rich_utils.safe_status(
|
3781
|
+
ux_utils.spinner_message('Checking managed jobs')):
|
3689
3782
|
_, msg = _get_managed_jobs(refresh=refresh,
|
3690
3783
|
skip_finished=skip_finished,
|
3691
3784
|
show_all=all,
|
@@ -3736,10 +3829,12 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
|
|
3736
3829
|
# Cancel managed jobs with IDs 1, 2, 3
|
3737
3830
|
$ sky jobs cancel 1 2 3
|
3738
3831
|
"""
|
3739
|
-
|
3740
|
-
|
3741
|
-
|
3742
|
-
|
3832
|
+
with rich_utils.safe_status(
|
3833
|
+
ux_utils.spinner_message('Checking managed jobs')):
|
3834
|
+
backend_utils.is_controller_accessible(
|
3835
|
+
controller=controller_utils.Controllers.JOBS_CONTROLLER,
|
3836
|
+
stopped_message='All managed jobs should have finished.',
|
3837
|
+
exit_if_not_accessible=True)
|
3743
3838
|
|
3744
3839
|
job_id_str = ','.join(map(str, job_ids))
|
3745
3840
|
if sum([len(job_ids) > 0, name is not None, all]) != 1:
|
@@ -4301,7 +4396,7 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]):
|
|
4301
4396
|
sky serve status my-service
|
4302
4397
|
"""
|
4303
4398
|
# This won't pollute the output of --endpoint.
|
4304
|
-
with rich_utils.safe_status('
|
4399
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Checking services')):
|
4305
4400
|
_, msg = _get_services(service_names,
|
4306
4401
|
show_all=all,
|
4307
4402
|
show_endpoint=endpoint,
|
@@ -4725,11 +4820,11 @@ def benchmark_launch(
|
|
4725
4820
|
f'\n{colorama.Fore.CYAN}Benchmark name: '
|
4726
4821
|
f'{colorama.Style.BRIGHT}{benchmark}{colorama.Style.RESET_ALL}'
|
4727
4822
|
'\nTo see the benchmark results: '
|
4728
|
-
f'{
|
4729
|
-
f'{benchmark}{
|
4823
|
+
f'{ux_utils.BOLD}sky bench show '
|
4824
|
+
f'{benchmark}{ux_utils.RESET_BOLD}'
|
4730
4825
|
'\nTo teardown the clusters: '
|
4731
|
-
f'{
|
4732
|
-
f'{benchmark}{
|
4826
|
+
f'{ux_utils.BOLD}sky bench down '
|
4827
|
+
f'{benchmark}{ux_utils.RESET_BOLD}')
|
4733
4828
|
subprocess_utils.run('sky bench ls')
|
4734
4829
|
else:
|
4735
4830
|
logger.error('No benchmarking clusters are created.')
|
@@ -5020,9 +5115,9 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
|
|
5020
5115
|
progress = rich_progress.Progress(transient=True,
|
5021
5116
|
redirect_stdout=False,
|
5022
5117
|
redirect_stderr=False)
|
5023
|
-
task = progress.add_task(
|
5024
|
-
f'
|
5025
|
-
|
5118
|
+
task = progress.add_task(ux_utils.spinner_message(
|
5119
|
+
f'Deleting {len(to_delete)} benchmark{plural}'),
|
5120
|
+
total=len(to_delete))
|
5026
5121
|
|
5027
5122
|
def _delete_benchmark(benchmark: str) -> None:
|
5028
5123
|
clusters = benchmark_state.get_benchmark_clusters(benchmark)
|
@@ -5037,8 +5132,8 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
|
|
5037
5132
|
message = (f'{colorama.Fore.YELLOW}Benchmark {benchmark} '
|
5038
5133
|
f'has {num_clusters} un-terminated cluster{plural}. '
|
5039
5134
|
f'Terminate the cluster{plural} with '
|
5040
|
-
f'{
|
5041
|
-
f'{
|
5135
|
+
f'{ux_utils.BOLD} sky bench down {benchmark} '
|
5136
|
+
f'{ux_utils.RESET_BOLD} '
|
5042
5137
|
'before deleting the benchmark report.')
|
5043
5138
|
success = False
|
5044
5139
|
else:
|
@@ -5139,7 +5234,7 @@ def _deploy_local_cluster(gpus: bool):
|
|
5139
5234
|
f'Full log: {log_path}'
|
5140
5235
|
f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
|
5141
5236
|
# Run sky check
|
5142
|
-
with rich_utils.safe_status('
|
5237
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Running sky check')):
|
5143
5238
|
sky_check.check(clouds=['kubernetes'], quiet=True)
|
5144
5239
|
if cluster_created:
|
5145
5240
|
# Prepare completion message which shows CPU and GPU count
|
@@ -5336,7 +5431,8 @@ def local_down():
|
|
5336
5431
|
'local_down.log')
|
5337
5432
|
tail_cmd = 'tail -n100 -f ' + log_path
|
5338
5433
|
|
5339
|
-
with rich_utils.safe_status(
|
5434
|
+
with rich_utils.safe_status(
|
5435
|
+
ux_utils.spinner_message('Removing local cluster')):
|
5340
5436
|
style = colorama.Style
|
5341
5437
|
click.echo('To view detailed progress: '
|
5342
5438
|
f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
|
@@ -5359,7 +5455,8 @@ def local_down():
|
|
5359
5455
|
f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
|
5360
5456
|
if cluster_removed:
|
5361
5457
|
# Run sky check
|
5362
|
-
with rich_utils.safe_status(
|
5458
|
+
with rich_utils.safe_status(
|
5459
|
+
ux_utils.spinner_message('Running sky check')):
|
5363
5460
|
sky_check.check(clouds=['kubernetes'], quiet=True)
|
5364
5461
|
click.echo(
|
5365
5462
|
f'{colorama.Fore.GREEN}Local cluster removed.{style.RESET_ALL}')
|
@@ -10,8 +10,6 @@ import threading
|
|
10
10
|
import typing
|
11
11
|
from typing import Dict, List, Optional, Tuple
|
12
12
|
|
13
|
-
import colorama
|
14
|
-
|
15
13
|
from sky import exceptions
|
16
14
|
from sky import sky_logging
|
17
15
|
from sky.adaptors import common as adaptors_common
|
@@ -21,6 +19,8 @@ from sky.clouds.service_catalog import config
|
|
21
19
|
from sky.clouds.service_catalog.data_fetchers import fetch_aws
|
22
20
|
from sky.utils import common_utils
|
23
21
|
from sky.utils import resources_utils
|
22
|
+
from sky.utils import rich_utils
|
23
|
+
from sky.utils import ux_utils
|
24
24
|
|
25
25
|
if typing.TYPE_CHECKING:
|
26
26
|
import pandas as pd
|
@@ -82,11 +82,10 @@ def _get_az_mappings(aws_user_hash: str) -> Optional['pd.DataFrame']:
|
|
82
82
|
az_mappings = None
|
83
83
|
if aws_user_hash != 'default':
|
84
84
|
# Fetch az mapping from AWS.
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
az_mappings = fetch_aws.fetch_availability_zone_mappings()
|
85
|
+
with rich_utils.safe_status(
|
86
|
+
ux_utils.spinner_message('AWS: Fetching availability '
|
87
|
+
'zones mapping')):
|
88
|
+
az_mappings = fetch_aws.fetch_availability_zone_mappings()
|
90
89
|
else:
|
91
90
|
return None
|
92
91
|
az_mappings.to_csv(az_mapping_path, index=False)
|
@@ -198,9 +198,10 @@ def read_catalog(filename: str,
|
|
198
198
|
if pull_frequency_hours is not None:
|
199
199
|
update_frequency_str = (
|
200
200
|
f' (every {pull_frequency_hours} hours)')
|
201
|
-
with rich_utils.safe_status(
|
202
|
-
|
203
|
-
|
201
|
+
with rich_utils.safe_status(
|
202
|
+
ux_utils.spinner_message(
|
203
|
+
f'Updating {cloud} catalog: {filename}') +
|
204
|
+
f'{update_frequency_str}'):
|
204
205
|
try:
|
205
206
|
r = requests.get(url)
|
206
207
|
r.raise_for_status()
|
@@ -14,6 +14,9 @@ _PULL_FREQUENCY_HOURS = 1
|
|
14
14
|
_df = common.read_catalog(cudo_mt.VMS_CSV,
|
15
15
|
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
|
16
16
|
|
17
|
+
_DEFAULT_NUM_VCPUS = 8
|
18
|
+
_DEFAULT_MEMORY_CPU_RATIO = 2
|
19
|
+
|
17
20
|
|
18
21
|
def instance_type_exists(instance_type: str) -> bool:
|
19
22
|
return common.instance_type_exists_impl(_df, instance_type)
|
@@ -52,7 +55,14 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
52
55
|
del disk_tier
|
53
56
|
# NOTE: After expanding catalog to multiple entries, you may
|
54
57
|
# want to specify a default instance type or family.
|
55
|
-
|
58
|
+
if cpus is None and memory is None:
|
59
|
+
cpus = f'{_DEFAULT_NUM_VCPUS}+'
|
60
|
+
|
61
|
+
memory_gb_or_ratio = memory
|
62
|
+
if memory is None:
|
63
|
+
memory_gb_or_ratio = f'{_DEFAULT_MEMORY_CPU_RATIO}x'
|
64
|
+
return common.get_instance_type_for_cpus_mem_impl(_df, cpus,
|
65
|
+
memory_gb_or_ratio)
|
56
66
|
|
57
67
|
|
58
68
|
def get_accelerators_from_instance_type(
|
sky/core.py
CHANGED
@@ -21,6 +21,7 @@ from sky.usage import usage_lib
|
|
21
21
|
from sky.utils import controller_utils
|
22
22
|
from sky.utils import rich_utils
|
23
23
|
from sky.utils import subprocess_utils
|
24
|
+
from sky.utils import ux_utils
|
24
25
|
|
25
26
|
if typing.TYPE_CHECKING:
|
26
27
|
from sky import resources as resources_lib
|
@@ -127,8 +128,9 @@ def endpoints(cluster: str,
|
|
127
128
|
RuntimeError: if the cluster has no ports to be exposed or no endpoints
|
128
129
|
are exposed yet.
|
129
130
|
"""
|
130
|
-
with rich_utils.safe_status(
|
131
|
-
|
131
|
+
with rich_utils.safe_status(
|
132
|
+
ux_utils.spinner_message(
|
133
|
+
f'Fetching endpoints for cluster {cluster}')):
|
132
134
|
return backend_utils.get_endpoints(cluster=cluster, port=port)
|
133
135
|
|
134
136
|
|
sky/data/storage.py
CHANGED
@@ -1317,8 +1317,8 @@ class S3Store(AbstractStore):
|
|
1317
1317
|
source_message = source_path_list[0]
|
1318
1318
|
|
1319
1319
|
with rich_utils.safe_status(
|
1320
|
-
f'
|
1321
|
-
|
1320
|
+
ux_utils.spinner_message(f'Syncing {source_message} -> '
|
1321
|
+
f's3://{self.name}/')):
|
1322
1322
|
data_utils.parallel_upload(
|
1323
1323
|
source_path_list,
|
1324
1324
|
get_file_sync_command,
|
@@ -1445,7 +1445,8 @@ class S3Store(AbstractStore):
|
|
1445
1445
|
}
|
1446
1446
|
s3_client.create_bucket(**create_bucket_config)
|
1447
1447
|
logger.info(
|
1448
|
-
f'Created S3 bucket {bucket_name!r} in
|
1448
|
+
f' {colorama.Style.DIM}Created S3 bucket {bucket_name!r} in '
|
1449
|
+
f'{region or "us-east-1"}{colorama.Style.RESET_ALL}')
|
1449
1450
|
|
1450
1451
|
# Add AWS tags configured in config.yaml to the bucket.
|
1451
1452
|
# This is useful for cost tracking and external cleanup.
|
@@ -1486,7 +1487,8 @@ class S3Store(AbstractStore):
|
|
1486
1487
|
remove_command = f'aws s3 rb s3://{bucket_name} --force'
|
1487
1488
|
try:
|
1488
1489
|
with rich_utils.safe_status(
|
1489
|
-
|
1490
|
+
ux_utils.spinner_message(
|
1491
|
+
f'Deleting S3 bucket [green]{bucket_name}')):
|
1490
1492
|
subprocess.check_output(remove_command.split(' '),
|
1491
1493
|
stderr=subprocess.STDOUT)
|
1492
1494
|
except subprocess.CalledProcessError as e:
|
@@ -1726,8 +1728,8 @@ class GcsStore(AbstractStore):
|
|
1726
1728
|
f'cp -e -n -r -I gs://{self.name}')
|
1727
1729
|
|
1728
1730
|
with rich_utils.safe_status(
|
1729
|
-
f'
|
1730
|
-
|
1731
|
+
ux_utils.spinner_message(f'Syncing {source_message} -> '
|
1732
|
+
f'gs://{self.name}/')):
|
1731
1733
|
data_utils.run_upload_cli(sync_command,
|
1732
1734
|
self._ACCESS_DENIED_MESSAGE,
|
1733
1735
|
bucket_name=self.name)
|
@@ -1781,8 +1783,8 @@ class GcsStore(AbstractStore):
|
|
1781
1783
|
source_message = source_path_list[0]
|
1782
1784
|
|
1783
1785
|
with rich_utils.safe_status(
|
1784
|
-
f'
|
1785
|
-
|
1786
|
+
ux_utils.spinner_message(f'Syncing {source_message} -> '
|
1787
|
+
f'gs://{self.name}/')):
|
1786
1788
|
data_utils.parallel_upload(
|
1787
1789
|
source_path_list,
|
1788
1790
|
get_file_sync_command,
|
@@ -1904,8 +1906,9 @@ class GcsStore(AbstractStore):
|
|
1904
1906
|
f'Attempted to create a bucket {self.name} but failed.'
|
1905
1907
|
) from e
|
1906
1908
|
logger.info(
|
1907
|
-
f'Created GCS bucket {new_bucket.name} in
|
1908
|
-
f'with storage class
|
1909
|
+
f' {colorama.Style.DIM}Created GCS bucket {new_bucket.name!r} in '
|
1910
|
+
f'{new_bucket.location} with storage class '
|
1911
|
+
f'{new_bucket.storage_class}{colorama.Style.RESET_ALL}')
|
1909
1912
|
return new_bucket
|
1910
1913
|
|
1911
1914
|
def _delete_gcs_bucket(self, bucket_name: str) -> bool:
|
@@ -1919,7 +1922,8 @@ class GcsStore(AbstractStore):
|
|
1919
1922
|
"""
|
1920
1923
|
|
1921
1924
|
with rich_utils.safe_status(
|
1922
|
-
|
1925
|
+
ux_utils.spinner_message(
|
1926
|
+
f'Deleting GCS bucket [green]{bucket_name}')):
|
1923
1927
|
try:
|
1924
1928
|
self.client.get_bucket(bucket_name)
|
1925
1929
|
except gcp.forbidden_exception() as e:
|
@@ -2306,11 +2310,12 @@ class AzureBlobStore(AbstractStore):
|
|
2306
2310
|
resource_group_name)
|
2307
2311
|
except azure.exceptions().ResourceNotFoundError:
|
2308
2312
|
with rich_utils.safe_status(
|
2309
|
-
|
2310
|
-
|
2313
|
+
ux_utils.spinner_message(
|
2314
|
+
f'Setting up resource group: '
|
2315
|
+
f'{resource_group_name}')):
|
2311
2316
|
self.resource_client.resource_groups.create_or_update(
|
2312
2317
|
resource_group_name, {'location': self.region})
|
2313
|
-
logger.info('Created Azure resource group '
|
2318
|
+
logger.info(' Created Azure resource group '
|
2314
2319
|
f'{resource_group_name!r}.')
|
2315
2320
|
# check if the storage account name already exists under the
|
2316
2321
|
# given resource group name.
|
@@ -2319,13 +2324,14 @@ class AzureBlobStore(AbstractStore):
|
|
2319
2324
|
resource_group_name, storage_account_name)
|
2320
2325
|
except azure.exceptions().ResourceNotFoundError:
|
2321
2326
|
with rich_utils.safe_status(
|
2322
|
-
|
2323
|
-
|
2327
|
+
ux_utils.spinner_message(
|
2328
|
+
f'Setting up storage account: '
|
2329
|
+
f'{storage_account_name}')):
|
2324
2330
|
self._create_storage_account(resource_group_name,
|
2325
2331
|
storage_account_name)
|
2326
2332
|
# wait until new resource creation propagates to Azure.
|
2327
2333
|
time.sleep(1)
|
2328
|
-
logger.info('Created Azure storage account '
|
2334
|
+
logger.info(' Created Azure storage account '
|
2329
2335
|
f'{storage_account_name!r}.')
|
2330
2336
|
|
2331
2337
|
return storage_account_name, resource_group_name
|
@@ -2514,9 +2520,9 @@ class AzureBlobStore(AbstractStore):
|
|
2514
2520
|
container_endpoint = data_utils.AZURE_CONTAINER_URL.format(
|
2515
2521
|
storage_account_name=self.storage_account_name,
|
2516
2522
|
container_name=self.name)
|
2517
|
-
with rich_utils.safe_status(
|
2518
|
-
|
2519
|
-
|
2523
|
+
with rich_utils.safe_status(
|
2524
|
+
ux_utils.spinner_message(
|
2525
|
+
f'Syncing {source_message} -> {container_endpoint}/')):
|
2520
2526
|
data_utils.parallel_upload(
|
2521
2527
|
source_path_list,
|
2522
2528
|
get_file_sync_command,
|
@@ -2665,9 +2671,10 @@ class AzureBlobStore(AbstractStore):
|
|
2665
2671
|
self.storage_account_name,
|
2666
2672
|
container_name,
|
2667
2673
|
blob_container={})
|
2668
|
-
logger.info('Created AZ Container '
|
2674
|
+
logger.info(f' {colorama.Style.DIM}Created AZ Container '
|
2669
2675
|
f'{container_name!r} in {self.region!r} under storage '
|
2670
|
-
f'account {self.storage_account_name!r}.'
|
2676
|
+
f'account {self.storage_account_name!r}.'
|
2677
|
+
f'{colorama.Style.RESET_ALL}')
|
2671
2678
|
except azure.exceptions().ResourceExistsError as e:
|
2672
2679
|
if 'container is being deleted' in e.error.message:
|
2673
2680
|
with ux_utils.print_exception_no_traceback():
|
@@ -2700,7 +2707,8 @@ class AzureBlobStore(AbstractStore):
|
|
2700
2707
|
"""
|
2701
2708
|
try:
|
2702
2709
|
with rich_utils.safe_status(
|
2703
|
-
|
2710
|
+
ux_utils.spinner_message(
|
2711
|
+
f'Deleting Azure container {container_name}')):
|
2704
2712
|
# Check for the existance of the container before deletion.
|
2705
2713
|
self.storage_client.blob_containers.get(
|
2706
2714
|
self.resource_group_name,
|
@@ -2916,8 +2924,8 @@ class R2Store(AbstractStore):
|
|
2916
2924
|
source_message = source_path_list[0]
|
2917
2925
|
|
2918
2926
|
with rich_utils.safe_status(
|
2919
|
-
|
2920
|
-
|
2927
|
+
ux_utils.spinner_message(
|
2928
|
+
f'Syncing {source_message} -> r2://{self.name}/')):
|
2921
2929
|
data_utils.parallel_upload(
|
2922
2930
|
source_path_list,
|
2923
2931
|
get_file_sync_command,
|
@@ -3055,7 +3063,9 @@ class R2Store(AbstractStore):
|
|
3055
3063
|
location = {'LocationConstraint': region}
|
3056
3064
|
r2_client.create_bucket(Bucket=bucket_name,
|
3057
3065
|
CreateBucketConfiguration=location)
|
3058
|
-
logger.info(f'Created R2 bucket
|
3066
|
+
logger.info(f' {colorama.Style.DIM}Created R2 bucket '
|
3067
|
+
f'{bucket_name!r} in {region}'
|
3068
|
+
f'{colorama.Style.RESET_ALL}')
|
3059
3069
|
except aws.botocore_exceptions().ClientError as e:
|
3060
3070
|
with ux_utils.print_exception_no_traceback():
|
3061
3071
|
raise exceptions.StorageBucketCreateError(
|
@@ -3087,7 +3097,8 @@ class R2Store(AbstractStore):
|
|
3087
3097
|
f'--profile={cloudflare.R2_PROFILE_NAME}')
|
3088
3098
|
try:
|
3089
3099
|
with rich_utils.safe_status(
|
3090
|
-
|
3100
|
+
ux_utils.spinner_message(
|
3101
|
+
f'Deleting R2 bucket {bucket_name}')):
|
3091
3102
|
subprocess.check_output(remove_command,
|
3092
3103
|
stderr=subprocess.STDOUT,
|
3093
3104
|
shell=True)
|
@@ -3354,9 +3365,8 @@ class IBMCosStore(AbstractStore):
|
|
3354
3365
|
source_message = source_path_list[0]
|
3355
3366
|
|
3356
3367
|
with rich_utils.safe_status(
|
3357
|
-
f'
|
3358
|
-
|
3359
|
-
f'[green]cos://{self.region}/{self.name}/[/]'):
|
3368
|
+
ux_utils.spinner_message(f'Syncing {source_message} -> '
|
3369
|
+
f'cos://{self.region}/{self.name}/')):
|
3360
3370
|
data_utils.parallel_upload(
|
3361
3371
|
source_path_list,
|
3362
3372
|
get_file_sync_command,
|
@@ -3490,8 +3500,10 @@ class IBMCosStore(AbstractStore):
|
|
3490
3500
|
CreateBucketConfiguration={
|
3491
3501
|
'LocationConstraint': f'{region}-smart'
|
3492
3502
|
})
|
3493
|
-
logger.info(f'Created IBM COS bucket
|
3494
|
-
f'
|
3503
|
+
logger.info(f' {colorama.Style.DIM}Created IBM COS bucket '
|
3504
|
+
f'{bucket_name!r} in {region} '
|
3505
|
+
'with storage class smart tier'
|
3506
|
+
f'{colorama.Style.RESET_ALL}')
|
3495
3507
|
self.bucket = self.s3_resource.Bucket(bucket_name)
|
3496
3508
|
|
3497
3509
|
except ibm.ibm_botocore.exceptions.ClientError as e: # type: ignore[union-attr] # pylint: disable=line-too-long
|