skypilot-nightly 1.0.0.dev20241012__py3-none-any.whl → 1.0.0.dev20241014__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/azure.py +3 -1
- sky/adaptors/common.py +6 -2
- sky/backends/backend.py +9 -4
- sky/backends/backend_utils.py +13 -16
- sky/backends/cloud_vm_ray_backend.py +207 -161
- sky/backends/local_docker_backend.py +3 -1
- sky/benchmark/benchmark_utils.py +5 -4
- sky/cli.py +36 -28
- sky/clouds/oci.py +17 -2
- sky/clouds/service_catalog/aws_catalog.py +6 -7
- sky/clouds/service_catalog/common.py +4 -3
- sky/clouds/service_catalog/cudo_catalog.py +11 -1
- sky/core.py +4 -2
- sky/data/storage.py +44 -32
- sky/data/storage_utils.py +8 -4
- sky/exceptions.py +5 -0
- sky/execution.py +10 -24
- sky/jobs/core.py +9 -7
- sky/jobs/utils.py +15 -10
- sky/optimizer.py +50 -37
- sky/provision/aws/config.py +15 -6
- sky/provision/azure/config.py +14 -3
- sky/provision/azure/instance.py +15 -9
- sky/provision/kubernetes/instance.py +3 -1
- sky/provision/provisioner.py +63 -74
- sky/serve/core.py +42 -40
- sky/sky_logging.py +9 -5
- sky/skylet/job_lib.py +15 -0
- sky/skylet/log_lib.py +5 -4
- sky/skylet/providers/lambda_cloud/node_provider.py +1 -1
- sky/utils/command_runner.py +11 -11
- sky/utils/common_utils.py +2 -5
- sky/utils/controller_utils.py +78 -29
- sky/utils/env_options.py +22 -7
- sky/utils/log_utils.py +39 -24
- sky/utils/resources_utils.py +23 -0
- sky/utils/rich_utils.py +55 -5
- sky/utils/ux_utils.py +63 -4
- {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/RECORD +45 -45
- {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241012.dist-info → skypilot_nightly-1.0.0.dev20241014.dist-info}/top_level.txt +0 -0
sky/jobs/core.py
CHANGED
@@ -79,9 +79,11 @@ def launch(
|
|
79
79
|
|
80
80
|
dag_utils.fill_default_config_in_dag_for_job_launch(dag)
|
81
81
|
|
82
|
-
|
83
|
-
|
84
|
-
|
82
|
+
with rich_utils.safe_status(
|
83
|
+
ux_utils.spinner_message('Initializing managed job')):
|
84
|
+
for task_ in dag.tasks:
|
85
|
+
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
86
|
+
task_, path='jobs')
|
85
87
|
|
86
88
|
with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
|
87
89
|
mode='w') as f:
|
@@ -129,7 +131,6 @@ def launch(
|
|
129
131
|
f'{colorama.Fore.YELLOW}'
|
130
132
|
f'Launching managed job {dag.name!r} from jobs controller...'
|
131
133
|
f'{colorama.Style.RESET_ALL}')
|
132
|
-
sky_logging.print('Launching jobs controller...')
|
133
134
|
sky.launch(task=controller_task,
|
134
135
|
stream_logs=stream_logs,
|
135
136
|
cluster_name=controller_name,
|
@@ -262,11 +263,12 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
|
|
262
263
|
f'{colorama.Style.RESET_ALL}')
|
263
264
|
|
264
265
|
rich_utils.force_update_status(
|
265
|
-
'
|
266
|
-
|
266
|
+
ux_utils.spinner_message('Checking managed jobs - restarting '
|
267
|
+
'controller'))
|
267
268
|
handle = sky.start(jobs_controller_type.value.cluster_name)
|
268
269
|
controller_status = status_lib.ClusterStatus.UP
|
269
|
-
rich_utils.force_update_status(
|
270
|
+
rich_utils.force_update_status(
|
271
|
+
ux_utils.spinner_message('Checking managed jobs'))
|
270
272
|
|
271
273
|
assert handle is not None, (controller_status, refresh)
|
272
274
|
|
sky/jobs/utils.py
CHANGED
@@ -34,6 +34,7 @@ from sky.utils import common_utils
|
|
34
34
|
from sky.utils import log_utils
|
35
35
|
from sky.utils import rich_utils
|
36
36
|
from sky.utils import subprocess_utils
|
37
|
+
from sky.utils import ux_utils
|
37
38
|
|
38
39
|
if typing.TYPE_CHECKING:
|
39
40
|
import sky
|
@@ -57,11 +58,13 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
|
|
57
58
|
|
58
59
|
_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
|
59
60
|
|
60
|
-
_JOB_WAITING_STATUS_MESSAGE = (
|
61
|
-
|
61
|
+
_JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
|
62
|
+
'Waiting for task to start[/]'
|
63
|
+
'{status_str}. It may take a few minutes.\n'
|
64
|
+
' [dim]View controller logs: sky jobs logs --controller {job_id}')
|
62
65
|
_JOB_CANCELLED_MESSAGE = (
|
63
|
-
'
|
64
|
-
'
|
66
|
+
ux_utils.spinner_message('Waiting for task status to be updated.') +
|
67
|
+
' It may take a minute.')
|
65
68
|
|
66
69
|
# The maximum time to wait for the managed job status to transition to terminal
|
67
70
|
# state, after the job finished. This is a safeguard to avoid the case where
|
@@ -290,8 +293,8 @@ def cancel_job_by_name(job_name: str) -> str:
|
|
290
293
|
def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
291
294
|
"""Stream logs by job id."""
|
292
295
|
controller_status = job_lib.get_status(job_id)
|
293
|
-
status_msg = (
|
294
|
-
|
296
|
+
status_msg = ux_utils.spinner_message(
|
297
|
+
'Waiting for controller process to be RUNNING') + '{status_str}'
|
295
298
|
status_display = rich_utils.safe_status(status_msg.format(status_str=''))
|
296
299
|
num_tasks = managed_job_state.get_num_tasks(job_id)
|
297
300
|
|
@@ -310,7 +313,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
310
313
|
time.sleep(_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS)
|
311
314
|
controller_status = job_lib.get_status(job_id)
|
312
315
|
|
313
|
-
msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='')
|
316
|
+
msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='', job_id=job_id)
|
314
317
|
status_display.update(msg)
|
315
318
|
prev_msg = msg
|
316
319
|
managed_job_status = managed_job_state.get_status(job_id)
|
@@ -356,7 +359,8 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
356
359
|
logger.debug(
|
357
360
|
f'INFO: The log is not ready yet{status_str}. '
|
358
361
|
f'Waiting for {JOB_STATUS_CHECK_GAP_SECONDS} seconds.')
|
359
|
-
msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str=status_str
|
362
|
+
msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str=status_str,
|
363
|
+
job_id=job_id)
|
360
364
|
if msg != prev_msg:
|
361
365
|
status_display.update(msg)
|
362
366
|
prev_msg = msg
|
@@ -444,8 +448,9 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
444
448
|
managed_job_status = managed_job_state.get_status(job_id)
|
445
449
|
assert managed_job_status is not None, job_id
|
446
450
|
|
447
|
-
logger.info(
|
448
|
-
|
451
|
+
logger.info(
|
452
|
+
ux_utils.finishing_message(f'Managed job finished: {job_id} '
|
453
|
+
f'(status: {managed_job_status.value}).'))
|
449
454
|
return ''
|
450
455
|
|
451
456
|
|
sky/optimizer.py
CHANGED
@@ -123,22 +123,23 @@ class Optimizer:
|
|
123
123
|
for a task.
|
124
124
|
exceptions.NoCloudAccessError: if no public clouds are enabled.
|
125
125
|
"""
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
126
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Optimizing')):
|
127
|
+
_check_specified_clouds(dag)
|
128
|
+
|
129
|
+
# This function is effectful: mutates every node in 'dag' by setting
|
130
|
+
# node.best_resources if it is None.
|
131
|
+
Optimizer._add_dummy_source_sink_nodes(dag)
|
132
|
+
try:
|
133
|
+
unused_best_plan = Optimizer._optimize_dag(
|
134
|
+
dag=dag,
|
135
|
+
minimize_cost=minimize == OptimizeTarget.COST,
|
136
|
+
blocked_resources=blocked_resources,
|
137
|
+
quiet=quiet)
|
138
|
+
finally:
|
139
|
+
# Make sure to remove the dummy source/sink nodes, even if the
|
140
|
+
# optimization fails.
|
141
|
+
Optimizer._remove_dummy_source_sink_nodes(dag)
|
142
|
+
return dag
|
142
143
|
|
143
144
|
@staticmethod
|
144
145
|
def _add_dummy_source_sink_nodes(dag: 'dag_lib.Dag'):
|
@@ -259,6 +260,9 @@ class Optimizer:
|
|
259
260
|
launchable_resources: Dict[resources_lib.Resources,
|
260
261
|
List[resources_lib.Resources]]
|
261
262
|
) -> Dict[resources_lib.Resources, int]:
|
263
|
+
if not resources_utils.need_to_query_reservations():
|
264
|
+
return {}
|
265
|
+
|
262
266
|
num_available_reserved_nodes_per_resource = {}
|
263
267
|
|
264
268
|
def get_reservations_available_resources(
|
@@ -269,7 +273,7 @@ class Optimizer:
|
|
269
273
|
launchable_resources_list: List[resources_lib.Resources] = sum(
|
270
274
|
launchable_resources.values(), [])
|
271
275
|
with rich_utils.safe_status(
|
272
|
-
'
|
276
|
+
ux_utils.spinner_message('Checking reserved resources')):
|
273
277
|
subprocess_utils.run_in_parallel(
|
274
278
|
get_reservations_available_resources,
|
275
279
|
launchable_resources_list)
|
@@ -337,8 +341,8 @@ class Optimizer:
|
|
337
341
|
if minimize_cost:
|
338
342
|
cost_per_node = resources.get_cost(estimated_runtime)
|
339
343
|
num_available_reserved_nodes = (
|
340
|
-
num_available_reserved_nodes_per_resource
|
341
|
-
|
344
|
+
num_available_reserved_nodes_per_resource.get(
|
345
|
+
resources, 0))
|
342
346
|
|
343
347
|
# We consider the cost of the unused reservation
|
344
348
|
# resources to be 0 since we are already paying for
|
@@ -384,10 +388,14 @@ class Optimizer:
|
|
384
388
|
fuzzy_candidates_str = (
|
385
389
|
f'\nTry one of these offered accelerators: {cyan}'
|
386
390
|
f'{fuzzy_candidates}{reset}')
|
391
|
+
node_resources_reprs = ', '.join(f'{node.num_nodes}x ' +
|
392
|
+
r.repr_with_region_zone
|
393
|
+
for r in node.resources)
|
387
394
|
error_msg = (
|
388
395
|
f'{source_hint.capitalize()} does not contain any '
|
389
|
-
f'instances satisfying the request
|
390
|
-
f'
|
396
|
+
f'instances satisfying the request: '
|
397
|
+
f'{node_resources_reprs}.'
|
398
|
+
f'\nTo fix: relax or change the '
|
391
399
|
f'resource requirements.{fuzzy_candidates_str}\n\n'
|
392
400
|
f'Hint: {bold}sky show-gpus{reset} '
|
393
401
|
'to list available accelerators.\n'
|
@@ -716,7 +724,6 @@ class Optimizer:
|
|
716
724
|
node_to_cost_map: _TaskToCostMap,
|
717
725
|
minimize_cost: bool,
|
718
726
|
):
|
719
|
-
logger.info('== Optimizer ==')
|
720
727
|
ordered_node_to_cost_map = collections.OrderedDict()
|
721
728
|
ordered_best_plan = collections.OrderedDict()
|
722
729
|
for node in topo_order:
|
@@ -738,15 +745,18 @@ class Optimizer:
|
|
738
745
|
node.get_inputs() is None and node.get_outputs() is None):
|
739
746
|
print_hourly_cost = True
|
740
747
|
|
741
|
-
if
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
748
|
+
if not env_options.Options.MINIMIZE_LOGGING.get():
|
749
|
+
if print_hourly_cost:
|
750
|
+
logger.info(
|
751
|
+
f'{colorama.Style.BRIGHT}Estimated cost: '
|
752
|
+
f'{colorama.Style.RESET_ALL}${total_cost:.1f} / hour\n')
|
753
|
+
else:
|
754
|
+
logger.info(
|
755
|
+
f'{colorama.Style.BRIGHT}Estimated total runtime: '
|
756
|
+
f'{colorama.Style.RESET_ALL}{total_time / 3600:.1f} '
|
757
|
+
'hours\n'
|
758
|
+
f'{colorama.Style.BRIGHT}Estimated total cost: '
|
759
|
+
f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
|
750
760
|
|
751
761
|
def _get_resources_element_list(
|
752
762
|
resources: 'resources_lib.Resources') -> List[str]:
|
@@ -845,7 +855,7 @@ class Optimizer:
|
|
845
855
|
best_plan_table = _create_table(['TASK', '#NODES'] +
|
846
856
|
resource_fields)
|
847
857
|
best_plan_table.add_rows(best_plan_rows)
|
848
|
-
logger.info(f'{best_plan_table}
|
858
|
+
logger.info(f'{best_plan_table}')
|
849
859
|
|
850
860
|
# Print the egress plan if any data egress is scheduled.
|
851
861
|
Optimizer._print_egress_plan(graph, best_plan, minimize_cost)
|
@@ -864,6 +874,10 @@ class Optimizer:
|
|
864
874
|
}
|
865
875
|
task_str = (f'for task {task.name!r} ' if num_tasks > 1 else '')
|
866
876
|
plural = 's' if task.num_nodes > 1 else ''
|
877
|
+
if num_tasks > 1:
|
878
|
+
# Add a new line for better readability, when there are multiple
|
879
|
+
# tasks.
|
880
|
+
logger.info('')
|
867
881
|
logger.info(
|
868
882
|
f'{colorama.Style.BRIGHT}Considered resources {task_str}'
|
869
883
|
f'({task.num_nodes} node{plural}):'
|
@@ -934,7 +948,7 @@ class Optimizer:
|
|
934
948
|
|
935
949
|
table = _create_table(field_names)
|
936
950
|
table.add_rows(rows)
|
937
|
-
logger.info(f'{table}
|
951
|
+
logger.info(f'{table}')
|
938
952
|
|
939
953
|
# Warning message for using disk_tier=ultra
|
940
954
|
# TODO(yi): Consider price of disks in optimizer and
|
@@ -965,10 +979,10 @@ class Optimizer:
|
|
965
979
|
f'Multiple {cloud} instances satisfy '
|
966
980
|
f'{acc_name}:{int(acc_count)}. '
|
967
981
|
f'The cheapest {candidate_list[0]!r} is considered '
|
968
|
-
f'among:\n{instance_list}
|
982
|
+
f'among:\n{instance_list}.')
|
969
983
|
if is_multi_instances:
|
970
984
|
logger.info(
|
971
|
-
f'To list more details, run
|
985
|
+
f'To list more details, run: sky show-gpus {acc_name}\n')
|
972
986
|
|
973
987
|
@staticmethod
|
974
988
|
def _optimize_dag(
|
@@ -1101,8 +1115,7 @@ class Optimizer:
|
|
1101
1115
|
Optimizer.print_optimized_plan(graph, topo_order, best_plan,
|
1102
1116
|
total_time, total_cost,
|
1103
1117
|
node_to_cost_map, minimize_cost)
|
1104
|
-
|
1105
|
-
Optimizer._print_candidates(local_node_to_candidate_map)
|
1118
|
+
Optimizer._print_candidates(local_node_to_candidate_map)
|
1106
1119
|
return best_plan
|
1107
1120
|
|
1108
1121
|
|
sky/provision/aws/config.py
CHANGED
@@ -16,10 +16,12 @@ from typing import Any, Dict, List, Optional, Set, Tuple
|
|
16
16
|
|
17
17
|
import colorama
|
18
18
|
|
19
|
+
from sky import exceptions
|
19
20
|
from sky import sky_logging
|
20
21
|
from sky.adaptors import aws
|
21
22
|
from sky.provision import common
|
22
23
|
from sky.provision.aws import utils
|
24
|
+
from sky.utils import common_utils
|
23
25
|
|
24
26
|
logger = sky_logging.init_logger(__name__)
|
25
27
|
|
@@ -535,12 +537,19 @@ def _get_or_create_vpc_security_group(ec2, vpc_id: str,
|
|
535
537
|
if vpc_id in vpc_to_existing_sg:
|
536
538
|
return vpc_to_existing_sg[vpc_id]
|
537
539
|
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
540
|
+
try:
|
541
|
+
# create a new security group
|
542
|
+
ec2.meta.client.create_security_group(
|
543
|
+
Description='Auto-created security group for Ray workers',
|
544
|
+
GroupName=expected_sg_name,
|
545
|
+
VpcId=vpc_id,
|
546
|
+
)
|
547
|
+
except ec2.meta.client.exceptions.ClientError as e:
|
548
|
+
message = ('Failed to create security group. Error: '
|
549
|
+
f'{common_utils.format_exception(e)}')
|
550
|
+
logger.warning(message)
|
551
|
+
raise exceptions.NoClusterLaunchedError(message) from e
|
552
|
+
|
544
553
|
security_group = _get_security_groups_from_vpc_ids(ec2, [vpc_id],
|
545
554
|
[expected_sg_name])
|
546
555
|
|
sky/provision/azure/config.py
CHANGED
@@ -5,16 +5,18 @@ a cluster to be launched.
|
|
5
5
|
"""
|
6
6
|
import hashlib
|
7
7
|
import json
|
8
|
-
import logging
|
9
8
|
from pathlib import Path
|
10
9
|
import random
|
11
10
|
import time
|
12
11
|
from typing import Any, Callable
|
13
12
|
|
13
|
+
from sky import exceptions
|
14
|
+
from sky import sky_logging
|
14
15
|
from sky.adaptors import azure
|
15
16
|
from sky.provision import common
|
17
|
+
from sky.utils import common_utils
|
16
18
|
|
17
|
-
logger =
|
19
|
+
logger = sky_logging.init_logger(__name__)
|
18
20
|
|
19
21
|
UNIQUE_ID_LEN = 4
|
20
22
|
_DEPLOYMENT_NAME = 'skypilot-config'
|
@@ -92,10 +94,19 @@ def bootstrap_instances(
|
|
92
94
|
retry += 1
|
93
95
|
continue
|
94
96
|
raise
|
97
|
+
except azure.exceptions().ClientAuthenticationError as e:
|
98
|
+
message = (
|
99
|
+
'Failed to authenticate with Azure. Please check your Azure '
|
100
|
+
f'credentials. Error: {common_utils.format_exception(e)}'
|
101
|
+
).replace('\n', ' ')
|
102
|
+
logger.error(message)
|
103
|
+
raise exceptions.NoClusterLaunchedError(message) from e
|
95
104
|
else:
|
96
|
-
|
105
|
+
message = (
|
97
106
|
f'Timed out waiting for resource group {resource_group} to be '
|
98
107
|
'deleted.')
|
108
|
+
logger.error(message)
|
109
|
+
raise TimeoutError(message)
|
99
110
|
|
100
111
|
# load the template file
|
101
112
|
current_path = Path(__file__).parent
|
sky/provision/azure/instance.py
CHANGED
@@ -441,15 +441,21 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
441
441
|
if to_start_count > 0:
|
442
442
|
resource_client = azure.get_client('resource', subscription_id)
|
443
443
|
logger.debug(f'run_instances: Creating {to_start_count} instances.')
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
444
|
+
try:
|
445
|
+
created_instances = _create_instances(
|
446
|
+
compute_client=compute_client,
|
447
|
+
resource_client=resource_client,
|
448
|
+
cluster_name_on_cloud=cluster_name_on_cloud,
|
449
|
+
resource_group=resource_group,
|
450
|
+
provider_config=provider_config,
|
451
|
+
node_config=config.node_config,
|
452
|
+
tags=tags,
|
453
|
+
count=to_start_count)
|
454
|
+
except Exception as e:
|
455
|
+
err_message = common_utils.format_exception(
|
456
|
+
e, use_bracket=True).replace('\n', ' ')
|
457
|
+
logger.error(f'Failed to create instances: {err_message}')
|
458
|
+
raise
|
453
459
|
created_instance_ids = [inst.name for inst in created_instances]
|
454
460
|
|
455
461
|
non_running_instance_statuses = list(
|
@@ -632,7 +632,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
632
632
|
try:
|
633
633
|
return _create_pods(region, cluster_name_on_cloud, config)
|
634
634
|
except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
|
635
|
-
|
635
|
+
e_msg = common_utils.format_exception(e).replace('\n', ' ')
|
636
|
+
logger.warning('run_instances: Error occurred when creating pods: '
|
637
|
+
f'{e_msg}')
|
636
638
|
raise
|
637
639
|
|
638
640
|
|
sky/provision/provisioner.py
CHANGED
@@ -14,6 +14,7 @@ import colorama
|
|
14
14
|
|
15
15
|
import sky
|
16
16
|
from sky import clouds
|
17
|
+
from sky import exceptions
|
17
18
|
from sky import provision
|
18
19
|
from sky import sky_logging
|
19
20
|
from sky import status_lib
|
@@ -42,76 +43,50 @@ _TITLE = '\n\n' + '=' * 20 + ' {} ' + '=' * 20 + '\n'
|
|
42
43
|
def _bulk_provision(
|
43
44
|
cloud: clouds.Cloud,
|
44
45
|
region: clouds.Region,
|
45
|
-
zones: Optional[List[clouds.Zone]],
|
46
46
|
cluster_name: resources_utils.ClusterName,
|
47
47
|
bootstrap_config: provision_common.ProvisionConfig,
|
48
48
|
) -> provision_common.ProvisionRecord:
|
49
49
|
provider_name = repr(cloud)
|
50
50
|
region_name = region.name
|
51
51
|
|
52
|
-
style = colorama.Style
|
53
|
-
|
54
|
-
if not zones:
|
55
|
-
# For Azure, zones is always an empty list.
|
56
|
-
zone_str = 'all zones'
|
57
|
-
else:
|
58
|
-
zone_str = ','.join(z.name for z in zones)
|
59
|
-
|
60
|
-
if isinstance(cloud, clouds.Kubernetes):
|
61
|
-
# Omit the region name for Kubernetes.
|
62
|
-
logger.info(f'{style.BRIGHT}Launching on {cloud}{style.RESET_ALL} '
|
63
|
-
f'{cluster_name!r}.')
|
64
|
-
else:
|
65
|
-
logger.info(f'{style.BRIGHT}Launching on {cloud} '
|
66
|
-
f'{region_name}{style.RESET_ALL} ({zone_str})')
|
67
|
-
|
68
52
|
start = time.time()
|
69
|
-
|
53
|
+
# TODO(suquark): Should we cache the bootstrapped result?
|
54
|
+
# Currently it is not necessary as bootstrapping takes
|
55
|
+
# only ~3s, caching it seems over-engineering and could
|
56
|
+
# cause other issues like the cache is not synced
|
57
|
+
# with the cloud configuration.
|
58
|
+
config = provision.bootstrap_instances(provider_name, region_name,
|
59
|
+
cluster_name.name_on_cloud,
|
60
|
+
bootstrap_config)
|
61
|
+
|
62
|
+
provision_record = provision.run_instances(provider_name,
|
63
|
+
region_name,
|
64
|
+
cluster_name.name_on_cloud,
|
65
|
+
config=config)
|
66
|
+
|
67
|
+
backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=3)
|
68
|
+
logger.debug(f'\nWaiting for instances of {cluster_name!r} to be ready...')
|
69
|
+
rich_utils.force_update_status(
|
70
|
+
ux_utils.spinner_message('Launching - Checking instance status',
|
71
|
+
str(provision_logging.config.log_path)))
|
72
|
+
# AWS would take a very short time (<<1s) updating the state of the
|
73
|
+
# instance.
|
74
|
+
time.sleep(1)
|
75
|
+
for retry_cnt in range(_MAX_RETRY):
|
70
76
|
try:
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
f'{common_utils.format_exception(e)}')
|
85
|
-
raise
|
86
|
-
|
87
|
-
provision_record = provision.run_instances(provider_name,
|
88
|
-
region_name,
|
89
|
-
cluster_name.name_on_cloud,
|
90
|
-
config=config)
|
91
|
-
|
92
|
-
backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=3)
|
93
|
-
logger.debug(
|
94
|
-
f'\nWaiting for instances of {cluster_name!r} to be ready...')
|
95
|
-
status.update('[bold cyan]Launching - Checking instance status[/]')
|
96
|
-
# AWS would take a very short time (<<1s) updating the state of the
|
97
|
-
# instance.
|
98
|
-
time.sleep(1)
|
99
|
-
for retry_cnt in range(_MAX_RETRY):
|
100
|
-
try:
|
101
|
-
provision.wait_instances(provider_name,
|
102
|
-
region_name,
|
103
|
-
cluster_name.name_on_cloud,
|
104
|
-
state=status_lib.ClusterStatus.UP)
|
105
|
-
break
|
106
|
-
except (aws.botocore_exceptions().WaiterError, RuntimeError):
|
107
|
-
time.sleep(backoff.current_backoff())
|
108
|
-
else:
|
109
|
-
raise RuntimeError(
|
110
|
-
f'Failed to wait for instances of {cluster_name!r} to be '
|
111
|
-
f'ready on the cloud provider after max retries {_MAX_RETRY}.')
|
112
|
-
logger.debug(
|
113
|
-
f'Instances of {cluster_name!r} are ready after {retry_cnt} '
|
114
|
-
'retries.')
|
77
|
+
provision.wait_instances(provider_name,
|
78
|
+
region_name,
|
79
|
+
cluster_name.name_on_cloud,
|
80
|
+
state=status_lib.ClusterStatus.UP)
|
81
|
+
break
|
82
|
+
except (aws.botocore_exceptions().WaiterError, RuntimeError):
|
83
|
+
time.sleep(backoff.current_backoff())
|
84
|
+
else:
|
85
|
+
raise RuntimeError(
|
86
|
+
f'Failed to wait for instances of {cluster_name!r} to be '
|
87
|
+
f'ready on the cloud provider after max retries {_MAX_RETRY}.')
|
88
|
+
logger.debug(f'Instances of {cluster_name!r} are ready after {retry_cnt} '
|
89
|
+
'retries.')
|
115
90
|
|
116
91
|
logger.debug(
|
117
92
|
f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
|
@@ -162,8 +137,11 @@ def bulk_provision(
|
|
162
137
|
logger.debug(
|
163
138
|
'Provision config:\n'
|
164
139
|
f'{json.dumps(dataclasses.asdict(bootstrap_config), indent=2)}')
|
165
|
-
return _bulk_provision(cloud, region,
|
140
|
+
return _bulk_provision(cloud, region, cluster_name,
|
166
141
|
bootstrap_config)
|
142
|
+
except exceptions.NoClusterLaunchedError:
|
143
|
+
# Skip the teardown if the cluster was never launched.
|
144
|
+
raise
|
167
145
|
except Exception: # pylint: disable=broad-except
|
168
146
|
zone_str = 'all zones'
|
169
147
|
if zones:
|
@@ -440,23 +418,30 @@ def _post_provision_setup(
|
|
440
418
|
# We don't set docker_user here, as we are configuring the VM itself.
|
441
419
|
ssh_credentials = backend_utils.ssh_credential_from_yaml(
|
442
420
|
cluster_yaml, ssh_user=cluster_info.ssh_user)
|
421
|
+
docker_config = config_from_yaml.get('docker', {})
|
443
422
|
|
444
423
|
with rich_utils.safe_status(
|
445
|
-
|
424
|
+
ux_utils.spinner_message(
|
425
|
+
'Launching - Waiting for SSH access',
|
426
|
+
provision_logging.config.log_path)) as status:
|
446
427
|
|
447
428
|
logger.debug(
|
448
429
|
f'\nWaiting for SSH to be available for {cluster_name!r} ...')
|
449
430
|
wait_for_ssh(cluster_info, ssh_credentials)
|
450
|
-
logger.debug(f'SSH
|
431
|
+
logger.debug(f'SSH Connection ready for {cluster_name!r}')
|
432
|
+
vm_str = 'Instance' if cloud_name.lower() != 'kubernetes' else 'Pod'
|
451
433
|
plural = '' if len(cluster_info.instances) == 1 else 's'
|
452
|
-
|
453
|
-
|
454
|
-
|
434
|
+
verb = 'is' if len(cluster_info.instances) == 1 else 'are'
|
435
|
+
indent_str = (ux_utils.INDENT_SYMBOL
|
436
|
+
if docker_config else ux_utils.INDENT_LAST_SYMBOL)
|
437
|
+
logger.info(f'{indent_str}{colorama.Style.DIM}{vm_str}{plural} {verb} '
|
438
|
+
f'up.{colorama.Style.RESET_ALL}')
|
455
439
|
|
456
|
-
docker_config = config_from_yaml.get('docker', {})
|
457
440
|
if docker_config:
|
458
441
|
status.update(
|
459
|
-
|
442
|
+
ux_utils.spinner_message(
|
443
|
+
'Launching - Initializing docker container',
|
444
|
+
provision_logging.config.log_path))
|
460
445
|
docker_user = instance_setup.initialize_docker(
|
461
446
|
cluster_name.name_on_cloud,
|
462
447
|
docker_config=docker_config,
|
@@ -470,6 +455,8 @@ def _post_provision_setup(
|
|
470
455
|
cluster_info.docker_user = docker_user
|
471
456
|
ssh_credentials['docker_user'] = docker_user
|
472
457
|
logger.debug(f'Docker user: {docker_user}')
|
458
|
+
logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
|
459
|
+
f'Docker container is up.{colorama.Style.RESET_ALL}')
|
473
460
|
|
474
461
|
# We mount the metadata with sky wheel for speedup.
|
475
462
|
# NOTE: currently we mount all credentials for all nodes, because
|
@@ -482,8 +469,9 @@ def _post_provision_setup(
|
|
482
469
|
# for later.
|
483
470
|
file_mounts = config_from_yaml.get('file_mounts', {})
|
484
471
|
|
485
|
-
runtime_preparation_str = (
|
486
|
-
|
472
|
+
runtime_preparation_str = (ux_utils.spinner_message(
|
473
|
+
'Preparing SkyPilot runtime ({step}/3 - {step_name})',
|
474
|
+
provision_logging.config.log_path))
|
487
475
|
status.update(
|
488
476
|
runtime_preparation_str.format(step=1, step_name='initializing'))
|
489
477
|
instance_setup.internal_file_mounts(cluster_name.name_on_cloud,
|
@@ -551,8 +539,9 @@ def _post_provision_setup(
|
|
551
539
|
instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
|
552
540
|
cluster_info, ssh_credentials)
|
553
541
|
|
554
|
-
logger.info(
|
555
|
-
|
542
|
+
logger.info(
|
543
|
+
ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
|
544
|
+
provision_logging.config.log_path))
|
556
545
|
return cluster_info
|
557
546
|
|
558
547
|
|