skypilot-nightly 1.0.0.dev20241011__py3-none-any.whl → 1.0.0.dev20241013__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/azure.py +3 -1
- sky/adaptors/common.py +6 -2
- sky/backends/backend.py +9 -4
- sky/backends/backend_utils.py +13 -16
- sky/backends/cloud_vm_ray_backend.py +207 -161
- sky/backends/local_docker_backend.py +3 -1
- sky/benchmark/benchmark_utils.py +5 -4
- sky/cli.py +128 -31
- sky/clouds/service_catalog/aws_catalog.py +6 -7
- sky/clouds/service_catalog/common.py +4 -3
- sky/clouds/service_catalog/cudo_catalog.py +11 -1
- sky/core.py +4 -2
- sky/data/storage.py +44 -32
- sky/data/storage_utils.py +12 -7
- sky/exceptions.py +5 -0
- sky/execution.py +10 -24
- sky/jobs/__init__.py +2 -0
- sky/jobs/core.py +87 -7
- sky/jobs/utils.py +35 -19
- sky/optimizer.py +50 -37
- sky/provision/aws/config.py +15 -6
- sky/provision/azure/config.py +14 -3
- sky/provision/azure/instance.py +15 -9
- sky/provision/kubernetes/instance.py +3 -1
- sky/provision/kubernetes/utils.py +25 -0
- sky/provision/provisioner.py +63 -74
- sky/serve/core.py +42 -40
- sky/sky_logging.py +9 -5
- sky/skylet/log_lib.py +5 -4
- sky/skylet/providers/lambda_cloud/node_provider.py +1 -1
- sky/utils/cli_utils/status_utils.py +168 -21
- sky/utils/command_runner.py +11 -11
- sky/utils/common_utils.py +22 -5
- sky/utils/controller_utils.py +78 -29
- sky/utils/env_options.py +22 -7
- sky/utils/log_utils.py +39 -24
- sky/utils/resources_utils.py +23 -0
- sky/utils/rich_utils.py +55 -5
- sky/utils/ux_utils.py +63 -4
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/RECORD +46 -46
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/top_level.txt +0 -0
@@ -122,9 +122,6 @@ _RSYNC_NOT_FOUND_MESSAGE = (
|
|
122
122
|
|
123
123
|
_TPU_NOT_FOUND_ERROR = 'ERROR: (gcloud.compute.tpus.delete) NOT_FOUND'
|
124
124
|
|
125
|
-
_CTRL_C_TIP_MESSAGE = ('INFO: Tip: use Ctrl-C to exit log streaming '
|
126
|
-
'(task will not be killed).')
|
127
|
-
|
128
125
|
_MAX_RAY_UP_RETRY = 5
|
129
126
|
|
130
127
|
# Number of retries for getting zones.
|
@@ -405,22 +402,35 @@ class RayCodeGen:
|
|
405
402
|
**gpu_dict,
|
406
403
|
})
|
407
404
|
|
405
|
+
streaming_message = (
|
406
|
+
f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
|
407
|
+
f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
|
408
|
+
f'be killed){colorama.Style.RESET_ALL}')
|
408
409
|
self._code += [
|
409
410
|
textwrap.dedent(f"""\
|
410
411
|
pg = ray_util.placement_group({json.dumps(bundles)}, 'STRICT_SPREAD')
|
411
412
|
plural = 's' if {num_nodes} > 1 else ''
|
412
413
|
node_str = f'{num_nodes} node{{plural}}'
|
413
414
|
|
414
|
-
message
|
415
|
-
|
416
|
-
|
417
|
-
|
415
|
+
# We have this `INFO: Tip:` message only for backward
|
416
|
+
# compatibility, because if a cluster has the old SkyPilot version,
|
417
|
+
# it relies on this message to start log streaming.
|
418
|
+
# This message will be skipped for new clusters, because we use
|
419
|
+
# start_streaming_at for the `Waiting for task resources on`
|
420
|
+
# message.
|
421
|
+
# TODO: Remove this message in v0.9.0.
|
422
|
+
message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}INFO: '
|
423
|
+
'Tip: use Ctrl-C to exit log streaming, not kill '
|
424
|
+
'the job.{colorama.Style.RESET_ALL}\\n')
|
425
|
+
message += ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
|
426
|
+
'Waiting for task resources on '
|
427
|
+
f'{{node_str}}.{colorama.Style.RESET_ALL}')
|
428
|
+
print(message, flush=True)
|
418
429
|
# FIXME: This will print the error message from autoscaler if
|
419
430
|
# it is waiting for other task to finish. We should hide the
|
420
431
|
# error message.
|
421
432
|
ray.get(pg.ready())
|
422
|
-
print(
|
423
|
-
flush=True)
|
433
|
+
print({streaming_message!r}, flush=True)
|
424
434
|
""")
|
425
435
|
]
|
426
436
|
|
@@ -496,7 +506,6 @@ class RayCodeGen:
|
|
496
506
|
)).remote()
|
497
507
|
for i in range(pg.bundle_count)
|
498
508
|
])
|
499
|
-
print('INFO: Reserved IPs:', gang_scheduling_id_to_ip)
|
500
509
|
|
501
510
|
cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
|
502
511
|
job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
|
@@ -743,15 +752,14 @@ class FailoverCloudErrorHandlerV1:
|
|
743
752
|
region: 'clouds.Region',
|
744
753
|
zones: Optional[List['clouds.Zone']], stdout: str,
|
745
754
|
stderr: str):
|
746
|
-
del zones # Unused.
|
755
|
+
del region, zones # Unused.
|
747
756
|
errors = FailoverCloudErrorHandlerV1._handle_errors(
|
748
757
|
stdout,
|
749
758
|
stderr,
|
750
759
|
is_error_str_known=lambda x: 'LambdaCloudError:' in x.strip())
|
751
|
-
|
752
|
-
messages = '\n\t'.join(errors)
|
760
|
+
messages = '\n '.join(errors)
|
753
761
|
style = colorama.Style
|
754
|
-
logger.warning(f'{style.DIM}
|
762
|
+
logger.warning(f' {style.DIM}{messages}{style.RESET_ALL}')
|
755
763
|
_add_to_blocked_resources(blocked_resources,
|
756
764
|
launchable_resources.copy(zone=None))
|
757
765
|
|
@@ -926,6 +934,10 @@ class FailoverCloudErrorHandlerV2:
|
|
926
934
|
_add_to_blocked_resources(
|
927
935
|
blocked_resources,
|
928
936
|
resources_lib.Resources(cloud=clouds.Azure()))
|
937
|
+
elif 'ClientAuthenticationError' in str(err):
|
938
|
+
_add_to_blocked_resources(
|
939
|
+
blocked_resources,
|
940
|
+
resources_lib.Resources(cloud=clouds.Azure()))
|
929
941
|
else:
|
930
942
|
_add_to_blocked_resources(blocked_resources,
|
931
943
|
launchable_resources.copy(zone=None))
|
@@ -1224,9 +1236,10 @@ class RetryingVmProvisioner(object):
|
|
1224
1236
|
|
1225
1237
|
if prev_cluster_status != status_lib.ClusterStatus.UP:
|
1226
1238
|
logger.info(
|
1227
|
-
f'Cluster {cluster_name!r} (status: '
|
1228
|
-
f'{prev_cluster_status.value}) was previously
|
1229
|
-
f'
|
1239
|
+
f'{colorama.Style.DIM}Cluster {cluster_name!r} (status: '
|
1240
|
+
f'{prev_cluster_status.value}) was previously in '
|
1241
|
+
f'{cloud} ({region.name}). Restarting.'
|
1242
|
+
f'{colorama.Style.RESET_ALL}')
|
1230
1243
|
yield zones
|
1231
1244
|
|
1232
1245
|
# If it reaches here: the cluster status in the database gets
|
@@ -1303,17 +1316,14 @@ class RetryingVmProvisioner(object):
|
|
1303
1316
|
prev_cluster_ever_up: bool,
|
1304
1317
|
) -> Dict[str, Any]:
|
1305
1318
|
"""The provision retry loop."""
|
1306
|
-
style = colorama.Style
|
1307
|
-
fore = colorama.Fore
|
1308
1319
|
# Get log_path name
|
1309
1320
|
log_path = os.path.join(self.log_dir, 'provision.log')
|
1310
1321
|
log_abs_path = os.path.abspath(log_path)
|
1311
1322
|
if not dryrun:
|
1312
1323
|
os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
|
1313
1324
|
os.system(f'touch {log_path}')
|
1314
|
-
|
1315
|
-
|
1316
|
-
f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
|
1325
|
+
rich_utils.force_update_status(
|
1326
|
+
ux_utils.spinner_message('Launching', log_path))
|
1317
1327
|
|
1318
1328
|
# Get previous cluster status
|
1319
1329
|
cluster_exists = prev_cluster_status is not None
|
@@ -1481,6 +1491,23 @@ class RetryingVmProvisioner(object):
|
|
1481
1491
|
if to_provision.cloud.OPEN_PORTS_VERSION <=
|
1482
1492
|
clouds.OpenPortsVersion.LAUNCH_ONLY else None)
|
1483
1493
|
try:
|
1494
|
+
controller = controller_utils.Controllers.from_name(
|
1495
|
+
cluster_name)
|
1496
|
+
controller_str = ('' if controller is None else
|
1497
|
+
f' {controller.value.name}')
|
1498
|
+
if isinstance(to_provision.cloud, clouds.Kubernetes):
|
1499
|
+
# Omit the region name for Kubernetes.
|
1500
|
+
logger.info(
|
1501
|
+
ux_utils.starting_message(
|
1502
|
+
f'Launching{controller_str} on '
|
1503
|
+
f'{to_provision.cloud}.'))
|
1504
|
+
else:
|
1505
|
+
logger.info(
|
1506
|
+
ux_utils.starting_message(
|
1507
|
+
f'Launching{controller_str} on '
|
1508
|
+
f'{to_provision.cloud} '
|
1509
|
+
f'{region.name}{colorama.Style.RESET_ALL}'
|
1510
|
+
f'{zone_str}.'))
|
1484
1511
|
provision_record = provisioner.bulk_provision(
|
1485
1512
|
to_provision.cloud,
|
1486
1513
|
region,
|
@@ -1528,6 +1555,7 @@ class RetryingVmProvisioner(object):
|
|
1528
1555
|
'region_name': region.name,
|
1529
1556
|
'zone_str': zone_str,
|
1530
1557
|
}
|
1558
|
+
|
1531
1559
|
status, stdout, stderr, head_internal_ip, head_external_ip = (
|
1532
1560
|
self._gang_schedule_ray_up(to_provision.cloud,
|
1533
1561
|
cluster_config_file, handle,
|
@@ -1566,9 +1594,9 @@ class RetryingVmProvisioner(object):
|
|
1566
1594
|
self._ensure_cluster_ray_started(handle, log_abs_path)
|
1567
1595
|
|
1568
1596
|
config_dict['handle'] = handle
|
1569
|
-
|
1570
|
-
|
1571
|
-
|
1597
|
+
logger.info(
|
1598
|
+
ux_utils.finishing_message(
|
1599
|
+
f'Cluster launched: {cluster_name!r}.', log_path))
|
1572
1600
|
return config_dict
|
1573
1601
|
|
1574
1602
|
# The cluster is not ready. We must perform error recording and/or
|
@@ -1633,17 +1661,15 @@ class RetryingVmProvisioner(object):
|
|
1633
1661
|
|
1634
1662
|
if to_provision.zone is not None:
|
1635
1663
|
message = (
|
1636
|
-
f'Failed to acquire resources in {to_provision.zone}
|
1637
|
-
'
|
1664
|
+
f'Failed to acquire resources in {to_provision.zone} for '
|
1665
|
+
f'{requested_resources}. ')
|
1638
1666
|
elif to_provision.region is not None:
|
1639
1667
|
# For public clouds, provision.region is always set.
|
1640
1668
|
message = ('Failed to acquire resources in all zones in '
|
1641
|
-
f'{to_provision.region}
|
1642
|
-
'requirements or use another region.')
|
1669
|
+
f'{to_provision.region} for {requested_resources}. ')
|
1643
1670
|
else:
|
1644
|
-
message = (f'Failed to acquire resources in {to_provision.cloud}
|
1645
|
-
'
|
1646
|
-
'cloud provider.')
|
1671
|
+
message = (f'Failed to acquire resources in {to_provision.cloud} '
|
1672
|
+
f'for {requested_resources}. ')
|
1647
1673
|
# Do not failover to other locations if the cluster was ever up, since
|
1648
1674
|
# the user can have some data on the cluster.
|
1649
1675
|
raise exceptions.ResourcesUnavailableError(
|
@@ -1694,7 +1720,7 @@ class RetryingVmProvisioner(object):
|
|
1694
1720
|
log_abs_path,
|
1695
1721
|
stream_logs=False,
|
1696
1722
|
start_streaming_at='Shared connection to',
|
1697
|
-
line_processor=log_utils.RayUpLineProcessor(),
|
1723
|
+
line_processor=log_utils.RayUpLineProcessor(log_abs_path),
|
1698
1724
|
# Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging
|
1699
1725
|
# time during 'ray up' if insufficient capacity occurs.
|
1700
1726
|
env=dict(
|
@@ -1714,13 +1740,14 @@ class RetryingVmProvisioner(object):
|
|
1714
1740
|
|
1715
1741
|
region_name = logging_info['region_name']
|
1716
1742
|
zone_str = logging_info['zone_str']
|
1717
|
-
style = colorama.Style
|
1718
1743
|
if isinstance(to_provision_cloud, clouds.Kubernetes):
|
1719
|
-
logger.info(
|
1720
|
-
|
1744
|
+
logger.info(
|
1745
|
+
ux_utils.starting_message(
|
1746
|
+
f'Launching on {to_provision_cloud}.'))
|
1721
1747
|
else:
|
1722
|
-
logger.info(
|
1723
|
-
|
1748
|
+
logger.info(
|
1749
|
+
ux_utils.starting_message(f'Launching on {to_provision_cloud} '
|
1750
|
+
f'{region_name}{zone_str}.'))
|
1724
1751
|
start = time.time()
|
1725
1752
|
|
1726
1753
|
# Edge case: /tmp/ray does not exist, so autoscaler can't create/store
|
@@ -1822,11 +1849,6 @@ class RetryingVmProvisioner(object):
|
|
1822
1849
|
head_internal_ip, head_external_ip)
|
1823
1850
|
|
1824
1851
|
# All code below is handling num_nodes > 1.
|
1825
|
-
provision_str = ('Successfully provisioned or found existing head '
|
1826
|
-
'instance.')
|
1827
|
-
logger.info(f'{style.BRIGHT}{provision_str} '
|
1828
|
-
f'Waiting for workers.{style.RESET_ALL}')
|
1829
|
-
|
1830
1852
|
# FIXME(zongheng): the below requires ray processes are up on head. To
|
1831
1853
|
# repro it failing: launch a 2-node cluster, log into head and ray
|
1832
1854
|
# stop, then launch again.
|
@@ -2006,13 +2028,6 @@ class RetryingVmProvisioner(object):
|
|
2006
2028
|
# Provisioning succeeded.
|
2007
2029
|
break
|
2008
2030
|
|
2009
|
-
if to_provision.zone is None:
|
2010
|
-
region_or_zone_str = str(to_provision.region)
|
2011
|
-
else:
|
2012
|
-
region_or_zone_str = str(to_provision.zone)
|
2013
|
-
logger.warning(f'\n{style.BRIGHT}Provision failed for {num_nodes}x '
|
2014
|
-
f'{to_provision} in {region_or_zone_str}. '
|
2015
|
-
f'Trying other locations (if any).{style.RESET_ALL}')
|
2016
2031
|
if prev_cluster_status is None:
|
2017
2032
|
# Add failed resources to the blocklist, only when it
|
2018
2033
|
# is in fallback mode.
|
@@ -2027,8 +2042,10 @@ class RetryingVmProvisioner(object):
|
|
2027
2042
|
), prev_cluster_status
|
2028
2043
|
assert global_user_state.get_handle_from_cluster_name(
|
2029
2044
|
cluster_name) is None, cluster_name
|
2030
|
-
logger.info(
|
2031
|
-
|
2045
|
+
logger.info(
|
2046
|
+
ux_utils.retry_message(
|
2047
|
+
f'Retrying provisioning with requested resources: '
|
2048
|
+
f'{task.num_nodes}x {task.resources}'))
|
2032
2049
|
# Retry with the current, potentially "smaller" resources:
|
2033
2050
|
# to_provision == the current new resources (e.g., V100:1),
|
2034
2051
|
# which may be "smaller" than the original (V100:8).
|
@@ -2038,6 +2055,12 @@ class RetryingVmProvisioner(object):
|
|
2038
2055
|
prev_cluster_status = None
|
2039
2056
|
prev_handle = None
|
2040
2057
|
|
2058
|
+
retry_message = ux_utils.retry_message(
|
2059
|
+
'Trying other potential resources.')
|
2060
|
+
logger.warning(f'\n{retry_message}')
|
2061
|
+
log_path = os.path.join(self.log_dir, 'provision.log')
|
2062
|
+
rich_utils.force_update_status(
|
2063
|
+
ux_utils.spinner_message('Looking for resources', log_path))
|
2041
2064
|
# Set to None so that sky.optimize() will assign a new one
|
2042
2065
|
# (otherwise will skip re-optimizing this task).
|
2043
2066
|
# TODO: set all remaining tasks' best_resources to None.
|
@@ -2781,6 +2804,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2781
2804
|
local_wheel_path,
|
2782
2805
|
wheel_hash,
|
2783
2806
|
blocked_resources=task.blocked_resources)
|
2807
|
+
log_path = os.path.join(self.log_dir, 'provision.log')
|
2808
|
+
rich_utils.force_update_status(
|
2809
|
+
ux_utils.spinner_message('Launching', log_path))
|
2784
2810
|
config_dict = retry_provisioner.provision_with_retries(
|
2785
2811
|
task, to_provision_config, dryrun, stream_logs)
|
2786
2812
|
break
|
@@ -2796,27 +2822,34 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2796
2822
|
usage_lib.messages.usage.update_final_cluster_status(
|
2797
2823
|
None)
|
2798
2824
|
error_message = (
|
2799
|
-
'Failed to provision all
|
2800
|
-
'resources.'
|
2801
|
-
f'
|
2825
|
+
f'{colorama.Fore.RED}Failed to provision all '
|
2826
|
+
f'possible launchable resources.'
|
2827
|
+
f'{colorama.Style.RESET_ALL}'
|
2828
|
+
' Relax the task\'s resource requirements: '
|
2802
2829
|
f'{task.num_nodes}x {list(task.resources)[0]}')
|
2830
|
+
|
2831
|
+
log_path = retry_provisioner.log_dir + '/provision.log'
|
2803
2832
|
if retry_until_up:
|
2804
2833
|
logger.error(error_message)
|
2805
2834
|
# Sleep and retry.
|
2806
2835
|
gap_seconds = backoff.current_backoff()
|
2807
2836
|
plural = 's' if attempt_cnt > 1 else ''
|
2808
|
-
|
2809
|
-
f'
|
2810
|
-
f'{
|
2811
|
-
|
2812
|
-
|
2813
|
-
|
2837
|
+
retry_message = ux_utils.retry_message(
|
2838
|
+
f'Retry after {gap_seconds:.0f}s '
|
2839
|
+
f'({attempt_cnt} attempt{plural}). ')
|
2840
|
+
logger.info(f'\n{retry_message} '
|
2841
|
+
f'{ux_utils.log_path_hint(log_path)}'
|
2842
|
+
f'{colorama.Style.RESET_ALL}')
|
2814
2843
|
attempt_cnt += 1
|
2815
2844
|
time.sleep(gap_seconds)
|
2816
2845
|
continue
|
2846
|
+
logger.error(
|
2847
|
+
f'{colorama.Fore.RED}⨯{colorama.Style.RESET_ALL} '
|
2848
|
+
'Failed to provision resources. '
|
2849
|
+
f'{ux_utils.log_path_hint(log_path)}')
|
2817
2850
|
error_message += (
|
2818
|
-
'\nTo keep retrying until the cluster is up, use
|
2819
|
-
'`--retry-until-up` flag.')
|
2851
|
+
'\nTo keep retrying until the cluster is up, use '
|
2852
|
+
'the `--retry-until-up` flag.')
|
2820
2853
|
with ux_utils.print_exception_no_traceback():
|
2821
2854
|
raise exceptions.ResourcesUnavailableError(
|
2822
2855
|
error_message,
|
@@ -2927,7 +2960,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2927
2960
|
# and restarted if necessary.
|
2928
2961
|
logger.debug('Checking if skylet is running on the head node.')
|
2929
2962
|
with rich_utils.safe_status(
|
2930
|
-
'
|
2963
|
+
ux_utils.spinner_message('Preparing SkyPilot runtime')):
|
2931
2964
|
# We need to source bashrc for skylet to make sure the autostop
|
2932
2965
|
# event can access the path to the cloud CLIs.
|
2933
2966
|
self.run_on_head(handle,
|
@@ -2970,7 +3003,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2970
3003
|
cmd = job_lib.JobLibCodeGen.update_status()
|
2971
3004
|
logger.debug('Update job queue on remote cluster.')
|
2972
3005
|
with rich_utils.safe_status(
|
2973
|
-
'
|
3006
|
+
ux_utils.spinner_message('Preparing SkyPilot runtime')):
|
2974
3007
|
returncode, _, stderr = self.run_on_head(handle,
|
2975
3008
|
cmd,
|
2976
3009
|
require_outputs=True)
|
@@ -3005,7 +3038,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3005
3038
|
if not (cloud.OPEN_PORTS_VERSION <=
|
3006
3039
|
clouds.OpenPortsVersion.LAUNCH_ONLY):
|
3007
3040
|
with rich_utils.safe_status(
|
3008
|
-
|
3041
|
+
ux_utils.spinner_message(
|
3042
|
+
'Launching - Opening new ports')):
|
3009
3043
|
self._open_ports(handle)
|
3010
3044
|
|
3011
3045
|
with timeline.Event('backend.provision.post_process'):
|
@@ -3054,7 +3088,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3054
3088
|
dir_size = backend_utils.path_size_megabytes(full_workdir)
|
3055
3089
|
if dir_size >= _PATH_SIZE_MEGABYTES_WARN_THRESHOLD:
|
3056
3090
|
logger.warning(
|
3057
|
-
f'{fore.YELLOW}The size of workdir {workdir!r} '
|
3091
|
+
f' {fore.YELLOW}The size of workdir {workdir!r} '
|
3058
3092
|
f'is {dir_size} MB. Try to keep workdir small or use '
|
3059
3093
|
'.skyignore to exclude large files, as large sizes will slow '
|
3060
3094
|
f'down rsync.{style.RESET_ALL}')
|
@@ -3076,17 +3110,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3076
3110
|
num_nodes = handle.launched_nodes
|
3077
3111
|
plural = 's' if num_nodes > 1 else ''
|
3078
3112
|
logger.info(
|
3079
|
-
f'{
|
3080
|
-
f'{
|
3081
|
-
f' -> '
|
3082
|
-
f'{style.BRIGHT}{SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
|
3113
|
+
f' {style.DIM}Syncing workdir (to {num_nodes} node{plural}): '
|
3114
|
+
f'{workdir} -> {SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
|
3083
3115
|
os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
|
3084
3116
|
os.system(f'touch {log_path}')
|
3085
|
-
|
3086
|
-
|
3087
|
-
f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
|
3088
|
-
with rich_utils.safe_status('[bold cyan]Syncing[/]'):
|
3117
|
+
with rich_utils.safe_status(
|
3118
|
+
ux_utils.spinner_message('Syncing workdir', log_path)):
|
3089
3119
|
subprocess_utils.run_in_parallel(_sync_workdir_node, runners)
|
3120
|
+
logger.info(ux_utils.finishing_message('Workdir synced.', log_path))
|
3090
3121
|
|
3091
3122
|
def _sync_file_mounts(
|
3092
3123
|
self,
|
@@ -3095,17 +3126,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3095
3126
|
storage_mounts: Optional[Dict[Path, storage_lib.Storage]],
|
3096
3127
|
) -> None:
|
3097
3128
|
"""Mounts all user files to the remote nodes."""
|
3098
|
-
|
3099
|
-
|
3100
|
-
|
3101
|
-
|
3102
|
-
|
3129
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Syncing files')):
|
3130
|
+
controller_utils.replace_skypilot_config_path_in_file_mounts(
|
3131
|
+
handle.launched_resources.cloud, all_file_mounts)
|
3132
|
+
self._execute_file_mounts(handle, all_file_mounts)
|
3133
|
+
self._execute_storage_mounts(handle, storage_mounts)
|
3134
|
+
self._set_storage_mounts_metadata(handle.cluster_name,
|
3135
|
+
storage_mounts)
|
3103
3136
|
|
3104
3137
|
def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
|
3105
3138
|
detach_setup: bool) -> None:
|
3106
3139
|
start = time.time()
|
3107
|
-
style = colorama.Style
|
3108
|
-
fore = colorama.Fore
|
3109
3140
|
|
3110
3141
|
if task.setup is None:
|
3111
3142
|
return
|
@@ -3161,7 +3192,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3161
3192
|
# and source ~/.bashrc in the setup_cmd.
|
3162
3193
|
# bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long
|
3163
3194
|
# bash: no job control in this shell
|
3164
|
-
|
3195
|
+
skip_num_lines=3)
|
3165
3196
|
return returncode
|
3166
3197
|
|
3167
3198
|
returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
|
@@ -3212,23 +3243,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3212
3243
|
|
3213
3244
|
num_nodes = len(runners)
|
3214
3245
|
plural = 's' if num_nodes > 1 else ''
|
3246
|
+
node_str = f'{num_nodes} VM{plural}'
|
3247
|
+
if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
|
3248
|
+
node_str = f'{num_nodes} pod{plural}'
|
3249
|
+
controller = controller_utils.Controllers.from_name(handle.cluster_name)
|
3250
|
+
if controller is not None:
|
3251
|
+
node_str = controller.value.name
|
3215
3252
|
if not detach_setup:
|
3216
|
-
logger.info(
|
3217
|
-
|
3253
|
+
logger.info(
|
3254
|
+
ux_utils.starting_message(f'Running setup on {node_str}.'))
|
3218
3255
|
# TODO(zhwu): run_in_parallel uses multi-thread to run the commands,
|
3219
3256
|
# which can cause the program waiting for all the threads to finish,
|
3220
3257
|
# even if some of them raise exceptions. We should replace it with
|
3221
3258
|
# multi-process.
|
3259
|
+
rich_utils.stop_safe_status()
|
3222
3260
|
subprocess_utils.run_in_parallel(_setup_node, range(num_nodes))
|
3223
3261
|
|
3224
3262
|
if detach_setup:
|
3225
3263
|
# Only set this when setup needs to be run outside the self._setup()
|
3226
3264
|
# as part of a job (--detach-setup).
|
3227
3265
|
self._setup_cmd = setup_cmd
|
3266
|
+
logger.info(ux_utils.finishing_message('Setup completed.'))
|
3228
3267
|
return
|
3229
|
-
logger.info(f'{fore.GREEN}Setup completed.{style.RESET_ALL}')
|
3230
3268
|
end = time.time()
|
3231
3269
|
logger.debug(f'Setup took {end - start} seconds.')
|
3270
|
+
setup_log_path = os.path.join(self.log_dir, 'setup-*.log')
|
3271
|
+
logger.info(
|
3272
|
+
ux_utils.finishing_message('Setup completed.', setup_log_path))
|
3232
3273
|
|
3233
3274
|
def _exec_code_on_head(
|
3234
3275
|
self,
|
@@ -3240,7 +3281,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3240
3281
|
) -> None:
|
3241
3282
|
"""Executes generated code on the head node."""
|
3242
3283
|
style = colorama.Style
|
3243
|
-
fore = colorama.Fore
|
3244
3284
|
|
3245
3285
|
script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
|
3246
3286
|
remote_log_dir = self.log_dir
|
@@ -3330,9 +3370,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3330
3370
|
f'Failed to submit job {job_id}.',
|
3331
3371
|
stderr=stdout + stderr)
|
3332
3372
|
|
3333
|
-
|
3334
|
-
|
3335
|
-
|
3373
|
+
controller = controller_utils.Controllers.from_name(handle.cluster_name)
|
3374
|
+
if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
|
3375
|
+
logger.info(ux_utils.starting_message('Service registered.'))
|
3376
|
+
else:
|
3377
|
+
logger.info(
|
3378
|
+
ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
|
3379
|
+
rich_utils.stop_safe_status()
|
3336
3380
|
try:
|
3337
3381
|
if not detach_run:
|
3338
3382
|
if (handle.cluster_name in controller_utils.Controllers.
|
@@ -3347,35 +3391,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3347
3391
|
controller = controller_utils.Controllers.from_name(name)
|
3348
3392
|
if controller == controller_utils.Controllers.JOBS_CONTROLLER:
|
3349
3393
|
logger.info(
|
3350
|
-
f'
|
3394
|
+
f'\n📋 Useful Commands'
|
3395
|
+
f'\nManaged Job ID: '
|
3351
3396
|
f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
|
3352
|
-
'\
|
3353
|
-
f'{
|
3354
|
-
f'{
|
3355
|
-
'\
|
3356
|
-
f'{
|
3357
|
-
f'{
|
3358
|
-
f'\
|
3359
|
-
f'{
|
3360
|
-
f'{
|
3361
|
-
'\
|
3362
|
-
f'{
|
3363
|
-
f'{
|
3364
|
-
'\
|
3365
|
-
f'{
|
3366
|
-
f'{
|
3397
|
+
f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t\t'
|
3398
|
+
f'{ux_utils.BOLD}sky jobs cancel {job_id}'
|
3399
|
+
f'{ux_utils.RESET_BOLD}'
|
3400
|
+
f'\n{ux_utils.INDENT_SYMBOL}To stream job logs:\t\t\t'
|
3401
|
+
f'{ux_utils.BOLD}sky jobs logs {job_id}'
|
3402
|
+
f'{ux_utils.RESET_BOLD}'
|
3403
|
+
f'\n{ux_utils.INDENT_SYMBOL}To stream controller logs:\t\t'
|
3404
|
+
f'{ux_utils.BOLD}sky jobs logs --controller {job_id}'
|
3405
|
+
f'{ux_utils.RESET_BOLD}'
|
3406
|
+
f'\n{ux_utils.INDENT_SYMBOL}To view all managed jobs:\t\t'
|
3407
|
+
f'{ux_utils.BOLD}sky jobs queue'
|
3408
|
+
f'{ux_utils.RESET_BOLD}'
|
3409
|
+
f'\n{ux_utils.INDENT_LAST_SYMBOL}To view managed job '
|
3410
|
+
f'dashboard:\t{ux_utils.BOLD}sky jobs dashboard'
|
3411
|
+
f'{ux_utils.RESET_BOLD}')
|
3367
3412
|
elif controller is None:
|
3368
|
-
logger.info(f'
|
3369
|
-
f'{
|
3370
|
-
'\
|
3371
|
-
f'{
|
3372
|
-
f'{
|
3373
|
-
'\
|
3374
|
-
f'{
|
3375
|
-
f'{
|
3376
|
-
'\
|
3377
|
-
|
3378
|
-
f'{
|
3413
|
+
logger.info(f'\n📋 Useful Commands'
|
3414
|
+
f'\nJob ID: {job_id}'
|
3415
|
+
f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t'
|
3416
|
+
f'{ux_utils.BOLD}sky cancel {name} {job_id}'
|
3417
|
+
f'{ux_utils.RESET_BOLD}'
|
3418
|
+
f'\n{ux_utils.INDENT_SYMBOL}To stream job logs:\t\t'
|
3419
|
+
f'{ux_utils.BOLD}sky logs {name} {job_id}'
|
3420
|
+
f'{ux_utils.RESET_BOLD}'
|
3421
|
+
f'\n{ux_utils.INDENT_LAST_SYMBOL}To view job '
|
3422
|
+
'queue:\t\t'
|
3423
|
+
f'{ux_utils.BOLD}sky queue {name}'
|
3424
|
+
f'{ux_utils.RESET_BOLD}')
|
3379
3425
|
|
3380
3426
|
def _add_job(self, handle: CloudVmRayResourceHandle,
|
3381
3427
|
job_name: Optional[str], resources_str: str) -> int:
|
@@ -3452,27 +3498,23 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3452
3498
|
|
3453
3499
|
def _post_execute(self, handle: CloudVmRayResourceHandle,
|
3454
3500
|
down: bool) -> None:
|
3455
|
-
fore = colorama.Fore
|
3456
|
-
style = colorama.Style
|
3457
3501
|
name = handle.cluster_name
|
3458
3502
|
controller = controller_utils.Controllers.from_name(name)
|
3459
|
-
if controller is not None
|
3503
|
+
if controller is not None:
|
3460
3504
|
return
|
3461
|
-
|
3462
|
-
f'\
|
3463
|
-
f'{
|
3464
|
-
|
3465
|
-
f'{
|
3466
|
-
'\
|
3467
|
-
f'{
|
3468
|
-
f'{
|
3469
|
-
'\
|
3470
|
-
f'
|
3471
|
-
f'{
|
3472
|
-
f'{
|
3473
|
-
'
|
3474
|
-
f'\t{backend_utils.BOLD}sky down {name}'
|
3475
|
-
f'{backend_utils.RESET_BOLD}')
|
3505
|
+
logger.info(f'\nCluster name: {name}'
|
3506
|
+
f'\n{ux_utils.INDENT_SYMBOL}To log into the head VM:\t'
|
3507
|
+
f'{ux_utils.BOLD}ssh {name}'
|
3508
|
+
f'{ux_utils.RESET_BOLD}'
|
3509
|
+
f'\n{ux_utils.INDENT_SYMBOL}To submit a job:'
|
3510
|
+
f'\t\t{ux_utils.BOLD}sky exec {name} yaml_file'
|
3511
|
+
f'{ux_utils.RESET_BOLD}'
|
3512
|
+
f'\n{ux_utils.INDENT_SYMBOL}To stop the cluster:'
|
3513
|
+
f'\t{ux_utils.BOLD}sky stop {name}'
|
3514
|
+
f'{ux_utils.RESET_BOLD}'
|
3515
|
+
f'\n{ux_utils.INDENT_LAST_SYMBOL}To teardown the cluster:'
|
3516
|
+
f'\t{ux_utils.BOLD}sky down {name}'
|
3517
|
+
f'{ux_utils.RESET_BOLD}')
|
3476
3518
|
if (gcp_utils.is_tpu(handle.launched_resources) and
|
3477
3519
|
not gcp_utils.is_tpu_vm(handle.launched_resources)):
|
3478
3520
|
logger.info('Tip: `sky down` will delete launched TPU(s) too.')
|
@@ -3808,11 +3850,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3808
3850
|
Raises:
|
3809
3851
|
RuntimeError: If the cluster fails to be terminated/stopped.
|
3810
3852
|
"""
|
3853
|
+
cluster_status_fetched = False
|
3811
3854
|
if refresh_cluster_status:
|
3812
|
-
|
3813
|
-
|
3814
|
-
|
3815
|
-
|
3855
|
+
try:
|
3856
|
+
prev_cluster_status, _ = (
|
3857
|
+
backend_utils.refresh_cluster_status_handle(
|
3858
|
+
handle.cluster_name,
|
3859
|
+
acquire_per_cluster_status_lock=False))
|
3860
|
+
cluster_status_fetched = True
|
3861
|
+
except exceptions.ClusterStatusFetchingError:
|
3862
|
+
logger.warning(
|
3863
|
+
'Failed to fetch cluster status for '
|
3864
|
+
f'{handle.cluster_name!r}. Assuming the cluster is still '
|
3865
|
+
'up.')
|
3866
|
+
if not cluster_status_fetched:
|
3816
3867
|
record = global_user_state.get_cluster_from_name(
|
3817
3868
|
handle.cluster_name)
|
3818
3869
|
prev_cluster_status = record[
|
@@ -3972,8 +4023,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3972
4023
|
f.flush()
|
3973
4024
|
|
3974
4025
|
teardown_verb = 'Terminating' if terminate else 'Stopping'
|
3975
|
-
with rich_utils.safe_status(
|
3976
|
-
|
4026
|
+
with rich_utils.safe_status(
|
4027
|
+
ux_utils.spinner_message(
|
4028
|
+
f'{teardown_verb}: {cluster_name}', log_path)):
|
3977
4029
|
# FIXME(zongheng): support retries. This call can fail for
|
3978
4030
|
# example due to GCP returning list requests per limit
|
3979
4031
|
# exceeded.
|
@@ -4053,7 +4105,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4053
4105
|
config = common_utils.read_yaml(handle.cluster_yaml)
|
4054
4106
|
tpu_node_config = config['provider'].get('tpu_node')
|
4055
4107
|
if tpu_node_config is None:
|
4056
|
-
with rich_utils.safe_status(
|
4108
|
+
with rich_utils.safe_status(
|
4109
|
+
ux_utils.spinner_message('Terminating TPU')):
|
4057
4110
|
tpu_rc, tpu_stdout, tpu_stderr = log_lib.run_with_log(
|
4058
4111
|
['bash', handle.tpu_delete_script],
|
4059
4112
|
log_abs_path,
|
@@ -4425,13 +4478,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4425
4478
|
to_provision = handle_before_refresh.launched_resources
|
4426
4479
|
self.check_resources_fit_cluster(handle_before_refresh, task)
|
4427
4480
|
|
4428
|
-
logger.info(
|
4429
|
-
f'{colorama.Fore.CYAN}Creating a new cluster: {cluster_name!r} '
|
4430
|
-
f'[{task.num_nodes}x {to_provision}].'
|
4431
|
-
f'{colorama.Style.RESET_ALL}\n'
|
4432
|
-
'Tip: to reuse an existing cluster, '
|
4433
|
-
'specify --cluster (-c). '
|
4434
|
-
'Run `sky status` to see existing clusters.')
|
4435
4481
|
return RetryingVmProvisioner.ToProvisionConfig(
|
4436
4482
|
cluster_name,
|
4437
4483
|
to_provision,
|
@@ -4454,7 +4500,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4454
4500
|
symlink_commands = []
|
4455
4501
|
fore = colorama.Fore
|
4456
4502
|
style = colorama.Style
|
4457
|
-
logger.info(f'{fore.CYAN}Processing file mounts.{style.RESET_ALL}')
|
4458
4503
|
start = time.time()
|
4459
4504
|
runners = handle.get_command_runners()
|
4460
4505
|
log_path = os.path.join(self.log_dir, 'file_mounts.log')
|
@@ -4468,20 +4513,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4468
4513
|
src_size = backend_utils.path_size_megabytes(full_src)
|
4469
4514
|
if src_size >= _PATH_SIZE_MEGABYTES_WARN_THRESHOLD:
|
4470
4515
|
logger.warning(
|
4471
|
-
f'{fore.YELLOW}The size of file mount src {src!r} '
|
4516
|
+
f' {fore.YELLOW}The size of file mount src {src!r} '
|
4472
4517
|
f'is {src_size} MB. Try to keep src small or use '
|
4473
4518
|
'.skyignore to exclude large files, as large sizes '
|
4474
4519
|
f'will slow down rsync. {style.RESET_ALL}')
|
4475
4520
|
if os.path.islink(full_src):
|
4476
4521
|
logger.warning(
|
4477
|
-
f'{fore.YELLOW}Source path {src!r} is a symlink. '
|
4522
|
+
f' {fore.YELLOW}Source path {src!r} is a symlink. '
|
4478
4523
|
f'Symlink contents are not uploaded.{style.RESET_ALL}')
|
4479
4524
|
|
4480
4525
|
os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
|
4481
4526
|
os.system(f'touch {log_path}')
|
4482
|
-
|
4483
|
-
|
4484
|
-
|
4527
|
+
|
4528
|
+
rich_utils.force_update_status(
|
4529
|
+
ux_utils.spinner_message('Syncing file mounts', log_path))
|
4485
4530
|
|
4486
4531
|
for dst, src in file_mounts.items():
|
4487
4532
|
# TODO: room for improvement. Here there are many moving parts
|
@@ -4576,6 +4621,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4576
4621
|
subprocess_utils.run_in_parallel(_symlink_node, runners)
|
4577
4622
|
end = time.time()
|
4578
4623
|
logger.debug(f'File mount sync took {end - start} seconds.')
|
4624
|
+
logger.info(ux_utils.finishing_message('Files synced.', log_path))
|
4579
4625
|
|
4580
4626
|
def _execute_storage_mounts(
|
4581
4627
|
self, handle: CloudVmRayResourceHandle,
|
@@ -4599,16 +4645,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4599
4645
|
# Handle cases when there aren't any Storages with MOUNT mode.
|
4600
4646
|
if not storage_mounts:
|
4601
4647
|
return
|
4602
|
-
|
4603
|
-
fore = colorama.Fore
|
4604
|
-
style = colorama.Style
|
4605
|
-
plural = 's' if len(storage_mounts) > 1 else ''
|
4606
|
-
logger.info(f'{fore.CYAN}Processing {len(storage_mounts)} '
|
4607
|
-
f'storage mount{plural}.{style.RESET_ALL}')
|
4608
4648
|
start = time.time()
|
4609
4649
|
runners = handle.get_command_runners()
|
4610
4650
|
log_path = os.path.join(self.log_dir, 'storage_mounts.log')
|
4611
4651
|
|
4652
|
+
plural = 's' if len(storage_mounts) > 1 else ''
|
4653
|
+
rich_utils.force_update_status(
|
4654
|
+
ux_utils.spinner_message(
|
4655
|
+
f'Mounting {len(storage_mounts)} storage{plural}', log_path))
|
4656
|
+
|
4612
4657
|
for dst, storage_obj in storage_mounts.items():
|
4613
4658
|
if not os.path.isabs(dst) and not dst.startswith('~/'):
|
4614
4659
|
dst = f'{SKY_REMOTE_WORKDIR}/{dst}'
|
@@ -4662,6 +4707,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4662
4707
|
|
4663
4708
|
end = time.time()
|
4664
4709
|
logger.debug(f'Storage mount sync took {end - start} seconds.')
|
4710
|
+
logger.info(ux_utils.finishing_message('Storage mounted.', log_path))
|
4665
4711
|
|
4666
4712
|
def _set_storage_mounts_metadata(
|
4667
4713
|
self, cluster_name: str,
|