skypilot-nightly 1.0.0.dev20250216__py3-none-any.whl → 1.0.0.dev20250218__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +48 -22
- sky/adaptors/aws.py +2 -1
- sky/adaptors/azure.py +4 -4
- sky/adaptors/cloudflare.py +4 -4
- sky/adaptors/kubernetes.py +8 -8
- sky/authentication.py +42 -45
- sky/backends/backend.py +2 -2
- sky/backends/backend_utils.py +108 -221
- sky/backends/cloud_vm_ray_backend.py +283 -282
- sky/benchmark/benchmark_utils.py +6 -2
- sky/check.py +40 -28
- sky/cli.py +1213 -1116
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5644 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1757 -0
- sky/cloud_stores.py +12 -6
- sky/clouds/__init__.py +0 -2
- sky/clouds/aws.py +20 -13
- sky/clouds/azure.py +5 -3
- sky/clouds/cloud.py +1 -1
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +7 -3
- sky/clouds/fluidstack.py +3 -2
- sky/clouds/gcp.py +10 -8
- sky/clouds/ibm.py +8 -7
- sky/clouds/kubernetes.py +7 -6
- sky/clouds/lambda_cloud.py +8 -7
- sky/clouds/oci.py +4 -3
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +8 -7
- sky/clouds/service_catalog/__init__.py +3 -3
- sky/clouds/service_catalog/aws_catalog.py +7 -1
- sky/clouds/service_catalog/common.py +4 -2
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/clouds/utils/oci_utils.py +1 -1
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +263 -99
- sky/dag.py +4 -0
- sky/data/mounting_utils.py +2 -1
- sky/data/storage.py +97 -35
- sky/data/storage_utils.py +69 -9
- sky/exceptions.py +138 -5
- sky/execution.py +47 -50
- sky/global_user_state.py +105 -22
- sky/jobs/__init__.py +12 -14
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +296 -0
- sky/jobs/constants.py +30 -1
- sky/jobs/controller.py +12 -6
- sky/jobs/dashboard/dashboard.py +2 -6
- sky/jobs/recovery_strategy.py +22 -29
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/{core.py → server/core.py} +101 -34
- sky/jobs/server/dashboard_utils.py +64 -0
- sky/jobs/server/server.py +182 -0
- sky/jobs/utils.py +32 -23
- sky/models.py +27 -0
- sky/optimizer.py +22 -22
- sky/provision/__init__.py +6 -3
- sky/provision/aws/config.py +2 -2
- sky/provision/aws/instance.py +1 -1
- sky/provision/azure/instance.py +1 -1
- sky/provision/cudo/instance.py +1 -1
- sky/provision/do/instance.py +1 -1
- sky/provision/do/utils.py +0 -5
- sky/provision/fluidstack/fluidstack_utils.py +4 -3
- sky/provision/fluidstack/instance.py +4 -2
- sky/provision/gcp/instance.py +1 -1
- sky/provision/instance_setup.py +2 -2
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +1 -1
- sky/provision/kubernetes/utils.py +67 -76
- sky/provision/lambda_cloud/instance.py +3 -15
- sky/provision/logging.py +1 -1
- sky/provision/oci/instance.py +7 -4
- sky/provision/paperspace/instance.py +1 -1
- sky/provision/provisioner.py +3 -2
- sky/provision/runpod/instance.py +1 -1
- sky/provision/vast/instance.py +1 -1
- sky/provision/vsphere/instance.py +2 -11
- sky/resources.py +63 -47
- sky/serve/__init__.py +6 -10
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +3 -0
- sky/serve/replica_managers.py +10 -10
- sky/serve/serve_utils.py +56 -36
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +37 -17
- sky/serve/server/server.py +117 -0
- sky/serve/service.py +8 -1
- sky/server/__init__.py +1 -0
- sky/server/common.py +442 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +462 -0
- sky/server/requests/payloads.py +481 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1095 -0
- sky/server/stream_utils.py +144 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +12 -4
- sky/setup_files/setup.py +1 -1
- sky/sky_logging.py +9 -13
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +46 -12
- sky/skylet/events.py +5 -6
- sky/skylet/job_lib.py +78 -66
- sky/skylet/log_lib.py +17 -11
- sky/skypilot_config.py +79 -94
- sky/task.py +119 -73
- sky/templates/aws-ray.yml.j2 +4 -4
- sky/templates/azure-ray.yml.j2 +3 -2
- sky/templates/cudo-ray.yml.j2 +3 -2
- sky/templates/fluidstack-ray.yml.j2 +3 -2
- sky/templates/gcp-ray.yml.j2 +3 -2
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +1 -12
- sky/templates/kubernetes-ray.yml.j2 +3 -2
- sky/templates/lambda-ray.yml.j2 +3 -2
- sky/templates/oci-ray.yml.j2 +3 -2
- sky/templates/paperspace-ray.yml.j2 +3 -2
- sky/templates/runpod-ray.yml.j2 +3 -2
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vsphere-ray.yml.j2 +4 -2
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +8 -0
- sky/usage/usage_lib.py +45 -11
- sky/utils/accelerator_registry.py +33 -53
- sky/utils/admin_policy_utils.py +2 -1
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +33 -3
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +69 -14
- sky/utils/common.py +74 -0
- sky/utils/common_utils.py +133 -93
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +2 -3
- sky/utils/controller_utils.py +133 -147
- sky/utils/dag_utils.py +72 -24
- sky/utils/kubernetes/deploy_remote_cluster.sh +2 -2
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/log_utils.py +83 -23
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +2 -2
- sky/utils/rich_utils.py +213 -34
- sky/utils/schemas.py +19 -2
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +51 -35
- sky/utils/timeline.py +7 -2
- sky/utils/ux_utils.py +95 -25
- {skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/METADATA +8 -3
- {skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/RECORD +170 -132
- sky/clouds/cloud_registry.py +0 -76
- sky/utils/cluster_yaml_utils.py +0 -24
- {skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,6 @@
|
|
1
1
|
"""Backend: runs on cloud virtual machines, managed by Ray."""
|
2
2
|
import copy
|
3
3
|
import enum
|
4
|
-
import functools
|
5
|
-
import getpass
|
6
4
|
import inspect
|
7
5
|
import json
|
8
6
|
import math
|
@@ -37,7 +35,6 @@ from sky import optimizer
|
|
37
35
|
from sky import provision as provision_lib
|
38
36
|
from sky import resources as resources_lib
|
39
37
|
from sky import sky_logging
|
40
|
-
from sky import status_lib
|
41
38
|
from sky import task as task_lib
|
42
39
|
from sky.backends import backend_utils
|
43
40
|
from sky.backends import wheel_utils
|
@@ -45,24 +42,30 @@ from sky.clouds import service_catalog
|
|
45
42
|
from sky.clouds.utils import gcp_utils
|
46
43
|
from sky.data import data_utils
|
47
44
|
from sky.data import storage as storage_lib
|
48
|
-
from sky.jobs import constants as managed_jobs_constants
|
49
45
|
from sky.provision import common as provision_common
|
50
46
|
from sky.provision import instance_setup
|
51
47
|
from sky.provision import metadata_utils
|
52
48
|
from sky.provision import provisioner
|
53
49
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
50
|
+
from sky.server.requests import requests as requests_lib
|
54
51
|
from sky.skylet import autostop_lib
|
55
52
|
from sky.skylet import constants
|
56
53
|
from sky.skylet import job_lib
|
57
54
|
from sky.skylet import log_lib
|
58
55
|
from sky.usage import usage_lib
|
59
56
|
from sky.utils import accelerator_registry
|
57
|
+
from sky.utils import annotations
|
58
|
+
from sky.utils import cluster_utils
|
60
59
|
from sky.utils import command_runner
|
60
|
+
from sky.utils import common
|
61
61
|
from sky.utils import common_utils
|
62
62
|
from sky.utils import controller_utils
|
63
63
|
from sky.utils import log_utils
|
64
|
+
from sky.utils import message_utils
|
65
|
+
from sky.utils import registry
|
64
66
|
from sky.utils import resources_utils
|
65
67
|
from sky.utils import rich_utils
|
68
|
+
from sky.utils import status_lib
|
66
69
|
from sky.utils import subprocess_utils
|
67
70
|
from sky.utils import timeline
|
68
71
|
from sky.utils import ux_utils
|
@@ -152,9 +155,9 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
|
|
152
155
|
# If the command is too long, we instead write it to a file, rsync and execute
|
153
156
|
# it.
|
154
157
|
#
|
155
|
-
# We use
|
158
|
+
# We use 100KB as a threshold to be safe for other arguments that
|
156
159
|
# might be added during ssh.
|
157
|
-
_MAX_INLINE_SCRIPT_LENGTH =
|
160
|
+
_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
|
158
161
|
|
159
162
|
_RESOURCES_UNAVAILABLE_LOG = (
|
160
163
|
'Reasons for provision failures (for details, please check the log above):')
|
@@ -194,7 +197,7 @@ def _get_cluster_config_template(cloud):
|
|
194
197
|
|
195
198
|
|
196
199
|
def write_ray_up_script_with_patched_launch_hash_fn(
|
197
|
-
cluster_config_path: str,
|
200
|
+
cluster_config_path: Optional[str],
|
198
201
|
ray_up_kwargs: Dict[str, bool],
|
199
202
|
) -> str:
|
200
203
|
"""Writes a Python script that runs `ray up` with our launch hash func.
|
@@ -1181,7 +1184,7 @@ class RetryingVmProvisioner(object):
|
|
1181
1184
|
def __init__(self,
|
1182
1185
|
log_dir: str,
|
1183
1186
|
dag: 'dag.Dag',
|
1184
|
-
optimize_target: '
|
1187
|
+
optimize_target: 'common.OptimizeTarget',
|
1185
1188
|
requested_features: Set[clouds.CloudImplementationFeatures],
|
1186
1189
|
local_wheel_path: pathlib.Path,
|
1187
1190
|
wheel_hash: str,
|
@@ -1554,6 +1557,7 @@ class RetryingVmProvisioner(object):
|
|
1554
1557
|
f'{to_provision.cloud} '
|
1555
1558
|
f'{region.name}{colorama.Style.RESET_ALL}'
|
1556
1559
|
f'{zone_str}.'))
|
1560
|
+
assert handle.cluster_yaml is not None
|
1557
1561
|
provision_record = provisioner.bulk_provision(
|
1558
1562
|
to_provision.cloud,
|
1559
1563
|
region,
|
@@ -1586,7 +1590,9 @@ class RetryingVmProvisioner(object):
|
|
1586
1590
|
# cluster does not exist. Also we are fast at
|
1587
1591
|
# cleaning up clusters now if there is no existing node..
|
1588
1592
|
CloudVmRayBackend().post_teardown_cleanup(
|
1589
|
-
handle,
|
1593
|
+
handle,
|
1594
|
+
terminate=not prev_cluster_ever_up,
|
1595
|
+
remove_from_db=False)
|
1590
1596
|
# TODO(suquark): other clouds may have different zone
|
1591
1597
|
# blocking strategy. See '_update_blocklist_on_error'
|
1592
1598
|
# for details.
|
@@ -1703,7 +1709,8 @@ class RetryingVmProvisioner(object):
|
|
1703
1709
|
# autoscaler proceeds to setup commands, which may fail:
|
1704
1710
|
# ERR updater.py:138 -- New status: update-failed
|
1705
1711
|
CloudVmRayBackend().teardown_no_lock(handle,
|
1706
|
-
terminate=terminate_or_stop
|
1712
|
+
terminate=terminate_or_stop,
|
1713
|
+
remove_from_db=False)
|
1707
1714
|
|
1708
1715
|
if to_provision.zone is not None:
|
1709
1716
|
message = (
|
@@ -2130,7 +2137,7 @@ class RetryingVmProvisioner(object):
|
|
2130
2137
|
# TODO: set all remaining tasks' best_resources to None.
|
2131
2138
|
task.best_resources = None
|
2132
2139
|
try:
|
2133
|
-
self._dag =
|
2140
|
+
self._dag = optimizer.Optimizer.optimize(
|
2134
2141
|
self._dag,
|
2135
2142
|
minimize=self._optimize_target,
|
2136
2143
|
blocked_resources=self._blocked_resources)
|
@@ -2176,14 +2183,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2176
2183
|
"""
|
2177
2184
|
# Bump if any fields get added/removed/changed, and add backward
|
2178
2185
|
# compaitibility logic in __setstate__.
|
2179
|
-
_VERSION =
|
2186
|
+
_VERSION = 10
|
2180
2187
|
|
2181
2188
|
def __init__(
|
2182
2189
|
self,
|
2183
2190
|
*,
|
2184
2191
|
cluster_name: str,
|
2185
2192
|
cluster_name_on_cloud: str,
|
2186
|
-
cluster_yaml: str,
|
2193
|
+
cluster_yaml: Optional[str],
|
2187
2194
|
launched_nodes: int,
|
2188
2195
|
launched_resources: resources_lib.Resources,
|
2189
2196
|
stable_internal_external_ips: Optional[List[Tuple[str,
|
@@ -2196,7 +2203,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2196
2203
|
self.cluster_name_on_cloud = cluster_name_on_cloud
|
2197
2204
|
# Replace the home directory with ~ for better robustness across systems
|
2198
2205
|
# with different home directories.
|
2199
|
-
if cluster_yaml.startswith(
|
2206
|
+
if cluster_yaml is not None and cluster_yaml.startswith(
|
2207
|
+
os.path.expanduser('~')):
|
2200
2208
|
cluster_yaml = cluster_yaml.replace(os.path.expanduser('~'), '~', 1)
|
2201
2209
|
self._cluster_yaml = cluster_yaml
|
2202
2210
|
# List of (internal_ip, feasible_ip) tuples for all the nodes in the
|
@@ -2403,7 +2411,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2403
2411
|
internal_external_ips[1:], key=lambda x: x[1])
|
2404
2412
|
self.stable_internal_external_ips = stable_internal_external_ips
|
2405
2413
|
|
2406
|
-
@
|
2414
|
+
@annotations.lru_cache(scope='global')
|
2407
2415
|
@timeline.event
|
2408
2416
|
def get_command_runners(self,
|
2409
2417
|
force_cached: bool = False,
|
@@ -2520,9 +2528,15 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2520
2528
|
self.docker_user = docker_user
|
2521
2529
|
|
2522
2530
|
@property
|
2523
|
-
def cluster_yaml(self):
|
2531
|
+
def cluster_yaml(self) -> Optional[str]:
|
2532
|
+
if self._cluster_yaml is None:
|
2533
|
+
return None
|
2524
2534
|
return os.path.expanduser(self._cluster_yaml)
|
2525
2535
|
|
2536
|
+
@cluster_yaml.setter
|
2537
|
+
def cluster_yaml(self, value: Optional[str]):
|
2538
|
+
self._cluster_yaml = value
|
2539
|
+
|
2526
2540
|
@property
|
2527
2541
|
def ssh_user(self):
|
2528
2542
|
if self.cached_cluster_info is not None:
|
@@ -2594,6 +2608,22 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2594
2608
|
state['launched_resources'] = launched_resources.copy(
|
2595
2609
|
region=context)
|
2596
2610
|
|
2611
|
+
if version < 10:
|
2612
|
+
# In #4660, we keep the cluster entry in the database even when it
|
2613
|
+
# is in the transition from one region to another during the
|
2614
|
+
# failover. We allow `handle.cluster_yaml` to be None to indicate
|
2615
|
+
# that the cluster yaml is intentionally removed. Before that PR,
|
2616
|
+
# the `handle.cluster_yaml` is always not None, even if it is
|
2617
|
+
# intentionally removed.
|
2618
|
+
#
|
2619
|
+
# For backward compatibility, we set the `_cluster_yaml` to None
|
2620
|
+
# if the file does not exist, assuming all the removal of the
|
2621
|
+
# _cluster_yaml for existing clusters are intentional by SkyPilot.
|
2622
|
+
# are intentional by SkyPilot.
|
2623
|
+
if state['_cluster_yaml'] is not None and not os.path.exists(
|
2624
|
+
os.path.expanduser(state['_cluster_yaml'])):
|
2625
|
+
state['_cluster_yaml'] = None
|
2626
|
+
|
2597
2627
|
self.__dict__.update(state)
|
2598
2628
|
|
2599
2629
|
# Because the update_cluster_ips and update_ssh_ports
|
@@ -2618,6 +2648,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2618
2648
|
pass
|
2619
2649
|
|
2620
2650
|
|
2651
|
+
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
2621
2652
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
2622
2653
|
"""Backend: runs on cloud virtual machines, managed by Ray.
|
2623
2654
|
|
@@ -2647,7 +2678,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2647
2678
|
|
2648
2679
|
# Command for running the setup script. It is only set when the
|
2649
2680
|
# setup needs to be run outside the self._setup() and as part of
|
2650
|
-
# a job (
|
2681
|
+
# a job (detach_setup, default).
|
2651
2682
|
self._setup_cmd = None
|
2652
2683
|
|
2653
2684
|
# --- Implementation of Backend APIs ---
|
@@ -2656,7 +2687,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2656
2687
|
self._dag = kwargs.pop('dag', self._dag)
|
2657
2688
|
self._optimize_target = kwargs.pop(
|
2658
2689
|
'optimize_target',
|
2659
|
-
self._optimize_target) or
|
2690
|
+
self._optimize_target) or common.OptimizeTarget.COST
|
2660
2691
|
self._requested_features = kwargs.pop('requested_features',
|
2661
2692
|
self._requested_features)
|
2662
2693
|
assert not kwargs, f'Unexpected kwargs: {kwargs}'
|
@@ -2872,21 +2903,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2872
2903
|
skip_unnecessary_provisioning)
|
2873
2904
|
break
|
2874
2905
|
except exceptions.ResourcesUnavailableError as e:
|
2875
|
-
|
2876
|
-
|
2906
|
+
log_path = retry_provisioner.log_dir + '/provision.log'
|
2907
|
+
error_message = (
|
2908
|
+
f'{colorama.Fore.RED}Failed to provision all '
|
2909
|
+
f'possible launchable resources.'
|
2910
|
+
f'{colorama.Style.RESET_ALL}'
|
2911
|
+
' Relax the task\'s resource requirements: '
|
2912
|
+
f'{task.num_nodes}x {list(task.resources)[0]}')
|
2877
2913
|
if e.no_failover:
|
2878
2914
|
error_message = str(e)
|
2879
|
-
else:
|
2880
|
-
usage_lib.messages.usage.update_final_cluster_status(
|
2881
|
-
None)
|
2882
|
-
error_message = (
|
2883
|
-
f'{colorama.Fore.RED}Failed to provision all '
|
2884
|
-
f'possible launchable resources.'
|
2885
|
-
f'{colorama.Style.RESET_ALL}'
|
2886
|
-
' Relax the task\'s resource requirements: '
|
2887
|
-
f'{task.num_nodes}x {list(task.resources)[0]}')
|
2888
2915
|
|
2889
|
-
log_path = retry_provisioner.log_dir + '/provision.log'
|
2890
2916
|
if retry_until_up:
|
2891
2917
|
logger.error(error_message)
|
2892
2918
|
# Sleep and retry.
|
@@ -2901,6 +2927,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2901
2927
|
attempt_cnt += 1
|
2902
2928
|
time.sleep(gap_seconds)
|
2903
2929
|
continue
|
2930
|
+
# Clean up the cluster's entry in `sky status`.
|
2931
|
+
# Do not remove the stopped cluster from the global state
|
2932
|
+
# if failed to start.
|
2933
|
+
if not e.no_failover:
|
2934
|
+
global_user_state.remove_cluster(cluster_name,
|
2935
|
+
terminate=True)
|
2936
|
+
usage_lib.messages.usage.update_final_cluster_status(
|
2937
|
+
None)
|
2904
2938
|
logger.error(
|
2905
2939
|
ux_utils.error_message(
|
2906
2940
|
'Failed to provision resources. '
|
@@ -2966,8 +3000,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2966
3000
|
|
2967
3001
|
self._update_after_cluster_provisioned(
|
2968
3002
|
handle, to_provision_config.prev_handle, task,
|
2969
|
-
prev_cluster_status,
|
2970
|
-
handle.external_ssh_ports(), lock_path, config_hash)
|
3003
|
+
prev_cluster_status, lock_path, config_hash)
|
2971
3004
|
return handle
|
2972
3005
|
|
2973
3006
|
cluster_config_file = config_dict['ray']
|
@@ -3039,8 +3072,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3039
3072
|
|
3040
3073
|
self._update_after_cluster_provisioned(
|
3041
3074
|
handle, to_provision_config.prev_handle, task,
|
3042
|
-
prev_cluster_status,
|
3043
|
-
config_hash)
|
3075
|
+
prev_cluster_status, lock_path, config_hash)
|
3044
3076
|
return handle
|
3045
3077
|
|
3046
3078
|
def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
|
@@ -3058,8 +3090,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3058
3090
|
prev_handle: Optional[CloudVmRayResourceHandle],
|
3059
3091
|
task: task_lib.Task,
|
3060
3092
|
prev_cluster_status: Optional[status_lib.ClusterStatus],
|
3061
|
-
|
3062
|
-
config_hash: str) -> None:
|
3093
|
+
lock_path: str, config_hash: str) -> None:
|
3063
3094
|
usage_lib.messages.usage.update_cluster_resources(
|
3064
3095
|
handle.launched_nodes, handle.launched_resources)
|
3065
3096
|
usage_lib.messages.usage.update_final_cluster_status(
|
@@ -3123,15 +3154,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3123
3154
|
)
|
3124
3155
|
usage_lib.messages.usage.update_final_cluster_status(
|
3125
3156
|
status_lib.ClusterStatus.UP)
|
3157
|
+
# We still add the cluster to ssh config file on API server, this
|
3158
|
+
# is helpful for people trying to use `sky launch`'ed cluster for
|
3159
|
+
# ssh proxy jump.
|
3126
3160
|
auth_config = backend_utils.ssh_credential_from_yaml(
|
3127
3161
|
handle.cluster_yaml,
|
3128
3162
|
ssh_user=handle.ssh_user,
|
3129
3163
|
docker_user=handle.docker_user)
|
3130
|
-
|
3131
|
-
|
3132
|
-
|
3133
|
-
|
3134
|
-
handle.ssh_user)
|
3164
|
+
cluster_utils.SSHConfigHelper.add_cluster(
|
3165
|
+
handle.cluster_name, handle.cached_external_ips, auth_config,
|
3166
|
+
handle.cached_external_ssh_ports, handle.docker_user,
|
3167
|
+
handle.ssh_user)
|
3135
3168
|
|
3136
3169
|
common_utils.remove_file_if_exists(lock_path)
|
3137
3170
|
|
@@ -3192,7 +3225,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3192
3225
|
ux_utils.spinner_message('Syncing workdir', log_path)):
|
3193
3226
|
subprocess_utils.run_in_parallel(_sync_workdir_node, runners,
|
3194
3227
|
num_threads)
|
3195
|
-
logger.info(ux_utils.finishing_message('
|
3228
|
+
logger.info(ux_utils.finishing_message('Synced workdir.', log_path))
|
3196
3229
|
|
3197
3230
|
def _sync_file_mounts(
|
3198
3231
|
self,
|
@@ -3346,9 +3379,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3346
3379
|
|
3347
3380
|
if detach_setup:
|
3348
3381
|
# Only set this when setup needs to be run outside the self._setup()
|
3349
|
-
# as part of a job (
|
3382
|
+
# as part of a job (detach_setup, default).
|
3350
3383
|
self._setup_cmd = setup_cmd
|
3351
|
-
logger.info(ux_utils.finishing_message('Setup
|
3384
|
+
logger.info(ux_utils.finishing_message('Setup detached.'))
|
3352
3385
|
return
|
3353
3386
|
end = time.time()
|
3354
3387
|
logger.debug(f'Setup took {end - start} seconds.')
|
@@ -3365,9 +3398,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3365
3398
|
managed_job_dag: Optional['dag.Dag'] = None,
|
3366
3399
|
) -> None:
|
3367
3400
|
"""Executes generated code on the head node."""
|
3368
|
-
style = colorama.Style
|
3369
|
-
fore = colorama.Fore
|
3370
|
-
|
3371
3401
|
script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
|
3372
3402
|
remote_log_dir = self.log_dir
|
3373
3403
|
remote_log_path = os.path.join(remote_log_dir, 'run.log')
|
@@ -3457,58 +3487,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3457
3487
|
logger.info(
|
3458
3488
|
ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
|
3459
3489
|
rich_utils.stop_safe_status()
|
3460
|
-
|
3461
|
-
if
|
3462
|
-
|
3463
|
-
|
3464
|
-
|
3465
|
-
|
3466
|
-
|
3467
|
-
|
3468
|
-
self.tail_logs(handle, job_id)
|
3469
|
-
finally:
|
3470
|
-
name = handle.cluster_name
|
3471
|
-
controller = controller_utils.Controllers.from_name(name)
|
3472
|
-
if controller == controller_utils.Controllers.JOBS_CONTROLLER:
|
3473
|
-
logger.info(
|
3474
|
-
f'\n{fore.CYAN}Managed Job ID: '
|
3475
|
-
f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
|
3476
|
-
f'\n📋 Useful Commands'
|
3477
|
-
f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t\t'
|
3478
|
-
f'{ux_utils.BOLD}sky jobs cancel {job_id}'
|
3479
|
-
f'{ux_utils.RESET_BOLD}'
|
3480
|
-
f'\n{ux_utils.INDENT_SYMBOL}To stream job logs:\t\t\t'
|
3481
|
-
f'{ux_utils.BOLD}sky jobs logs {job_id}'
|
3482
|
-
f'{ux_utils.RESET_BOLD}'
|
3483
|
-
f'\n{ux_utils.INDENT_SYMBOL}To stream controller logs:\t\t'
|
3484
|
-
f'{ux_utils.BOLD}sky jobs logs --controller {job_id}'
|
3485
|
-
f'{ux_utils.RESET_BOLD}'
|
3486
|
-
f'\n{ux_utils.INDENT_SYMBOL}To view all managed jobs:\t\t'
|
3487
|
-
f'{ux_utils.BOLD}sky jobs queue'
|
3488
|
-
f'{ux_utils.RESET_BOLD}'
|
3489
|
-
f'\n{ux_utils.INDENT_LAST_SYMBOL}To view managed job '
|
3490
|
-
f'dashboard:\t{ux_utils.BOLD}sky jobs dashboard'
|
3491
|
-
f'{ux_utils.RESET_BOLD}')
|
3492
|
-
elif controller is None:
|
3493
|
-
logger.info(f'\n{fore.CYAN}Job ID: '
|
3494
|
-
f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
|
3495
|
-
f'\n📋 Useful Commands'
|
3496
|
-
f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t'
|
3497
|
-
f'{ux_utils.BOLD}sky cancel {name} {job_id}'
|
3498
|
-
f'{ux_utils.RESET_BOLD}'
|
3499
|
-
f'\n{ux_utils.INDENT_SYMBOL}To stream job logs:\t\t'
|
3500
|
-
f'{ux_utils.BOLD}sky logs {name} {job_id}'
|
3501
|
-
f'{ux_utils.RESET_BOLD}'
|
3502
|
-
f'\n{ux_utils.INDENT_LAST_SYMBOL}To view job '
|
3503
|
-
'queue:\t\t'
|
3504
|
-
f'{ux_utils.BOLD}sky queue {name}'
|
3505
|
-
f'{ux_utils.RESET_BOLD}')
|
3490
|
+
if not detach_run:
|
3491
|
+
if (handle.cluster_name == controller_utils.Controllers.
|
3492
|
+
JOBS_CONTROLLER.value.cluster_name):
|
3493
|
+
self.tail_managed_job_logs(handle, job_id)
|
3494
|
+
else:
|
3495
|
+
# Sky logs. Not using subprocess.run since it will make the
|
3496
|
+
# ssh keep connected after ctrl-c.
|
3497
|
+
self.tail_logs(handle, job_id)
|
3506
3498
|
|
3507
3499
|
def _add_job(self, handle: CloudVmRayResourceHandle,
|
3508
3500
|
job_name: Optional[str], resources_str: str) -> int:
|
3509
|
-
|
3510
|
-
|
3511
|
-
|
3501
|
+
code = job_lib.JobLibCodeGen.add_job(
|
3502
|
+
job_name=job_name,
|
3503
|
+
username=common_utils.get_user_hash(),
|
3504
|
+
run_timestamp=self.run_timestamp,
|
3505
|
+
resources_str=resources_str)
|
3512
3506
|
returncode, job_id_str, stderr = self.run_on_head(handle,
|
3513
3507
|
code,
|
3514
3508
|
stream_logs=False,
|
@@ -3548,13 +3542,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3548
3542
|
Job id if the task is submitted to the cluster, None otherwise.
|
3549
3543
|
"""
|
3550
3544
|
if task.run is None and self._setup_cmd is None:
|
3551
|
-
# This message is fine without mentioning setup, as there are
|
3545
|
+
# This message is fine without mentioning setup, as there are two
|
3552
3546
|
# cases when run section is empty:
|
3553
|
-
# 1. setup specified
|
3554
|
-
# message
|
3555
|
-
# 2. setup specified
|
3556
|
-
# detached mode and this message will not be shown.
|
3557
|
-
# 3. no setup specified: this message is fine as a user is likely
|
3547
|
+
# 1. setup specified: setup is executed in detached mode and this
|
3548
|
+
# message will not be shown.
|
3549
|
+
# 2. no setup specified: this message is fine as a user is likely
|
3558
3550
|
# creating a cluster only, and ok with the empty run command.
|
3559
3551
|
logger.info('Run commands not specified or empty.')
|
3560
3552
|
return None
|
@@ -3601,26 +3593,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3601
3593
|
|
3602
3594
|
def _post_execute(self, handle: CloudVmRayResourceHandle,
|
3603
3595
|
down: bool) -> None:
|
3604
|
-
|
3605
|
-
|
3606
|
-
|
3607
|
-
return
|
3608
|
-
logger.info(f'\nCluster name: {name}'
|
3609
|
-
f'\n{ux_utils.INDENT_SYMBOL}To log into the head VM:\t'
|
3610
|
-
f'{ux_utils.BOLD}ssh {name}'
|
3611
|
-
f'{ux_utils.RESET_BOLD}'
|
3612
|
-
f'\n{ux_utils.INDENT_SYMBOL}To submit a job:'
|
3613
|
-
f'\t\t{ux_utils.BOLD}sky exec {name} yaml_file'
|
3614
|
-
f'{ux_utils.RESET_BOLD}'
|
3615
|
-
f'\n{ux_utils.INDENT_SYMBOL}To stop the cluster:'
|
3616
|
-
f'\t{ux_utils.BOLD}sky stop {name}'
|
3617
|
-
f'{ux_utils.RESET_BOLD}'
|
3618
|
-
f'\n{ux_utils.INDENT_LAST_SYMBOL}To teardown the cluster:'
|
3619
|
-
f'\t{ux_utils.BOLD}sky down {name}'
|
3620
|
-
f'{ux_utils.RESET_BOLD}')
|
3621
|
-
if (gcp_utils.is_tpu(handle.launched_resources) and
|
3622
|
-
not gcp_utils.is_tpu_vm(handle.launched_resources)):
|
3623
|
-
logger.info('Tip: `sky down` will delete launched TPU(s) too.')
|
3596
|
+
"""Post-execute cleanup."""
|
3597
|
+
del handle, down # Unused.
|
3598
|
+
# All logic is handled in previous stages, no-op.
|
3624
3599
|
|
3625
3600
|
def _teardown_ephemeral_storage(self, task: task_lib.Task) -> None:
|
3626
3601
|
storage_mounts = task.storage_mounts
|
@@ -3668,30 +3643,47 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3668
3643
|
is_identity_mismatch_and_purge = True
|
3669
3644
|
else:
|
3670
3645
|
raise
|
3671
|
-
|
3672
3646
|
lock_path = os.path.expanduser(
|
3673
3647
|
backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
|
3674
|
-
|
3675
|
-
|
3676
|
-
|
3677
|
-
|
3678
|
-
|
3679
|
-
|
3680
|
-
|
3681
|
-
|
3682
|
-
|
3683
|
-
|
3684
|
-
|
3685
|
-
|
3686
|
-
|
3687
|
-
|
3688
|
-
|
3689
|
-
|
3690
|
-
|
3691
|
-
|
3692
|
-
|
3693
|
-
|
3694
|
-
|
3648
|
+
# Retry in case new cluster operation comes in and holds the lock
|
3649
|
+
# right after the lock is removed.
|
3650
|
+
n_attempts = 2
|
3651
|
+
while True:
|
3652
|
+
n_attempts -= 1
|
3653
|
+
# In case other running cluster operations are still holding the
|
3654
|
+
# lock.
|
3655
|
+
common_utils.remove_file_if_exists(lock_path)
|
3656
|
+
# We have to kill the cluster requests, because `down` and `stop`
|
3657
|
+
# should be higher priority than the cluster requests, and we should
|
3658
|
+
# release the lock from other requests.
|
3659
|
+
exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
|
3660
|
+
requests_lib.kill_cluster_requests(handle.cluster_name,
|
3661
|
+
exclude_request_to_kill)
|
3662
|
+
try:
|
3663
|
+
with filelock.FileLock(
|
3664
|
+
lock_path,
|
3665
|
+
backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
|
3666
|
+
self.teardown_no_lock(
|
3667
|
+
handle,
|
3668
|
+
terminate,
|
3669
|
+
purge,
|
3670
|
+
# When --purge is set and we already see an ID mismatch
|
3671
|
+
# error, we skip the refresh codepath. This is because
|
3672
|
+
# refresh checks current user identity can throw
|
3673
|
+
# ClusterOwnerIdentityMismatchError. The argument/flag
|
3674
|
+
# `purge` should bypass such ID mismatch errors.
|
3675
|
+
refresh_cluster_status=(
|
3676
|
+
not is_identity_mismatch_and_purge))
|
3677
|
+
if terminate:
|
3678
|
+
common_utils.remove_file_if_exists(lock_path)
|
3679
|
+
break
|
3680
|
+
except filelock.Timeout as e:
|
3681
|
+
logger.debug(f'Failed to acquire lock for {cluster_name}, '
|
3682
|
+
f'retrying...')
|
3683
|
+
if n_attempts <= 0:
|
3684
|
+
raise RuntimeError(
|
3685
|
+
f'Cluster {cluster_name!r} is locked by {lock_path}. '
|
3686
|
+
'Check to see if it is still being launched') from e
|
3695
3687
|
|
3696
3688
|
# --- CloudVMRayBackend Specific APIs ---
|
3697
3689
|
|
@@ -3715,24 +3707,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3715
3707
|
def cancel_jobs(self,
|
3716
3708
|
handle: CloudVmRayResourceHandle,
|
3717
3709
|
jobs: Optional[List[int]],
|
3718
|
-
cancel_all: bool = False
|
3710
|
+
cancel_all: bool = False,
|
3711
|
+
user_hash: Optional[str] = None) -> None:
|
3719
3712
|
"""Cancels jobs.
|
3720
3713
|
|
3721
|
-
|
3722
|
-
|
3723
|
-
Args:
|
3724
|
-
handle: The cluster handle.
|
3725
|
-
jobs: Job IDs to cancel. (See `cancel_all` for special semantics.)
|
3726
|
-
cancel_all: Whether to cancel all jobs. If True, asserts `jobs` is
|
3727
|
-
set to None. If False and `jobs` is None, cancel the latest
|
3728
|
-
running job.
|
3714
|
+
See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
|
3729
3715
|
"""
|
3730
|
-
|
3731
|
-
assert jobs is None, (
|
3732
|
-
'If cancel_all=True, usage is to set jobs=None')
|
3733
|
-
code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all)
|
3734
|
-
|
3735
|
-
# All error messages should have been redirected to stdout.
|
3716
|
+
code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all, user_hash)
|
3736
3717
|
returncode, stdout, _ = self.run_on_head(handle,
|
3737
3718
|
code,
|
3738
3719
|
stream_logs=False,
|
@@ -3741,13 +3722,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3741
3722
|
returncode, code,
|
3742
3723
|
f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout)
|
3743
3724
|
|
3744
|
-
cancelled_ids =
|
3725
|
+
cancelled_ids = message_utils.decode_payload(stdout)
|
3745
3726
|
if cancelled_ids:
|
3746
3727
|
logger.info(
|
3747
3728
|
f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
|
3748
3729
|
else:
|
3749
|
-
logger.info(
|
3750
|
-
'No jobs cancelled. They may already be in terminal states.')
|
3730
|
+
logger.info('No jobs cancelled. They may be in terminal states.')
|
3751
3731
|
|
3752
3732
|
def sync_down_logs(
|
3753
3733
|
self,
|
@@ -3768,7 +3748,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3768
3748
|
separate_stderr=True)
|
3769
3749
|
subprocess_utils.handle_returncode(returncode, code,
|
3770
3750
|
'Failed to sync logs.', stderr)
|
3771
|
-
run_timestamps =
|
3751
|
+
run_timestamps = message_utils.decode_payload(run_timestamps)
|
3772
3752
|
if not run_timestamps:
|
3773
3753
|
logger.info(f'{colorama.Fore.YELLOW}'
|
3774
3754
|
'No matching log directories found'
|
@@ -3782,16 +3762,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3782
3762
|
for run_timestamp in run_timestamps
|
3783
3763
|
]
|
3784
3764
|
local_log_dirs = [
|
3785
|
-
os.path.
|
3765
|
+
os.path.join(local_dir, run_timestamp)
|
3786
3766
|
for run_timestamp in run_timestamps
|
3787
3767
|
]
|
3788
3768
|
|
3789
|
-
style = colorama.Style
|
3790
|
-
fore = colorama.Fore
|
3791
|
-
for job_id, log_dir in zip(job_ids, local_log_dirs):
|
3792
|
-
logger.info(f'{fore.CYAN}Job {job_id} logs: {log_dir}'
|
3793
|
-
f'{style.RESET_ALL}')
|
3794
|
-
|
3795
3769
|
runners = handle.get_command_runners()
|
3796
3770
|
|
3797
3771
|
def _rsync_down(args) -> None:
|
@@ -3802,13 +3776,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3802
3776
|
"""
|
3803
3777
|
(runner, local_log_dir, remote_log_dir) = args
|
3804
3778
|
try:
|
3805
|
-
os.makedirs(local_log_dir, exist_ok=True)
|
3779
|
+
os.makedirs(os.path.expanduser(local_log_dir), exist_ok=True)
|
3806
3780
|
runner.rsync(
|
3807
3781
|
# Require a `/` at the end to make sure the parent dir
|
3808
3782
|
# are not created locally. We do not add additional '*' as
|
3809
3783
|
# kubernetes's rsync does not work with an ending '*'.
|
3810
3784
|
source=f'{remote_log_dir}/',
|
3811
|
-
target=local_log_dir,
|
3785
|
+
target=os.path.expanduser(local_log_dir),
|
3812
3786
|
up=False,
|
3813
3787
|
stream_logs=False,
|
3814
3788
|
)
|
@@ -3864,10 +3838,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3864
3838
|
# Allocate a pseudo-terminal to disable output buffering.
|
3865
3839
|
# Otherwise, there may be 5 minutes delay in logging.
|
3866
3840
|
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
3867
|
-
# Disable stdin to avoid ray outputs mess up the terminal with
|
3868
|
-
# misaligned output in multithreading/multiprocessing.
|
3869
|
-
# Refer to: https://github.com/ray-project/ray/blob/d462172be7c5779abf37609aed08af112a533e1e/python/ray/autoscaler/_private/subprocess_output_util.py#L264 # pylint: disable=line-too-long
|
3870
|
-
stdin=subprocess.DEVNULL,
|
3871
3841
|
)
|
3872
3842
|
except SystemExit as e:
|
3873
3843
|
returncode = e.code
|
@@ -3897,7 +3867,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3897
3867
|
stream_logs=True,
|
3898
3868
|
process_stream=False,
|
3899
3869
|
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
3900
|
-
stdin=subprocess.DEVNULL,
|
3901
3870
|
)
|
3902
3871
|
|
3903
3872
|
def sync_down_managed_job_logs(
|
@@ -3936,7 +3905,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3936
3905
|
subprocess_utils.handle_returncode(returncode, code,
|
3937
3906
|
'Failed to sync down logs.',
|
3938
3907
|
stderr)
|
3939
|
-
job_ids =
|
3908
|
+
job_ids = message_utils.decode_payload(job_ids)
|
3940
3909
|
if not job_ids:
|
3941
3910
|
logger.info(f'{colorama.Fore.YELLOW}'
|
3942
3911
|
'No matching job found'
|
@@ -3947,9 +3916,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3947
3916
|
if job_name is not None:
|
3948
3917
|
name_str = ('Multiple jobs IDs found under the name '
|
3949
3918
|
f'{job_name}. ')
|
3919
|
+
controller_str = ' (controller)' if controller else ''
|
3950
3920
|
logger.info(f'{colorama.Fore.YELLOW}'
|
3951
3921
|
f'{name_str}'
|
3952
|
-
'Downloading the latest job logs.'
|
3922
|
+
f'Downloading the latest job logs{controller_str}.'
|
3953
3923
|
f'{colorama.Style.RESET_ALL}')
|
3954
3924
|
# list should aready be in descending order
|
3955
3925
|
job_id = job_ids[0]
|
@@ -3967,7 +3937,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3967
3937
|
subprocess_utils.handle_returncode(returncode, code,
|
3968
3938
|
'Failed to sync logs.', stderr)
|
3969
3939
|
# returns with a dict of {job_id: run_timestamp}
|
3970
|
-
run_timestamps =
|
3940
|
+
run_timestamps = message_utils.decode_payload(run_timestamps)
|
3971
3941
|
if not run_timestamps:
|
3972
3942
|
logger.info(f'{colorama.Fore.YELLOW}'
|
3973
3943
|
'No matching log directories found'
|
@@ -3978,15 +3948,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3978
3948
|
job_id = list(run_timestamps.keys())[0]
|
3979
3949
|
local_log_dir = ''
|
3980
3950
|
if controller: # download controller logs
|
3981
|
-
remote_log = os.path.join(
|
3982
|
-
|
3983
|
-
|
3984
|
-
|
3985
|
-
|
3986
|
-
|
3987
|
-
logger.
|
3988
|
-
|
3989
|
-
|
3951
|
+
remote_log = os.path.join(managed_jobs.JOBS_CONTROLLER_LOGS_DIR,
|
3952
|
+
f'{job_id}.log')
|
3953
|
+
local_log_dir = os.path.join(local_dir, run_timestamp)
|
3954
|
+
os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
|
3955
|
+
exist_ok=True)
|
3956
|
+
|
3957
|
+
logger.debug(f'{colorama.Fore.CYAN}'
|
3958
|
+
f'Job {job_id} local logs: {local_log_dir}'
|
3959
|
+
f'{colorama.Style.RESET_ALL}')
|
3990
3960
|
|
3991
3961
|
runners = handle.get_command_runners()
|
3992
3962
|
|
@@ -3998,7 +3968,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3998
3968
|
"""
|
3999
3969
|
(runner, local_log_dir, remote_log) = args
|
4000
3970
|
try:
|
4001
|
-
os.makedirs(local_log_dir,
|
3971
|
+
os.makedirs(os.path.expanduser(local_log_dir),
|
3972
|
+
exist_ok=True)
|
4002
3973
|
runner.rsync(
|
4003
3974
|
source=remote_log,
|
4004
3975
|
target=f'{local_log_dir}/controller.log',
|
@@ -4019,9 +3990,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4019
3990
|
]
|
4020
3991
|
subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
|
4021
3992
|
else: # download job logs
|
4022
|
-
local_log_dir = os.path.
|
4023
|
-
|
4024
|
-
os.makedirs(os.path.dirname(local_log_dir),
|
3993
|
+
local_log_dir = os.path.join(local_dir, 'managed_jobs',
|
3994
|
+
run_timestamp)
|
3995
|
+
os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
|
3996
|
+
exist_ok=True)
|
4025
3997
|
log_file = os.path.join(local_log_dir, 'run.log')
|
4026
3998
|
|
4027
3999
|
code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
|
@@ -4040,16 +4012,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4040
4012
|
self.run_on_head(
|
4041
4013
|
handle,
|
4042
4014
|
code,
|
4043
|
-
log_path=log_file,
|
4015
|
+
log_path=os.path.expanduser(log_file),
|
4044
4016
|
stream_logs=False,
|
4045
4017
|
process_stream=False,
|
4046
4018
|
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
4047
|
-
stdin=subprocess.DEVNULL,
|
4048
4019
|
)
|
4049
4020
|
|
4050
|
-
logger.
|
4051
|
-
|
4052
|
-
|
4021
|
+
logger.debug(f'{colorama.Fore.CYAN}'
|
4022
|
+
f'Job {job_id} logs: {local_log_dir}'
|
4023
|
+
f'{colorama.Style.RESET_ALL}')
|
4053
4024
|
return {str(job_id): local_log_dir}
|
4054
4025
|
|
4055
4026
|
def teardown_no_lock(self,
|
@@ -4057,7 +4028,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4057
4028
|
terminate: bool,
|
4058
4029
|
purge: bool = False,
|
4059
4030
|
post_teardown_cleanup: bool = True,
|
4060
|
-
refresh_cluster_status: bool = True
|
4031
|
+
refresh_cluster_status: bool = True,
|
4032
|
+
remove_from_db: bool = True) -> None:
|
4061
4033
|
"""Teardown the cluster without acquiring the cluster status lock.
|
4062
4034
|
|
4063
4035
|
NOTE: This method should not be called without holding the cluster
|
@@ -4069,6 +4041,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4069
4041
|
Raises:
|
4070
4042
|
RuntimeError: If the cluster fails to be terminated/stopped.
|
4071
4043
|
"""
|
4044
|
+
exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
|
4045
|
+
# We have to kill the cluster requests again within the lock, because
|
4046
|
+
# any pending requests on the same cluster should be cancelled after
|
4047
|
+
# the cluster is terminated/stopped. Otherwise, it will be quite
|
4048
|
+
# confusing to see the cluster restarted immediately after it is
|
4049
|
+
# terminated/stopped, when there is a pending launch request.
|
4050
|
+
requests_lib.kill_cluster_requests(handle.cluster_name,
|
4051
|
+
exclude_request_to_kill)
|
4072
4052
|
cluster_status_fetched = False
|
4073
4053
|
if refresh_cluster_status:
|
4074
4054
|
try:
|
@@ -4096,6 +4076,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4096
4076
|
f'Cluster {handle.cluster_name!r} is already terminated. '
|
4097
4077
|
'Skipped.')
|
4098
4078
|
return
|
4079
|
+
|
4080
|
+
if handle.cluster_yaml is None:
|
4081
|
+
logger.warning(f'Cluster {handle.cluster_name!r} has no '
|
4082
|
+
f'provision yaml so it '
|
4083
|
+
'has not been provisioned. Skipped.')
|
4084
|
+
global_user_state.remove_cluster(handle.cluster_name,
|
4085
|
+
terminate=terminate)
|
4086
|
+
return
|
4099
4087
|
log_path = os.path.join(os.path.expanduser(self.log_dir),
|
4100
4088
|
'teardown.log')
|
4101
4089
|
log_abs_path = os.path.abspath(log_path)
|
@@ -4150,7 +4138,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4150
4138
|
raise
|
4151
4139
|
|
4152
4140
|
if post_teardown_cleanup:
|
4153
|
-
self.post_teardown_cleanup(handle, terminate, purge
|
4141
|
+
self.post_teardown_cleanup(handle, terminate, purge,
|
4142
|
+
remove_from_db)
|
4154
4143
|
return
|
4155
4144
|
|
4156
4145
|
if (isinstance(cloud, clouds.IBM) and terminate and
|
@@ -4271,7 +4260,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4271
4260
|
def post_teardown_cleanup(self,
|
4272
4261
|
handle: CloudVmRayResourceHandle,
|
4273
4262
|
terminate: bool,
|
4274
|
-
purge: bool = False
|
4263
|
+
purge: bool = False,
|
4264
|
+
remove_from_db: bool = True) -> None:
|
4275
4265
|
"""Cleanup local configs/caches and delete TPUs after teardown.
|
4276
4266
|
|
4277
4267
|
This method will handle the following cleanup steps:
|
@@ -4302,96 +4292,100 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4302
4292
|
'remove it manually to avoid image leakage. Details: '
|
4303
4293
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
4304
4294
|
if terminate:
|
4305
|
-
|
4306
|
-
|
4307
|
-
|
4308
|
-
|
4309
|
-
|
4310
|
-
|
4311
|
-
|
4312
|
-
|
4313
|
-
|
4314
|
-
|
4315
|
-
|
4316
|
-
|
4317
|
-
|
4318
|
-
|
4319
|
-
|
4320
|
-
|
4321
|
-
|
4322
|
-
|
4323
|
-
|
4324
|
-
|
4295
|
+
# This function could be directly called from status refresh,
|
4296
|
+
# where we need to cleanup the cluster profile.
|
4297
|
+
metadata_utils.remove_cluster_metadata(handle.cluster_name)
|
4298
|
+
# The cluster yaml does not exist when skypilot has not found
|
4299
|
+
# the right resource to provision the cluster.
|
4300
|
+
if handle.cluster_yaml is not None:
|
4301
|
+
try:
|
4302
|
+
cloud = handle.launched_resources.cloud
|
4303
|
+
config = common_utils.read_yaml(handle.cluster_yaml)
|
4304
|
+
cloud.check_features_are_supported(
|
4305
|
+
handle.launched_resources,
|
4306
|
+
{clouds.CloudImplementationFeatures.OPEN_PORTS})
|
4307
|
+
provision_lib.cleanup_ports(repr(cloud),
|
4308
|
+
cluster_name_on_cloud,
|
4309
|
+
handle.launched_resources.ports,
|
4310
|
+
config['provider'])
|
4311
|
+
self.remove_cluster_config(handle)
|
4312
|
+
except exceptions.NotSupportedError:
|
4313
|
+
pass
|
4314
|
+
except exceptions.PortDoesNotExistError:
|
4315
|
+
logger.debug('Ports do not exist. Skipping cleanup.')
|
4316
|
+
except Exception as e: # pylint: disable=broad-except
|
4317
|
+
if purge:
|
4318
|
+
msg = common_utils.format_exception(e, use_bracket=True)
|
4319
|
+
logger.warning(
|
4320
|
+
f'Failed to cleanup ports. Skipping since purge is '
|
4321
|
+
f'set. Details: {msg}')
|
4322
|
+
else:
|
4323
|
+
raise
|
4325
4324
|
|
4326
|
-
|
4327
|
-
|
4328
|
-
|
4329
|
-
|
4330
|
-
|
4331
|
-
|
4332
|
-
|
4333
|
-
|
4334
|
-
|
4335
|
-
|
4336
|
-
|
4337
|
-
|
4338
|
-
|
4339
|
-
|
4340
|
-
|
4341
|
-
|
4342
|
-
|
4343
|
-
|
4325
|
+
sky.utils.cluster_utils.SSHConfigHelper.remove_cluster(
|
4326
|
+
handle.cluster_name)
|
4327
|
+
|
4328
|
+
def _detect_abnormal_non_terminated_nodes(
|
4329
|
+
handle: CloudVmRayResourceHandle) -> None:
|
4330
|
+
# Confirm that instances have actually transitioned state before
|
4331
|
+
# updating the state database. We do this immediately before
|
4332
|
+
# removing the state from the database, so that we can guarantee
|
4333
|
+
# that this is always called before the state is removed. We
|
4334
|
+
# considered running this check as part of
|
4335
|
+
# provisioner.teardown_cluster or provision.terminate_instances, but
|
4336
|
+
# it would open the door to code paths that successfully call this
|
4337
|
+
# function but do not first call teardown_cluster or
|
4338
|
+
# terminate_instances. See
|
4339
|
+
# https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
|
4340
|
+
attempts = 0
|
4341
|
+
while True:
|
4342
|
+
config = common_utils.read_yaml(handle.cluster_yaml)
|
4343
|
+
|
4344
|
+
logger.debug(f'instance statuses attempt {attempts + 1}')
|
4344
4345
|
node_status_dict = provision_lib.query_instances(
|
4345
4346
|
repr(cloud),
|
4346
4347
|
cluster_name_on_cloud,
|
4347
4348
|
config['provider'],
|
4348
4349
|
non_terminated_only=False)
|
4349
|
-
except Exception as e: # pylint: disable=broad-except
|
4350
|
-
if purge:
|
4351
|
-
logger.warning(
|
4352
|
-
f'Failed to query instances. Skipping since purge is '
|
4353
|
-
f'set. Details: '
|
4354
|
-
f'{common_utils.format_exception(e, use_bracket=True)}')
|
4355
|
-
break
|
4356
|
-
raise
|
4357
4350
|
|
4358
|
-
|
4359
|
-
|
4360
|
-
|
4361
|
-
|
4362
|
-
|
4363
|
-
|
4364
|
-
|
4365
|
-
|
4366
|
-
|
4367
|
-
|
4368
|
-
|
4351
|
+
unexpected_node_state: Optional[Tuple[str, str]] = None
|
4352
|
+
for node_id, node_status in node_status_dict.items():
|
4353
|
+
logger.debug(f'{node_id} status: {node_status}')
|
4354
|
+
# FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
|
4355
|
+
# between "stopping/stopped" and "terminating/terminated",
|
4356
|
+
# so we allow for either status instead of casing on
|
4357
|
+
# `terminate`.
|
4358
|
+
if node_status not in [
|
4359
|
+
None, status_lib.ClusterStatus.STOPPED
|
4360
|
+
]:
|
4361
|
+
unexpected_node_state = (node_id, node_status)
|
4362
|
+
break
|
4369
4363
|
|
4370
|
-
|
4371
|
-
if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
|
4372
|
-
time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
|
4373
|
-
else:
|
4374
|
-
(node_id, node_status) = unexpected_node_state
|
4375
|
-
if purge:
|
4376
|
-
logger.warning(f'Instance {node_id} in unexpected '
|
4377
|
-
f'state {node_status}. Skipping since purge '
|
4378
|
-
'is set.')
|
4364
|
+
if unexpected_node_state is None:
|
4379
4365
|
break
|
4380
|
-
raise RuntimeError(f'Instance {node_id} in unexpected '
|
4381
|
-
f'state {node_status}.')
|
4382
4366
|
|
4383
|
-
|
4384
|
-
|
4367
|
+
attempts += 1
|
4368
|
+
if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
|
4369
|
+
time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
|
4370
|
+
else:
|
4371
|
+
(node_id, node_status) = unexpected_node_state
|
4372
|
+
raise RuntimeError(f'Instance {node_id} in unexpected '
|
4373
|
+
f'state {node_status}.')
|
4385
4374
|
|
4386
|
-
|
4387
|
-
|
4388
|
-
|
4389
|
-
|
4375
|
+
# If cluster_yaml is None, the cluster should ensured to be terminated,
|
4376
|
+
# so we don't need to do the double check.
|
4377
|
+
if handle.cluster_yaml is not None:
|
4378
|
+
_detect_abnormal_non_terminated_nodes(handle)
|
4390
4379
|
|
4391
|
-
|
4392
|
-
|
4393
|
-
|
4394
|
-
|
4380
|
+
if not terminate or remove_from_db:
|
4381
|
+
global_user_state.remove_cluster(handle.cluster_name,
|
4382
|
+
terminate=terminate)
|
4383
|
+
|
4384
|
+
def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
|
4385
|
+
"""Remove the YAML config of a cluster."""
|
4386
|
+
handle.cluster_yaml = None
|
4387
|
+
global_user_state.update_cluster_handle(handle.cluster_name, handle)
|
4388
|
+
common_utils.remove_file_if_exists(handle.cluster_yaml)
|
4395
4389
|
|
4396
4390
|
def set_autostop(self,
|
4397
4391
|
handle: CloudVmRayResourceHandle,
|
@@ -4468,7 +4462,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4468
4462
|
stream_logs=stream_logs)
|
4469
4463
|
|
4470
4464
|
if returncode == 0:
|
4471
|
-
return
|
4465
|
+
return message_utils.decode_payload(stdout)
|
4472
4466
|
logger.debug('Failed to check if cluster is autostopping with '
|
4473
4467
|
f'{returncode}: {stdout+stderr}\n'
|
4474
4468
|
f'Command: {code}')
|
@@ -4707,7 +4701,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4707
4701
|
if not data_utils.is_cloud_store_url(src):
|
4708
4702
|
full_src = os.path.abspath(os.path.expanduser(src))
|
4709
4703
|
# Checked during Task.set_file_mounts().
|
4710
|
-
assert os.path.exists(
|
4704
|
+
assert os.path.exists(
|
4705
|
+
full_src), f'{full_src} does not exist. {file_mounts}'
|
4711
4706
|
src_size = backend_utils.path_size_megabytes(full_src)
|
4712
4707
|
if src_size >= _PATH_SIZE_MEGABYTES_WARN_THRESHOLD:
|
4713
4708
|
logger.warning(
|
@@ -4822,7 +4817,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4822
4817
|
num_threads)
|
4823
4818
|
end = time.time()
|
4824
4819
|
logger.debug(f'File mount sync took {end - start} seconds.')
|
4825
|
-
logger.info(ux_utils.finishing_message('
|
4820
|
+
logger.info(ux_utils.finishing_message('Synced file_mounts.', log_path))
|
4826
4821
|
|
4827
4822
|
def _execute_storage_mounts(
|
4828
4823
|
self, handle: CloudVmRayResourceHandle,
|
@@ -4858,6 +4853,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4858
4853
|
f'Mounting {len(storage_mounts)} storage{plural}', log_path))
|
4859
4854
|
|
4860
4855
|
for dst, storage_obj in storage_mounts.items():
|
4856
|
+
storage_obj.construct()
|
4861
4857
|
if not os.path.isabs(dst) and not dst.startswith('~/'):
|
4862
4858
|
dst = f'{SKY_REMOTE_WORKDIR}/{dst}'
|
4863
4859
|
# Raised when the bucket is externall removed before re-mounting
|
@@ -4871,6 +4867,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4871
4867
|
'successfully without mounting the bucket.')
|
4872
4868
|
# Get the first store and use it to mount
|
4873
4869
|
store = list(storage_obj.stores.values())[0]
|
4870
|
+
assert store is not None, storage_obj
|
4874
4871
|
mount_cmd = store.mount_command(dst)
|
4875
4872
|
src_print = (storage_obj.source
|
4876
4873
|
if storage_obj.source else storage_obj.name)
|
@@ -4925,6 +4922,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4925
4922
|
return
|
4926
4923
|
storage_mounts_metadata = {}
|
4927
4924
|
for dst, storage_obj in storage_mounts.items():
|
4925
|
+
if storage_obj.mode != storage_lib.StorageMode.MOUNT:
|
4926
|
+
# Skip non-mount storage objects, as there is no need to
|
4927
|
+
# reconstruct them during cluster restart.
|
4928
|
+
continue
|
4928
4929
|
storage_mounts_metadata[dst] = storage_obj.handle
|
4929
4930
|
lock_path = (
|
4930
4931
|
backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
|