skypilot-nightly 1.0.0.dev20250827__py3-none-any.whl → 1.0.0.dev20250829__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/admin_policy.py +11 -10
- sky/authentication.py +1 -1
- sky/backends/backend.py +3 -5
- sky/backends/backend_utils.py +140 -52
- sky/backends/cloud_vm_ray_backend.py +30 -25
- sky/backends/local_docker_backend.py +3 -8
- sky/backends/wheel_utils.py +35 -8
- sky/client/cli/command.py +41 -9
- sky/client/sdk.py +23 -8
- sky/client/sdk_async.py +6 -2
- sky/clouds/aws.py +118 -1
- sky/core.py +1 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +82 -22
- sky/jobs/client/sdk.py +5 -2
- sky/jobs/recovery_strategy.py +9 -4
- sky/jobs/server/server.py +2 -1
- sky/logs/agent.py +2 -2
- sky/logs/aws.py +6 -3
- sky/provision/aws/config.py +78 -3
- sky/provision/aws/instance.py +45 -6
- sky/provision/do/utils.py +2 -1
- sky/provision/kubernetes/instance.py +55 -11
- sky/provision/kubernetes/utils.py +11 -2
- sky/provision/nebius/utils.py +36 -2
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/serve/client/impl.py +5 -4
- sky/serve/replica_managers.py +4 -3
- sky/serve/serve_utils.py +2 -2
- sky/serve/server/impl.py +3 -2
- sky/serve/server/server.py +2 -1
- sky/server/auth/oauth2_proxy.py +10 -4
- sky/server/common.py +4 -4
- sky/server/daemons.py +16 -5
- sky/server/requests/executor.py +5 -3
- sky/server/requests/payloads.py +3 -1
- sky/server/requests/preconditions.py +3 -2
- sky/server/requests/requests.py +121 -19
- sky/server/server.py +85 -60
- sky/server/stream_utils.py +7 -5
- sky/setup_files/dependencies.py +6 -1
- sky/sky_logging.py +28 -0
- sky/skylet/constants.py +6 -0
- sky/skylet/events.py +2 -3
- sky/skypilot_config.py +10 -10
- sky/task.py +1 -1
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +4 -8
- sky/usage/usage_lib.py +3 -2
- sky/utils/annotations.py +8 -2
- sky/utils/cluster_utils.py +3 -3
- sky/utils/common_utils.py +0 -72
- sky/utils/controller_utils.py +4 -3
- sky/utils/dag_utils.py +4 -4
- sky/utils/db/db_utils.py +11 -0
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes/config_map_utils.py +3 -3
- sky/utils/kubernetes_enums.py +1 -0
- sky/utils/lock_events.py +94 -0
- sky/utils/schemas.py +3 -0
- sky/utils/timeline.py +24 -93
- sky/utils/yaml_utils.py +77 -10
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/METADATA +8 -2
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/RECORD +86 -84
- /sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → hYJYFIxp_ZFONR4wTIJqZ}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → hYJYFIxp_ZFONR4wTIJqZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ import urllib.request
|
|
|
7
7
|
from sky.utils import directory_utils
|
|
8
8
|
|
|
9
9
|
# Replaced with the current commit when building the wheels.
|
|
10
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
10
|
+
_SKYPILOT_COMMIT_SHA = '27f74c78af59ef98180b59a30c43410e46e3ce37'
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def _get_git_commit():
|
|
@@ -37,7 +37,7 @@ def _get_git_commit():
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
__commit__ = _get_git_commit()
|
|
40
|
-
__version__ = '1.0.0.
|
|
40
|
+
__version__ = '1.0.0.dev20250829'
|
|
41
41
|
__root_dir__ = directory_utils.get_sky_dir()
|
|
42
42
|
|
|
43
43
|
|
sky/admin_policy.py
CHANGED
|
@@ -13,6 +13,7 @@ from sky.adaptors import common as adaptors_common
|
|
|
13
13
|
from sky.utils import common_utils
|
|
14
14
|
from sky.utils import config_utils
|
|
15
15
|
from sky.utils import ux_utils
|
|
16
|
+
from sky.utils import yaml_utils
|
|
16
17
|
|
|
17
18
|
if typing.TYPE_CHECKING:
|
|
18
19
|
import requests
|
|
@@ -80,9 +81,9 @@ class UserRequest:
|
|
|
80
81
|
|
|
81
82
|
def encode(self) -> str:
|
|
82
83
|
return _UserRequestBody(
|
|
83
|
-
task=
|
|
84
|
-
skypilot_config=
|
|
85
|
-
|
|
84
|
+
task=yaml_utils.dump_yaml_str(self.task.to_yaml_config()),
|
|
85
|
+
skypilot_config=yaml_utils.dump_yaml_str(dict(
|
|
86
|
+
self.skypilot_config)),
|
|
86
87
|
request_options=self.request_options,
|
|
87
88
|
at_client_side=self.at_client_side,
|
|
88
89
|
).model_dump_json()
|
|
@@ -92,9 +93,9 @@ class UserRequest:
|
|
|
92
93
|
user_request_body = _UserRequestBody.model_validate_json(body)
|
|
93
94
|
return cls(
|
|
94
95
|
task=sky.Task.from_yaml_config(
|
|
95
|
-
|
|
96
|
+
yaml_utils.read_yaml_all_str(user_request_body.task)[0]),
|
|
96
97
|
skypilot_config=config_utils.Config.from_dict(
|
|
97
|
-
|
|
98
|
+
yaml_utils.read_yaml_all_str(
|
|
98
99
|
user_request_body.skypilot_config)[0]),
|
|
99
100
|
request_options=user_request_body.request_options,
|
|
100
101
|
at_client_side=user_request_body.at_client_side,
|
|
@@ -116,9 +117,9 @@ class MutatedUserRequest:
|
|
|
116
117
|
|
|
117
118
|
def encode(self) -> str:
|
|
118
119
|
return _MutatedUserRequestBody(
|
|
119
|
-
task=
|
|
120
|
-
skypilot_config=
|
|
121
|
-
|
|
120
|
+
task=yaml_utils.dump_yaml_str(self.task.to_yaml_config()),
|
|
121
|
+
skypilot_config=yaml_utils.dump_yaml_str(dict(
|
|
122
|
+
self.skypilot_config),)).model_dump_json()
|
|
122
123
|
|
|
123
124
|
@classmethod
|
|
124
125
|
def decode(cls, mutated_user_request_body: str,
|
|
@@ -126,14 +127,14 @@ class MutatedUserRequest:
|
|
|
126
127
|
mutated_user_request_body = _MutatedUserRequestBody.model_validate_json(
|
|
127
128
|
mutated_user_request_body)
|
|
128
129
|
task = sky.Task.from_yaml_config(
|
|
129
|
-
|
|
130
|
+
yaml_utils.read_yaml_all_str(mutated_user_request_body.task)[0])
|
|
130
131
|
# Some internal Task fields are not serialized. We need to manually
|
|
131
132
|
# restore them from the original request.
|
|
132
133
|
task.managed_job_dag = original_request.task.managed_job_dag
|
|
133
134
|
task.service_name = original_request.task.service_name
|
|
134
135
|
return cls(task=task,
|
|
135
136
|
skypilot_config=config_utils.Config.from_dict(
|
|
136
|
-
|
|
137
|
+
yaml_utils.read_yaml_all_str(
|
|
137
138
|
mutated_user_request_body.skypilot_config)[0],))
|
|
138
139
|
|
|
139
140
|
|
sky/authentication.py
CHANGED
|
@@ -198,7 +198,7 @@ def configure_ssh_info(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
198
198
|
_, public_key_path = get_or_generate_keys()
|
|
199
199
|
with open(public_key_path, 'r', encoding='utf-8') as f:
|
|
200
200
|
public_key = f.read().strip()
|
|
201
|
-
config_str =
|
|
201
|
+
config_str = yaml_utils.dump_yaml_str(config)
|
|
202
202
|
config_str = config_str.replace('skypilot:ssh_user',
|
|
203
203
|
config['auth']['ssh_user'])
|
|
204
204
|
config_str = config_str.replace('skypilot:ssh_public_key_content',
|
sky/backends/backend.py
CHANGED
|
@@ -147,9 +147,8 @@ class Backend(Generic[_ResourceHandleType]):
|
|
|
147
147
|
def teardown(self,
|
|
148
148
|
handle: _ResourceHandleType,
|
|
149
149
|
terminate: bool,
|
|
150
|
-
purge: bool = False
|
|
151
|
-
|
|
152
|
-
self._teardown(handle, terminate, purge, explicitly_requested)
|
|
150
|
+
purge: bool = False) -> None:
|
|
151
|
+
self._teardown(handle, terminate, purge)
|
|
153
152
|
|
|
154
153
|
def register_info(self, **kwargs) -> None:
|
|
155
154
|
"""Register backend-specific information."""
|
|
@@ -201,6 +200,5 @@ class Backend(Generic[_ResourceHandleType]):
|
|
|
201
200
|
def _teardown(self,
|
|
202
201
|
handle: _ResourceHandleType,
|
|
203
202
|
terminate: bool,
|
|
204
|
-
purge: bool = False
|
|
205
|
-
explicitly_requested: bool = False):
|
|
203
|
+
purge: bool = False):
|
|
206
204
|
raise NotImplementedError
|
sky/backends/backend_utils.py
CHANGED
|
@@ -241,7 +241,7 @@ def _optimize_file_mounts(tmp_yaml_path: str) -> None:
|
|
|
241
241
|
subprocess.CalledProcessError: If the file mounts are failed to be
|
|
242
242
|
copied.
|
|
243
243
|
"""
|
|
244
|
-
yaml_config =
|
|
244
|
+
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
245
245
|
|
|
246
246
|
file_mounts = yaml_config.get('file_mounts', {})
|
|
247
247
|
# Remove the file mounts added by the newline.
|
|
@@ -325,7 +325,7 @@ def _optimize_file_mounts(tmp_yaml_path: str) -> None:
|
|
|
325
325
|
shell=True,
|
|
326
326
|
check=True)
|
|
327
327
|
|
|
328
|
-
|
|
328
|
+
yaml_utils.dump_yaml(tmp_yaml_path, yaml_config)
|
|
329
329
|
|
|
330
330
|
|
|
331
331
|
def path_size_megabytes(path: str) -> int:
|
|
@@ -510,7 +510,7 @@ def _replace_yaml_dicts(
|
|
|
510
510
|
for key in exclude_restore_key_name[:-1]:
|
|
511
511
|
curr = curr[key]
|
|
512
512
|
curr[exclude_restore_key_name[-1]] = value
|
|
513
|
-
return
|
|
513
|
+
return yaml_utils.dump_yaml_str(new_config)
|
|
514
514
|
|
|
515
515
|
|
|
516
516
|
def get_expirable_clouds(
|
|
@@ -937,7 +937,7 @@ def write_cluster_config(
|
|
|
937
937
|
tmp_yaml_path,
|
|
938
938
|
cluster_config_overrides=cluster_config_overrides,
|
|
939
939
|
context=region.name)
|
|
940
|
-
yaml_obj =
|
|
940
|
+
yaml_obj = yaml_utils.read_yaml(tmp_yaml_path)
|
|
941
941
|
pod_config: Dict[str, Any] = yaml_obj['available_node_types'][
|
|
942
942
|
'ray_head_default']['node_config']
|
|
943
943
|
|
|
@@ -976,7 +976,7 @@ def write_cluster_config(
|
|
|
976
976
|
# Read the cluster name from the tmp yaml file, to take the backward
|
|
977
977
|
# compatbility restortion above into account.
|
|
978
978
|
# TODO: remove this after 2 minor releases, 0.10.0.
|
|
979
|
-
yaml_config =
|
|
979
|
+
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
980
980
|
config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
|
|
981
981
|
|
|
982
982
|
# Make sure to do this before we optimize file mounts. Optimization is
|
|
@@ -1022,7 +1022,7 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
|
|
1022
1022
|
|
|
1023
1023
|
This function's output removes comments included in the jinja2 template.
|
|
1024
1024
|
"""
|
|
1025
|
-
config =
|
|
1025
|
+
config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1026
1026
|
# Check the availability of the cloud type.
|
|
1027
1027
|
if isinstance(cloud, (
|
|
1028
1028
|
clouds.AWS,
|
|
@@ -1054,7 +1054,7 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
|
|
1054
1054
|
config = auth.setup_hyperbolic_authentication(config)
|
|
1055
1055
|
else:
|
|
1056
1056
|
assert False, cloud
|
|
1057
|
-
|
|
1057
|
+
yaml_utils.dump_yaml(tmp_yaml_path, config)
|
|
1058
1058
|
|
|
1059
1059
|
|
|
1060
1060
|
def get_timestamp_from_run_timestamp(run_timestamp: str) -> float:
|
|
@@ -1156,7 +1156,7 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
|
|
|
1156
1156
|
"""
|
|
1157
1157
|
|
|
1158
1158
|
# Load the yaml contents so that we can directly remove keys.
|
|
1159
|
-
yaml_config =
|
|
1159
|
+
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1160
1160
|
for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
|
|
1161
1161
|
dict_to_remove_from = yaml_config
|
|
1162
1162
|
found_key = True
|
|
@@ -1175,7 +1175,7 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
|
|
|
1175
1175
|
config_hash = hashlib.sha256()
|
|
1176
1176
|
|
|
1177
1177
|
yaml_hash = hashlib.sha256(
|
|
1178
|
-
|
|
1178
|
+
yaml_utils.dump_yaml_str(yaml_config).encode('utf-8'))
|
|
1179
1179
|
config_hash.update(yaml_hash.digest())
|
|
1180
1180
|
|
|
1181
1181
|
file_mounts = yaml_config.get('file_mounts', {})
|
|
@@ -1409,6 +1409,62 @@ def ssh_credential_from_yaml(
|
|
|
1409
1409
|
return credentials
|
|
1410
1410
|
|
|
1411
1411
|
|
|
1412
|
+
def ssh_credentials_from_handles(
|
|
1413
|
+
handles: List['cloud_vm_ray_backend.CloudVmRayResourceHandle'],
|
|
1414
|
+
) -> List[Dict[str, Any]]:
|
|
1415
|
+
"""Returns ssh_user, ssh_private_key and ssh_control name.
|
|
1416
|
+
"""
|
|
1417
|
+
non_empty_cluster_yaml_paths = [
|
|
1418
|
+
handle.cluster_yaml
|
|
1419
|
+
for handle in handles
|
|
1420
|
+
if handle.cluster_yaml is not None
|
|
1421
|
+
]
|
|
1422
|
+
cluster_yaml_dicts = global_user_state.get_cluster_yaml_dict_multiple(
|
|
1423
|
+
non_empty_cluster_yaml_paths)
|
|
1424
|
+
cluster_yaml_dicts_to_index = {
|
|
1425
|
+
cluster_yaml_path: cluster_yaml_dict
|
|
1426
|
+
for cluster_yaml_path, cluster_yaml_dict in zip(
|
|
1427
|
+
non_empty_cluster_yaml_paths, cluster_yaml_dicts)
|
|
1428
|
+
}
|
|
1429
|
+
|
|
1430
|
+
credentials_to_return: List[Dict[str, Any]] = []
|
|
1431
|
+
for handle in handles:
|
|
1432
|
+
if handle.cluster_yaml is None:
|
|
1433
|
+
credentials_to_return.append(dict())
|
|
1434
|
+
continue
|
|
1435
|
+
ssh_user = handle.ssh_user
|
|
1436
|
+
docker_user = handle.docker_user
|
|
1437
|
+
config = cluster_yaml_dicts_to_index[handle.cluster_yaml]
|
|
1438
|
+
auth_section = config['auth']
|
|
1439
|
+
if ssh_user is None:
|
|
1440
|
+
ssh_user = auth_section['ssh_user'].strip()
|
|
1441
|
+
ssh_private_key_path = auth_section.get('ssh_private_key')
|
|
1442
|
+
ssh_control_name = config.get('cluster_name', '__default__')
|
|
1443
|
+
ssh_proxy_command = auth_section.get('ssh_proxy_command')
|
|
1444
|
+
|
|
1445
|
+
# Update the ssh_user placeholder in proxy command, if required
|
|
1446
|
+
if (ssh_proxy_command is not None and
|
|
1447
|
+
constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
|
|
1448
|
+
ssh_proxy_command = ssh_proxy_command.replace(
|
|
1449
|
+
constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
|
|
1450
|
+
|
|
1451
|
+
credentials = {
|
|
1452
|
+
'ssh_user': ssh_user,
|
|
1453
|
+
'ssh_private_key': ssh_private_key_path,
|
|
1454
|
+
'ssh_control_name': ssh_control_name,
|
|
1455
|
+
'ssh_proxy_command': ssh_proxy_command,
|
|
1456
|
+
}
|
|
1457
|
+
if docker_user is not None:
|
|
1458
|
+
credentials['docker_user'] = docker_user
|
|
1459
|
+
ssh_provider_module = config['provider']['module']
|
|
1460
|
+
# If we are running ssh command on kubernetes node.
|
|
1461
|
+
if 'kubernetes' in ssh_provider_module:
|
|
1462
|
+
credentials['disable_control_master'] = True
|
|
1463
|
+
credentials_to_return.append(credentials)
|
|
1464
|
+
|
|
1465
|
+
return credentials_to_return
|
|
1466
|
+
|
|
1467
|
+
|
|
1412
1468
|
def parallel_data_transfer_to_nodes(
|
|
1413
1469
|
runners: List[command_runner.CommandRunner],
|
|
1414
1470
|
source: Optional[str],
|
|
@@ -2027,9 +2083,7 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2027
2083
|
'Cluster has no YAML file. Removing the cluster from cache.',
|
|
2028
2084
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2029
2085
|
nop_if_duplicate=True)
|
|
2030
|
-
global_user_state.remove_cluster(cluster_name,
|
|
2031
|
-
terminate=True,
|
|
2032
|
-
remove_events=True)
|
|
2086
|
+
global_user_state.remove_cluster(cluster_name, terminate=True)
|
|
2033
2087
|
logger.debug(f'Cluster {cluster_name!r} has no YAML file. '
|
|
2034
2088
|
'Removing the cluster from cache.')
|
|
2035
2089
|
return None
|
|
@@ -2058,7 +2112,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2058
2112
|
f'{output}\n', stderr)
|
|
2059
2113
|
return (*_count_healthy_nodes_from_ray(output), output, stderr)
|
|
2060
2114
|
|
|
2115
|
+
ray_status_details: Optional[str] = None
|
|
2116
|
+
|
|
2061
2117
|
def run_ray_status_to_check_ray_cluster_healthy() -> bool:
|
|
2118
|
+
nonlocal ray_status_details
|
|
2062
2119
|
try:
|
|
2063
2120
|
# NOTE: fetching the IPs is very slow as it calls into
|
|
2064
2121
|
# `ray get head-ip/worker-ips`. Using cached IPs is safe because
|
|
@@ -2136,19 +2193,25 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2136
2193
|
# showing up
|
|
2137
2194
|
time.sleep(1)
|
|
2138
2195
|
|
|
2196
|
+
ray_status_details = (
|
|
2197
|
+
f'{ready_head + ready_workers}/{total_nodes} ready')
|
|
2139
2198
|
raise RuntimeError(
|
|
2140
2199
|
f'Refreshing status ({cluster_name!r}): ray status not showing '
|
|
2141
2200
|
f'all nodes ({ready_head + ready_workers}/'
|
|
2142
2201
|
f'{total_nodes});\noutput:\n{output}\nstderr:\n{stderr}')
|
|
2143
2202
|
|
|
2144
2203
|
except exceptions.FetchClusterInfoError:
|
|
2204
|
+
ray_status_details = 'failed to get IPs'
|
|
2145
2205
|
logger.debug(
|
|
2146
2206
|
f'Refreshing status ({cluster_name!r}) failed to get IPs.')
|
|
2147
2207
|
except RuntimeError as e:
|
|
2208
|
+
if ray_status_details is None:
|
|
2209
|
+
ray_status_details = str(e)
|
|
2148
2210
|
logger.debug(common_utils.format_exception(e))
|
|
2149
2211
|
except Exception as e: # pylint: disable=broad-except
|
|
2150
2212
|
# This can be raised by `external_ssh_ports()`, due to the
|
|
2151
2213
|
# underlying call to kubernetes API.
|
|
2214
|
+
ray_status_details = str(e)
|
|
2152
2215
|
logger.debug(f'Refreshing status ({cluster_name!r}) failed: ',
|
|
2153
2216
|
exc_info=e)
|
|
2154
2217
|
return False
|
|
@@ -2261,6 +2324,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2261
2324
|
# (2) Otherwise, we will reset the autostop setting, unless the cluster is
|
|
2262
2325
|
# autostopping/autodowning.
|
|
2263
2326
|
some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
|
|
2327
|
+
# If all nodes are up and ray cluster is health, we would have returned
|
|
2328
|
+
# earlier. So if all_nodes_up is True and we are here, it means the ray
|
|
2329
|
+
# cluster must have been unhealthy.
|
|
2330
|
+
ray_cluster_unhealthy = all_nodes_up
|
|
2264
2331
|
some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
|
|
2265
2332
|
for status in node_statuses)
|
|
2266
2333
|
is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
|
|
@@ -2271,8 +2338,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2271
2338
|
|
|
2272
2339
|
if some_nodes_terminated:
|
|
2273
2340
|
init_reason = 'one or more nodes terminated'
|
|
2341
|
+
elif ray_cluster_unhealthy:
|
|
2342
|
+
init_reason = f'ray cluster is unhealthy ({ray_status_details})'
|
|
2274
2343
|
elif some_nodes_not_stopped:
|
|
2275
|
-
init_reason = 'some
|
|
2344
|
+
init_reason = 'some but not all nodes are stopped'
|
|
2276
2345
|
logger.debug('The cluster is abnormal. Setting to INIT status. '
|
|
2277
2346
|
f'node_statuses: {node_statuses}')
|
|
2278
2347
|
if record['autostop'] >= 0:
|
|
@@ -2367,7 +2436,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2367
2436
|
# Some status reason clears after a certain time (e.g. k8s events
|
|
2368
2437
|
# are only stored for an hour by default), so it is possible that
|
|
2369
2438
|
# the previous event has a status reason, but now it does not.
|
|
2370
|
-
init_reason_regex = f'^Cluster is abnormal because
|
|
2439
|
+
init_reason_regex = (f'^Cluster is abnormal because '
|
|
2440
|
+
f'{re.escape(init_reason)}.*')
|
|
2371
2441
|
log_message = f'Cluster is abnormal because {init_reason}'
|
|
2372
2442
|
if status_reason:
|
|
2373
2443
|
log_message += f' ({status_reason})'
|
|
@@ -2387,10 +2457,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2387
2457
|
return global_user_state.get_cluster_from_name(cluster_name)
|
|
2388
2458
|
# Now is_abnormal is False: either node_statuses is empty or all nodes are
|
|
2389
2459
|
# STOPPED.
|
|
2460
|
+
verb = 'terminated' if to_terminate else 'stopped'
|
|
2390
2461
|
backend = backends.CloudVmRayBackend()
|
|
2391
2462
|
global_user_state.add_cluster_event(
|
|
2392
|
-
cluster_name,
|
|
2393
|
-
|
|
2463
|
+
cluster_name,
|
|
2464
|
+
None,
|
|
2465
|
+
f'All nodes {verb}, cleaning up the cluster.',
|
|
2466
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2467
|
+
# This won't do anything for a terminated cluster, but it's needed for a
|
|
2468
|
+
# stopped cluster.
|
|
2469
|
+
nop_if_duplicate=True,
|
|
2470
|
+
)
|
|
2394
2471
|
backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
|
|
2395
2472
|
return global_user_state.get_cluster_from_name(cluster_name)
|
|
2396
2473
|
|
|
@@ -2918,44 +2995,57 @@ def get_clusters(
|
|
|
2918
2995
|
logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
|
|
2919
2996
|
records = new_records
|
|
2920
2997
|
|
|
2921
|
-
def
|
|
2922
|
-
|
|
2998
|
+
def _update_records_with_credentials_and_resources_str(
|
|
2999
|
+
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
2923
3000
|
"""Add the credentials to the record.
|
|
2924
3001
|
|
|
2925
3002
|
This is useful for the client side to setup the ssh config of the
|
|
2926
3003
|
cluster.
|
|
2927
3004
|
"""
|
|
2928
|
-
|
|
2929
|
-
|
|
2930
|
-
|
|
2931
|
-
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
handle
|
|
2935
|
-
|
|
2936
|
-
|
|
2937
|
-
|
|
2938
|
-
|
|
2939
|
-
|
|
2940
|
-
|
|
2941
|
-
|
|
2942
|
-
|
|
3005
|
+
records_with_handle = []
|
|
3006
|
+
|
|
3007
|
+
# only act on records that have a handle
|
|
3008
|
+
for record in records:
|
|
3009
|
+
if record is None:
|
|
3010
|
+
continue
|
|
3011
|
+
handle = record['handle']
|
|
3012
|
+
if handle is None:
|
|
3013
|
+
continue
|
|
3014
|
+
record[
|
|
3015
|
+
'resources_str'] = resources_utils.get_readable_resources_repr(
|
|
3016
|
+
handle, simplify=True)
|
|
3017
|
+
record[
|
|
3018
|
+
'resources_str_full'] = resources_utils.get_readable_resources_repr(
|
|
3019
|
+
handle, simplify=False)
|
|
3020
|
+
records_with_handle.append(record)
|
|
3021
|
+
if len(records_with_handle) == 0:
|
|
2943
3022
|
return
|
|
2944
|
-
|
|
2945
|
-
|
|
2946
|
-
|
|
2947
|
-
|
|
2948
|
-
|
|
2949
|
-
|
|
2950
|
-
|
|
2951
|
-
|
|
2952
|
-
|
|
2953
|
-
|
|
2954
|
-
|
|
2955
|
-
|
|
2956
|
-
|
|
2957
|
-
|
|
2958
|
-
|
|
3023
|
+
|
|
3024
|
+
handles = [record['handle'] for record in records_with_handle]
|
|
3025
|
+
credentials = ssh_credentials_from_handles(handles)
|
|
3026
|
+
cached_private_keys: Dict[str, str] = {}
|
|
3027
|
+
for record, credential in zip(records_with_handle, credentials):
|
|
3028
|
+
if not credential:
|
|
3029
|
+
continue
|
|
3030
|
+
ssh_private_key_path = credential.get('ssh_private_key', None)
|
|
3031
|
+
if ssh_private_key_path is not None:
|
|
3032
|
+
expanded_private_key_path = os.path.expanduser(
|
|
3033
|
+
ssh_private_key_path)
|
|
3034
|
+
if not os.path.exists(expanded_private_key_path):
|
|
3035
|
+
auth.create_ssh_key_files_from_db(ssh_private_key_path)
|
|
3036
|
+
else:
|
|
3037
|
+
private_key_path, _ = auth.get_or_generate_keys()
|
|
3038
|
+
expanded_private_key_path = os.path.expanduser(private_key_path)
|
|
3039
|
+
if expanded_private_key_path in cached_private_keys:
|
|
3040
|
+
credential['ssh_private_key_content'] = cached_private_keys[
|
|
3041
|
+
expanded_private_key_path]
|
|
3042
|
+
else:
|
|
3043
|
+
with open(expanded_private_key_path, 'r',
|
|
3044
|
+
encoding='utf-8') as f:
|
|
3045
|
+
credential['ssh_private_key_content'] = f.read()
|
|
3046
|
+
cached_private_keys[expanded_private_key_path] = credential[
|
|
3047
|
+
'ssh_private_key_content']
|
|
3048
|
+
record['credentials'] = credential
|
|
2959
3049
|
|
|
2960
3050
|
def _update_records_with_resources(
|
|
2961
3051
|
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
@@ -2982,9 +3072,7 @@ def get_clusters(
|
|
|
2982
3072
|
if handle.launched_resources.accelerators else None)
|
|
2983
3073
|
|
|
2984
3074
|
# Add auth_config to the records
|
|
2985
|
-
|
|
2986
|
-
_update_record_with_credentials_and_resources_str(record)
|
|
2987
|
-
|
|
3075
|
+
_update_records_with_credentials_and_resources_str(records)
|
|
2988
3076
|
if refresh == common.StatusRefreshMode.NONE:
|
|
2989
3077
|
# Add resources to the records
|
|
2990
3078
|
_update_records_with_resources(records)
|
|
@@ -3024,7 +3112,7 @@ def get_clusters(
|
|
|
3024
3112
|
cluster_name,
|
|
3025
3113
|
force_refresh_statuses=force_refresh_statuses,
|
|
3026
3114
|
acquire_per_cluster_status_lock=True)
|
|
3027
|
-
|
|
3115
|
+
_update_records_with_credentials_and_resources_str([record])
|
|
3028
3116
|
except (exceptions.ClusterStatusFetchingError,
|
|
3029
3117
|
exceptions.CloudUserIdentityError,
|
|
3030
3118
|
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
|
@@ -65,6 +65,7 @@ from sky.utils import context_utils
|
|
|
65
65
|
from sky.utils import controller_utils
|
|
66
66
|
from sky.utils import directory_utils
|
|
67
67
|
from sky.utils import env_options
|
|
68
|
+
from sky.utils import lock_events
|
|
68
69
|
from sky.utils import locks
|
|
69
70
|
from sky.utils import log_utils
|
|
70
71
|
from sky.utils import message_utils
|
|
@@ -1972,7 +1973,7 @@ class RetryingVmProvisioner(object):
|
|
|
1972
1973
|
ray_config = global_user_state.get_cluster_yaml_dict(
|
|
1973
1974
|
cluster_config_file)
|
|
1974
1975
|
ray_config['upscaling_speed'] = 0
|
|
1975
|
-
|
|
1976
|
+
yaml_utils.dump_yaml(cluster_config_file, ray_config)
|
|
1976
1977
|
start = time.time()
|
|
1977
1978
|
returncode, stdout, stderr = ray_up()
|
|
1978
1979
|
logger.debug(
|
|
@@ -2498,7 +2499,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2498
2499
|
self.stable_internal_external_ips = stable_internal_external_ips
|
|
2499
2500
|
|
|
2500
2501
|
@context_utils.cancellation_guard
|
|
2501
|
-
|
|
2502
|
+
# we expect different request to be acting on different clusters
|
|
2503
|
+
# (= different handles) so we have no real expectation of cache hit
|
|
2504
|
+
# across requests.
|
|
2505
|
+
# Do not change this cache to global scope
|
|
2506
|
+
# without understanding https://github.com/skypilot-org/skypilot/pull/6908
|
|
2507
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
|
2502
2508
|
@timeline.event
|
|
2503
2509
|
def get_command_runners(self,
|
|
2504
2510
|
force_cached: bool = False,
|
|
@@ -2854,7 +2860,12 @@ class LocalResourcesHandle(CloudVmRayResourceHandle):
|
|
|
2854
2860
|
self.is_grpc_enabled = False
|
|
2855
2861
|
|
|
2856
2862
|
@context_utils.cancellation_guard
|
|
2857
|
-
|
|
2863
|
+
# we expect different request to be acting on different clusters
|
|
2864
|
+
# (= different handles) so we have no real expectation of cache hit
|
|
2865
|
+
# across requests.
|
|
2866
|
+
# Do not change this cache to global scope
|
|
2867
|
+
# without understanding https://github.com/skypilot-org/skypilot/pull/6908
|
|
2868
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
|
2858
2869
|
@timeline.event
|
|
2859
2870
|
def get_command_runners(self,
|
|
2860
2871
|
force_cached: bool = False,
|
|
@@ -3112,7 +3123,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3112
3123
|
retry_until_up: bool = False,
|
|
3113
3124
|
skip_unnecessary_provisioning: bool = False,
|
|
3114
3125
|
) -> Tuple[Optional[CloudVmRayResourceHandle], bool]:
|
|
3115
|
-
with
|
|
3126
|
+
with lock_events.DistributedLockEvent(lock_id, _CLUSTER_LOCK_TIMEOUT):
|
|
3127
|
+
# Reset spinner message to remove any mention of being blocked
|
|
3128
|
+
# by other requests.
|
|
3129
|
+
rich_utils.force_update_status(
|
|
3130
|
+
ux_utils.spinner_message('Launching'))
|
|
3131
|
+
|
|
3116
3132
|
# Try to launch the exiting cluster first. If no existing
|
|
3117
3133
|
# cluster, this function will create a to_provision_config
|
|
3118
3134
|
# with required resources.
|
|
@@ -3208,8 +3224,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3208
3224
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
3209
3225
|
nop_if_duplicate=True)
|
|
3210
3226
|
global_user_state.remove_cluster(cluster_name,
|
|
3211
|
-
terminate=True
|
|
3212
|
-
remove_events=False)
|
|
3227
|
+
terminate=True)
|
|
3213
3228
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
3214
3229
|
None)
|
|
3215
3230
|
logger.error(
|
|
@@ -4011,8 +4026,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4011
4026
|
def _teardown(self,
|
|
4012
4027
|
handle: CloudVmRayResourceHandle,
|
|
4013
4028
|
terminate: bool,
|
|
4014
|
-
purge: bool = False
|
|
4015
|
-
explicitly_requested: bool = False):
|
|
4029
|
+
purge: bool = False):
|
|
4016
4030
|
"""Tear down or stop the cluster.
|
|
4017
4031
|
|
|
4018
4032
|
Args:
|
|
@@ -4087,8 +4101,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4087
4101
|
# ClusterOwnerIdentityMismatchError. The argument/flag
|
|
4088
4102
|
# `purge` should bypass such ID mismatch errors.
|
|
4089
4103
|
refresh_cluster_status=(
|
|
4090
|
-
not is_identity_mismatch_and_purge)
|
|
4091
|
-
explicitly_requested=explicitly_requested)
|
|
4104
|
+
not is_identity_mismatch_and_purge))
|
|
4092
4105
|
if terminate:
|
|
4093
4106
|
lock.force_unlock()
|
|
4094
4107
|
break
|
|
@@ -4477,8 +4490,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4477
4490
|
purge: bool = False,
|
|
4478
4491
|
post_teardown_cleanup: bool = True,
|
|
4479
4492
|
refresh_cluster_status: bool = True,
|
|
4480
|
-
remove_from_db: bool = True
|
|
4481
|
-
explicitly_requested: bool = False) -> None:
|
|
4493
|
+
remove_from_db: bool = True) -> None:
|
|
4482
4494
|
"""Teardown the cluster without acquiring the cluster status lock.
|
|
4483
4495
|
|
|
4484
4496
|
NOTE: This method should not be called without holding the cluster
|
|
@@ -4542,8 +4554,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4542
4554
|
f'provision yaml so it '
|
|
4543
4555
|
'has not been provisioned. Skipped.')
|
|
4544
4556
|
global_user_state.remove_cluster(handle.cluster_name,
|
|
4545
|
-
terminate=terminate
|
|
4546
|
-
remove_events=False)
|
|
4557
|
+
terminate=terminate)
|
|
4547
4558
|
return
|
|
4548
4559
|
log_path = os.path.join(os.path.expanduser(self.log_dir),
|
|
4549
4560
|
'teardown.log')
|
|
@@ -4600,12 +4611,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4600
4611
|
raise
|
|
4601
4612
|
|
|
4602
4613
|
if post_teardown_cleanup:
|
|
4603
|
-
self.post_teardown_cleanup(
|
|
4604
|
-
|
|
4605
|
-
terminate,
|
|
4606
|
-
purge,
|
|
4607
|
-
remove_from_db,
|
|
4608
|
-
explicitly_requested=explicitly_requested)
|
|
4614
|
+
self.post_teardown_cleanup(handle, terminate, purge,
|
|
4615
|
+
remove_from_db)
|
|
4609
4616
|
return
|
|
4610
4617
|
|
|
4611
4618
|
if (isinstance(cloud, clouds.IBM) and terminate and
|
|
@@ -4649,7 +4656,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4649
4656
|
prefix='sky_',
|
|
4650
4657
|
delete=False,
|
|
4651
4658
|
suffix='.yml') as f:
|
|
4652
|
-
|
|
4659
|
+
yaml_utils.dump_yaml(f.name, config)
|
|
4653
4660
|
f.flush()
|
|
4654
4661
|
|
|
4655
4662
|
teardown_verb = 'Terminating' if terminate else 'Stopping'
|
|
@@ -4705,8 +4712,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4705
4712
|
terminate: bool,
|
|
4706
4713
|
purge: bool = False,
|
|
4707
4714
|
remove_from_db: bool = True,
|
|
4708
|
-
failover: bool = False
|
|
4709
|
-
explicitly_requested: bool = False) -> None:
|
|
4715
|
+
failover: bool = False) -> None:
|
|
4710
4716
|
"""Cleanup local configs/caches and delete TPUs after teardown.
|
|
4711
4717
|
|
|
4712
4718
|
This method will handle the following cleanup steps:
|
|
@@ -4884,8 +4890,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4884
4890
|
|
|
4885
4891
|
if not terminate or remove_from_db:
|
|
4886
4892
|
global_user_state.remove_cluster(handle.cluster_name,
|
|
4887
|
-
terminate=terminate
|
|
4888
|
-
remove_events=explicitly_requested)
|
|
4893
|
+
terminate=terminate)
|
|
4889
4894
|
|
|
4890
4895
|
def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
|
|
4891
4896
|
"""Remove the YAML config of a cluster."""
|
|
@@ -256,9 +256,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
|
256
256
|
logger.error(
|
|
257
257
|
'Unable to run container - nvidia runtime for docker not '
|
|
258
258
|
'found. Have you installed nvidia-docker on your machine?')
|
|
259
|
-
global_user_state.remove_cluster(cluster_name,
|
|
260
|
-
terminate=True,
|
|
261
|
-
remove_events=False)
|
|
259
|
+
global_user_state.remove_cluster(cluster_name, terminate=True)
|
|
262
260
|
raise e
|
|
263
261
|
self.containers[handle] = container
|
|
264
262
|
logger.info(
|
|
@@ -325,8 +323,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
|
325
323
|
def _teardown(self,
|
|
326
324
|
handle: LocalDockerResourceHandle,
|
|
327
325
|
terminate: bool,
|
|
328
|
-
purge: bool = False
|
|
329
|
-
explicitly_requested: bool = False):
|
|
326
|
+
purge: bool = False):
|
|
330
327
|
"""Teardown kills the container."""
|
|
331
328
|
del purge # Unused.
|
|
332
329
|
if not terminate:
|
|
@@ -342,9 +339,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
|
342
339
|
container.remove(force=True)
|
|
343
340
|
cluster_name = handle.get_cluster_name()
|
|
344
341
|
|
|
345
|
-
global_user_state.remove_cluster(cluster_name,
|
|
346
|
-
terminate=True,
|
|
347
|
-
remove_events=explicitly_requested)
|
|
342
|
+
global_user_state.remove_cluster(cluster_name, terminate=True)
|
|
348
343
|
|
|
349
344
|
# --- Utilities ---
|
|
350
345
|
|