skypilot-nightly 1.0.0.dev20250826__py3-none-any.whl → 1.0.0.dev20250828__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/admin_policy.py +11 -10
- sky/authentication.py +4 -10
- sky/backends/backend.py +3 -5
- sky/backends/backend_utils.py +41 -56
- sky/backends/cloud_vm_ray_backend.py +13 -24
- sky/backends/local_docker_backend.py +3 -8
- sky/client/cli/command.py +43 -10
- sky/client/common.py +41 -14
- sky/client/sdk.py +24 -9
- sky/client/sdk_async.py +6 -2
- sky/clouds/aws.py +1 -1
- sky/clouds/cloud.py +15 -0
- sky/clouds/kubernetes.py +27 -0
- sky/clouds/ssh.py +2 -3
- sky/core.py +1 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-6e76f636a048e145.js → webpack-6dae1cd599a34def.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +127 -23
- sky/jobs/client/sdk.py +5 -2
- sky/jobs/recovery_strategy.py +9 -4
- sky/logs/agent.py +2 -2
- sky/logs/aws.py +6 -3
- sky/provision/do/utils.py +2 -1
- sky/provision/kubernetes/config.py +2 -8
- sky/provision/kubernetes/instance.py +58 -8
- sky/provision/kubernetes/network_utils.py +3 -4
- sky/provision/kubernetes/utils.py +8 -7
- sky/provision/nebius/utils.py +51 -9
- sky/provision/vsphere/vsphere_utils.py +2 -8
- sky/schemas/api/responses.py +7 -0
- sky/serve/client/impl.py +5 -4
- sky/serve/replica_managers.py +4 -3
- sky/serve/serve_utils.py +4 -4
- sky/serve/server/impl.py +3 -2
- sky/serve/service_spec.py +2 -8
- sky/server/auth/authn.py +4 -0
- sky/server/auth/oauth2_proxy.py +10 -4
- sky/server/common.py +10 -3
- sky/server/daemons.py +10 -5
- sky/server/requests/executor.py +6 -1
- sky/server/requests/requests.py +21 -0
- sky/server/server.py +34 -33
- sky/server/uvicorn.py +33 -0
- sky/setup_files/dependencies.py +1 -0
- sky/sky_logging.py +4 -1
- sky/skylet/events.py +4 -5
- sky/skypilot_config.py +14 -12
- sky/ssh_node_pools/core.py +3 -1
- sky/task.py +4 -10
- sky/templates/nebius-ray.yml.j2 +4 -8
- sky/usage/usage_lib.py +3 -2
- sky/users/server.py +6 -6
- sky/utils/common_utils.py +0 -71
- sky/utils/controller_utils.py +4 -3
- sky/utils/dag_utils.py +4 -4
- sky/utils/kubernetes/config_map_utils.py +3 -3
- sky/utils/schemas.py +3 -0
- sky/utils/yaml_utils.py +102 -0
- sky/volumes/volume.py +8 -3
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/RECORD +83 -82
- /sky/dashboard/out/_next/static/{TPMkEeuj85tHTmIW7Gu3S → 9DW6d9jaP2kZt0NcgIfFa}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{TPMkEeuj85tHTmIW7Gu3S → 9DW6d9jaP2kZt0NcgIfFa}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250826.dist-info → skypilot_nightly-1.0.0.dev20250828.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ import urllib.request
|
|
|
7
7
|
from sky.utils import directory_utils
|
|
8
8
|
|
|
9
9
|
# Replaced with the current commit when building the wheels.
|
|
10
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
10
|
+
_SKYPILOT_COMMIT_SHA = 'ff93214498e29e0aa9a73868b73613535f96b8a3'
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def _get_git_commit():
|
|
@@ -37,7 +37,7 @@ def _get_git_commit():
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
__commit__ = _get_git_commit()
|
|
40
|
-
__version__ = '1.0.0.
|
|
40
|
+
__version__ = '1.0.0.dev20250828'
|
|
41
41
|
__root_dir__ = directory_utils.get_sky_dir()
|
|
42
42
|
|
|
43
43
|
|
sky/admin_policy.py
CHANGED
|
@@ -13,6 +13,7 @@ from sky.adaptors import common as adaptors_common
|
|
|
13
13
|
from sky.utils import common_utils
|
|
14
14
|
from sky.utils import config_utils
|
|
15
15
|
from sky.utils import ux_utils
|
|
16
|
+
from sky.utils import yaml_utils
|
|
16
17
|
|
|
17
18
|
if typing.TYPE_CHECKING:
|
|
18
19
|
import requests
|
|
@@ -80,9 +81,9 @@ class UserRequest:
|
|
|
80
81
|
|
|
81
82
|
def encode(self) -> str:
|
|
82
83
|
return _UserRequestBody(
|
|
83
|
-
task=
|
|
84
|
-
skypilot_config=
|
|
85
|
-
|
|
84
|
+
task=yaml_utils.dump_yaml_str(self.task.to_yaml_config()),
|
|
85
|
+
skypilot_config=yaml_utils.dump_yaml_str(dict(
|
|
86
|
+
self.skypilot_config)),
|
|
86
87
|
request_options=self.request_options,
|
|
87
88
|
at_client_side=self.at_client_side,
|
|
88
89
|
).model_dump_json()
|
|
@@ -92,9 +93,9 @@ class UserRequest:
|
|
|
92
93
|
user_request_body = _UserRequestBody.model_validate_json(body)
|
|
93
94
|
return cls(
|
|
94
95
|
task=sky.Task.from_yaml_config(
|
|
95
|
-
|
|
96
|
+
yaml_utils.read_yaml_all_str(user_request_body.task)[0]),
|
|
96
97
|
skypilot_config=config_utils.Config.from_dict(
|
|
97
|
-
|
|
98
|
+
yaml_utils.read_yaml_all_str(
|
|
98
99
|
user_request_body.skypilot_config)[0]),
|
|
99
100
|
request_options=user_request_body.request_options,
|
|
100
101
|
at_client_side=user_request_body.at_client_side,
|
|
@@ -116,9 +117,9 @@ class MutatedUserRequest:
|
|
|
116
117
|
|
|
117
118
|
def encode(self) -> str:
|
|
118
119
|
return _MutatedUserRequestBody(
|
|
119
|
-
task=
|
|
120
|
-
skypilot_config=
|
|
121
|
-
|
|
120
|
+
task=yaml_utils.dump_yaml_str(self.task.to_yaml_config()),
|
|
121
|
+
skypilot_config=yaml_utils.dump_yaml_str(dict(
|
|
122
|
+
self.skypilot_config),)).model_dump_json()
|
|
122
123
|
|
|
123
124
|
@classmethod
|
|
124
125
|
def decode(cls, mutated_user_request_body: str,
|
|
@@ -126,14 +127,14 @@ class MutatedUserRequest:
|
|
|
126
127
|
mutated_user_request_body = _MutatedUserRequestBody.model_validate_json(
|
|
127
128
|
mutated_user_request_body)
|
|
128
129
|
task = sky.Task.from_yaml_config(
|
|
129
|
-
|
|
130
|
+
yaml_utils.read_yaml_all_str(mutated_user_request_body.task)[0])
|
|
130
131
|
# Some internal Task fields are not serialized. We need to manually
|
|
131
132
|
# restore them from the original request.
|
|
132
133
|
task.managed_job_dag = original_request.task.managed_job_dag
|
|
133
134
|
task.service_name = original_request.task.service_name
|
|
134
135
|
return cls(task=task,
|
|
135
136
|
skypilot_config=config_utils.Config.from_dict(
|
|
136
|
-
|
|
137
|
+
yaml_utils.read_yaml_all_str(
|
|
137
138
|
mutated_user_request_body.skypilot_config)[0],))
|
|
138
139
|
|
|
139
140
|
|
sky/authentication.py
CHANGED
|
@@ -25,7 +25,6 @@ import re
|
|
|
25
25
|
import socket
|
|
26
26
|
import subprocess
|
|
27
27
|
import sys
|
|
28
|
-
import typing
|
|
29
28
|
from typing import Any, Dict, Optional, Tuple
|
|
30
29
|
import uuid
|
|
31
30
|
|
|
@@ -37,7 +36,6 @@ from sky import exceptions
|
|
|
37
36
|
from sky import global_user_state
|
|
38
37
|
from sky import sky_logging
|
|
39
38
|
from sky import skypilot_config
|
|
40
|
-
from sky.adaptors import common as adaptors_common
|
|
41
39
|
from sky.adaptors import gcp
|
|
42
40
|
from sky.adaptors import ibm
|
|
43
41
|
from sky.adaptors import kubernetes
|
|
@@ -51,6 +49,7 @@ from sky.utils import config_utils
|
|
|
51
49
|
from sky.utils import kubernetes_enums
|
|
52
50
|
from sky.utils import subprocess_utils
|
|
53
51
|
from sky.utils import ux_utils
|
|
52
|
+
from sky.utils import yaml_utils
|
|
54
53
|
|
|
55
54
|
logger = sky_logging.init_logger(__name__)
|
|
56
55
|
|
|
@@ -67,11 +66,6 @@ MAX_TRIALS = 64
|
|
|
67
66
|
# the former dir is empheral.
|
|
68
67
|
_SSH_KEY_PATH_PREFIX = '~/.sky/clients/{user_hash}/ssh'
|
|
69
68
|
|
|
70
|
-
if typing.TYPE_CHECKING:
|
|
71
|
-
import yaml
|
|
72
|
-
else:
|
|
73
|
-
yaml = adaptors_common.LazyImport('yaml')
|
|
74
|
-
|
|
75
69
|
|
|
76
70
|
def get_ssh_key_and_lock_path(
|
|
77
71
|
user_hash: Optional[str] = None) -> Tuple[str, str, str]:
|
|
@@ -204,12 +198,12 @@ def configure_ssh_info(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
204
198
|
_, public_key_path = get_or_generate_keys()
|
|
205
199
|
with open(public_key_path, 'r', encoding='utf-8') as f:
|
|
206
200
|
public_key = f.read().strip()
|
|
207
|
-
config_str =
|
|
201
|
+
config_str = yaml_utils.dump_yaml_str(config)
|
|
208
202
|
config_str = config_str.replace('skypilot:ssh_user',
|
|
209
203
|
config['auth']['ssh_user'])
|
|
210
204
|
config_str = config_str.replace('skypilot:ssh_public_key_content',
|
|
211
205
|
public_key)
|
|
212
|
-
config =
|
|
206
|
+
config = yaml_utils.safe_load(config_str)
|
|
213
207
|
return config
|
|
214
208
|
|
|
215
209
|
|
|
@@ -289,7 +283,7 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
289
283
|
os_login_username = None
|
|
290
284
|
if proc.returncode == 0:
|
|
291
285
|
try:
|
|
292
|
-
profile =
|
|
286
|
+
profile = yaml_utils.safe_load(proc.stdout)
|
|
293
287
|
username = profile['posixAccounts'][0]['username']
|
|
294
288
|
if username:
|
|
295
289
|
os_login_username = username
|
sky/backends/backend.py
CHANGED
|
@@ -147,9 +147,8 @@ class Backend(Generic[_ResourceHandleType]):
|
|
|
147
147
|
def teardown(self,
|
|
148
148
|
handle: _ResourceHandleType,
|
|
149
149
|
terminate: bool,
|
|
150
|
-
purge: bool = False
|
|
151
|
-
|
|
152
|
-
self._teardown(handle, terminate, purge, explicitly_requested)
|
|
150
|
+
purge: bool = False) -> None:
|
|
151
|
+
self._teardown(handle, terminate, purge)
|
|
153
152
|
|
|
154
153
|
def register_info(self, **kwargs) -> None:
|
|
155
154
|
"""Register backend-specific information."""
|
|
@@ -201,6 +200,5 @@ class Backend(Generic[_ResourceHandleType]):
|
|
|
201
200
|
def _teardown(self,
|
|
202
201
|
handle: _ResourceHandleType,
|
|
203
202
|
terminate: bool,
|
|
204
|
-
purge: bool = False
|
|
205
|
-
explicitly_requested: bool = False):
|
|
203
|
+
purge: bool = False):
|
|
206
204
|
raise NotImplementedError
|
sky/backends/backend_utils.py
CHANGED
|
@@ -60,6 +60,7 @@ from sky.utils import subprocess_utils
|
|
|
60
60
|
from sky.utils import tempstore
|
|
61
61
|
from sky.utils import timeline
|
|
62
62
|
from sky.utils import ux_utils
|
|
63
|
+
from sky.utils import yaml_utils
|
|
63
64
|
from sky.workspaces import core as workspaces_core
|
|
64
65
|
|
|
65
66
|
if typing.TYPE_CHECKING:
|
|
@@ -240,7 +241,7 @@ def _optimize_file_mounts(tmp_yaml_path: str) -> None:
|
|
|
240
241
|
subprocess.CalledProcessError: If the file mounts are failed to be
|
|
241
242
|
copied.
|
|
242
243
|
"""
|
|
243
|
-
yaml_config =
|
|
244
|
+
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
244
245
|
|
|
245
246
|
file_mounts = yaml_config.get('file_mounts', {})
|
|
246
247
|
# Remove the file mounts added by the newline.
|
|
@@ -324,7 +325,7 @@ def _optimize_file_mounts(tmp_yaml_path: str) -> None:
|
|
|
324
325
|
shell=True,
|
|
325
326
|
check=True)
|
|
326
327
|
|
|
327
|
-
|
|
328
|
+
yaml_utils.dump_yaml(tmp_yaml_path, yaml_config)
|
|
328
329
|
|
|
329
330
|
|
|
330
331
|
def path_size_megabytes(path: str) -> int:
|
|
@@ -484,8 +485,8 @@ def _replace_yaml_dicts(
|
|
|
484
485
|
if key in old_block:
|
|
485
486
|
_restore_block(value, old_block[key])
|
|
486
487
|
|
|
487
|
-
new_config =
|
|
488
|
-
old_config =
|
|
488
|
+
new_config = yaml_utils.safe_load(new_yaml)
|
|
489
|
+
old_config = yaml_utils.safe_load(old_yaml)
|
|
489
490
|
excluded_results = {}
|
|
490
491
|
# Find all key values excluded from restore
|
|
491
492
|
for exclude_restore_key_name_list in restore_key_names_exceptions:
|
|
@@ -509,7 +510,7 @@ def _replace_yaml_dicts(
|
|
|
509
510
|
for key in exclude_restore_key_name[:-1]:
|
|
510
511
|
curr = curr[key]
|
|
511
512
|
curr[exclude_restore_key_name[-1]] = value
|
|
512
|
-
return
|
|
513
|
+
return yaml_utils.dump_yaml_str(new_config)
|
|
513
514
|
|
|
514
515
|
|
|
515
516
|
def get_expirable_clouds(
|
|
@@ -936,7 +937,7 @@ def write_cluster_config(
|
|
|
936
937
|
tmp_yaml_path,
|
|
937
938
|
cluster_config_overrides=cluster_config_overrides,
|
|
938
939
|
context=region.name)
|
|
939
|
-
yaml_obj =
|
|
940
|
+
yaml_obj = yaml_utils.read_yaml(tmp_yaml_path)
|
|
940
941
|
pod_config: Dict[str, Any] = yaml_obj['available_node_types'][
|
|
941
942
|
'ray_head_default']['node_config']
|
|
942
943
|
|
|
@@ -975,7 +976,7 @@ def write_cluster_config(
|
|
|
975
976
|
# Read the cluster name from the tmp yaml file, to take the backward
|
|
976
977
|
# compatbility restortion above into account.
|
|
977
978
|
# TODO: remove this after 2 minor releases, 0.10.0.
|
|
978
|
-
yaml_config =
|
|
979
|
+
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
979
980
|
config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
|
|
980
981
|
|
|
981
982
|
# Make sure to do this before we optimize file mounts. Optimization is
|
|
@@ -1021,7 +1022,7 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
|
|
1021
1022
|
|
|
1022
1023
|
This function's output removes comments included in the jinja2 template.
|
|
1023
1024
|
"""
|
|
1024
|
-
config =
|
|
1025
|
+
config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1025
1026
|
# Check the availability of the cloud type.
|
|
1026
1027
|
if isinstance(cloud, (
|
|
1027
1028
|
clouds.AWS,
|
|
@@ -1053,7 +1054,7 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
|
|
1053
1054
|
config = auth.setup_hyperbolic_authentication(config)
|
|
1054
1055
|
else:
|
|
1055
1056
|
assert False, cloud
|
|
1056
|
-
|
|
1057
|
+
yaml_utils.dump_yaml(tmp_yaml_path, config)
|
|
1057
1058
|
|
|
1058
1059
|
|
|
1059
1060
|
def get_timestamp_from_run_timestamp(run_timestamp: str) -> float:
|
|
@@ -1155,7 +1156,7 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
|
|
|
1155
1156
|
"""
|
|
1156
1157
|
|
|
1157
1158
|
# Load the yaml contents so that we can directly remove keys.
|
|
1158
|
-
yaml_config =
|
|
1159
|
+
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1159
1160
|
for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
|
|
1160
1161
|
dict_to_remove_from = yaml_config
|
|
1161
1162
|
found_key = True
|
|
@@ -1174,7 +1175,7 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
|
|
|
1174
1175
|
config_hash = hashlib.sha256()
|
|
1175
1176
|
|
|
1176
1177
|
yaml_hash = hashlib.sha256(
|
|
1177
|
-
|
|
1178
|
+
yaml_utils.dump_yaml_str(yaml_config).encode('utf-8'))
|
|
1178
1179
|
config_hash.update(yaml_hash.digest())
|
|
1179
1180
|
|
|
1180
1181
|
file_mounts = yaml_config.get('file_mounts', {})
|
|
@@ -2026,9 +2027,7 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2026
2027
|
'Cluster has no YAML file. Removing the cluster from cache.',
|
|
2027
2028
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2028
2029
|
nop_if_duplicate=True)
|
|
2029
|
-
global_user_state.remove_cluster(cluster_name,
|
|
2030
|
-
terminate=True,
|
|
2031
|
-
remove_events=True)
|
|
2030
|
+
global_user_state.remove_cluster(cluster_name, terminate=True)
|
|
2032
2031
|
logger.debug(f'Cluster {cluster_name!r} has no YAML file. '
|
|
2033
2032
|
'Removing the cluster from cache.')
|
|
2034
2033
|
return None
|
|
@@ -2366,7 +2365,7 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2366
2365
|
# Some status reason clears after a certain time (e.g. k8s events
|
|
2367
2366
|
# are only stored for an hour by default), so it is possible that
|
|
2368
2367
|
# the previous event has a status reason, but now it does not.
|
|
2369
|
-
init_reason_regex = f'^Cluster is abnormal because {init_reason}
|
|
2368
|
+
init_reason_regex = f'^Cluster is abnormal because {init_reason}.*'
|
|
2370
2369
|
log_message = f'Cluster is abnormal because {init_reason}'
|
|
2371
2370
|
if status_reason:
|
|
2372
2371
|
log_message += f' ({status_reason})'
|
|
@@ -2881,38 +2880,42 @@ def get_clusters(
|
|
|
2881
2880
|
A list of cluster records. If the cluster does not exist or has been
|
|
2882
2881
|
terminated, the record will be omitted from the returned list.
|
|
2883
2882
|
"""
|
|
2884
|
-
records = global_user_state.get_clusters()
|
|
2885
|
-
current_user = common_utils.get_current_user()
|
|
2886
|
-
|
|
2887
|
-
# Filter out clusters created by the controller.
|
|
2888
|
-
if (not env_options.Options.SHOW_DEBUG_INFO.get() and
|
|
2889
|
-
not _include_is_managed):
|
|
2890
|
-
records = [
|
|
2891
|
-
record for record in records if not record.get('is_managed', False)
|
|
2892
|
-
]
|
|
2893
2883
|
|
|
2894
|
-
|
|
2884
|
+
exclude_managed_clusters = False
|
|
2885
|
+
if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
|
|
2886
|
+
exclude_managed_clusters = True
|
|
2887
|
+
user_hashes_filter = None
|
|
2895
2888
|
if not all_users:
|
|
2896
|
-
|
|
2897
|
-
record for record in records
|
|
2898
|
-
if record['user_hash'] == current_user.id
|
|
2899
|
-
]
|
|
2900
|
-
|
|
2889
|
+
user_hashes_filter = {common_utils.get_current_user().id}
|
|
2901
2890
|
accessible_workspaces = workspaces_core.get_workspaces()
|
|
2902
2891
|
|
|
2903
|
-
|
|
2904
|
-
|
|
2905
|
-
|
|
2906
|
-
|
|
2907
|
-
if cluster_workspace in accessible_workspaces:
|
|
2908
|
-
workspace_filtered_records.append(record)
|
|
2909
|
-
|
|
2910
|
-
records = workspace_filtered_records
|
|
2892
|
+
records = global_user_state.get_clusters(
|
|
2893
|
+
exclude_managed_clusters=exclude_managed_clusters,
|
|
2894
|
+
user_hashes_filter=user_hashes_filter,
|
|
2895
|
+
workspaces_filter=accessible_workspaces)
|
|
2911
2896
|
|
|
2912
2897
|
yellow = colorama.Fore.YELLOW
|
|
2913
2898
|
bright = colorama.Style.BRIGHT
|
|
2914
2899
|
reset = colorama.Style.RESET_ALL
|
|
2915
2900
|
|
|
2901
|
+
if cluster_names is not None:
|
|
2902
|
+
if isinstance(cluster_names, str):
|
|
2903
|
+
cluster_names = [cluster_names]
|
|
2904
|
+
cluster_names = _get_glob_clusters(cluster_names, silent=True)
|
|
2905
|
+
new_records = []
|
|
2906
|
+
not_exist_cluster_names = []
|
|
2907
|
+
for cluster_name in cluster_names:
|
|
2908
|
+
for record in records:
|
|
2909
|
+
if record['name'] == cluster_name:
|
|
2910
|
+
new_records.append(record)
|
|
2911
|
+
break
|
|
2912
|
+
else:
|
|
2913
|
+
not_exist_cluster_names.append(cluster_name)
|
|
2914
|
+
if not_exist_cluster_names:
|
|
2915
|
+
clusters_str = ', '.join(not_exist_cluster_names)
|
|
2916
|
+
logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
|
|
2917
|
+
records = new_records
|
|
2918
|
+
|
|
2916
2919
|
def _update_record_with_credentials_and_resources_str(
|
|
2917
2920
|
record: Optional[Dict[str, Any]]) -> None:
|
|
2918
2921
|
"""Add the credentials to the record.
|
|
@@ -2952,24 +2955,6 @@ def get_clusters(
|
|
|
2952
2955
|
credentials['ssh_private_key_content'] = f.read()
|
|
2953
2956
|
record['credentials'] = credentials
|
|
2954
2957
|
|
|
2955
|
-
if cluster_names is not None:
|
|
2956
|
-
if isinstance(cluster_names, str):
|
|
2957
|
-
cluster_names = [cluster_names]
|
|
2958
|
-
cluster_names = _get_glob_clusters(cluster_names, silent=True)
|
|
2959
|
-
new_records = []
|
|
2960
|
-
not_exist_cluster_names = []
|
|
2961
|
-
for cluster_name in cluster_names:
|
|
2962
|
-
for record in records:
|
|
2963
|
-
if record['name'] == cluster_name:
|
|
2964
|
-
new_records.append(record)
|
|
2965
|
-
break
|
|
2966
|
-
else:
|
|
2967
|
-
not_exist_cluster_names.append(cluster_name)
|
|
2968
|
-
if not_exist_cluster_names:
|
|
2969
|
-
clusters_str = ', '.join(not_exist_cluster_names)
|
|
2970
|
-
logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
|
|
2971
|
-
records = new_records
|
|
2972
|
-
|
|
2973
2958
|
def _update_records_with_resources(
|
|
2974
2959
|
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
2975
2960
|
"""Add the resources to the record."""
|
|
@@ -22,7 +22,6 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
|
|
|
22
22
|
|
|
23
23
|
import colorama
|
|
24
24
|
import psutil
|
|
25
|
-
import yaml
|
|
26
25
|
|
|
27
26
|
from sky import backends
|
|
28
27
|
from sky import catalog
|
|
@@ -77,6 +76,7 @@ from sky.utils import subprocess_utils
|
|
|
77
76
|
from sky.utils import timeline
|
|
78
77
|
from sky.utils import ux_utils
|
|
79
78
|
from sky.utils import volume as volume_lib
|
|
79
|
+
from sky.utils import yaml_utils
|
|
80
80
|
|
|
81
81
|
if typing.TYPE_CHECKING:
|
|
82
82
|
import grpc
|
|
@@ -1972,7 +1972,7 @@ class RetryingVmProvisioner(object):
|
|
|
1972
1972
|
ray_config = global_user_state.get_cluster_yaml_dict(
|
|
1973
1973
|
cluster_config_file)
|
|
1974
1974
|
ray_config['upscaling_speed'] = 0
|
|
1975
|
-
|
|
1975
|
+
yaml_utils.dump_yaml(cluster_config_file, ray_config)
|
|
1976
1976
|
start = time.time()
|
|
1977
1977
|
returncode, stdout, stderr = ray_up()
|
|
1978
1978
|
logger.debug(
|
|
@@ -2351,7 +2351,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2351
2351
|
# If the cluster yaml is not available,
|
|
2352
2352
|
# we skip updating the cluster info.
|
|
2353
2353
|
return
|
|
2354
|
-
config =
|
|
2354
|
+
config = yaml_utils.safe_load(yaml_str)
|
|
2355
2355
|
try:
|
|
2356
2356
|
cluster_info = provision_lib.get_cluster_info(
|
|
2357
2357
|
provider_name,
|
|
@@ -3208,8 +3208,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3208
3208
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
3209
3209
|
nop_if_duplicate=True)
|
|
3210
3210
|
global_user_state.remove_cluster(cluster_name,
|
|
3211
|
-
terminate=True
|
|
3212
|
-
remove_events=False)
|
|
3211
|
+
terminate=True)
|
|
3213
3212
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
3214
3213
|
None)
|
|
3215
3214
|
logger.error(
|
|
@@ -4011,8 +4010,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4011
4010
|
def _teardown(self,
|
|
4012
4011
|
handle: CloudVmRayResourceHandle,
|
|
4013
4012
|
terminate: bool,
|
|
4014
|
-
purge: bool = False
|
|
4015
|
-
explicitly_requested: bool = False):
|
|
4013
|
+
purge: bool = False):
|
|
4016
4014
|
"""Tear down or stop the cluster.
|
|
4017
4015
|
|
|
4018
4016
|
Args:
|
|
@@ -4087,8 +4085,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4087
4085
|
# ClusterOwnerIdentityMismatchError. The argument/flag
|
|
4088
4086
|
# `purge` should bypass such ID mismatch errors.
|
|
4089
4087
|
refresh_cluster_status=(
|
|
4090
|
-
not is_identity_mismatch_and_purge)
|
|
4091
|
-
explicitly_requested=explicitly_requested)
|
|
4088
|
+
not is_identity_mismatch_and_purge))
|
|
4092
4089
|
if terminate:
|
|
4093
4090
|
lock.force_unlock()
|
|
4094
4091
|
break
|
|
@@ -4477,8 +4474,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4477
4474
|
purge: bool = False,
|
|
4478
4475
|
post_teardown_cleanup: bool = True,
|
|
4479
4476
|
refresh_cluster_status: bool = True,
|
|
4480
|
-
remove_from_db: bool = True
|
|
4481
|
-
explicitly_requested: bool = False) -> None:
|
|
4477
|
+
remove_from_db: bool = True) -> None:
|
|
4482
4478
|
"""Teardown the cluster without acquiring the cluster status lock.
|
|
4483
4479
|
|
|
4484
4480
|
NOTE: This method should not be called without holding the cluster
|
|
@@ -4542,8 +4538,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4542
4538
|
f'provision yaml so it '
|
|
4543
4539
|
'has not been provisioned. Skipped.')
|
|
4544
4540
|
global_user_state.remove_cluster(handle.cluster_name,
|
|
4545
|
-
terminate=terminate
|
|
4546
|
-
remove_events=False)
|
|
4541
|
+
terminate=terminate)
|
|
4547
4542
|
return
|
|
4548
4543
|
log_path = os.path.join(os.path.expanduser(self.log_dir),
|
|
4549
4544
|
'teardown.log')
|
|
@@ -4600,12 +4595,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4600
4595
|
raise
|
|
4601
4596
|
|
|
4602
4597
|
if post_teardown_cleanup:
|
|
4603
|
-
self.post_teardown_cleanup(
|
|
4604
|
-
|
|
4605
|
-
terminate,
|
|
4606
|
-
purge,
|
|
4607
|
-
remove_from_db,
|
|
4608
|
-
explicitly_requested=explicitly_requested)
|
|
4598
|
+
self.post_teardown_cleanup(handle, terminate, purge,
|
|
4599
|
+
remove_from_db)
|
|
4609
4600
|
return
|
|
4610
4601
|
|
|
4611
4602
|
if (isinstance(cloud, clouds.IBM) and terminate and
|
|
@@ -4649,7 +4640,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4649
4640
|
prefix='sky_',
|
|
4650
4641
|
delete=False,
|
|
4651
4642
|
suffix='.yml') as f:
|
|
4652
|
-
|
|
4643
|
+
yaml_utils.dump_yaml(f.name, config)
|
|
4653
4644
|
f.flush()
|
|
4654
4645
|
|
|
4655
4646
|
teardown_verb = 'Terminating' if terminate else 'Stopping'
|
|
@@ -4705,8 +4696,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4705
4696
|
terminate: bool,
|
|
4706
4697
|
purge: bool = False,
|
|
4707
4698
|
remove_from_db: bool = True,
|
|
4708
|
-
failover: bool = False
|
|
4709
|
-
explicitly_requested: bool = False) -> None:
|
|
4699
|
+
failover: bool = False) -> None:
|
|
4710
4700
|
"""Cleanup local configs/caches and delete TPUs after teardown.
|
|
4711
4701
|
|
|
4712
4702
|
This method will handle the following cleanup steps:
|
|
@@ -4884,8 +4874,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4884
4874
|
|
|
4885
4875
|
if not terminate or remove_from_db:
|
|
4886
4876
|
global_user_state.remove_cluster(handle.cluster_name,
|
|
4887
|
-
terminate=terminate
|
|
4888
|
-
remove_events=explicitly_requested)
|
|
4877
|
+
terminate=terminate)
|
|
4889
4878
|
|
|
4890
4879
|
def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
|
|
4891
4880
|
"""Remove the YAML config of a cluster."""
|
|
@@ -256,9 +256,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
|
256
256
|
logger.error(
|
|
257
257
|
'Unable to run container - nvidia runtime for docker not '
|
|
258
258
|
'found. Have you installed nvidia-docker on your machine?')
|
|
259
|
-
global_user_state.remove_cluster(cluster_name,
|
|
260
|
-
terminate=True,
|
|
261
|
-
remove_events=False)
|
|
259
|
+
global_user_state.remove_cluster(cluster_name, terminate=True)
|
|
262
260
|
raise e
|
|
263
261
|
self.containers[handle] = container
|
|
264
262
|
logger.info(
|
|
@@ -325,8 +323,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
|
325
323
|
def _teardown(self,
|
|
326
324
|
handle: LocalDockerResourceHandle,
|
|
327
325
|
terminate: bool,
|
|
328
|
-
purge: bool = False
|
|
329
|
-
explicitly_requested: bool = False):
|
|
326
|
+
purge: bool = False):
|
|
330
327
|
"""Teardown kills the container."""
|
|
331
328
|
del purge # Unused.
|
|
332
329
|
if not terminate:
|
|
@@ -342,9 +339,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
|
342
339
|
container.remove(force=True)
|
|
343
340
|
cluster_name = handle.get_cluster_name()
|
|
344
341
|
|
|
345
|
-
global_user_state.remove_cluster(cluster_name,
|
|
346
|
-
terminate=True,
|
|
347
|
-
remove_events=explicitly_requested)
|
|
342
|
+
global_user_state.remove_cluster(cluster_name, terminate=True)
|
|
348
343
|
|
|
349
344
|
# --- Utilities ---
|
|
350
345
|
|
sky/client/cli/command.py
CHANGED
|
@@ -89,6 +89,7 @@ from sky.utils import status_lib
|
|
|
89
89
|
from sky.utils import subprocess_utils
|
|
90
90
|
from sky.utils import timeline
|
|
91
91
|
from sky.utils import ux_utils
|
|
92
|
+
from sky.utils import yaml_utils
|
|
92
93
|
from sky.utils.cli_utils import status_utils
|
|
93
94
|
from sky.volumes import utils as volumes_utils
|
|
94
95
|
from sky.volumes.client import sdk as volumes_sdk
|
|
@@ -286,9 +287,10 @@ def _complete_cluster_name(ctx: click.Context, param: click.Parameter,
|
|
|
286
287
|
del ctx, param # Unused.
|
|
287
288
|
# TODO(zhwu): we send requests to API server for completion, which can cause
|
|
288
289
|
# large latency. We should investigate caching mechanism if needed.
|
|
289
|
-
response =
|
|
290
|
-
|
|
290
|
+
response = server_common.make_authenticated_request(
|
|
291
|
+
'GET',
|
|
291
292
|
f'/api/completion/cluster_name?incomplete={incomplete}',
|
|
293
|
+
retry=False,
|
|
292
294
|
timeout=2.0,
|
|
293
295
|
)
|
|
294
296
|
response.raise_for_status()
|
|
@@ -299,9 +301,10 @@ def _complete_storage_name(ctx: click.Context, param: click.Parameter,
|
|
|
299
301
|
incomplete: str) -> List[str]:
|
|
300
302
|
"""Handle shell completion for storage names."""
|
|
301
303
|
del ctx, param # Unused.
|
|
302
|
-
response =
|
|
303
|
-
|
|
304
|
+
response = server_common.make_authenticated_request(
|
|
305
|
+
'GET',
|
|
304
306
|
f'/api/completion/storage_name?incomplete={incomplete}',
|
|
307
|
+
retry=False,
|
|
305
308
|
timeout=2.0,
|
|
306
309
|
)
|
|
307
310
|
response.raise_for_status()
|
|
@@ -312,15 +315,34 @@ def _complete_volume_name(ctx: click.Context, param: click.Parameter,
|
|
|
312
315
|
incomplete: str) -> List[str]:
|
|
313
316
|
"""Handle shell completion for volume names."""
|
|
314
317
|
del ctx, param # Unused.
|
|
315
|
-
response =
|
|
316
|
-
|
|
318
|
+
response = server_common.make_authenticated_request(
|
|
319
|
+
'GET',
|
|
317
320
|
f'/api/completion/volume_name?incomplete={incomplete}',
|
|
321
|
+
retry=False,
|
|
318
322
|
timeout=2.0,
|
|
319
323
|
)
|
|
320
324
|
response.raise_for_status()
|
|
321
325
|
return response.json()
|
|
322
326
|
|
|
323
327
|
|
|
328
|
+
def _complete_api_request(ctx: click.Context, param: click.Parameter,
|
|
329
|
+
incomplete: str) -> List[str]:
|
|
330
|
+
"""Handle shell completion for API requests."""
|
|
331
|
+
del ctx, param # Unused.
|
|
332
|
+
response = server_common.make_authenticated_request(
|
|
333
|
+
'GET',
|
|
334
|
+
f'/api/completion/api_request?incomplete={incomplete}',
|
|
335
|
+
retry=False,
|
|
336
|
+
timeout=2.0,
|
|
337
|
+
)
|
|
338
|
+
try:
|
|
339
|
+
response.raise_for_status()
|
|
340
|
+
except requests_lib.exceptions.HTTPError:
|
|
341
|
+
# Server may be outdated/missing this API. Silently skip.
|
|
342
|
+
return []
|
|
343
|
+
return response.json()
|
|
344
|
+
|
|
345
|
+
|
|
324
346
|
def _complete_file_name(ctx: click.Context, param: click.Parameter,
|
|
325
347
|
incomplete: str) -> List[str]:
|
|
326
348
|
"""Handle shell completion for file names.
|
|
@@ -589,7 +611,7 @@ def _check_yaml_only(
|
|
|
589
611
|
try:
|
|
590
612
|
with open(entrypoint, 'r', encoding='utf-8') as f:
|
|
591
613
|
try:
|
|
592
|
-
config = list(
|
|
614
|
+
config = list(yaml_utils.safe_load_all(f))
|
|
593
615
|
if config:
|
|
594
616
|
# FIXME(zongheng): in a chain DAG YAML it only returns the
|
|
595
617
|
# first section. OK for downstream but is weird.
|
|
@@ -6017,7 +6039,10 @@ def api_stop():
|
|
|
6017
6039
|
|
|
6018
6040
|
@api.command('logs', cls=_DocumentedCodeCommand)
|
|
6019
6041
|
@flags.config_option(expose_value=False)
|
|
6020
|
-
@click.argument('request_id',
|
|
6042
|
+
@click.argument('request_id',
|
|
6043
|
+
required=False,
|
|
6044
|
+
type=str,
|
|
6045
|
+
**_get_shell_complete_args(_complete_api_request))
|
|
6021
6046
|
@click.option('--server-logs',
|
|
6022
6047
|
is_flag=True,
|
|
6023
6048
|
default=False,
|
|
@@ -6061,7 +6086,11 @@ def api_logs(request_id: Optional[str], server_logs: bool,
|
|
|
6061
6086
|
|
|
6062
6087
|
@api.command('cancel', cls=_DocumentedCodeCommand)
|
|
6063
6088
|
@flags.config_option(expose_value=False)
|
|
6064
|
-
@click.argument('request_ids',
|
|
6089
|
+
@click.argument('request_ids',
|
|
6090
|
+
required=False,
|
|
6091
|
+
type=str,
|
|
6092
|
+
nargs=-1,
|
|
6093
|
+
**_get_shell_complete_args(_complete_api_request))
|
|
6065
6094
|
@flags.all_option('Cancel all your requests.')
|
|
6066
6095
|
@flags.all_users_option('Cancel all requests from all users.')
|
|
6067
6096
|
@usage_lib.entrypoint
|
|
@@ -6093,7 +6122,11 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
|
|
|
6093
6122
|
|
|
6094
6123
|
@api.command('status', cls=_DocumentedCodeCommand)
|
|
6095
6124
|
@flags.config_option(expose_value=False)
|
|
6096
|
-
@click.argument('request_ids',
|
|
6125
|
+
@click.argument('request_ids',
|
|
6126
|
+
required=False,
|
|
6127
|
+
type=str,
|
|
6128
|
+
nargs=-1,
|
|
6129
|
+
**_get_shell_complete_args(_complete_api_request))
|
|
6097
6130
|
@click.option('--all-status',
|
|
6098
6131
|
'-a',
|
|
6099
6132
|
is_flag=True,
|