skypilot-nightly 1.0.0.dev20250528__py3-none-any.whl → 1.0.0.dev20250530__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +99 -16
- sky/authentication.py +54 -7
- sky/backends/backend_utils.py +35 -22
- sky/backends/cloud_vm_ray_backend.py +30 -15
- sky/check.py +1 -1
- sky/cli.py +20 -8
- sky/client/cli.py +20 -8
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +60 -10
- sky/clouds/nebius.py +55 -14
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +3 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/Q32Bxr2Pby5tFDW-y5TNg/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/236-ca00738e2f58ea65.js +6 -0
- sky/dashboard/out/_next/static/chunks/37-64efcd0e9c54bff6.js +6 -0
- sky/dashboard/out/_next/static/chunks/{173-7db8607cefc20f70.js → 614-3d29f98e0634b179.js} +2 -2
- sky/dashboard/out/_next/static/chunks/682-f3f1443ed2fba42f.js +6 -0
- sky/dashboard/out/_next/static/chunks/798-c0525dc3f21e488d.js +1 -0
- sky/dashboard/out/_next/static/chunks/843-786c36624d5ff61f.js +11 -0
- sky/dashboard/out/_next/static/chunks/856-02e34c9fc5945066.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-42d3656aba9d2e78.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-20835df7b0c4599c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-943992b84fd6f4ee.js → clusters-f37ff20f0af29aae.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-41738d1896fc02fe.js → config-3c6a2dabf56e8cd6.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-342bc15bb78ab2e5.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-7b4b8e7fa9fa0827.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-258decb65e95f520.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a4efc09e61988f8d.js → jobs-78a6c5ba3e24c0cf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-b2634885d67c49a6.js → users-89f9212b81d8897e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/{new-579b3203c7c19d84.js → new-198b6e00d7d724c5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-9388e38fac73ee8f.js → [name]-2ce792183b03c341.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-17d41826537196e7.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-f27c9a32aa3d9c6d.js +1 -0
- sky/dashboard/out/_next/static/css/5411b9fb0a783c1c.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +11 -1
- sky/global_user_state.py +149 -1
- sky/jobs/client/sdk.py +1 -0
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +3 -5
- sky/jobs/recovery_strategy.py +148 -102
- sky/jobs/scheduler.py +23 -8
- sky/jobs/server/core.py +16 -0
- sky/jobs/state.py +153 -39
- sky/jobs/utils.py +33 -5
- sky/provision/kubernetes/utils.py +2 -1
- sky/provision/provisioner.py +15 -10
- sky/resources.py +16 -1
- sky/serve/controller.py +10 -7
- sky/serve/replica_managers.py +22 -18
- sky/serve/service.py +5 -4
- sky/server/common.py +11 -4
- sky/server/html/token_page.html +32 -6
- sky/server/server.py +3 -1
- sky/server/stream_utils.py +21 -0
- sky/setup_files/dependencies.py +7 -1
- sky/skylet/constants.py +1 -1
- sky/task.py +26 -0
- sky/templates/jobs-controller.yaml.j2 +2 -1
- sky/templates/kubernetes-ray.yml.j2 +19 -1
- sky/utils/common_utils.py +66 -0
- sky/utils/rich_utils.py +5 -0
- sky/utils/schemas.py +32 -1
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/METADATA +3 -1
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/RECORD +84 -81
- sky/dashboard/out/_next/static/Mx1iAbDQn1jMHh3UHmK3R/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-d6900c828331f664.js +0 -6
- sky/dashboard/out/_next/static/chunks/320-afea3ddcc5bd1c6c.js +0 -6
- sky/dashboard/out/_next/static/chunks/578-9146658cead92981.js +0 -6
- sky/dashboard/out/_next/static/chunks/843-256ec920f6d5f41f.js +0 -11
- sky/dashboard/out/_next/static/chunks/856-62b87c68917b08ed.js +0 -1
- sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-159bffb2fa34ed54.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9506c00257d10dbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-881fcd902fbbd0e5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-2c29e97a6aa50dd4.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/workspaces-610c49ae3619ee85.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +0 -1
- sky/dashboard/out/_next/static/css/ffd1cd601648c303.css +0 -3
- /sky/dashboard/out/_next/static/{Mx1iAbDQn1jMHh3UHmK3R → Q32Bxr2Pby5tFDW-y5TNg}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-a631df412d8172de.js → _app-f19ea34b91c33950.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '86ea13eb24c6357ea885539fd3196d88295f5943'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250530'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/adaptors/nebius.py
CHANGED
@@ -1,17 +1,44 @@
|
|
1
1
|
"""Nebius cloud adaptor."""
|
2
2
|
import os
|
3
3
|
import threading
|
4
|
+
from typing import List, Optional
|
4
5
|
|
6
|
+
from sky import sky_logging
|
7
|
+
from sky import skypilot_config
|
5
8
|
from sky.adaptors import common
|
6
9
|
from sky.utils import annotations
|
7
10
|
from sky.utils import ux_utils
|
8
11
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
12
|
+
logger = sky_logging.init_logger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def tenant_id_path() -> str:
|
16
|
+
return '~/.nebius/NEBIUS_TENANT_ID.txt'
|
17
|
+
|
18
|
+
|
19
|
+
def iam_token_path() -> str:
|
20
|
+
return '~/.nebius/NEBIUS_IAM_TOKEN.txt'
|
21
|
+
|
22
|
+
|
23
|
+
def credentials_path() -> str:
|
24
|
+
workspace_path = skypilot_config.get_workspace_cloud('nebius').get(
|
25
|
+
'credentials_file_path', None)
|
26
|
+
if workspace_path is not None:
|
27
|
+
return workspace_path
|
28
|
+
return _get_default_credentials_path()
|
29
|
+
|
30
|
+
|
31
|
+
def _get_workspace_credentials_path() -> Optional[str]:
|
32
|
+
"""Get credentials path if explicitly set in workspace config."""
|
33
|
+
workspace_cred_path = skypilot_config.get_workspace_cloud('nebius').get(
|
34
|
+
'credentials_file_path', None)
|
35
|
+
return workspace_cred_path
|
36
|
+
|
37
|
+
|
38
|
+
def _get_default_credentials_path() -> str:
|
39
|
+
"""Get the default credentials path."""
|
40
|
+
return '~/.nebius/credentials.json'
|
41
|
+
|
15
42
|
|
16
43
|
DEFAULT_REGION = 'eu-north1'
|
17
44
|
|
@@ -74,39 +101,77 @@ def vpc():
|
|
74
101
|
return vpc_v1
|
75
102
|
|
76
103
|
|
77
|
-
@annotations.lru_cache(scope='request')
|
78
104
|
def get_iam_token():
|
79
105
|
try:
|
80
|
-
with open(os.path.expanduser(
|
106
|
+
with open(os.path.expanduser(iam_token_path()),
|
81
107
|
encoding='utf-8') as file:
|
82
108
|
return file.read().strip()
|
83
109
|
except FileNotFoundError:
|
84
110
|
return None
|
85
111
|
|
86
112
|
|
87
|
-
@annotations.lru_cache(scope='request')
|
88
113
|
def is_token_or_cred_file_exist():
|
89
|
-
return (os.path.exists(os.path.expanduser(
|
90
|
-
os.path.exists(os.path.expanduser(
|
114
|
+
return (os.path.exists(os.path.expanduser(iam_token_path())) or
|
115
|
+
os.path.exists(os.path.expanduser(credentials_path())))
|
91
116
|
|
92
117
|
|
93
|
-
@annotations.lru_cache(scope='request')
|
94
118
|
def get_tenant_id():
|
119
|
+
tenant_id_in_ws_config = skypilot_config.get_workspace_cloud('nebius').get(
|
120
|
+
'tenant_id', None)
|
121
|
+
if tenant_id_in_ws_config is not None:
|
122
|
+
return tenant_id_in_ws_config
|
123
|
+
tenant_id_in_config = skypilot_config.get_nested(('nebius', 'tenant_id'),
|
124
|
+
None)
|
125
|
+
if tenant_id_in_config is not None:
|
126
|
+
return tenant_id_in_config
|
95
127
|
try:
|
96
|
-
with open(os.path.expanduser(
|
128
|
+
with open(os.path.expanduser(tenant_id_path()),
|
97
129
|
encoding='utf-8') as file:
|
98
130
|
return file.read().strip()
|
99
131
|
except FileNotFoundError:
|
100
132
|
return None
|
101
133
|
|
102
134
|
|
103
|
-
@annotations.lru_cache(scope='request')
|
104
135
|
def sdk():
|
136
|
+
"""Create the Nebius SDK with the correct credentials.
|
137
|
+
|
138
|
+
The order of priority is:
|
139
|
+
1. Credentials file specified in workspace config, if set
|
140
|
+
2. IAM token file, if set
|
141
|
+
3. Default credentials path
|
142
|
+
"""
|
143
|
+
# 1. Check if credentials path is set in workspace config (highest priority)
|
144
|
+
workspace_cred_path = _get_workspace_credentials_path()
|
145
|
+
if workspace_cred_path is not None:
|
146
|
+
# Check if token is also available and warn
|
147
|
+
token = get_iam_token()
|
148
|
+
if token is not None:
|
149
|
+
logger.warning(
|
150
|
+
f'Both workspace credentials file ({workspace_cred_path}) and '
|
151
|
+
f'IAM token file ({iam_token_path()}) are available. Using '
|
152
|
+
'workspace credentials file.')
|
153
|
+
return _sdk(None, workspace_cred_path)
|
154
|
+
|
155
|
+
# 2. Check for IAM token file (second priority)
|
105
156
|
token = get_iam_token()
|
157
|
+
if token is not None:
|
158
|
+
return _sdk(token, None)
|
159
|
+
|
160
|
+
# 3. Fall back to default credentials path (lowest priority)
|
161
|
+
default_cred_path = _get_default_credentials_path()
|
162
|
+
return _sdk(None, default_cred_path)
|
163
|
+
|
164
|
+
|
165
|
+
@annotations.lru_cache(scope='request')
|
166
|
+
def _sdk(token: Optional[str], cred_path: Optional[str]):
|
167
|
+
# Exactly one of token or cred_path must be provided
|
168
|
+
assert (token is None) != (cred_path is None), (token, cred_path)
|
106
169
|
if token is not None:
|
107
170
|
return nebius.sdk.SDK(credentials=token)
|
108
|
-
|
109
|
-
|
171
|
+
if cred_path is not None:
|
172
|
+
return nebius.sdk.SDK(
|
173
|
+
credentials_file_name=os.path.expanduser(cred_path))
|
174
|
+
raise ValueError('Either token or credentials file path must be provided')
|
110
175
|
|
111
176
|
|
112
177
|
def get_nebius_credentials(boto3_session):
|
@@ -184,3 +249,21 @@ def botocore_exceptions():
|
|
184
249
|
# pylint: disable=import-outside-toplevel
|
185
250
|
from botocore import exceptions
|
186
251
|
return exceptions
|
252
|
+
|
253
|
+
|
254
|
+
def get_credential_file_paths() -> List[str]:
|
255
|
+
"""Get the list of credential file paths based on current configuration."""
|
256
|
+
paths = {
|
257
|
+
# Always include tenant ID and IAM token paths
|
258
|
+
tenant_id_path(),
|
259
|
+
iam_token_path(),
|
260
|
+
}
|
261
|
+
|
262
|
+
# Add workspace-specific credentials path if set
|
263
|
+
workspace_cred_path = _get_workspace_credentials_path()
|
264
|
+
if workspace_cred_path is not None:
|
265
|
+
paths.add(workspace_cred_path)
|
266
|
+
# Always add default path in case it's needed for fallback
|
267
|
+
paths.add(_get_default_credentials_path())
|
268
|
+
|
269
|
+
return list(paths)
|
sky/authentication.py
CHANGED
@@ -26,7 +26,7 @@ import socket
|
|
26
26
|
import subprocess
|
27
27
|
import sys
|
28
28
|
import typing
|
29
|
-
from typing import Any, Dict, Tuple
|
29
|
+
from typing import Any, Dict, Optional, Tuple
|
30
30
|
import uuid
|
31
31
|
|
32
32
|
import colorama
|
@@ -34,6 +34,7 @@ import filelock
|
|
34
34
|
|
35
35
|
from sky import clouds
|
36
36
|
from sky import exceptions
|
37
|
+
from sky import global_user_state
|
37
38
|
from sky import sky_logging
|
38
39
|
from sky import skypilot_config
|
39
40
|
from sky.adaptors import common as adaptors_common
|
@@ -72,8 +73,10 @@ else:
|
|
72
73
|
yaml = adaptors_common.LazyImport('yaml')
|
73
74
|
|
74
75
|
|
75
|
-
def get_ssh_key_and_lock_path(
|
76
|
-
|
76
|
+
def get_ssh_key_and_lock_path(
|
77
|
+
user_hash: Optional[str] = None) -> Tuple[str, str, str]:
|
78
|
+
if user_hash is None:
|
79
|
+
user_hash = common_utils.get_user_hash()
|
77
80
|
user_ssh_key_prefix = _SSH_KEY_PATH_PREFIX.format(user_hash=user_hash)
|
78
81
|
|
79
82
|
os.makedirs(os.path.expanduser(user_ssh_key_prefix),
|
@@ -129,9 +132,12 @@ def _save_key_pair(private_key_path: str, public_key_path: str,
|
|
129
132
|
opener=functools.partial(os.open, mode=0o644)) as f:
|
130
133
|
f.write(public_key)
|
131
134
|
|
135
|
+
global_user_state.set_ssh_keys(common_utils.get_user_hash(), public_key,
|
136
|
+
private_key)
|
137
|
+
|
132
138
|
|
133
139
|
def get_or_generate_keys() -> Tuple[str, str]:
|
134
|
-
"""Returns the
|
140
|
+
"""Returns the absolute private and public key paths."""
|
135
141
|
private_key_path, public_key_path, lock_path = get_ssh_key_and_lock_path()
|
136
142
|
private_key_path = os.path.expanduser(private_key_path)
|
137
143
|
public_key_path = os.path.expanduser(public_key_path)
|
@@ -144,15 +150,56 @@ def get_or_generate_keys() -> Tuple[str, str]:
|
|
144
150
|
os.makedirs(lock_dir, exist_ok=True, mode=0o700)
|
145
151
|
with filelock.FileLock(lock_path, timeout=10):
|
146
152
|
if not os.path.exists(private_key_path):
|
147
|
-
|
148
|
-
|
149
|
-
|
153
|
+
ssh_public_key, ssh_private_key, exists = (
|
154
|
+
global_user_state.get_ssh_keys(common_utils.get_user_hash()))
|
155
|
+
if not exists:
|
156
|
+
ssh_public_key, ssh_private_key = _generate_rsa_key_pair()
|
157
|
+
_save_key_pair(private_key_path, public_key_path, ssh_private_key,
|
158
|
+
ssh_public_key)
|
150
159
|
assert os.path.exists(public_key_path), (
|
151
160
|
'Private key found, but associated public key '
|
152
161
|
f'{public_key_path} does not exist.')
|
153
162
|
return private_key_path, public_key_path
|
154
163
|
|
155
164
|
|
165
|
+
def create_ssh_key_files_from_db(private_key_path: Optional[str] = None):
|
166
|
+
if private_key_path is None:
|
167
|
+
user_hash = common_utils.get_user_hash()
|
168
|
+
else:
|
169
|
+
# Assume private key path is in the format of
|
170
|
+
# ~/.sky/clients/<user_hash>/ssh/sky-key
|
171
|
+
separated_path = os.path.normpath(private_key_path).split(os.path.sep)
|
172
|
+
assert separated_path[-1] == 'sky-key'
|
173
|
+
assert separated_path[-2] == 'ssh'
|
174
|
+
user_hash = separated_path[-3]
|
175
|
+
|
176
|
+
private_key_path_generated, public_key_path, lock_path = (
|
177
|
+
get_ssh_key_and_lock_path(user_hash))
|
178
|
+
assert private_key_path == os.path.expanduser(private_key_path_generated), (
|
179
|
+
f'Private key path {private_key_path} does not '
|
180
|
+
f'match the generated path {private_key_path_generated}')
|
181
|
+
private_key_path = os.path.expanduser(private_key_path)
|
182
|
+
public_key_path = os.path.expanduser(public_key_path)
|
183
|
+
lock_path = os.path.expanduser(lock_path)
|
184
|
+
|
185
|
+
lock_dir = os.path.dirname(lock_path)
|
186
|
+
# We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
|
187
|
+
# as the ssh configs will be written to this folder as well in
|
188
|
+
# backend_utils.SSHConfigHelper
|
189
|
+
os.makedirs(lock_dir, exist_ok=True, mode=0o700)
|
190
|
+
with filelock.FileLock(lock_path, timeout=10):
|
191
|
+
if not os.path.exists(private_key_path):
|
192
|
+
ssh_public_key, ssh_private_key, exists = (
|
193
|
+
global_user_state.get_ssh_keys(user_hash))
|
194
|
+
if not exists:
|
195
|
+
raise RuntimeError(f'SSH keys not found for user {user_hash}')
|
196
|
+
_save_key_pair(private_key_path, public_key_path, ssh_private_key,
|
197
|
+
ssh_public_key)
|
198
|
+
assert os.path.exists(public_key_path), (
|
199
|
+
'Private key found, but associated public key '
|
200
|
+
f'{public_key_path} does not exist.')
|
201
|
+
|
202
|
+
|
156
203
|
def configure_ssh_info(config: Dict[str, Any]) -> Dict[str, Any]:
|
157
204
|
_, public_key_path = get_or_generate_keys()
|
158
205
|
with open(public_key_path, 'r', encoding='utf-8') as f:
|
sky/backends/backend_utils.py
CHANGED
@@ -213,7 +213,7 @@ def _get_yaml_path_from_cluster_name(cluster_name: str,
|
|
213
213
|
# Add retry for the file mounts optimization, as the underlying cp command may
|
214
214
|
# experience transient errors, #4758.
|
215
215
|
@common_utils.retry
|
216
|
-
def _optimize_file_mounts(
|
216
|
+
def _optimize_file_mounts(tmp_yaml_path: str) -> None:
|
217
217
|
"""Optimize file mounts in the given ray yaml file.
|
218
218
|
|
219
219
|
Runtime files handling:
|
@@ -227,7 +227,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
|
|
227
227
|
subprocess.CalledProcessError: If the file mounts are failed to be
|
228
228
|
copied.
|
229
229
|
"""
|
230
|
-
yaml_config = common_utils.read_yaml(
|
230
|
+
yaml_config = common_utils.read_yaml(tmp_yaml_path)
|
231
231
|
|
232
232
|
file_mounts = yaml_config.get('file_mounts', {})
|
233
233
|
# Remove the file mounts added by the newline.
|
@@ -311,7 +311,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
|
|
311
311
|
shell=True,
|
312
312
|
check=True)
|
313
313
|
|
314
|
-
common_utils.dump_yaml(
|
314
|
+
common_utils.dump_yaml(tmp_yaml_path, yaml_config)
|
315
315
|
|
316
316
|
|
317
317
|
def path_size_megabytes(path: str) -> int:
|
@@ -842,9 +842,8 @@ def write_cluster_config(
|
|
842
842
|
_add_auth_to_cluster_config(cloud, tmp_yaml_path)
|
843
843
|
|
844
844
|
# Restore the old yaml content for backward compatibility.
|
845
|
-
|
846
|
-
|
847
|
-
old_yaml_content = f.read()
|
845
|
+
old_yaml_content = global_user_state.get_cluster_yaml_str(yaml_path)
|
846
|
+
if old_yaml_content is not None and keep_launch_fields_in_existing_config:
|
848
847
|
with open(tmp_yaml_path, 'r', encoding='utf-8') as f:
|
849
848
|
new_yaml_content = f.read()
|
850
849
|
restored_yaml_content = _replace_yaml_dicts(
|
@@ -881,18 +880,29 @@ def write_cluster_config(
|
|
881
880
|
# compatibility should go before this call.
|
882
881
|
_optimize_file_mounts(tmp_yaml_path)
|
883
882
|
|
884
|
-
#
|
885
|
-
|
886
|
-
|
883
|
+
# commit the final yaml to the database
|
884
|
+
global_user_state.set_cluster_yaml(
|
885
|
+
cluster_name,
|
886
|
+
open(tmp_yaml_path, 'r', encoding='utf-8').read())
|
887
|
+
|
888
|
+
usage_lib.messages.usage.update_ray_yaml(tmp_yaml_path)
|
889
|
+
|
890
|
+
# Remove the tmp file.
|
891
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
892
|
+
debug_yaml_path = yaml_path + '.debug'
|
893
|
+
os.rename(tmp_yaml_path, debug_yaml_path)
|
894
|
+
else:
|
895
|
+
os.remove(tmp_yaml_path)
|
896
|
+
|
887
897
|
return config_dict
|
888
898
|
|
889
899
|
|
890
|
-
def _add_auth_to_cluster_config(cloud: clouds.Cloud,
|
900
|
+
def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
891
901
|
"""Adds SSH key info to the cluster config.
|
892
902
|
|
893
903
|
This function's output removes comments included in the jinja2 template.
|
894
904
|
"""
|
895
|
-
config = common_utils.read_yaml(
|
905
|
+
config = common_utils.read_yaml(tmp_yaml_path)
|
896
906
|
# Check the availability of the cloud type.
|
897
907
|
if isinstance(cloud, (
|
898
908
|
clouds.AWS,
|
@@ -922,7 +932,7 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
|
|
922
932
|
config = auth.setup_fluidstack_authentication(config)
|
923
933
|
else:
|
924
934
|
assert False, cloud
|
925
|
-
common_utils.dump_yaml(
|
935
|
+
common_utils.dump_yaml(tmp_yaml_path, config)
|
926
936
|
|
927
937
|
|
928
938
|
def get_timestamp_from_run_timestamp(run_timestamp: str) -> float:
|
@@ -980,7 +990,7 @@ def _count_healthy_nodes_from_ray(output: str,
|
|
980
990
|
|
981
991
|
|
982
992
|
@timeline.event
|
983
|
-
def _deterministic_cluster_yaml_hash(
|
993
|
+
def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
|
984
994
|
"""Hash the cluster yaml and contents of file mounts to a unique string.
|
985
995
|
|
986
996
|
Two invocations of this function should return the same string if and only
|
@@ -1024,7 +1034,7 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
|
|
1024
1034
|
"""
|
1025
1035
|
|
1026
1036
|
# Load the yaml contents so that we can directly remove keys.
|
1027
|
-
yaml_config = common_utils.read_yaml(
|
1037
|
+
yaml_config = common_utils.read_yaml(tmp_yaml_path)
|
1028
1038
|
for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
|
1029
1039
|
dict_to_remove_from = yaml_config
|
1030
1040
|
found_key = True
|
@@ -1053,7 +1063,7 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
|
|
1053
1063
|
file_mounts.pop('')
|
1054
1064
|
|
1055
1065
|
for dst, src in sorted(file_mounts.items()):
|
1056
|
-
if src ==
|
1066
|
+
if src == tmp_yaml_path:
|
1057
1067
|
# Skip the yaml file itself. We have already hashed a modified
|
1058
1068
|
# version of it. The file may include fields we don't want to hash.
|
1059
1069
|
continue
|
@@ -1148,7 +1158,7 @@ def wait_until_ray_cluster_ready(
|
|
1148
1158
|
logger.error(common_utils.format_exception(e))
|
1149
1159
|
return False, None # failed
|
1150
1160
|
|
1151
|
-
config =
|
1161
|
+
config = global_user_state.get_cluster_yaml_dict(cluster_config_file)
|
1152
1162
|
|
1153
1163
|
docker_user = None
|
1154
1164
|
if 'docker' in config:
|
@@ -1248,11 +1258,11 @@ def ssh_credential_from_yaml(
|
|
1248
1258
|
"""
|
1249
1259
|
if cluster_yaml is None:
|
1250
1260
|
return dict()
|
1251
|
-
config =
|
1261
|
+
config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
|
1252
1262
|
auth_section = config['auth']
|
1253
1263
|
if ssh_user is None:
|
1254
1264
|
ssh_user = auth_section['ssh_user'].strip()
|
1255
|
-
|
1265
|
+
ssh_private_key_path = auth_section.get('ssh_private_key')
|
1256
1266
|
ssh_control_name = config.get('cluster_name', '__default__')
|
1257
1267
|
ssh_proxy_command = auth_section.get('ssh_proxy_command')
|
1258
1268
|
|
@@ -1261,9 +1271,10 @@ def ssh_credential_from_yaml(
|
|
1261
1271
|
constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
|
1262
1272
|
ssh_proxy_command = ssh_proxy_command.replace(
|
1263
1273
|
constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
|
1274
|
+
|
1264
1275
|
credentials = {
|
1265
1276
|
'ssh_user': ssh_user,
|
1266
|
-
'ssh_private_key':
|
1277
|
+
'ssh_private_key': ssh_private_key_path,
|
1267
1278
|
'ssh_control_name': ssh_control_name,
|
1268
1279
|
'ssh_proxy_command': ssh_proxy_command,
|
1269
1280
|
}
|
@@ -1436,7 +1447,7 @@ def get_node_ips(cluster_yaml: str,
|
|
1436
1447
|
exceptions.FetchClusterInfoError: if we failed to get the IPs. e.reason is
|
1437
1448
|
HEAD or WORKER.
|
1438
1449
|
"""
|
1439
|
-
ray_config =
|
1450
|
+
ray_config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
|
1440
1451
|
# Use the new provisioner for AWS.
|
1441
1452
|
provider_name = cluster_utils.get_provider_name(ray_config)
|
1442
1453
|
cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
|
@@ -1652,7 +1663,7 @@ def _query_cluster_status_via_cloud_api(
|
|
1652
1663
|
# Use region and zone from the cluster config, instead of the
|
1653
1664
|
# handle.launched_resources, because the latter may not be set
|
1654
1665
|
# correctly yet.
|
1655
|
-
ray_config =
|
1666
|
+
ray_config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
|
1656
1667
|
provider_config = ray_config['provider']
|
1657
1668
|
|
1658
1669
|
# Query the cloud provider.
|
@@ -2599,6 +2610,8 @@ def get_clusters(
|
|
2599
2610
|
return
|
2600
2611
|
ssh_private_key_path = credentials.get('ssh_private_key', None)
|
2601
2612
|
if ssh_private_key_path is not None:
|
2613
|
+
if not os.path.exists(os.path.expanduser(ssh_private_key_path)):
|
2614
|
+
auth.create_ssh_key_files_from_db(ssh_private_key_path)
|
2602
2615
|
with open(os.path.expanduser(ssh_private_key_path),
|
2603
2616
|
'r',
|
2604
2617
|
encoding='utf-8') as f:
|
@@ -2999,7 +3012,7 @@ def get_endpoints(cluster: str,
|
|
2999
3012
|
raise ValueError('Querying endpoints is not supported '
|
3000
3013
|
f'for {cluster!r} on {cloud}.') from None
|
3001
3014
|
|
3002
|
-
config =
|
3015
|
+
config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
|
3003
3016
|
port_details = provision_lib.query_ports(repr(cloud),
|
3004
3017
|
handle.cluster_name_on_cloud,
|
3005
3018
|
handle.launched_resources.ports,
|
@@ -1235,7 +1235,8 @@ class RetryingVmProvisioner(object):
|
|
1235
1235
|
assert isinstance(handle, CloudVmRayResourceHandle), (
|
1236
1236
|
'handle should be CloudVmRayResourceHandle (found: '
|
1237
1237
|
f'{type(handle)}) {cluster_name!r}')
|
1238
|
-
config =
|
1238
|
+
config = global_user_state.get_cluster_yaml_dict(
|
1239
|
+
handle.cluster_yaml)
|
1239
1240
|
# This is for the case when the zone field is not set in the
|
1240
1241
|
# launched resources in a previous launch (e.g., ctrl-c during
|
1241
1242
|
# launch and multi-node cluster before PR #1700).
|
@@ -1935,7 +1936,8 @@ class RetryingVmProvisioner(object):
|
|
1935
1936
|
# ready to ensure cluster will not scale up after preemption (spot).
|
1936
1937
|
# Skip for non-spot as this takes extra time to provision (~1min).
|
1937
1938
|
if use_spot:
|
1938
|
-
ray_config =
|
1939
|
+
ray_config = global_user_state.get_cluster_yaml_dict(
|
1940
|
+
cluster_config_file)
|
1939
1941
|
ray_config['upscaling_speed'] = 0
|
1940
1942
|
common_utils.dump_yaml(cluster_config_file, ray_config)
|
1941
1943
|
start = time.time()
|
@@ -2270,7 +2272,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2270
2272
|
# Directly load the `use_internal_ips` flag from the cluster yaml
|
2271
2273
|
# instead of `skypilot_config` as the latter can be changed after the
|
2272
2274
|
# cluster is UP.
|
2273
|
-
return
|
2275
|
+
return global_user_state.get_cluster_yaml_dict(self.cluster_yaml).get(
|
2274
2276
|
'provider', {}).get('use_internal_ips', False)
|
2275
2277
|
|
2276
2278
|
def update_ssh_ports(self, max_attempts: int = 1) -> None:
|
@@ -2299,7 +2301,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2299
2301
|
# It is possible that the cluster yaml is not available when
|
2300
2302
|
# the handle is unpickled for service replicas from the
|
2301
2303
|
# controller with older version.
|
2302
|
-
config =
|
2304
|
+
config = global_user_state.get_cluster_yaml_dict(
|
2305
|
+
self.cluster_yaml)
|
2303
2306
|
try:
|
2304
2307
|
cluster_info = provision_lib.get_cluster_info(
|
2305
2308
|
provider_name,
|
@@ -2634,7 +2637,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2634
2637
|
# pylint: disable=import-outside-toplevel
|
2635
2638
|
launched_resources = state['launched_resources']
|
2636
2639
|
if isinstance(launched_resources.cloud, clouds.Kubernetes):
|
2637
|
-
yaml_config =
|
2640
|
+
yaml_config = global_user_state.get_cluster_yaml_dict(
|
2638
2641
|
os.path.expanduser(state['_cluster_yaml']))
|
2639
2642
|
context = kubernetes_utils.get_context_from_config(
|
2640
2643
|
yaml_config['provider'])
|
@@ -3044,8 +3047,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3044
3047
|
ssh_port_list = handle.external_ssh_ports()
|
3045
3048
|
assert ip_list is not None, handle
|
3046
3049
|
assert ssh_port_list is not None, handle
|
3047
|
-
|
3048
|
-
|
3050
|
+
config = global_user_state.get_cluster_yaml_dict(
|
3051
|
+
cluster_config_file)
|
3049
3052
|
if 'docker' in config:
|
3050
3053
|
handle.setup_docker_user(cluster_config_file)
|
3051
3054
|
|
@@ -3113,7 +3116,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3113
3116
|
cloud = handle.launched_resources.cloud
|
3114
3117
|
logger.debug(
|
3115
3118
|
f'Opening ports {handle.launched_resources.ports} for {cloud}')
|
3116
|
-
config =
|
3119
|
+
config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
|
3117
3120
|
provider_config = config['provider']
|
3118
3121
|
provision_lib.open_ports(repr(cloud), handle.cluster_name_on_cloud,
|
3119
3122
|
handle.launched_resources.ports,
|
@@ -3178,6 +3181,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3178
3181
|
'Launching - Opening new ports')):
|
3179
3182
|
self._open_ports(handle)
|
3180
3183
|
|
3184
|
+
# Capture task YAML and command
|
3185
|
+
task_config = None
|
3186
|
+
if task is not None:
|
3187
|
+
task_config = task.to_yaml_config()
|
3188
|
+
|
3181
3189
|
with timeline.Event('backend.provision.post_process'):
|
3182
3190
|
global_user_state.add_or_update_cluster(
|
3183
3191
|
handle.cluster_name,
|
@@ -3185,6 +3193,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3185
3193
|
set(task.resources),
|
3186
3194
|
ready=True,
|
3187
3195
|
config_hash=config_hash,
|
3196
|
+
task_config=task_config,
|
3188
3197
|
)
|
3189
3198
|
usage_lib.messages.usage.update_final_cluster_status(
|
3190
3199
|
status_lib.ClusterStatus.UP)
|
@@ -3507,9 +3516,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3507
3516
|
# Add the managed job to job queue database.
|
3508
3517
|
managed_job_codegen = managed_jobs.ManagedJobCodeGen()
|
3509
3518
|
managed_job_code = managed_job_codegen.set_pending(
|
3510
|
-
job_id,
|
3519
|
+
job_id,
|
3520
|
+
managed_job_dag,
|
3511
3521
|
skypilot_config.get_active_workspace(
|
3512
|
-
force_user_workspace=True)
|
3522
|
+
force_user_workspace=True),
|
3523
|
+
entrypoint=common_utils.get_current_command())
|
3513
3524
|
# Set the managed job to PENDING state to make sure that this
|
3514
3525
|
# managed job appears in the `sky jobs queue`, even if it needs
|
3515
3526
|
# to wait to be submitted.
|
@@ -4195,7 +4206,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4195
4206
|
log_abs_path = os.path.abspath(log_path)
|
4196
4207
|
launched_resources = handle.launched_resources.assert_launchable()
|
4197
4208
|
cloud = launched_resources.cloud
|
4198
|
-
config =
|
4209
|
+
config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
|
4199
4210
|
cluster_name = handle.cluster_name
|
4200
4211
|
cluster_name_on_cloud = handle.cluster_name_on_cloud
|
4201
4212
|
|
@@ -4255,7 +4266,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4255
4266
|
from sky.adaptors import ibm
|
4256
4267
|
from sky.skylet.providers.ibm.vpc_provider import IBMVPCProvider
|
4257
4268
|
|
4258
|
-
config_provider =
|
4269
|
+
config_provider = global_user_state.get_cluster_yaml_dict(
|
4259
4270
|
handle.cluster_yaml)['provider']
|
4260
4271
|
region = config_provider['region']
|
4261
4272
|
search_client = ibm.search_client()
|
@@ -4409,7 +4420,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4409
4420
|
launched_resources = (
|
4410
4421
|
handle.launched_resources.assert_launchable())
|
4411
4422
|
cloud = launched_resources.cloud
|
4412
|
-
config =
|
4423
|
+
config = global_user_state.get_cluster_yaml_dict(
|
4424
|
+
handle.cluster_yaml)
|
4413
4425
|
cloud.check_features_are_supported(
|
4414
4426
|
launched_resources,
|
4415
4427
|
{clouds.CloudImplementationFeatures.OPEN_PORTS})
|
@@ -4448,7 +4460,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4448
4460
|
# https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
|
4449
4461
|
attempts = 0
|
4450
4462
|
while True:
|
4451
|
-
config =
|
4463
|
+
config = global_user_state.get_cluster_yaml_dict(
|
4464
|
+
handle.cluster_yaml)
|
4452
4465
|
|
4453
4466
|
logger.debug(f'instance statuses attempt {attempts + 1}')
|
4454
4467
|
node_status_dict = provision_lib.query_instances(
|
@@ -4504,9 +4517,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4504
4517
|
|
4505
4518
|
def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
|
4506
4519
|
"""Remove the YAML config of a cluster."""
|
4520
|
+
cluster_yaml_path = handle.cluster_yaml
|
4507
4521
|
handle.cluster_yaml = None
|
4508
4522
|
global_user_state.update_cluster_handle(handle.cluster_name, handle)
|
4509
|
-
|
4523
|
+
global_user_state.remove_cluster_yaml(handle.cluster_name)
|
4524
|
+
common_utils.remove_file_if_exists(cluster_yaml_path)
|
4510
4525
|
|
4511
4526
|
def set_autostop(self,
|
4512
4527
|
handle: CloudVmRayResourceHandle,
|
sky/check.py
CHANGED
@@ -257,7 +257,7 @@ def check_capabilities(
|
|
257
257
|
# Check all workspaces
|
258
258
|
workspaces_to_check = available_workspaces
|
259
259
|
|
260
|
-
hide_per_cloud_details_flag = (not verbose and
|
260
|
+
hide_per_cloud_details_flag = (not verbose and clouds is None and
|
261
261
|
len(workspaces_to_check) > 1)
|
262
262
|
|
263
263
|
for ws_name in workspaces_to_check:
|