skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -4
- sky/backends/backend_utils.py +7 -0
- sky/backends/cloud_vm_ray_backend.py +91 -96
- sky/cli.py +5 -6311
- sky/client/cli.py +66 -639
- sky/client/sdk.py +22 -2
- sky/clouds/kubernetes.py +8 -0
- sky/clouds/scp.py +7 -26
- sky/clouds/utils/scp_utils.py +177 -124
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +50 -11
- sky/jobs/controller.py +98 -31
- sky/jobs/scheduler.py +37 -29
- sky/jobs/server/core.py +36 -3
- sky/jobs/state.py +69 -9
- sky/jobs/utils.py +11 -0
- sky/logs/__init__.py +17 -0
- sky/logs/agent.py +73 -0
- sky/logs/gcp.py +91 -0
- sky/models.py +1 -0
- sky/provision/__init__.py +1 -0
- sky/provision/instance_setup.py +35 -0
- sky/provision/provisioner.py +11 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +528 -0
- sky/resources.py +164 -29
- sky/server/common.py +21 -9
- sky/server/requests/payloads.py +19 -1
- sky/server/server.py +121 -29
- sky/setup_files/dependencies.py +11 -1
- sky/skylet/constants.py +48 -1
- sky/skylet/job_lib.py +83 -19
- sky/task.py +171 -21
- sky/templates/kubernetes-ray.yml.j2 +60 -4
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/users/permission.py +47 -34
- sky/users/rbac.py +10 -1
- sky/users/server.py +274 -9
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +16 -14
- sky/utils/context.py +1 -1
- sky/utils/controller_utils.py +12 -3
- sky/utils/dag_utils.py +17 -4
- sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
- sky/utils/schemas.py +83 -5
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
- sky/benchmark/__init__.py +0 -0
- sky/benchmark/benchmark_state.py +0 -295
- sky/benchmark/benchmark_utils.py +0 -641
- sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
- sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
sky/users/server.py
CHANGED
@@ -1,19 +1,30 @@
|
|
1
1
|
"""REST API for workspace management."""
|
2
2
|
|
3
|
-
|
3
|
+
import contextlib
|
4
|
+
import hashlib
|
5
|
+
import os
|
6
|
+
from typing import Any, Dict, Generator, List
|
4
7
|
|
5
8
|
import fastapi
|
9
|
+
import filelock
|
10
|
+
from passlib.hash import apr_md5_crypt
|
6
11
|
|
7
12
|
from sky import global_user_state
|
13
|
+
from sky import models
|
8
14
|
from sky import sky_logging
|
9
15
|
from sky.server.requests import payloads
|
10
16
|
from sky.skylet import constants
|
11
17
|
from sky.users import permission
|
12
18
|
from sky.users import rbac
|
13
19
|
from sky.utils import common
|
20
|
+
from sky.utils import common_utils
|
14
21
|
|
15
22
|
logger = sky_logging.init_logger(__name__)
|
16
23
|
|
24
|
+
# Filelocks for the user management.
|
25
|
+
USER_LOCK_PATH = os.path.expanduser('~/.sky/.{user_id}.lock')
|
26
|
+
USER_LOCK_TIMEOUT_SECONDS = 20
|
27
|
+
|
17
28
|
router = fastapi.APIRouter()
|
18
29
|
|
19
30
|
|
@@ -39,29 +50,283 @@ async def get_current_user_role(request: fastapi.Request):
|
|
39
50
|
# hash for the request without 'X-Auth-Request-Email' header?
|
40
51
|
auth_user = request.state.auth_user
|
41
52
|
if auth_user is None:
|
42
|
-
return {'name': '', 'role': rbac.RoleName.ADMIN.value}
|
53
|
+
return {'id': '', 'name': '', 'role': rbac.RoleName.ADMIN.value}
|
43
54
|
user_roles = permission.permission_service.get_user_roles(auth_user.id)
|
44
|
-
return {
|
55
|
+
return {
|
56
|
+
'id': auth_user.id,
|
57
|
+
'name': auth_user.name,
|
58
|
+
'role': user_roles[0] if user_roles else ''
|
59
|
+
}
|
60
|
+
|
61
|
+
|
62
|
+
@router.post('/create')
|
63
|
+
async def user_create(user_create_body: payloads.UserCreateBody) -> None:
|
64
|
+
username = user_create_body.username
|
65
|
+
password = user_create_body.password
|
66
|
+
role = user_create_body.role
|
67
|
+
|
68
|
+
if not username or not password:
|
69
|
+
raise fastapi.HTTPException(status_code=400,
|
70
|
+
detail='Username and password are required')
|
71
|
+
if role and role not in rbac.get_supported_roles():
|
72
|
+
raise fastapi.HTTPException(status_code=400,
|
73
|
+
detail=f'Invalid role: {role}')
|
74
|
+
|
75
|
+
if not role:
|
76
|
+
role = rbac.get_default_role()
|
77
|
+
|
78
|
+
# Create user
|
79
|
+
password_hash = apr_md5_crypt.hash(password)
|
80
|
+
user_hash = hashlib.md5(
|
81
|
+
username.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
|
82
|
+
with _user_lock(user_hash):
|
83
|
+
# Check if user already exists
|
84
|
+
if global_user_state.get_user_by_name(username):
|
85
|
+
raise fastapi.HTTPException(
|
86
|
+
status_code=400, detail=f'User {username!r} already exists')
|
87
|
+
global_user_state.add_or_update_user(
|
88
|
+
models.User(id=user_hash, name=username, password=password_hash))
|
89
|
+
permission.permission_service.update_role(user_hash, role)
|
45
90
|
|
46
91
|
|
47
92
|
@router.post('/update')
|
48
|
-
async def user_update(
|
93
|
+
async def user_update(request: fastapi.Request,
|
94
|
+
user_update_body: payloads.UserUpdateBody) -> None:
|
49
95
|
"""Updates the user role."""
|
50
96
|
user_id = user_update_body.user_id
|
51
97
|
role = user_update_body.role
|
98
|
+
password = user_update_body.password
|
52
99
|
supported_roles = rbac.get_supported_roles()
|
53
|
-
if role not in supported_roles:
|
100
|
+
if role and role not in supported_roles:
|
54
101
|
raise fastapi.HTTPException(status_code=400,
|
55
102
|
detail=f'Invalid role: {role}')
|
103
|
+
target_user_roles = permission.permission_service.get_user_roles(user_id)
|
104
|
+
need_update_role = role and (not target_user_roles or
|
105
|
+
(role != target_user_roles[0]))
|
106
|
+
current_user = request.state.auth_user
|
107
|
+
if current_user is not None:
|
108
|
+
current_user_roles = permission.permission_service.get_user_roles(
|
109
|
+
current_user.id)
|
110
|
+
if not current_user_roles:
|
111
|
+
raise fastapi.HTTPException(status_code=403, detail='Invalid user')
|
112
|
+
if current_user_roles[0] != rbac.RoleName.ADMIN.value:
|
113
|
+
if need_update_role:
|
114
|
+
raise fastapi.HTTPException(
|
115
|
+
status_code=403, detail='Only admin can update user role')
|
116
|
+
if password and user_id != current_user.id:
|
117
|
+
raise fastapi.HTTPException(
|
118
|
+
status_code=403,
|
119
|
+
detail='Only admin can update password for other users')
|
56
120
|
user_info = global_user_state.get_user(user_id)
|
57
121
|
if user_info is None:
|
58
122
|
raise fastapi.HTTPException(status_code=400,
|
59
123
|
detail=f'User {user_id} does not exist')
|
60
|
-
# Disallow updating
|
61
|
-
if user_info.id in [
|
124
|
+
# Disallow updating the internal users.
|
125
|
+
if need_update_role and user_info.id in [
|
126
|
+
common.SERVER_ID, constants.SKYPILOT_SYSTEM_USER_ID
|
127
|
+
]:
|
62
128
|
raise fastapi.HTTPException(status_code=400,
|
63
129
|
detail=f'Cannot update role for internal '
|
64
130
|
f'API server user {user_info.name}')
|
131
|
+
if password and user_info.id == constants.SKYPILOT_SYSTEM_USER_ID:
|
132
|
+
raise fastapi.HTTPException(
|
133
|
+
status_code=400,
|
134
|
+
detail=f'Cannot update password for internal '
|
135
|
+
f'API server user {user_info.name}')
|
136
|
+
|
137
|
+
with _user_lock(user_info.id):
|
138
|
+
if password:
|
139
|
+
password_hash = apr_md5_crypt.hash(password)
|
140
|
+
global_user_state.add_or_update_user(
|
141
|
+
models.User(id=user_info.id,
|
142
|
+
name=user_info.name,
|
143
|
+
password=password_hash))
|
144
|
+
if role and need_update_role:
|
145
|
+
# Update user role in casbin policy
|
146
|
+
permission.permission_service.update_role(user_info.id, role)
|
147
|
+
|
148
|
+
|
149
|
+
@router.post('/delete')
|
150
|
+
async def user_delete(user_delete_body: payloads.UserDeleteBody) -> None:
|
151
|
+
user_id = user_delete_body.user_id
|
152
|
+
|
153
|
+
user_info = global_user_state.get_user(user_id)
|
154
|
+
if user_info is None:
|
155
|
+
raise fastapi.HTTPException(status_code=400,
|
156
|
+
detail=f'User {user_id} does not exist')
|
157
|
+
# Disallow deleting the internal users.
|
158
|
+
if user_info.id in [common.SERVER_ID, constants.SKYPILOT_SYSTEM_USER_ID]:
|
159
|
+
raise fastapi.HTTPException(status_code=400,
|
160
|
+
detail=f'Cannot delete internal '
|
161
|
+
f'API server user {user_info.name}')
|
162
|
+
with _user_lock(user_id):
|
163
|
+
global_user_state.delete_user(user_id)
|
164
|
+
permission.permission_service.delete_user(user_id)
|
165
|
+
|
166
|
+
|
167
|
+
@router.post('/import')
|
168
|
+
async def user_import(
|
169
|
+
user_import_body: payloads.UserImportBody) -> Dict[str, Any]:
|
170
|
+
"""Import users from CSV content."""
|
171
|
+
csv_content = user_import_body.csv_content
|
172
|
+
|
173
|
+
if not csv_content:
|
174
|
+
raise fastapi.HTTPException(status_code=400,
|
175
|
+
detail='CSV content is required')
|
176
|
+
|
177
|
+
# Parse CSV content
|
178
|
+
lines = csv_content.strip().split('\n')
|
179
|
+
if len(lines) < 2:
|
180
|
+
raise fastapi.HTTPException(
|
181
|
+
status_code=400,
|
182
|
+
detail='CSV must have at least a header row and one data row')
|
183
|
+
|
184
|
+
# Parse headers
|
185
|
+
headers = [h.strip().lower() for h in lines[0].split(',')]
|
186
|
+
required_headers = ['username', 'password', 'role']
|
187
|
+
|
188
|
+
# Check if all required headers are present
|
189
|
+
missing_headers = [
|
190
|
+
header for header in required_headers if header not in headers
|
191
|
+
]
|
192
|
+
if missing_headers:
|
193
|
+
raise fastapi.HTTPException(
|
194
|
+
status_code=400,
|
195
|
+
detail=f'Missing required columns: {", ".join(missing_headers)}')
|
196
|
+
|
197
|
+
# Parse user data
|
198
|
+
users_to_create = []
|
199
|
+
parse_errors = []
|
200
|
+
|
201
|
+
for i, line in enumerate(lines[1:], start=2):
|
202
|
+
if not line.strip():
|
203
|
+
continue # Skip empty lines
|
204
|
+
|
205
|
+
values = [v.strip() for v in line.split(',')]
|
206
|
+
if len(values) != len(headers):
|
207
|
+
parse_errors.append(f'Line {i}: Invalid number of columns')
|
208
|
+
continue
|
209
|
+
|
210
|
+
user_data = dict(zip(headers, values))
|
211
|
+
|
212
|
+
# Validate required fields
|
213
|
+
if not user_data.get('username') or not user_data.get('password'):
|
214
|
+
parse_errors.append(f'Line {i}: Username and password are required')
|
215
|
+
continue
|
216
|
+
|
217
|
+
# Validate role
|
218
|
+
role = user_data.get('role', '').lower()
|
219
|
+
if role and role not in rbac.get_supported_roles():
|
220
|
+
role = rbac.get_default_role() # Default to default role if invalid
|
221
|
+
elif not role:
|
222
|
+
role = rbac.get_default_role()
|
223
|
+
|
224
|
+
users_to_create.append({
|
225
|
+
'username': user_data['username'],
|
226
|
+
'password': user_data['password'],
|
227
|
+
'role': role
|
228
|
+
})
|
229
|
+
|
230
|
+
if not users_to_create and parse_errors:
|
231
|
+
raise fastapi.HTTPException(
|
232
|
+
status_code=400,
|
233
|
+
detail=f'No valid users found. Errors: {"; ".join(parse_errors)}')
|
234
|
+
|
235
|
+
# Create users
|
236
|
+
success_count = 0
|
237
|
+
error_count = 0
|
238
|
+
creation_errors = []
|
239
|
+
|
240
|
+
for user_data in users_to_create:
|
241
|
+
try:
|
242
|
+
username = user_data['username']
|
243
|
+
password = user_data['password']
|
244
|
+
role = user_data['role']
|
245
|
+
|
246
|
+
# Check if user already exists
|
247
|
+
if global_user_state.get_user_by_name(username):
|
248
|
+
error_count += 1
|
249
|
+
creation_errors.append(f'{username}: User already exists')
|
250
|
+
continue
|
251
|
+
|
252
|
+
# Check if password is already hashed (APR1 hash)
|
253
|
+
if password.startswith('$apr1$'):
|
254
|
+
# Password is already hashed, use it directly
|
255
|
+
password_hash = password
|
256
|
+
else:
|
257
|
+
# Password is plain text, hash it
|
258
|
+
password_hash = apr_md5_crypt.hash(password)
|
259
|
+
|
260
|
+
user_hash = hashlib.md5(
|
261
|
+
username.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
|
262
|
+
|
263
|
+
with _user_lock(user_hash):
|
264
|
+
global_user_state.add_or_update_user(
|
265
|
+
models.User(id=user_hash,
|
266
|
+
name=username,
|
267
|
+
password=password_hash))
|
268
|
+
permission.permission_service.update_role(user_hash, role)
|
269
|
+
|
270
|
+
success_count += 1
|
271
|
+
|
272
|
+
except Exception as e: # pylint: disable=broad-except
|
273
|
+
error_count += 1
|
274
|
+
creation_errors.append(f'{user_data["username"]}: {str(e)}')
|
275
|
+
|
276
|
+
return {
|
277
|
+
'success_count': success_count,
|
278
|
+
'error_count': error_count,
|
279
|
+
'total_processed': len(users_to_create),
|
280
|
+
'parse_errors': parse_errors,
|
281
|
+
'creation_errors': creation_errors
|
282
|
+
}
|
283
|
+
|
284
|
+
|
285
|
+
@router.get('/export')
|
286
|
+
async def user_export() -> Dict[str, Any]:
|
287
|
+
"""Export all users as CSV content."""
|
288
|
+
try:
|
289
|
+
# Get all users
|
290
|
+
user_list = global_user_state.get_all_users()
|
291
|
+
|
292
|
+
# Create CSV content
|
293
|
+
csv_lines = ['username,password,role'] # Header
|
294
|
+
|
295
|
+
for user in user_list:
|
296
|
+
# Get user role
|
297
|
+
user_roles = permission.permission_service.get_user_roles(user.id)
|
298
|
+
role = user_roles[0] if user_roles else rbac.get_default_role()
|
299
|
+
# Avoid exporting `None` values
|
300
|
+
line = ''
|
301
|
+
if user.name:
|
302
|
+
line += user.name
|
303
|
+
line += ','
|
304
|
+
if user.password:
|
305
|
+
line += user.password
|
306
|
+
line += ','
|
307
|
+
if role:
|
308
|
+
line += role
|
309
|
+
csv_lines.append(line)
|
310
|
+
|
311
|
+
csv_content = '\n'.join(csv_lines)
|
312
|
+
|
313
|
+
return {'csv_content': csv_content, 'user_count': len(user_list)}
|
314
|
+
|
315
|
+
except Exception as e:
|
316
|
+
raise fastapi.HTTPException(status_code=500,
|
317
|
+
detail=f'Failed to export users: {str(e)}')
|
318
|
+
|
65
319
|
|
66
|
-
|
67
|
-
|
320
|
+
@contextlib.contextmanager
|
321
|
+
def _user_lock(user_id: str) -> Generator[None, None, None]:
|
322
|
+
"""Context manager for user lock."""
|
323
|
+
try:
|
324
|
+
with filelock.FileLock(USER_LOCK_PATH.format(user_id=user_id),
|
325
|
+
USER_LOCK_TIMEOUT_SECONDS):
|
326
|
+
yield
|
327
|
+
except filelock.Timeout as e:
|
328
|
+
raise RuntimeError(f'Failed to update user due to a timeout '
|
329
|
+
f'when trying to acquire the lock at '
|
330
|
+
f'{USER_LOCK_PATH.format(user_id=user_id)}. '
|
331
|
+
'Please try again or manually remove the lock '
|
332
|
+
f'file if you believe it is stale.') from e
|
sky/utils/command_runner.py
CHANGED
@@ -561,7 +561,7 @@ class SSHCommandRunner(CommandRunner):
|
|
561
561
|
if self.ssh_control_name is not None:
|
562
562
|
control_path = _ssh_control_path(self.ssh_control_name)
|
563
563
|
if control_path is not None:
|
564
|
-
# Suppress the `Exit request sent.` output for this
|
564
|
+
# Suppress the `Exit request sent.` output for this command
|
565
565
|
# which would interrupt the CLI spinner.
|
566
566
|
cmd = (f'ssh -O exit -S {control_path}/%C '
|
567
567
|
f'{self.ssh_user}@{self.ip} > /dev/null 2>&1')
|
sky/utils/common_utils.py
CHANGED
@@ -343,30 +343,31 @@ def get_pretty_entrypoint_cmd() -> str:
|
|
343
343
|
# things like 'examples/app.py'.
|
344
344
|
argv[0] = basename
|
345
345
|
|
346
|
-
# Redact sensitive
|
347
|
-
argv =
|
346
|
+
# Redact sensitive values from secrets arguments
|
347
|
+
argv = _redact_secrets_values(argv)
|
348
348
|
|
349
349
|
return ' '.join(argv)
|
350
350
|
|
351
351
|
|
352
|
-
def
|
353
|
-
"""Redact sensitive values from --
|
352
|
+
def _redact_secrets_values(argv: List[str]) -> List[str]:
|
353
|
+
"""Redact sensitive values from --secret arguments.
|
354
354
|
|
355
355
|
Args:
|
356
356
|
argv: Command line arguments
|
357
357
|
|
358
358
|
Returns:
|
359
|
-
Modified argv with redacted --
|
359
|
+
Modified argv with redacted --secret values, or original argv if any
|
360
|
+
error
|
360
361
|
|
361
362
|
Examples:
|
362
|
-
['sky', 'launch', '--
|
363
|
-
['sky', 'launch', '--
|
363
|
+
['sky', 'launch', '--secret', 'HF_TOKEN=secret'] ->
|
364
|
+
['sky', 'launch', '--secret', 'HF_TOKEN=<redacted>']
|
364
365
|
|
365
|
-
['sky', 'launch', '--
|
366
|
-
['sky', 'launch', '--
|
366
|
+
['sky', 'launch', '--secret=HF_TOKEN=secret'] ->
|
367
|
+
['sky', 'launch', '--secret=HF_TOKEN=<redacted>']
|
367
368
|
|
368
|
-
['sky', 'launch', '--
|
369
|
-
['sky', 'launch', '--
|
369
|
+
['sky', 'launch', '--secret', 'HF_TOKEN'] ->
|
370
|
+
['sky', 'launch', '--secret', 'HF_TOKEN'] (no change)
|
370
371
|
"""
|
371
372
|
try:
|
372
373
|
if not argv:
|
@@ -384,7 +385,7 @@ def _redact_env_values(argv: List[str]) -> List[str]:
|
|
384
385
|
i += 1
|
385
386
|
continue
|
386
387
|
|
387
|
-
if arg == '--
|
388
|
+
if arg == '--secret' and i + 1 < len(argv):
|
388
389
|
result.append(arg)
|
389
390
|
next_arg = argv[i + 1]
|
390
391
|
# Ensure next_arg is a string and handle redaction safely
|
@@ -395,9 +396,10 @@ def _redact_env_values(argv: List[str]) -> List[str]:
|
|
395
396
|
else:
|
396
397
|
result.append(next_arg)
|
397
398
|
i += 2
|
398
|
-
elif arg.startswith('--
|
399
|
+
elif arg.startswith('--secret='):
|
399
400
|
# Redact only if there's a value after the key
|
400
|
-
redacted = re.sub(r'^(--
|
401
|
+
redacted = re.sub(r'^(--secret=[^=]+)=.*', r'\1=<redacted>',
|
402
|
+
arg)
|
401
403
|
result.append(redacted)
|
402
404
|
i += 1
|
403
405
|
else:
|
sky/utils/context.py
CHANGED
@@ -262,7 +262,7 @@ F = TypeVar('F', bound=Callable[..., Any])
|
|
262
262
|
|
263
263
|
|
264
264
|
def contextual(func: F) -> F:
|
265
|
-
"""Decorator to
|
265
|
+
"""Decorator to initialize a context before executing the function.
|
266
266
|
|
267
267
|
If a context is already initialized, this decorator will reset the context,
|
268
268
|
i.e. all contextual variables set previously will be cleared.
|
sky/utils/controller_utils.py
CHANGED
@@ -254,6 +254,13 @@ def _get_cloud_dependencies_installation_commands(
|
|
254
254
|
sky_check.get_cached_enabled_clouds_or_refresh(
|
255
255
|
sky_cloud.CloudCapability.STORAGE))
|
256
256
|
enabled_clouds = enabled_compute_clouds.union(enabled_storage_clouds)
|
257
|
+
enabled_k8s_and_ssh = [
|
258
|
+
repr(cloud)
|
259
|
+
for cloud in enabled_clouds
|
260
|
+
if isinstance(cloud, clouds.Kubernetes)
|
261
|
+
]
|
262
|
+
k8s_and_ssh_label = ' and '.join(sorted(enabled_k8s_and_ssh))
|
263
|
+
k8s_dependencies_installed = False
|
257
264
|
|
258
265
|
for cloud in enabled_clouds:
|
259
266
|
cloud_python_dependencies: List[str] = copy.deepcopy(
|
@@ -295,10 +302,11 @@ def _get_cloud_dependencies_installation_commands(
|
|
295
302
|
'--endpoint api.nebius.cloud '
|
296
303
|
'--service-account-file $HOME/.nebius/credentials.json '
|
297
304
|
'&> /dev/null || echo "Unable to create Nebius profile."')
|
298
|
-
elif isinstance(cloud, clouds.Kubernetes)
|
305
|
+
elif (isinstance(cloud, clouds.Kubernetes) and
|
306
|
+
not k8s_dependencies_installed):
|
299
307
|
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
300
308
|
commands.append(
|
301
|
-
f'echo -en "\\r{step_prefix}
|
309
|
+
f'echo -en "\\r{step_prefix}{k8s_and_ssh_label}{empty_str}" && '
|
302
310
|
# Install k8s + skypilot dependencies
|
303
311
|
'sudo bash -c "if '
|
304
312
|
'! command -v curl &> /dev/null || '
|
@@ -321,6 +329,7 @@ def _get_cloud_dependencies_installation_commands(
|
|
321
329
|
'kubectl /usr/local/bin/kubectl)) && '
|
322
330
|
f'echo -e \'#!/bin/bash\\nexport PATH="{kubernetes_constants.SKY_K8S_EXEC_AUTH_PATH}"\\nexec "$@"\' | sudo tee /usr/local/bin/{kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER} > /dev/null && ' # pylint: disable=line-too-long
|
323
331
|
f'sudo chmod +x /usr/local/bin/{kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER}') # pylint: disable=line-too-long
|
332
|
+
k8s_dependencies_installed = True
|
324
333
|
elif isinstance(cloud, clouds.Cudo):
|
325
334
|
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
|
326
335
|
commands.append(
|
@@ -422,7 +431,7 @@ def download_and_stream_latest_job_log(
|
|
422
431
|
return None
|
423
432
|
|
424
433
|
log_dir = list(log_dirs.values())[0]
|
425
|
-
log_file = os.path.join(log_dir, 'run.log')
|
434
|
+
log_file = os.path.expanduser(os.path.join(log_dir, 'run.log'))
|
426
435
|
|
427
436
|
# Print the logs to the console.
|
428
437
|
# TODO(zhwu): refactor this into log_utils, along with the refactoring for
|
sky/utils/dag_utils.py
CHANGED
@@ -66,7 +66,9 @@ def convert_entrypoint_to_dag(entrypoint: Any) -> 'dag_lib.Dag':
|
|
66
66
|
|
67
67
|
def _load_chain_dag(
|
68
68
|
configs: List[Dict[str, Any]],
|
69
|
-
env_overrides: Optional[List[Tuple[str, str]]] = None
|
69
|
+
env_overrides: Optional[List[Tuple[str, str]]] = None,
|
70
|
+
secrets_overrides: Optional[List[Tuple[str,
|
71
|
+
str]]] = None) -> dag_lib.Dag:
|
70
72
|
"""Loads a chain DAG from a list of YAML configs."""
|
71
73
|
dag_name = None
|
72
74
|
if set(configs[0].keys()) == {'name'}:
|
@@ -84,7 +86,8 @@ def _load_chain_dag(
|
|
84
86
|
for task_config in configs:
|
85
87
|
if task_config is None:
|
86
88
|
continue
|
87
|
-
task = task_lib.Task.from_yaml_config(task_config, env_overrides
|
89
|
+
task = task_lib.Task.from_yaml_config(task_config, env_overrides,
|
90
|
+
secrets_overrides)
|
88
91
|
if current_task is not None:
|
89
92
|
current_task >> task # pylint: disable=pointless-statement
|
90
93
|
current_task = task
|
@@ -95,6 +98,7 @@ def _load_chain_dag(
|
|
95
98
|
def load_chain_dag_from_yaml(
|
96
99
|
path: str,
|
97
100
|
env_overrides: Optional[List[Tuple[str, str]]] = None,
|
101
|
+
secret_overrides: Optional[List[Tuple[str, str]]] = None,
|
98
102
|
) -> dag_lib.Dag:
|
99
103
|
"""Loads a chain DAG from a YAML file.
|
100
104
|
|
@@ -105,17 +109,22 @@ def load_chain_dag_from_yaml(
|
|
105
109
|
the task's 'envs' section. If it is a chain dag, the envs will be updated
|
106
110
|
for all tasks in the chain.
|
107
111
|
|
112
|
+
'secrets_overrides' is a list of (key, value) pairs that will be used to
|
113
|
+
update the task's 'secrets' section. If it is a chain dag, the secrets will
|
114
|
+
be updated for all tasks in the chain.
|
115
|
+
|
108
116
|
Returns:
|
109
117
|
A chain Dag with 1 or more tasks (an empty entrypoint would create a
|
110
118
|
trivial task).
|
111
119
|
"""
|
112
120
|
configs = common_utils.read_yaml_all(path)
|
113
|
-
return _load_chain_dag(configs, env_overrides)
|
121
|
+
return _load_chain_dag(configs, env_overrides, secret_overrides)
|
114
122
|
|
115
123
|
|
116
124
|
def load_chain_dag_from_yaml_str(
|
117
125
|
yaml_str: str,
|
118
126
|
env_overrides: Optional[List[Tuple[str, str]]] = None,
|
127
|
+
secrets_overrides: Optional[List[Tuple[str, str]]] = None,
|
119
128
|
) -> dag_lib.Dag:
|
120
129
|
"""Loads a chain DAG from a YAML string.
|
121
130
|
|
@@ -126,12 +135,16 @@ def load_chain_dag_from_yaml_str(
|
|
126
135
|
the task's 'envs' section. If it is a chain dag, the envs will be updated
|
127
136
|
for all tasks in the chain.
|
128
137
|
|
138
|
+
'secrets_overrides' is a list of (key, value) pairs that will be used to
|
139
|
+
update the task's 'secrets' section. If it is a chain dag, the secrets will
|
140
|
+
be updated for all tasks in the chain.
|
141
|
+
|
129
142
|
Returns:
|
130
143
|
A chain Dag with 1 or more tasks (an empty entrypoint would create a
|
131
144
|
trivial task).
|
132
145
|
"""
|
133
146
|
configs = common_utils.read_yaml_all_str(yaml_str)
|
134
|
-
return _load_chain_dag(configs, env_overrides)
|
147
|
+
return _load_chain_dag(configs, env_overrides, secrets_overrides)
|
135
148
|
|
136
149
|
|
137
150
|
def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag) -> str:
|
@@ -207,9 +207,16 @@ def prepare_hosts_info(cluster_name: str,
|
|
207
207
|
|
208
208
|
# Get cluster-level defaults
|
209
209
|
cluster_user = cluster_config.get('user', '')
|
210
|
-
cluster_identity_file =
|
210
|
+
cluster_identity_file = os.path.expanduser(
|
211
|
+
cluster_config.get('identity_file', ''))
|
211
212
|
cluster_password = cluster_config.get('password', '')
|
212
213
|
|
214
|
+
# Check if cluster identity file exists
|
215
|
+
if cluster_identity_file and not os.path.isfile(cluster_identity_file):
|
216
|
+
with ux_utils.print_exception_no_traceback():
|
217
|
+
raise ValueError(
|
218
|
+
f'SSH Identity File Missing: {cluster_identity_file}')
|
219
|
+
|
213
220
|
hosts_info = []
|
214
221
|
for host in cluster_config['hosts']:
|
215
222
|
# Host can be a string (IP or SSH config hostname) or a dict
|
@@ -239,10 +246,16 @@ def prepare_hosts_info(cluster_name: str,
|
|
239
246
|
# Use host-specific values or fall back to cluster defaults
|
240
247
|
host_user = '' if is_ssh_config_host else host.get(
|
241
248
|
'user', cluster_user)
|
242
|
-
host_identity_file =
|
243
|
-
'
|
249
|
+
host_identity_file = os.path.expanduser(
|
250
|
+
'' if is_ssh_config_host else host.
|
251
|
+
get('identity_file', cluster_identity_file))
|
244
252
|
host_password = host.get('password', cluster_password)
|
245
253
|
|
254
|
+
if host_identity_file and not os.path.isfile(host_identity_file):
|
255
|
+
with ux_utils.print_exception_no_traceback():
|
256
|
+
raise ValueError(
|
257
|
+
f'SSH Identity File Missing: {host_identity_file}')
|
258
|
+
|
246
259
|
hosts_info.append({
|
247
260
|
'ip': host['ip'],
|
248
261
|
'user': host_user,
|
@@ -836,10 +849,6 @@ def deploy_cluster(head_node,
|
|
836
849
|
|
837
850
|
Returns: List of unsuccessful worker nodes.
|
838
851
|
"""
|
839
|
-
# Ensure SSH key is expanded for paths with ~ (home directory)
|
840
|
-
if ssh_key:
|
841
|
-
ssh_key = os.path.expanduser(ssh_key)
|
842
|
-
|
843
852
|
history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
|
844
853
|
f'{context_name}-history.yaml')
|
845
854
|
cert_file_path = os.path.join(NODE_POOLS_INFO_DIR,
|
@@ -1091,7 +1100,7 @@ def deploy_cluster(head_node,
|
|
1091
1100
|
f'Skipping...{NC}')
|
1092
1101
|
return node, True, False
|
1093
1102
|
worker_user = worker_hosts[i]['user']
|
1094
|
-
worker_key =
|
1103
|
+
worker_key = worker_hosts[i]['identity_file']
|
1095
1104
|
worker_password = worker_hosts[i]['password']
|
1096
1105
|
worker_askpass = create_askpass_script(worker_password)
|
1097
1106
|
worker_config = worker_use_ssh_config[i]
|