skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -4
  2. sky/backends/backend_utils.py +7 -0
  3. sky/backends/cloud_vm_ray_backend.py +91 -96
  4. sky/cli.py +5 -6311
  5. sky/client/cli.py +66 -639
  6. sky/client/sdk.py +22 -2
  7. sky/clouds/kubernetes.py +8 -0
  8. sky/clouds/scp.py +7 -26
  9. sky/clouds/utils/scp_utils.py +177 -124
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
  14. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
  16. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
  18. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
  19. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  20. sky/dashboard/out/clusters/[cluster].html +1 -1
  21. sky/dashboard/out/clusters.html +1 -1
  22. sky/dashboard/out/config.html +1 -1
  23. sky/dashboard/out/index.html +1 -1
  24. sky/dashboard/out/infra/[context].html +1 -1
  25. sky/dashboard/out/infra.html +1 -1
  26. sky/dashboard/out/jobs/[job].html +1 -1
  27. sky/dashboard/out/jobs.html +1 -1
  28. sky/dashboard/out/users.html +1 -1
  29. sky/dashboard/out/workspace/new.html +1 -1
  30. sky/dashboard/out/workspaces/[name].html +1 -1
  31. sky/dashboard/out/workspaces.html +1 -1
  32. sky/global_user_state.py +50 -11
  33. sky/jobs/controller.py +98 -31
  34. sky/jobs/scheduler.py +37 -29
  35. sky/jobs/server/core.py +36 -3
  36. sky/jobs/state.py +69 -9
  37. sky/jobs/utils.py +11 -0
  38. sky/logs/__init__.py +17 -0
  39. sky/logs/agent.py +73 -0
  40. sky/logs/gcp.py +91 -0
  41. sky/models.py +1 -0
  42. sky/provision/__init__.py +1 -0
  43. sky/provision/instance_setup.py +35 -0
  44. sky/provision/provisioner.py +11 -0
  45. sky/provision/scp/__init__.py +15 -0
  46. sky/provision/scp/config.py +93 -0
  47. sky/provision/scp/instance.py +528 -0
  48. sky/resources.py +164 -29
  49. sky/server/common.py +21 -9
  50. sky/server/requests/payloads.py +19 -1
  51. sky/server/server.py +121 -29
  52. sky/setup_files/dependencies.py +11 -1
  53. sky/skylet/constants.py +48 -1
  54. sky/skylet/job_lib.py +83 -19
  55. sky/task.py +171 -21
  56. sky/templates/kubernetes-ray.yml.j2 +60 -4
  57. sky/templates/scp-ray.yml.j2 +3 -50
  58. sky/users/permission.py +47 -34
  59. sky/users/rbac.py +10 -1
  60. sky/users/server.py +274 -9
  61. sky/utils/command_runner.py +1 -1
  62. sky/utils/common_utils.py +16 -14
  63. sky/utils/context.py +1 -1
  64. sky/utils/controller_utils.py +12 -3
  65. sky/utils/dag_utils.py +17 -4
  66. sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
  67. sky/utils/schemas.py +83 -5
  68. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
  69. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
  70. sky/benchmark/__init__.py +0 -0
  71. sky/benchmark/benchmark_state.py +0 -295
  72. sky/benchmark/benchmark_utils.py +0 -641
  73. sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
  74. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
  75. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
  76. sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
  77. sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
  78. sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
  79. sky/skylet/providers/scp/__init__.py +0 -2
  80. sky/skylet/providers/scp/config.py +0 -149
  81. sky/skylet/providers/scp/node_provider.py +0 -578
  82. /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
  83. /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
  84. /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
  85. /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
  86. /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
  87. /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
  88. /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
  89. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
  90. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
  91. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
  92. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
sky/users/server.py CHANGED
@@ -1,19 +1,30 @@
1
1
  """REST API for workspace management."""
2
2
 
3
- from typing import Any, Dict, List
3
+ import contextlib
4
+ import hashlib
5
+ import os
6
+ from typing import Any, Dict, Generator, List
4
7
 
5
8
  import fastapi
9
+ import filelock
10
+ from passlib.hash import apr_md5_crypt
6
11
 
7
12
  from sky import global_user_state
13
+ from sky import models
8
14
  from sky import sky_logging
9
15
  from sky.server.requests import payloads
10
16
  from sky.skylet import constants
11
17
  from sky.users import permission
12
18
  from sky.users import rbac
13
19
  from sky.utils import common
20
+ from sky.utils import common_utils
14
21
 
15
22
  logger = sky_logging.init_logger(__name__)
16
23
 
24
+ # Filelocks for the user management.
25
+ USER_LOCK_PATH = os.path.expanduser('~/.sky/.{user_id}.lock')
26
+ USER_LOCK_TIMEOUT_SECONDS = 20
27
+
17
28
  router = fastapi.APIRouter()
18
29
 
19
30
 
@@ -39,29 +50,283 @@ async def get_current_user_role(request: fastapi.Request):
39
50
  # hash for the request without 'X-Auth-Request-Email' header?
40
51
  auth_user = request.state.auth_user
41
52
  if auth_user is None:
42
- return {'name': '', 'role': rbac.RoleName.ADMIN.value}
53
+ return {'id': '', 'name': '', 'role': rbac.RoleName.ADMIN.value}
43
54
  user_roles = permission.permission_service.get_user_roles(auth_user.id)
44
- return {'name': auth_user.name, 'role': user_roles[0] if user_roles else ''}
55
+ return {
56
+ 'id': auth_user.id,
57
+ 'name': auth_user.name,
58
+ 'role': user_roles[0] if user_roles else ''
59
+ }
60
+
61
+
62
+ @router.post('/create')
63
+ async def user_create(user_create_body: payloads.UserCreateBody) -> None:
64
+ username = user_create_body.username
65
+ password = user_create_body.password
66
+ role = user_create_body.role
67
+
68
+ if not username or not password:
69
+ raise fastapi.HTTPException(status_code=400,
70
+ detail='Username and password are required')
71
+ if role and role not in rbac.get_supported_roles():
72
+ raise fastapi.HTTPException(status_code=400,
73
+ detail=f'Invalid role: {role}')
74
+
75
+ if not role:
76
+ role = rbac.get_default_role()
77
+
78
+ # Create user
79
+ password_hash = apr_md5_crypt.hash(password)
80
+ user_hash = hashlib.md5(
81
+ username.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
82
+ with _user_lock(user_hash):
83
+ # Check if user already exists
84
+ if global_user_state.get_user_by_name(username):
85
+ raise fastapi.HTTPException(
86
+ status_code=400, detail=f'User {username!r} already exists')
87
+ global_user_state.add_or_update_user(
88
+ models.User(id=user_hash, name=username, password=password_hash))
89
+ permission.permission_service.update_role(user_hash, role)
45
90
 
46
91
 
47
92
  @router.post('/update')
48
- async def user_update(user_update_body: payloads.UserUpdateBody) -> None:
93
+ async def user_update(request: fastapi.Request,
94
+ user_update_body: payloads.UserUpdateBody) -> None:
49
95
  """Updates the user role."""
50
96
  user_id = user_update_body.user_id
51
97
  role = user_update_body.role
98
+ password = user_update_body.password
52
99
  supported_roles = rbac.get_supported_roles()
53
- if role not in supported_roles:
100
+ if role and role not in supported_roles:
54
101
  raise fastapi.HTTPException(status_code=400,
55
102
  detail=f'Invalid role: {role}')
103
+ target_user_roles = permission.permission_service.get_user_roles(user_id)
104
+ need_update_role = role and (not target_user_roles or
105
+ (role != target_user_roles[0]))
106
+ current_user = request.state.auth_user
107
+ if current_user is not None:
108
+ current_user_roles = permission.permission_service.get_user_roles(
109
+ current_user.id)
110
+ if not current_user_roles:
111
+ raise fastapi.HTTPException(status_code=403, detail='Invalid user')
112
+ if current_user_roles[0] != rbac.RoleName.ADMIN.value:
113
+ if need_update_role:
114
+ raise fastapi.HTTPException(
115
+ status_code=403, detail='Only admin can update user role')
116
+ if password and user_id != current_user.id:
117
+ raise fastapi.HTTPException(
118
+ status_code=403,
119
+ detail='Only admin can update password for other users')
56
120
  user_info = global_user_state.get_user(user_id)
57
121
  if user_info is None:
58
122
  raise fastapi.HTTPException(status_code=400,
59
123
  detail=f'User {user_id} does not exist')
60
- # Disallow updating roles for the internal users.
61
- if user_info.id in [common.SERVER_ID, constants.SKYPILOT_SYSTEM_USER_ID]:
124
+ # Disallow updating the internal users.
125
+ if need_update_role and user_info.id in [
126
+ common.SERVER_ID, constants.SKYPILOT_SYSTEM_USER_ID
127
+ ]:
62
128
  raise fastapi.HTTPException(status_code=400,
63
129
  detail=f'Cannot update role for internal '
64
130
  f'API server user {user_info.name}')
131
+ if password and user_info.id == constants.SKYPILOT_SYSTEM_USER_ID:
132
+ raise fastapi.HTTPException(
133
+ status_code=400,
134
+ detail=f'Cannot update password for internal '
135
+ f'API server user {user_info.name}')
136
+
137
+ with _user_lock(user_info.id):
138
+ if password:
139
+ password_hash = apr_md5_crypt.hash(password)
140
+ global_user_state.add_or_update_user(
141
+ models.User(id=user_info.id,
142
+ name=user_info.name,
143
+ password=password_hash))
144
+ if role and need_update_role:
145
+ # Update user role in casbin policy
146
+ permission.permission_service.update_role(user_info.id, role)
147
+
148
+
149
+ @router.post('/delete')
150
+ async def user_delete(user_delete_body: payloads.UserDeleteBody) -> None:
151
+ user_id = user_delete_body.user_id
152
+
153
+ user_info = global_user_state.get_user(user_id)
154
+ if user_info is None:
155
+ raise fastapi.HTTPException(status_code=400,
156
+ detail=f'User {user_id} does not exist')
157
+ # Disallow deleting the internal users.
158
+ if user_info.id in [common.SERVER_ID, constants.SKYPILOT_SYSTEM_USER_ID]:
159
+ raise fastapi.HTTPException(status_code=400,
160
+ detail=f'Cannot delete internal '
161
+ f'API server user {user_info.name}')
162
+ with _user_lock(user_id):
163
+ global_user_state.delete_user(user_id)
164
+ permission.permission_service.delete_user(user_id)
165
+
166
+
167
+ @router.post('/import')
168
+ async def user_import(
169
+ user_import_body: payloads.UserImportBody) -> Dict[str, Any]:
170
+ """Import users from CSV content."""
171
+ csv_content = user_import_body.csv_content
172
+
173
+ if not csv_content:
174
+ raise fastapi.HTTPException(status_code=400,
175
+ detail='CSV content is required')
176
+
177
+ # Parse CSV content
178
+ lines = csv_content.strip().split('\n')
179
+ if len(lines) < 2:
180
+ raise fastapi.HTTPException(
181
+ status_code=400,
182
+ detail='CSV must have at least a header row and one data row')
183
+
184
+ # Parse headers
185
+ headers = [h.strip().lower() for h in lines[0].split(',')]
186
+ required_headers = ['username', 'password', 'role']
187
+
188
+ # Check if all required headers are present
189
+ missing_headers = [
190
+ header for header in required_headers if header not in headers
191
+ ]
192
+ if missing_headers:
193
+ raise fastapi.HTTPException(
194
+ status_code=400,
195
+ detail=f'Missing required columns: {", ".join(missing_headers)}')
196
+
197
+ # Parse user data
198
+ users_to_create = []
199
+ parse_errors = []
200
+
201
+ for i, line in enumerate(lines[1:], start=2):
202
+ if not line.strip():
203
+ continue # Skip empty lines
204
+
205
+ values = [v.strip() for v in line.split(',')]
206
+ if len(values) != len(headers):
207
+ parse_errors.append(f'Line {i}: Invalid number of columns')
208
+ continue
209
+
210
+ user_data = dict(zip(headers, values))
211
+
212
+ # Validate required fields
213
+ if not user_data.get('username') or not user_data.get('password'):
214
+ parse_errors.append(f'Line {i}: Username and password are required')
215
+ continue
216
+
217
+ # Validate role
218
+ role = user_data.get('role', '').lower()
219
+ if role and role not in rbac.get_supported_roles():
220
+ role = rbac.get_default_role() # Default to default role if invalid
221
+ elif not role:
222
+ role = rbac.get_default_role()
223
+
224
+ users_to_create.append({
225
+ 'username': user_data['username'],
226
+ 'password': user_data['password'],
227
+ 'role': role
228
+ })
229
+
230
+ if not users_to_create and parse_errors:
231
+ raise fastapi.HTTPException(
232
+ status_code=400,
233
+ detail=f'No valid users found. Errors: {"; ".join(parse_errors)}')
234
+
235
+ # Create users
236
+ success_count = 0
237
+ error_count = 0
238
+ creation_errors = []
239
+
240
+ for user_data in users_to_create:
241
+ try:
242
+ username = user_data['username']
243
+ password = user_data['password']
244
+ role = user_data['role']
245
+
246
+ # Check if user already exists
247
+ if global_user_state.get_user_by_name(username):
248
+ error_count += 1
249
+ creation_errors.append(f'{username}: User already exists')
250
+ continue
251
+
252
+ # Check if password is already hashed (APR1 hash)
253
+ if password.startswith('$apr1$'):
254
+ # Password is already hashed, use it directly
255
+ password_hash = password
256
+ else:
257
+ # Password is plain text, hash it
258
+ password_hash = apr_md5_crypt.hash(password)
259
+
260
+ user_hash = hashlib.md5(
261
+ username.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
262
+
263
+ with _user_lock(user_hash):
264
+ global_user_state.add_or_update_user(
265
+ models.User(id=user_hash,
266
+ name=username,
267
+ password=password_hash))
268
+ permission.permission_service.update_role(user_hash, role)
269
+
270
+ success_count += 1
271
+
272
+ except Exception as e: # pylint: disable=broad-except
273
+ error_count += 1
274
+ creation_errors.append(f'{user_data["username"]}: {str(e)}')
275
+
276
+ return {
277
+ 'success_count': success_count,
278
+ 'error_count': error_count,
279
+ 'total_processed': len(users_to_create),
280
+ 'parse_errors': parse_errors,
281
+ 'creation_errors': creation_errors
282
+ }
283
+
284
+
285
+ @router.get('/export')
286
+ async def user_export() -> Dict[str, Any]:
287
+ """Export all users as CSV content."""
288
+ try:
289
+ # Get all users
290
+ user_list = global_user_state.get_all_users()
291
+
292
+ # Create CSV content
293
+ csv_lines = ['username,password,role'] # Header
294
+
295
+ for user in user_list:
296
+ # Get user role
297
+ user_roles = permission.permission_service.get_user_roles(user.id)
298
+ role = user_roles[0] if user_roles else rbac.get_default_role()
299
+ # Avoid exporting `None` values
300
+ line = ''
301
+ if user.name:
302
+ line += user.name
303
+ line += ','
304
+ if user.password:
305
+ line += user.password
306
+ line += ','
307
+ if role:
308
+ line += role
309
+ csv_lines.append(line)
310
+
311
+ csv_content = '\n'.join(csv_lines)
312
+
313
+ return {'csv_content': csv_content, 'user_count': len(user_list)}
314
+
315
+ except Exception as e:
316
+ raise fastapi.HTTPException(status_code=500,
317
+ detail=f'Failed to export users: {str(e)}')
318
+
65
319
 
66
- # Update user role in casbin policy
67
- permission.permission_service.update_role(user_info.id, role)
320
+ @contextlib.contextmanager
321
+ def _user_lock(user_id: str) -> Generator[None, None, None]:
322
+ """Context manager for user lock."""
323
+ try:
324
+ with filelock.FileLock(USER_LOCK_PATH.format(user_id=user_id),
325
+ USER_LOCK_TIMEOUT_SECONDS):
326
+ yield
327
+ except filelock.Timeout as e:
328
+ raise RuntimeError(f'Failed to update user due to a timeout '
329
+ f'when trying to acquire the lock at '
330
+ f'{USER_LOCK_PATH.format(user_id=user_id)}. '
331
+ 'Please try again or manually remove the lock '
332
+ f'file if you believe it is stale.') from e
@@ -561,7 +561,7 @@ class SSHCommandRunner(CommandRunner):
561
561
  if self.ssh_control_name is not None:
562
562
  control_path = _ssh_control_path(self.ssh_control_name)
563
563
  if control_path is not None:
564
- # Suppress the `Exit request sent.` output for this comamnd
564
+ # Suppress the `Exit request sent.` output for this command
565
565
  # which would interrupt the CLI spinner.
566
566
  cmd = (f'ssh -O exit -S {control_path}/%C '
567
567
  f'{self.ssh_user}@{self.ip} > /dev/null 2>&1')
sky/utils/common_utils.py CHANGED
@@ -343,30 +343,31 @@ def get_pretty_entrypoint_cmd() -> str:
343
343
  # things like 'examples/app.py'.
344
344
  argv[0] = basename
345
345
 
346
- # Redact sensitive environment variable values
347
- argv = _redact_env_values(argv)
346
+ # Redact sensitive values from secrets arguments
347
+ argv = _redact_secrets_values(argv)
348
348
 
349
349
  return ' '.join(argv)
350
350
 
351
351
 
352
- def _redact_env_values(argv: List[str]) -> List[str]:
353
- """Redact sensitive values from --env arguments.
352
+ def _redact_secrets_values(argv: List[str]) -> List[str]:
353
+ """Redact sensitive values from --secret arguments.
354
354
 
355
355
  Args:
356
356
  argv: Command line arguments
357
357
 
358
358
  Returns:
359
- Modified argv with redacted --env values, or original argv if any error
359
+ Modified argv with redacted --secret values, or original argv if any
360
+ error
360
361
 
361
362
  Examples:
362
- ['sky', 'launch', '--env', 'HF_TOKEN=secret'] ->
363
- ['sky', 'launch', '--env', 'HF_TOKEN=<redacted>']
363
+ ['sky', 'launch', '--secret', 'HF_TOKEN=secret'] ->
364
+ ['sky', 'launch', '--secret', 'HF_TOKEN=<redacted>']
364
365
 
365
- ['sky', 'launch', '--env=HF_TOKEN=secret'] ->
366
- ['sky', 'launch', '--env=HF_TOKEN=<redacted>']
366
+ ['sky', 'launch', '--secret=HF_TOKEN=secret'] ->
367
+ ['sky', 'launch', '--secret=HF_TOKEN=<redacted>']
367
368
 
368
- ['sky', 'launch', '--env', 'HF_TOKEN'] ->
369
- ['sky', 'launch', '--env', 'HF_TOKEN'] (no change)
369
+ ['sky', 'launch', '--secret', 'HF_TOKEN'] ->
370
+ ['sky', 'launch', '--secret', 'HF_TOKEN'] (no change)
370
371
  """
371
372
  try:
372
373
  if not argv:
@@ -384,7 +385,7 @@ def _redact_env_values(argv: List[str]) -> List[str]:
384
385
  i += 1
385
386
  continue
386
387
 
387
- if arg == '--env' and i + 1 < len(argv):
388
+ if arg == '--secret' and i + 1 < len(argv):
388
389
  result.append(arg)
389
390
  next_arg = argv[i + 1]
390
391
  # Ensure next_arg is a string and handle redaction safely
@@ -395,9 +396,10 @@ def _redact_env_values(argv: List[str]) -> List[str]:
395
396
  else:
396
397
  result.append(next_arg)
397
398
  i += 2
398
- elif arg.startswith('--env='):
399
+ elif arg.startswith('--secret='):
399
400
  # Redact only if there's a value after the key
400
- redacted = re.sub(r'^(--env=[^=]+)=.*', r'\1=<redacted>', arg)
401
+ redacted = re.sub(r'^(--secret=[^=]+)=.*', r'\1=<redacted>',
402
+ arg)
401
403
  result.append(redacted)
402
404
  i += 1
403
405
  else:
sky/utils/context.py CHANGED
@@ -262,7 +262,7 @@ F = TypeVar('F', bound=Callable[..., Any])
262
262
 
263
263
 
264
264
  def contextual(func: F) -> F:
265
- """Decorator to intiailize a context before executing the function.
265
+ """Decorator to initialize a context before executing the function.
266
266
 
267
267
  If a context is already initialized, this decorator will reset the context,
268
268
  i.e. all contextual variables set previously will be cleared.
@@ -254,6 +254,13 @@ def _get_cloud_dependencies_installation_commands(
254
254
  sky_check.get_cached_enabled_clouds_or_refresh(
255
255
  sky_cloud.CloudCapability.STORAGE))
256
256
  enabled_clouds = enabled_compute_clouds.union(enabled_storage_clouds)
257
+ enabled_k8s_and_ssh = [
258
+ repr(cloud)
259
+ for cloud in enabled_clouds
260
+ if isinstance(cloud, clouds.Kubernetes)
261
+ ]
262
+ k8s_and_ssh_label = ' and '.join(sorted(enabled_k8s_and_ssh))
263
+ k8s_dependencies_installed = False
257
264
 
258
265
  for cloud in enabled_clouds:
259
266
  cloud_python_dependencies: List[str] = copy.deepcopy(
@@ -295,10 +302,11 @@ def _get_cloud_dependencies_installation_commands(
295
302
  '--endpoint api.nebius.cloud '
296
303
  '--service-account-file $HOME/.nebius/credentials.json '
297
304
  '&> /dev/null || echo "Unable to create Nebius profile."')
298
- elif isinstance(cloud, clouds.Kubernetes):
305
+ elif (isinstance(cloud, clouds.Kubernetes) and
306
+ not k8s_dependencies_installed):
299
307
  step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
300
308
  commands.append(
301
- f'echo -en "\\r{step_prefix}Kubernetes{empty_str}" && '
309
+ f'echo -en "\\r{step_prefix}{k8s_and_ssh_label}{empty_str}" && '
302
310
  # Install k8s + skypilot dependencies
303
311
  'sudo bash -c "if '
304
312
  '! command -v curl &> /dev/null || '
@@ -321,6 +329,7 @@ def _get_cloud_dependencies_installation_commands(
321
329
  'kubectl /usr/local/bin/kubectl)) && '
322
330
  f'echo -e \'#!/bin/bash\\nexport PATH="{kubernetes_constants.SKY_K8S_EXEC_AUTH_PATH}"\\nexec "$@"\' | sudo tee /usr/local/bin/{kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER} > /dev/null && ' # pylint: disable=line-too-long
323
331
  f'sudo chmod +x /usr/local/bin/{kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER}') # pylint: disable=line-too-long
332
+ k8s_dependencies_installed = True
324
333
  elif isinstance(cloud, clouds.Cudo):
325
334
  step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
326
335
  commands.append(
@@ -422,7 +431,7 @@ def download_and_stream_latest_job_log(
422
431
  return None
423
432
 
424
433
  log_dir = list(log_dirs.values())[0]
425
- log_file = os.path.join(log_dir, 'run.log')
434
+ log_file = os.path.expanduser(os.path.join(log_dir, 'run.log'))
426
435
 
427
436
  # Print the logs to the console.
428
437
  # TODO(zhwu): refactor this into log_utils, along with the refactoring for
sky/utils/dag_utils.py CHANGED
@@ -66,7 +66,9 @@ def convert_entrypoint_to_dag(entrypoint: Any) -> 'dag_lib.Dag':
66
66
 
67
67
  def _load_chain_dag(
68
68
  configs: List[Dict[str, Any]],
69
- env_overrides: Optional[List[Tuple[str, str]]] = None) -> dag_lib.Dag:
69
+ env_overrides: Optional[List[Tuple[str, str]]] = None,
70
+ secrets_overrides: Optional[List[Tuple[str,
71
+ str]]] = None) -> dag_lib.Dag:
70
72
  """Loads a chain DAG from a list of YAML configs."""
71
73
  dag_name = None
72
74
  if set(configs[0].keys()) == {'name'}:
@@ -84,7 +86,8 @@ def _load_chain_dag(
84
86
  for task_config in configs:
85
87
  if task_config is None:
86
88
  continue
87
- task = task_lib.Task.from_yaml_config(task_config, env_overrides)
89
+ task = task_lib.Task.from_yaml_config(task_config, env_overrides,
90
+ secrets_overrides)
88
91
  if current_task is not None:
89
92
  current_task >> task # pylint: disable=pointless-statement
90
93
  current_task = task
@@ -95,6 +98,7 @@ def _load_chain_dag(
95
98
  def load_chain_dag_from_yaml(
96
99
  path: str,
97
100
  env_overrides: Optional[List[Tuple[str, str]]] = None,
101
+ secret_overrides: Optional[List[Tuple[str, str]]] = None,
98
102
  ) -> dag_lib.Dag:
99
103
  """Loads a chain DAG from a YAML file.
100
104
 
@@ -105,17 +109,22 @@ def load_chain_dag_from_yaml(
105
109
  the task's 'envs' section. If it is a chain dag, the envs will be updated
106
110
  for all tasks in the chain.
107
111
 
112
+ 'secrets_overrides' is a list of (key, value) pairs that will be used to
113
+ update the task's 'secrets' section. If it is a chain dag, the secrets will
114
+ be updated for all tasks in the chain.
115
+
108
116
  Returns:
109
117
  A chain Dag with 1 or more tasks (an empty entrypoint would create a
110
118
  trivial task).
111
119
  """
112
120
  configs = common_utils.read_yaml_all(path)
113
- return _load_chain_dag(configs, env_overrides)
121
+ return _load_chain_dag(configs, env_overrides, secret_overrides)
114
122
 
115
123
 
116
124
  def load_chain_dag_from_yaml_str(
117
125
  yaml_str: str,
118
126
  env_overrides: Optional[List[Tuple[str, str]]] = None,
127
+ secrets_overrides: Optional[List[Tuple[str, str]]] = None,
119
128
  ) -> dag_lib.Dag:
120
129
  """Loads a chain DAG from a YAML string.
121
130
 
@@ -126,12 +135,16 @@ def load_chain_dag_from_yaml_str(
126
135
  the task's 'envs' section. If it is a chain dag, the envs will be updated
127
136
  for all tasks in the chain.
128
137
 
138
+ 'secrets_overrides' is a list of (key, value) pairs that will be used to
139
+ update the task's 'secrets' section. If it is a chain dag, the secrets will
140
+ be updated for all tasks in the chain.
141
+
129
142
  Returns:
130
143
  A chain Dag with 1 or more tasks (an empty entrypoint would create a
131
144
  trivial task).
132
145
  """
133
146
  configs = common_utils.read_yaml_all_str(yaml_str)
134
- return _load_chain_dag(configs, env_overrides)
147
+ return _load_chain_dag(configs, env_overrides, secrets_overrides)
135
148
 
136
149
 
137
150
  def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag) -> str:
@@ -207,9 +207,16 @@ def prepare_hosts_info(cluster_name: str,
207
207
 
208
208
  # Get cluster-level defaults
209
209
  cluster_user = cluster_config.get('user', '')
210
- cluster_identity_file = cluster_config.get('identity_file', '')
210
+ cluster_identity_file = os.path.expanduser(
211
+ cluster_config.get('identity_file', ''))
211
212
  cluster_password = cluster_config.get('password', '')
212
213
 
214
+ # Check if cluster identity file exists
215
+ if cluster_identity_file and not os.path.isfile(cluster_identity_file):
216
+ with ux_utils.print_exception_no_traceback():
217
+ raise ValueError(
218
+ f'SSH Identity File Missing: {cluster_identity_file}')
219
+
213
220
  hosts_info = []
214
221
  for host in cluster_config['hosts']:
215
222
  # Host can be a string (IP or SSH config hostname) or a dict
@@ -239,10 +246,16 @@ def prepare_hosts_info(cluster_name: str,
239
246
  # Use host-specific values or fall back to cluster defaults
240
247
  host_user = '' if is_ssh_config_host else host.get(
241
248
  'user', cluster_user)
242
- host_identity_file = '' if is_ssh_config_host else host.get(
243
- 'identity_file', cluster_identity_file)
249
+ host_identity_file = os.path.expanduser(
250
+ '' if is_ssh_config_host else host.
251
+ get('identity_file', cluster_identity_file))
244
252
  host_password = host.get('password', cluster_password)
245
253
 
254
+ if host_identity_file and not os.path.isfile(host_identity_file):
255
+ with ux_utils.print_exception_no_traceback():
256
+ raise ValueError(
257
+ f'SSH Identity File Missing: {host_identity_file}')
258
+
246
259
  hosts_info.append({
247
260
  'ip': host['ip'],
248
261
  'user': host_user,
@@ -836,10 +849,6 @@ def deploy_cluster(head_node,
836
849
 
837
850
  Returns: List of unsuccessful worker nodes.
838
851
  """
839
- # Ensure SSH key is expanded for paths with ~ (home directory)
840
- if ssh_key:
841
- ssh_key = os.path.expanduser(ssh_key)
842
-
843
852
  history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
844
853
  f'{context_name}-history.yaml')
845
854
  cert_file_path = os.path.join(NODE_POOLS_INFO_DIR,
@@ -1091,7 +1100,7 @@ def deploy_cluster(head_node,
1091
1100
  f'Skipping...{NC}')
1092
1101
  return node, True, False
1093
1102
  worker_user = worker_hosts[i]['user']
1094
- worker_key = os.path.expanduser(worker_hosts[i]['identity_file'])
1103
+ worker_key = worker_hosts[i]['identity_file']
1095
1104
  worker_password = worker_hosts[i]['password']
1096
1105
  worker_askpass = create_askpass_script(worker_password)
1097
1106
  worker_config = worker_use_ssh_config[i]