skypilot-nightly 1.0.0.dev20250523__py3-none-any.whl → 1.0.0.dev20250526__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +62 -45
  3. sky/backends/cloud_vm_ray_backend.py +3 -1
  4. sky/check.py +335 -170
  5. sky/cli.py +56 -13
  6. sky/client/cli.py +56 -13
  7. sky/client/sdk.py +54 -10
  8. sky/clouds/gcp.py +19 -3
  9. sky/core.py +5 -2
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/7GEgRyZKRaSnYZCV1Jwol/_buildManifest.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/25-062253ea41fb8eec.js +6 -0
  13. sky/dashboard/out/_next/static/chunks/480-5a0de8b6570ea105.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +15 -0
  15. sky/dashboard/out/_next/static/chunks/498-d7722313e5e5b4e6.js +21 -0
  16. sky/dashboard/out/_next/static/chunks/573-f17bd89d9f9118b3.js +66 -0
  17. sky/dashboard/out/_next/static/chunks/578-d351125af46c293f.js +6 -0
  18. sky/dashboard/out/_next/static/chunks/734-a6e01d7f98904741.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/937.f97f83652028e944.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/938-59956af3950b02ed.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/pages/_app-96a715a6fb01e228.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-3b5aad09a25f64b7.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +16 -0
  25. sky/dashboard/out/_next/static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/infra-abb7d744ecf15109.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-48dc8d67d4b60be1.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +16 -0
  29. sky/dashboard/out/_next/static/chunks/pages/users-b8acf6e6735323a2.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/pages/workspace/new-bbf436f41381e169.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7733c960685b4385.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/pages/workspaces-5ed48b3201b998c8.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +1 -0
  34. sky/dashboard/out/_next/static/css/28558d57108b05ae.css +3 -0
  35. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  36. sky/dashboard/out/clusters/[cluster].html +1 -1
  37. sky/dashboard/out/clusters.html +1 -1
  38. sky/dashboard/out/index.html +1 -1
  39. sky/dashboard/out/infra.html +1 -1
  40. sky/dashboard/out/jobs/[job].html +1 -1
  41. sky/dashboard/out/jobs.html +1 -1
  42. sky/dashboard/out/users.html +1 -0
  43. sky/dashboard/out/workspace/new.html +1 -0
  44. sky/dashboard/out/workspaces/[name].html +1 -0
  45. sky/dashboard/out/workspaces.html +1 -0
  46. sky/data/storage.py +1 -1
  47. sky/global_user_state.py +606 -543
  48. sky/jobs/constants.py +1 -1
  49. sky/jobs/server/core.py +72 -56
  50. sky/jobs/state.py +26 -5
  51. sky/jobs/utils.py +65 -13
  52. sky/optimizer.py +6 -3
  53. sky/provision/fluidstack/instance.py +1 -0
  54. sky/serve/server/core.py +9 -6
  55. sky/server/html/token_page.html +6 -1
  56. sky/server/requests/executor.py +1 -0
  57. sky/server/requests/payloads.py +28 -0
  58. sky/server/server.py +59 -5
  59. sky/setup_files/dependencies.py +1 -0
  60. sky/skylet/constants.py +4 -1
  61. sky/skypilot_config.py +107 -11
  62. sky/utils/cli_utils/status_utils.py +18 -8
  63. sky/utils/db_utils.py +53 -0
  64. sky/utils/kubernetes/config_map_utils.py +133 -0
  65. sky/utils/kubernetes/deploy_remote_cluster.py +166 -147
  66. sky/utils/kubernetes/kubernetes_deploy_utils.py +49 -5
  67. sky/utils/kubernetes/ssh-tunnel.sh +20 -28
  68. sky/utils/log_utils.py +4 -0
  69. sky/utils/schemas.py +54 -0
  70. sky/workspaces/__init__.py +0 -0
  71. sky/workspaces/core.py +295 -0
  72. sky/workspaces/server.py +62 -0
  73. {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/METADATA +2 -1
  74. {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/RECORD +79 -63
  75. sky/dashboard/out/_next/static/ECKwDNS9v9y3_IKFZ2lpp/_buildManifest.js +0 -1
  76. sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +0 -6
  77. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  78. sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +0 -6
  79. sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +0 -1
  80. sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +0 -1
  81. sky/dashboard/out/_next/static/chunks/582-683f4f27b81996dc.js +0 -59
  82. sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +0 -1
  83. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +0 -1
  87. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +0 -1
  88. sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +0 -1
  89. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  90. sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +0 -3
  91. /sky/dashboard/out/_next/static/{ECKwDNS9v9y3_IKFZ2lpp → 7GEgRyZKRaSnYZCV1Jwol}/_ssgManifest.js +0 -0
  92. {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/WHEEL +0 -0
  93. {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/entry_points.txt +0 -0
  94. {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/licenses/LICENSE +0 -0
  95. {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/top_level.txt +0 -0
sky/server/server.py CHANGED
@@ -6,6 +6,7 @@ import base64
6
6
  import contextlib
7
7
  import dataclasses
8
8
  import datetime
9
+ import hashlib
9
10
  import json
10
11
  import logging
11
12
  import multiprocessing
@@ -31,6 +32,7 @@ from sky import core
31
32
  from sky import exceptions
32
33
  from sky import execution
33
34
  from sky import global_user_state
35
+ from sky import models
34
36
  from sky import sky_logging
35
37
  from sky.clouds import service_catalog
36
38
  from sky.data import storage_utils
@@ -56,6 +58,7 @@ from sky.utils import dag_utils
56
58
  from sky.utils import env_options
57
59
  from sky.utils import status_lib
58
60
  from sky.utils import subprocess_utils
61
+ from sky.workspaces import server as workspaces_rest
59
62
 
60
63
  # pylint: disable=ungrouped-imports
61
64
  if sys.version_info >= (3, 10):
@@ -110,6 +113,38 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
110
113
  return response
111
114
 
112
115
 
116
+ def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
117
+ if 'X-Auth-Request-Email' not in request.headers:
118
+ return None
119
+ user_name = request.headers['X-Auth-Request-Email']
120
+ user_hash = hashlib.md5(
121
+ user_name.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
122
+ return models.User(id=user_hash, name=user_name)
123
+
124
+
125
+ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
126
+ """Middleware to handle auth proxy."""
127
+
128
+ async def dispatch(self, request: fastapi.Request, call_next):
129
+ auth_user = _get_auth_user_header(request)
130
+ body = await request.body()
131
+ if auth_user and body:
132
+ try:
133
+ original_json = await request.json()
134
+ except json.JSONDecodeError as e:
135
+ logger.error(f'Error parsing request JSON: {e}')
136
+ else:
137
+ logger.debug(f'Overriding user for {request.state.request_id}: '
138
+ f'{auth_user.name}, {auth_user.id}')
139
+ if 'env_vars' in original_json:
140
+ original_json['env_vars'][
141
+ constants.USER_ID_ENV_VAR] = auth_user.id
142
+ original_json['env_vars'][
143
+ constants.USER_ENV_VAR] = auth_user.name
144
+ request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
145
+ return await call_next(request)
146
+
147
+
113
148
  # Default expiration time for upload ids before cleanup.
114
149
  _DEFAULT_UPLOAD_EXPIRATION_TIME = datetime.timedelta(hours=1)
115
150
  # Key: (upload_id, user_hash), Value: the time when the upload id needs to be
@@ -216,15 +251,29 @@ app.add_middleware(
216
251
  allow_headers=['*'],
217
252
  # TODO(syang): remove X-Request-ID when v0.10.0 is released.
218
253
  expose_headers=['X-Request-ID', 'X-Skypilot-Request-ID'])
254
+ app.add_middleware(AuthProxyMiddleware)
219
255
  app.add_middleware(RequestIDMiddleware)
220
256
  app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
221
257
  app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
258
+ app.include_router(workspaces_rest.router,
259
+ prefix='/workspaces',
260
+ tags=['workspaces'])
222
261
 
223
262
 
224
263
  @app.get('/token')
225
264
  async def token(request: fastapi.Request) -> fastapi.responses.HTMLResponse:
265
+ # If we have auth info, save this user to the database.
266
+ user = _get_auth_user_header(request)
267
+ if user is not None:
268
+ global_user_state.add_or_update_user(user)
269
+
270
+ token_data = {
271
+ 'v': 1, # Token version number, bump for backwards incompatible.
272
+ 'user': user.id if user is not None else None,
273
+ 'cookies': request.cookies,
274
+ }
226
275
  # Use base64 encoding to avoid having to escape anything in the HTML.
227
- json_bytes = json.dumps(request.cookies).encode('utf-8')
276
+ json_bytes = json.dumps(token_data).encode('utf-8')
228
277
  base64_str = base64.b64encode(json_bytes).decode('utf-8')
229
278
 
230
279
  html_dir = pathlib.Path(__file__).parent / 'html'
@@ -236,8 +285,10 @@ async def token(request: fastapi.Request) -> fastapi.responses.HTMLResponse:
236
285
  raise fastapi.HTTPException(
237
286
  status_code=500, detail='Token page template not found.') from e
238
287
 
288
+ user_info_string = f'Logged in as {user.name}' if user is not None else ''
239
289
  html_content = html_content.replace(
240
- 'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER', base64_str)
290
+ 'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER',
291
+ base64_str).replace('USER_PLACEHOLDER', user_info_string)
241
292
 
242
293
  return fastapi.responses.HTMLResponse(
243
294
  content=html_content,
@@ -263,12 +314,13 @@ async def check(request: fastapi.Request,
263
314
 
264
315
 
265
316
  @app.get('/enabled_clouds')
266
- async def enabled_clouds(request: fastapi.Request) -> None:
317
+ async def enabled_clouds(request: fastapi.Request,
318
+ workspace: Optional[str] = None) -> None:
267
319
  """Gets enabled clouds on the server."""
268
320
  executor.schedule_request(
269
321
  request_id=request.state.request_id,
270
322
  request_name='enabled_clouds',
271
- request_body=payloads.RequestBody(),
323
+ request_body=payloads.EnabledCloudsBody(workspace=workspace),
272
324
  func=core.enabled_clouds,
273
325
  schedule_type=requests_lib.ScheduleType.SHORT,
274
326
  )
@@ -1113,7 +1165,7 @@ async def api_status(
1113
1165
 
1114
1166
 
1115
1167
  @app.get('/api/health')
1116
- async def health() -> Dict[str, str]:
1168
+ async def health(request: fastapi.Request) -> Dict[str, Any]:
1117
1169
  """Checks the health of the API server.
1118
1170
 
1119
1171
  Returns:
@@ -1125,12 +1177,14 @@ async def health() -> Dict[str, str]:
1125
1177
  disk, which can be used to warn about restarting the API server
1126
1178
  - commit: str; The commit hash of SkyPilot used for API server.
1127
1179
  """
1180
+ user = _get_auth_user_header(request)
1128
1181
  return {
1129
1182
  'status': common.ApiServerStatus.HEALTHY.value,
1130
1183
  'api_version': server_constants.API_VERSION,
1131
1184
  'version': sky.__version__,
1132
1185
  'version_on_disk': common.get_skypilot_version_on_disk(),
1133
1186
  'commit': sky.__commit__,
1187
+ 'user': user.to_dict() if user is not None else None,
1134
1188
  }
1135
1189
 
1136
1190
 
@@ -56,6 +56,7 @@ install_requires = [
56
56
  'aiofiles',
57
57
  'httpx',
58
58
  'setproctitle',
59
+ 'sqlalchemy',
59
60
  ]
60
61
 
61
62
  local_ray = [
sky/skylet/constants.py CHANGED
@@ -378,7 +378,8 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
378
378
  # we skip the following keys because they are meant to be client-side configs.
379
379
  SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('admin_policy',),
380
380
  ('api_server',),
381
- ('allowed_clouds',)]
381
+ ('allowed_clouds',),
382
+ ('workspaces',)]
382
383
 
383
384
  # Constants for Azure blob storage
384
385
  WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
@@ -405,3 +406,5 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
405
406
 
406
407
  # Environment variable that is set to 'true' if this is a skypilot server.
407
408
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
409
+
410
+ SKYPILOT_DEFAULT_WORKSPACE = 'default'
sky/skypilot_config.py CHANGED
@@ -57,6 +57,8 @@ import threading
57
57
  import typing
58
58
  from typing import Any, Dict, Iterator, List, Optional, Tuple
59
59
 
60
+ import filelock
61
+
60
62
  from sky import exceptions
61
63
  from sky import sky_logging
62
64
  from sky.adaptors import common as adaptors_common
@@ -66,6 +68,7 @@ from sky.utils import config_utils
66
68
  from sky.utils import context
67
69
  from sky.utils import schemas
68
70
  from sky.utils import ux_utils
71
+ from sky.utils.kubernetes import config_map_utils
69
72
 
70
73
  if typing.TYPE_CHECKING:
71
74
  import yaml
@@ -120,8 +123,17 @@ class ConfigContext:
120
123
 
121
124
 
122
125
  # The global loaded config.
126
+ _active_workspace_context = threading.local()
123
127
  _global_config_context = ConfigContext()
124
- _reload_config_lock = threading.Lock()
128
+
129
+ SKYPILOT_CONFIG_LOCK_PATH = '~/.sky/locks/.skypilot_config.lock'
130
+
131
+
132
+ def get_skypilot_config_lock_path() -> str:
133
+ """Get the path for the SkyPilot config lock file."""
134
+ lock_path = os.path.expanduser(SKYPILOT_CONFIG_LOCK_PATH)
135
+ os.makedirs(os.path.dirname(lock_path), exist_ok=True)
136
+ return lock_path
125
137
 
126
138
 
127
139
  def _get_config_context() -> ConfigContext:
@@ -194,8 +206,7 @@ def get_user_config() -> config_utils.Config:
194
206
 
195
207
  # load the user config file
196
208
  if os.path.exists(user_config_path):
197
- user_config = parse_config_file(user_config_path)
198
- _validate_config(user_config, user_config_path)
209
+ user_config = parse_and_validate_config_file(user_config_path)
199
210
  else:
200
211
  user_config = config_utils.Config()
201
212
  return user_config
@@ -223,8 +234,7 @@ def _get_project_config() -> config_utils.Config:
223
234
 
224
235
  # load the project config file
225
236
  if os.path.exists(project_config_path):
226
- project_config = parse_config_file(project_config_path)
227
- _validate_config(project_config, project_config_path)
237
+ project_config = parse_and_validate_config_file(project_config_path)
228
238
  else:
229
239
  project_config = config_utils.Config()
230
240
  return project_config
@@ -252,8 +262,7 @@ def get_server_config() -> config_utils.Config:
252
262
 
253
263
  # load the server config file
254
264
  if os.path.exists(server_config_path):
255
- server_config = parse_config_file(server_config_path)
256
- _validate_config(server_config, server_config_path)
265
+ server_config = parse_and_validate_config_file(server_config_path)
257
266
  else:
258
267
  server_config = config_utils.Config()
259
268
  return server_config
@@ -287,6 +296,60 @@ def get_nested(keys: Tuple[str, ...],
287
296
  disallowed_override_keys=None)
288
297
 
289
298
 
299
+ def get_workspace_cloud(cloud: str,
300
+ workspace: Optional[str] = None) -> config_utils.Config:
301
+ """Returns the workspace config."""
302
+ if workspace is None:
303
+ workspace = get_active_workspace()
304
+ clouds = get_nested(keys=(
305
+ 'workspaces',
306
+ workspace,
307
+ ), default_value=None)
308
+ if clouds is None:
309
+ return config_utils.Config()
310
+ return clouds.get(cloud.lower(), config_utils.Config())
311
+
312
+
313
+ @contextlib.contextmanager
314
+ def local_active_workspace_ctx(workspace: str) -> Iterator[None]:
315
+ """Temporarily set the active workspace IN CURRENT THREAD.
316
+
317
+ Note: having this function thread-local is error-prone, as wrapping some
318
+ operations with this will not have the underlying threads to get the
319
+ correct active workspace. However, we cannot make it global either, as
320
+ backend_utils.refresh_cluster_status() will be called in multiple threads,
321
+ and they may have different active workspaces for different threads.
322
+
323
+ # TODO(zhwu): make this function global by default and able to be set
324
+ # it to thread-local with an argument.
325
+
326
+ Args:
327
+ workspace: The workspace to set as active.
328
+
329
+ Raises:
330
+ RuntimeError: If called from a non-main thread.
331
+ """
332
+ original_workspace = get_active_workspace()
333
+ if original_workspace == workspace:
334
+ # No change, do nothing.
335
+ yield
336
+ return
337
+ _active_workspace_context.workspace = workspace
338
+ logger.debug(f'Set context workspace: {workspace}')
339
+ yield
340
+ logger.debug(f'Reset context workspace: {original_workspace}')
341
+ _active_workspace_context.workspace = original_workspace
342
+
343
+
344
+ def get_active_workspace(force_user_workspace: bool = False) -> str:
345
+ context_workspace = getattr(_active_workspace_context, 'workspace', None)
346
+ if not force_user_workspace and context_workspace is not None:
347
+ logger.debug(f'Get context workspace: {context_workspace}')
348
+ return context_workspace
349
+ return get_nested(keys=('active_workspace',),
350
+ default_value=constants.SKYPILOT_DEFAULT_WORKSPACE)
351
+
352
+
290
353
  def set_nested(keys: Tuple[str, ...], value: Any) -> Dict[str, Any]:
291
354
  """Returns a deep-copied config with the nested key set to value.
292
355
 
@@ -336,7 +399,7 @@ def overlay_skypilot_config(
336
399
 
337
400
  def safe_reload_config() -> None:
338
401
  """Reloads the config, safe to be called concurrently."""
339
- with _reload_config_lock:
402
+ with filelock.FileLock(get_skypilot_config_lock_path()):
340
403
  _reload_config()
341
404
 
342
405
 
@@ -357,7 +420,7 @@ def _reload_config() -> None:
357
420
  _reload_config_as_client()
358
421
 
359
422
 
360
- def parse_config_file(config_path: str) -> config_utils.Config:
423
+ def parse_and_validate_config_file(config_path: str) -> config_utils.Config:
361
424
  config = config_utils.Config()
362
425
  try:
363
426
  config_dict = common_utils.read_yaml(config_path)
@@ -413,7 +476,7 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
413
476
  'exist. Please double check the path or unset the env var: '
414
477
  f'unset {ENV_VAR_SKYPILOT_CONFIG}')
415
478
  logger.debug(f'Using config path: {config_path}')
416
- _set_loaded_config(parse_config_file(config_path))
479
+ _set_loaded_config(parse_and_validate_config_file(config_path))
417
480
  _set_loaded_config_path(config_path)
418
481
 
419
482
 
@@ -512,6 +575,19 @@ def override_skypilot_config(
512
575
  override_configs=dict(override_configs),
513
576
  allowed_override_keys=None,
514
577
  disallowed_override_keys=constants.SKIPPED_CLIENT_OVERRIDE_KEYS)
578
+ workspace = config.get_nested(
579
+ keys=('active_workspace',),
580
+ default_value=constants.SKYPILOT_DEFAULT_WORKSPACE)
581
+ if (workspace != constants.SKYPILOT_DEFAULT_WORKSPACE and workspace
582
+ not in get_nested(keys=('workspaces',), default_value={})):
583
+ raise ValueError(f'Workspace {workspace} does not exist. '
584
+ 'Use `sky check` to see if it is defined on the API '
585
+ 'server and try again.')
586
+ # Initialize the active workspace context to the workspace specified, so
587
+ # that a new request is not affected by the previous request's workspace.
588
+ global _active_workspace_context
589
+ _active_workspace_context = threading.local()
590
+
515
591
  try:
516
592
  common_utils.validate_schema(
517
593
  config,
@@ -592,7 +668,7 @@ def _compose_cli_config(cli_config: Optional[List[str]]) -> config_utils.Config:
592
668
  'Cannot use multiple --config flags with a config file.')
593
669
  config_source = maybe_config_path
594
670
  # cli_config is a path to a config file
595
- parsed_config = parse_config_file(maybe_config_path)
671
+ parsed_config = parse_and_validate_config_file(maybe_config_path)
596
672
  else: # cli_config is a comma-separated list of key-value pairs
597
673
  parsed_config = _parse_dotlist(cli_config)
598
674
  _validate_config(parsed_config, config_source)
@@ -623,3 +699,23 @@ def apply_cli_config(cli_config: Optional[List[str]]) -> Dict[str, Any]:
623
699
  overlay_skypilot_config(original_config=_get_loaded_config(),
624
700
  override_configs=parsed_config))
625
701
  return parsed_config
702
+
703
+
704
+ def update_config_no_lock(config: config_utils.Config) -> None:
705
+ """Dumps the new config to a file and syncs to ConfigMap if in Kubernetes.
706
+
707
+ Args:
708
+ config: The config to save and sync.
709
+ """
710
+ global_config_path = os.path.expanduser(get_user_config_path())
711
+
712
+ # Always save to the local file (PVC in Kubernetes, local file otherwise)
713
+ common_utils.dump_yaml(global_config_path, dict(config))
714
+
715
+ if config_map_utils.is_running_in_kubernetes():
716
+ # In Kubernetes, sync the PVC config to ConfigMap for user convenience
717
+ # PVC file is the source of truth, ConfigMap is just a mirror for easy
718
+ # access
719
+ config_map_utils.patch_configmap_with_config(config, global_config_path)
720
+
721
+ _reload_config()
@@ -48,7 +48,8 @@ class StatusColumn:
48
48
  def show_status_table(cluster_records: List[_ClusterRecord],
49
49
  show_all: bool,
50
50
  show_user: bool,
51
- query_clusters: Optional[List[str]] = None) -> int:
51
+ query_clusters: Optional[List[str]] = None,
52
+ show_workspaces: bool = False) -> int:
52
53
  """Compute cluster table values and display.
53
54
 
54
55
  Returns:
@@ -56,7 +57,6 @@ def show_status_table(cluster_records: List[_ClusterRecord],
56
57
  STOPPED.
57
58
  """
58
59
  # TODO(zhwu): Update the information for autostop clusters.
59
-
60
60
  status_columns = [
61
61
  StatusColumn('NAME', _get_name),
62
62
  ]
@@ -66,6 +66,9 @@ def show_status_table(cluster_records: List[_ClusterRecord],
66
66
  StatusColumn('USER_ID', _get_user_hash, show_by_default=False))
67
67
 
68
68
  status_columns += [
69
+ StatusColumn('WORKSPACE',
70
+ _get_workspace,
71
+ show_by_default=show_workspaces),
69
72
  StatusColumn('INFRA', _get_infra, truncate=not show_all),
70
73
  StatusColumn('RESOURCES', _get_resources, truncate=not show_all),
71
74
  StatusColumn('STATUS', _get_status_colored),
@@ -106,12 +109,13 @@ def show_status_table(cluster_records: List[_ClusterRecord],
106
109
  for cluster in query_clusters
107
110
  if cluster not in cluster_names
108
111
  ]
109
- cluster_str = 'Cluster'
110
- if len(not_found_clusters) > 1:
111
- cluster_str += 's'
112
- cluster_str += ' '
113
- cluster_str += ', '.join(not_found_clusters)
114
- click.echo(f'{cluster_str} not found.')
112
+ if not_found_clusters:
113
+ cluster_str = 'Cluster'
114
+ if len(not_found_clusters) > 1:
115
+ cluster_str += 's'
116
+ cluster_str += ' '
117
+ cluster_str += ', '.join(not_found_clusters)
118
+ click.echo(f'{cluster_str} not found.')
115
119
  elif not cluster_records:
116
120
  click.echo('No existing clusters.')
117
121
  return num_pending_autostop
@@ -243,6 +247,12 @@ def _get_status(cluster_record: _ClusterRecord,
243
247
  return cluster_record['status']
244
248
 
245
249
 
250
+ def _get_workspace(cluster_record: _ClusterRecord,
251
+ truncate: bool = True) -> str:
252
+ del truncate
253
+ return cluster_record['workspace']
254
+
255
+
246
256
  def _get_status_colored(cluster_record: _ClusterRecord,
247
257
  truncate: bool = True) -> str:
248
258
  del truncate
sky/utils/db_utils.py CHANGED
@@ -1,9 +1,17 @@
1
1
  """Utils for sky databases."""
2
2
  import contextlib
3
+ import enum
3
4
  import sqlite3
4
5
  import threading
6
+ import typing
5
7
  from typing import Any, Callable, Optional
6
8
 
9
+ import sqlalchemy
10
+ from sqlalchemy import exc as sqlalchemy_exc
11
+
12
+ if typing.TYPE_CHECKING:
13
+ from sqlalchemy.orm import Session
14
+
7
15
  # This parameter (passed to sqlite3.connect) controls how long we will wait to
8
16
  # obtains a database lock (not necessarily during connection, but whenever it is
9
17
  # needed). It is not a connection timeout.
@@ -21,6 +29,11 @@ from typing import Any, Callable, Optional
21
29
  _DB_TIMEOUT_S = 60
22
30
 
23
31
 
32
+ class SQLAlchemyDialect(enum.Enum):
33
+ SQLITE = 'sqlite'
34
+ POSTGRESQL = 'postgresql'
35
+
36
+
24
37
  @contextlib.contextmanager
25
38
  def safe_cursor(db_path: str):
26
39
  """A newly created, auto-committing, auto-closing cursor."""
@@ -71,6 +84,46 @@ def add_column_to_table(
71
84
  conn.commit()
72
85
 
73
86
 
87
+ def add_column_to_table_sqlalchemy(
88
+ session: 'Session',
89
+ table_name: str,
90
+ column_name: str,
91
+ column_type: str,
92
+ copy_from: Optional[str] = None,
93
+ value_to_replace_existing_entries: Optional[Any] = None,
94
+ ):
95
+ """Add a column to a table."""
96
+ dialect = session.bind.dialect
97
+ if dialect.name == SQLAlchemyDialect.SQLITE.value:
98
+ try:
99
+ session.execute(
100
+ sqlalchemy.text(f'ALTER TABLE {table_name} '
101
+ f'ADD COLUMN {column_name} {column_type}'))
102
+ if copy_from is not None:
103
+ session.execute(
104
+ sqlalchemy.text(f'UPDATE {table_name} '
105
+ f'SET {column_name} = {copy_from}'))
106
+ if value_to_replace_existing_entries is not None:
107
+ session.execute(
108
+ sqlalchemy.text(f'UPDATE {table_name} '
109
+ f'SET {column_name} = :replacement_value '
110
+ f'WHERE {column_name} IS NULL'),
111
+ {'replacement_value': value_to_replace_existing_entries})
112
+ except sqlalchemy_exc.OperationalError as e:
113
+ if 'duplicate column name' in str(e):
114
+ pass
115
+ else:
116
+ raise
117
+ elif dialect.name == SQLAlchemyDialect.POSTGRESQL.value:
118
+ # TODO(syang) support postgres dialect
119
+ session.rollback()
120
+ raise ValueError('Unsupported database dialect')
121
+ else:
122
+ session.rollback()
123
+ raise ValueError('Unsupported database dialect')
124
+ session.commit()
125
+
126
+
74
127
  def rename_column(
75
128
  cursor: 'sqlite3.Cursor',
76
129
  conn: 'sqlite3.Connection',
@@ -0,0 +1,133 @@
1
+ """Utilities for Kubernetes ConfigMap operations in SkyPilot."""
2
+ import os
3
+
4
+ from sky import sky_logging
5
+ from sky import skypilot_config
6
+ from sky.adaptors import kubernetes
7
+ from sky.utils import common_utils
8
+
9
+ logger = sky_logging.init_logger(__name__)
10
+
11
+ # Kubernetes ConfigMap sync constants
12
+ _KUBE_SERVICE_ACCOUNT_PATH = '/var/run/secrets/kubernetes.io/serviceaccount'
13
+ _CONFIGMAP_SYNC_TIMEOUT = 10 # seconds
14
+
15
+
16
+ def is_running_in_kubernetes() -> bool:
17
+ """Check if we're running inside a Kubernetes pod."""
18
+ return os.path.exists(f'{_KUBE_SERVICE_ACCOUNT_PATH}/token')
19
+
20
+
21
+ def _get_kubernetes_namespace() -> str:
22
+ """Get the current Kubernetes namespace from the service account."""
23
+ try:
24
+ namespace_file = f'{_KUBE_SERVICE_ACCOUNT_PATH}/namespace'
25
+ if os.path.exists(namespace_file):
26
+ with open(namespace_file, encoding='utf-8') as f:
27
+ return f.read().strip()
28
+ except (OSError, IOError):
29
+ pass
30
+ return 'default'
31
+
32
+
33
+ def _get_configmap_name() -> str:
34
+ """Get the ConfigMap name for the SkyPilot config."""
35
+ release_name = (os.getenv('HELM_RELEASE_NAME') or
36
+ os.getenv('SKYPILOT_RELEASE_NAME') or 'skypilot')
37
+ return f'{release_name}-config'
38
+
39
+
40
+ def initialize_configmap_sync_on_startup(config_file_path: str) -> None:
41
+ """Initialize ConfigMap sync on API server startup.
42
+
43
+ This syncs existing PVC config to ConfigMap if ConfigMap doesn't exist.
44
+ This handles the upgrade scenario where an existing deployment has
45
+ workspace configs on PVC but no ConfigMap exists.
46
+
47
+ Args:
48
+ config_file_path: Path to the config file to sync.
49
+ """
50
+ config_file_path = os.path.expanduser(config_file_path)
51
+ if not is_running_in_kubernetes() or not os.path.exists(config_file_path):
52
+ return
53
+
54
+ try:
55
+ namespace = _get_kubernetes_namespace()
56
+ configmap_name = _get_configmap_name()
57
+
58
+ # Check if ConfigMap exists
59
+ try:
60
+ kubernetes.core_api().read_namespaced_config_map(
61
+ name=configmap_name, namespace=namespace)
62
+ # ConfigMap exists, don't overwrite it
63
+ logger.debug(f'ConfigMap {configmap_name} already exists')
64
+ return
65
+ except kubernetes.kubernetes.client.rest.ApiException as e:
66
+ if e.status != 404:
67
+ raise
68
+ # ConfigMap doesn't exist, create it
69
+
70
+ current_config = skypilot_config.parse_and_validate_config_file(
71
+ config_file_path)
72
+ config_yaml = common_utils.dump_yaml_str(dict(current_config))
73
+
74
+ configmap_body = {
75
+ 'apiVersion': 'v1',
76
+ 'kind': 'ConfigMap',
77
+ 'metadata': {
78
+ 'name': configmap_name,
79
+ 'namespace': namespace,
80
+ 'labels': {
81
+ 'app.kubernetes.io/name': 'skypilot',
82
+ 'app.kubernetes.io/component': 'config'
83
+ }
84
+ },
85
+ 'data': {
86
+ 'config.yaml': config_yaml
87
+ }
88
+ }
89
+
90
+ kubernetes.core_api().create_namespaced_config_map(
91
+ namespace=namespace,
92
+ body=configmap_body,
93
+ _request_timeout=_CONFIGMAP_SYNC_TIMEOUT)
94
+
95
+ logger.info(f'Synced PVC config to new ConfigMap {configmap_name}')
96
+
97
+ except Exception as e: # pylint: disable=broad-except
98
+ logger.warning(f'Failed to initialize ConfigMap sync: {e}')
99
+
100
+
101
+ def patch_configmap_with_config(config, config_file_path: str) -> None:
102
+ """Patch the Kubernetes ConfigMap with the updated config.
103
+
104
+ Args:
105
+ config: The updated config to sync to the ConfigMap.
106
+ config_file_path: Path to the config file for fallback sync.
107
+ """
108
+ if not is_running_in_kubernetes():
109
+ return
110
+
111
+ try:
112
+ namespace = _get_kubernetes_namespace()
113
+ configmap_name = _get_configmap_name()
114
+ config_yaml = common_utils.dump_yaml_str(dict(config))
115
+ patch_body = {'data': {'config.yaml': config_yaml}}
116
+
117
+ try:
118
+ kubernetes.core_api().patch_namespaced_config_map(
119
+ name=configmap_name,
120
+ namespace=namespace,
121
+ body=patch_body,
122
+ _request_timeout=_CONFIGMAP_SYNC_TIMEOUT)
123
+ logger.debug(f'Synced config to ConfigMap {configmap_name}')
124
+ except kubernetes.kubernetes.client.rest.ApiException as e:
125
+ if e.status == 404:
126
+ # ConfigMap doesn't exist, create it
127
+ logger.info(f'ConfigMap {configmap_name} not found, creating')
128
+ initialize_configmap_sync_on_startup(config_file_path)
129
+ else:
130
+ raise
131
+
132
+ except Exception as e: # pylint: disable=broad-except
133
+ logger.warning(f'Failed to sync config to ConfigMap: {e}')