skypilot-nightly 1.0.0.dev20250522__py3-none-any.whl → 1.0.0.dev20250524__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +46 -16
  3. sky/backends/backend_utils.py +62 -45
  4. sky/backends/cloud_vm_ray_backend.py +19 -5
  5. sky/check.py +398 -171
  6. sky/cli.py +302 -98
  7. sky/client/cli.py +302 -98
  8. sky/client/sdk.py +104 -12
  9. sky/clouds/__init__.py +3 -0
  10. sky/clouds/aws.py +4 -2
  11. sky/clouds/azure.py +4 -2
  12. sky/clouds/cloud.py +24 -6
  13. sky/clouds/cudo.py +2 -1
  14. sky/clouds/do.py +2 -1
  15. sky/clouds/fluidstack.py +2 -1
  16. sky/clouds/gcp.py +23 -5
  17. sky/clouds/ibm.py +4 -2
  18. sky/clouds/kubernetes.py +66 -22
  19. sky/clouds/lambda_cloud.py +2 -1
  20. sky/clouds/nebius.py +18 -2
  21. sky/clouds/oci.py +4 -2
  22. sky/clouds/paperspace.py +2 -1
  23. sky/clouds/runpod.py +2 -1
  24. sky/clouds/scp.py +2 -1
  25. sky/clouds/service_catalog/constants.py +1 -1
  26. sky/clouds/service_catalog/ssh_catalog.py +167 -0
  27. sky/clouds/ssh.py +203 -0
  28. sky/clouds/vast.py +2 -1
  29. sky/clouds/vsphere.py +2 -1
  30. sky/core.py +58 -11
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/480-ee58038f1a4afd5c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +15 -0
  35. sky/dashboard/out/_next/static/chunks/498-d7722313e5e5b4e6.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/573-f17bd89d9f9118b3.js +66 -0
  37. sky/dashboard/out/_next/static/chunks/578-7a4795009a56430c.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/734-5f5ce8f347b7f417.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/937.f97f83652028e944.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/938-f347f6144075b0c8.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-37c042a356f8e608.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/infra-e690d864aa00e2ea.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-db6558a5ec687011.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +16 -0
  49. sky/dashboard/out/_next/static/chunks/pages/users-2d319455c3f1c3e2.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/workspaces-02a7b60f2ead275f.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +1 -0
  52. sky/dashboard/out/_next/static/css/d2cdba64c9202dd7.css +3 -0
  53. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  54. sky/dashboard/out/clusters/[cluster].html +1 -1
  55. sky/dashboard/out/clusters.html +1 -1
  56. sky/dashboard/out/index.html +1 -1
  57. sky/dashboard/out/infra.html +1 -1
  58. sky/dashboard/out/jobs/[job].html +1 -1
  59. sky/dashboard/out/jobs.html +1 -1
  60. sky/dashboard/out/users.html +1 -0
  61. sky/dashboard/out/workspaces.html +1 -0
  62. sky/data/storage.py +1 -1
  63. sky/global_user_state.py +42 -19
  64. sky/jobs/constants.py +1 -1
  65. sky/jobs/server/core.py +72 -56
  66. sky/jobs/state.py +26 -5
  67. sky/jobs/utils.py +65 -13
  68. sky/optimizer.py +29 -7
  69. sky/provision/__init__.py +1 -0
  70. sky/provision/aws/instance.py +17 -1
  71. sky/provision/fluidstack/instance.py +1 -0
  72. sky/provision/kubernetes/instance.py +16 -5
  73. sky/provision/kubernetes/utils.py +37 -19
  74. sky/provision/nebius/instance.py +3 -1
  75. sky/provision/nebius/utils.py +14 -2
  76. sky/provision/ssh/__init__.py +18 -0
  77. sky/resources.py +4 -1
  78. sky/serve/server/core.py +9 -6
  79. sky/server/html/token_page.html +6 -1
  80. sky/server/requests/executor.py +1 -0
  81. sky/server/requests/payloads.py +18 -0
  82. sky/server/server.py +108 -5
  83. sky/setup_files/dependencies.py +1 -0
  84. sky/skylet/constants.py +4 -1
  85. sky/skypilot_config.py +83 -9
  86. sky/templates/nebius-ray.yml.j2 +12 -0
  87. sky/utils/cli_utils/status_utils.py +18 -8
  88. sky/utils/infra_utils.py +21 -1
  89. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  90. sky/utils/kubernetes/create_cluster.sh +1 -0
  91. sky/utils/kubernetes/deploy_remote_cluster.py +1440 -0
  92. sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
  93. sky/utils/kubernetes/ssh-tunnel.sh +387 -0
  94. sky/utils/log_utils.py +218 -1
  95. sky/utils/schemas.py +75 -0
  96. sky/utils/ux_utils.py +2 -1
  97. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/METADATA +6 -1
  98. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/RECORD +103 -91
  99. sky/dashboard/out/_next/static/CzOVV6JpRQBRt5GhZuhyK/_buildManifest.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +0 -6
  101. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  102. sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +0 -6
  103. sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/582-683f4f27b81996dc.js +0 -59
  106. sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +0 -1
  109. sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +0 -1
  110. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  114. sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +0 -3
  115. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  116. /sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → aHej19bZyl4hoHgrzPCn7}/_ssgManifest.js +0 -0
  117. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/WHEEL +0 -0
  118. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/entry_points.txt +0 -0
  119. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/licenses/LICENSE +0 -0
  120. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/top_level.txt +0 -0
sky/server/server.py CHANGED
@@ -6,6 +6,7 @@ import base64
6
6
  import contextlib
7
7
  import dataclasses
8
8
  import datetime
9
+ import hashlib
9
10
  import json
10
11
  import logging
11
12
  import multiprocessing
@@ -31,7 +32,9 @@ from sky import core
31
32
  from sky import exceptions
32
33
  from sky import execution
33
34
  from sky import global_user_state
35
+ from sky import models
34
36
  from sky import sky_logging
37
+ from sky import skypilot_config
35
38
  from sky.clouds import service_catalog
36
39
  from sky.data import storage_utils
37
40
  from sky.jobs.server import server as jobs_rest
@@ -110,6 +113,38 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
110
113
  return response
111
114
 
112
115
 
116
+ def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
117
+ if 'X-Auth-Request-Email' not in request.headers:
118
+ return None
119
+ user_name = request.headers['X-Auth-Request-Email']
120
+ user_hash = hashlib.md5(
121
+ user_name.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
122
+ return models.User(id=user_hash, name=user_name)
123
+
124
+
125
+ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
126
+ """Middleware to handle auth proxy."""
127
+
128
+ async def dispatch(self, request: fastapi.Request, call_next):
129
+ auth_user = _get_auth_user_header(request)
130
+ body = await request.body()
131
+ if auth_user and body:
132
+ try:
133
+ original_json = await request.json()
134
+ except json.JSONDecodeError as e:
135
+ logger.error(f'Error parsing request JSON: {e}')
136
+ else:
137
+ logger.debug(f'Overriding user for {request.state.request_id}: '
138
+ f'{auth_user.name}, {auth_user.id}')
139
+ if 'env_vars' in original_json:
140
+ original_json['env_vars'][
141
+ constants.USER_ID_ENV_VAR] = auth_user.id
142
+ original_json['env_vars'][
143
+ constants.USER_ENV_VAR] = auth_user.name
144
+ request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
145
+ return await call_next(request)
146
+
147
+
113
148
  # Default expiration time for upload ids before cleanup.
114
149
  _DEFAULT_UPLOAD_EXPIRATION_TIME = datetime.timedelta(hours=1)
115
150
  # Key: (upload_id, user_hash), Value: the time when the upload id needs to be
@@ -216,6 +251,7 @@ app.add_middleware(
216
251
  allow_headers=['*'],
217
252
  # TODO(syang): remove X-Request-ID when v0.10.0 is released.
218
253
  expose_headers=['X-Request-ID', 'X-Skypilot-Request-ID'])
254
+ app.add_middleware(AuthProxyMiddleware)
219
255
  app.add_middleware(RequestIDMiddleware)
220
256
  app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
221
257
  app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
@@ -223,8 +259,18 @@ app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
223
259
 
224
260
  @app.get('/token')
225
261
  async def token(request: fastapi.Request) -> fastapi.responses.HTMLResponse:
262
+ # If we have auth info, save this user to the database.
263
+ user = _get_auth_user_header(request)
264
+ if user is not None:
265
+ global_user_state.add_or_update_user(user)
266
+
267
+ token_data = {
268
+ 'v': 1, # Token version number, bump for backwards incompatible.
269
+ 'user': user.id if user is not None else None,
270
+ 'cookies': request.cookies,
271
+ }
226
272
  # Use base64 encoding to avoid having to escape anything in the HTML.
227
- json_bytes = json.dumps(request.cookies).encode('utf-8')
273
+ json_bytes = json.dumps(token_data).encode('utf-8')
228
274
  base64_str = base64.b64encode(json_bytes).decode('utf-8')
229
275
 
230
276
  html_dir = pathlib.Path(__file__).parent / 'html'
@@ -236,8 +282,10 @@ async def token(request: fastapi.Request) -> fastapi.responses.HTMLResponse:
236
282
  raise fastapi.HTTPException(
237
283
  status_code=500, detail='Token page template not found.') from e
238
284
 
285
+ user_info_string = f'Logged in as {user.name}' if user is not None else ''
239
286
  html_content = html_content.replace(
240
- 'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER', base64_str)
287
+ 'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER',
288
+ base64_str).replace('USER_PLACEHOLDER', user_info_string)
241
289
 
242
290
  return fastapi.responses.HTMLResponse(
243
291
  content=html_content,
@@ -263,17 +311,30 @@ async def check(request: fastapi.Request,
263
311
 
264
312
 
265
313
  @app.get('/enabled_clouds')
266
- async def enabled_clouds(request: fastapi.Request) -> None:
314
+ async def enabled_clouds(request: fastapi.Request,
315
+ workspace: Optional[str] = None) -> None:
267
316
  """Gets enabled clouds on the server."""
268
317
  executor.schedule_request(
269
318
  request_id=request.state.request_id,
270
319
  request_name='enabled_clouds',
271
- request_body=payloads.RequestBody(),
320
+ request_body=payloads.EnabledCloudsBody(workspace=workspace),
272
321
  func=core.enabled_clouds,
273
322
  schedule_type=requests_lib.ScheduleType.SHORT,
274
323
  )
275
324
 
276
325
 
326
+ @app.get('/workspaces')
327
+ async def get_workspace_config(request: fastapi.Request) -> None:
328
+ """Gets workspace config on the server."""
329
+ executor.schedule_request(
330
+ request_id=request.state.request_id,
331
+ request_name='workspaces',
332
+ request_body=payloads.RequestBody(),
333
+ func=skypilot_config.get_workspaces,
334
+ schedule_type=requests_lib.ScheduleType.SHORT,
335
+ )
336
+
337
+
277
338
  @app.post('/realtime_kubernetes_gpu_availability')
278
339
  async def realtime_kubernetes_gpu_availability(
279
340
  request: fastapi.Request,
@@ -909,6 +970,33 @@ async def local_down(request: fastapi.Request) -> None:
909
970
  )
910
971
 
911
972
 
973
+ @app.post('/ssh_up')
974
+ async def ssh_up(request: fastapi.Request,
975
+ ssh_up_body: payloads.SSHUpBody) -> None:
976
+ """Deploys a Kubernetes cluster on SSH targets."""
977
+ executor.schedule_request(
978
+ request_id=request.state.request_id,
979
+ request_name='ssh_up',
980
+ request_body=ssh_up_body,
981
+ func=core.ssh_up,
982
+ schedule_type=requests_lib.ScheduleType.LONG,
983
+ )
984
+
985
+
986
+ @app.post('/ssh_down')
987
+ async def ssh_down(request: fastapi.Request,
988
+ ssh_up_body: payloads.SSHUpBody) -> None:
989
+ """Tears down a Kubernetes cluster on SSH targets."""
990
+ # We still call ssh_up but with cleanup=True
991
+ executor.schedule_request(
992
+ request_id=request.state.request_id,
993
+ request_name='ssh_down',
994
+ request_body=ssh_up_body,
995
+ func=core.ssh_up, # Reuse ssh_up function with cleanup=True
996
+ schedule_type=requests_lib.ScheduleType.LONG,
997
+ )
998
+
999
+
912
1000
  # === API server related APIs ===
913
1001
  @app.get('/api/get')
914
1002
  async def api_get(request_id: str) -> requests_lib.RequestPayload:
@@ -1086,7 +1174,7 @@ async def api_status(
1086
1174
 
1087
1175
 
1088
1176
  @app.get('/api/health')
1089
- async def health() -> Dict[str, str]:
1177
+ async def health(request: fastapi.Request) -> Dict[str, Any]:
1090
1178
  """Checks the health of the API server.
1091
1179
 
1092
1180
  Returns:
@@ -1098,12 +1186,14 @@ async def health() -> Dict[str, str]:
1098
1186
  disk, which can be used to warn about restarting the API server
1099
1187
  - commit: str; The commit hash of SkyPilot used for API server.
1100
1188
  """
1189
+ user = _get_auth_user_header(request)
1101
1190
  return {
1102
1191
  'status': common.ApiServerStatus.HEALTHY.value,
1103
1192
  'api_version': server_constants.API_VERSION,
1104
1193
  'version': sky.__version__,
1105
1194
  'version_on_disk': common.get_skypilot_version_on_disk(),
1106
1195
  'commit': sky.__commit__,
1196
+ 'user': user.to_dict() if user is not None else None,
1107
1197
  }
1108
1198
 
1109
1199
 
@@ -1185,6 +1275,19 @@ async def kubernetes_pod_ssh_proxy(
1185
1275
  proc.terminate()
1186
1276
 
1187
1277
 
1278
+ @app.get('/all_contexts')
1279
+ async def all_contexts(request: fastapi.Request) -> None:
1280
+ """Gets all Kubernetes and SSH node pool contexts."""
1281
+
1282
+ executor.schedule_request(
1283
+ request_id=request.state.request_id,
1284
+ request_name='all_contexts',
1285
+ request_body=payloads.RequestBody(),
1286
+ func=core.get_all_contexts,
1287
+ schedule_type=requests_lib.ScheduleType.SHORT,
1288
+ )
1289
+
1290
+
1188
1291
  # === Internal APIs ===
1189
1292
  @app.get('/api/completion/cluster_name')
1190
1293
  async def complete_cluster_name(incomplete: str,) -> List[str]:
@@ -130,6 +130,7 @@ extras_require: Dict[str, List[str]] = {
130
130
  'oci': ['oci'] + local_ray,
131
131
  # Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
132
132
  'kubernetes': ['kubernetes>=20.0.0,!=32.0.0', 'websockets'],
133
+ 'ssh': ['kubernetes>=20.0.0,!=32.0.0', 'websockets'],
133
134
  'remote': remote,
134
135
  # For the container registry auth api. Reference:
135
136
  # https://github.com/runpod/runpod-python/releases/tag/1.6.1
sky/skylet/constants.py CHANGED
@@ -378,7 +378,8 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
378
378
  # we skip the following keys because they are meant to be client-side configs.
379
379
  SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('admin_policy',),
380
380
  ('api_server',),
381
- ('allowed_clouds',)]
381
+ ('allowed_clouds',),
382
+ ('workspaces',)]
382
383
 
383
384
  # Constants for Azure blob storage
384
385
  WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
@@ -405,3 +406,5 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
405
406
 
406
407
  # Environment variable that is set to 'true' if this is a skypilot server.
407
408
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
409
+
410
+ SKYPILOT_DEFAULT_WORKSPACE = 'default'
sky/skypilot_config.py CHANGED
@@ -123,6 +123,8 @@ class ConfigContext:
123
123
  _global_config_context = ConfigContext()
124
124
  _reload_config_lock = threading.Lock()
125
125
 
126
+ _active_workspace_context = threading.local()
127
+
126
128
 
127
129
  def _get_config_context() -> ConfigContext:
128
130
  """Get config context for current context.
@@ -194,8 +196,7 @@ def get_user_config() -> config_utils.Config:
194
196
 
195
197
  # load the user config file
196
198
  if os.path.exists(user_config_path):
197
- user_config = parse_config_file(user_config_path)
198
- _validate_config(user_config, user_config_path)
199
+ user_config = parse_and_validate_config_file(user_config_path)
199
200
  else:
200
201
  user_config = config_utils.Config()
201
202
  return user_config
@@ -223,8 +224,7 @@ def _get_project_config() -> config_utils.Config:
223
224
 
224
225
  # load the project config file
225
226
  if os.path.exists(project_config_path):
226
- project_config = parse_config_file(project_config_path)
227
- _validate_config(project_config, project_config_path)
227
+ project_config = parse_and_validate_config_file(project_config_path)
228
228
  else:
229
229
  project_config = config_utils.Config()
230
230
  return project_config
@@ -252,8 +252,7 @@ def get_server_config() -> config_utils.Config:
252
252
 
253
253
  # load the server config file
254
254
  if os.path.exists(server_config_path):
255
- server_config = parse_config_file(server_config_path)
256
- _validate_config(server_config, server_config_path)
255
+ server_config = parse_and_validate_config_file(server_config_path)
257
256
  else:
258
257
  server_config = config_utils.Config()
259
258
  return server_config
@@ -287,6 +286,60 @@ def get_nested(keys: Tuple[str, ...],
287
286
  disallowed_override_keys=None)
288
287
 
289
288
 
289
+ def get_workspace_cloud(cloud: str,
290
+ workspace: Optional[str] = None) -> config_utils.Config:
291
+ """Returns the workspace config."""
292
+ if workspace is None:
293
+ workspace = get_active_workspace()
294
+ clouds = get_nested(keys=(
295
+ 'workspaces',
296
+ workspace,
297
+ ), default_value=None)
298
+ if clouds is None:
299
+ return config_utils.Config()
300
+ return clouds.get(cloud.lower(), config_utils.Config())
301
+
302
+
303
+ @contextlib.contextmanager
304
+ def local_active_workspace_ctx(workspace: str) -> Iterator[None]:
305
+ """Temporarily set the active workspace IN CURRENT THREAD.
306
+
307
+ Note: having this function thread-local is error-prone, as wrapping some
308
+ operations with this will not have the underlying threads to get the
309
+ correct active workspace. However, we cannot make it global either, as
310
+ backend_utils.refresh_cluster_status() will be called in multiple threads,
311
+ and they may have different active workspaces for different threads.
312
+
313
+ # TODO(zhwu): make this function global by default and able to be set
314
+ # it to thread-local with an argument.
315
+
316
+ Args:
317
+ workspace: The workspace to set as active.
318
+
319
+ Raises:
320
+ RuntimeError: If called from a non-main thread.
321
+ """
322
+ original_workspace = get_active_workspace()
323
+ if original_workspace == workspace:
324
+ # No change, do nothing.
325
+ yield
326
+ return
327
+ _active_workspace_context.workspace = workspace
328
+ logger.debug(f'Set context workspace: {workspace}')
329
+ yield
330
+ logger.debug(f'Reset context workspace: {original_workspace}')
331
+ _active_workspace_context.workspace = original_workspace
332
+
333
+
334
+ def get_active_workspace(force_user_workspace: bool = False) -> str:
335
+ context_workspace = getattr(_active_workspace_context, 'workspace', None)
336
+ if not force_user_workspace and context_workspace is not None:
337
+ logger.debug(f'Get context workspace: {context_workspace}')
338
+ return context_workspace
339
+ return get_nested(keys=('active_workspace',),
340
+ default_value=constants.SKYPILOT_DEFAULT_WORKSPACE)
341
+
342
+
290
343
  def set_nested(keys: Tuple[str, ...], value: Any) -> Dict[str, Any]:
291
344
  """Returns a deep-copied config with the nested key set to value.
292
345
 
@@ -357,7 +410,7 @@ def _reload_config() -> None:
357
410
  _reload_config_as_client()
358
411
 
359
412
 
360
- def parse_config_file(config_path: str) -> config_utils.Config:
413
+ def parse_and_validate_config_file(config_path: str) -> config_utils.Config:
361
414
  config = config_utils.Config()
362
415
  try:
363
416
  config_dict = common_utils.read_yaml(config_path)
@@ -413,7 +466,7 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
413
466
  'exist. Please double check the path or unset the env var: '
414
467
  f'unset {ENV_VAR_SKYPILOT_CONFIG}')
415
468
  logger.debug(f'Using config path: {config_path}')
416
- _set_loaded_config(parse_config_file(config_path))
469
+ _set_loaded_config(parse_and_validate_config_file(config_path))
417
470
  _set_loaded_config_path(config_path)
418
471
 
419
472
 
@@ -512,6 +565,19 @@ def override_skypilot_config(
512
565
  override_configs=dict(override_configs),
513
566
  allowed_override_keys=None,
514
567
  disallowed_override_keys=constants.SKIPPED_CLIENT_OVERRIDE_KEYS)
568
+ workspace = config.get_nested(
569
+ keys=('active_workspace',),
570
+ default_value=constants.SKYPILOT_DEFAULT_WORKSPACE)
571
+ if (workspace != constants.SKYPILOT_DEFAULT_WORKSPACE and workspace
572
+ not in get_nested(keys=('workspaces',), default_value={})):
573
+ raise ValueError(f'Workspace {workspace} does not exist. '
574
+ 'Use `sky check` to see if it is defined on the API '
575
+ 'server and try again.')
576
+ # Initialize the active workspace context to the workspace specified, so
577
+ # that a new request is not affected by the previous request's workspace.
578
+ global _active_workspace_context
579
+ _active_workspace_context = threading.local()
580
+
515
581
  try:
516
582
  common_utils.validate_schema(
517
583
  config,
@@ -592,7 +658,7 @@ def _compose_cli_config(cli_config: Optional[List[str]]) -> config_utils.Config:
592
658
  'Cannot use multiple --config flags with a config file.')
593
659
  config_source = maybe_config_path
594
660
  # cli_config is a path to a config file
595
- parsed_config = parse_config_file(maybe_config_path)
661
+ parsed_config = parse_and_validate_config_file(maybe_config_path)
596
662
  else: # cli_config is a comma-separated list of key-value pairs
597
663
  parsed_config = _parse_dotlist(cli_config)
598
664
  _validate_config(parsed_config, config_source)
@@ -623,3 +689,11 @@ def apply_cli_config(cli_config: Optional[List[str]]) -> Dict[str, Any]:
623
689
  overlay_skypilot_config(original_config=_get_loaded_config(),
624
690
  override_configs=parsed_config))
625
691
  return parsed_config
692
+
693
+
694
+ def get_workspaces() -> Dict[str, Any]:
695
+ """Returns the workspace config."""
696
+ workspaces = get_nested(('workspaces',), default_value={})
697
+ if constants.SKYPILOT_DEFAULT_WORKSPACE not in workspaces:
698
+ workspaces[constants.SKYPILOT_DEFAULT_WORKSPACE] = {}
699
+ return workspaces
@@ -46,6 +46,13 @@ available_node_types:
46
46
  InstanceType: {{instance_type}}
47
47
  ImageId: {{image_id}}
48
48
  DiskSize: {{disk_size}}
49
+ filesystems:
50
+ {%- for fs in filesystems %}
51
+ - filesystem_id: {{ fs.filesystem_id }}
52
+ filesystem_mount_tag: {{ fs.filesystem_mount_tag }}
53
+ filesystem_attach_mode: {{ fs.filesystem_attach_mode }}
54
+ filesystem_mount_path: {{ fs.filesystem_mount_path }}
55
+ {%- endfor %}
49
56
  UserData: |
50
57
  runcmd:
51
58
  - sudo sed -i 's/^#\?AllowTcpForwarding.*/AllowTcpForwarding yes/' /etc/ssh/sshd_config
@@ -130,6 +137,11 @@ setup_commands:
130
137
  - {%- for initial_setup_command in initial_setup_commands %}
131
138
  {{ initial_setup_command }}
132
139
  {%- endfor %}
140
+ {%- for fs in filesystems %}
141
+ sudo mkdir {{ fs.filesystem_mount_path }};
142
+ sudo mount -t virtiofs {{ fs.filesystem_mount_tag }} {{ fs.filesystem_mount_path }};
143
+ sudo chmod a+w {{ fs.filesystem_mount_path }};
144
+ {%- endfor %}
133
145
  sudo systemctl stop unattended-upgrades || true;
134
146
  sudo systemctl disable unattended-upgrades || true;
135
147
  sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
@@ -48,7 +48,8 @@ class StatusColumn:
48
48
  def show_status_table(cluster_records: List[_ClusterRecord],
49
49
  show_all: bool,
50
50
  show_user: bool,
51
- query_clusters: Optional[List[str]] = None) -> int:
51
+ query_clusters: Optional[List[str]] = None,
52
+ show_workspaces: bool = False) -> int:
52
53
  """Compute cluster table values and display.
53
54
 
54
55
  Returns:
@@ -56,7 +57,6 @@ def show_status_table(cluster_records: List[_ClusterRecord],
56
57
  STOPPED.
57
58
  """
58
59
  # TODO(zhwu): Update the information for autostop clusters.
59
-
60
60
  status_columns = [
61
61
  StatusColumn('NAME', _get_name),
62
62
  ]
@@ -66,6 +66,9 @@ def show_status_table(cluster_records: List[_ClusterRecord],
66
66
  StatusColumn('USER_ID', _get_user_hash, show_by_default=False))
67
67
 
68
68
  status_columns += [
69
+ StatusColumn('WORKSPACE',
70
+ _get_workspace,
71
+ show_by_default=show_workspaces),
69
72
  StatusColumn('INFRA', _get_infra, truncate=not show_all),
70
73
  StatusColumn('RESOURCES', _get_resources, truncate=not show_all),
71
74
  StatusColumn('STATUS', _get_status_colored),
@@ -106,12 +109,13 @@ def show_status_table(cluster_records: List[_ClusterRecord],
106
109
  for cluster in query_clusters
107
110
  if cluster not in cluster_names
108
111
  ]
109
- cluster_str = 'Cluster'
110
- if len(not_found_clusters) > 1:
111
- cluster_str += 's'
112
- cluster_str += ' '
113
- cluster_str += ', '.join(not_found_clusters)
114
- click.echo(f'{cluster_str} not found.')
112
+ if not_found_clusters:
113
+ cluster_str = 'Cluster'
114
+ if len(not_found_clusters) > 1:
115
+ cluster_str += 's'
116
+ cluster_str += ' '
117
+ cluster_str += ', '.join(not_found_clusters)
118
+ click.echo(f'{cluster_str} not found.')
115
119
  elif not cluster_records:
116
120
  click.echo('No existing clusters.')
117
121
  return num_pending_autostop
@@ -243,6 +247,12 @@ def _get_status(cluster_record: _ClusterRecord,
243
247
  return cluster_record['status']
244
248
 
245
249
 
250
+ def _get_workspace(cluster_record: _ClusterRecord,
251
+ truncate: bool = True) -> str:
252
+ del truncate
253
+ return cluster_record['workspace']
254
+
255
+
246
256
  def _get_status_colored(cluster_record: _ClusterRecord,
247
257
  truncate: bool = True) -> str:
248
258
  del truncate
sky/utils/infra_utils.py CHANGED
@@ -86,6 +86,16 @@ class InfraInfo:
86
86
  cloud_name = 'kubernetes' # Normalize k8s to kubernetes
87
87
  region = '/'.join(parts[1:]) if len(parts) >= 2 else None
88
88
  zone = None
89
+ elif cloud_name == 'ssh':
90
+ # For SSH, the entire string after "ssh/" is the
91
+ # node pool name. We prepend 'ssh-' for the internal implementation
92
+ # which reuses the context name.
93
+ # TODO(romilb): This is a workaround while we use the global
94
+ # kubeconfig to store the ssh contexts.
95
+ region = '/'.join(parts[1:]) if len(parts) >= 2 else None
96
+ if region:
97
+ region = f'ssh-{region}'
98
+ zone = None
89
99
  else:
90
100
  # For non-Kubernetes clouds, continue with regular parsing
91
101
  # but be careful to only split into max 3 parts
@@ -133,6 +143,12 @@ class InfraInfo:
133
143
  if zone is None:
134
144
  zone = '*'
135
145
 
146
+ # If the cloud is ssh, we remove the ssh- prefix from the region
147
+ # TODO(romilb): This is a workaround while we use the global
148
+ # kubeconfig to store the ssh contexts.
149
+ if region and region.startswith('ssh-'):
150
+ region = region[4:]
151
+
136
152
  # Build the parts list and filter out trailing wildcards
137
153
  parts = [cloud.lower(), region, zone]
138
154
  while parts and parts[-1] == '*':
@@ -160,7 +176,11 @@ class InfraInfo:
160
176
  if self.zone is not None and self.zone != '*':
161
177
  region_or_zone = self.zone
162
178
  elif self.region is not None and self.region != '*':
163
- region_or_zone = self.region
179
+ # If using region, we remove the ssh- prefix if it exists for SSH
180
+ # Node Pools.
181
+ # TODO(romilb): This is a workaround while we use the global
182
+ # kubeconfig to store the ssh contexts.
183
+ region_or_zone = self.region.lstrip('ssh-')
164
184
 
165
185
  if region_or_zone is not None and truncate:
166
186
  region_or_zone = common_utils.truncate_long_string(
@@ -0,0 +1,62 @@
1
+ #!/bin/bash
2
+ # cleanup-tunnel.sh - Script to clean up SSH tunnels for a Kubernetes context
3
+
4
+ # Usage: cleanup-tunnel.sh CONTEXT_NAME
5
+
6
+ CONTEXT="${1:-default}"
7
+ TUNNEL_DIR="$HOME/.sky/ssh_node_pools_info"
8
+ PID_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.pid"
9
+ LOG_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.log"
10
+ LOCK_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.lock"
11
+
12
+ # Get the port from kubeconfig if available
13
+ KUBE_PORT=$(kubectl config view --minify --context="$CONTEXT" -o jsonpath='{.clusters[0].cluster.server}' 2>/dev/null | grep -o ":[0-9]\+" | tr -d ":" || echo "")
14
+
15
+ if [[ -z "$KUBE_PORT" ]]; then
16
+ # Default to 6443 if we can't determine the port
17
+ KUBE_PORT=6443
18
+ echo "$(date): Could not determine port from kubeconfig, using default port $KUBE_PORT" >> "$LOG_FILE"
19
+ else
20
+ echo "$(date): Found port $KUBE_PORT in kubeconfig for context $CONTEXT" >> "$LOG_FILE"
21
+ fi
22
+
23
+ # Check if PID file exists
24
+ if [[ -f "$PID_FILE" ]]; then
25
+ OLD_PID=$(cat "$PID_FILE")
26
+
27
+ # Log the cleanup attempt
28
+ echo "$(date): Attempting to clean up tunnel for context $CONTEXT (PID: $OLD_PID, Port: $KUBE_PORT)" >> "$LOG_FILE"
29
+
30
+ # Try to kill the process
31
+ if kill -0 "$OLD_PID" 2>/dev/null; then
32
+ # Process exists, kill it
33
+ kill "$OLD_PID" 2>/dev/null
34
+
35
+ # Wait a moment and check if it's really gone
36
+ sleep 1
37
+ if kill -0 "$OLD_PID" 2>/dev/null; then
38
+ # Still running, force kill
39
+ kill -9 "$OLD_PID" 2>/dev/null
40
+ echo "$(date): Forcefully terminated tunnel process $OLD_PID" >> "$LOG_FILE"
41
+ else
42
+ echo "$(date): Successfully terminated tunnel process $OLD_PID" >> "$LOG_FILE"
43
+ fi
44
+ else
45
+ echo "$(date): No running process found with PID $OLD_PID" >> "$LOG_FILE"
46
+ fi
47
+
48
+ # Remove PID file
49
+ rm -f "$PID_FILE"
50
+ else
51
+ echo "$(date): No PID file found for context $CONTEXT. Nothing to clean up." >> "$LOG_FILE"
52
+ fi
53
+
54
+ # Clean up lock file if it exists
55
+ rm -f "$LOCK_FILE"
56
+
57
+ # Check if port is still in use
58
+ if nc -z localhost "$KUBE_PORT" 2>/dev/null; then
59
+ echo "$(date): Warning: Port $KUBE_PORT is still in use after cleanup. Another process might be using it." >> "$LOG_FILE"
60
+ fi
61
+
62
+ echo "$(date): Cleanup complete for context $CONTEXT" >> "$LOG_FILE"
@@ -85,6 +85,7 @@ fi
85
85
  if kind get clusters | grep -q skypilot; then
86
86
  echo "Local cluster already exists. Exiting."
87
87
  # Switch context to the local cluster
88
+ kind export kubeconfig --name skypilot
88
89
  kubectl config use-context kind-skypilot
89
90
  exit 100
90
91
  fi