skypilot-nightly 1.0.0.dev20250522__py3-none-any.whl → 1.0.0.dev20250524__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +46 -16
- sky/backends/backend_utils.py +62 -45
- sky/backends/cloud_vm_ray_backend.py +19 -5
- sky/check.py +398 -171
- sky/cli.py +302 -98
- sky/client/cli.py +302 -98
- sky/client/sdk.py +104 -12
- sky/clouds/__init__.py +3 -0
- sky/clouds/aws.py +4 -2
- sky/clouds/azure.py +4 -2
- sky/clouds/cloud.py +24 -6
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +2 -1
- sky/clouds/gcp.py +23 -5
- sky/clouds/ibm.py +4 -2
- sky/clouds/kubernetes.py +66 -22
- sky/clouds/lambda_cloud.py +2 -1
- sky/clouds/nebius.py +18 -2
- sky/clouds/oci.py +4 -2
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +2 -1
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/ssh_catalog.py +167 -0
- sky/clouds/ssh.py +203 -0
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +58 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/480-ee58038f1a4afd5c.js +1 -0
- sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +15 -0
- sky/dashboard/out/_next/static/chunks/498-d7722313e5e5b4e6.js +21 -0
- sky/dashboard/out/_next/static/chunks/573-f17bd89d9f9118b3.js +66 -0
- sky/dashboard/out/_next/static/chunks/578-7a4795009a56430c.js +6 -0
- sky/dashboard/out/_next/static/chunks/734-5f5ce8f347b7f417.js +1 -0
- sky/dashboard/out/_next/static/chunks/937.f97f83652028e944.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-f347f6144075b0c8.js +1 -0
- sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-37c042a356f8e608.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-e690d864aa00e2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-db6558a5ec687011.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2d319455c3f1c3e2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-02a7b60f2ead275f.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +1 -0
- sky/dashboard/out/_next/static/css/d2cdba64c9202dd7.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/storage.py +1 -1
- sky/global_user_state.py +42 -19
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +72 -56
- sky/jobs/state.py +26 -5
- sky/jobs/utils.py +65 -13
- sky/optimizer.py +29 -7
- sky/provision/__init__.py +1 -0
- sky/provision/aws/instance.py +17 -1
- sky/provision/fluidstack/instance.py +1 -0
- sky/provision/kubernetes/instance.py +16 -5
- sky/provision/kubernetes/utils.py +37 -19
- sky/provision/nebius/instance.py +3 -1
- sky/provision/nebius/utils.py +14 -2
- sky/provision/ssh/__init__.py +18 -0
- sky/resources.py +4 -1
- sky/serve/server/core.py +9 -6
- sky/server/html/token_page.html +6 -1
- sky/server/requests/executor.py +1 -0
- sky/server/requests/payloads.py +18 -0
- sky/server/server.py +108 -5
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/constants.py +4 -1
- sky/skypilot_config.py +83 -9
- sky/templates/nebius-ray.yml.j2 +12 -0
- sky/utils/cli_utils/status_utils.py +18 -8
- sky/utils/infra_utils.py +21 -1
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/create_cluster.sh +1 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +1440 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
- sky/utils/kubernetes/ssh-tunnel.sh +387 -0
- sky/utils/log_utils.py +218 -1
- sky/utils/schemas.py +75 -0
- sky/utils/ux_utils.py +2 -1
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/METADATA +6 -1
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/RECORD +103 -91
- sky/dashboard/out/_next/static/CzOVV6JpRQBRt5GhZuhyK/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +0 -6
- sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +0 -1
- sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +0 -1
- sky/dashboard/out/_next/static/chunks/582-683f4f27b81996dc.js +0 -59
- sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +0 -3
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- /sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → aHej19bZyl4hoHgrzPCn7}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/top_level.txt +0 -0
sky/server/server.py
CHANGED
@@ -6,6 +6,7 @@ import base64
|
|
6
6
|
import contextlib
|
7
7
|
import dataclasses
|
8
8
|
import datetime
|
9
|
+
import hashlib
|
9
10
|
import json
|
10
11
|
import logging
|
11
12
|
import multiprocessing
|
@@ -31,7 +32,9 @@ from sky import core
|
|
31
32
|
from sky import exceptions
|
32
33
|
from sky import execution
|
33
34
|
from sky import global_user_state
|
35
|
+
from sky import models
|
34
36
|
from sky import sky_logging
|
37
|
+
from sky import skypilot_config
|
35
38
|
from sky.clouds import service_catalog
|
36
39
|
from sky.data import storage_utils
|
37
40
|
from sky.jobs.server import server as jobs_rest
|
@@ -110,6 +113,38 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
110
113
|
return response
|
111
114
|
|
112
115
|
|
116
|
+
def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
|
117
|
+
if 'X-Auth-Request-Email' not in request.headers:
|
118
|
+
return None
|
119
|
+
user_name = request.headers['X-Auth-Request-Email']
|
120
|
+
user_hash = hashlib.md5(
|
121
|
+
user_name.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
|
122
|
+
return models.User(id=user_hash, name=user_name)
|
123
|
+
|
124
|
+
|
125
|
+
class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
126
|
+
"""Middleware to handle auth proxy."""
|
127
|
+
|
128
|
+
async def dispatch(self, request: fastapi.Request, call_next):
|
129
|
+
auth_user = _get_auth_user_header(request)
|
130
|
+
body = await request.body()
|
131
|
+
if auth_user and body:
|
132
|
+
try:
|
133
|
+
original_json = await request.json()
|
134
|
+
except json.JSONDecodeError as e:
|
135
|
+
logger.error(f'Error parsing request JSON: {e}')
|
136
|
+
else:
|
137
|
+
logger.debug(f'Overriding user for {request.state.request_id}: '
|
138
|
+
f'{auth_user.name}, {auth_user.id}')
|
139
|
+
if 'env_vars' in original_json:
|
140
|
+
original_json['env_vars'][
|
141
|
+
constants.USER_ID_ENV_VAR] = auth_user.id
|
142
|
+
original_json['env_vars'][
|
143
|
+
constants.USER_ENV_VAR] = auth_user.name
|
144
|
+
request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
|
145
|
+
return await call_next(request)
|
146
|
+
|
147
|
+
|
113
148
|
# Default expiration time for upload ids before cleanup.
|
114
149
|
_DEFAULT_UPLOAD_EXPIRATION_TIME = datetime.timedelta(hours=1)
|
115
150
|
# Key: (upload_id, user_hash), Value: the time when the upload id needs to be
|
@@ -216,6 +251,7 @@ app.add_middleware(
|
|
216
251
|
allow_headers=['*'],
|
217
252
|
# TODO(syang): remove X-Request-ID when v0.10.0 is released.
|
218
253
|
expose_headers=['X-Request-ID', 'X-Skypilot-Request-ID'])
|
254
|
+
app.add_middleware(AuthProxyMiddleware)
|
219
255
|
app.add_middleware(RequestIDMiddleware)
|
220
256
|
app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
|
221
257
|
app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
|
@@ -223,8 +259,18 @@ app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
|
|
223
259
|
|
224
260
|
@app.get('/token')
|
225
261
|
async def token(request: fastapi.Request) -> fastapi.responses.HTMLResponse:
|
262
|
+
# If we have auth info, save this user to the database.
|
263
|
+
user = _get_auth_user_header(request)
|
264
|
+
if user is not None:
|
265
|
+
global_user_state.add_or_update_user(user)
|
266
|
+
|
267
|
+
token_data = {
|
268
|
+
'v': 1, # Token version number, bump for backwards incompatible.
|
269
|
+
'user': user.id if user is not None else None,
|
270
|
+
'cookies': request.cookies,
|
271
|
+
}
|
226
272
|
# Use base64 encoding to avoid having to escape anything in the HTML.
|
227
|
-
json_bytes = json.dumps(
|
273
|
+
json_bytes = json.dumps(token_data).encode('utf-8')
|
228
274
|
base64_str = base64.b64encode(json_bytes).decode('utf-8')
|
229
275
|
|
230
276
|
html_dir = pathlib.Path(__file__).parent / 'html'
|
@@ -236,8 +282,10 @@ async def token(request: fastapi.Request) -> fastapi.responses.HTMLResponse:
|
|
236
282
|
raise fastapi.HTTPException(
|
237
283
|
status_code=500, detail='Token page template not found.') from e
|
238
284
|
|
285
|
+
user_info_string = f'Logged in as {user.name}' if user is not None else ''
|
239
286
|
html_content = html_content.replace(
|
240
|
-
'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER',
|
287
|
+
'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER',
|
288
|
+
base64_str).replace('USER_PLACEHOLDER', user_info_string)
|
241
289
|
|
242
290
|
return fastapi.responses.HTMLResponse(
|
243
291
|
content=html_content,
|
@@ -263,17 +311,30 @@ async def check(request: fastapi.Request,
|
|
263
311
|
|
264
312
|
|
265
313
|
@app.get('/enabled_clouds')
|
266
|
-
async def enabled_clouds(request: fastapi.Request
|
314
|
+
async def enabled_clouds(request: fastapi.Request,
|
315
|
+
workspace: Optional[str] = None) -> None:
|
267
316
|
"""Gets enabled clouds on the server."""
|
268
317
|
executor.schedule_request(
|
269
318
|
request_id=request.state.request_id,
|
270
319
|
request_name='enabled_clouds',
|
271
|
-
request_body=payloads.
|
320
|
+
request_body=payloads.EnabledCloudsBody(workspace=workspace),
|
272
321
|
func=core.enabled_clouds,
|
273
322
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
274
323
|
)
|
275
324
|
|
276
325
|
|
326
|
+
@app.get('/workspaces')
|
327
|
+
async def get_workspace_config(request: fastapi.Request) -> None:
|
328
|
+
"""Gets workspace config on the server."""
|
329
|
+
executor.schedule_request(
|
330
|
+
request_id=request.state.request_id,
|
331
|
+
request_name='workspaces',
|
332
|
+
request_body=payloads.RequestBody(),
|
333
|
+
func=skypilot_config.get_workspaces,
|
334
|
+
schedule_type=requests_lib.ScheduleType.SHORT,
|
335
|
+
)
|
336
|
+
|
337
|
+
|
277
338
|
@app.post('/realtime_kubernetes_gpu_availability')
|
278
339
|
async def realtime_kubernetes_gpu_availability(
|
279
340
|
request: fastapi.Request,
|
@@ -909,6 +970,33 @@ async def local_down(request: fastapi.Request) -> None:
|
|
909
970
|
)
|
910
971
|
|
911
972
|
|
973
|
+
@app.post('/ssh_up')
|
974
|
+
async def ssh_up(request: fastapi.Request,
|
975
|
+
ssh_up_body: payloads.SSHUpBody) -> None:
|
976
|
+
"""Deploys a Kubernetes cluster on SSH targets."""
|
977
|
+
executor.schedule_request(
|
978
|
+
request_id=request.state.request_id,
|
979
|
+
request_name='ssh_up',
|
980
|
+
request_body=ssh_up_body,
|
981
|
+
func=core.ssh_up,
|
982
|
+
schedule_type=requests_lib.ScheduleType.LONG,
|
983
|
+
)
|
984
|
+
|
985
|
+
|
986
|
+
@app.post('/ssh_down')
|
987
|
+
async def ssh_down(request: fastapi.Request,
|
988
|
+
ssh_up_body: payloads.SSHUpBody) -> None:
|
989
|
+
"""Tears down a Kubernetes cluster on SSH targets."""
|
990
|
+
# We still call ssh_up but with cleanup=True
|
991
|
+
executor.schedule_request(
|
992
|
+
request_id=request.state.request_id,
|
993
|
+
request_name='ssh_down',
|
994
|
+
request_body=ssh_up_body,
|
995
|
+
func=core.ssh_up, # Reuse ssh_up function with cleanup=True
|
996
|
+
schedule_type=requests_lib.ScheduleType.LONG,
|
997
|
+
)
|
998
|
+
|
999
|
+
|
912
1000
|
# === API server related APIs ===
|
913
1001
|
@app.get('/api/get')
|
914
1002
|
async def api_get(request_id: str) -> requests_lib.RequestPayload:
|
@@ -1086,7 +1174,7 @@ async def api_status(
|
|
1086
1174
|
|
1087
1175
|
|
1088
1176
|
@app.get('/api/health')
|
1089
|
-
async def health() -> Dict[str,
|
1177
|
+
async def health(request: fastapi.Request) -> Dict[str, Any]:
|
1090
1178
|
"""Checks the health of the API server.
|
1091
1179
|
|
1092
1180
|
Returns:
|
@@ -1098,12 +1186,14 @@ async def health() -> Dict[str, str]:
|
|
1098
1186
|
disk, which can be used to warn about restarting the API server
|
1099
1187
|
- commit: str; The commit hash of SkyPilot used for API server.
|
1100
1188
|
"""
|
1189
|
+
user = _get_auth_user_header(request)
|
1101
1190
|
return {
|
1102
1191
|
'status': common.ApiServerStatus.HEALTHY.value,
|
1103
1192
|
'api_version': server_constants.API_VERSION,
|
1104
1193
|
'version': sky.__version__,
|
1105
1194
|
'version_on_disk': common.get_skypilot_version_on_disk(),
|
1106
1195
|
'commit': sky.__commit__,
|
1196
|
+
'user': user.to_dict() if user is not None else None,
|
1107
1197
|
}
|
1108
1198
|
|
1109
1199
|
|
@@ -1185,6 +1275,19 @@ async def kubernetes_pod_ssh_proxy(
|
|
1185
1275
|
proc.terminate()
|
1186
1276
|
|
1187
1277
|
|
1278
|
+
@app.get('/all_contexts')
|
1279
|
+
async def all_contexts(request: fastapi.Request) -> None:
|
1280
|
+
"""Gets all Kubernetes and SSH node pool contexts."""
|
1281
|
+
|
1282
|
+
executor.schedule_request(
|
1283
|
+
request_id=request.state.request_id,
|
1284
|
+
request_name='all_contexts',
|
1285
|
+
request_body=payloads.RequestBody(),
|
1286
|
+
func=core.get_all_contexts,
|
1287
|
+
schedule_type=requests_lib.ScheduleType.SHORT,
|
1288
|
+
)
|
1289
|
+
|
1290
|
+
|
1188
1291
|
# === Internal APIs ===
|
1189
1292
|
@app.get('/api/completion/cluster_name')
|
1190
1293
|
async def complete_cluster_name(incomplete: str,) -> List[str]:
|
sky/setup_files/dependencies.py
CHANGED
@@ -130,6 +130,7 @@ extras_require: Dict[str, List[str]] = {
|
|
130
130
|
'oci': ['oci'] + local_ray,
|
131
131
|
# Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
|
132
132
|
'kubernetes': ['kubernetes>=20.0.0,!=32.0.0', 'websockets'],
|
133
|
+
'ssh': ['kubernetes>=20.0.0,!=32.0.0', 'websockets'],
|
133
134
|
'remote': remote,
|
134
135
|
# For the container registry auth api. Reference:
|
135
136
|
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
|
sky/skylet/constants.py
CHANGED
@@ -378,7 +378,8 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
378
378
|
# we skip the following keys because they are meant to be client-side configs.
|
379
379
|
SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('admin_policy',),
|
380
380
|
('api_server',),
|
381
|
-
('allowed_clouds',)
|
381
|
+
('allowed_clouds',),
|
382
|
+
('workspaces',)]
|
382
383
|
|
383
384
|
# Constants for Azure blob storage
|
384
385
|
WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
|
@@ -405,3 +406,5 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
405
406
|
|
406
407
|
# Environment variable that is set to 'true' if this is a skypilot server.
|
407
408
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
409
|
+
|
410
|
+
SKYPILOT_DEFAULT_WORKSPACE = 'default'
|
sky/skypilot_config.py
CHANGED
@@ -123,6 +123,8 @@ class ConfigContext:
|
|
123
123
|
_global_config_context = ConfigContext()
|
124
124
|
_reload_config_lock = threading.Lock()
|
125
125
|
|
126
|
+
_active_workspace_context = threading.local()
|
127
|
+
|
126
128
|
|
127
129
|
def _get_config_context() -> ConfigContext:
|
128
130
|
"""Get config context for current context.
|
@@ -194,8 +196,7 @@ def get_user_config() -> config_utils.Config:
|
|
194
196
|
|
195
197
|
# load the user config file
|
196
198
|
if os.path.exists(user_config_path):
|
197
|
-
user_config =
|
198
|
-
_validate_config(user_config, user_config_path)
|
199
|
+
user_config = parse_and_validate_config_file(user_config_path)
|
199
200
|
else:
|
200
201
|
user_config = config_utils.Config()
|
201
202
|
return user_config
|
@@ -223,8 +224,7 @@ def _get_project_config() -> config_utils.Config:
|
|
223
224
|
|
224
225
|
# load the project config file
|
225
226
|
if os.path.exists(project_config_path):
|
226
|
-
project_config =
|
227
|
-
_validate_config(project_config, project_config_path)
|
227
|
+
project_config = parse_and_validate_config_file(project_config_path)
|
228
228
|
else:
|
229
229
|
project_config = config_utils.Config()
|
230
230
|
return project_config
|
@@ -252,8 +252,7 @@ def get_server_config() -> config_utils.Config:
|
|
252
252
|
|
253
253
|
# load the server config file
|
254
254
|
if os.path.exists(server_config_path):
|
255
|
-
server_config =
|
256
|
-
_validate_config(server_config, server_config_path)
|
255
|
+
server_config = parse_and_validate_config_file(server_config_path)
|
257
256
|
else:
|
258
257
|
server_config = config_utils.Config()
|
259
258
|
return server_config
|
@@ -287,6 +286,60 @@ def get_nested(keys: Tuple[str, ...],
|
|
287
286
|
disallowed_override_keys=None)
|
288
287
|
|
289
288
|
|
289
|
+
def get_workspace_cloud(cloud: str,
|
290
|
+
workspace: Optional[str] = None) -> config_utils.Config:
|
291
|
+
"""Returns the workspace config."""
|
292
|
+
if workspace is None:
|
293
|
+
workspace = get_active_workspace()
|
294
|
+
clouds = get_nested(keys=(
|
295
|
+
'workspaces',
|
296
|
+
workspace,
|
297
|
+
), default_value=None)
|
298
|
+
if clouds is None:
|
299
|
+
return config_utils.Config()
|
300
|
+
return clouds.get(cloud.lower(), config_utils.Config())
|
301
|
+
|
302
|
+
|
303
|
+
@contextlib.contextmanager
|
304
|
+
def local_active_workspace_ctx(workspace: str) -> Iterator[None]:
|
305
|
+
"""Temporarily set the active workspace IN CURRENT THREAD.
|
306
|
+
|
307
|
+
Note: having this function thread-local is error-prone, as wrapping some
|
308
|
+
operations with this will not have the underlying threads to get the
|
309
|
+
correct active workspace. However, we cannot make it global either, as
|
310
|
+
backend_utils.refresh_cluster_status() will be called in multiple threads,
|
311
|
+
and they may have different active workspaces for different threads.
|
312
|
+
|
313
|
+
# TODO(zhwu): make this function global by default and able to be set
|
314
|
+
# it to thread-local with an argument.
|
315
|
+
|
316
|
+
Args:
|
317
|
+
workspace: The workspace to set as active.
|
318
|
+
|
319
|
+
Raises:
|
320
|
+
RuntimeError: If called from a non-main thread.
|
321
|
+
"""
|
322
|
+
original_workspace = get_active_workspace()
|
323
|
+
if original_workspace == workspace:
|
324
|
+
# No change, do nothing.
|
325
|
+
yield
|
326
|
+
return
|
327
|
+
_active_workspace_context.workspace = workspace
|
328
|
+
logger.debug(f'Set context workspace: {workspace}')
|
329
|
+
yield
|
330
|
+
logger.debug(f'Reset context workspace: {original_workspace}')
|
331
|
+
_active_workspace_context.workspace = original_workspace
|
332
|
+
|
333
|
+
|
334
|
+
def get_active_workspace(force_user_workspace: bool = False) -> str:
|
335
|
+
context_workspace = getattr(_active_workspace_context, 'workspace', None)
|
336
|
+
if not force_user_workspace and context_workspace is not None:
|
337
|
+
logger.debug(f'Get context workspace: {context_workspace}')
|
338
|
+
return context_workspace
|
339
|
+
return get_nested(keys=('active_workspace',),
|
340
|
+
default_value=constants.SKYPILOT_DEFAULT_WORKSPACE)
|
341
|
+
|
342
|
+
|
290
343
|
def set_nested(keys: Tuple[str, ...], value: Any) -> Dict[str, Any]:
|
291
344
|
"""Returns a deep-copied config with the nested key set to value.
|
292
345
|
|
@@ -357,7 +410,7 @@ def _reload_config() -> None:
|
|
357
410
|
_reload_config_as_client()
|
358
411
|
|
359
412
|
|
360
|
-
def
|
413
|
+
def parse_and_validate_config_file(config_path: str) -> config_utils.Config:
|
361
414
|
config = config_utils.Config()
|
362
415
|
try:
|
363
416
|
config_dict = common_utils.read_yaml(config_path)
|
@@ -413,7 +466,7 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
|
|
413
466
|
'exist. Please double check the path or unset the env var: '
|
414
467
|
f'unset {ENV_VAR_SKYPILOT_CONFIG}')
|
415
468
|
logger.debug(f'Using config path: {config_path}')
|
416
|
-
_set_loaded_config(
|
469
|
+
_set_loaded_config(parse_and_validate_config_file(config_path))
|
417
470
|
_set_loaded_config_path(config_path)
|
418
471
|
|
419
472
|
|
@@ -512,6 +565,19 @@ def override_skypilot_config(
|
|
512
565
|
override_configs=dict(override_configs),
|
513
566
|
allowed_override_keys=None,
|
514
567
|
disallowed_override_keys=constants.SKIPPED_CLIENT_OVERRIDE_KEYS)
|
568
|
+
workspace = config.get_nested(
|
569
|
+
keys=('active_workspace',),
|
570
|
+
default_value=constants.SKYPILOT_DEFAULT_WORKSPACE)
|
571
|
+
if (workspace != constants.SKYPILOT_DEFAULT_WORKSPACE and workspace
|
572
|
+
not in get_nested(keys=('workspaces',), default_value={})):
|
573
|
+
raise ValueError(f'Workspace {workspace} does not exist. '
|
574
|
+
'Use `sky check` to see if it is defined on the API '
|
575
|
+
'server and try again.')
|
576
|
+
# Initialize the active workspace context to the workspace specified, so
|
577
|
+
# that a new request is not affected by the previous request's workspace.
|
578
|
+
global _active_workspace_context
|
579
|
+
_active_workspace_context = threading.local()
|
580
|
+
|
515
581
|
try:
|
516
582
|
common_utils.validate_schema(
|
517
583
|
config,
|
@@ -592,7 +658,7 @@ def _compose_cli_config(cli_config: Optional[List[str]]) -> config_utils.Config:
|
|
592
658
|
'Cannot use multiple --config flags with a config file.')
|
593
659
|
config_source = maybe_config_path
|
594
660
|
# cli_config is a path to a config file
|
595
|
-
parsed_config =
|
661
|
+
parsed_config = parse_and_validate_config_file(maybe_config_path)
|
596
662
|
else: # cli_config is a comma-separated list of key-value pairs
|
597
663
|
parsed_config = _parse_dotlist(cli_config)
|
598
664
|
_validate_config(parsed_config, config_source)
|
@@ -623,3 +689,11 @@ def apply_cli_config(cli_config: Optional[List[str]]) -> Dict[str, Any]:
|
|
623
689
|
overlay_skypilot_config(original_config=_get_loaded_config(),
|
624
690
|
override_configs=parsed_config))
|
625
691
|
return parsed_config
|
692
|
+
|
693
|
+
|
694
|
+
def get_workspaces() -> Dict[str, Any]:
|
695
|
+
"""Returns the workspace config."""
|
696
|
+
workspaces = get_nested(('workspaces',), default_value={})
|
697
|
+
if constants.SKYPILOT_DEFAULT_WORKSPACE not in workspaces:
|
698
|
+
workspaces[constants.SKYPILOT_DEFAULT_WORKSPACE] = {}
|
699
|
+
return workspaces
|
sky/templates/nebius-ray.yml.j2
CHANGED
@@ -46,6 +46,13 @@ available_node_types:
|
|
46
46
|
InstanceType: {{instance_type}}
|
47
47
|
ImageId: {{image_id}}
|
48
48
|
DiskSize: {{disk_size}}
|
49
|
+
filesystems:
|
50
|
+
{%- for fs in filesystems %}
|
51
|
+
- filesystem_id: {{ fs.filesystem_id }}
|
52
|
+
filesystem_mount_tag: {{ fs.filesystem_mount_tag }}
|
53
|
+
filesystem_attach_mode: {{ fs.filesystem_attach_mode }}
|
54
|
+
filesystem_mount_path: {{ fs.filesystem_mount_path }}
|
55
|
+
{%- endfor %}
|
49
56
|
UserData: |
|
50
57
|
runcmd:
|
51
58
|
- sudo sed -i 's/^#\?AllowTcpForwarding.*/AllowTcpForwarding yes/' /etc/ssh/sshd_config
|
@@ -130,6 +137,11 @@ setup_commands:
|
|
130
137
|
- {%- for initial_setup_command in initial_setup_commands %}
|
131
138
|
{{ initial_setup_command }}
|
132
139
|
{%- endfor %}
|
140
|
+
{%- for fs in filesystems %}
|
141
|
+
sudo mkdir {{ fs.filesystem_mount_path }};
|
142
|
+
sudo mount -t virtiofs {{ fs.filesystem_mount_tag }} {{ fs.filesystem_mount_path }};
|
143
|
+
sudo chmod a+w {{ fs.filesystem_mount_path }};
|
144
|
+
{%- endfor %}
|
133
145
|
sudo systemctl stop unattended-upgrades || true;
|
134
146
|
sudo systemctl disable unattended-upgrades || true;
|
135
147
|
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
|
@@ -48,7 +48,8 @@ class StatusColumn:
|
|
48
48
|
def show_status_table(cluster_records: List[_ClusterRecord],
|
49
49
|
show_all: bool,
|
50
50
|
show_user: bool,
|
51
|
-
query_clusters: Optional[List[str]] = None
|
51
|
+
query_clusters: Optional[List[str]] = None,
|
52
|
+
show_workspaces: bool = False) -> int:
|
52
53
|
"""Compute cluster table values and display.
|
53
54
|
|
54
55
|
Returns:
|
@@ -56,7 +57,6 @@ def show_status_table(cluster_records: List[_ClusterRecord],
|
|
56
57
|
STOPPED.
|
57
58
|
"""
|
58
59
|
# TODO(zhwu): Update the information for autostop clusters.
|
59
|
-
|
60
60
|
status_columns = [
|
61
61
|
StatusColumn('NAME', _get_name),
|
62
62
|
]
|
@@ -66,6 +66,9 @@ def show_status_table(cluster_records: List[_ClusterRecord],
|
|
66
66
|
StatusColumn('USER_ID', _get_user_hash, show_by_default=False))
|
67
67
|
|
68
68
|
status_columns += [
|
69
|
+
StatusColumn('WORKSPACE',
|
70
|
+
_get_workspace,
|
71
|
+
show_by_default=show_workspaces),
|
69
72
|
StatusColumn('INFRA', _get_infra, truncate=not show_all),
|
70
73
|
StatusColumn('RESOURCES', _get_resources, truncate=not show_all),
|
71
74
|
StatusColumn('STATUS', _get_status_colored),
|
@@ -106,12 +109,13 @@ def show_status_table(cluster_records: List[_ClusterRecord],
|
|
106
109
|
for cluster in query_clusters
|
107
110
|
if cluster not in cluster_names
|
108
111
|
]
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
112
|
+
if not_found_clusters:
|
113
|
+
cluster_str = 'Cluster'
|
114
|
+
if len(not_found_clusters) > 1:
|
115
|
+
cluster_str += 's'
|
116
|
+
cluster_str += ' '
|
117
|
+
cluster_str += ', '.join(not_found_clusters)
|
118
|
+
click.echo(f'{cluster_str} not found.')
|
115
119
|
elif not cluster_records:
|
116
120
|
click.echo('No existing clusters.')
|
117
121
|
return num_pending_autostop
|
@@ -243,6 +247,12 @@ def _get_status(cluster_record: _ClusterRecord,
|
|
243
247
|
return cluster_record['status']
|
244
248
|
|
245
249
|
|
250
|
+
def _get_workspace(cluster_record: _ClusterRecord,
|
251
|
+
truncate: bool = True) -> str:
|
252
|
+
del truncate
|
253
|
+
return cluster_record['workspace']
|
254
|
+
|
255
|
+
|
246
256
|
def _get_status_colored(cluster_record: _ClusterRecord,
|
247
257
|
truncate: bool = True) -> str:
|
248
258
|
del truncate
|
sky/utils/infra_utils.py
CHANGED
@@ -86,6 +86,16 @@ class InfraInfo:
|
|
86
86
|
cloud_name = 'kubernetes' # Normalize k8s to kubernetes
|
87
87
|
region = '/'.join(parts[1:]) if len(parts) >= 2 else None
|
88
88
|
zone = None
|
89
|
+
elif cloud_name == 'ssh':
|
90
|
+
# For SSH, the entire string after "ssh/" is the
|
91
|
+
# node pool name. We prepend 'ssh-' for the internal implementation
|
92
|
+
# which reuses the context name.
|
93
|
+
# TODO(romilb): This is a workaround while we use the global
|
94
|
+
# kubeconfig to store the ssh contexts.
|
95
|
+
region = '/'.join(parts[1:]) if len(parts) >= 2 else None
|
96
|
+
if region:
|
97
|
+
region = f'ssh-{region}'
|
98
|
+
zone = None
|
89
99
|
else:
|
90
100
|
# For non-Kubernetes clouds, continue with regular parsing
|
91
101
|
# but be careful to only split into max 3 parts
|
@@ -133,6 +143,12 @@ class InfraInfo:
|
|
133
143
|
if zone is None:
|
134
144
|
zone = '*'
|
135
145
|
|
146
|
+
# If the cloud is ssh, we remove the ssh- prefix from the region
|
147
|
+
# TODO(romilb): This is a workaround while we use the global
|
148
|
+
# kubeconfig to store the ssh contexts.
|
149
|
+
if region and region.startswith('ssh-'):
|
150
|
+
region = region[4:]
|
151
|
+
|
136
152
|
# Build the parts list and filter out trailing wildcards
|
137
153
|
parts = [cloud.lower(), region, zone]
|
138
154
|
while parts and parts[-1] == '*':
|
@@ -160,7 +176,11 @@ class InfraInfo:
|
|
160
176
|
if self.zone is not None and self.zone != '*':
|
161
177
|
region_or_zone = self.zone
|
162
178
|
elif self.region is not None and self.region != '*':
|
163
|
-
|
179
|
+
# If using region, we remove the ssh- prefix if it exists for SSH
|
180
|
+
# Node Pools.
|
181
|
+
# TODO(romilb): This is a workaround while we use the global
|
182
|
+
# kubeconfig to store the ssh contexts.
|
183
|
+
region_or_zone = self.region.lstrip('ssh-')
|
164
184
|
|
165
185
|
if region_or_zone is not None and truncate:
|
166
186
|
region_or_zone = common_utils.truncate_long_string(
|
@@ -0,0 +1,62 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# cleanup-tunnel.sh - Script to clean up SSH tunnels for a Kubernetes context
|
3
|
+
|
4
|
+
# Usage: cleanup-tunnel.sh CONTEXT_NAME
|
5
|
+
|
6
|
+
CONTEXT="${1:-default}"
|
7
|
+
TUNNEL_DIR="$HOME/.sky/ssh_node_pools_info"
|
8
|
+
PID_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.pid"
|
9
|
+
LOG_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.log"
|
10
|
+
LOCK_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.lock"
|
11
|
+
|
12
|
+
# Get the port from kubeconfig if available
|
13
|
+
KUBE_PORT=$(kubectl config view --minify --context="$CONTEXT" -o jsonpath='{.clusters[0].cluster.server}' 2>/dev/null | grep -o ":[0-9]\+" | tr -d ":" || echo "")
|
14
|
+
|
15
|
+
if [[ -z "$KUBE_PORT" ]]; then
|
16
|
+
# Default to 6443 if we can't determine the port
|
17
|
+
KUBE_PORT=6443
|
18
|
+
echo "$(date): Could not determine port from kubeconfig, using default port $KUBE_PORT" >> "$LOG_FILE"
|
19
|
+
else
|
20
|
+
echo "$(date): Found port $KUBE_PORT in kubeconfig for context $CONTEXT" >> "$LOG_FILE"
|
21
|
+
fi
|
22
|
+
|
23
|
+
# Check if PID file exists
|
24
|
+
if [[ -f "$PID_FILE" ]]; then
|
25
|
+
OLD_PID=$(cat "$PID_FILE")
|
26
|
+
|
27
|
+
# Log the cleanup attempt
|
28
|
+
echo "$(date): Attempting to clean up tunnel for context $CONTEXT (PID: $OLD_PID, Port: $KUBE_PORT)" >> "$LOG_FILE"
|
29
|
+
|
30
|
+
# Try to kill the process
|
31
|
+
if kill -0 "$OLD_PID" 2>/dev/null; then
|
32
|
+
# Process exists, kill it
|
33
|
+
kill "$OLD_PID" 2>/dev/null
|
34
|
+
|
35
|
+
# Wait a moment and check if it's really gone
|
36
|
+
sleep 1
|
37
|
+
if kill -0 "$OLD_PID" 2>/dev/null; then
|
38
|
+
# Still running, force kill
|
39
|
+
kill -9 "$OLD_PID" 2>/dev/null
|
40
|
+
echo "$(date): Forcefully terminated tunnel process $OLD_PID" >> "$LOG_FILE"
|
41
|
+
else
|
42
|
+
echo "$(date): Successfully terminated tunnel process $OLD_PID" >> "$LOG_FILE"
|
43
|
+
fi
|
44
|
+
else
|
45
|
+
echo "$(date): No running process found with PID $OLD_PID" >> "$LOG_FILE"
|
46
|
+
fi
|
47
|
+
|
48
|
+
# Remove PID file
|
49
|
+
rm -f "$PID_FILE"
|
50
|
+
else
|
51
|
+
echo "$(date): No PID file found for context $CONTEXT. Nothing to clean up." >> "$LOG_FILE"
|
52
|
+
fi
|
53
|
+
|
54
|
+
# Clean up lock file if it exists
|
55
|
+
rm -f "$LOCK_FILE"
|
56
|
+
|
57
|
+
# Check if port is still in use
|
58
|
+
if nc -z localhost "$KUBE_PORT" 2>/dev/null; then
|
59
|
+
echo "$(date): Warning: Port $KUBE_PORT is still in use after cleanup. Another process might be using it." >> "$LOG_FILE"
|
60
|
+
fi
|
61
|
+
|
62
|
+
echo "$(date): Cleanup complete for context $CONTEXT" >> "$LOG_FILE"
|