skypilot-nightly 1.0.0.dev20250524__py3-none-any.whl → 1.0.0.dev20250527__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/check.py +32 -6
- sky/cli.py +17 -24
- sky/client/cli.py +17 -24
- sky/client/sdk.py +5 -2
- sky/clouds/cloud.py +2 -2
- sky/clouds/kubernetes.py +10 -5
- sky/clouds/service_catalog/kubernetes_catalog.py +4 -0
- sky/clouds/ssh.py +24 -8
- sky/core.py +20 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/D5bjIfl4Ob3SV3LJz3CO0/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/236-e220ba0c35bf089e.js +6 -0
- sky/dashboard/out/_next/static/chunks/{498-d7722313e5e5b4e6.js → 320-afea3ddcc5bd1c6c.js} +1 -16
- sky/dashboard/out/_next/static/chunks/470-1d784f5c8750744a.js +1 -0
- sky/dashboard/out/_next/static/chunks/578-24f35aa98d38d638.js +6 -0
- sky/dashboard/out/_next/static/chunks/627-31b701e69f52db0c.js +1 -0
- sky/dashboard/out/_next/static/chunks/843-e35d71cf1c7f706e.js +11 -0
- sky/dashboard/out/_next/static/chunks/990-f85643b521f7ca65.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-3985f074c163a856.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-339b59921ccfe266.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e23fcddf60578a0d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-9e6d1ec6e1ac5b29.js → clusters-8afda8efa5b74997.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/config-72b8c6c2edfd0e39.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-1521baab6992916b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4d913940b4fa6f5a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ff7e8e377d02b651.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-9900af52acf8648d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-63763ffa3edb4508.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-3ede7a13caf23375.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-72330c4d0fc9a4a2.js +1 -0
- sky/dashboard/out/_next/static/css/6a1c0d711a4bdaf1.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +592 -552
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +33 -3
- sky/server/requests/serializers/decoders.py +0 -11
- sky/server/server.py +23 -22
- sky/setup_files/dependencies.py +1 -0
- sky/skypilot_config.py +35 -9
- sky/utils/db_utils.py +53 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +20 -4
- sky/utils/kubernetes/exec_kubeconfig_converter.py +19 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +49 -5
- sky/utils/kubernetes/ssh-tunnel.sh +20 -28
- sky/utils/schemas.py +57 -5
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +431 -0
- sky/workspaces/server.py +87 -0
- {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250527.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250527.dist-info}/RECORD +69 -57
- {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250527.dist-info}/WHEEL +1 -1
- sky/dashboard/out/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/480-ee58038f1a4afd5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/578-7a4795009a56430c.js +0 -6
- sky/dashboard/out/_next/static/chunks/734-5f5ce8f347b7f417.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-f347f6144075b0c8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-37c042a356f8e608.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/infra-e690d864aa00e2ea.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-db6558a5ec687011.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/users-2d319455c3f1c3e2.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-02a7b60f2ead275f.js +0 -1
- sky/dashboard/out/_next/static/css/d2cdba64c9202dd7.css +0 -3
- /sky/dashboard/out/_next/static/{aHej19bZyl4hoHgrzPCn7 → D5bjIfl4Ob3SV3LJz3CO0}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{573-f17bd89d9f9118b3.js → 573-82bd40a37af834f1.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250527.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250527.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250527.dist-info}/top_level.txt +0 -0
sky/server/constants.py
CHANGED
@@ -7,7 +7,7 @@ from sky.skylet import constants
|
|
7
7
|
# API server version, whenever there is a change in API server that requires a
|
8
8
|
# restart of the local API server or error out when the client does not match
|
9
9
|
# the server version.
|
10
|
-
API_VERSION = '
|
10
|
+
API_VERSION = '6'
|
11
11
|
|
12
12
|
# Prefix for API request names.
|
13
13
|
REQUEST_NAME_PREFIX = 'sky.'
|
sky/server/requests/payloads.py
CHANGED
@@ -72,6 +72,8 @@ def request_body_env_vars() -> dict:
|
|
72
72
|
|
73
73
|
def get_override_skypilot_config_from_client() -> Dict[str, Any]:
|
74
74
|
"""Returns the override configs from the client."""
|
75
|
+
if annotations.is_on_api_server:
|
76
|
+
return {}
|
75
77
|
config = skypilot_config.to_dict()
|
76
78
|
# Remove the API server config, as we should not specify the SkyPilot
|
77
79
|
# server endpoint on the server side. This avoids the warning at
|
@@ -134,6 +136,12 @@ class CheckBody(RequestBody):
|
|
134
136
|
workspace: Optional[str] = None
|
135
137
|
|
136
138
|
|
139
|
+
class EnabledCloudsBody(RequestBody):
|
140
|
+
"""The request body for the enabled clouds endpoint."""
|
141
|
+
workspace: Optional[str] = None
|
142
|
+
expand: bool = False
|
143
|
+
|
144
|
+
|
137
145
|
class DagRequestBody(RequestBody):
|
138
146
|
"""Request body base class for endpoints with a dag."""
|
139
147
|
dag: str
|
@@ -533,6 +541,28 @@ class UploadZipFileResponse(pydantic.BaseModel):
|
|
533
541
|
missing_chunks: Optional[List[str]] = None
|
534
542
|
|
535
543
|
|
536
|
-
class
|
537
|
-
"""The request body for
|
538
|
-
|
544
|
+
class UpdateWorkspaceBody(RequestBody):
|
545
|
+
"""The request body for updating a specific workspace configuration."""
|
546
|
+
workspace_name: str = '' # Will be set from path parameter
|
547
|
+
config: Dict[str, Any]
|
548
|
+
|
549
|
+
|
550
|
+
class CreateWorkspaceBody(RequestBody):
|
551
|
+
"""The request body for creating a new workspace."""
|
552
|
+
workspace_name: str = '' # Will be set from path parameter
|
553
|
+
config: Dict[str, Any]
|
554
|
+
|
555
|
+
|
556
|
+
class DeleteWorkspaceBody(RequestBody):
|
557
|
+
"""The request body for deleting a workspace."""
|
558
|
+
workspace_name: str
|
559
|
+
|
560
|
+
|
561
|
+
class UpdateConfigBody(RequestBody):
|
562
|
+
"""The request body for updating the entire SkyPilot configuration."""
|
563
|
+
config: Dict[str, Any]
|
564
|
+
|
565
|
+
|
566
|
+
class GetConfigBody(RequestBody):
|
567
|
+
"""The request body for getting the entire SkyPilot configuration."""
|
568
|
+
pass
|
@@ -12,7 +12,6 @@ from sky.provision.kubernetes import utils as kubernetes_utils
|
|
12
12
|
from sky.serve import serve_state
|
13
13
|
from sky.server import constants as server_constants
|
14
14
|
from sky.skylet import job_lib
|
15
|
-
from sky.utils import registry
|
16
15
|
from sky.utils import status_lib
|
17
16
|
|
18
17
|
if typing.TYPE_CHECKING:
|
@@ -135,16 +134,6 @@ def decode_cost_report(
|
|
135
134
|
return return_value
|
136
135
|
|
137
136
|
|
138
|
-
@register_decoders('enabled_clouds')
|
139
|
-
def decode_enabled_clouds(return_value: List[str]) -> List['clouds.Cloud']:
|
140
|
-
clouds = []
|
141
|
-
for cloud_name in return_value:
|
142
|
-
cloud = registry.CLOUD_REGISTRY.from_str(cloud_name)
|
143
|
-
assert cloud is not None, return_value
|
144
|
-
clouds.append(cloud)
|
145
|
-
return clouds
|
146
|
-
|
147
|
-
|
148
137
|
@register_decoders('list_accelerators')
|
149
138
|
def decode_list_accelerators(
|
150
139
|
return_value: Dict[str, List[List[Any]]]
|
sky/server/server.py
CHANGED
@@ -34,7 +34,6 @@ from sky import execution
|
|
34
34
|
from sky import global_user_state
|
35
35
|
from sky import models
|
36
36
|
from sky import sky_logging
|
37
|
-
from sky import skypilot_config
|
38
37
|
from sky.clouds import service_catalog
|
39
38
|
from sky.data import storage_utils
|
40
39
|
from sky.jobs.server import server as jobs_rest
|
@@ -59,6 +58,7 @@ from sky.utils import dag_utils
|
|
59
58
|
from sky.utils import env_options
|
60
59
|
from sky.utils import status_lib
|
61
60
|
from sky.utils import subprocess_utils
|
61
|
+
from sky.workspaces import server as workspaces_rest
|
62
62
|
|
63
63
|
# pylint: disable=ungrouped-imports
|
64
64
|
if sys.version_info >= (3, 10):
|
@@ -127,6 +127,11 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
127
127
|
|
128
128
|
async def dispatch(self, request: fastapi.Request, call_next):
|
129
129
|
auth_user = _get_auth_user_header(request)
|
130
|
+
|
131
|
+
# Add user to database if auth_user is present
|
132
|
+
if auth_user is not None:
|
133
|
+
global_user_state.add_or_update_user(auth_user)
|
134
|
+
|
130
135
|
body = await request.body()
|
131
136
|
if auth_user and body:
|
132
137
|
try:
|
@@ -137,10 +142,16 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
137
142
|
logger.debug(f'Overriding user for {request.state.request_id}: '
|
138
143
|
f'{auth_user.name}, {auth_user.id}')
|
139
144
|
if 'env_vars' in original_json:
|
140
|
-
original_json
|
141
|
-
|
142
|
-
|
143
|
-
|
145
|
+
if isinstance(original_json.get('env_vars'), dict):
|
146
|
+
original_json['env_vars'][
|
147
|
+
constants.USER_ID_ENV_VAR] = auth_user.id
|
148
|
+
original_json['env_vars'][
|
149
|
+
constants.USER_ENV_VAR] = auth_user.name
|
150
|
+
else:
|
151
|
+
logger.warning(
|
152
|
+
f'"env_vars" in request body is not a dictionary '
|
153
|
+
f'for request {request.state.request_id}. '
|
154
|
+
'Skipping user info injection into body.')
|
144
155
|
request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
|
145
156
|
return await call_next(request)
|
146
157
|
|
@@ -255,14 +266,14 @@ app.add_middleware(AuthProxyMiddleware)
|
|
255
266
|
app.add_middleware(RequestIDMiddleware)
|
256
267
|
app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
|
257
268
|
app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
|
269
|
+
app.include_router(workspaces_rest.router,
|
270
|
+
prefix='/workspaces',
|
271
|
+
tags=['workspaces'])
|
258
272
|
|
259
273
|
|
260
274
|
@app.get('/token')
|
261
275
|
async def token(request: fastapi.Request) -> fastapi.responses.HTMLResponse:
|
262
|
-
# If we have auth info, save this user to the database.
|
263
276
|
user = _get_auth_user_header(request)
|
264
|
-
if user is not None:
|
265
|
-
global_user_state.add_or_update_user(user)
|
266
277
|
|
267
278
|
token_data = {
|
268
279
|
'v': 1, # Token version number, bump for backwards incompatible.
|
@@ -312,29 +323,19 @@ async def check(request: fastapi.Request,
|
|
312
323
|
|
313
324
|
@app.get('/enabled_clouds')
|
314
325
|
async def enabled_clouds(request: fastapi.Request,
|
315
|
-
workspace: Optional[str] = None
|
326
|
+
workspace: Optional[str] = None,
|
327
|
+
expand: bool = False) -> None:
|
316
328
|
"""Gets enabled clouds on the server."""
|
317
329
|
executor.schedule_request(
|
318
330
|
request_id=request.state.request_id,
|
319
331
|
request_name='enabled_clouds',
|
320
|
-
request_body=payloads.EnabledCloudsBody(workspace=workspace
|
332
|
+
request_body=payloads.EnabledCloudsBody(workspace=workspace,
|
333
|
+
expand=expand),
|
321
334
|
func=core.enabled_clouds,
|
322
335
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
323
336
|
)
|
324
337
|
|
325
338
|
|
326
|
-
@app.get('/workspaces')
|
327
|
-
async def get_workspace_config(request: fastapi.Request) -> None:
|
328
|
-
"""Gets workspace config on the server."""
|
329
|
-
executor.schedule_request(
|
330
|
-
request_id=request.state.request_id,
|
331
|
-
request_name='workspaces',
|
332
|
-
request_body=payloads.RequestBody(),
|
333
|
-
func=skypilot_config.get_workspaces,
|
334
|
-
schedule_type=requests_lib.ScheduleType.SHORT,
|
335
|
-
)
|
336
|
-
|
337
|
-
|
338
339
|
@app.post('/realtime_kubernetes_gpu_availability')
|
339
340
|
async def realtime_kubernetes_gpu_availability(
|
340
341
|
request: fastapi.Request,
|
sky/setup_files/dependencies.py
CHANGED
sky/skypilot_config.py
CHANGED
@@ -57,6 +57,8 @@ import threading
|
|
57
57
|
import typing
|
58
58
|
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
59
59
|
|
60
|
+
import filelock
|
61
|
+
|
60
62
|
from sky import exceptions
|
61
63
|
from sky import sky_logging
|
62
64
|
from sky.adaptors import common as adaptors_common
|
@@ -66,6 +68,7 @@ from sky.utils import config_utils
|
|
66
68
|
from sky.utils import context
|
67
69
|
from sky.utils import schemas
|
68
70
|
from sky.utils import ux_utils
|
71
|
+
from sky.utils.kubernetes import config_map_utils
|
69
72
|
|
70
73
|
if typing.TYPE_CHECKING:
|
71
74
|
import yaml
|
@@ -120,10 +123,17 @@ class ConfigContext:
|
|
120
123
|
|
121
124
|
|
122
125
|
# The global loaded config.
|
126
|
+
_active_workspace_context = threading.local()
|
123
127
|
_global_config_context = ConfigContext()
|
124
|
-
_reload_config_lock = threading.Lock()
|
125
128
|
|
126
|
-
|
129
|
+
SKYPILOT_CONFIG_LOCK_PATH = '~/.sky/locks/.skypilot_config.lock'
|
130
|
+
|
131
|
+
|
132
|
+
def get_skypilot_config_lock_path() -> str:
|
133
|
+
"""Get the path for the SkyPilot config lock file."""
|
134
|
+
lock_path = os.path.expanduser(SKYPILOT_CONFIG_LOCK_PATH)
|
135
|
+
os.makedirs(os.path.dirname(lock_path), exist_ok=True)
|
136
|
+
return lock_path
|
127
137
|
|
128
138
|
|
129
139
|
def _get_config_context() -> ConfigContext:
|
@@ -289,6 +299,10 @@ def get_nested(keys: Tuple[str, ...],
|
|
289
299
|
def get_workspace_cloud(cloud: str,
|
290
300
|
workspace: Optional[str] = None) -> config_utils.Config:
|
291
301
|
"""Returns the workspace config."""
|
302
|
+
# TODO(zhwu): Instead of just returning the workspace specific config, we
|
303
|
+
# should return the config that already merges the global config, so that
|
304
|
+
# the caller does not need to manually merge the global config with
|
305
|
+
# the workspace specific config.
|
292
306
|
if workspace is None:
|
293
307
|
workspace = get_active_workspace()
|
294
308
|
clouds = get_nested(keys=(
|
@@ -389,7 +403,7 @@ def overlay_skypilot_config(
|
|
389
403
|
|
390
404
|
def safe_reload_config() -> None:
|
391
405
|
"""Reloads the config, safe to be called concurrently."""
|
392
|
-
with
|
406
|
+
with filelock.FileLock(get_skypilot_config_lock_path()):
|
393
407
|
_reload_config()
|
394
408
|
|
395
409
|
|
@@ -691,9 +705,21 @@ def apply_cli_config(cli_config: Optional[List[str]]) -> Dict[str, Any]:
|
|
691
705
|
return parsed_config
|
692
706
|
|
693
707
|
|
694
|
-
def
|
695
|
-
"""
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
708
|
+
def update_config_no_lock(config: config_utils.Config) -> None:
|
709
|
+
"""Dumps the new config to a file and syncs to ConfigMap if in Kubernetes.
|
710
|
+
|
711
|
+
Args:
|
712
|
+
config: The config to save and sync.
|
713
|
+
"""
|
714
|
+
global_config_path = os.path.expanduser(get_user_config_path())
|
715
|
+
|
716
|
+
# Always save to the local file (PVC in Kubernetes, local file otherwise)
|
717
|
+
common_utils.dump_yaml(global_config_path, dict(config))
|
718
|
+
|
719
|
+
if config_map_utils.is_running_in_kubernetes():
|
720
|
+
# In Kubernetes, sync the PVC config to ConfigMap for user convenience
|
721
|
+
# PVC file is the source of truth, ConfigMap is just a mirror for easy
|
722
|
+
# access
|
723
|
+
config_map_utils.patch_configmap_with_config(config, global_config_path)
|
724
|
+
|
725
|
+
_reload_config()
|
sky/utils/db_utils.py
CHANGED
@@ -1,9 +1,17 @@
|
|
1
1
|
"""Utils for sky databases."""
|
2
2
|
import contextlib
|
3
|
+
import enum
|
3
4
|
import sqlite3
|
4
5
|
import threading
|
6
|
+
import typing
|
5
7
|
from typing import Any, Callable, Optional
|
6
8
|
|
9
|
+
import sqlalchemy
|
10
|
+
from sqlalchemy import exc as sqlalchemy_exc
|
11
|
+
|
12
|
+
if typing.TYPE_CHECKING:
|
13
|
+
from sqlalchemy.orm import Session
|
14
|
+
|
7
15
|
# This parameter (passed to sqlite3.connect) controls how long we will wait to
|
8
16
|
# obtains a database lock (not necessarily during connection, but whenever it is
|
9
17
|
# needed). It is not a connection timeout.
|
@@ -21,6 +29,11 @@ from typing import Any, Callable, Optional
|
|
21
29
|
_DB_TIMEOUT_S = 60
|
22
30
|
|
23
31
|
|
32
|
+
class SQLAlchemyDialect(enum.Enum):
|
33
|
+
SQLITE = 'sqlite'
|
34
|
+
POSTGRESQL = 'postgresql'
|
35
|
+
|
36
|
+
|
24
37
|
@contextlib.contextmanager
|
25
38
|
def safe_cursor(db_path: str):
|
26
39
|
"""A newly created, auto-committing, auto-closing cursor."""
|
@@ -71,6 +84,46 @@ def add_column_to_table(
|
|
71
84
|
conn.commit()
|
72
85
|
|
73
86
|
|
87
|
+
def add_column_to_table_sqlalchemy(
|
88
|
+
session: 'Session',
|
89
|
+
table_name: str,
|
90
|
+
column_name: str,
|
91
|
+
column_type: str,
|
92
|
+
copy_from: Optional[str] = None,
|
93
|
+
value_to_replace_existing_entries: Optional[Any] = None,
|
94
|
+
):
|
95
|
+
"""Add a column to a table."""
|
96
|
+
dialect = session.bind.dialect
|
97
|
+
if dialect.name == SQLAlchemyDialect.SQLITE.value:
|
98
|
+
try:
|
99
|
+
session.execute(
|
100
|
+
sqlalchemy.text(f'ALTER TABLE {table_name} '
|
101
|
+
f'ADD COLUMN {column_name} {column_type}'))
|
102
|
+
if copy_from is not None:
|
103
|
+
session.execute(
|
104
|
+
sqlalchemy.text(f'UPDATE {table_name} '
|
105
|
+
f'SET {column_name} = {copy_from}'))
|
106
|
+
if value_to_replace_existing_entries is not None:
|
107
|
+
session.execute(
|
108
|
+
sqlalchemy.text(f'UPDATE {table_name} '
|
109
|
+
f'SET {column_name} = :replacement_value '
|
110
|
+
f'WHERE {column_name} IS NULL'),
|
111
|
+
{'replacement_value': value_to_replace_existing_entries})
|
112
|
+
except sqlalchemy_exc.OperationalError as e:
|
113
|
+
if 'duplicate column name' in str(e):
|
114
|
+
pass
|
115
|
+
else:
|
116
|
+
raise
|
117
|
+
elif dialect.name == SQLAlchemyDialect.POSTGRESQL.value:
|
118
|
+
# TODO(syang) support postgres dialect
|
119
|
+
session.rollback()
|
120
|
+
raise ValueError('Unsupported database dialect')
|
121
|
+
else:
|
122
|
+
session.rollback()
|
123
|
+
raise ValueError('Unsupported database dialect')
|
124
|
+
session.commit()
|
125
|
+
|
126
|
+
|
74
127
|
def rename_column(
|
75
128
|
cursor: 'sqlite3.Cursor',
|
76
129
|
conn: 'sqlite3.Connection',
|
@@ -0,0 +1,133 @@
|
|
1
|
+
"""Utilities for Kubernetes ConfigMap operations in SkyPilot."""
|
2
|
+
import os
|
3
|
+
|
4
|
+
from sky import sky_logging
|
5
|
+
from sky import skypilot_config
|
6
|
+
from sky.adaptors import kubernetes
|
7
|
+
from sky.utils import common_utils
|
8
|
+
|
9
|
+
logger = sky_logging.init_logger(__name__)
|
10
|
+
|
11
|
+
# Kubernetes ConfigMap sync constants
|
12
|
+
_KUBE_SERVICE_ACCOUNT_PATH = '/var/run/secrets/kubernetes.io/serviceaccount'
|
13
|
+
_CONFIGMAP_SYNC_TIMEOUT = 10 # seconds
|
14
|
+
|
15
|
+
|
16
|
+
def is_running_in_kubernetes() -> bool:
|
17
|
+
"""Check if we're running inside a Kubernetes pod."""
|
18
|
+
return os.path.exists(f'{_KUBE_SERVICE_ACCOUNT_PATH}/token')
|
19
|
+
|
20
|
+
|
21
|
+
def _get_kubernetes_namespace() -> str:
|
22
|
+
"""Get the current Kubernetes namespace from the service account."""
|
23
|
+
try:
|
24
|
+
namespace_file = f'{_KUBE_SERVICE_ACCOUNT_PATH}/namespace'
|
25
|
+
if os.path.exists(namespace_file):
|
26
|
+
with open(namespace_file, encoding='utf-8') as f:
|
27
|
+
return f.read().strip()
|
28
|
+
except (OSError, IOError):
|
29
|
+
pass
|
30
|
+
return 'default'
|
31
|
+
|
32
|
+
|
33
|
+
def _get_configmap_name() -> str:
|
34
|
+
"""Get the ConfigMap name for the SkyPilot config."""
|
35
|
+
release_name = (os.getenv('HELM_RELEASE_NAME') or
|
36
|
+
os.getenv('SKYPILOT_RELEASE_NAME') or 'skypilot')
|
37
|
+
return f'{release_name}-config'
|
38
|
+
|
39
|
+
|
40
|
+
def initialize_configmap_sync_on_startup(config_file_path: str) -> None:
|
41
|
+
"""Initialize ConfigMap sync on API server startup.
|
42
|
+
|
43
|
+
This syncs existing PVC config to ConfigMap if ConfigMap doesn't exist.
|
44
|
+
This handles the upgrade scenario where an existing deployment has
|
45
|
+
workspace configs on PVC but no ConfigMap exists.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
config_file_path: Path to the config file to sync.
|
49
|
+
"""
|
50
|
+
config_file_path = os.path.expanduser(config_file_path)
|
51
|
+
if not is_running_in_kubernetes() or not os.path.exists(config_file_path):
|
52
|
+
return
|
53
|
+
|
54
|
+
try:
|
55
|
+
namespace = _get_kubernetes_namespace()
|
56
|
+
configmap_name = _get_configmap_name()
|
57
|
+
|
58
|
+
# Check if ConfigMap exists
|
59
|
+
try:
|
60
|
+
kubernetes.core_api().read_namespaced_config_map(
|
61
|
+
name=configmap_name, namespace=namespace)
|
62
|
+
# ConfigMap exists, don't overwrite it
|
63
|
+
logger.debug(f'ConfigMap {configmap_name} already exists')
|
64
|
+
return
|
65
|
+
except kubernetes.kubernetes.client.rest.ApiException as e:
|
66
|
+
if e.status != 404:
|
67
|
+
raise
|
68
|
+
# ConfigMap doesn't exist, create it
|
69
|
+
|
70
|
+
current_config = skypilot_config.parse_and_validate_config_file(
|
71
|
+
config_file_path)
|
72
|
+
config_yaml = common_utils.dump_yaml_str(dict(current_config))
|
73
|
+
|
74
|
+
configmap_body = {
|
75
|
+
'apiVersion': 'v1',
|
76
|
+
'kind': 'ConfigMap',
|
77
|
+
'metadata': {
|
78
|
+
'name': configmap_name,
|
79
|
+
'namespace': namespace,
|
80
|
+
'labels': {
|
81
|
+
'app.kubernetes.io/name': 'skypilot',
|
82
|
+
'app.kubernetes.io/component': 'config'
|
83
|
+
}
|
84
|
+
},
|
85
|
+
'data': {
|
86
|
+
'config.yaml': config_yaml
|
87
|
+
}
|
88
|
+
}
|
89
|
+
|
90
|
+
kubernetes.core_api().create_namespaced_config_map(
|
91
|
+
namespace=namespace,
|
92
|
+
body=configmap_body,
|
93
|
+
_request_timeout=_CONFIGMAP_SYNC_TIMEOUT)
|
94
|
+
|
95
|
+
logger.info(f'Synced PVC config to new ConfigMap {configmap_name}')
|
96
|
+
|
97
|
+
except Exception as e: # pylint: disable=broad-except
|
98
|
+
logger.warning(f'Failed to initialize ConfigMap sync: {e}')
|
99
|
+
|
100
|
+
|
101
|
+
def patch_configmap_with_config(config, config_file_path: str) -> None:
|
102
|
+
"""Patch the Kubernetes ConfigMap with the updated config.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
config: The updated config to sync to the ConfigMap.
|
106
|
+
config_file_path: Path to the config file for fallback sync.
|
107
|
+
"""
|
108
|
+
if not is_running_in_kubernetes():
|
109
|
+
return
|
110
|
+
|
111
|
+
try:
|
112
|
+
namespace = _get_kubernetes_namespace()
|
113
|
+
configmap_name = _get_configmap_name()
|
114
|
+
config_yaml = common_utils.dump_yaml_str(dict(config))
|
115
|
+
patch_body = {'data': {'config.yaml': config_yaml}}
|
116
|
+
|
117
|
+
try:
|
118
|
+
kubernetes.core_api().patch_namespaced_config_map(
|
119
|
+
name=configmap_name,
|
120
|
+
namespace=namespace,
|
121
|
+
body=patch_body,
|
122
|
+
_request_timeout=_CONFIGMAP_SYNC_TIMEOUT)
|
123
|
+
logger.debug(f'Synced config to ConfigMap {configmap_name}')
|
124
|
+
except kubernetes.kubernetes.client.rest.ApiException as e:
|
125
|
+
if e.status == 404:
|
126
|
+
# ConfigMap doesn't exist, create it
|
127
|
+
logger.info(f'ConfigMap {configmap_name} not found, creating')
|
128
|
+
initialize_configmap_sync_on_startup(config_file_path)
|
129
|
+
else:
|
130
|
+
raise
|
131
|
+
|
132
|
+
except Exception as e: # pylint: disable=broad-except
|
133
|
+
logger.warning(f'Failed to sync config to ConfigMap: {e}')
|
@@ -624,6 +624,9 @@ def main():
|
|
624
624
|
kubeconfig_path = os.path.expanduser(args.kubeconfig_path)
|
625
625
|
global_use_ssh_config = args.use_ssh_config
|
626
626
|
|
627
|
+
failed_clusters = []
|
628
|
+
successful_clusters = []
|
629
|
+
|
627
630
|
# Print cleanup mode marker if applicable
|
628
631
|
if args.cleanup:
|
629
632
|
print('SKYPILOT_CLEANUP_MODE: Cleanup mode activated')
|
@@ -793,10 +796,21 @@ def main():
|
|
793
796
|
print(
|
794
797
|
f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
|
795
798
|
)
|
799
|
+
successful_clusters.append(cluster_name)
|
796
800
|
except Exception as e: # pylint: disable=broad-except
|
801
|
+
reason = str(e)
|
802
|
+
failed_clusters.append((cluster_name, reason))
|
797
803
|
print(
|
798
|
-
f'{RED}Error deploying SSH Node Pool {cluster_name}: {
|
799
|
-
)
|
804
|
+
f'{RED}Error deploying SSH Node Pool {cluster_name}: {reason}{NC}'
|
805
|
+
) # Print for internal logging
|
806
|
+
|
807
|
+
if failed_clusters:
|
808
|
+
action = 'clean' if args.cleanup else 'deploy'
|
809
|
+
msg = f'{GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {NC}'
|
810
|
+
msg += f'{RED}Failed to {action} {len(failed_clusters)} cluster(s): {NC}'
|
811
|
+
for cluster_name, reason in failed_clusters:
|
812
|
+
msg += f'\n {cluster_name}: {reason}'
|
813
|
+
raise RuntimeError(msg)
|
800
814
|
|
801
815
|
|
802
816
|
def deploy_cluster(head_node,
|
@@ -847,8 +861,10 @@ def deploy_cluster(head_node,
|
|
847
861
|
print_output=True)
|
848
862
|
if result is None:
|
849
863
|
with ux_utils.print_exception_no_traceback():
|
850
|
-
raise RuntimeError(
|
851
|
-
|
864
|
+
raise RuntimeError(
|
865
|
+
f'Failed to SSH to head node ({head_node}). '
|
866
|
+
f'Please check the SSH configuration and logs for more details.'
|
867
|
+
)
|
852
868
|
|
853
869
|
# Checking history
|
854
870
|
history_exists = (history_worker_nodes is not None and
|
@@ -5,6 +5,9 @@ the 'command' field in the exec configuration, leaving only the executable name.
|
|
5
5
|
This is useful when moving between different environments where auth plugin
|
6
6
|
executables might be installed in different locations.
|
7
7
|
|
8
|
+
For Nebius kubeconfigs, it also changes the --profile argument to 'sky' to
|
9
|
+
ensure compatibility with SkyPilot's expected profile configuration.
|
10
|
+
|
8
11
|
It assumes the target environment has the auth executable available in PATH.
|
9
12
|
If not, you'll need to update your environment container to include the auth
|
10
13
|
executable in PATH.
|
@@ -21,6 +24,8 @@ import yaml
|
|
21
24
|
def strip_auth_plugin_paths(kubeconfig_path: str, output_path: str):
|
22
25
|
"""Strip path information from exec plugin commands in a kubeconfig file.
|
23
26
|
|
27
|
+
For Nebius kubeconfigs, also changes the --profile argument to 'sky'.
|
28
|
+
|
24
29
|
Args:
|
25
30
|
kubeconfig_path (str): Path to the input kubeconfig file
|
26
31
|
output_path (str): Path where the modified kubeconfig will be saved
|
@@ -40,6 +45,20 @@ def strip_auth_plugin_paths(kubeconfig_path: str, output_path: str):
|
|
40
45
|
exec_info['command'] = executable
|
41
46
|
updated = True
|
42
47
|
|
48
|
+
# Handle Nebius kubeconfigs: change --profile to 'sky'
|
49
|
+
if executable == 'nebius' or current_command == 'nebius':
|
50
|
+
args = exec_info.get('args', [])
|
51
|
+
if args and '--profile' in args:
|
52
|
+
try:
|
53
|
+
profile_index = args.index('--profile')
|
54
|
+
if profile_index + 1 < len(args):
|
55
|
+
old_profile = args[profile_index + 1]
|
56
|
+
if old_profile != 'sky':
|
57
|
+
args[profile_index + 1] = 'sky'
|
58
|
+
updated = True
|
59
|
+
except ValueError:
|
60
|
+
pass # --profile not found in args
|
61
|
+
|
43
62
|
if updated:
|
44
63
|
with open(output_path, 'w', encoding='utf-8') as file:
|
45
64
|
yaml.safe_dump(config, file)
|
@@ -26,6 +26,48 @@ logger = sky_logging.init_logger(__name__)
|
|
26
26
|
DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
|
27
27
|
|
28
28
|
|
29
|
+
def check_ssh_cluster_dependencies(
|
30
|
+
raise_error: bool = True) -> Optional[List[str]]:
|
31
|
+
"""Checks if the dependencies for ssh cluster are installed.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
raise_error: set to true when the dependency needs to be present.
|
35
|
+
set to false for `sky check`, where reason strings are compiled
|
36
|
+
at the end.
|
37
|
+
|
38
|
+
Returns: the reasons list if there are missing dependencies.
|
39
|
+
"""
|
40
|
+
# error message
|
41
|
+
jq_message = ('`jq` is required to setup ssh cluster.')
|
42
|
+
|
43
|
+
# save
|
44
|
+
reasons = []
|
45
|
+
required_binaries = []
|
46
|
+
|
47
|
+
# Ensure jq is installed
|
48
|
+
try:
|
49
|
+
subprocess.run(['jq', '--version'],
|
50
|
+
stdout=subprocess.DEVNULL,
|
51
|
+
stderr=subprocess.DEVNULL,
|
52
|
+
check=True)
|
53
|
+
except (FileNotFoundError, subprocess.CalledProcessError):
|
54
|
+
required_binaries.append('jq')
|
55
|
+
reasons.append(jq_message)
|
56
|
+
|
57
|
+
if required_binaries:
|
58
|
+
reasons.extend([
|
59
|
+
'On Debian/Ubuntu, install the missing dependenc(ies) with:',
|
60
|
+
f' $ sudo apt install {" ".join(required_binaries)}',
|
61
|
+
'On MacOS, install with: ',
|
62
|
+
f' $ brew install {" ".join(required_binaries)}',
|
63
|
+
])
|
64
|
+
if raise_error:
|
65
|
+
with ux_utils.print_exception_no_traceback():
|
66
|
+
raise RuntimeError('\n'.join(reasons))
|
67
|
+
return reasons
|
68
|
+
return None
|
69
|
+
|
70
|
+
|
29
71
|
def deploy_ssh_cluster(cleanup: bool = False,
|
30
72
|
infra: Optional[str] = None,
|
31
73
|
kubeconfig_path: Optional[str] = None):
|
@@ -41,6 +83,8 @@ def deploy_ssh_cluster(cleanup: bool = False,
|
|
41
83
|
kubeconfig_path: Path to save the Kubernetes configuration file.
|
42
84
|
If None, the default ~/.kube/config will be used.
|
43
85
|
"""
|
86
|
+
check_ssh_cluster_dependencies()
|
87
|
+
|
44
88
|
# Prepare command to call deploy_remote_cluster.py script
|
45
89
|
# TODO(romilb): We should move this to a native python method/class call
|
46
90
|
# instead of invoking a script with subprocess.
|
@@ -81,9 +125,9 @@ def deploy_ssh_cluster(cleanup: bool = False,
|
|
81
125
|
cmd=deploy_command,
|
82
126
|
log_path=log_path,
|
83
127
|
require_outputs=True,
|
84
|
-
stream_logs=False,
|
128
|
+
stream_logs=False,
|
85
129
|
line_processor=log_utils.SkySSHUpLineProcessor(log_path=log_path,
|
86
|
-
is_local=
|
130
|
+
is_local=False),
|
87
131
|
cwd=cwd,
|
88
132
|
env=env)
|
89
133
|
|
@@ -91,9 +135,9 @@ def deploy_ssh_cluster(cleanup: bool = False,
|
|
91
135
|
success = True
|
92
136
|
else:
|
93
137
|
with ux_utils.print_exception_no_traceback():
|
94
|
-
log_hint = ux_utils.log_path_hint(log_path, is_local=
|
95
|
-
raise RuntimeError('Failed to deploy SkyPilot on
|
96
|
-
f'
|
138
|
+
log_hint = ux_utils.log_path_hint(log_path, is_local=False)
|
139
|
+
raise RuntimeError('Failed to deploy SkyPilot on some Node Pools. '
|
140
|
+
f'{log_hint}'
|
97
141
|
f'\nError: {stderr}')
|
98
142
|
|
99
143
|
if success:
|