skypilot-nightly 1.0.0.dev20250524__py3-none-any.whl → 1.0.0.dev20250526__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/check.py +4 -1
- sky/cli.py +13 -3
- sky/client/cli.py +13 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/7GEgRyZKRaSnYZCV1Jwol/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/25-062253ea41fb8eec.js +6 -0
- sky/dashboard/out/_next/static/chunks/480-5a0de8b6570ea105.js +1 -0
- sky/dashboard/out/_next/static/chunks/578-d351125af46c293f.js +6 -0
- sky/dashboard/out/_next/static/chunks/734-a6e01d7f98904741.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-59956af3950b02ed.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-96a715a6fb01e228.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{infra-e690d864aa00e2ea.js → infra-abb7d744ecf15109.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-bbf436f41381e169.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7733c960685b4385.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5ed48b3201b998c8.js +1 -0
- sky/dashboard/out/_next/static/css/28558d57108b05ae.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +592 -552
- sky/server/requests/payloads.py +17 -0
- sky/server/server.py +4 -13
- sky/setup_files/dependencies.py +1 -0
- sky/skypilot_config.py +31 -9
- sky/utils/db_utils.py +53 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +20 -4
- sky/utils/kubernetes/kubernetes_deploy_utils.py +49 -5
- sky/utils/kubernetes/ssh-tunnel.sh +20 -28
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +295 -0
- sky/workspaces/server.py +62 -0
- {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/RECORD +51 -42
- sky/dashboard/out/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/480-ee58038f1a4afd5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/578-7a4795009a56430c.js +0 -6
- sky/dashboard/out/_next/static/chunks/734-5f5ce8f347b7f417.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-f347f6144075b0c8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-02a7b60f2ead275f.js +0 -1
- sky/dashboard/out/_next/static/css/d2cdba64c9202dd7.css +0 -3
- /sky/dashboard/out/_next/static/{aHej19bZyl4hoHgrzPCn7 → 7GEgRyZKRaSnYZCV1Jwol}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-37c042a356f8e608.js → [job]-3b5aad09a25f64b7.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-db6558a5ec687011.js → [job]-48dc8d67d4b60be1.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{users-2d319455c3f1c3e2.js → users-b8acf6e6735323a2.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/top_level.txt +0 -0
sky/server/requests/payloads.py
CHANGED
@@ -536,3 +536,20 @@ class UploadZipFileResponse(pydantic.BaseModel):
|
|
536
536
|
class EnabledCloudsBody(RequestBody):
|
537
537
|
"""The request body for the enabled clouds endpoint."""
|
538
538
|
workspace: Optional[str] = None
|
539
|
+
|
540
|
+
|
541
|
+
class UpdateWorkspaceBody(RequestBody):
|
542
|
+
"""The request body for updating a specific workspace configuration."""
|
543
|
+
workspace_name: str = '' # Will be set from path parameter
|
544
|
+
config: Dict[str, Any]
|
545
|
+
|
546
|
+
|
547
|
+
class CreateWorkspaceBody(RequestBody):
|
548
|
+
"""The request body for creating a new workspace."""
|
549
|
+
workspace_name: str = '' # Will be set from path parameter
|
550
|
+
config: Dict[str, Any]
|
551
|
+
|
552
|
+
|
553
|
+
class DeleteWorkspaceBody(RequestBody):
|
554
|
+
"""The request body for deleting a workspace."""
|
555
|
+
workspace_name: str
|
sky/server/server.py
CHANGED
@@ -34,7 +34,6 @@ from sky import execution
|
|
34
34
|
from sky import global_user_state
|
35
35
|
from sky import models
|
36
36
|
from sky import sky_logging
|
37
|
-
from sky import skypilot_config
|
38
37
|
from sky.clouds import service_catalog
|
39
38
|
from sky.data import storage_utils
|
40
39
|
from sky.jobs.server import server as jobs_rest
|
@@ -59,6 +58,7 @@ from sky.utils import dag_utils
|
|
59
58
|
from sky.utils import env_options
|
60
59
|
from sky.utils import status_lib
|
61
60
|
from sky.utils import subprocess_utils
|
61
|
+
from sky.workspaces import server as workspaces_rest
|
62
62
|
|
63
63
|
# pylint: disable=ungrouped-imports
|
64
64
|
if sys.version_info >= (3, 10):
|
@@ -255,6 +255,9 @@ app.add_middleware(AuthProxyMiddleware)
|
|
255
255
|
app.add_middleware(RequestIDMiddleware)
|
256
256
|
app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
|
257
257
|
app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
|
258
|
+
app.include_router(workspaces_rest.router,
|
259
|
+
prefix='/workspaces',
|
260
|
+
tags=['workspaces'])
|
258
261
|
|
259
262
|
|
260
263
|
@app.get('/token')
|
@@ -323,18 +326,6 @@ async def enabled_clouds(request: fastapi.Request,
|
|
323
326
|
)
|
324
327
|
|
325
328
|
|
326
|
-
@app.get('/workspaces')
|
327
|
-
async def get_workspace_config(request: fastapi.Request) -> None:
|
328
|
-
"""Gets workspace config on the server."""
|
329
|
-
executor.schedule_request(
|
330
|
-
request_id=request.state.request_id,
|
331
|
-
request_name='workspaces',
|
332
|
-
request_body=payloads.RequestBody(),
|
333
|
-
func=skypilot_config.get_workspaces,
|
334
|
-
schedule_type=requests_lib.ScheduleType.SHORT,
|
335
|
-
)
|
336
|
-
|
337
|
-
|
338
329
|
@app.post('/realtime_kubernetes_gpu_availability')
|
339
330
|
async def realtime_kubernetes_gpu_availability(
|
340
331
|
request: fastapi.Request,
|
sky/setup_files/dependencies.py
CHANGED
sky/skypilot_config.py
CHANGED
@@ -57,6 +57,8 @@ import threading
|
|
57
57
|
import typing
|
58
58
|
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
59
59
|
|
60
|
+
import filelock
|
61
|
+
|
60
62
|
from sky import exceptions
|
61
63
|
from sky import sky_logging
|
62
64
|
from sky.adaptors import common as adaptors_common
|
@@ -66,6 +68,7 @@ from sky.utils import config_utils
|
|
66
68
|
from sky.utils import context
|
67
69
|
from sky.utils import schemas
|
68
70
|
from sky.utils import ux_utils
|
71
|
+
from sky.utils.kubernetes import config_map_utils
|
69
72
|
|
70
73
|
if typing.TYPE_CHECKING:
|
71
74
|
import yaml
|
@@ -120,10 +123,17 @@ class ConfigContext:
|
|
120
123
|
|
121
124
|
|
122
125
|
# The global loaded config.
|
126
|
+
_active_workspace_context = threading.local()
|
123
127
|
_global_config_context = ConfigContext()
|
124
|
-
_reload_config_lock = threading.Lock()
|
125
128
|
|
126
|
-
|
129
|
+
SKYPILOT_CONFIG_LOCK_PATH = '~/.sky/locks/.skypilot_config.lock'
|
130
|
+
|
131
|
+
|
132
|
+
def get_skypilot_config_lock_path() -> str:
|
133
|
+
"""Get the path for the SkyPilot config lock file."""
|
134
|
+
lock_path = os.path.expanduser(SKYPILOT_CONFIG_LOCK_PATH)
|
135
|
+
os.makedirs(os.path.dirname(lock_path), exist_ok=True)
|
136
|
+
return lock_path
|
127
137
|
|
128
138
|
|
129
139
|
def _get_config_context() -> ConfigContext:
|
@@ -389,7 +399,7 @@ def overlay_skypilot_config(
|
|
389
399
|
|
390
400
|
def safe_reload_config() -> None:
|
391
401
|
"""Reloads the config, safe to be called concurrently."""
|
392
|
-
with
|
402
|
+
with filelock.FileLock(get_skypilot_config_lock_path()):
|
393
403
|
_reload_config()
|
394
404
|
|
395
405
|
|
@@ -691,9 +701,21 @@ def apply_cli_config(cli_config: Optional[List[str]]) -> Dict[str, Any]:
|
|
691
701
|
return parsed_config
|
692
702
|
|
693
703
|
|
694
|
-
def
|
695
|
-
"""
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
704
|
+
def update_config_no_lock(config: config_utils.Config) -> None:
|
705
|
+
"""Dumps the new config to a file and syncs to ConfigMap if in Kubernetes.
|
706
|
+
|
707
|
+
Args:
|
708
|
+
config: The config to save and sync.
|
709
|
+
"""
|
710
|
+
global_config_path = os.path.expanduser(get_user_config_path())
|
711
|
+
|
712
|
+
# Always save to the local file (PVC in Kubernetes, local file otherwise)
|
713
|
+
common_utils.dump_yaml(global_config_path, dict(config))
|
714
|
+
|
715
|
+
if config_map_utils.is_running_in_kubernetes():
|
716
|
+
# In Kubernetes, sync the PVC config to ConfigMap for user convenience
|
717
|
+
# PVC file is the source of truth, ConfigMap is just a mirror for easy
|
718
|
+
# access
|
719
|
+
config_map_utils.patch_configmap_with_config(config, global_config_path)
|
720
|
+
|
721
|
+
_reload_config()
|
sky/utils/db_utils.py
CHANGED
@@ -1,9 +1,17 @@
|
|
1
1
|
"""Utils for sky databases."""
|
2
2
|
import contextlib
|
3
|
+
import enum
|
3
4
|
import sqlite3
|
4
5
|
import threading
|
6
|
+
import typing
|
5
7
|
from typing import Any, Callable, Optional
|
6
8
|
|
9
|
+
import sqlalchemy
|
10
|
+
from sqlalchemy import exc as sqlalchemy_exc
|
11
|
+
|
12
|
+
if typing.TYPE_CHECKING:
|
13
|
+
from sqlalchemy.orm import Session
|
14
|
+
|
7
15
|
# This parameter (passed to sqlite3.connect) controls how long we will wait to
|
8
16
|
# obtains a database lock (not necessarily during connection, but whenever it is
|
9
17
|
# needed). It is not a connection timeout.
|
@@ -21,6 +29,11 @@ from typing import Any, Callable, Optional
|
|
21
29
|
_DB_TIMEOUT_S = 60
|
22
30
|
|
23
31
|
|
32
|
+
class SQLAlchemyDialect(enum.Enum):
|
33
|
+
SQLITE = 'sqlite'
|
34
|
+
POSTGRESQL = 'postgresql'
|
35
|
+
|
36
|
+
|
24
37
|
@contextlib.contextmanager
|
25
38
|
def safe_cursor(db_path: str):
|
26
39
|
"""A newly created, auto-committing, auto-closing cursor."""
|
@@ -71,6 +84,46 @@ def add_column_to_table(
|
|
71
84
|
conn.commit()
|
72
85
|
|
73
86
|
|
87
|
+
def add_column_to_table_sqlalchemy(
|
88
|
+
session: 'Session',
|
89
|
+
table_name: str,
|
90
|
+
column_name: str,
|
91
|
+
column_type: str,
|
92
|
+
copy_from: Optional[str] = None,
|
93
|
+
value_to_replace_existing_entries: Optional[Any] = None,
|
94
|
+
):
|
95
|
+
"""Add a column to a table."""
|
96
|
+
dialect = session.bind.dialect
|
97
|
+
if dialect.name == SQLAlchemyDialect.SQLITE.value:
|
98
|
+
try:
|
99
|
+
session.execute(
|
100
|
+
sqlalchemy.text(f'ALTER TABLE {table_name} '
|
101
|
+
f'ADD COLUMN {column_name} {column_type}'))
|
102
|
+
if copy_from is not None:
|
103
|
+
session.execute(
|
104
|
+
sqlalchemy.text(f'UPDATE {table_name} '
|
105
|
+
f'SET {column_name} = {copy_from}'))
|
106
|
+
if value_to_replace_existing_entries is not None:
|
107
|
+
session.execute(
|
108
|
+
sqlalchemy.text(f'UPDATE {table_name} '
|
109
|
+
f'SET {column_name} = :replacement_value '
|
110
|
+
f'WHERE {column_name} IS NULL'),
|
111
|
+
{'replacement_value': value_to_replace_existing_entries})
|
112
|
+
except sqlalchemy_exc.OperationalError as e:
|
113
|
+
if 'duplicate column name' in str(e):
|
114
|
+
pass
|
115
|
+
else:
|
116
|
+
raise
|
117
|
+
elif dialect.name == SQLAlchemyDialect.POSTGRESQL.value:
|
118
|
+
# TODO(syang) support postgres dialect
|
119
|
+
session.rollback()
|
120
|
+
raise ValueError('Unsupported database dialect')
|
121
|
+
else:
|
122
|
+
session.rollback()
|
123
|
+
raise ValueError('Unsupported database dialect')
|
124
|
+
session.commit()
|
125
|
+
|
126
|
+
|
74
127
|
def rename_column(
|
75
128
|
cursor: 'sqlite3.Cursor',
|
76
129
|
conn: 'sqlite3.Connection',
|
@@ -0,0 +1,133 @@
|
|
1
|
+
"""Utilities for Kubernetes ConfigMap operations in SkyPilot."""
|
2
|
+
import os
|
3
|
+
|
4
|
+
from sky import sky_logging
|
5
|
+
from sky import skypilot_config
|
6
|
+
from sky.adaptors import kubernetes
|
7
|
+
from sky.utils import common_utils
|
8
|
+
|
9
|
+
logger = sky_logging.init_logger(__name__)
|
10
|
+
|
11
|
+
# Kubernetes ConfigMap sync constants
|
12
|
+
_KUBE_SERVICE_ACCOUNT_PATH = '/var/run/secrets/kubernetes.io/serviceaccount'
|
13
|
+
_CONFIGMAP_SYNC_TIMEOUT = 10 # seconds
|
14
|
+
|
15
|
+
|
16
|
+
def is_running_in_kubernetes() -> bool:
|
17
|
+
"""Check if we're running inside a Kubernetes pod."""
|
18
|
+
return os.path.exists(f'{_KUBE_SERVICE_ACCOUNT_PATH}/token')
|
19
|
+
|
20
|
+
|
21
|
+
def _get_kubernetes_namespace() -> str:
|
22
|
+
"""Get the current Kubernetes namespace from the service account."""
|
23
|
+
try:
|
24
|
+
namespace_file = f'{_KUBE_SERVICE_ACCOUNT_PATH}/namespace'
|
25
|
+
if os.path.exists(namespace_file):
|
26
|
+
with open(namespace_file, encoding='utf-8') as f:
|
27
|
+
return f.read().strip()
|
28
|
+
except (OSError, IOError):
|
29
|
+
pass
|
30
|
+
return 'default'
|
31
|
+
|
32
|
+
|
33
|
+
def _get_configmap_name() -> str:
|
34
|
+
"""Get the ConfigMap name for the SkyPilot config."""
|
35
|
+
release_name = (os.getenv('HELM_RELEASE_NAME') or
|
36
|
+
os.getenv('SKYPILOT_RELEASE_NAME') or 'skypilot')
|
37
|
+
return f'{release_name}-config'
|
38
|
+
|
39
|
+
|
40
|
+
def initialize_configmap_sync_on_startup(config_file_path: str) -> None:
|
41
|
+
"""Initialize ConfigMap sync on API server startup.
|
42
|
+
|
43
|
+
This syncs existing PVC config to ConfigMap if ConfigMap doesn't exist.
|
44
|
+
This handles the upgrade scenario where an existing deployment has
|
45
|
+
workspace configs on PVC but no ConfigMap exists.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
config_file_path: Path to the config file to sync.
|
49
|
+
"""
|
50
|
+
config_file_path = os.path.expanduser(config_file_path)
|
51
|
+
if not is_running_in_kubernetes() or not os.path.exists(config_file_path):
|
52
|
+
return
|
53
|
+
|
54
|
+
try:
|
55
|
+
namespace = _get_kubernetes_namespace()
|
56
|
+
configmap_name = _get_configmap_name()
|
57
|
+
|
58
|
+
# Check if ConfigMap exists
|
59
|
+
try:
|
60
|
+
kubernetes.core_api().read_namespaced_config_map(
|
61
|
+
name=configmap_name, namespace=namespace)
|
62
|
+
# ConfigMap exists, don't overwrite it
|
63
|
+
logger.debug(f'ConfigMap {configmap_name} already exists')
|
64
|
+
return
|
65
|
+
except kubernetes.kubernetes.client.rest.ApiException as e:
|
66
|
+
if e.status != 404:
|
67
|
+
raise
|
68
|
+
# ConfigMap doesn't exist, create it
|
69
|
+
|
70
|
+
current_config = skypilot_config.parse_and_validate_config_file(
|
71
|
+
config_file_path)
|
72
|
+
config_yaml = common_utils.dump_yaml_str(dict(current_config))
|
73
|
+
|
74
|
+
configmap_body = {
|
75
|
+
'apiVersion': 'v1',
|
76
|
+
'kind': 'ConfigMap',
|
77
|
+
'metadata': {
|
78
|
+
'name': configmap_name,
|
79
|
+
'namespace': namespace,
|
80
|
+
'labels': {
|
81
|
+
'app.kubernetes.io/name': 'skypilot',
|
82
|
+
'app.kubernetes.io/component': 'config'
|
83
|
+
}
|
84
|
+
},
|
85
|
+
'data': {
|
86
|
+
'config.yaml': config_yaml
|
87
|
+
}
|
88
|
+
}
|
89
|
+
|
90
|
+
kubernetes.core_api().create_namespaced_config_map(
|
91
|
+
namespace=namespace,
|
92
|
+
body=configmap_body,
|
93
|
+
_request_timeout=_CONFIGMAP_SYNC_TIMEOUT)
|
94
|
+
|
95
|
+
logger.info(f'Synced PVC config to new ConfigMap {configmap_name}')
|
96
|
+
|
97
|
+
except Exception as e: # pylint: disable=broad-except
|
98
|
+
logger.warning(f'Failed to initialize ConfigMap sync: {e}')
|
99
|
+
|
100
|
+
|
101
|
+
def patch_configmap_with_config(config, config_file_path: str) -> None:
|
102
|
+
"""Patch the Kubernetes ConfigMap with the updated config.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
config: The updated config to sync to the ConfigMap.
|
106
|
+
config_file_path: Path to the config file for fallback sync.
|
107
|
+
"""
|
108
|
+
if not is_running_in_kubernetes():
|
109
|
+
return
|
110
|
+
|
111
|
+
try:
|
112
|
+
namespace = _get_kubernetes_namespace()
|
113
|
+
configmap_name = _get_configmap_name()
|
114
|
+
config_yaml = common_utils.dump_yaml_str(dict(config))
|
115
|
+
patch_body = {'data': {'config.yaml': config_yaml}}
|
116
|
+
|
117
|
+
try:
|
118
|
+
kubernetes.core_api().patch_namespaced_config_map(
|
119
|
+
name=configmap_name,
|
120
|
+
namespace=namespace,
|
121
|
+
body=patch_body,
|
122
|
+
_request_timeout=_CONFIGMAP_SYNC_TIMEOUT)
|
123
|
+
logger.debug(f'Synced config to ConfigMap {configmap_name}')
|
124
|
+
except kubernetes.kubernetes.client.rest.ApiException as e:
|
125
|
+
if e.status == 404:
|
126
|
+
# ConfigMap doesn't exist, create it
|
127
|
+
logger.info(f'ConfigMap {configmap_name} not found, creating')
|
128
|
+
initialize_configmap_sync_on_startup(config_file_path)
|
129
|
+
else:
|
130
|
+
raise
|
131
|
+
|
132
|
+
except Exception as e: # pylint: disable=broad-except
|
133
|
+
logger.warning(f'Failed to sync config to ConfigMap: {e}')
|
@@ -624,6 +624,9 @@ def main():
|
|
624
624
|
kubeconfig_path = os.path.expanduser(args.kubeconfig_path)
|
625
625
|
global_use_ssh_config = args.use_ssh_config
|
626
626
|
|
627
|
+
failed_clusters = []
|
628
|
+
successful_clusters = []
|
629
|
+
|
627
630
|
# Print cleanup mode marker if applicable
|
628
631
|
if args.cleanup:
|
629
632
|
print('SKYPILOT_CLEANUP_MODE: Cleanup mode activated')
|
@@ -793,10 +796,21 @@ def main():
|
|
793
796
|
print(
|
794
797
|
f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
|
795
798
|
)
|
799
|
+
successful_clusters.append(cluster_name)
|
796
800
|
except Exception as e: # pylint: disable=broad-except
|
801
|
+
reason = str(e)
|
802
|
+
failed_clusters.append((cluster_name, reason))
|
797
803
|
print(
|
798
|
-
f'{RED}Error deploying SSH Node Pool {cluster_name}: {
|
799
|
-
)
|
804
|
+
f'{RED}Error deploying SSH Node Pool {cluster_name}: {reason}{NC}'
|
805
|
+
) # Print for internal logging
|
806
|
+
|
807
|
+
if failed_clusters:
|
808
|
+
action = 'clean' if args.cleanup else 'deploy'
|
809
|
+
msg = f'{GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {NC}'
|
810
|
+
msg += f'{RED}Failed to {action} {len(failed_clusters)} cluster(s): {NC}'
|
811
|
+
for cluster_name, reason in failed_clusters:
|
812
|
+
msg += f'\n {cluster_name}: {reason}'
|
813
|
+
raise RuntimeError(msg)
|
800
814
|
|
801
815
|
|
802
816
|
def deploy_cluster(head_node,
|
@@ -847,8 +861,10 @@ def deploy_cluster(head_node,
|
|
847
861
|
print_output=True)
|
848
862
|
if result is None:
|
849
863
|
with ux_utils.print_exception_no_traceback():
|
850
|
-
raise RuntimeError(
|
851
|
-
|
864
|
+
raise RuntimeError(
|
865
|
+
f'Failed to SSH to head node ({head_node}). '
|
866
|
+
f'Please check the SSH configuration and logs for more details.'
|
867
|
+
)
|
852
868
|
|
853
869
|
# Checking history
|
854
870
|
history_exists = (history_worker_nodes is not None and
|
@@ -26,6 +26,48 @@ logger = sky_logging.init_logger(__name__)
|
|
26
26
|
DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
|
27
27
|
|
28
28
|
|
29
|
+
def check_ssh_cluster_dependencies(
|
30
|
+
raise_error: bool = True) -> Optional[List[str]]:
|
31
|
+
"""Checks if the dependencies for ssh cluster are installed.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
raise_error: set to true when the dependency needs to be present.
|
35
|
+
set to false for `sky check`, where reason strings are compiled
|
36
|
+
at the end.
|
37
|
+
|
38
|
+
Returns: the reasons list if there are missing dependencies.
|
39
|
+
"""
|
40
|
+
# error message
|
41
|
+
jq_message = ('`jq` is required to setup ssh cluster.')
|
42
|
+
|
43
|
+
# save
|
44
|
+
reasons = []
|
45
|
+
required_binaries = []
|
46
|
+
|
47
|
+
# Ensure jq is installed
|
48
|
+
try:
|
49
|
+
subprocess.run(['jq', '--version'],
|
50
|
+
stdout=subprocess.DEVNULL,
|
51
|
+
stderr=subprocess.DEVNULL,
|
52
|
+
check=True)
|
53
|
+
except (FileNotFoundError, subprocess.CalledProcessError):
|
54
|
+
required_binaries.append('jq')
|
55
|
+
reasons.append(jq_message)
|
56
|
+
|
57
|
+
if required_binaries:
|
58
|
+
reasons.extend([
|
59
|
+
'On Debian/Ubuntu, install the missing dependenc(ies) with:',
|
60
|
+
f' $ sudo apt install {" ".join(required_binaries)}',
|
61
|
+
'On MacOS, install with: ',
|
62
|
+
f' $ brew install {" ".join(required_binaries)}',
|
63
|
+
])
|
64
|
+
if raise_error:
|
65
|
+
with ux_utils.print_exception_no_traceback():
|
66
|
+
raise RuntimeError('\n'.join(reasons))
|
67
|
+
return reasons
|
68
|
+
return None
|
69
|
+
|
70
|
+
|
29
71
|
def deploy_ssh_cluster(cleanup: bool = False,
|
30
72
|
infra: Optional[str] = None,
|
31
73
|
kubeconfig_path: Optional[str] = None):
|
@@ -41,6 +83,8 @@ def deploy_ssh_cluster(cleanup: bool = False,
|
|
41
83
|
kubeconfig_path: Path to save the Kubernetes configuration file.
|
42
84
|
If None, the default ~/.kube/config will be used.
|
43
85
|
"""
|
86
|
+
check_ssh_cluster_dependencies()
|
87
|
+
|
44
88
|
# Prepare command to call deploy_remote_cluster.py script
|
45
89
|
# TODO(romilb): We should move this to a native python method/class call
|
46
90
|
# instead of invoking a script with subprocess.
|
@@ -81,9 +125,9 @@ def deploy_ssh_cluster(cleanup: bool = False,
|
|
81
125
|
cmd=deploy_command,
|
82
126
|
log_path=log_path,
|
83
127
|
require_outputs=True,
|
84
|
-
stream_logs=False,
|
128
|
+
stream_logs=False,
|
85
129
|
line_processor=log_utils.SkySSHUpLineProcessor(log_path=log_path,
|
86
|
-
is_local=
|
130
|
+
is_local=False),
|
87
131
|
cwd=cwd,
|
88
132
|
env=env)
|
89
133
|
|
@@ -91,9 +135,9 @@ def deploy_ssh_cluster(cleanup: bool = False,
|
|
91
135
|
success = True
|
92
136
|
else:
|
93
137
|
with ux_utils.print_exception_no_traceback():
|
94
|
-
log_hint = ux_utils.log_path_hint(log_path, is_local=
|
95
|
-
raise RuntimeError('Failed to deploy SkyPilot on
|
96
|
-
f'
|
138
|
+
log_hint = ux_utils.log_path_hint(log_path, is_local=False)
|
139
|
+
raise RuntimeError('Failed to deploy SkyPilot on some Node Pools. '
|
140
|
+
f'{log_hint}'
|
97
141
|
f'\nError: {stderr}')
|
98
142
|
|
99
143
|
if success:
|
@@ -188,14 +188,17 @@ generate_credentials_json() {
|
|
188
188
|
debug_log "Key data length: $(echo -n "$client_key_data" | wc -c) bytes"
|
189
189
|
|
190
190
|
# Check if we can create proper JSON with `jq`
|
191
|
-
if command -v jq &>/dev/null; then
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
191
|
+
if ! command -v jq &>/dev/null; then
|
192
|
+
echo "jq is not installed. Please install jq to use this script." >&2
|
193
|
+
exit 1
|
194
|
+
fi
|
195
|
+
debug_log "Using jq for JSON formatting"
|
196
|
+
|
197
|
+
# Create a temporary file for the JSON output to avoid shell escaping issues
|
198
|
+
local TEMP_JSON_FILE=$(mktemp)
|
199
|
+
|
200
|
+
# Write the JSON to the temporary file using jq for proper JSON formatting
|
201
|
+
cat > "$TEMP_JSON_FILE" << EOL
|
199
202
|
{
|
200
203
|
"apiVersion": "client.authentication.k8s.io/v1beta1",
|
201
204
|
"kind": "ExecCredential",
|
@@ -207,25 +210,14 @@ generate_credentials_json() {
|
|
207
210
|
}
|
208
211
|
EOL
|
209
212
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
else
|
219
|
-
debug_log "jq is not available, using simpler formatting method"
|
220
|
-
|
221
|
-
# Alternative approach: encode with base64 and use the token field instead
|
222
|
-
# This works because kubectl will decode token data properly
|
223
|
-
local combined_data=$(echo -n "${client_cert_data}:${client_key_data}" | base64 | tr -d '\n')
|
224
|
-
|
225
|
-
echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"$combined_data\",\"expirationTimestamp\":\"$expiration_time\"}}"
|
226
|
-
|
227
|
-
debug_log "Sent certificate data as encoded token instead of direct certificate fields"
|
228
|
-
fi
|
213
|
+
# Read the JSON from the file
|
214
|
+
local json_response=$(cat "$TEMP_JSON_FILE")
|
215
|
+
|
216
|
+
# Clean up
|
217
|
+
rm -f "$TEMP_JSON_FILE"
|
218
|
+
|
219
|
+
# Output the JSON
|
220
|
+
echo "$json_response"
|
229
221
|
else
|
230
222
|
# Fallback to token-based credential for tunnel-only authentication
|
231
223
|
echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"k8s-ssh-tunnel-token\",\"expirationTimestamp\":\"$expiration_time\"}}"
|
@@ -384,4 +376,4 @@ fi
|
|
384
376
|
|
385
377
|
# Return valid credential format with certificates if available
|
386
378
|
generate_credentials_json
|
387
|
-
exit 0
|
379
|
+
exit 0
|
File without changes
|