skypilot-nightly 1.0.0.dev20250526__py3-none-any.whl → 1.0.0.dev20250528__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +13 -1
- sky/backends/cloud_vm_ray_backend.py +2 -2
- sky/check.py +32 -6
- sky/cli.py +5 -22
- sky/client/cli.py +5 -22
- sky/client/sdk.py +5 -2
- sky/clouds/cloud.py +2 -2
- sky/clouds/kubernetes.py +12 -7
- sky/clouds/service_catalog/kubernetes_catalog.py +4 -0
- sky/clouds/ssh.py +24 -8
- sky/core.py +20 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/Mx1iAbDQn1jMHh3UHmK3R/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/121-8f55ee3fa6301784.js +20 -0
- sky/dashboard/out/_next/static/chunks/{573-f17bd89d9f9118b3.js → 173-7db8607cefc20f70.js} +5 -5
- sky/dashboard/out/_next/static/chunks/236-d6900c828331f664.js +6 -0
- sky/dashboard/out/_next/static/chunks/293-351268365226d251.js +1 -0
- sky/dashboard/out/_next/static/chunks/{498-d7722313e5e5b4e6.js → 320-afea3ddcc5bd1c6c.js} +1 -16
- sky/dashboard/out/_next/static/chunks/470-4d003c441839094d.js +1 -0
- sky/dashboard/out/_next/static/chunks/578-9146658cead92981.js +6 -0
- sky/dashboard/out/_next/static/chunks/843-256ec920f6d5f41f.js +11 -0
- sky/dashboard/out/_next/static/chunks/856-62b87c68917b08ed.js +1 -0
- sky/dashboard/out/_next/static/chunks/973-1a09cac61cfcc1e1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-159bffb2fa34ed54.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9506c00257d10dbd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-9e6d1ec6e1ac5b29.js → clusters-943992b84fd6f4ee.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/config-41738d1896fc02fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-881fcd902fbbd0e5.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-2c29e97a6aa50dd4.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-a4efc09e61988f8d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-b2634885d67c49a6.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/{new-bbf436f41381e169.js → new-579b3203c7c19d84.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-7733c960685b4385.js → [name]-9388e38fac73ee8f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-610c49ae3619ee85.js +1 -0
- sky/dashboard/out/_next/static/css/ffd1cd601648c303.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +181 -134
- sky/provision/kubernetes/utils.py +4 -4
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +18 -5
- sky/server/requests/serializers/decoders.py +0 -11
- sky/server/server.py +25 -14
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/constants.py +4 -0
- sky/skypilot_config.py +4 -0
- sky/utils/db_utils.py +34 -46
- sky/utils/kubernetes/exec_kubeconfig_converter.py +19 -0
- sky/utils/schemas.py +57 -5
- sky/utils/subprocess_utils.py +2 -3
- sky/workspaces/core.py +186 -50
- sky/workspaces/server.py +25 -0
- {skypilot_nightly-1.0.0.dev20250526.dist-info → skypilot_nightly-1.0.0.dev20250528.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250526.dist-info → skypilot_nightly-1.0.0.dev20250528.dist-info}/RECORD +71 -67
- {skypilot_nightly-1.0.0.dev20250526.dist-info → skypilot_nightly-1.0.0.dev20250528.dist-info}/WHEEL +1 -1
- sky/dashboard/out/_next/static/7GEgRyZKRaSnYZCV1Jwol/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/25-062253ea41fb8eec.js +0 -6
- sky/dashboard/out/_next/static/chunks/480-5a0de8b6570ea105.js +0 -1
- sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +0 -15
- sky/dashboard/out/_next/static/chunks/578-d351125af46c293f.js +0 -6
- sky/dashboard/out/_next/static/chunks/734-a6e01d7f98904741.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-59956af3950b02ed.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-3b5aad09a25f64b7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/infra-abb7d744ecf15109.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-48dc8d67d4b60be1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/users-b8acf6e6735323a2.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5ed48b3201b998c8.js +0 -1
- sky/dashboard/out/_next/static/css/28558d57108b05ae.css +0 -3
- /sky/dashboard/out/_next/static/{7GEgRyZKRaSnYZCV1Jwol → Mx1iAbDQn1jMHh3UHmK3R}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-96a715a6fb01e228.js → _app-a631df412d8172de.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250526.dist-info → skypilot_nightly-1.0.0.dev20250528.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250526.dist-info → skypilot_nightly-1.0.0.dev20250528.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250526.dist-info → skypilot_nightly-1.0.0.dev20250528.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,6 @@ from sky.provision.kubernetes import utils as kubernetes_utils
|
|
12
12
|
from sky.serve import serve_state
|
13
13
|
from sky.server import constants as server_constants
|
14
14
|
from sky.skylet import job_lib
|
15
|
-
from sky.utils import registry
|
16
15
|
from sky.utils import status_lib
|
17
16
|
|
18
17
|
if typing.TYPE_CHECKING:
|
@@ -135,16 +134,6 @@ def decode_cost_report(
|
|
135
134
|
return return_value
|
136
135
|
|
137
136
|
|
138
|
-
@register_decoders('enabled_clouds')
|
139
|
-
def decode_enabled_clouds(return_value: List[str]) -> List['clouds.Cloud']:
|
140
|
-
clouds = []
|
141
|
-
for cloud_name in return_value:
|
142
|
-
cloud = registry.CLOUD_REGISTRY.from_str(cloud_name)
|
143
|
-
assert cloud is not None, return_value
|
144
|
-
clouds.append(cloud)
|
145
|
-
return clouds
|
146
|
-
|
147
|
-
|
148
137
|
@register_decoders('list_accelerators')
|
149
138
|
def decode_list_accelerators(
|
150
139
|
return_value: Dict[str, List[List[Any]]]
|
sky/server/server.py
CHANGED
@@ -127,6 +127,11 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
127
127
|
|
128
128
|
async def dispatch(self, request: fastapi.Request, call_next):
|
129
129
|
auth_user = _get_auth_user_header(request)
|
130
|
+
|
131
|
+
# Add user to database if auth_user is present
|
132
|
+
if auth_user is not None:
|
133
|
+
global_user_state.add_or_update_user(auth_user)
|
134
|
+
|
130
135
|
body = await request.body()
|
131
136
|
if auth_user and body:
|
132
137
|
try:
|
@@ -137,10 +142,16 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
137
142
|
logger.debug(f'Overriding user for {request.state.request_id}: '
|
138
143
|
f'{auth_user.name}, {auth_user.id}')
|
139
144
|
if 'env_vars' in original_json:
|
140
|
-
original_json
|
141
|
-
|
142
|
-
|
143
|
-
|
145
|
+
if isinstance(original_json.get('env_vars'), dict):
|
146
|
+
original_json['env_vars'][
|
147
|
+
constants.USER_ID_ENV_VAR] = auth_user.id
|
148
|
+
original_json['env_vars'][
|
149
|
+
constants.USER_ENV_VAR] = auth_user.name
|
150
|
+
else:
|
151
|
+
logger.warning(
|
152
|
+
f'"env_vars" in request body is not a dictionary '
|
153
|
+
f'for request {request.state.request_id}. '
|
154
|
+
'Skipping user info injection into body.')
|
144
155
|
request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
|
145
156
|
return await call_next(request)
|
146
157
|
|
@@ -262,10 +273,7 @@ app.include_router(workspaces_rest.router,
|
|
262
273
|
|
263
274
|
@app.get('/token')
|
264
275
|
async def token(request: fastapi.Request) -> fastapi.responses.HTMLResponse:
|
265
|
-
# If we have auth info, save this user to the database.
|
266
276
|
user = _get_auth_user_header(request)
|
267
|
-
if user is not None:
|
268
|
-
global_user_state.add_or_update_user(user)
|
269
277
|
|
270
278
|
token_data = {
|
271
279
|
'v': 1, # Token version number, bump for backwards incompatible.
|
@@ -315,12 +323,14 @@ async def check(request: fastapi.Request,
|
|
315
323
|
|
316
324
|
@app.get('/enabled_clouds')
|
317
325
|
async def enabled_clouds(request: fastapi.Request,
|
318
|
-
workspace: Optional[str] = None
|
326
|
+
workspace: Optional[str] = None,
|
327
|
+
expand: bool = False) -> None:
|
319
328
|
"""Gets enabled clouds on the server."""
|
320
329
|
executor.schedule_request(
|
321
330
|
request_id=request.state.request_id,
|
322
331
|
request_name='enabled_clouds',
|
323
|
-
request_body=payloads.EnabledCloudsBody(workspace=workspace
|
332
|
+
request_body=payloads.EnabledCloudsBody(workspace=workspace,
|
333
|
+
expand=expand),
|
324
334
|
func=core.enabled_clouds,
|
325
335
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
326
336
|
)
|
@@ -411,6 +421,10 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
411
421
|
logger.debug(f'Validating tasks: {validate_body.dag}')
|
412
422
|
|
413
423
|
context.initialize()
|
424
|
+
ctx = context.get()
|
425
|
+
assert ctx is not None
|
426
|
+
# TODO(aylei): generalize this to all requests without a db record.
|
427
|
+
ctx.override_envs(validate_body.env_vars)
|
414
428
|
|
415
429
|
def validate_dag(dag: dag_utils.dag_lib.Dag):
|
416
430
|
# TODO: Admin policy may contain arbitrary code, which may be expensive
|
@@ -1189,13 +1203,10 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
|
|
1189
1203
|
|
1190
1204
|
|
1191
1205
|
@app.websocket('/kubernetes-pod-ssh-proxy')
|
1192
|
-
async def kubernetes_pod_ssh_proxy(
|
1193
|
-
|
1194
|
-
cluster_name_body: payloads.ClusterNameBody = fastapi.Depends()
|
1195
|
-
) -> None:
|
1206
|
+
async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
1207
|
+
cluster_name: str) -> None:
|
1196
1208
|
"""Proxies SSH to the Kubernetes pod with websocket."""
|
1197
1209
|
await websocket.accept()
|
1198
|
-
cluster_name = cluster_name_body.cluster_name
|
1199
1210
|
logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
|
1200
1211
|
|
1201
1212
|
cluster_records = core.status(cluster_name, all_users=True)
|
sky/setup_files/dependencies.py
CHANGED
sky/skylet/constants.py
CHANGED
@@ -408,3 +408,7 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
408
408
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
409
409
|
|
410
410
|
SKYPILOT_DEFAULT_WORKSPACE = 'default'
|
411
|
+
|
412
|
+
# Experimental - may be deprecated in the future without notice.
|
413
|
+
SKYPILOT_API_SERVER_DB_URL_ENV_VAR: str = (
|
414
|
+
f'{SKYPILOT_ENV_VAR_PREFIX}API_SERVER_DB_URL')
|
sky/skypilot_config.py
CHANGED
@@ -299,6 +299,10 @@ def get_nested(keys: Tuple[str, ...],
|
|
299
299
|
def get_workspace_cloud(cloud: str,
|
300
300
|
workspace: Optional[str] = None) -> config_utils.Config:
|
301
301
|
"""Returns the workspace config."""
|
302
|
+
# TODO(zhwu): Instead of just returning the workspace specific config, we
|
303
|
+
# should return the config that already merges the global config, so that
|
304
|
+
# the caller does not need to manually merge the global config with
|
305
|
+
# the workspace specific config.
|
302
306
|
if workspace is None:
|
303
307
|
workspace = get_active_workspace()
|
304
308
|
clouds = get_nested(keys=(
|
sky/utils/db_utils.py
CHANGED
@@ -88,60 +88,48 @@ def add_column_to_table_sqlalchemy(
|
|
88
88
|
session: 'Session',
|
89
89
|
table_name: str,
|
90
90
|
column_name: str,
|
91
|
-
column_type:
|
91
|
+
column_type: sqlalchemy.types.TypeEngine,
|
92
|
+
default_statement: Optional[str] = None,
|
92
93
|
copy_from: Optional[str] = None,
|
93
94
|
value_to_replace_existing_entries: Optional[Any] = None,
|
94
95
|
):
|
95
96
|
"""Add a column to a table."""
|
96
|
-
|
97
|
-
|
98
|
-
|
97
|
+
# column type may be different for different dialects.
|
98
|
+
# for example, sqlite uses BLOB for LargeBinary
|
99
|
+
# while postgres uses BYTEA.
|
100
|
+
column_type_str = column_type.compile(dialect=session.bind.dialect)
|
101
|
+
default_statement_str = (f' {default_statement}'
|
102
|
+
if default_statement is not None else '')
|
103
|
+
try:
|
104
|
+
session.execute(
|
105
|
+
sqlalchemy.text(f'ALTER TABLE {table_name} '
|
106
|
+
f'ADD COLUMN {column_name} {column_type_str}'
|
107
|
+
f'{default_statement_str}'))
|
108
|
+
if copy_from is not None:
|
99
109
|
session.execute(
|
100
|
-
sqlalchemy.text(f'
|
101
|
-
f'
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
raise ValueError('Unsupported database dialect')
|
121
|
-
else:
|
122
|
-
session.rollback()
|
123
|
-
raise ValueError('Unsupported database dialect')
|
110
|
+
sqlalchemy.text(f'UPDATE {table_name} '
|
111
|
+
f'SET {column_name} = {copy_from}'))
|
112
|
+
if value_to_replace_existing_entries is not None:
|
113
|
+
session.execute(
|
114
|
+
sqlalchemy.text(f'UPDATE {table_name} '
|
115
|
+
f'SET {column_name} = :replacement_value '
|
116
|
+
f'WHERE {column_name} IS NULL'),
|
117
|
+
{'replacement_value': value_to_replace_existing_entries})
|
118
|
+
#sqlite
|
119
|
+
except sqlalchemy_exc.OperationalError as e:
|
120
|
+
if 'duplicate column name' in str(e):
|
121
|
+
pass
|
122
|
+
else:
|
123
|
+
raise
|
124
|
+
#postgressql
|
125
|
+
except sqlalchemy_exc.ProgrammingError as e:
|
126
|
+
if 'already exists' in str(e):
|
127
|
+
pass
|
128
|
+
else:
|
129
|
+
raise
|
124
130
|
session.commit()
|
125
131
|
|
126
132
|
|
127
|
-
def rename_column(
|
128
|
-
cursor: 'sqlite3.Cursor',
|
129
|
-
conn: 'sqlite3.Connection',
|
130
|
-
table_name: str,
|
131
|
-
old_name: str,
|
132
|
-
new_name: str,
|
133
|
-
):
|
134
|
-
"""Rename a column in a table."""
|
135
|
-
# NOTE: This only works for sqlite3 >= 3.25.0. Be careful to use this.
|
136
|
-
|
137
|
-
for row in cursor.execute(f'PRAGMA table_info({table_name})'):
|
138
|
-
if row[1] == old_name:
|
139
|
-
cursor.execute(f'ALTER TABLE {table_name} '
|
140
|
-
f'RENAME COLUMN {old_name} to {new_name}')
|
141
|
-
break
|
142
|
-
conn.commit()
|
143
|
-
|
144
|
-
|
145
133
|
class SQLiteConn(threading.local):
|
146
134
|
"""Thread-local connection to the sqlite3 database."""
|
147
135
|
|
@@ -5,6 +5,9 @@ the 'command' field in the exec configuration, leaving only the executable name.
|
|
5
5
|
This is useful when moving between different environments where auth plugin
|
6
6
|
executables might be installed in different locations.
|
7
7
|
|
8
|
+
For Nebius kubeconfigs, it also changes the --profile argument to 'sky' to
|
9
|
+
ensure compatibility with SkyPilot's expected profile configuration.
|
10
|
+
|
8
11
|
It assumes the target environment has the auth executable available in PATH.
|
9
12
|
If not, you'll need to update your environment container to include the auth
|
10
13
|
executable in PATH.
|
@@ -21,6 +24,8 @@ import yaml
|
|
21
24
|
def strip_auth_plugin_paths(kubeconfig_path: str, output_path: str):
|
22
25
|
"""Strip path information from exec plugin commands in a kubeconfig file.
|
23
26
|
|
27
|
+
For Nebius kubeconfigs, also changes the --profile argument to 'sky'.
|
28
|
+
|
24
29
|
Args:
|
25
30
|
kubeconfig_path (str): Path to the input kubeconfig file
|
26
31
|
output_path (str): Path where the modified kubeconfig will be saved
|
@@ -40,6 +45,20 @@ def strip_auth_plugin_paths(kubeconfig_path: str, output_path: str):
|
|
40
45
|
exec_info['command'] = executable
|
41
46
|
updated = True
|
42
47
|
|
48
|
+
# Handle Nebius kubeconfigs: change --profile to 'sky'
|
49
|
+
if executable == 'nebius' or current_command == 'nebius':
|
50
|
+
args = exec_info.get('args', [])
|
51
|
+
if args and '--profile' in args:
|
52
|
+
try:
|
53
|
+
profile_index = args.index('--profile')
|
54
|
+
if profile_index + 1 < len(args):
|
55
|
+
old_profile = args[profile_index + 1]
|
56
|
+
if old_profile != 'sky':
|
57
|
+
args[profile_index + 1] = 'sky'
|
58
|
+
updated = True
|
59
|
+
except ValueError:
|
60
|
+
pass # --profile not found in args
|
61
|
+
|
43
62
|
if updated:
|
44
63
|
with open(output_path, 'w', encoding='utf-8') as file:
|
45
64
|
yaml.safe_dump(config, file)
|
sky/utils/schemas.py
CHANGED
@@ -1044,6 +1044,25 @@ def get_config_schema():
|
|
1044
1044
|
},
|
1045
1045
|
}
|
1046
1046
|
},
|
1047
|
+
'ssh': {
|
1048
|
+
'type': 'object',
|
1049
|
+
'required': [],
|
1050
|
+
'additionalProperties': False,
|
1051
|
+
'properties': {
|
1052
|
+
'allowed_node_pools': {
|
1053
|
+
'type': 'array',
|
1054
|
+
'items': {
|
1055
|
+
'type': 'string',
|
1056
|
+
},
|
1057
|
+
},
|
1058
|
+
'pod_config': {
|
1059
|
+
'type': 'object',
|
1060
|
+
'required': [],
|
1061
|
+
# Allow arbitrary keys since validating pod spec is hard
|
1062
|
+
'additionalProperties': True,
|
1063
|
+
},
|
1064
|
+
}
|
1065
|
+
},
|
1047
1066
|
'oci': {
|
1048
1067
|
'type': 'object',
|
1049
1068
|
'required': [],
|
@@ -1177,12 +1196,13 @@ def get_config_schema():
|
|
1177
1196
|
|
1178
1197
|
allowed_workspace_cloud_names = list(
|
1179
1198
|
service_catalog.ALL_CLOUDS) + ['cloudflare']
|
1180
|
-
# Create pattern for
|
1181
|
-
|
1199
|
+
# Create pattern for not supported clouds, i.e.
|
1200
|
+
# all clouds except gcp, kubernetes, ssh
|
1201
|
+
not_supported_clouds = [
|
1182
1202
|
cloud for cloud in allowed_workspace_cloud_names
|
1183
|
-
if cloud.lower()
|
1203
|
+
if cloud.lower() not in ['gcp', 'kubernetes', 'ssh']
|
1184
1204
|
]
|
1185
|
-
|
1205
|
+
not_supported_cloud_regex = '|'.join(not_supported_clouds)
|
1186
1206
|
workspaces_schema = {
|
1187
1207
|
'type': 'object',
|
1188
1208
|
'required': [],
|
@@ -1192,7 +1212,7 @@ def get_config_schema():
|
|
1192
1212
|
'additionalProperties': False,
|
1193
1213
|
'patternProperties': {
|
1194
1214
|
# Pattern for non-GCP clouds - only allows 'disabled' property
|
1195
|
-
f'^({
|
1215
|
+
f'^({not_supported_cloud_regex})$': {
|
1196
1216
|
'type': 'object',
|
1197
1217
|
'additionalProperties': False,
|
1198
1218
|
'properties': {
|
@@ -1217,6 +1237,38 @@ def get_config_schema():
|
|
1217
1237
|
},
|
1218
1238
|
'additionalProperties': False,
|
1219
1239
|
},
|
1240
|
+
'ssh': {
|
1241
|
+
'type': 'object',
|
1242
|
+
'required': [],
|
1243
|
+
'properties': {
|
1244
|
+
'allowed_node_pools': {
|
1245
|
+
'type': 'array',
|
1246
|
+
'items': {
|
1247
|
+
'type': 'string',
|
1248
|
+
},
|
1249
|
+
},
|
1250
|
+
'disabled': {
|
1251
|
+
'type': 'boolean'
|
1252
|
+
},
|
1253
|
+
},
|
1254
|
+
'additionalProperties': False,
|
1255
|
+
},
|
1256
|
+
'kubernetes': {
|
1257
|
+
'type': 'object',
|
1258
|
+
'required': [],
|
1259
|
+
'properties': {
|
1260
|
+
'allowed_contexts': {
|
1261
|
+
'type': 'array',
|
1262
|
+
'items': {
|
1263
|
+
'type': 'string',
|
1264
|
+
},
|
1265
|
+
},
|
1266
|
+
'disabled': {
|
1267
|
+
'type': 'boolean'
|
1268
|
+
},
|
1269
|
+
},
|
1270
|
+
'additionalProperties': False,
|
1271
|
+
},
|
1220
1272
|
},
|
1221
1273
|
},
|
1222
1274
|
}
|
sky/utils/subprocess_utils.py
CHANGED
@@ -246,11 +246,10 @@ def kill_process_with_grace_period(proc: GenericProcess,
|
|
246
246
|
# The child process may have already been terminated.
|
247
247
|
return
|
248
248
|
except psutil.TimeoutExpired:
|
249
|
-
# Pass to finally to force kill the process.
|
250
|
-
pass
|
251
|
-
finally:
|
252
249
|
logger.debug(f'Process {proc.pid} did not terminate after '
|
253
250
|
f'{grace_period} seconds')
|
251
|
+
# Continue to finally to force kill the process.
|
252
|
+
finally:
|
254
253
|
# Attempt to force kill if the normal termination fails
|
255
254
|
if not force:
|
256
255
|
logger.debug(f'Force killing process {proc.pid}')
|
sky/workspaces/core.py
CHANGED
@@ -13,6 +13,7 @@ from sky import skypilot_config
|
|
13
13
|
from sky.skylet import constants
|
14
14
|
from sky.usage import usage_lib
|
15
15
|
from sky.utils import common_utils
|
16
|
+
from sky.utils import config_utils
|
16
17
|
from sky.utils import schemas
|
17
18
|
|
18
19
|
logger = sky_logging.init_logger(__name__)
|
@@ -88,70 +89,103 @@ def _check_workspace_has_no_active_resources(workspace_name: str,
|
|
88
89
|
Raises:
|
89
90
|
ValueError: If the workspace has active clusters or managed jobs.
|
90
91
|
"""
|
92
|
+
_check_workspaces_have_no_active_resources([(workspace_name, operation)])
|
91
93
|
|
92
|
-
def check_clusters():
|
93
|
-
# Check for active clusters
|
94
|
-
all_clusters = global_user_state.get_clusters()
|
95
|
-
workspace_clusters = [
|
96
|
-
cluster for cluster in all_clusters
|
97
|
-
if (cluster.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
|
98
|
-
== workspace_name)
|
99
|
-
]
|
100
|
-
return workspace_clusters
|
101
94
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
from sky.jobs.server import core as managed_jobs_core
|
95
|
+
def _check_workspaces_have_no_active_resources(
|
96
|
+
workspace_operations: list) -> None:
|
97
|
+
"""Check if workspaces have active clusters or managed jobs.
|
106
98
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
managed_jobs = managed_jobs_core.queue(refresh=False,
|
111
|
-
skip_finished=True,
|
112
|
-
all_users=True)
|
113
|
-
|
114
|
-
workspace_active_jobs = [
|
115
|
-
job for job in managed_jobs
|
116
|
-
if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) ==
|
117
|
-
workspace_name
|
118
|
-
]
|
99
|
+
Args:
|
100
|
+
workspace_operations: List of tuples (workspace_name, operation) where
|
101
|
+
operation is 'update' or 'delete'.
|
119
102
|
|
120
|
-
|
103
|
+
Raises:
|
104
|
+
ValueError: If any workspace has active clusters or managed jobs.
|
105
|
+
The error message will include all workspaces with issues.
|
106
|
+
"""
|
107
|
+
if not workspace_operations:
|
108
|
+
return
|
109
|
+
|
110
|
+
def get_all_clusters():
|
111
|
+
return global_user_state.get_clusters()
|
121
112
|
|
113
|
+
def get_all_managed_jobs():
|
114
|
+
# pylint: disable=import-outside-toplevel
|
115
|
+
from sky.jobs.server import core as managed_jobs_core
|
116
|
+
try:
|
117
|
+
return managed_jobs_core.queue(refresh=False,
|
118
|
+
skip_finished=True,
|
119
|
+
all_users=True)
|
122
120
|
except exceptions.ClusterNotUpError:
|
123
|
-
# If we can't check managed jobs (e.g., controller not running),
|
124
|
-
# log a warning but don't fail the operation
|
125
121
|
logger.warning('All jobs should be finished in workspace.')
|
126
122
|
return []
|
127
123
|
|
128
|
-
#
|
124
|
+
# Fetch both clusters and jobs in parallel
|
129
125
|
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
130
|
-
|
131
|
-
jobs_future = executor.submit(
|
126
|
+
clusters_future = executor.submit(get_all_clusters)
|
127
|
+
jobs_future = executor.submit(get_all_managed_jobs)
|
128
|
+
|
129
|
+
all_clusters = clusters_future.result()
|
130
|
+
all_managed_jobs = jobs_future.result()
|
132
131
|
|
133
|
-
|
134
|
-
|
135
|
-
|
132
|
+
# Collect all error messages instead of raising immediately
|
133
|
+
error_messages = []
|
134
|
+
|
135
|
+
# Check each workspace against the fetched data
|
136
|
+
for workspace_name, operation in workspace_operations:
|
137
|
+
# Filter clusters for this workspace
|
138
|
+
workspace_clusters = [
|
139
|
+
cluster for cluster in all_clusters
|
140
|
+
if (cluster.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
|
141
|
+
== workspace_name)
|
142
|
+
]
|
136
143
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
144
|
+
# Filter managed jobs for this workspace
|
145
|
+
workspace_active_jobs = [
|
146
|
+
job for job in all_managed_jobs
|
147
|
+
if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) ==
|
148
|
+
workspace_name
|
141
149
|
]
|
142
|
-
cluster_list = ', '.join(active_cluster_names)
|
143
|
-
raise ValueError(
|
144
|
-
f'Cannot {operation} workspace {workspace_name!r} because it has '
|
145
|
-
f'{len(workspace_clusters)} active cluster(s): {cluster_list}. '
|
146
|
-
f'Please terminate these clusters first.')
|
147
150
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
151
|
+
# Collect error messages for this workspace
|
152
|
+
workspace_errors = []
|
153
|
+
|
154
|
+
if workspace_clusters:
|
155
|
+
active_cluster_names = [
|
156
|
+
cluster['name'] for cluster in workspace_clusters
|
157
|
+
]
|
158
|
+
cluster_list = ', '.join(active_cluster_names)
|
159
|
+
workspace_errors.append(
|
160
|
+
f'{len(workspace_clusters)} active cluster(s): {cluster_list}')
|
161
|
+
|
162
|
+
if workspace_active_jobs:
|
163
|
+
job_names = [job['job_id'] for job in workspace_active_jobs]
|
164
|
+
job_list = ', '.join(job_names)
|
165
|
+
workspace_errors.append(
|
166
|
+
f'{len(workspace_active_jobs)} active managed job(s): '
|
167
|
+
f'{job_list}')
|
168
|
+
|
169
|
+
# If this workspace has issues, add to overall error messages
|
170
|
+
if workspace_errors:
|
171
|
+
workspace_error_summary = ' and '.join(workspace_errors)
|
172
|
+
error_messages.append(
|
173
|
+
f'Cannot {operation} workspace {workspace_name!r} because it '
|
174
|
+
f'has {workspace_error_summary}.')
|
175
|
+
|
176
|
+
# If we collected any errors, raise them all together
|
177
|
+
if error_messages:
|
178
|
+
if len(error_messages) == 1:
|
179
|
+
# Single workspace error
|
180
|
+
full_message = error_messages[
|
181
|
+
0] + ' Please terminate these resources first.'
|
182
|
+
else:
|
183
|
+
# Multiple workspace errors
|
184
|
+
full_message = (f'Cannot proceed due to active resources in '
|
185
|
+
f'{len(error_messages)} workspace(s):\n' +
|
186
|
+
'\n'.join(f'• {msg}' for msg in error_messages) +
|
187
|
+
'\nPlease terminate these resources first.')
|
188
|
+
raise ValueError(full_message)
|
155
189
|
|
156
190
|
|
157
191
|
def _validate_workspace_config(workspace_name: str,
|
@@ -293,3 +327,105 @@ def delete_workspace(workspace_name: str) -> Dict[str, Any]:
|
|
293
327
|
|
294
328
|
# Use the internal helper function to save
|
295
329
|
return _update_workspaces_config(delete_workspace_fn)
|
330
|
+
|
331
|
+
|
332
|
+
# =========================
|
333
|
+
# = Config Management =
|
334
|
+
# =========================
|
335
|
+
|
336
|
+
|
337
|
+
@usage_lib.entrypoint
|
338
|
+
def get_config() -> Dict[str, Any]:
|
339
|
+
"""Returns the entire SkyPilot configuration.
|
340
|
+
|
341
|
+
Returns:
|
342
|
+
The complete SkyPilot configuration as a dictionary.
|
343
|
+
"""
|
344
|
+
return skypilot_config.to_dict()
|
345
|
+
|
346
|
+
|
347
|
+
@usage_lib.entrypoint
|
348
|
+
def update_config(config: Dict[str, Any]) -> Dict[str, Any]:
|
349
|
+
"""Updates the entire SkyPilot configuration.
|
350
|
+
|
351
|
+
Args:
|
352
|
+
config: The new configuration to save.
|
353
|
+
|
354
|
+
Returns:
|
355
|
+
The updated configuration.
|
356
|
+
|
357
|
+
Raises:
|
358
|
+
ValueError: If the configuration is invalid, or if there are
|
359
|
+
active clusters or managed jobs in workspaces being modified.
|
360
|
+
FileNotFoundError: If the config file cannot be found.
|
361
|
+
PermissionError: If the config file cannot be written.
|
362
|
+
"""
|
363
|
+
# Validate the configuration using the schema
|
364
|
+
try:
|
365
|
+
common_utils.validate_schema(config, schemas.get_config_schema(),
|
366
|
+
'Invalid SkyPilot configuration: ')
|
367
|
+
except exceptions.InvalidSkyPilotConfigError as e:
|
368
|
+
raise ValueError(str(e)) from e
|
369
|
+
|
370
|
+
# Check for API server changes and validate them
|
371
|
+
current_config = skypilot_config.to_dict()
|
372
|
+
|
373
|
+
current_endpoint = current_config.get('api_server', {}).get('endpoint')
|
374
|
+
new_endpoint = config.get('api_server', {}).get('endpoint')
|
375
|
+
if current_endpoint != new_endpoint:
|
376
|
+
raise ValueError('API server endpoint should not be changed to avoid '
|
377
|
+
'unexpected behavior.')
|
378
|
+
|
379
|
+
# Check for workspace changes and validate them
|
380
|
+
current_workspaces = current_config.get('workspaces', {})
|
381
|
+
new_workspaces = config.get('workspaces', {})
|
382
|
+
|
383
|
+
# Collect all workspaces that need to be checked for active resources
|
384
|
+
workspaces_to_check = []
|
385
|
+
|
386
|
+
# Check each workspace that is being modified
|
387
|
+
for workspace_name, new_workspace_config in new_workspaces.items():
|
388
|
+
current_workspace_config = current_workspaces.get(workspace_name, {})
|
389
|
+
|
390
|
+
# If workspace configuration is changing, validate and mark for checking
|
391
|
+
if current_workspace_config != new_workspace_config:
|
392
|
+
_validate_workspace_config(workspace_name, new_workspace_config)
|
393
|
+
workspaces_to_check.append((workspace_name, 'update'))
|
394
|
+
|
395
|
+
# Check for workspace deletions
|
396
|
+
for workspace_name in current_workspaces:
|
397
|
+
if workspace_name not in new_workspaces:
|
398
|
+
# Workspace is being deleted
|
399
|
+
if workspace_name == constants.SKYPILOT_DEFAULT_WORKSPACE:
|
400
|
+
raise ValueError(f'Cannot delete the default workspace '
|
401
|
+
f'{constants.SKYPILOT_DEFAULT_WORKSPACE!r}.')
|
402
|
+
workspaces_to_check.append((workspace_name, 'delete'))
|
403
|
+
|
404
|
+
# Check all workspaces for active resources in one efficient call
|
405
|
+
_check_workspaces_have_no_active_resources(workspaces_to_check)
|
406
|
+
|
407
|
+
# Use file locking to prevent race conditions
|
408
|
+
lock_path = skypilot_config.get_skypilot_config_lock_path()
|
409
|
+
try:
|
410
|
+
with filelock.FileLock(lock_path,
|
411
|
+
_WORKSPACE_CONFIG_LOCK_TIMEOUT_SECONDS):
|
412
|
+
# Convert to config_utils.Config and save
|
413
|
+
config_obj = config_utils.Config.from_dict(config)
|
414
|
+
skypilot_config.update_config_no_lock(config_obj)
|
415
|
+
except filelock.Timeout as e:
|
416
|
+
raise RuntimeError(
|
417
|
+
f'Failed to update configuration due to a timeout '
|
418
|
+
f'when trying to acquire the lock at {lock_path}. This may '
|
419
|
+
'indicate another SkyPilot process is currently updating the '
|
420
|
+
'configuration. Please try again or manually remove the lock '
|
421
|
+
f'file if you believe it is stale.') from e
|
422
|
+
|
423
|
+
# Validate the configuration by running sky check
|
424
|
+
try:
|
425
|
+
sky_check.check(quiet=True)
|
426
|
+
except Exception as e: # pylint: disable=broad-except
|
427
|
+
logger.warning(f'Configuration saved but '
|
428
|
+
f'validation check failed: {e}')
|
429
|
+
# Don't fail the update if the check fails, just warn
|
430
|
+
|
431
|
+
return config
|