skypilot-nightly 1.0.0.dev20250627__py3-none-any.whl → 1.0.0.dev20250628__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +7 -0
- sky/adaptors/nebius.py +2 -2
- sky/authentication.py +12 -5
- sky/backends/backend_utils.py +92 -26
- sky/check.py +5 -2
- sky/client/cli/command.py +38 -6
- sky/client/sdk.py +217 -167
- sky/client/service_account_auth.py +47 -0
- sky/clouds/aws.py +10 -4
- sky/clouds/azure.py +5 -2
- sky/clouds/cloud.py +5 -2
- sky/clouds/gcp.py +31 -18
- sky/clouds/kubernetes.py +54 -34
- sky/clouds/nebius.py +8 -2
- sky/clouds/ssh.py +5 -2
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +22 -7
- sky/clouds/utils/oci_utils.py +62 -14
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{HudU4f4Xsy-cP51JvXSZ- → ZYLkkWSYZjJhLVsObh20y}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/43-f38a531f6692f281.js +1 -0
- sky/dashboard/out/_next/static/chunks/601-111d06d9ded11d00.js +1 -0
- sky/dashboard/out/_next/static/chunks/{616-d6128fa9e7cae6e6.js → 616-50a620ac4a23deb4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/691.fd9292250ab089af.js +21 -0
- sky/dashboard/out/_next/static/chunks/{785.dc2686c3c1235554.js → 785.3446c12ffdf3d188.js} +1 -1
- sky/dashboard/out/_next/static/chunks/871-e547295e7e21399c.js +6 -0
- sky/dashboard/out/_next/static/chunks/937.72796f7afe54075b.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-0a770415b5ce4649.js +1 -0
- sky/dashboard/out/_next/static/chunks/982.d7bd80ed18cad4cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-21080826c6095f21.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-77d4816945b04793.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-f119a5630a1efd61.js → clusters-65b2c90320b8afb8.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-64bdc0b2d3a44709.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/{jobs-0a5695ff3075d94a.js → jobs-df7407b5e37d3750.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-4978cbb093e141e7.js → users-d7684eaa04c4f58f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-cb7e720b739de53a.js → [name]-04e1b3ad4207b1e9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-50e230828730cfb3.js → workspaces-c470366a6179f16e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-08fdb9e6070127fc.js → webpack-75a3310ef922a299.js} +1 -1
- sky/dashboard/out/_next/static/css/605ac87514049058.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +8 -3
- sky/global_user_state.py +257 -9
- sky/jobs/client/sdk.py +20 -25
- sky/models.py +16 -0
- sky/provision/kubernetes/config.py +1 -1
- sky/provision/kubernetes/instance.py +7 -4
- sky/provision/kubernetes/network.py +15 -9
- sky/provision/kubernetes/network_utils.py +42 -23
- sky/provision/kubernetes/utils.py +73 -35
- sky/provision/nebius/utils.py +10 -4
- sky/resources.py +10 -4
- sky/serve/client/sdk.py +28 -34
- sky/server/common.py +51 -3
- sky/server/constants.py +3 -0
- sky/server/requests/executor.py +4 -0
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +19 -0
- sky/server/rest.py +6 -15
- sky/server/server.py +121 -6
- sky/skylet/constants.py +6 -0
- sky/skypilot_config.py +32 -4
- sky/users/permission.py +29 -0
- sky/users/server.py +384 -5
- sky/users/token_service.py +196 -0
- sky/utils/common_utils.py +4 -5
- sky/utils/config_utils.py +41 -0
- sky/utils/controller_utils.py +5 -1
- sky/utils/resource_checker.py +153 -0
- sky/utils/resources_utils.py +12 -4
- sky/utils/schemas.py +87 -60
- sky/utils/subprocess_utils.py +2 -6
- sky/workspaces/core.py +9 -117
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250628.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250628.dist-info}/RECORD +94 -91
- sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +0 -1
- sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +0 -16
- sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +0 -6
- sky/dashboard/out/_next/static/chunks/937.3759f538f11a0953.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +0 -1
- sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +0 -1
- sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +0 -16
- sky/dashboard/out/_next/static/css/52082cf558ec9705.css +0 -3
- /sky/dashboard/out/_next/static/{HudU4f4Xsy-cP51JvXSZ- → ZYLkkWSYZjJhLVsObh20y}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-9a3ce3170d2edcec.js → _app-050a9e637b057b24.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250628.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250628.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250628.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250627.dist-info → skypilot_nightly-1.0.0.dev20250628.dist-info}/top_level.txt +0 -0
sky/utils/config_utils.py
CHANGED
@@ -226,3 +226,44 @@ def merge_k8s_configs(
|
|
226
226
|
base_config[key].extend(value)
|
227
227
|
else:
|
228
228
|
base_config[key] = value
|
229
|
+
|
230
|
+
|
231
|
+
def get_cloud_config_value_from_dict(
|
232
|
+
dict_config: Dict[str, Any],
|
233
|
+
cloud: str,
|
234
|
+
keys: Tuple[str, ...],
|
235
|
+
region: Optional[str] = None,
|
236
|
+
default_value: Optional[Any] = None,
|
237
|
+
override_configs: Optional[Dict[str, Any]] = None) -> Any:
|
238
|
+
"""Returns the nested key value by reading from config
|
239
|
+
Order to get the property_name value:
|
240
|
+
1. if region is specified,
|
241
|
+
try to get the value from <cloud>/<region_key>/<region>/keys
|
242
|
+
2. if no region or no override,
|
243
|
+
try to get it at the cloud level <cloud>/keys
|
244
|
+
3. if not found at cloud level,
|
245
|
+
return either default_value if specified or None
|
246
|
+
"""
|
247
|
+
input_config = Config(dict_config)
|
248
|
+
region_key = None
|
249
|
+
if cloud == 'kubernetes':
|
250
|
+
region_key = 'context_configs'
|
251
|
+
|
252
|
+
per_context_config = None
|
253
|
+
if region is not None and region_key is not None:
|
254
|
+
per_context_config = input_config.get_nested(
|
255
|
+
keys=(cloud, region_key, region) + keys,
|
256
|
+
default_value=None,
|
257
|
+
override_configs=override_configs)
|
258
|
+
# if no override found for specified region
|
259
|
+
general_config = input_config.get_nested(keys=(cloud,) + keys,
|
260
|
+
default_value=default_value,
|
261
|
+
override_configs=override_configs)
|
262
|
+
|
263
|
+
if (cloud == 'kubernetes' and isinstance(general_config, dict) and
|
264
|
+
isinstance(per_context_config, dict)):
|
265
|
+
merge_k8s_configs(general_config, per_context_config)
|
266
|
+
return general_config
|
267
|
+
else:
|
268
|
+
return (general_config
|
269
|
+
if per_context_config is None else per_context_config)
|
sky/utils/controller_utils.py
CHANGED
@@ -733,7 +733,11 @@ def _setup_proxy_command_on_controller(
|
|
733
733
|
config = config_utils.Config.from_dict(user_config)
|
734
734
|
proxy_command_key = (str(controller_launched_cloud).lower(),
|
735
735
|
'ssh_proxy_command')
|
736
|
-
ssh_proxy_command =
|
736
|
+
ssh_proxy_command = skypilot_config.get_effective_region_config(
|
737
|
+
cloud=str(controller_launched_cloud).lower(),
|
738
|
+
region=None,
|
739
|
+
keys=('ssh_proxy_command',),
|
740
|
+
default_value=None)
|
737
741
|
if isinstance(ssh_proxy_command, str):
|
738
742
|
config.set_nested(proxy_command_key, None)
|
739
743
|
elif isinstance(ssh_proxy_command, dict):
|
@@ -0,0 +1,153 @@
|
|
1
|
+
"""Resource checking utilities for finding active clusters and managed jobs."""
|
2
|
+
|
3
|
+
import concurrent.futures
|
4
|
+
from typing import Any, Callable, Dict, List, Tuple
|
5
|
+
|
6
|
+
from sky import exceptions
|
7
|
+
from sky import global_user_state
|
8
|
+
from sky import sky_logging
|
9
|
+
from sky.skylet import constants
|
10
|
+
|
11
|
+
logger = sky_logging.init_logger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
def check_no_active_resources_for_users(
|
15
|
+
user_operations: List[Tuple[str, str]]) -> None:
|
16
|
+
"""Check if users have active clusters or managed jobs.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
user_operations: List of tuples (user_id, operation) where
|
20
|
+
operation is 'update' or 'delete'.
|
21
|
+
|
22
|
+
Raises:
|
23
|
+
ValueError: If any user has active clusters or managed jobs.
|
24
|
+
The error message will include all users with issues.
|
25
|
+
"""
|
26
|
+
if not user_operations:
|
27
|
+
return
|
28
|
+
|
29
|
+
def filter_by_user(user_id: str):
|
30
|
+
return lambda resource: resource.get('user_hash') == user_id
|
31
|
+
|
32
|
+
_check_active_resources(user_operations, filter_by_user, 'user')
|
33
|
+
|
34
|
+
|
35
|
+
def check_no_active_resources_for_workspaces(
|
36
|
+
workspace_operations: List[Tuple[str, str]]) -> None:
|
37
|
+
"""Check if workspaces have active clusters or managed jobs.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
workspace_operations: List of tuples (workspace_name, operation) where
|
41
|
+
operation is 'update' or 'delete'.
|
42
|
+
|
43
|
+
Raises:
|
44
|
+
ValueError: If any workspace has active clusters or managed jobs.
|
45
|
+
The error message will include all workspaces with issues.
|
46
|
+
"""
|
47
|
+
if not workspace_operations:
|
48
|
+
return
|
49
|
+
|
50
|
+
def filter_by_workspace(workspace_name: str):
|
51
|
+
return lambda resource: (resource.get(
|
52
|
+
'workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) == workspace_name
|
53
|
+
)
|
54
|
+
|
55
|
+
_check_active_resources(workspace_operations, filter_by_workspace,
|
56
|
+
'workspace')
|
57
|
+
|
58
|
+
|
59
|
+
def _check_active_resources(resource_operations: List[Tuple[str, str]],
|
60
|
+
filter_factory: Callable[[str],
|
61
|
+
Callable[[Dict[str, Any]],
|
62
|
+
bool]],
|
63
|
+
resource_type: str) -> None:
|
64
|
+
"""Check if resource entities have active clusters or managed jobs.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
resource_operations: List of tuples (resource_name, operation) where
|
68
|
+
operation is 'update' or 'delete'.
|
69
|
+
filter_factory: Function that takes a resource_name and returns a filter
|
70
|
+
function for clusters/jobs.
|
71
|
+
resource_type: Type of resource being checked ('user' or 'workspace').
|
72
|
+
|
73
|
+
Raises:
|
74
|
+
ValueError: If any resource has active clusters or managed jobs.
|
75
|
+
"""
|
76
|
+
|
77
|
+
def get_all_clusters():
|
78
|
+
return global_user_state.get_clusters()
|
79
|
+
|
80
|
+
def get_all_managed_jobs():
|
81
|
+
# pylint: disable=import-outside-toplevel
|
82
|
+
from sky.jobs.server import core as managed_jobs_core
|
83
|
+
try:
|
84
|
+
return managed_jobs_core.queue(refresh=False,
|
85
|
+
skip_finished=True,
|
86
|
+
all_users=True)
|
87
|
+
except exceptions.ClusterNotUpError:
|
88
|
+
logger.warning('All jobs should be finished.')
|
89
|
+
return []
|
90
|
+
|
91
|
+
# Fetch both clusters and jobs in parallel
|
92
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
93
|
+
clusters_future = executor.submit(get_all_clusters)
|
94
|
+
jobs_future = executor.submit(get_all_managed_jobs)
|
95
|
+
|
96
|
+
all_clusters = clusters_future.result()
|
97
|
+
all_managed_jobs = jobs_future.result()
|
98
|
+
|
99
|
+
# Collect all error messages instead of raising immediately
|
100
|
+
error_messages = []
|
101
|
+
|
102
|
+
# Check each resource against the fetched data
|
103
|
+
for resource_name, operation in resource_operations:
|
104
|
+
resource_filter = filter_factory(resource_name)
|
105
|
+
|
106
|
+
# Filter clusters for this resource
|
107
|
+
resource_clusters = [
|
108
|
+
cluster for cluster in all_clusters if resource_filter(cluster)
|
109
|
+
]
|
110
|
+
|
111
|
+
# Filter managed jobs for this resource
|
112
|
+
resource_active_jobs = [
|
113
|
+
job for job in all_managed_jobs if resource_filter(job)
|
114
|
+
]
|
115
|
+
|
116
|
+
# Collect error messages for this resource
|
117
|
+
resource_errors = []
|
118
|
+
|
119
|
+
if resource_clusters:
|
120
|
+
active_cluster_names = [
|
121
|
+
cluster['name'] for cluster in resource_clusters
|
122
|
+
]
|
123
|
+
cluster_list = ', '.join(active_cluster_names)
|
124
|
+
resource_errors.append(
|
125
|
+
f'{len(resource_clusters)} active cluster(s): {cluster_list}')
|
126
|
+
|
127
|
+
if resource_active_jobs:
|
128
|
+
job_names = [str(job['job_id']) for job in resource_active_jobs]
|
129
|
+
job_list = ', '.join(job_names)
|
130
|
+
resource_errors.append(
|
131
|
+
f'{len(resource_active_jobs)} active managed job(s): '
|
132
|
+
f'{job_list}')
|
133
|
+
|
134
|
+
# If this resource has issues, add to overall error messages
|
135
|
+
if resource_errors:
|
136
|
+
resource_error_summary = ' and '.join(resource_errors)
|
137
|
+
error_messages.append(
|
138
|
+
f'Cannot {operation} {resource_type} {resource_name!r} '
|
139
|
+
f'because it has {resource_error_summary}.')
|
140
|
+
|
141
|
+
# If we collected any errors, raise them all together
|
142
|
+
if error_messages:
|
143
|
+
if len(error_messages) == 1:
|
144
|
+
# Single resource error
|
145
|
+
full_message = error_messages[
|
146
|
+
0] + ' Please terminate these resources first.'
|
147
|
+
else:
|
148
|
+
# Multiple resource errors
|
149
|
+
full_message = (f'Cannot proceed due to active resources in '
|
150
|
+
f'{len(error_messages)} {resource_type}(s):\n' +
|
151
|
+
'\n'.join(f'• {msg}' for msg in error_messages) +
|
152
|
+
'\nPlease terminate these resources first.')
|
153
|
+
raise ValueError(full_message)
|
sky/utils/resources_utils.py
CHANGED
@@ -273,10 +273,18 @@ def need_to_query_reservations() -> bool:
|
|
273
273
|
clouds that do not use reservations.
|
274
274
|
"""
|
275
275
|
for cloud_str in registry.CLOUD_REGISTRY.keys():
|
276
|
-
cloud_specific_reservations =
|
277
|
-
(
|
278
|
-
|
279
|
-
|
276
|
+
cloud_specific_reservations = (
|
277
|
+
skypilot_config.get_effective_region_config(
|
278
|
+
cloud=cloud_str,
|
279
|
+
region=None,
|
280
|
+
keys=('specific_reservations',),
|
281
|
+
default_value=None))
|
282
|
+
cloud_prioritize_reservations = (
|
283
|
+
skypilot_config.get_effective_region_config(
|
284
|
+
cloud=cloud_str,
|
285
|
+
region=None,
|
286
|
+
keys=('prioritize_reservations',),
|
287
|
+
default_value=False))
|
280
288
|
if (cloud_specific_reservations is not None or
|
281
289
|
cloud_prioritize_reservations):
|
282
290
|
return True
|
sky/utils/schemas.py
CHANGED
@@ -7,6 +7,7 @@ import enum
|
|
7
7
|
from typing import Any, Dict, List, Tuple
|
8
8
|
|
9
9
|
from sky.skylet import constants
|
10
|
+
from sky.utils import kubernetes_enums
|
10
11
|
|
11
12
|
|
12
13
|
def _check_not_both_fields_present(field1: str, field2: str):
|
@@ -1018,10 +1019,73 @@ _REMOTE_IDENTITY_SCHEMA_KUBERNETES = {
|
|
1018
1019
|
},
|
1019
1020
|
}
|
1020
1021
|
|
1022
|
+
_CONTEXT_CONFIG_SCHEMA_KUBERNETES = {
|
1023
|
+
'networking': {
|
1024
|
+
'type': 'string',
|
1025
|
+
'case_insensitive_enum': [
|
1026
|
+
type.value for type in kubernetes_enums.KubernetesNetworkingMode
|
1027
|
+
],
|
1028
|
+
},
|
1029
|
+
'ports': {
|
1030
|
+
'type': 'string',
|
1031
|
+
'case_insensitive_enum': [
|
1032
|
+
type.value for type in kubernetes_enums.KubernetesPortMode
|
1033
|
+
],
|
1034
|
+
},
|
1035
|
+
'pod_config': {
|
1036
|
+
'type': 'object',
|
1037
|
+
'required': [],
|
1038
|
+
# Allow arbitrary keys since validating pod spec is hard
|
1039
|
+
'additionalProperties': True,
|
1040
|
+
},
|
1041
|
+
'custom_metadata': {
|
1042
|
+
'type': 'object',
|
1043
|
+
'required': [],
|
1044
|
+
# Allow arbitrary keys since validating metadata is hard
|
1045
|
+
'additionalProperties': True,
|
1046
|
+
# Disallow 'name' and 'namespace' keys in this dict
|
1047
|
+
'not': {
|
1048
|
+
'anyOf': [{
|
1049
|
+
'required': ['name']
|
1050
|
+
}, {
|
1051
|
+
'required': ['namespace']
|
1052
|
+
}]
|
1053
|
+
},
|
1054
|
+
},
|
1055
|
+
'provision_timeout': {
|
1056
|
+
'type': 'integer',
|
1057
|
+
},
|
1058
|
+
'autoscaler': {
|
1059
|
+
'type': 'string',
|
1060
|
+
'case_insensitive_enum': [
|
1061
|
+
type.value for type in kubernetes_enums.KubernetesAutoscalerType
|
1062
|
+
],
|
1063
|
+
},
|
1064
|
+
'high_availability': {
|
1065
|
+
'type': 'object',
|
1066
|
+
'required': [],
|
1067
|
+
'additionalProperties': False,
|
1068
|
+
'properties': {
|
1069
|
+
'storage_class_name': {
|
1070
|
+
'type': 'string',
|
1071
|
+
}
|
1072
|
+
},
|
1073
|
+
},
|
1074
|
+
'kueue': {
|
1075
|
+
'type': 'object',
|
1076
|
+
'required': [],
|
1077
|
+
'additionalProperties': False,
|
1078
|
+
'properties': {
|
1079
|
+
'local_queue_name': {
|
1080
|
+
'type': 'string',
|
1081
|
+
},
|
1082
|
+
},
|
1083
|
+
},
|
1084
|
+
}
|
1085
|
+
|
1021
1086
|
|
1022
1087
|
def get_config_schema():
|
1023
1088
|
# pylint: disable=import-outside-toplevel
|
1024
|
-
from sky.utils import kubernetes_enums
|
1025
1089
|
|
1026
1090
|
resources_schema = {
|
1027
1091
|
k: v
|
@@ -1178,70 +1242,21 @@ def get_config_schema():
|
|
1178
1242
|
'type': 'string',
|
1179
1243
|
},
|
1180
1244
|
},
|
1181
|
-
'
|
1182
|
-
'type': 'string',
|
1183
|
-
'case_insensitive_enum': [
|
1184
|
-
type.value
|
1185
|
-
for type in kubernetes_enums.KubernetesNetworkingMode
|
1186
|
-
]
|
1187
|
-
},
|
1188
|
-
'ports': {
|
1189
|
-
'type': 'string',
|
1190
|
-
'case_insensitive_enum': [
|
1191
|
-
type.value
|
1192
|
-
for type in kubernetes_enums.KubernetesPortMode
|
1193
|
-
]
|
1194
|
-
},
|
1195
|
-
'pod_config': {
|
1245
|
+
'context_configs': {
|
1196
1246
|
'type': 'object',
|
1197
1247
|
'required': [],
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1206
|
-
# Disallow 'name' and 'namespace' keys in this dict
|
1207
|
-
'not': {
|
1208
|
-
'anyOf': [{
|
1209
|
-
'required': ['name']
|
1210
|
-
}, {
|
1211
|
-
'required': ['namespace']
|
1212
|
-
}]
|
1213
|
-
}
|
1214
|
-
},
|
1215
|
-
'provision_timeout': {
|
1216
|
-
'type': 'integer',
|
1217
|
-
},
|
1218
|
-
'autoscaler': {
|
1219
|
-
'type': 'string',
|
1220
|
-
'case_insensitive_enum': [
|
1221
|
-
type.value
|
1222
|
-
for type in kubernetes_enums.KubernetesAutoscalerType
|
1223
|
-
]
|
1224
|
-
},
|
1225
|
-
'high_availability': {
|
1226
|
-
'type': 'object',
|
1227
|
-
'required': [],
|
1228
|
-
'additionalProperties': False,
|
1229
|
-
'properties': {
|
1230
|
-
'storage_class_name': {
|
1231
|
-
'type': 'string',
|
1232
|
-
}
|
1233
|
-
}
|
1234
|
-
},
|
1235
|
-
'kueue': {
|
1236
|
-
'type': 'object',
|
1237
|
-
'required': [],
|
1238
|
-
'additionalProperties': False,
|
1239
|
-
'properties': {
|
1240
|
-
'local_queue_name': {
|
1241
|
-
'type': 'string',
|
1248
|
+
'properties': {},
|
1249
|
+
# Properties are kubernetes context names.
|
1250
|
+
'additionalProperties': {
|
1251
|
+
'type': 'object',
|
1252
|
+
'required': [],
|
1253
|
+
'additionalProperties': False,
|
1254
|
+
'properties': {
|
1255
|
+
**_CONTEXT_CONFIG_SCHEMA_KUBERNETES,
|
1242
1256
|
},
|
1243
1257
|
},
|
1244
1258
|
},
|
1259
|
+
**_CONTEXT_CONFIG_SCHEMA_KUBERNETES,
|
1245
1260
|
}
|
1246
1261
|
},
|
1247
1262
|
'ssh': {
|
@@ -1400,6 +1415,18 @@ def get_config_schema():
|
|
1400
1415
|
# Apply validation for URL
|
1401
1416
|
'pattern': r'^https?://.*$',
|
1402
1417
|
},
|
1418
|
+
'service_account_token': {
|
1419
|
+
'anyOf': [
|
1420
|
+
{
|
1421
|
+
'type': 'string',
|
1422
|
+
# Validate that token starts with sky_ prefix
|
1423
|
+
'pattern': r'^sky_.+$',
|
1424
|
+
},
|
1425
|
+
{
|
1426
|
+
'type': 'null',
|
1427
|
+
}
|
1428
|
+
]
|
1429
|
+
},
|
1403
1430
|
}
|
1404
1431
|
}
|
1405
1432
|
|
sky/utils/subprocess_utils.py
CHANGED
@@ -6,6 +6,7 @@ import random
|
|
6
6
|
import resource
|
7
7
|
import shlex
|
8
8
|
import subprocess
|
9
|
+
import sys
|
9
10
|
import threading
|
10
11
|
import time
|
11
12
|
import typing
|
@@ -16,7 +17,6 @@ import colorama
|
|
16
17
|
from sky import exceptions
|
17
18
|
from sky import sky_logging
|
18
19
|
from sky.adaptors import common as adaptors_common
|
19
|
-
from sky.skylet import constants
|
20
20
|
from sky.skylet import log_lib
|
21
21
|
from sky.utils import common_utils
|
22
22
|
from sky.utils import timeline
|
@@ -322,12 +322,8 @@ def kill_process_daemon(process_pid: int) -> None:
|
|
322
322
|
daemon_script = os.path.join(
|
323
323
|
os.path.dirname(os.path.abspath(log_lib.__file__)),
|
324
324
|
'subprocess_daemon.py')
|
325
|
-
python_path = subprocess.check_output(constants.SKY_GET_PYTHON_PATH_CMD,
|
326
|
-
shell=True,
|
327
|
-
stderr=subprocess.DEVNULL,
|
328
|
-
encoding='utf-8').strip()
|
329
325
|
daemon_cmd = [
|
330
|
-
|
326
|
+
sys.executable,
|
331
327
|
daemon_script,
|
332
328
|
'--parent-pid',
|
333
329
|
str(parent_pid),
|
sky/workspaces/core.py
CHANGED
@@ -1,13 +1,11 @@
|
|
1
1
|
"""Workspace management core."""
|
2
2
|
|
3
|
-
import
|
4
|
-
from typing import Any, Callable, Dict, List
|
3
|
+
from typing import Any, Callable, Dict, List, Tuple
|
5
4
|
|
6
5
|
import filelock
|
7
6
|
|
8
7
|
from sky import check as sky_check
|
9
8
|
from sky import exceptions
|
10
|
-
from sky import global_user_state
|
11
9
|
from sky import models
|
12
10
|
from sky import sky_logging
|
13
11
|
from sky import skypilot_config
|
@@ -17,6 +15,7 @@ from sky.users import permission
|
|
17
15
|
from sky.utils import annotations
|
18
16
|
from sky.utils import common_utils
|
19
17
|
from sky.utils import config_utils
|
18
|
+
from sky.utils import resource_checker
|
20
19
|
from sky.utils import schemas
|
21
20
|
from sky.workspaces import utils as workspaces_utils
|
22
21
|
|
@@ -79,116 +78,6 @@ def _update_workspaces_config(
|
|
79
78
|
f'file if you believe it is stale.') from e
|
80
79
|
|
81
80
|
|
82
|
-
def _check_workspace_has_no_active_resources(workspace_name: str,
|
83
|
-
operation: str) -> None:
|
84
|
-
"""Check if a workspace has active clusters or managed jobs.
|
85
|
-
|
86
|
-
Args:
|
87
|
-
workspace_name: The name of the workspace to check.
|
88
|
-
operation: The operation being performed ('update' or 'delete').
|
89
|
-
|
90
|
-
Raises:
|
91
|
-
ValueError: If the workspace has active clusters or managed jobs.
|
92
|
-
"""
|
93
|
-
_check_workspaces_have_no_active_resources([(workspace_name, operation)])
|
94
|
-
|
95
|
-
|
96
|
-
def _check_workspaces_have_no_active_resources(
|
97
|
-
workspace_operations: list) -> None:
|
98
|
-
"""Check if workspaces have active clusters or managed jobs.
|
99
|
-
|
100
|
-
Args:
|
101
|
-
workspace_operations: List of tuples (workspace_name, operation) where
|
102
|
-
operation is 'update' or 'delete'.
|
103
|
-
|
104
|
-
Raises:
|
105
|
-
ValueError: If any workspace has active clusters or managed jobs.
|
106
|
-
The error message will include all workspaces with issues.
|
107
|
-
"""
|
108
|
-
if not workspace_operations:
|
109
|
-
return
|
110
|
-
|
111
|
-
def get_all_clusters():
|
112
|
-
return global_user_state.get_clusters()
|
113
|
-
|
114
|
-
def get_all_managed_jobs():
|
115
|
-
# pylint: disable=import-outside-toplevel
|
116
|
-
from sky.jobs.server import core as managed_jobs_core
|
117
|
-
try:
|
118
|
-
return managed_jobs_core.queue(refresh=False,
|
119
|
-
skip_finished=True,
|
120
|
-
all_users=True)
|
121
|
-
except exceptions.ClusterNotUpError:
|
122
|
-
logger.warning('All jobs should be finished in workspace.')
|
123
|
-
return []
|
124
|
-
|
125
|
-
# Fetch both clusters and jobs in parallel
|
126
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
127
|
-
clusters_future = executor.submit(get_all_clusters)
|
128
|
-
jobs_future = executor.submit(get_all_managed_jobs)
|
129
|
-
|
130
|
-
all_clusters = clusters_future.result()
|
131
|
-
all_managed_jobs = jobs_future.result()
|
132
|
-
|
133
|
-
# Collect all error messages instead of raising immediately
|
134
|
-
error_messages = []
|
135
|
-
|
136
|
-
# Check each workspace against the fetched data
|
137
|
-
for workspace_name, operation in workspace_operations:
|
138
|
-
# Filter clusters for this workspace
|
139
|
-
workspace_clusters = [
|
140
|
-
cluster for cluster in all_clusters
|
141
|
-
if (cluster.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
|
142
|
-
== workspace_name)
|
143
|
-
]
|
144
|
-
|
145
|
-
# Filter managed jobs for this workspace
|
146
|
-
workspace_active_jobs = [
|
147
|
-
job for job in all_managed_jobs
|
148
|
-
if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) ==
|
149
|
-
workspace_name
|
150
|
-
]
|
151
|
-
|
152
|
-
# Collect error messages for this workspace
|
153
|
-
workspace_errors = []
|
154
|
-
|
155
|
-
if workspace_clusters:
|
156
|
-
active_cluster_names = [
|
157
|
-
cluster['name'] for cluster in workspace_clusters
|
158
|
-
]
|
159
|
-
cluster_list = ', '.join(active_cluster_names)
|
160
|
-
workspace_errors.append(
|
161
|
-
f'{len(workspace_clusters)} active cluster(s): {cluster_list}')
|
162
|
-
|
163
|
-
if workspace_active_jobs:
|
164
|
-
job_names = [str(job['job_id']) for job in workspace_active_jobs]
|
165
|
-
job_list = ', '.join(job_names)
|
166
|
-
workspace_errors.append(
|
167
|
-
f'{len(workspace_active_jobs)} active managed job(s): '
|
168
|
-
f'{job_list}')
|
169
|
-
|
170
|
-
# If this workspace has issues, add to overall error messages
|
171
|
-
if workspace_errors:
|
172
|
-
workspace_error_summary = ' and '.join(workspace_errors)
|
173
|
-
error_messages.append(
|
174
|
-
f'Cannot {operation} workspace {workspace_name!r} because it '
|
175
|
-
f'has {workspace_error_summary}.')
|
176
|
-
|
177
|
-
# If we collected any errors, raise them all together
|
178
|
-
if error_messages:
|
179
|
-
if len(error_messages) == 1:
|
180
|
-
# Single workspace error
|
181
|
-
full_message = error_messages[
|
182
|
-
0] + ' Please terminate these resources first.'
|
183
|
-
else:
|
184
|
-
# Multiple workspace errors
|
185
|
-
full_message = (f'Cannot proceed due to active resources in '
|
186
|
-
f'{len(error_messages)} workspace(s):\n' +
|
187
|
-
'\n'.join(f'• {msg}' for msg in error_messages) +
|
188
|
-
'\nPlease terminate these resources first.')
|
189
|
-
raise ValueError(full_message)
|
190
|
-
|
191
|
-
|
192
81
|
def _validate_workspace_config(workspace_name: str,
|
193
82
|
workspace_config: Dict[str, Any]) -> None:
|
194
83
|
"""Validate the workspace configuration.
|
@@ -229,7 +118,8 @@ def update_workspace(workspace_name: str, config: Dict[str,
|
|
229
118
|
# Check for active clusters and managed jobs in the workspace
|
230
119
|
# TODO(zhwu): we should allow the edits that only contain changes to
|
231
120
|
# allowed_users or private.
|
232
|
-
|
121
|
+
resource_checker.check_no_active_resources_for_workspaces([(workspace_name,
|
122
|
+
'update')])
|
233
123
|
|
234
124
|
def update_workspace_fn(workspaces: Dict[str, Any]) -> None:
|
235
125
|
"""Function to update workspace inside the lock."""
|
@@ -327,7 +217,8 @@ def delete_workspace(workspace_name: str) -> Dict[str, Any]:
|
|
327
217
|
raise ValueError(f'Workspace {workspace_name!r} does not exist.')
|
328
218
|
|
329
219
|
# Check for active clusters and managed jobs in the workspace
|
330
|
-
|
220
|
+
resource_checker.check_no_active_resources_for_workspaces([(workspace_name,
|
221
|
+
'delete')])
|
331
222
|
|
332
223
|
def delete_workspace_fn(workspaces: Dict[str, Any]) -> None:
|
333
224
|
"""Function to delete workspace inside the lock."""
|
@@ -396,7 +287,7 @@ def update_config(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
396
287
|
new_workspaces = config.get('workspaces', {})
|
397
288
|
|
398
289
|
# Collect all workspaces that need to be checked for active resources
|
399
|
-
workspaces_to_check = []
|
290
|
+
workspaces_to_check: List[Tuple[str, str]] = []
|
400
291
|
workspaces_to_check_policy: Dict[str, Dict[str, List[str]]] = {
|
401
292
|
'add': {},
|
402
293
|
'update': {},
|
@@ -430,7 +321,8 @@ def update_config(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
430
321
|
workspaces_to_check_policy['delete'][workspace_name] = ['*']
|
431
322
|
|
432
323
|
# Check all workspaces for active resources in one efficient call
|
433
|
-
|
324
|
+
resource_checker.check_no_active_resources_for_workspaces(
|
325
|
+
workspaces_to_check)
|
434
326
|
|
435
327
|
# Use file locking to prevent race conditions
|
436
328
|
lock_path = skypilot_config.get_skypilot_config_lock_path()
|