skypilot-nightly 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +5 -2
- sky/backends/backend_utils.py +37 -6
- sky/backends/cloud_vm_ray_backend.py +41 -6
- sky/client/cli/command.py +22 -2
- sky/core.py +5 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Fuy7OzApYTUMz2QgoP7dP}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/{webpack-339efec49c0cc7d0.js → webpack-7fd0cf9dbecff10f.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +15 -0
- sky/global_user_state.py +102 -0
- sky/jobs/recovery_strategy.py +3 -0
- sky/jobs/server/core.py +4 -0
- sky/jobs/utils.py +9 -2
- sky/provision/__init__.py +3 -2
- sky/provision/aws/instance.py +5 -4
- sky/provision/azure/instance.py +5 -4
- sky/provision/cudo/instance.py +5 -4
- sky/provision/do/instance.py +5 -4
- sky/provision/fluidstack/instance.py +5 -4
- sky/provision/gcp/instance.py +5 -4
- sky/provision/hyperbolic/instance.py +5 -4
- sky/provision/kubernetes/instance.py +36 -6
- sky/provision/lambda_cloud/instance.py +5 -4
- sky/provision/nebius/instance.py +5 -4
- sky/provision/oci/instance.py +5 -4
- sky/provision/paperspace/instance.py +5 -4
- sky/provision/provisioner.py +6 -0
- sky/provision/runpod/instance.py +5 -4
- sky/provision/scp/instance.py +5 -5
- sky/provision/vast/instance.py +5 -5
- sky/provision/vsphere/instance.py +5 -4
- sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/serve_state/001_initial_schema.py +1 -1
- sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
- sky/serve/serve_utils.py +37 -3
- sky/skypilot_config.py +4 -4
- sky/users/permission.py +1 -1
- sky/utils/cli_utils/status_utils.py +9 -0
- sky/utils/db/db_utils.py +22 -1
- sky/utils/db/migration_utils.py +1 -1
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/RECORD +67 -66
- sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
- /sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Fuy7OzApYTUMz2QgoP7dP}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/top_level.txt +0 -0
sky/provision/gcp/instance.py
CHANGED
|
@@ -4,7 +4,7 @@ import copy
|
|
|
4
4
|
from multiprocessing import pool
|
|
5
5
|
import re
|
|
6
6
|
import time
|
|
7
|
-
from typing import Any, Callable, Dict, Iterable, List, Optional, Type
|
|
7
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type
|
|
8
8
|
|
|
9
9
|
from sky import sky_logging
|
|
10
10
|
from sky.adaptors import gcp
|
|
@@ -61,7 +61,7 @@ def query_instances(
|
|
|
61
61
|
cluster_name_on_cloud: str,
|
|
62
62
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
63
63
|
non_terminated_only: bool = True,
|
|
64
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
64
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
65
65
|
"""See sky/provision/__init__.py"""
|
|
66
66
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
67
67
|
zone = provider_config['availability_zone']
|
|
@@ -84,7 +84,8 @@ def query_instances(
|
|
|
84
84
|
)
|
|
85
85
|
|
|
86
86
|
raw_statuses = {}
|
|
87
|
-
statuses
|
|
87
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
88
|
+
Optional[str]]] = {}
|
|
88
89
|
for inst_id, instance in instances.items():
|
|
89
90
|
raw_status = instance[handler.STATUS_FIELD]
|
|
90
91
|
raw_statuses[inst_id] = raw_status
|
|
@@ -98,7 +99,7 @@ def query_instances(
|
|
|
98
99
|
status = None
|
|
99
100
|
if non_terminated_only and status is None:
|
|
100
101
|
continue
|
|
101
|
-
statuses[inst_id] = status
|
|
102
|
+
statuses[inst_id] = (status, None)
|
|
102
103
|
|
|
103
104
|
# GCP does not clean up preempted TPU VMs. We remove it ourselves.
|
|
104
105
|
if handler == instance_utils.GCPTPUVMInstance:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Hyperbolic instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -307,7 +307,7 @@ def query_instances(
|
|
|
307
307
|
cluster_name_on_cloud: str,
|
|
308
308
|
provider_config: Optional[dict] = None,
|
|
309
309
|
non_terminated_only: bool = True,
|
|
310
|
-
) -> Dict[str, Optional['status_lib.ClusterStatus']]:
|
|
310
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
311
311
|
"""Returns the status of the specified instances for Hyperbolic."""
|
|
312
312
|
del provider_config # unused
|
|
313
313
|
# Fetch all instances for this cluster
|
|
@@ -319,7 +319,8 @@ def query_instances(
|
|
|
319
319
|
# No instances found: return empty dict to indicate fully deleted
|
|
320
320
|
return {}
|
|
321
321
|
|
|
322
|
-
statuses: Dict[str, Optional['status_lib.ClusterStatus']
|
|
322
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
323
|
+
Optional[str]]] = {}
|
|
323
324
|
for instance_id, instance in instances.items():
|
|
324
325
|
try:
|
|
325
326
|
raw_status = instance.get('status', 'unknown').lower()
|
|
@@ -328,7 +329,7 @@ def query_instances(
|
|
|
328
329
|
status = hyperbolic_status.to_cluster_status()
|
|
329
330
|
if non_terminated_only and status is None:
|
|
330
331
|
continue
|
|
331
|
-
statuses[instance_id] = status
|
|
332
|
+
statuses[instance_id] = (status, None)
|
|
332
333
|
except utils.HyperbolicError as e:
|
|
333
334
|
logger.warning(
|
|
334
335
|
f'Failed to parse status for instance {instance_id}: {e}')
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import copy
|
|
3
3
|
import json
|
|
4
4
|
import time
|
|
5
|
-
from typing import Any, Callable, Dict, List, Optional, Union
|
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
6
6
|
|
|
7
7
|
from sky import exceptions
|
|
8
8
|
from sky import sky_logging
|
|
@@ -1248,15 +1248,37 @@ def get_cluster_info(
|
|
|
1248
1248
|
provider_config=provider_config)
|
|
1249
1249
|
|
|
1250
1250
|
|
|
1251
|
+
def _get_pod_termination_reason(pod: Any) -> str:
|
|
1252
|
+
reasons = []
|
|
1253
|
+
if pod.status.container_statuses:
|
|
1254
|
+
for container_status in pod.status.container_statuses:
|
|
1255
|
+
terminated = container_status.state.terminated
|
|
1256
|
+
if terminated:
|
|
1257
|
+
exit_code = terminated.exit_code
|
|
1258
|
+
reason = terminated.reason
|
|
1259
|
+
if exit_code == 0:
|
|
1260
|
+
# skip exit 0 (non-failed) just for sanity
|
|
1261
|
+
continue
|
|
1262
|
+
if reason is None:
|
|
1263
|
+
# just in-case reason is None, have default for debugging
|
|
1264
|
+
reason = f'exit({exit_code})'
|
|
1265
|
+
reasons.append(reason)
|
|
1266
|
+
# TODO (kyuds): later, if needed, query `last_state` too.
|
|
1267
|
+
|
|
1268
|
+
# Normally we will have a single container per pod for skypilot
|
|
1269
|
+
# but doing this just in-case there are multiple containers.
|
|
1270
|
+
return ' | '.join(reasons)
|
|
1271
|
+
|
|
1272
|
+
|
|
1251
1273
|
def query_instances(
|
|
1252
1274
|
cluster_name_on_cloud: str,
|
|
1253
1275
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
1254
1276
|
non_terminated_only: bool = True
|
|
1255
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
1277
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
1256
1278
|
status_map = {
|
|
1257
1279
|
'Pending': status_lib.ClusterStatus.INIT,
|
|
1258
1280
|
'Running': status_lib.ClusterStatus.UP,
|
|
1259
|
-
'Failed':
|
|
1281
|
+
'Failed': status_lib.ClusterStatus.INIT,
|
|
1260
1282
|
'Unknown': None,
|
|
1261
1283
|
'Succeeded': None,
|
|
1262
1284
|
'Terminating': None,
|
|
@@ -1298,12 +1320,20 @@ def query_instances(
|
|
|
1298
1320
|
f'status: {common_utils.format_exception(e)}')
|
|
1299
1321
|
|
|
1300
1322
|
# Check if the pods are running or pending
|
|
1301
|
-
cluster_status
|
|
1323
|
+
cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
1324
|
+
Optional[str]]] = {}
|
|
1302
1325
|
for pod in pods:
|
|
1303
|
-
|
|
1326
|
+
phase = pod.status.phase
|
|
1327
|
+
pod_status = status_map[phase]
|
|
1304
1328
|
if non_terminated_only and pod_status is None:
|
|
1305
1329
|
continue
|
|
1306
|
-
|
|
1330
|
+
reason = None
|
|
1331
|
+
if phase == 'Failed':
|
|
1332
|
+
reason = _get_pod_termination_reason(pod)
|
|
1333
|
+
logger.debug(f'Pod Status Reason(s): {reason}')
|
|
1334
|
+
pod_name = pod.metadata.name
|
|
1335
|
+
reason = f'{pod_name}: {reason}' if reason is not None else None
|
|
1336
|
+
cluster_status[pod_name] = (pod_status, reason)
|
|
1307
1337
|
return cluster_status
|
|
1308
1338
|
|
|
1309
1339
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Lambda Cloud instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.provision import common
|
|
@@ -229,7 +229,7 @@ def query_instances(
|
|
|
229
229
|
cluster_name_on_cloud: str,
|
|
230
230
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
231
231
|
non_terminated_only: bool = True,
|
|
232
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
232
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
233
233
|
"""See sky/provision/__init__.py"""
|
|
234
234
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
235
235
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
@@ -240,12 +240,13 @@ def query_instances(
|
|
|
240
240
|
'unhealthy': status_lib.ClusterStatus.INIT,
|
|
241
241
|
'terminating': None,
|
|
242
242
|
}
|
|
243
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
243
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
244
|
+
Optional[str]]] = {}
|
|
244
245
|
for instance_id, instance in instances.items():
|
|
245
246
|
status = status_map.get(instance['status'])
|
|
246
247
|
if non_terminated_only and status is None:
|
|
247
248
|
continue
|
|
248
|
-
statuses[instance_id] = status
|
|
249
|
+
statuses[instance_id] = (status, None)
|
|
249
250
|
return statuses
|
|
250
251
|
|
|
251
252
|
|
sky/provision/nebius/instance.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Nebius instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -250,7 +250,7 @@ def query_instances(
|
|
|
250
250
|
cluster_name_on_cloud: str,
|
|
251
251
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
252
252
|
non_terminated_only: bool = True,
|
|
253
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
253
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
254
254
|
"""See sky/provision/__init__.py"""
|
|
255
255
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
256
256
|
instances = _filter_instances(provider_config['region'],
|
|
@@ -263,12 +263,13 @@ def query_instances(
|
|
|
263
263
|
'STOPPING': status_lib.ClusterStatus.STOPPED,
|
|
264
264
|
'DELETING': status_lib.ClusterStatus.STOPPED,
|
|
265
265
|
}
|
|
266
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
266
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
267
|
+
Optional[str]]] = {}
|
|
267
268
|
for inst_id, inst in instances.items():
|
|
268
269
|
status = status_map[inst['status']]
|
|
269
270
|
if non_terminated_only and status is None:
|
|
270
271
|
continue
|
|
271
|
-
statuses[inst_id] = status
|
|
272
|
+
statuses[inst_id] = (status, None)
|
|
272
273
|
return statuses
|
|
273
274
|
|
|
274
275
|
|
sky/provision/oci/instance.py
CHANGED
|
@@ -10,7 +10,7 @@ import copy
|
|
|
10
10
|
from datetime import datetime
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import Any, Dict, List, Optional
|
|
13
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
14
14
|
|
|
15
15
|
from sky import exceptions
|
|
16
16
|
from sky import sky_logging
|
|
@@ -35,7 +35,7 @@ def query_instances(
|
|
|
35
35
|
cluster_name_on_cloud: str,
|
|
36
36
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
37
37
|
non_terminated_only: bool = True,
|
|
38
|
-
) -> Dict[str, Optional['status_lib.ClusterStatus']]:
|
|
38
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
39
39
|
"""Query instances.
|
|
40
40
|
|
|
41
41
|
Returns a dictionary of instance IDs and status.
|
|
@@ -47,7 +47,8 @@ def query_instances(
|
|
|
47
47
|
region = provider_config['region']
|
|
48
48
|
|
|
49
49
|
status_map = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY
|
|
50
|
-
statuses: Dict[str, Optional['status_lib.ClusterStatus']
|
|
50
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
51
|
+
Optional[str]]] = {}
|
|
51
52
|
filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
|
|
52
53
|
|
|
53
54
|
instances = _get_filtered_nodes(region, filters)
|
|
@@ -56,7 +57,7 @@ def query_instances(
|
|
|
56
57
|
sky_status = status_map[vm_status]
|
|
57
58
|
if non_terminated_only and sky_status is None:
|
|
58
59
|
continue
|
|
59
|
-
statuses[node['inst_id']] = sky_status
|
|
60
|
+
statuses[node['inst_id']] = (sky_status, None)
|
|
60
61
|
|
|
61
62
|
return statuses
|
|
62
63
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Paperspace instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.provision import common
|
|
@@ -280,7 +280,7 @@ def query_instances(
|
|
|
280
280
|
cluster_name_on_cloud: str,
|
|
281
281
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
282
282
|
non_terminated_only: bool = True,
|
|
283
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
283
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
284
284
|
"""See sky/provision/__init__.py"""
|
|
285
285
|
del non_terminated_only
|
|
286
286
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
@@ -297,10 +297,11 @@ def query_instances(
|
|
|
297
297
|
'ready': status_lib.ClusterStatus.UP,
|
|
298
298
|
'off': status_lib.ClusterStatus.STOPPED,
|
|
299
299
|
}
|
|
300
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
300
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
301
|
+
Optional[str]]] = {}
|
|
301
302
|
for inst_id, inst in instances.items():
|
|
302
303
|
status = status_map[inst['state']]
|
|
303
|
-
statuses[inst_id] = status
|
|
304
|
+
statuses[inst_id] = (status, None)
|
|
304
305
|
return statuses
|
|
305
306
|
|
|
306
307
|
|
sky/provision/provisioner.py
CHANGED
|
@@ -100,6 +100,12 @@ def _bulk_provision(
|
|
|
100
100
|
f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
|
|
101
101
|
f'seconds.')
|
|
102
102
|
|
|
103
|
+
# Add cluster event for provisioning completion.
|
|
104
|
+
global_user_state.add_cluster_event(
|
|
105
|
+
str(cluster_name), status_lib.ClusterStatus.INIT,
|
|
106
|
+
f'Instances launched on {cloud.display_name()} in {region}',
|
|
107
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
108
|
+
|
|
103
109
|
return provision_record
|
|
104
110
|
|
|
105
111
|
|
sky/provision/runpod/instance.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""RunPod instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -204,7 +204,7 @@ def query_instances(
|
|
|
204
204
|
cluster_name_on_cloud: str,
|
|
205
205
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
206
206
|
non_terminated_only: bool = True,
|
|
207
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
207
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
208
208
|
"""See sky/provision/__init__.py"""
|
|
209
209
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
210
210
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
@@ -215,12 +215,13 @@ def query_instances(
|
|
|
215
215
|
'PAUSED': status_lib.ClusterStatus.INIT,
|
|
216
216
|
'RUNNING': status_lib.ClusterStatus.UP,
|
|
217
217
|
}
|
|
218
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
218
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
219
|
+
Optional[str]]] = {}
|
|
219
220
|
for inst_id, inst in instances.items():
|
|
220
221
|
status = status_map[inst['status']]
|
|
221
222
|
if non_terminated_only and status is None:
|
|
222
223
|
continue
|
|
223
|
-
statuses[inst_id] = status
|
|
224
|
+
statuses[inst_id] = (status, None)
|
|
224
225
|
return statuses
|
|
225
226
|
|
|
226
227
|
|
sky/provision/scp/instance.py
CHANGED
|
@@ -4,7 +4,7 @@ import logging
|
|
|
4
4
|
import random
|
|
5
5
|
import string
|
|
6
6
|
import time
|
|
7
|
-
from typing import Any, Dict, List, Optional
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
8
8
|
|
|
9
9
|
from sky.clouds.utils import scp_utils
|
|
10
10
|
from sky.provision import common
|
|
@@ -430,8 +430,7 @@ def query_instances(
|
|
|
430
430
|
cluster_name_on_cloud: str,
|
|
431
431
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
432
432
|
non_terminated_only: bool = True,
|
|
433
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
434
|
-
|
|
433
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
435
434
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
436
435
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
437
436
|
|
|
@@ -447,12 +446,13 @@ def query_instances(
|
|
|
447
446
|
'TERMINATED': None,
|
|
448
447
|
}
|
|
449
448
|
|
|
450
|
-
statuses
|
|
449
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
450
|
+
Optional[str]]] = {}
|
|
451
451
|
for instance in instances:
|
|
452
452
|
status = status_map[instance['virtualServerState']]
|
|
453
453
|
if non_terminated_only and status is None:
|
|
454
454
|
continue
|
|
455
|
-
statuses[instance['virtualServerId']] = status
|
|
455
|
+
statuses[instance['virtualServerId']] = (status, None)
|
|
456
456
|
return statuses
|
|
457
457
|
|
|
458
458
|
|
sky/provision/vast/instance.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Vast instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -219,9 +219,8 @@ def query_instances(
|
|
|
219
219
|
cluster_name_on_cloud: str,
|
|
220
220
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
221
221
|
non_terminated_only: bool = True,
|
|
222
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
222
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
223
223
|
"""See sky/provision/__init__.py"""
|
|
224
|
-
|
|
225
224
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
226
225
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
227
226
|
# "running", "frozen", "stopped", "unknown", "loading"
|
|
@@ -231,12 +230,13 @@ def query_instances(
|
|
|
231
230
|
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
|
232
231
|
'RUNNING': status_lib.ClusterStatus.UP,
|
|
233
232
|
}
|
|
234
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
233
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
234
|
+
Optional[str]]] = {}
|
|
235
235
|
for inst_id, inst in instances.items():
|
|
236
236
|
status = status_map[inst['status']]
|
|
237
237
|
if non_terminated_only and status is None:
|
|
238
238
|
continue
|
|
239
|
-
statuses[inst_id] = status
|
|
239
|
+
statuses[inst_id] = (status, None)
|
|
240
240
|
return statuses
|
|
241
241
|
|
|
242
242
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Vsphere instance provisioning."""
|
|
2
2
|
import json
|
|
3
3
|
import typing
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.adaptors import common as adaptors_common
|
|
@@ -396,7 +396,7 @@ def query_instances(
|
|
|
396
396
|
cluster_name_on_cloud: str,
|
|
397
397
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
398
398
|
non_terminated_only: bool = True,
|
|
399
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
399
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
400
400
|
"""See sky/provision/__init__.py"""
|
|
401
401
|
logger.info('New provision of Vsphere: query_instances().')
|
|
402
402
|
assert provider_config is not None, cluster_name_on_cloud
|
|
@@ -413,12 +413,13 @@ def query_instances(
|
|
|
413
413
|
'suspended': None,
|
|
414
414
|
}
|
|
415
415
|
|
|
416
|
-
status
|
|
416
|
+
status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
417
|
+
Optional[str]]] = {}
|
|
417
418
|
for inst in instances:
|
|
418
419
|
stat = status_map[inst.runtime.powerState]
|
|
419
420
|
if non_terminated_only and stat is None:
|
|
420
421
|
continue
|
|
421
|
-
status[inst.summary.config.instanceUuid] = stat
|
|
422
|
+
status[inst.summary.config.instanceUuid] = (stat, None)
|
|
422
423
|
vc_object.disconnect()
|
|
423
424
|
return status
|
|
424
425
|
|
|
@@ -22,7 +22,7 @@ depends_on = None
|
|
|
22
22
|
def upgrade():
|
|
23
23
|
with op.get_context().autocommit_block():
|
|
24
24
|
# Create any missing tables with current schema first
|
|
25
|
-
db_utils.
|
|
25
|
+
db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
|
|
26
26
|
|
|
27
27
|
# Add all missing columns to clusters table
|
|
28
28
|
# This allows each column addition to fail independently without rolling
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Columns for whether the cluster is managed.
|
|
2
|
+
|
|
3
|
+
Revision ID: 005
|
|
4
|
+
Revises: 004
|
|
5
|
+
Create Date: 2025-08-08
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
|
|
13
|
+
from sky.global_user_state import Base
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.π
|
|
17
|
+
revision: str = '005'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '004'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add new table for cluster events."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
# Add new table for cluster events.
|
|
27
|
+
db_utils.add_table_to_db_sqlalchemy(Base.metadata, op.get_bind(),
|
|
28
|
+
'cluster_events')
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def downgrade():
|
|
32
|
+
pass
|
|
@@ -26,7 +26,7 @@ def upgrade():
|
|
|
26
26
|
"""Create initial schema and add all backwards compatibility columns"""
|
|
27
27
|
with op.get_context().autocommit_block():
|
|
28
28
|
# Create all tables with their current schema
|
|
29
|
-
db_utils.
|
|
29
|
+
db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
|
|
30
30
|
|
|
31
31
|
# Add backwards compatibility columns using helper function that matches
|
|
32
32
|
# original add_column_to_table_sqlalchemy behavior exactly
|
|
@@ -26,7 +26,7 @@ def upgrade():
|
|
|
26
26
|
"""Create initial schema and add all backwards compatibility columns"""
|
|
27
27
|
with op.get_context().autocommit_block():
|
|
28
28
|
# Create all tables with their current schema
|
|
29
|
-
db_utils.
|
|
29
|
+
db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
|
|
30
30
|
|
|
31
31
|
# Add backwards compatibility columns using helper function that matches
|
|
32
32
|
# original add_column_to_table_sqlalchemy behavior exactly
|
sky/serve/serve_utils.py
CHANGED
|
@@ -37,6 +37,7 @@ from sky.skylet import job_lib
|
|
|
37
37
|
from sky.utils import annotations
|
|
38
38
|
from sky.utils import command_runner
|
|
39
39
|
from sky.utils import common_utils
|
|
40
|
+
from sky.utils import controller_utils
|
|
40
41
|
from sky.utils import log_utils
|
|
41
42
|
from sky.utils import message_utils
|
|
42
43
|
from sky.utils import resources_utils
|
|
@@ -259,14 +260,47 @@ def get_service_filelock_path(pool: str) -> str:
|
|
|
259
260
|
return str(path)
|
|
260
261
|
|
|
261
262
|
|
|
263
|
+
def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
|
|
264
|
+
pool: bool) -> None:
|
|
265
|
+
"""Validate the consolidation mode config."""
|
|
266
|
+
# Check whether the consolidation mode config is changed.
|
|
267
|
+
controller = controller_utils.get_controller_for_pool(pool).value
|
|
268
|
+
if current_is_consolidation_mode:
|
|
269
|
+
controller_cn = controller.cluster_name
|
|
270
|
+
if global_user_state.get_cluster_from_name(controller_cn) is not None:
|
|
271
|
+
with ux_utils.print_exception_no_traceback():
|
|
272
|
+
raise exceptions.InconsistentConsolidationModeError(
|
|
273
|
+
f'{colorama.Fore.RED}Consolidation mode for '
|
|
274
|
+
f'{controller.controller_type} is enabled, but the '
|
|
275
|
+
f'controller cluster {controller_cn} is still running. '
|
|
276
|
+
'Please terminate the controller cluster first.'
|
|
277
|
+
f'{colorama.Style.RESET_ALL}')
|
|
278
|
+
else:
|
|
279
|
+
noun = 'pool' if pool else 'service'
|
|
280
|
+
all_services = [
|
|
281
|
+
svc for svc in serve_state.get_services() if svc['pool'] == pool
|
|
282
|
+
]
|
|
283
|
+
if all_services:
|
|
284
|
+
with ux_utils.print_exception_no_traceback():
|
|
285
|
+
raise exceptions.InconsistentConsolidationModeError(
|
|
286
|
+
f'{colorama.Fore.RED}Consolidation mode for '
|
|
287
|
+
f'{controller.controller_type} is disabled, but there are '
|
|
288
|
+
f'still {len(all_services)} {noun}s running. Please '
|
|
289
|
+
f'terminate those {noun}s first.{colorama.Style.RESET_ALL}')
|
|
290
|
+
|
|
291
|
+
|
|
262
292
|
@annotations.lru_cache(scope='request', maxsize=1)
|
|
263
293
|
def is_consolidation_mode(pool: bool = False) -> bool:
|
|
264
294
|
# Use jobs config for pool consolidation mode.
|
|
265
|
-
|
|
295
|
+
controller = controller_utils.get_controller_for_pool(pool).value
|
|
266
296
|
consolidation_mode = skypilot_config.get_nested(
|
|
267
|
-
(controller_type, 'controller', 'consolidation_mode'),
|
|
297
|
+
(controller.controller_type, 'controller', 'consolidation_mode'),
|
|
268
298
|
default_value=False)
|
|
269
|
-
#
|
|
299
|
+
# We should only do this check on API server, as the controller will not
|
|
300
|
+
# have related config and will always seemingly disabled for consolidation
|
|
301
|
+
# mode. Check #6611 for more details.
|
|
302
|
+
if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
303
|
+
_validate_consolidation_mode_config(consolidation_mode, pool)
|
|
270
304
|
return consolidation_mode
|
|
271
305
|
|
|
272
306
|
|
sky/skypilot_config.py
CHANGED
|
@@ -575,8 +575,8 @@ def _reload_config_as_server() -> None:
|
|
|
575
575
|
with _DB_USE_LOCK:
|
|
576
576
|
sqlalchemy_engine = sqlalchemy.create_engine(db_url,
|
|
577
577
|
poolclass=NullPool)
|
|
578
|
-
db_utils.
|
|
579
|
-
|
|
578
|
+
db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata,
|
|
579
|
+
sqlalchemy_engine)
|
|
580
580
|
|
|
581
581
|
def _get_config_yaml_from_db(
|
|
582
582
|
key: str) -> Optional[config_utils.Config]:
|
|
@@ -867,8 +867,8 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
867
867
|
with _DB_USE_LOCK:
|
|
868
868
|
sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
|
|
869
869
|
poolclass=NullPool)
|
|
870
|
-
db_utils.
|
|
871
|
-
|
|
870
|
+
db_utils.add_all_tables_to_db_sqlalchemy(
|
|
871
|
+
Base.metadata, sqlalchemy_engine)
|
|
872
872
|
|
|
873
873
|
def _set_config_yaml_to_db(key: str,
|
|
874
874
|
config: config_utils.Config):
|
sky/users/permission.py
CHANGED
|
@@ -44,7 +44,7 @@ class PermissionService:
|
|
|
44
44
|
if _enforcer_instance is None:
|
|
45
45
|
_enforcer_instance = self
|
|
46
46
|
engine = global_user_state.initialize_and_get_db()
|
|
47
|
-
db_utils.
|
|
47
|
+
db_utils.add_all_tables_to_db_sqlalchemy(
|
|
48
48
|
sqlalchemy_adapter.Base.metadata, engine)
|
|
49
49
|
adapter = sqlalchemy_adapter.Adapter(engine)
|
|
50
50
|
model_path = os.path.join(os.path.dirname(__file__),
|
|
@@ -81,6 +81,7 @@ def show_status_table(cluster_records: List[_ClusterRecord],
|
|
|
81
81
|
_get_command,
|
|
82
82
|
truncate=not show_all,
|
|
83
83
|
show_by_default=False),
|
|
84
|
+
StatusColumn('LAST_EVENT', _get_last_event, show_by_default=False),
|
|
84
85
|
]
|
|
85
86
|
|
|
86
87
|
columns = []
|
|
@@ -314,6 +315,14 @@ def _get_head_ip(cluster_record: _ClusterRecord, truncate: bool = True) -> str:
|
|
|
314
315
|
return handle.head_ip
|
|
315
316
|
|
|
316
317
|
|
|
318
|
+
def _get_last_event(cluster_record: _ClusterRecord,
|
|
319
|
+
truncate: bool = True) -> str:
|
|
320
|
+
del truncate
|
|
321
|
+
if cluster_record.get('last_event', None) is None:
|
|
322
|
+
return 'No recorded events.'
|
|
323
|
+
return cluster_record['last_event']
|
|
324
|
+
|
|
325
|
+
|
|
317
326
|
def _is_pending_autostop(cluster_record: _ClusterRecord) -> bool:
|
|
318
327
|
# autostop < 0 means nothing scheduled.
|
|
319
328
|
return cluster_record['autostop'] >= 0 and _get_status(
|
sky/utils/db/db_utils.py
CHANGED
|
@@ -87,7 +87,7 @@ def add_column_to_table(
|
|
|
87
87
|
conn.commit()
|
|
88
88
|
|
|
89
89
|
|
|
90
|
-
def
|
|
90
|
+
def add_all_tables_to_db_sqlalchemy(
|
|
91
91
|
metadata: sqlalchemy.MetaData,
|
|
92
92
|
engine: sqlalchemy.Engine,
|
|
93
93
|
):
|
|
@@ -103,6 +103,27 @@ def add_tables_to_db_sqlalchemy(
|
|
|
103
103
|
raise
|
|
104
104
|
|
|
105
105
|
|
|
106
|
+
def add_table_to_db_sqlalchemy(
|
|
107
|
+
metadata: sqlalchemy.MetaData,
|
|
108
|
+
engine: sqlalchemy.Engine,
|
|
109
|
+
table_name: str,
|
|
110
|
+
):
|
|
111
|
+
"""Add a specific table to the database."""
|
|
112
|
+
try:
|
|
113
|
+
table = metadata.tables[table_name]
|
|
114
|
+
except KeyError as e:
|
|
115
|
+
raise e
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
table.create(bind=engine, checkfirst=True)
|
|
119
|
+
except (sqlalchemy_exc.OperationalError,
|
|
120
|
+
sqlalchemy_exc.ProgrammingError) as e:
|
|
121
|
+
if 'already exists' in str(e):
|
|
122
|
+
pass
|
|
123
|
+
else:
|
|
124
|
+
raise
|
|
125
|
+
|
|
126
|
+
|
|
106
127
|
def add_column_to_table_sqlalchemy(
|
|
107
128
|
session: 'Session',
|
|
108
129
|
table_name: str,
|
sky/utils/db/migration_utils.py
CHANGED
|
@@ -19,7 +19,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
19
19
|
DB_INIT_LOCK_TIMEOUT_SECONDS = 10
|
|
20
20
|
|
|
21
21
|
GLOBAL_USER_STATE_DB_NAME = 'state_db'
|
|
22
|
-
GLOBAL_USER_STATE_VERSION = '
|
|
22
|
+
GLOBAL_USER_STATE_VERSION = '005'
|
|
23
23
|
GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
|
|
24
24
|
|
|
25
25
|
SPOT_JOBS_DB_NAME = 'spot_jobs_db'
|