skypilot-nightly 1.0.0.dev20250807__py3-none-any.whl → 1.0.0.dev20250812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +5 -2
- sky/backends/backend_utils.py +57 -7
- sky/backends/cloud_vm_ray_backend.py +50 -8
- sky/client/cli/command.py +60 -26
- sky/client/sdk.py +132 -65
- sky/client/sdk_async.py +1 -1
- sky/core.py +10 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → Fuy7OzApYTUMz2QgoP7dP}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{6601-3e21152fe16da09c.js → 6601-06114c982db410b6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
- sky/dashboard/out/_next/static/chunks/{8969-318c3dca725e8e5d.js → 8969-c9686994ddafcf01.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-1e6de35d15a8d432.js → _app-491a4d699d95e808.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +21 -4
- sky/global_user_state.py +110 -1
- sky/jobs/client/sdk.py +27 -20
- sky/jobs/controller.py +2 -1
- sky/jobs/recovery_strategy.py +3 -0
- sky/jobs/server/core.py +4 -0
- sky/jobs/utils.py +9 -2
- sky/provision/__init__.py +3 -2
- sky/provision/aws/instance.py +5 -4
- sky/provision/azure/instance.py +5 -4
- sky/provision/cudo/instance.py +5 -4
- sky/provision/do/instance.py +5 -4
- sky/provision/fluidstack/instance.py +5 -4
- sky/provision/gcp/instance.py +5 -4
- sky/provision/hyperbolic/instance.py +5 -4
- sky/provision/kubernetes/instance.py +36 -6
- sky/provision/lambda_cloud/instance.py +5 -4
- sky/provision/nebius/instance.py +5 -4
- sky/provision/oci/instance.py +5 -4
- sky/provision/paperspace/instance.py +5 -4
- sky/provision/provisioner.py +6 -0
- sky/provision/runpod/instance.py +5 -4
- sky/provision/scp/instance.py +5 -5
- sky/provision/vast/instance.py +5 -5
- sky/provision/vsphere/instance.py +5 -4
- sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
- sky/serve/client/impl.py +11 -8
- sky/serve/client/sdk.py +7 -7
- sky/serve/serve_state.py +437 -340
- sky/serve/serve_utils.py +37 -3
- sky/serve/server/impl.py +2 -2
- sky/server/common.py +12 -8
- sky/server/constants.py +1 -1
- sky/setup_files/alembic.ini +4 -0
- sky/skypilot_config.py +4 -4
- sky/users/permission.py +1 -1
- sky/utils/cli_utils/status_utils.py +10 -1
- sky/utils/db/db_utils.py +53 -1
- sky/utils/db/migration_utils.py +5 -1
- sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
- sky/utils/resource_checker.py +162 -21
- sky/volumes/client/sdk.py +4 -4
- sky/workspaces/core.py +210 -6
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/METADATA +2 -2
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/RECORD +87 -83
- sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +0 -1
- /sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → Fuy7OzApYTUMz2QgoP7dP}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
|
@@ -141,7 +141,7 @@ def _validate_consolidation_mode_config(
|
|
|
141
141
|
if global_user_state.get_cluster_from_name(controller_cn) is not None:
|
|
142
142
|
with ux_utils.print_exception_no_traceback():
|
|
143
143
|
raise exceptions.InconsistentConsolidationModeError(
|
|
144
|
-
f'{colorama.Fore.RED}Consolidation mode is '
|
|
144
|
+
f'{colorama.Fore.RED}Consolidation mode for jobs is '
|
|
145
145
|
f'enabled, but the controller cluster '
|
|
146
146
|
f'{controller_cn} is still running. Please '
|
|
147
147
|
'terminate the controller cluster first.'
|
|
@@ -179,7 +179,11 @@ def _validate_consolidation_mode_config(
|
|
|
179
179
|
def is_consolidation_mode() -> bool:
|
|
180
180
|
consolidation_mode = skypilot_config.get_nested(
|
|
181
181
|
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
|
182
|
-
|
|
182
|
+
# We should only do this check on API server, as the controller will not
|
|
183
|
+
# have related config and will always seemingly disabled for consolidation
|
|
184
|
+
# mode. Check #6611 for more details.
|
|
185
|
+
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
186
|
+
_validate_consolidation_mode_config(consolidation_mode)
|
|
183
187
|
return consolidation_mode
|
|
184
188
|
|
|
185
189
|
|
|
@@ -333,6 +337,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
333
337
|
if handle is not None:
|
|
334
338
|
try:
|
|
335
339
|
if pool is None:
|
|
340
|
+
global_user_state.add_cluster_event(
|
|
341
|
+
cluster_name, None, 'Cluster was cleaned up.',
|
|
342
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
336
343
|
terminate_cluster(cluster_name)
|
|
337
344
|
except Exception as e: # pylint: disable=broad-except
|
|
338
345
|
error_msg = (
|
sky/provision/__init__.py
CHANGED
|
@@ -76,10 +76,11 @@ def query_instances(
|
|
|
76
76
|
cluster_name_on_cloud: str,
|
|
77
77
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
78
78
|
non_terminated_only: bool = True,
|
|
79
|
-
) -> Dict[str, Optional['status_lib.ClusterStatus']]:
|
|
79
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
80
80
|
"""Query instances.
|
|
81
81
|
|
|
82
|
-
Returns a dictionary of instance IDs and status
|
|
82
|
+
Returns a dictionary of instance IDs and a tuple of (status, reason for
|
|
83
|
+
being in status if any).
|
|
83
84
|
|
|
84
85
|
A None status means the instance is marked as "terminated"
|
|
85
86
|
or "terminating".
|
sky/provision/aws/instance.py
CHANGED
|
@@ -10,7 +10,7 @@ from multiprocessing import pool
|
|
|
10
10
|
import re
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
|
|
13
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar
|
|
14
14
|
|
|
15
15
|
from sky import sky_logging
|
|
16
16
|
from sky.adaptors import aws
|
|
@@ -588,7 +588,7 @@ def query_instances(
|
|
|
588
588
|
cluster_name_on_cloud: str,
|
|
589
589
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
590
590
|
non_terminated_only: bool = True,
|
|
591
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
591
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
592
592
|
"""See sky/provision/__init__.py"""
|
|
593
593
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
594
594
|
region = provider_config['region']
|
|
@@ -608,12 +608,13 @@ def query_instances(
|
|
|
608
608
|
'shutting-down': None,
|
|
609
609
|
'terminated': None,
|
|
610
610
|
}
|
|
611
|
-
statuses
|
|
611
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
612
|
+
Optional[str]]] = {}
|
|
612
613
|
for inst in instances:
|
|
613
614
|
status = status_map[inst.state['Name']]
|
|
614
615
|
if non_terminated_only and status is None:
|
|
615
616
|
continue
|
|
616
|
-
statuses[inst.id] = status
|
|
617
|
+
statuses[inst.id] = (status, None)
|
|
617
618
|
return statuses
|
|
618
619
|
|
|
619
620
|
|
sky/provision/azure/instance.py
CHANGED
|
@@ -955,7 +955,7 @@ def query_instances(
|
|
|
955
955
|
cluster_name_on_cloud: str,
|
|
956
956
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
957
957
|
non_terminated_only: bool = True,
|
|
958
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
958
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
959
959
|
"""See sky/provision/__init__.py"""
|
|
960
960
|
assert provider_config is not None, cluster_name_on_cloud
|
|
961
961
|
|
|
@@ -964,7 +964,8 @@ def query_instances(
|
|
|
964
964
|
filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
|
|
965
965
|
compute_client = azure.get_client('compute', subscription_id)
|
|
966
966
|
nodes = _filter_instances(compute_client, resource_group, filters)
|
|
967
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
967
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
968
|
+
Optional[str]]] = {}
|
|
968
969
|
|
|
969
970
|
def _fetch_and_map_status(node, resource_group: str) -> None:
|
|
970
971
|
compute_client = azure.get_client('compute', subscription_id)
|
|
@@ -972,8 +973,8 @@ def query_instances(
|
|
|
972
973
|
|
|
973
974
|
if status is None and non_terminated_only:
|
|
974
975
|
return
|
|
975
|
-
statuses[node.name] = (None if status is None else
|
|
976
|
-
|
|
976
|
+
statuses[node.name] = ((None if status is None else
|
|
977
|
+
status.to_cluster_status()), None)
|
|
977
978
|
|
|
978
979
|
with pool.ThreadPool() as p:
|
|
979
980
|
p.starmap(_fetch_and_map_status,
|
sky/provision/cudo/instance.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Cudo Compute instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.provision import common
|
|
@@ -194,7 +194,7 @@ def query_instances(
|
|
|
194
194
|
cluster_name_on_cloud: str,
|
|
195
195
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
196
196
|
non_terminated_only: bool = True,
|
|
197
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
197
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
198
198
|
"""See sky/provision/__init__.py"""
|
|
199
199
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
200
200
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
@@ -210,12 +210,13 @@ def query_instances(
|
|
|
210
210
|
'done': status_lib.ClusterStatus.STOPPED,
|
|
211
211
|
'poff': status_lib.ClusterStatus.STOPPED,
|
|
212
212
|
}
|
|
213
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
213
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
214
|
+
Optional[str]]] = {}
|
|
214
215
|
for inst_id, inst in instances.items():
|
|
215
216
|
status = status_map[inst['status']]
|
|
216
217
|
if non_terminated_only and status is None:
|
|
217
218
|
continue
|
|
218
|
-
statuses[inst_id] = status
|
|
219
|
+
statuses[inst_id] = (status, None)
|
|
219
220
|
return statuses
|
|
220
221
|
|
|
221
222
|
|
sky/provision/do/instance.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""DigitalOcean instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
import uuid
|
|
6
6
|
|
|
7
7
|
from sky import sky_logging
|
|
@@ -245,7 +245,7 @@ def query_instances(
|
|
|
245
245
|
cluster_name_on_cloud: str,
|
|
246
246
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
247
247
|
non_terminated_only: bool = True,
|
|
248
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
248
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
249
249
|
"""See sky/provision/__init__.py"""
|
|
250
250
|
# terminated instances are not retrieved by the
|
|
251
251
|
# API making `non_terminated_only` argument moot.
|
|
@@ -260,10 +260,11 @@ def query_instances(
|
|
|
260
260
|
'active': status_lib.ClusterStatus.UP,
|
|
261
261
|
'off': status_lib.ClusterStatus.STOPPED,
|
|
262
262
|
}
|
|
263
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
263
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
264
|
+
Optional[str]]] = {}
|
|
264
265
|
for instance_meta in instances.values():
|
|
265
266
|
status = status_map[instance_meta['status']]
|
|
266
|
-
statuses[instance_meta['name']] = status
|
|
267
|
+
statuses[instance_meta['name']] = (status, None)
|
|
267
268
|
return statuses
|
|
268
269
|
|
|
269
270
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""FluidStack instance provisioning."""
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import authentication as auth
|
|
7
7
|
from sky import exceptions
|
|
@@ -290,7 +290,7 @@ def query_instances(
|
|
|
290
290
|
cluster_name_on_cloud: str,
|
|
291
291
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
292
292
|
non_terminated_only: bool = True,
|
|
293
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
293
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
294
294
|
"""See sky/provision/__init__.py"""
|
|
295
295
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
296
296
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
@@ -302,7 +302,8 @@ def query_instances(
|
|
|
302
302
|
'failed': status_lib.ClusterStatus.INIT,
|
|
303
303
|
'terminated': None,
|
|
304
304
|
}
|
|
305
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
305
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
306
|
+
Optional[str]]] = {}
|
|
306
307
|
for inst_id, inst in instances.items():
|
|
307
308
|
if inst['status'] not in status_map:
|
|
308
309
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -311,7 +312,7 @@ def query_instances(
|
|
|
311
312
|
status = status_map.get(inst['status'], None)
|
|
312
313
|
if non_terminated_only and status is None:
|
|
313
314
|
continue
|
|
314
|
-
statuses[inst_id] = status
|
|
315
|
+
statuses[inst_id] = (status, None)
|
|
315
316
|
return statuses
|
|
316
317
|
|
|
317
318
|
|
sky/provision/gcp/instance.py
CHANGED
|
@@ -4,7 +4,7 @@ import copy
|
|
|
4
4
|
from multiprocessing import pool
|
|
5
5
|
import re
|
|
6
6
|
import time
|
|
7
|
-
from typing import Any, Callable, Dict, Iterable, List, Optional, Type
|
|
7
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type
|
|
8
8
|
|
|
9
9
|
from sky import sky_logging
|
|
10
10
|
from sky.adaptors import gcp
|
|
@@ -61,7 +61,7 @@ def query_instances(
|
|
|
61
61
|
cluster_name_on_cloud: str,
|
|
62
62
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
63
63
|
non_terminated_only: bool = True,
|
|
64
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
64
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
65
65
|
"""See sky/provision/__init__.py"""
|
|
66
66
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
67
67
|
zone = provider_config['availability_zone']
|
|
@@ -84,7 +84,8 @@ def query_instances(
|
|
|
84
84
|
)
|
|
85
85
|
|
|
86
86
|
raw_statuses = {}
|
|
87
|
-
statuses
|
|
87
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
88
|
+
Optional[str]]] = {}
|
|
88
89
|
for inst_id, instance in instances.items():
|
|
89
90
|
raw_status = instance[handler.STATUS_FIELD]
|
|
90
91
|
raw_statuses[inst_id] = raw_status
|
|
@@ -98,7 +99,7 @@ def query_instances(
|
|
|
98
99
|
status = None
|
|
99
100
|
if non_terminated_only and status is None:
|
|
100
101
|
continue
|
|
101
|
-
statuses[inst_id] = status
|
|
102
|
+
statuses[inst_id] = (status, None)
|
|
102
103
|
|
|
103
104
|
# GCP does not clean up preempted TPU VMs. We remove it ourselves.
|
|
104
105
|
if handler == instance_utils.GCPTPUVMInstance:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Hyperbolic instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -307,7 +307,7 @@ def query_instances(
|
|
|
307
307
|
cluster_name_on_cloud: str,
|
|
308
308
|
provider_config: Optional[dict] = None,
|
|
309
309
|
non_terminated_only: bool = True,
|
|
310
|
-
) -> Dict[str, Optional['status_lib.ClusterStatus']]:
|
|
310
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
311
311
|
"""Returns the status of the specified instances for Hyperbolic."""
|
|
312
312
|
del provider_config # unused
|
|
313
313
|
# Fetch all instances for this cluster
|
|
@@ -319,7 +319,8 @@ def query_instances(
|
|
|
319
319
|
# No instances found: return empty dict to indicate fully deleted
|
|
320
320
|
return {}
|
|
321
321
|
|
|
322
|
-
statuses: Dict[str, Optional['status_lib.ClusterStatus']
|
|
322
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
323
|
+
Optional[str]]] = {}
|
|
323
324
|
for instance_id, instance in instances.items():
|
|
324
325
|
try:
|
|
325
326
|
raw_status = instance.get('status', 'unknown').lower()
|
|
@@ -328,7 +329,7 @@ def query_instances(
|
|
|
328
329
|
status = hyperbolic_status.to_cluster_status()
|
|
329
330
|
if non_terminated_only and status is None:
|
|
330
331
|
continue
|
|
331
|
-
statuses[instance_id] = status
|
|
332
|
+
statuses[instance_id] = (status, None)
|
|
332
333
|
except utils.HyperbolicError as e:
|
|
333
334
|
logger.warning(
|
|
334
335
|
f'Failed to parse status for instance {instance_id}: {e}')
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import copy
|
|
3
3
|
import json
|
|
4
4
|
import time
|
|
5
|
-
from typing import Any, Callable, Dict, List, Optional, Union
|
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
6
6
|
|
|
7
7
|
from sky import exceptions
|
|
8
8
|
from sky import sky_logging
|
|
@@ -1248,15 +1248,37 @@ def get_cluster_info(
|
|
|
1248
1248
|
provider_config=provider_config)
|
|
1249
1249
|
|
|
1250
1250
|
|
|
1251
|
+
def _get_pod_termination_reason(pod: Any) -> str:
|
|
1252
|
+
reasons = []
|
|
1253
|
+
if pod.status.container_statuses:
|
|
1254
|
+
for container_status in pod.status.container_statuses:
|
|
1255
|
+
terminated = container_status.state.terminated
|
|
1256
|
+
if terminated:
|
|
1257
|
+
exit_code = terminated.exit_code
|
|
1258
|
+
reason = terminated.reason
|
|
1259
|
+
if exit_code == 0:
|
|
1260
|
+
# skip exit 0 (non-failed) just for sanity
|
|
1261
|
+
continue
|
|
1262
|
+
if reason is None:
|
|
1263
|
+
# just in-case reason is None, have default for debugging
|
|
1264
|
+
reason = f'exit({exit_code})'
|
|
1265
|
+
reasons.append(reason)
|
|
1266
|
+
# TODO (kyuds): later, if needed, query `last_state` too.
|
|
1267
|
+
|
|
1268
|
+
# Normally we will have a single container per pod for skypilot
|
|
1269
|
+
# but doing this just in-case there are multiple containers.
|
|
1270
|
+
return ' | '.join(reasons)
|
|
1271
|
+
|
|
1272
|
+
|
|
1251
1273
|
def query_instances(
|
|
1252
1274
|
cluster_name_on_cloud: str,
|
|
1253
1275
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
1254
1276
|
non_terminated_only: bool = True
|
|
1255
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
1277
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
1256
1278
|
status_map = {
|
|
1257
1279
|
'Pending': status_lib.ClusterStatus.INIT,
|
|
1258
1280
|
'Running': status_lib.ClusterStatus.UP,
|
|
1259
|
-
'Failed':
|
|
1281
|
+
'Failed': status_lib.ClusterStatus.INIT,
|
|
1260
1282
|
'Unknown': None,
|
|
1261
1283
|
'Succeeded': None,
|
|
1262
1284
|
'Terminating': None,
|
|
@@ -1298,12 +1320,20 @@ def query_instances(
|
|
|
1298
1320
|
f'status: {common_utils.format_exception(e)}')
|
|
1299
1321
|
|
|
1300
1322
|
# Check if the pods are running or pending
|
|
1301
|
-
cluster_status
|
|
1323
|
+
cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
1324
|
+
Optional[str]]] = {}
|
|
1302
1325
|
for pod in pods:
|
|
1303
|
-
|
|
1326
|
+
phase = pod.status.phase
|
|
1327
|
+
pod_status = status_map[phase]
|
|
1304
1328
|
if non_terminated_only and pod_status is None:
|
|
1305
1329
|
continue
|
|
1306
|
-
|
|
1330
|
+
reason = None
|
|
1331
|
+
if phase == 'Failed':
|
|
1332
|
+
reason = _get_pod_termination_reason(pod)
|
|
1333
|
+
logger.debug(f'Pod Status Reason(s): {reason}')
|
|
1334
|
+
pod_name = pod.metadata.name
|
|
1335
|
+
reason = f'{pod_name}: {reason}' if reason is not None else None
|
|
1336
|
+
cluster_status[pod_name] = (pod_status, reason)
|
|
1307
1337
|
return cluster_status
|
|
1308
1338
|
|
|
1309
1339
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Lambda Cloud instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.provision import common
|
|
@@ -229,7 +229,7 @@ def query_instances(
|
|
|
229
229
|
cluster_name_on_cloud: str,
|
|
230
230
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
231
231
|
non_terminated_only: bool = True,
|
|
232
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
232
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
233
233
|
"""See sky/provision/__init__.py"""
|
|
234
234
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
235
235
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
@@ -240,12 +240,13 @@ def query_instances(
|
|
|
240
240
|
'unhealthy': status_lib.ClusterStatus.INIT,
|
|
241
241
|
'terminating': None,
|
|
242
242
|
}
|
|
243
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
243
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
244
|
+
Optional[str]]] = {}
|
|
244
245
|
for instance_id, instance in instances.items():
|
|
245
246
|
status = status_map.get(instance['status'])
|
|
246
247
|
if non_terminated_only and status is None:
|
|
247
248
|
continue
|
|
248
|
-
statuses[instance_id] = status
|
|
249
|
+
statuses[instance_id] = (status, None)
|
|
249
250
|
return statuses
|
|
250
251
|
|
|
251
252
|
|
sky/provision/nebius/instance.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Nebius instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -250,7 +250,7 @@ def query_instances(
|
|
|
250
250
|
cluster_name_on_cloud: str,
|
|
251
251
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
252
252
|
non_terminated_only: bool = True,
|
|
253
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
253
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
254
254
|
"""See sky/provision/__init__.py"""
|
|
255
255
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
256
256
|
instances = _filter_instances(provider_config['region'],
|
|
@@ -263,12 +263,13 @@ def query_instances(
|
|
|
263
263
|
'STOPPING': status_lib.ClusterStatus.STOPPED,
|
|
264
264
|
'DELETING': status_lib.ClusterStatus.STOPPED,
|
|
265
265
|
}
|
|
266
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
266
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
267
|
+
Optional[str]]] = {}
|
|
267
268
|
for inst_id, inst in instances.items():
|
|
268
269
|
status = status_map[inst['status']]
|
|
269
270
|
if non_terminated_only and status is None:
|
|
270
271
|
continue
|
|
271
|
-
statuses[inst_id] = status
|
|
272
|
+
statuses[inst_id] = (status, None)
|
|
272
273
|
return statuses
|
|
273
274
|
|
|
274
275
|
|
sky/provision/oci/instance.py
CHANGED
|
@@ -10,7 +10,7 @@ import copy
|
|
|
10
10
|
from datetime import datetime
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import Any, Dict, List, Optional
|
|
13
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
14
14
|
|
|
15
15
|
from sky import exceptions
|
|
16
16
|
from sky import sky_logging
|
|
@@ -35,7 +35,7 @@ def query_instances(
|
|
|
35
35
|
cluster_name_on_cloud: str,
|
|
36
36
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
37
37
|
non_terminated_only: bool = True,
|
|
38
|
-
) -> Dict[str, Optional['status_lib.ClusterStatus']]:
|
|
38
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
39
39
|
"""Query instances.
|
|
40
40
|
|
|
41
41
|
Returns a dictionary of instance IDs and status.
|
|
@@ -47,7 +47,8 @@ def query_instances(
|
|
|
47
47
|
region = provider_config['region']
|
|
48
48
|
|
|
49
49
|
status_map = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY
|
|
50
|
-
statuses: Dict[str, Optional['status_lib.ClusterStatus']
|
|
50
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
51
|
+
Optional[str]]] = {}
|
|
51
52
|
filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
|
|
52
53
|
|
|
53
54
|
instances = _get_filtered_nodes(region, filters)
|
|
@@ -56,7 +57,7 @@ def query_instances(
|
|
|
56
57
|
sky_status = status_map[vm_status]
|
|
57
58
|
if non_terminated_only and sky_status is None:
|
|
58
59
|
continue
|
|
59
|
-
statuses[node['inst_id']] = sky_status
|
|
60
|
+
statuses[node['inst_id']] = (sky_status, None)
|
|
60
61
|
|
|
61
62
|
return statuses
|
|
62
63
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Paperspace instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.provision import common
|
|
@@ -280,7 +280,7 @@ def query_instances(
|
|
|
280
280
|
cluster_name_on_cloud: str,
|
|
281
281
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
282
282
|
non_terminated_only: bool = True,
|
|
283
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
283
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
284
284
|
"""See sky/provision/__init__.py"""
|
|
285
285
|
del non_terminated_only
|
|
286
286
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
@@ -297,10 +297,11 @@ def query_instances(
|
|
|
297
297
|
'ready': status_lib.ClusterStatus.UP,
|
|
298
298
|
'off': status_lib.ClusterStatus.STOPPED,
|
|
299
299
|
}
|
|
300
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
300
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
301
|
+
Optional[str]]] = {}
|
|
301
302
|
for inst_id, inst in instances.items():
|
|
302
303
|
status = status_map[inst['state']]
|
|
303
|
-
statuses[inst_id] = status
|
|
304
|
+
statuses[inst_id] = (status, None)
|
|
304
305
|
return statuses
|
|
305
306
|
|
|
306
307
|
|
sky/provision/provisioner.py
CHANGED
|
@@ -100,6 +100,12 @@ def _bulk_provision(
|
|
|
100
100
|
f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
|
|
101
101
|
f'seconds.')
|
|
102
102
|
|
|
103
|
+
# Add cluster event for provisioning completion.
|
|
104
|
+
global_user_state.add_cluster_event(
|
|
105
|
+
str(cluster_name), status_lib.ClusterStatus.INIT,
|
|
106
|
+
f'Instances launched on {cloud.display_name()} in {region}',
|
|
107
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
108
|
+
|
|
103
109
|
return provision_record
|
|
104
110
|
|
|
105
111
|
|
sky/provision/runpod/instance.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""RunPod instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -204,7 +204,7 @@ def query_instances(
|
|
|
204
204
|
cluster_name_on_cloud: str,
|
|
205
205
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
206
206
|
non_terminated_only: bool = True,
|
|
207
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
207
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
208
208
|
"""See sky/provision/__init__.py"""
|
|
209
209
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
210
210
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
@@ -215,12 +215,13 @@ def query_instances(
|
|
|
215
215
|
'PAUSED': status_lib.ClusterStatus.INIT,
|
|
216
216
|
'RUNNING': status_lib.ClusterStatus.UP,
|
|
217
217
|
}
|
|
218
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
218
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
219
|
+
Optional[str]]] = {}
|
|
219
220
|
for inst_id, inst in instances.items():
|
|
220
221
|
status = status_map[inst['status']]
|
|
221
222
|
if non_terminated_only and status is None:
|
|
222
223
|
continue
|
|
223
|
-
statuses[inst_id] = status
|
|
224
|
+
statuses[inst_id] = (status, None)
|
|
224
225
|
return statuses
|
|
225
226
|
|
|
226
227
|
|
sky/provision/scp/instance.py
CHANGED
|
@@ -4,7 +4,7 @@ import logging
|
|
|
4
4
|
import random
|
|
5
5
|
import string
|
|
6
6
|
import time
|
|
7
|
-
from typing import Any, Dict, List, Optional
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
8
8
|
|
|
9
9
|
from sky.clouds.utils import scp_utils
|
|
10
10
|
from sky.provision import common
|
|
@@ -430,8 +430,7 @@ def query_instances(
|
|
|
430
430
|
cluster_name_on_cloud: str,
|
|
431
431
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
432
432
|
non_terminated_only: bool = True,
|
|
433
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
434
|
-
|
|
433
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
435
434
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
436
435
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
437
436
|
|
|
@@ -447,12 +446,13 @@ def query_instances(
|
|
|
447
446
|
'TERMINATED': None,
|
|
448
447
|
}
|
|
449
448
|
|
|
450
|
-
statuses
|
|
449
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
450
|
+
Optional[str]]] = {}
|
|
451
451
|
for instance in instances:
|
|
452
452
|
status = status_map[instance['virtualServerState']]
|
|
453
453
|
if non_terminated_only and status is None:
|
|
454
454
|
continue
|
|
455
|
-
statuses[instance['virtualServerId']] = status
|
|
455
|
+
statuses[instance['virtualServerId']] = (status, None)
|
|
456
456
|
return statuses
|
|
457
457
|
|
|
458
458
|
|
sky/provision/vast/instance.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Vast instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -219,9 +219,8 @@ def query_instances(
|
|
|
219
219
|
cluster_name_on_cloud: str,
|
|
220
220
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
221
221
|
non_terminated_only: bool = True,
|
|
222
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
222
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
223
223
|
"""See sky/provision/__init__.py"""
|
|
224
|
-
|
|
225
224
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
226
225
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
227
226
|
# "running", "frozen", "stopped", "unknown", "loading"
|
|
@@ -231,12 +230,13 @@ def query_instances(
|
|
|
231
230
|
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
|
232
231
|
'RUNNING': status_lib.ClusterStatus.UP,
|
|
233
232
|
}
|
|
234
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
233
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
234
|
+
Optional[str]]] = {}
|
|
235
235
|
for inst_id, inst in instances.items():
|
|
236
236
|
status = status_map[inst['status']]
|
|
237
237
|
if non_terminated_only and status is None:
|
|
238
238
|
continue
|
|
239
|
-
statuses[inst_id] = status
|
|
239
|
+
statuses[inst_id] = (status, None)
|
|
240
240
|
return statuses
|
|
241
241
|
|
|
242
242
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Vsphere instance provisioning."""
|
|
2
2
|
import json
|
|
3
3
|
import typing
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.adaptors import common as adaptors_common
|
|
@@ -396,7 +396,7 @@ def query_instances(
|
|
|
396
396
|
cluster_name_on_cloud: str,
|
|
397
397
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
398
398
|
non_terminated_only: bool = True,
|
|
399
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
399
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
400
400
|
"""See sky/provision/__init__.py"""
|
|
401
401
|
logger.info('New provision of Vsphere: query_instances().')
|
|
402
402
|
assert provider_config is not None, cluster_name_on_cloud
|
|
@@ -413,12 +413,13 @@ def query_instances(
|
|
|
413
413
|
'suspended': None,
|
|
414
414
|
}
|
|
415
415
|
|
|
416
|
-
status
|
|
416
|
+
status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
417
|
+
Optional[str]]] = {}
|
|
417
418
|
for inst in instances:
|
|
418
419
|
stat = status_map[inst.runtime.powerState]
|
|
419
420
|
if non_terminated_only and stat is None:
|
|
420
421
|
continue
|
|
421
|
-
status[inst.summary.config.instanceUuid] = stat
|
|
422
|
+
status[inst.summary.config.instanceUuid] = (stat, None)
|
|
422
423
|
vc_object.disconnect()
|
|
423
424
|
return status
|
|
424
425
|
|
|
@@ -22,7 +22,7 @@ depends_on = None
|
|
|
22
22
|
def upgrade():
|
|
23
23
|
with op.get_context().autocommit_block():
|
|
24
24
|
# Create any missing tables with current schema first
|
|
25
|
-
db_utils.
|
|
25
|
+
db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
|
|
26
26
|
|
|
27
27
|
# Add all missing columns to clusters table
|
|
28
28
|
# This allows each column addition to fail independently without rolling
|