skypilot-nightly 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/kubernetes.py +5 -2
- sky/backends/backend_utils.py +102 -8
- sky/backends/cloud_vm_ray_backend.py +197 -31
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +60 -77
- sky/client/common.py +1 -1
- sky/client/sdk.py +19 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +14 -0
- sky/core.py +5 -0
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-339efec49c0cc7d0.js → webpack-00c0a51d21157453.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/execution.py +15 -0
- sky/global_user_state.py +160 -2
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +6 -3
- sky/jobs/scheduler.py +23 -68
- sky/jobs/server/core.py +22 -12
- sky/jobs/state.py +6 -2
- sky/jobs/utils.py +17 -2
- sky/provision/__init__.py +4 -2
- sky/provision/aws/config.py +9 -0
- sky/provision/aws/instance.py +41 -17
- sky/provision/azure/instance.py +7 -4
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +7 -4
- sky/provision/do/instance.py +7 -4
- sky/provision/fluidstack/instance.py +7 -4
- sky/provision/gcp/instance.py +7 -4
- sky/provision/hyperbolic/instance.py +7 -5
- sky/provision/kubernetes/instance.py +169 -6
- sky/provision/lambda_cloud/instance.py +7 -4
- sky/provision/nebius/instance.py +7 -4
- sky/provision/oci/instance.py +7 -4
- sky/provision/paperspace/instance.py +7 -5
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/provisioner.py +6 -0
- sky/provision/runpod/instance.py +7 -4
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +7 -5
- sky/provision/vast/instance.py +7 -5
- sky/provision/vsphere/instance.py +7 -4
- sky/resources.py +1 -2
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/serve_state/001_initial_schema.py +1 -1
- sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +15 -16
- sky/serve/serve_state.py +10 -0
- sky/serve/serve_utils.py +58 -23
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +31 -16
- sky/server/server.py +20 -14
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/skypilot_config.py +4 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/users/permission.py +1 -1
- sky/utils/cli_utils/status_utils.py +9 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +73 -0
- sky/utils/db/db_utils.py +39 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/schemas.py +3 -0
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +117 -108
- sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
- /sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
sky/provision/gcp/instance.py
CHANGED
|
@@ -4,7 +4,7 @@ import copy
|
|
|
4
4
|
from multiprocessing import pool
|
|
5
5
|
import re
|
|
6
6
|
import time
|
|
7
|
-
from typing import Any, Callable, Dict, Iterable, List, Optional, Type
|
|
7
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type
|
|
8
8
|
|
|
9
9
|
from sky import sky_logging
|
|
10
10
|
from sky.adaptors import gcp
|
|
@@ -58,11 +58,13 @@ def _filter_instances(
|
|
|
58
58
|
# for terminated instances, if they have already been fully deleted.
|
|
59
59
|
@common_utils.retry
|
|
60
60
|
def query_instances(
|
|
61
|
+
cluster_name: str,
|
|
61
62
|
cluster_name_on_cloud: str,
|
|
62
63
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
63
64
|
non_terminated_only: bool = True,
|
|
64
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
65
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
65
66
|
"""See sky/provision/__init__.py"""
|
|
67
|
+
del cluster_name # unused
|
|
66
68
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
67
69
|
zone = provider_config['availability_zone']
|
|
68
70
|
project_id = provider_config['project_id']
|
|
@@ -84,7 +86,8 @@ def query_instances(
|
|
|
84
86
|
)
|
|
85
87
|
|
|
86
88
|
raw_statuses = {}
|
|
87
|
-
statuses
|
|
89
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
90
|
+
Optional[str]]] = {}
|
|
88
91
|
for inst_id, instance in instances.items():
|
|
89
92
|
raw_status = instance[handler.STATUS_FIELD]
|
|
90
93
|
raw_statuses[inst_id] = raw_status
|
|
@@ -98,7 +101,7 @@ def query_instances(
|
|
|
98
101
|
status = None
|
|
99
102
|
if non_terminated_only and status is None:
|
|
100
103
|
continue
|
|
101
|
-
statuses[inst_id] = status
|
|
104
|
+
statuses[inst_id] = (status, None)
|
|
102
105
|
|
|
103
106
|
# GCP does not clean up preempted TPU VMs. We remove it ourselves.
|
|
104
107
|
if handler == instance_utils.GCPTPUVMInstance:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Hyperbolic instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -304,12 +304,13 @@ def get_cluster_info(
|
|
|
304
304
|
|
|
305
305
|
|
|
306
306
|
def query_instances(
|
|
307
|
+
cluster_name: str,
|
|
307
308
|
cluster_name_on_cloud: str,
|
|
308
309
|
provider_config: Optional[dict] = None,
|
|
309
310
|
non_terminated_only: bool = True,
|
|
310
|
-
) -> Dict[str, Optional['status_lib.ClusterStatus']]:
|
|
311
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
311
312
|
"""Returns the status of the specified instances for Hyperbolic."""
|
|
312
|
-
del provider_config # unused
|
|
313
|
+
del cluster_name, provider_config # unused
|
|
313
314
|
# Fetch all instances for this cluster
|
|
314
315
|
instances = utils.list_instances(
|
|
315
316
|
metadata={'skypilot': {
|
|
@@ -319,7 +320,8 @@ def query_instances(
|
|
|
319
320
|
# No instances found: return empty dict to indicate fully deleted
|
|
320
321
|
return {}
|
|
321
322
|
|
|
322
|
-
statuses: Dict[str, Optional['status_lib.ClusterStatus']
|
|
323
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
324
|
+
Optional[str]]] = {}
|
|
323
325
|
for instance_id, instance in instances.items():
|
|
324
326
|
try:
|
|
325
327
|
raw_status = instance.get('status', 'unknown').lower()
|
|
@@ -328,7 +330,7 @@ def query_instances(
|
|
|
328
330
|
status = hyperbolic_status.to_cluster_status()
|
|
329
331
|
if non_terminated_only and status is None:
|
|
330
332
|
continue
|
|
331
|
-
statuses[instance_id] = status
|
|
333
|
+
statuses[instance_id] = (status, None)
|
|
332
334
|
except utils.HyperbolicError as e:
|
|
333
335
|
logger.warning(
|
|
334
336
|
f'Failed to parse status for instance {instance_id}: {e}')
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
"""Kubernetes instance provisioning."""
|
|
2
2
|
import copy
|
|
3
3
|
import json
|
|
4
|
+
import re
|
|
4
5
|
import time
|
|
5
|
-
from typing import Any, Callable, Dict, List, Optional, Union
|
|
6
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
6
7
|
|
|
7
8
|
from sky import exceptions
|
|
9
|
+
from sky import global_user_state
|
|
8
10
|
from sky import sky_logging
|
|
9
11
|
from sky import skypilot_config
|
|
10
12
|
from sky.adaptors import kubernetes
|
|
@@ -24,6 +26,7 @@ from sky.utils import status_lib
|
|
|
24
26
|
from sky.utils import subprocess_utils
|
|
25
27
|
from sky.utils import timeline
|
|
26
28
|
from sky.utils import ux_utils
|
|
29
|
+
from sky.utils.db import db_utils
|
|
27
30
|
|
|
28
31
|
POLL_INTERVAL = 2
|
|
29
32
|
_TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
|
|
@@ -1248,15 +1251,146 @@ def get_cluster_info(
|
|
|
1248
1251
|
provider_config=provider_config)
|
|
1249
1252
|
|
|
1250
1253
|
|
|
1254
|
+
def _get_pod_termination_reason(pod: Any) -> str:
|
|
1255
|
+
reasons = []
|
|
1256
|
+
if pod.status.container_statuses:
|
|
1257
|
+
for container_status in pod.status.container_statuses:
|
|
1258
|
+
terminated = container_status.state.terminated
|
|
1259
|
+
if terminated:
|
|
1260
|
+
exit_code = terminated.exit_code
|
|
1261
|
+
reason = terminated.reason
|
|
1262
|
+
if exit_code == 0:
|
|
1263
|
+
# skip exit 0 (non-failed) just for sanity
|
|
1264
|
+
continue
|
|
1265
|
+
if reason is None:
|
|
1266
|
+
# just in-case reason is None, have default for debugging
|
|
1267
|
+
reason = f'exit({exit_code})'
|
|
1268
|
+
reasons.append(reason)
|
|
1269
|
+
# TODO (kyuds): later, if needed, query `last_state` too.
|
|
1270
|
+
|
|
1271
|
+
# Normally we will have a single container per pod for skypilot
|
|
1272
|
+
# but doing this just in-case there are multiple containers.
|
|
1273
|
+
return ' | '.join(reasons)
|
|
1274
|
+
|
|
1275
|
+
|
|
1276
|
+
def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
1277
|
+
cluster_name: str, pod_name: str) -> Optional[str]:
|
|
1278
|
+
logger.debug(f'Analyzing events for pod {pod_name}')
|
|
1279
|
+
pod_field_selector = (
|
|
1280
|
+
f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
|
|
1281
|
+
pod_events = kubernetes.core_api(context).list_namespaced_event(
|
|
1282
|
+
namespace,
|
|
1283
|
+
field_selector=pod_field_selector,
|
|
1284
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
1285
|
+
pod_events = sorted(
|
|
1286
|
+
pod_events,
|
|
1287
|
+
key=lambda event: event.metadata.creation_timestamp,
|
|
1288
|
+
# latest event appears first
|
|
1289
|
+
reverse=True)
|
|
1290
|
+
last_scheduled_node = None
|
|
1291
|
+
insert_new_pod_event = True
|
|
1292
|
+
new_event_inserted = False
|
|
1293
|
+
for event in pod_events:
|
|
1294
|
+
if event.reason == 'Scheduled':
|
|
1295
|
+
pattern = r'Successfully assigned (\S+) to (\S+)'
|
|
1296
|
+
match = re.search(pattern, event.message)
|
|
1297
|
+
if match:
|
|
1298
|
+
scheduled_node = match.group(2)
|
|
1299
|
+
last_scheduled_node = scheduled_node
|
|
1300
|
+
if insert_new_pod_event:
|
|
1301
|
+
# Try inserting the latest events first. If the event is a
|
|
1302
|
+
# duplicate, it means the event (and any previous events) have
|
|
1303
|
+
# already been inserted - so do not insert further events.
|
|
1304
|
+
try:
|
|
1305
|
+
global_user_state.add_cluster_event(
|
|
1306
|
+
cluster_name,
|
|
1307
|
+
None, f'[kubernetes pod {pod_name}] '
|
|
1308
|
+
f'{event.reason} {event.message}',
|
|
1309
|
+
global_user_state.ClusterEventType.DEBUG,
|
|
1310
|
+
transitioned_at=int(
|
|
1311
|
+
event.metadata.creation_timestamp.timestamp()),
|
|
1312
|
+
expose_duplicate_error=True)
|
|
1313
|
+
except db_utils.UniqueConstraintViolationError:
|
|
1314
|
+
insert_new_pod_event = False
|
|
1315
|
+
else:
|
|
1316
|
+
new_event_inserted = True
|
|
1317
|
+
|
|
1318
|
+
if last_scheduled_node is not None:
|
|
1319
|
+
node_field_selector = ('involvedObject.kind=Node,'
|
|
1320
|
+
f'involvedObject.name={last_scheduled_node}')
|
|
1321
|
+
node_events = kubernetes.core_api(context).list_namespaced_event(
|
|
1322
|
+
namespace,
|
|
1323
|
+
field_selector=node_field_selector,
|
|
1324
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
1325
|
+
node_events = sorted(
|
|
1326
|
+
node_events,
|
|
1327
|
+
key=lambda event: event.metadata.creation_timestamp,
|
|
1328
|
+
# latest event appears first
|
|
1329
|
+
reverse=True)
|
|
1330
|
+
insert_new_node_event = True
|
|
1331
|
+
for event in node_events:
|
|
1332
|
+
if insert_new_node_event:
|
|
1333
|
+
# Try inserting the latest events first. If the event is a
|
|
1334
|
+
# duplicate, it means the event (and any previous events) have
|
|
1335
|
+
# already been inserted - so do not insert further events.
|
|
1336
|
+
try:
|
|
1337
|
+
global_user_state.add_cluster_event(
|
|
1338
|
+
cluster_name,
|
|
1339
|
+
None, f'[kubernetes node {last_scheduled_node}] '
|
|
1340
|
+
f'{event.reason} {event.message}',
|
|
1341
|
+
global_user_state.ClusterEventType.DEBUG,
|
|
1342
|
+
transitioned_at=int(
|
|
1343
|
+
event.metadata.creation_timestamp.timestamp()),
|
|
1344
|
+
expose_duplicate_error=True)
|
|
1345
|
+
except db_utils.UniqueConstraintViolationError:
|
|
1346
|
+
insert_new_node_event = False
|
|
1347
|
+
else:
|
|
1348
|
+
new_event_inserted = True
|
|
1349
|
+
|
|
1350
|
+
if not new_event_inserted:
|
|
1351
|
+
# If new event is not inserted, there is no useful information to
|
|
1352
|
+
# return. Return None.
|
|
1353
|
+
return None
|
|
1354
|
+
|
|
1355
|
+
# Analyze the events for failure
|
|
1356
|
+
failure_reason = None
|
|
1357
|
+
failure_decisiveness = 0
|
|
1358
|
+
|
|
1359
|
+
def _record_failure_reason(reason: str, decisiveness: int):
|
|
1360
|
+
nonlocal failure_reason, failure_decisiveness
|
|
1361
|
+
if decisiveness > failure_decisiveness:
|
|
1362
|
+
failure_reason = reason
|
|
1363
|
+
failure_decisiveness = decisiveness
|
|
1364
|
+
|
|
1365
|
+
cluster_events = global_user_state.get_cluster_events(
|
|
1366
|
+
cluster_name, None, global_user_state.ClusterEventType.DEBUG)
|
|
1367
|
+
for event in cluster_events:
|
|
1368
|
+
if event.startswith('[kubernetes pod'):
|
|
1369
|
+
event = event.split(']')[1].strip()
|
|
1370
|
+
elif event.startswith('[kubernetes node'):
|
|
1371
|
+
event = event.split(']')[1].strip()
|
|
1372
|
+
|
|
1373
|
+
if event.startswith('NodeNotReady '):
|
|
1374
|
+
_record_failure_reason(event[len('NodeNotReady '):], 1)
|
|
1375
|
+
elif event.startswith('TaintManagerEviction '):
|
|
1376
|
+
# usually the event message for TaintManagerEviction is not useful
|
|
1377
|
+
# so we record a more generic message.
|
|
1378
|
+
_record_failure_reason('pod was evicted by taint manager', 2)
|
|
1379
|
+
elif event.startswith('DeletingNode '):
|
|
1380
|
+
_record_failure_reason(event[len('DeletingNode '):], 3)
|
|
1381
|
+
return failure_reason
|
|
1382
|
+
|
|
1383
|
+
|
|
1251
1384
|
def query_instances(
|
|
1385
|
+
cluster_name: str,
|
|
1252
1386
|
cluster_name_on_cloud: str,
|
|
1253
1387
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
1254
1388
|
non_terminated_only: bool = True
|
|
1255
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
1389
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
1256
1390
|
status_map = {
|
|
1257
1391
|
'Pending': status_lib.ClusterStatus.INIT,
|
|
1258
1392
|
'Running': status_lib.ClusterStatus.UP,
|
|
1259
|
-
'Failed':
|
|
1393
|
+
'Failed': status_lib.ClusterStatus.INIT,
|
|
1260
1394
|
'Unknown': None,
|
|
1261
1395
|
'Succeeded': None,
|
|
1262
1396
|
'Terminating': None,
|
|
@@ -1298,12 +1432,41 @@ def query_instances(
|
|
|
1298
1432
|
f'status: {common_utils.format_exception(e)}')
|
|
1299
1433
|
|
|
1300
1434
|
# Check if the pods are running or pending
|
|
1301
|
-
cluster_status
|
|
1435
|
+
cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
1436
|
+
Optional[str]]] = {}
|
|
1302
1437
|
for pod in pods:
|
|
1303
|
-
|
|
1438
|
+
phase = pod.status.phase
|
|
1439
|
+
pod_status = status_map[phase]
|
|
1304
1440
|
if non_terminated_only and pod_status is None:
|
|
1305
1441
|
continue
|
|
1306
|
-
|
|
1442
|
+
reason = None
|
|
1443
|
+
if phase == 'Failed':
|
|
1444
|
+
reason = _get_pod_termination_reason(pod)
|
|
1445
|
+
logger.debug(f'Pod Status Reason(s): {reason}')
|
|
1446
|
+
pod_name = pod.metadata.name
|
|
1447
|
+
reason = f'{pod_name}: {reason}' if reason is not None else None
|
|
1448
|
+
cluster_status[pod_name] = (pod_status, reason)
|
|
1449
|
+
|
|
1450
|
+
# Find the list of pod names that should be there
|
|
1451
|
+
# from k8s services. Filter duplicates as -ssh service
|
|
1452
|
+
# creates a duplicate entry.
|
|
1453
|
+
target_pod_names = list(
|
|
1454
|
+
set([
|
|
1455
|
+
service['spec']['selector']['component']
|
|
1456
|
+
for service in provider_config.get('services', [])
|
|
1457
|
+
]))
|
|
1458
|
+
|
|
1459
|
+
for target_pod_name in target_pod_names:
|
|
1460
|
+
if target_pod_name not in cluster_status:
|
|
1461
|
+
# If the pod is not in the cluster_status, it means it's not
|
|
1462
|
+
# running.
|
|
1463
|
+
# Analyze what happened to the pod based on events.
|
|
1464
|
+
reason = _get_pod_missing_reason(context, namespace, cluster_name,
|
|
1465
|
+
target_pod_name)
|
|
1466
|
+
reason = (f'{target_pod_name}: {reason}'
|
|
1467
|
+
if reason is not None else None)
|
|
1468
|
+
cluster_status[target_pod_name] = (None, reason)
|
|
1469
|
+
|
|
1307
1470
|
return cluster_status
|
|
1308
1471
|
|
|
1309
1472
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Lambda Cloud instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.provision import common
|
|
@@ -226,11 +226,13 @@ def get_cluster_info(
|
|
|
226
226
|
|
|
227
227
|
|
|
228
228
|
def query_instances(
|
|
229
|
+
cluster_name: str,
|
|
229
230
|
cluster_name_on_cloud: str,
|
|
230
231
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
231
232
|
non_terminated_only: bool = True,
|
|
232
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
233
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
233
234
|
"""See sky/provision/__init__.py"""
|
|
235
|
+
del cluster_name # unused
|
|
234
236
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
235
237
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
236
238
|
|
|
@@ -240,12 +242,13 @@ def query_instances(
|
|
|
240
242
|
'unhealthy': status_lib.ClusterStatus.INIT,
|
|
241
243
|
'terminating': None,
|
|
242
244
|
}
|
|
243
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
245
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
246
|
+
Optional[str]]] = {}
|
|
244
247
|
for instance_id, instance in instances.items():
|
|
245
248
|
status = status_map.get(instance['status'])
|
|
246
249
|
if non_terminated_only and status is None:
|
|
247
250
|
continue
|
|
248
|
-
statuses[instance_id] = status
|
|
251
|
+
statuses[instance_id] = (status, None)
|
|
249
252
|
return statuses
|
|
250
253
|
|
|
251
254
|
|
sky/provision/nebius/instance.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Nebius instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -247,11 +247,13 @@ def get_cluster_info(
|
|
|
247
247
|
|
|
248
248
|
|
|
249
249
|
def query_instances(
|
|
250
|
+
cluster_name: str,
|
|
250
251
|
cluster_name_on_cloud: str,
|
|
251
252
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
252
253
|
non_terminated_only: bool = True,
|
|
253
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
254
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
254
255
|
"""See sky/provision/__init__.py"""
|
|
256
|
+
del cluster_name # unused
|
|
255
257
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
256
258
|
instances = _filter_instances(provider_config['region'],
|
|
257
259
|
cluster_name_on_cloud, None)
|
|
@@ -263,12 +265,13 @@ def query_instances(
|
|
|
263
265
|
'STOPPING': status_lib.ClusterStatus.STOPPED,
|
|
264
266
|
'DELETING': status_lib.ClusterStatus.STOPPED,
|
|
265
267
|
}
|
|
266
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
268
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
269
|
+
Optional[str]]] = {}
|
|
267
270
|
for inst_id, inst in instances.items():
|
|
268
271
|
status = status_map[inst['status']]
|
|
269
272
|
if non_terminated_only and status is None:
|
|
270
273
|
continue
|
|
271
|
-
statuses[inst_id] = status
|
|
274
|
+
statuses[inst_id] = (status, None)
|
|
272
275
|
return statuses
|
|
273
276
|
|
|
274
277
|
|
sky/provision/oci/instance.py
CHANGED
|
@@ -10,7 +10,7 @@ import copy
|
|
|
10
10
|
from datetime import datetime
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import Any, Dict, List, Optional
|
|
13
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
14
14
|
|
|
15
15
|
from sky import exceptions
|
|
16
16
|
from sky import sky_logging
|
|
@@ -32,10 +32,11 @@ logger = sky_logging.init_logger(__name__)
|
|
|
32
32
|
@query_utils.debug_enabled(logger)
|
|
33
33
|
@common_utils.retry
|
|
34
34
|
def query_instances(
|
|
35
|
+
cluster_name: str,
|
|
35
36
|
cluster_name_on_cloud: str,
|
|
36
37
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
37
38
|
non_terminated_only: bool = True,
|
|
38
|
-
) -> Dict[str, Optional['status_lib.ClusterStatus']]:
|
|
39
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
39
40
|
"""Query instances.
|
|
40
41
|
|
|
41
42
|
Returns a dictionary of instance IDs and status.
|
|
@@ -43,11 +44,13 @@ def query_instances(
|
|
|
43
44
|
A None status means the instance is marked as "terminated"
|
|
44
45
|
or "terminating".
|
|
45
46
|
"""
|
|
47
|
+
del cluster_name # unusedå
|
|
46
48
|
assert provider_config is not None, cluster_name_on_cloud
|
|
47
49
|
region = provider_config['region']
|
|
48
50
|
|
|
49
51
|
status_map = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY
|
|
50
|
-
statuses: Dict[str, Optional['status_lib.ClusterStatus']
|
|
52
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
53
|
+
Optional[str]]] = {}
|
|
51
54
|
filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
|
|
52
55
|
|
|
53
56
|
instances = _get_filtered_nodes(region, filters)
|
|
@@ -56,7 +59,7 @@ def query_instances(
|
|
|
56
59
|
sky_status = status_map[vm_status]
|
|
57
60
|
if non_terminated_only and sky_status is None:
|
|
58
61
|
continue
|
|
59
|
-
statuses[node['inst_id']] = sky_status
|
|
62
|
+
statuses[node['inst_id']] = (sky_status, None)
|
|
60
63
|
|
|
61
64
|
return statuses
|
|
62
65
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Paperspace instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.provision import common
|
|
@@ -277,12 +277,13 @@ def get_cluster_info(
|
|
|
277
277
|
|
|
278
278
|
|
|
279
279
|
def query_instances(
|
|
280
|
+
cluster_name: str,
|
|
280
281
|
cluster_name_on_cloud: str,
|
|
281
282
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
282
283
|
non_terminated_only: bool = True,
|
|
283
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
284
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
284
285
|
"""See sky/provision/__init__.py"""
|
|
285
|
-
del non_terminated_only
|
|
286
|
+
del cluster_name, non_terminated_only #unused
|
|
286
287
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
287
288
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
288
289
|
|
|
@@ -297,10 +298,11 @@ def query_instances(
|
|
|
297
298
|
'ready': status_lib.ClusterStatus.UP,
|
|
298
299
|
'off': status_lib.ClusterStatus.STOPPED,
|
|
299
300
|
}
|
|
300
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
301
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
302
|
+
Optional[str]]] = {}
|
|
301
303
|
for inst_id, inst in instances.items():
|
|
302
304
|
status = status_map[inst['state']]
|
|
303
|
-
statuses[inst_id] = status
|
|
305
|
+
statuses[inst_id] = (status, None)
|
|
304
306
|
return statuses
|
|
305
307
|
|
|
306
308
|
|
|
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
8
8
|
|
|
9
9
|
from sky import sky_logging
|
|
10
10
|
from sky.adaptors import common as adaptors_common
|
|
11
|
-
|
|
11
|
+
from sky.provision.paperspace import constants
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
|
|
14
14
|
if typing.TYPE_CHECKING:
|
sky/provision/provisioner.py
CHANGED
|
@@ -100,6 +100,12 @@ def _bulk_provision(
|
|
|
100
100
|
f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
|
|
101
101
|
f'seconds.')
|
|
102
102
|
|
|
103
|
+
# Add cluster event for provisioning completion.
|
|
104
|
+
global_user_state.add_cluster_event(
|
|
105
|
+
str(cluster_name), status_lib.ClusterStatus.INIT,
|
|
106
|
+
f'Instances launched on {cloud.display_name()} in {region}',
|
|
107
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
108
|
+
|
|
103
109
|
return provision_record
|
|
104
110
|
|
|
105
111
|
|
sky/provision/runpod/instance.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""RunPod instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -201,11 +201,13 @@ def get_cluster_info(
|
|
|
201
201
|
|
|
202
202
|
|
|
203
203
|
def query_instances(
|
|
204
|
+
cluster_name: str,
|
|
204
205
|
cluster_name_on_cloud: str,
|
|
205
206
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
206
207
|
non_terminated_only: bool = True,
|
|
207
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
208
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
208
209
|
"""See sky/provision/__init__.py"""
|
|
210
|
+
del cluster_name # unused
|
|
209
211
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
210
212
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
211
213
|
|
|
@@ -215,12 +217,13 @@ def query_instances(
|
|
|
215
217
|
'PAUSED': status_lib.ClusterStatus.INIT,
|
|
216
218
|
'RUNNING': status_lib.ClusterStatus.UP,
|
|
217
219
|
}
|
|
218
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
220
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
221
|
+
Optional[str]]] = {}
|
|
219
222
|
for inst_id, inst in instances.items():
|
|
220
223
|
status = status_map[inst['status']]
|
|
221
224
|
if non_terminated_only and status is None:
|
|
222
225
|
continue
|
|
223
|
-
statuses[inst_id] = status
|
|
226
|
+
statuses[inst_id] = (status, None)
|
|
224
227
|
return statuses
|
|
225
228
|
|
|
226
229
|
|
sky/provision/runpod/utils.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
7
7
|
from sky import sky_logging
|
|
8
8
|
from sky.adaptors import runpod
|
|
9
9
|
from sky.provision import docker_utils
|
|
10
|
-
|
|
10
|
+
from sky.provision.runpod.api import commands as runpod_commands
|
|
11
11
|
from sky.skylet import constants
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
|
sky/provision/scp/instance.py
CHANGED
|
@@ -4,7 +4,7 @@ import logging
|
|
|
4
4
|
import random
|
|
5
5
|
import string
|
|
6
6
|
import time
|
|
7
|
-
from typing import Any, Dict, List, Optional
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
8
8
|
|
|
9
9
|
from sky.clouds.utils import scp_utils
|
|
10
10
|
from sky.provision import common
|
|
@@ -427,11 +427,12 @@ def terminate_instances(
|
|
|
427
427
|
|
|
428
428
|
|
|
429
429
|
def query_instances(
|
|
430
|
+
cluster_name: str,
|
|
430
431
|
cluster_name_on_cloud: str,
|
|
431
432
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
432
433
|
non_terminated_only: bool = True,
|
|
433
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
434
|
-
|
|
434
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
435
|
+
del cluster_name # unused
|
|
435
436
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
436
437
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
437
438
|
|
|
@@ -447,12 +448,13 @@ def query_instances(
|
|
|
447
448
|
'TERMINATED': None,
|
|
448
449
|
}
|
|
449
450
|
|
|
450
|
-
statuses
|
|
451
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
452
|
+
Optional[str]]] = {}
|
|
451
453
|
for instance in instances:
|
|
452
454
|
status = status_map[instance['virtualServerState']]
|
|
453
455
|
if non_terminated_only and status is None:
|
|
454
456
|
continue
|
|
455
|
-
statuses[instance['virtualServerId']] = status
|
|
457
|
+
statuses[instance['virtualServerId']] = (status, None)
|
|
456
458
|
return statuses
|
|
457
459
|
|
|
458
460
|
|
sky/provision/vast/instance.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Vast instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -216,12 +216,13 @@ def open_ports(
|
|
|
216
216
|
|
|
217
217
|
|
|
218
218
|
def query_instances(
|
|
219
|
+
cluster_name: str,
|
|
219
220
|
cluster_name_on_cloud: str,
|
|
220
221
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
221
222
|
non_terminated_only: bool = True,
|
|
222
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
223
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
223
224
|
"""See sky/provision/__init__.py"""
|
|
224
|
-
|
|
225
|
+
del cluster_name # unused
|
|
225
226
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
226
227
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
227
228
|
# "running", "frozen", "stopped", "unknown", "loading"
|
|
@@ -231,12 +232,13 @@ def query_instances(
|
|
|
231
232
|
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
|
232
233
|
'RUNNING': status_lib.ClusterStatus.UP,
|
|
233
234
|
}
|
|
234
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
235
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
236
|
+
Optional[str]]] = {}
|
|
235
237
|
for inst_id, inst in instances.items():
|
|
236
238
|
status = status_map[inst['status']]
|
|
237
239
|
if non_terminated_only and status is None:
|
|
238
240
|
continue
|
|
239
|
-
statuses[inst_id] = status
|
|
241
|
+
statuses[inst_id] = (status, None)
|
|
240
242
|
return statuses
|
|
241
243
|
|
|
242
244
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Vsphere instance provisioning."""
|
|
2
2
|
import json
|
|
3
3
|
import typing
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.adaptors import common as adaptors_common
|
|
@@ -393,11 +393,13 @@ def _get_cluster_name_filter(cluster_name_on_cloud):
|
|
|
393
393
|
|
|
394
394
|
|
|
395
395
|
def query_instances(
|
|
396
|
+
cluster_name: str,
|
|
396
397
|
cluster_name_on_cloud: str,
|
|
397
398
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
398
399
|
non_terminated_only: bool = True,
|
|
399
|
-
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
|
400
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
400
401
|
"""See sky/provision/__init__.py"""
|
|
402
|
+
del cluster_name # unused
|
|
401
403
|
logger.info('New provision of Vsphere: query_instances().')
|
|
402
404
|
assert provider_config is not None, cluster_name_on_cloud
|
|
403
405
|
region = provider_config['region']
|
|
@@ -413,12 +415,13 @@ def query_instances(
|
|
|
413
415
|
'suspended': None,
|
|
414
416
|
}
|
|
415
417
|
|
|
416
|
-
status
|
|
418
|
+
status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
419
|
+
Optional[str]]] = {}
|
|
417
420
|
for inst in instances:
|
|
418
421
|
stat = status_map[inst.runtime.powerState]
|
|
419
422
|
if non_terminated_only and stat is None:
|
|
420
423
|
continue
|
|
421
|
-
status[inst.summary.config.instanceUuid] = stat
|
|
424
|
+
status[inst.summary.config.instanceUuid] = (stat, None)
|
|
422
425
|
vc_object.disconnect()
|
|
423
426
|
return status
|
|
424
427
|
|
sky/resources.py
CHANGED
|
@@ -8,7 +8,6 @@ from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
|
|
|
8
8
|
|
|
9
9
|
import colorama
|
|
10
10
|
|
|
11
|
-
import sky
|
|
12
11
|
from sky import catalog
|
|
13
12
|
from sky import check as sky_check
|
|
14
13
|
from sky import clouds
|
|
@@ -288,7 +287,7 @@ class Resources:
|
|
|
288
287
|
if infra is not None:
|
|
289
288
|
infra_info = infra_utils.InfraInfo.from_str(infra)
|
|
290
289
|
# Infra takes precedence over individually specified parameters
|
|
291
|
-
cloud =
|
|
290
|
+
cloud = registry.CLOUD_REGISTRY.from_str(infra_info.cloud)
|
|
292
291
|
region = infra_info.region
|
|
293
292
|
zone = infra_info.zone
|
|
294
293
|
|