skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/backends/backend_utils.py +69 -6
- sky/backends/cloud_vm_ray_backend.py +156 -25
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +40 -77
- sky/client/common.py +1 -1
- sky/client/sdk.py +19 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +14 -0
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-7fd0cf9dbecff10f.js → webpack-00c0a51d21157453.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +63 -7
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +3 -3
- sky/jobs/scheduler.py +23 -68
- sky/jobs/server/core.py +18 -12
- sky/jobs/state.py +6 -2
- sky/jobs/utils.py +8 -0
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +9 -0
- sky/provision/aws/instance.py +36 -13
- sky/provision/azure/instance.py +2 -0
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +2 -0
- sky/provision/do/instance.py +2 -0
- sky/provision/fluidstack/instance.py +2 -0
- sky/provision/gcp/instance.py +2 -0
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/kubernetes/instance.py +133 -0
- sky/provision/lambda_cloud/instance.py +2 -0
- sky/provision/nebius/instance.py +2 -0
- sky/provision/oci/instance.py +2 -0
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/runpod/instance.py +2 -0
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +2 -0
- sky/provision/vast/instance.py +2 -0
- sky/provision/vsphere/instance.py +2 -0
- sky/resources.py +1 -2
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +15 -16
- sky/serve/serve_state.py +10 -0
- sky/serve/serve_utils.py +21 -20
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +31 -16
- sky/server/server.py +20 -14
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +73 -0
- sky/utils/db/db_utils.py +17 -0
- sky/utils/schemas.py +3 -0
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +102 -94
- /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
sky/provision/aws/instance.py
CHANGED
|
@@ -527,6 +527,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
527
527
|
to_start_count,
|
|
528
528
|
associate_public_ip_address=(
|
|
529
529
|
not config.provider_config['use_internal_ips']))
|
|
530
|
+
|
|
530
531
|
created_instances.extend(created_remaining_instances)
|
|
531
532
|
created_instances.sort(key=lambda x: x.id)
|
|
532
533
|
|
|
@@ -585,11 +586,13 @@ def _filter_instances(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
585
586
|
# stop() and terminate() for example already implicitly assume non-terminated.
|
|
586
587
|
@common_utils.retry
|
|
587
588
|
def query_instances(
|
|
589
|
+
cluster_name: str,
|
|
588
590
|
cluster_name_on_cloud: str,
|
|
589
591
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
590
592
|
non_terminated_only: bool = True,
|
|
591
593
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
592
594
|
"""See sky/provision/__init__.py"""
|
|
595
|
+
del cluster_name # unused
|
|
593
596
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
594
597
|
region = provider_config['region']
|
|
595
598
|
ec2 = _default_ec2_resource(region)
|
|
@@ -682,19 +685,39 @@ def terminate_instances(
|
|
|
682
685
|
filters,
|
|
683
686
|
included_instances=None,
|
|
684
687
|
excluded_instances=None)
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
#
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
688
|
+
default_sg = _get_sg_from_name(ec2, aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
|
|
689
|
+
if sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
|
|
690
|
+
# Case 1: The default SG is used, we don't need to ensure instance are
|
|
691
|
+
# terminated.
|
|
692
|
+
instances.terminate()
|
|
693
|
+
elif not managed_by_skypilot:
|
|
694
|
+
# Case 2: We are not managing the non-default sg. We don't need to
|
|
695
|
+
# ensure instances are terminated.
|
|
696
|
+
instances.terminate()
|
|
697
|
+
elif (managed_by_skypilot and default_sg is not None):
|
|
698
|
+
# Case 3: We are managing the non-default sg. The default SG exists
|
|
699
|
+
# so we can move the instances to the default SG and terminate them
|
|
700
|
+
# without blocking.
|
|
701
|
+
|
|
702
|
+
# Make this multithreaded: modify all instances' SGs in parallel.
|
|
703
|
+
def modify_instance_sg(instance):
|
|
704
|
+
instance.modify_attribute(Groups=[default_sg.id])
|
|
705
|
+
logger.debug(f'Instance {instance.id} modified to use default SG:'
|
|
706
|
+
f'{default_sg.id} for quick deletion.')
|
|
707
|
+
|
|
708
|
+
with pool.ThreadPool() as thread_pool:
|
|
709
|
+
thread_pool.map(modify_instance_sg, instances)
|
|
710
|
+
thread_pool.close()
|
|
711
|
+
thread_pool.join()
|
|
712
|
+
|
|
713
|
+
instances.terminate()
|
|
714
|
+
else:
|
|
715
|
+
# Case 4: We are managing the non-default sg. The default SG does not
|
|
716
|
+
# exist. We must block on instance termination.
|
|
717
|
+
instances.terminate()
|
|
718
|
+
for instance in instances:
|
|
719
|
+
instance.wait_until_terminated()
|
|
720
|
+
|
|
698
721
|
# TODO(suquark): Currently, the implementation of GCP and Azure will
|
|
699
722
|
# wait util the cluster is fully terminated, while other clouds just
|
|
700
723
|
# trigger the termination process (via http call) and then return.
|
sky/provision/azure/instance.py
CHANGED
|
@@ -952,11 +952,13 @@ def delete_vm_and_attached_resources(subscription_id: str, resource_group: str,
|
|
|
952
952
|
|
|
953
953
|
@common_utils.retry
|
|
954
954
|
def query_instances(
|
|
955
|
+
cluster_name: str,
|
|
955
956
|
cluster_name_on_cloud: str,
|
|
956
957
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
957
958
|
non_terminated_only: bool = True,
|
|
958
959
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
959
960
|
"""See sky/provision/__init__.py"""
|
|
961
|
+
del cluster_name # unused
|
|
960
962
|
assert provider_config is not None, cluster_name_on_cloud
|
|
961
963
|
|
|
962
964
|
subscription_id = provider_config['subscription_id']
|
sky/provision/cudo/instance.py
CHANGED
|
@@ -191,11 +191,13 @@ def get_cluster_info(
|
|
|
191
191
|
|
|
192
192
|
|
|
193
193
|
def query_instances(
|
|
194
|
+
cluster_name: str,
|
|
194
195
|
cluster_name_on_cloud: str,
|
|
195
196
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
196
197
|
non_terminated_only: bool = True,
|
|
197
198
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
198
199
|
"""See sky/provision/__init__.py"""
|
|
200
|
+
del cluster_name # unused
|
|
199
201
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
200
202
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
201
203
|
|
sky/provision/do/instance.py
CHANGED
|
@@ -242,11 +242,13 @@ def get_cluster_info(
|
|
|
242
242
|
|
|
243
243
|
|
|
244
244
|
def query_instances(
|
|
245
|
+
cluster_name: str,
|
|
245
246
|
cluster_name_on_cloud: str,
|
|
246
247
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
247
248
|
non_terminated_only: bool = True,
|
|
248
249
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
249
250
|
"""See sky/provision/__init__.py"""
|
|
251
|
+
del cluster_name # unused
|
|
250
252
|
# terminated instances are not retrieved by the
|
|
251
253
|
# API making `non_terminated_only` argument moot.
|
|
252
254
|
del non_terminated_only
|
|
@@ -287,11 +287,13 @@ def get_cluster_info(
|
|
|
287
287
|
|
|
288
288
|
|
|
289
289
|
def query_instances(
|
|
290
|
+
cluster_name: str,
|
|
290
291
|
cluster_name_on_cloud: str,
|
|
291
292
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
292
293
|
non_terminated_only: bool = True,
|
|
293
294
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
294
295
|
"""See sky/provision/__init__.py"""
|
|
296
|
+
del cluster_name # unused
|
|
295
297
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
296
298
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
297
299
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
sky/provision/gcp/instance.py
CHANGED
|
@@ -58,11 +58,13 @@ def _filter_instances(
|
|
|
58
58
|
# for terminated instances, if they have already been fully deleted.
|
|
59
59
|
@common_utils.retry
|
|
60
60
|
def query_instances(
|
|
61
|
+
cluster_name: str,
|
|
61
62
|
cluster_name_on_cloud: str,
|
|
62
63
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
63
64
|
non_terminated_only: bool = True,
|
|
64
65
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
65
66
|
"""See sky/provision/__init__.py"""
|
|
67
|
+
del cluster_name # unused
|
|
66
68
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
67
69
|
zone = provider_config['availability_zone']
|
|
68
70
|
project_id = provider_config['project_id']
|
|
@@ -304,12 +304,13 @@ def get_cluster_info(
|
|
|
304
304
|
|
|
305
305
|
|
|
306
306
|
def query_instances(
|
|
307
|
+
cluster_name: str,
|
|
307
308
|
cluster_name_on_cloud: str,
|
|
308
309
|
provider_config: Optional[dict] = None,
|
|
309
310
|
non_terminated_only: bool = True,
|
|
310
311
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
311
312
|
"""Returns the status of the specified instances for Hyperbolic."""
|
|
312
|
-
del provider_config # unused
|
|
313
|
+
del cluster_name, provider_config # unused
|
|
313
314
|
# Fetch all instances for this cluster
|
|
314
315
|
instances = utils.list_instances(
|
|
315
316
|
metadata={'skypilot': {
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
"""Kubernetes instance provisioning."""
|
|
2
2
|
import copy
|
|
3
3
|
import json
|
|
4
|
+
import re
|
|
4
5
|
import time
|
|
5
6
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
6
7
|
|
|
7
8
|
from sky import exceptions
|
|
9
|
+
from sky import global_user_state
|
|
8
10
|
from sky import sky_logging
|
|
9
11
|
from sky import skypilot_config
|
|
10
12
|
from sky.adaptors import kubernetes
|
|
@@ -24,6 +26,7 @@ from sky.utils import status_lib
|
|
|
24
26
|
from sky.utils import subprocess_utils
|
|
25
27
|
from sky.utils import timeline
|
|
26
28
|
from sky.utils import ux_utils
|
|
29
|
+
from sky.utils.db import db_utils
|
|
27
30
|
|
|
28
31
|
POLL_INTERVAL = 2
|
|
29
32
|
_TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
|
|
@@ -1270,7 +1273,116 @@ def _get_pod_termination_reason(pod: Any) -> str:
|
|
|
1270
1273
|
return ' | '.join(reasons)
|
|
1271
1274
|
|
|
1272
1275
|
|
|
1276
|
+
def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
1277
|
+
cluster_name: str, pod_name: str) -> Optional[str]:
|
|
1278
|
+
logger.debug(f'Analyzing events for pod {pod_name}')
|
|
1279
|
+
pod_field_selector = (
|
|
1280
|
+
f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
|
|
1281
|
+
pod_events = kubernetes.core_api(context).list_namespaced_event(
|
|
1282
|
+
namespace,
|
|
1283
|
+
field_selector=pod_field_selector,
|
|
1284
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
1285
|
+
pod_events = sorted(
|
|
1286
|
+
pod_events,
|
|
1287
|
+
key=lambda event: event.metadata.creation_timestamp,
|
|
1288
|
+
# latest event appears first
|
|
1289
|
+
reverse=True)
|
|
1290
|
+
last_scheduled_node = None
|
|
1291
|
+
insert_new_pod_event = True
|
|
1292
|
+
new_event_inserted = False
|
|
1293
|
+
for event in pod_events:
|
|
1294
|
+
if event.reason == 'Scheduled':
|
|
1295
|
+
pattern = r'Successfully assigned (\S+) to (\S+)'
|
|
1296
|
+
match = re.search(pattern, event.message)
|
|
1297
|
+
if match:
|
|
1298
|
+
scheduled_node = match.group(2)
|
|
1299
|
+
last_scheduled_node = scheduled_node
|
|
1300
|
+
if insert_new_pod_event:
|
|
1301
|
+
# Try inserting the latest events first. If the event is a
|
|
1302
|
+
# duplicate, it means the event (and any previous events) have
|
|
1303
|
+
# already been inserted - so do not insert further events.
|
|
1304
|
+
try:
|
|
1305
|
+
global_user_state.add_cluster_event(
|
|
1306
|
+
cluster_name,
|
|
1307
|
+
None, f'[kubernetes pod {pod_name}] '
|
|
1308
|
+
f'{event.reason} {event.message}',
|
|
1309
|
+
global_user_state.ClusterEventType.DEBUG,
|
|
1310
|
+
transitioned_at=int(
|
|
1311
|
+
event.metadata.creation_timestamp.timestamp()),
|
|
1312
|
+
expose_duplicate_error=True)
|
|
1313
|
+
except db_utils.UniqueConstraintViolationError:
|
|
1314
|
+
insert_new_pod_event = False
|
|
1315
|
+
else:
|
|
1316
|
+
new_event_inserted = True
|
|
1317
|
+
|
|
1318
|
+
if last_scheduled_node is not None:
|
|
1319
|
+
node_field_selector = ('involvedObject.kind=Node,'
|
|
1320
|
+
f'involvedObject.name={last_scheduled_node}')
|
|
1321
|
+
node_events = kubernetes.core_api(context).list_namespaced_event(
|
|
1322
|
+
namespace,
|
|
1323
|
+
field_selector=node_field_selector,
|
|
1324
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
1325
|
+
node_events = sorted(
|
|
1326
|
+
node_events,
|
|
1327
|
+
key=lambda event: event.metadata.creation_timestamp,
|
|
1328
|
+
# latest event appears first
|
|
1329
|
+
reverse=True)
|
|
1330
|
+
insert_new_node_event = True
|
|
1331
|
+
for event in node_events:
|
|
1332
|
+
if insert_new_node_event:
|
|
1333
|
+
# Try inserting the latest events first. If the event is a
|
|
1334
|
+
# duplicate, it means the event (and any previous events) have
|
|
1335
|
+
# already been inserted - so do not insert further events.
|
|
1336
|
+
try:
|
|
1337
|
+
global_user_state.add_cluster_event(
|
|
1338
|
+
cluster_name,
|
|
1339
|
+
None, f'[kubernetes node {last_scheduled_node}] '
|
|
1340
|
+
f'{event.reason} {event.message}',
|
|
1341
|
+
global_user_state.ClusterEventType.DEBUG,
|
|
1342
|
+
transitioned_at=int(
|
|
1343
|
+
event.metadata.creation_timestamp.timestamp()),
|
|
1344
|
+
expose_duplicate_error=True)
|
|
1345
|
+
except db_utils.UniqueConstraintViolationError:
|
|
1346
|
+
insert_new_node_event = False
|
|
1347
|
+
else:
|
|
1348
|
+
new_event_inserted = True
|
|
1349
|
+
|
|
1350
|
+
if not new_event_inserted:
|
|
1351
|
+
# If new event is not inserted, there is no useful information to
|
|
1352
|
+
# return. Return None.
|
|
1353
|
+
return None
|
|
1354
|
+
|
|
1355
|
+
# Analyze the events for failure
|
|
1356
|
+
failure_reason = None
|
|
1357
|
+
failure_decisiveness = 0
|
|
1358
|
+
|
|
1359
|
+
def _record_failure_reason(reason: str, decisiveness: int):
|
|
1360
|
+
nonlocal failure_reason, failure_decisiveness
|
|
1361
|
+
if decisiveness > failure_decisiveness:
|
|
1362
|
+
failure_reason = reason
|
|
1363
|
+
failure_decisiveness = decisiveness
|
|
1364
|
+
|
|
1365
|
+
cluster_events = global_user_state.get_cluster_events(
|
|
1366
|
+
cluster_name, None, global_user_state.ClusterEventType.DEBUG)
|
|
1367
|
+
for event in cluster_events:
|
|
1368
|
+
if event.startswith('[kubernetes pod'):
|
|
1369
|
+
event = event.split(']')[1].strip()
|
|
1370
|
+
elif event.startswith('[kubernetes node'):
|
|
1371
|
+
event = event.split(']')[1].strip()
|
|
1372
|
+
|
|
1373
|
+
if event.startswith('NodeNotReady '):
|
|
1374
|
+
_record_failure_reason(event[len('NodeNotReady '):], 1)
|
|
1375
|
+
elif event.startswith('TaintManagerEviction '):
|
|
1376
|
+
# usually the event message for TaintManagerEviction is not useful
|
|
1377
|
+
# so we record a more generic message.
|
|
1378
|
+
_record_failure_reason('pod was evicted by taint manager', 2)
|
|
1379
|
+
elif event.startswith('DeletingNode '):
|
|
1380
|
+
_record_failure_reason(event[len('DeletingNode '):], 3)
|
|
1381
|
+
return failure_reason
|
|
1382
|
+
|
|
1383
|
+
|
|
1273
1384
|
def query_instances(
|
|
1385
|
+
cluster_name: str,
|
|
1274
1386
|
cluster_name_on_cloud: str,
|
|
1275
1387
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
1276
1388
|
non_terminated_only: bool = True
|
|
@@ -1334,6 +1446,27 @@ def query_instances(
|
|
|
1334
1446
|
pod_name = pod.metadata.name
|
|
1335
1447
|
reason = f'{pod_name}: {reason}' if reason is not None else None
|
|
1336
1448
|
cluster_status[pod_name] = (pod_status, reason)
|
|
1449
|
+
|
|
1450
|
+
# Find the list of pod names that should be there
|
|
1451
|
+
# from k8s services. Filter duplicates as -ssh service
|
|
1452
|
+
# creates a duplicate entry.
|
|
1453
|
+
target_pod_names = list(
|
|
1454
|
+
set([
|
|
1455
|
+
service['spec']['selector']['component']
|
|
1456
|
+
for service in provider_config.get('services', [])
|
|
1457
|
+
]))
|
|
1458
|
+
|
|
1459
|
+
for target_pod_name in target_pod_names:
|
|
1460
|
+
if target_pod_name not in cluster_status:
|
|
1461
|
+
# If the pod is not in the cluster_status, it means it's not
|
|
1462
|
+
# running.
|
|
1463
|
+
# Analyze what happened to the pod based on events.
|
|
1464
|
+
reason = _get_pod_missing_reason(context, namespace, cluster_name,
|
|
1465
|
+
target_pod_name)
|
|
1466
|
+
reason = (f'{target_pod_name}: {reason}'
|
|
1467
|
+
if reason is not None else None)
|
|
1468
|
+
cluster_status[target_pod_name] = (None, reason)
|
|
1469
|
+
|
|
1337
1470
|
return cluster_status
|
|
1338
1471
|
|
|
1339
1472
|
|
|
@@ -226,11 +226,13 @@ def get_cluster_info(
|
|
|
226
226
|
|
|
227
227
|
|
|
228
228
|
def query_instances(
|
|
229
|
+
cluster_name: str,
|
|
229
230
|
cluster_name_on_cloud: str,
|
|
230
231
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
231
232
|
non_terminated_only: bool = True,
|
|
232
233
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
233
234
|
"""See sky/provision/__init__.py"""
|
|
235
|
+
del cluster_name # unused
|
|
234
236
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
235
237
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
236
238
|
|
sky/provision/nebius/instance.py
CHANGED
|
@@ -247,11 +247,13 @@ def get_cluster_info(
|
|
|
247
247
|
|
|
248
248
|
|
|
249
249
|
def query_instances(
|
|
250
|
+
cluster_name: str,
|
|
250
251
|
cluster_name_on_cloud: str,
|
|
251
252
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
252
253
|
non_terminated_only: bool = True,
|
|
253
254
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
254
255
|
"""See sky/provision/__init__.py"""
|
|
256
|
+
del cluster_name # unused
|
|
255
257
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
256
258
|
instances = _filter_instances(provider_config['region'],
|
|
257
259
|
cluster_name_on_cloud, None)
|
sky/provision/oci/instance.py
CHANGED
|
@@ -32,6 +32,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
32
32
|
@query_utils.debug_enabled(logger)
|
|
33
33
|
@common_utils.retry
|
|
34
34
|
def query_instances(
|
|
35
|
+
cluster_name: str,
|
|
35
36
|
cluster_name_on_cloud: str,
|
|
36
37
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
37
38
|
non_terminated_only: bool = True,
|
|
@@ -43,6 +44,7 @@ def query_instances(
|
|
|
43
44
|
A None status means the instance is marked as "terminated"
|
|
44
45
|
or "terminating".
|
|
45
46
|
"""
|
|
47
|
+
del cluster_name # unusedå
|
|
46
48
|
assert provider_config is not None, cluster_name_on_cloud
|
|
47
49
|
region = provider_config['region']
|
|
48
50
|
|
|
@@ -277,12 +277,13 @@ def get_cluster_info(
|
|
|
277
277
|
|
|
278
278
|
|
|
279
279
|
def query_instances(
|
|
280
|
+
cluster_name: str,
|
|
280
281
|
cluster_name_on_cloud: str,
|
|
281
282
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
282
283
|
non_terminated_only: bool = True,
|
|
283
284
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
284
285
|
"""See sky/provision/__init__.py"""
|
|
285
|
-
del non_terminated_only
|
|
286
|
+
del cluster_name, non_terminated_only #unused
|
|
286
287
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
287
288
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
288
289
|
|
|
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
8
8
|
|
|
9
9
|
from sky import sky_logging
|
|
10
10
|
from sky.adaptors import common as adaptors_common
|
|
11
|
-
|
|
11
|
+
from sky.provision.paperspace import constants
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
|
|
14
14
|
if typing.TYPE_CHECKING:
|
sky/provision/runpod/instance.py
CHANGED
|
@@ -201,11 +201,13 @@ def get_cluster_info(
|
|
|
201
201
|
|
|
202
202
|
|
|
203
203
|
def query_instances(
|
|
204
|
+
cluster_name: str,
|
|
204
205
|
cluster_name_on_cloud: str,
|
|
205
206
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
206
207
|
non_terminated_only: bool = True,
|
|
207
208
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
208
209
|
"""See sky/provision/__init__.py"""
|
|
210
|
+
del cluster_name # unused
|
|
209
211
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
210
212
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
211
213
|
|
sky/provision/runpod/utils.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
7
7
|
from sky import sky_logging
|
|
8
8
|
from sky.adaptors import runpod
|
|
9
9
|
from sky.provision import docker_utils
|
|
10
|
-
|
|
10
|
+
from sky.provision.runpod.api import commands as runpod_commands
|
|
11
11
|
from sky.skylet import constants
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
|
sky/provision/scp/instance.py
CHANGED
|
@@ -427,10 +427,12 @@ def terminate_instances(
|
|
|
427
427
|
|
|
428
428
|
|
|
429
429
|
def query_instances(
|
|
430
|
+
cluster_name: str,
|
|
430
431
|
cluster_name_on_cloud: str,
|
|
431
432
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
432
433
|
non_terminated_only: bool = True,
|
|
433
434
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
435
|
+
del cluster_name # unused
|
|
434
436
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
435
437
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
436
438
|
|
sky/provision/vast/instance.py
CHANGED
|
@@ -216,11 +216,13 @@ def open_ports(
|
|
|
216
216
|
|
|
217
217
|
|
|
218
218
|
def query_instances(
|
|
219
|
+
cluster_name: str,
|
|
219
220
|
cluster_name_on_cloud: str,
|
|
220
221
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
221
222
|
non_terminated_only: bool = True,
|
|
222
223
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
223
224
|
"""See sky/provision/__init__.py"""
|
|
225
|
+
del cluster_name # unused
|
|
224
226
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
225
227
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
226
228
|
# "running", "frozen", "stopped", "unknown", "loading"
|
|
@@ -393,11 +393,13 @@ def _get_cluster_name_filter(cluster_name_on_cloud):
|
|
|
393
393
|
|
|
394
394
|
|
|
395
395
|
def query_instances(
|
|
396
|
+
cluster_name: str,
|
|
396
397
|
cluster_name_on_cloud: str,
|
|
397
398
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
398
399
|
non_terminated_only: bool = True,
|
|
399
400
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
400
401
|
"""See sky/provision/__init__.py"""
|
|
402
|
+
del cluster_name # unused
|
|
401
403
|
logger.info('New provision of Vsphere: query_instances().')
|
|
402
404
|
assert provider_config is not None, cluster_name_on_cloud
|
|
403
405
|
region = provider_config['region']
|
sky/resources.py
CHANGED
|
@@ -8,7 +8,6 @@ from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
|
|
|
8
8
|
|
|
9
9
|
import colorama
|
|
10
10
|
|
|
11
|
-
import sky
|
|
12
11
|
from sky import catalog
|
|
13
12
|
from sky import check as sky_check
|
|
14
13
|
from sky import clouds
|
|
@@ -288,7 +287,7 @@ class Resources:
|
|
|
288
287
|
if infra is not None:
|
|
289
288
|
infra_info = infra_utils.InfraInfo.from_str(infra)
|
|
290
289
|
# Infra takes precedence over individually specified parameters
|
|
291
|
-
cloud =
|
|
290
|
+
cloud = registry.CLOUD_REGISTRY.from_str(infra_info.cloud)
|
|
292
291
|
region = infra_info.region
|
|
293
292
|
zone = infra_info.zone
|
|
294
293
|
|
sky/schemas/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Responses for the API server."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import pydantic
|
|
6
|
+
|
|
7
|
+
from sky import models
|
|
8
|
+
from sky.server import common
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ResponseBaseModel(pydantic.BaseModel):
|
|
12
|
+
"""A pydantic model that acts like a dict.
|
|
13
|
+
|
|
14
|
+
Supports the following syntax:
|
|
15
|
+
class SampleResponse(DictLikePayload):
|
|
16
|
+
field: str
|
|
17
|
+
|
|
18
|
+
response = SampleResponse(field='value')
|
|
19
|
+
print(response['field']) # prints 'value'
|
|
20
|
+
response['field'] = 'value2'
|
|
21
|
+
print(response['field']) # prints 'value2'
|
|
22
|
+
print('field' in response) # prints True
|
|
23
|
+
|
|
24
|
+
This model exists for backwards compatibility with the
|
|
25
|
+
old SDK that used to return a dict.
|
|
26
|
+
|
|
27
|
+
The backward compatibility may be removed
|
|
28
|
+
in the future.
|
|
29
|
+
"""
|
|
30
|
+
# Ignore extra fields in the request body, which is useful for backward
|
|
31
|
+
# compatibility. The difference with `allow` is that `ignore` will not
|
|
32
|
+
# include the unknown fields when dump the model, i.e., we can add new
|
|
33
|
+
# fields to the request body without breaking the existing old API server
|
|
34
|
+
# where the handler function does not accept the new field in function
|
|
35
|
+
# signature.
|
|
36
|
+
model_config = pydantic.ConfigDict(extra='ignore')
|
|
37
|
+
|
|
38
|
+
# backward compatibility with dict
|
|
39
|
+
# TODO(syang): remove this in v0.13.0
|
|
40
|
+
def __getitem__(self, key):
|
|
41
|
+
try:
|
|
42
|
+
return getattr(self, key)
|
|
43
|
+
except AttributeError as e:
|
|
44
|
+
raise KeyError(key) from e
|
|
45
|
+
|
|
46
|
+
def __setitem__(self, key, value):
|
|
47
|
+
setattr(self, key, value)
|
|
48
|
+
|
|
49
|
+
def __contains__(self, key):
|
|
50
|
+
return hasattr(self, key)
|
|
51
|
+
|
|
52
|
+
def keys(self):
|
|
53
|
+
return self.model_dump().keys()
|
|
54
|
+
|
|
55
|
+
def values(self):
|
|
56
|
+
return self.model_dump().values()
|
|
57
|
+
|
|
58
|
+
def items(self):
|
|
59
|
+
return self.model_dump().items()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class APIHealthResponse(ResponseBaseModel):
|
|
63
|
+
"""Response for the API health endpoint."""
|
|
64
|
+
status: common.ApiServerStatus
|
|
65
|
+
api_version: str = ''
|
|
66
|
+
version: str = ''
|
|
67
|
+
version_on_disk: str = ''
|
|
68
|
+
commit: str = ''
|
|
69
|
+
basic_auth_enabled: bool = False
|
|
70
|
+
user: Optional[models.User] = None
|
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
3
|
+
# source: sky/schemas/generated/autostopv1.proto
|
|
4
|
+
# Protobuf Python Version: 5.26.1
|
|
5
|
+
"""Generated protocol buffer code."""
|
|
6
|
+
from google.protobuf import descriptor as _descriptor
|
|
7
|
+
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
8
|
+
from google.protobuf import symbol_database as _symbol_database
|
|
9
|
+
from google.protobuf.internal import builder as _builder
|
|
10
|
+
# @@protoc_insertion_point(imports)
|
|
11
|
+
|
|
12
|
+
_sym_db = _symbol_database.Default()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n&sky/schemas/generated/autostopv1.proto\x12\x0b\x61utostop.v1\"y\n\x12SetAutostopRequest\x12\x14\n\x0cidle_minutes\x18\x01 \x01(\x05\x12\x0f\n\x07\x62\x61\x63kend\x18\x02 \x01(\t\x12.\n\x08wait_for\x18\x03 \x01(\x0e\x32\x1c.autostop.v1.AutostopWaitFor\x12\x0c\n\x04\x64own\x18\x04 \x01(\x08\"\x15\n\x13SetAutostopResponse\"\x17\n\x15IsAutostoppingRequest\"1\n\x16IsAutostoppingResponse\x12\x17\n\x0fis_autostopping\x18\x01 \x01(\x08*\x90\x01\n\x0f\x41utostopWaitFor\x12!\n\x1d\x41UTOSTOP_WAIT_FOR_UNSPECIFIED\x10\x00\x12\"\n\x1e\x41UTOSTOP_WAIT_FOR_JOBS_AND_SSH\x10\x01\x12\x1a\n\x16\x41UTOSTOP_WAIT_FOR_JOBS\x10\x02\x12\x1a\n\x16\x41UTOSTOP_WAIT_FOR_NONE\x10\x03\x32\xbe\x01\n\x0f\x41utostopService\x12P\n\x0bSetAutostop\x12\x1f.autostop.v1.SetAutostopRequest\x1a .autostop.v1.SetAutostopResponse\x12Y\n\x0eIsAutostopping\x12\".autostop.v1.IsAutostoppingRequest\x1a#.autostop.v1.IsAutostoppingResponseb\x06proto3')
|
|
18
|
+
|
|
19
|
+
_globals = globals()
|
|
20
|
+
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
21
|
+
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'sky.schemas.generated.autostopv1_pb2', _globals)
|
|
22
|
+
if not _descriptor._USE_C_DESCRIPTORS:
|
|
23
|
+
DESCRIPTOR._loaded_options = None
|
|
24
|
+
_globals['_AUTOSTOPWAITFOR']._serialized_start=278
|
|
25
|
+
_globals['_AUTOSTOPWAITFOR']._serialized_end=422
|
|
26
|
+
_globals['_SETAUTOSTOPREQUEST']._serialized_start=55
|
|
27
|
+
_globals['_SETAUTOSTOPREQUEST']._serialized_end=176
|
|
28
|
+
_globals['_SETAUTOSTOPRESPONSE']._serialized_start=178
|
|
29
|
+
_globals['_SETAUTOSTOPRESPONSE']._serialized_end=199
|
|
30
|
+
_globals['_ISAUTOSTOPPINGREQUEST']._serialized_start=201
|
|
31
|
+
_globals['_ISAUTOSTOPPINGREQUEST']._serialized_end=224
|
|
32
|
+
_globals['_ISAUTOSTOPPINGRESPONSE']._serialized_start=226
|
|
33
|
+
_globals['_ISAUTOSTOPPINGRESPONSE']._serialized_end=275
|
|
34
|
+
_globals['_AUTOSTOPSERVICE']._serialized_start=425
|
|
35
|
+
_globals['_AUTOSTOPSERVICE']._serialized_end=615
|
|
36
|
+
# @@protoc_insertion_point(module_scope)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
|
|
2
|
+
from google.protobuf import descriptor as _descriptor
|
|
3
|
+
from google.protobuf import message as _message
|
|
4
|
+
from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union
|
|
5
|
+
|
|
6
|
+
DESCRIPTOR: _descriptor.FileDescriptor
|
|
7
|
+
|
|
8
|
+
class AutostopWaitFor(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
|
|
9
|
+
__slots__ = ()
|
|
10
|
+
AUTOSTOP_WAIT_FOR_UNSPECIFIED: _ClassVar[AutostopWaitFor]
|
|
11
|
+
AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: _ClassVar[AutostopWaitFor]
|
|
12
|
+
AUTOSTOP_WAIT_FOR_JOBS: _ClassVar[AutostopWaitFor]
|
|
13
|
+
AUTOSTOP_WAIT_FOR_NONE: _ClassVar[AutostopWaitFor]
|
|
14
|
+
AUTOSTOP_WAIT_FOR_UNSPECIFIED: AutostopWaitFor
|
|
15
|
+
AUTOSTOP_WAIT_FOR_JOBS_AND_SSH: AutostopWaitFor
|
|
16
|
+
AUTOSTOP_WAIT_FOR_JOBS: AutostopWaitFor
|
|
17
|
+
AUTOSTOP_WAIT_FOR_NONE: AutostopWaitFor
|
|
18
|
+
|
|
19
|
+
class SetAutostopRequest(_message.Message):
|
|
20
|
+
__slots__ = ("idle_minutes", "backend", "wait_for", "down")
|
|
21
|
+
IDLE_MINUTES_FIELD_NUMBER: _ClassVar[int]
|
|
22
|
+
BACKEND_FIELD_NUMBER: _ClassVar[int]
|
|
23
|
+
WAIT_FOR_FIELD_NUMBER: _ClassVar[int]
|
|
24
|
+
DOWN_FIELD_NUMBER: _ClassVar[int]
|
|
25
|
+
idle_minutes: int
|
|
26
|
+
backend: str
|
|
27
|
+
wait_for: AutostopWaitFor
|
|
28
|
+
down: bool
|
|
29
|
+
def __init__(self, idle_minutes: _Optional[int] = ..., backend: _Optional[str] = ..., wait_for: _Optional[_Union[AutostopWaitFor, str]] = ..., down: bool = ...) -> None: ...
|
|
30
|
+
|
|
31
|
+
class SetAutostopResponse(_message.Message):
|
|
32
|
+
__slots__ = ()
|
|
33
|
+
def __init__(self) -> None: ...
|
|
34
|
+
|
|
35
|
+
class IsAutostoppingRequest(_message.Message):
|
|
36
|
+
__slots__ = ()
|
|
37
|
+
def __init__(self) -> None: ...
|
|
38
|
+
|
|
39
|
+
class IsAutostoppingResponse(_message.Message):
|
|
40
|
+
__slots__ = ("is_autostopping",)
|
|
41
|
+
IS_AUTOSTOPPING_FIELD_NUMBER: _ClassVar[int]
|
|
42
|
+
is_autostopping: bool
|
|
43
|
+
def __init__(self, is_autostopping: bool = ...) -> None: ...
|