skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/backends/backend_utils.py +69 -6
- sky/backends/cloud_vm_ray_backend.py +156 -25
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +40 -77
- sky/client/common.py +1 -1
- sky/client/sdk.py +19 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +14 -0
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-7fd0cf9dbecff10f.js → webpack-00c0a51d21157453.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +63 -7
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +3 -3
- sky/jobs/scheduler.py +23 -68
- sky/jobs/server/core.py +18 -12
- sky/jobs/state.py +6 -2
- sky/jobs/utils.py +8 -0
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +9 -0
- sky/provision/aws/instance.py +36 -13
- sky/provision/azure/instance.py +2 -0
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +2 -0
- sky/provision/do/instance.py +2 -0
- sky/provision/fluidstack/instance.py +2 -0
- sky/provision/gcp/instance.py +2 -0
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/kubernetes/instance.py +133 -0
- sky/provision/lambda_cloud/instance.py +2 -0
- sky/provision/nebius/instance.py +2 -0
- sky/provision/oci/instance.py +2 -0
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/runpod/instance.py +2 -0
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +2 -0
- sky/provision/vast/instance.py +2 -0
- sky/provision/vsphere/instance.py +2 -0
- sky/resources.py +1 -2
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +15 -16
- sky/serve/serve_state.py +10 -0
- sky/serve/serve_utils.py +21 -20
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +31 -16
- sky/server/server.py +20 -14
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +73 -0
- sky/utils/db/db_utils.py +17 -0
- sky/utils/schemas.py +3 -0
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +102 -94
- /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Optional
|
|
|
5
5
|
import urllib.request
|
|
6
6
|
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
8
|
+
_SKYPILOT_COMMIT_SHA = '58649973a7c706775528a419f46ae024e59f4603'
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def _get_git_commit():
|
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
__commit__ = _get_git_commit()
|
|
38
|
-
__version__ = '1.0.0.
|
|
38
|
+
__version__ = '1.0.0.dev20250814'
|
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
|
40
40
|
|
|
41
41
|
|
|
@@ -98,6 +98,7 @@ from sky.client.sdk import cancel
|
|
|
98
98
|
from sky.client.sdk import cost_report
|
|
99
99
|
from sky.client.sdk import down
|
|
100
100
|
from sky.client.sdk import download_logs
|
|
101
|
+
from sky.client.sdk import endpoints
|
|
101
102
|
from sky.client.sdk import exec # pylint: disable=redefined-builtin
|
|
102
103
|
from sky.client.sdk import get
|
|
103
104
|
from sky.client.sdk import job_status
|
|
@@ -194,6 +195,7 @@ __all__ = [
|
|
|
194
195
|
'down',
|
|
195
196
|
'autostop',
|
|
196
197
|
'cost_report',
|
|
198
|
+
'endpoints',
|
|
197
199
|
# core APIs Job Management
|
|
198
200
|
'queue',
|
|
199
201
|
'cancel',
|
sky/backends/backend_utils.py
CHANGED
|
@@ -13,11 +13,13 @@ import sys
|
|
|
13
13
|
import tempfile
|
|
14
14
|
import time
|
|
15
15
|
import typing
|
|
16
|
-
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple,
|
|
16
|
+
from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
|
|
17
|
+
TypeVar, Union)
|
|
17
18
|
import uuid
|
|
18
19
|
|
|
19
20
|
import colorama
|
|
20
21
|
from packaging import version
|
|
22
|
+
import psutil
|
|
21
23
|
from typing_extensions import Literal
|
|
22
24
|
|
|
23
25
|
import sky
|
|
@@ -61,6 +63,7 @@ from sky.utils import ux_utils
|
|
|
61
63
|
from sky.workspaces import core as workspaces_core
|
|
62
64
|
|
|
63
65
|
if typing.TYPE_CHECKING:
|
|
66
|
+
import grpc
|
|
64
67
|
import requests
|
|
65
68
|
from requests import adapters
|
|
66
69
|
from requests.packages.urllib3.util import retry as retry_lib
|
|
@@ -79,6 +82,8 @@ else:
|
|
|
79
82
|
adapters = adaptors_common.LazyImport('requests.adapters')
|
|
80
83
|
retry_lib = adaptors_common.LazyImport(
|
|
81
84
|
'requests.packages.urllib3.util.retry')
|
|
85
|
+
# To avoid requiring grpcio to be installed on the client side.
|
|
86
|
+
grpc = adaptors_common.LazyImport('grpc')
|
|
82
87
|
|
|
83
88
|
logger = sky_logging.init_logger(__name__)
|
|
84
89
|
|
|
@@ -1781,6 +1786,7 @@ def _query_cluster_status_via_cloud_api(
|
|
|
1781
1786
|
exceptions.ClusterStatusFetchingError: the cluster status cannot be
|
|
1782
1787
|
fetched from the cloud provider.
|
|
1783
1788
|
"""
|
|
1789
|
+
cluster_name = handle.cluster_name
|
|
1784
1790
|
cluster_name_on_cloud = handle.cluster_name_on_cloud
|
|
1785
1791
|
cluster_name_in_hint = common_utils.cluster_name_in_hint(
|
|
1786
1792
|
handle.cluster_name, cluster_name_on_cloud)
|
|
@@ -1798,7 +1804,8 @@ def _query_cluster_status_via_cloud_api(
|
|
|
1798
1804
|
cloud_name = repr(handle.launched_resources.cloud)
|
|
1799
1805
|
try:
|
|
1800
1806
|
node_status_dict = provision_lib.query_instances(
|
|
1801
|
-
cloud_name, cluster_name_on_cloud,
|
|
1807
|
+
cloud_name, cluster_name, cluster_name_on_cloud,
|
|
1808
|
+
provider_config)
|
|
1802
1809
|
logger.debug(f'Querying {cloud_name} cluster '
|
|
1803
1810
|
f'{cluster_name_in_hint} '
|
|
1804
1811
|
f'status:\n{pprint.pformat(node_status_dict)}')
|
|
@@ -2227,9 +2234,9 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2227
2234
|
[status[1] for status in node_statuses if status[1] is not None])
|
|
2228
2235
|
|
|
2229
2236
|
if some_nodes_terminated:
|
|
2230
|
-
init_reason =
|
|
2237
|
+
init_reason = 'one or more nodes terminated'
|
|
2231
2238
|
elif some_nodes_not_stopped:
|
|
2232
|
-
init_reason =
|
|
2239
|
+
init_reason = 'some nodes are up and some nodes are stopped'
|
|
2233
2240
|
logger.debug('The cluster is abnormal. Setting to INIT status. '
|
|
2234
2241
|
f'node_statuses: {node_statuses}')
|
|
2235
2242
|
if record['autostop'] >= 0:
|
|
@@ -2313,12 +2320,22 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2313
2320
|
# represent that the cluster is partially preempted.
|
|
2314
2321
|
# TODO(zhwu): the definition of INIT should be audited/changed.
|
|
2315
2322
|
# Adding a new status UNHEALTHY for abnormal status can be a choice.
|
|
2323
|
+
init_reason_regex = None
|
|
2324
|
+
if not status_reason:
|
|
2325
|
+
# If there is not a status reason, don't re-add (and overwrite) the
|
|
2326
|
+
# event if there is already an event with the same reason which may
|
|
2327
|
+
# have a status reason.
|
|
2328
|
+
# Some status reason clears after a certain time (e.g. k8s events
|
|
2329
|
+
# are only stored for an hour by default), so it is possible that
|
|
2330
|
+
# the previous event has a status reason, but now it does not.
|
|
2331
|
+
init_reason_regex = f'^Cluster is abnormal because {init_reason} .*'
|
|
2316
2332
|
global_user_state.add_cluster_event(
|
|
2317
2333
|
cluster_name,
|
|
2318
2334
|
status_lib.ClusterStatus.INIT,
|
|
2319
|
-
f'Cluster is abnormal because {init_reason}. Transitioned to INIT.',
|
|
2335
|
+
f'Cluster is abnormal because {init_reason} ({status_reason}). Transitioned to INIT.',
|
|
2320
2336
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2321
|
-
nop_if_duplicate=True
|
|
2337
|
+
nop_if_duplicate=True,
|
|
2338
|
+
duplicate_regex=init_reason_regex)
|
|
2322
2339
|
global_user_state.add_or_update_cluster(cluster_name,
|
|
2323
2340
|
handle,
|
|
2324
2341
|
requested_resources=None,
|
|
@@ -3361,3 +3378,49 @@ def cluster_file_mounts_lock_id(cluster_name: str) -> str:
|
|
|
3361
3378
|
def workspace_lock_id(workspace_name: str) -> str:
|
|
3362
3379
|
"""Get the lock ID for workspace operations."""
|
|
3363
3380
|
return f'{workspace_name}_workspace'
|
|
3381
|
+
|
|
3382
|
+
|
|
3383
|
+
T = TypeVar('T')
|
|
3384
|
+
|
|
3385
|
+
|
|
3386
|
+
def invoke_skylet_with_retries(
|
|
3387
|
+
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
3388
|
+
func: Callable[..., T]) -> T:
|
|
3389
|
+
"""Generic helper for making Skylet gRPC requests.
|
|
3390
|
+
|
|
3391
|
+
This method handles the common pattern of:
|
|
3392
|
+
1. Try the gRPC request
|
|
3393
|
+
2. If SSH tunnel is closed, recreate it and retry
|
|
3394
|
+
"""
|
|
3395
|
+
max_attempts = 3
|
|
3396
|
+
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3397
|
+
last_exception: Optional[Exception] = None
|
|
3398
|
+
|
|
3399
|
+
for _ in range(max_attempts):
|
|
3400
|
+
try:
|
|
3401
|
+
return func()
|
|
3402
|
+
except grpc.RpcError as e:
|
|
3403
|
+
last_exception = e
|
|
3404
|
+
if e.code() == grpc.StatusCode.INTERNAL:
|
|
3405
|
+
with ux_utils.print_exception_no_traceback():
|
|
3406
|
+
raise exceptions.SkyletInternalError(e.details())
|
|
3407
|
+
elif e.code() == grpc.StatusCode.UNAVAILABLE:
|
|
3408
|
+
recreate_tunnel = True
|
|
3409
|
+
try:
|
|
3410
|
+
if handle.skylet_ssh_tunnel is not None:
|
|
3411
|
+
proc = psutil.Process(handle.skylet_ssh_tunnel.pid)
|
|
3412
|
+
if proc.is_running(
|
|
3413
|
+
) and proc.status() != psutil.STATUS_ZOMBIE:
|
|
3414
|
+
recreate_tunnel = False
|
|
3415
|
+
except psutil.NoSuchProcess:
|
|
3416
|
+
pass
|
|
3417
|
+
|
|
3418
|
+
if recreate_tunnel:
|
|
3419
|
+
handle.open_and_update_skylet_tunnel()
|
|
3420
|
+
|
|
3421
|
+
time.sleep(backoff.current_backoff())
|
|
3422
|
+
else:
|
|
3423
|
+
raise e
|
|
3424
|
+
|
|
3425
|
+
raise RuntimeError(f'Failed to invoke Skylet after {max_attempts} attempts'
|
|
3426
|
+
) from last_exception
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Backend: runs on cloud virtual machines, managed by Ray."""
|
|
2
2
|
import copy
|
|
3
|
+
import dataclasses
|
|
3
4
|
import enum
|
|
4
5
|
import inspect
|
|
5
6
|
import json
|
|
@@ -20,6 +21,7 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
|
|
|
20
21
|
Union)
|
|
21
22
|
|
|
22
23
|
import colorama
|
|
24
|
+
import psutil
|
|
23
25
|
import yaml
|
|
24
26
|
|
|
25
27
|
import sky
|
|
@@ -37,6 +39,7 @@ from sky import resources as resources_lib
|
|
|
37
39
|
from sky import sky_logging
|
|
38
40
|
from sky import skypilot_config
|
|
39
41
|
from sky import task as task_lib
|
|
42
|
+
from sky.adaptors import common as adaptors_common
|
|
40
43
|
from sky.backends import backend_utils
|
|
41
44
|
from sky.backends import wheel_utils
|
|
42
45
|
from sky.clouds import cloud as sky_cloud
|
|
@@ -76,7 +79,18 @@ from sky.utils import ux_utils
|
|
|
76
79
|
from sky.utils import volume as volume_lib
|
|
77
80
|
|
|
78
81
|
if typing.TYPE_CHECKING:
|
|
82
|
+
import grpc
|
|
83
|
+
|
|
79
84
|
from sky import dag
|
|
85
|
+
from sky.schemas.generated import autostopv1_pb2
|
|
86
|
+
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
87
|
+
else:
|
|
88
|
+
# To avoid requiring grpcio to be installed on the client side.
|
|
89
|
+
grpc = adaptors_common.LazyImport('grpc')
|
|
90
|
+
autostopv1_pb2 = adaptors_common.LazyImport(
|
|
91
|
+
'sky.schemas.generated.autostopv1_pb2')
|
|
92
|
+
autostopv1_pb2_grpc = adaptors_common.LazyImport(
|
|
93
|
+
'sky.schemas.generated.autostopv1_pb2_grpc')
|
|
80
94
|
|
|
81
95
|
Path = str
|
|
82
96
|
|
|
@@ -2206,6 +2220,12 @@ class RetryingVmProvisioner(object):
|
|
|
2206
2220
|
return config_dict
|
|
2207
2221
|
|
|
2208
2222
|
|
|
2223
|
+
@dataclasses.dataclass
|
|
2224
|
+
class SSHTunnelInfo:
|
|
2225
|
+
port: int
|
|
2226
|
+
pid: int
|
|
2227
|
+
|
|
2228
|
+
|
|
2209
2229
|
class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2210
2230
|
"""A pickle-able handle to a cluster created by CloudVmRayBackend.
|
|
2211
2231
|
|
|
@@ -2225,10 +2245,11 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2225
2245
|
- (optional) Launched resources
|
|
2226
2246
|
- (optional) Docker user name
|
|
2227
2247
|
- (optional) If TPU(s) are managed, a path to a deletion script.
|
|
2248
|
+
- (optional) Skylet SSH tunnel info.
|
|
2228
2249
|
"""
|
|
2229
2250
|
# Bump if any fields get added/removed/changed, and add backward
|
|
2230
2251
|
# compaitibility logic in __setstate__.
|
|
2231
|
-
_VERSION =
|
|
2252
|
+
_VERSION = 11
|
|
2232
2253
|
|
|
2233
2254
|
def __init__(
|
|
2234
2255
|
self,
|
|
@@ -2261,6 +2282,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2261
2282
|
self.launched_nodes = launched_nodes
|
|
2262
2283
|
self.launched_resources = launched_resources
|
|
2263
2284
|
self.docker_user: Optional[str] = None
|
|
2285
|
+
self.is_grpc_enabled = True
|
|
2286
|
+
self.skylet_ssh_tunnel: Optional[SSHTunnelInfo] = None
|
|
2264
2287
|
|
|
2265
2288
|
def __repr__(self):
|
|
2266
2289
|
return (f'ResourceHandle('
|
|
@@ -2276,7 +2299,9 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2276
2299
|
f'\n\tlaunched_resources={self.launched_nodes}x '
|
|
2277
2300
|
f'{self.launched_resources}, '
|
|
2278
2301
|
f'\n\tdocker_user={self.docker_user},'
|
|
2279
|
-
f'\n\tssh_user={self.ssh_user}'
|
|
2302
|
+
f'\n\tssh_user={self.ssh_user},'
|
|
2303
|
+
f'\n\tis_grpc_enabled={self.is_grpc_enabled},'
|
|
2304
|
+
f'\n\tskylet_ssh_tunnel={self.skylet_ssh_tunnel}')
|
|
2280
2305
|
|
|
2281
2306
|
def get_cluster_name(self):
|
|
2282
2307
|
return self.cluster_name
|
|
@@ -2600,6 +2625,66 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2600
2625
|
cluster_config_file)
|
|
2601
2626
|
self.docker_user = docker_user
|
|
2602
2627
|
|
|
2628
|
+
def get_grpc_channel(self) -> 'grpc.Channel':
|
|
2629
|
+
if self.skylet_ssh_tunnel is None:
|
|
2630
|
+
self.open_and_update_skylet_tunnel()
|
|
2631
|
+
assert self.skylet_ssh_tunnel is not None
|
|
2632
|
+
return grpc.insecure_channel(f'localhost:{self.skylet_ssh_tunnel.port}')
|
|
2633
|
+
|
|
2634
|
+
def _cleanup_ssh_tunnel(self, tunnel_info: SSHTunnelInfo) -> None:
|
|
2635
|
+
"""Clean up an SSH tunnel by terminating the process."""
|
|
2636
|
+
try:
|
|
2637
|
+
proc = psutil.Process(tunnel_info.pid)
|
|
2638
|
+
if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
|
|
2639
|
+
logger.debug(
|
|
2640
|
+
f'Terminating SSH tunnel process {tunnel_info.pid}')
|
|
2641
|
+
proc.terminate()
|
|
2642
|
+
try:
|
|
2643
|
+
proc.wait(timeout=3)
|
|
2644
|
+
except psutil.TimeoutExpired:
|
|
2645
|
+
proc.kill()
|
|
2646
|
+
proc.wait(timeout=1)
|
|
2647
|
+
except psutil.NoSuchProcess:
|
|
2648
|
+
pass
|
|
2649
|
+
except Exception as e: # pylint: disable=broad-except
|
|
2650
|
+
logger.warning(
|
|
2651
|
+
f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
|
|
2652
|
+
|
|
2653
|
+
def open_and_update_skylet_tunnel(self) -> None:
|
|
2654
|
+
"""Opens an SSH tunnel to the Skylet on the head node,
|
|
2655
|
+
updates the cluster handle, and persists it to the database."""
|
|
2656
|
+
local_port = common_utils.find_free_port(10000)
|
|
2657
|
+
runners = self.get_command_runners()
|
|
2658
|
+
head_runner = runners[0]
|
|
2659
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
2660
|
+
# Disabling ControlMaster makes things easier to reason about
|
|
2661
|
+
# with respect to resource management/ownership,
|
|
2662
|
+
# as killing the process will close the tunnel too.
|
|
2663
|
+
head_runner.disable_control_master = True
|
|
2664
|
+
|
|
2665
|
+
cmd = head_runner.port_forward_command([(local_port,
|
|
2666
|
+
constants.SKYLET_GRPC_PORT)])
|
|
2667
|
+
ssh_tunnel_proc = subprocess.Popen(cmd)
|
|
2668
|
+
tunnel_info = SSHTunnelInfo(port=local_port, pid=ssh_tunnel_proc.pid)
|
|
2669
|
+
try:
|
|
2670
|
+
grpc.channel_ready_future(
|
|
2671
|
+
grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
|
|
2672
|
+
timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
|
|
2673
|
+
# Clean up existing tunnel before setting up the new one.
|
|
2674
|
+
if self.skylet_ssh_tunnel is not None:
|
|
2675
|
+
self._cleanup_ssh_tunnel(self.skylet_ssh_tunnel)
|
|
2676
|
+
self.skylet_ssh_tunnel = tunnel_info
|
|
2677
|
+
global_user_state.update_cluster_handle(self.cluster_name, self)
|
|
2678
|
+
except grpc.FutureTimeoutError as e:
|
|
2679
|
+
self._cleanup_ssh_tunnel(tunnel_info)
|
|
2680
|
+
logger.warning(
|
|
2681
|
+
f'Skylet gRPC channel for cluster {self.cluster_name} not '
|
|
2682
|
+
f'ready after {constants.SKYLET_GRPC_TIMEOUT_SECONDS}s')
|
|
2683
|
+
raise e
|
|
2684
|
+
except Exception as e:
|
|
2685
|
+
self._cleanup_ssh_tunnel(tunnel_info)
|
|
2686
|
+
raise e
|
|
2687
|
+
|
|
2603
2688
|
@property
|
|
2604
2689
|
def cluster_yaml(self) -> Optional[str]:
|
|
2605
2690
|
if self._cluster_yaml is None:
|
|
@@ -2697,6 +2782,10 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2697
2782
|
os.path.expanduser(state['_cluster_yaml'])):
|
|
2698
2783
|
state['_cluster_yaml'] = None
|
|
2699
2784
|
|
|
2785
|
+
if version < 11:
|
|
2786
|
+
state['is_grpc_enabled'] = False
|
|
2787
|
+
state['skylet_ssh_tunnel'] = None
|
|
2788
|
+
|
|
2700
2789
|
self.__dict__.update(state)
|
|
2701
2790
|
|
|
2702
2791
|
# Because the update_cluster_ips and update_ssh_ports
|
|
@@ -2736,6 +2825,27 @@ class LocalResourcesHandle(CloudVmRayResourceHandle):
|
|
|
2736
2825
|
return [command_runner.LocalProcessCommandRunner()]
|
|
2737
2826
|
|
|
2738
2827
|
|
|
2828
|
+
class SkyletClient:
|
|
2829
|
+
"""The client to interact with a remote cluster through Skylet."""
|
|
2830
|
+
|
|
2831
|
+
def __init__(self, channel: 'grpc.Channel'):
|
|
2832
|
+
self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
|
|
2833
|
+
|
|
2834
|
+
def set_autostop(
|
|
2835
|
+
self,
|
|
2836
|
+
request: 'autostopv1_pb2.SetAutostopRequest',
|
|
2837
|
+
timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2838
|
+
) -> 'autostopv1_pb2.SetAutostopResponse':
|
|
2839
|
+
return self._autostop_stub.SetAutostop(request, timeout=timeout)
|
|
2840
|
+
|
|
2841
|
+
def is_autostopping(
|
|
2842
|
+
self,
|
|
2843
|
+
request: 'autostopv1_pb2.IsAutostoppingRequest',
|
|
2844
|
+
timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2845
|
+
) -> 'autostopv1_pb2.IsAutostoppingResponse':
|
|
2846
|
+
return self._autostop_stub.IsAutostopping(request, timeout=timeout)
|
|
2847
|
+
|
|
2848
|
+
|
|
2739
2849
|
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
|
2740
2850
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2741
2851
|
"""Backend: runs on cloud virtual machines, managed by Ray.
|
|
@@ -4659,6 +4769,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4659
4769
|
logger.debug(f'instance statuses attempt {attempts + 1}')
|
|
4660
4770
|
node_status_dict = provision_lib.query_instances(
|
|
4661
4771
|
repr(cloud),
|
|
4772
|
+
handle.cluster_name,
|
|
4662
4773
|
cluster_name_on_cloud,
|
|
4663
4774
|
config['provider'],
|
|
4664
4775
|
non_terminated_only=False)
|
|
@@ -4768,17 +4879,30 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4768
4879
|
# Check if we're stopping spot
|
|
4769
4880
|
assert (handle.launched_resources is not None and
|
|
4770
4881
|
handle.launched_resources.cloud is not None), handle
|
|
4771
|
-
|
|
4772
|
-
|
|
4773
|
-
|
|
4774
|
-
|
|
4775
|
-
|
|
4776
|
-
|
|
4777
|
-
|
|
4778
|
-
|
|
4779
|
-
|
|
4780
|
-
|
|
4781
|
-
|
|
4882
|
+
if handle.is_grpc_enabled:
|
|
4883
|
+
request = autostopv1_pb2.SetAutostopRequest(
|
|
4884
|
+
idle_minutes=idle_minutes_to_autostop,
|
|
4885
|
+
backend=self.NAME,
|
|
4886
|
+
wait_for=wait_for.to_protobuf() if wait_for is not None else
|
|
4887
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
|
|
4888
|
+
down=down,
|
|
4889
|
+
)
|
|
4890
|
+
backend_utils.invoke_skylet_with_retries(
|
|
4891
|
+
handle, lambda: SkyletClient(handle.get_grpc_channel()).
|
|
4892
|
+
set_autostop(request))
|
|
4893
|
+
else:
|
|
4894
|
+
logger.info(
|
|
4895
|
+
'Using legacy remote execution for set_autostop on '
|
|
4896
|
+
'cluster %s.', handle.cluster_name)
|
|
4897
|
+
code = autostop_lib.AutostopCodeGen.set_autostop(
|
|
4898
|
+
idle_minutes_to_autostop, self.NAME, wait_for, down)
|
|
4899
|
+
returncode, _, stderr = self.run_on_head(
|
|
4900
|
+
handle, code, require_outputs=True, stream_logs=stream_logs)
|
|
4901
|
+
subprocess_utils.handle_returncode(returncode,
|
|
4902
|
+
code,
|
|
4903
|
+
'Failed to set autostop',
|
|
4904
|
+
stderr=stderr,
|
|
4905
|
+
stream_logs=stream_logs)
|
|
4782
4906
|
global_user_state.set_cluster_autostop_value(
|
|
4783
4907
|
handle.cluster_name, idle_minutes_to_autostop, down)
|
|
4784
4908
|
|
|
@@ -4803,18 +4927,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4803
4927
|
# The head node of the cluster is not UP or in an abnormal state.
|
|
4804
4928
|
# We cannot check if the cluster is autostopping.
|
|
4805
4929
|
return False
|
|
4806
|
-
|
|
4807
|
-
|
|
4808
|
-
|
|
4809
|
-
|
|
4810
|
-
|
|
4811
|
-
|
|
4812
|
-
|
|
4813
|
-
|
|
4814
|
-
|
|
4815
|
-
|
|
4816
|
-
|
|
4817
|
-
|
|
4930
|
+
if handle.is_grpc_enabled:
|
|
4931
|
+
request = autostopv1_pb2.IsAutostoppingRequest()
|
|
4932
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4933
|
+
handle, lambda: SkyletClient(handle.get_grpc_channel()).
|
|
4934
|
+
is_autostopping(request))
|
|
4935
|
+
return response.is_autostopping
|
|
4936
|
+
else:
|
|
4937
|
+
logger.info(
|
|
4938
|
+
'Using legacy remote execution for is_autostopping on '
|
|
4939
|
+
'cluster %s.', handle.cluster_name)
|
|
4940
|
+
code = autostop_lib.AutostopCodeGen.is_autostopping()
|
|
4941
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
4942
|
+
handle, code, require_outputs=True, stream_logs=stream_logs)
|
|
4943
|
+
if returncode == 0:
|
|
4944
|
+
return message_utils.decode_payload(stdout)
|
|
4945
|
+
logger.debug('Failed to check if cluster is autostopping with '
|
|
4946
|
+
f'{returncode}: {stdout+stderr}\n'
|
|
4947
|
+
f'Command: {code}')
|
|
4948
|
+
return False
|
|
4818
4949
|
|
|
4819
4950
|
# TODO(zhwu): Refactor this to a CommandRunner class, so different backends
|
|
4820
4951
|
# can support its own command runner.
|
sky/catalog/cudo_catalog.py
CHANGED
|
@@ -4,7 +4,7 @@ import typing
|
|
|
4
4
|
from typing import Dict, List, Optional, Tuple, Union
|
|
5
5
|
|
|
6
6
|
from sky.catalog import common
|
|
7
|
-
|
|
7
|
+
from sky.provision.cudo import cudo_machine_type as cudo_mt
|
|
8
8
|
from sky.utils import ux_utils
|
|
9
9
|
|
|
10
10
|
if typing.TYPE_CHECKING:
|
|
@@ -22,6 +22,8 @@ TIMEOUT = 10
|
|
|
22
22
|
PARENT_ID_TEMPLATE = 'project-{}public-images'
|
|
23
23
|
ACCELERATOR_MANUFACTURER = 'NVIDIA'
|
|
24
24
|
|
|
25
|
+
VRAM = {'L40S': 49152, 'H100': 81920, 'H200': 144384, 'B200': 184320}
|
|
26
|
+
|
|
25
27
|
|
|
26
28
|
@dataclass
|
|
27
29
|
class PresetInfo:
|
|
@@ -196,17 +198,18 @@ def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
|
|
|
196
198
|
key=lambda x:
|
|
197
199
|
(bool(x.gpu), x.region, x.platform_name, x.vcpu)):
|
|
198
200
|
gpu_info = ''
|
|
199
|
-
if preset.gpu > 0:
|
|
201
|
+
if preset.gpu > 0 and preset.accelerator_name:
|
|
200
202
|
gpu_info_dict = {
|
|
201
203
|
'Gpus': [{
|
|
202
204
|
'Name': preset.accelerator_name,
|
|
203
205
|
'Manufacturer': preset.accelerator_manufacturer,
|
|
204
206
|
'Count': preset.gpu,
|
|
205
207
|
'MemoryInfo': {
|
|
206
|
-
'SizeInMiB': preset.
|
|
208
|
+
'SizeInMiB': VRAM.get(preset.accelerator_name, 0)
|
|
207
209
|
},
|
|
208
210
|
}],
|
|
209
|
-
'TotalGpuMemoryInMiB': preset.
|
|
211
|
+
'TotalGpuMemoryInMiB': VRAM.get(preset.accelerator_name, 0)
|
|
212
|
+
* preset.gpu,
|
|
210
213
|
}
|
|
211
214
|
gpu_info = json.dumps(gpu_info_dict).replace('"', '\'')
|
|
212
215
|
|