skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend_utils.py +74 -7
- sky/backends/cloud_vm_ray_backend.py +169 -29
- sky/catalog/cudo_catalog.py +1 -1
- sky/catalog/data_fetchers/fetch_cudo.py +1 -1
- sky/catalog/data_fetchers/fetch_nebius.py +6 -3
- sky/client/cli/command.py +62 -85
- sky/client/common.py +1 -1
- sky/client/sdk.py +69 -19
- sky/client/sdk_async.py +5 -4
- sky/clouds/aws.py +52 -1
- sky/clouds/kubernetes.py +15 -5
- sky/clouds/nebius.py +3 -1
- sky/dag.py +1 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
- sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
- sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -1
- sky/exceptions.py +5 -0
- sky/execution.py +13 -10
- sky/global_user_state.py +191 -8
- sky/jobs/constants.py +1 -1
- sky/jobs/controller.py +0 -1
- sky/jobs/recovery_strategy.py +3 -3
- sky/jobs/scheduler.py +35 -87
- sky/jobs/server/core.py +82 -22
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +7 -5
- sky/jobs/utils.py +167 -8
- sky/provision/__init__.py +1 -0
- sky/provision/aws/config.py +25 -0
- sky/provision/aws/instance.py +37 -13
- sky/provision/azure/instance.py +2 -0
- sky/provision/cudo/cudo_wrapper.py +1 -1
- sky/provision/cudo/instance.py +2 -0
- sky/provision/do/instance.py +2 -0
- sky/provision/fluidstack/instance.py +2 -0
- sky/provision/gcp/instance.py +2 -0
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/kubernetes/instance.py +133 -0
- sky/provision/lambda_cloud/instance.py +2 -0
- sky/provision/nebius/instance.py +2 -0
- sky/provision/nebius/utils.py +101 -86
- sky/provision/oci/instance.py +2 -0
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/provisioner.py +13 -8
- sky/provision/runpod/instance.py +2 -0
- sky/provision/runpod/utils.py +1 -1
- sky/provision/scp/instance.py +2 -0
- sky/provision/vast/instance.py +2 -0
- sky/provision/vsphere/instance.py +2 -0
- sky/resources.py +6 -7
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +70 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/serve/constants.py +3 -7
- sky/serve/replica_managers.py +138 -117
- sky/serve/serve_state.py +42 -0
- sky/serve/serve_utils.py +58 -36
- sky/serve/server/impl.py +15 -19
- sky/serve/service.py +82 -33
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +12 -2
- sky/server/requests/serializers/encoders.py +10 -2
- sky/server/server.py +64 -16
- sky/setup_files/dependencies.py +11 -10
- sky/skylet/autostop_lib.py +38 -5
- sky/skylet/constants.py +3 -1
- sky/skylet/services.py +44 -0
- sky/skylet/skylet.py +49 -4
- sky/task.py +19 -16
- sky/templates/aws-ray.yml.j2 +2 -2
- sky/templates/jobs-controller.yaml.j2 +6 -0
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +20 -0
- sky/utils/config_utils.py +29 -5
- sky/utils/controller_utils.py +86 -0
- sky/utils/db/db_utils.py +17 -0
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +6 -0
- sky/utils/ux_utils.py +36 -5
- sky/volumes/server/core.py +2 -2
- sky/volumes/server/server.py +2 -2
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
- sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
- /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Optional
|
|
|
5
5
|
import urllib.request
|
|
6
6
|
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
8
|
+
_SKYPILOT_COMMIT_SHA = '83f7c60583fed1ee759f73a5cf859239f86fb3f9'
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def _get_git_commit():
|
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
__commit__ = _get_git_commit()
|
|
38
|
-
__version__ = '1.0.0.
|
|
38
|
+
__version__ = '1.0.0.dev20250815'
|
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
|
40
40
|
|
|
41
41
|
|
|
@@ -98,6 +98,7 @@ from sky.client.sdk import cancel
|
|
|
98
98
|
from sky.client.sdk import cost_report
|
|
99
99
|
from sky.client.sdk import down
|
|
100
100
|
from sky.client.sdk import download_logs
|
|
101
|
+
from sky.client.sdk import endpoints
|
|
101
102
|
from sky.client.sdk import exec # pylint: disable=redefined-builtin
|
|
102
103
|
from sky.client.sdk import get
|
|
103
104
|
from sky.client.sdk import job_status
|
|
@@ -194,6 +195,7 @@ __all__ = [
|
|
|
194
195
|
'down',
|
|
195
196
|
'autostop',
|
|
196
197
|
'cost_report',
|
|
198
|
+
'endpoints',
|
|
197
199
|
# core APIs Job Management
|
|
198
200
|
'queue',
|
|
199
201
|
'cancel',
|
sky/adaptors/nebius.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
"""Nebius cloud adaptor."""
|
|
2
|
+
import asyncio
|
|
2
3
|
import os
|
|
3
4
|
import threading
|
|
4
|
-
from typing import List, Optional
|
|
5
|
+
from typing import Any, Awaitable, List, Optional
|
|
5
6
|
|
|
6
7
|
from sky import sky_logging
|
|
7
8
|
from sky import skypilot_config
|
|
@@ -9,8 +10,49 @@ from sky.adaptors import common
|
|
|
9
10
|
from sky.utils import annotations
|
|
10
11
|
from sky.utils import ux_utils
|
|
11
12
|
|
|
13
|
+
# Default read timeout for nebius SDK
|
|
14
|
+
READ_TIMEOUT = 10
|
|
15
|
+
|
|
12
16
|
logger = sky_logging.init_logger(__name__)
|
|
13
17
|
|
|
18
|
+
_loop_lock = threading.Lock()
|
|
19
|
+
_loop = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_event_loop() -> asyncio.AbstractEventLoop:
|
|
23
|
+
"""Get event loop for nebius sdk."""
|
|
24
|
+
global _loop
|
|
25
|
+
|
|
26
|
+
if _loop is not None:
|
|
27
|
+
return _loop
|
|
28
|
+
|
|
29
|
+
with _loop_lock:
|
|
30
|
+
if _loop is None:
|
|
31
|
+
# Create a new event loop in a dedicated thread
|
|
32
|
+
_loop = asyncio.new_event_loop()
|
|
33
|
+
threading.Thread(target=_loop.run_forever, daemon=True).start()
|
|
34
|
+
|
|
35
|
+
return _loop
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def sync_call(awaitable: Awaitable[Any]) -> Any:
|
|
39
|
+
"""Synchronously run an awaitable in coroutine.
|
|
40
|
+
|
|
41
|
+
This wrapper is used to workaround:
|
|
42
|
+
https://github.com/nebius/pysdk/issues/76
|
|
43
|
+
|
|
44
|
+
Uses a dedicated background event loop to avoid conflicts
|
|
45
|
+
with existing asyncio contexts and prevent BlockingIOError.
|
|
46
|
+
"""
|
|
47
|
+
loop = _get_event_loop()
|
|
48
|
+
future = asyncio.run_coroutine_threadsafe(_coro(awaitable), loop)
|
|
49
|
+
return future.result()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def _coro(awaitable: Awaitable[Any]) -> Any:
|
|
53
|
+
"""Wrapper coroutine for awaitable."""
|
|
54
|
+
return await awaitable
|
|
55
|
+
|
|
14
56
|
|
|
15
57
|
def tenant_id_path() -> str:
|
|
16
58
|
return '~/.nebius/NEBIUS_TENANT_ID.txt'
|
sky/backends/backend_utils.py
CHANGED
|
@@ -13,11 +13,13 @@ import sys
|
|
|
13
13
|
import tempfile
|
|
14
14
|
import time
|
|
15
15
|
import typing
|
|
16
|
-
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple,
|
|
16
|
+
from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
|
|
17
|
+
TypeVar, Union)
|
|
17
18
|
import uuid
|
|
18
19
|
|
|
19
20
|
import colorama
|
|
20
21
|
from packaging import version
|
|
22
|
+
import psutil
|
|
21
23
|
from typing_extensions import Literal
|
|
22
24
|
|
|
23
25
|
import sky
|
|
@@ -61,6 +63,7 @@ from sky.utils import ux_utils
|
|
|
61
63
|
from sky.workspaces import core as workspaces_core
|
|
62
64
|
|
|
63
65
|
if typing.TYPE_CHECKING:
|
|
66
|
+
import grpc
|
|
64
67
|
import requests
|
|
65
68
|
from requests import adapters
|
|
66
69
|
from requests.packages.urllib3.util import retry as retry_lib
|
|
@@ -79,6 +82,8 @@ else:
|
|
|
79
82
|
adapters = adaptors_common.LazyImport('requests.adapters')
|
|
80
83
|
retry_lib = adaptors_common.LazyImport(
|
|
81
84
|
'requests.packages.urllib3.util.retry')
|
|
85
|
+
# To avoid requiring grpcio to be installed on the client side.
|
|
86
|
+
grpc = adaptors_common.LazyImport('grpc')
|
|
82
87
|
|
|
83
88
|
logger = sky_logging.init_logger(__name__)
|
|
84
89
|
|
|
@@ -1781,6 +1786,7 @@ def _query_cluster_status_via_cloud_api(
|
|
|
1781
1786
|
exceptions.ClusterStatusFetchingError: the cluster status cannot be
|
|
1782
1787
|
fetched from the cloud provider.
|
|
1783
1788
|
"""
|
|
1789
|
+
cluster_name = handle.cluster_name
|
|
1784
1790
|
cluster_name_on_cloud = handle.cluster_name_on_cloud
|
|
1785
1791
|
cluster_name_in_hint = common_utils.cluster_name_in_hint(
|
|
1786
1792
|
handle.cluster_name, cluster_name_on_cloud)
|
|
@@ -1798,7 +1804,8 @@ def _query_cluster_status_via_cloud_api(
|
|
|
1798
1804
|
cloud_name = repr(handle.launched_resources.cloud)
|
|
1799
1805
|
try:
|
|
1800
1806
|
node_status_dict = provision_lib.query_instances(
|
|
1801
|
-
cloud_name, cluster_name_on_cloud,
|
|
1807
|
+
cloud_name, cluster_name, cluster_name_on_cloud,
|
|
1808
|
+
provider_config)
|
|
1802
1809
|
logger.debug(f'Querying {cloud_name} cluster '
|
|
1803
1810
|
f'{cluster_name_in_hint} '
|
|
1804
1811
|
f'status:\n{pprint.pformat(node_status_dict)}')
|
|
@@ -2130,7 +2137,7 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2130
2137
|
global_user_state.add_cluster_event(
|
|
2131
2138
|
cluster_name,
|
|
2132
2139
|
status_lib.ClusterStatus.UP,
|
|
2133
|
-
'All nodes up
|
|
2140
|
+
'All nodes up; SkyPilot runtime healthy.',
|
|
2134
2141
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2135
2142
|
nop_if_duplicate=True)
|
|
2136
2143
|
global_user_state.add_or_update_cluster(cluster_name,
|
|
@@ -2227,9 +2234,9 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2227
2234
|
[status[1] for status in node_statuses if status[1] is not None])
|
|
2228
2235
|
|
|
2229
2236
|
if some_nodes_terminated:
|
|
2230
|
-
init_reason =
|
|
2237
|
+
init_reason = 'one or more nodes terminated'
|
|
2231
2238
|
elif some_nodes_not_stopped:
|
|
2232
|
-
init_reason =
|
|
2239
|
+
init_reason = 'some nodes are up and some nodes are stopped'
|
|
2233
2240
|
logger.debug('The cluster is abnormal. Setting to INIT status. '
|
|
2234
2241
|
f'node_statuses: {node_statuses}')
|
|
2235
2242
|
if record['autostop'] >= 0:
|
|
@@ -2313,12 +2320,26 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2313
2320
|
# represent that the cluster is partially preempted.
|
|
2314
2321
|
# TODO(zhwu): the definition of INIT should be audited/changed.
|
|
2315
2322
|
# Adding a new status UNHEALTHY for abnormal status can be a choice.
|
|
2323
|
+
init_reason_regex = None
|
|
2324
|
+
if not status_reason:
|
|
2325
|
+
# If there is not a status reason, don't re-add (and overwrite) the
|
|
2326
|
+
# event if there is already an event with the same reason which may
|
|
2327
|
+
# have a status reason.
|
|
2328
|
+
# Some status reason clears after a certain time (e.g. k8s events
|
|
2329
|
+
# are only stored for an hour by default), so it is possible that
|
|
2330
|
+
# the previous event has a status reason, but now it does not.
|
|
2331
|
+
init_reason_regex = f'^Cluster is abnormal because {init_reason} .*'
|
|
2332
|
+
log_message = f'Cluster is abnormal because {init_reason}'
|
|
2333
|
+
if status_reason:
|
|
2334
|
+
log_message += f' ({status_reason})'
|
|
2335
|
+
log_message += '. Transitioned to INIT.'
|
|
2316
2336
|
global_user_state.add_cluster_event(
|
|
2317
2337
|
cluster_name,
|
|
2318
2338
|
status_lib.ClusterStatus.INIT,
|
|
2319
|
-
|
|
2339
|
+
log_message,
|
|
2320
2340
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2321
|
-
nop_if_duplicate=True
|
|
2341
|
+
nop_if_duplicate=True,
|
|
2342
|
+
duplicate_regex=init_reason_regex)
|
|
2322
2343
|
global_user_state.add_or_update_cluster(cluster_name,
|
|
2323
2344
|
handle,
|
|
2324
2345
|
requested_resources=None,
|
|
@@ -3361,3 +3382,49 @@ def cluster_file_mounts_lock_id(cluster_name: str) -> str:
|
|
|
3361
3382
|
def workspace_lock_id(workspace_name: str) -> str:
|
|
3362
3383
|
"""Get the lock ID for workspace operations."""
|
|
3363
3384
|
return f'{workspace_name}_workspace'
|
|
3385
|
+
|
|
3386
|
+
|
|
3387
|
+
T = TypeVar('T')
|
|
3388
|
+
|
|
3389
|
+
|
|
3390
|
+
def invoke_skylet_with_retries(
|
|
3391
|
+
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
3392
|
+
func: Callable[..., T]) -> T:
|
|
3393
|
+
"""Generic helper for making Skylet gRPC requests.
|
|
3394
|
+
|
|
3395
|
+
This method handles the common pattern of:
|
|
3396
|
+
1. Try the gRPC request
|
|
3397
|
+
2. If SSH tunnel is closed, recreate it and retry
|
|
3398
|
+
"""
|
|
3399
|
+
max_attempts = 3
|
|
3400
|
+
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3401
|
+
last_exception: Optional[Exception] = None
|
|
3402
|
+
|
|
3403
|
+
for _ in range(max_attempts):
|
|
3404
|
+
try:
|
|
3405
|
+
return func()
|
|
3406
|
+
except grpc.RpcError as e:
|
|
3407
|
+
last_exception = e
|
|
3408
|
+
if e.code() == grpc.StatusCode.INTERNAL:
|
|
3409
|
+
with ux_utils.print_exception_no_traceback():
|
|
3410
|
+
raise exceptions.SkyletInternalError(e.details())
|
|
3411
|
+
elif e.code() == grpc.StatusCode.UNAVAILABLE:
|
|
3412
|
+
recreate_tunnel = True
|
|
3413
|
+
try:
|
|
3414
|
+
if handle.skylet_ssh_tunnel is not None:
|
|
3415
|
+
proc = psutil.Process(handle.skylet_ssh_tunnel.pid)
|
|
3416
|
+
if proc.is_running(
|
|
3417
|
+
) and proc.status() != psutil.STATUS_ZOMBIE:
|
|
3418
|
+
recreate_tunnel = False
|
|
3419
|
+
except psutil.NoSuchProcess:
|
|
3420
|
+
pass
|
|
3421
|
+
|
|
3422
|
+
if recreate_tunnel:
|
|
3423
|
+
handle.open_and_update_skylet_tunnel()
|
|
3424
|
+
|
|
3425
|
+
time.sleep(backoff.current_backoff())
|
|
3426
|
+
else:
|
|
3427
|
+
raise e
|
|
3428
|
+
|
|
3429
|
+
raise RuntimeError(f'Failed to invoke Skylet after {max_attempts} attempts'
|
|
3430
|
+
) from last_exception
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Backend: runs on cloud virtual machines, managed by Ray."""
|
|
2
2
|
import copy
|
|
3
|
+
import dataclasses
|
|
3
4
|
import enum
|
|
4
5
|
import inspect
|
|
5
6
|
import json
|
|
@@ -20,6 +21,7 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
|
|
|
20
21
|
Union)
|
|
21
22
|
|
|
22
23
|
import colorama
|
|
24
|
+
import psutil
|
|
23
25
|
import yaml
|
|
24
26
|
|
|
25
27
|
import sky
|
|
@@ -37,6 +39,7 @@ from sky import resources as resources_lib
|
|
|
37
39
|
from sky import sky_logging
|
|
38
40
|
from sky import skypilot_config
|
|
39
41
|
from sky import task as task_lib
|
|
42
|
+
from sky.adaptors import common as adaptors_common
|
|
40
43
|
from sky.backends import backend_utils
|
|
41
44
|
from sky.backends import wheel_utils
|
|
42
45
|
from sky.clouds import cloud as sky_cloud
|
|
@@ -76,7 +79,18 @@ from sky.utils import ux_utils
|
|
|
76
79
|
from sky.utils import volume as volume_lib
|
|
77
80
|
|
|
78
81
|
if typing.TYPE_CHECKING:
|
|
82
|
+
import grpc
|
|
83
|
+
|
|
79
84
|
from sky import dag
|
|
85
|
+
from sky.schemas.generated import autostopv1_pb2
|
|
86
|
+
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
87
|
+
else:
|
|
88
|
+
# To avoid requiring grpcio to be installed on the client side.
|
|
89
|
+
grpc = adaptors_common.LazyImport('grpc')
|
|
90
|
+
autostopv1_pb2 = adaptors_common.LazyImport(
|
|
91
|
+
'sky.schemas.generated.autostopv1_pb2')
|
|
92
|
+
autostopv1_pb2_grpc = adaptors_common.LazyImport(
|
|
93
|
+
'sky.schemas.generated.autostopv1_pb2_grpc')
|
|
80
94
|
|
|
81
95
|
Path = str
|
|
82
96
|
|
|
@@ -1354,8 +1368,11 @@ class RetryingVmProvisioner(object):
|
|
|
1354
1368
|
if not dryrun:
|
|
1355
1369
|
os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
|
|
1356
1370
|
os.system(f'touch {log_path}')
|
|
1371
|
+
|
|
1357
1372
|
rich_utils.force_update_status(
|
|
1358
|
-
ux_utils.spinner_message('Launching',
|
|
1373
|
+
ux_utils.spinner_message('Launching',
|
|
1374
|
+
log_path,
|
|
1375
|
+
cluster_name=cluster_name))
|
|
1359
1376
|
|
|
1360
1377
|
# Get previous cluster status
|
|
1361
1378
|
cluster_exists = prev_cluster_status is not None
|
|
@@ -1525,6 +1542,7 @@ class RetryingVmProvisioner(object):
|
|
|
1525
1542
|
requested_resources=requested_resources,
|
|
1526
1543
|
ready=False,
|
|
1527
1544
|
is_managed=self._is_managed,
|
|
1545
|
+
provision_log_path=log_abs_path,
|
|
1528
1546
|
)
|
|
1529
1547
|
|
|
1530
1548
|
# Add cluster event for actual provisioning start.
|
|
@@ -1670,7 +1688,9 @@ class RetryingVmProvisioner(object):
|
|
|
1670
1688
|
config_dict['handle'] = handle
|
|
1671
1689
|
logger.info(
|
|
1672
1690
|
ux_utils.finishing_message(
|
|
1673
|
-
f'Cluster launched: {cluster_name!r}.',
|
|
1691
|
+
f'Cluster launched: {cluster_name!r}.',
|
|
1692
|
+
log_path,
|
|
1693
|
+
cluster_name=cluster_name))
|
|
1674
1694
|
return config_dict
|
|
1675
1695
|
|
|
1676
1696
|
# The cluster is not ready. We must perform error recording and/or
|
|
@@ -1804,7 +1824,8 @@ class RetryingVmProvisioner(object):
|
|
|
1804
1824
|
log_abs_path,
|
|
1805
1825
|
stream_logs=False,
|
|
1806
1826
|
start_streaming_at='Shared connection to',
|
|
1807
|
-
line_processor=log_utils.RayUpLineProcessor(
|
|
1827
|
+
line_processor=log_utils.RayUpLineProcessor(
|
|
1828
|
+
log_abs_path, cluster_name=cluster_handle.cluster_name),
|
|
1808
1829
|
# Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging
|
|
1809
1830
|
# time during 'ray up' if insufficient capacity occurs.
|
|
1810
1831
|
env=dict(
|
|
@@ -2206,6 +2227,12 @@ class RetryingVmProvisioner(object):
|
|
|
2206
2227
|
return config_dict
|
|
2207
2228
|
|
|
2208
2229
|
|
|
2230
|
+
@dataclasses.dataclass
|
|
2231
|
+
class SSHTunnelInfo:
|
|
2232
|
+
port: int
|
|
2233
|
+
pid: int
|
|
2234
|
+
|
|
2235
|
+
|
|
2209
2236
|
class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2210
2237
|
"""A pickle-able handle to a cluster created by CloudVmRayBackend.
|
|
2211
2238
|
|
|
@@ -2225,10 +2252,11 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2225
2252
|
- (optional) Launched resources
|
|
2226
2253
|
- (optional) Docker user name
|
|
2227
2254
|
- (optional) If TPU(s) are managed, a path to a deletion script.
|
|
2255
|
+
- (optional) Skylet SSH tunnel info.
|
|
2228
2256
|
"""
|
|
2229
2257
|
# Bump if any fields get added/removed/changed, and add backward
|
|
2230
2258
|
# compaitibility logic in __setstate__.
|
|
2231
|
-
_VERSION =
|
|
2259
|
+
_VERSION = 11
|
|
2232
2260
|
|
|
2233
2261
|
def __init__(
|
|
2234
2262
|
self,
|
|
@@ -2261,6 +2289,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2261
2289
|
self.launched_nodes = launched_nodes
|
|
2262
2290
|
self.launched_resources = launched_resources
|
|
2263
2291
|
self.docker_user: Optional[str] = None
|
|
2292
|
+
self.is_grpc_enabled = True
|
|
2293
|
+
self.skylet_ssh_tunnel: Optional[SSHTunnelInfo] = None
|
|
2264
2294
|
|
|
2265
2295
|
def __repr__(self):
|
|
2266
2296
|
return (f'ResourceHandle('
|
|
@@ -2276,7 +2306,9 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2276
2306
|
f'\n\tlaunched_resources={self.launched_nodes}x '
|
|
2277
2307
|
f'{self.launched_resources}, '
|
|
2278
2308
|
f'\n\tdocker_user={self.docker_user},'
|
|
2279
|
-
f'\n\tssh_user={self.ssh_user}'
|
|
2309
|
+
f'\n\tssh_user={self.ssh_user},'
|
|
2310
|
+
f'\n\tis_grpc_enabled={self.is_grpc_enabled},'
|
|
2311
|
+
f'\n\tskylet_ssh_tunnel={self.skylet_ssh_tunnel}')
|
|
2280
2312
|
|
|
2281
2313
|
def get_cluster_name(self):
|
|
2282
2314
|
return self.cluster_name
|
|
@@ -2600,6 +2632,66 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2600
2632
|
cluster_config_file)
|
|
2601
2633
|
self.docker_user = docker_user
|
|
2602
2634
|
|
|
2635
|
+
def get_grpc_channel(self) -> 'grpc.Channel':
|
|
2636
|
+
if self.skylet_ssh_tunnel is None:
|
|
2637
|
+
self.open_and_update_skylet_tunnel()
|
|
2638
|
+
assert self.skylet_ssh_tunnel is not None
|
|
2639
|
+
return grpc.insecure_channel(f'localhost:{self.skylet_ssh_tunnel.port}')
|
|
2640
|
+
|
|
2641
|
+
def _cleanup_ssh_tunnel(self, tunnel_info: SSHTunnelInfo) -> None:
|
|
2642
|
+
"""Clean up an SSH tunnel by terminating the process."""
|
|
2643
|
+
try:
|
|
2644
|
+
proc = psutil.Process(tunnel_info.pid)
|
|
2645
|
+
if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
|
|
2646
|
+
logger.debug(
|
|
2647
|
+
f'Terminating SSH tunnel process {tunnel_info.pid}')
|
|
2648
|
+
proc.terminate()
|
|
2649
|
+
try:
|
|
2650
|
+
proc.wait(timeout=3)
|
|
2651
|
+
except psutil.TimeoutExpired:
|
|
2652
|
+
proc.kill()
|
|
2653
|
+
proc.wait(timeout=1)
|
|
2654
|
+
except psutil.NoSuchProcess:
|
|
2655
|
+
pass
|
|
2656
|
+
except Exception as e: # pylint: disable=broad-except
|
|
2657
|
+
logger.warning(
|
|
2658
|
+
f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
|
|
2659
|
+
|
|
2660
|
+
def open_and_update_skylet_tunnel(self) -> None:
|
|
2661
|
+
"""Opens an SSH tunnel to the Skylet on the head node,
|
|
2662
|
+
updates the cluster handle, and persists it to the database."""
|
|
2663
|
+
local_port = common_utils.find_free_port(10000)
|
|
2664
|
+
runners = self.get_command_runners()
|
|
2665
|
+
head_runner = runners[0]
|
|
2666
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
2667
|
+
# Disabling ControlMaster makes things easier to reason about
|
|
2668
|
+
# with respect to resource management/ownership,
|
|
2669
|
+
# as killing the process will close the tunnel too.
|
|
2670
|
+
head_runner.disable_control_master = True
|
|
2671
|
+
|
|
2672
|
+
cmd = head_runner.port_forward_command([(local_port,
|
|
2673
|
+
constants.SKYLET_GRPC_PORT)])
|
|
2674
|
+
ssh_tunnel_proc = subprocess.Popen(cmd)
|
|
2675
|
+
tunnel_info = SSHTunnelInfo(port=local_port, pid=ssh_tunnel_proc.pid)
|
|
2676
|
+
try:
|
|
2677
|
+
grpc.channel_ready_future(
|
|
2678
|
+
grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
|
|
2679
|
+
timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
|
|
2680
|
+
# Clean up existing tunnel before setting up the new one.
|
|
2681
|
+
if self.skylet_ssh_tunnel is not None:
|
|
2682
|
+
self._cleanup_ssh_tunnel(self.skylet_ssh_tunnel)
|
|
2683
|
+
self.skylet_ssh_tunnel = tunnel_info
|
|
2684
|
+
global_user_state.update_cluster_handle(self.cluster_name, self)
|
|
2685
|
+
except grpc.FutureTimeoutError as e:
|
|
2686
|
+
self._cleanup_ssh_tunnel(tunnel_info)
|
|
2687
|
+
logger.warning(
|
|
2688
|
+
f'Skylet gRPC channel for cluster {self.cluster_name} not '
|
|
2689
|
+
f'ready after {constants.SKYLET_GRPC_TIMEOUT_SECONDS}s')
|
|
2690
|
+
raise e
|
|
2691
|
+
except Exception as e:
|
|
2692
|
+
self._cleanup_ssh_tunnel(tunnel_info)
|
|
2693
|
+
raise e
|
|
2694
|
+
|
|
2603
2695
|
@property
|
|
2604
2696
|
def cluster_yaml(self) -> Optional[str]:
|
|
2605
2697
|
if self._cluster_yaml is None:
|
|
@@ -2697,6 +2789,10 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2697
2789
|
os.path.expanduser(state['_cluster_yaml'])):
|
|
2698
2790
|
state['_cluster_yaml'] = None
|
|
2699
2791
|
|
|
2792
|
+
if version < 11:
|
|
2793
|
+
state['is_grpc_enabled'] = False
|
|
2794
|
+
state['skylet_ssh_tunnel'] = None
|
|
2795
|
+
|
|
2700
2796
|
self.__dict__.update(state)
|
|
2701
2797
|
|
|
2702
2798
|
# Because the update_cluster_ips and update_ssh_ports
|
|
@@ -2736,6 +2832,27 @@ class LocalResourcesHandle(CloudVmRayResourceHandle):
|
|
|
2736
2832
|
return [command_runner.LocalProcessCommandRunner()]
|
|
2737
2833
|
|
|
2738
2834
|
|
|
2835
|
+
class SkyletClient:
|
|
2836
|
+
"""The client to interact with a remote cluster through Skylet."""
|
|
2837
|
+
|
|
2838
|
+
def __init__(self, channel: 'grpc.Channel'):
|
|
2839
|
+
self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
|
|
2840
|
+
|
|
2841
|
+
def set_autostop(
|
|
2842
|
+
self,
|
|
2843
|
+
request: 'autostopv1_pb2.SetAutostopRequest',
|
|
2844
|
+
timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2845
|
+
) -> 'autostopv1_pb2.SetAutostopResponse':
|
|
2846
|
+
return self._autostop_stub.SetAutostop(request, timeout=timeout)
|
|
2847
|
+
|
|
2848
|
+
def is_autostopping(
|
|
2849
|
+
self,
|
|
2850
|
+
request: 'autostopv1_pb2.IsAutostoppingRequest',
|
|
2851
|
+
timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2852
|
+
) -> 'autostopv1_pb2.IsAutostoppingResponse':
|
|
2853
|
+
return self._autostop_stub.IsAutostopping(request, timeout=timeout)
|
|
2854
|
+
|
|
2855
|
+
|
|
2739
2856
|
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
|
2740
2857
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2741
2858
|
"""Backend: runs on cloud virtual machines, managed by Ray.
|
|
@@ -3010,7 +3127,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3010
3127
|
is_managed=self._is_managed)
|
|
3011
3128
|
log_path = os.path.join(self.log_dir, 'provision.log')
|
|
3012
3129
|
rich_utils.force_update_status(
|
|
3013
|
-
ux_utils.spinner_message('Launching',
|
|
3130
|
+
ux_utils.spinner_message('Launching',
|
|
3131
|
+
log_path,
|
|
3132
|
+
cluster_name=cluster_name))
|
|
3014
3133
|
config_dict = retry_provisioner.provision_with_retries(
|
|
3015
3134
|
task, to_provision_config, dryrun, stream_logs,
|
|
3016
3135
|
skip_unnecessary_provisioning)
|
|
@@ -4659,6 +4778,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4659
4778
|
logger.debug(f'instance statuses attempt {attempts + 1}')
|
|
4660
4779
|
node_status_dict = provision_lib.query_instances(
|
|
4661
4780
|
repr(cloud),
|
|
4781
|
+
handle.cluster_name,
|
|
4662
4782
|
cluster_name_on_cloud,
|
|
4663
4783
|
config['provider'],
|
|
4664
4784
|
non_terminated_only=False)
|
|
@@ -4768,17 +4888,30 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4768
4888
|
# Check if we're stopping spot
|
|
4769
4889
|
assert (handle.launched_resources is not None and
|
|
4770
4890
|
handle.launched_resources.cloud is not None), handle
|
|
4771
|
-
|
|
4772
|
-
|
|
4773
|
-
|
|
4774
|
-
|
|
4775
|
-
|
|
4776
|
-
|
|
4777
|
-
|
|
4778
|
-
|
|
4779
|
-
|
|
4780
|
-
|
|
4781
|
-
|
|
4891
|
+
if handle.is_grpc_enabled:
|
|
4892
|
+
request = autostopv1_pb2.SetAutostopRequest(
|
|
4893
|
+
idle_minutes=idle_minutes_to_autostop,
|
|
4894
|
+
backend=self.NAME,
|
|
4895
|
+
wait_for=wait_for.to_protobuf() if wait_for is not None else
|
|
4896
|
+
autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
|
|
4897
|
+
down=down,
|
|
4898
|
+
)
|
|
4899
|
+
backend_utils.invoke_skylet_with_retries(
|
|
4900
|
+
handle, lambda: SkyletClient(handle.get_grpc_channel()).
|
|
4901
|
+
set_autostop(request))
|
|
4902
|
+
else:
|
|
4903
|
+
logger.info(
|
|
4904
|
+
'Using legacy remote execution for set_autostop on '
|
|
4905
|
+
'cluster %s.', handle.cluster_name)
|
|
4906
|
+
code = autostop_lib.AutostopCodeGen.set_autostop(
|
|
4907
|
+
idle_minutes_to_autostop, self.NAME, wait_for, down)
|
|
4908
|
+
returncode, _, stderr = self.run_on_head(
|
|
4909
|
+
handle, code, require_outputs=True, stream_logs=stream_logs)
|
|
4910
|
+
subprocess_utils.handle_returncode(returncode,
|
|
4911
|
+
code,
|
|
4912
|
+
'Failed to set autostop',
|
|
4913
|
+
stderr=stderr,
|
|
4914
|
+
stream_logs=stream_logs)
|
|
4782
4915
|
global_user_state.set_cluster_autostop_value(
|
|
4783
4916
|
handle.cluster_name, idle_minutes_to_autostop, down)
|
|
4784
4917
|
|
|
@@ -4803,18 +4936,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4803
4936
|
# The head node of the cluster is not UP or in an abnormal state.
|
|
4804
4937
|
# We cannot check if the cluster is autostopping.
|
|
4805
4938
|
return False
|
|
4806
|
-
|
|
4807
|
-
|
|
4808
|
-
|
|
4809
|
-
|
|
4810
|
-
|
|
4811
|
-
|
|
4812
|
-
|
|
4813
|
-
|
|
4814
|
-
|
|
4815
|
-
|
|
4816
|
-
|
|
4817
|
-
|
|
4939
|
+
if handle.is_grpc_enabled:
|
|
4940
|
+
request = autostopv1_pb2.IsAutostoppingRequest()
|
|
4941
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4942
|
+
handle, lambda: SkyletClient(handle.get_grpc_channel()).
|
|
4943
|
+
is_autostopping(request))
|
|
4944
|
+
return response.is_autostopping
|
|
4945
|
+
else:
|
|
4946
|
+
logger.info(
|
|
4947
|
+
'Using legacy remote execution for is_autostopping on '
|
|
4948
|
+
'cluster %s.', handle.cluster_name)
|
|
4949
|
+
code = autostop_lib.AutostopCodeGen.is_autostopping()
|
|
4950
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
4951
|
+
handle, code, require_outputs=True, stream_logs=stream_logs)
|
|
4952
|
+
if returncode == 0:
|
|
4953
|
+
return message_utils.decode_payload(stdout)
|
|
4954
|
+
logger.debug('Failed to check if cluster is autostopping with '
|
|
4955
|
+
f'{returncode}: {stdout+stderr}\n'
|
|
4956
|
+
f'Command: {code}')
|
|
4957
|
+
return False
|
|
4818
4958
|
|
|
4819
4959
|
# TODO(zhwu): Refactor this to a CommandRunner class, so different backends
|
|
4820
4960
|
# can support its own command runner.
|
sky/catalog/cudo_catalog.py
CHANGED
|
@@ -4,7 +4,7 @@ import typing
|
|
|
4
4
|
from typing import Dict, List, Optional, Tuple, Union
|
|
5
5
|
|
|
6
6
|
from sky.catalog import common
|
|
7
|
-
|
|
7
|
+
from sky.provision.cudo import cudo_machine_type as cudo_mt
|
|
8
8
|
from sky.utils import ux_utils
|
|
9
9
|
|
|
10
10
|
if typing.TYPE_CHECKING:
|
|
@@ -22,6 +22,8 @@ TIMEOUT = 10
|
|
|
22
22
|
PARENT_ID_TEMPLATE = 'project-{}public-images'
|
|
23
23
|
ACCELERATOR_MANUFACTURER = 'NVIDIA'
|
|
24
24
|
|
|
25
|
+
VRAM = {'L40S': 49152, 'H100': 81920, 'H200': 144384, 'B200': 184320}
|
|
26
|
+
|
|
25
27
|
|
|
26
28
|
@dataclass
|
|
27
29
|
class PresetInfo:
|
|
@@ -196,17 +198,18 @@ def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
|
|
|
196
198
|
key=lambda x:
|
|
197
199
|
(bool(x.gpu), x.region, x.platform_name, x.vcpu)):
|
|
198
200
|
gpu_info = ''
|
|
199
|
-
if preset.gpu > 0:
|
|
201
|
+
if preset.gpu > 0 and preset.accelerator_name:
|
|
200
202
|
gpu_info_dict = {
|
|
201
203
|
'Gpus': [{
|
|
202
204
|
'Name': preset.accelerator_name,
|
|
203
205
|
'Manufacturer': preset.accelerator_manufacturer,
|
|
204
206
|
'Count': preset.gpu,
|
|
205
207
|
'MemoryInfo': {
|
|
206
|
-
'SizeInMiB': preset.
|
|
208
|
+
'SizeInMiB': VRAM.get(preset.accelerator_name, 0)
|
|
207
209
|
},
|
|
208
210
|
}],
|
|
209
|
-
'TotalGpuMemoryInMiB': preset.
|
|
211
|
+
'TotalGpuMemoryInMiB': VRAM.get(preset.accelerator_name, 0)
|
|
212
|
+
* preset.gpu,
|
|
210
213
|
}
|
|
211
214
|
gpu_info = json.dumps(gpu_info_dict).replace('"', '\'')
|
|
212
215
|
|