skypilot-nightly 1.0.0.dev20250718__py3-none-any.whl → 1.0.0.dev20250723__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/admin_policy.py +11 -4
- sky/backends/backend_utils.py +50 -24
- sky/backends/cloud_vm_ray_backend.py +41 -38
- sky/catalog/__init__.py +3 -1
- sky/catalog/aws_catalog.py +8 -5
- sky/catalog/azure_catalog.py +8 -5
- sky/catalog/common.py +8 -2
- sky/catalog/cudo_catalog.py +5 -2
- sky/catalog/do_catalog.py +4 -1
- sky/catalog/fluidstack_catalog.py +5 -2
- sky/catalog/gcp_catalog.py +8 -5
- sky/catalog/hyperbolic_catalog.py +5 -2
- sky/catalog/ibm_catalog.py +8 -5
- sky/catalog/lambda_catalog.py +8 -5
- sky/catalog/nebius_catalog.py +8 -5
- sky/catalog/oci_catalog.py +8 -5
- sky/catalog/paperspace_catalog.py +4 -1
- sky/catalog/runpod_catalog.py +5 -2
- sky/catalog/scp_catalog.py +8 -5
- sky/catalog/vast_catalog.py +5 -2
- sky/catalog/vsphere_catalog.py +4 -1
- sky/client/cli/command.py +63 -25
- sky/client/sdk.py +61 -11
- sky/clouds/aws.py +12 -7
- sky/clouds/azure.py +12 -7
- sky/clouds/cloud.py +9 -8
- sky/clouds/cudo.py +13 -7
- sky/clouds/do.py +12 -7
- sky/clouds/fluidstack.py +11 -6
- sky/clouds/gcp.py +12 -7
- sky/clouds/hyperbolic.py +11 -6
- sky/clouds/ibm.py +11 -6
- sky/clouds/kubernetes.py +7 -3
- sky/clouds/lambda_cloud.py +11 -6
- sky/clouds/nebius.py +14 -12
- sky/clouds/oci.py +12 -7
- sky/clouds/paperspace.py +12 -7
- sky/clouds/runpod.py +12 -7
- sky/clouds/scp.py +11 -6
- sky/clouds/vast.py +14 -8
- sky/clouds/vsphere.py +11 -6
- sky/core.py +6 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{1043-734e57d2b27dfe5d.js → 1043-869d9c78bf5dd3df.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{1141-d8c6404a7c6fffe6.js → 1141-e49a159c30a6c4a7.js} +1 -1
- sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +30 -0
- sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +6 -0
- sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +15 -0
- sky/dashboard/out/_next/static/chunks/{2641.35edc9ccaeaad9e3.js → 2641.74c19c4d45a2c034.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{4725.4c849b1e05c8e9ad.js → 4725.66125dcd9832aa5d.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +16 -0
- sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +15 -0
- sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +1 -0
- sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +55 -0
- sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +1 -0
- sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +41 -0
- sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +6 -0
- sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +1 -0
- sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +30 -0
- sky/dashboard/out/_next/static/chunks/{9984.2b5e3fa69171bff9.js → 9984.0460de9d3adf5582.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa406155b4223d0d.js → [job]-2186770cc2de1623.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0c37ee1ac5f3474d.js → [cluster]-95afb019ab85801c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-c5b357bfd9502fbe.js → [job]-dc0299ffefebcdbe.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-19e98664bdd61643.js → users-6790fcefd5487b13.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +1 -0
- sky/dashboard/out/_next/static/css/b3227360726f12eb.css +3 -0
- sky/dashboard/out/_next/static/mym3Ciwp-zqU7ZpOLGnrW/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +93 -32
- sky/global_user_state.py +12 -143
- sky/jobs/state.py +9 -88
- sky/jobs/utils.py +28 -13
- sky/provision/nebius/utils.py +3 -6
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/serve/client/sdk.py +6 -2
- sky/serve/controller.py +7 -3
- sky/serve/serve_state.py +1 -1
- sky/serve/serve_utils.py +171 -75
- sky/serve/server/core.py +17 -6
- sky/server/common.py +4 -3
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/requests.py +1 -1
- sky/setup_files/MANIFEST.in +2 -0
- sky/setup_files/alembic.ini +148 -0
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/configs.py +1 -1
- sky/skylet/constants.py +4 -0
- sky/skylet/job_lib.py +1 -1
- sky/skypilot_config.py +1 -1
- sky/users/permission.py +1 -1
- sky/utils/common_utils.py +85 -3
- sky/utils/config_utils.py +15 -0
- sky/utils/db/__init__.py +0 -0
- sky/utils/{db_utils.py → db/db_utils.py} +59 -0
- sky/utils/db/migration_utils.py +93 -0
- sky/utils/locks.py +319 -0
- sky/utils/schemas.py +38 -34
- sky/utils/timeline.py +41 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/RECORD +134 -125
- sky/dashboard/out/_next/static/FUjweqdImyeYhMYFON-Se/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1746.27d40aedc22bd2d6.js +0 -60
- sky/dashboard/out/_next/static/chunks/1871-76491ac174a95278.js +0 -6
- sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +0 -1
- sky/dashboard/out/_next/static/chunks/2875.c24c6d57dc82e436.js +0 -25
- sky/dashboard/out/_next/static/chunks/3785.95b94f18aaec7233.js +0 -1
- sky/dashboard/out/_next/static/chunks/3947-b059261d6fa88a1f.js +0 -35
- sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +0 -16
- sky/dashboard/out/_next/static/chunks/5491.918ffed0ba7a5294.js +0 -20
- sky/dashboard/out/_next/static/chunks/6990-dcb411b566e64cde.js +0 -1
- sky/dashboard/out/_next/static/chunks/804-9f5e98ce84d46bdd.js +0 -21
- sky/dashboard/out/_next/static/chunks/9025.133e9ba5c780afeb.js +0 -6
- sky/dashboard/out/_next/static/chunks/938-6a9ffdaa21eee969.js +0 -1
- sky/dashboard/out/_next/static/chunks/9470-b6f6a35283863a6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.46e613d000c55859.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-771a40cde532309b.js +0 -20
- sky/dashboard/out/_next/static/chunks/pages/clusters-102d169e87913ba1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-a1e43d9ef51a9cea.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-6b0575ea521af4f3.js +0 -1
- sky/dashboard/out/_next/static/css/219887b94512388c.css +0 -3
- /sky/dashboard/out/_next/static/{FUjweqdImyeYhMYFON-Se → mym3Ciwp-zqU7ZpOLGnrW}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Optional
|
|
|
5
5
|
import urllib.request
|
|
6
6
|
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
8
|
+
_SKYPILOT_COMMIT_SHA = '874bc28c3a4b7322d30cfc544b257647379b59ed'
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def _get_git_commit():
|
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
__commit__ = _get_git_commit()
|
|
38
|
-
__version__ = '1.0.0.
|
|
38
|
+
__version__ = '1.0.0.dev20250723'
|
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
|
40
40
|
|
|
41
41
|
|
|
@@ -104,6 +104,7 @@ from sky.client.sdk import job_status
|
|
|
104
104
|
from sky.client.sdk import launch
|
|
105
105
|
from sky.client.sdk import optimize
|
|
106
106
|
from sky.client.sdk import queue
|
|
107
|
+
from sky.client.sdk import reload_config
|
|
107
108
|
from sky.client.sdk import start
|
|
108
109
|
from sky.client.sdk import status
|
|
109
110
|
from sky.client.sdk import stop
|
|
@@ -185,6 +186,7 @@ __all__ = [
|
|
|
185
186
|
'optimize',
|
|
186
187
|
'launch',
|
|
187
188
|
'exec',
|
|
189
|
+
'reload_config',
|
|
188
190
|
# core APIs
|
|
189
191
|
'status',
|
|
190
192
|
'start',
|
sky/admin_policy.py
CHANGED
|
@@ -121,11 +121,17 @@ class MutatedUserRequest:
|
|
|
121
121
|
dict(self.skypilot_config),)).model_dump_json()
|
|
122
122
|
|
|
123
123
|
@classmethod
|
|
124
|
-
def decode(cls, mutated_user_request_body: str
|
|
124
|
+
def decode(cls, mutated_user_request_body: str,
|
|
125
|
+
original_request: UserRequest) -> 'MutatedUserRequest':
|
|
125
126
|
mutated_user_request_body = _MutatedUserRequestBody.model_validate_json(
|
|
126
127
|
mutated_user_request_body)
|
|
127
|
-
|
|
128
|
-
common_utils.read_yaml_all_str(mutated_user_request_body.task)[0])
|
|
128
|
+
task = sky.Task.from_yaml_config(
|
|
129
|
+
common_utils.read_yaml_all_str(mutated_user_request_body.task)[0])
|
|
130
|
+
# Some internal Task fields are not serialized. We need to manually
|
|
131
|
+
# restore them from the original request.
|
|
132
|
+
task.managed_job_dag = original_request.task.managed_job_dag
|
|
133
|
+
task.service_name = original_request.task.service_name
|
|
134
|
+
return cls(task=task,
|
|
129
135
|
skypilot_config=config_utils.Config.from_dict(
|
|
130
136
|
common_utils.read_yaml_all_str(
|
|
131
137
|
mutated_user_request_body.skypilot_config)[0],))
|
|
@@ -243,7 +249,8 @@ class RestfulAdminPolicy(PolicyTemplate):
|
|
|
243
249
|
f'{self.policy_url}: {e}') from None
|
|
244
250
|
|
|
245
251
|
try:
|
|
246
|
-
mutated_user_request = MutatedUserRequest.decode(
|
|
252
|
+
mutated_user_request = MutatedUserRequest.decode(
|
|
253
|
+
response.json(), user_request)
|
|
247
254
|
except Exception as e: # pylint: disable=broad-except
|
|
248
255
|
with ux_utils.print_exception_no_traceback():
|
|
249
256
|
raise exceptions.RestfulPolicyError(
|
sky/backends/backend_utils.py
CHANGED
|
@@ -17,7 +17,6 @@ from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
|
|
|
17
17
|
import uuid
|
|
18
18
|
|
|
19
19
|
import colorama
|
|
20
|
-
import filelock
|
|
21
20
|
from packaging import version
|
|
22
21
|
from typing_extensions import Literal
|
|
23
22
|
|
|
@@ -45,6 +44,7 @@ from sky.utils import common_utils
|
|
|
45
44
|
from sky.utils import context_utils
|
|
46
45
|
from sky.utils import controller_utils
|
|
47
46
|
from sky.utils import env_options
|
|
47
|
+
from sky.utils import locks
|
|
48
48
|
from sky.utils import registry
|
|
49
49
|
from sky.utils import resources_utils
|
|
50
50
|
from sky.utils import rich_utils
|
|
@@ -104,23 +104,18 @@ WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
|
|
|
104
104
|
# Fixed IP addresses are used to avoid DNS lookup blocking the check, for
|
|
105
105
|
# machine with no internet connection.
|
|
106
106
|
# Refer to: https://stackoverflow.com/questions/3764291/how-can-i-see-if-theres-an-available-and-active-network-connection-in-python # pylint: disable=line-too-long
|
|
107
|
-
_TEST_IP_LIST = ['https://
|
|
107
|
+
_TEST_IP_LIST = ['https://8.8.8.8', 'https://1.1.1.1']
|
|
108
108
|
|
|
109
109
|
# Allow each CPU thread take 2 tasks.
|
|
110
110
|
# Note: This value cannot be too small, otherwise OOM issue may occur.
|
|
111
111
|
DEFAULT_TASK_CPU_DEMAND = 0.5
|
|
112
112
|
|
|
113
|
-
# Filelocks for the cluster status change.
|
|
114
|
-
CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
|
|
115
113
|
CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
|
|
116
114
|
|
|
117
115
|
# Time that must elapse since the last status check before we should re-check if
|
|
118
116
|
# the cluster has been terminated or autostopped.
|
|
119
117
|
_CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
|
|
120
118
|
|
|
121
|
-
# Filelocks for updating cluster's file_mounts.
|
|
122
|
-
CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
|
|
123
|
-
'~/.sky/.{}_file_mounts.lock')
|
|
124
119
|
CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
|
|
125
120
|
|
|
126
121
|
# Remote dir that holds our runtime files.
|
|
@@ -1635,18 +1630,28 @@ def get_node_ips(cluster_yaml: str,
|
|
|
1635
1630
|
|
|
1636
1631
|
def check_network_connection():
|
|
1637
1632
|
# Tolerate 3 retries as it is observed that connections can fail.
|
|
1638
|
-
adapter = adapters.HTTPAdapter(max_retries=retry_lib.Retry(total=3))
|
|
1639
1633
|
http = requests.Session()
|
|
1640
|
-
http.mount('https://',
|
|
1641
|
-
http.mount('http://',
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1634
|
+
http.mount('https://', adapters.HTTPAdapter())
|
|
1635
|
+
http.mount('http://', adapters.HTTPAdapter())
|
|
1636
|
+
|
|
1637
|
+
# Alternate between IPs on each retry
|
|
1638
|
+
max_retries = 3
|
|
1639
|
+
timeout = 0.5
|
|
1640
|
+
|
|
1641
|
+
for _ in range(max_retries):
|
|
1642
|
+
for ip in _TEST_IP_LIST:
|
|
1643
|
+
try:
|
|
1644
|
+
http.head(ip, timeout=timeout)
|
|
1645
|
+
return
|
|
1646
|
+
except (requests.Timeout, requests.exceptions.ConnectionError):
|
|
1647
|
+
continue
|
|
1648
|
+
|
|
1649
|
+
timeout *= 2 # Double the timeout for next retry
|
|
1650
|
+
|
|
1651
|
+
# If we get here, all IPs failed
|
|
1652
|
+
# Assume network connection is down
|
|
1653
|
+
raise exceptions.NetworkError('Could not refresh the cluster. '
|
|
1654
|
+
'Network seems down.')
|
|
1650
1655
|
|
|
1651
1656
|
|
|
1652
1657
|
@timeline.event
|
|
@@ -1995,9 +2000,20 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1995
2000
|
|
|
1996
2001
|
total_nodes = handle.launched_nodes * handle.num_ips_per_node
|
|
1997
2002
|
|
|
2003
|
+
cloud_name = repr(handle.launched_resources.cloud).lower()
|
|
1998
2004
|
for i in range(5):
|
|
1999
|
-
|
|
2000
|
-
|
|
2005
|
+
try:
|
|
2006
|
+
ready_head, ready_workers, output, stderr = (
|
|
2007
|
+
get_node_counts_from_ray_status(head_runner))
|
|
2008
|
+
except RuntimeError as e:
|
|
2009
|
+
logger.debug(f'Refreshing status ({cluster_name!r}) attempt'
|
|
2010
|
+
f' {i}: {common_utils.format_exception(e)}')
|
|
2011
|
+
if cloud_name != 'kubernetes':
|
|
2012
|
+
raise e
|
|
2013
|
+
# We retry for kubernetes because coreweave can have a
|
|
2014
|
+
# transient network issue.
|
|
2015
|
+
time.sleep(1)
|
|
2016
|
+
continue
|
|
2001
2017
|
if ready_head + ready_workers == total_nodes:
|
|
2002
2018
|
return True
|
|
2003
2019
|
logger.debug(f'Refreshing status ({cluster_name!r}) attempt '
|
|
@@ -2284,8 +2300,7 @@ def refresh_cluster_record(
|
|
|
2284
2300
|
|
|
2285
2301
|
# The loop logic allows us to notice if the status was updated in the
|
|
2286
2302
|
# global_user_state by another process and stop trying to get the lock.
|
|
2287
|
-
|
|
2288
|
-
lock = filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
|
|
2303
|
+
lock = locks.get_lock(cluster_status_lock_id(cluster_name))
|
|
2289
2304
|
start_time = time.perf_counter()
|
|
2290
2305
|
|
|
2291
2306
|
# Loop until we have an up-to-date status or until we acquire the lock.
|
|
@@ -2309,7 +2324,8 @@ def refresh_cluster_record(
|
|
|
2309
2324
|
return record
|
|
2310
2325
|
# Update and return the cluster status.
|
|
2311
2326
|
return _update_cluster_status(cluster_name)
|
|
2312
|
-
|
|
2327
|
+
|
|
2328
|
+
except locks.LockTimeout:
|
|
2313
2329
|
# lock.acquire() will throw a Timeout exception if the lock is not
|
|
2314
2330
|
# available and we have blocking=False.
|
|
2315
2331
|
pass
|
|
@@ -2610,7 +2626,7 @@ def is_controller_accessible(
|
|
|
2610
2626
|
need_connection_check):
|
|
2611
2627
|
# Check ssh connection if (1) controller is in INIT state, or (2) we failed to fetch the
|
|
2612
2628
|
# status, both of which can happen when controller's status lock is held by another `sky jobs launch` or
|
|
2613
|
-
# `sky serve up`. If we have
|
|
2629
|
+
# `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
|
|
2614
2630
|
# we can allow access to the controller.
|
|
2615
2631
|
ssh_credentials = ssh_credential_from_yaml(handle.cluster_yaml,
|
|
2616
2632
|
handle.docker_user,
|
|
@@ -3187,3 +3203,13 @@ def get_endpoints(cluster: str,
|
|
|
3187
3203
|
return {
|
|
3188
3204
|
port_num: urls[0].url() for port_num, urls in port_details.items()
|
|
3189
3205
|
}
|
|
3206
|
+
|
|
3207
|
+
|
|
3208
|
+
def cluster_status_lock_id(cluster_name: str) -> str:
|
|
3209
|
+
"""Get the lock ID for cluster status operations."""
|
|
3210
|
+
return f'{cluster_name}_status'
|
|
3211
|
+
|
|
3212
|
+
|
|
3213
|
+
def cluster_file_mounts_lock_id(cluster_name: str) -> str:
|
|
3214
|
+
"""Get the lock ID for cluster file mounts operations."""
|
|
3215
|
+
return f'{cluster_name}_file_mounts'
|
|
@@ -20,7 +20,6 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
|
|
|
20
20
|
Union)
|
|
21
21
|
|
|
22
22
|
import colorama
|
|
23
|
-
import filelock
|
|
24
23
|
import yaml
|
|
25
24
|
|
|
26
25
|
import sky
|
|
@@ -64,6 +63,7 @@ from sky.utils import common_utils
|
|
|
64
63
|
from sky.utils import context_utils
|
|
65
64
|
from sky.utils import controller_utils
|
|
66
65
|
from sky.utils import env_options
|
|
66
|
+
from sky.utils import locks
|
|
67
67
|
from sky.utils import log_utils
|
|
68
68
|
from sky.utils import message_utils
|
|
69
69
|
from sky.utils import registry
|
|
@@ -2916,9 +2916,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2916
2916
|
# Check if the cluster is owned by the current user. Raise
|
|
2917
2917
|
# exceptions.ClusterOwnerIdentityMismatchError
|
|
2918
2918
|
backend_utils.check_owner_identity(cluster_name)
|
|
2919
|
-
|
|
2920
|
-
|
|
2921
|
-
with timeline.FileLockEvent(lock_path):
|
|
2919
|
+
lock_id = backend_utils.cluster_status_lock_id(cluster_name)
|
|
2920
|
+
with timeline.DistributedLockEvent(lock_id):
|
|
2922
2921
|
# Try to launch the exiting cluster first. If no existing cluster,
|
|
2923
2922
|
# this function will create a to_provision_config with required
|
|
2924
2923
|
# resources.
|
|
@@ -3065,7 +3064,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3065
3064
|
|
|
3066
3065
|
self._update_after_cluster_provisioned(
|
|
3067
3066
|
handle, to_provision_config.prev_handle, task,
|
|
3068
|
-
prev_cluster_status,
|
|
3067
|
+
prev_cluster_status, lock_id, config_hash)
|
|
3069
3068
|
return handle, False
|
|
3070
3069
|
|
|
3071
3070
|
cluster_config_file = config_dict['ray']
|
|
@@ -3137,7 +3136,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3137
3136
|
|
|
3138
3137
|
self._update_after_cluster_provisioned(
|
|
3139
3138
|
handle, to_provision_config.prev_handle, task,
|
|
3140
|
-
prev_cluster_status,
|
|
3139
|
+
prev_cluster_status, lock_id, config_hash)
|
|
3141
3140
|
return handle, False
|
|
3142
3141
|
|
|
3143
3142
|
def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
|
|
@@ -3155,7 +3154,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3155
3154
|
prev_handle: Optional[CloudVmRayResourceHandle],
|
|
3156
3155
|
task: task_lib.Task,
|
|
3157
3156
|
prev_cluster_status: Optional[status_lib.ClusterStatus],
|
|
3158
|
-
|
|
3157
|
+
lock_id: str, config_hash: str) -> None:
|
|
3159
3158
|
usage_lib.messages.usage.update_cluster_resources(
|
|
3160
3159
|
handle.launched_nodes, handle.launched_resources)
|
|
3161
3160
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
@@ -3237,7 +3236,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3237
3236
|
handle.cached_external_ssh_ports, handle.docker_user,
|
|
3238
3237
|
handle.ssh_user)
|
|
3239
3238
|
|
|
3240
|
-
|
|
3239
|
+
locks.get_lock(lock_id).force_unlock()
|
|
3241
3240
|
|
|
3242
3241
|
def _sync_workdir(self, handle: CloudVmRayResourceHandle,
|
|
3243
3242
|
workdir: Union[Path, Dict[str, Any]],
|
|
@@ -3819,8 +3818,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3819
3818
|
is_identity_mismatch_and_purge = True
|
|
3820
3819
|
else:
|
|
3821
3820
|
raise
|
|
3822
|
-
|
|
3823
|
-
|
|
3821
|
+
lock_id = backend_utils.cluster_status_lock_id(cluster_name)
|
|
3822
|
+
lock = locks.get_lock(lock_id)
|
|
3824
3823
|
# Retry in case new cluster operation comes in and holds the lock
|
|
3825
3824
|
# right after the lock is removed.
|
|
3826
3825
|
n_attempts = 2
|
|
@@ -3828,7 +3827,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3828
3827
|
n_attempts -= 1
|
|
3829
3828
|
# In case other running cluster operations are still holding the
|
|
3830
3829
|
# lock.
|
|
3831
|
-
|
|
3830
|
+
lock.force_unlock()
|
|
3832
3831
|
# We have to kill the cluster requests, because `down` and `stop`
|
|
3833
3832
|
# should be higher priority than the cluster requests, and we should
|
|
3834
3833
|
# release the lock from other requests.
|
|
@@ -3847,9 +3846,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3847
3846
|
f'cluster {handle.cluster_name}: '
|
|
3848
3847
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
3849
3848
|
try:
|
|
3850
|
-
with
|
|
3851
|
-
lock_path,
|
|
3852
|
-
backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
|
|
3849
|
+
with lock:
|
|
3853
3850
|
self.teardown_no_lock(
|
|
3854
3851
|
handle,
|
|
3855
3852
|
terminate,
|
|
@@ -3862,14 +3859,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3862
3859
|
refresh_cluster_status=(
|
|
3863
3860
|
not is_identity_mismatch_and_purge))
|
|
3864
3861
|
if terminate:
|
|
3865
|
-
|
|
3862
|
+
lock.force_unlock()
|
|
3866
3863
|
break
|
|
3867
|
-
except
|
|
3864
|
+
except locks.LockTimeout as e:
|
|
3868
3865
|
logger.debug(f'Failed to acquire lock for {cluster_name}, '
|
|
3869
3866
|
f'retrying...')
|
|
3870
3867
|
if n_attempts <= 0:
|
|
3871
3868
|
raise RuntimeError(
|
|
3872
|
-
f'Cluster {cluster_name!r} is locked by {
|
|
3869
|
+
f'Cluster {cluster_name!r} is locked by {lock_id}. '
|
|
3873
3870
|
'Check to see if it is still being launched') from e
|
|
3874
3871
|
|
|
3875
3872
|
# --- CloudVMRayBackend Specific APIs ---
|
|
@@ -3988,12 +3985,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3988
3985
|
return dict(zip(job_ids, local_log_dirs))
|
|
3989
3986
|
|
|
3990
3987
|
@context_utils.cancellation_guard
|
|
3991
|
-
def tail_logs(
|
|
3992
|
-
|
|
3993
|
-
|
|
3994
|
-
|
|
3995
|
-
|
|
3996
|
-
|
|
3988
|
+
def tail_logs(
|
|
3989
|
+
self,
|
|
3990
|
+
handle: CloudVmRayResourceHandle,
|
|
3991
|
+
job_id: Optional[int],
|
|
3992
|
+
managed_job_id: Optional[int] = None,
|
|
3993
|
+
follow: bool = True,
|
|
3994
|
+
tail: int = 0,
|
|
3995
|
+
require_outputs: bool = False,
|
|
3996
|
+
stream_logs: bool = True,
|
|
3997
|
+
process_stream: bool = False) -> Union[int, Tuple[int, str, str]]:
|
|
3997
3998
|
"""Tail the logs of a job.
|
|
3998
3999
|
|
|
3999
4000
|
Args:
|
|
@@ -4003,6 +4004,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4003
4004
|
follow: Whether to follow the logs.
|
|
4004
4005
|
tail: The number of lines to display from the end of the
|
|
4005
4006
|
log file. If 0, print all lines.
|
|
4007
|
+
require_outputs: Whether to return the stdout/stderr of the command.
|
|
4008
|
+
stream_logs: Whether to stream the logs to stdout/stderr.
|
|
4009
|
+
process_stream: Whether to process the stream.
|
|
4006
4010
|
|
|
4007
4011
|
Returns:
|
|
4008
4012
|
The exit code of the tail command. Returns code 100 if the job has
|
|
@@ -4022,18 +4026,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4022
4026
|
signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
|
|
4023
4027
|
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
|
|
4024
4028
|
try:
|
|
4025
|
-
|
|
4029
|
+
final = self.run_on_head(
|
|
4026
4030
|
handle,
|
|
4027
4031
|
code,
|
|
4028
|
-
stream_logs=
|
|
4029
|
-
process_stream=
|
|
4032
|
+
stream_logs=stream_logs,
|
|
4033
|
+
process_stream=process_stream,
|
|
4034
|
+
require_outputs=require_outputs,
|
|
4030
4035
|
# Allocate a pseudo-terminal to disable output buffering.
|
|
4031
4036
|
# Otherwise, there may be 5 minutes delay in logging.
|
|
4032
4037
|
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
|
4033
4038
|
)
|
|
4034
4039
|
except SystemExit as e:
|
|
4035
|
-
|
|
4036
|
-
return
|
|
4040
|
+
final = e.code
|
|
4041
|
+
return final
|
|
4037
4042
|
|
|
4038
4043
|
def tail_managed_job_logs(self,
|
|
4039
4044
|
handle: CloudVmRayResourceHandle,
|
|
@@ -5237,18 +5242,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5237
5242
|
# reconstruct them during cluster restart.
|
|
5238
5243
|
continue
|
|
5239
5244
|
storage_mounts_metadata[dst] = storage_obj.handle
|
|
5240
|
-
|
|
5241
|
-
backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
|
|
5245
|
+
lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
|
|
5242
5246
|
lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
|
|
5243
5247
|
try:
|
|
5244
|
-
with
|
|
5248
|
+
with locks.get_lock(lock_id, lock_timeout):
|
|
5245
5249
|
global_user_state.set_cluster_storage_mounts_metadata(
|
|
5246
5250
|
cluster_name, storage_mounts_metadata)
|
|
5247
|
-
except
|
|
5251
|
+
except locks.LockTimeout as e:
|
|
5248
5252
|
raise RuntimeError(
|
|
5249
5253
|
f'Failed to store metadata for cluster {cluster_name!r} due to '
|
|
5250
5254
|
'a timeout when trying to access local database. Please '
|
|
5251
|
-
f'try again or manually remove the lock at {
|
|
5255
|
+
f'try again or manually remove the lock at {lock_id}. '
|
|
5252
5256
|
f'{common_utils.format_exception(e)}') from None
|
|
5253
5257
|
|
|
5254
5258
|
def get_storage_mounts_metadata(
|
|
@@ -5259,19 +5263,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5259
5263
|
After retrieving storage_mounts_metadata, it converts back the
|
|
5260
5264
|
StorageMetadata to Storage object and restores 'storage_mounts.'
|
|
5261
5265
|
"""
|
|
5262
|
-
|
|
5263
|
-
backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
|
|
5266
|
+
lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
|
|
5264
5267
|
lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
|
|
5265
5268
|
try:
|
|
5266
|
-
with
|
|
5269
|
+
with locks.get_lock(lock_id, lock_timeout):
|
|
5267
5270
|
storage_mounts_metadata = (
|
|
5268
5271
|
global_user_state.get_cluster_storage_mounts_metadata(
|
|
5269
5272
|
cluster_name))
|
|
5270
|
-
except
|
|
5273
|
+
except locks.LockTimeout as e:
|
|
5271
5274
|
raise RuntimeError(
|
|
5272
5275
|
f'Failed to retrieve metadata for cluster {cluster_name!r} '
|
|
5273
5276
|
'due to a timeout when trying to access local database. '
|
|
5274
|
-
f'Please try again or manually remove the lock at {
|
|
5277
|
+
f'Please try again or manually remove the lock at {lock_id}.'
|
|
5275
5278
|
f' {common_utils.format_exception(e)}') from None
|
|
5276
5279
|
|
|
5277
5280
|
if storage_mounts_metadata is None:
|
sky/catalog/__init__.py
CHANGED
|
@@ -221,6 +221,8 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
|
221
221
|
memory: Optional[str] = None,
|
|
222
222
|
disk_tier: Optional[
|
|
223
223
|
resources_utils.DiskTier] = None,
|
|
224
|
+
region: Optional[str] = None,
|
|
225
|
+
zone: Optional[str] = None,
|
|
224
226
|
clouds: CloudFilter = None) -> Optional[str]:
|
|
225
227
|
"""Returns the cloud's default instance type for given #vCPUs and memory.
|
|
226
228
|
|
|
@@ -234,7 +236,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
|
234
236
|
the given CPU and memory requirement.
|
|
235
237
|
"""
|
|
236
238
|
return _map_clouds_catalog(clouds, 'get_default_instance_type', cpus,
|
|
237
|
-
memory, disk_tier)
|
|
239
|
+
memory, disk_tier, region, zone)
|
|
238
240
|
|
|
239
241
|
|
|
240
242
|
def get_accelerators_from_instance_type(
|
sky/catalog/aws_catalog.py
CHANGED
|
@@ -230,10 +230,12 @@ def get_vcpus_mem_from_instance_type(
|
|
|
230
230
|
instance_type)
|
|
231
231
|
|
|
232
232
|
|
|
233
|
-
def get_default_instance_type(
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
233
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
234
|
+
memory: Optional[str] = None,
|
|
235
|
+
disk_tier: Optional[
|
|
236
|
+
resources_utils.DiskTier] = None,
|
|
237
|
+
region: Optional[str] = None,
|
|
238
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
237
239
|
del disk_tier # unused
|
|
238
240
|
if cpus is None and memory is None:
|
|
239
241
|
cpus = f'{_DEFAULT_NUM_VCPUS}+'
|
|
@@ -247,7 +249,8 @@ def get_default_instance_type(
|
|
|
247
249
|
df = _get_df()
|
|
248
250
|
df = df[df['InstanceType'].str.startswith(instance_type_prefix)]
|
|
249
251
|
return common.get_instance_type_for_cpus_mem_impl(df, cpus,
|
|
250
|
-
memory_gb_or_ratio
|
|
252
|
+
memory_gb_or_ratio,
|
|
253
|
+
region, zone)
|
|
251
254
|
|
|
252
255
|
|
|
253
256
|
def get_accelerators_from_instance_type(
|
sky/catalog/azure_catalog.py
CHANGED
|
@@ -114,10 +114,12 @@ def _get_instance_family(instance_type: str) -> str:
|
|
|
114
114
|
return instance_family
|
|
115
115
|
|
|
116
116
|
|
|
117
|
-
def get_default_instance_type(
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
117
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
118
|
+
memory: Optional[str] = None,
|
|
119
|
+
disk_tier: Optional[
|
|
120
|
+
resources_utils.DiskTier] = None,
|
|
121
|
+
region: Optional[str] = None,
|
|
122
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
121
123
|
if cpus is None and memory is None:
|
|
122
124
|
cpus = f'{_DEFAULT_NUM_VCPUS}+'
|
|
123
125
|
if memory is None:
|
|
@@ -133,7 +135,8 @@ def get_default_instance_type(
|
|
|
133
135
|
|
|
134
136
|
df = df.loc[df['InstanceType'].apply(_filter_disk_type)]
|
|
135
137
|
return common.get_instance_type_for_cpus_mem_impl(df, cpus,
|
|
136
|
-
memory_gb_or_ratio
|
|
138
|
+
memory_gb_or_ratio,
|
|
139
|
+
region, zone)
|
|
137
140
|
|
|
138
141
|
|
|
139
142
|
def get_accelerators_from_instance_type(
|
sky/catalog/common.py
CHANGED
|
@@ -476,8 +476,11 @@ def _filter_region_zone(df: 'pd.DataFrame', region: Optional[str],
|
|
|
476
476
|
|
|
477
477
|
|
|
478
478
|
def get_instance_type_for_cpus_mem_impl(
|
|
479
|
-
df: 'pd.DataFrame',
|
|
480
|
-
|
|
479
|
+
df: 'pd.DataFrame',
|
|
480
|
+
cpus: Optional[str],
|
|
481
|
+
memory_gb_or_ratio: Optional[str],
|
|
482
|
+
region: Optional[str] = None,
|
|
483
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
481
484
|
"""Returns the cheapest instance type that satisfies the requirements.
|
|
482
485
|
|
|
483
486
|
Args:
|
|
@@ -490,7 +493,10 @@ def get_instance_type_for_cpus_mem_impl(
|
|
|
490
493
|
returned instance type should have at least the given memory size.
|
|
491
494
|
If the string ends with "x", then the returned instance type should
|
|
492
495
|
have at least the given number of vCPUs times the given ratio.
|
|
496
|
+
region: The region to filter by.
|
|
497
|
+
zone: The zone to filter by.
|
|
493
498
|
"""
|
|
499
|
+
df = _filter_region_zone(df, region, zone)
|
|
494
500
|
df = _filter_with_cpus(df, cpus)
|
|
495
501
|
df = _filter_with_mem(df, memory_gb_or_ratio)
|
|
496
502
|
if df.empty:
|
sky/catalog/cudo_catalog.py
CHANGED
|
@@ -51,7 +51,9 @@ def get_vcpus_mem_from_instance_type(
|
|
|
51
51
|
|
|
52
52
|
def get_default_instance_type(cpus: Optional[str] = None,
|
|
53
53
|
memory: Optional[str] = None,
|
|
54
|
-
disk_tier: Optional[str] = None
|
|
54
|
+
disk_tier: Optional[str] = None,
|
|
55
|
+
region: Optional[str] = None,
|
|
56
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
55
57
|
del disk_tier
|
|
56
58
|
# NOTE: After expanding catalog to multiple entries, you may
|
|
57
59
|
# want to specify a default instance type or family.
|
|
@@ -62,7 +64,8 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
|
62
64
|
if memory is None:
|
|
63
65
|
memory_gb_or_ratio = f'{_DEFAULT_MEMORY_CPU_RATIO}x'
|
|
64
66
|
return common.get_instance_type_for_cpus_mem_impl(_df, cpus,
|
|
65
|
-
memory_gb_or_ratio
|
|
67
|
+
memory_gb_or_ratio,
|
|
68
|
+
region, zone)
|
|
66
69
|
|
|
67
70
|
|
|
68
71
|
def get_accelerators_from_instance_type(
|
sky/catalog/do_catalog.py
CHANGED
|
@@ -52,11 +52,14 @@ def get_default_instance_type(
|
|
|
52
52
|
cpus: Optional[str] = None,
|
|
53
53
|
memory: Optional[str] = None,
|
|
54
54
|
disk_tier: Optional[str] = None,
|
|
55
|
+
region: Optional[str] = None,
|
|
56
|
+
zone: Optional[str] = None,
|
|
55
57
|
) -> Optional[str]:
|
|
56
58
|
# NOTE: After expanding catalog to multiple entries, you may
|
|
57
59
|
# want to specify a default instance type or family.
|
|
58
60
|
del disk_tier # unused
|
|
59
|
-
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory
|
|
61
|
+
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
|
|
62
|
+
zone)
|
|
60
63
|
|
|
61
64
|
|
|
62
65
|
def get_accelerators_from_instance_type(
|
|
@@ -52,7 +52,9 @@ def get_vcpus_mem_from_instance_type(
|
|
|
52
52
|
|
|
53
53
|
def get_default_instance_type(cpus: Optional[str] = None,
|
|
54
54
|
memory: Optional[str] = None,
|
|
55
|
-
disk_tier: Optional[str] = None
|
|
55
|
+
disk_tier: Optional[str] = None,
|
|
56
|
+
region: Optional[str] = None,
|
|
57
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
56
58
|
del disk_tier # unused
|
|
57
59
|
if cpus is None and memory is None:
|
|
58
60
|
cpus = f'{_DEFAULT_NUM_VCPUS}+'
|
|
@@ -61,7 +63,8 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
|
61
63
|
else:
|
|
62
64
|
memory_gb_or_ratio = memory
|
|
63
65
|
return common.get_instance_type_for_cpus_mem_impl(_df, cpus,
|
|
64
|
-
memory_gb_or_ratio
|
|
66
|
+
memory_gb_or_ratio,
|
|
67
|
+
region, zone)
|
|
65
68
|
|
|
66
69
|
|
|
67
70
|
def get_accelerators_from_instance_type(
|
sky/catalog/gcp_catalog.py
CHANGED
|
@@ -279,10 +279,12 @@ def get_vcpus_mem_from_instance_type(
|
|
|
279
279
|
return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
|
|
280
280
|
|
|
281
281
|
|
|
282
|
-
def get_default_instance_type(
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
282
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
283
|
+
memory: Optional[str] = None,
|
|
284
|
+
disk_tier: Optional[
|
|
285
|
+
resources_utils.DiskTier] = None,
|
|
286
|
+
region: Optional[str] = None,
|
|
287
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
286
288
|
if cpus is None and memory is None:
|
|
287
289
|
cpus = f'{_DEFAULT_NUM_VCPUS}+'
|
|
288
290
|
if memory is None:
|
|
@@ -300,7 +302,8 @@ def get_default_instance_type(
|
|
|
300
302
|
|
|
301
303
|
df = df.loc[df['InstanceType'].apply(_filter_disk_type)]
|
|
302
304
|
return common.get_instance_type_for_cpus_mem_impl(df, cpus,
|
|
303
|
-
memory_gb_or_ratio
|
|
305
|
+
memory_gb_or_ratio,
|
|
306
|
+
region, zone)
|
|
304
307
|
|
|
305
308
|
|
|
306
309
|
def get_accelerators_from_instance_type(
|
|
@@ -67,9 +67,12 @@ def get_zone_shell_cmd() -> Optional[str]:
|
|
|
67
67
|
|
|
68
68
|
def get_default_instance_type(cpus: Optional[str] = None,
|
|
69
69
|
memory: Optional[str] = None,
|
|
70
|
-
disk_tier: Optional[str] = None
|
|
70
|
+
disk_tier: Optional[str] = None,
|
|
71
|
+
region: Optional[str] = None,
|
|
72
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
71
73
|
del disk_tier # Unused
|
|
72
|
-
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory
|
|
74
|
+
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
|
|
75
|
+
zone)
|
|
73
76
|
|
|
74
77
|
|
|
75
78
|
def get_instance_type_for_accelerator(
|
sky/catalog/ibm_catalog.py
CHANGED
|
@@ -92,10 +92,12 @@ def list_accelerators(
|
|
|
92
92
|
case_sensitive, all_regions)
|
|
93
93
|
|
|
94
94
|
|
|
95
|
-
def get_default_instance_type(
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
95
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
96
|
+
memory: Optional[str] = None,
|
|
97
|
+
disk_tier: Optional[
|
|
98
|
+
resources_utils.DiskTier] = None,
|
|
99
|
+
region: Optional[str] = None,
|
|
100
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
99
101
|
del disk_tier # unused
|
|
100
102
|
if cpus is None and memory is None:
|
|
101
103
|
cpus = f'{_DEFAULT_NUM_VCPUS}+'
|
|
@@ -107,7 +109,8 @@ def get_default_instance_type(
|
|
|
107
109
|
instance_type_prefix = f'{_DEFAULT_INSTANCE_FAMILY}-'
|
|
108
110
|
df = _df[_df['InstanceType'].str.startswith(instance_type_prefix)]
|
|
109
111
|
return common.get_instance_type_for_cpus_mem_impl(df, cpus,
|
|
110
|
-
memory_gb_or_ratio
|
|
112
|
+
memory_gb_or_ratio,
|
|
113
|
+
region, zone)
|
|
111
114
|
|
|
112
115
|
|
|
113
116
|
def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
|