skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/common.py +15 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/oci.py +32 -1
- sky/authentication.py +20 -8
- sky/backends/backend_utils.py +44 -0
- sky/backends/cloud_vm_ray_backend.py +202 -41
- sky/backends/wheel_utils.py +4 -1
- sky/check.py +31 -1
- sky/cli.py +39 -43
- sky/cloud_stores.py +71 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +137 -50
- sky/clouds/cloud.py +4 -0
- sky/clouds/do.py +303 -0
- sky/clouds/gcp.py +9 -0
- sky/clouds/kubernetes.py +3 -3
- sky/clouds/oci.py +20 -9
- sky/clouds/service_catalog/__init__.py +7 -3
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
- sky/clouds/utils/oci_utils.py +15 -2
- sky/core.py +8 -5
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +19 -4
- sky/data/mounting_utils.py +99 -15
- sky/data/storage.py +961 -130
- sky/global_user_state.py +1 -1
- sky/jobs/__init__.py +2 -0
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +19 -22
- sky/jobs/core.py +46 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +290 -21
- sky/jobs/utils.py +346 -95
- sky/optimizer.py +6 -3
- sky/provision/aws/config.py +59 -29
- sky/provision/azure/instance.py +1 -1
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +306 -0
- sky/provision/docker_utils.py +22 -11
- sky/provision/gcp/instance_utils.py +15 -9
- sky/provision/kubernetes/instance.py +3 -2
- sky/provision/kubernetes/utils.py +125 -20
- sky/provision/oci/query_utils.py +17 -14
- sky/provision/provisioner.py +0 -1
- sky/provision/runpod/instance.py +10 -1
- sky/provision/runpod/utils.py +170 -13
- sky/resources.py +1 -1
- sky/serve/autoscalers.py +359 -301
- sky/serve/controller.py +10 -8
- sky/serve/core.py +84 -7
- sky/serve/load_balancer.py +27 -10
- sky/serve/replica_managers.py +1 -3
- sky/serve/serve_state.py +10 -5
- sky/serve/serve_utils.py +28 -1
- sky/serve/service.py +4 -3
- sky/serve/service_spec.py +31 -0
- sky/setup_files/dependencies.py +4 -1
- sky/skylet/constants.py +8 -4
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +10 -30
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/providers/command_runner.py +5 -7
- sky/skylet/skylet.py +1 -1
- sky/task.py +28 -1
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/jobs-controller.yaml.j2 +41 -7
- sky/templates/runpod-ray.yml.j2 +13 -0
- sky/templates/sky-serve-controller.yaml.j2 +4 -0
- sky/usage/usage_lib.py +10 -2
- sky/utils/accelerator_registry.py +12 -8
- sky/utils/controller_utils.py +114 -39
- sky/utils/db_utils.py +18 -4
- sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
- sky/utils/log_utils.py +2 -0
- sky/utils/resources_utils.py +25 -21
- sky/utils/schemas.py +27 -0
- sky/utils/subprocess_utils.py +54 -10
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/clouds/do.py
ADDED
@@ -0,0 +1,303 @@
|
|
1
|
+
""" Digital Ocean Cloud. """
|
2
|
+
|
3
|
+
import json
|
4
|
+
import typing
|
5
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
6
|
+
|
7
|
+
from sky import clouds
|
8
|
+
from sky.adaptors import do
|
9
|
+
from sky.clouds import service_catalog
|
10
|
+
from sky.provision.do import utils as do_utils
|
11
|
+
from sky.utils import resources_utils
|
12
|
+
|
13
|
+
if typing.TYPE_CHECKING:
|
14
|
+
from sky import resources as resources_lib
|
15
|
+
|
16
|
+
_CREDENTIAL_FILE = 'config.yaml'
|
17
|
+
|
18
|
+
|
19
|
+
@clouds.CLOUD_REGISTRY.register(aliases=['digitalocean'])
|
20
|
+
class DO(clouds.Cloud):
|
21
|
+
"""Digital Ocean Cloud"""
|
22
|
+
|
23
|
+
_REPR = 'DO'
|
24
|
+
_CLOUD_UNSUPPORTED_FEATURES = {
|
25
|
+
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
26
|
+
'Migrating '
|
27
|
+
f'disk is not supported in {_REPR}.',
|
28
|
+
clouds.CloudImplementationFeatures.SPOT_INSTANCE:
|
29
|
+
'Spot instances are '
|
30
|
+
f'not supported in {_REPR}.',
|
31
|
+
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
32
|
+
'Custom disk tiers'
|
33
|
+
f' is not supported in {_REPR}.',
|
34
|
+
}
|
35
|
+
# DO maximum node name length defined as <= 255
|
36
|
+
# https://docs.digitalocean.com/reference/api/api-reference/#operation/droplets_create
|
37
|
+
# 255 - 8 = 247 characters since
|
38
|
+
# our provisioner adds additional `-worker`.
|
39
|
+
_MAX_CLUSTER_NAME_LEN_LIMIT = 247
|
40
|
+
_regions: List[clouds.Region] = []
|
41
|
+
|
42
|
+
# Using the latest SkyPilot provisioner API to provision and check status.
|
43
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
44
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
45
|
+
|
46
|
+
@classmethod
|
47
|
+
def _unsupported_features_for_resources(
|
48
|
+
cls, resources: 'resources_lib.Resources'
|
49
|
+
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
50
|
+
"""The features not supported based on the resources provided.
|
51
|
+
|
52
|
+
This method is used by check_features_are_supported() to check if the
|
53
|
+
cloud implementation supports all the requested features.
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
A dict of {feature: reason} for the features not supported by the
|
57
|
+
cloud implementation.
|
58
|
+
"""
|
59
|
+
del resources # unused
|
60
|
+
return cls._CLOUD_UNSUPPORTED_FEATURES
|
61
|
+
|
62
|
+
@classmethod
|
63
|
+
def _max_cluster_name_length(cls) -> Optional[int]:
|
64
|
+
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
65
|
+
|
66
|
+
@classmethod
|
67
|
+
def regions_with_offering(
|
68
|
+
cls,
|
69
|
+
instance_type: str,
|
70
|
+
accelerators: Optional[Dict[str, int]],
|
71
|
+
use_spot: bool,
|
72
|
+
region: Optional[str],
|
73
|
+
zone: Optional[str],
|
74
|
+
) -> List[clouds.Region]:
|
75
|
+
assert zone is None, 'DO does not support zones.'
|
76
|
+
del accelerators, zone # unused
|
77
|
+
if use_spot:
|
78
|
+
return []
|
79
|
+
regions = service_catalog.get_region_zones_for_instance_type(
|
80
|
+
instance_type, use_spot, 'DO')
|
81
|
+
if region is not None:
|
82
|
+
regions = [r for r in regions if r.name == region]
|
83
|
+
return regions
|
84
|
+
|
85
|
+
@classmethod
|
86
|
+
def get_vcpus_mem_from_instance_type(
|
87
|
+
cls,
|
88
|
+
instance_type: str,
|
89
|
+
) -> Tuple[Optional[float], Optional[float]]:
|
90
|
+
return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
|
91
|
+
clouds='DO')
|
92
|
+
|
93
|
+
@classmethod
|
94
|
+
def zones_provision_loop(
|
95
|
+
cls,
|
96
|
+
*,
|
97
|
+
region: str,
|
98
|
+
num_nodes: int,
|
99
|
+
instance_type: str,
|
100
|
+
accelerators: Optional[Dict[str, int]] = None,
|
101
|
+
use_spot: bool = False,
|
102
|
+
) -> Iterator[None]:
|
103
|
+
del num_nodes # unused
|
104
|
+
regions = cls.regions_with_offering(instance_type,
|
105
|
+
accelerators,
|
106
|
+
use_spot,
|
107
|
+
region=region,
|
108
|
+
zone=None)
|
109
|
+
for r in regions:
|
110
|
+
assert r.zones is None, r
|
111
|
+
yield r.zones
|
112
|
+
|
113
|
+
def instance_type_to_hourly_cost(
|
114
|
+
self,
|
115
|
+
instance_type: str,
|
116
|
+
use_spot: bool,
|
117
|
+
region: Optional[str] = None,
|
118
|
+
zone: Optional[str] = None,
|
119
|
+
) -> float:
|
120
|
+
return service_catalog.get_hourly_cost(
|
121
|
+
instance_type,
|
122
|
+
use_spot=use_spot,
|
123
|
+
region=region,
|
124
|
+
zone=zone,
|
125
|
+
clouds='DO',
|
126
|
+
)
|
127
|
+
|
128
|
+
def accelerators_to_hourly_cost(
|
129
|
+
self,
|
130
|
+
accelerators: Dict[str, int],
|
131
|
+
use_spot: bool,
|
132
|
+
region: Optional[str] = None,
|
133
|
+
zone: Optional[str] = None,
|
134
|
+
) -> float:
|
135
|
+
"""Returns the hourly cost of the accelerators, in dollars/hour."""
|
136
|
+
# the acc price is include in the instance price.
|
137
|
+
del accelerators, use_spot, region, zone # unused
|
138
|
+
return 0.0
|
139
|
+
|
140
|
+
def get_egress_cost(self, num_gigabytes: float) -> float:
|
141
|
+
return 0.0
|
142
|
+
|
143
|
+
def __repr__(self):
|
144
|
+
return self._REPR
|
145
|
+
|
146
|
+
@classmethod
|
147
|
+
def get_default_instance_type(
|
148
|
+
cls,
|
149
|
+
cpus: Optional[str] = None,
|
150
|
+
memory: Optional[str] = None,
|
151
|
+
disk_tier: Optional[resources_utils.DiskTier] = None,
|
152
|
+
) -> Optional[str]:
|
153
|
+
"""Returns the default instance type for DO."""
|
154
|
+
return service_catalog.get_default_instance_type(cpus=cpus,
|
155
|
+
memory=memory,
|
156
|
+
disk_tier=disk_tier,
|
157
|
+
clouds='DO')
|
158
|
+
|
159
|
+
@classmethod
|
160
|
+
def get_accelerators_from_instance_type(
|
161
|
+
cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
162
|
+
return service_catalog.get_accelerators_from_instance_type(
|
163
|
+
instance_type, clouds='DO')
|
164
|
+
|
165
|
+
@classmethod
|
166
|
+
def get_zone_shell_cmd(cls) -> Optional[str]:
|
167
|
+
return None
|
168
|
+
|
169
|
+
def make_deploy_resources_variables(
|
170
|
+
self,
|
171
|
+
resources: 'resources_lib.Resources',
|
172
|
+
cluster_name: resources_utils.ClusterName,
|
173
|
+
region: 'clouds.Region',
|
174
|
+
zones: Optional[List['clouds.Zone']],
|
175
|
+
num_nodes: int,
|
176
|
+
dryrun: bool = False) -> Dict[str, Optional[str]]:
|
177
|
+
del zones, dryrun, cluster_name
|
178
|
+
|
179
|
+
r = resources
|
180
|
+
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
181
|
+
if acc_dict is not None:
|
182
|
+
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
|
183
|
+
else:
|
184
|
+
custom_resources = None
|
185
|
+
image_id = None
|
186
|
+
if (resources.image_id is not None and
|
187
|
+
resources.extract_docker_image() is None):
|
188
|
+
if None in resources.image_id:
|
189
|
+
image_id = resources.image_id[None]
|
190
|
+
else:
|
191
|
+
assert region.name in resources.image_id
|
192
|
+
image_id = resources.image_id[region.name]
|
193
|
+
return {
|
194
|
+
'instance_type': resources.instance_type,
|
195
|
+
'custom_resources': custom_resources,
|
196
|
+
'region': region.name,
|
197
|
+
**({
|
198
|
+
'image_id': image_id
|
199
|
+
} if image_id else {})
|
200
|
+
}
|
201
|
+
|
202
|
+
def _get_feasible_launchable_resources(
|
203
|
+
self, resources: 'resources_lib.Resources'
|
204
|
+
) -> resources_utils.FeasibleResources:
|
205
|
+
"""Returns a list of feasible resources for the given resources."""
|
206
|
+
if resources.use_spot:
|
207
|
+
# TODO: Add hints to all return values in this method to help
|
208
|
+
# users understand why the resources are not launchable.
|
209
|
+
return resources_utils.FeasibleResources([], [], None)
|
210
|
+
if resources.instance_type is not None:
|
211
|
+
assert resources.is_launchable(), resources
|
212
|
+
resources = resources.copy(accelerators=None)
|
213
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
214
|
+
|
215
|
+
def _make(instance_list):
|
216
|
+
resource_list = []
|
217
|
+
for instance_type in instance_list:
|
218
|
+
r = resources.copy(
|
219
|
+
cloud=DO(),
|
220
|
+
instance_type=instance_type,
|
221
|
+
accelerators=None,
|
222
|
+
cpus=None,
|
223
|
+
)
|
224
|
+
resource_list.append(r)
|
225
|
+
return resource_list
|
226
|
+
|
227
|
+
# Currently, handle a filter on accelerators only.
|
228
|
+
accelerators = resources.accelerators
|
229
|
+
if accelerators is None:
|
230
|
+
# Return a default instance type
|
231
|
+
default_instance_type = DO.get_default_instance_type(
|
232
|
+
cpus=resources.cpus,
|
233
|
+
memory=resources.memory,
|
234
|
+
disk_tier=resources.disk_tier)
|
235
|
+
return resources_utils.FeasibleResources(
|
236
|
+
_make([default_instance_type]), [], None)
|
237
|
+
|
238
|
+
assert len(accelerators) == 1, resources
|
239
|
+
acc, acc_count = list(accelerators.items())[0]
|
240
|
+
(instance_list, fuzzy_candidate_list) = (
|
241
|
+
service_catalog.get_instance_type_for_accelerator(
|
242
|
+
acc,
|
243
|
+
acc_count,
|
244
|
+
use_spot=resources.use_spot,
|
245
|
+
cpus=resources.cpus,
|
246
|
+
memory=resources.memory,
|
247
|
+
region=resources.region,
|
248
|
+
zone=resources.zone,
|
249
|
+
clouds='DO',
|
250
|
+
))
|
251
|
+
if instance_list is None:
|
252
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
253
|
+
None)
|
254
|
+
return resources_utils.FeasibleResources(_make(instance_list),
|
255
|
+
fuzzy_candidate_list, None)
|
256
|
+
|
257
|
+
@classmethod
|
258
|
+
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
259
|
+
"""Verify that the user has valid credentials for DO."""
|
260
|
+
try:
|
261
|
+
# attempt to make a CURL request for listing instances
|
262
|
+
do_utils.client().droplets.list()
|
263
|
+
except do.exceptions().HttpResponseError as err:
|
264
|
+
return False, str(err)
|
265
|
+
except do_utils.DigitalOceanError as err:
|
266
|
+
return False, str(err)
|
267
|
+
|
268
|
+
return True, None
|
269
|
+
|
270
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
271
|
+
try:
|
272
|
+
do_utils.client()
|
273
|
+
return {
|
274
|
+
f'~/.config/doctl/{_CREDENTIAL_FILE}': do_utils.CREDENTIALS_PATH
|
275
|
+
}
|
276
|
+
except do_utils.DigitalOceanError:
|
277
|
+
return {}
|
278
|
+
|
279
|
+
@classmethod
|
280
|
+
def get_current_user_identity(cls) -> Optional[List[str]]:
|
281
|
+
# NOTE: used for very advanced SkyPilot functionality
|
282
|
+
# Can implement later if desired
|
283
|
+
return None
|
284
|
+
|
285
|
+
@classmethod
|
286
|
+
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
|
287
|
+
del region
|
288
|
+
try:
|
289
|
+
response = do_utils.client().images.get(image_id=image_id)
|
290
|
+
return response['image']['size_gigabytes']
|
291
|
+
except do.exceptions().HttpResponseError as err:
|
292
|
+
raise do_utils.DigitalOceanError(
|
293
|
+
'HTTP error while retrieving size of '
|
294
|
+
f'image_id {response}: {err.error.message}') from err
|
295
|
+
except KeyError as err:
|
296
|
+
raise do_utils.DigitalOceanError(
|
297
|
+
f'No image_id `{image_id}` found') from err
|
298
|
+
|
299
|
+
def instance_type_exists(self, instance_type: str) -> bool:
|
300
|
+
return service_catalog.instance_type_exists(instance_type, 'DO')
|
301
|
+
|
302
|
+
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
303
|
+
return service_catalog.validate_region_zone(region, zone, clouds='DO')
|
sky/clouds/gcp.py
CHANGED
@@ -132,6 +132,9 @@ class GCPIdentityType(enum.Enum):
|
|
132
132
|
|
133
133
|
SHARED_CREDENTIALS_FILE = ''
|
134
134
|
|
135
|
+
def can_credential_expire(self) -> bool:
|
136
|
+
return self == GCPIdentityType.SHARED_CREDENTIALS_FILE
|
137
|
+
|
135
138
|
|
136
139
|
@clouds.CLOUD_REGISTRY.register
|
137
140
|
class GCP(clouds.Cloud):
|
@@ -863,6 +866,12 @@ class GCP(clouds.Cloud):
|
|
863
866
|
pass
|
864
867
|
return credentials
|
865
868
|
|
869
|
+
@functools.lru_cache(maxsize=1)
|
870
|
+
def can_credential_expire(self) -> bool:
|
871
|
+
identity_type = self._get_identity_type()
|
872
|
+
return identity_type is not None and identity_type.can_credential_expire(
|
873
|
+
)
|
874
|
+
|
866
875
|
@classmethod
|
867
876
|
def _get_identity_type(cls) -> Optional[GCPIdentityType]:
|
868
877
|
try:
|
sky/clouds/kubernetes.py
CHANGED
@@ -131,7 +131,7 @@ class Kubernetes(clouds.Cloud):
|
|
131
131
|
'Ignoring these contexts.')
|
132
132
|
|
133
133
|
@classmethod
|
134
|
-
def
|
134
|
+
def existing_allowed_contexts(cls) -> List[str]:
|
135
135
|
"""Get existing allowed contexts.
|
136
136
|
|
137
137
|
If None is returned in the list, it means that we are running in a pod
|
@@ -175,7 +175,7 @@ class Kubernetes(clouds.Cloud):
|
|
175
175
|
use_spot: bool, region: Optional[str],
|
176
176
|
zone: Optional[str]) -> List[clouds.Region]:
|
177
177
|
del accelerators, zone, use_spot # unused
|
178
|
-
existing_contexts = cls.
|
178
|
+
existing_contexts = cls.existing_allowed_contexts()
|
179
179
|
|
180
180
|
regions = []
|
181
181
|
for context in existing_contexts:
|
@@ -591,7 +591,7 @@ class Kubernetes(clouds.Cloud):
|
|
591
591
|
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
592
592
|
# Test using python API
|
593
593
|
try:
|
594
|
-
existing_allowed_contexts = cls.
|
594
|
+
existing_allowed_contexts = cls.existing_allowed_contexts()
|
595
595
|
except ImportError as e:
|
596
596
|
return (False,
|
597
597
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
sky/clouds/oci.py
CHANGED
@@ -232,6 +232,14 @@ class OCI(clouds.Cloud):
|
|
232
232
|
listing_id = None
|
233
233
|
res_ver = None
|
234
234
|
|
235
|
+
os_type = None
|
236
|
+
if ':' in image_id:
|
237
|
+
# OS type provided in the --image-id. This is the case where
|
238
|
+
# custom image's ocid provided in the --image-id parameter.
|
239
|
+
# - ocid1.image...aaa:oraclelinux (os type is oraclelinux)
|
240
|
+
# - ocid1.image...aaa (OS not provided)
|
241
|
+
image_id, os_type = image_id.replace(' ', '').split(':')
|
242
|
+
|
235
243
|
cpus = resources.cpus
|
236
244
|
instance_type_arr = resources.instance_type.split(
|
237
245
|
oci_utils.oci_config.INSTANCE_TYPE_RES_SPERATOR)
|
@@ -297,15 +305,18 @@ class OCI(clouds.Cloud):
|
|
297
305
|
cpus=None if cpus is None else float(cpus),
|
298
306
|
disk_tier=resources.disk_tier)
|
299
307
|
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
308
|
+
if os_type is None:
|
309
|
+
# OS type is not determined yet. So try to get it from vms.csv
|
310
|
+
image_str = self._get_image_str(
|
311
|
+
image_id=resources.image_id,
|
312
|
+
instance_type=resources.instance_type,
|
313
|
+
region=region.name)
|
314
|
+
|
315
|
+
# pylint: disable=import-outside-toplevel
|
316
|
+
from sky.clouds.service_catalog import oci_catalog
|
317
|
+
os_type = oci_catalog.get_image_os_from_tag(tag=image_str,
|
318
|
+
region=region.name)
|
319
|
+
logger.debug(f'OS type for the image {image_id} is {os_type}')
|
309
320
|
|
310
321
|
return {
|
311
322
|
'instance_type': instance_type,
|
@@ -10,6 +10,7 @@ from sky.clouds.service_catalog.constants import CATALOG_DIR
|
|
10
10
|
from sky.clouds.service_catalog.constants import CATALOG_SCHEMA_VERSION
|
11
11
|
from sky.clouds.service_catalog.constants import HOSTED_CATALOG_DIR_URL
|
12
12
|
from sky.utils import resources_utils
|
13
|
+
from sky.utils import subprocess_utils
|
13
14
|
|
14
15
|
if typing.TYPE_CHECKING:
|
15
16
|
from sky.clouds import cloud
|
@@ -31,8 +32,7 @@ def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs):
|
|
31
32
|
if single:
|
32
33
|
clouds = [clouds] # type: ignore
|
33
34
|
|
34
|
-
|
35
|
-
for cloud in clouds:
|
35
|
+
def _execute_catalog_method(cloud: str):
|
36
36
|
try:
|
37
37
|
cloud_module = importlib.import_module(
|
38
38
|
f'sky.clouds.service_catalog.{cloud.lower()}_catalog')
|
@@ -46,7 +46,11 @@ def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs):
|
|
46
46
|
raise AttributeError(
|
47
47
|
f'Module "{cloud}_catalog" does not '
|
48
48
|
f'implement the "{method_name}" method') from None
|
49
|
-
|
49
|
+
return method(*args, **kwargs)
|
50
|
+
|
51
|
+
results = subprocess_utils.run_in_parallel(_execute_catalog_method,
|
52
|
+
args=list(clouds),
|
53
|
+
num_threads=len(clouds))
|
50
54
|
if single:
|
51
55
|
return results[0]
|
52
56
|
return results
|
@@ -47,10 +47,6 @@ TPU_RETRY_CNT = 3
|
|
47
47
|
TPU_V4_ZONES = ['us-central2-b']
|
48
48
|
# TPU v3 pods are available in us-east1-d, but hidden in the skus.
|
49
49
|
# We assume the TPU prices are the same as us-central1.
|
50
|
-
# TPU v6e's pricing info is not available on the SKUs. However, in
|
51
|
-
# https://cloud.google.com/tpu/pricing, it listed the price for 4 regions:
|
52
|
-
# us-east1, us-east5, europe-west4, and asia-northeast1. We hardcode them here
|
53
|
-
# and filtered out the other regions (us-central{1,2}, us-south1).
|
54
50
|
HIDDEN_TPU_DF = pd.read_csv(
|
55
51
|
io.StringIO(
|
56
52
|
textwrap.dedent("""\
|
@@ -62,49 +58,10 @@ HIDDEN_TPU_DF = pd.read_csv(
|
|
62
58
|
,tpu-v3-512,1,,,tpu-v3-512,512.0,153.6,us-east1,us-east1-d
|
63
59
|
,tpu-v3-1024,1,,,tpu-v3-1024,1024.0,307.2,us-east1,us-east1-d
|
64
60
|
,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d
|
65
|
-
,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east5,us-east5-b
|
66
|
-
,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east5,us-east5-c
|
67
|
-
,tpu-v6e-1,1,,,tpu-v6e-1,2.97,,europe-west4,europe-west4-a
|
68
|
-
,tpu-v6e-1,1,,,tpu-v6e-1,3.24,,asia-northeast1,asia-northeast1-b
|
69
|
-
,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east1,us-east1-d
|
70
|
-
,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east5,us-east5-b
|
71
|
-
,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east5,us-east5-c
|
72
|
-
,tpu-v6e-4,1,,,tpu-v6e-4,11.88,,europe-west4,europe-west4-a
|
73
|
-
,tpu-v6e-4,1,,,tpu-v6e-4,12.96,,asia-northeast1,asia-northeast1-b
|
74
|
-
,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east1,us-east1-d
|
75
|
-
,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east5,us-east5-b
|
76
|
-
,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east5,us-east5-c
|
77
|
-
,tpu-v6e-8,1,,,tpu-v6e-8,23.76,,europe-west4,europe-west4-a
|
78
|
-
,tpu-v6e-8,1,,,tpu-v6e-8,25.92,,asia-northeast1,asia-northeast1-b
|
79
|
-
,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east1,us-east1-d
|
80
|
-
,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east5,us-east5-b
|
81
|
-
,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east5,us-east5-c
|
82
|
-
,tpu-v6e-16,1,,,tpu-v6e-16,47.52,,europe-west4,europe-west4-a
|
83
|
-
,tpu-v6e-16,1,,,tpu-v6e-16,51.84,,asia-northeast1,asia-northeast1-b
|
84
|
-
,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east1,us-east1-d
|
85
|
-
,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east5,us-east5-b
|
86
|
-
,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east5,us-east5-c
|
87
|
-
,tpu-v6e-32,1,,,tpu-v6e-32,95.04,,europe-west4,europe-west4-a
|
88
|
-
,tpu-v6e-32,1,,,tpu-v6e-32,103.68,,asia-northeast1,asia-northeast1-b
|
89
|
-
,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east1,us-east1-d
|
90
|
-
,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east5,us-east5-b
|
91
|
-
,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east5,us-east5-c
|
92
|
-
,tpu-v6e-64,1,,,tpu-v6e-64,190.08,,europe-west4,europe-west4-a
|
93
|
-
,tpu-v6e-64,1,,,tpu-v6e-64,207.36,,asia-northeast1,asia-northeast1-b
|
94
|
-
,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east1,us-east1-d
|
95
|
-
,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east5,us-east5-b
|
96
|
-
,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east5,us-east5-c
|
97
|
-
,tpu-v6e-128,1,,,tpu-v6e-128,380.16,,europe-west4,europe-west4-a
|
98
|
-
,tpu-v6e-128,1,,,tpu-v6e-128,414.72,,asia-northeast1,asia-northeast1-b
|
99
|
-
,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east1,us-east1-d
|
100
|
-
,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east5,us-east5-b
|
101
|
-
,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east5,us-east5-c
|
102
|
-
,tpu-v6e-256,1,,,tpu-v6e-256,760.32,,europe-west4,europe-west4-a
|
103
|
-
,tpu-v6e-256,1,,,tpu-v6e-256,829.44,,asia-northeast1,asia-northeast1-b
|
104
|
-
,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east1,us-east1-d
|
105
61
|
""")))
|
106
62
|
|
107
|
-
|
63
|
+
# TPU V6e price for us-central2 is missing in the SKUs.
|
64
|
+
TPU_V6E_MISSING_REGIONS = ['us-central2']
|
108
65
|
|
109
66
|
# TPU V5 is not visible in specific zones. We hardcode the missing zones here.
|
110
67
|
# NOTE(dev): Keep the zones and the df in sync.
|
@@ -670,6 +627,8 @@ def get_tpu_df(gce_skus: List[Dict[str, Any]],
|
|
670
627
|
return 'TpuV5p'
|
671
628
|
assert tpu_version == 'v5litepod', tpu_version
|
672
629
|
return 'TpuV5e'
|
630
|
+
if tpu_version.startswith('v6e'):
|
631
|
+
return 'TpuV6e'
|
673
632
|
return f'Tpu-{tpu_version}'
|
674
633
|
|
675
634
|
def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]:
|
@@ -684,10 +643,10 @@ def get_tpu_df(gce_skus: List[Dict[str, Any]],
|
|
684
643
|
# whether the TPU is a single device or a pod.
|
685
644
|
# For TPU-v4, the pricing is uniform, and thus the pricing API
|
686
645
|
# only provides the price of TPU-v4 pods.
|
687
|
-
# The price shown for v5 TPU is per chip hour, so there is
|
688
|
-
# keyword in the description.
|
646
|
+
# The price shown for v5 & v6e TPU is per chip hour, so there is
|
647
|
+
# no 'Pod' keyword in the description.
|
689
648
|
is_pod = ((num_cores > 8 or tpu_version == 'v4') and
|
690
|
-
not tpu_version.startswith('v5'))
|
649
|
+
not tpu_version.startswith('v5') and tpu_version != 'v6e')
|
691
650
|
|
692
651
|
for sku in gce_skus + tpu_skus:
|
693
652
|
if tpu_region not in sku['serviceRegions']:
|
@@ -718,7 +677,9 @@ def get_tpu_df(gce_skus: List[Dict[str, Any]],
|
|
718
677
|
# for v5e. Reference here:
|
719
678
|
# https://cloud.google.com/tpu/docs/v5p#using-accelerator-type
|
720
679
|
# https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config
|
721
|
-
|
680
|
+
# v6e is also per chip price. Reference here:
|
681
|
+
# https://cloud.google.com/tpu/docs/v6e#configurations
|
682
|
+
core_per_sku = (1 if tpu_version in ['v5litepod', 'v6e'] else
|
722
683
|
2 if tpu_version == 'v5p' else 8)
|
723
684
|
tpu_core_price = tpu_device_price / core_per_sku
|
724
685
|
tpu_price = num_cores * tpu_core_price
|
@@ -738,8 +699,6 @@ def get_tpu_df(gce_skus: List[Dict[str, Any]],
|
|
738
699
|
spot_str = 'spot ' if spot else ''
|
739
700
|
print(f'The {spot_str}price of {tpu_name} in {tpu_region} is '
|
740
701
|
'not found in SKUs or hidden TPU price DF.')
|
741
|
-
# TODO(tian): Hack. Should investigate how to retrieve the price
|
742
|
-
# for TPU-v6e.
|
743
702
|
if (tpu_name.startswith('tpu-v6e') and
|
744
703
|
tpu_region in TPU_V6E_MISSING_REGIONS):
|
745
704
|
if not spot:
|
@@ -0,0 +1,111 @@
|
|
1
|
+
"""Digital ocean service catalog.
|
2
|
+
|
3
|
+
This module loads the service catalog file and can be used to
|
4
|
+
query instance types and pricing information for digital ocean.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import typing
|
8
|
+
from typing import Dict, List, Optional, Tuple, Union
|
9
|
+
|
10
|
+
from sky.clouds.service_catalog import common
|
11
|
+
from sky.utils import ux_utils
|
12
|
+
|
13
|
+
if typing.TYPE_CHECKING:
|
14
|
+
from sky.clouds import cloud
|
15
|
+
|
16
|
+
_df = common.read_catalog('do/vms.csv')
|
17
|
+
|
18
|
+
|
19
|
+
def instance_type_exists(instance_type: str) -> bool:
|
20
|
+
return common.instance_type_exists_impl(_df, instance_type)
|
21
|
+
|
22
|
+
|
23
|
+
def validate_region_zone(
|
24
|
+
region: Optional[str],
|
25
|
+
zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
|
26
|
+
if zone is not None:
|
27
|
+
with ux_utils.print_exception_no_traceback():
|
28
|
+
raise ValueError('DO does not support zones.')
|
29
|
+
return common.validate_region_zone_impl('DO', _df, region, zone)
|
30
|
+
|
31
|
+
|
32
|
+
def get_hourly_cost(
|
33
|
+
instance_type: str,
|
34
|
+
use_spot: bool = False,
|
35
|
+
region: Optional[str] = None,
|
36
|
+
zone: Optional[str] = None,
|
37
|
+
) -> float:
|
38
|
+
"""Returns the cost, or the cheapest cost among all zones for spot."""
|
39
|
+
if zone is not None:
|
40
|
+
with ux_utils.print_exception_no_traceback():
|
41
|
+
raise ValueError('DO does not support zones.')
|
42
|
+
return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
|
43
|
+
zone)
|
44
|
+
|
45
|
+
|
46
|
+
def get_vcpus_mem_from_instance_type(
|
47
|
+
instance_type: str,) -> Tuple[Optional[float], Optional[float]]:
|
48
|
+
return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
|
49
|
+
|
50
|
+
|
51
|
+
def get_default_instance_type(
|
52
|
+
cpus: Optional[str] = None,
|
53
|
+
memory: Optional[str] = None,
|
54
|
+
disk_tier: Optional[str] = None,
|
55
|
+
) -> Optional[str]:
|
56
|
+
# NOTE: After expanding catalog to multiple entries, you may
|
57
|
+
# want to specify a default instance type or family.
|
58
|
+
del disk_tier # unused
|
59
|
+
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
|
60
|
+
|
61
|
+
|
62
|
+
def get_accelerators_from_instance_type(
|
63
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
64
|
+
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
65
|
+
|
66
|
+
|
67
|
+
def get_instance_type_for_accelerator(
|
68
|
+
acc_name: str,
|
69
|
+
acc_count: int,
|
70
|
+
cpus: Optional[str] = None,
|
71
|
+
memory: Optional[str] = None,
|
72
|
+
use_spot: bool = False,
|
73
|
+
region: Optional[str] = None,
|
74
|
+
zone: Optional[str] = None,
|
75
|
+
) -> Tuple[Optional[List[str]], List[str]]:
|
76
|
+
"""Returns a list of instance types that have the given accelerator."""
|
77
|
+
if zone is not None:
|
78
|
+
with ux_utils.print_exception_no_traceback():
|
79
|
+
raise ValueError('DO does not support zones.')
|
80
|
+
return common.get_instance_type_for_accelerator_impl(
|
81
|
+
df=_df,
|
82
|
+
acc_name=acc_name,
|
83
|
+
acc_count=acc_count,
|
84
|
+
cpus=cpus,
|
85
|
+
memory=memory,
|
86
|
+
use_spot=use_spot,
|
87
|
+
region=region,
|
88
|
+
zone=zone,
|
89
|
+
)
|
90
|
+
|
91
|
+
|
92
|
+
def get_region_zones_for_instance_type(instance_type: str,
|
93
|
+
use_spot: bool) -> List['cloud.Region']:
|
94
|
+
df = _df[_df['InstanceType'] == instance_type]
|
95
|
+
return common.get_region_zones(df, use_spot)
|
96
|
+
|
97
|
+
|
98
|
+
def list_accelerators(
|
99
|
+
gpus_only: bool,
|
100
|
+
name_filter: Optional[str],
|
101
|
+
region_filter: Optional[str],
|
102
|
+
quantity_filter: Optional[int],
|
103
|
+
case_sensitive: bool = True,
|
104
|
+
all_regions: bool = False,
|
105
|
+
require_price: bool = True,
|
106
|
+
) -> Dict[str, List[common.InstanceTypeInfo]]:
|
107
|
+
"""Returns all instance types in DO offering GPUs."""
|
108
|
+
del require_price # unused
|
109
|
+
return common.list_accelerators_impl('DO', _df, gpus_only, name_filter,
|
110
|
+
region_filter, quantity_filter,
|
111
|
+
case_sensitive, all_regions)
|
@@ -115,6 +115,16 @@ def _list_accelerators(
|
|
115
115
|
|
116
116
|
If the user does not have sufficient permissions to list pods in all
|
117
117
|
namespaces, the function will return free GPUs as -1.
|
118
|
+
|
119
|
+
Returns:
|
120
|
+
A tuple of three dictionaries:
|
121
|
+
- qtys_map: Dict mapping accelerator names to lists of InstanceTypeInfo
|
122
|
+
objects with quantity information.
|
123
|
+
- total_accelerators_capacity: Dict mapping accelerator names to their
|
124
|
+
total capacity in the cluster.
|
125
|
+
- total_accelerators_available: Dict mapping accelerator names to their
|
126
|
+
current availability. Returns -1 for each accelerator if
|
127
|
+
realtime=False or if insufficient permissions.
|
118
128
|
"""
|
119
129
|
# TODO(romilb): This should be refactored to use get_kubernetes_node_info()
|
120
130
|
# function from kubernetes_utils.
|
@@ -243,6 +253,10 @@ def _list_accelerators(
|
|
243
253
|
|
244
254
|
accelerators_available = accelerator_count - allocated_qty
|
245
255
|
|
256
|
+
# Initialize the entry if it doesn't exist yet
|
257
|
+
if accelerator_name not in total_accelerators_available:
|
258
|
+
total_accelerators_available[accelerator_name] = 0
|
259
|
+
|
246
260
|
if accelerators_available >= min_quantity_filter:
|
247
261
|
quantized_availability = min_quantity_filter * (
|
248
262
|
accelerators_available // min_quantity_filter)
|