skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/common.py +15 -9
  3. sky/adaptors/do.py +20 -0
  4. sky/adaptors/oci.py +32 -1
  5. sky/authentication.py +20 -8
  6. sky/backends/backend_utils.py +44 -0
  7. sky/backends/cloud_vm_ray_backend.py +202 -41
  8. sky/backends/wheel_utils.py +4 -1
  9. sky/check.py +31 -1
  10. sky/cli.py +39 -43
  11. sky/cloud_stores.py +71 -2
  12. sky/clouds/__init__.py +2 -0
  13. sky/clouds/aws.py +137 -50
  14. sky/clouds/cloud.py +4 -0
  15. sky/clouds/do.py +303 -0
  16. sky/clouds/gcp.py +9 -0
  17. sky/clouds/kubernetes.py +3 -3
  18. sky/clouds/oci.py +20 -9
  19. sky/clouds/service_catalog/__init__.py +7 -3
  20. sky/clouds/service_catalog/constants.py +1 -1
  21. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
  22. sky/clouds/service_catalog/do_catalog.py +111 -0
  23. sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
  24. sky/clouds/utils/oci_utils.py +15 -2
  25. sky/core.py +8 -5
  26. sky/data/data_transfer.py +37 -0
  27. sky/data/data_utils.py +19 -4
  28. sky/data/mounting_utils.py +99 -15
  29. sky/data/storage.py +961 -130
  30. sky/global_user_state.py +1 -1
  31. sky/jobs/__init__.py +2 -0
  32. sky/jobs/constants.py +8 -7
  33. sky/jobs/controller.py +19 -22
  34. sky/jobs/core.py +46 -2
  35. sky/jobs/recovery_strategy.py +114 -143
  36. sky/jobs/scheduler.py +283 -0
  37. sky/jobs/state.py +290 -21
  38. sky/jobs/utils.py +346 -95
  39. sky/optimizer.py +6 -3
  40. sky/provision/aws/config.py +59 -29
  41. sky/provision/azure/instance.py +1 -1
  42. sky/provision/do/__init__.py +11 -0
  43. sky/provision/do/config.py +14 -0
  44. sky/provision/do/constants.py +10 -0
  45. sky/provision/do/instance.py +287 -0
  46. sky/provision/do/utils.py +306 -0
  47. sky/provision/docker_utils.py +22 -11
  48. sky/provision/gcp/instance_utils.py +15 -9
  49. sky/provision/kubernetes/instance.py +3 -2
  50. sky/provision/kubernetes/utils.py +125 -20
  51. sky/provision/oci/query_utils.py +17 -14
  52. sky/provision/provisioner.py +0 -1
  53. sky/provision/runpod/instance.py +10 -1
  54. sky/provision/runpod/utils.py +170 -13
  55. sky/resources.py +1 -1
  56. sky/serve/autoscalers.py +359 -301
  57. sky/serve/controller.py +10 -8
  58. sky/serve/core.py +84 -7
  59. sky/serve/load_balancer.py +27 -10
  60. sky/serve/replica_managers.py +1 -3
  61. sky/serve/serve_state.py +10 -5
  62. sky/serve/serve_utils.py +28 -1
  63. sky/serve/service.py +4 -3
  64. sky/serve/service_spec.py +31 -0
  65. sky/setup_files/dependencies.py +4 -1
  66. sky/skylet/constants.py +8 -4
  67. sky/skylet/events.py +7 -3
  68. sky/skylet/job_lib.py +10 -30
  69. sky/skylet/log_lib.py +8 -8
  70. sky/skylet/log_lib.pyi +3 -0
  71. sky/skylet/providers/command_runner.py +5 -7
  72. sky/skylet/skylet.py +1 -1
  73. sky/task.py +28 -1
  74. sky/templates/do-ray.yml.j2 +98 -0
  75. sky/templates/jobs-controller.yaml.j2 +41 -7
  76. sky/templates/runpod-ray.yml.j2 +13 -0
  77. sky/templates/sky-serve-controller.yaml.j2 +4 -0
  78. sky/usage/usage_lib.py +10 -2
  79. sky/utils/accelerator_registry.py +12 -8
  80. sky/utils/controller_utils.py +114 -39
  81. sky/utils/db_utils.py +18 -4
  82. sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
  83. sky/utils/log_utils.py +2 -0
  84. sky/utils/resources_utils.py +25 -21
  85. sky/utils/schemas.py +27 -0
  86. sky/utils/subprocess_utils.py +54 -10
  87. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
  88. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
  89. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
  90. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
  91. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
  92. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/clouds/do.py ADDED
@@ -0,0 +1,303 @@
1
+ """ Digital Ocean Cloud. """
2
+
3
+ import json
4
+ import typing
5
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
6
+
7
+ from sky import clouds
8
+ from sky.adaptors import do
9
+ from sky.clouds import service_catalog
10
+ from sky.provision.do import utils as do_utils
11
+ from sky.utils import resources_utils
12
+
13
+ if typing.TYPE_CHECKING:
14
+ from sky import resources as resources_lib
15
+
16
+ _CREDENTIAL_FILE = 'config.yaml'
17
+
18
+
19
+ @clouds.CLOUD_REGISTRY.register(aliases=['digitalocean'])
20
+ class DO(clouds.Cloud):
21
+ """Digital Ocean Cloud"""
22
+
23
+ _REPR = 'DO'
24
+ _CLOUD_UNSUPPORTED_FEATURES = {
25
+ clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
26
+ 'Migrating '
27
+ f'disk is not supported in {_REPR}.',
28
+ clouds.CloudImplementationFeatures.SPOT_INSTANCE:
29
+ 'Spot instances are '
30
+ f'not supported in {_REPR}.',
31
+ clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
32
+ 'Custom disk tiers'
33
+ f' is not supported in {_REPR}.',
34
+ }
35
+ # DO maximum node name length defined as <= 255
36
+ # https://docs.digitalocean.com/reference/api/api-reference/#operation/droplets_create
37
+ # 255 - 8 = 247 characters since
38
+ # our provisioner adds additional `-worker`.
39
+ _MAX_CLUSTER_NAME_LEN_LIMIT = 247
40
+ _regions: List[clouds.Region] = []
41
+
42
+ # Using the latest SkyPilot provisioner API to provision and check status.
43
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
44
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
45
+
46
+ @classmethod
47
+ def _unsupported_features_for_resources(
48
+ cls, resources: 'resources_lib.Resources'
49
+ ) -> Dict[clouds.CloudImplementationFeatures, str]:
50
+ """The features not supported based on the resources provided.
51
+
52
+ This method is used by check_features_are_supported() to check if the
53
+ cloud implementation supports all the requested features.
54
+
55
+ Returns:
56
+ A dict of {feature: reason} for the features not supported by the
57
+ cloud implementation.
58
+ """
59
+ del resources # unused
60
+ return cls._CLOUD_UNSUPPORTED_FEATURES
61
+
62
+ @classmethod
63
+ def _max_cluster_name_length(cls) -> Optional[int]:
64
+ return cls._MAX_CLUSTER_NAME_LEN_LIMIT
65
+
66
+ @classmethod
67
+ def regions_with_offering(
68
+ cls,
69
+ instance_type: str,
70
+ accelerators: Optional[Dict[str, int]],
71
+ use_spot: bool,
72
+ region: Optional[str],
73
+ zone: Optional[str],
74
+ ) -> List[clouds.Region]:
75
+ assert zone is None, 'DO does not support zones.'
76
+ del accelerators, zone # unused
77
+ if use_spot:
78
+ return []
79
+ regions = service_catalog.get_region_zones_for_instance_type(
80
+ instance_type, use_spot, 'DO')
81
+ if region is not None:
82
+ regions = [r for r in regions if r.name == region]
83
+ return regions
84
+
85
+ @classmethod
86
+ def get_vcpus_mem_from_instance_type(
87
+ cls,
88
+ instance_type: str,
89
+ ) -> Tuple[Optional[float], Optional[float]]:
90
+ return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
91
+ clouds='DO')
92
+
93
+ @classmethod
94
+ def zones_provision_loop(
95
+ cls,
96
+ *,
97
+ region: str,
98
+ num_nodes: int,
99
+ instance_type: str,
100
+ accelerators: Optional[Dict[str, int]] = None,
101
+ use_spot: bool = False,
102
+ ) -> Iterator[None]:
103
+ del num_nodes # unused
104
+ regions = cls.regions_with_offering(instance_type,
105
+ accelerators,
106
+ use_spot,
107
+ region=region,
108
+ zone=None)
109
+ for r in regions:
110
+ assert r.zones is None, r
111
+ yield r.zones
112
+
113
+ def instance_type_to_hourly_cost(
114
+ self,
115
+ instance_type: str,
116
+ use_spot: bool,
117
+ region: Optional[str] = None,
118
+ zone: Optional[str] = None,
119
+ ) -> float:
120
+ return service_catalog.get_hourly_cost(
121
+ instance_type,
122
+ use_spot=use_spot,
123
+ region=region,
124
+ zone=zone,
125
+ clouds='DO',
126
+ )
127
+
128
+ def accelerators_to_hourly_cost(
129
+ self,
130
+ accelerators: Dict[str, int],
131
+ use_spot: bool,
132
+ region: Optional[str] = None,
133
+ zone: Optional[str] = None,
134
+ ) -> float:
135
+ """Returns the hourly cost of the accelerators, in dollars/hour."""
136
+ # the acc price is include in the instance price.
137
+ del accelerators, use_spot, region, zone # unused
138
+ return 0.0
139
+
140
+ def get_egress_cost(self, num_gigabytes: float) -> float:
141
+ return 0.0
142
+
143
+ def __repr__(self):
144
+ return self._REPR
145
+
146
+ @classmethod
147
+ def get_default_instance_type(
148
+ cls,
149
+ cpus: Optional[str] = None,
150
+ memory: Optional[str] = None,
151
+ disk_tier: Optional[resources_utils.DiskTier] = None,
152
+ ) -> Optional[str]:
153
+ """Returns the default instance type for DO."""
154
+ return service_catalog.get_default_instance_type(cpus=cpus,
155
+ memory=memory,
156
+ disk_tier=disk_tier,
157
+ clouds='DO')
158
+
159
+ @classmethod
160
+ def get_accelerators_from_instance_type(
161
+ cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
162
+ return service_catalog.get_accelerators_from_instance_type(
163
+ instance_type, clouds='DO')
164
+
165
+ @classmethod
166
+ def get_zone_shell_cmd(cls) -> Optional[str]:
167
+ return None
168
+
169
+ def make_deploy_resources_variables(
170
+ self,
171
+ resources: 'resources_lib.Resources',
172
+ cluster_name: resources_utils.ClusterName,
173
+ region: 'clouds.Region',
174
+ zones: Optional[List['clouds.Zone']],
175
+ num_nodes: int,
176
+ dryrun: bool = False) -> Dict[str, Optional[str]]:
177
+ del zones, dryrun, cluster_name
178
+
179
+ r = resources
180
+ acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
181
+ if acc_dict is not None:
182
+ custom_resources = json.dumps(acc_dict, separators=(',', ':'))
183
+ else:
184
+ custom_resources = None
185
+ image_id = None
186
+ if (resources.image_id is not None and
187
+ resources.extract_docker_image() is None):
188
+ if None in resources.image_id:
189
+ image_id = resources.image_id[None]
190
+ else:
191
+ assert region.name in resources.image_id
192
+ image_id = resources.image_id[region.name]
193
+ return {
194
+ 'instance_type': resources.instance_type,
195
+ 'custom_resources': custom_resources,
196
+ 'region': region.name,
197
+ **({
198
+ 'image_id': image_id
199
+ } if image_id else {})
200
+ }
201
+
202
+ def _get_feasible_launchable_resources(
203
+ self, resources: 'resources_lib.Resources'
204
+ ) -> resources_utils.FeasibleResources:
205
+ """Returns a list of feasible resources for the given resources."""
206
+ if resources.use_spot:
207
+ # TODO: Add hints to all return values in this method to help
208
+ # users understand why the resources are not launchable.
209
+ return resources_utils.FeasibleResources([], [], None)
210
+ if resources.instance_type is not None:
211
+ assert resources.is_launchable(), resources
212
+ resources = resources.copy(accelerators=None)
213
+ return resources_utils.FeasibleResources([resources], [], None)
214
+
215
+ def _make(instance_list):
216
+ resource_list = []
217
+ for instance_type in instance_list:
218
+ r = resources.copy(
219
+ cloud=DO(),
220
+ instance_type=instance_type,
221
+ accelerators=None,
222
+ cpus=None,
223
+ )
224
+ resource_list.append(r)
225
+ return resource_list
226
+
227
+ # Currently, handle a filter on accelerators only.
228
+ accelerators = resources.accelerators
229
+ if accelerators is None:
230
+ # Return a default instance type
231
+ default_instance_type = DO.get_default_instance_type(
232
+ cpus=resources.cpus,
233
+ memory=resources.memory,
234
+ disk_tier=resources.disk_tier)
235
+ return resources_utils.FeasibleResources(
236
+ _make([default_instance_type]), [], None)
237
+
238
+ assert len(accelerators) == 1, resources
239
+ acc, acc_count = list(accelerators.items())[0]
240
+ (instance_list, fuzzy_candidate_list) = (
241
+ service_catalog.get_instance_type_for_accelerator(
242
+ acc,
243
+ acc_count,
244
+ use_spot=resources.use_spot,
245
+ cpus=resources.cpus,
246
+ memory=resources.memory,
247
+ region=resources.region,
248
+ zone=resources.zone,
249
+ clouds='DO',
250
+ ))
251
+ if instance_list is None:
252
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
253
+ None)
254
+ return resources_utils.FeasibleResources(_make(instance_list),
255
+ fuzzy_candidate_list, None)
256
+
257
+ @classmethod
258
+ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
259
+ """Verify that the user has valid credentials for DO."""
260
+ try:
261
+ # attempt to make a CURL request for listing instances
262
+ do_utils.client().droplets.list()
263
+ except do.exceptions().HttpResponseError as err:
264
+ return False, str(err)
265
+ except do_utils.DigitalOceanError as err:
266
+ return False, str(err)
267
+
268
+ return True, None
269
+
270
+ def get_credential_file_mounts(self) -> Dict[str, str]:
271
+ try:
272
+ do_utils.client()
273
+ return {
274
+ f'~/.config/doctl/{_CREDENTIAL_FILE}': do_utils.CREDENTIALS_PATH
275
+ }
276
+ except do_utils.DigitalOceanError:
277
+ return {}
278
+
279
+ @classmethod
280
+ def get_current_user_identity(cls) -> Optional[List[str]]:
281
+ # NOTE: used for very advanced SkyPilot functionality
282
+ # Can implement later if desired
283
+ return None
284
+
285
+ @classmethod
286
+ def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
287
+ del region
288
+ try:
289
+ response = do_utils.client().images.get(image_id=image_id)
290
+ return response['image']['size_gigabytes']
291
+ except do.exceptions().HttpResponseError as err:
292
+ raise do_utils.DigitalOceanError(
293
+ 'HTTP error while retrieving size of '
294
+ f'image_id {response}: {err.error.message}') from err
295
+ except KeyError as err:
296
+ raise do_utils.DigitalOceanError(
297
+ f'No image_id `{image_id}` found') from err
298
+
299
+ def instance_type_exists(self, instance_type: str) -> bool:
300
+ return service_catalog.instance_type_exists(instance_type, 'DO')
301
+
302
+ def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
303
+ return service_catalog.validate_region_zone(region, zone, clouds='DO')
sky/clouds/gcp.py CHANGED
@@ -132,6 +132,9 @@ class GCPIdentityType(enum.Enum):
132
132
 
133
133
  SHARED_CREDENTIALS_FILE = ''
134
134
 
135
+ def can_credential_expire(self) -> bool:
136
+ return self == GCPIdentityType.SHARED_CREDENTIALS_FILE
137
+
135
138
 
136
139
  @clouds.CLOUD_REGISTRY.register
137
140
  class GCP(clouds.Cloud):
@@ -863,6 +866,12 @@ class GCP(clouds.Cloud):
863
866
  pass
864
867
  return credentials
865
868
 
869
+ @functools.lru_cache(maxsize=1)
870
+ def can_credential_expire(self) -> bool:
871
+ identity_type = self._get_identity_type()
872
+ return identity_type is not None and identity_type.can_credential_expire(
873
+ )
874
+
866
875
  @classmethod
867
876
  def _get_identity_type(cls) -> Optional[GCPIdentityType]:
868
877
  try:
sky/clouds/kubernetes.py CHANGED
@@ -131,7 +131,7 @@ class Kubernetes(clouds.Cloud):
131
131
  'Ignoring these contexts.')
132
132
 
133
133
  @classmethod
134
- def _existing_allowed_contexts(cls) -> List[str]:
134
+ def existing_allowed_contexts(cls) -> List[str]:
135
135
  """Get existing allowed contexts.
136
136
 
137
137
  If None is returned in the list, it means that we are running in a pod
@@ -175,7 +175,7 @@ class Kubernetes(clouds.Cloud):
175
175
  use_spot: bool, region: Optional[str],
176
176
  zone: Optional[str]) -> List[clouds.Region]:
177
177
  del accelerators, zone, use_spot # unused
178
- existing_contexts = cls._existing_allowed_contexts()
178
+ existing_contexts = cls.existing_allowed_contexts()
179
179
 
180
180
  regions = []
181
181
  for context in existing_contexts:
@@ -591,7 +591,7 @@ class Kubernetes(clouds.Cloud):
591
591
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
592
592
  # Test using python API
593
593
  try:
594
- existing_allowed_contexts = cls._existing_allowed_contexts()
594
+ existing_allowed_contexts = cls.existing_allowed_contexts()
595
595
  except ImportError as e:
596
596
  return (False,
597
597
  f'{common_utils.format_exception(e, use_bracket=True)}')
sky/clouds/oci.py CHANGED
@@ -232,6 +232,14 @@ class OCI(clouds.Cloud):
232
232
  listing_id = None
233
233
  res_ver = None
234
234
 
235
+ os_type = None
236
+ if ':' in image_id:
237
+ # OS type provided in the --image-id. This is the case where
238
+ # custom image's ocid provided in the --image-id parameter.
239
+ # - ocid1.image...aaa:oraclelinux (os type is oraclelinux)
240
+ # - ocid1.image...aaa (OS not provided)
241
+ image_id, os_type = image_id.replace(' ', '').split(':')
242
+
235
243
  cpus = resources.cpus
236
244
  instance_type_arr = resources.instance_type.split(
237
245
  oci_utils.oci_config.INSTANCE_TYPE_RES_SPERATOR)
@@ -297,15 +305,18 @@ class OCI(clouds.Cloud):
297
305
  cpus=None if cpus is None else float(cpus),
298
306
  disk_tier=resources.disk_tier)
299
307
 
300
- image_str = self._get_image_str(image_id=resources.image_id,
301
- instance_type=resources.instance_type,
302
- region=region.name)
303
-
304
- # pylint: disable=import-outside-toplevel
305
- from sky.clouds.service_catalog import oci_catalog
306
- os_type = oci_catalog.get_image_os_from_tag(tag=image_str,
307
- region=region.name)
308
- logger.debug(f'OS type for the image {image_str} is {os_type}')
308
+ if os_type is None:
309
+ # OS type is not determined yet. So try to get it from vms.csv
310
+ image_str = self._get_image_str(
311
+ image_id=resources.image_id,
312
+ instance_type=resources.instance_type,
313
+ region=region.name)
314
+
315
+ # pylint: disable=import-outside-toplevel
316
+ from sky.clouds.service_catalog import oci_catalog
317
+ os_type = oci_catalog.get_image_os_from_tag(tag=image_str,
318
+ region=region.name)
319
+ logger.debug(f'OS type for the image {image_id} is {os_type}')
309
320
 
310
321
  return {
311
322
  'instance_type': instance_type,
@@ -10,6 +10,7 @@ from sky.clouds.service_catalog.constants import CATALOG_DIR
10
10
  from sky.clouds.service_catalog.constants import CATALOG_SCHEMA_VERSION
11
11
  from sky.clouds.service_catalog.constants import HOSTED_CATALOG_DIR_URL
12
12
  from sky.utils import resources_utils
13
+ from sky.utils import subprocess_utils
13
14
 
14
15
  if typing.TYPE_CHECKING:
15
16
  from sky.clouds import cloud
@@ -31,8 +32,7 @@ def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs):
31
32
  if single:
32
33
  clouds = [clouds] # type: ignore
33
34
 
34
- results = []
35
- for cloud in clouds:
35
+ def _execute_catalog_method(cloud: str):
36
36
  try:
37
37
  cloud_module = importlib.import_module(
38
38
  f'sky.clouds.service_catalog.{cloud.lower()}_catalog')
@@ -46,7 +46,11 @@ def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs):
46
46
  raise AttributeError(
47
47
  f'Module "{cloud}_catalog" does not '
48
48
  f'implement the "{method_name}" method') from None
49
- results.append(method(*args, **kwargs))
49
+ return method(*args, **kwargs)
50
+
51
+ results = subprocess_utils.run_in_parallel(_execute_catalog_method,
52
+ args=list(clouds),
53
+ num_threads=len(clouds))
50
54
  if single:
51
55
  return results[0]
52
56
  return results
@@ -4,4 +4,4 @@ CATALOG_SCHEMA_VERSION = 'v6'
4
4
  CATALOG_DIR = '~/.sky/catalogs'
5
5
  ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
6
6
  'kubernetes', 'runpod', 'vsphere', 'cudo', 'fluidstack',
7
- 'paperspace')
7
+ 'paperspace', 'do')
@@ -47,10 +47,6 @@ TPU_RETRY_CNT = 3
47
47
  TPU_V4_ZONES = ['us-central2-b']
48
48
  # TPU v3 pods are available in us-east1-d, but hidden in the skus.
49
49
  # We assume the TPU prices are the same as us-central1.
50
- # TPU v6e's pricing info is not available on the SKUs. However, in
51
- # https://cloud.google.com/tpu/pricing, it listed the price for 4 regions:
52
- # us-east1, us-east5, europe-west4, and asia-northeast1. We hardcode them here
53
- # and filtered out the other regions (us-central{1,2}, us-south1).
54
50
  HIDDEN_TPU_DF = pd.read_csv(
55
51
  io.StringIO(
56
52
  textwrap.dedent("""\
@@ -62,49 +58,10 @@ HIDDEN_TPU_DF = pd.read_csv(
62
58
  ,tpu-v3-512,1,,,tpu-v3-512,512.0,153.6,us-east1,us-east1-d
63
59
  ,tpu-v3-1024,1,,,tpu-v3-1024,1024.0,307.2,us-east1,us-east1-d
64
60
  ,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d
65
- ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east5,us-east5-b
66
- ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east5,us-east5-c
67
- ,tpu-v6e-1,1,,,tpu-v6e-1,2.97,,europe-west4,europe-west4-a
68
- ,tpu-v6e-1,1,,,tpu-v6e-1,3.24,,asia-northeast1,asia-northeast1-b
69
- ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east1,us-east1-d
70
- ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east5,us-east5-b
71
- ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east5,us-east5-c
72
- ,tpu-v6e-4,1,,,tpu-v6e-4,11.88,,europe-west4,europe-west4-a
73
- ,tpu-v6e-4,1,,,tpu-v6e-4,12.96,,asia-northeast1,asia-northeast1-b
74
- ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east1,us-east1-d
75
- ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east5,us-east5-b
76
- ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east5,us-east5-c
77
- ,tpu-v6e-8,1,,,tpu-v6e-8,23.76,,europe-west4,europe-west4-a
78
- ,tpu-v6e-8,1,,,tpu-v6e-8,25.92,,asia-northeast1,asia-northeast1-b
79
- ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east1,us-east1-d
80
- ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east5,us-east5-b
81
- ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east5,us-east5-c
82
- ,tpu-v6e-16,1,,,tpu-v6e-16,47.52,,europe-west4,europe-west4-a
83
- ,tpu-v6e-16,1,,,tpu-v6e-16,51.84,,asia-northeast1,asia-northeast1-b
84
- ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east1,us-east1-d
85
- ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east5,us-east5-b
86
- ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east5,us-east5-c
87
- ,tpu-v6e-32,1,,,tpu-v6e-32,95.04,,europe-west4,europe-west4-a
88
- ,tpu-v6e-32,1,,,tpu-v6e-32,103.68,,asia-northeast1,asia-northeast1-b
89
- ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east1,us-east1-d
90
- ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east5,us-east5-b
91
- ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east5,us-east5-c
92
- ,tpu-v6e-64,1,,,tpu-v6e-64,190.08,,europe-west4,europe-west4-a
93
- ,tpu-v6e-64,1,,,tpu-v6e-64,207.36,,asia-northeast1,asia-northeast1-b
94
- ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east1,us-east1-d
95
- ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east5,us-east5-b
96
- ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east5,us-east5-c
97
- ,tpu-v6e-128,1,,,tpu-v6e-128,380.16,,europe-west4,europe-west4-a
98
- ,tpu-v6e-128,1,,,tpu-v6e-128,414.72,,asia-northeast1,asia-northeast1-b
99
- ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east1,us-east1-d
100
- ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east5,us-east5-b
101
- ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east5,us-east5-c
102
- ,tpu-v6e-256,1,,,tpu-v6e-256,760.32,,europe-west4,europe-west4-a
103
- ,tpu-v6e-256,1,,,tpu-v6e-256,829.44,,asia-northeast1,asia-northeast1-b
104
- ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east1,us-east1-d
105
61
  """)))
106
62
 
107
- TPU_V6E_MISSING_REGIONS = ['us-central1', 'us-central2', 'us-south1']
63
+ # TPU V6e price for us-central2 is missing in the SKUs.
64
+ TPU_V6E_MISSING_REGIONS = ['us-central2']
108
65
 
109
66
  # TPU V5 is not visible in specific zones. We hardcode the missing zones here.
110
67
  # NOTE(dev): Keep the zones and the df in sync.
@@ -670,6 +627,8 @@ def get_tpu_df(gce_skus: List[Dict[str, Any]],
670
627
  return 'TpuV5p'
671
628
  assert tpu_version == 'v5litepod', tpu_version
672
629
  return 'TpuV5e'
630
+ if tpu_version.startswith('v6e'):
631
+ return 'TpuV6e'
673
632
  return f'Tpu-{tpu_version}'
674
633
 
675
634
  def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]:
@@ -684,10 +643,10 @@ def get_tpu_df(gce_skus: List[Dict[str, Any]],
684
643
  # whether the TPU is a single device or a pod.
685
644
  # For TPU-v4, the pricing is uniform, and thus the pricing API
686
645
  # only provides the price of TPU-v4 pods.
687
- # The price shown for v5 TPU is per chip hour, so there is no 'Pod'
688
- # keyword in the description.
646
+ # The price shown for v5 & v6e TPU is per chip hour, so there is
647
+ # no 'Pod' keyword in the description.
689
648
  is_pod = ((num_cores > 8 or tpu_version == 'v4') and
690
- not tpu_version.startswith('v5'))
649
+ not tpu_version.startswith('v5') and tpu_version != 'v6e')
691
650
 
692
651
  for sku in gce_skus + tpu_skus:
693
652
  if tpu_region not in sku['serviceRegions']:
@@ -718,7 +677,9 @@ def get_tpu_df(gce_skus: List[Dict[str, Any]],
718
677
  # for v5e. Reference here:
719
678
  # https://cloud.google.com/tpu/docs/v5p#using-accelerator-type
720
679
  # https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config
721
- core_per_sku = (1 if tpu_version == 'v5litepod' else
680
+ # v6e is also per chip price. Reference here:
681
+ # https://cloud.google.com/tpu/docs/v6e#configurations
682
+ core_per_sku = (1 if tpu_version in ['v5litepod', 'v6e'] else
722
683
  2 if tpu_version == 'v5p' else 8)
723
684
  tpu_core_price = tpu_device_price / core_per_sku
724
685
  tpu_price = num_cores * tpu_core_price
@@ -738,8 +699,6 @@ def get_tpu_df(gce_skus: List[Dict[str, Any]],
738
699
  spot_str = 'spot ' if spot else ''
739
700
  print(f'The {spot_str}price of {tpu_name} in {tpu_region} is '
740
701
  'not found in SKUs or hidden TPU price DF.')
741
- # TODO(tian): Hack. Should investigate how to retrieve the price
742
- # for TPU-v6e.
743
702
  if (tpu_name.startswith('tpu-v6e') and
744
703
  tpu_region in TPU_V6E_MISSING_REGIONS):
745
704
  if not spot:
@@ -0,0 +1,111 @@
1
+ """Digital ocean service catalog.
2
+
3
+ This module loads the service catalog file and can be used to
4
+ query instance types and pricing information for digital ocean.
5
+ """
6
+
7
+ import typing
8
+ from typing import Dict, List, Optional, Tuple, Union
9
+
10
+ from sky.clouds.service_catalog import common
11
+ from sky.utils import ux_utils
12
+
13
+ if typing.TYPE_CHECKING:
14
+ from sky.clouds import cloud
15
+
16
+ _df = common.read_catalog('do/vms.csv')
17
+
18
+
19
+ def instance_type_exists(instance_type: str) -> bool:
20
+ return common.instance_type_exists_impl(_df, instance_type)
21
+
22
+
23
+ def validate_region_zone(
24
+ region: Optional[str],
25
+ zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
26
+ if zone is not None:
27
+ with ux_utils.print_exception_no_traceback():
28
+ raise ValueError('DO does not support zones.')
29
+ return common.validate_region_zone_impl('DO', _df, region, zone)
30
+
31
+
32
+ def get_hourly_cost(
33
+ instance_type: str,
34
+ use_spot: bool = False,
35
+ region: Optional[str] = None,
36
+ zone: Optional[str] = None,
37
+ ) -> float:
38
+ """Returns the cost, or the cheapest cost among all zones for spot."""
39
+ if zone is not None:
40
+ with ux_utils.print_exception_no_traceback():
41
+ raise ValueError('DO does not support zones.')
42
+ return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
43
+ zone)
44
+
45
+
46
+ def get_vcpus_mem_from_instance_type(
47
+ instance_type: str,) -> Tuple[Optional[float], Optional[float]]:
48
+ return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
49
+
50
+
51
+ def get_default_instance_type(
52
+ cpus: Optional[str] = None,
53
+ memory: Optional[str] = None,
54
+ disk_tier: Optional[str] = None,
55
+ ) -> Optional[str]:
56
+ # NOTE: After expanding catalog to multiple entries, you may
57
+ # want to specify a default instance type or family.
58
+ del disk_tier # unused
59
+ return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
60
+
61
+
62
+ def get_accelerators_from_instance_type(
63
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
64
+ return common.get_accelerators_from_instance_type_impl(_df, instance_type)
65
+
66
+
67
+ def get_instance_type_for_accelerator(
68
+ acc_name: str,
69
+ acc_count: int,
70
+ cpus: Optional[str] = None,
71
+ memory: Optional[str] = None,
72
+ use_spot: bool = False,
73
+ region: Optional[str] = None,
74
+ zone: Optional[str] = None,
75
+ ) -> Tuple[Optional[List[str]], List[str]]:
76
+ """Returns a list of instance types that have the given accelerator."""
77
+ if zone is not None:
78
+ with ux_utils.print_exception_no_traceback():
79
+ raise ValueError('DO does not support zones.')
80
+ return common.get_instance_type_for_accelerator_impl(
81
+ df=_df,
82
+ acc_name=acc_name,
83
+ acc_count=acc_count,
84
+ cpus=cpus,
85
+ memory=memory,
86
+ use_spot=use_spot,
87
+ region=region,
88
+ zone=zone,
89
+ )
90
+
91
+
92
+ def get_region_zones_for_instance_type(instance_type: str,
93
+ use_spot: bool) -> List['cloud.Region']:
94
+ df = _df[_df['InstanceType'] == instance_type]
95
+ return common.get_region_zones(df, use_spot)
96
+
97
+
98
+ def list_accelerators(
99
+ gpus_only: bool,
100
+ name_filter: Optional[str],
101
+ region_filter: Optional[str],
102
+ quantity_filter: Optional[int],
103
+ case_sensitive: bool = True,
104
+ all_regions: bool = False,
105
+ require_price: bool = True,
106
+ ) -> Dict[str, List[common.InstanceTypeInfo]]:
107
+ """Returns all instance types in DO offering GPUs."""
108
+ del require_price # unused
109
+ return common.list_accelerators_impl('DO', _df, gpus_only, name_filter,
110
+ region_filter, quantity_filter,
111
+ case_sensitive, all_regions)
@@ -115,6 +115,16 @@ def _list_accelerators(
115
115
 
116
116
  If the user does not have sufficient permissions to list pods in all
117
117
  namespaces, the function will return free GPUs as -1.
118
+
119
+ Returns:
120
+ A tuple of three dictionaries:
121
+ - qtys_map: Dict mapping accelerator names to lists of InstanceTypeInfo
122
+ objects with quantity information.
123
+ - total_accelerators_capacity: Dict mapping accelerator names to their
124
+ total capacity in the cluster.
125
+ - total_accelerators_available: Dict mapping accelerator names to their
126
+ current availability. Returns -1 for each accelerator if
127
+ realtime=False or if insufficient permissions.
118
128
  """
119
129
  # TODO(romilb): This should be refactored to use get_kubernetes_node_info()
120
130
  # function from kubernetes_utils.
@@ -243,6 +253,10 @@ def _list_accelerators(
243
253
 
244
254
  accelerators_available = accelerator_count - allocated_qty
245
255
 
256
+ # Initialize the entry if it doesn't exist yet
257
+ if accelerator_name not in total_accelerators_available:
258
+ total_accelerators_available[accelerator_name] = 0
259
+
246
260
  if accelerators_available >= min_quantity_filter:
247
261
  quantized_availability = min_quantity_filter * (
248
262
  accelerators_available // min_quantity_filter)