skypilot-nightly 1.0.0.dev20241025__py3-none-any.whl → 1.0.0.dev20241027__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +15 -0
  3. sky/cli.py +1 -1
  4. sky/clouds/aws.py +4 -7
  5. sky/clouds/azure.py +7 -9
  6. sky/clouds/cloud.py +11 -7
  7. sky/clouds/cudo.py +4 -7
  8. sky/clouds/fluidstack.py +4 -7
  9. sky/clouds/gcp.py +2 -2
  10. sky/clouds/ibm.py +4 -7
  11. sky/clouds/kubernetes.py +4 -7
  12. sky/clouds/lambda_cloud.py +4 -7
  13. sky/clouds/oci.py +9 -8
  14. sky/clouds/paperspace.py +4 -7
  15. sky/clouds/runpod.py +4 -7
  16. sky/clouds/scp.py +4 -7
  17. sky/clouds/service_catalog/__init__.py +1 -1
  18. sky/clouds/service_catalog/aws_catalog.py +2 -2
  19. sky/clouds/service_catalog/azure_catalog.py +16 -5
  20. sky/clouds/service_catalog/common.py +15 -6
  21. sky/clouds/service_catalog/cudo_catalog.py +2 -2
  22. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +21 -11
  23. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  24. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  25. sky/clouds/service_catalog/lambda_catalog.py +2 -2
  26. sky/clouds/service_catalog/oci_catalog.py +2 -2
  27. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  28. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  29. sky/clouds/service_catalog/scp_catalog.py +2 -2
  30. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  31. sky/clouds/vsphere.py +4 -7
  32. sky/jobs/controller.py +29 -34
  33. sky/provision/kubernetes/instance.py +66 -2
  34. sky/resources.py +1 -1
  35. sky/utils/resources_utils.py +13 -1
  36. {skypilot_nightly-1.0.0.dev20241025.dist-info → skypilot_nightly-1.0.0.dev20241027.dist-info}/METADATA +1 -1
  37. {skypilot_nightly-1.0.0.dev20241025.dist-info → skypilot_nightly-1.0.0.dev20241027.dist-info}/RECORD +41 -41
  38. {skypilot_nightly-1.0.0.dev20241025.dist-info → skypilot_nightly-1.0.0.dev20241027.dist-info}/LICENSE +0 -0
  39. {skypilot_nightly-1.0.0.dev20241025.dist-info → skypilot_nightly-1.0.0.dev20241027.dist-info}/WHEEL +0 -0
  40. {skypilot_nightly-1.0.0.dev20241025.dist-info → skypilot_nightly-1.0.0.dev20241027.dist-info}/entry_points.txt +0 -0
  41. {skypilot_nightly-1.0.0.dev20241025.dist-info → skypilot_nightly-1.0.0.dev20241027.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '057bc4b44755ac1e9dadc680e022c369e8ddff52'
8
+ _SKYPILOT_COMMIT_SHA = 'c0c17483d1f692ad639144050f5f6fa0966e47a5'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241025'
38
+ __version__ = '1.0.0.dev20241027'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -2713,6 +2713,21 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2713
2713
  f' Existing:\t{handle.launched_nodes}x '
2714
2714
  f'{handle.launched_resources}\n'
2715
2715
  f'{mismatch_str}')
2716
+ else:
2717
+ # For fractional acc count clusters, we round up the number of accs
2718
+ # to 1 (sky/utils/resources_utils.py::make_ray_custom_resources_str)
2719
+ # Here we scale the required acc count to (required / launched) * 1
2720
+ # so the total number of accs is the same as the requested number.
2721
+ launched_accs = launched_resources.accelerators
2722
+ if (launched_accs is not None and
2723
+ valid_resource.accelerators is not None):
2724
+ for _, count in launched_accs.items():
2725
+ if isinstance(count, float) and not count.is_integer():
2726
+ valid_resource = valid_resource.copy(
2727
+ accelerators={
2728
+ k: v / count
2729
+ for k, v in valid_resource.accelerators.items()
2730
+ })
2716
2731
  return valid_resource
2717
2732
 
2718
2733
  def _provision(
sky/cli.py CHANGED
@@ -3519,7 +3519,7 @@ def jobs():
3519
3519
  default=None,
3520
3520
  type=str,
3521
3521
  hidden=True,
3522
- help=('Alias for --name, the name of the spot job.'))
3522
+ help=('Alias for --name, the name of the managed job.'))
3523
3523
  @click.option('--job-recovery',
3524
3524
  default=None,
3525
3525
  type=str,
sky/clouds/aws.py CHANGED
@@ -2,13 +2,12 @@
2
2
  import enum
3
3
  import fnmatch
4
4
  import functools
5
- import json
6
5
  import os
7
6
  import re
8
7
  import subprocess
9
8
  import time
10
9
  import typing
11
- from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
10
+ from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
12
11
 
13
12
  from sky import clouds
14
13
  from sky import exceptions
@@ -383,7 +382,7 @@ class AWS(clouds.Cloud):
383
382
  def get_accelerators_from_instance_type(
384
383
  cls,
385
384
  instance_type: str,
386
- ) -> Optional[Dict[str, int]]:
385
+ ) -> Optional[Dict[str, Union[int, float]]]:
387
386
  return service_catalog.get_accelerators_from_instance_type(
388
387
  instance_type, clouds='aws')
389
388
 
@@ -411,10 +410,8 @@ class AWS(clouds.Cloud):
411
410
  r = resources
412
411
  # r.accelerators is cleared but .instance_type encodes the info.
413
412
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
414
- if acc_dict is not None:
415
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
416
- else:
417
- custom_resources = None
413
+ custom_resources = resources_utils.make_ray_custom_resources_str(
414
+ acc_dict)
418
415
 
419
416
  if r.extract_docker_image() is not None:
420
417
  image_id_to_use = None
sky/clouds/azure.py CHANGED
@@ -1,12 +1,11 @@
1
1
  """Azure."""
2
2
  import functools
3
- import json
4
3
  import os
5
4
  import re
6
5
  import subprocess
7
6
  import textwrap
8
7
  import typing
9
- from typing import Any, Dict, Iterator, List, Optional, Tuple
8
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
10
9
 
11
10
  import colorama
12
11
 
@@ -39,9 +38,9 @@ _DEFAULT_AZURE_UBUNTU_HPC_IMAGE_GB = 30
39
38
  _DEFAULT_AZURE_UBUNTU_2004_IMAGE_GB = 150
40
39
  _DEFAULT_SKYPILOT_IMAGE_GB = 30
41
40
 
42
- _DEFAULT_CPU_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
43
- _DEFAULT_GPU_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
44
- _DEFAULT_V1_IMAGE_ID = 'skypilot:v1-ubuntu-2004'
41
+ _DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-v2'
42
+ _DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2'
43
+ _DEFAULT_V1_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v1'
45
44
  _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
46
45
  _FALLBACK_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
47
46
 
@@ -272,7 +271,7 @@ class Azure(clouds.Cloud):
272
271
  def get_accelerators_from_instance_type(
273
272
  cls,
274
273
  instance_type: str,
275
- ) -> Optional[Dict[str, int]]:
274
+ ) -> Optional[Dict[str, Union[int, float]]]:
276
275
  return service_catalog.get_accelerators_from_instance_type(
277
276
  instance_type, clouds='azure')
278
277
 
@@ -304,10 +303,9 @@ class Azure(clouds.Cloud):
304
303
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
305
304
  acc_count = None
306
305
  if acc_dict is not None:
307
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
308
306
  acc_count = str(sum(acc_dict.values()))
309
- else:
310
- custom_resources = None
307
+ custom_resources = resources_utils.make_ray_custom_resources_str(
308
+ acc_dict)
311
309
 
312
310
  if (resources.image_id is None or
313
311
  resources.extract_docker_image() is not None):
sky/clouds/cloud.py CHANGED
@@ -9,8 +9,9 @@ reused across cloud object creation.
9
9
  """
10
10
  import collections
11
11
  import enum
12
+ import math
12
13
  import typing
13
- from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple
14
+ from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
14
15
 
15
16
  from sky import exceptions
16
17
  from sky import skypilot_config
@@ -306,7 +307,7 @@ class Cloud:
306
307
  def get_accelerators_from_instance_type(
307
308
  cls,
308
309
  instance_type: str,
309
- ) -> Optional[Dict[str, int]]:
310
+ ) -> Optional[Dict[str, Union[int, float]]]:
310
311
  """Returns {acc: acc_count} held by 'instance_type', if any."""
311
312
  raise NotImplementedError
312
313
 
@@ -673,8 +674,9 @@ class Cloud:
673
674
  assert resources.is_launchable(), resources
674
675
 
675
676
  def _equal_accelerators(
676
- acc_requested: Optional[Dict[str, int]],
677
- acc_from_instance_type: Optional[Dict[str, int]]) -> bool:
677
+ acc_requested: Optional[Dict[str, Union[int, float]]],
678
+ acc_from_instance_type: Optional[Dict[str, Union[int,
679
+ float]]]) -> bool:
678
680
  """Check the requested accelerators equals to the instance type
679
681
 
680
682
  Check the requested accelerators equals to the accelerators
@@ -689,12 +691,14 @@ class Cloud:
689
691
  for acc in acc_requested:
690
692
  if acc not in acc_from_instance_type:
691
693
  return False
692
- if acc_requested[acc] != acc_from_instance_type[acc]:
694
+ # Avoid float point precision issue.
695
+ if not math.isclose(acc_requested[acc],
696
+ acc_from_instance_type[acc]):
693
697
  return False
694
698
  return True
695
699
 
696
- acc_from_instance_type = (cls.get_accelerators_from_instance_type(
697
- resources.instance_type))
700
+ acc_from_instance_type = cls.get_accelerators_from_instance_type(
701
+ resources.instance_type)
698
702
  if not _equal_accelerators(resources.accelerators,
699
703
  acc_from_instance_type):
700
704
  with ux_utils.print_exception_no_traceback():
sky/clouds/cudo.py CHANGED
@@ -1,8 +1,7 @@
1
1
  """Cudo Compute"""
2
- import json
3
2
  import subprocess
4
3
  import typing
5
- from typing import Dict, Iterator, List, Optional, Tuple
4
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
6
5
 
7
6
  from sky import clouds
8
7
  from sky.clouds import service_catalog
@@ -183,7 +182,7 @@ class Cudo(clouds.Cloud):
183
182
  def get_accelerators_from_instance_type(
184
183
  cls,
185
184
  instance_type: str,
186
- ) -> Optional[Dict[str, int]]:
185
+ ) -> Optional[Dict[str, Union[int, float]]]:
187
186
  return service_catalog.get_accelerators_from_instance_type(
188
187
  instance_type, clouds='cudo')
189
188
 
@@ -202,10 +201,8 @@ class Cudo(clouds.Cloud):
202
201
  del zones, cluster_name # unused
203
202
  r = resources
204
203
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
205
- if acc_dict is not None:
206
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
207
- else:
208
- custom_resources = None
204
+ custom_resources = resources_utils.make_ray_custom_resources_str(
205
+ acc_dict)
209
206
 
210
207
  return {
211
208
  'instance_type': resources.instance_type,
sky/clouds/fluidstack.py CHANGED
@@ -1,8 +1,7 @@
1
1
  """Fluidstack Cloud."""
2
- import json
3
2
  import os
4
3
  import typing
5
- from typing import Dict, Iterator, List, Optional, Tuple
4
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
6
5
 
7
6
  import requests
8
7
 
@@ -155,7 +154,7 @@ class Fluidstack(clouds.Cloud):
155
154
  def get_accelerators_from_instance_type(
156
155
  cls,
157
156
  instance_type: str,
158
- ) -> Optional[Dict[str, int]]:
157
+ ) -> Optional[Dict[str, Union[int, float]]]:
159
158
  return service_catalog.get_accelerators_from_instance_type(
160
159
  instance_type, clouds='fluidstack')
161
160
 
@@ -184,10 +183,8 @@ class Fluidstack(clouds.Cloud):
184
183
 
185
184
  r = resources
186
185
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
187
- if acc_dict is not None:
188
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
189
- else:
190
- custom_resources = None
186
+ custom_resources = resources_utils.make_ray_custom_resources_str(
187
+ acc_dict)
191
188
 
192
189
  return {
193
190
  'instance_type': resources.instance_type,
sky/clouds/gcp.py CHANGED
@@ -7,7 +7,7 @@ import re
7
7
  import subprocess
8
8
  import time
9
9
  import typing
10
- from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
10
+ from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
11
11
 
12
12
  import colorama
13
13
 
@@ -669,7 +669,7 @@ class GCP(clouds.Cloud):
669
669
  def get_accelerators_from_instance_type(
670
670
  cls,
671
671
  instance_type: str,
672
- ) -> Optional[Dict[str, int]]:
672
+ ) -> Optional[Dict[str, Union[int, float]]]:
673
673
  # GCP handles accelerators separately from regular instance types,
674
674
  # hence return none here.
675
675
  return None
sky/clouds/ibm.py CHANGED
@@ -1,8 +1,7 @@
1
1
  """IBM Web Services."""
2
- import json
3
2
  import os
4
3
  import typing
5
- from typing import Any, Dict, Iterator, List, Optional, Tuple
4
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
6
5
 
7
6
  import colorama
8
7
 
@@ -206,10 +205,8 @@ class IBM(clouds.Cloud):
206
205
  'IBM does not currently support spot instances in this framework'
207
206
 
208
207
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
209
- if acc_dict is not None:
210
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
211
- else:
212
- custom_resources = None
208
+ custom_resources = resources_utils.make_ray_custom_resources_str(
209
+ acc_dict)
213
210
 
214
211
  instance_resources = _get_profile_resources(r.instance_type)
215
212
 
@@ -247,7 +244,7 @@ class IBM(clouds.Cloud):
247
244
  def get_accelerators_from_instance_type(
248
245
  cls,
249
246
  instance_type: str,
250
- ) -> Optional[Dict[str, int]]:
247
+ ) -> Optional[Dict[str, Union[int, float]]]:
251
248
  """Returns {acc: acc_count} held by 'instance_type', if any."""
252
249
  return service_catalog.get_accelerators_from_instance_type(
253
250
  instance_type, clouds='ibm')
sky/clouds/kubernetes.py CHANGED
@@ -1,10 +1,9 @@
1
1
  """Kubernetes."""
2
2
  import functools
3
- import json
4
3
  import os
5
4
  import re
6
5
  import typing
7
- from typing import Dict, Iterator, List, Optional, Tuple
6
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
8
7
 
9
8
  from sky import clouds
10
9
  from sky import sky_logging
@@ -271,7 +270,7 @@ class Kubernetes(clouds.Cloud):
271
270
  def get_accelerators_from_instance_type(
272
271
  cls,
273
272
  instance_type: str,
274
- ) -> Optional[Dict[str, int]]:
273
+ ) -> Optional[Dict[str, Union[int, float]]]:
275
274
  inst = kubernetes_utils.KubernetesInstanceType.from_instance_type(
276
275
  instance_type)
277
276
  return {
@@ -328,10 +327,8 @@ class Kubernetes(clouds.Cloud):
328
327
 
329
328
  r = resources
330
329
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
331
- if acc_dict is not None:
332
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
333
- else:
334
- custom_resources = None
330
+ custom_resources = resources_utils.make_ray_custom_resources_str(
331
+ acc_dict)
335
332
 
336
333
  # resources.memory and cpus are None if they are not explicitly set.
337
334
  # We fetch the default values for the instance type in that case.
@@ -1,7 +1,6 @@
1
1
  """Lambda Cloud."""
2
- import json
3
2
  import typing
4
- from typing import Dict, Iterator, List, Optional, Tuple
3
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
5
4
 
6
5
  import requests
7
6
 
@@ -136,7 +135,7 @@ class Lambda(clouds.Cloud):
136
135
  def get_accelerators_from_instance_type(
137
136
  cls,
138
137
  instance_type: str,
139
- ) -> Optional[Dict[str, int]]:
138
+ ) -> Optional[Dict[str, Union[int, float]]]:
140
139
  return service_catalog.get_accelerators_from_instance_type(
141
140
  instance_type, clouds='lambda')
142
141
 
@@ -164,10 +163,8 @@ class Lambda(clouds.Cloud):
164
163
 
165
164
  r = resources
166
165
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
167
- if acc_dict is not None:
168
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
169
- else:
170
- custom_resources = None
166
+ custom_resources = resources_utils.make_ray_custom_resources_str(
167
+ acc_dict)
171
168
 
172
169
  resources_vars = {
173
170
  'instance_type': resources.instance_type,
sky/clouds/oci.py CHANGED
@@ -20,11 +20,10 @@ History:
20
20
  - Hysun He (hysun.he@oracle.com) @ Oct 13, 2024:
21
21
  Support more OS types additional to ubuntu for OCI resources.
22
22
  """
23
- import json
24
23
  import logging
25
24
  import os
26
25
  import typing
27
- from typing import Dict, Iterator, List, Optional, Tuple
26
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
28
27
 
29
28
  from sky import clouds
30
29
  from sky import exceptions
@@ -193,7 +192,7 @@ class OCI(clouds.Cloud):
193
192
  def get_accelerators_from_instance_type(
194
193
  cls,
195
194
  instance_type: str,
196
- ) -> Optional[Dict[str, int]]:
195
+ ) -> Optional[Dict[str, Union[int, float]]]:
197
196
  return service_catalog.get_accelerators_from_instance_type(
198
197
  instance_type, clouds='oci')
199
198
 
@@ -213,10 +212,8 @@ class OCI(clouds.Cloud):
213
212
 
214
213
  acc_dict = self.get_accelerators_from_instance_type(
215
214
  resources.instance_type)
216
- if acc_dict is not None:
217
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
218
- else:
219
- custom_resources = None
215
+ custom_resources = resources_utils.make_ray_custom_resources_str(
216
+ acc_dict)
220
217
 
221
218
  image_str = self._get_image_id(resources.image_id, region.name,
222
219
  resources.instance_type)
@@ -468,7 +465,11 @@ class OCI(clouds.Cloud):
468
465
  api_key_file = oci_cfg[
469
466
  'key_file'] if 'key_file' in oci_cfg else 'BadConf'
470
467
  sky_cfg_file = oci_utils.oci_config.get_sky_user_config_file()
471
- except (ImportError, oci_adaptor.oci.exceptions.ConfigFileNotFound):
468
+ # Must catch ImportError before any oci_adaptor.oci.exceptions
469
+ # because oci_adaptor.oci.exceptions can throw ImportError.
470
+ except ImportError:
471
+ return {}
472
+ except oci_adaptor.oci.exceptions.ConfigFileNotFound:
472
473
  return {}
473
474
 
474
475
  # OCI config and API key file are mandatory
sky/clouds/paperspace.py CHANGED
@@ -1,8 +1,7 @@
1
1
  """ Paperspace Cloud. """
2
2
 
3
- import json
4
3
  import typing
5
- from typing import Dict, Iterator, List, Optional, Tuple
4
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
6
5
 
7
6
  import requests
8
7
 
@@ -162,7 +161,7 @@ class Paperspace(clouds.Cloud):
162
161
 
163
162
  @classmethod
164
163
  def get_accelerators_from_instance_type(
165
- cls, instance_type: str) -> Optional[Dict[str, int]]:
164
+ cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
166
165
  return service_catalog.get_accelerators_from_instance_type(
167
166
  instance_type, clouds='paperspace')
168
167
 
@@ -181,10 +180,8 @@ class Paperspace(clouds.Cloud):
181
180
 
182
181
  r = resources
183
182
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
184
- if acc_dict is not None:
185
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
186
- else:
187
- custom_resources = None
183
+ custom_resources = resources_utils.make_ray_custom_resources_str(
184
+ acc_dict)
188
185
 
189
186
  return {
190
187
  'instance_type': resources.instance_type,
sky/clouds/runpod.py CHANGED
@@ -1,8 +1,7 @@
1
1
  """ RunPod Cloud. """
2
2
 
3
- import json
4
3
  import typing
5
- from typing import Dict, Iterator, List, Optional, Tuple
4
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
6
5
 
7
6
  from sky import clouds
8
7
  from sky.clouds import service_catalog
@@ -147,7 +146,7 @@ class RunPod(clouds.Cloud):
147
146
 
148
147
  @classmethod
149
148
  def get_accelerators_from_instance_type(
150
- cls, instance_type: str) -> Optional[Dict[str, int]]:
149
+ cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
151
150
  return service_catalog.get_accelerators_from_instance_type(
152
151
  instance_type, clouds='runpod')
153
152
 
@@ -166,10 +165,8 @@ class RunPod(clouds.Cloud):
166
165
 
167
166
  r = resources
168
167
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
169
- if acc_dict is not None:
170
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
171
- else:
172
- custom_resources = None
168
+ custom_resources = resources_utils.make_ray_custom_resources_str(
169
+ acc_dict)
173
170
 
174
171
  if r.image_id is None:
175
172
  image_id = 'runpod/base:0.0.2'
sky/clouds/scp.py CHANGED
@@ -4,9 +4,8 @@ This module includes the set of functions
4
4
  to access the SCP catalog and check credentials for the SCP access.
5
5
  """
6
6
 
7
- import json
8
7
  import typing
9
- from typing import Dict, Iterator, List, Optional, Tuple
8
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
10
9
 
11
10
  from sky import clouds
12
11
  from sky import exceptions
@@ -160,7 +159,7 @@ class SCP(clouds.Cloud):
160
159
  def get_accelerators_from_instance_type(
161
160
  cls,
162
161
  instance_type: str,
163
- ) -> Optional[Dict[str, int]]:
162
+ ) -> Optional[Dict[str, Union[int, float]]]:
164
163
  return service_catalog.get_accelerators_from_instance_type(
165
164
  instance_type, clouds='scp')
166
165
 
@@ -188,11 +187,9 @@ class SCP(clouds.Cloud):
188
187
 
189
188
  r = resources
190
189
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
190
+ custom_resources = resources_utils.make_ray_custom_resources_str(
191
+ acc_dict)
191
192
 
192
- if acc_dict is not None:
193
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
194
- else:
195
- custom_resources = None
196
193
  image_id = self._get_image_id(r.image_id, region.name, r.instance_type)
197
194
  return {
198
195
  'instance_type': resources.instance_type,
@@ -238,7 +238,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
238
238
 
239
239
  def get_accelerators_from_instance_type(
240
240
  instance_type: str,
241
- clouds: CloudFilter = None) -> Optional[Dict[str, int]]:
241
+ clouds: CloudFilter = None) -> Optional[Dict[str, Union[int, float]]]:
242
242
  """Returns the accelerators from a instance type."""
243
243
  return _map_clouds_catalog(clouds, 'get_accelerators_from_instance_type',
244
244
  instance_type)
@@ -8,7 +8,7 @@ import hashlib
8
8
  import os
9
9
  import threading
10
10
  import typing
11
- from typing import Dict, List, Optional, Tuple
11
+ from typing import Dict, List, Optional, Tuple, Union
12
12
 
13
13
  from sky import exceptions
14
14
  from sky import sky_logging
@@ -243,7 +243,7 @@ def get_default_instance_type(
243
243
 
244
244
 
245
245
  def get_accelerators_from_instance_type(
246
- instance_type: str) -> Optional[Dict[str, int]]:
246
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
247
247
  return common.get_accelerators_from_instance_type_impl(
248
248
  _get_df(), instance_type)
249
249
 
@@ -4,14 +4,17 @@ This module loads the service catalog file and can be used to query
4
4
  instance types and pricing information for Azure.
5
5
  """
6
6
  import re
7
- from typing import Dict, List, Optional, Tuple
7
+ from typing import Dict, List, Optional, Tuple, Union
8
8
 
9
9
  from sky import clouds as cloud_lib
10
+ from sky import sky_logging
10
11
  from sky.clouds import Azure
11
12
  from sky.clouds.service_catalog import common
12
13
  from sky.utils import resources_utils
13
14
  from sky.utils import ux_utils
14
15
 
16
+ logger = sky_logging.init_logger(__name__)
17
+
15
18
  # This list should match the list of regions in
16
19
  # skypilot image generation Packer script's replication_regions
17
20
  # sky/clouds/service_catalog/images/skypilot-azure-cpu-ubuntu.pkr.hcl
@@ -134,7 +137,7 @@ def get_default_instance_type(
134
137
 
135
138
 
136
139
  def get_accelerators_from_instance_type(
137
- instance_type: str) -> Optional[Dict[str, int]]:
140
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
138
141
  return common.get_accelerators_from_instance_type_impl(_df, instance_type)
139
142
 
140
143
 
@@ -154,6 +157,7 @@ def get_instance_type_for_accelerator(
154
157
  if zone is not None:
155
158
  with ux_utils.print_exception_no_traceback():
156
159
  raise ValueError('Azure does not support zones.')
160
+
157
161
  return common.get_instance_type_for_accelerator_impl(df=_df,
158
162
  acc_name=acc_name,
159
163
  acc_count=acc_count,
@@ -191,9 +195,16 @@ def list_accelerators(
191
195
 
192
196
  def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
193
197
  """Returns the image id from the tag."""
194
- # Azure images are not region-specific.
195
- del region # Unused.
196
- return common.get_image_id_from_tag_impl(_image_df, tag, None)
198
+ global _image_df
199
+ image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
200
+ if image_id is None:
201
+ # Refresh the image catalog and try again, if the image tag is not
202
+ # found.
203
+ logger.debug('Refreshing the image catalog and trying again.')
204
+ _image_df = common.read_catalog('azure/images.csv',
205
+ pull_frequency_hours=0)
206
+ image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
207
+ return image_id
197
208
 
198
209
 
199
210
  def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
@@ -5,7 +5,7 @@ import hashlib
5
5
  import os
6
6
  import time
7
7
  import typing
8
- from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
8
+ from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union
9
9
 
10
10
  import filelock
11
11
  import requests
@@ -481,7 +481,7 @@ def get_instance_type_for_cpus_mem_impl(
481
481
  def get_accelerators_from_instance_type_impl(
482
482
  df: 'pd.DataFrame',
483
483
  instance_type: str,
484
- ) -> Optional[Dict[str, int]]:
484
+ ) -> Optional[Dict[str, Union[int, float]]]:
485
485
  df = _get_instance_type(df, instance_type, None)
486
486
  if len(df) == 0:
487
487
  with ux_utils.print_exception_no_traceback():
@@ -490,13 +490,19 @@ def get_accelerators_from_instance_type_impl(
490
490
  acc_name, acc_count = row['AcceleratorName'], row['AcceleratorCount']
491
491
  if pd.isnull(acc_name):
492
492
  return None
493
- return {acc_name: int(acc_count)}
493
+
494
+ def _convert(value):
495
+ if int(value) == value:
496
+ return int(value)
497
+ return float(value)
498
+
499
+ return {acc_name: _convert(acc_count)}
494
500
 
495
501
 
496
502
  def get_instance_type_for_accelerator_impl(
497
503
  df: 'pd.DataFrame',
498
504
  acc_name: str,
499
- acc_count: int,
505
+ acc_count: Union[int, float],
500
506
  cpus: Optional[str] = None,
501
507
  memory: Optional[str] = None,
502
508
  use_spot: bool = False,
@@ -509,7 +515,7 @@ def get_instance_type_for_accelerator_impl(
509
515
  accelerators with sorted prices and a list of candidates with fuzzy search.
510
516
  """
511
517
  result = df[(df['AcceleratorName'].str.fullmatch(acc_name, case=False)) &
512
- (df['AcceleratorCount'] == acc_count)]
518
+ (abs(df['AcceleratorCount'] - acc_count) <= 0.01)]
513
519
  result = _filter_region_zone(result, region, zone)
514
520
  if len(result) == 0:
515
521
  fuzzy_result = df[
@@ -522,8 +528,11 @@ def get_instance_type_for_accelerator_impl(
522
528
  fuzzy_candidate_list = []
523
529
  if len(fuzzy_result) > 0:
524
530
  for _, row in fuzzy_result.iterrows():
531
+ acc_cnt = float(row['AcceleratorCount'])
532
+ acc_count_display = (int(acc_cnt) if acc_cnt.is_integer() else
533
+ f'{acc_cnt:.2f}')
525
534
  fuzzy_candidate_list.append(f'{row["AcceleratorName"]}:'
526
- f'{int(row["AcceleratorCount"])}')
535
+ f'{acc_count_display}')
527
536
  return (None, fuzzy_candidate_list)
528
537
 
529
538
  result = _filter_with_cpus(result, cpus)