skypilot-nightly 1.0.0.dev20241026__py3-none-any.whl → 1.0.0.dev20241028__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +15 -0
- sky/clouds/aws.py +4 -7
- sky/clouds/azure.py +4 -6
- sky/clouds/cloud.py +11 -7
- sky/clouds/cudo.py +4 -7
- sky/clouds/fluidstack.py +4 -7
- sky/clouds/gcp.py +2 -2
- sky/clouds/ibm.py +4 -7
- sky/clouds/kubernetes.py +4 -7
- sky/clouds/lambda_cloud.py +4 -7
- sky/clouds/oci.py +4 -7
- sky/clouds/paperspace.py +4 -7
- sky/clouds/runpod.py +4 -7
- sky/clouds/scp.py +4 -7
- sky/clouds/service_catalog/__init__.py +1 -1
- sky/clouds/service_catalog/aws_catalog.py +2 -2
- sky/clouds/service_catalog/azure_catalog.py +3 -2
- sky/clouds/service_catalog/common.py +15 -6
- sky/clouds/service_catalog/cudo_catalog.py +2 -2
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +21 -11
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/lambda_catalog.py +2 -2
- sky/clouds/service_catalog/oci_catalog.py +2 -2
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/vsphere.py +4 -7
- sky/jobs/controller.py +28 -33
- sky/resources.py +1 -1
- sky/utils/resources_utils.py +13 -1
- {skypilot_nightly-1.0.0.dev20241026.dist-info → skypilot_nightly-1.0.0.dev20241028.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241026.dist-info → skypilot_nightly-1.0.0.dev20241028.dist-info}/RECORD +39 -39
- {skypilot_nightly-1.0.0.dev20241026.dist-info → skypilot_nightly-1.0.0.dev20241028.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241026.dist-info → skypilot_nightly-1.0.0.dev20241028.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241026.dist-info → skypilot_nightly-1.0.0.dev20241028.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241026.dist-info → skypilot_nightly-1.0.0.dev20241028.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'c0c17483d1f692ad639144050f5f6fa0966e47a5'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241028'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -2713,6 +2713,21 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2713
2713
|
f' Existing:\t{handle.launched_nodes}x '
|
2714
2714
|
f'{handle.launched_resources}\n'
|
2715
2715
|
f'{mismatch_str}')
|
2716
|
+
else:
|
2717
|
+
# For fractional acc count clusters, we round up the number of accs
|
2718
|
+
# to 1 (sky/utils/resources_utils.py::make_ray_custom_resources_str)
|
2719
|
+
# Here we scale the required acc count to (required / launched) * 1
|
2720
|
+
# so the total number of accs is the same as the requested number.
|
2721
|
+
launched_accs = launched_resources.accelerators
|
2722
|
+
if (launched_accs is not None and
|
2723
|
+
valid_resource.accelerators is not None):
|
2724
|
+
for _, count in launched_accs.items():
|
2725
|
+
if isinstance(count, float) and not count.is_integer():
|
2726
|
+
valid_resource = valid_resource.copy(
|
2727
|
+
accelerators={
|
2728
|
+
k: v / count
|
2729
|
+
for k, v in valid_resource.accelerators.items()
|
2730
|
+
})
|
2716
2731
|
return valid_resource
|
2717
2732
|
|
2718
2733
|
def _provision(
|
sky/clouds/aws.py
CHANGED
@@ -2,13 +2,12 @@
|
|
2
2
|
import enum
|
3
3
|
import fnmatch
|
4
4
|
import functools
|
5
|
-
import json
|
6
5
|
import os
|
7
6
|
import re
|
8
7
|
import subprocess
|
9
8
|
import time
|
10
9
|
import typing
|
11
|
-
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
|
10
|
+
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
|
12
11
|
|
13
12
|
from sky import clouds
|
14
13
|
from sky import exceptions
|
@@ -383,7 +382,7 @@ class AWS(clouds.Cloud):
|
|
383
382
|
def get_accelerators_from_instance_type(
|
384
383
|
cls,
|
385
384
|
instance_type: str,
|
386
|
-
) -> Optional[Dict[str, int]]:
|
385
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
387
386
|
return service_catalog.get_accelerators_from_instance_type(
|
388
387
|
instance_type, clouds='aws')
|
389
388
|
|
@@ -411,10 +410,8 @@ class AWS(clouds.Cloud):
|
|
411
410
|
r = resources
|
412
411
|
# r.accelerators is cleared but .instance_type encodes the info.
|
413
412
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
414
|
-
|
415
|
-
|
416
|
-
else:
|
417
|
-
custom_resources = None
|
413
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
414
|
+
acc_dict)
|
418
415
|
|
419
416
|
if r.extract_docker_image() is not None:
|
420
417
|
image_id_to_use = None
|
sky/clouds/azure.py
CHANGED
@@ -1,12 +1,11 @@
|
|
1
1
|
"""Azure."""
|
2
2
|
import functools
|
3
|
-
import json
|
4
3
|
import os
|
5
4
|
import re
|
6
5
|
import subprocess
|
7
6
|
import textwrap
|
8
7
|
import typing
|
9
|
-
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
8
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
10
9
|
|
11
10
|
import colorama
|
12
11
|
|
@@ -272,7 +271,7 @@ class Azure(clouds.Cloud):
|
|
272
271
|
def get_accelerators_from_instance_type(
|
273
272
|
cls,
|
274
273
|
instance_type: str,
|
275
|
-
) -> Optional[Dict[str, int]]:
|
274
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
276
275
|
return service_catalog.get_accelerators_from_instance_type(
|
277
276
|
instance_type, clouds='azure')
|
278
277
|
|
@@ -304,10 +303,9 @@ class Azure(clouds.Cloud):
|
|
304
303
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
305
304
|
acc_count = None
|
306
305
|
if acc_dict is not None:
|
307
|
-
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
|
308
306
|
acc_count = str(sum(acc_dict.values()))
|
309
|
-
|
310
|
-
|
307
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
308
|
+
acc_dict)
|
311
309
|
|
312
310
|
if (resources.image_id is None or
|
313
311
|
resources.extract_docker_image() is not None):
|
sky/clouds/cloud.py
CHANGED
@@ -9,8 +9,9 @@ reused across cloud object creation.
|
|
9
9
|
"""
|
10
10
|
import collections
|
11
11
|
import enum
|
12
|
+
import math
|
12
13
|
import typing
|
13
|
-
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple
|
14
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
|
14
15
|
|
15
16
|
from sky import exceptions
|
16
17
|
from sky import skypilot_config
|
@@ -306,7 +307,7 @@ class Cloud:
|
|
306
307
|
def get_accelerators_from_instance_type(
|
307
308
|
cls,
|
308
309
|
instance_type: str,
|
309
|
-
) -> Optional[Dict[str, int]]:
|
310
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
310
311
|
"""Returns {acc: acc_count} held by 'instance_type', if any."""
|
311
312
|
raise NotImplementedError
|
312
313
|
|
@@ -673,8 +674,9 @@ class Cloud:
|
|
673
674
|
assert resources.is_launchable(), resources
|
674
675
|
|
675
676
|
def _equal_accelerators(
|
676
|
-
|
677
|
-
|
677
|
+
acc_requested: Optional[Dict[str, Union[int, float]]],
|
678
|
+
acc_from_instance_type: Optional[Dict[str, Union[int,
|
679
|
+
float]]]) -> bool:
|
678
680
|
"""Check the requested accelerators equals to the instance type
|
679
681
|
|
680
682
|
Check the requested accelerators equals to the accelerators
|
@@ -689,12 +691,14 @@ class Cloud:
|
|
689
691
|
for acc in acc_requested:
|
690
692
|
if acc not in acc_from_instance_type:
|
691
693
|
return False
|
692
|
-
|
694
|
+
# Avoid float point precision issue.
|
695
|
+
if not math.isclose(acc_requested[acc],
|
696
|
+
acc_from_instance_type[acc]):
|
693
697
|
return False
|
694
698
|
return True
|
695
699
|
|
696
|
-
acc_from_instance_type =
|
697
|
-
resources.instance_type)
|
700
|
+
acc_from_instance_type = cls.get_accelerators_from_instance_type(
|
701
|
+
resources.instance_type)
|
698
702
|
if not _equal_accelerators(resources.accelerators,
|
699
703
|
acc_from_instance_type):
|
700
704
|
with ux_utils.print_exception_no_traceback():
|
sky/clouds/cudo.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
"""Cudo Compute"""
|
2
|
-
import json
|
3
2
|
import subprocess
|
4
3
|
import typing
|
5
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
4
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
6
5
|
|
7
6
|
from sky import clouds
|
8
7
|
from sky.clouds import service_catalog
|
@@ -183,7 +182,7 @@ class Cudo(clouds.Cloud):
|
|
183
182
|
def get_accelerators_from_instance_type(
|
184
183
|
cls,
|
185
184
|
instance_type: str,
|
186
|
-
) -> Optional[Dict[str, int]]:
|
185
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
187
186
|
return service_catalog.get_accelerators_from_instance_type(
|
188
187
|
instance_type, clouds='cudo')
|
189
188
|
|
@@ -202,10 +201,8 @@ class Cudo(clouds.Cloud):
|
|
202
201
|
del zones, cluster_name # unused
|
203
202
|
r = resources
|
204
203
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
205
|
-
|
206
|
-
|
207
|
-
else:
|
208
|
-
custom_resources = None
|
204
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
205
|
+
acc_dict)
|
209
206
|
|
210
207
|
return {
|
211
208
|
'instance_type': resources.instance_type,
|
sky/clouds/fluidstack.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
"""Fluidstack Cloud."""
|
2
|
-
import json
|
3
2
|
import os
|
4
3
|
import typing
|
5
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
4
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
6
5
|
|
7
6
|
import requests
|
8
7
|
|
@@ -155,7 +154,7 @@ class Fluidstack(clouds.Cloud):
|
|
155
154
|
def get_accelerators_from_instance_type(
|
156
155
|
cls,
|
157
156
|
instance_type: str,
|
158
|
-
) -> Optional[Dict[str, int]]:
|
157
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
159
158
|
return service_catalog.get_accelerators_from_instance_type(
|
160
159
|
instance_type, clouds='fluidstack')
|
161
160
|
|
@@ -184,10 +183,8 @@ class Fluidstack(clouds.Cloud):
|
|
184
183
|
|
185
184
|
r = resources
|
186
185
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
187
|
-
|
188
|
-
|
189
|
-
else:
|
190
|
-
custom_resources = None
|
186
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
187
|
+
acc_dict)
|
191
188
|
|
192
189
|
return {
|
193
190
|
'instance_type': resources.instance_type,
|
sky/clouds/gcp.py
CHANGED
@@ -7,7 +7,7 @@ import re
|
|
7
7
|
import subprocess
|
8
8
|
import time
|
9
9
|
import typing
|
10
|
-
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
|
10
|
+
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
|
11
11
|
|
12
12
|
import colorama
|
13
13
|
|
@@ -669,7 +669,7 @@ class GCP(clouds.Cloud):
|
|
669
669
|
def get_accelerators_from_instance_type(
|
670
670
|
cls,
|
671
671
|
instance_type: str,
|
672
|
-
) -> Optional[Dict[str, int]]:
|
672
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
673
673
|
# GCP handles accelerators separately from regular instance types,
|
674
674
|
# hence return none here.
|
675
675
|
return None
|
sky/clouds/ibm.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
"""IBM Web Services."""
|
2
|
-
import json
|
3
2
|
import os
|
4
3
|
import typing
|
5
|
-
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
4
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
6
5
|
|
7
6
|
import colorama
|
8
7
|
|
@@ -206,10 +205,8 @@ class IBM(clouds.Cloud):
|
|
206
205
|
'IBM does not currently support spot instances in this framework'
|
207
206
|
|
208
207
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
209
|
-
|
210
|
-
|
211
|
-
else:
|
212
|
-
custom_resources = None
|
208
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
209
|
+
acc_dict)
|
213
210
|
|
214
211
|
instance_resources = _get_profile_resources(r.instance_type)
|
215
212
|
|
@@ -247,7 +244,7 @@ class IBM(clouds.Cloud):
|
|
247
244
|
def get_accelerators_from_instance_type(
|
248
245
|
cls,
|
249
246
|
instance_type: str,
|
250
|
-
) -> Optional[Dict[str, int]]:
|
247
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
251
248
|
"""Returns {acc: acc_count} held by 'instance_type', if any."""
|
252
249
|
return service_catalog.get_accelerators_from_instance_type(
|
253
250
|
instance_type, clouds='ibm')
|
sky/clouds/kubernetes.py
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
"""Kubernetes."""
|
2
2
|
import functools
|
3
|
-
import json
|
4
3
|
import os
|
5
4
|
import re
|
6
5
|
import typing
|
7
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
6
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
8
7
|
|
9
8
|
from sky import clouds
|
10
9
|
from sky import sky_logging
|
@@ -271,7 +270,7 @@ class Kubernetes(clouds.Cloud):
|
|
271
270
|
def get_accelerators_from_instance_type(
|
272
271
|
cls,
|
273
272
|
instance_type: str,
|
274
|
-
) -> Optional[Dict[str, int]]:
|
273
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
275
274
|
inst = kubernetes_utils.KubernetesInstanceType.from_instance_type(
|
276
275
|
instance_type)
|
277
276
|
return {
|
@@ -328,10 +327,8 @@ class Kubernetes(clouds.Cloud):
|
|
328
327
|
|
329
328
|
r = resources
|
330
329
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
331
|
-
|
332
|
-
|
333
|
-
else:
|
334
|
-
custom_resources = None
|
330
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
331
|
+
acc_dict)
|
335
332
|
|
336
333
|
# resources.memory and cpus are None if they are not explicitly set.
|
337
334
|
# We fetch the default values for the instance type in that case.
|
sky/clouds/lambda_cloud.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
"""Lambda Cloud."""
|
2
|
-
import json
|
3
2
|
import typing
|
4
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
3
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
5
4
|
|
6
5
|
import requests
|
7
6
|
|
@@ -136,7 +135,7 @@ class Lambda(clouds.Cloud):
|
|
136
135
|
def get_accelerators_from_instance_type(
|
137
136
|
cls,
|
138
137
|
instance_type: str,
|
139
|
-
) -> Optional[Dict[str, int]]:
|
138
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
140
139
|
return service_catalog.get_accelerators_from_instance_type(
|
141
140
|
instance_type, clouds='lambda')
|
142
141
|
|
@@ -164,10 +163,8 @@ class Lambda(clouds.Cloud):
|
|
164
163
|
|
165
164
|
r = resources
|
166
165
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
167
|
-
|
168
|
-
|
169
|
-
else:
|
170
|
-
custom_resources = None
|
166
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
167
|
+
acc_dict)
|
171
168
|
|
172
169
|
resources_vars = {
|
173
170
|
'instance_type': resources.instance_type,
|
sky/clouds/oci.py
CHANGED
@@ -20,11 +20,10 @@ History:
|
|
20
20
|
- Hysun He (hysun.he@oracle.com) @ Oct 13, 2024:
|
21
21
|
Support more OS types additional to ubuntu for OCI resources.
|
22
22
|
"""
|
23
|
-
import json
|
24
23
|
import logging
|
25
24
|
import os
|
26
25
|
import typing
|
27
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
26
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
28
27
|
|
29
28
|
from sky import clouds
|
30
29
|
from sky import exceptions
|
@@ -193,7 +192,7 @@ class OCI(clouds.Cloud):
|
|
193
192
|
def get_accelerators_from_instance_type(
|
194
193
|
cls,
|
195
194
|
instance_type: str,
|
196
|
-
) -> Optional[Dict[str, int]]:
|
195
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
197
196
|
return service_catalog.get_accelerators_from_instance_type(
|
198
197
|
instance_type, clouds='oci')
|
199
198
|
|
@@ -213,10 +212,8 @@ class OCI(clouds.Cloud):
|
|
213
212
|
|
214
213
|
acc_dict = self.get_accelerators_from_instance_type(
|
215
214
|
resources.instance_type)
|
216
|
-
|
217
|
-
|
218
|
-
else:
|
219
|
-
custom_resources = None
|
215
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
216
|
+
acc_dict)
|
220
217
|
|
221
218
|
image_str = self._get_image_id(resources.image_id, region.name,
|
222
219
|
resources.instance_type)
|
sky/clouds/paperspace.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
""" Paperspace Cloud. """
|
2
2
|
|
3
|
-
import json
|
4
3
|
import typing
|
5
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
4
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
6
5
|
|
7
6
|
import requests
|
8
7
|
|
@@ -162,7 +161,7 @@ class Paperspace(clouds.Cloud):
|
|
162
161
|
|
163
162
|
@classmethod
|
164
163
|
def get_accelerators_from_instance_type(
|
165
|
-
cls, instance_type: str) -> Optional[Dict[str, int]]:
|
164
|
+
cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
166
165
|
return service_catalog.get_accelerators_from_instance_type(
|
167
166
|
instance_type, clouds='paperspace')
|
168
167
|
|
@@ -181,10 +180,8 @@ class Paperspace(clouds.Cloud):
|
|
181
180
|
|
182
181
|
r = resources
|
183
182
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
184
|
-
|
185
|
-
|
186
|
-
else:
|
187
|
-
custom_resources = None
|
183
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
184
|
+
acc_dict)
|
188
185
|
|
189
186
|
return {
|
190
187
|
'instance_type': resources.instance_type,
|
sky/clouds/runpod.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
""" RunPod Cloud. """
|
2
2
|
|
3
|
-
import json
|
4
3
|
import typing
|
5
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
4
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
6
5
|
|
7
6
|
from sky import clouds
|
8
7
|
from sky.clouds import service_catalog
|
@@ -147,7 +146,7 @@ class RunPod(clouds.Cloud):
|
|
147
146
|
|
148
147
|
@classmethod
|
149
148
|
def get_accelerators_from_instance_type(
|
150
|
-
cls, instance_type: str) -> Optional[Dict[str, int]]:
|
149
|
+
cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
151
150
|
return service_catalog.get_accelerators_from_instance_type(
|
152
151
|
instance_type, clouds='runpod')
|
153
152
|
|
@@ -166,10 +165,8 @@ class RunPod(clouds.Cloud):
|
|
166
165
|
|
167
166
|
r = resources
|
168
167
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
169
|
-
|
170
|
-
|
171
|
-
else:
|
172
|
-
custom_resources = None
|
168
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
169
|
+
acc_dict)
|
173
170
|
|
174
171
|
if r.image_id is None:
|
175
172
|
image_id = 'runpod/base:0.0.2'
|
sky/clouds/scp.py
CHANGED
@@ -4,9 +4,8 @@ This module includes the set of functions
|
|
4
4
|
to access the SCP catalog and check credentials for the SCP access.
|
5
5
|
"""
|
6
6
|
|
7
|
-
import json
|
8
7
|
import typing
|
9
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
8
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
10
9
|
|
11
10
|
from sky import clouds
|
12
11
|
from sky import exceptions
|
@@ -160,7 +159,7 @@ class SCP(clouds.Cloud):
|
|
160
159
|
def get_accelerators_from_instance_type(
|
161
160
|
cls,
|
162
161
|
instance_type: str,
|
163
|
-
) -> Optional[Dict[str, int]]:
|
162
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
164
163
|
return service_catalog.get_accelerators_from_instance_type(
|
165
164
|
instance_type, clouds='scp')
|
166
165
|
|
@@ -188,11 +187,9 @@ class SCP(clouds.Cloud):
|
|
188
187
|
|
189
188
|
r = resources
|
190
189
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
190
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
191
|
+
acc_dict)
|
191
192
|
|
192
|
-
if acc_dict is not None:
|
193
|
-
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
|
194
|
-
else:
|
195
|
-
custom_resources = None
|
196
193
|
image_id = self._get_image_id(r.image_id, region.name, r.instance_type)
|
197
194
|
return {
|
198
195
|
'instance_type': resources.instance_type,
|
@@ -238,7 +238,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
238
238
|
|
239
239
|
def get_accelerators_from_instance_type(
|
240
240
|
instance_type: str,
|
241
|
-
clouds: CloudFilter = None) -> Optional[Dict[str, int]]:
|
241
|
+
clouds: CloudFilter = None) -> Optional[Dict[str, Union[int, float]]]:
|
242
242
|
"""Returns the accelerators from a instance type."""
|
243
243
|
return _map_clouds_catalog(clouds, 'get_accelerators_from_instance_type',
|
244
244
|
instance_type)
|
@@ -8,7 +8,7 @@ import hashlib
|
|
8
8
|
import os
|
9
9
|
import threading
|
10
10
|
import typing
|
11
|
-
from typing import Dict, List, Optional, Tuple
|
11
|
+
from typing import Dict, List, Optional, Tuple, Union
|
12
12
|
|
13
13
|
from sky import exceptions
|
14
14
|
from sky import sky_logging
|
@@ -243,7 +243,7 @@ def get_default_instance_type(
|
|
243
243
|
|
244
244
|
|
245
245
|
def get_accelerators_from_instance_type(
|
246
|
-
instance_type: str) -> Optional[Dict[str, int]]:
|
246
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
247
247
|
return common.get_accelerators_from_instance_type_impl(
|
248
248
|
_get_df(), instance_type)
|
249
249
|
|
@@ -4,7 +4,7 @@ This module loads the service catalog file and can be used to query
|
|
4
4
|
instance types and pricing information for Azure.
|
5
5
|
"""
|
6
6
|
import re
|
7
|
-
from typing import Dict, List, Optional, Tuple
|
7
|
+
from typing import Dict, List, Optional, Tuple, Union
|
8
8
|
|
9
9
|
from sky import clouds as cloud_lib
|
10
10
|
from sky import sky_logging
|
@@ -137,7 +137,7 @@ def get_default_instance_type(
|
|
137
137
|
|
138
138
|
|
139
139
|
def get_accelerators_from_instance_type(
|
140
|
-
instance_type: str) -> Optional[Dict[str, int]]:
|
140
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
141
141
|
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
142
142
|
|
143
143
|
|
@@ -157,6 +157,7 @@ def get_instance_type_for_accelerator(
|
|
157
157
|
if zone is not None:
|
158
158
|
with ux_utils.print_exception_no_traceback():
|
159
159
|
raise ValueError('Azure does not support zones.')
|
160
|
+
|
160
161
|
return common.get_instance_type_for_accelerator_impl(df=_df,
|
161
162
|
acc_name=acc_name,
|
162
163
|
acc_count=acc_count,
|
@@ -5,7 +5,7 @@ import hashlib
|
|
5
5
|
import os
|
6
6
|
import time
|
7
7
|
import typing
|
8
|
-
from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
|
8
|
+
from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union
|
9
9
|
|
10
10
|
import filelock
|
11
11
|
import requests
|
@@ -481,7 +481,7 @@ def get_instance_type_for_cpus_mem_impl(
|
|
481
481
|
def get_accelerators_from_instance_type_impl(
|
482
482
|
df: 'pd.DataFrame',
|
483
483
|
instance_type: str,
|
484
|
-
) -> Optional[Dict[str, int]]:
|
484
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
485
485
|
df = _get_instance_type(df, instance_type, None)
|
486
486
|
if len(df) == 0:
|
487
487
|
with ux_utils.print_exception_no_traceback():
|
@@ -490,13 +490,19 @@ def get_accelerators_from_instance_type_impl(
|
|
490
490
|
acc_name, acc_count = row['AcceleratorName'], row['AcceleratorCount']
|
491
491
|
if pd.isnull(acc_name):
|
492
492
|
return None
|
493
|
-
|
493
|
+
|
494
|
+
def _convert(value):
|
495
|
+
if int(value) == value:
|
496
|
+
return int(value)
|
497
|
+
return float(value)
|
498
|
+
|
499
|
+
return {acc_name: _convert(acc_count)}
|
494
500
|
|
495
501
|
|
496
502
|
def get_instance_type_for_accelerator_impl(
|
497
503
|
df: 'pd.DataFrame',
|
498
504
|
acc_name: str,
|
499
|
-
acc_count: int,
|
505
|
+
acc_count: Union[int, float],
|
500
506
|
cpus: Optional[str] = None,
|
501
507
|
memory: Optional[str] = None,
|
502
508
|
use_spot: bool = False,
|
@@ -509,7 +515,7 @@ def get_instance_type_for_accelerator_impl(
|
|
509
515
|
accelerators with sorted prices and a list of candidates with fuzzy search.
|
510
516
|
"""
|
511
517
|
result = df[(df['AcceleratorName'].str.fullmatch(acc_name, case=False)) &
|
512
|
-
(df['AcceleratorCount']
|
518
|
+
(abs(df['AcceleratorCount'] - acc_count) <= 0.01)]
|
513
519
|
result = _filter_region_zone(result, region, zone)
|
514
520
|
if len(result) == 0:
|
515
521
|
fuzzy_result = df[
|
@@ -522,8 +528,11 @@ def get_instance_type_for_accelerator_impl(
|
|
522
528
|
fuzzy_candidate_list = []
|
523
529
|
if len(fuzzy_result) > 0:
|
524
530
|
for _, row in fuzzy_result.iterrows():
|
531
|
+
acc_cnt = float(row['AcceleratorCount'])
|
532
|
+
acc_count_display = (int(acc_cnt) if acc_cnt.is_integer() else
|
533
|
+
f'{acc_cnt:.2f}')
|
525
534
|
fuzzy_candidate_list.append(f'{row["AcceleratorName"]}:'
|
526
|
-
f'{
|
535
|
+
f'{acc_count_display}')
|
527
536
|
return (None, fuzzy_candidate_list)
|
528
537
|
|
529
538
|
result = _filter_with_cpus(result, cpus)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
"""Cudo Compute Offerings Catalog."""
|
2
2
|
|
3
3
|
import typing
|
4
|
-
from typing import Dict, List, Optional, Tuple
|
4
|
+
from typing import Dict, List, Optional, Tuple, Union
|
5
5
|
|
6
6
|
from sky.clouds.service_catalog import common
|
7
7
|
import sky.provision.cudo.cudo_machine_type as cudo_mt
|
@@ -66,7 +66,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
66
66
|
|
67
67
|
|
68
68
|
def get_accelerators_from_instance_type(
|
69
|
-
instance_type: str) -> Optional[Dict[str, int]]:
|
69
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
70
70
|
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
71
71
|
|
72
72
|
|
@@ -93,14 +93,15 @@ def get_regions() -> List[str]:
|
|
93
93
|
# We have to manually remove it.
|
94
94
|
DEPRECATED_FAMILIES = ['standardNVSv2Family']
|
95
95
|
|
96
|
-
#
|
97
|
-
#
|
98
|
-
# TODO(zhwu,tian): support fractional GPUs, which can be done on
|
99
|
-
# kubernetes as well.
|
96
|
+
# Azure has those fractional A10 instance types, which still shows has 1 A10 GPU
|
97
|
+
# in the API response. We manually changing the number of GPUs to a float here.
|
100
98
|
# Ref: https://learn.microsoft.com/en-us/azure/virtual-machines/nva10v5-series
|
101
|
-
|
102
|
-
|
103
|
-
|
99
|
+
# TODO(zhwu,tian): Support fractional GPUs on k8s as well.
|
100
|
+
# TODO(tian): Maybe we should support literally fractional count, i.e. A10:1/6
|
101
|
+
# instead of float point count (A10:0.167).
|
102
|
+
AZURE_FRACTIONAL_A10_INS_TYPE_TO_NUM_GPUS = {
|
103
|
+
f'Standard_NV{vcpu}ads_A10_v5': round(vcpu / 36, 3) for vcpu in [6, 12, 18]
|
104
|
+
}
|
104
105
|
|
105
106
|
USEFUL_COLUMNS = [
|
106
107
|
'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB',
|
@@ -274,6 +275,19 @@ def get_all_regions_instance_types_df(region_set: Set[str]):
|
|
274
275
|
axis='columns',
|
275
276
|
)
|
276
277
|
|
278
|
+
def _upd_a10_gpu_count(row):
|
279
|
+
new_gpu_cnt = AZURE_FRACTIONAL_A10_INS_TYPE_TO_NUM_GPUS.get(
|
280
|
+
row['InstanceType'])
|
281
|
+
if new_gpu_cnt is not None:
|
282
|
+
return new_gpu_cnt
|
283
|
+
return row['AcceleratorCount']
|
284
|
+
|
285
|
+
# Manually update the GPU count for fractional A10 instance types.
|
286
|
+
# Those instance types have fractional GPU count, but Azure API returns
|
287
|
+
# 1 GPU count for them. We manually update the GPU count here.
|
288
|
+
df_ret['AcceleratorCount'] = df_ret.apply(_upd_a10_gpu_count,
|
289
|
+
axis='columns')
|
290
|
+
|
277
291
|
# As of Dec 2023, a few H100 instance types fetched from Azure APIs do not
|
278
292
|
# have pricing:
|
279
293
|
#
|
@@ -299,10 +313,6 @@ def get_all_regions_instance_types_df(region_set: Set[str]):
|
|
299
313
|
after_drop_len = len(df_ret)
|
300
314
|
print(f'Dropped {before_drop_len - after_drop_len} duplicated rows')
|
301
315
|
|
302
|
-
# Filter out instance types that only contain a fractional of GPU.
|
303
|
-
df_ret = df_ret.loc[~df_ret['InstanceType'].isin(FILTERED_A10_INSTANCE_TYPES
|
304
|
-
)]
|
305
|
-
|
306
316
|
# Filter out deprecated families
|
307
317
|
df_ret = df_ret.loc[~df_ret['family'].isin(DEPRECATED_FAMILIES)]
|
308
318
|
df_ret = df_ret[USEFUL_COLUMNS]
|
@@ -4,7 +4,7 @@ This module loads the service catalog file and can be used to query
|
|
4
4
|
instance types and pricing information for FluidStack.
|
5
5
|
"""
|
6
6
|
import typing
|
7
|
-
from typing import Dict, List, Optional, Tuple
|
7
|
+
from typing import Dict, List, Optional, Tuple, Union
|
8
8
|
|
9
9
|
from sky.clouds.service_catalog import common
|
10
10
|
from sky.utils import ux_utils
|
@@ -65,7 +65,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
65
65
|
|
66
66
|
|
67
67
|
def get_accelerators_from_instance_type(
|
68
|
-
instance_type: str) -> Optional[Dict[str, int]]:
|
68
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
69
69
|
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
70
70
|
|
71
71
|
|
@@ -4,7 +4,7 @@ This module loads the service catalog file and can be used to query
|
|
4
4
|
instance types and pricing information for IBM.
|
5
5
|
"""
|
6
6
|
|
7
|
-
from typing import Dict, List, Optional, Tuple
|
7
|
+
from typing import Dict, List, Optional, Tuple, Union
|
8
8
|
|
9
9
|
from sky import sky_logging
|
10
10
|
from sky.adaptors import ibm
|
@@ -43,7 +43,7 @@ def get_vcpus_mem_from_instance_type(
|
|
43
43
|
|
44
44
|
|
45
45
|
def get_accelerators_from_instance_type(
|
46
|
-
instance_type: str) -> Optional[Dict[str, int]]:
|
46
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
47
47
|
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
48
48
|
|
49
49
|
|
@@ -4,7 +4,7 @@ This module loads the service catalog file and can be used to query
|
|
4
4
|
instance types and pricing information for Lambda.
|
5
5
|
"""
|
6
6
|
import typing
|
7
|
-
from typing import Dict, List, Optional, Tuple
|
7
|
+
from typing import Dict, List, Optional, Tuple, Union
|
8
8
|
|
9
9
|
from sky.clouds.service_catalog import common
|
10
10
|
from sky.utils import resources_utils
|
@@ -72,7 +72,7 @@ def get_default_instance_type(
|
|
72
72
|
|
73
73
|
|
74
74
|
def get_accelerators_from_instance_type(
|
75
|
-
instance_type: str) -> Optional[Dict[str, int]]:
|
75
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
76
76
|
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
77
77
|
|
78
78
|
|
@@ -14,7 +14,7 @@ History:
|
|
14
14
|
import logging
|
15
15
|
import threading
|
16
16
|
import typing
|
17
|
-
from typing import Dict, List, Optional, Tuple
|
17
|
+
from typing import Dict, List, Optional, Tuple, Union
|
18
18
|
|
19
19
|
from sky.adaptors import oci as oci_adaptor
|
20
20
|
from sky.clouds import OCI
|
@@ -131,7 +131,7 @@ def get_default_instance_type(
|
|
131
131
|
|
132
132
|
|
133
133
|
def get_accelerators_from_instance_type(
|
134
|
-
instance_type: str) -> Optional[Dict[str, int]]:
|
134
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
135
135
|
return common.get_accelerators_from_instance_type_impl(
|
136
136
|
_get_df(), instance_type)
|
137
137
|
|
@@ -5,7 +5,7 @@ query instance types and pricing information for Paperspace.
|
|
5
5
|
"""
|
6
6
|
|
7
7
|
import typing
|
8
|
-
from typing import Dict, List, Optional, Tuple
|
8
|
+
from typing import Dict, List, Optional, Tuple, Union
|
9
9
|
|
10
10
|
from sky.clouds.service_catalog import common
|
11
11
|
from sky.utils import ux_utils
|
@@ -60,7 +60,7 @@ def get_default_instance_type(
|
|
60
60
|
|
61
61
|
|
62
62
|
def get_accelerators_from_instance_type(
|
63
|
-
instance_type: str) -> Optional[Dict[str, int]]:
|
63
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
64
64
|
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
65
65
|
|
66
66
|
|
@@ -5,7 +5,7 @@ query instance types and pricing information for RunPod.
|
|
5
5
|
"""
|
6
6
|
|
7
7
|
import typing
|
8
|
-
from typing import Dict, List, Optional, Tuple
|
8
|
+
from typing import Dict, List, Optional, Tuple, Union
|
9
9
|
|
10
10
|
from sky.clouds.service_catalog import common
|
11
11
|
from sky.utils import ux_utils
|
@@ -56,7 +56,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
|
|
56
56
|
|
57
57
|
|
58
58
|
def get_accelerators_from_instance_type(
|
59
|
-
instance_type: str) -> Optional[Dict[str, int]]:
|
59
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
60
60
|
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
61
61
|
|
62
62
|
|
@@ -5,7 +5,7 @@ instance types and pricing information for SCP.
|
|
5
5
|
"""
|
6
6
|
|
7
7
|
import typing
|
8
|
-
from typing import Dict, List, Optional, Tuple
|
8
|
+
from typing import Dict, List, Optional, Tuple, Union
|
9
9
|
|
10
10
|
from sky.clouds.service_catalog import common
|
11
11
|
from sky.utils import resources_utils
|
@@ -67,7 +67,7 @@ def get_default_instance_type(
|
|
67
67
|
|
68
68
|
|
69
69
|
def get_accelerators_from_instance_type(
|
70
|
-
instance_type: str) -> Optional[Dict[str, int]]:
|
70
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
71
71
|
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
72
72
|
|
73
73
|
|
@@ -2,7 +2,7 @@
|
|
2
2
|
import io
|
3
3
|
import os
|
4
4
|
import typing
|
5
|
-
from typing import Dict, List, Optional, Tuple
|
5
|
+
from typing import Dict, List, Optional, Tuple, Union
|
6
6
|
|
7
7
|
from sky.adaptors import common as adaptors_common
|
8
8
|
from sky.clouds.service_catalog import common
|
@@ -85,7 +85,7 @@ def get_default_instance_type(
|
|
85
85
|
|
86
86
|
|
87
87
|
def get_accelerators_from_instance_type(
|
88
|
-
instance_type: str) -> Optional[Dict[str, int]]:
|
88
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
89
89
|
return common.get_accelerators_from_instance_type_impl(
|
90
90
|
_get_df(), instance_type)
|
91
91
|
|
sky/clouds/vsphere.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
"""Vsphere cloud implementation."""
|
2
|
-
import json
|
3
2
|
import subprocess
|
4
3
|
import typing
|
5
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
4
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
6
5
|
|
7
6
|
import requests
|
8
7
|
|
@@ -152,7 +151,7 @@ class Vsphere(clouds.Cloud):
|
|
152
151
|
def get_accelerators_from_instance_type(
|
153
152
|
cls,
|
154
153
|
instance_type: str,
|
155
|
-
) -> Optional[Dict[str, int]]:
|
154
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
156
155
|
return service_catalog.get_accelerators_from_instance_type(
|
157
156
|
instance_type, clouds=_CLOUD_VSPHERE)
|
158
157
|
|
@@ -182,10 +181,8 @@ class Vsphere(clouds.Cloud):
|
|
182
181
|
zone_names = [zone.name for zone in zones]
|
183
182
|
r = resources
|
184
183
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
185
|
-
|
186
|
-
|
187
|
-
else:
|
188
|
-
custom_resources = None
|
184
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
185
|
+
acc_dict)
|
189
186
|
|
190
187
|
return {
|
191
188
|
'instance_type': resources.instance_type,
|
sky/jobs/controller.py
CHANGED
@@ -340,48 +340,28 @@ class JobsController:
|
|
340
340
|
common_utils.format_exception(reason, use_bracket=True)
|
341
341
|
for reason in e.reasons))
|
342
342
|
logger.error(failure_reason)
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
failure_type=managed_job_state.ManagedJobStatus.
|
347
|
-
FAILED_PRECHECKS,
|
348
|
-
failure_reason=failure_reason,
|
349
|
-
callback_func=managed_job_utils.event_callback_func(
|
350
|
-
job_id=self._job_id,
|
351
|
-
task_id=task_id,
|
352
|
-
task=self._dag.tasks[task_id]))
|
343
|
+
self._update_failed_task_state(
|
344
|
+
task_id, managed_job_state.ManagedJobStatus.FAILED_PRECHECKS,
|
345
|
+
failure_reason)
|
353
346
|
except exceptions.ManagedJobReachedMaxRetriesError as e:
|
354
347
|
# Please refer to the docstring of self._run for the cases when
|
355
348
|
# this exception can occur.
|
356
|
-
|
349
|
+
failure_reason = common_utils.format_exception(e)
|
350
|
+
logger.error(failure_reason)
|
357
351
|
# The managed job should be marked as FAILED_NO_RESOURCE, as the
|
358
352
|
# managed job may be able to launch next time.
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
failure_type=managed_job_state.ManagedJobStatus.
|
363
|
-
FAILED_NO_RESOURCE,
|
364
|
-
failure_reason=common_utils.format_exception(e),
|
365
|
-
callback_func=managed_job_utils.event_callback_func(
|
366
|
-
job_id=self._job_id,
|
367
|
-
task_id=task_id,
|
368
|
-
task=self._dag.tasks[task_id]))
|
353
|
+
self._update_failed_task_state(
|
354
|
+
task_id, managed_job_state.ManagedJobStatus.FAILED_NO_RESOURCE,
|
355
|
+
failure_reason)
|
369
356
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
370
357
|
with ux_utils.enable_traceback():
|
371
358
|
logger.error(traceback.format_exc())
|
372
|
-
msg = ('Unexpected error occurred: '
|
373
|
-
|
359
|
+
msg = ('Unexpected error occurred: ' +
|
360
|
+
common_utils.format_exception(e, use_bracket=True))
|
374
361
|
logger.error(msg)
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
failure_type=managed_job_state.ManagedJobStatus.
|
379
|
-
FAILED_CONTROLLER,
|
380
|
-
failure_reason=msg,
|
381
|
-
callback_func=managed_job_utils.event_callback_func(
|
382
|
-
job_id=self._job_id,
|
383
|
-
task_id=task_id,
|
384
|
-
task=self._dag.tasks[task_id]))
|
362
|
+
self._update_failed_task_state(
|
363
|
+
task_id, managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
|
364
|
+
msg)
|
385
365
|
finally:
|
386
366
|
# This will set all unfinished tasks to CANCELLING, and will not
|
387
367
|
# affect the jobs in terminal states.
|
@@ -396,6 +376,21 @@ class JobsController:
|
|
396
376
|
managed_job_state.set_cancelled(job_id=self._job_id,
|
397
377
|
callback_func=callback_func)
|
398
378
|
|
379
|
+
def _update_failed_task_state(
|
380
|
+
self, task_id: int,
|
381
|
+
failure_type: managed_job_state.ManagedJobStatus,
|
382
|
+
failure_reason: str):
|
383
|
+
"""Update the state of the failed task."""
|
384
|
+
managed_job_state.set_failed(
|
385
|
+
self._job_id,
|
386
|
+
task_id=task_id,
|
387
|
+
failure_type=failure_type,
|
388
|
+
failure_reason=failure_reason,
|
389
|
+
callback_func=managed_job_utils.event_callback_func(
|
390
|
+
job_id=self._job_id,
|
391
|
+
task_id=task_id,
|
392
|
+
task=self._dag.tasks[task_id]))
|
393
|
+
|
399
394
|
|
400
395
|
def _run_controller(job_id: int, dag_yaml: str, retry_until_up: bool):
|
401
396
|
"""Runs the controller in a remote process for interruption."""
|
sky/resources.py
CHANGED
@@ -392,7 +392,7 @@ class Resources:
|
|
392
392
|
|
393
393
|
@property
|
394
394
|
@functools.lru_cache(maxsize=1)
|
395
|
-
def accelerators(self) -> Optional[Dict[str, int]]:
|
395
|
+
def accelerators(self) -> Optional[Dict[str, Union[int, float]]]:
|
396
396
|
"""Returns the accelerators field directly or by inferring.
|
397
397
|
|
398
398
|
For example, Resources(AWS, 'p3.2xlarge') has its accelerators field
|
sky/utils/resources_utils.py
CHANGED
@@ -2,9 +2,11 @@
|
|
2
2
|
import dataclasses
|
3
3
|
import enum
|
4
4
|
import itertools
|
5
|
+
import json
|
6
|
+
import math
|
5
7
|
import re
|
6
8
|
import typing
|
7
|
-
from typing import List, Optional, Set
|
9
|
+
from typing import Dict, List, Optional, Set, Union
|
8
10
|
|
9
11
|
from sky import skypilot_config
|
10
12
|
from sky.clouds import cloud_registry
|
@@ -163,6 +165,16 @@ def get_readable_resources_repr(handle: 'backends.CloudVmRayResourceHandle',
|
|
163
165
|
return _DEFAULT_MESSAGE_HANDLE_INITIALIZING
|
164
166
|
|
165
167
|
|
168
|
+
def make_ray_custom_resources_str(
|
169
|
+
resource_dict: Optional[Dict[str, Union[int, float]]]) -> Optional[str]:
|
170
|
+
"""Convert resources to Ray custom resources format."""
|
171
|
+
if resource_dict is None:
|
172
|
+
return None
|
173
|
+
# Ray does not allow fractional resources, so we need to ceil the values.
|
174
|
+
ceiled_dict = {k: math.ceil(v) for k, v in resource_dict.items()}
|
175
|
+
return json.dumps(ceiled_dict, separators=(',', ':'))
|
176
|
+
|
177
|
+
|
166
178
|
@dataclasses.dataclass
|
167
179
|
class FeasibleResources:
|
168
180
|
"""Feasible resources returned by cloud.
|
{skypilot_nightly-1.0.0.dev20241026.dist-info → skypilot_nightly-1.0.0.dev20241028.dist-info}/RECORD
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=VJgULLW-g5zmsZdNK6RE_nYOPiwoAau82qlC-KSxDjs,5882
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
|
4
4
|
sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
|
@@ -10,7 +10,7 @@ sky/exceptions.py,sha256=KBIEJHgrw6OMBL8H65o-Gk6qYQEV1SR9gBwMjnMnxxg,8858
|
|
10
10
|
sky/execution.py,sha256=CbrKMgfc2JgLqZqwPvmYKxbWAQKYqHpOLpUEOb-k2m0,24679
|
11
11
|
sky/global_user_state.py,sha256=PywEmUutF97XBgRMClR6IS5_KM8JJC0oA1LsPUZebp0,28681
|
12
12
|
sky/optimizer.py,sha256=OzxWiA6ZC0tyJ1eNMy4e72vitjfLKfbOLF9ywZOccXU,59343
|
13
|
-
sky/resources.py,sha256=
|
13
|
+
sky/resources.py,sha256=bm004Ms2qlBqEr0N_TEUybDOXJVhLF8yOwkhoqb1t9c,67478
|
14
14
|
sky/sky_logging.py,sha256=oLmTmwkuucIto3LHXLJfMcyRpYSkmZAZa5XzQPA5IHk,4434
|
15
15
|
sky/skypilot_config.py,sha256=E3g65cX3P3dT9b5N0GgFBG6yB0FXwIGpisKoozmJmWU,9094
|
16
16
|
sky/status_lib.py,sha256=J7Jb4_Dz0v2T64ttOdyUgpokvl4S0sBJrMfH7Fvo51A,1457
|
@@ -31,7 +31,7 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
|
|
31
31
|
sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
|
32
32
|
sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
|
33
33
|
sky/backends/backend_utils.py,sha256=PA21DAXspXuTZDQ5qA3G5RGJ0oUTpJ7XatRRvhtmtt0,126993
|
34
|
-
sky/backends/cloud_vm_ray_backend.py,sha256=
|
34
|
+
sky/backends/cloud_vm_ray_backend.py,sha256=D9x5TT_4IE_WhzgLjj-I7nliRmnCY9DKVwcLuJicx7s,237775
|
35
35
|
sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
|
36
36
|
sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
|
37
37
|
sky/backends/wheel_utils.py,sha256=3QS4T_Ydvo4DbYhogtyADyNBEf04I6jUCL71M285shQ,7963
|
@@ -40,42 +40,42 @@ sky/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
40
|
sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG2yg,8723
|
41
41
|
sky/benchmark/benchmark_utils.py,sha256=eb-i6zYoo-Zkod-T9qtCu1FcYLw--Yyos1SyibUPZNE,26194
|
42
42
|
sky/clouds/__init__.py,sha256=WuNIJEnZmBO72tU5awgaaL3rdvFRSkgaYNNeuY68dXo,1356
|
43
|
-
sky/clouds/aws.py,sha256=
|
44
|
-
sky/clouds/azure.py,sha256=
|
45
|
-
sky/clouds/cloud.py,sha256=
|
43
|
+
sky/clouds/aws.py,sha256=dVZ8auaa2z2Ifl9iiRT06IeEFaNtZhANKtHVLT6Gcno,49474
|
44
|
+
sky/clouds/azure.py,sha256=wgR78HMn64EwgQlU9Klv-j3neVbgo4NKycaBkNHyiBc,30158
|
45
|
+
sky/clouds/cloud.py,sha256=A5F4a71ciPyljWEs6vT-4RmdGT-AE9NkhS8gJ4Vgi_I,35165
|
46
46
|
sky/clouds/cloud_registry.py,sha256=oLoYFjm_SDTgdHokY7b6A5Utq80HXRQNxV0fLjDdVsQ,2361
|
47
|
-
sky/clouds/cudo.py,sha256=
|
48
|
-
sky/clouds/fluidstack.py,sha256=
|
49
|
-
sky/clouds/gcp.py,sha256=
|
50
|
-
sky/clouds/ibm.py,sha256=
|
51
|
-
sky/clouds/kubernetes.py,sha256=
|
52
|
-
sky/clouds/lambda_cloud.py,sha256=
|
53
|
-
sky/clouds/oci.py,sha256=
|
54
|
-
sky/clouds/paperspace.py,sha256=
|
55
|
-
sky/clouds/runpod.py,sha256=
|
56
|
-
sky/clouds/scp.py,sha256=
|
57
|
-
sky/clouds/vsphere.py,sha256=
|
58
|
-
sky/clouds/service_catalog/__init__.py,sha256=
|
59
|
-
sky/clouds/service_catalog/aws_catalog.py,sha256=
|
60
|
-
sky/clouds/service_catalog/azure_catalog.py,sha256=
|
61
|
-
sky/clouds/service_catalog/common.py,sha256=
|
47
|
+
sky/clouds/cudo.py,sha256=UiY273Sln7VOYDYx93yWiWH_RLlOKZ2cm7mA31ld4A8,13094
|
48
|
+
sky/clouds/fluidstack.py,sha256=ufve4wXo_VmaEkxqTw2Jnt-DORBDRnqUPU1kB_mD89s,12383
|
49
|
+
sky/clouds/gcp.py,sha256=BjCehW3s0IYkRDdEEDm0vYWXQDpOV8KU98OMVRPnQNg,54676
|
50
|
+
sky/clouds/ibm.py,sha256=w8bo1EIY_YWYNu0fy-OpAyr6DZviU0tpIXUsiV01rVE,21423
|
51
|
+
sky/clouds/kubernetes.py,sha256=WbbxJ9IIF3HtroGJhc4akZV-Pf3_sroVcHUxKIXIk5I,28643
|
52
|
+
sky/clouds/lambda_cloud.py,sha256=ExL_uixdFrF9qSL5JYXpaOXCZ9_eOA2q444kcmBHBXQ,12644
|
53
|
+
sky/clouds/oci.py,sha256=sHJrVhUhOKvJ-skbd2ZJ82IR63OXp43krmyPpM8BZqw,27084
|
54
|
+
sky/clouds/paperspace.py,sha256=4cjNua6jpuxmfidvLY4tSRX1oj_QaaHDinPMunGplyU,10868
|
55
|
+
sky/clouds/runpod.py,sha256=_4myVdGIvQshkka8fn6mBXHgz5TZqhrNhAEM2_HrCT8,11487
|
56
|
+
sky/clouds/scp.py,sha256=NivPvzQxA90R37QR3fgTup8ScGfxKsXAhH0xklAj5QU,15817
|
57
|
+
sky/clouds/vsphere.py,sha256=ZzlcQBzv0aaRYXwZOrdKIGFK94LaOfDSV3lJBg9xyc4,12256
|
58
|
+
sky/clouds/service_catalog/__init__.py,sha256=cFZ3HLdQVa42xOhK2XxuB_xPIX4X1UR89InR4y2y_78,14757
|
59
|
+
sky/clouds/service_catalog/aws_catalog.py,sha256=vTI7h5bjZg3lItT9RBaSwY1Fl0vX5UN1CgMDM6-C1pw,13059
|
60
|
+
sky/clouds/service_catalog/azure_catalog.py,sha256=5Q51x_WEKvQ2YSgJvZHRH3URlbwIstYuwpjaWW_wJlw,8149
|
61
|
+
sky/clouds/service_catalog/common.py,sha256=GcKjtJTuPbpHoqh6CKoTfDJ2EWB9yFiIRmRUgr6oJI4,27615
|
62
62
|
sky/clouds/service_catalog/config.py,sha256=ylzqewdEBjDg4awvFek6ldYmFrnvD2bVGLZuLPvEVYA,1793
|
63
63
|
sky/clouds/service_catalog/constants.py,sha256=ai2yOlsVqBnEpbxaEHXt61COsHBLwOfw6GZXntEPj7k,411
|
64
|
-
sky/clouds/service_catalog/cudo_catalog.py,sha256=
|
65
|
-
sky/clouds/service_catalog/fluidstack_catalog.py,sha256=
|
64
|
+
sky/clouds/service_catalog/cudo_catalog.py,sha256=V_takvL6dWTGQaTLCEvjKIotCDPnMujiNUZ87kZKGVI,4673
|
65
|
+
sky/clouds/service_catalog/fluidstack_catalog.py,sha256=21-cvrYEYTIi7n3ZNF2e7_0QX-PF4BkhlVJUWQOvKrY,5059
|
66
66
|
sky/clouds/service_catalog/gcp_catalog.py,sha256=v_5fsB3dB9oD8U7lBKnCe5ii6AUWEOiQjNarMnU_qLA,24379
|
67
|
-
sky/clouds/service_catalog/ibm_catalog.py,sha256=
|
67
|
+
sky/clouds/service_catalog/ibm_catalog.py,sha256=1iK0KvbI82U7sySb7chr-qm_16x3tTnZ6nIo7o76ouc,4493
|
68
68
|
sky/clouds/service_catalog/kubernetes_catalog.py,sha256=Eezfl-tx3obgy3d2Kz2XR-_ezj_y8Dxk4oOW7Hy_g-o,8599
|
69
|
-
sky/clouds/service_catalog/lambda_catalog.py,sha256=
|
70
|
-
sky/clouds/service_catalog/oci_catalog.py,sha256=
|
71
|
-
sky/clouds/service_catalog/paperspace_catalog.py,sha256=
|
72
|
-
sky/clouds/service_catalog/runpod_catalog.py,sha256=
|
73
|
-
sky/clouds/service_catalog/scp_catalog.py,sha256=
|
74
|
-
sky/clouds/service_catalog/vsphere_catalog.py,sha256=
|
69
|
+
sky/clouds/service_catalog/lambda_catalog.py,sha256=2R-ccu63BbdvO6X80MtxiniA-jLewXb6I0Ye1rYD9fY,5302
|
70
|
+
sky/clouds/service_catalog/oci_catalog.py,sha256=DQaP0iQlxZEHWJs862ilynUfUEQDIjCGltS7kSadgYo,8572
|
71
|
+
sky/clouds/service_catalog/paperspace_catalog.py,sha256=MOlfoGRChjEwMzu4nRAho8DrIwwUJ3QlRzrMA1RLqvE,3789
|
72
|
+
sky/clouds/service_catalog/runpod_catalog.py,sha256=oWYVgSMiK3DxBE5AgROyExIq9kCTaOr3hDLSc31kqTU,4205
|
73
|
+
sky/clouds/service_catalog/scp_catalog.py,sha256=nrtD0hAZd1rUDsFuHI1hrBgAVSE5YprdWoYSXQooIqU,5195
|
74
|
+
sky/clouds/service_catalog/vsphere_catalog.py,sha256=OV3Czi3vwRSW4lqVPHxU_GND0ox322gmhv3kb11Q8AM,4412
|
75
75
|
sky/clouds/service_catalog/data_fetchers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
76
76
|
sky/clouds/service_catalog/data_fetchers/analyze.py,sha256=VdksJQs3asFE8H5T3ZV1FJas2xD9WEX6c-V5p7y-wp4,2084
|
77
77
|
sky/clouds/service_catalog/data_fetchers/fetch_aws.py,sha256=ro2zazdkDF6z9bE7QFyjoeb4VFxmbNZ1WK5IQrdoQWk,23003
|
78
|
-
sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=
|
78
|
+
sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=L1JsX1YrhpyI7ylzEPBBNE9XOZM0K0FIXbBUMj9h0MQ,12803
|
79
79
|
sky/clouds/service_catalog/data_fetchers/fetch_cudo.py,sha256=52P48lvWN0s1ArjeLPeLemPRpxjSRcHincRle0nqdm4,3440
|
80
80
|
sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py,sha256=35nO_VaDOgp5W13kt_lIANSk_CNf7gBiZGJ5fGyZu6o,6808
|
81
81
|
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=5CbgU90ldiKVgaagQTnYBJVsgVGE3cMwtF7KpBiTtvU,29873
|
@@ -95,7 +95,7 @@ sky/data/storage.py,sha256=x8YYY4zVBdit_5oAR_MXV-TM9qDefV_ZV4z0irv6ZaU,163102
|
|
95
95
|
sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
|
96
96
|
sky/jobs/__init__.py,sha256=yucibSB_ZimtJMvOhMxn6ZqwBIYNfcwmc6pSXtCqmNQ,1483
|
97
97
|
sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
|
98
|
-
sky/jobs/controller.py,sha256=
|
98
|
+
sky/jobs/controller.py,sha256=zSdawmXg-9SZ91jJg5_OSFVlntu9xupLs-CiPwG1QdQ,26412
|
99
99
|
sky/jobs/core.py,sha256=RkBFaKDlovmdzqlOAgQ0xAimZFgo4pXq3qaQkAvGsGk,16908
|
100
100
|
sky/jobs/recovery_strategy.py,sha256=UOEaVGSpRbCnCzlD8cgyjhCPIBIeBeCXCutoSic5aiA,25545
|
101
101
|
sky/jobs/state.py,sha256=C6R5Yq7ftBqGPa_71tUjflBMKAaJ1FTTdbgjAwmbJsI,23231
|
@@ -254,7 +254,7 @@ sky/utils/db_utils.py,sha256=AOvMmBEN9cF4I7CoXihPCtus4mU2VDGjBQSVMMgzKlA,2786
|
|
254
254
|
sky/utils/env_options.py,sha256=3oAaUPxowL6vI2XmxXrH56V7Myj9IJWsL-MXFmRFVdI,1294
|
255
255
|
sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM,1384
|
256
256
|
sky/utils/log_utils.py,sha256=ptv2sbsiJSgk4NvdccrMsUR-MvOKnbu4BQiRSishgk0,12472
|
257
|
-
sky/utils/resources_utils.py,sha256=
|
257
|
+
sky/utils/resources_utils.py,sha256=Xqi7gxPYw2y5wl5okUI5zx5LEij0hJF_V3Zi8q7TXYg,7890
|
258
258
|
sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
|
259
259
|
sky/utils/schemas.py,sha256=qo9j1TJZXqgJlBgbQfqz1oIZAxc3CN8uWooKYPQXXIY,28878
|
260
260
|
sky/utils/subprocess_utils.py,sha256=3R54Elc2n8DQeO6Y8MCDJ6N6v27HDGpbNMIfCquqXYQ,6552
|
@@ -274,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
|
|
274
274
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
275
275
|
sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
|
276
276
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
|
277
|
-
skypilot_nightly-1.0.0.
|
278
|
-
skypilot_nightly-1.0.0.
|
279
|
-
skypilot_nightly-1.0.0.
|
280
|
-
skypilot_nightly-1.0.0.
|
281
|
-
skypilot_nightly-1.0.0.
|
282
|
-
skypilot_nightly-1.0.0.
|
277
|
+
skypilot_nightly-1.0.0.dev20241028.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
278
|
+
skypilot_nightly-1.0.0.dev20241028.dist-info/METADATA,sha256=8eFLTibwR6C40v9UaWQJTpI65ybVGgMyoeGkQzxl-qk,19540
|
279
|
+
skypilot_nightly-1.0.0.dev20241028.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
|
280
|
+
skypilot_nightly-1.0.0.dev20241028.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
281
|
+
skypilot_nightly-1.0.0.dev20241028.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
282
|
+
skypilot_nightly-1.0.0.dev20241028.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20241026.dist-info → skypilot_nightly-1.0.0.dev20241028.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|