torchx-nightly 2024.6.18__py3-none-any.whl → 2024.6.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/schedulers/devices.py +15 -4
- torchx/specs/api.py +7 -1
- torchx/specs/named_resources_aws.py +7 -2
- {torchx_nightly-2024.6.18.dist-info → torchx_nightly-2024.6.21.dist-info}/METADATA +1 -1
- {torchx_nightly-2024.6.18.dist-info → torchx_nightly-2024.6.21.dist-info}/RECORD +9 -9
- {torchx_nightly-2024.6.18.dist-info → torchx_nightly-2024.6.21.dist-info}/LICENSE +0 -0
- {torchx_nightly-2024.6.18.dist-info → torchx_nightly-2024.6.21.dist-info}/WHEEL +0 -0
- {torchx_nightly-2024.6.18.dist-info → torchx_nightly-2024.6.21.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2024.6.18.dist-info → torchx_nightly-2024.6.21.dist-info}/top_level.txt +0 -0
torchx/schedulers/devices.py
CHANGED
|
@@ -7,25 +7,36 @@
|
|
|
7
7
|
|
|
8
8
|
# pyre-strict
|
|
9
9
|
import warnings
|
|
10
|
+
from functools import partial
|
|
10
11
|
from typing import Callable, Dict, List, Mapping
|
|
11
12
|
|
|
12
13
|
from torchx.specs.api import DeviceMount
|
|
14
|
+
from torchx.specs.named_resources_aws import EFA_DEVICE, NEURON_DEVICE
|
|
13
15
|
|
|
14
16
|
|
|
15
|
-
def
|
|
17
|
+
def to_devicemounts(num_devices: int, device_type: str) -> List[DeviceMount]:
|
|
16
18
|
device_mounts = []
|
|
17
19
|
for device_index in range(0, num_devices):
|
|
18
20
|
device_mounts.append(
|
|
19
21
|
DeviceMount(
|
|
20
|
-
src_path=
|
|
21
|
-
dst_path=
|
|
22
|
+
src_path=device_type + str(device_index),
|
|
23
|
+
dst_path=device_type + str(device_index),
|
|
22
24
|
)
|
|
23
25
|
)
|
|
24
26
|
return device_mounts
|
|
25
27
|
|
|
26
28
|
|
|
29
|
+
neuron_to_devicemounts: Callable[[int], List[DeviceMount]] = partial(
|
|
30
|
+
to_devicemounts, device_type="/dev/neuron"
|
|
31
|
+
)
|
|
32
|
+
efa_to_devicemounts: Callable[[int], List[DeviceMount]] = partial(
|
|
33
|
+
to_devicemounts, device_type="/dev/infiniband/uverbs"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
27
37
|
DEVICES: Mapping[str, Callable[[int], List[DeviceMount]]] = {
|
|
28
|
-
|
|
38
|
+
EFA_DEVICE: efa_to_devicemounts,
|
|
39
|
+
NEURON_DEVICE: neuron_to_devicemounts,
|
|
29
40
|
}
|
|
30
41
|
|
|
31
42
|
|
torchx/specs/api.py
CHANGED
|
@@ -237,11 +237,15 @@ class RetryPolicy(str, Enum):
|
|
|
237
237
|
application to deal with failed replica departures and
|
|
238
238
|
replacement replica admittance.
|
|
239
239
|
2. APPLICATION: Restarts the entire application.
|
|
240
|
-
|
|
240
|
+
3. HOT_SPARE: Restarts the replicas for a role as long as quorum (min_replicas)
|
|
241
|
+
is not violated using extra hosts as spares. It does not really support
|
|
242
|
+
elasticity and just uses the delta between num_replicas and min_replicas
|
|
243
|
+
as spares (EXPERIMENTAL).
|
|
241
244
|
"""
|
|
242
245
|
|
|
243
246
|
REPLICA = "REPLICA"
|
|
244
247
|
APPLICATION = "APPLICATION"
|
|
248
|
+
HOT_SPARE = "HOT_SPARE"
|
|
245
249
|
|
|
246
250
|
|
|
247
251
|
class MountType(str, Enum):
|
|
@@ -340,6 +344,8 @@ class Role:
|
|
|
340
344
|
and num_replicas depending on the cluster resources and
|
|
341
345
|
policies. If the scheduler doesn't support auto scaling this
|
|
342
346
|
field is ignored and the job size will be num_replicas.
|
|
347
|
+
EXPERIMENTAL: For HOT_SPARE restart policy this field is used to
|
|
348
|
+
indicate the quorum required for the job to run.
|
|
343
349
|
max_retries: max number of retries before giving up
|
|
344
350
|
retry_policy: retry behavior upon replica failures
|
|
345
351
|
resource: Resource requirement for the role. The role should be scheduled
|
|
@@ -37,6 +37,7 @@ from typing import Callable, Mapping
|
|
|
37
37
|
from torchx.specs.api import Resource
|
|
38
38
|
|
|
39
39
|
EFA_DEVICE = "vpc.amazonaws.com/efa"
|
|
40
|
+
NEURON_DEVICE = "aws.amazon.com/neurondevice"
|
|
40
41
|
|
|
41
42
|
# ecs and ec2 have memtax and currently AWS Batch uses hard memory limits
|
|
42
43
|
# so we have to account for mem tax when registering these resources for AWS
|
|
@@ -255,7 +256,11 @@ def aws_g5_48xlarge() -> Resource:
|
|
|
255
256
|
|
|
256
257
|
def aws_trn1_2xlarge() -> Resource:
|
|
257
258
|
return Resource(
|
|
258
|
-
cpu=8,
|
|
259
|
+
cpu=8,
|
|
260
|
+
gpu=0,
|
|
261
|
+
memMB=32 * GiB,
|
|
262
|
+
capabilities={K8S_ITYPE: "trn1.2xlarge"},
|
|
263
|
+
devices={NEURON_DEVICE: 1},
|
|
259
264
|
)
|
|
260
265
|
|
|
261
266
|
|
|
@@ -265,7 +270,7 @@ def aws_trn1_32xlarge() -> Resource:
|
|
|
265
270
|
gpu=0,
|
|
266
271
|
memMB=512 * GiB,
|
|
267
272
|
capabilities={K8S_ITYPE: "trn1.32xlarge"},
|
|
268
|
-
devices={EFA_DEVICE: 8},
|
|
273
|
+
devices={EFA_DEVICE: 8, NEURON_DEVICE: 16},
|
|
269
274
|
)
|
|
270
275
|
|
|
271
276
|
|
|
@@ -68,7 +68,7 @@ torchx/schedulers/__init__.py,sha256=M9SBZiNdE3KI_yc1-BiRtAetfTgtX07uKkuvGUeZQLU
|
|
|
68
68
|
torchx/schedulers/api.py,sha256=s2hI87uAWtU2SHMNBKjAqelzQU_GKp_BjcxdtjVVDDk,14155
|
|
69
69
|
torchx/schedulers/aws_batch_scheduler.py,sha256=7qxy3UFRq0F731-kTjEi6VABWKD60o0req6CBMsTohU,27975
|
|
70
70
|
torchx/schedulers/aws_sagemaker_scheduler.py,sha256=dPah3yaKFUVm-ZZrzFbyM_abP-LCTd-AcAjZ6t2iycU,20699
|
|
71
|
-
torchx/schedulers/devices.py,sha256=
|
|
71
|
+
torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,1734
|
|
72
72
|
torchx/schedulers/docker_scheduler.py,sha256=IrDlmeH-tg_f3krA04Y81nK9dmuYfEPbYOuCjSQkIHA,16541
|
|
73
73
|
torchx/schedulers/gcp_batch_scheduler.py,sha256=dlUfvjfMuQiRcSXQAdwxqdadwPhOf82L5u-ejRWtFgE,16226
|
|
74
74
|
torchx/schedulers/ids.py,sha256=3E-_vwVYC-8Tv8kjuY9-W7TbOe_-Laqd8a65uIN3hQY,1798
|
|
@@ -83,11 +83,11 @@ torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80Rx
|
|
|
83
83
|
torchx/schedulers/ray/ray_common.py,sha256=pyNYFvTKVwdjDAeCBNbPwAWwVNmlLOJWExfn90XY8u8,610
|
|
84
84
|
torchx/schedulers/ray/ray_driver.py,sha256=Wl-1jldL8veVKzmYDEeR2va3JSlAjZpFE1h8HWE9YVE,12286
|
|
85
85
|
torchx/specs/__init__.py,sha256=vF-WUu_4NZP30lCtNYg0YVenY6wRQ8k7K36fOxqbOKc,5477
|
|
86
|
-
torchx/specs/api.py,sha256=
|
|
86
|
+
torchx/specs/api.py,sha256=Y5uT7a-qZ4pP4kSfS6yYP1jUzQWLPI4qLQeyuBv5uDQ,36085
|
|
87
87
|
torchx/specs/builders.py,sha256=QDcQrnCO4bdSaiP0216XbCgTsnLutO_1_FW5jDiEIWI,9939
|
|
88
88
|
torchx/specs/file_linter.py,sha256=IeiomB1BgHUlT-ZsvGxar3llY63NOupfLBrOrD_---A,11860
|
|
89
89
|
torchx/specs/finder.py,sha256=MnwxG_UC4a-3X2wQ37ANEQR6D1TvriCLyuVYBh_-wuI,16249
|
|
90
|
-
torchx/specs/named_resources_aws.py,sha256=
|
|
90
|
+
torchx/specs/named_resources_aws.py,sha256=NDzF9srT7hiS5NGwEJc_sbuwxXMtq8l3rVG0QnVqpJE,8114
|
|
91
91
|
torchx/specs/named_resources_generic.py,sha256=Sg4tAdqiiWDrDz2Lj_pnfsjzGIXKTou73wPseh6j55w,2646
|
|
92
92
|
torchx/specs/test/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
|
93
93
|
torchx/specs/test/components/a/__init__.py,sha256=kdxEgnI8QBSBiuTjaB4qDD7JX84hWowyPWU4B2Cqe9A,561
|
|
@@ -113,9 +113,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
|
|
|
113
113
|
torchx/workspace/api.py,sha256=1heBmPgB-W5Zf9gwViM7NrqvHpZlVYeMN7jpY8Qkytc,5479
|
|
114
114
|
torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
|
|
115
115
|
torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
|
|
116
|
-
torchx_nightly-2024.6.
|
|
117
|
-
torchx_nightly-2024.6.
|
|
118
|
-
torchx_nightly-2024.6.
|
|
119
|
-
torchx_nightly-2024.6.
|
|
120
|
-
torchx_nightly-2024.6.
|
|
121
|
-
torchx_nightly-2024.6.
|
|
116
|
+
torchx_nightly-2024.6.21.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
|
|
117
|
+
torchx_nightly-2024.6.21.dist-info/METADATA,sha256=nz1Jh0khbL27eBONb-oD_ir0Ju4qZXg9mlCt858zBB4,6184
|
|
118
|
+
torchx_nightly-2024.6.21.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
119
|
+
torchx_nightly-2024.6.21.dist-info/entry_points.txt,sha256=3JYZFlX9aWzR-Gs_qsx1zq7mlqbFz6Mi9rQUULW8caI,170
|
|
120
|
+
torchx_nightly-2024.6.21.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
|
|
121
|
+
torchx_nightly-2024.6.21.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|