torchx-nightly 2024.6.18__py3-none-any.whl → 2024.6.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

@@ -7,25 +7,36 @@
7
7
 
8
8
  # pyre-strict
9
9
  import warnings
10
+ from functools import partial
10
11
  from typing import Callable, Dict, List, Mapping
11
12
 
12
13
  from torchx.specs.api import DeviceMount
14
+ from torchx.specs.named_resources_aws import EFA_DEVICE, NEURON_DEVICE
13
15
 
14
16
 
15
- def efa_to_devicemounts(num_devices: int) -> List[DeviceMount]:
17
+ def to_devicemounts(num_devices: int, device_type: str) -> List[DeviceMount]:
16
18
  device_mounts = []
17
19
  for device_index in range(0, num_devices):
18
20
  device_mounts.append(
19
21
  DeviceMount(
20
- src_path="/dev/infiniband/uverbs" + str(device_index),
21
- dst_path="/dev/infiniband/uverbs" + str(device_index),
22
+ src_path=device_type + str(device_index),
23
+ dst_path=device_type + str(device_index),
22
24
  )
23
25
  )
24
26
  return device_mounts
25
27
 
26
28
 
29
+ neuron_to_devicemounts: Callable[[int], List[DeviceMount]] = partial(
30
+ to_devicemounts, device_type="/dev/neuron"
31
+ )
32
+ efa_to_devicemounts: Callable[[int], List[DeviceMount]] = partial(
33
+ to_devicemounts, device_type="/dev/infiniband/uverbs"
34
+ )
35
+
36
+
27
37
  DEVICES: Mapping[str, Callable[[int], List[DeviceMount]]] = {
28
- "vpc.amazonaws.com/efa": efa_to_devicemounts,
38
+ EFA_DEVICE: efa_to_devicemounts,
39
+ NEURON_DEVICE: neuron_to_devicemounts,
29
40
  }
30
41
 
31
42
 
torchx/specs/api.py CHANGED
@@ -237,11 +237,15 @@ class RetryPolicy(str, Enum):
237
237
  application to deal with failed replica departures and
238
238
  replacement replica admittance.
239
239
  2. APPLICATION: Restarts the entire application.
240
-
240
+ 3. HOT_SPARE: Restarts the replicas for a role as long as quorum (min_replicas)
241
+ is not violated using extra hosts as spares. It does not really support
242
+ elasticity and just uses the delta between num_replicas and min_replicas
243
+ as spares (EXPERIMENTAL).
241
244
  """
242
245
 
243
246
  REPLICA = "REPLICA"
244
247
  APPLICATION = "APPLICATION"
248
+ HOT_SPARE = "HOT_SPARE"
245
249
 
246
250
 
247
251
  class MountType(str, Enum):
@@ -340,6 +344,8 @@ class Role:
340
344
  and num_replicas depending on the cluster resources and
341
345
  policies. If the scheduler doesn't support auto scaling this
342
346
  field is ignored and the job size will be num_replicas.
347
+ EXPERIMENTAL: For HOT_SPARE restart policy this field is used to
348
+ indicate the quorum required for the job to run.
343
349
  max_retries: max number of retries before giving up
344
350
  retry_policy: retry behavior upon replica failures
345
351
  resource: Resource requirement for the role. The role should be scheduled
@@ -37,6 +37,7 @@ from typing import Callable, Mapping
37
37
  from torchx.specs.api import Resource
38
38
 
39
39
  EFA_DEVICE = "vpc.amazonaws.com/efa"
40
+ NEURON_DEVICE = "aws.amazon.com/neurondevice"
40
41
 
41
42
  # ecs and ec2 have memtax and currently AWS Batch uses hard memory limits
42
43
  # so we have to account for mem tax when registering these resources for AWS
@@ -255,7 +256,11 @@ def aws_g5_48xlarge() -> Resource:
255
256
 
256
257
  def aws_trn1_2xlarge() -> Resource:
257
258
  return Resource(
258
- cpu=8, gpu=0, memMB=32 * GiB, capabilities={K8S_ITYPE: "trn1.2xlarge"}
259
+ cpu=8,
260
+ gpu=0,
261
+ memMB=32 * GiB,
262
+ capabilities={K8S_ITYPE: "trn1.2xlarge"},
263
+ devices={NEURON_DEVICE: 1},
259
264
  )
260
265
 
261
266
 
@@ -265,7 +270,7 @@ def aws_trn1_32xlarge() -> Resource:
265
270
  gpu=0,
266
271
  memMB=512 * GiB,
267
272
  capabilities={K8S_ITYPE: "trn1.32xlarge"},
268
- devices={EFA_DEVICE: 8},
273
+ devices={EFA_DEVICE: 8, NEURON_DEVICE: 16},
269
274
  )
270
275
 
271
276
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: torchx-nightly
3
- Version: 2024.6.18
3
+ Version: 2024.6.21
4
4
  Summary: TorchX SDK and Components
5
5
  Home-page: https://github.com/pytorch/torchx
6
6
  Author: TorchX Devs
@@ -68,7 +68,7 @@ torchx/schedulers/__init__.py,sha256=M9SBZiNdE3KI_yc1-BiRtAetfTgtX07uKkuvGUeZQLU
68
68
  torchx/schedulers/api.py,sha256=s2hI87uAWtU2SHMNBKjAqelzQU_GKp_BjcxdtjVVDDk,14155
69
69
  torchx/schedulers/aws_batch_scheduler.py,sha256=7qxy3UFRq0F731-kTjEi6VABWKD60o0req6CBMsTohU,27975
70
70
  torchx/schedulers/aws_sagemaker_scheduler.py,sha256=dPah3yaKFUVm-ZZrzFbyM_abP-LCTd-AcAjZ6t2iycU,20699
71
- torchx/schedulers/devices.py,sha256=BnjZnbXGTWiZKLmMKE3zPDOkb4Vao3jgPVa01aV6vyY,1367
71
+ torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,1734
72
72
  torchx/schedulers/docker_scheduler.py,sha256=IrDlmeH-tg_f3krA04Y81nK9dmuYfEPbYOuCjSQkIHA,16541
73
73
  torchx/schedulers/gcp_batch_scheduler.py,sha256=dlUfvjfMuQiRcSXQAdwxqdadwPhOf82L5u-ejRWtFgE,16226
74
74
  torchx/schedulers/ids.py,sha256=3E-_vwVYC-8Tv8kjuY9-W7TbOe_-Laqd8a65uIN3hQY,1798
@@ -83,11 +83,11 @@ torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80Rx
83
83
  torchx/schedulers/ray/ray_common.py,sha256=pyNYFvTKVwdjDAeCBNbPwAWwVNmlLOJWExfn90XY8u8,610
84
84
  torchx/schedulers/ray/ray_driver.py,sha256=Wl-1jldL8veVKzmYDEeR2va3JSlAjZpFE1h8HWE9YVE,12286
85
85
  torchx/specs/__init__.py,sha256=vF-WUu_4NZP30lCtNYg0YVenY6wRQ8k7K36fOxqbOKc,5477
86
- torchx/specs/api.py,sha256=1qmajrQcTKhVWDdsuWc57y2s0UYO1tD0p7XYg20cm5Q,35609
86
+ torchx/specs/api.py,sha256=Y5uT7a-qZ4pP4kSfS6yYP1jUzQWLPI4qLQeyuBv5uDQ,36085
87
87
  torchx/specs/builders.py,sha256=QDcQrnCO4bdSaiP0216XbCgTsnLutO_1_FW5jDiEIWI,9939
88
88
  torchx/specs/file_linter.py,sha256=IeiomB1BgHUlT-ZsvGxar3llY63NOupfLBrOrD_---A,11860
89
89
  torchx/specs/finder.py,sha256=MnwxG_UC4a-3X2wQ37ANEQR6D1TvriCLyuVYBh_-wuI,16249
90
- torchx/specs/named_resources_aws.py,sha256=gmMbzzaYy3WzwDFo5iyd28icX7zRbUOseLl8yudY13Q,7988
90
+ torchx/specs/named_resources_aws.py,sha256=NDzF9srT7hiS5NGwEJc_sbuwxXMtq8l3rVG0QnVqpJE,8114
91
91
  torchx/specs/named_resources_generic.py,sha256=Sg4tAdqiiWDrDz2Lj_pnfsjzGIXKTou73wPseh6j55w,2646
92
92
  torchx/specs/test/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
93
93
  torchx/specs/test/components/a/__init__.py,sha256=kdxEgnI8QBSBiuTjaB4qDD7JX84hWowyPWU4B2Cqe9A,561
@@ -113,9 +113,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
113
113
  torchx/workspace/api.py,sha256=1heBmPgB-W5Zf9gwViM7NrqvHpZlVYeMN7jpY8Qkytc,5479
114
114
  torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
115
115
  torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
116
- torchx_nightly-2024.6.18.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
117
- torchx_nightly-2024.6.18.dist-info/METADATA,sha256=oypt_fwq9wzJbR8VbKz8RI-wd47bO0PdDC9AQSaS0kA,6184
118
- torchx_nightly-2024.6.18.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
119
- torchx_nightly-2024.6.18.dist-info/entry_points.txt,sha256=3JYZFlX9aWzR-Gs_qsx1zq7mlqbFz6Mi9rQUULW8caI,170
120
- torchx_nightly-2024.6.18.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
121
- torchx_nightly-2024.6.18.dist-info/RECORD,,
116
+ torchx_nightly-2024.6.21.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
117
+ torchx_nightly-2024.6.21.dist-info/METADATA,sha256=nz1Jh0khbL27eBONb-oD_ir0Ju4qZXg9mlCt858zBB4,6184
118
+ torchx_nightly-2024.6.21.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
119
+ torchx_nightly-2024.6.21.dist-info/entry_points.txt,sha256=3JYZFlX9aWzR-Gs_qsx1zq7mlqbFz6Mi9rQUULW8caI,170
120
+ torchx_nightly-2024.6.21.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
121
+ torchx_nightly-2024.6.21.dist-info/RECORD,,