PyPI - torchx-nightly - Versions diffs - 2024.6.18__py3-none-any.whl → 2024.6.21__py3-none-any.whl - Mend

torchx-nightly 2024.6.18py3-none-any.whl → 2024.6.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchx-nightly might be problematic. Click here for more details.

Files changed (9) hide show

torchx/schedulers/devices.py CHANGED Viewed

@@ -7,25 +7,36 @@
 # pyre-strict
 import warnings
+from functools import partial
 from typing import Callable, Dict, List, Mapping
 from torchx.specs.api import DeviceMount
+from torchx.specs.named_resources_aws import EFA_DEVICE, NEURON_DEVICE
-def efa_to_devicemounts(num_devices: int) -> List[DeviceMount]:
+def to_devicemounts(num_devices: int, device_type: str) -> List[DeviceMount]:
     device_mounts = []
     for device_index in range(0, num_devices):
         device_mounts.append(
             DeviceMount(
-                src_path="/dev/infiniband/uverbs" + str(device_index),
-                dst_path="/dev/infiniband/uverbs" + str(device_index),
+                src_path=device_type + str(device_index),
+                dst_path=device_type + str(device_index),
             )
         )
     return device_mounts
+neuron_to_devicemounts: Callable[[int], List[DeviceMount]] = partial(
+    to_devicemounts, device_type="/dev/neuron"
+)
+efa_to_devicemounts: Callable[[int], List[DeviceMount]] = partial(
+    to_devicemounts, device_type="/dev/infiniband/uverbs"
+)
 DEVICES: Mapping[str, Callable[[int], List[DeviceMount]]] = {
-    "vpc.amazonaws.com/efa": efa_to_devicemounts,
+    EFA_DEVICE: efa_to_devicemounts,
+    NEURON_DEVICE: neuron_to_devicemounts,
 }

torchx/specs/api.py CHANGED Viewed

@@ -237,11 +237,15 @@ class RetryPolicy(str, Enum):
                 application to deal with failed replica departures and
                 replacement replica admittance.
     2. APPLICATION: Restarts the entire application.
+    3. HOT_SPARE: Restarts the replicas for a role as long as quorum (min_replicas)
+                is not violated using extra hosts as spares. It does not really support
+                elasticity and just uses the delta between num_replicas and min_replicas
+                as spares (EXPERIMENTAL).
     """
     REPLICA = "REPLICA"
     APPLICATION = "APPLICATION"
+    HOT_SPARE = "HOT_SPARE"
 class MountType(str, Enum):
@@ -340,6 +344,8 @@ class Role:
                 and num_replicas depending on the cluster resources and
                 policies. If the scheduler doesn't support auto scaling this
                 field is ignored and the job size will be num_replicas.
+                EXPERIMENTAL: For HOT_SPARE restart policy this field is used to
+                indicate the quorum required for the job to run.
             max_retries: max number of retries before giving up
             retry_policy: retry behavior upon replica failures
             resource: Resource requirement for the role. The role should be scheduled

torchx/specs/named_resources_aws.py CHANGED Viewed

@@ -37,6 +37,7 @@ from typing import Callable, Mapping
 from torchx.specs.api import Resource
 EFA_DEVICE = "vpc.amazonaws.com/efa"
+NEURON_DEVICE = "aws.amazon.com/neurondevice"
 # ecs and ec2 have memtax and currently AWS Batch uses hard memory limits
 # so we have to account for mem tax when registering these resources for AWS
@@ -255,7 +256,11 @@ def aws_g5_48xlarge() -> Resource:
 def aws_trn1_2xlarge() -> Resource:
     return Resource(
-        cpu=8, gpu=0, memMB=32 * GiB, capabilities={K8S_ITYPE: "trn1.2xlarge"}
+        cpu=8,
+        gpu=0,
+        memMB=32 * GiB,
+        capabilities={K8S_ITYPE: "trn1.2xlarge"},
+        devices={NEURON_DEVICE: 1},
     )
@@ -265,7 +270,7 @@ def aws_trn1_32xlarge() -> Resource:
         gpu=0,
         memMB=512 * GiB,
         capabilities={K8S_ITYPE: "trn1.32xlarge"},
-        devices={EFA_DEVICE: 8},
+        devices={EFA_DEVICE: 8, NEURON_DEVICE: 16},
     )

{torchx_nightly-2024.6.18.dist-info → torchx_nightly-2024.6.21.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: torchx-nightly
-Version: 2024.6.18
+Version: 2024.6.21
 Summary: TorchX SDK and Components
 Home-page: https://github.com/pytorch/torchx
 Author: TorchX Devs

{torchx_nightly-2024.6.18.dist-info → torchx_nightly-2024.6.21.dist-info}/RECORD RENAMED Viewed

@@ -68,7 +68,7 @@ torchx/schedulers/__init__.py,sha256=M9SBZiNdE3KI_yc1-BiRtAetfTgtX07uKkuvGUeZQLU
 torchx/schedulers/api.py,sha256=s2hI87uAWtU2SHMNBKjAqelzQU_GKp_BjcxdtjVVDDk,14155
 torchx/schedulers/aws_batch_scheduler.py,sha256=7qxy3UFRq0F731-kTjEi6VABWKD60o0req6CBMsTohU,27975
 torchx/schedulers/aws_sagemaker_scheduler.py,sha256=dPah3yaKFUVm-ZZrzFbyM_abP-LCTd-AcAjZ6t2iycU,20699
-torchx/schedulers/devices.py,sha256=BnjZnbXGTWiZKLmMKE3zPDOkb4Vao3jgPVa01aV6vyY,1367
+torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,1734
 torchx/schedulers/docker_scheduler.py,sha256=IrDlmeH-tg_f3krA04Y81nK9dmuYfEPbYOuCjSQkIHA,16541
 torchx/schedulers/gcp_batch_scheduler.py,sha256=dlUfvjfMuQiRcSXQAdwxqdadwPhOf82L5u-ejRWtFgE,16226
 torchx/schedulers/ids.py,sha256=3E-_vwVYC-8Tv8kjuY9-W7TbOe_-Laqd8a65uIN3hQY,1798
@@ -83,11 +83,11 @@ torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80Rx
 torchx/schedulers/ray/ray_common.py,sha256=pyNYFvTKVwdjDAeCBNbPwAWwVNmlLOJWExfn90XY8u8,610
 torchx/schedulers/ray/ray_driver.py,sha256=Wl-1jldL8veVKzmYDEeR2va3JSlAjZpFE1h8HWE9YVE,12286
 torchx/specs/__init__.py,sha256=vF-WUu_4NZP30lCtNYg0YVenY6wRQ8k7K36fOxqbOKc,5477
-torchx/specs/api.py,sha256=1qmajrQcTKhVWDdsuWc57y2s0UYO1tD0p7XYg20cm5Q,35609
+torchx/specs/api.py,sha256=Y5uT7a-qZ4pP4kSfS6yYP1jUzQWLPI4qLQeyuBv5uDQ,36085
 torchx/specs/builders.py,sha256=QDcQrnCO4bdSaiP0216XbCgTsnLutO_1_FW5jDiEIWI,9939
 torchx/specs/file_linter.py,sha256=IeiomB1BgHUlT-ZsvGxar3llY63NOupfLBrOrD_---A,11860
 torchx/specs/finder.py,sha256=MnwxG_UC4a-3X2wQ37ANEQR6D1TvriCLyuVYBh_-wuI,16249
-torchx/specs/named_resources_aws.py,sha256=gmMbzzaYy3WzwDFo5iyd28icX7zRbUOseLl8yudY13Q,7988
+torchx/specs/named_resources_aws.py,sha256=NDzF9srT7hiS5NGwEJc_sbuwxXMtq8l3rVG0QnVqpJE,8114
 torchx/specs/named_resources_generic.py,sha256=Sg4tAdqiiWDrDz2Lj_pnfsjzGIXKTou73wPseh6j55w,2646
 torchx/specs/test/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
 torchx/specs/test/components/a/__init__.py,sha256=kdxEgnI8QBSBiuTjaB4qDD7JX84hWowyPWU4B2Cqe9A,561
@@ -113,9 +113,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
 torchx/workspace/api.py,sha256=1heBmPgB-W5Zf9gwViM7NrqvHpZlVYeMN7jpY8Qkytc,5479
 torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
 torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
-torchx_nightly-2024.6.18.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
-torchx_nightly-2024.6.18.dist-info/METADATA,sha256=oypt_fwq9wzJbR8VbKz8RI-wd47bO0PdDC9AQSaS0kA,6184
-torchx_nightly-2024.6.18.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-torchx_nightly-2024.6.18.dist-info/entry_points.txt,sha256=3JYZFlX9aWzR-Gs_qsx1zq7mlqbFz6Mi9rQUULW8caI,170
-torchx_nightly-2024.6.18.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
-torchx_nightly-2024.6.18.dist-info/RECORD,,
+torchx_nightly-2024.6.21.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
+torchx_nightly-2024.6.21.dist-info/METADATA,sha256=nz1Jh0khbL27eBONb-oD_ir0Ju4qZXg9mlCt858zBB4,6184
+torchx_nightly-2024.6.21.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+torchx_nightly-2024.6.21.dist-info/entry_points.txt,sha256=3JYZFlX9aWzR-Gs_qsx1zq7mlqbFz6Mi9rQUULW8caI,170
+torchx_nightly-2024.6.21.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
+torchx_nightly-2024.6.21.dist-info/RECORD,,

{torchx_nightly-2024.6.18.dist-info → torchx_nightly-2024.6.21.dist-info}/LICENSE RENAMED Viewed

File without changes

{torchx_nightly-2024.6.18.dist-info → torchx_nightly-2024.6.21.dist-info}/WHEEL RENAMED Viewed

File without changes

{torchx_nightly-2024.6.18.dist-info → torchx_nightly-2024.6.21.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{torchx_nightly-2024.6.18.dist-info → torchx_nightly-2024.6.21.dist-info}/top_level.txt RENAMED Viewed

File without changes

torchx-nightly 2024.6.18__py3-none-any.whl → 2024.6.21__py3-none-any.whl

Potentially problematic release.

torchx-nightly 2024.6.18py3-none-any.whl → 2024.6.21py3-none-any.whl