PyPI - xmanager-slurm - Versions diffs - 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl - Mend

xmanager-slurm 0.4.13py3-none-any.whl → 0.4.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xmanager-slurm might be problematic. Click here for more details.

Files changed (29) hide show

xm_slurm/__init__.py +2 -1
xm_slurm/batching.py +11 -11
xm_slurm/config.py +10 -10
xm_slurm/contrib/clusters/drac.py +15 -29
xm_slurm/dependencies.py +7 -7
xm_slurm/execution.py +10 -0
xm_slurm/executors.py +82 -12
xm_slurm/experimental/parameter_controller.py +18 -14
xm_slurm/job_blocks.py +3 -3
xm_slurm/packageables.py +23 -23
xm_slurm/packaging/registry.py +14 -14
xm_slurm/packaging/router.py +3 -3
xm_slurm/packaging/utils.py +5 -5
xm_slurm/resources.py +198 -28
xm_slurm/status.py +2 -2
xm_slurm/templates/slurm/fragments/monitor.bash.j2 +2 -0
xm_slurm/templates/slurm/job-array.bash.j2 +1 -1
xm_slurm/templates/slurm/job-group.bash.j2 +1 -0
xm_slurm/templates/slurm/job.bash.j2 +8 -1
xm_slurm/templates/slurm/library/retry.bash +62 -0
xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +8 -7
xm_slurm/templates/slurm/runtimes/podman.bash.j2 +4 -3
xm_slurm/utils.py +8 -0
{xmanager_slurm-0.4.13.dist-info → xmanager_slurm-0.4.15.dist-info}/METADATA +1 -1
xmanager_slurm-0.4.15.dist-info/RECORD +52 -0
xmanager_slurm-0.4.13.dist-info/RECORD +0 -51
{xmanager_slurm-0.4.13.dist-info → xmanager_slurm-0.4.15.dist-info}/WHEEL +0 -0
{xmanager_slurm-0.4.13.dist-info → xmanager_slurm-0.4.15.dist-info}/entry_points.txt +0 -0
{xmanager_slurm-0.4.13.dist-info → xmanager_slurm-0.4.15.dist-info}/licenses/LICENSE.md +0 -0

xm_slurm/packageables.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import importlib.resources as resources
 import pathlib
 import sys
-from typing import Literal, Mapping, Sequence
+import typing as tp
 from xmanager import xm
@@ -14,7 +14,7 @@ def docker_image(
     *,
     image: str,
     args: xm.UserArgs | None = None,
-    env_vars: Mapping[str, str] | None = None,
+    env_vars: tp.Mapping[str, str] | None = None,
 ) -> xm.Packageable:
     """Creates a packageable for a pre-built Docker image.
@@ -39,12 +39,12 @@ def docker_container(
     dockerfile: pathlib.Path | None = None,
     context: pathlib.Path | None = None,
     target: str | None = None,
-    ssh: Sequence[str] | Literal[True] | None = None,
-    build_args: Mapping[str, str] | None = None,
-    cache_from: str | Sequence[str] | None = None,
-    labels: Mapping[str, str] | None = None,
+    ssh: tp.Sequence[str] | tp.Literal[True] | None = None,
+    build_args: tp.Mapping[str, str] | None = None,
+    cache_from: str | tp.Sequence[str] | None = None,
+    labels: tp.Mapping[str, str] | None = None,
     args: xm.UserArgs | None = None,
-    env_vars: Mapping[str, str] | None = None,
+    env_vars: tp.Mapping[str, str] | None = None,
 ) -> xm.Packageable:
     """Creates a Docker container packageable from a dockerfile.
@@ -104,13 +104,13 @@ def python_container(
     context: pathlib.Path | None = None,
     requirements: pathlib.Path | None = None,
     base_image: str = "docker.io/python:{major}.{minor}-slim",
-    extra_system_packages: Sequence[str] = (),
-    extra_python_packages: Sequence[str] = (),
-    cache_from: str | Sequence[str] | None = None,
-    labels: Mapping[str, str] | None = None,
-    ssh: Sequence[str] | Literal[True] | None = None,
+    extra_system_packages: tp.Sequence[str] = (),
+    extra_python_packages: tp.Sequence[str] = (),
+    cache_from: str | tp.Sequence[str] | None = None,
+    labels: tp.Mapping[str, str] | None = None,
+    ssh: tp.Sequence[str] | tp.Literal[True] | None = None,
     args: xm.UserArgs | None = None,
-    env_vars: Mapping[str, str] | None = None,
+    env_vars: tp.Mapping[str, str] | None = None,
 ) -> xm.Packageable:
     """Creates a Python container from a base image using pip from a `requirements.txt` file.
@@ -181,11 +181,11 @@ def mamba_container(
     context: pathlib.Path | None = None,
     environment: pathlib.Path | None = None,
     base_image: str = "gcr.io/distroless/base-debian10",
-    cache_from: str | Sequence[str] | None = None,
-    labels: Mapping[str, str] | None = None,
-    ssh: Sequence[str] | Literal[True] | None = None,
+    cache_from: str | tp.Sequence[str] | None = None,
+    labels: tp.Mapping[str, str] | None = None,
+    ssh: tp.Sequence[str] | tp.Literal[True] | None = None,
     args: xm.UserArgs | None = None,
-    env_vars: Mapping[str, str] | None = None,
+    env_vars: tp.Mapping[str, str] | None = None,
 ) -> xm.Packageable:
     """Creates a Conda container from a base image using mamba from a `environment.yml` file.
@@ -249,13 +249,13 @@ def uv_container(
     entrypoint: xm.ModuleName | xm.CommandList,
     context: pathlib.Path | None = None,
     base_image: str = "docker.io/python:{major}.{minor}-slim-bookworm",
-    extra_system_packages: Sequence[str] = (),
-    extra_python_packages: Sequence[str] = (),
-    cache_from: str | Sequence[str] | None = None,
-    labels: Mapping[str, str] | None = None,
-    ssh: Sequence[str] | Literal[True] | None = None,
+    extra_system_packages: tp.Sequence[str] = (),
+    extra_python_packages: tp.Sequence[str] = (),
+    cache_from: str | tp.Sequence[str] | None = None,
+    labels: tp.Mapping[str, str] | None = None,
+    ssh: tp.Sequence[str] | tp.Literal[True] | None = None,
     args: xm.UserArgs | None = None,
-    env_vars: Mapping[str, str] | None = None,
+    env_vars: tp.Mapping[str, str] | None = None,
 ) -> xm.Packageable:
     """Creates a Python container from a base image using uv from a `uv.lock` file.

xm_slurm/packaging/registry.py CHANGED Viewed

@@ -1,31 +1,31 @@
 import dataclasses
-from typing import Callable, Generic, ParamSpec, Sequence, Type, TypeVar
+import typing as tp
 from xmanager import xm
-T_co = TypeVar("T_co", covariant=True)
-P = ParamSpec("P")
-ExecutableSpecT = TypeVar("ExecutableSpecT", bound=xm.ExecutableSpec)
+T_co = tp.TypeVar("T_co", covariant=True)
+P = tp.ParamSpec("P")
+ExecutableSpecT = tp.TypeVar("ExecutableSpecT", bound=xm.ExecutableSpec)
 @dataclasses.dataclass(frozen=True)
-class IndexedContainer(Generic[T_co]):
+class IndexedContainer(tp.Generic[T_co]):
     index: int
     value: T_co
-RegistrationCallable = Callable[
-    [Sequence[IndexedContainer[xm.Packageable]]],
-    Sequence[IndexedContainer[xm.Executable]],
+RegistrationCallable = tp.Callable[
+    [tp.Sequence[IndexedContainer[xm.Packageable]]],
+    tp.Sequence[IndexedContainer[xm.Executable]],
 ]
-_REGISTRY: dict[Type[xm.ExecutableSpec], RegistrationCallable] = {}
+_REGISTRY: dict[tp.Type[xm.ExecutableSpec], RegistrationCallable] = {}
 def register(
-    *typs: Type[ExecutableSpecT],
-) -> Callable[[RegistrationCallable], RegistrationCallable]:
+    *typs: tp.Type[ExecutableSpecT],
+) -> tp.Callable[[RegistrationCallable], RegistrationCallable]:
     def decorator(
         registration_callable: RegistrationCallable,
     ) -> RegistrationCallable:
@@ -38,8 +38,8 @@ def register(
 def route(
-    typ: Type[ExecutableSpecT],
-    packageables: Sequence[IndexedContainer[xm.Packageable]],
-) -> Sequence[IndexedContainer[xm.Executable]]:
+    typ: tp.Type[ExecutableSpecT],
+    packageables: tp.Sequence[IndexedContainer[xm.Packageable]],
+) -> tp.Sequence[IndexedContainer[xm.Executable]]:
     global _REGISTRY
     return _REGISTRY[typ](packageables)

xm_slurm/packaging/router.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import collections
 import logging
-from typing import Sequence, Type
+import typing as tp
 from xmanager import xm
@@ -14,7 +14,7 @@ logger = logging.getLogger(__name__)
 def package(
-    packageables: Sequence[xm.Packageable],
+    packageables: tp.Sequence[xm.Packageable],
 ) -> list[xm.Executable]:
     """
     Takes as input a list of packageables and returns a mapping of
@@ -23,7 +23,7 @@ def package(
     # Docker targets to be collected.
     # These are a mapping from `DockerTarget` to the latest digest of the image.
     targets_by_type = collections.defaultdict[
-        Type[xm.ExecutableSpec], list[IndexedContainer[xm.Packageable]]
+        tp.Type[xm.ExecutableSpec], list[IndexedContainer[xm.Packageable]]
     ](list)
     # Collect dockerfiles that need to be built locally

xm_slurm/packaging/utils.py CHANGED Viewed

@@ -1,20 +1,20 @@
 import collections
 import logging
-from typing import ParamSpec, Sequence, TypeVar
+import typing as tp
 from xmanager import xm
 from xm_slurm.packaging.registry import IndexedContainer
-T = TypeVar("T")
-P = ParamSpec("P")
-ReturnT = TypeVar("ReturnT")
+T = tp.TypeVar("T")
+P = tp.ParamSpec("P")
+ReturnT = tp.TypeVar("ReturnT")
 logger = logging.getLogger(__name__)
 def collect_executors_by_executable(
-    targets: Sequence[IndexedContainer[xm.Packageable]],
+    targets: tp.Sequence[IndexedContainer[xm.Packageable]],
 ) -> dict[xm.ExecutableSpec, set[xm.ExecutorSpec]]:
     executors_by_executable = collections.defaultdict(set)
     for target in targets:

xm_slurm/resources.py CHANGED Viewed

@@ -1,9 +1,13 @@
+import builtins
+import collections.abc
+import datetime as dt
 import enum
 import itertools
 import math
-from typing import Mapping
+import re
+import typing as tp
-from xm_slurm import config
+from xm_slurm import config, utils
 class ResourceType(enum.IntEnum):
@@ -57,32 +61,123 @@ assert AcceleratorType | {
 } == set(ResourceType.__members__.values()), "Resource types are not exhaustive."
-ResourceQuantity = int | float
 class FeatureType(enum.IntEnum):
     NVIDIA_MIG = 1
     NVIDIA_NVLINK = 2
+class InvalidTopologyError(Exception):
+    """An unrecognized topology has been provided."""
+TOPOLOGY_REGEX = re.compile(r"^(?P<dims>[\d]+(?:x[\d]+)*)$")
+class Topology:
+    mesh: str
+    dimensions: list[int]
+    switches: int | None
+    switches_grace_period: dt.timedelta | None
+    def __init__(
+        self,
+        mesh: str,
+        /,
+        *,
+        switches: int | None = None,
+        switches_grace_period: dt.timedelta | None = None,
+    ):
+        mesh_match = TOPOLOGY_REGEX.fullmatch(mesh)
+        if not mesh_match:
+            raise InvalidTopologyError(f"Invalid topology mesh: {mesh!r}.")
+        self.mesh = mesh
+        self.dimensions = list(map(int, mesh_match.group("dims").split("x")))
+        if switches is not None:
+            assert (
+                isinstance(switches, int) and switches > 0
+            ), "Switches must be a positive integer."
+        self.switches = switches
+        if switches_grace_period is not None:
+            assert isinstance(
+                switches_grace_period, dt.timedelta
+            ), "Switches grace period must be a `datetime.timedelta`."
+        self.switches_grace_period = switches_grace_period
+    @property
+    def chip_count(self) -> int:
+        return math.prod(self.dimensions)
+    @property
+    def ndim(self) -> int:
+        return len(self.dimensions)
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Topology):
+            return False
+        return (
+            self.mesh == other.mesh
+            and self.switches == other.switches
+            and self.switches_grace_period == other.switches_grace_period
+        )
+    def __hash__(self) -> int:
+        return hash((self.mesh, self.switches, self.switches_grace_period))
+ResourceQuantity = int | float | Topology
+def _parse_resource_quantity(
+    resource_name: ResourceType | str, value: ResourceQuantity
+) -> tuple[float, Topology | None]:
+    if isinstance(resource_name, ResourceType):
+        resource_name = resource_name.name
+    match value:
+        case Topology() as topology:
+            return topology.chip_count, topology
+        case builtins.str() as topology_str if (
+            "x" in topology_str and TOPOLOGY_REGEX.fullmatch(topology_str) is not None
+        ):
+            topology = Topology(topology_str)
+            return topology.chip_count, topology
+        case builtins.str() as num_str:
+            try:
+                value = float(num_str)
+                return int(value) if value.is_integer() else value, None
+            except ValueError as e:
+                raise ValueError(
+                    f"Couldn't parse resource quantity for {resource_name}. "
+                    f"{num_str!r} was given."
+                ) from e
+        case int() | float():
+            return value, None
+        case _:
+            raise ValueError(f"Invalid resource quantity: {value!r} for {resource_name!r}.")
 class JobRequirements:
     replicas: int
     location: str | None
     accelerator: ResourceType | None
-    cluster: config.SlurmClusterConfig | None = None
+    topology: Topology | None
+    cluster: config.SlurmClusterConfig
     def __init__(
         self,
         *,
-        resources: Mapping[ResourceType | str, ResourceQuantity] | None = None,
-        replicas: int = 1,
-        location: str | None = None,
-        cluster: config.SlurmClusterConfig | None = None,
+        resources: tp.Mapping[ResourceType | str, ResourceQuantity] | None = None,
+        replicas: int | None = None,
+        location: str | tp.Iterable[str] | None = None,
+        cluster: config.SlurmClusterConfig,
         **kw_resources: ResourceQuantity,
     ):
-        self.replicas = replicas or 1
+        if isinstance(location, collections.abc.Iterable) and not isinstance(location, str):
+            location = ",".join(location)
         self.location = location
         self.accelerator = None
+        self.topology = None
         self.cluster = cluster
         if resources is None:
@@ -90,6 +185,7 @@ class JobRequirements:
         self.task_requirements: dict[ResourceType | str, ResourceQuantity] = {}
         for resource_name, value in itertools.chain(resources.items(), kw_resources.items()):
+            quantity, topology = _parse_resource_quantity(resource_name, value)
             match resource_name:
                 case str() if resource_name.upper() in ResourceType.__members__:
                     resource = ResourceType[resource_name.upper()]
@@ -106,58 +202,132 @@ class JobRequirements:
                 if self.accelerator is not None:
                     raise ValueError("Accelerator already set.")
                 self.accelerator = resource  # type: ignore
+                self.topology = topology or Topology(f"{quantity:g}")
+            elif topology is not None:
+                raise ValueError(
+                    f"A topology was specified for a non-accelerator resource: {resource_name!r}."
+                )
             if resource in self.task_requirements:
                 raise ValueError(f"{resource} has been specified twice.")
-            self.task_requirements[resource] = value
+            self.task_requirements[resource] = quantity
+        if self.topology is not None and self.topology.ndim > 2:
+            raise ValueError("Topologies with more than 2 dimensions are not supported.")
+        if (
+            self.accelerator is not None
+            and self.topology is not None
+            and len(self.topology.dimensions) == 2
+        ):
+            if replicas is not None and replicas != self.topology.dimensions[1]:
+                raise ValueError(
+                    f"For multihost GPUs with topology {self.topology}, replicas should"
+                    f"be either None or {self.topology.dimensions[1]}. Found: "
+                    f"{replicas}"
+                )
+            replicas = self.topology.dimensions[1]
+        if replicas is not None and replicas <= 0:
+            raise ValueError(f"Replicas must be a positive integer, got {replicas!r}")
+        self.replicas = replicas or 1
     def to_directives(self) -> list[str]:
-        if self.cluster is None:
-            raise ValueError("Cannnot derive Slurm directives for requirements without a cluster.")
         directives = []
         for resource, value in self.task_requirements.items():
             match resource:
                 case ResourceType.EPHEMERAL_STORAGE | ResourceType.DISK:
-                    assert isinstance(value, int), "Disk space must be an integer"
+                    assert isinstance(
+                        value, int
+                    ), f"Disk space must be an integer, got {type(value)!r}"
                     directives.append(f"--tmp={math.ceil(value / 2**20)}M")
                 case ResourceType.MEMORY | ResourceType.RAM:
                     num_cpus = self.task_requirements.get(ResourceType.CPU, 1)
-                    assert isinstance(value, (int, float)), "Memory must be an integer or float"
-                    assert isinstance(num_cpus, int), "CPU must be an integer"
+                    assert isinstance(
+                        value, (int, float)
+                    ), f"Memory must be an integer or float, got {type(value)!r}"
+                    assert isinstance(
+                        num_cpus, int
+                    ), f"CPU must be an integer, got {type(num_cpus)!r}"
                     directives.append(f"--mem-per-cpu={math.ceil(value / num_cpus / 2**20)}M")
                 case ResourceType.CPU:
-                    assert isinstance(value, int), "CPU must be an integer"
+                    assert isinstance(value, int), f"CPU must be an integer, got {type(value)!r}"
                     directives.append(f"--cpus-per-task={value}")
                 case ResourceType.GPU:
-                    assert isinstance(value, int), "GPU must be an integer"
-                    directives.append(f"--gpus-per-task={value}")
+                    assert isinstance(value, int), f"GPU must be an integer, got {type(value)!r}"
+                    directives.append(f"--gpus={value}")
                 case ResourceType() if resource in AcceleratorType:
-                    assert isinstance(value, int), "Accelerator must be an integer"
+                    assert isinstance(
+                        value, int
+                    ), f"Accelerator must be an integer, got {type(value)!r}"
                     resource_type = self.cluster.resources.get(resource, None)
                     if resource_type is None:
                         raise ValueError(
                             f"Cluster {self.cluster.name} does not map resource type {resource!r}."
                         )
-                    directives.append(f"--gpus-per-task={resource_type}:{value}")
+                    directives.append(f"--gpus={resource_type}:{value}")
                 case str():
                     directives.append(f"--gres={resource}:{value}")
-        directives.append(f"--ntasks={self.replicas}")
         if self.location:
+            assert isinstance(
+                self.location, str
+            ), f"Location must be a string, got {type(self.location)!r}"
             directives.append(f"--nodelist={self.location}")
+        assert (
+            isinstance(self.replicas, int) and self.replicas > 0
+        ), f"Replicas must be a positive integer, got {self.replicas!r}"
+        directives.append(f"--ntasks={self.replicas}")
+        if self.topology is not None:
+            assert self.accelerator is not None, "Accelerator must be set."
+            match self.accelerator:
+                case ResourceType.GPU:
+                    directives.append(f"--gpus-per-task={self.topology.dimensions[0]}")
+                case ResourceType() if self.accelerator in AcceleratorType:
+                    resource_type = self.cluster.resources[self.accelerator]
+                    directives.append(
+                        f"--gpus-per-task={resource_type}:{self.topology.dimensions[0]}"
+                    )
+            if self.topology.switches is not None:
+                switches_timeout = (
+                    f"@{utils.timestr_from_timedelta(self.topology.switches_grace_period)}"
+                    if self.topology.switches_grace_period is not None
+                    else ""
+                )
+                directives.append(f"--switches={self.topology.switches}{switches_timeout}")
         return directives
     def replace(
         self,
-        cluster: config.SlurmClusterConfig | None,
+        replicas: int | None = None,
+        location: str | None = None,
+        cluster: config.SlurmClusterConfig | None = None,
         **kw_resources: ResourceQuantity,
     ) -> "JobRequirements":
+        # Merge kw_resources into existing task_requirements, removing conflicting enum keys
+        merged_resources = dict(self.task_requirements)
+        # Remove ResourceType keys that will be overridden by string keys in kw_resources
+        for key in list(merged_resources.keys()):
+            if isinstance(key, ResourceType) and any(
+                ResourceType[name.upper()] == key
+                for name in kw_resources
+                if name.upper() in ResourceType.__members__
+            ):
+                del merged_resources[key]
+        merged_resources.update(kw_resources)  # type: ignore
         return JobRequirements(
-            resources=self.task_requirements | kw_resources,  # type: ignore
-            replicas=self.replicas,
-            cluster=cluster or self.cluster,
+            resources=merged_resources,
+            replicas=replicas if replicas is not None else self.replicas,
+            location=location if location is not None else self.location,
+            cluster=cluster if cluster is not None else self.cluster,
         )
     def __repr__(self) -> str:
@@ -169,7 +339,7 @@ class JobRequirements:
             args.append(f"{resource.lower()}={value!r}")
         if self.replicas != 1:
-            args.append(f"replicas={self.replicas}")
+            args.append(f"replicas={self.replicas!r}")
         if self.cluster is not None:
             args.append(f"cluster={self.cluster!r}")

xm_slurm/status.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import enum
 import re
-from typing import Sequence
+import typing as tp
 from xmanager import xm
@@ -151,7 +151,7 @@ class SlurmWorkUnitStatus(xm.ExperimentUnitStatus):
     """Status of a Slurm experiment job."""
     @classmethod
-    def aggregate(cls, states: Sequence[SlurmJobState]) -> "SlurmWorkUnitStatus":
+    def aggregate(cls, states: tp.Sequence[SlurmJobState]) -> "SlurmWorkUnitStatus":
         """Aggregate a sequence of statuses into a single status."""
         assert len(states) > 0, "Cannot aggregate empty sequence of statuses."
         max_error_state: SlurmJobState | None = None

xm_slurm/templates/slurm/fragments/monitor.bash.j2 CHANGED Viewed

@@ -25,6 +25,7 @@ __xm_slurm_wait_for_children() {
   while [ ${#children[@]} -gt 0 ]; do
 {% endraw %}
     echo "INFO: Waiting for child processes to finish..."
+    set +e
 {% if requeue_on_timeout %}
     # Wait on either one of the child processes or the timeout process.
     wait -n -p child_pid "${children[@]}" "${timeout_pid}"
@@ -32,6 +33,7 @@ __xm_slurm_wait_for_children() {
     wait -n -p child_pid "${children[@]}"
 {% endif %}
     local child_exit_code=$?
+    set -e
 {% if requeue_on_timeout %}
     # If the finished process is the watchdog, trigger the timeout handling.

xm_slurm/templates/slurm/job-array.bash.j2 CHANGED Viewed

@@ -7,9 +7,9 @@
 {% block bootstrap %}
 srun \
+  --label \
   --unbuffered \
   --kill-on-bad-exit=0 \
-  --overlap \
   --export="ALL" \
   bash <<'SRUN_EOF' &
 set -Eeuxo pipefail

xm_slurm/templates/slurm/job-group.bash.j2 CHANGED Viewed

@@ -29,6 +29,7 @@
 {% block bootstrap %}
 {% for job in job_group.jobs.values() +%}
 srun \
+  --label \
   --unbuffered \
   --kill-on-bad-exit=0 \
   --export="ALL" \

xm_slurm/templates/slurm/job.bash.j2 CHANGED Viewed

@@ -27,6 +27,13 @@
 {% endblock directives %}
 set -Eeuxo pipefail
+{% if stdlib %}
+# --- Helper functions ---
+{% for fn in stdlib %}
+{{ fn }}
+{% endfor %}
+{% endif %}
 {% block prolog %}
 {% if cluster.prolog %}
 {{- cluster.prolog -}}
@@ -52,9 +59,9 @@ export {{ key }}="{{ value }}"
 {% block bootstrap %}
 srun \
+  --label \
   --unbuffered \
   --kill-on-bad-exit=0 \
-  --overlap \
   --export="ALL" \
   bash <<'SRUN_EOF' &
 set -Eeuxo pipefail

xm_slurm/templates/slurm/library/retry.bash ADDED Viewed

@@ -0,0 +1,62 @@
+# retry: rerun a command if it exits with certain codes
+# Options:
+#   -c CODE   Retry on this exit code (repeatable).
+#   -n N      Max attempts (incl. first). Default: unlimited
+#   -d SECS   Initial delay before first retry. Default: 1
+#   -b FACTOR Integer backoff multiplier per retry. Default: 1 (no backoff)
+#   -q        Quiet (no logs)
+# Usage:
+#   retry [-c CODE ...] [-n N] [-d SECS] [-b FACTOR] [-q] -- cmd arg1 arg2 ...
+retry() {
+  local -a codes=()
+  local -i max=-1 delay=1 backoff=1 quiet=0 status
+  local opt OPTIND=1
+  while getopts ":c:n:d:b:q" opt; do
+    case "$opt" in
+      c) codes+=("$OPTARG") ;;
+      n) max=$OPTARG ;;
+      d) delay=$OPTARG ;;
+      b) backoff=$OPTARG ;;
+      q) quiet=1 ;;
+      :) printf 'retry: option -%s requires an argument\n' "$OPTARG" >&2; return 2 ;;
+      \?) printf 'retry: invalid option -- %s\n' "$OPTARG" >&2; return 2 ;;
+    esac
+  done
+  shift $((OPTIND-1))
+  (( $# )) || { printf 'retry: missing command\n' >&2; return 2; }
+  ((${#codes[@]})) || { printf 'retry: no return codes specified\n' >&2; return 2; }
+  for ((attempt=1; ; attempt++)); do
+    if "$@"; then                    # safe with set -e (exception context)
+      return 0
+    else
+      status=$?                       # capture failing status immediately
+    fi
+    # retryable?
+    local retryable=0 c
+    for c in "${codes[@]}"; do
+      (( status == c )) && { retryable=1; break; }
+    done
+    # stop if not retryable OR we've just hit the max attempt
+    if (( !retryable )) || (( max >= 0 && attempt >= max )); then
+      (( quiet )) || {
+        if (( attempt > 1 )); then
+          printf 'retry: giving up after %d attempts; last exit=%d\n' "$attempt" "$status" >&2
+        else
+          printf 'retry: command failed; exit=%d\n' "$status" >&2
+        fi
+      }
+      return "$status"               # propagate exact code; errexit will catch
+    fi
+    (( quiet )) || printf 'retry: attempt %d failed with %d; retrying in %ds...\n' \
+                          "$attempt" "$status" "$delay" >&2
+    sleep "$delay" || :              # never trip set -e if sleep errors
+    (( delay *= backoff ))
+  done
+}
+export -f retry

xmanager-slurm 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl

Potentially problematic release.

xmanager-slurm 0.4.13py3-none-any.whl → 0.4.15py3-none-any.whl