PyPI - torchx-nightly - Versions diffs - 2025.7.29__py3-none-any.whl → 2025.7.31__py3-none-any.whl - Mend

torchx-nightly 2025.7.29py3-none-any.whl → 2025.7.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchx-nightly might be problematic. Click here for more details.

Files changed (7) hide show

torchx/schedulers/slurm_scheduler.py CHANGED Viewed

@@ -18,6 +18,7 @@ import os.path
 import shlex
 import subprocess
 import tempfile
+import warnings
 from dataclasses import dataclass
 from datetime import datetime
 from subprocess import CalledProcessError, PIPE
@@ -72,6 +73,55 @@ def appstate_from_slurm_state(slurm_state: str) -> AppState:
     return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
+def version() -> Tuple[int, int]:
+    """
+    Uses ``sinfo --version`` to get the slurm version. If the command fails, it
+    assumes the version is ``slurm 24.05.8``.
+    Returns:
+    -------
+        Tuple[int, int] slurm version as a tuple of ints (major, minor).
+    """
+    cmd = ["sinfo", "--version"]
+    try:
+        out = subprocess.check_output(cmd, stderr=PIPE, encoding="utf-8")
+    except (CalledProcessError, FileNotFoundError):
+        out = "slurm 24.05.8"
+        warnings.warn(
+            "Error running: `{sinfo_cmd}` to get SLURM version. Are you running outside the "
+            "cluster's login or head node? This typically happens when running in `--dryrun`"
+            " mode. Assuming version is `slurm 24.05.8`.",
+            RuntimeWarning,
+            stacklevel=2,
+        )
+    # sinfo --version returns in the form "slurm 24.1.0"
+    _, version_literal = out.split(" ", maxsplit=2)
+    major, minor = [int(v) for v in version_literal.split(".")][:2]
+    return (major, minor)
+def _should_use_gpus_per_node_from_version() -> bool:
+    """
+    Determine whether to use gpus-per-node based on automatically detected slurm version.
+    Change Reference: https://fburl.com/sqwqzxn6
+    > select/linear - Reject jobs asking for GRES per job|socket|task or cpus|mem per GRES.
+    Returns:
+        ``True`` in slurm ``version>=24.11.0``, ``False`` otherwise.
+    """
+    slurm_24_11_0 = (24, 11)
+    slurm_version = version()
+    return slurm_version[0] > slurm_24_11_0[0] or (  # Major version is greater
+        slurm_version[0] == slurm_24_11_0[0] and slurm_version[1] >= slurm_24_11_0[1]
+    )  # Major version is equal and minor version is greater or equal
 SBATCH_JOB_OPTIONS = {
     "comment",
     "mail-user",
@@ -81,6 +131,7 @@ SBATCH_GROUP_OPTIONS = {
     "partition",
     "time",
     "constraint",
+    "qos",
 }
 log: logging.Logger = logging.getLogger(__name__)
@@ -106,6 +157,7 @@ SlurmOpts = TypedDict(
         "mail-user": Optional[str],
         "mail-type": Optional[str],
         "job_dir": Optional[str],
+        "qos": Optional[str],
     },
     total=False,
 )
@@ -126,7 +178,11 @@ class SlurmReplicaRequest:
     @classmethod
     def from_role(
-        cls, name: str, role: Role, cfg: SlurmOpts, nomem: bool
+        cls,
+        name: str,
+        role: Role,
+        cfg: SlurmOpts,
+        nomem: bool,
     ) -> "SlurmReplicaRequest":
         """
         ``from_role`` creates a SlurmReplicaRequest for the specific role and
@@ -149,7 +205,11 @@ class SlurmReplicaRequest:
             if not nomem and resource.memMB > 0:
                 sbatch_opts.setdefault("mem", str(resource.memMB))
             if resource.gpu > 0:
-                sbatch_opts.setdefault("gpus-per-task", str(resource.gpu))
+                # Use smart GPU allocation based on automatically detected Slurm version
+                if _should_use_gpus_per_node_from_version():
+                    sbatch_opts.setdefault("gpus-per-node", str(resource.gpu))
+                else:
+                    sbatch_opts.setdefault("gpus-per-task", str(resource.gpu))
         srun_opts = {
             "output": f"slurm-{macros.app_id}-{name}.out",
@@ -378,6 +438,11 @@ class SlurmScheduler(
             iteration, jobs will be tracked in ``.torchxslurmjobdirs``.
             """,
         )
+        opts.add(
+            "qos",
+            type_=str,
+            help="Quality of Service (QoS) to assign to the job.",
+        )
         return opts
     def schedule(self, dryrun_info: AppDryRunInfo[SlurmBatchRequest]) -> str:

{torchx_nightly-2025.7.29.dist-info → torchx_nightly-2025.7.31.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: torchx-nightly
-Version: 2025.7.29
+Version: 2025.7.31
 Summary: TorchX SDK and Components
 Home-page: https://github.com/pytorch/torchx
 Author: TorchX Devs

{torchx_nightly-2025.7.29.dist-info → torchx_nightly-2025.7.31.dist-info}/RECORD RENAMED Viewed

@@ -77,7 +77,7 @@ torchx/schedulers/kubernetes_scheduler.py,sha256=0_loGJ7WnxEr9dhgFt3Gw-7nVLirMDV
 torchx/schedulers/local_scheduler.py,sha256=lOtVtmMIhytdju1Dlc3p99VALMY3qYRDPqjxdyTAbQQ,41877
 torchx/schedulers/lsf_scheduler.py,sha256=YS6Yel8tXJqLPxbcGz95lZG2nCi36AQXdNDyuBJePKg,17661
 torchx/schedulers/ray_scheduler.py,sha256=T-jsGSOa8O-h1kTUU7Q7Fk1RILL1Yzvuos_WFSQF8Fo,15795
-torchx/schedulers/slurm_scheduler.py,sha256=zM_9XYVm7sQ8NGN-N26D-2YIfE83JS3mvpPb40CDKcA,26411
+torchx/schedulers/slurm_scheduler.py,sha256=GlHGZBIklIyvBxxzw2CtFvCBmLQGCy_o8kf5lf411Ng,28592
 torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
 torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
 torchx/schedulers/ray/ray_common.py,sha256=pyNYFvTKVwdjDAeCBNbPwAWwVNmlLOJWExfn90XY8u8,610
@@ -115,9 +115,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
 torchx/workspace/api.py,sha256=PtDkGTC5lX03pRoYpuMz2KCmM1ZOycRP1UknqvNb97Y,6341
 torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
 torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
-torchx_nightly-2025.7.29.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
-torchx_nightly-2025.7.29.dist-info/METADATA,sha256=RDIAT6e-_Gu037E5bWrN68JJB4u4BEKriSNJFGq4GrU,6104
-torchx_nightly-2025.7.29.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-torchx_nightly-2025.7.29.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
-torchx_nightly-2025.7.29.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
-torchx_nightly-2025.7.29.dist-info/RECORD,,
+torchx_nightly-2025.7.31.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
+torchx_nightly-2025.7.31.dist-info/METADATA,sha256=wzQ4vn137b-jWuJM7WNlldxfchoZZAwmAbBYttz4anU,6104
+torchx_nightly-2025.7.31.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+torchx_nightly-2025.7.31.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
+torchx_nightly-2025.7.31.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
+torchx_nightly-2025.7.31.dist-info/RECORD,,

{torchx_nightly-2025.7.29.dist-info → torchx_nightly-2025.7.31.dist-info}/LICENSE RENAMED Viewed

File without changes

{torchx_nightly-2025.7.29.dist-info → torchx_nightly-2025.7.31.dist-info}/WHEEL RENAMED Viewed

File without changes

{torchx_nightly-2025.7.29.dist-info → torchx_nightly-2025.7.31.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{torchx_nightly-2025.7.29.dist-info → torchx_nightly-2025.7.31.dist-info}/top_level.txt RENAMED Viewed

File without changes

torchx-nightly 2025.7.29__py3-none-any.whl → 2025.7.31__py3-none-any.whl

Potentially problematic release.

torchx-nightly 2025.7.29py3-none-any.whl → 2025.7.31py3-none-any.whl