torchx-nightly 2025.7.29__py3-none-any.whl → 2025.7.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/schedulers/slurm_scheduler.py +67 -2
- {torchx_nightly-2025.7.29.dist-info → torchx_nightly-2025.7.31.dist-info}/METADATA +1 -1
- {torchx_nightly-2025.7.29.dist-info → torchx_nightly-2025.7.31.dist-info}/RECORD +7 -7
- {torchx_nightly-2025.7.29.dist-info → torchx_nightly-2025.7.31.dist-info}/LICENSE +0 -0
- {torchx_nightly-2025.7.29.dist-info → torchx_nightly-2025.7.31.dist-info}/WHEEL +0 -0
- {torchx_nightly-2025.7.29.dist-info → torchx_nightly-2025.7.31.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.7.29.dist-info → torchx_nightly-2025.7.31.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,7 @@ import os.path
|
|
|
18
18
|
import shlex
|
|
19
19
|
import subprocess
|
|
20
20
|
import tempfile
|
|
21
|
+
import warnings
|
|
21
22
|
from dataclasses import dataclass
|
|
22
23
|
from datetime import datetime
|
|
23
24
|
from subprocess import CalledProcessError, PIPE
|
|
@@ -72,6 +73,55 @@ def appstate_from_slurm_state(slurm_state: str) -> AppState:
|
|
|
72
73
|
return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
|
|
73
74
|
|
|
74
75
|
|
|
76
|
+
def version() -> Tuple[int, int]:
|
|
77
|
+
"""
|
|
78
|
+
Uses ``sinfo --version`` to get the slurm version. If the command fails, it
|
|
79
|
+
assumes the version is ``slurm 24.05.8``.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
-------
|
|
83
|
+
Tuple[int, int] slurm version as a tuple of ints (major, minor).
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
cmd = ["sinfo", "--version"]
|
|
87
|
+
try:
|
|
88
|
+
out = subprocess.check_output(cmd, stderr=PIPE, encoding="utf-8")
|
|
89
|
+
except (CalledProcessError, FileNotFoundError):
|
|
90
|
+
out = "slurm 24.05.8"
|
|
91
|
+
warnings.warn(
|
|
92
|
+
"Error running: `{sinfo_cmd}` to get SLURM version. Are you running outside the "
|
|
93
|
+
"cluster's login or head node? This typically happens when running in `--dryrun`"
|
|
94
|
+
" mode. Assuming version is `slurm 24.05.8`.",
|
|
95
|
+
RuntimeWarning,
|
|
96
|
+
stacklevel=2,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# sinfo --version returns in the form "slurm 24.1.0"
|
|
100
|
+
_, version_literal = out.split(" ", maxsplit=2)
|
|
101
|
+
major, minor = [int(v) for v in version_literal.split(".")][:2]
|
|
102
|
+
|
|
103
|
+
return (major, minor)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _should_use_gpus_per_node_from_version() -> bool:
|
|
107
|
+
"""
|
|
108
|
+
Determine whether to use gpus-per-node based on automatically detected slurm version.
|
|
109
|
+
|
|
110
|
+
Change Reference: https://fburl.com/sqwqzxn6
|
|
111
|
+
> select/linear - Reject jobs asking for GRES per job|socket|task or cpus|mem per GRES.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
``True`` in slurm ``version>=24.11.0``, ``False`` otherwise.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
slurm_24_11_0 = (24, 11)
|
|
118
|
+
slurm_version = version()
|
|
119
|
+
|
|
120
|
+
return slurm_version[0] > slurm_24_11_0[0] or ( # Major version is greater
|
|
121
|
+
slurm_version[0] == slurm_24_11_0[0] and slurm_version[1] >= slurm_24_11_0[1]
|
|
122
|
+
) # Major version is equal and minor version is greater or equal
|
|
123
|
+
|
|
124
|
+
|
|
75
125
|
SBATCH_JOB_OPTIONS = {
|
|
76
126
|
"comment",
|
|
77
127
|
"mail-user",
|
|
@@ -81,6 +131,7 @@ SBATCH_GROUP_OPTIONS = {
|
|
|
81
131
|
"partition",
|
|
82
132
|
"time",
|
|
83
133
|
"constraint",
|
|
134
|
+
"qos",
|
|
84
135
|
}
|
|
85
136
|
|
|
86
137
|
log: logging.Logger = logging.getLogger(__name__)
|
|
@@ -106,6 +157,7 @@ SlurmOpts = TypedDict(
|
|
|
106
157
|
"mail-user": Optional[str],
|
|
107
158
|
"mail-type": Optional[str],
|
|
108
159
|
"job_dir": Optional[str],
|
|
160
|
+
"qos": Optional[str],
|
|
109
161
|
},
|
|
110
162
|
total=False,
|
|
111
163
|
)
|
|
@@ -126,7 +178,11 @@ class SlurmReplicaRequest:
|
|
|
126
178
|
|
|
127
179
|
@classmethod
|
|
128
180
|
def from_role(
|
|
129
|
-
cls,
|
|
181
|
+
cls,
|
|
182
|
+
name: str,
|
|
183
|
+
role: Role,
|
|
184
|
+
cfg: SlurmOpts,
|
|
185
|
+
nomem: bool,
|
|
130
186
|
) -> "SlurmReplicaRequest":
|
|
131
187
|
"""
|
|
132
188
|
``from_role`` creates a SlurmReplicaRequest for the specific role and
|
|
@@ -149,7 +205,11 @@ class SlurmReplicaRequest:
|
|
|
149
205
|
if not nomem and resource.memMB > 0:
|
|
150
206
|
sbatch_opts.setdefault("mem", str(resource.memMB))
|
|
151
207
|
if resource.gpu > 0:
|
|
152
|
-
|
|
208
|
+
# Use smart GPU allocation based on automatically detected Slurm version
|
|
209
|
+
if _should_use_gpus_per_node_from_version():
|
|
210
|
+
sbatch_opts.setdefault("gpus-per-node", str(resource.gpu))
|
|
211
|
+
else:
|
|
212
|
+
sbatch_opts.setdefault("gpus-per-task", str(resource.gpu))
|
|
153
213
|
|
|
154
214
|
srun_opts = {
|
|
155
215
|
"output": f"slurm-{macros.app_id}-{name}.out",
|
|
@@ -378,6 +438,11 @@ class SlurmScheduler(
|
|
|
378
438
|
iteration, jobs will be tracked in ``.torchxslurmjobdirs``.
|
|
379
439
|
""",
|
|
380
440
|
)
|
|
441
|
+
opts.add(
|
|
442
|
+
"qos",
|
|
443
|
+
type_=str,
|
|
444
|
+
help="Quality of Service (QoS) to assign to the job.",
|
|
445
|
+
)
|
|
381
446
|
return opts
|
|
382
447
|
|
|
383
448
|
def schedule(self, dryrun_info: AppDryRunInfo[SlurmBatchRequest]) -> str:
|
|
@@ -77,7 +77,7 @@ torchx/schedulers/kubernetes_scheduler.py,sha256=0_loGJ7WnxEr9dhgFt3Gw-7nVLirMDV
|
|
|
77
77
|
torchx/schedulers/local_scheduler.py,sha256=lOtVtmMIhytdju1Dlc3p99VALMY3qYRDPqjxdyTAbQQ,41877
|
|
78
78
|
torchx/schedulers/lsf_scheduler.py,sha256=YS6Yel8tXJqLPxbcGz95lZG2nCi36AQXdNDyuBJePKg,17661
|
|
79
79
|
torchx/schedulers/ray_scheduler.py,sha256=T-jsGSOa8O-h1kTUU7Q7Fk1RILL1Yzvuos_WFSQF8Fo,15795
|
|
80
|
-
torchx/schedulers/slurm_scheduler.py,sha256=
|
|
80
|
+
torchx/schedulers/slurm_scheduler.py,sha256=GlHGZBIklIyvBxxzw2CtFvCBmLQGCy_o8kf5lf411Ng,28592
|
|
81
81
|
torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
|
|
82
82
|
torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
|
|
83
83
|
torchx/schedulers/ray/ray_common.py,sha256=pyNYFvTKVwdjDAeCBNbPwAWwVNmlLOJWExfn90XY8u8,610
|
|
@@ -115,9 +115,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
|
|
|
115
115
|
torchx/workspace/api.py,sha256=PtDkGTC5lX03pRoYpuMz2KCmM1ZOycRP1UknqvNb97Y,6341
|
|
116
116
|
torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
|
|
117
117
|
torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
|
|
118
|
-
torchx_nightly-2025.7.
|
|
119
|
-
torchx_nightly-2025.7.
|
|
120
|
-
torchx_nightly-2025.7.
|
|
121
|
-
torchx_nightly-2025.7.
|
|
122
|
-
torchx_nightly-2025.7.
|
|
123
|
-
torchx_nightly-2025.7.
|
|
118
|
+
torchx_nightly-2025.7.31.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
|
|
119
|
+
torchx_nightly-2025.7.31.dist-info/METADATA,sha256=wzQ4vn137b-jWuJM7WNlldxfchoZZAwmAbBYttz4anU,6104
|
|
120
|
+
torchx_nightly-2025.7.31.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
121
|
+
torchx_nightly-2025.7.31.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
|
|
122
|
+
torchx_nightly-2025.7.31.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
|
|
123
|
+
torchx_nightly-2025.7.31.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|