xmanager-slurm 0.4.16__py3-none-any.whl → 0.4.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xmanager-slurm might be problematic. Click here for more details.

xm_slurm/executors.py CHANGED
@@ -37,6 +37,7 @@ class Slurm(xm.Executor):
37
37
  switches: Maximum count of leaf switches desired for the job allocation.
38
38
  switches_grace_period: Maximum time to wait for that number of switches.
39
39
  bind: How to bind tasks to resource (memory, GPU, or generic resource).
40
+ bind_flag: Generic resource task binding options.
40
41
  account: The account to charge the job to.
41
42
  partition: The partition to run the job in.
42
43
  qos: The quality of service to run the job with.
@@ -59,6 +60,7 @@ class Slurm(xm.Executor):
59
60
  requirements: resources.JobRequirements
60
61
  time: dt.timedelta
61
62
  bind: tp.Mapping[ResourceBindType | str, str | None] | None = None
63
+ bind_flag: str | None = None
62
64
 
63
65
  # Placement
64
66
  account: str | None = None
@@ -109,6 +111,8 @@ class Slurm(xm.Executor):
109
111
  )
110
112
  if value is not None and not isinstance(value, str):
111
113
  raise TypeError(f"bind value must be None or a string, got {type(value)}")
114
+ if self.bind_flag is not None and not isinstance(self.bind_flag, str):
115
+ raise TypeError(f"bind_flag must be a string, got {type(self.bind_flag)}")
112
116
 
113
117
  if not isinstance(self.timeout_signal, signal.Signals):
114
118
  raise TypeError(
@@ -133,28 +137,13 @@ class Slurm(xm.Executor):
133
137
  def Spec(cls, tag: str | None = None) -> SlurmSpec:
134
138
  return SlurmSpec(tag=tag)
135
139
 
136
- def to_directives(self) -> list[str]:
140
+ def batch_directives(self) -> list[str]:
137
141
  # Job requirements
138
- directives = self.requirements.to_directives()
142
+ directives = self.requirements.batch_directives()
139
143
 
140
144
  # Time
141
145
  directives.append(f"--time={utils.timestr_from_timedelta(self.time)}")
142
146
 
143
- # Resource binding
144
- if self.bind is not None:
145
- for resource, value in self.bind.items():
146
- if value is None:
147
- value = "none"
148
- match resource:
149
- case resources.ResourceType.MEMORY | resources.ResourceType.RAM:
150
- directives.append(f"--mem-bind={value}")
151
- case resources.ResourceType.GPU:
152
- directives.append(f"--gpu-bind={value}")
153
- case str():
154
- directives.append(f"--tres-bind=gres/{resource}:{value}")
155
- case _:
156
- raise ValueError(f"Unsupported resource type {resource!r} for binding.")
157
-
158
147
  # Job dependency handling
159
148
  directives.append(
160
149
  f"--kill-on-invalid-dep={'yes' if self.kill_on_invalid_dependencies else 'no'}"
@@ -196,3 +185,26 @@ class Slurm(xm.Executor):
196
185
  directives.append("--no-requeue")
197
186
 
198
187
  return directives
188
+
189
+ def step_directives(self) -> list[str]:
190
+ directives = self.requirements.step_directives()
191
+
192
+ # Resource binding
193
+ if self.bind is not None:
194
+ for resource, value in self.bind.items():
195
+ if value is None:
196
+ value = "none"
197
+ match resource:
198
+ case resources.ResourceType.MEMORY | resources.ResourceType.RAM:
199
+ directives.append(f"--mem-bind={value}")
200
+ case resources.ResourceType.GPU:
201
+ directives.append(f"--gpu-bind={value}")
202
+ case str():
203
+ directives.append(f"--tres-bind=gres/{resource}:{value}")
204
+ case _:
205
+ raise ValueError(f"Unsupported resource type {resource!r} for binding.")
206
+
207
+ if self.bind_flag is not None:
208
+ directives.append(f"--gres-flags={self.bind_flag}")
209
+
210
+ return directives
xm_slurm/resources.py CHANGED
@@ -232,7 +232,7 @@ class JobRequirements:
232
232
  raise ValueError(f"Replicas must be a positive integer, got {replicas!r}")
233
233
  self.replicas = replicas or 1
234
234
 
235
- def to_directives(self) -> list[str]:
235
+ def batch_directives(self) -> list[str]:
236
236
  directives = []
237
237
 
238
238
  for resource, value in self.task_requirements.items():
@@ -302,6 +302,9 @@ class JobRequirements:
302
302
 
303
303
  return directives
304
304
 
305
+ def step_directives(self) -> list[str]:
306
+ return []
307
+
305
308
  def replace(
306
309
  self,
307
310
  replicas: int | None = None,
@@ -9,8 +9,11 @@
9
9
  srun \
10
10
  --label \
11
11
  --unbuffered \
12
- --kill-on-bad-exit=0 \
12
+ --kill-on-bad-exit=1 \
13
13
  --export="ALL" \
14
+ {% for directive in job.executor.step_directives() %}
15
+ {{ directive }} \
16
+ {% endfor %}
14
17
  bash <<'SRUN_EOF' &
15
18
  set -Eeuxo pipefail
16
19
 
@@ -21,7 +21,9 @@
21
21
  {% else %}
22
22
  #SBATCH --job-name=xm[{{ job_name }}@{{ experiment_id }}]
23
23
  {% endif %}
24
- {{ job.executor.to_directives() | join("\n") }}
24
+ {% for directive in job.executor.batch_directives() %}
25
+ #SBATCH {{ directive }}
26
+ {% endfor %}
25
27
  {{ "\n#SBATCH hetjob\n" if not loop.last }}
26
28
  {% endfor %}
27
29
  {% endblock directives %}
@@ -31,8 +33,11 @@
31
33
  srun \
32
34
  --label \
33
35
  --unbuffered \
34
- --kill-on-bad-exit=0 \
36
+ --kill-on-bad-exit=1 \
35
37
  --export="ALL" \
38
+ {% for directive in job.executor.step_directives() %}
39
+ {{ directive }} \
40
+ {% endfor %}
36
41
  --het-group={{ loop.index0 }} \
37
42
  bash <<'SRUN_EOF' &
38
43
  set -Eeuxo pipefail
@@ -21,7 +21,7 @@
21
21
  {% endif %}
22
22
  #SBATCH --job-name=xm[{{ experiment_id }}]
23
23
  {% endif %}
24
- {% for directive in job.executor.to_directives() %}
24
+ {% for directive in job.executor.batch_directives() %}
25
25
  #SBATCH {{ directive }}
26
26
  {% endfor %}
27
27
  {% endblock directives %}
@@ -61,8 +61,11 @@ export {{ key }}="{{ value }}"
61
61
  srun \
62
62
  --label \
63
63
  --unbuffered \
64
- --kill-on-bad-exit=0 \
64
+ --kill-on-bad-exit=1 \
65
65
  --export="ALL" \
66
+ {% for directive in job.executor.step_directives() %}
67
+ {{ directive }} \
68
+ {% endfor %}
66
69
  bash <<'SRUN_EOF' &
67
70
  set -Eeuxo pipefail
68
71
  {{ run(cluster, job) }}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xmanager-slurm
3
- Version: 0.4.16
3
+ Version: 0.4.17
4
4
  Summary: Slurm backend for XManager.
5
5
  Project-URL: GitHub, https://github.com/jessefarebro/xm-slurm
6
6
  Author-email: Jesse Farebrother <jfarebro@cs.mcgill.ca>
@@ -6,13 +6,13 @@ xm_slurm/constants.py,sha256=zefVtlFdflgSolie5g_rVxWV-Zpydxapchm3y0a2FDc,999
6
6
  xm_slurm/dependencies.py,sha256=G-8vfmvSptZH6c_Ow51SwT84Dr6LI1clRj8F8wOUkiw,6421
7
7
  xm_slurm/executables.py,sha256=fGmrFBl-258bMn6ip5adYeM7xxUHAeIbDN9zD2FDGtY,6373
8
8
  xm_slurm/execution.py,sha256=mTy5u2oP2StIbGzjaSiGCUAwXuBFOiaJ5ephWoc25hI,31799
9
- xm_slurm/executors.py,sha256=karM5u2UEG2IWi0z548_vasyBACrXGV675rCllJmwZw,8616
9
+ xm_slurm/executors.py,sha256=27oiMwF84axeTcrcwL0f5seeLL_1j79OjiM_JZjioFs,9112
10
10
  xm_slurm/experiment.py,sha256=94r0mhtUPUzw4eaUEz0kpsufC25wEGqlDhV4Fcr1ukY,39883
11
11
  xm_slurm/filesystem.py,sha256=4rKtq3t-KDgxJbSGt6JVyRJT_3lCN_vIKTcwKHpTo3I,4389
12
12
  xm_slurm/job_blocks.py,sha256=BFOOYgeodoGIQsB5PdC7SsOUou5aZx-1qbQ7lcqqylI,604
13
13
  xm_slurm/metadata_context.py,sha256=mksVRbVUuistL1uE7TC-fkW-Y69On52jN_svP1e1kiQ,7841
14
14
  xm_slurm/packageables.py,sha256=aEZUQpddfq4FK6h4f6kgGEI4XcOufhm68MjoDFOYR4U,12261
15
- xm_slurm/resources.py,sha256=aC8MzO_7fB9IAdTACvhwVOaNDjLOlWnCh428-8_IDYA,12322
15
+ xm_slurm/resources.py,sha256=sTfwPc0QHRgfckOFq300FZ4fvtPfE4hq8B27DIvf6m4,12388
16
16
  xm_slurm/status.py,sha256=JIBCJPOYsmeJOQbzdACXA2vTWK7g8YWWhzpGP79e7JE,6911
17
17
  xm_slurm/types.py,sha256=TsVykDm-LazVkrjeJrTwCMs4Q8APKhy7BTk0yKIhFNg,805
18
18
  xm_slurm/utils.py,sha256=9w98HlXF0U9cKKtoB8QtGm0CnB0MnnzBARKlbbVNNpU,6211
@@ -37,16 +37,16 @@ xm_slurm/templates/docker/mamba.Dockerfile,sha256=Sgxr5IA5T-pT1Shumb5k3JngoG4pgC
37
37
  xm_slurm/templates/docker/python.Dockerfile,sha256=U4b4QVkopckQ0o9jJIE7d_M6TvExEYlYDirNwCoZ7W4,865
38
38
  xm_slurm/templates/docker/uv.Dockerfile,sha256=L2UJMX2c8waMdrRhiqPytQe3pTBu6u5PpMhJYsKkbEg,1040
39
39
  xm_slurm/templates/slurm/entrypoint.bash.j2,sha256=MRdSVwgGrgQdpEhqfkP35IidgsblrtVXB1YWzvE9hkk,666
40
- xm_slurm/templates/slurm/job-array.bash.j2,sha256=j7jkJjSbe39XvSTJ9rmK2oVnHdntElIhdS5PFpZzpFs,550
41
- xm_slurm/templates/slurm/job-group.bash.j2,sha256=vH5HwneVsVSHx6dPZwbLa4KT9NedRbrZ7cWNE5pXi-M,1113
42
- xm_slurm/templates/slurm/job.bash.j2,sha256=JnK0D8_3tVNpnvPwM5yL_rjLcjqhuHiCtolDjUGAwpk,2084
40
+ xm_slurm/templates/slurm/job-array.bash.j2,sha256=7cc0nZvEcHhZoo7jXI3fJWgMcc6z5H5FmopPRaklylI,637
41
+ xm_slurm/templates/slurm/job-group.bash.j2,sha256=9H3zfJy8RZGFf00ZQJGmMEPyWQ9YMZfvGoD4Q8hMx9Y,1244
42
+ xm_slurm/templates/slurm/job.bash.j2,sha256=GBKY3DPCODPTtEBfuvfaZAua_ZEd5cqPrShtPGE_IpY,2174
43
43
  xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=ri5FgoKs6_bQVf5DO8SL4rJf4UsLxV34aOV-OD8VWDU,2526
44
44
  xm_slurm/templates/slurm/fragments/proxy.bash.j2,sha256=VJLglZo-Nvx9R-qe3rHTxr07CylTQ6Z9NwBzvIpAZrA,814
45
45
  xm_slurm/templates/slurm/library/retry.bash,sha256=bLe59qvfWEk17rE1wZ4EHiHba3RvR2WWZPq-kSe8RAA,2164
46
46
  xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=XxAQWLxZogL7zjn7tuzKn-DkYUJMx_HjaRzpVkz97lM,2414
47
47
  xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=8N1ZtwHyXxP-Cjo4HBPsJiZXcTvf7q2GzvW9ao8_aok,1208
48
- xmanager_slurm-0.4.16.dist-info/METADATA,sha256=j0282EV56cTAG80Q4R419IE-Z74OJVvAkP--IPMCwuo,1007
49
- xmanager_slurm-0.4.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
50
- xmanager_slurm-0.4.16.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
51
- xmanager_slurm-0.4.16.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
52
- xmanager_slurm-0.4.16.dist-info/RECORD,,
48
+ xmanager_slurm-0.4.17.dist-info/METADATA,sha256=YUBZ6woSk-9-0GzFuFlTPy6hhaxFI-WPGVt7APaviT0,1007
49
+ xmanager_slurm-0.4.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
50
+ xmanager_slurm-0.4.17.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
51
+ xmanager_slurm-0.4.17.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
52
+ xmanager_slurm-0.4.17.dist-info/RECORD,,