xmanager-slurm 0.4.15__py3-none-any.whl → 0.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xmanager-slurm might be problematic. Click here for more details.

@@ -26,6 +26,7 @@ def mila(
26
26
  "/home/mila/${USER:0:1}/$USER/.local/state/xm-slurm": "/xm-slurm-state",
27
27
  "/home/mila/${USER:0:1}/$USER/.ssh": "/home/mila/${USER:0:1}/$USER/.ssh",
28
28
  }
29
+ mounts = dict(mounts) | {"/dev/infiniband": "/dev/infiniband"}
29
30
 
30
31
  return config.SlurmClusterConfig(
31
32
  name="mila",
@@ -29,6 +29,7 @@ def _drac_cluster(
29
29
  "/home/$USER/.ssh": "/home/$USER/.ssh",
30
30
  "/home/$USER/.local/state/xm-slurm": "/xm-slurm-state",
31
31
  }
32
+ mounts = dict(mounts) | {"/dev/infiniband": "/dev/infiniband"}
32
33
 
33
34
  return config.SlurmClusterConfig(
34
35
  name=name,
@@ -3,7 +3,9 @@
3
3
 
4
4
  # Bundle will be where our built sandbox image is stored
5
5
  # container-workdir will be our container's scratch directory
6
- mkdir -p "$SLURM_TMPDIR"/{container,container-workdir,container-overlay}
6
+ # TODO(jfarebro): We can make this more efficient by doing an srun per node and downloading the container once per node.
7
+ # but this requires apptainer support to have an overlay per procid
8
+ mkdir -p "$SLURM_TMPDIR"/{container-"$SLURM_PROCID",container-workdir-"$SLURM_PROCID",container-overlay-"$SLURM_PROCID"}
7
9
 
8
10
  retry -c 255 -n 10 -d 1 -b 2 -- \
9
11
  {% if job.executable.credentials %}
@@ -14,19 +16,21 @@ retry -c 255 -n 10 -d 1 -b 2 -- \
14
16
  --force \
15
17
  --sandbox \
16
18
  --fix-perms \
17
- "$SLURM_TMPDIR"/container \
19
+ "$SLURM_TMPDIR"/container-"$SLURM_PROCID" \
18
20
  docker://{{ job.executable.image }}
19
21
 
20
22
  {% if runtime == "singularity" and cluster.mounts %}
21
23
  {% for source, dest in cluster.mounts.items() %}
22
- mkdir -p "$SLURM_TMPDIR"/container/{{ dest | trim('/') }}
24
+ mkdir -p "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/{{ dest | trim('/') }}
23
25
  {% endfor %}
24
26
  {% endif %}
25
27
 
26
- cat << 'ENTRYPOINT_EOF' > "$SLURM_TMPDIR"/container/xm-slurm-entrypoint.sh
28
+ cat << 'ENTRYPOINT_EOF' > "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/xm-slurm-entrypoint.sh
27
29
  {{ entrypoint(cluster, job) }}
28
30
  ENTRYPOINT_EOF
29
- chmod +x "$SLURM_TMPDIR"/container/xm-slurm-entrypoint.sh
31
+ chmod +x "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/xm-slurm-entrypoint.sh
32
+
33
+ for var in "${!SLURM_@}"; do export "{{ runtime | upper }}ENV_${var}=${!var}"; done
30
34
 
31
35
  exec {{ runtime }} exec \
32
36
  {% if job.executor.requirements.accelerator %}
@@ -45,16 +49,16 @@ exec {{ runtime }} exec \
45
49
  --bind {{ source }}:{{ dest }} \
46
50
  {% endfor %}
47
51
  {% endif %}
48
- --workdir "$SLURM_TMPDIR"/container-workdir \
52
+ --workdir "$SLURM_TMPDIR"/container-workdir-"$SLURM_PROCID" \
49
53
  {% if (cluster.runtime | string) == "apptainer" %}
50
- --overlay "$SLURM_TMPDIR"/container-overlay \
54
+ --overlay "$SLURM_TMPDIR"/container-overlay-"$SLURM_PROCID" \
51
55
  {% else %}
52
56
  --writable \
53
57
  {% endif %}
54
58
  {% if job.executable.workdir %}
55
59
  --pwd {{ job.executable.workdir }} \
56
60
  {% endif %}
57
- "$SLURM_TMPDIR"/container \
61
+ "$SLURM_TMPDIR"/container-"$SLURM_PROCID" \
58
62
  /xm-slurm-entrypoint.sh \
59
63
  {% for arg in job.executable.args.to_list() %}
60
64
  {{ arg }} \
@@ -16,6 +16,7 @@ exec podman run \
16
16
  --entrypoint /xm-slurm-entrypoint.sh \
17
17
  --pull never \
18
18
  --restart no \
19
+ --env "SLURM_*" \
19
20
  --rm \
20
21
  {% if job.executor.requirements.accelerator %}
21
22
  --device nvidia.com/gpu=all \
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xmanager-slurm
3
- Version: 0.4.15
3
+ Version: 0.4.16
4
4
  Summary: Slurm backend for XManager.
5
5
  Project-URL: GitHub, https://github.com/jessefarebro/xm-slurm
6
6
  Author-email: Jesse Farebrother <jfarebro@cs.mcgill.ca>
@@ -22,8 +22,8 @@ xm_slurm/api/models.py,sha256=_INVh0j-4-rRs0WASyg4fNB6NF1L1nUeGgQ6-XnbwsM,1610
22
22
  xm_slurm/api/sqlite/client.py,sha256=jAesCKDuYwnNcAxwJk_1b1TB8cT_QGbSjo1UE3mZjEQ,14037
23
23
  xm_slurm/api/web/client.py,sha256=uO67Y7fnQ-w__Vm_A5BEuy7Qi8wQcWk3vIsBGEBkyfk,6261
24
24
  xm_slurm/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- xm_slurm/contrib/clusters/__init__.py,sha256=XFCVnkThiU3_8uA_tUgDByOBanXNHrxDvfmuptmQ2KE,2214
26
- xm_slurm/contrib/clusters/drac.py,sha256=_iubsmzz5tK2KTaKqSuykS3IDtsdXqJ0MXep1THSJUM,7020
25
+ xm_slurm/contrib/clusters/__init__.py,sha256=oYkuo4jAA6_VLIJs4k1Rjcp6l3qN0qmJExfkPWfmho4,2281
26
+ xm_slurm/contrib/clusters/drac.py,sha256=-Nawzyx3p471f8Fti_tbweHn8h4ItRN4ec-ktHnUE4Q,7087
27
27
  xm_slurm/experimental/parameter_controller.py,sha256=IrFzq104LkZrhzuirit5GUZDXDvv2bBSYNMh3orsiPY,8518
28
28
  xm_slurm/packaging/__init__.py,sha256=dh307yLpUT9KN7rJ1e9fYC6hegGKfZcGboUq9nGpDVQ,233
29
29
  xm_slurm/packaging/docker.py,sha256=-DWcB9qqbeHmIEqyfF0-v6xOT25ae90u2x-QZ7kluOw,13579
@@ -43,10 +43,10 @@ xm_slurm/templates/slurm/job.bash.j2,sha256=JnK0D8_3tVNpnvPwM5yL_rjLcjqhuHiCtolD
43
43
  xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=ri5FgoKs6_bQVf5DO8SL4rJf4UsLxV34aOV-OD8VWDU,2526
44
44
  xm_slurm/templates/slurm/fragments/proxy.bash.j2,sha256=VJLglZo-Nvx9R-qe3rHTxr07CylTQ6Z9NwBzvIpAZrA,814
45
45
  xm_slurm/templates/slurm/library/retry.bash,sha256=bLe59qvfWEk17rE1wZ4EHiHba3RvR2WWZPq-kSe8RAA,2164
46
- xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=v0LwHM-kBW8sJqVcVA2jYr1n44imDSZrJqmqlr5uTGc,1980
47
- xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=zWLsFEuVzOMSETOmv4A5ZCV4oQHwCipiR6wi79XVzNI,1188
48
- xmanager_slurm-0.4.15.dist-info/METADATA,sha256=xgJUFDConlb4R5W0cOK3xb2fuIR7x_tjNpOybMagT_A,1007
49
- xmanager_slurm-0.4.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
50
- xmanager_slurm-0.4.15.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
51
- xmanager_slurm-0.4.15.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
52
- xmanager_slurm-0.4.15.dist-info/RECORD,,
46
+ xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=XxAQWLxZogL7zjn7tuzKn-DkYUJMx_HjaRzpVkz97lM,2414
47
+ xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=8N1ZtwHyXxP-Cjo4HBPsJiZXcTvf7q2GzvW9ao8_aok,1208
48
+ xmanager_slurm-0.4.16.dist-info/METADATA,sha256=j0282EV56cTAG80Q4R419IE-Z74OJVvAkP--IPMCwuo,1007
49
+ xmanager_slurm-0.4.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
50
+ xmanager_slurm-0.4.16.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
51
+ xmanager_slurm-0.4.16.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
52
+ xmanager_slurm-0.4.16.dist-info/RECORD,,