xmanager-slurm 0.4.15__tar.gz → 0.4.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xmanager-slurm might be problematic. Click here for more details.

Files changed (126) hide show
  1. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/PKG-INFO +1 -1
  2. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/pyproject.toml +1 -1
  3. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/uv.lock +1 -1
  4. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/contrib/clusters/__init__.py +1 -0
  5. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/contrib/clusters/drac.py +1 -0
  6. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +12 -8
  7. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/templates/slurm/runtimes/podman.bash.j2 +1 -0
  8. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/.devcontainer.json +0 -0
  9. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/.github/workflows/ci.yml +0 -0
  10. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/.github/workflows/deploy-docs.yml +0 -0
  11. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/.gitignore +0 -0
  12. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/.pre-commit-config.yaml +0 -0
  13. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/.python-version +0 -0
  14. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/.vscode/settings.json +0 -0
  15. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/LICENSE.md +0 -0
  16. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/README.md +0 -0
  17. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/docs/api/executables.rst +0 -0
  18. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/docs/api/executors.rst +0 -0
  19. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/docs/api/packageables.rst +0 -0
  20. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/docs/assets/workflow-dark.svg +0 -0
  21. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/docs/assets/workflow-light.svg +0 -0
  22. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/docs/conf.py +0 -0
  23. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/docs/getting-started/xmanager.md +0 -0
  24. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/docs/guides/index.md +0 -0
  25. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/docs/guides/remote-dev.md +0 -0
  26. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/docs/index.md +0 -0
  27. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/conda/environment.yml +0 -0
  28. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/conda/launch.py +0 -0
  29. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/conda/main.py +0 -0
  30. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/conda/pyproject.toml +0 -0
  31. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/custom-dockerfile/Dockerfile +0 -0
  32. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/custom-dockerfile/launch.py +0 -0
  33. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/custom-dockerfile/pyproject.toml +0 -0
  34. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-array-sweep/launch.py +0 -0
  35. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-array-sweep/main.py +0 -0
  36. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-array-sweep/pyproject.toml +0 -0
  37. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-array-sweep/uv.lock +0 -0
  38. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-dependencies/eval.py +0 -0
  39. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-dependencies/launch.py +0 -0
  40. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-dependencies/pyproject.toml +0 -0
  41. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-dependencies/train.py +0 -0
  42. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-dependencies/uv.lock +0 -0
  43. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-group/Dockerfile +0 -0
  44. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-group/launch.py +0 -0
  45. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-group/pyproject.toml +0 -0
  46. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-group/uv.lock +0 -0
  47. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-timeout/launch.py +0 -0
  48. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-timeout/main.py +0 -0
  49. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-timeout/pyproject.toml +0 -0
  50. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/job-timeout/uv.lock +0 -0
  51. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/metadata/launch.py +0 -0
  52. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/metadata/main.py +0 -0
  53. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/metadata/pyproject.toml +0 -0
  54. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/metadata/requirements.txt +0 -0
  55. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/parameter-controller/launch.py +0 -0
  56. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/parameter-controller/main.py +0 -0
  57. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/parameter-controller/pyproject.toml +0 -0
  58. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/parameter-controller/requirements.txt +0 -0
  59. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/pip/launch.py +0 -0
  60. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/pip/main.py +0 -0
  61. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/pip/pyproject.toml +0 -0
  62. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/pip/requirements.txt +0 -0
  63. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/uv/launch.py +0 -0
  64. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/uv/pyproject.toml +0 -0
  65. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/examples/uv/uv.lock +0 -0
  66. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/integration/conftest.py +0 -0
  67. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/integration/fixtures/slurm/Dockerfile +0 -0
  68. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/integration/fixtures/slurm/README.md +0 -0
  69. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/integration/fixtures/slurm/cgroup.conf +0 -0
  70. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/integration/fixtures/slurm/docker-compose.yml +0 -0
  71. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/integration/fixtures/slurm/docker-entrypoint.sh +0 -0
  72. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/integration/fixtures/slurm/host_ed25519 +0 -0
  73. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/integration/fixtures/slurm/host_ed25519.pub +0 -0
  74. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/integration/fixtures/slurm/id_ed25519 +0 -0
  75. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/integration/fixtures/slurm/id_ed25519.pub +0 -0
  76. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/integration/fixtures/slurm/slurm.conf +0 -0
  77. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/integration/fixtures/slurm/slurmdbd.conf +0 -0
  78. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/integration/fixtures/slurm/sshd_config +0 -0
  79. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/integration/test_remote_execution.py +0 -0
  80. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/test_dependencies.py +0 -0
  81. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/test_executors.py +0 -0
  82. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/test_resources.py +0 -0
  83. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/tests/test_utils.py +0 -0
  84. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/__init__.py +0 -0
  85. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/api/__init__.py +0 -0
  86. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/api/abc.py +0 -0
  87. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/api/models.py +0 -0
  88. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/api/sqlite/client.py +0 -0
  89. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/api/web/client.py +0 -0
  90. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/batching.py +0 -0
  91. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/config.py +0 -0
  92. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/console.py +0 -0
  93. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/constants.py +0 -0
  94. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/contrib/__init__.py +0 -0
  95. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/dependencies.py +0 -0
  96. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/executables.py +0 -0
  97. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/execution.py +0 -0
  98. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/executors.py +0 -0
  99. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/experiment.py +0 -0
  100. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/experimental/parameter_controller.py +0 -0
  101. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/filesystem.py +0 -0
  102. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/job_blocks.py +0 -0
  103. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/metadata_context.py +0 -0
  104. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/packageables.py +0 -0
  105. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/packaging/__init__.py +0 -0
  106. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/packaging/docker.py +0 -0
  107. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/packaging/registry.py +0 -0
  108. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/packaging/router.py +0 -0
  109. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/packaging/utils.py +0 -0
  110. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/resources.py +0 -0
  111. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/scripts/_cloudpickle.py +0 -0
  112. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/scripts/cli.py +0 -0
  113. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/status.py +0 -0
  114. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/templates/docker/docker-bake.hcl.j2 +0 -0
  115. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/templates/docker/mamba.Dockerfile +0 -0
  116. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/templates/docker/python.Dockerfile +0 -0
  117. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/templates/docker/uv.Dockerfile +0 -0
  118. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/templates/slurm/entrypoint.bash.j2 +0 -0
  119. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/templates/slurm/fragments/monitor.bash.j2 +0 -0
  120. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/templates/slurm/fragments/proxy.bash.j2 +0 -0
  121. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/templates/slurm/job-array.bash.j2 +0 -0
  122. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/templates/slurm/job-group.bash.j2 +0 -0
  123. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/templates/slurm/job.bash.j2 +0 -0
  124. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/templates/slurm/library/retry.bash +0 -0
  125. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/types.py +0 -0
  126. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.16}/xm_slurm/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xmanager-slurm
3
- Version: 0.4.15
3
+ Version: 0.4.16
4
4
  Summary: Slurm backend for XManager.
5
5
  Project-URL: GitHub, https://github.com/jessefarebro/xm-slurm
6
6
  Author-email: Jesse Farebrother <jfarebro@cs.mcgill.ca>
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
5
5
  [project]
6
6
  name = "xmanager-slurm"
7
7
  description = "Slurm backend for XManager."
8
- version = "0.4.15"
8
+ version = "0.4.16"
9
9
  # readme = "README.md"
10
10
  requires-python = ">=3.10"
11
11
  license = { text = "MIT" }
@@ -2251,7 +2251,7 @@ wheels = [
2251
2251
 
2252
2252
  [[package]]
2253
2253
  name = "xmanager-slurm"
2254
- version = "0.4.15"
2254
+ version = "0.4.16"
2255
2255
  source = { editable = "." }
2256
2256
  dependencies = [
2257
2257
  { name = "aiofile" },
@@ -26,6 +26,7 @@ def mila(
26
26
  "/home/mila/${USER:0:1}/$USER/.local/state/xm-slurm": "/xm-slurm-state",
27
27
  "/home/mila/${USER:0:1}/$USER/.ssh": "/home/mila/${USER:0:1}/$USER/.ssh",
28
28
  }
29
+ mounts = dict(mounts) | {"/dev/infiniband": "/dev/infiniband"}
29
30
 
30
31
  return config.SlurmClusterConfig(
31
32
  name="mila",
@@ -29,6 +29,7 @@ def _drac_cluster(
29
29
  "/home/$USER/.ssh": "/home/$USER/.ssh",
30
30
  "/home/$USER/.local/state/xm-slurm": "/xm-slurm-state",
31
31
  }
32
+ mounts = dict(mounts) | {"/dev/infiniband": "/dev/infiniband"}
32
33
 
33
34
  return config.SlurmClusterConfig(
34
35
  name=name,
@@ -3,7 +3,9 @@
3
3
 
4
4
  # Bundle will be where our built sandbox image is stored
5
5
  # container-workdir will be our container's scratch directory
6
- mkdir -p "$SLURM_TMPDIR"/{container,container-workdir,container-overlay}
6
+ # TODO(jfarebro): We can make this more efficient by doing an srun per node and downloading the container once per node.
7
+ # but this requires apptainer support to have an overlay per procid
8
+ mkdir -p "$SLURM_TMPDIR"/{container-"$SLURM_PROCID",container-workdir-"$SLURM_PROCID",container-overlay-"$SLURM_PROCID"}
7
9
 
8
10
  retry -c 255 -n 10 -d 1 -b 2 -- \
9
11
  {% if job.executable.credentials %}
@@ -14,19 +16,21 @@ retry -c 255 -n 10 -d 1 -b 2 -- \
14
16
  --force \
15
17
  --sandbox \
16
18
  --fix-perms \
17
- "$SLURM_TMPDIR"/container \
19
+ "$SLURM_TMPDIR"/container-"$SLURM_PROCID" \
18
20
  docker://{{ job.executable.image }}
19
21
 
20
22
  {% if runtime == "singularity" and cluster.mounts %}
21
23
  {% for source, dest in cluster.mounts.items() %}
22
- mkdir -p "$SLURM_TMPDIR"/container/{{ dest | trim('/') }}
24
+ mkdir -p "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/{{ dest | trim('/') }}
23
25
  {% endfor %}
24
26
  {% endif %}
25
27
 
26
- cat << 'ENTRYPOINT_EOF' > "$SLURM_TMPDIR"/container/xm-slurm-entrypoint.sh
28
+ cat << 'ENTRYPOINT_EOF' > "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/xm-slurm-entrypoint.sh
27
29
  {{ entrypoint(cluster, job) }}
28
30
  ENTRYPOINT_EOF
29
- chmod +x "$SLURM_TMPDIR"/container/xm-slurm-entrypoint.sh
31
+ chmod +x "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/xm-slurm-entrypoint.sh
32
+
33
+ for var in "${!SLURM_@}"; do export "{{ runtime | upper }}ENV_${var}=${!var}"; done
30
34
 
31
35
  exec {{ runtime }} exec \
32
36
  {% if job.executor.requirements.accelerator %}
@@ -45,16 +49,16 @@ exec {{ runtime }} exec \
45
49
  --bind {{ source }}:{{ dest }} \
46
50
  {% endfor %}
47
51
  {% endif %}
48
- --workdir "$SLURM_TMPDIR"/container-workdir \
52
+ --workdir "$SLURM_TMPDIR"/container-workdir-"$SLURM_PROCID" \
49
53
  {% if (cluster.runtime | string) == "apptainer" %}
50
- --overlay "$SLURM_TMPDIR"/container-overlay \
54
+ --overlay "$SLURM_TMPDIR"/container-overlay-"$SLURM_PROCID" \
51
55
  {% else %}
52
56
  --writable \
53
57
  {% endif %}
54
58
  {% if job.executable.workdir %}
55
59
  --pwd {{ job.executable.workdir }} \
56
60
  {% endif %}
57
- "$SLURM_TMPDIR"/container \
61
+ "$SLURM_TMPDIR"/container-"$SLURM_PROCID" \
58
62
  /xm-slurm-entrypoint.sh \
59
63
  {% for arg in job.executable.args.to_list() %}
60
64
  {{ arg }} \
@@ -16,6 +16,7 @@ exec podman run \
16
16
  --entrypoint /xm-slurm-entrypoint.sh \
17
17
  --pull never \
18
18
  --restart no \
19
+ --env "SLURM_*" \
19
20
  --rm \
20
21
  {% if job.executor.requirements.accelerator %}
21
22
  --device nvidia.com/gpu=all \