xmanager-slurm 0.4.15__tar.gz → 0.4.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xmanager-slurm might be problematic. Click here for more details.

Files changed (126) hide show
  1. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/PKG-INFO +1 -1
  2. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/pyproject.toml +1 -1
  3. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/test_executors.py +25 -14
  4. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/test_resources.py +34 -34
  5. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/uv.lock +1 -1
  6. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/contrib/clusters/__init__.py +1 -0
  7. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/contrib/clusters/drac.py +1 -0
  8. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/executors.py +29 -17
  9. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/resources.py +4 -1
  10. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/job-array.bash.j2 +4 -1
  11. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/job-group.bash.j2 +7 -2
  12. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/job.bash.j2 +5 -2
  13. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +12 -8
  14. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/runtimes/podman.bash.j2 +1 -0
  15. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/.devcontainer.json +0 -0
  16. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/.github/workflows/ci.yml +0 -0
  17. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/.github/workflows/deploy-docs.yml +0 -0
  18. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/.gitignore +0 -0
  19. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/.pre-commit-config.yaml +0 -0
  20. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/.python-version +0 -0
  21. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/.vscode/settings.json +0 -0
  22. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/LICENSE.md +0 -0
  23. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/README.md +0 -0
  24. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/docs/api/executables.rst +0 -0
  25. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/docs/api/executors.rst +0 -0
  26. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/docs/api/packageables.rst +0 -0
  27. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/docs/assets/workflow-dark.svg +0 -0
  28. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/docs/assets/workflow-light.svg +0 -0
  29. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/docs/conf.py +0 -0
  30. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/docs/getting-started/xmanager.md +0 -0
  31. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/docs/guides/index.md +0 -0
  32. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/docs/guides/remote-dev.md +0 -0
  33. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/docs/index.md +0 -0
  34. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/conda/environment.yml +0 -0
  35. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/conda/launch.py +0 -0
  36. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/conda/main.py +0 -0
  37. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/conda/pyproject.toml +0 -0
  38. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/custom-dockerfile/Dockerfile +0 -0
  39. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/custom-dockerfile/launch.py +0 -0
  40. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/custom-dockerfile/pyproject.toml +0 -0
  41. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-array-sweep/launch.py +0 -0
  42. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-array-sweep/main.py +0 -0
  43. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-array-sweep/pyproject.toml +0 -0
  44. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-array-sweep/uv.lock +0 -0
  45. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-dependencies/eval.py +0 -0
  46. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-dependencies/launch.py +0 -0
  47. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-dependencies/pyproject.toml +0 -0
  48. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-dependencies/train.py +0 -0
  49. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-dependencies/uv.lock +0 -0
  50. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-group/Dockerfile +0 -0
  51. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-group/launch.py +0 -0
  52. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-group/pyproject.toml +0 -0
  53. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-group/uv.lock +0 -0
  54. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-timeout/launch.py +0 -0
  55. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-timeout/main.py +0 -0
  56. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-timeout/pyproject.toml +0 -0
  57. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/job-timeout/uv.lock +0 -0
  58. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/metadata/launch.py +0 -0
  59. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/metadata/main.py +0 -0
  60. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/metadata/pyproject.toml +0 -0
  61. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/metadata/requirements.txt +0 -0
  62. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/parameter-controller/launch.py +0 -0
  63. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/parameter-controller/main.py +0 -0
  64. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/parameter-controller/pyproject.toml +0 -0
  65. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/parameter-controller/requirements.txt +0 -0
  66. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/pip/launch.py +0 -0
  67. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/pip/main.py +0 -0
  68. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/pip/pyproject.toml +0 -0
  69. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/pip/requirements.txt +0 -0
  70. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/uv/launch.py +0 -0
  71. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/uv/pyproject.toml +0 -0
  72. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/examples/uv/uv.lock +0 -0
  73. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/integration/conftest.py +0 -0
  74. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/Dockerfile +0 -0
  75. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/README.md +0 -0
  76. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/cgroup.conf +0 -0
  77. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/docker-compose.yml +0 -0
  78. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/docker-entrypoint.sh +0 -0
  79. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/host_ed25519 +0 -0
  80. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/host_ed25519.pub +0 -0
  81. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/id_ed25519 +0 -0
  82. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/id_ed25519.pub +0 -0
  83. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/slurm.conf +0 -0
  84. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/slurmdbd.conf +0 -0
  85. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/sshd_config +0 -0
  86. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/integration/test_remote_execution.py +0 -0
  87. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/test_dependencies.py +0 -0
  88. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/tests/test_utils.py +0 -0
  89. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/__init__.py +0 -0
  90. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/api/__init__.py +0 -0
  91. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/api/abc.py +0 -0
  92. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/api/models.py +0 -0
  93. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/api/sqlite/client.py +0 -0
  94. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/api/web/client.py +0 -0
  95. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/batching.py +0 -0
  96. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/config.py +0 -0
  97. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/console.py +0 -0
  98. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/constants.py +0 -0
  99. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/contrib/__init__.py +0 -0
  100. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/dependencies.py +0 -0
  101. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/executables.py +0 -0
  102. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/execution.py +0 -0
  103. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/experiment.py +0 -0
  104. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/experimental/parameter_controller.py +0 -0
  105. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/filesystem.py +0 -0
  106. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/job_blocks.py +0 -0
  107. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/metadata_context.py +0 -0
  108. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/packageables.py +0 -0
  109. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/packaging/__init__.py +0 -0
  110. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/packaging/docker.py +0 -0
  111. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/packaging/registry.py +0 -0
  112. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/packaging/router.py +0 -0
  113. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/packaging/utils.py +0 -0
  114. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/scripts/_cloudpickle.py +0 -0
  115. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/scripts/cli.py +0 -0
  116. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/status.py +0 -0
  117. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/templates/docker/docker-bake.hcl.j2 +0 -0
  118. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/templates/docker/mamba.Dockerfile +0 -0
  119. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/templates/docker/python.Dockerfile +0 -0
  120. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/templates/docker/uv.Dockerfile +0 -0
  121. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/entrypoint.bash.j2 +0 -0
  122. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/fragments/monitor.bash.j2 +0 -0
  123. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/fragments/proxy.bash.j2 +0 -0
  124. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/library/retry.bash +0 -0
  125. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/types.py +0 -0
  126. {xmanager_slurm-0.4.15 → xmanager_slurm-0.4.17}/xm_slurm/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xmanager-slurm
3
- Version: 0.4.15
3
+ Version: 0.4.17
4
4
  Summary: Slurm backend for XManager.
5
5
  Project-URL: GitHub, https://github.com/jessefarebro/xm-slurm
6
6
  Author-email: Jesse Farebrother <jfarebro@cs.mcgill.ca>
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
5
5
  [project]
6
6
  name = "xmanager-slurm"
7
7
  description = "Slurm backend for XManager."
8
- version = "0.4.15"
8
+ version = "0.4.17"
9
9
  # readme = "README.md"
10
10
  requires-python = ">=3.10"
11
11
  license = { text = "MIT" }
@@ -226,7 +226,7 @@ def test_slurm_executor_bind_accepts_custom_strings(
226
226
  def test_slurm_executor_to_directives_default(basic_requirements):
227
227
  """Test directive generation with minimal configuration."""
228
228
  executor = executors.Slurm(requirements=basic_requirements, time=dt.timedelta(hours=1))
229
- directives = executor.to_directives()
229
+ directives = executor.batch_directives()
230
230
  assert any(d.startswith("--time=") for d in directives)
231
231
  assert any(d.startswith("--kill-on-invalid-dep=") for d in directives)
232
232
  assert any(d.startswith("--signal=") for d in directives)
@@ -244,7 +244,7 @@ def test_slurm_executor_to_directives_default(basic_requirements):
244
244
  def test_slurm_executor_to_directives_time(basic_requirements, time, expected):
245
245
  """Test time directive generation."""
246
246
  executor = executors.Slurm(requirements=basic_requirements, time=time)
247
- directives = executor.to_directives()
247
+ directives = executor.batch_directives()
248
248
  time_directive = [d for d in directives if d.startswith("--time=")][0]
249
249
  assert time_directive == expected
250
250
 
@@ -278,7 +278,7 @@ def test_slurm_executor_to_directives_executor_params(
278
278
  time=dt.timedelta(hours=1),
279
279
  **executor_kwargs,
280
280
  )
281
- directives = executor.to_directives()
281
+ directives = executor.batch_directives()
282
282
  assert expected_directive in directives
283
283
 
284
284
 
@@ -294,7 +294,7 @@ def test_slurm_executor_to_directives_reservation(basic_requirements, reservatio
294
294
  executor = executors.Slurm(
295
295
  requirements=basic_requirements, time=dt.timedelta(hours=1), reservation=reservation
296
296
  )
297
- directives = executor.to_directives()
297
+ directives = executor.batch_directives()
298
298
  reservation_directive = [d for d in directives if d.startswith("--reservation=")][0]
299
299
  assert reservation_directive == expected
300
300
 
@@ -321,7 +321,7 @@ def test_slurm_executor_to_directives_boolean_flags(
321
321
  time=dt.timedelta(hours=1),
322
322
  **flag_kwargs,
323
323
  )
324
- directives = executor.to_directives()
324
+ directives = executor.batch_directives()
325
325
  for expected in expected_directives:
326
326
  assert any(expected in d for d in directives)
327
327
  for unexpected in ["--exclusive", "--oversubscribe", "--overcommit"]:
@@ -348,7 +348,7 @@ def test_slurm_executor_to_directives_timeout_signal(
348
348
  timeout_signal=timeout_signal,
349
349
  timeout_signal_grace_period=grace_period,
350
350
  )
351
- directives = executor.to_directives()
351
+ directives = executor.batch_directives()
352
352
  signal_directive = [d for d in directives if d.startswith("--signal=")][0]
353
353
  assert signal_directive == f"--signal={expected_signal}"
354
354
 
@@ -369,7 +369,7 @@ def test_slurm_executor_to_directives_requeue(basic_requirements, requeue, max_a
369
369
  requeue=requeue,
370
370
  requeue_max_attempts=max_attempts,
371
371
  )
372
- directives = executor.to_directives()
372
+ directives = executor.batch_directives()
373
373
  requeue_directives = [d for d in directives if "requeue" in d]
374
374
  assert expected in requeue_directives
375
375
 
@@ -381,7 +381,7 @@ def test_slurm_executor_to_directives_bind_gpu(basic_requirements):
381
381
  time=dt.timedelta(hours=1),
382
382
  bind={resources.ResourceType.GPU: "closest"},
383
383
  )
384
- directives = executor.to_directives()
384
+ directives = executor.step_directives()
385
385
  gpu_bind_directive = [d for d in directives if d.startswith("--gpu-bind=")][0]
386
386
  assert gpu_bind_directive == "--gpu-bind=closest"
387
387
 
@@ -404,7 +404,7 @@ def test_slurm_executor_to_directives_bind_resource_types(
404
404
  time=dt.timedelta(hours=1),
405
405
  bind={resource: value},
406
406
  )
407
- directives = executor.to_directives()
407
+ directives = executor.step_directives()
408
408
  assert any(expected in d for d in directives)
409
409
 
410
410
 
@@ -425,14 +425,14 @@ def test_slurm_executor_to_directives_bind_custom_gres(
425
425
  time=dt.timedelta(hours=1),
426
426
  bind={custom_gres: value},
427
427
  )
428
- directives = executor.to_directives()
428
+ directives = executor.step_directives()
429
429
  assert any(expected in d for d in directives)
430
430
 
431
431
 
432
432
  def test_slurm_executor_to_directives_includes_requirements_directives(basic_requirements):
433
433
  """Test that to_directives includes directives from requirements."""
434
434
  executor = executors.Slurm(requirements=basic_requirements, time=dt.timedelta(hours=1))
435
- directives = executor.to_directives()
435
+ directives = executor.batch_directives()
436
436
  assert any(d.startswith("--cpus-per-task=") for d in directives)
437
437
  assert any(d.startswith("--mem-per-cpu=") for d in directives)
438
438
 
@@ -476,11 +476,22 @@ def test_slurm_executor_with_multiple_binds(basic_requirements):
476
476
  resources.ResourceType.MEMORY: "local",
477
477
  },
478
478
  )
479
- directives = executor.to_directives()
479
+ directives = executor.step_directives()
480
480
  assert any("--gpu-bind=closest" in d for d in directives)
481
481
  assert any("--mem-bind=local" in d for d in directives)
482
482
 
483
483
 
484
+ def test_slurm_executor_bind_flag_sets_gres_flags(basic_requirements):
485
+ """Test that bind_flag produces the correct --gres-flags directive."""
486
+ executor = executors.Slurm(
487
+ requirements=basic_requirements,
488
+ time=dt.timedelta(hours=1),
489
+ bind_flag="enforce-binding",
490
+ )
491
+ directives = executor.step_directives()
492
+ assert "--gres-flags=enforce-binding" in directives
493
+
494
+
484
495
  @pytest.mark.parametrize("grace_period_secs", [1, 30, 3600])
485
496
  def test_slurm_executor_with_various_grace_periods(basic_requirements, grace_period_secs):
486
497
  """Test executor with various timeout grace periods."""
@@ -489,7 +500,7 @@ def test_slurm_executor_with_various_grace_periods(basic_requirements, grace_per
489
500
  time=dt.timedelta(hours=24),
490
501
  timeout_signal_grace_period=dt.timedelta(seconds=grace_period_secs),
491
502
  )
492
- directives = executor.to_directives()
503
+ directives = executor.batch_directives()
493
504
  signal_directive = [d for d in directives if d.startswith("--signal=")][0]
494
505
  assert f"@{grace_period_secs}" in signal_directive
495
506
 
@@ -520,7 +531,7 @@ def test_slurm_executor_multiple_signal_types(basic_requirements, sig):
520
531
  time=dt.timedelta(hours=1),
521
532
  timeout_signal=sig,
522
533
  )
523
- directives = executor.to_directives()
534
+ directives = executor.batch_directives()
524
535
  signal_directive = [d for d in directives if d.startswith("--signal=")][0]
525
536
  expected_sig_name = sig.name.removeprefix("SIG")
526
537
  assert expected_sig_name in signal_directive
@@ -277,7 +277,7 @@ def test_job_requirements_to_directives_cpu(
277
277
  ) -> None:
278
278
  """Test CPU directive generation."""
279
279
  req = resources.JobRequirements(cpu=4, cluster=dummy_cluster_config)
280
- directives = req.to_directives()
280
+ directives = req.batch_directives()
281
281
  assert "--cpus-per-task=4" in directives
282
282
 
283
283
 
@@ -290,7 +290,7 @@ def test_job_requirements_to_directives_memory(
290
290
  memory=8 * 1024**3, # 8GB
291
291
  cluster=dummy_cluster_config,
292
292
  )
293
- directives = req.to_directives()
293
+ directives = req.batch_directives()
294
294
  # 8GB / 2 CPUs / 2^20 bytes per MB = 4096 MB
295
295
  assert "--mem-per-cpu=4096M" in directives
296
296
 
@@ -303,7 +303,7 @@ def test_job_requirements_to_directives_memory_with_single_cpu(
303
303
  memory=2 * 1024**3, # 2GB
304
304
  cluster=dummy_cluster_config,
305
305
  )
306
- directives = req.to_directives()
306
+ directives = req.batch_directives()
307
307
  # 2GB / 1 CPU / 2^20 = 2048 MB
308
308
  assert "--mem-per-cpu=2048M" in directives
309
309
 
@@ -317,7 +317,7 @@ def test_job_requirements_to_directives_ram_alias(
317
317
  ram=8 * 1024**3,
318
318
  cluster=dummy_cluster_config,
319
319
  )
320
- directives = req.to_directives()
320
+ directives = req.batch_directives()
321
321
  assert "--mem-per-cpu=4096M" in directives
322
322
 
323
323
 
@@ -329,7 +329,7 @@ def test_job_requirements_to_directives_disk(
329
329
  disk=1024**3, # 1GB
330
330
  cluster=dummy_cluster_config,
331
331
  )
332
- directives = req.to_directives()
332
+ directives = req.batch_directives()
333
333
  # 1GB / 2^20 MB = 1024 MB
334
334
  assert "--tmp=1024M" in directives
335
335
 
@@ -342,7 +342,7 @@ def test_job_requirements_to_directives_ephemeral_storage_alias(
342
342
  ephemeral_storage=2 * 1024**3, # 2GB
343
343
  cluster=dummy_cluster_config,
344
344
  )
345
- directives = req.to_directives()
345
+ directives = req.batch_directives()
346
346
  # 2GB / 2^20 MB = 2048 MB
347
347
  assert "--tmp=2048M" in directives
348
348
 
@@ -355,7 +355,7 @@ def test_job_requirements_to_directives_generic_gpu(
355
355
  gpu=4,
356
356
  cluster=dummy_cluster_config,
357
357
  )
358
- directives = req.to_directives()
358
+ directives = req.batch_directives()
359
359
  assert "--gpus=4" in directives
360
360
 
361
361
 
@@ -367,7 +367,7 @@ def test_job_requirements_to_directives_specific_gpu(
367
367
  a100=2,
368
368
  cluster=cluster_with_gpu_mapping,
369
369
  )
370
- directives = req.to_directives()
370
+ directives = req.batch_directives()
371
371
  assert "--gpus=a100:2" in directives
372
372
 
373
373
 
@@ -394,7 +394,7 @@ def test_job_requirements_to_directives_various_gpu_types(
394
394
  )
395
395
  kwargs = {accelerator_type.name.lower(): count}
396
396
  req = resources.JobRequirements(**kwargs, cluster=cluster) # type: ignore
397
- directives = req.to_directives()
397
+ directives = req.batch_directives()
398
398
  assert f"--gpus={gpu_name}:{count}" in directives
399
399
 
400
400
 
@@ -407,7 +407,7 @@ def test_job_requirements_to_directives_unmapped_gpu_raises_error(
407
407
  cluster=dummy_cluster_config,
408
408
  )
409
409
  with pytest.raises(ValueError, match="does not map resource type"):
410
- req.to_directives()
410
+ req.batch_directives()
411
411
 
412
412
 
413
413
  def test_job_requirements_to_directives_custom_gres(
@@ -418,7 +418,7 @@ def test_job_requirements_to_directives_custom_gres(
418
418
  custom_resource="2", # type: ignore
419
419
  cluster=dummy_cluster_config,
420
420
  )
421
- directives = req.to_directives()
421
+ directives = req.batch_directives()
422
422
  assert "--gres=custom_resource:2" in directives
423
423
 
424
424
 
@@ -431,7 +431,7 @@ def test_job_requirements_to_directives_replicas(
431
431
  replicas=8,
432
432
  cluster=dummy_cluster_config,
433
433
  )
434
- directives = req.to_directives()
434
+ directives = req.batch_directives()
435
435
  assert "--ntasks=8" in directives
436
436
 
437
437
 
@@ -444,7 +444,7 @@ def test_job_requirements_to_directives_location(
444
444
  location="node[001-005]",
445
445
  cluster=dummy_cluster_config,
446
446
  )
447
- directives = req.to_directives()
447
+ directives = req.batch_directives()
448
448
  assert "--nodelist=node[001-005]" in directives
449
449
 
450
450
 
@@ -461,7 +461,7 @@ def test_job_requirements_to_directives_combined_all_resources(
461
461
  location="gpu-nodes[001-004]",
462
462
  cluster=cluster_with_gpu_mapping,
463
463
  )
464
- directives = req.to_directives()
464
+ directives = req.batch_directives()
465
465
  assert "--cpus-per-task=8" in directives
466
466
  assert "--mem-per-cpu=8192M" in directives # 64GB / 8 CPUs / 2^20
467
467
  assert "--tmp=512000M" in directives # 500GB / 2^20
@@ -655,7 +655,7 @@ def test_job_requirements_invalid_memory_type(
655
655
  req = resources.JobRequirements(cluster=dummy_cluster_config)
656
656
  req.task_requirements[resources.ResourceType.MEMORY] = "invalid" # type: ignore
657
657
  with pytest.raises(AssertionError, match="Memory must be an integer or float"):
658
- req.to_directives()
658
+ req.batch_directives()
659
659
 
660
660
 
661
661
  def test_job_requirements_invalid_cpu_type(
@@ -665,7 +665,7 @@ def test_job_requirements_invalid_cpu_type(
665
665
  req = resources.JobRequirements(cluster=dummy_cluster_config)
666
666
  req.task_requirements[resources.ResourceType.CPU] = 4.5 # type: ignore
667
667
  with pytest.raises(AssertionError, match="CPU must be an integer"):
668
- req.to_directives()
668
+ req.batch_directives()
669
669
 
670
670
 
671
671
  def test_job_requirements_invalid_gpu_type(
@@ -675,7 +675,7 @@ def test_job_requirements_invalid_gpu_type(
675
675
  req = resources.JobRequirements(cluster=dummy_cluster_config)
676
676
  req.task_requirements[resources.ResourceType.GPU] = 1.5 # type: ignore
677
677
  with pytest.raises(AssertionError, match="GPU must be an integer"):
678
- req.to_directives()
678
+ req.batch_directives()
679
679
 
680
680
 
681
681
  def test_job_requirements_invalid_disk_type(
@@ -685,7 +685,7 @@ def test_job_requirements_invalid_disk_type(
685
685
  req = resources.JobRequirements(cluster=dummy_cluster_config)
686
686
  req.task_requirements[resources.ResourceType.DISK] = 1.5 # type: ignore
687
687
  with pytest.raises(AssertionError, match="Disk space must be an integer"):
688
- req.to_directives()
688
+ req.batch_directives()
689
689
 
690
690
 
691
691
  def test_job_requirements_invalid_replicas(
@@ -695,7 +695,7 @@ def test_job_requirements_invalid_replicas(
695
695
  req = resources.JobRequirements(cluster=dummy_cluster_config)
696
696
  req.replicas = -1
697
697
  with pytest.raises(AssertionError, match="Replicas must be a positive integer"):
698
- req.to_directives()
698
+ req.batch_directives()
699
699
 
700
700
 
701
701
  def test_job_requirements_invalid_replicas_zero(
@@ -705,7 +705,7 @@ def test_job_requirements_invalid_replicas_zero(
705
705
  req = resources.JobRequirements(cluster=dummy_cluster_config)
706
706
  req.replicas = 0
707
707
  with pytest.raises(AssertionError, match="Replicas must be a positive integer"):
708
- req.to_directives()
708
+ req.batch_directives()
709
709
 
710
710
 
711
711
  def test_job_requirements_invalid_location_type(
@@ -715,7 +715,7 @@ def test_job_requirements_invalid_location_type(
715
715
  req = resources.JobRequirements(cluster=dummy_cluster_config)
716
716
  req.location = 123 # type: ignore
717
717
  with pytest.raises(AssertionError, match="Location must be a string"):
718
- req.to_directives()
718
+ req.batch_directives()
719
719
 
720
720
 
721
721
  @pytest.mark.parametrize(
@@ -740,7 +740,7 @@ def test_job_requirements_memory_directive_calculation(
740
740
  memory=memory_bytes,
741
741
  cluster=dummy_cluster_config,
742
742
  )
743
- directives = req.to_directives()
743
+ directives = req.batch_directives()
744
744
  assert f"--mem-per-cpu={expected_mem_per_cpu_mb}M" in directives
745
745
 
746
746
 
@@ -763,7 +763,7 @@ def test_job_requirements_disk_directive_calculation(
763
763
  disk=disk_bytes,
764
764
  cluster=dummy_cluster_config,
765
765
  )
766
- directives = req.to_directives()
766
+ directives = req.batch_directives()
767
767
  assert f"--tmp={expected_tmp_mb}M" in directives
768
768
 
769
769
 
@@ -776,7 +776,7 @@ def test_job_requirements_fractional_memory_rounds_correctly(
776
776
  memory=10 * 1024**3, # 10GB / 3 = 3.33GB per CPU
777
777
  cluster=dummy_cluster_config,
778
778
  )
779
- directives = req.to_directives()
779
+ directives = req.batch_directives()
780
780
  # math.ceil(10 * 1024^3 / 3 / 2^20) = math.ceil(3413.33...) = 3414
781
781
  assert "--mem-per-cpu=3414M" in directives
782
782
 
@@ -789,7 +789,7 @@ def test_job_requirements_fractional_disk_rounds_correctly(
789
789
  disk=5 * 1024**3 + 512 * 1024**2, # 5.5GB
790
790
  cluster=dummy_cluster_config,
791
791
  )
792
- directives = req.to_directives()
792
+ directives = req.batch_directives()
793
793
  # math.ceil((5*1024^3 + 512*1024^2) / 2^20) = math.ceil(5632) = 5632
794
794
  assert "--tmp=5632M" in directives
795
795
 
@@ -799,7 +799,7 @@ def test_job_requirements_empty_task_requirements(
799
799
  ) -> None:
800
800
  """Test job requirements with no specific resources."""
801
801
  req = resources.JobRequirements(cluster=dummy_cluster_config)
802
- directives = req.to_directives()
802
+ directives = req.batch_directives()
803
803
  # Should still have ntasks directive
804
804
  assert "--ntasks=1" in directives
805
805
  assert len(directives) == 1
@@ -835,8 +835,8 @@ def test_job_requirements_multiple_directives_order_independence(
835
835
  memory=4 * 1024**3,
836
836
  cluster=cluster_with_gpu_mapping,
837
837
  )
838
- directives1 = set(req1.to_directives())
839
- directives2 = set(req2.to_directives())
838
+ directives1 = set(req1.batch_directives())
839
+ directives2 = set(req2.batch_directives())
840
840
  assert directives1 == directives2
841
841
 
842
842
 
@@ -1150,7 +1150,7 @@ def test_topology_directive_gpus_per_task(dummy_cluster_config: config.SlurmClus
1150
1150
  gpu=topo,
1151
1151
  cluster=dummy_cluster_config,
1152
1152
  )
1153
- directives = req.to_directives()
1153
+ directives = req.batch_directives()
1154
1154
  assert "--gpus-per-task=4" in directives
1155
1155
 
1156
1156
 
@@ -1163,7 +1163,7 @@ def test_topology_directive_gpus_per_task_with_specific_accelerator(
1163
1163
  a100=topo,
1164
1164
  cluster=cluster_with_gpu_mapping,
1165
1165
  )
1166
- directives = req.to_directives()
1166
+ directives = req.batch_directives()
1167
1167
  assert "--gpus-per-task=a100:2" in directives
1168
1168
 
1169
1169
 
@@ -1176,7 +1176,7 @@ def test_topology_directive_ntasks_from_topology(
1176
1176
  gpu=topo,
1177
1177
  cluster=dummy_cluster_config,
1178
1178
  )
1179
- directives = req.to_directives()
1179
+ directives = req.batch_directives()
1180
1180
  assert "--ntasks=3" in directives
1181
1181
 
1182
1182
 
@@ -1187,7 +1187,7 @@ def test_topology_directive_switches(dummy_cluster_config: config.SlurmClusterCo
1187
1187
  gpu=topo,
1188
1188
  cluster=dummy_cluster_config,
1189
1189
  )
1190
- directives = req.to_directives()
1190
+ directives = req.batch_directives()
1191
1191
  assert any("--switches=" in d for d in directives)
1192
1192
 
1193
1193
 
@@ -1201,7 +1201,7 @@ def test_topology_directive_switches_with_timeout(
1201
1201
  gpu=topo,
1202
1202
  cluster=dummy_cluster_config,
1203
1203
  )
1204
- directives = req.to_directives()
1204
+ directives = req.batch_directives()
1205
1205
  # Should have format like "--switches=2@5:00" (5 minutes in SLURM format)
1206
1206
  switches_directives = [d for d in directives if "--switches=" in d]
1207
1207
  assert len(switches_directives) > 0
@@ -1218,7 +1218,7 @@ def test_topology_1d_topology_no_ntasks_override(
1218
1218
  replicas=4,
1219
1219
  cluster=dummy_cluster_config,
1220
1220
  )
1221
- directives = req.to_directives()
1221
+ directives = req.batch_directives()
1222
1222
  assert "--ntasks=4" in directives
1223
1223
 
1224
1224
 
@@ -2251,7 +2251,7 @@ wheels = [
2251
2251
 
2252
2252
  [[package]]
2253
2253
  name = "xmanager-slurm"
2254
- version = "0.4.15"
2254
+ version = "0.4.17"
2255
2255
  source = { editable = "." }
2256
2256
  dependencies = [
2257
2257
  { name = "aiofile" },
@@ -26,6 +26,7 @@ def mila(
26
26
  "/home/mila/${USER:0:1}/$USER/.local/state/xm-slurm": "/xm-slurm-state",
27
27
  "/home/mila/${USER:0:1}/$USER/.ssh": "/home/mila/${USER:0:1}/$USER/.ssh",
28
28
  }
29
+ mounts = dict(mounts) | {"/dev/infiniband": "/dev/infiniband"}
29
30
 
30
31
  return config.SlurmClusterConfig(
31
32
  name="mila",
@@ -29,6 +29,7 @@ def _drac_cluster(
29
29
  "/home/$USER/.ssh": "/home/$USER/.ssh",
30
30
  "/home/$USER/.local/state/xm-slurm": "/xm-slurm-state",
31
31
  }
32
+ mounts = dict(mounts) | {"/dev/infiniband": "/dev/infiniband"}
32
33
 
33
34
  return config.SlurmClusterConfig(
34
35
  name=name,
@@ -37,6 +37,7 @@ class Slurm(xm.Executor):
37
37
  switches: Maximum count of leaf switches desired for the job allocation.
38
38
  switches_grace_period: Maximum time to wait for that number of switches.
39
39
  bind: How to bind tasks to resource (memory, GPU, or generic resource).
40
+ bind_flag: Generic resource task binding options.
40
41
  account: The account to charge the job to.
41
42
  partition: The partition to run the job in.
42
43
  qos: The quality of service to run the job with.
@@ -59,6 +60,7 @@ class Slurm(xm.Executor):
59
60
  requirements: resources.JobRequirements
60
61
  time: dt.timedelta
61
62
  bind: tp.Mapping[ResourceBindType | str, str | None] | None = None
63
+ bind_flag: str | None = None
62
64
 
63
65
  # Placement
64
66
  account: str | None = None
@@ -109,6 +111,8 @@ class Slurm(xm.Executor):
109
111
  )
110
112
  if value is not None and not isinstance(value, str):
111
113
  raise TypeError(f"bind value must be None or a string, got {type(value)}")
114
+ if self.bind_flag is not None and not isinstance(self.bind_flag, str):
115
+ raise TypeError(f"bind_flag must be a string, got {type(self.bind_flag)}")
112
116
 
113
117
  if not isinstance(self.timeout_signal, signal.Signals):
114
118
  raise TypeError(
@@ -133,28 +137,13 @@ class Slurm(xm.Executor):
133
137
  def Spec(cls, tag: str | None = None) -> SlurmSpec:
134
138
  return SlurmSpec(tag=tag)
135
139
 
136
- def to_directives(self) -> list[str]:
140
+ def batch_directives(self) -> list[str]:
137
141
  # Job requirements
138
- directives = self.requirements.to_directives()
142
+ directives = self.requirements.batch_directives()
139
143
 
140
144
  # Time
141
145
  directives.append(f"--time={utils.timestr_from_timedelta(self.time)}")
142
146
 
143
- # Resource binding
144
- if self.bind is not None:
145
- for resource, value in self.bind.items():
146
- if value is None:
147
- value = "none"
148
- match resource:
149
- case resources.ResourceType.MEMORY | resources.ResourceType.RAM:
150
- directives.append(f"--mem-bind={value}")
151
- case resources.ResourceType.GPU:
152
- directives.append(f"--gpu-bind={value}")
153
- case str():
154
- directives.append(f"--tres-bind=gres/{resource}:{value}")
155
- case _:
156
- raise ValueError(f"Unsupported resource type {resource!r} for binding.")
157
-
158
147
  # Job dependency handling
159
148
  directives.append(
160
149
  f"--kill-on-invalid-dep={'yes' if self.kill_on_invalid_dependencies else 'no'}"
@@ -196,3 +185,26 @@ class Slurm(xm.Executor):
196
185
  directives.append("--no-requeue")
197
186
 
198
187
  return directives
188
+
189
+ def step_directives(self) -> list[str]:
190
+ directives = self.requirements.step_directives()
191
+
192
+ # Resource binding
193
+ if self.bind is not None:
194
+ for resource, value in self.bind.items():
195
+ if value is None:
196
+ value = "none"
197
+ match resource:
198
+ case resources.ResourceType.MEMORY | resources.ResourceType.RAM:
199
+ directives.append(f"--mem-bind={value}")
200
+ case resources.ResourceType.GPU:
201
+ directives.append(f"--gpu-bind={value}")
202
+ case str():
203
+ directives.append(f"--tres-bind=gres/{resource}:{value}")
204
+ case _:
205
+ raise ValueError(f"Unsupported resource type {resource!r} for binding.")
206
+
207
+ if self.bind_flag is not None:
208
+ directives.append(f"--gres-flags={self.bind_flag}")
209
+
210
+ return directives
@@ -232,7 +232,7 @@ class JobRequirements:
232
232
  raise ValueError(f"Replicas must be a positive integer, got {replicas!r}")
233
233
  self.replicas = replicas or 1
234
234
 
235
- def to_directives(self) -> list[str]:
235
+ def batch_directives(self) -> list[str]:
236
236
  directives = []
237
237
 
238
238
  for resource, value in self.task_requirements.items():
@@ -302,6 +302,9 @@ class JobRequirements:
302
302
 
303
303
  return directives
304
304
 
305
+ def step_directives(self) -> list[str]:
306
+ return []
307
+
305
308
  def replace(
306
309
  self,
307
310
  replicas: int | None = None,
@@ -9,8 +9,11 @@
9
9
  srun \
10
10
  --label \
11
11
  --unbuffered \
12
- --kill-on-bad-exit=0 \
12
+ --kill-on-bad-exit=1 \
13
13
  --export="ALL" \
14
+ {% for directive in job.executor.step_directives() %}
15
+ {{ directive }} \
16
+ {% endfor %}
14
17
  bash <<'SRUN_EOF' &
15
18
  set -Eeuxo pipefail
16
19
 
@@ -21,7 +21,9 @@
21
21
  {% else %}
22
22
  #SBATCH --job-name=xm[{{ job_name }}@{{ experiment_id }}]
23
23
  {% endif %}
24
- {{ job.executor.to_directives() | join("\n") }}
24
+ {% for directive in job.executor.batch_directives() %}
25
+ #SBATCH {{ directive }}
26
+ {% endfor %}
25
27
  {{ "\n#SBATCH hetjob\n" if not loop.last }}
26
28
  {% endfor %}
27
29
  {% endblock directives %}
@@ -31,8 +33,11 @@
31
33
  srun \
32
34
  --label \
33
35
  --unbuffered \
34
- --kill-on-bad-exit=0 \
36
+ --kill-on-bad-exit=1 \
35
37
  --export="ALL" \
38
+ {% for directive in job.executor.step_directives() %}
39
+ {{ directive }} \
40
+ {% endfor %}
36
41
  --het-group={{ loop.index0 }} \
37
42
  bash <<'SRUN_EOF' &
38
43
  set -Eeuxo pipefail
@@ -21,7 +21,7 @@
21
21
  {% endif %}
22
22
  #SBATCH --job-name=xm[{{ experiment_id }}]
23
23
  {% endif %}
24
- {% for directive in job.executor.to_directives() %}
24
+ {% for directive in job.executor.batch_directives() %}
25
25
  #SBATCH {{ directive }}
26
26
  {% endfor %}
27
27
  {% endblock directives %}
@@ -61,8 +61,11 @@ export {{ key }}="{{ value }}"
61
61
  srun \
62
62
  --label \
63
63
  --unbuffered \
64
- --kill-on-bad-exit=0 \
64
+ --kill-on-bad-exit=1 \
65
65
  --export="ALL" \
66
+ {% for directive in job.executor.step_directives() %}
67
+ {{ directive }} \
68
+ {% endfor %}
66
69
  bash <<'SRUN_EOF' &
67
70
  set -Eeuxo pipefail
68
71
  {{ run(cluster, job) }}
@@ -3,7 +3,9 @@
3
3
 
4
4
  # Bundle will be where our built sandbox image is stored
5
5
  # container-workdir will be our container's scratch directory
6
- mkdir -p "$SLURM_TMPDIR"/{container,container-workdir,container-overlay}
6
+ # TODO(jfarebro): We can make this more efficient by doing an srun per node and downloading the container once per node.
7
+ # but this requires apptainer support to have an overlay per procid
8
+ mkdir -p "$SLURM_TMPDIR"/{container-"$SLURM_PROCID",container-workdir-"$SLURM_PROCID",container-overlay-"$SLURM_PROCID"}
7
9
 
8
10
  retry -c 255 -n 10 -d 1 -b 2 -- \
9
11
  {% if job.executable.credentials %}
@@ -14,19 +16,21 @@ retry -c 255 -n 10 -d 1 -b 2 -- \
14
16
  --force \
15
17
  --sandbox \
16
18
  --fix-perms \
17
- "$SLURM_TMPDIR"/container \
19
+ "$SLURM_TMPDIR"/container-"$SLURM_PROCID" \
18
20
  docker://{{ job.executable.image }}
19
21
 
20
22
  {% if runtime == "singularity" and cluster.mounts %}
21
23
  {% for source, dest in cluster.mounts.items() %}
22
- mkdir -p "$SLURM_TMPDIR"/container/{{ dest | trim('/') }}
24
+ mkdir -p "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/{{ dest | trim('/') }}
23
25
  {% endfor %}
24
26
  {% endif %}
25
27
 
26
- cat << 'ENTRYPOINT_EOF' > "$SLURM_TMPDIR"/container/xm-slurm-entrypoint.sh
28
+ cat << 'ENTRYPOINT_EOF' > "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/xm-slurm-entrypoint.sh
27
29
  {{ entrypoint(cluster, job) }}
28
30
  ENTRYPOINT_EOF
29
- chmod +x "$SLURM_TMPDIR"/container/xm-slurm-entrypoint.sh
31
+ chmod +x "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/xm-slurm-entrypoint.sh
32
+
33
+ for var in "${!SLURM_@}"; do export "{{ runtime | upper }}ENV_${var}=${!var}"; done
30
34
 
31
35
  exec {{ runtime }} exec \
32
36
  {% if job.executor.requirements.accelerator %}
@@ -45,16 +49,16 @@ exec {{ runtime }} exec \
45
49
  --bind {{ source }}:{{ dest }} \
46
50
  {% endfor %}
47
51
  {% endif %}
48
- --workdir "$SLURM_TMPDIR"/container-workdir \
52
+ --workdir "$SLURM_TMPDIR"/container-workdir-"$SLURM_PROCID" \
49
53
  {% if (cluster.runtime | string) == "apptainer" %}
50
- --overlay "$SLURM_TMPDIR"/container-overlay \
54
+ --overlay "$SLURM_TMPDIR"/container-overlay-"$SLURM_PROCID" \
51
55
  {% else %}
52
56
  --writable \
53
57
  {% endif %}
54
58
  {% if job.executable.workdir %}
55
59
  --pwd {{ job.executable.workdir }} \
56
60
  {% endif %}
57
- "$SLURM_TMPDIR"/container \
61
+ "$SLURM_TMPDIR"/container-"$SLURM_PROCID" \
58
62
  /xm-slurm-entrypoint.sh \
59
63
  {% for arg in job.executable.args.to_list() %}
60
64
  {{ arg }} \
@@ -16,6 +16,7 @@ exec podman run \
16
16
  --entrypoint /xm-slurm-entrypoint.sh \
17
17
  --pull never \
18
18
  --restart no \
19
+ --env "SLURM_*" \
19
20
  --rm \
20
21
  {% if job.executor.requirements.accelerator %}
21
22
  --device nvidia.com/gpu=all \