xmanager-slurm 0.4.16__tar.gz → 0.4.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xmanager-slurm might be problematic. Click here for more details.
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/PKG-INFO +1 -1
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/pyproject.toml +1 -1
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/test_executors.py +25 -14
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/test_resources.py +34 -34
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/uv.lock +1 -1
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/executors.py +29 -17
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/resources.py +4 -1
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/job-array.bash.j2 +4 -1
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/job-group.bash.j2 +7 -2
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/job.bash.j2 +5 -2
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/.devcontainer.json +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/.github/workflows/ci.yml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/.github/workflows/deploy-docs.yml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/.gitignore +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/.pre-commit-config.yaml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/.python-version +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/.vscode/settings.json +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/LICENSE.md +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/README.md +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/docs/api/executables.rst +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/docs/api/executors.rst +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/docs/api/packageables.rst +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/docs/assets/workflow-dark.svg +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/docs/assets/workflow-light.svg +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/docs/conf.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/docs/getting-started/xmanager.md +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/docs/guides/index.md +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/docs/guides/remote-dev.md +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/docs/index.md +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/conda/environment.yml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/conda/launch.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/conda/main.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/conda/pyproject.toml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/custom-dockerfile/Dockerfile +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/custom-dockerfile/launch.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/custom-dockerfile/pyproject.toml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-array-sweep/launch.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-array-sweep/main.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-array-sweep/pyproject.toml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-array-sweep/uv.lock +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-dependencies/eval.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-dependencies/launch.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-dependencies/pyproject.toml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-dependencies/train.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-dependencies/uv.lock +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-group/Dockerfile +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-group/launch.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-group/pyproject.toml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-group/uv.lock +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-timeout/launch.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-timeout/main.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-timeout/pyproject.toml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/job-timeout/uv.lock +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/metadata/launch.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/metadata/main.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/metadata/pyproject.toml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/metadata/requirements.txt +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/parameter-controller/launch.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/parameter-controller/main.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/parameter-controller/pyproject.toml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/parameter-controller/requirements.txt +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/pip/launch.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/pip/main.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/pip/pyproject.toml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/pip/requirements.txt +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/uv/launch.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/uv/pyproject.toml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/uv/uv.lock +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/conftest.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/Dockerfile +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/README.md +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/cgroup.conf +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/docker-compose.yml +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/docker-entrypoint.sh +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/host_ed25519 +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/host_ed25519.pub +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/id_ed25519 +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/id_ed25519.pub +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/slurm.conf +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/slurmdbd.conf +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/sshd_config +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/test_remote_execution.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/test_dependencies.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/test_utils.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/__init__.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/api/__init__.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/api/abc.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/api/models.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/api/sqlite/client.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/api/web/client.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/batching.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/config.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/console.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/constants.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/contrib/__init__.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/contrib/clusters/__init__.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/contrib/clusters/drac.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/dependencies.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/executables.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/execution.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/experiment.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/experimental/parameter_controller.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/filesystem.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/job_blocks.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/metadata_context.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/packageables.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/packaging/__init__.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/packaging/docker.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/packaging/registry.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/packaging/router.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/packaging/utils.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/scripts/_cloudpickle.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/scripts/cli.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/status.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/docker/docker-bake.hcl.j2 +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/docker/mamba.Dockerfile +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/docker/python.Dockerfile +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/docker/uv.Dockerfile +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/entrypoint.bash.j2 +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/fragments/monitor.bash.j2 +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/fragments/proxy.bash.j2 +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/library/retry.bash +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/runtimes/podman.bash.j2 +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/types.py +0 -0
- {xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/utils.py +0 -0
|
@@ -226,7 +226,7 @@ def test_slurm_executor_bind_accepts_custom_strings(
|
|
|
226
226
|
def test_slurm_executor_to_directives_default(basic_requirements):
|
|
227
227
|
"""Test directive generation with minimal configuration."""
|
|
228
228
|
executor = executors.Slurm(requirements=basic_requirements, time=dt.timedelta(hours=1))
|
|
229
|
-
directives = executor.
|
|
229
|
+
directives = executor.batch_directives()
|
|
230
230
|
assert any(d.startswith("--time=") for d in directives)
|
|
231
231
|
assert any(d.startswith("--kill-on-invalid-dep=") for d in directives)
|
|
232
232
|
assert any(d.startswith("--signal=") for d in directives)
|
|
@@ -244,7 +244,7 @@ def test_slurm_executor_to_directives_default(basic_requirements):
|
|
|
244
244
|
def test_slurm_executor_to_directives_time(basic_requirements, time, expected):
|
|
245
245
|
"""Test time directive generation."""
|
|
246
246
|
executor = executors.Slurm(requirements=basic_requirements, time=time)
|
|
247
|
-
directives = executor.
|
|
247
|
+
directives = executor.batch_directives()
|
|
248
248
|
time_directive = [d for d in directives if d.startswith("--time=")][0]
|
|
249
249
|
assert time_directive == expected
|
|
250
250
|
|
|
@@ -278,7 +278,7 @@ def test_slurm_executor_to_directives_executor_params(
|
|
|
278
278
|
time=dt.timedelta(hours=1),
|
|
279
279
|
**executor_kwargs,
|
|
280
280
|
)
|
|
281
|
-
directives = executor.
|
|
281
|
+
directives = executor.batch_directives()
|
|
282
282
|
assert expected_directive in directives
|
|
283
283
|
|
|
284
284
|
|
|
@@ -294,7 +294,7 @@ def test_slurm_executor_to_directives_reservation(basic_requirements, reservatio
|
|
|
294
294
|
executor = executors.Slurm(
|
|
295
295
|
requirements=basic_requirements, time=dt.timedelta(hours=1), reservation=reservation
|
|
296
296
|
)
|
|
297
|
-
directives = executor.
|
|
297
|
+
directives = executor.batch_directives()
|
|
298
298
|
reservation_directive = [d for d in directives if d.startswith("--reservation=")][0]
|
|
299
299
|
assert reservation_directive == expected
|
|
300
300
|
|
|
@@ -321,7 +321,7 @@ def test_slurm_executor_to_directives_boolean_flags(
|
|
|
321
321
|
time=dt.timedelta(hours=1),
|
|
322
322
|
**flag_kwargs,
|
|
323
323
|
)
|
|
324
|
-
directives = executor.
|
|
324
|
+
directives = executor.batch_directives()
|
|
325
325
|
for expected in expected_directives:
|
|
326
326
|
assert any(expected in d for d in directives)
|
|
327
327
|
for unexpected in ["--exclusive", "--oversubscribe", "--overcommit"]:
|
|
@@ -348,7 +348,7 @@ def test_slurm_executor_to_directives_timeout_signal(
|
|
|
348
348
|
timeout_signal=timeout_signal,
|
|
349
349
|
timeout_signal_grace_period=grace_period,
|
|
350
350
|
)
|
|
351
|
-
directives = executor.
|
|
351
|
+
directives = executor.batch_directives()
|
|
352
352
|
signal_directive = [d for d in directives if d.startswith("--signal=")][0]
|
|
353
353
|
assert signal_directive == f"--signal={expected_signal}"
|
|
354
354
|
|
|
@@ -369,7 +369,7 @@ def test_slurm_executor_to_directives_requeue(basic_requirements, requeue, max_a
|
|
|
369
369
|
requeue=requeue,
|
|
370
370
|
requeue_max_attempts=max_attempts,
|
|
371
371
|
)
|
|
372
|
-
directives = executor.
|
|
372
|
+
directives = executor.batch_directives()
|
|
373
373
|
requeue_directives = [d for d in directives if "requeue" in d]
|
|
374
374
|
assert expected in requeue_directives
|
|
375
375
|
|
|
@@ -381,7 +381,7 @@ def test_slurm_executor_to_directives_bind_gpu(basic_requirements):
|
|
|
381
381
|
time=dt.timedelta(hours=1),
|
|
382
382
|
bind={resources.ResourceType.GPU: "closest"},
|
|
383
383
|
)
|
|
384
|
-
directives = executor.
|
|
384
|
+
directives = executor.step_directives()
|
|
385
385
|
gpu_bind_directive = [d for d in directives if d.startswith("--gpu-bind=")][0]
|
|
386
386
|
assert gpu_bind_directive == "--gpu-bind=closest"
|
|
387
387
|
|
|
@@ -404,7 +404,7 @@ def test_slurm_executor_to_directives_bind_resource_types(
|
|
|
404
404
|
time=dt.timedelta(hours=1),
|
|
405
405
|
bind={resource: value},
|
|
406
406
|
)
|
|
407
|
-
directives = executor.
|
|
407
|
+
directives = executor.step_directives()
|
|
408
408
|
assert any(expected in d for d in directives)
|
|
409
409
|
|
|
410
410
|
|
|
@@ -425,14 +425,14 @@ def test_slurm_executor_to_directives_bind_custom_gres(
|
|
|
425
425
|
time=dt.timedelta(hours=1),
|
|
426
426
|
bind={custom_gres: value},
|
|
427
427
|
)
|
|
428
|
-
directives = executor.
|
|
428
|
+
directives = executor.step_directives()
|
|
429
429
|
assert any(expected in d for d in directives)
|
|
430
430
|
|
|
431
431
|
|
|
432
432
|
def test_slurm_executor_to_directives_includes_requirements_directives(basic_requirements):
|
|
433
433
|
"""Test that to_directives includes directives from requirements."""
|
|
434
434
|
executor = executors.Slurm(requirements=basic_requirements, time=dt.timedelta(hours=1))
|
|
435
|
-
directives = executor.
|
|
435
|
+
directives = executor.batch_directives()
|
|
436
436
|
assert any(d.startswith("--cpus-per-task=") for d in directives)
|
|
437
437
|
assert any(d.startswith("--mem-per-cpu=") for d in directives)
|
|
438
438
|
|
|
@@ -476,11 +476,22 @@ def test_slurm_executor_with_multiple_binds(basic_requirements):
|
|
|
476
476
|
resources.ResourceType.MEMORY: "local",
|
|
477
477
|
},
|
|
478
478
|
)
|
|
479
|
-
directives = executor.
|
|
479
|
+
directives = executor.step_directives()
|
|
480
480
|
assert any("--gpu-bind=closest" in d for d in directives)
|
|
481
481
|
assert any("--mem-bind=local" in d for d in directives)
|
|
482
482
|
|
|
483
483
|
|
|
484
|
+
def test_slurm_executor_bind_flag_sets_gres_flags(basic_requirements):
|
|
485
|
+
"""Test that bind_flag produces the correct --gres-flags directive."""
|
|
486
|
+
executor = executors.Slurm(
|
|
487
|
+
requirements=basic_requirements,
|
|
488
|
+
time=dt.timedelta(hours=1),
|
|
489
|
+
bind_flag="enforce-binding",
|
|
490
|
+
)
|
|
491
|
+
directives = executor.step_directives()
|
|
492
|
+
assert "--gres-flags=enforce-binding" in directives
|
|
493
|
+
|
|
494
|
+
|
|
484
495
|
@pytest.mark.parametrize("grace_period_secs", [1, 30, 3600])
|
|
485
496
|
def test_slurm_executor_with_various_grace_periods(basic_requirements, grace_period_secs):
|
|
486
497
|
"""Test executor with various timeout grace periods."""
|
|
@@ -489,7 +500,7 @@ def test_slurm_executor_with_various_grace_periods(basic_requirements, grace_per
|
|
|
489
500
|
time=dt.timedelta(hours=24),
|
|
490
501
|
timeout_signal_grace_period=dt.timedelta(seconds=grace_period_secs),
|
|
491
502
|
)
|
|
492
|
-
directives = executor.
|
|
503
|
+
directives = executor.batch_directives()
|
|
493
504
|
signal_directive = [d for d in directives if d.startswith("--signal=")][0]
|
|
494
505
|
assert f"@{grace_period_secs}" in signal_directive
|
|
495
506
|
|
|
@@ -520,7 +531,7 @@ def test_slurm_executor_multiple_signal_types(basic_requirements, sig):
|
|
|
520
531
|
time=dt.timedelta(hours=1),
|
|
521
532
|
timeout_signal=sig,
|
|
522
533
|
)
|
|
523
|
-
directives = executor.
|
|
534
|
+
directives = executor.batch_directives()
|
|
524
535
|
signal_directive = [d for d in directives if d.startswith("--signal=")][0]
|
|
525
536
|
expected_sig_name = sig.name.removeprefix("SIG")
|
|
526
537
|
assert expected_sig_name in signal_directive
|
|
@@ -277,7 +277,7 @@ def test_job_requirements_to_directives_cpu(
|
|
|
277
277
|
) -> None:
|
|
278
278
|
"""Test CPU directive generation."""
|
|
279
279
|
req = resources.JobRequirements(cpu=4, cluster=dummy_cluster_config)
|
|
280
|
-
directives = req.
|
|
280
|
+
directives = req.batch_directives()
|
|
281
281
|
assert "--cpus-per-task=4" in directives
|
|
282
282
|
|
|
283
283
|
|
|
@@ -290,7 +290,7 @@ def test_job_requirements_to_directives_memory(
|
|
|
290
290
|
memory=8 * 1024**3, # 8GB
|
|
291
291
|
cluster=dummy_cluster_config,
|
|
292
292
|
)
|
|
293
|
-
directives = req.
|
|
293
|
+
directives = req.batch_directives()
|
|
294
294
|
# 8GB / 2 CPUs / 2^20 bytes per MB = 4096 MB
|
|
295
295
|
assert "--mem-per-cpu=4096M" in directives
|
|
296
296
|
|
|
@@ -303,7 +303,7 @@ def test_job_requirements_to_directives_memory_with_single_cpu(
|
|
|
303
303
|
memory=2 * 1024**3, # 2GB
|
|
304
304
|
cluster=dummy_cluster_config,
|
|
305
305
|
)
|
|
306
|
-
directives = req.
|
|
306
|
+
directives = req.batch_directives()
|
|
307
307
|
# 2GB / 1 CPU / 2^20 = 2048 MB
|
|
308
308
|
assert "--mem-per-cpu=2048M" in directives
|
|
309
309
|
|
|
@@ -317,7 +317,7 @@ def test_job_requirements_to_directives_ram_alias(
|
|
|
317
317
|
ram=8 * 1024**3,
|
|
318
318
|
cluster=dummy_cluster_config,
|
|
319
319
|
)
|
|
320
|
-
directives = req.
|
|
320
|
+
directives = req.batch_directives()
|
|
321
321
|
assert "--mem-per-cpu=4096M" in directives
|
|
322
322
|
|
|
323
323
|
|
|
@@ -329,7 +329,7 @@ def test_job_requirements_to_directives_disk(
|
|
|
329
329
|
disk=1024**3, # 1GB
|
|
330
330
|
cluster=dummy_cluster_config,
|
|
331
331
|
)
|
|
332
|
-
directives = req.
|
|
332
|
+
directives = req.batch_directives()
|
|
333
333
|
# 1GB / 2^20 MB = 1024 MB
|
|
334
334
|
assert "--tmp=1024M" in directives
|
|
335
335
|
|
|
@@ -342,7 +342,7 @@ def test_job_requirements_to_directives_ephemeral_storage_alias(
|
|
|
342
342
|
ephemeral_storage=2 * 1024**3, # 2GB
|
|
343
343
|
cluster=dummy_cluster_config,
|
|
344
344
|
)
|
|
345
|
-
directives = req.
|
|
345
|
+
directives = req.batch_directives()
|
|
346
346
|
# 2GB / 2^20 MB = 2048 MB
|
|
347
347
|
assert "--tmp=2048M" in directives
|
|
348
348
|
|
|
@@ -355,7 +355,7 @@ def test_job_requirements_to_directives_generic_gpu(
|
|
|
355
355
|
gpu=4,
|
|
356
356
|
cluster=dummy_cluster_config,
|
|
357
357
|
)
|
|
358
|
-
directives = req.
|
|
358
|
+
directives = req.batch_directives()
|
|
359
359
|
assert "--gpus=4" in directives
|
|
360
360
|
|
|
361
361
|
|
|
@@ -367,7 +367,7 @@ def test_job_requirements_to_directives_specific_gpu(
|
|
|
367
367
|
a100=2,
|
|
368
368
|
cluster=cluster_with_gpu_mapping,
|
|
369
369
|
)
|
|
370
|
-
directives = req.
|
|
370
|
+
directives = req.batch_directives()
|
|
371
371
|
assert "--gpus=a100:2" in directives
|
|
372
372
|
|
|
373
373
|
|
|
@@ -394,7 +394,7 @@ def test_job_requirements_to_directives_various_gpu_types(
|
|
|
394
394
|
)
|
|
395
395
|
kwargs = {accelerator_type.name.lower(): count}
|
|
396
396
|
req = resources.JobRequirements(**kwargs, cluster=cluster) # type: ignore
|
|
397
|
-
directives = req.
|
|
397
|
+
directives = req.batch_directives()
|
|
398
398
|
assert f"--gpus={gpu_name}:{count}" in directives
|
|
399
399
|
|
|
400
400
|
|
|
@@ -407,7 +407,7 @@ def test_job_requirements_to_directives_unmapped_gpu_raises_error(
|
|
|
407
407
|
cluster=dummy_cluster_config,
|
|
408
408
|
)
|
|
409
409
|
with pytest.raises(ValueError, match="does not map resource type"):
|
|
410
|
-
req.
|
|
410
|
+
req.batch_directives()
|
|
411
411
|
|
|
412
412
|
|
|
413
413
|
def test_job_requirements_to_directives_custom_gres(
|
|
@@ -418,7 +418,7 @@ def test_job_requirements_to_directives_custom_gres(
|
|
|
418
418
|
custom_resource="2", # type: ignore
|
|
419
419
|
cluster=dummy_cluster_config,
|
|
420
420
|
)
|
|
421
|
-
directives = req.
|
|
421
|
+
directives = req.batch_directives()
|
|
422
422
|
assert "--gres=custom_resource:2" in directives
|
|
423
423
|
|
|
424
424
|
|
|
@@ -431,7 +431,7 @@ def test_job_requirements_to_directives_replicas(
|
|
|
431
431
|
replicas=8,
|
|
432
432
|
cluster=dummy_cluster_config,
|
|
433
433
|
)
|
|
434
|
-
directives = req.
|
|
434
|
+
directives = req.batch_directives()
|
|
435
435
|
assert "--ntasks=8" in directives
|
|
436
436
|
|
|
437
437
|
|
|
@@ -444,7 +444,7 @@ def test_job_requirements_to_directives_location(
|
|
|
444
444
|
location="node[001-005]",
|
|
445
445
|
cluster=dummy_cluster_config,
|
|
446
446
|
)
|
|
447
|
-
directives = req.
|
|
447
|
+
directives = req.batch_directives()
|
|
448
448
|
assert "--nodelist=node[001-005]" in directives
|
|
449
449
|
|
|
450
450
|
|
|
@@ -461,7 +461,7 @@ def test_job_requirements_to_directives_combined_all_resources(
|
|
|
461
461
|
location="gpu-nodes[001-004]",
|
|
462
462
|
cluster=cluster_with_gpu_mapping,
|
|
463
463
|
)
|
|
464
|
-
directives = req.
|
|
464
|
+
directives = req.batch_directives()
|
|
465
465
|
assert "--cpus-per-task=8" in directives
|
|
466
466
|
assert "--mem-per-cpu=8192M" in directives # 64GB / 8 CPUs / 2^20
|
|
467
467
|
assert "--tmp=512000M" in directives # 500GB / 2^20
|
|
@@ -655,7 +655,7 @@ def test_job_requirements_invalid_memory_type(
|
|
|
655
655
|
req = resources.JobRequirements(cluster=dummy_cluster_config)
|
|
656
656
|
req.task_requirements[resources.ResourceType.MEMORY] = "invalid" # type: ignore
|
|
657
657
|
with pytest.raises(AssertionError, match="Memory must be an integer or float"):
|
|
658
|
-
req.
|
|
658
|
+
req.batch_directives()
|
|
659
659
|
|
|
660
660
|
|
|
661
661
|
def test_job_requirements_invalid_cpu_type(
|
|
@@ -665,7 +665,7 @@ def test_job_requirements_invalid_cpu_type(
|
|
|
665
665
|
req = resources.JobRequirements(cluster=dummy_cluster_config)
|
|
666
666
|
req.task_requirements[resources.ResourceType.CPU] = 4.5 # type: ignore
|
|
667
667
|
with pytest.raises(AssertionError, match="CPU must be an integer"):
|
|
668
|
-
req.
|
|
668
|
+
req.batch_directives()
|
|
669
669
|
|
|
670
670
|
|
|
671
671
|
def test_job_requirements_invalid_gpu_type(
|
|
@@ -675,7 +675,7 @@ def test_job_requirements_invalid_gpu_type(
|
|
|
675
675
|
req = resources.JobRequirements(cluster=dummy_cluster_config)
|
|
676
676
|
req.task_requirements[resources.ResourceType.GPU] = 1.5 # type: ignore
|
|
677
677
|
with pytest.raises(AssertionError, match="GPU must be an integer"):
|
|
678
|
-
req.
|
|
678
|
+
req.batch_directives()
|
|
679
679
|
|
|
680
680
|
|
|
681
681
|
def test_job_requirements_invalid_disk_type(
|
|
@@ -685,7 +685,7 @@ def test_job_requirements_invalid_disk_type(
|
|
|
685
685
|
req = resources.JobRequirements(cluster=dummy_cluster_config)
|
|
686
686
|
req.task_requirements[resources.ResourceType.DISK] = 1.5 # type: ignore
|
|
687
687
|
with pytest.raises(AssertionError, match="Disk space must be an integer"):
|
|
688
|
-
req.
|
|
688
|
+
req.batch_directives()
|
|
689
689
|
|
|
690
690
|
|
|
691
691
|
def test_job_requirements_invalid_replicas(
|
|
@@ -695,7 +695,7 @@ def test_job_requirements_invalid_replicas(
|
|
|
695
695
|
req = resources.JobRequirements(cluster=dummy_cluster_config)
|
|
696
696
|
req.replicas = -1
|
|
697
697
|
with pytest.raises(AssertionError, match="Replicas must be a positive integer"):
|
|
698
|
-
req.
|
|
698
|
+
req.batch_directives()
|
|
699
699
|
|
|
700
700
|
|
|
701
701
|
def test_job_requirements_invalid_replicas_zero(
|
|
@@ -705,7 +705,7 @@ def test_job_requirements_invalid_replicas_zero(
|
|
|
705
705
|
req = resources.JobRequirements(cluster=dummy_cluster_config)
|
|
706
706
|
req.replicas = 0
|
|
707
707
|
with pytest.raises(AssertionError, match="Replicas must be a positive integer"):
|
|
708
|
-
req.
|
|
708
|
+
req.batch_directives()
|
|
709
709
|
|
|
710
710
|
|
|
711
711
|
def test_job_requirements_invalid_location_type(
|
|
@@ -715,7 +715,7 @@ def test_job_requirements_invalid_location_type(
|
|
|
715
715
|
req = resources.JobRequirements(cluster=dummy_cluster_config)
|
|
716
716
|
req.location = 123 # type: ignore
|
|
717
717
|
with pytest.raises(AssertionError, match="Location must be a string"):
|
|
718
|
-
req.
|
|
718
|
+
req.batch_directives()
|
|
719
719
|
|
|
720
720
|
|
|
721
721
|
@pytest.mark.parametrize(
|
|
@@ -740,7 +740,7 @@ def test_job_requirements_memory_directive_calculation(
|
|
|
740
740
|
memory=memory_bytes,
|
|
741
741
|
cluster=dummy_cluster_config,
|
|
742
742
|
)
|
|
743
|
-
directives = req.
|
|
743
|
+
directives = req.batch_directives()
|
|
744
744
|
assert f"--mem-per-cpu={expected_mem_per_cpu_mb}M" in directives
|
|
745
745
|
|
|
746
746
|
|
|
@@ -763,7 +763,7 @@ def test_job_requirements_disk_directive_calculation(
|
|
|
763
763
|
disk=disk_bytes,
|
|
764
764
|
cluster=dummy_cluster_config,
|
|
765
765
|
)
|
|
766
|
-
directives = req.
|
|
766
|
+
directives = req.batch_directives()
|
|
767
767
|
assert f"--tmp={expected_tmp_mb}M" in directives
|
|
768
768
|
|
|
769
769
|
|
|
@@ -776,7 +776,7 @@ def test_job_requirements_fractional_memory_rounds_correctly(
|
|
|
776
776
|
memory=10 * 1024**3, # 10GB / 3 = 3.33GB per CPU
|
|
777
777
|
cluster=dummy_cluster_config,
|
|
778
778
|
)
|
|
779
|
-
directives = req.
|
|
779
|
+
directives = req.batch_directives()
|
|
780
780
|
# math.ceil(10 * 1024^3 / 3 / 2^20) = math.ceil(3413.33...) = 3414
|
|
781
781
|
assert "--mem-per-cpu=3414M" in directives
|
|
782
782
|
|
|
@@ -789,7 +789,7 @@ def test_job_requirements_fractional_disk_rounds_correctly(
|
|
|
789
789
|
disk=5 * 1024**3 + 512 * 1024**2, # 5.5GB
|
|
790
790
|
cluster=dummy_cluster_config,
|
|
791
791
|
)
|
|
792
|
-
directives = req.
|
|
792
|
+
directives = req.batch_directives()
|
|
793
793
|
# math.ceil((5*1024^3 + 512*1024^2) / 2^20) = math.ceil(5632) = 5632
|
|
794
794
|
assert "--tmp=5632M" in directives
|
|
795
795
|
|
|
@@ -799,7 +799,7 @@ def test_job_requirements_empty_task_requirements(
|
|
|
799
799
|
) -> None:
|
|
800
800
|
"""Test job requirements with no specific resources."""
|
|
801
801
|
req = resources.JobRequirements(cluster=dummy_cluster_config)
|
|
802
|
-
directives = req.
|
|
802
|
+
directives = req.batch_directives()
|
|
803
803
|
# Should still have ntasks directive
|
|
804
804
|
assert "--ntasks=1" in directives
|
|
805
805
|
assert len(directives) == 1
|
|
@@ -835,8 +835,8 @@ def test_job_requirements_multiple_directives_order_independence(
|
|
|
835
835
|
memory=4 * 1024**3,
|
|
836
836
|
cluster=cluster_with_gpu_mapping,
|
|
837
837
|
)
|
|
838
|
-
directives1 = set(req1.
|
|
839
|
-
directives2 = set(req2.
|
|
838
|
+
directives1 = set(req1.batch_directives())
|
|
839
|
+
directives2 = set(req2.batch_directives())
|
|
840
840
|
assert directives1 == directives2
|
|
841
841
|
|
|
842
842
|
|
|
@@ -1150,7 +1150,7 @@ def test_topology_directive_gpus_per_task(dummy_cluster_config: config.SlurmClus
|
|
|
1150
1150
|
gpu=topo,
|
|
1151
1151
|
cluster=dummy_cluster_config,
|
|
1152
1152
|
)
|
|
1153
|
-
directives = req.
|
|
1153
|
+
directives = req.batch_directives()
|
|
1154
1154
|
assert "--gpus-per-task=4" in directives
|
|
1155
1155
|
|
|
1156
1156
|
|
|
@@ -1163,7 +1163,7 @@ def test_topology_directive_gpus_per_task_with_specific_accelerator(
|
|
|
1163
1163
|
a100=topo,
|
|
1164
1164
|
cluster=cluster_with_gpu_mapping,
|
|
1165
1165
|
)
|
|
1166
|
-
directives = req.
|
|
1166
|
+
directives = req.batch_directives()
|
|
1167
1167
|
assert "--gpus-per-task=a100:2" in directives
|
|
1168
1168
|
|
|
1169
1169
|
|
|
@@ -1176,7 +1176,7 @@ def test_topology_directive_ntasks_from_topology(
|
|
|
1176
1176
|
gpu=topo,
|
|
1177
1177
|
cluster=dummy_cluster_config,
|
|
1178
1178
|
)
|
|
1179
|
-
directives = req.
|
|
1179
|
+
directives = req.batch_directives()
|
|
1180
1180
|
assert "--ntasks=3" in directives
|
|
1181
1181
|
|
|
1182
1182
|
|
|
@@ -1187,7 +1187,7 @@ def test_topology_directive_switches(dummy_cluster_config: config.SlurmClusterCo
|
|
|
1187
1187
|
gpu=topo,
|
|
1188
1188
|
cluster=dummy_cluster_config,
|
|
1189
1189
|
)
|
|
1190
|
-
directives = req.
|
|
1190
|
+
directives = req.batch_directives()
|
|
1191
1191
|
assert any("--switches=" in d for d in directives)
|
|
1192
1192
|
|
|
1193
1193
|
|
|
@@ -1201,7 +1201,7 @@ def test_topology_directive_switches_with_timeout(
|
|
|
1201
1201
|
gpu=topo,
|
|
1202
1202
|
cluster=dummy_cluster_config,
|
|
1203
1203
|
)
|
|
1204
|
-
directives = req.
|
|
1204
|
+
directives = req.batch_directives()
|
|
1205
1205
|
# Should have format like "--switches=2@5:00" (5 minutes in SLURM format)
|
|
1206
1206
|
switches_directives = [d for d in directives if "--switches=" in d]
|
|
1207
1207
|
assert len(switches_directives) > 0
|
|
@@ -1218,7 +1218,7 @@ def test_topology_1d_topology_no_ntasks_override(
|
|
|
1218
1218
|
replicas=4,
|
|
1219
1219
|
cluster=dummy_cluster_config,
|
|
1220
1220
|
)
|
|
1221
|
-
directives = req.
|
|
1221
|
+
directives = req.batch_directives()
|
|
1222
1222
|
assert "--ntasks=4" in directives
|
|
1223
1223
|
|
|
1224
1224
|
|
|
@@ -37,6 +37,7 @@ class Slurm(xm.Executor):
|
|
|
37
37
|
switches: Maximum count of leaf switches desired for the job allocation.
|
|
38
38
|
switches_grace_period: Maximum time to wait for that number of switches.
|
|
39
39
|
bind: How to bind tasks to resource (memory, GPU, or generic resource).
|
|
40
|
+
bind_flag: Generic resource task binding options.
|
|
40
41
|
account: The account to charge the job to.
|
|
41
42
|
partition: The partition to run the job in.
|
|
42
43
|
qos: The quality of service to run the job with.
|
|
@@ -59,6 +60,7 @@ class Slurm(xm.Executor):
|
|
|
59
60
|
requirements: resources.JobRequirements
|
|
60
61
|
time: dt.timedelta
|
|
61
62
|
bind: tp.Mapping[ResourceBindType | str, str | None] | None = None
|
|
63
|
+
bind_flag: str | None = None
|
|
62
64
|
|
|
63
65
|
# Placement
|
|
64
66
|
account: str | None = None
|
|
@@ -109,6 +111,8 @@ class Slurm(xm.Executor):
|
|
|
109
111
|
)
|
|
110
112
|
if value is not None and not isinstance(value, str):
|
|
111
113
|
raise TypeError(f"bind value must be None or a string, got {type(value)}")
|
|
114
|
+
if self.bind_flag is not None and not isinstance(self.bind_flag, str):
|
|
115
|
+
raise TypeError(f"bind_flag must be a string, got {type(self.bind_flag)}")
|
|
112
116
|
|
|
113
117
|
if not isinstance(self.timeout_signal, signal.Signals):
|
|
114
118
|
raise TypeError(
|
|
@@ -133,28 +137,13 @@ class Slurm(xm.Executor):
|
|
|
133
137
|
def Spec(cls, tag: str | None = None) -> SlurmSpec:
|
|
134
138
|
return SlurmSpec(tag=tag)
|
|
135
139
|
|
|
136
|
-
def
|
|
140
|
+
def batch_directives(self) -> list[str]:
|
|
137
141
|
# Job requirements
|
|
138
|
-
directives = self.requirements.
|
|
142
|
+
directives = self.requirements.batch_directives()
|
|
139
143
|
|
|
140
144
|
# Time
|
|
141
145
|
directives.append(f"--time={utils.timestr_from_timedelta(self.time)}")
|
|
142
146
|
|
|
143
|
-
# Resource binding
|
|
144
|
-
if self.bind is not None:
|
|
145
|
-
for resource, value in self.bind.items():
|
|
146
|
-
if value is None:
|
|
147
|
-
value = "none"
|
|
148
|
-
match resource:
|
|
149
|
-
case resources.ResourceType.MEMORY | resources.ResourceType.RAM:
|
|
150
|
-
directives.append(f"--mem-bind={value}")
|
|
151
|
-
case resources.ResourceType.GPU:
|
|
152
|
-
directives.append(f"--gpu-bind={value}")
|
|
153
|
-
case str():
|
|
154
|
-
directives.append(f"--tres-bind=gres/{resource}:{value}")
|
|
155
|
-
case _:
|
|
156
|
-
raise ValueError(f"Unsupported resource type {resource!r} for binding.")
|
|
157
|
-
|
|
158
147
|
# Job dependency handling
|
|
159
148
|
directives.append(
|
|
160
149
|
f"--kill-on-invalid-dep={'yes' if self.kill_on_invalid_dependencies else 'no'}"
|
|
@@ -196,3 +185,26 @@ class Slurm(xm.Executor):
|
|
|
196
185
|
directives.append("--no-requeue")
|
|
197
186
|
|
|
198
187
|
return directives
|
|
188
|
+
|
|
189
|
+
def step_directives(self) -> list[str]:
|
|
190
|
+
directives = self.requirements.step_directives()
|
|
191
|
+
|
|
192
|
+
# Resource binding
|
|
193
|
+
if self.bind is not None:
|
|
194
|
+
for resource, value in self.bind.items():
|
|
195
|
+
if value is None:
|
|
196
|
+
value = "none"
|
|
197
|
+
match resource:
|
|
198
|
+
case resources.ResourceType.MEMORY | resources.ResourceType.RAM:
|
|
199
|
+
directives.append(f"--mem-bind={value}")
|
|
200
|
+
case resources.ResourceType.GPU:
|
|
201
|
+
directives.append(f"--gpu-bind={value}")
|
|
202
|
+
case str():
|
|
203
|
+
directives.append(f"--tres-bind=gres/{resource}:{value}")
|
|
204
|
+
case _:
|
|
205
|
+
raise ValueError(f"Unsupported resource type {resource!r} for binding.")
|
|
206
|
+
|
|
207
|
+
if self.bind_flag is not None:
|
|
208
|
+
directives.append(f"--gres-flags={self.bind_flag}")
|
|
209
|
+
|
|
210
|
+
return directives
|
|
@@ -232,7 +232,7 @@ class JobRequirements:
|
|
|
232
232
|
raise ValueError(f"Replicas must be a positive integer, got {replicas!r}")
|
|
233
233
|
self.replicas = replicas or 1
|
|
234
234
|
|
|
235
|
-
def
|
|
235
|
+
def batch_directives(self) -> list[str]:
|
|
236
236
|
directives = []
|
|
237
237
|
|
|
238
238
|
for resource, value in self.task_requirements.items():
|
|
@@ -302,6 +302,9 @@ class JobRequirements:
|
|
|
302
302
|
|
|
303
303
|
return directives
|
|
304
304
|
|
|
305
|
+
def step_directives(self) -> list[str]:
|
|
306
|
+
return []
|
|
307
|
+
|
|
305
308
|
def replace(
|
|
306
309
|
self,
|
|
307
310
|
replicas: int | None = None,
|
|
@@ -9,8 +9,11 @@
|
|
|
9
9
|
srun \
|
|
10
10
|
--label \
|
|
11
11
|
--unbuffered \
|
|
12
|
-
--kill-on-bad-exit=
|
|
12
|
+
--kill-on-bad-exit=1 \
|
|
13
13
|
--export="ALL" \
|
|
14
|
+
{% for directive in job.executor.step_directives() %}
|
|
15
|
+
{{ directive }} \
|
|
16
|
+
{% endfor %}
|
|
14
17
|
bash <<'SRUN_EOF' &
|
|
15
18
|
set -Eeuxo pipefail
|
|
16
19
|
|
|
@@ -21,7 +21,9 @@
|
|
|
21
21
|
{% else %}
|
|
22
22
|
#SBATCH --job-name=xm[{{ job_name }}@{{ experiment_id }}]
|
|
23
23
|
{% endif %}
|
|
24
|
-
{
|
|
24
|
+
{% for directive in job.executor.batch_directives() %}
|
|
25
|
+
#SBATCH {{ directive }}
|
|
26
|
+
{% endfor %}
|
|
25
27
|
{{ "\n#SBATCH hetjob\n" if not loop.last }}
|
|
26
28
|
{% endfor %}
|
|
27
29
|
{% endblock directives %}
|
|
@@ -31,8 +33,11 @@
|
|
|
31
33
|
srun \
|
|
32
34
|
--label \
|
|
33
35
|
--unbuffered \
|
|
34
|
-
--kill-on-bad-exit=
|
|
36
|
+
--kill-on-bad-exit=1 \
|
|
35
37
|
--export="ALL" \
|
|
38
|
+
{% for directive in job.executor.step_directives() %}
|
|
39
|
+
{{ directive }} \
|
|
40
|
+
{% endfor %}
|
|
36
41
|
--het-group={{ loop.index0 }} \
|
|
37
42
|
bash <<'SRUN_EOF' &
|
|
38
43
|
set -Eeuxo pipefail
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
{% endif %}
|
|
22
22
|
#SBATCH --job-name=xm[{{ experiment_id }}]
|
|
23
23
|
{% endif %}
|
|
24
|
-
{% for directive in job.executor.
|
|
24
|
+
{% for directive in job.executor.batch_directives() %}
|
|
25
25
|
#SBATCH {{ directive }}
|
|
26
26
|
{% endfor %}
|
|
27
27
|
{% endblock directives %}
|
|
@@ -61,8 +61,11 @@ export {{ key }}="{{ value }}"
|
|
|
61
61
|
srun \
|
|
62
62
|
--label \
|
|
63
63
|
--unbuffered \
|
|
64
|
-
--kill-on-bad-exit=
|
|
64
|
+
--kill-on-bad-exit=1 \
|
|
65
65
|
--export="ALL" \
|
|
66
|
+
{% for directive in job.executor.step_directives() %}
|
|
67
|
+
{{ directive }} \
|
|
68
|
+
{% endfor %}
|
|
66
69
|
bash <<'SRUN_EOF' &
|
|
67
70
|
set -Eeuxo pipefail
|
|
68
71
|
{{ run(cluster, job) }}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/parameter-controller/pyproject.toml
RENAMED
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/examples/parameter-controller/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/cgroup.conf
RENAMED
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/docker-compose.yml
RENAMED
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/host_ed25519
RENAMED
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/host_ed25519.pub
RENAMED
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/id_ed25519.pub
RENAMED
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/slurmdbd.conf
RENAMED
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/tests/integration/fixtures/slurm/sshd_config
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/experimental/parameter_controller.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/docker/docker-bake.hcl.j2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/fragments/monitor.bash.j2
RENAMED
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/fragments/proxy.bash.j2
RENAMED
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/runtimes/apptainer.bash.j2
RENAMED
|
File without changes
|
{xmanager_slurm-0.4.16 → xmanager_slurm-0.4.17}/xm_slurm/templates/slurm/runtimes/podman.bash.j2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|