xmanager-slurm 0.4.8__tar.gz → 0.4.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xmanager-slurm might be problematic. Click here for more details.
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/PKG-INFO +1 -1
- xmanager_slurm-0.4.10/examples/job-timeout/launch.py +53 -0
- xmanager_slurm-0.4.10/examples/job-timeout/main.py +11 -0
- xmanager_slurm-0.4.10/examples/uv/pyproject.toml +6 -0
- xmanager_slurm-0.4.10/examples/uv/uv.lock +66 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/pyproject.toml +1 -1
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/uv.lock +2 -1
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/executors.py +5 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/packaging/docker.py +5 -8
- xmanager_slurm-0.4.10/xm_slurm/templates/slurm/fragments/monitor.bash.j2 +76 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/job.bash.j2 +1 -1
- xmanager_slurm-0.4.8/xm_slurm/templates/slurm/fragments/monitor.bash.j2 +0 -37
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.devcontainer.json +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.github/workflows/ci.yml +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.github/workflows/deploy-docs.yml +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.gitignore +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.pdm-build/.gitignore +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.pdm-build/xm_slurm-0.1+editable.dist-info/METADATA +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.pdm-build/xm_slurm-0.1+editable.dist-info/WHEEL +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.pdm-build/xm_slurm.pth +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.pre-commit-config.yaml +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.python-version +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.vscode/settings.json +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/LICENSE.md +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/README.md +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/api/executables.rst +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/api/executors.rst +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/api/packageables.rst +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/assets/workflow-dark.svg +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/assets/workflow-light.svg +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/conf.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/getting-started/xmanager.md +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/guides/index.md +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/guides/remote-dev.md +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/docs/index.md +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/conda/environment.yml +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/conda/launch.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/conda/main.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/conda/pyproject.toml +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/custom-dockerfile/Dockerfile +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/custom-dockerfile/launch.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/custom-dockerfile/pyproject.toml +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-array-sweep/launch.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-array-sweep/main.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-array-sweep/pyproject.toml +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-array-sweep/uv.lock +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-dependencies/eval.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-dependencies/launch.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-dependencies/pyproject.toml +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-dependencies/train.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-dependencies/uv.lock +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-group/Dockerfile +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-group/launch.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-group/pyproject.toml +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/job-group/uv.lock +0 -0
- {xmanager_slurm-0.4.8/examples/uv → xmanager_slurm-0.4.10/examples/job-timeout}/pyproject.toml +0 -0
- {xmanager_slurm-0.4.8/examples/uv → xmanager_slurm-0.4.10/examples/job-timeout}/uv.lock +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/metadata/launch.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/metadata/main.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/metadata/pyproject.toml +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/metadata/requirements.txt +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/parameter-controller/launch.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/parameter-controller/main.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/parameter-controller/pyproject.toml +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/parameter-controller/requirements.txt +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/pip/launch.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/pip/main.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/pip/pyproject.toml +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/pip/requirements.txt +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/uv/launch.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/conftest.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/Dockerfile +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/README.md +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/cgroup.conf +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/docker-compose.yml +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/docker-entrypoint.sh +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/host_ed25519 +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/host_ed25519.pub +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/id_ed25519 +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/id_ed25519.pub +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/slurm.conf +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/slurmdbd.conf +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/sshd_config +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/test_remote_execution.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/test_dependencies.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/__init__.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/api/__init__.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/api/abc.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/api/models.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/api/sqlite/client.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/api/web/client.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/batching.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/config.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/console.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/constants.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/contrib/__init__.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/contrib/clusters/__init__.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/contrib/clusters/drac.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/dependencies.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/executables.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/execution.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/experiment.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/experimental/parameter_controller.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/filesystem.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/job_blocks.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/metadata_context.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/packageables.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/packaging/__init__.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/packaging/registry.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/packaging/router.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/packaging/utils.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/resources.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/scripts/_cloudpickle.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/scripts/cli.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/status.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/docker/docker-bake.hcl.j2 +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/docker/mamba.Dockerfile +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/docker/python.Dockerfile +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/docker/uv.Dockerfile +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/entrypoint.bash.j2 +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/fragments/proxy.bash.j2 +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/job-array.bash.j2 +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/job-group.bash.j2 +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/runtimes/podman.bash.j2 +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/types.py +0 -0
- {xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/utils.py +0 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import datetime as dt
|
|
2
|
+
|
|
3
|
+
from absl import app
|
|
4
|
+
from xmanager import xm
|
|
5
|
+
|
|
6
|
+
import xm_slurm
|
|
7
|
+
import xm_slurm.contrib.clusters
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@xm.run_in_asyncio_loop
|
|
11
|
+
async def main(_):
|
|
12
|
+
async with xm_slurm.create_experiment("My Experiment") as experiment:
|
|
13
|
+
# Step 1: Specify the executor specification
|
|
14
|
+
executor_spec = xm_slurm.Slurm.Spec(tag="ghcr.io/jessefarebro/xm-slurm/test:latest")
|
|
15
|
+
|
|
16
|
+
# Step 2: Specify the executable and package it
|
|
17
|
+
[executable] = experiment.package(
|
|
18
|
+
[
|
|
19
|
+
xm_slurm.uv_container(
|
|
20
|
+
executor_spec=executor_spec,
|
|
21
|
+
# Equivalent of `-m rich.status`
|
|
22
|
+
entrypoint=xm.CommandList(["main.py", "900"]),
|
|
23
|
+
),
|
|
24
|
+
],
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Step 3: Construct requirements & executor
|
|
28
|
+
requirements = xm_slurm.JobRequirements(
|
|
29
|
+
CPU=1,
|
|
30
|
+
RAM=1.0 * xm.GiB,
|
|
31
|
+
GPU=1,
|
|
32
|
+
replicas=1,
|
|
33
|
+
cluster=xm_slurm.contrib.clusters.mila(),
|
|
34
|
+
)
|
|
35
|
+
executor = xm_slurm.Slurm(
|
|
36
|
+
requirements=requirements,
|
|
37
|
+
time=dt.timedelta(minutes=10),
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Step 4: Schedule job
|
|
41
|
+
wu = await experiment.add(
|
|
42
|
+
xm.Job(
|
|
43
|
+
executable=executable,
|
|
44
|
+
executor=executor,
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
await wu.wait_until_complete()
|
|
49
|
+
print(f"Job finished executing with status {await wu.get_status()}")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
app.run(main)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
version = 1
|
|
2
|
+
requires-python = ">=3.10"
|
|
3
|
+
|
|
4
|
+
[[package]]
|
|
5
|
+
name = "markdown-it-py"
|
|
6
|
+
version = "3.0.0"
|
|
7
|
+
source = { registry = "https://pypi.org/simple" }
|
|
8
|
+
dependencies = [
|
|
9
|
+
{ name = "mdurl" },
|
|
10
|
+
]
|
|
11
|
+
sdist = { url = "https://files.pythonhosted.org/packages/38/71/3b932df36c1a044d397a1f92d1cf91ee0a503d91e470cbd670aa66b07ed0/markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb", size = 74596 }
|
|
12
|
+
wheels = [
|
|
13
|
+
{ url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528 },
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[[package]]
|
|
17
|
+
name = "mdurl"
|
|
18
|
+
version = "0.1.2"
|
|
19
|
+
source = { registry = "https://pypi.org/simple" }
|
|
20
|
+
sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729 }
|
|
21
|
+
wheels = [
|
|
22
|
+
{ url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[[package]]
|
|
26
|
+
name = "pygments"
|
|
27
|
+
version = "2.18.0"
|
|
28
|
+
source = { registry = "https://pypi.org/simple" }
|
|
29
|
+
sdist = { url = "https://files.pythonhosted.org/packages/8e/62/8336eff65bcbc8e4cb5d05b55faf041285951b6e80f33e2bff2024788f31/pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199", size = 4891905 }
|
|
30
|
+
wheels = [
|
|
31
|
+
{ url = "https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a", size = 1205513 },
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[[package]]
|
|
35
|
+
name = "rich"
|
|
36
|
+
version = "13.9.2"
|
|
37
|
+
source = { registry = "https://pypi.org/simple" }
|
|
38
|
+
dependencies = [
|
|
39
|
+
{ name = "markdown-it-py" },
|
|
40
|
+
{ name = "pygments" },
|
|
41
|
+
{ name = "typing-extensions", marker = "python_full_version < '3.11'" },
|
|
42
|
+
]
|
|
43
|
+
sdist = { url = "https://files.pythonhosted.org/packages/aa/9e/1784d15b057b0075e5136445aaea92d23955aad2c93eaede673718a40d95/rich-13.9.2.tar.gz", hash = "sha256:51a2c62057461aaf7152b4d611168f93a9fc73068f8ded2790f29fe2b5366d0c", size = 222843 }
|
|
44
|
+
wheels = [
|
|
45
|
+
{ url = "https://files.pythonhosted.org/packages/67/91/5474b84e505a6ccc295b2d322d90ff6aa0746745717839ee0c5fb4fdcceb/rich-13.9.2-py3-none-any.whl", hash = "sha256:8c82a3d3f8dcfe9e734771313e606b39d8247bb6b826e196f4914b333b743cf1", size = 242117 },
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
[[package]]
|
|
49
|
+
name = "typing-extensions"
|
|
50
|
+
version = "4.12.2"
|
|
51
|
+
source = { registry = "https://pypi.org/simple" }
|
|
52
|
+
sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 }
|
|
53
|
+
wheels = [
|
|
54
|
+
{ url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 },
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
[[package]]
|
|
58
|
+
name = "xm-slurm-example"
|
|
59
|
+
version = "0.0.1"
|
|
60
|
+
source = { virtual = "." }
|
|
61
|
+
dependencies = [
|
|
62
|
+
{ name = "rich" },
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
[package.metadata]
|
|
66
|
+
requires-dist = [{ name = "rich" }]
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
version = 1
|
|
2
|
+
revision = 1
|
|
2
3
|
requires-python = ">=3.10"
|
|
3
4
|
resolution-markers = [
|
|
4
5
|
"python_full_version < '3.11'",
|
|
@@ -2250,7 +2251,7 @@ wheels = [
|
|
|
2250
2251
|
|
|
2251
2252
|
[[package]]
|
|
2252
2253
|
name = "xmanager-slurm"
|
|
2253
|
-
version = "0.4.
|
|
2254
|
+
version = "0.4.10"
|
|
2254
2255
|
source = { editable = "." }
|
|
2255
2256
|
dependencies = [
|
|
2256
2257
|
{ name = "aiofile" },
|
|
@@ -57,8 +57,13 @@ class Slurm(xm.Executor):
|
|
|
57
57
|
|
|
58
58
|
requeue: bool = True # Is this job ellible for requeueing?
|
|
59
59
|
requeue_on_exit_code: int = 42 # The exit code that triggers requeueing
|
|
60
|
+
requeue_on_timeout: bool = True # Should the job requeue upon timeout minus the grace period
|
|
60
61
|
requeue_max_attempts: int = 5 # How many times to attempt requeueing
|
|
61
62
|
|
|
63
|
+
@property
|
|
64
|
+
def requeue_timeout(self) -> dt.timedelta:
|
|
65
|
+
return self.time - self.timeout_signal_grace_period
|
|
66
|
+
|
|
62
67
|
def __post_init__(self) -> None:
|
|
63
68
|
if not isinstance(self.time, dt.timedelta):
|
|
64
69
|
raise TypeError(f"time must be a `datetime.timedelta`, got {type(self.time)}")
|
|
@@ -139,15 +139,12 @@ class DockerClient:
|
|
|
139
139
|
)
|
|
140
140
|
return _parse_credentials_from_config(podman_config_path)
|
|
141
141
|
|
|
142
|
-
def inspect(
|
|
143
|
-
self, image: ImageURI, element: str | None = None, type: tp.Literal["image"] = "image"
|
|
144
|
-
) -> dict[str, tp.Any]:
|
|
142
|
+
def inspect(self, image: ImageURI, element: str) -> dict[str, tp.Any]:
|
|
145
143
|
output = utils.run_command(
|
|
146
144
|
xm.merge_args(
|
|
147
145
|
self._client_call,
|
|
148
|
-
["inspect"],
|
|
149
|
-
["--format", f"{{{{json .{element}}}}}"]
|
|
150
|
-
["--type", type] if type else [],
|
|
146
|
+
["buildx", "imagetools", "inspect"],
|
|
147
|
+
["--format", f"{{{{json .{element}}}}}"],
|
|
151
148
|
[str(image)],
|
|
152
149
|
),
|
|
153
150
|
check=True,
|
|
@@ -259,7 +256,7 @@ class DockerClient:
|
|
|
259
256
|
uri = ImageURI(target.value.executor_spec.tag).with_digest(
|
|
260
257
|
executable_metadata["containerimage.digest"]
|
|
261
258
|
)
|
|
262
|
-
config = self.inspect(uri, "Config")
|
|
259
|
+
config = self.inspect(uri, "Image.Config")
|
|
263
260
|
if "WorkingDir" not in config:
|
|
264
261
|
raise ValueError(
|
|
265
262
|
"Docker image does not have a working directory. "
|
|
@@ -320,7 +317,7 @@ def _(
|
|
|
320
317
|
|
|
321
318
|
uri = ImageURI(target.value.executable_spec.image)
|
|
322
319
|
|
|
323
|
-
config = client.inspect(uri, "Config")
|
|
320
|
+
config = client.inspect(uri, "Image.Config")
|
|
324
321
|
if "WorkingDir" not in config:
|
|
325
322
|
raise ValueError(
|
|
326
323
|
"Docker image does not have a working directory. "
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
{% macro monitor(requeue_max_attempts, requeue_exit_code, requeue_on_timeout, requeue_timeout) -%}
|
|
2
|
+
__xm_slurm_wait_for_children() {
|
|
3
|
+
if [[ -n "${SLURM_ARRAY_JOB_ID:-}" ]]; then
|
|
4
|
+
local -r JOB_ID="${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}"
|
|
5
|
+
else
|
|
6
|
+
local -r JOB_ID="${SLURM_JOB_ID}"
|
|
7
|
+
fi
|
|
8
|
+
|
|
9
|
+
# If there are no child jobs we should error out
|
|
10
|
+
children=( $(jobs -p) )
|
|
11
|
+
{% raw %}
|
|
12
|
+
if [ ${#children[@]} -eq 0 ]; then
|
|
13
|
+
{% endraw %}
|
|
14
|
+
echo "ERROR: no child jobs exist..." >&2
|
|
15
|
+
exit 1
|
|
16
|
+
fi
|
|
17
|
+
|
|
18
|
+
{% if requeue_on_timeout %}
|
|
19
|
+
# Start a watchdog process to signal timeout.
|
|
20
|
+
sleep {{ requeue_timeout }} &
|
|
21
|
+
timeout_pid=$!
|
|
22
|
+
{% endif %}
|
|
23
|
+
|
|
24
|
+
{% raw %}
|
|
25
|
+
while [ ${#children[@]} -gt 0 ]; do
|
|
26
|
+
{% endraw %}
|
|
27
|
+
echo "INFO: Waiting for child processes to finish..."
|
|
28
|
+
{% if requeue_on_timeout %}
|
|
29
|
+
# Wait on either one of the child processes or the timeout process.
|
|
30
|
+
wait -n -p child_pid "${children[@]}" "${timeout_pid}"
|
|
31
|
+
{% else %}
|
|
32
|
+
wait -n -p child_pid "${children[@]}"
|
|
33
|
+
{% endif %}
|
|
34
|
+
local child_exit_code=$?
|
|
35
|
+
|
|
36
|
+
{% if requeue_on_timeout %}
|
|
37
|
+
# If the finished process is the watchdog, trigger the timeout handling.
|
|
38
|
+
if [ "${child_pid}" = "${timeout_pid}" ]; then
|
|
39
|
+
echo "INFO: Timeout of {{ requeue_timeout }} seconds reached. Killing remaining processes: ${children[*]}" >&2
|
|
40
|
+
kill "${children[@]}" 2>/dev/null || true
|
|
41
|
+
scontrol requeue "${JOB_ID}"
|
|
42
|
+
exit {{ requeue_exit_code }}
|
|
43
|
+
fi
|
|
44
|
+
{% endif %}
|
|
45
|
+
|
|
46
|
+
echo "INFO: Process ${child_pid} finished with exit code ${child_exit_code}."
|
|
47
|
+
|
|
48
|
+
# Handle the exit code of the finished process.
|
|
49
|
+
if [ "${child_exit_code}" -eq "{{ requeue_exit_code }}" ] && [ "${SLURM_RESTART_COUNT:-0}" -le "{{ requeue_max_attempts }}" ]; then
|
|
50
|
+
echo "INFO: Received requeue exit code {{ requeue_exit_code }} from process ${child_pid}. Requeuing Slurm job ${JOB_ID} after ${SLURM_RESTART_COUNT-0} restarts." >&2
|
|
51
|
+
scontrol requeue "${JOB_ID}"
|
|
52
|
+
exit {{ requeue_exit_code }}
|
|
53
|
+
elif [ "${child_exit_code}" -ne 0 ]; then
|
|
54
|
+
echo "ERROR: Process ${child_pid} exited with code ${child_exit_code}." >&2
|
|
55
|
+
exit "${child_exit_code}"
|
|
56
|
+
fi
|
|
57
|
+
|
|
58
|
+
# Remove the finished PID from the array in a concise way.
|
|
59
|
+
for i in "${!children[@]}"; do
|
|
60
|
+
if [ "${children[i]}" = "$child_pid" ]; then
|
|
61
|
+
unset 'children[i]'
|
|
62
|
+
break
|
|
63
|
+
fi
|
|
64
|
+
done
|
|
65
|
+
|
|
66
|
+
# Reindex the array.
|
|
67
|
+
children=( "${children[@]}" )
|
|
68
|
+
done
|
|
69
|
+
|
|
70
|
+
{% if requeue_on_timeout %}
|
|
71
|
+
kill "$timeout_pid" 2>/dev/null || true
|
|
72
|
+
{% endif %}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
__xm_slurm_wait_for_children
|
|
76
|
+
{%- endmacro %}
|
|
@@ -73,7 +73,7 @@ echo "[INFO] Start timestamp: $(date)"
|
|
|
73
73
|
|
|
74
74
|
{% block monitor -%}
|
|
75
75
|
{% from 'fragments/monitor.bash.j2' import monitor %}
|
|
76
|
-
{{ monitor(job.executor.requeue_max_attempts, job.executor.requeue_on_exit_code) }}
|
|
76
|
+
{{ monitor(job.executor.requeue_max_attempts, job.executor.requeue_on_exit_code, job.executor.requeue_on_timeout, job.executor.requeue_timeout.seconds) }}
|
|
77
77
|
{%- endblock monitor %}
|
|
78
78
|
|
|
79
79
|
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
{% macro monitor(requeue_max_attempts, requeue_exit_code) -%}
|
|
2
|
-
__xm_slurm_wait_for_children() {
|
|
3
|
-
if [[ -n "${SLURM_ARRAY_JOB_ID:-}" ]]; then
|
|
4
|
-
local -r JOB_ID="${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}"
|
|
5
|
-
else
|
|
6
|
-
local -r JOB_ID="${SLURM_JOB_ID}"
|
|
7
|
-
fi
|
|
8
|
-
|
|
9
|
-
# If there are no child jobs we should error out
|
|
10
|
-
if [ -z "$(jobs -p)" ]; then
|
|
11
|
-
echo "ERROR: no child jobs exist..." >&2
|
|
12
|
-
exit -1
|
|
13
|
-
fi
|
|
14
|
-
|
|
15
|
-
# Loop through all job IDs in the background job list and wait for them to finish
|
|
16
|
-
for job in "$(jobs -p)"; do
|
|
17
|
-
echo "INFO: Waiting for job ${job} to finish..."
|
|
18
|
-
set +e
|
|
19
|
-
wait "${job}"
|
|
20
|
-
local -r JOB_EXIT_CODE="${?}"
|
|
21
|
-
set -e
|
|
22
|
-
|
|
23
|
-
if [ "${JOB_EXIT_CODE}" -eq "{{ requeue_exit_code }}" ] && [ "${SLURM_RESTART_COUNT-0}" -le "{{ requeue_max_attempts }}" ]; then
|
|
24
|
-
echo "INFO: Received requeue exit code {{ requeue_exit_code }} from job ${job}. Requeing Slurm job ${JOB_ID} after ${SLURM_RESTART_COUNT-0} restarts." >&2
|
|
25
|
-
scontrol requeue "${JOB_ID}"
|
|
26
|
-
exit {{ requeue_exit_code }}
|
|
27
|
-
elif [ "${JOB_EXIT_CODE}" -ne 0 ]; then
|
|
28
|
-
echo "ERROR: Job ${job} exited with code ${JOB_EXIT_CODE}." >&2
|
|
29
|
-
exit "${JOB_EXIT_CODE}"
|
|
30
|
-
else
|
|
31
|
-
echo "INFO: Job ${job} exited successfully." >&2
|
|
32
|
-
fi
|
|
33
|
-
done
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
__xm_slurm_wait_for_children
|
|
37
|
-
{%- endmacro %}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.pdm-build/xm_slurm-0.1+editable.dist-info/METADATA
RENAMED
|
File without changes
|
{xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/.pdm-build/xm_slurm-0.1+editable.dist-info/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.8/examples/uv → xmanager_slurm-0.4.10/examples/job-timeout}/pyproject.toml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/examples/parameter-controller/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/docker-compose.yml
RENAMED
|
File without changes
|
{xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/docker-entrypoint.sh
RENAMED
|
File without changes
|
{xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/host_ed25519
RENAMED
|
File without changes
|
{xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/host_ed25519.pub
RENAMED
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/id_ed25519.pub
RENAMED
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/tests/integration/fixtures/slurm/slurmdbd.conf
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/experimental/parameter_controller.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/fragments/proxy.bash.j2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/runtimes/apptainer.bash.j2
RENAMED
|
File without changes
|
{xmanager_slurm-0.4.8 → xmanager_slurm-0.4.10}/xm_slurm/templates/slurm/runtimes/podman.bash.j2
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|