xmanager-slurm 0.4.0__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xmanager-slurm might be problematic. Click here for more details.
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/PKG-INFO +1 -1
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/job-array-sweep/launch.py +1 -1
- xmanager_slurm-0.4.1/examples/job-dependencies/eval.py +13 -0
- xmanager_slurm-0.4.1/examples/job-dependencies/launch.py +104 -0
- xmanager_slurm-0.4.1/examples/job-dependencies/pyproject.toml +6 -0
- xmanager_slurm-0.4.1/examples/job-dependencies/train.py +19 -0
- xmanager_slurm-0.4.1/examples/job-dependencies/uv.lock +86 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/pyproject.toml +4 -1
- xmanager_slurm-0.4.1/tests/test_dependencies.py +149 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/uv.lock +1 -1
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/__init__.py +4 -2
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/api.py +1 -1
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/config.py +7 -2
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/constants.py +4 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/contrib/clusters/__init__.py +25 -0
- xmanager_slurm-0.4.1/xm_slurm/dependencies.py +171 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/executables.py +20 -15
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/execution.py +246 -96
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/executors.py +8 -12
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/experiment.py +374 -83
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/experimental/parameter_controller.py +12 -10
- xmanager_slurm-0.4.0/xm_slurm/packaging/docker/local.py → xmanager_slurm-0.4.1/xm_slurm/packaging/docker.py +126 -32
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/packaging/router.py +3 -1
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/packaging/utils.py +4 -28
- xmanager_slurm-0.4.1/xm_slurm/scripts/cli.py +52 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/templates/docker/mamba.Dockerfile +1 -1
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/templates/slurm/fragments/monitor.bash.j2 +5 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/templates/slurm/job-array.bash.j2 +1 -2
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/templates/slurm/job.bash.j2 +4 -3
- xmanager_slurm-0.4.1/xm_slurm/types.py +23 -0
- xmanager_slurm-0.4.0/xm_slurm/packaging/docker/__init__.py +0 -69
- xmanager_slurm-0.4.0/xm_slurm/packaging/docker/abc.py +0 -112
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/.devcontainer.json +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/.github/workflows/ci.yml +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/.github/workflows/deploy-docs.yml +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/.gitignore +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/.pre-commit-config.yaml +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/.vscode/settings.json +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/LICENSE.md +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/README.md +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/docs/api/executables.rst +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/docs/api/executors.rst +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/docs/api/packageables.rst +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/docs/assets/workflow-dark.svg +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/docs/assets/workflow-light.svg +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/docs/conf.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/docs/getting-started/xmanager.md +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/docs/guides/index.md +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/docs/guides/remote-dev.md +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/docs/index.md +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/conda/environment.yml +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/conda/launch.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/conda/main.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/conda/pyproject.toml +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/custom-dockerfile/Dockerfile +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/custom-dockerfile/launch.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/custom-dockerfile/pyproject.toml +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/job-array-sweep/main.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/job-array-sweep/pyproject.toml +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/job-array-sweep/uv.lock +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/job-group/Dockerfile +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/job-group/launch.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/job-group/pyproject.toml +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/job-group/uv.lock +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/metadata/launch.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/metadata/main.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/metadata/pyproject.toml +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/metadata/requirements.txt +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/parameter-controller/launch.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/parameter-controller/main.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/parameter-controller/pyproject.toml +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/parameter-controller/requirements.txt +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/pip/launch.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/pip/main.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/pip/pyproject.toml +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/pip/requirements.txt +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/uv/launch.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/uv/pyproject.toml +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/examples/uv/uv.lock +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/tests/integration/conftest.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/tests/integration/fixtures/slurm/Dockerfile +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/tests/integration/fixtures/slurm/README.md +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/tests/integration/fixtures/slurm/cgroup.conf +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/tests/integration/fixtures/slurm/docker-compose.yml +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/tests/integration/fixtures/slurm/docker-entrypoint.sh +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/tests/integration/fixtures/slurm/host_ed25519 +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/tests/integration/fixtures/slurm/host_ed25519.pub +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/tests/integration/fixtures/slurm/id_ed25519 +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/tests/integration/fixtures/slurm/id_ed25519.pub +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/tests/integration/fixtures/slurm/slurm.conf +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/tests/integration/fixtures/slurm/slurmdbd.conf +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/tests/integration/fixtures/slurm/sshd_config +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/tests/integration/test_remote_execution.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/batching.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/console.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/contrib/__init__.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/contrib/clusters/drac.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/job_blocks.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/packageables.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/packaging/__init__.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/packaging/registry.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/resources.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/scripts/_cloudpickle.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/status.py +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/templates/docker/docker-bake.hcl.j2 +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/templates/docker/python.Dockerfile +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/templates/docker/uv.Dockerfile +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/templates/slurm/fragments/proxy.bash.j2 +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/templates/slurm/job-group.bash.j2 +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/templates/slurm/runtimes/podman.bash.j2 +0 -0
- {xmanager_slurm-0.4.0 → xmanager_slurm-0.4.1}/xm_slurm/utils.py +0 -0
|
@@ -46,7 +46,7 @@ async def main(_):
|
|
|
46
46
|
)
|
|
47
47
|
)
|
|
48
48
|
|
|
49
|
-
args = [
|
|
49
|
+
args = [xm_slurm.JobArgs(args={"scale": scale}) for scale in range(3)]
|
|
50
50
|
wus = await experiment.add(make_job, args)
|
|
51
51
|
|
|
52
52
|
for wu, status in zip(wus, await asyncio.gather(*[wu.get_status() for wu in wus])):
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from absl import app, flags
|
|
3
|
+
|
|
4
|
+
INPUT_FILE = flags.DEFINE_string("input_file", "result.npy", "Input file path")
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def main(_):
|
|
8
|
+
result = np.load(INPUT_FILE.value)
|
|
9
|
+
print(f"Received result: {result}")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
if __name__ == "__main__":
|
|
13
|
+
app.run(main)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import datetime as dt
|
|
3
|
+
import pathlib
|
|
4
|
+
|
|
5
|
+
from absl import app
|
|
6
|
+
from xmanager import xm
|
|
7
|
+
|
|
8
|
+
import xm_slurm
|
|
9
|
+
import xm_slurm.contrib.clusters
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@xm.run_in_asyncio_loop
|
|
13
|
+
async def main(_):
|
|
14
|
+
async with xm_slurm.create_experiment("My Experiment") as experiment:
|
|
15
|
+
# Step 1: Specify the executor specification
|
|
16
|
+
executor_spec = xm_slurm.Slurm.Spec(tag="ghcr.io/jessefarebro/xm-slurm/test:latest")
|
|
17
|
+
|
|
18
|
+
# Step 2: Specify the executable and package it
|
|
19
|
+
[train_executable, eval_executable] = experiment.package(
|
|
20
|
+
[
|
|
21
|
+
xm_slurm.uv_container(
|
|
22
|
+
executor_spec=executor_spec,
|
|
23
|
+
entrypoint=xm.CommandList(["train.py"]),
|
|
24
|
+
),
|
|
25
|
+
xm_slurm.uv_container(
|
|
26
|
+
executor_spec=executor_spec,
|
|
27
|
+
entrypoint=xm.CommandList(["eval.py"]),
|
|
28
|
+
),
|
|
29
|
+
],
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
workdir = pathlib.Path(f"/scratch/xm-slurm-examples/{experiment.experiment_id}")
|
|
33
|
+
|
|
34
|
+
# Step 4: Schedule train job
|
|
35
|
+
train_executor = xm_slurm.Slurm(
|
|
36
|
+
requirements=xm_slurm.JobRequirements(
|
|
37
|
+
CPU=1,
|
|
38
|
+
RAM=1.0 * xm.GiB,
|
|
39
|
+
GPU=1,
|
|
40
|
+
replicas=1,
|
|
41
|
+
cluster=xm_slurm.contrib.clusters.mila(),
|
|
42
|
+
),
|
|
43
|
+
time=dt.timedelta(hours=1),
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
async def make_train_job(wu: xm.WorkUnit, args):
|
|
47
|
+
await wu.add(
|
|
48
|
+
xm.Job(
|
|
49
|
+
executable=train_executable,
|
|
50
|
+
executor=train_executor,
|
|
51
|
+
args=xm.merge_args(
|
|
52
|
+
[
|
|
53
|
+
"--output_file",
|
|
54
|
+
(workdir / f"{wu.work_unit_id}" / "result.npy").as_posix(),
|
|
55
|
+
],
|
|
56
|
+
args,
|
|
57
|
+
),
|
|
58
|
+
),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
train_wus = await experiment.add(
|
|
62
|
+
make_train_job,
|
|
63
|
+
args=[xm_slurm.JobArgs(args=["--seed", seed]) for seed in range(5)],
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Step 5: Schedule eval job
|
|
67
|
+
eval_executor = xm_slurm.Slurm(
|
|
68
|
+
requirements=xm_slurm.JobRequirements(
|
|
69
|
+
CPU=1,
|
|
70
|
+
RAM=1.0 * xm.GiB,
|
|
71
|
+
GPU=1,
|
|
72
|
+
replicas=1,
|
|
73
|
+
cluster=xm_slurm.contrib.clusters.mila(),
|
|
74
|
+
),
|
|
75
|
+
time=dt.timedelta(hours=1),
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
eval_wus = await experiment.add(
|
|
79
|
+
xm.Job(
|
|
80
|
+
executable=eval_executable,
|
|
81
|
+
executor=eval_executor,
|
|
82
|
+
),
|
|
83
|
+
args=[
|
|
84
|
+
xm_slurm.JobArgs(
|
|
85
|
+
args=[
|
|
86
|
+
"--input_file",
|
|
87
|
+
(workdir / f"{wu.work_unit_id}" / "result.npy").as_posix(),
|
|
88
|
+
]
|
|
89
|
+
)
|
|
90
|
+
for wu in train_wus
|
|
91
|
+
],
|
|
92
|
+
dependency=[train_wu.after_completed() for train_wu in train_wus],
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
for wu in asyncio.as_completed([
|
|
96
|
+
*[train_wu.wait_until_complete() for train_wu in train_wus],
|
|
97
|
+
*[eval_wu.wait_until_complete() for eval_wu in eval_wus],
|
|
98
|
+
]):
|
|
99
|
+
wu = await wu
|
|
100
|
+
print(f"Work Unit {wu!r} finished executing with status {await wu.get_status()}")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
if __name__ == "__main__":
|
|
104
|
+
app.run(main)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from absl import app, flags
|
|
5
|
+
|
|
6
|
+
OUTPUT_FILE = flags.DEFINE_string("output_file", "result.npy", "Output file path")
|
|
7
|
+
SEED = flags.DEFINE_integer("seed", 0, "Random seed")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main(_):
|
|
11
|
+
np.random.seed(SEED.value)
|
|
12
|
+
|
|
13
|
+
pathlib.Path(OUTPUT_FILE.value).parent.mkdir(parents=True, exist_ok=True)
|
|
14
|
+
result = np.random.random((32,))
|
|
15
|
+
np.save(OUTPUT_FILE.value, result)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
if __name__ == "__main__":
|
|
19
|
+
app.run(main)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
version = 1
|
|
2
|
+
requires-python = ">=3.10"
|
|
3
|
+
|
|
4
|
+
[[package]]
|
|
5
|
+
name = "absl-py"
|
|
6
|
+
version = "2.1.0"
|
|
7
|
+
source = { registry = "https://pypi.org/simple" }
|
|
8
|
+
sdist = { url = "https://files.pythonhosted.org/packages/7a/8f/fc001b92ecc467cc32ab38398bd0bfb45df46e7523bf33c2ad22a505f06e/absl-py-2.1.0.tar.gz", hash = "sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff", size = 118055 }
|
|
9
|
+
wheels = [
|
|
10
|
+
{ url = "https://files.pythonhosted.org/packages/a2/ad/e0d3c824784ff121c03cc031f944bc7e139a8f1870ffd2845cc2dd76f6c4/absl_py-2.1.0-py3-none-any.whl", hash = "sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308", size = 133706 },
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[[package]]
|
|
14
|
+
name = "numpy"
|
|
15
|
+
version = "2.1.2"
|
|
16
|
+
source = { registry = "https://pypi.org/simple" }
|
|
17
|
+
sdist = { url = "https://files.pythonhosted.org/packages/4b/d1/8a730ea07f4a37d94f9172f4ce1d81064b7a64766b460378be278952de75/numpy-2.1.2.tar.gz", hash = "sha256:13532a088217fa624c99b843eeb54640de23b3414b14aa66d023805eb731066c", size = 18878063 }
|
|
18
|
+
wheels = [
|
|
19
|
+
{ url = "https://files.pythonhosted.org/packages/1c/a2/40a76d357f168e9f9f06d6cc2c8e22dd5fb2bfbe63fe2c433057258c145a/numpy-2.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:30d53720b726ec36a7f88dc873f0eec8447fbc93d93a8f079dfac2629598d6ee", size = 21150947 },
|
|
20
|
+
{ url = "https://files.pythonhosted.org/packages/b5/d0/ba271ea9108d7278d3889a7eb38d77370a88713fb94339964e71ac184d4a/numpy-2.1.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e8d3ca0a72dd8846eb6f7dfe8f19088060fcb76931ed592d29128e0219652884", size = 13758184 },
|
|
21
|
+
{ url = "https://files.pythonhosted.org/packages/7c/b9/5c6507439cd756201010f7937bf90712c2469052ae094584af14557dd64f/numpy-2.1.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:fc44e3c68ff00fd991b59092a54350e6e4911152682b4782f68070985aa9e648", size = 5354091 },
|
|
22
|
+
{ url = "https://files.pythonhosted.org/packages/60/21/7938cf724d9e84e45fb886f3fc794ab431d71facfebc261e3e9f19f3233a/numpy-2.1.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:7c1c60328bd964b53f8b835df69ae8198659e2b9302ff9ebb7de4e5a5994db3d", size = 6887169 },
|
|
23
|
+
{ url = "https://files.pythonhosted.org/packages/09/8d/42a124657f5d31902fca73921b25a0d022cead2b32ce7e6975762cd2995a/numpy-2.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6cdb606a7478f9ad91c6283e238544451e3a95f30fb5467fbf715964341a8a86", size = 13888165 },
|
|
24
|
+
{ url = "https://files.pythonhosted.org/packages/fb/25/ba023652a39a2c127200e85aed975fc6119b421e2c348e5d0171e2046edb/numpy-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d666cb72687559689e9906197e3bec7b736764df6a2e58ee265e360663e9baf7", size = 16326954 },
|
|
25
|
+
{ url = "https://files.pythonhosted.org/packages/34/58/23e6b07fad492b7c47cf09cd8bad6983658f0f925b6c535fd008e3e86274/numpy-2.1.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c6eef7a2dbd0abfb0d9eaf78b73017dbfd0b54051102ff4e6a7b2980d5ac1a03", size = 16702916 },
|
|
26
|
+
{ url = "https://files.pythonhosted.org/packages/91/24/37b5cf2dc7d385ac97f7b7fe50cba312abb70a2a5eac74c23af028811f73/numpy-2.1.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:12edb90831ff481f7ef5f6bc6431a9d74dc0e5ff401559a71e5e4611d4f2d466", size = 14384372 },
|
|
27
|
+
{ url = "https://files.pythonhosted.org/packages/ea/ec/0f6d471058a01d1a05a50d2793898de1549280fa715a8537987ee866b5d9/numpy-2.1.2-cp310-cp310-win32.whl", hash = "sha256:a65acfdb9c6ebb8368490dbafe83c03c7e277b37e6857f0caeadbbc56e12f4fb", size = 6535361 },
|
|
28
|
+
{ url = "https://files.pythonhosted.org/packages/c2/3d/293cc5927f916a7bc6bf74da8f6defab63d1b13f0959d7e21878ad8a20d8/numpy-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:860ec6e63e2c5c2ee5e9121808145c7bf86c96cca9ad396c0bd3e0f2798ccbe2", size = 12865501 },
|
|
29
|
+
{ url = "https://files.pythonhosted.org/packages/aa/9c/9a6ec3ae89cd0648d419781284308f2956d2a61d932b5ac9682c956a171b/numpy-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b42a1a511c81cc78cbc4539675713bbcf9d9c3913386243ceff0e9429ca892fe", size = 21154845 },
|
|
30
|
+
{ url = "https://files.pythonhosted.org/packages/02/69/9f05c4ecc75fabf297b17743996371b4c3dfc4d92e15c5c38d8bb3db8d74/numpy-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:faa88bc527d0f097abdc2c663cddf37c05a1c2f113716601555249805cf573f1", size = 13789409 },
|
|
31
|
+
{ url = "https://files.pythonhosted.org/packages/34/4e/f95c99217bf77bbfaaf660d693c10bd0dc03b6032d19316d316088c9e479/numpy-2.1.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:c82af4b2ddd2ee72d1fc0c6695048d457e00b3582ccde72d8a1c991b808bb20f", size = 5352097 },
|
|
32
|
+
{ url = "https://files.pythonhosted.org/packages/06/13/f5d87a497c16658e9af8920449b0b5692b469586b8231340c672962071c5/numpy-2.1.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:13602b3174432a35b16c4cfb5de9a12d229727c3dd47a6ce35111f2ebdf66ff4", size = 6891195 },
|
|
33
|
+
{ url = "https://files.pythonhosted.org/packages/6c/89/691ac07429ac061b344d5e37fa8e94be51a6017734aea15f2d9d7c6d119a/numpy-2.1.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ebec5fd716c5a5b3d8dfcc439be82a8407b7b24b230d0ad28a81b61c2f4659a", size = 13895153 },
|
|
34
|
+
{ url = "https://files.pythonhosted.org/packages/23/69/538317f0d925095537745f12aced33be1570bbdc4acde49b33748669af96/numpy-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2b49c3c0804e8ecb05d59af8386ec2f74877f7ca8fd9c1e00be2672e4d399b1", size = 16338306 },
|
|
35
|
+
{ url = "https://files.pythonhosted.org/packages/af/03/863fe7062c2106d3c151f7df9353f2ae2237c1dd6900f127a3eb1f24cb1b/numpy-2.1.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2cbba4b30bf31ddbe97f1c7205ef976909a93a66bb1583e983adbd155ba72ac2", size = 16710893 },
|
|
36
|
+
{ url = "https://files.pythonhosted.org/packages/70/77/0ad9efe25482009873f9660d29a40a8c41a6f0e8b541195e3c95c70684c5/numpy-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8e00ea6fc82e8a804433d3e9cedaa1051a1422cb6e443011590c14d2dea59146", size = 14398048 },
|
|
37
|
+
{ url = "https://files.pythonhosted.org/packages/3e/0f/e785fe75544db9f2b0bb1c181e13ceff349ce49753d807fd9672916aa06d/numpy-2.1.2-cp311-cp311-win32.whl", hash = "sha256:5006b13a06e0b38d561fab5ccc37581f23c9511879be7693bd33c7cd15ca227c", size = 6533458 },
|
|
38
|
+
{ url = "https://files.pythonhosted.org/packages/d4/96/450054662295125af861d48d2c4bc081dadcf1974a879b2104613157aa62/numpy-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:f1eb068ead09f4994dec71c24b2844f1e4e4e013b9629f812f292f04bd1510d9", size = 12870896 },
|
|
39
|
+
{ url = "https://files.pythonhosted.org/packages/a0/7d/554a6838f37f3ada5a55f25173c619d556ae98092a6e01afb6e710501d70/numpy-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d7bf0a4f9f15b32b5ba53147369e94296f5fffb783db5aacc1be15b4bf72f43b", size = 20848077 },
|
|
40
|
+
{ url = "https://files.pythonhosted.org/packages/b0/29/cb48a402ea879e645b16218718f3f7d9588a77d674a9dcf22e4c43487636/numpy-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b1d0fcae4f0949f215d4632be684a539859b295e2d0cb14f78ec231915d644db", size = 13493242 },
|
|
41
|
+
{ url = "https://files.pythonhosted.org/packages/56/44/f899b0581766c230da42f751b7b8896d096640b19b312164c267e48d36cb/numpy-2.1.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:f751ed0a2f250541e19dfca9f1eafa31a392c71c832b6bb9e113b10d050cb0f1", size = 5089219 },
|
|
42
|
+
{ url = "https://files.pythonhosted.org/packages/79/8f/b987070d45161a7a4504afc67ed38544ed2c0ed5576263599a0402204a9c/numpy-2.1.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:bd33f82e95ba7ad632bc57837ee99dba3d7e006536200c4e9124089e1bf42426", size = 6620167 },
|
|
43
|
+
{ url = "https://files.pythonhosted.org/packages/c4/a7/af3329fda3c3ec31d9b650e42bbcd3422fc62a765cbb1405fde4177a0996/numpy-2.1.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b8cde4f11f0a975d1fd59373b32e2f5a562ade7cde4f85b7137f3de8fbb29a0", size = 13604905 },
|
|
44
|
+
{ url = "https://files.pythonhosted.org/packages/9b/b4/e3c7e6fab0f77fff6194afa173d1f2342073d91b1d3b4b30b17c3fb4407a/numpy-2.1.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d95f286b8244b3649b477ac066c6906fbb2905f8ac19b170e2175d3d799f4df", size = 16041825 },
|
|
45
|
+
{ url = "https://files.pythonhosted.org/packages/e9/50/6828e66a78aa03147c111f84d55f33ce2dde547cb578d6744a3b06a0124b/numpy-2.1.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ab4754d432e3ac42d33a269c8567413bdb541689b02d93788af4131018cbf366", size = 16409541 },
|
|
46
|
+
{ url = "https://files.pythonhosted.org/packages/bf/72/66af7916d9c3c6dbfbc8acdd4930c65461e1953374a2bc43d00f948f004a/numpy-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e585c8ae871fd38ac50598f4763d73ec5497b0de9a0ab4ef5b69f01c6a046142", size = 14081134 },
|
|
47
|
+
{ url = "https://files.pythonhosted.org/packages/dc/5a/59a67d84f33fe00ae74f0b5b69dd4f93a586a4aba7f7e19b54b2133db038/numpy-2.1.2-cp312-cp312-win32.whl", hash = "sha256:9c6c754df29ce6a89ed23afb25550d1c2d5fdb9901d9c67a16e0b16eaf7e2550", size = 6237784 },
|
|
48
|
+
{ url = "https://files.pythonhosted.org/packages/4c/79/73735a6a5dad6059c085f240a4e74c9270feccd2bc66e4d31b5ca01d329c/numpy-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:456e3b11cb79ac9946c822a56346ec80275eaf2950314b249b512896c0d2505e", size = 12568254 },
|
|
49
|
+
{ url = "https://files.pythonhosted.org/packages/16/72/716fa1dbe92395a9a623d5049203ff8ddb0cfce65b9df9117c3696ccc011/numpy-2.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a84498e0d0a1174f2b3ed769b67b656aa5460c92c9554039e11f20a05650f00d", size = 20834690 },
|
|
50
|
+
{ url = "https://files.pythonhosted.org/packages/1e/fb/3e85a39511586053b5c6a59a643879e376fae22230ebfef9cfabb0e032e2/numpy-2.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4d6ec0d4222e8ffdab1744da2560f07856421b367928026fb540e1945f2eeeaf", size = 13507474 },
|
|
51
|
+
{ url = "https://files.pythonhosted.org/packages/35/eb/5677556d9ba13436dab51e129f98d4829d95cd1b6bd0e199c14485a4bdb9/numpy-2.1.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:259ec80d54999cc34cd1eb8ded513cb053c3bf4829152a2e00de2371bd406f5e", size = 5074742 },
|
|
52
|
+
{ url = "https://files.pythonhosted.org/packages/3e/c5/6c5ef5ba41b65a7e51bed50dbf3e1483eb578055633dd013e811a28e96a1/numpy-2.1.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:675c741d4739af2dc20cd6c6a5c4b7355c728167845e3c6b0e824e4e5d36a6c3", size = 6606787 },
|
|
53
|
+
{ url = "https://files.pythonhosted.org/packages/08/ac/f2f29dd4fd325b379c7dc932a0ebab22f0e031dbe80b2f6019b291a3a544/numpy-2.1.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b2d4e667895cc55e3ff2b56077e4c8a5604361fc21a042845ea3ad67465aa8", size = 13601333 },
|
|
54
|
+
{ url = "https://files.pythonhosted.org/packages/44/26/63f5f4e5089654dfb858f4892215ed968cd1a68e6f4a83f9961f84f855cb/numpy-2.1.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43cca367bf94a14aca50b89e9bc2061683116cfe864e56740e083392f533ce7a", size = 16038090 },
|
|
55
|
+
{ url = "https://files.pythonhosted.org/packages/1d/21/015e0594de9c3a8d5edd24943d2bd23f102ec71aec026083f822f86497e2/numpy-2.1.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:76322dcdb16fccf2ac56f99048af32259dcc488d9b7e25b51e5eca5147a3fb98", size = 16410865 },
|
|
56
|
+
{ url = "https://files.pythonhosted.org/packages/df/01/c1bcf9e6025d79077fbf3f3ee503b50aa7bfabfcd8f4b54f5829f4c00f3f/numpy-2.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:32e16a03138cabe0cb28e1007ee82264296ac0983714094380b408097a418cfe", size = 14078077 },
|
|
57
|
+
{ url = "https://files.pythonhosted.org/packages/ba/06/db9d127d63bd11591770ba9f3d960f8041e0f895184b9351d4b1b5b56983/numpy-2.1.2-cp313-cp313-win32.whl", hash = "sha256:242b39d00e4944431a3cd2db2f5377e15b5785920421993770cddb89992c3f3a", size = 6234904 },
|
|
58
|
+
{ url = "https://files.pythonhosted.org/packages/a9/96/9f61f8f95b6e0ea0aa08633b704c75d1882bdcb331bdf8bfd63263b25b00/numpy-2.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:f2ded8d9b6f68cc26f8425eda5d3877b47343e68ca23d0d0846f4d312ecaa445", size = 12561910 },
|
|
59
|
+
{ url = "https://files.pythonhosted.org/packages/36/b8/033f627821784a48e8f75c218033471eebbaacdd933f8979c79637a1b44b/numpy-2.1.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:2ffef621c14ebb0188a8633348504a35c13680d6da93ab5cb86f4e54b7e922b5", size = 20857719 },
|
|
60
|
+
{ url = "https://files.pythonhosted.org/packages/96/46/af5726fde5b74ed83f2f17a73386d399319b7ed4d51279fb23b721d0816d/numpy-2.1.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:ad369ed238b1959dfbade9018a740fb9392c5ac4f9b5173f420bd4f37ba1f7a0", size = 13518826 },
|
|
61
|
+
{ url = "https://files.pythonhosted.org/packages/db/6e/8ce677edf36da1c4dae80afe5529f47690697eb55b4864673af260ccea7b/numpy-2.1.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:d82075752f40c0ddf57e6e02673a17f6cb0f8eb3f587f63ca1eaab5594da5b17", size = 5115036 },
|
|
62
|
+
{ url = "https://files.pythonhosted.org/packages/6a/ba/3cce44fb1b8438042c11847048812a776f75ee0e7070179c22e4cfbf420c/numpy-2.1.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:1600068c262af1ca9580a527d43dc9d959b0b1d8e56f8a05d830eea39b7c8af6", size = 6628641 },
|
|
63
|
+
{ url = "https://files.pythonhosted.org/packages/59/c8/e722998720ccbd35ffbcf1d1b8ed0aa2304af88d3f1c38e06ebf983599b3/numpy-2.1.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a26ae94658d3ba3781d5e103ac07a876b3e9b29db53f68ed7df432fd033358a8", size = 13574803 },
|
|
64
|
+
{ url = "https://files.pythonhosted.org/packages/7c/8e/fc1fdd83a55476765329ac2913321c4aed5b082a7915095628c4ca30ea72/numpy-2.1.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13311c2db4c5f7609b462bc0f43d3c465424d25c626d95040f073e30f7570e35", size = 16021174 },
|
|
65
|
+
{ url = "https://files.pythonhosted.org/packages/2a/b6/a790742aa88067adb4bd6c89a946778c1417d4deaeafce3ca928f26d4c52/numpy-2.1.2-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:2abbf905a0b568706391ec6fa15161fad0fb5d8b68d73c461b3c1bab6064dd62", size = 16400117 },
|
|
66
|
+
{ url = "https://files.pythonhosted.org/packages/48/6f/129e3c17e3befe7fefdeaa6890f4c4df3f3cf0831aa053802c3862da67aa/numpy-2.1.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ef444c57d664d35cac4e18c298c47d7b504c66b17c2ea91312e979fcfbdfb08a", size = 14066202 },
|
|
67
|
+
{ url = "https://files.pythonhosted.org/packages/73/c9/3e1d6bbe6d3d2e2c5a9483b24b2f29a229b323f62054278a3bba7fee11e5/numpy-2.1.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:bdd407c40483463898b84490770199d5714dcc9dd9b792f6c6caccc523c00952", size = 20981945 },
|
|
68
|
+
{ url = "https://files.pythonhosted.org/packages/6e/62/989c4988bde1a8e08117fccc3bab73d2886421fb98cde597168714f3c54e/numpy-2.1.2-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:da65fb46d4cbb75cb417cddf6ba5e7582eb7bb0b47db4b99c9fe5787ce5d91f5", size = 6750558 },
|
|
69
|
+
{ url = "https://files.pythonhosted.org/packages/53/b1/00ef9f30975f1312a53257f68e57b4513d14d537e03d507e2773a684b1e8/numpy-2.1.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c193d0b0238638e6fc5f10f1b074a6993cb13b0b431f64079a509d63d3aa8b7", size = 16141552 },
|
|
70
|
+
{ url = "https://files.pythonhosted.org/packages/c0/ec/0c04903b48dfea6be1d7b47ba70f98709fb7198fd970784a1400c391d522/numpy-2.1.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a7d80b2e904faa63068ead63107189164ca443b42dd1930299e0d1cb041cec2e", size = 12789924 },
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
[[package]]
|
|
74
|
+
name = "xm-slurm-example"
|
|
75
|
+
version = "0.0.1"
|
|
76
|
+
source = { virtual = "." }
|
|
77
|
+
dependencies = [
|
|
78
|
+
{ name = "absl-py" },
|
|
79
|
+
{ name = "numpy" },
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
[package.metadata]
|
|
83
|
+
requires-dist = [
|
|
84
|
+
{ name = "absl-py" },
|
|
85
|
+
{ name = "numpy" },
|
|
86
|
+
]
|
|
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "xmanager-slurm"
|
|
7
7
|
description = "Slurm backend for XManager."
|
|
8
|
-
version = "0.4.
|
|
8
|
+
version = "0.4.1"
|
|
9
9
|
# readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
11
11
|
license = { text = "MIT" }
|
|
@@ -36,6 +36,9 @@ dependencies = [
|
|
|
36
36
|
[project.urls]
|
|
37
37
|
"GitHub" = "https://github.com/jessefarebro/xm-slurm"
|
|
38
38
|
|
|
39
|
+
[project.scripts]
|
|
40
|
+
xm = "xm_slurm.scripts.cli:main"
|
|
41
|
+
|
|
39
42
|
[tool.uv]
|
|
40
43
|
dev-dependencies = [
|
|
41
44
|
"mypy>=1.11.2",
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
import datetime as dt
|
|
2
|
+
from typing import Callable
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from xm_slurm import config, dependencies, execution
|
|
7
|
+
|
|
8
|
+
SlurmHandleGenerator = Callable[[str], execution.SlurmHandle]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.fixture
|
|
12
|
+
def slurm_handle() -> SlurmHandleGenerator:
|
|
13
|
+
dummy_ssh_config = config.SlurmSSHConfig(host="localhost")
|
|
14
|
+
|
|
15
|
+
def _slurm_handle(job_id: str):
|
|
16
|
+
return execution.SlurmHandle(
|
|
17
|
+
experiment_id=0, ssh=dummy_ssh_config, slurm_job=job_id, job_name="job"
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
return _slurm_handle
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_slurm_job_dependency_and(slurm_handle: SlurmHandleGenerator):
|
|
24
|
+
dep1 = dependencies.SlurmJobDependencyAfter([slurm_handle("123")])
|
|
25
|
+
dep2 = dependencies.SlurmJobDependencyAfter([slurm_handle("456")])
|
|
26
|
+
combined_dep = dep1 & dep2
|
|
27
|
+
assert isinstance(combined_dep, dependencies.SlurmJobDependencyAND)
|
|
28
|
+
assert combined_dep.to_dependency_str() == "after:123,after:456"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_slurm_job_dependency_or(slurm_handle: SlurmHandleGenerator):
|
|
32
|
+
dep1 = dependencies.SlurmJobDependencyAfter([slurm_handle("123")])
|
|
33
|
+
dep2 = dependencies.SlurmJobDependencyAfter([slurm_handle("456")])
|
|
34
|
+
combined_dep = dep1 | dep2
|
|
35
|
+
assert isinstance(combined_dep, dependencies.SlurmJobDependencyOR)
|
|
36
|
+
assert combined_dep.to_dependency_str() == "after:123?after:456"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_slurm_job_dependency_mixing_logical_operations(slurm_handle: SlurmHandleGenerator):
|
|
40
|
+
dep1 = dependencies.SlurmJobDependencyAfter([slurm_handle("123")])
|
|
41
|
+
dep2 = dependencies.SlurmJobDependencyAfter([slurm_handle("456")])
|
|
42
|
+
dep3 = dependencies.SlurmJobDependencyAfter([slurm_handle("789")])
|
|
43
|
+
with pytest.raises(
|
|
44
|
+
dependencies.SlurmDependencyException,
|
|
45
|
+
match="Slurm only supports chaining dependencies with the same logical operator. ",
|
|
46
|
+
):
|
|
47
|
+
dep1 & dep2 | dep3 # type: ignore
|
|
48
|
+
|
|
49
|
+
with pytest.raises(
|
|
50
|
+
dependencies.SlurmDependencyException,
|
|
51
|
+
match="Slurm only supports chaining dependencies with the same logical operator. ",
|
|
52
|
+
):
|
|
53
|
+
dep1 | dep2 & dep3 # type: ignore
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_slurm_job_dependency_chaining_and(slurm_handle: SlurmHandleGenerator):
|
|
57
|
+
dep1 = dependencies.SlurmJobDependencyAfter([slurm_handle("1")])
|
|
58
|
+
dep2 = dependencies.SlurmJobDependencyAfter([slurm_handle("2")])
|
|
59
|
+
dep3 = dependencies.SlurmJobDependencyAfter([slurm_handle("3")])
|
|
60
|
+
dep4 = dependencies.SlurmJobDependencyAfter([slurm_handle("4")])
|
|
61
|
+
combined_dep = dep1 & dep2 & dep3 & dep4
|
|
62
|
+
assert isinstance(combined_dep, dependencies.SlurmJobDependencyAND)
|
|
63
|
+
assert combined_dep.to_dependency_str() == "after:1,after:2,after:3,after:4"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_slurm_job_dependency_chaining_or(slurm_handle: SlurmHandleGenerator):
|
|
67
|
+
dep1 = dependencies.SlurmJobDependencyAfter([slurm_handle("1")])
|
|
68
|
+
dep2 = dependencies.SlurmJobDependencyAfter([slurm_handle("2")])
|
|
69
|
+
dep3 = dependencies.SlurmJobDependencyAfter([slurm_handle("3")])
|
|
70
|
+
dep4 = dependencies.SlurmJobDependencyAfter([slurm_handle("4")])
|
|
71
|
+
combined_dep = dep1 | dep2 | dep3 | dep4
|
|
72
|
+
assert isinstance(combined_dep, dependencies.SlurmJobDependencyOR)
|
|
73
|
+
assert combined_dep.to_dependency_str() == "after:1?after:2?after:3?after:4"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_slurm_job_dependency_after(slurm_handle: SlurmHandleGenerator):
|
|
77
|
+
dep = dependencies.SlurmJobDependencyAfter([slurm_handle("123")])
|
|
78
|
+
assert dep.to_dependency_str() == "after:123"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_slurm_job_dependency_after_with_time(slurm_handle: SlurmHandleGenerator):
|
|
82
|
+
dep = dependencies.SlurmJobDependencyAfter([slurm_handle("123")], time=dt.timedelta(minutes=10))
|
|
83
|
+
assert dep.to_dependency_str() == "after:123+10"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_slurm_job_dependency_after_with_invalid_time(slurm_handle: SlurmHandleGenerator):
|
|
87
|
+
with pytest.raises(
|
|
88
|
+
dependencies.SlurmDependencyException, match="Time must be specified in exact minutes"
|
|
89
|
+
):
|
|
90
|
+
dependencies.SlurmJobDependencyAfter([slurm_handle("123")], time=dt.timedelta(seconds=30))
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@pytest.mark.parametrize(
|
|
94
|
+
"dependency_cls,dependency_type",
|
|
95
|
+
[
|
|
96
|
+
(dependencies.SlurmJobDependencyAfter, "after"),
|
|
97
|
+
(dependencies.SlurmJobDependencyAfterAny, "afterany"),
|
|
98
|
+
(dependencies.SlurmJobDependencyAfterNotOK, "afternotok"),
|
|
99
|
+
(dependencies.SlurmJobDependencyAfterOK, "afterok"),
|
|
100
|
+
],
|
|
101
|
+
)
|
|
102
|
+
def test_slurm_job_dependency_after_not_ok(
|
|
103
|
+
slurm_handle: SlurmHandleGenerator,
|
|
104
|
+
dependency_cls: type,
|
|
105
|
+
dependency_type: str,
|
|
106
|
+
):
|
|
107
|
+
dep = dependency_cls([slurm_handle("123"), slurm_handle("456")])
|
|
108
|
+
assert dep.to_dependency_str() == f"{dependency_type}:123:456"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@pytest.mark.parametrize(
|
|
112
|
+
"dependency_cls",
|
|
113
|
+
[
|
|
114
|
+
dependencies.SlurmJobDependencyAfter,
|
|
115
|
+
dependencies.SlurmJobDependencyAfterAny,
|
|
116
|
+
dependencies.SlurmJobDependencyAfterNotOK,
|
|
117
|
+
dependencies.SlurmJobDependencyAfterOK,
|
|
118
|
+
],
|
|
119
|
+
)
|
|
120
|
+
def test_slurm_job_dependency_after_no_handles(dependency_cls: type):
|
|
121
|
+
with pytest.raises(
|
|
122
|
+
dependencies.SlurmDependencyException, match="Dependency doesn't have any handles."
|
|
123
|
+
):
|
|
124
|
+
dependency_cls([])
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_dependency_flatten(slurm_handle: SlurmHandleGenerator):
|
|
128
|
+
dep1 = dependencies.SlurmJobDependencyAfter([slurm_handle("1")])
|
|
129
|
+
dep2 = dependencies.SlurmJobDependencyAfter([slurm_handle("2")])
|
|
130
|
+
dep3 = dependencies.SlurmJobDependencyAfter([slurm_handle("3")])
|
|
131
|
+
combined_dep = dep1 & dep2 & dep3
|
|
132
|
+
assert combined_dep.flatten() == (dep1, dep2, dep3)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def test_dependency_traverse(slurm_handle: SlurmHandleGenerator):
|
|
136
|
+
dep1 = dependencies.SlurmJobDependencyAfterOK([slurm_handle("1")])
|
|
137
|
+
dep2 = dependencies.SlurmJobDependencyAfterOK([slurm_handle("2")])
|
|
138
|
+
dep3 = dependencies.SlurmJobDependencyAfterOK([slurm_handle("3")])
|
|
139
|
+
combined_dep = dep1 & dep2 & dep3
|
|
140
|
+
|
|
141
|
+
def traverse_fn(dep: dependencies.SlurmJobDependency):
|
|
142
|
+
if isinstance(dep, dependencies.SlurmJobDependencyAfterOK):
|
|
143
|
+
return dependencies.SlurmJobDependencyAfterNotOK(dep.handles)
|
|
144
|
+
return dep
|
|
145
|
+
|
|
146
|
+
transformed_combined_dep = combined_dep.traverse(traverse_fn)
|
|
147
|
+
for dep in transformed_combined_dep.flatten():
|
|
148
|
+
assert isinstance(dep, dependencies.SlurmJobDependencyAfterNotOK)
|
|
149
|
+
assert transformed_combined_dep.to_dependency_str() == "afternotok:1,afternotok:2,afternotok:3"
|
|
@@ -10,6 +10,7 @@ from xm_slurm.experiment import (
|
|
|
10
10
|
get_current_work_unit,
|
|
11
11
|
get_experiment,
|
|
12
12
|
)
|
|
13
|
+
from xm_slurm.job_blocks import JobArgs
|
|
13
14
|
from xm_slurm.packageables import (
|
|
14
15
|
conda_container,
|
|
15
16
|
docker_container,
|
|
@@ -34,13 +35,14 @@ __all__ = [
|
|
|
34
35
|
"get_current_experiment",
|
|
35
36
|
"get_current_work_unit",
|
|
36
37
|
"get_experiment",
|
|
38
|
+
"JobArgs",
|
|
37
39
|
"JobRequirements",
|
|
38
40
|
"mamba_container",
|
|
39
|
-
"uv_container",
|
|
40
41
|
"python_container",
|
|
41
42
|
"ResourceQuantity",
|
|
42
43
|
"ResourceType",
|
|
43
44
|
"Slurm",
|
|
44
|
-
"SlurmSpec",
|
|
45
45
|
"SlurmExperiment",
|
|
46
|
+
"SlurmSpec",
|
|
47
|
+
"uv_container",
|
|
46
48
|
]
|
|
@@ -322,7 +322,7 @@ class XManagerSqliteAPI(XManagerAPI):
|
|
|
322
322
|
db_path = Path(os.environ["XM_SLURM_STATE_DIR"]) / "db.sqlite3"
|
|
323
323
|
else:
|
|
324
324
|
db_path = Path.home() / ".local" / "state" / "xm-slurm" / "db.sqlite3"
|
|
325
|
-
|
|
325
|
+
logger.debug("Looking for db at: ", db_path)
|
|
326
326
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
327
327
|
engine = create_engine(f"sqlite:///{db_path}")
|
|
328
328
|
Base.metadata.create_all(engine)
|
|
@@ -5,9 +5,10 @@ import getpass
|
|
|
5
5
|
import json
|
|
6
6
|
import os
|
|
7
7
|
import pathlib
|
|
8
|
-
from typing import Literal, Mapping, NamedTuple
|
|
8
|
+
from typing import Callable, Literal, Mapping, NamedTuple
|
|
9
9
|
|
|
10
10
|
import asyncssh
|
|
11
|
+
from xmanager import xm
|
|
11
12
|
|
|
12
13
|
from xm_slurm import constants
|
|
13
14
|
|
|
@@ -137,7 +138,7 @@ class SlurmSSHConfig:
|
|
|
137
138
|
)
|
|
138
139
|
|
|
139
140
|
def __hash__(self):
|
|
140
|
-
return hash((self.host, self.host_public_key, self.user, self.port))
|
|
141
|
+
return hash((type(self), self.host, self.host_public_key, self.user, self.port))
|
|
141
142
|
|
|
142
143
|
|
|
143
144
|
@dataclasses.dataclass(frozen=True, kw_only=True)
|
|
@@ -176,6 +177,9 @@ class SlurmClusterConfig:
|
|
|
176
177
|
|
|
177
178
|
features: Mapping["xm_slurm.FeatureType", str] = dataclasses.field(default_factory=dict) # type: ignore # noqa: F821
|
|
178
179
|
|
|
180
|
+
# Function to validate the Slurm executor config
|
|
181
|
+
validate: Callable[[xm.Job], None] | None = None
|
|
182
|
+
|
|
179
183
|
def __post_init__(self) -> None:
|
|
180
184
|
for src, dst in self.mounts.items():
|
|
181
185
|
if not isinstance(src, (str, os.PathLike)):
|
|
@@ -194,6 +198,7 @@ class SlurmClusterConfig:
|
|
|
194
198
|
|
|
195
199
|
def __hash__(self):
|
|
196
200
|
return hash((
|
|
201
|
+
type(self),
|
|
197
202
|
self.ssh,
|
|
198
203
|
self.cwd,
|
|
199
204
|
self.prolog,
|
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
import re
|
|
2
2
|
|
|
3
|
+
SLURM_JOB_ID_REGEX = re.compile(
|
|
4
|
+
r"^(?P<jobid>\d+)(?:(?:\+(?P<componentid>\d+))|(?:_(?P<arraytaskid>\d+)))?$"
|
|
5
|
+
)
|
|
6
|
+
|
|
3
7
|
IMAGE_URI_REGEX = re.compile(
|
|
4
8
|
r"^(?P<scheme>(?:[^:]+://)?)?(?P<domain>[^/]+)(?P<path>/[^:]*)?(?::(?P<tag>[^@]+))?@?(?P<digest>.+)?$"
|
|
5
9
|
)
|
|
@@ -1,13 +1,20 @@
|
|
|
1
|
+
import datetime as dt
|
|
2
|
+
import logging
|
|
1
3
|
import os
|
|
2
4
|
|
|
5
|
+
from xmanager import xm
|
|
6
|
+
|
|
3
7
|
from xm_slurm import config, resources
|
|
4
8
|
from xm_slurm.contrib.clusters import drac
|
|
9
|
+
from xm_slurm.executors import Slurm
|
|
5
10
|
|
|
6
11
|
# ComputeCanada alias
|
|
7
12
|
cc = drac
|
|
8
13
|
|
|
9
14
|
__all__ = ["drac", "mila", "cc"]
|
|
10
15
|
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
11
18
|
|
|
12
19
|
def mila(
|
|
13
20
|
*,
|
|
@@ -24,6 +31,23 @@ def mila(
|
|
|
24
31
|
"/home/mila/${USER:0:1}/$USER/.ssh": "/home/mila/${USER:0:1}/$USER/.ssh",
|
|
25
32
|
}
|
|
26
33
|
|
|
34
|
+
def validate(job: xm.Job) -> None:
|
|
35
|
+
assert isinstance(job.executor, Slurm)
|
|
36
|
+
|
|
37
|
+
wants_requeue_with_grace_period = (
|
|
38
|
+
job.executor.requeue and job.executor.timeout_signal_grace_period > dt.timedelta(0)
|
|
39
|
+
)
|
|
40
|
+
partition = job.executor.partition or "main"
|
|
41
|
+
|
|
42
|
+
if wants_requeue_with_grace_period and (
|
|
43
|
+
partition is None or not partition.endswith("-grace")
|
|
44
|
+
):
|
|
45
|
+
logger.warning(
|
|
46
|
+
f"Job {job.name} wants requeue with grace period, but partition `{partition}` does not end with '-grace'. "
|
|
47
|
+
"Mila Cluster requires you specify a grace partition. "
|
|
48
|
+
"This may result in the job not being requeued properly."
|
|
49
|
+
)
|
|
50
|
+
|
|
27
51
|
return config.SlurmClusterConfig(
|
|
28
52
|
name="mila",
|
|
29
53
|
ssh=config.SlurmSSHConfig(
|
|
@@ -58,4 +82,5 @@ def mila(
|
|
|
58
82
|
resources.FeatureType.NVIDIA_MIG: "mig",
|
|
59
83
|
resources.FeatureType.NVIDIA_NVLINK: "nvlink",
|
|
60
84
|
},
|
|
85
|
+
validate=validate,
|
|
61
86
|
)
|