xmanager-slurm 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xm_slurm/__init__.py +47 -0
- xm_slurm/api/__init__.py +33 -0
- xm_slurm/api/abc.py +65 -0
- xm_slurm/api/models.py +70 -0
- xm_slurm/api/sqlite/client.py +358 -0
- xm_slurm/api/web/client.py +173 -0
- xm_slurm/batching.py +139 -0
- xm_slurm/config.py +189 -0
- xm_slurm/console.py +3 -0
- xm_slurm/constants.py +19 -0
- xm_slurm/contrib/__init__.py +0 -0
- xm_slurm/contrib/clusters/__init__.py +67 -0
- xm_slurm/contrib/clusters/drac.py +242 -0
- xm_slurm/dependencies.py +171 -0
- xm_slurm/executables.py +215 -0
- xm_slurm/execution.py +995 -0
- xm_slurm/executors.py +210 -0
- xm_slurm/experiment.py +1016 -0
- xm_slurm/experimental/parameter_controller.py +206 -0
- xm_slurm/filesystems.py +129 -0
- xm_slurm/job_blocks.py +21 -0
- xm_slurm/metadata_context.py +253 -0
- xm_slurm/packageables.py +309 -0
- xm_slurm/packaging/__init__.py +8 -0
- xm_slurm/packaging/docker.py +348 -0
- xm_slurm/packaging/registry.py +45 -0
- xm_slurm/packaging/router.py +56 -0
- xm_slurm/packaging/utils.py +22 -0
- xm_slurm/resources.py +350 -0
- xm_slurm/scripts/_cloudpickle.py +28 -0
- xm_slurm/scripts/cli.py +90 -0
- xm_slurm/status.py +197 -0
- xm_slurm/templates/docker/docker-bake.hcl.j2 +54 -0
- xm_slurm/templates/docker/mamba.Dockerfile +29 -0
- xm_slurm/templates/docker/python.Dockerfile +32 -0
- xm_slurm/templates/docker/uv.Dockerfile +38 -0
- xm_slurm/templates/slurm/entrypoint.bash.j2 +27 -0
- xm_slurm/templates/slurm/fragments/monitor.bash.j2 +78 -0
- xm_slurm/templates/slurm/fragments/proxy.bash.j2 +31 -0
- xm_slurm/templates/slurm/job-array.bash.j2 +31 -0
- xm_slurm/templates/slurm/job-group.bash.j2 +47 -0
- xm_slurm/templates/slurm/job.bash.j2 +90 -0
- xm_slurm/templates/slurm/library/retry.bash +62 -0
- xm_slurm/templates/slurm/runtimes/apptainer.bash.j2 +73 -0
- xm_slurm/templates/slurm/runtimes/podman.bash.j2 +43 -0
- xm_slurm/types.py +23 -0
- xm_slurm/utils.py +196 -0
- xmanager_slurm-0.4.19.dist-info/METADATA +28 -0
- xmanager_slurm-0.4.19.dist-info/RECORD +52 -0
- xmanager_slurm-0.4.19.dist-info/WHEEL +4 -0
- xmanager_slurm-0.4.19.dist-info/entry_points.txt +2 -0
- xmanager_slurm-0.4.19.dist-info/licenses/LICENSE.md +227 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# retry: rerun a command if it exits with certain codes
|
|
2
|
+
# Options:
|
|
3
|
+
# -c CODE Retry on this exit code (repeatable).
|
|
4
|
+
# -n N Max attempts (incl. first). Default: unlimited
|
|
5
|
+
# -d SECS Initial delay before first retry. Default: 1
|
|
6
|
+
# -b FACTOR Integer backoff multiplier per retry. Default: 1 (no backoff)
|
|
7
|
+
# -q Quiet (no logs)
|
|
8
|
+
# Usage:
|
|
9
|
+
# retry [-c CODE ...] [-n N] [-d SECS] [-b FACTOR] [-q] -- cmd arg1 arg2 ...
|
|
10
|
+
retry() {
|
|
11
|
+
local -a codes=()
|
|
12
|
+
local -i max=-1 delay=1 backoff=1 quiet=0 status
|
|
13
|
+
local opt OPTIND=1
|
|
14
|
+
|
|
15
|
+
while getopts ":c:n:d:b:q" opt; do
|
|
16
|
+
case "$opt" in
|
|
17
|
+
c) codes+=("$OPTARG") ;;
|
|
18
|
+
n) max=$OPTARG ;;
|
|
19
|
+
d) delay=$OPTARG ;;
|
|
20
|
+
b) backoff=$OPTARG ;;
|
|
21
|
+
q) quiet=1 ;;
|
|
22
|
+
:) printf 'retry: option -%s requires an argument\n' "$OPTARG" >&2; return 2 ;;
|
|
23
|
+
\?) printf 'retry: invalid option -- %s\n' "$OPTARG" >&2; return 2 ;;
|
|
24
|
+
esac
|
|
25
|
+
done
|
|
26
|
+
shift $((OPTIND-1))
|
|
27
|
+
(( $# )) || { printf 'retry: missing command\n' >&2; return 2; }
|
|
28
|
+
|
|
29
|
+
((${#codes[@]})) || { printf 'retry: no return codes specified\n' >&2; return 2; }
|
|
30
|
+
|
|
31
|
+
for ((attempt=1; ; attempt++)); do
|
|
32
|
+
if "$@"; then # safe with set -e (exception context)
|
|
33
|
+
return 0
|
|
34
|
+
else
|
|
35
|
+
status=$? # capture failing status immediately
|
|
36
|
+
fi
|
|
37
|
+
|
|
38
|
+
# retryable?
|
|
39
|
+
local retryable=0 c
|
|
40
|
+
for c in "${codes[@]}"; do
|
|
41
|
+
(( status == c )) && { retryable=1; break; }
|
|
42
|
+
done
|
|
43
|
+
|
|
44
|
+
# stop if not retryable OR we've just hit the max attempt
|
|
45
|
+
if (( !retryable )) || (( max >= 0 && attempt >= max )); then
|
|
46
|
+
(( quiet )) || {
|
|
47
|
+
if (( attempt > 1 )); then
|
|
48
|
+
printf 'retry: giving up after %d attempts; last exit=%d\n' "$attempt" "$status" >&2
|
|
49
|
+
else
|
|
50
|
+
printf 'retry: command failed; exit=%d\n' "$status" >&2
|
|
51
|
+
fi
|
|
52
|
+
}
|
|
53
|
+
return "$status" # propagate exact code; errexit will catch
|
|
54
|
+
fi
|
|
55
|
+
|
|
56
|
+
(( quiet )) || printf 'retry: attempt %d failed with %d; retrying in %ds...\n' \
|
|
57
|
+
"$attempt" "$status" "$delay" >&2
|
|
58
|
+
sleep "$delay" || : # never trip set -e if sleep errors
|
|
59
|
+
(( delay *= backoff ))
|
|
60
|
+
done
|
|
61
|
+
}
|
|
62
|
+
export -f retry
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
{% macro run(cluster, job) -%}
|
|
2
|
+
{%- set runtime = (cluster.runtime | string | lower) -%}
|
|
3
|
+
|
|
4
|
+
# Bundle will be where our built sandbox image is stored
|
|
5
|
+
# container-workdir will be our container's scratch directory
|
|
6
|
+
# TODO(jfarebro): We can make this more efficient by doing an srun per node and downloading the container once per node.
|
|
7
|
+
# but this requires apptainer support to have an overlay per procid
|
|
8
|
+
mkdir -p "$SLURM_TMPDIR"/{container-"$SLURM_PROCID",container-workdir-"$SLURM_PROCID",container-overlay-"$SLURM_PROCID"}
|
|
9
|
+
|
|
10
|
+
retry -c 255 -n 10 -d 1 -b 2 -- \
|
|
11
|
+
{% if job.executable.credentials %}
|
|
12
|
+
env {{ runtime | upper }}_DOCKER_USERNAME="{{ job.executable.credentials.username }}" {{ runtime | upper }}_DOCKER_PASSWORD="{{ job.executable.credentials.password }}" time {{ runtime }} build \
|
|
13
|
+
{% else %}
|
|
14
|
+
time {{ runtime }} build \
|
|
15
|
+
{% endif %}
|
|
16
|
+
--force \
|
|
17
|
+
--sandbox \
|
|
18
|
+
--fix-perms \
|
|
19
|
+
"$SLURM_TMPDIR"/container-"$SLURM_PROCID" \
|
|
20
|
+
docker://{{ job.executable.image }}
|
|
21
|
+
|
|
22
|
+
{% if runtime == "singularity" and cluster.mounts %}
|
|
23
|
+
{% for source, dest in cluster.mounts.items() %}
|
|
24
|
+
mkdir -p "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/{{ dest | trim('/') }}
|
|
25
|
+
{% endfor %}
|
|
26
|
+
{% endif %}
|
|
27
|
+
|
|
28
|
+
cat << 'ENTRYPOINT_EOF' > "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/xm-slurm-entrypoint.sh
|
|
29
|
+
{{ entrypoint(cluster, job) }}
|
|
30
|
+
ENTRYPOINT_EOF
|
|
31
|
+
chmod +x "$SLURM_TMPDIR"/container-"$SLURM_PROCID"/xm-slurm-entrypoint.sh
|
|
32
|
+
|
|
33
|
+
for var in "${!SLURM_@}"; do export "{{ runtime | upper }}ENV_${var}=${!var}"; done
|
|
34
|
+
|
|
35
|
+
exec {{ runtime }} exec \
|
|
36
|
+
{% if job.executor.requirements.accelerator %}
|
|
37
|
+
--nv \
|
|
38
|
+
{% endif %}
|
|
39
|
+
--no-init \
|
|
40
|
+
--no-umask \
|
|
41
|
+
--no-home \
|
|
42
|
+
--cleanenv \
|
|
43
|
+
{% if runtime == "apptainer" %}
|
|
44
|
+
--no-eval \
|
|
45
|
+
{% endif %}
|
|
46
|
+
--containall \
|
|
47
|
+
{% if cluster.mounts %}
|
|
48
|
+
{% for source, dest in cluster.mounts.items() %}
|
|
49
|
+
--bind {{ source }}:{{ dest }} \
|
|
50
|
+
{% endfor %}
|
|
51
|
+
{% endif %}
|
|
52
|
+
--workdir "$SLURM_TMPDIR"/container-workdir-"$SLURM_PROCID" \
|
|
53
|
+
{% if (cluster.runtime | string) == "apptainer" %}
|
|
54
|
+
--overlay "$SLURM_TMPDIR"/container-overlay-"$SLURM_PROCID" \
|
|
55
|
+
{% else %}
|
|
56
|
+
--writable \
|
|
57
|
+
{% endif %}
|
|
58
|
+
{% if job.executable.workdir %}
|
|
59
|
+
--pwd {{ job.executable.workdir }} \
|
|
60
|
+
{% endif %}
|
|
61
|
+
"$SLURM_TMPDIR"/container-"$SLURM_PROCID" \
|
|
62
|
+
/xm-slurm-entrypoint.sh \
|
|
63
|
+
{% for arg in job.executable.args.to_list() %}
|
|
64
|
+
{{ arg }} \
|
|
65
|
+
{% endfor %}
|
|
66
|
+
{% for arg in job.args.to_list() %}
|
|
67
|
+
{{ arg }} \
|
|
68
|
+
{% endfor %}
|
|
69
|
+
{% if caller %}
|
|
70
|
+
{{- caller() -}}
|
|
71
|
+
{% endif %}
|
|
72
|
+
"$@"
|
|
73
|
+
{%- endmacro %}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
{% macro run(cluster, job) -%}
|
|
2
|
+
retry -c 255 -n 10 -d 1 -b 2 -- \
|
|
3
|
+
time podman pull \
|
|
4
|
+
{% if job.executable.credentials %}
|
|
5
|
+
--creds {{ job.executable.credentials.username }}:{{ job.executable.credentials.password }} \
|
|
6
|
+
{% endif %}
|
|
7
|
+
{{ job.executable.image }}
|
|
8
|
+
|
|
9
|
+
cat << 'ENTRYPOINT_EOF' > "$SLURM_TMPDIR"/xm-slurm-entrypoint.sh
|
|
10
|
+
{{ entrypoint(cluster, job) }}
|
|
11
|
+
ENTRYPOINT_EOF
|
|
12
|
+
chmod +x "$SLURM_TMPDIR"/xm-slurm-entrypoint.sh
|
|
13
|
+
|
|
14
|
+
exec podman run \
|
|
15
|
+
--mount type=bind,src="$SLURM_TMPDIR"/xm-slurm-entrypoint.sh,dst=/xm-slurm-entrypoint.sh,ro \
|
|
16
|
+
--entrypoint /xm-slurm-entrypoint.sh \
|
|
17
|
+
--pull never \
|
|
18
|
+
--restart no \
|
|
19
|
+
--env "SLURM_*" \
|
|
20
|
+
--rm \
|
|
21
|
+
{% if job.executor.requirements.accelerator %}
|
|
22
|
+
--device nvidia.com/gpu=all \
|
|
23
|
+
{% endif %}
|
|
24
|
+
{% if cluster.mounts %}
|
|
25
|
+
{% for source, dest in cluster.mounts.items() %}
|
|
26
|
+
--mount type=bind,src={{ source }},dst={{ dest }} \
|
|
27
|
+
{% endfor %}
|
|
28
|
+
{% endif %}
|
|
29
|
+
{% if job.executable.workdir %}
|
|
30
|
+
--workdir {{ job.executable.workdir }} \
|
|
31
|
+
{% endif %}
|
|
32
|
+
{{ job.executable.image }} \
|
|
33
|
+
{% for arg in job.executable.args.to_list() %}
|
|
34
|
+
{{ arg }} \
|
|
35
|
+
{% endfor %}
|
|
36
|
+
{% for arg in job.args.to_list() %}
|
|
37
|
+
{{ arg }} \
|
|
38
|
+
{% endfor %}
|
|
39
|
+
{% if caller %}
|
|
40
|
+
{{- caller() -}}
|
|
41
|
+
{% endif %}
|
|
42
|
+
"$@"
|
|
43
|
+
{% endmacro %}
|
xm_slurm/types.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import typing as tp
|
|
2
|
+
|
|
3
|
+
InstanceT_contra = tp.TypeVar("InstanceT_contra", contravariant=True)
|
|
4
|
+
GetterT_co = tp.TypeVar("GetterT_co", covariant=True)
|
|
5
|
+
SetterT_co = tp.TypeVar("SetterT_co", contravariant=True)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Descriptor(tp.Protocol[GetterT_co, SetterT_co]):
|
|
9
|
+
def __set_name__(self, owner: tp.Type[tp.Any], name: str) -> None: ...
|
|
10
|
+
|
|
11
|
+
@tp.overload
|
|
12
|
+
def __get__(
|
|
13
|
+
self, instance: InstanceT_contra, owner: tp.Type[InstanceT_contra] | None = None
|
|
14
|
+
) -> GetterT_co: ...
|
|
15
|
+
|
|
16
|
+
@tp.overload
|
|
17
|
+
def __get__(self, instance: None, owner: tp.Type[InstanceT_contra]) -> GetterT_co: ...
|
|
18
|
+
|
|
19
|
+
def __get__(
|
|
20
|
+
self, instance: InstanceT_contra | None, owner: tp.Type[InstanceT_contra] | None = None
|
|
21
|
+
) -> GetterT_co: ...
|
|
22
|
+
|
|
23
|
+
def __set__(self, instance: tp.Any, value: SetterT_co) -> None: ...
|
xm_slurm/utils.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import datetime as dt
|
|
3
|
+
import functools
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import pathlib
|
|
7
|
+
import pty
|
|
8
|
+
import re
|
|
9
|
+
import select
|
|
10
|
+
import shutil
|
|
11
|
+
import subprocess
|
|
12
|
+
import sys
|
|
13
|
+
import typing as tp
|
|
14
|
+
|
|
15
|
+
from xmanager import xm
|
|
16
|
+
|
|
17
|
+
T = tp.TypeVar("T")
|
|
18
|
+
P = tp.ParamSpec("P")
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CachedAwaitable(tp.Awaitable[T]):
|
|
24
|
+
def __init__(self, awaitable: tp.Awaitable[T]):
|
|
25
|
+
self.awaitable = awaitable
|
|
26
|
+
self.result: asyncio.Future[T] | None = None
|
|
27
|
+
|
|
28
|
+
def __await__(self):
|
|
29
|
+
if not self.result:
|
|
30
|
+
future = asyncio.get_event_loop().create_future()
|
|
31
|
+
self.result = future
|
|
32
|
+
try:
|
|
33
|
+
result = yield from self.awaitable.__await__()
|
|
34
|
+
future.set_result(result)
|
|
35
|
+
except Exception as e:
|
|
36
|
+
future.set_exception(e)
|
|
37
|
+
|
|
38
|
+
if not self.result.done():
|
|
39
|
+
yield from self.result
|
|
40
|
+
return self.result.result()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def reawaitable(f: tp.Callable[P, tp.Awaitable[T]]) -> tp.Callable[P, CachedAwaitable[T]]:
|
|
44
|
+
@functools.wraps(f)
|
|
45
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> CachedAwaitable[T]:
|
|
46
|
+
return CachedAwaitable(f(*args, **kwargs))
|
|
47
|
+
|
|
48
|
+
return wrapper
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@functools.cache
|
|
52
|
+
def find_project_root() -> pathlib.Path:
|
|
53
|
+
launch_script_path: pathlib.Path | None = None
|
|
54
|
+
launch_script_path = pathlib.Path(sys.argv[0])
|
|
55
|
+
|
|
56
|
+
if sys.argv[0].endswith(".py"):
|
|
57
|
+
launch_script_path = pathlib.Path(sys.argv[0]).resolve()
|
|
58
|
+
else:
|
|
59
|
+
main_file_path = getattr(sys.modules["__main__"], "__file__", None)
|
|
60
|
+
if main_file_path and os.access(main_file_path, os.R_OK):
|
|
61
|
+
launch_script_path = pathlib.Path(main_file_path).resolve()
|
|
62
|
+
|
|
63
|
+
pdir = launch_script_path.parent if launch_script_path else pathlib.Path.cwd().resolve()
|
|
64
|
+
while pdir != pdir.parent:
|
|
65
|
+
if (
|
|
66
|
+
(pdir / "pyproject.toml").exists()
|
|
67
|
+
or (pdir / "setup.py").exists()
|
|
68
|
+
or (pdir / "setup.cfg").exists()
|
|
69
|
+
or (pdir / "requirements.txt").exists()
|
|
70
|
+
or (pdir / "requirements.in").exists()
|
|
71
|
+
or (pdir / "uv.lock").exists()
|
|
72
|
+
or (pdir / ".venv").exists()
|
|
73
|
+
):
|
|
74
|
+
return pdir
|
|
75
|
+
pdir = pdir.parent
|
|
76
|
+
|
|
77
|
+
raise RuntimeError(f"Could not find project root from {sys.argv[0]}. Please specify `context`.")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# Cursor commands to filter out from the command data stream
|
|
81
|
+
_CURSOR_ESCAPE_SEQUENCES_REGEX = re.compile(
|
|
82
|
+
rb"\x1b\[\?25[hl]" # Matches cursor show/hide commands (CSI ?25h and CSI ?25l)
|
|
83
|
+
rb"|\x1b\[[0-9;]*[Hf]" # Matches cursor position commands (CSI n;mH and CSI n;mf)
|
|
84
|
+
rb"|\x1b\[s" # Matches cursor save position (CSI s)
|
|
85
|
+
rb"|\x1b\[u" # Matches cursor restore position (CSI u)
|
|
86
|
+
rb"|\x1b\[2J" # Matches clear screen (CSI 2J)
|
|
87
|
+
rb"|\x1b\[K" # Matches clear line (CSI K)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def run_command(
|
|
92
|
+
args: tp.Sequence[str] | xm.SequentialArgs,
|
|
93
|
+
env: dict[str, str] | None = None,
|
|
94
|
+
tty: bool = False,
|
|
95
|
+
cwd: str | os.PathLike[str] | None = None,
|
|
96
|
+
stdin: tp.IO[tp.AnyStr] | str | None = None,
|
|
97
|
+
check: bool = False,
|
|
98
|
+
return_stdout: bool = False,
|
|
99
|
+
return_stderr: bool = False,
|
|
100
|
+
) -> subprocess.CompletedProcess[str]:
|
|
101
|
+
if isinstance(args, xm.SequentialArgs):
|
|
102
|
+
args = args.to_list()
|
|
103
|
+
args = list(args)
|
|
104
|
+
|
|
105
|
+
executable = shutil.which(args[0])
|
|
106
|
+
if not executable:
|
|
107
|
+
raise RuntimeError(f"Couldn't find executable {args[0]}")
|
|
108
|
+
executable = pathlib.Path(executable)
|
|
109
|
+
|
|
110
|
+
subprocess_env = os.environ.copy() | (env if env else {})
|
|
111
|
+
if executable.name == "docker" and args[1] == "buildx":
|
|
112
|
+
subprocess_env |= {"DOCKER_CLI_EXPERIMENTAL": "enabled"}
|
|
113
|
+
|
|
114
|
+
logger.debug(f"command: {' '.join(args)}")
|
|
115
|
+
|
|
116
|
+
stdout_master, stdout_slave = pty.openpty()
|
|
117
|
+
stderr_master, stderr_slave = pty.openpty()
|
|
118
|
+
|
|
119
|
+
stdout_data, stderr_data = b"", b""
|
|
120
|
+
with subprocess.Popen(
|
|
121
|
+
executable=executable,
|
|
122
|
+
args=args,
|
|
123
|
+
shell=False,
|
|
124
|
+
text=True,
|
|
125
|
+
bufsize=0,
|
|
126
|
+
stdin=subprocess.PIPE if stdin else None,
|
|
127
|
+
stdout=stdout_slave,
|
|
128
|
+
stderr=stderr_slave,
|
|
129
|
+
start_new_session=True,
|
|
130
|
+
close_fds=True,
|
|
131
|
+
cwd=cwd,
|
|
132
|
+
env=subprocess_env,
|
|
133
|
+
) as process:
|
|
134
|
+
os.close(stdout_slave)
|
|
135
|
+
os.close(stderr_slave)
|
|
136
|
+
|
|
137
|
+
if stdin and process.stdin:
|
|
138
|
+
process.stdin.write(stdin if isinstance(stdin, str) else tp.cast(str, stdin.read()))
|
|
139
|
+
process.stdin.close()
|
|
140
|
+
|
|
141
|
+
fds = [stdout_master, stderr_master]
|
|
142
|
+
while fds:
|
|
143
|
+
rlist, _, _ = select.select(fds, [], [])
|
|
144
|
+
for fd in rlist:
|
|
145
|
+
try:
|
|
146
|
+
data = os.read(fd, 1024)
|
|
147
|
+
except OSError:
|
|
148
|
+
data = None
|
|
149
|
+
|
|
150
|
+
if not data:
|
|
151
|
+
os.close(fd)
|
|
152
|
+
fds.remove(fd)
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
data = _CURSOR_ESCAPE_SEQUENCES_REGEX.sub(b"", data)
|
|
156
|
+
|
|
157
|
+
if fd == stdout_master:
|
|
158
|
+
if return_stdout:
|
|
159
|
+
stdout_data += data
|
|
160
|
+
if tty:
|
|
161
|
+
os.write(pty.STDOUT_FILENO, data)
|
|
162
|
+
elif fd == stderr_master:
|
|
163
|
+
if return_stderr:
|
|
164
|
+
stderr_data += data
|
|
165
|
+
if tty:
|
|
166
|
+
os.write(pty.STDERR_FILENO, data)
|
|
167
|
+
else:
|
|
168
|
+
raise RuntimeError("Unexpected file descriptor")
|
|
169
|
+
|
|
170
|
+
stdout = stdout_data.decode(errors="replace") if stdout_data else ""
|
|
171
|
+
stderr = stderr_data.decode(errors="replace") if stderr_data else ""
|
|
172
|
+
|
|
173
|
+
logger.debug(f"return code: {process.returncode}")
|
|
174
|
+
if stdout:
|
|
175
|
+
logger.debug(f"stdout: {stdout}")
|
|
176
|
+
if stderr:
|
|
177
|
+
logger.debug(f"stderr: {stderr}")
|
|
178
|
+
|
|
179
|
+
retcode = process.poll()
|
|
180
|
+
assert retcode is not None
|
|
181
|
+
|
|
182
|
+
if check and retcode:
|
|
183
|
+
raise subprocess.CalledProcessError(retcode, process.args)
|
|
184
|
+
return subprocess.CompletedProcess(
|
|
185
|
+
process.args,
|
|
186
|
+
retcode,
|
|
187
|
+
stdout=stdout,
|
|
188
|
+
stderr=stderr,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def timestr_from_timedelta(time: dt.timedelta) -> str:
|
|
193
|
+
days = time.days
|
|
194
|
+
hours, remainder = divmod(time.seconds, 3600)
|
|
195
|
+
minutes, seconds = divmod(remainder, 60)
|
|
196
|
+
return f"{days}-{hours:02}:{minutes:02}:{seconds:02}"
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xmanager-slurm
|
|
3
|
+
Version: 0.4.19
|
|
4
|
+
Summary: Slurm backend for XManager.
|
|
5
|
+
Project-URL: GitHub, https://github.com/jessefarebro/xm-slurm
|
|
6
|
+
Author-email: Jesse Farebrother <jfarebro@cs.mcgill.ca>
|
|
7
|
+
License: MIT
|
|
8
|
+
License-File: LICENSE.md
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Requires-Dist: aiofile>=3.9.0
|
|
18
|
+
Requires-Dist: asyncssh>=2.19.0
|
|
19
|
+
Requires-Dist: backoff>=2.2.1
|
|
20
|
+
Requires-Dist: cloudpickle>=3.0.0
|
|
21
|
+
Requires-Dist: httpx>=0.28.1
|
|
22
|
+
Requires-Dist: humanize>=4.8.0
|
|
23
|
+
Requires-Dist: jinja2>=3.1.2
|
|
24
|
+
Requires-Dist: more-itertools>=10.2.0
|
|
25
|
+
Requires-Dist: rich>=13.5.2
|
|
26
|
+
Requires-Dist: toml>=0.10.2
|
|
27
|
+
Requires-Dist: wrapt>=1.16.0
|
|
28
|
+
Requires-Dist: xmanager>=0.5.0
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
xm_slurm/__init__.py,sha256=VNbvBLbv5ccbPxQUpbiwgoo72qI3FrATTloevufstzY,1112
|
|
2
|
+
xm_slurm/batching.py,sha256=ynbMRItxNtBn0SbkhHbrv5ugYuHeMw-7BP7a_-I6Oqg,4384
|
|
3
|
+
xm_slurm/config.py,sha256=srM8_UZSweYmMdaZzQ25FWTTutZbpc5CkQThUvXxmgQ,5944
|
|
4
|
+
xm_slurm/console.py,sha256=UpMqeJ0C8i0pkue1AHnnyyX0bFJ9zZeJ7HBR6yhuA8A,54
|
|
5
|
+
xm_slurm/constants.py,sha256=zefVtlFdflgSolie5g_rVxWV-Zpydxapchm3y0a2FDc,999
|
|
6
|
+
xm_slurm/dependencies.py,sha256=G-8vfmvSptZH6c_Ow51SwT84Dr6LI1clRj8F8wOUkiw,6421
|
|
7
|
+
xm_slurm/executables.py,sha256=fGmrFBl-258bMn6ip5adYeM7xxUHAeIbDN9zD2FDGtY,6373
|
|
8
|
+
xm_slurm/execution.py,sha256=htX4nZyLYexIwmFeT79Vta0SiUmcnq1YQJBc04zDyU0,37653
|
|
9
|
+
xm_slurm/executors.py,sha256=27oiMwF84axeTcrcwL0f5seeLL_1j79OjiM_JZjioFs,9112
|
|
10
|
+
xm_slurm/experiment.py,sha256=32FCtG9USi4rMKPFY2B4wt8_JB01VH1ozWcznCyxc50,39878
|
|
11
|
+
xm_slurm/filesystems.py,sha256=4rKtq3t-KDgxJbSGt6JVyRJT_3lCN_vIKTcwKHpTo3I,4389
|
|
12
|
+
xm_slurm/job_blocks.py,sha256=BFOOYgeodoGIQsB5PdC7SsOUou5aZx-1qbQ7lcqqylI,604
|
|
13
|
+
xm_slurm/metadata_context.py,sha256=mksVRbVUuistL1uE7TC-fkW-Y69On52jN_svP1e1kiQ,7841
|
|
14
|
+
xm_slurm/packageables.py,sha256=aEZUQpddfq4FK6h4f6kgGEI4XcOufhm68MjoDFOYR4U,12261
|
|
15
|
+
xm_slurm/resources.py,sha256=G1rkZ1tBnTFpF9ZUb3Ui7NFrLYgRwe1GPW2g8Aap1Qk,12359
|
|
16
|
+
xm_slurm/status.py,sha256=JIBCJPOYsmeJOQbzdACXA2vTWK7g8YWWhzpGP79e7JE,6911
|
|
17
|
+
xm_slurm/types.py,sha256=TsVykDm-LazVkrjeJrTwCMs4Q8APKhy7BTk0yKIhFNg,805
|
|
18
|
+
xm_slurm/utils.py,sha256=9w98HlXF0U9cKKtoB8QtGm0CnB0MnnzBARKlbbVNNpU,6211
|
|
19
|
+
xm_slurm/api/__init__.py,sha256=cyao3LZ3uLftu1wIv1aN7Qvsl6gYzYpkxeehTHZ0fA8,1089
|
|
20
|
+
xm_slurm/api/abc.py,sha256=-lS2OndnOuEiwNdr8ccQKkwMd1iDmKMmkBOSTvo5H5w,1816
|
|
21
|
+
xm_slurm/api/models.py,sha256=_INVh0j-4-rRs0WASyg4fNB6NF1L1nUeGgQ6-XnbwsM,1610
|
|
22
|
+
xm_slurm/api/sqlite/client.py,sha256=jAesCKDuYwnNcAxwJk_1b1TB8cT_QGbSjo1UE3mZjEQ,14037
|
|
23
|
+
xm_slurm/api/web/client.py,sha256=uO67Y7fnQ-w__Vm_A5BEuy7Qi8wQcWk3vIsBGEBkyfk,6261
|
|
24
|
+
xm_slurm/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
+
xm_slurm/contrib/clusters/__init__.py,sha256=lIEHCVADz2zzMnT2ecZFJMu9YpLDbhntRDTiqPM_mTo,2279
|
|
26
|
+
xm_slurm/contrib/clusters/drac.py,sha256=cTgayyBLQmYL5RVwjmImO9UXnsiYftT4JDOLms_UOAg,7546
|
|
27
|
+
xm_slurm/experimental/parameter_controller.py,sha256=IrFzq104LkZrhzuirit5GUZDXDvv2bBSYNMh3orsiPY,8518
|
|
28
|
+
xm_slurm/packaging/__init__.py,sha256=dh307yLpUT9KN7rJ1e9fYC6hegGKfZcGboUq9nGpDVQ,233
|
|
29
|
+
xm_slurm/packaging/docker.py,sha256=-DWcB9qqbeHmIEqyfF0-v6xOT25ae90u2x-QZ7kluOw,13579
|
|
30
|
+
xm_slurm/packaging/registry.py,sha256=Hq56KhqsQRxgr_y1EQhcZORlnrs13xY5vDGge5WEgYU,1134
|
|
31
|
+
xm_slurm/packaging/router.py,sha256=ORBbY4oNPDUXsWgEvQzBmbWq1HEFFgTN_BnRVK0nads,2140
|
|
32
|
+
xm_slurm/packaging/utils.py,sha256=KlU_GGkFH1Xu5VZkAMqRilmq6SV1iLai80beEZ3UQmw,616
|
|
33
|
+
xm_slurm/scripts/_cloudpickle.py,sha256=dlJYf2SceOuUn8wi-ozuoYAQg71wqD2MUVOUCyOwWIY,647
|
|
34
|
+
xm_slurm/scripts/cli.py,sha256=zzsQpvkx9VThAeQPM34iDK9wAWfCVCIIvLNI12UaMhw,2577
|
|
35
|
+
xm_slurm/templates/docker/docker-bake.hcl.j2,sha256=7qSJl2VN5poz-Hh8Gjo7--qR-k3lmfGtBu2mNbfG2uA,1499
|
|
36
|
+
xm_slurm/templates/docker/mamba.Dockerfile,sha256=Sgxr5IA5T-pT1Shumb5k3JngoG4pgCdBXjzqslFJdZI,753
|
|
37
|
+
xm_slurm/templates/docker/python.Dockerfile,sha256=U4b4QVkopckQ0o9jJIE7d_M6TvExEYlYDirNwCoZ7W4,865
|
|
38
|
+
xm_slurm/templates/docker/uv.Dockerfile,sha256=L2UJMX2c8waMdrRhiqPytQe3pTBu6u5PpMhJYsKkbEg,1040
|
|
39
|
+
xm_slurm/templates/slurm/entrypoint.bash.j2,sha256=MRdSVwgGrgQdpEhqfkP35IidgsblrtVXB1YWzvE9hkk,666
|
|
40
|
+
xm_slurm/templates/slurm/job-array.bash.j2,sha256=7cc0nZvEcHhZoo7jXI3fJWgMcc6z5H5FmopPRaklylI,637
|
|
41
|
+
xm_slurm/templates/slurm/job-group.bash.j2,sha256=9H3zfJy8RZGFf00ZQJGmMEPyWQ9YMZfvGoD4Q8hMx9Y,1244
|
|
42
|
+
xm_slurm/templates/slurm/job.bash.j2,sha256=GBKY3DPCODPTtEBfuvfaZAua_ZEd5cqPrShtPGE_IpY,2174
|
|
43
|
+
xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=ri5FgoKs6_bQVf5DO8SL4rJf4UsLxV34aOV-OD8VWDU,2526
|
|
44
|
+
xm_slurm/templates/slurm/fragments/proxy.bash.j2,sha256=VJLglZo-Nvx9R-qe3rHTxr07CylTQ6Z9NwBzvIpAZrA,814
|
|
45
|
+
xm_slurm/templates/slurm/library/retry.bash,sha256=bLe59qvfWEk17rE1wZ4EHiHba3RvR2WWZPq-kSe8RAA,2164
|
|
46
|
+
xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=XxAQWLxZogL7zjn7tuzKn-DkYUJMx_HjaRzpVkz97lM,2414
|
|
47
|
+
xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=8N1ZtwHyXxP-Cjo4HBPsJiZXcTvf7q2GzvW9ao8_aok,1208
|
|
48
|
+
xmanager_slurm-0.4.19.dist-info/METADATA,sha256=paJPyvR8SoMbGEo2SAvGTiYK2JHXJLdbC9KEGRLqnXs,1007
|
|
49
|
+
xmanager_slurm-0.4.19.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
50
|
+
xmanager_slurm-0.4.19.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
|
|
51
|
+
xmanager_slurm-0.4.19.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
|
|
52
|
+
xmanager_slurm-0.4.19.dist-info/RECORD,,
|