xmanager-slurm 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xmanager-slurm might be problematic. Click here for more details.
- xm_slurm/executors.py +5 -0
- xm_slurm/templates/slurm/fragments/monitor.bash.j2 +56 -17
- xm_slurm/templates/slurm/job.bash.j2 +1 -1
- {xmanager_slurm-0.4.8.dist-info → xmanager_slurm-0.4.9.dist-info}/METADATA +1 -1
- {xmanager_slurm-0.4.8.dist-info → xmanager_slurm-0.4.9.dist-info}/RECORD +8 -8
- {xmanager_slurm-0.4.8.dist-info → xmanager_slurm-0.4.9.dist-info}/WHEEL +0 -0
- {xmanager_slurm-0.4.8.dist-info → xmanager_slurm-0.4.9.dist-info}/entry_points.txt +0 -0
- {xmanager_slurm-0.4.8.dist-info → xmanager_slurm-0.4.9.dist-info}/licenses/LICENSE.md +0 -0
xm_slurm/executors.py
CHANGED
|
@@ -57,8 +57,13 @@ class Slurm(xm.Executor):
|
|
|
57
57
|
|
|
58
58
|
requeue: bool = True # Is this job ellible for requeueing?
|
|
59
59
|
requeue_on_exit_code: int = 42 # The exit code that triggers requeueing
|
|
60
|
+
requeue_on_timeout: bool = True # Should the job requeue upon timeout minus the grace period
|
|
60
61
|
requeue_max_attempts: int = 5 # How many times to attempt requeueing
|
|
61
62
|
|
|
63
|
+
@property
|
|
64
|
+
def requeue_timeout(self) -> dt.timedelta:
|
|
65
|
+
return self.time - self.timeout_signal_grace_period
|
|
66
|
+
|
|
62
67
|
def __post_init__(self) -> None:
|
|
63
68
|
if not isinstance(self.time, dt.timedelta):
|
|
64
69
|
raise TypeError(f"time must be a `datetime.timedelta`, got {type(self.time)}")
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
{% macro monitor(requeue_max_attempts, requeue_exit_code) -%}
|
|
1
|
+
{% macro monitor(requeue_max_attempts, requeue_exit_code, requeue_on_timeout, requeue_timeout) -%}
|
|
2
2
|
__xm_slurm_wait_for_children() {
|
|
3
3
|
if [[ -n "${SLURM_ARRAY_JOB_ID:-}" ]]; then
|
|
4
4
|
local -r JOB_ID="${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}"
|
|
@@ -7,30 +7,69 @@ __xm_slurm_wait_for_children() {
|
|
|
7
7
|
fi
|
|
8
8
|
|
|
9
9
|
# If there are no child jobs we should error out
|
|
10
|
-
|
|
10
|
+
children=( $(jobs -p) )
|
|
11
|
+
{% raw %}
|
|
12
|
+
if [ ${#children[@]} -eq 0 ]; then
|
|
13
|
+
{% endraw %}
|
|
11
14
|
echo "ERROR: no child jobs exist..." >&2
|
|
12
|
-
exit
|
|
15
|
+
exit 1
|
|
13
16
|
fi
|
|
14
17
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
local -r JOB_EXIT_CODE="${?}"
|
|
21
|
-
set -e
|
|
18
|
+
{% if requeue_on_timeout %}
|
|
19
|
+
# Start a watchdog process to signal timeout.
|
|
20
|
+
sleep {{ requeue_timeout }} &
|
|
21
|
+
timeout_pid=$!
|
|
22
|
+
{% endif %}
|
|
22
23
|
|
|
23
|
-
|
|
24
|
-
|
|
24
|
+
{% raw %}
|
|
25
|
+
while [ ${#children[@]} -gt 0 ]; do
|
|
26
|
+
{% endraw %}
|
|
27
|
+
echo "INFO: Waiting for child processes to finish..."
|
|
28
|
+
{% if requeue_on_timeout %}
|
|
29
|
+
# Wait on either one of the child processes or the timeout process.
|
|
30
|
+
wait -n -p child_pid "${children[@]}" "${timeout_pid}"
|
|
31
|
+
{% else %}
|
|
32
|
+
wait -n -p child_pid "${children[@]}"
|
|
33
|
+
{% endif %}
|
|
34
|
+
local child_exit_code=$?
|
|
35
|
+
|
|
36
|
+
{% if requeue_on_timeout %}
|
|
37
|
+
# If the finished process is the watchdog, trigger the timeout handling.
|
|
38
|
+
if [ "${child_pid}" = "${timeout_pid}" ]; then
|
|
39
|
+
echo "INFO: Timeout of {{ requeue_timeout }} seconds reached. Killing remaining processes: ${children[*]}" >&2
|
|
40
|
+
kill "${children[@]}" 2>/dev/null || true
|
|
25
41
|
scontrol requeue "${JOB_ID}"
|
|
26
42
|
exit {{ requeue_exit_code }}
|
|
27
|
-
elif [ "${JOB_EXIT_CODE}" -ne 0 ]; then
|
|
28
|
-
echo "ERROR: Job ${job} exited with code ${JOB_EXIT_CODE}." >&2
|
|
29
|
-
exit "${JOB_EXIT_CODE}"
|
|
30
|
-
else
|
|
31
|
-
echo "INFO: Job ${job} exited successfully." >&2
|
|
32
43
|
fi
|
|
44
|
+
{% endif %}
|
|
45
|
+
|
|
46
|
+
echo "INFO: Process ${child_pid} finished with exit code ${child_exit_code}."
|
|
47
|
+
|
|
48
|
+
# Handle the exit code of the finished process.
|
|
49
|
+
if [ "${child_exit_code}" -eq "{{ requeue_exit_code }}" ] && [ "${SLURM_RESTART_COUNT:-0}" -le "{{ requeue_max_attempts }}" ]; then
|
|
50
|
+
echo "INFO: Received requeue exit code {{ requeue_exit_code }} from process ${child_pid}. Requeuing Slurm job ${JOB_ID} after ${SLURM_RESTART_COUNT-0} restarts." >&2
|
|
51
|
+
scontrol requeue "${JOB_ID}"
|
|
52
|
+
exit {{ requeue_exit_code }}
|
|
53
|
+
elif [ "${child_exit_code}" -ne 0 ]; then
|
|
54
|
+
echo "ERROR: Process ${child_pid} exited with code ${child_exit_code}." >&2
|
|
55
|
+
exit "${child_exit_code}"
|
|
56
|
+
fi
|
|
57
|
+
|
|
58
|
+
# Remove the finished PID from the array in a concise way.
|
|
59
|
+
for i in "${!children[@]}"; do
|
|
60
|
+
if [ "${children[i]}" = "$child_pid" ]; then
|
|
61
|
+
unset 'children[i]'
|
|
62
|
+
break
|
|
63
|
+
fi
|
|
64
|
+
done
|
|
65
|
+
|
|
66
|
+
# Reindex the array.
|
|
67
|
+
children=( "${children[@]}" )
|
|
33
68
|
done
|
|
69
|
+
|
|
70
|
+
{% if requeue_on_timeout %}
|
|
71
|
+
kill "$timeout_pid" 2>/dev/null || true
|
|
72
|
+
{% endif %}
|
|
34
73
|
}
|
|
35
74
|
|
|
36
75
|
__xm_slurm_wait_for_children
|
|
@@ -73,7 +73,7 @@ echo "[INFO] Start timestamp: $(date)"
|
|
|
73
73
|
|
|
74
74
|
{% block monitor -%}
|
|
75
75
|
{% from 'fragments/monitor.bash.j2' import monitor %}
|
|
76
|
-
{{ monitor(job.executor.requeue_max_attempts, job.executor.requeue_on_exit_code) }}
|
|
76
|
+
{{ monitor(job.executor.requeue_max_attempts, job.executor.requeue_on_exit_code, job.executor.requeue_on_timeout, job.executor.requeue_timeout.seconds) }}
|
|
77
77
|
{%- endblock monitor %}
|
|
78
78
|
|
|
79
79
|
|
|
@@ -6,7 +6,7 @@ xm_slurm/constants.py,sha256=zefVtlFdflgSolie5g_rVxWV-Zpydxapchm3y0a2FDc,999
|
|
|
6
6
|
xm_slurm/dependencies.py,sha256=-5gN_tpfs3dOA7H5_MIHO2ratb7F5Pm_yjkR5rZcgI8,6421
|
|
7
7
|
xm_slurm/executables.py,sha256=fGmrFBl-258bMn6ip5adYeM7xxUHAeIbDN9zD2FDGtY,6373
|
|
8
8
|
xm_slurm/execution.py,sha256=c0aV1h2tKQFyAGM6JLd16MWFgpRLKAbcutZz17xPUSw,31400
|
|
9
|
-
xm_slurm/executors.py,sha256=
|
|
9
|
+
xm_slurm/executors.py,sha256=bUgKcgtvf-nPGjcuHRzUAqD1r3_vwea_h-Y9MAB-Kqo,4887
|
|
10
10
|
xm_slurm/experiment.py,sha256=94r0mhtUPUzw4eaUEz0kpsufC25wEGqlDhV4Fcr1ukY,39883
|
|
11
11
|
xm_slurm/filesystem.py,sha256=4rKtq3t-KDgxJbSGt6JVyRJT_3lCN_vIKTcwKHpTo3I,4389
|
|
12
12
|
xm_slurm/job_blocks.py,sha256=_F8CKCs5BQFj40a2-mjG71HfacvWoBXBDPDKEaKTbXc,616
|
|
@@ -39,13 +39,13 @@ xm_slurm/templates/docker/uv.Dockerfile,sha256=L2UJMX2c8waMdrRhiqPytQe3pTBu6u5Pp
|
|
|
39
39
|
xm_slurm/templates/slurm/entrypoint.bash.j2,sha256=MRdSVwgGrgQdpEhqfkP35IidgsblrtVXB1YWzvE9hkk,666
|
|
40
40
|
xm_slurm/templates/slurm/job-array.bash.j2,sha256=smxmSSzBEUHm6MJF-nYPVVjK6CLKrb1fRxF_tfrzAX8,552
|
|
41
41
|
xm_slurm/templates/slurm/job-group.bash.j2,sha256=Cp8YhNOxYqaOkl4MFjQlcaLMGZwdDh97m8OGT5RWbAo,1101
|
|
42
|
-
xm_slurm/templates/slurm/job.bash.j2,sha256=
|
|
43
|
-
xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=
|
|
42
|
+
xm_slurm/templates/slurm/job.bash.j2,sha256=pNKir1tkmRTGDiGxlQ3DkUaW9Zos_gdkkXJC_xX5Cxo,1985
|
|
43
|
+
xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=BJ1brSjhESOe9VX_OYaPyy9-qE3uiFlzxp8ZkFcTw8Y,2504
|
|
44
44
|
xm_slurm/templates/slurm/fragments/proxy.bash.j2,sha256=VJLglZo-Nvx9R-qe3rHTxr07CylTQ6Z9NwBzvIpAZrA,814
|
|
45
45
|
xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=lE2EWVCK2O-n08RL4_MJYIikVTvODjcYKuv7Eh73Q2w,1932
|
|
46
46
|
xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=3j7K5eyXt_WhXK0EoMlxnhlmFVJ2JyxRKbsMRaDqzSs,1148
|
|
47
|
-
xmanager_slurm-0.4.
|
|
48
|
-
xmanager_slurm-0.4.
|
|
49
|
-
xmanager_slurm-0.4.
|
|
50
|
-
xmanager_slurm-0.4.
|
|
51
|
-
xmanager_slurm-0.4.
|
|
47
|
+
xmanager_slurm-0.4.9.dist-info/METADATA,sha256=WWPRzVrTsK5t8kD732EIejSlNgQ8KP01Ln7eM1Mj4e4,1042
|
|
48
|
+
xmanager_slurm-0.4.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
49
|
+
xmanager_slurm-0.4.9.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
|
|
50
|
+
xmanager_slurm-0.4.9.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
|
|
51
|
+
xmanager_slurm-0.4.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|