xmanager-slurm 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xmanager-slurm might be problematic. Click here for more details.

xm_slurm/executors.py CHANGED
@@ -57,8 +57,13 @@ class Slurm(xm.Executor):
57
57
 
58
58
  requeue: bool = True # Is this job ellible for requeueing?
59
59
  requeue_on_exit_code: int = 42 # The exit code that triggers requeueing
60
+ requeue_on_timeout: bool = True # Should the job requeue upon timeout minus the grace period
60
61
  requeue_max_attempts: int = 5 # How many times to attempt requeueing
61
62
 
63
+ @property
64
+ def requeue_timeout(self) -> dt.timedelta:
65
+ return self.time - self.timeout_signal_grace_period
66
+
62
67
  def __post_init__(self) -> None:
63
68
  if not isinstance(self.time, dt.timedelta):
64
69
  raise TypeError(f"time must be a `datetime.timedelta`, got {type(self.time)}")
@@ -1,4 +1,4 @@
1
- {% macro monitor(requeue_max_attempts, requeue_exit_code) -%}
1
+ {% macro monitor(requeue_max_attempts, requeue_exit_code, requeue_on_timeout, requeue_timeout) -%}
2
2
  __xm_slurm_wait_for_children() {
3
3
  if [[ -n "${SLURM_ARRAY_JOB_ID:-}" ]]; then
4
4
  local -r JOB_ID="${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}"
@@ -7,30 +7,69 @@ __xm_slurm_wait_for_children() {
7
7
  fi
8
8
 
9
9
  # If there are no child jobs we should error out
10
- if [ -z "$(jobs -p)" ]; then
10
+ children=( $(jobs -p) )
11
+ {% raw %}
12
+ if [ ${#children[@]} -eq 0 ]; then
13
+ {% endraw %}
11
14
  echo "ERROR: no child jobs exist..." >&2
12
- exit -1
15
+ exit 1
13
16
  fi
14
17
 
15
- # Loop through all job IDs in the background job list and wait for them to finish
16
- for job in "$(jobs -p)"; do
17
- echo "INFO: Waiting for job ${job} to finish..."
18
- set +e
19
- wait "${job}"
20
- local -r JOB_EXIT_CODE="${?}"
21
- set -e
18
+ {% if requeue_on_timeout %}
19
+ # Start a watchdog process to signal timeout.
20
+ sleep {{ requeue_timeout }} &
21
+ timeout_pid=$!
22
+ {% endif %}
22
23
 
23
- if [ "${JOB_EXIT_CODE}" -eq "{{ requeue_exit_code }}" ] && [ "${SLURM_RESTART_COUNT-0}" -le "{{ requeue_max_attempts }}" ]; then
24
- echo "INFO: Received requeue exit code {{ requeue_exit_code }} from job ${job}. Requeing Slurm job ${JOB_ID} after ${SLURM_RESTART_COUNT-0} restarts." >&2
24
+ {% raw %}
25
+ while [ ${#children[@]} -gt 0 ]; do
26
+ {% endraw %}
27
+ echo "INFO: Waiting for child processes to finish..."
28
+ {% if requeue_on_timeout %}
29
+ # Wait on either one of the child processes or the timeout process.
30
+ wait -n -p child_pid "${children[@]}" "${timeout_pid}"
31
+ {% else %}
32
+ wait -n -p child_pid "${children[@]}"
33
+ {% endif %}
34
+ local child_exit_code=$?
35
+
36
+ {% if requeue_on_timeout %}
37
+ # If the finished process is the watchdog, trigger the timeout handling.
38
+ if [ "${child_pid}" = "${timeout_pid}" ]; then
39
+ echo "INFO: Timeout of {{ requeue_timeout }} seconds reached. Killing remaining processes: ${children[*]}" >&2
40
+ kill "${children[@]}" 2>/dev/null || true
25
41
  scontrol requeue "${JOB_ID}"
26
42
  exit {{ requeue_exit_code }}
27
- elif [ "${JOB_EXIT_CODE}" -ne 0 ]; then
28
- echo "ERROR: Job ${job} exited with code ${JOB_EXIT_CODE}." >&2
29
- exit "${JOB_EXIT_CODE}"
30
- else
31
- echo "INFO: Job ${job} exited successfully." >&2
32
43
  fi
44
+ {% endif %}
45
+
46
+ echo "INFO: Process ${child_pid} finished with exit code ${child_exit_code}."
47
+
48
+ # Handle the exit code of the finished process.
49
+ if [ "${child_exit_code}" -eq "{{ requeue_exit_code }}" ] && [ "${SLURM_RESTART_COUNT:-0}" -le "{{ requeue_max_attempts }}" ]; then
50
+ echo "INFO: Received requeue exit code {{ requeue_exit_code }} from process ${child_pid}. Requeuing Slurm job ${JOB_ID} after ${SLURM_RESTART_COUNT-0} restarts." >&2
51
+ scontrol requeue "${JOB_ID}"
52
+ exit {{ requeue_exit_code }}
53
+ elif [ "${child_exit_code}" -ne 0 ]; then
54
+ echo "ERROR: Process ${child_pid} exited with code ${child_exit_code}." >&2
55
+ exit "${child_exit_code}"
56
+ fi
57
+
58
+ # Remove the finished PID from the array in a concise way.
59
+ for i in "${!children[@]}"; do
60
+ if [ "${children[i]}" = "$child_pid" ]; then
61
+ unset 'children[i]'
62
+ break
63
+ fi
64
+ done
65
+
66
+ # Reindex the array.
67
+ children=( "${children[@]}" )
33
68
  done
69
+
70
+ {% if requeue_on_timeout %}
71
+ kill "$timeout_pid" 2>/dev/null || true
72
+ {% endif %}
34
73
  }
35
74
 
36
75
  __xm_slurm_wait_for_children
@@ -73,7 +73,7 @@ echo "[INFO] Start timestamp: $(date)"
73
73
 
74
74
  {% block monitor -%}
75
75
  {% from 'fragments/monitor.bash.j2' import monitor %}
76
- {{ monitor(job.executor.requeue_max_attempts, job.executor.requeue_on_exit_code) }}
76
+ {{ monitor(job.executor.requeue_max_attempts, job.executor.requeue_on_exit_code, job.executor.requeue_on_timeout, job.executor.requeue_timeout.seconds) }}
77
77
  {%- endblock monitor %}
78
78
 
79
79
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xmanager-slurm
3
- Version: 0.4.8
3
+ Version: 0.4.9
4
4
  Summary: Slurm backend for XManager.
5
5
  Project-URL: GitHub, https://github.com/jessefarebro/xm-slurm
6
6
  Author-email: Jesse Farebrother <jfarebro@cs.mcgill.ca>
@@ -6,7 +6,7 @@ xm_slurm/constants.py,sha256=zefVtlFdflgSolie5g_rVxWV-Zpydxapchm3y0a2FDc,999
6
6
  xm_slurm/dependencies.py,sha256=-5gN_tpfs3dOA7H5_MIHO2ratb7F5Pm_yjkR5rZcgI8,6421
7
7
  xm_slurm/executables.py,sha256=fGmrFBl-258bMn6ip5adYeM7xxUHAeIbDN9zD2FDGtY,6373
8
8
  xm_slurm/execution.py,sha256=c0aV1h2tKQFyAGM6JLd16MWFgpRLKAbcutZz17xPUSw,31400
9
- xm_slurm/executors.py,sha256=fMtxGUCi4vEKmb_p4JEpqPUTh7L_f1LcR_TamMLAWNg,4667
9
+ xm_slurm/executors.py,sha256=bUgKcgtvf-nPGjcuHRzUAqD1r3_vwea_h-Y9MAB-Kqo,4887
10
10
  xm_slurm/experiment.py,sha256=94r0mhtUPUzw4eaUEz0kpsufC25wEGqlDhV4Fcr1ukY,39883
11
11
  xm_slurm/filesystem.py,sha256=4rKtq3t-KDgxJbSGt6JVyRJT_3lCN_vIKTcwKHpTo3I,4389
12
12
  xm_slurm/job_blocks.py,sha256=_F8CKCs5BQFj40a2-mjG71HfacvWoBXBDPDKEaKTbXc,616
@@ -39,13 +39,13 @@ xm_slurm/templates/docker/uv.Dockerfile,sha256=L2UJMX2c8waMdrRhiqPytQe3pTBu6u5Pp
39
39
  xm_slurm/templates/slurm/entrypoint.bash.j2,sha256=MRdSVwgGrgQdpEhqfkP35IidgsblrtVXB1YWzvE9hkk,666
40
40
  xm_slurm/templates/slurm/job-array.bash.j2,sha256=smxmSSzBEUHm6MJF-nYPVVjK6CLKrb1fRxF_tfrzAX8,552
41
41
  xm_slurm/templates/slurm/job-group.bash.j2,sha256=Cp8YhNOxYqaOkl4MFjQlcaLMGZwdDh97m8OGT5RWbAo,1101
42
- xm_slurm/templates/slurm/job.bash.j2,sha256=d35VYHdAKkgVK8s4XnUDJwQR0gLnDWRJu-Ldz-qALmQ,1914
43
- xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=HYqYhXsTv8TCed5UaGCZVGIYsqxSKHcnPyNNTHWNvxc,1279
42
+ xm_slurm/templates/slurm/job.bash.j2,sha256=pNKir1tkmRTGDiGxlQ3DkUaW9Zos_gdkkXJC_xX5Cxo,1985
43
+ xm_slurm/templates/slurm/fragments/monitor.bash.j2,sha256=BJ1brSjhESOe9VX_OYaPyy9-qE3uiFlzxp8ZkFcTw8Y,2504
44
44
  xm_slurm/templates/slurm/fragments/proxy.bash.j2,sha256=VJLglZo-Nvx9R-qe3rHTxr07CylTQ6Z9NwBzvIpAZrA,814
45
45
  xm_slurm/templates/slurm/runtimes/apptainer.bash.j2,sha256=lE2EWVCK2O-n08RL4_MJYIikVTvODjcYKuv7Eh73Q2w,1932
46
46
  xm_slurm/templates/slurm/runtimes/podman.bash.j2,sha256=3j7K5eyXt_WhXK0EoMlxnhlmFVJ2JyxRKbsMRaDqzSs,1148
47
- xmanager_slurm-0.4.8.dist-info/METADATA,sha256=8QQ9xbptuObTCHB8WaYr0rSKOgxK5ojCaeD7mN9Qvl0,1042
48
- xmanager_slurm-0.4.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
49
- xmanager_slurm-0.4.8.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
50
- xmanager_slurm-0.4.8.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
51
- xmanager_slurm-0.4.8.dist-info/RECORD,,
47
+ xmanager_slurm-0.4.9.dist-info/METADATA,sha256=WWPRzVrTsK5t8kD732EIejSlNgQ8KP01Ln7eM1Mj4e4,1042
48
+ xmanager_slurm-0.4.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
49
+ xmanager_slurm-0.4.9.dist-info/entry_points.txt,sha256=_HLGmLgxuQLOPmF2gOFYDVq2HqtMVD_SzigHvUh8TCY,49
50
+ xmanager_slurm-0.4.9.dist-info/licenses/LICENSE.md,sha256=IxstXr3MPHwTJ5jMrByHrQsR1ZAGQ2U_uz_4qzI_15Y,11756
51
+ xmanager_slurm-0.4.9.dist-info/RECORD,,