stepup-queue 1.0.6__tar.gz → 1.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.0.6
3
+ Version: 1.0.7
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
@@ -24,7 +24,7 @@ Classifier: Topic :: Software Development :: Build Tools
24
24
  Requires-Python: >=3.11
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
- Requires-Dist: stepup<4.0.0,>=3.1.3
27
+ Requires-Dist: stepup<4.0.0,>=3.1.4
28
28
  Provides-Extra: dev
29
29
  Requires-Dist: psutil; extra == "dev"
30
30
  Requires-Dist: pytest; extra == "dev"
@@ -28,7 +28,7 @@ classifiers = [
28
28
  ]
29
29
  dependencies = [
30
30
  # Ensure changes to these dependencies are reflected in .github/requirements-old.txt
31
- "stepup>=3.1.3,<4.0.0",
31
+ "stepup>=3.1.4,<4.0.0",
32
32
  ]
33
33
  dynamic = ["version"]
34
34
 
@@ -49,6 +49,9 @@ def sbatch(argstr: str, work_thread: WorkThread) -> int:
49
49
  # Cancel running job (if any), clean log and resubmit
50
50
  path_log = Path("slurmjob.log")
51
51
  job_id, cluster = read_jobid_cluster(path_log)
52
- work_thread.runsh(f"scancel -M {cluster} {job_id}")
52
+ if cluster is None:
53
+ work_thread.runsh(f"scancel {job_id}")
54
+ else:
55
+ work_thread.runsh(f"scancel -M {cluster} {job_id}")
53
56
  path_log.remove_p()
54
57
  return submit_once_and_wait(work_thread, args.ext, args.rc, args.onchange != "ignore")
@@ -20,16 +20,17 @@
20
20
  """Tool to cancel jobs."""
21
21
 
22
22
  import argparse
23
- import os
23
+ import subprocess
24
24
 
25
25
  from path import Path
26
26
 
27
- from .sbatch import FIRST_LINE
27
+ from .sbatch import FIRST_LINE, parse_sbatch
28
28
 
29
29
 
30
30
  def canceljobs_tool(args: argparse.Namespace) -> int:
31
31
  if len(args.paths) == 0:
32
32
  args.paths = [Path(".")]
33
+
33
34
  # Iterate over all slurmjob.log files in the specified directories, and kill them.
34
35
  job_ids = {}
35
36
  for path in args.paths:
@@ -39,18 +40,42 @@ def canceljobs_tool(args: argparse.Namespace) -> int:
39
40
  if not path.is_dir():
40
41
  print(f"Path {path} is not a directory.")
41
42
  continue
42
- for job_log in path.glob("**/slurmjob.log"):
43
- job_id, cluster = read_jobid_cluster(job_log)
44
- print(f"Found job {job_id} on cluster {cluster} in {job_log}")
45
- job_ids.setdefault(cluster, []).append(job_id)
46
- # Cancel 100 at a time to avoid exceeding the command line length limit.
43
+ print(f"Searching recursively in {path}")
44
+ paths_log = list(path.glob("**/slurmjob.log"))
45
+ if (path / "slurmjob.log").is_file():
46
+ paths_log.append(path / "slurmjob.log")
47
+ for job_log in paths_log:
48
+ try:
49
+ job_id, cluster = read_jobid_cluster(job_log)
50
+ msg = f"Found job {job_id} in {job_log}"
51
+ if cluster is not None:
52
+ msg += f" on cluster {cluster}"
53
+ print(msg)
54
+ job_ids.setdefault(cluster, []).append(job_id)
55
+ except ValueError as e:
56
+ print(f"Warning: Could not read job ID from {job_log}: {e}")
57
+ continue
58
+
59
+ returncode = 0
60
+ # Cancel at most 100 at a time to avoid exceeding the command line length limit,
61
+ # and to play nice with SLURM.
47
62
  for cluster, cluster_job_ids in job_ids.items():
48
63
  while len(cluster_job_ids) > 0:
49
- command = f"scancel -M {cluster} " + " ".join(cluster_job_ids[:100])
50
- print(command)
51
- os.system(command)
64
+ cancel_ids = cluster_job_ids[:100]
52
65
  cluster_job_ids[:] = cluster_job_ids[100:]
53
66
 
67
+ command_args = ["scancel"]
68
+ if cluster is not None:
69
+ command_args.extend(["-M", cluster])
70
+ command_args.extend(str(job_id) for job_id in cancel_ids)
71
+
72
+ # Using subprocess.run for better control and error handling
73
+ print(f"Executing: {' '.join(command_args)}")
74
+ result = subprocess.run(command_args, check=False)
75
+ if result.returncode != 0:
76
+ returncode = 1
77
+ return returncode
78
+
54
79
 
55
80
  def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
56
81
  """Read the job ID and cluster from the job log file."""
@@ -58,8 +83,7 @@ def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
58
83
  lines = f.readlines()
59
84
  if len(lines) < 3 or lines[0][:-1] != FIRST_LINE:
60
85
  raise ValueError(f"Invalid first line in {job_log}.")
61
- job_id, cluster = lines[2].split()[-1].split(";")
62
- return job_id, cluster
86
+ return parse_sbatch(lines[2].split()[-1])
63
87
 
64
88
 
65
89
  def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
@@ -28,15 +28,17 @@ from datetime import datetime
28
28
 
29
29
  from path import Path
30
30
 
31
- from stepup.core.utils import string_to_bool
32
31
  from stepup.core.worker import WorkThread
33
32
 
34
33
  FIRST_LINE = "StepUp Queue sbatch wait log format version 2"
35
- SCONTROL_FAILED = "The command `scontrol show job` failed!\n"
36
- DEBUG = string_to_bool(os.getenv("STEPUP_SBATCH_DEBUG", "0"))
34
+ SBATCH_RETRY_NUM = int(os.getenv("STEPUP_SBATCH_RETRY_NUM", "5"))
35
+ SBATCH_RETRY_DELAY_MIN = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MIN", "60"))
36
+ SBATCH_RETRY_DELAY_MAX = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MAX", "120"))
37
37
  CACHE_TIMEOUT = int(os.getenv("STEPUP_SBATCH_CACHE_TIMEOUT", "30"))
38
- POLLING_INTERVAL = int(os.getenv("STEPUP_SBATCH_POLLING_INTERVAL", "10"))
39
- TIME_MARGIN = int(os.getenv("STEPUP_SBATCH_TIME_MARGIN", "15"))
38
+ POLLING_MIN = int(os.getenv("STEPUP_SBATCH_POLLING_MIN", "10"))
39
+ POLLING_MAX = max(int(os.getenv("STEPUP_SBATCH_POLLING_MAX", "20")), POLLING_MIN)
40
+ SACCT_START = os.getenv("STEPUP_SACCT_START_TIME", "now-7days")
41
+ UNLISTED_TIMEOUT = int(os.getenv("STEPUP_SBATCH_UNLISTED_TIMEOUT", "600"))
40
42
 
41
43
 
42
44
  def submit_once_and_wait(
@@ -68,11 +70,7 @@ def submit_once_and_wait(
68
70
  """
69
71
  # Read previously logged steps
70
72
  path_log = Path("slurmjob.log")
71
- if path_log.is_file():
72
- previous_lines = read_log(path_log, validate_inp_digest)
73
- else:
74
- previous_lines = []
75
- _init_log(path_log)
73
+ previous_lines = read_log(path_log, validate_inp_digest) if path_log.is_file() else []
76
74
 
77
75
  # Go through or skip steps.
78
76
  submit_time, status = read_step(previous_lines)
@@ -80,6 +78,8 @@ def submit_once_and_wait(
80
78
  # A new job must be submitted.
81
79
  submit_time = time.time()
82
80
  sbatch_stdout = submit_job(work_thread, job_ext, sbatch_rc)
81
+ # Create a new log file after submitting the job.
82
+ _init_log(path_log)
83
83
  log_step(path_log, f"Submitted {sbatch_stdout}")
84
84
  rndsleep()
85
85
  else:
@@ -103,12 +103,17 @@ def submit_once_and_wait(
103
103
  work_thread, submit_time, jobid, cluster, previous_lines, path_log, status
104
104
  )
105
105
 
106
- # Get the return code from the job
107
- with open("slurmjob.ret") as fh:
108
- returncode = fh.read().strip()
109
- if returncode == "":
110
- raise ValueError("The job did not return a return code, e.g. because it was cancelled.")
111
- return int(returncode)
106
+ if status == "COMPLETED":
107
+ # Get the return code from the job
108
+ with open("slurmjob.ret") as fh:
109
+ returncode = fh.read().strip()
110
+ try:
111
+ return int(returncode)
112
+ except ValueError as exc:
113
+ raise ValueError(
114
+ f"Could not parse return code from slurmjob.ret. Got '{returncode}'"
115
+ ) from exc
116
+ raise RuntimeError(f"Job ended with status '{status}'.")
112
117
 
113
118
 
114
119
  def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
@@ -141,6 +146,50 @@ def _init_log(path_log: str):
141
146
  print(inp_digest, file=fh)
142
147
 
143
148
 
149
+ # From: https://slurm.schedmd.com/job_state_codes.html
150
+ KNOWN_JOB_STATES = [
151
+ # -- Job states
152
+ # done
153
+ "BOOT_FAIL",
154
+ "CANCELLED",
155
+ "COMPLETED",
156
+ "DEADLINE",
157
+ "FAILED",
158
+ "NODE_FAIL",
159
+ "OUT_OF_MEMORY",
160
+ "PREEMPTED",
161
+ "TIMEOUT",
162
+ # waiting or running
163
+ "PENDING",
164
+ "RUNNING",
165
+ "SUSPENDED",
166
+ # -- Job flags
167
+ # done
168
+ "LAUNCH_FAILED",
169
+ "RECONFIG_FAIL",
170
+ "REVOKED",
171
+ "STOPPED",
172
+ # waiting or running
173
+ "COMPLETING",
174
+ "CONFIGURING",
175
+ "EXPEDITING",
176
+ "POWER_UP_NODE",
177
+ "REQUEUED",
178
+ "REQUEUE_FED",
179
+ "REQUEUE_HOLD",
180
+ "RESIZING",
181
+ "RESV_DEL_HOLD",
182
+ "SIGNALING",
183
+ "SPECIAL_EXIT",
184
+ "STAGE_OUT",
185
+ "UPDATE_DB",
186
+ # -- Specific to this script
187
+ # to be ignored (same as waiting or running), must not be logged
188
+ "invalid",
189
+ "unlisted",
190
+ ]
191
+
192
+
144
193
  def _read_or_poll_status(
145
194
  work_thread: WorkThread,
146
195
  submit_time: float,
@@ -155,7 +204,7 @@ def _read_or_poll_status(
155
204
  Parameters
156
205
  ----------
157
206
  work_thread
158
- The work thread to use for launching the scontrol command.
207
+ The work thread to use for launching the sacct command.
159
208
  submit_time
160
209
  The timestamp when the job was submitted.
161
210
  jobid
@@ -165,7 +214,6 @@ def _read_or_poll_status(
165
214
  previous_lines
166
215
  Lines from an existing log file to be processed first.
167
216
  (It will be gradually emptied.)
168
- path_log
169
217
  The log file to write new polling results to.
170
218
  last_status
171
219
  The status from the previous iteration.
@@ -179,19 +227,40 @@ def _read_or_poll_status(
179
227
  True when the waiting is over.
180
228
  """
181
229
  # First try to replay previously logged steps
182
- status_time, status = read_step(previous_lines)
230
+ _, status = read_step(previous_lines)
183
231
  if status is None:
184
232
  # All previously logged steps are processed.
185
- # Call scontrol and parse its response.
233
+ # Call sacct and parse its response.
186
234
  rndsleep()
187
- status_time, status = get_status(work_thread, jobid, cluster)
235
+ _, status = get_status(work_thread, jobid, cluster)
188
236
  # Log only if the status changed, and is not invalid or unlisted.
189
237
  # These two statuses are (potentially) transient and should not be logged.
190
238
  if status != last_status and status not in ["invalid", "unlisted"]:
191
239
  log_step(path_log, status)
192
- done = (status_time > submit_time + TIME_MARGIN) and (
193
- status not in ["PENDING", "CONFIGURING", "RUNNING", "invalid"]
194
- )
240
+ if status not in KNOWN_JOB_STATES:
241
+ raise ValueError(f"Unknown job status '{status}' obtained from scheduler.")
242
+
243
+ # Determine if the job is done
244
+ done = status in [
245
+ "BOOT_FAIL",
246
+ "CANCELLED",
247
+ "COMPLETED",
248
+ "DEADLINE",
249
+ "FAILED",
250
+ "NODE_FAIL",
251
+ "OUT_OF_MEMORY",
252
+ "PREEMPTED",
253
+ "TIMEOUT",
254
+ "LAUNCH_FAILED",
255
+ "RECONFIG_FAIL",
256
+ "REVOKED",
257
+ "STOPPED",
258
+ ]
259
+ if status == "unlisted" and time.time() > submit_time + UNLISTED_TIMEOUT:
260
+ # If the job remains unlisted for too long, we declare it failed.
261
+ # This prevents an infinite loop if the job ID was wrong or purged.
262
+ done = True
263
+
195
264
  return status, done
196
265
 
197
266
 
@@ -232,7 +301,7 @@ def read_step(lines: list[str]) -> str | None:
232
301
 
233
302
  def rndsleep():
234
303
  """Randomized sleep to distribute I/O load evenly."""
235
- sleep_seconds = 1 if DEBUG else random.randint(POLLING_INTERVAL, POLLING_INTERVAL + TIME_MARGIN)
304
+ sleep_seconds = random.randint(POLLING_MIN, POLLING_MAX)
236
305
  time.sleep(sleep_seconds)
237
306
 
238
307
 
@@ -241,36 +310,56 @@ JOB_SCRIPT_WRAPPER = """\
241
310
  {sbatch_header}
242
311
 
243
312
  touch slurmjob.ret
244
- chmod +x '{job_script}'
245
313
  ./'{job_script}'
246
314
  RETURN_CODE=$?
247
315
  echo $RETURN_CODE > slurmjob.ret
248
316
  exit $RETURN_CODE
249
317
  """
250
318
 
319
+ RE_SBATCH_STDOUT = re.compile(r"#\s*SBATCH\b.*(--output|-o)")
320
+ RE_SBATCH_STDERR = re.compile(r"#\s*SBATCH\b.*(--error|-e)")
321
+ RE_SBATCH_ARRAY = re.compile(r"#\s*SBATCH\b.*(--array|-a)")
322
+ RE_SBATCH = re.compile(r"#\s*SBATCH\b")
323
+
251
324
 
252
325
  def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = None) -> str:
253
326
  """Submit a job with sbatch."""
254
- # Copy the #SBATCH lines from the job script.
327
+ # Verify that the job script is executable.
255
328
  path_job = f"slurmjob{job_ext}"
329
+ if not os.access(path_job, os.X_OK):
330
+ raise ValueError("The job script must be executable.")
331
+
332
+ # Copy the #SBATCH lines from the job script and perform some checks.
256
333
  with open(path_job) as f:
257
- sbatch_header = "\n".join(line for line in f if line.startswith("#SBATCH"))
334
+ sbatch_header = []
335
+ first_line = next(f)
336
+ if not first_line.startswith("#!"):
337
+ raise ValueError("The job script must start with a shebang line.")
338
+ for line in f:
339
+ if RE_SBATCH_STDOUT.match(line):
340
+ raise ValueError("The job script must not contain a #SBATCH --output/-o line.")
341
+ if RE_SBATCH_STDERR.match(line):
342
+ raise ValueError("The job script must not contain a #SBATCH --error/-e line.")
343
+ if RE_SBATCH_ARRAY.match(line):
344
+ raise ValueError("StepUp Queue does not support array jobs. (Found -a or --array)")
345
+ if RE_SBATCH.match(line):
346
+ sbatch_header.append(line.strip())
347
+ sbatch_header = "\n".join(sbatch_header)
258
348
 
259
349
  command = "sbatch --parsable -o slurmjob.out -e slurmjob.err"
260
350
  if sbatch_rc is not None:
261
351
  command = f"{sbatch_rc} < /dev/null && {command}"
262
- returncode, stdout, stderr = work_thread.runsh(
263
- command,
264
- stdin=JOB_SCRIPT_WRAPPER.format(
265
- sbatch_header=sbatch_header,
266
- job_script=path_job,
267
- ),
268
- )
269
- if returncode != 0:
352
+ stdin = JOB_SCRIPT_WRAPPER.format(sbatch_header=sbatch_header, job_script=path_job)
353
+ for _ in range(SBATCH_RETRY_NUM):
354
+ returncode, stdout, stderr = work_thread.runsh(command, stdin=stdin)
355
+ if returncode == 0:
356
+ return stdout.strip()
270
357
  if not (stderr is None or stderr == ""):
271
358
  print(stderr)
272
- raise RuntimeError(f"sbatch failed with return code {returncode}.")
273
- return stdout.strip()
359
+ delay = random.randint(SBATCH_RETRY_DELAY_MIN, SBATCH_RETRY_DELAY_MAX)
360
+ print(f"sbatch failed with return code {returncode}. Retrying in {delay} seconds.")
361
+ time.sleep(delay)
362
+ raise RuntimeError(f"sbatch failed {SBATCH_RETRY_NUM} times. Giving up.")
274
363
 
275
364
 
276
365
  def log_step(path_log: Path, step: str):
@@ -292,12 +381,12 @@ def parse_sbatch(stdout: str) -> tuple[int, str | None]:
292
381
 
293
382
 
294
383
  def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
295
- """Load cached scontrol output or run scontrol if outdated.
384
+ """Load cached sacct output or run sacct if outdated.
296
385
 
297
386
  Parameters
298
387
  ----------
299
388
  work_thread
300
- The work thread to use for launching the scontrol command.
389
+ The work thread to use for launching the sacct command.
301
390
  jobid
302
391
  The job to wait for.
303
392
  cluster
@@ -306,24 +395,22 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
306
395
  Returns
307
396
  -------
308
397
  status
309
- A status reported by scontrol,
310
- or `invalid` if scontrol failed (retry scontrol later),
398
+ A status reported by sacct,
399
+ or `invalid` if sacct failed (retry sacct later),
311
400
  or `unlisted` if the job is not found (probably ended long ago).
312
401
  """
313
402
  # Load cached output or run again
314
- command = "scontrol show job"
315
- path_out = Path(os.getenv("HOME")) / ".cache/stepup-queue"
403
+ command = f"sacct -o 'jobid,state' -PXn -S {SACCT_START}"
404
+ path_out = Path(os.getenv("ROOT")) / ".stepup/queue"
316
405
  if cluster is None:
317
- path_out /= "sbatch_wait.out"
406
+ path_out /= "sbatch_wait_sacct.out"
318
407
  else:
319
408
  command += f" --cluster={cluster}"
320
- path_out /= f"sbatch_wait.{cluster}.out"
321
- status_time, scontrol_out, returncode = cached_run(
322
- work_thread, command, path_out, CACHE_TIMEOUT
323
- )
409
+ path_out /= f"sbatch_wait_sacct.{cluster}.out"
410
+ status_time, sacct_out, returncode = cached_run(work_thread, command, path_out, CACHE_TIMEOUT)
324
411
  if returncode != 0:
325
412
  return status_time, "invalid"
326
- return status_time, parse_scontrol_out(scontrol_out, jobid)
413
+ return status_time, parse_sacct_out(sacct_out, jobid)
327
414
 
328
415
 
329
416
  def cached_run(
@@ -405,13 +492,13 @@ def parse_cache_header(header: str) -> tuple[float, int]:
405
492
  CACHE_HEADER_LENGTH = len(make_cache_header(time.time(), 0))
406
493
 
407
494
 
408
- def parse_scontrol_out(scontrol_out: str, jobid: int) -> str:
409
- """Get the job state for a specific from from the output of ``scontrol show job``.
495
+ def parse_sacct_out(sacct_out: str, jobid: int) -> str:
496
+ """Get the job state for a specific from from the output of ``sacct -o 'jobid,state' -PXn``.
410
497
 
411
498
  Parameters
412
499
  ----------
413
- scontrol_out
414
- A string with the output of ``scontrol show job``.
500
+ sacct_out
501
+ A string with the output of ``sacct -o 'jobid,state' -PXn``.
415
502
  jobid
416
503
  The jobid of interest.
417
504
 
@@ -423,12 +510,13 @@ def parse_scontrol_out(scontrol_out: str, jobid: int) -> str:
423
510
  - Any of the SLURM job states.
424
511
  - `unlisted` if the job cannot be found,
425
512
  which practically means it has ended long ago.
513
+ - `invalid` if the sacct output cannot be parsed.
426
514
  """
427
- match = re.search(
428
- f"JobId={jobid}.*?JobState=(?P<state>[A-Z]+)",
429
- scontrol_out,
430
- flags=re.MULTILINE | re.DOTALL,
431
- )
432
- if match is not None:
433
- return match.group("state")
515
+ try:
516
+ for line in sacct_out.splitlines():
517
+ columns = line.strip().split("|")
518
+ if int(columns[0]) == jobid:
519
+ return columns[1].strip().split()[0]
520
+ except (ValueError, IndexError):
521
+ return "invalid"
434
522
  return "unlisted"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.0.6
3
+ Version: 1.0.7
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
@@ -24,7 +24,7 @@ Classifier: Topic :: Software Development :: Build Tools
24
24
  Requires-Python: >=3.11
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
- Requires-Dist: stepup<4.0.0,>=3.1.3
27
+ Requires-Dist: stepup<4.0.0,>=3.1.4
28
28
  Provides-Extra: dev
29
29
  Requires-Dist: psutil; extra == "dev"
30
30
  Requires-Dist: pytest; extra == "dev"
@@ -1,4 +1,4 @@
1
- stepup<4.0.0,>=3.1.3
1
+ stepup<4.0.0,>=3.1.4
2
2
 
3
3
  [dev]
4
4
  psutil
File without changes
File without changes
File without changes
File without changes