stepup-queue 1.0.2__tar.gz → 1.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/PKG-INFO +1 -1
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/changelog.md +10 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/examples/slurm-basic/dynamic-template.sh +1 -1
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/examples/slurm-basic/fail/slurmjob.sh +1 -1
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/examples/slurm-basic/pass/slurmjob.py +1 -1
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/examples/slurm-perpetual/step1/slurmjob.sh +1 -1
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/examples/slurm-perpetual/step2/slurmjob.sh +1 -1
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/examples/slurm-perpetual/workflow.sh +7 -4
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/stepup/queue/sbatch.py +28 -17
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/stepup_queue.egg-info/PKG-INFO +1 -1
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/tests/test_sbatch.py +5 -3
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/.editorconfig +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/.github/requirements-old.txt +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/.github/scripts/extract-notes.sh +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/.github/workflows/mkdocs.yaml +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/.github/workflows/pytest.yaml +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/.github/workflows/release.yaml +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/.gitignore +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/.markdownlint-cli2.jsonc +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/.pre-commit-config.yaml +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/LICENSE +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/README.md +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/development.md +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/examples/slurm-basic/.gitignore +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/examples/slurm-basic/README.md +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/examples/slurm-basic/plan.py +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/examples/slurm-perpetual/.gitignore +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/examples/slurm-perpetual/README.md +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/examples/slurm-perpetual/plan.py +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/index.md +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/installation.md +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/license.md +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/stepup.queue.api.md +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/docs/usage.md +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/mkdocs.yaml +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/overrides/main.html +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/pyproject.toml +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/setup.cfg +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/stepup/queue/__init__.py +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/stepup/queue/actions.py +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/stepup/queue/api.py +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/stepup/queue/canceljobs.py +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/stepup_queue.egg-info/SOURCES.txt +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/stepup_queue.egg-info/dependency_links.txt +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/stepup_queue.egg-info/entry_points.txt +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/stepup_queue.egg-info/requires.txt +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/stepup_queue.egg-info/top_level.txt +0 -0
- {stepup_queue-1.0.2 → stepup_queue-1.0.3}/tests/conftest.py +0 -0
|
@@ -12,6 +12,15 @@ and this project adheres to [Effort-based Versioning](https://jacobtomlinson.dev
|
|
|
12
12
|
|
|
13
13
|
(no changes yet)
|
|
14
14
|
|
|
15
|
+
## [1.0.3][] - 2025-05-16 {: #v1.0.3 }
|
|
16
|
+
|
|
17
|
+
### Fixed
|
|
18
|
+
|
|
19
|
+
- Fixed errors in the example job scripts.
|
|
20
|
+
- Improved handling of `scontrol` failures.
|
|
21
|
+
|
|
22
|
+
### Added
|
|
23
|
+
|
|
15
24
|
## [1.0.2][] - 2025-05-14 {: #v1.0.2 }
|
|
16
25
|
|
|
17
26
|
### Added
|
|
@@ -38,6 +47,7 @@ It was adapted to integrate well with StepUp Core 3.
|
|
|
38
47
|
This release also features the `stepup canceljobs` tool, which was not present in Parman.
|
|
39
48
|
|
|
40
49
|
[Unreleased]: https://github.com/reproducible-reporting/stepup-queue
|
|
50
|
+
[1.0.3]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.3
|
|
41
51
|
[1.0.2]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.2
|
|
42
52
|
[1.0.1]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.1
|
|
43
53
|
[1.0.0]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.0
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env bash
|
|
2
|
-
#SBATCH --job-name
|
|
2
|
+
#SBATCH --job-name stepup
|
|
3
3
|
#SBATCH --nodes=1
|
|
4
|
-
#SBATCH --
|
|
4
|
+
#SBATCH --ntasks=1
|
|
5
5
|
#SBATCH --cpus-per-task=1
|
|
6
|
-
#SBATCH --output=
|
|
6
|
+
#SBATCH --output=stepup-%j.out
|
|
7
7
|
#SBATCH --time=00:01:00
|
|
8
8
|
|
|
9
|
-
# In production, --time=
|
|
9
|
+
# In production, --time=12:00:00 is a reasonable time limit.
|
|
10
|
+
echo "StepUp workflow job starts:" $(date)
|
|
10
11
|
|
|
11
12
|
# If needed, load required modules and activate a relevant virtual environment.
|
|
12
13
|
# For example:
|
|
@@ -50,3 +51,5 @@ if [ -f ${STEPUP_QUEUE_FLAG_DIR}/resubmit ]; then
|
|
|
50
51
|
else
|
|
51
52
|
echo "Stepup was stopped gracefully."
|
|
52
53
|
fi
|
|
54
|
+
|
|
55
|
+
echo "StepUp workflow job ends:" $(date)
|
|
@@ -177,10 +177,10 @@ def _read_or_poll_status(
|
|
|
177
177
|
# Call scontrol and parse its response.
|
|
178
178
|
rndsleep()
|
|
179
179
|
status_time, status = get_status(work_thread, jobid, cluster)
|
|
180
|
-
if status
|
|
180
|
+
if status != last_status:
|
|
181
181
|
log_step(path_log, status)
|
|
182
182
|
done = (status_time > submit_time + TIME_MARGIN) and (
|
|
183
|
-
status not in ["PENDING", "CONFIGURING", "RUNNING"]
|
|
183
|
+
status not in ["PENDING", "CONFIGURING", "RUNNING", "invalid"]
|
|
184
184
|
)
|
|
185
185
|
return status, done
|
|
186
186
|
|
|
@@ -281,7 +281,7 @@ def parse_sbatch(stdout: str) -> tuple[int, str | None]:
|
|
|
281
281
|
raise ValueError(f"Cannot parse sbatch output: {stdout}")
|
|
282
282
|
|
|
283
283
|
|
|
284
|
-
def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str
|
|
284
|
+
def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
|
|
285
285
|
"""Load cached scontrol output or run scontrol if outdated.
|
|
286
286
|
|
|
287
287
|
Parameters
|
|
@@ -296,10 +296,9 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str
|
|
|
296
296
|
Returns
|
|
297
297
|
-------
|
|
298
298
|
status
|
|
299
|
-
A status reported by scontrol
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
(Try again later.)
|
|
299
|
+
A status reported by scontrol,
|
|
300
|
+
or `invalid` if scontrol failed (retry scontrol later),
|
|
301
|
+
or `unlisted` if the job is not found (probably ended long ago).
|
|
303
302
|
"""
|
|
304
303
|
# Load cached output or run again
|
|
305
304
|
command = "scontrol show job"
|
|
@@ -309,11 +308,17 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str
|
|
|
309
308
|
else:
|
|
310
309
|
command += f" --cluster={cluster}"
|
|
311
310
|
path_out /= f"sbatch_wait.{cluster}.out"
|
|
312
|
-
status_time, scontrol_out = cached_run(
|
|
311
|
+
status_time, scontrol_out, returncode = cached_run(
|
|
312
|
+
work_thread, command, path_out, CACHE_TIMEOUT
|
|
313
|
+
)
|
|
314
|
+
if returncode != 0:
|
|
315
|
+
return status_time, "invalid"
|
|
313
316
|
return status_time, parse_scontrol_out(scontrol_out, jobid)
|
|
314
317
|
|
|
315
318
|
|
|
316
|
-
def cached_run(
|
|
319
|
+
def cached_run(
|
|
320
|
+
work_thread: WorkThread, command: str, path_out: Path, cache_timeout
|
|
321
|
+
) -> tuple[float, str, int]:
|
|
317
322
|
"""Execute a command if its previous output is outdated.
|
|
318
323
|
|
|
319
324
|
Parameters
|
|
@@ -329,8 +334,12 @@ def cached_run(work_thread: WorkThread, command: str, path_out: Path, cache_time
|
|
|
329
334
|
|
|
330
335
|
Returns
|
|
331
336
|
-------
|
|
337
|
+
cache_time
|
|
338
|
+
The time when the command was last executed.
|
|
332
339
|
stdout
|
|
333
340
|
The output of the file, either new or cached.
|
|
341
|
+
returncode
|
|
342
|
+
The return code of the (cached) command.
|
|
334
343
|
|
|
335
344
|
Notes
|
|
336
345
|
-----
|
|
@@ -345,7 +354,7 @@ def cached_run(work_thread: WorkThread, command: str, path_out: Path, cache_time
|
|
|
345
354
|
fcntl.lockf(fh, fcntl.LOCK_EX)
|
|
346
355
|
fh.seek(0)
|
|
347
356
|
header = fh.read(CACHE_HEADER_LENGTH)
|
|
348
|
-
cache_time,
|
|
357
|
+
cache_time, returncode = parse_cache_header(header)
|
|
349
358
|
if cache_time is None or time.time() > cache_time + cache_timeout:
|
|
350
359
|
returncode, stdout, _ = work_thread.runsh(command)
|
|
351
360
|
# Go the the beginning of the file before truncating.
|
|
@@ -358,8 +367,8 @@ def cached_run(work_thread: WorkThread, command: str, path_out: Path, cache_time
|
|
|
358
367
|
fh.write(stdout)
|
|
359
368
|
fh.flush()
|
|
360
369
|
os.fsync(fh.fileno())
|
|
361
|
-
return cache_time, stdout
|
|
362
|
-
return cache_time, fh.read()
|
|
370
|
+
return cache_time, stdout, returncode
|
|
371
|
+
return cache_time, fh.read(), returncode
|
|
363
372
|
|
|
364
373
|
|
|
365
374
|
def make_cache_header(cache_time: float, returncode: int):
|
|
@@ -386,7 +395,7 @@ def parse_cache_header(header: str) -> tuple[float, int]:
|
|
|
386
395
|
CACHE_HEADER_LENGTH = len(make_cache_header(time.time(), 0))
|
|
387
396
|
|
|
388
397
|
|
|
389
|
-
def parse_scontrol_out(scontrol_out: str, jobid: int) -> str
|
|
398
|
+
def parse_scontrol_out(scontrol_out: str, jobid: int) -> str:
|
|
390
399
|
"""Get the job state for a specific from from the output of ``scontrol show job``.
|
|
391
400
|
|
|
392
401
|
Parameters
|
|
@@ -399,10 +408,12 @@ def parse_scontrol_out(scontrol_out: str, jobid: int) -> str | None:
|
|
|
399
408
|
Returns
|
|
400
409
|
-------
|
|
401
410
|
jobstate
|
|
402
|
-
The status of the job
|
|
411
|
+
The status of the job. This can be:
|
|
412
|
+
|
|
413
|
+
- Any of the SLURM job states.
|
|
414
|
+
- `unlisted` if the job cannot be found,
|
|
415
|
+
which practically means it has ended long ago.
|
|
403
416
|
"""
|
|
404
|
-
if scontrol_out == SCONTROL_FAILED:
|
|
405
|
-
return "Invalid"
|
|
406
417
|
match = re.search(
|
|
407
418
|
f"JobId={jobid}.*?JobState=(?P<state>[A-Z]+)",
|
|
408
419
|
scontrol_out,
|
|
@@ -410,4 +421,4 @@ def parse_scontrol_out(scontrol_out: str, jobid: int) -> str | None:
|
|
|
410
421
|
)
|
|
411
422
|
if match is not None:
|
|
412
423
|
return match.group("state")
|
|
413
|
-
return
|
|
424
|
+
return "unlisted"
|
|
@@ -56,15 +56,17 @@ def test_parse_sbatch():
|
|
|
56
56
|
def test_cached_run(path_tmp: Path):
|
|
57
57
|
path_out = path_tmp / "date.txt"
|
|
58
58
|
work_thread = WorkThread("<test>")
|
|
59
|
-
cache_time1, out1 = cached_run(work_thread, "date", path_out, 1)
|
|
60
|
-
cache_time2, out2 = cached_run(work_thread, "date", path_out, 10)
|
|
59
|
+
cache_time1, out1, ret1 = cached_run(work_thread, "date", path_out, 1)
|
|
60
|
+
cache_time2, out2, ret2 = cached_run(work_thread, "date", path_out, 10)
|
|
61
61
|
assert cache_time1 == pytest.approx(cache_time2, 1e-4)
|
|
62
62
|
assert out1 != ""
|
|
63
63
|
assert out1 == out2
|
|
64
|
+
assert ret1 == ret2
|
|
64
65
|
time.sleep(2)
|
|
65
|
-
cache_time3, out3 = cached_run(work_thread, "date", path_out, 1)
|
|
66
|
+
cache_time3, out3, ret3 = cached_run(work_thread, "date", path_out, 1)
|
|
66
67
|
assert abs(cache_time1 - cache_time3) > 0.5
|
|
67
68
|
assert out1 != out3
|
|
69
|
+
assert ret1 == ret3
|
|
68
70
|
|
|
69
71
|
|
|
70
72
|
SCONTROL_OUT = """\
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|