py-cluster-api 0.2.4__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/.github/workflows/ci.yml +2 -2
  2. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/CLAUDE.md +4 -2
  3. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/PKG-INFO +34 -41
  4. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/README.md +33 -40
  5. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/_types.py +21 -3
  6. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/config.py +7 -1
  7. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/core.py +75 -42
  8. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/executors/local.py +33 -21
  9. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/executors/lsf.py +42 -21
  10. py_cluster_api-0.4.0/docs/API.md +77 -0
  11. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/docs/Development.md +7 -4
  12. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/pixi.lock +6 -3
  13. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/pyproject.toml +2 -2
  14. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/cluster_config.example.yaml +3 -0
  15. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/test_core.py +36 -7
  16. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/test_local.py +10 -0
  17. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/test_lsf.py +74 -5
  18. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/test_reconnect.py +15 -17
  19. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/.gitignore +0 -0
  20. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/LICENSE +0 -0
  21. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/__init__.py +0 -0
  22. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/exceptions.py +0 -0
  23. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/executors/__init__.py +0 -0
  24. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/monitor.py +0 -0
  25. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/script.py +0 -0
  26. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/__init__.py +0 -0
  27. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/conftest.py +0 -0
  28. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/test_config.py +0 -0
  29. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/test_integration.py +0 -0
  30. {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/test_monitor.py +0 -0
@@ -18,12 +18,12 @@ jobs:
18
18
  runs-on: ubuntu-latest
19
19
 
20
20
  steps:
21
- - uses: actions/checkout@v4
21
+ - uses: actions/checkout@v5
22
22
 
23
23
  - name: Set up Pixi
24
24
  uses: prefix-dev/setup-pixi@v0.9.0
25
25
  with:
26
- pixi-version: v0.55.0
26
+ pixi-version: v0.65.0
27
27
  cache: true
28
28
 
29
29
  - name: Lint
@@ -2,6 +2,8 @@
2
2
 
3
3
  Generic Python library for submitting and monitoring jobs on HPC clusters. Wraps scheduler CLIs (bsub/bjobs/bkill) behind an async executor abstraction with an active polling monitor that fires callbacks on job completion. Inspired by dask-jobqueue's script templating and Nextflow's portable config profiles, but unlike dask-jobqueue, this library actively polls the scheduler rather than relying on workers phoning home.
4
4
 
5
+ Key capabilities beyond submit/poll/cancel: `reconnect()` rediscovers running jobs after a process restart (requires `job_name_prefix`), and `cancel_by_name()` kills jobs by name pattern (LSF only).
6
+
5
7
  Founding principles: async-only API, executors are thin wrappers around scheduler CLIs, all state lives in `JobRecord` dataclasses tracked in-process, monitoring is poll-based via `bjobs -json`, and configuration uses Nextflow-style YAML profiles.
6
8
 
7
9
  Always use `pixi run` to run commands — never invoke python, pytest, ruff, or other tools directly.
@@ -22,7 +24,7 @@ pixi run check # lint + test
22
24
 
23
25
  - `cluster_api/` — library source
24
26
  - `core.py` — abstract `Executor` base class
25
- - `_types.py` — `JobStatus`, `JobRecord`, `ResourceSpec`, `JobExitCondition`, `ArrayElement`
27
+ - `_types.py` — `JobStatus`, `JobRecord`, `ResourceSpec` (`cpus`, `gpus`, …), `JobExitCondition`, `ArrayElement`
26
28
  - `config.py` — YAML config loader with profiles
27
29
  - `script.py` — script rendering (`render_script`) and writing (`write_script`)
28
30
  - `monitor.py` — async polling loop + callback dispatch
@@ -41,7 +43,7 @@ Explicit `stdout_path` / `stderr_path` in `ResourceSpec` override these defaults
41
43
 
42
44
  ## Testing
43
45
 
44
- All tests mock `Executor._call()` to avoid needing a real scheduler (except `test_local.py` which runs real subprocesses). Use `unittest.mock.patch` with `AsyncMock` for async method mocking.
46
+ All tests mock `Executor._call()` to avoid needing a real scheduler (except `test_local.py` which runs real subprocesses, and `test_integration.py` which requires a live LSF cluster and is skipped by default). Use `unittest.mock.patch` with `AsyncMock` for async method mocking.
45
47
 
46
48
  ## Style
47
49
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: py-cluster-api
3
- Version: 0.2.4
3
+ Version: 0.4.0
4
4
  Summary: Generic Python library for running jobs on HPC clusters
5
5
  Project-URL: Homepage, https://github.com/JaneliaSciComp/py-cluster-api
6
6
  Project-URL: Repository, https://github.com/JaneliaSciComp/py-cluster-api
@@ -54,12 +54,17 @@ Description-Content-Type: text/markdown
54
54
 
55
55
  [![CI](https://github.com/JaneliaSciComp/py-cluster-api/actions/workflows/ci.yml/badge.svg)](https://github.com/JaneliaSciComp/py-cluster-api/actions/workflows/ci.yml)
56
56
 
57
- A Python library for submitting and monitoring jobs on HPC clusters. Supports running arbitrary executables (Nextflow pipelines, Python scripts, Java tools, etc.) on LSF clusters and taking action when jobs complete via async callbacks.
57
+ A Python library for submitting and monitoring jobs on HPC clusters. Supports running arbitrary executables (Nextflow pipelines, Python scripts, Java tools, etc.) on clusters and taking action when jobs complete via async callbacks.
58
+
59
+ ## Executors
60
+
61
+ * Local Subprocess
62
+ * IBM Platform LSF
63
+ * We will accept PRs that implement and test additional executors (SLURM, etc.)
58
64
 
59
65
  ## Features
60
66
 
61
67
  - **Async-first** — built on `asyncio` for non-blocking job submission and monitoring
62
- - **LSF executor** — submit via `bsub`, monitor via `bjobs -json`, cancel via `bkill`
63
68
  - **Local executor** — run jobs as local subprocesses for development and testing, including array jobs
64
69
  - **Job monitoring** — polls the scheduler and fires callbacks on job completion, failure, or cancellation
65
70
  - **Job arrays** — submit array jobs with per-element log files
@@ -97,7 +102,7 @@ async def main():
97
102
  job = await executor.submit(
98
103
  command="nextflow run nf-core/rnaseq --input samples.csv",
99
104
  name="rnaseq-run",
100
- resources=ResourceSpec(cpus=4, memory="32 GB", walltime="24:00", queue="long"),
105
+ resources=ResourceSpec(cpus=4, gpus=1, memory="32 GB", walltime="24:00", queue="long"),
101
106
  env={"NXF_WORK": "/scratch/work"},
102
107
  )
103
108
  job.on_success(lambda j: print(f"Done! Job {j.job_id}, peak mem: {j.max_mem}"))
@@ -131,6 +136,26 @@ async def run_array():
131
136
 
132
137
  The array index environment variable depends on the executor: LSF uses `$LSB_JOBINDEX`, while the local executor uses `$ARRAY_INDEX`.
133
138
 
139
+ ### Reconnecting After Restart
140
+
141
+ If your process crashes or restarts, `reconnect()` rediscovers running jobs from the scheduler and resumes tracking them. Requires `job_name_prefix` to be set in config.
142
+
143
+ ```python
144
+ async def resume():
145
+ executor = create_executor(profile="janelia_lsf")
146
+ monitor = JobMonitor(executor)
147
+ await monitor.start()
148
+
149
+ recovered = await executor.reconnect()
150
+ for job in recovered:
151
+ print(f"Reconnected to {job.job_id} ({job.name}), status={job.status}")
152
+ job.on_exit(lambda j: print(f"Job {j.job_id} finished: {j.status}"))
153
+
154
+ if recovered:
155
+ await monitor.wait_for(*recovered)
156
+ await monitor.stop()
157
+ ```
158
+
134
159
  ### Local Testing
135
160
 
136
161
  ```python
@@ -166,6 +191,7 @@ profiles:
166
191
  janelia_lsf:
167
192
  executor: lsf
168
193
  queue: normal
194
+ gpus: 1
169
195
  memory: "8 GB"
170
196
  walltime: "04:00"
171
197
  script_prologue:
@@ -182,15 +208,16 @@ profiles:
182
208
  |---|---|---|
183
209
  | `executor` | `"local"` | Backend: `lsf` or `local` |
184
210
  | `cpus` | `None` | Default CPU count |
211
+ | `gpus` | `None` | Default GPU count |
185
212
  | `memory` | `None` | Default memory (e.g. `"8 GB"`) |
186
213
  | `walltime` | `None` | Default wall time (e.g. `"04:00"`) |
187
214
  | `queue` | `None` | Default queue/partition |
188
215
  | `poll_interval` | `10.0` | Seconds between status polls |
189
- | `job_name_prefix` | `"capi"` | Prefix for all job names |
216
+ | `job_name_prefix` | `None` | Optional prefix prepended to job names. When set, polling filters by `{prefix}-*` and `reconnect()` is available; when unset, the user controls the full job name and polling queries all jobs |
190
217
  | `shebang` | `"#!/bin/bash"` | Script shebang line |
191
218
  | `script_prologue` | `[]` | Lines inserted before the command |
192
219
  | `script_epilogue` | `[]` | Lines inserted after the command |
193
- | `extra_directives` | `[]` | Additional scheduler flags (directive prefix added automatically) |
220
+ | `extra_directives` | `[]` | Additional scheduler directive lines appended verbatim to the script header (e.g. `"#BSUB -P myproject"`) |
194
221
  | `directives_skip` | `[]` | Substrings to filter out of directives |
195
222
  | `extra_args` | `[]` | Extra CLI args appended to the submit command (e.g. `bsub`) |
196
223
  | `lsf_units` | `"MB"` | LSF memory units (`KB`, `MB`, `GB`) |
@@ -201,41 +228,7 @@ profiles:
201
228
 
202
229
  ## API Reference
203
230
 
204
- ### `create_executor(profile=None, config_path=None, **overrides)`
205
-
206
- Factory function that loads config and returns an `Executor` instance.
207
-
208
- ### `Executor`
209
-
210
- Abstract base class. Key methods:
211
-
212
- - `submit(command, name, resources=None, prologue=None, epilogue=None, env=None, metadata=None)` — submit a job, returns `JobRecord`
213
- - `submit_array(command, name, array_range, ...)` — submit a job array
214
- - `cancel(job_id)` — cancel a job by ID
215
- - `cancel_by_name(name_pattern)` — cancel by name pattern (LSF only)
216
- - `cancel_all()` — cancel all tracked jobs
217
- - `poll()` — query scheduler and update job statuses
218
- - `jobs` / `active_jobs` — properties returning tracked job dicts
219
-
220
- ### `JobRecord`
221
-
222
- Tracks a submitted job. Fields include `job_id`, `name`, `status`, `exit_code`, `exec_host`, `max_mem`, `submit_time`, `start_time`, `finish_time`, and `metadata`.
223
-
224
- - `on_success(callback)` — register callback for exit code 0
225
- - `on_failure(callback)` — register callback for non-zero exit
226
- - `on_exit(callback, condition=ANY)` — register callback for any exit condition
227
- - `is_terminal` — whether the job has finished
228
-
229
- ### `JobMonitor`
230
-
231
- Async polling loop that drives status updates and callback dispatch.
232
-
233
- - `start()` / `stop()` — control the polling loop
234
- - `wait_for(*records, timeout=None)` — block until jobs reach a terminal state
235
-
236
- ### `ResourceSpec`
237
-
238
- Resource requirements: `cpus`, `memory`, `walltime`, `queue`, `work_dir`, `stdout_path`, `stderr_path`, `extra_directives`, `extra_args`.
231
+ See [docs/API.md](docs/API.md) for the full API reference and error handling guide.
239
232
 
240
233
  ## Development
241
234
 
@@ -2,12 +2,17 @@
2
2
 
3
3
  [![CI](https://github.com/JaneliaSciComp/py-cluster-api/actions/workflows/ci.yml/badge.svg)](https://github.com/JaneliaSciComp/py-cluster-api/actions/workflows/ci.yml)
4
4
 
5
- A Python library for submitting and monitoring jobs on HPC clusters. Supports running arbitrary executables (Nextflow pipelines, Python scripts, Java tools, etc.) on LSF clusters and taking action when jobs complete via async callbacks.
5
+ A Python library for submitting and monitoring jobs on HPC clusters. Supports running arbitrary executables (Nextflow pipelines, Python scripts, Java tools, etc.) on clusters and taking action when jobs complete via async callbacks.
6
+
7
+ ## Executors
8
+
9
+ * Local Subprocess
10
+ * IBM Platform LSF
11
+ * We will accept PRs that implement and test additional executors (SLURM, etc.)
6
12
 
7
13
  ## Features
8
14
 
9
15
  - **Async-first** — built on `asyncio` for non-blocking job submission and monitoring
10
- - **LSF executor** — submit via `bsub`, monitor via `bjobs -json`, cancel via `bkill`
11
16
  - **Local executor** — run jobs as local subprocesses for development and testing, including array jobs
12
17
  - **Job monitoring** — polls the scheduler and fires callbacks on job completion, failure, or cancellation
13
18
  - **Job arrays** — submit array jobs with per-element log files
@@ -45,7 +50,7 @@ async def main():
45
50
  job = await executor.submit(
46
51
  command="nextflow run nf-core/rnaseq --input samples.csv",
47
52
  name="rnaseq-run",
48
- resources=ResourceSpec(cpus=4, memory="32 GB", walltime="24:00", queue="long"),
53
+ resources=ResourceSpec(cpus=4, gpus=1, memory="32 GB", walltime="24:00", queue="long"),
49
54
  env={"NXF_WORK": "/scratch/work"},
50
55
  )
51
56
  job.on_success(lambda j: print(f"Done! Job {j.job_id}, peak mem: {j.max_mem}"))
@@ -79,6 +84,26 @@ async def run_array():
79
84
 
80
85
  The array index environment variable depends on the executor: LSF uses `$LSB_JOBINDEX`, while the local executor uses `$ARRAY_INDEX`.
81
86
 
87
+ ### Reconnecting After Restart
88
+
89
+ If your process crashes or restarts, `reconnect()` rediscovers running jobs from the scheduler and resumes tracking them. Requires `job_name_prefix` to be set in config.
90
+
91
+ ```python
92
+ async def resume():
93
+ executor = create_executor(profile="janelia_lsf")
94
+ monitor = JobMonitor(executor)
95
+ await monitor.start()
96
+
97
+ recovered = await executor.reconnect()
98
+ for job in recovered:
99
+ print(f"Reconnected to {job.job_id} ({job.name}), status={job.status}")
100
+ job.on_exit(lambda j: print(f"Job {j.job_id} finished: {j.status}"))
101
+
102
+ if recovered:
103
+ await monitor.wait_for(*recovered)
104
+ await monitor.stop()
105
+ ```
106
+
82
107
  ### Local Testing
83
108
 
84
109
  ```python
@@ -114,6 +139,7 @@ profiles:
114
139
  janelia_lsf:
115
140
  executor: lsf
116
141
  queue: normal
142
+ gpus: 1
117
143
  memory: "8 GB"
118
144
  walltime: "04:00"
119
145
  script_prologue:
@@ -130,15 +156,16 @@ profiles:
130
156
  |---|---|---|
131
157
  | `executor` | `"local"` | Backend: `lsf` or `local` |
132
158
  | `cpus` | `None` | Default CPU count |
159
+ | `gpus` | `None` | Default GPU count |
133
160
  | `memory` | `None` | Default memory (e.g. `"8 GB"`) |
134
161
  | `walltime` | `None` | Default wall time (e.g. `"04:00"`) |
135
162
  | `queue` | `None` | Default queue/partition |
136
163
  | `poll_interval` | `10.0` | Seconds between status polls |
137
- | `job_name_prefix` | `"capi"` | Prefix for all job names |
164
+ | `job_name_prefix` | `None` | Optional prefix prepended to job names. When set, polling filters by `{prefix}-*` and `reconnect()` is available; when unset, the user controls the full job name and polling queries all jobs |
138
165
  | `shebang` | `"#!/bin/bash"` | Script shebang line |
139
166
  | `script_prologue` | `[]` | Lines inserted before the command |
140
167
  | `script_epilogue` | `[]` | Lines inserted after the command |
141
- | `extra_directives` | `[]` | Additional scheduler flags (directive prefix added automatically) |
168
+ | `extra_directives` | `[]` | Additional scheduler directive lines appended verbatim to the script header (e.g. `"#BSUB -P myproject"`) |
142
169
  | `directives_skip` | `[]` | Substrings to filter out of directives |
143
170
  | `extra_args` | `[]` | Extra CLI args appended to the submit command (e.g. `bsub`) |
144
171
  | `lsf_units` | `"MB"` | LSF memory units (`KB`, `MB`, `GB`) |
@@ -149,41 +176,7 @@ profiles:
149
176
 
150
177
  ## API Reference
151
178
 
152
- ### `create_executor(profile=None, config_path=None, **overrides)`
153
-
154
- Factory function that loads config and returns an `Executor` instance.
155
-
156
- ### `Executor`
157
-
158
- Abstract base class. Key methods:
159
-
160
- - `submit(command, name, resources=None, prologue=None, epilogue=None, env=None, metadata=None)` — submit a job, returns `JobRecord`
161
- - `submit_array(command, name, array_range, ...)` — submit a job array
162
- - `cancel(job_id)` — cancel a job by ID
163
- - `cancel_by_name(name_pattern)` — cancel by name pattern (LSF only)
164
- - `cancel_all()` — cancel all tracked jobs
165
- - `poll()` — query scheduler and update job statuses
166
- - `jobs` / `active_jobs` — properties returning tracked job dicts
167
-
168
- ### `JobRecord`
169
-
170
- Tracks a submitted job. Fields include `job_id`, `name`, `status`, `exit_code`, `exec_host`, `max_mem`, `submit_time`, `start_time`, `finish_time`, and `metadata`.
171
-
172
- - `on_success(callback)` — register callback for exit code 0
173
- - `on_failure(callback)` — register callback for non-zero exit
174
- - `on_exit(callback, condition=ANY)` — register callback for any exit condition
175
- - `is_terminal` — whether the job has finished
176
-
177
- ### `JobMonitor`
178
-
179
- Async polling loop that drives status updates and callback dispatch.
180
-
181
- - `start()` / `stop()` — control the polling loop
182
- - `wait_for(*records, timeout=None)` — block until jobs reach a terminal state
183
-
184
- ### `ResourceSpec`
185
-
186
- Resource requirements: `cpus`, `memory`, `walltime`, `queue`, `work_dir`, `stdout_path`, `stderr_path`, `extra_directives`, `extra_args`.
179
+ See [docs/API.md](docs/API.md) for the full API reference and error handling guide.
187
180
 
188
181
  ## Development
189
182
 
@@ -34,7 +34,25 @@ _TERMINAL_STATUSES = frozenset({JobStatus.DONE, JobStatus.FAILED, JobStatus.KILL
34
34
 
35
35
  @dataclass
36
36
  class ResourceSpec:
37
- """Resource requirements for a job."""
37
+ """Resource requirements for a job.
38
+
39
+ Fields:
40
+ cpus: Number of CPU cores to request.
41
+ gpus: Number of GPUs to request.
42
+ memory: Memory limit as a string with unit, e.g. ``"16GB"`` or ``"500MB"``.
43
+ Passed directly to the scheduler directive.
44
+ walltime: Wall-clock time limit, e.g. ``"1:00"`` (h:mm) or ``"24:00:00"``.
45
+ Format depends on the target scheduler.
46
+ queue: Scheduler queue / partition name.
47
+ work_dir: Working directory for the job (defaults to ``os.getcwd()``).
48
+ stdout_path: Explicit path for stdout log. Overrides the executor's
49
+ default log naming (see CLAUDE.md § Log File Naming).
50
+ stderr_path: Explicit path for stderr log. Same override behaviour.
51
+ extra_directives: Raw scheduler directives injected into the job script
52
+ header (e.g. ``["#BSUB -R 'rusage[mem=16GB]'"]``).
53
+ extra_args: Extra command-line arguments appended to the submit command
54
+ (e.g. ``["-q", "gpu"]`` for ``bsub``).
55
+ """
38
56
 
39
57
  cpus: int | None = None
40
58
  gpus: int | None = None
@@ -149,8 +167,8 @@ class JobRecord:
149
167
  return JobStatus.RUNNING
150
168
 
151
169
  # All expected elements accounted for and terminal
152
- if JobStatus.KILLED in statuses:
153
- return JobStatus.KILLED
154
170
  if JobStatus.FAILED in statuses:
155
171
  return JobStatus.FAILED
172
+ if JobStatus.KILLED in statuses:
173
+ return JobStatus.KILLED
156
174
  return JobStatus.DONE
@@ -58,6 +58,7 @@ class ClusterConfig:
58
58
  completed_retention_minutes: float = 10.0
59
59
  command_timeout: float = 100.0
60
60
  suppress_job_email: bool = True
61
+ poll_all_users: bool = False
61
62
 
62
63
 
63
64
  _CONFIG_SEARCH_PATHS = [
@@ -109,7 +110,12 @@ def load_config(
109
110
 
110
111
  profiles = raw.pop("profiles", {})
111
112
 
112
- if profile and profile in profiles:
113
+ if profile:
114
+ if profile not in profiles:
115
+ available = ", ".join(sorted(profiles)) if profiles else "(none)"
116
+ raise ValueError(
117
+ f"Unknown profile {profile!r}; available profiles: {available}"
118
+ )
113
119
  raw = {**raw, **profiles[profile]}
114
120
 
115
121
  if overrides:
@@ -7,8 +7,6 @@ import asyncio
7
7
  import logging
8
8
  import os
9
9
  import re
10
- import secrets
11
- import string
12
10
  from datetime import datetime, timezone
13
11
  from typing import Any
14
12
 
@@ -18,8 +16,10 @@ from ._types import ArrayElement, JobRecord, JobStatus, ResourceSpec
18
16
 
19
17
  logger = logging.getLogger(__name__)
20
18
 
19
+ # Check for array element IDs like "12345[1]"
21
20
  _ARRAY_ELEMENT_RE = re.compile(r"^(.+)\[(\d+)\]$")
22
21
 
22
+ # Check for job names that are unsafe in scheduler job names
23
23
  _UNSAFE_NAME_RE = re.compile(r"[^\w\-.]")
24
24
 
25
25
 
@@ -29,7 +29,37 @@ def _sanitize_job_name(name: str) -> str:
29
29
 
30
30
 
31
31
  class Executor(abc.ABC):
32
- """Abstract base for cluster job executors."""
32
+ """Abstract base for cluster job executors.
33
+
34
+ Lifecycle:
35
+ 1. **Construct** — instantiate with a ``ClusterConfig``.
36
+ 2. **Submit** — call :meth:`submit` or :meth:`submit_array` to enqueue
37
+ jobs. Each returns a :class:`JobRecord` tracked in-process.
38
+ 3. **Poll** — call :meth:`poll` (usually via :class:`~cluster_api.monitor.Monitor`)
39
+ to query the scheduler and update every tracked ``JobRecord``.
40
+ 4. **Cancel** — call :meth:`cancel`, :meth:`cancel_all`, or
41
+ :meth:`cancel_by_name` to kill running jobs.
42
+
43
+ Subclass requirements:
44
+ Must implement:
45
+ - :meth:`_submit_job` — run the scheduler submit command.
46
+ - :meth:`_build_status_args` — build the CLI args for a status query.
47
+ - :meth:`_parse_job_statuses` — parse status output into per-job dicts.
48
+
49
+ May override:
50
+ - :meth:`_submit_array_job` — array submission (default delegates
51
+ to ``_submit_job``).
52
+ - :meth:`_cancel_job` — cancel a single job.
53
+ - :meth:`cancel_by_name` — cancel by name pattern.
54
+ - :meth:`reconnect` — rediscover running jobs after restart.
55
+
56
+ Class attributes:
57
+ submit_command: CLI executable used for submission (e.g. ``"bsub"``).
58
+ cancel_command: CLI executable used for cancellation (e.g. ``"bkill"``).
59
+ status_command: CLI executable used for status queries (e.g. ``"bjobs"``).
60
+ job_id_regexp: Regex with a ``job_id`` named group, applied to submit
61
+ output to extract the job ID.
62
+ """
33
63
 
34
64
  submit_command: str
35
65
  cancel_command: str
@@ -39,13 +69,7 @@ class Executor(abc.ABC):
39
69
  def __init__(self, config: ClusterConfig) -> None:
40
70
  self.config = config
41
71
  self._jobs: dict[str, JobRecord] = {}
42
- if config.job_name_prefix:
43
- self._prefix = config.job_name_prefix
44
- else:
45
- # Generate a random prefix so concurrent users/sessions don't
46
- # see each other's jobs when polling by name.
47
- alphabet = string.ascii_lowercase + string.digits
48
- self._prefix = "".join(secrets.choice(alphabet) for _ in range(5))
72
+ self._prefix = config.job_name_prefix # None if not configured
49
73
 
50
74
  # --- Submission ---
51
75
 
@@ -61,7 +85,7 @@ class Executor(abc.ABC):
61
85
  ) -> JobRecord:
62
86
  """Submit a job to the scheduler."""
63
87
  resources = resources or ResourceSpec()
64
- full_name = _sanitize_job_name(f"{self._prefix}-{name}")
88
+ full_name = _sanitize_job_name(f"{self._prefix}-{name}" if self._prefix else name)
65
89
 
66
90
  job_id, script_path = await self._submit_job(
67
91
  command, full_name, resources, prologue, epilogue, env,
@@ -96,7 +120,7 @@ class Executor(abc.ABC):
96
120
  ) -> JobRecord:
97
121
  """Submit a job array to the scheduler."""
98
122
  resources = resources or ResourceSpec()
99
- full_name = _sanitize_job_name(f"{self._prefix}-{name}")
123
+ full_name = _sanitize_job_name(f"{self._prefix}-{name}" if self._prefix else name)
100
124
 
101
125
  job_id, script_path = await self._submit_array_job(
102
126
  command, full_name, array_range, resources, prologue, epilogue,
@@ -172,18 +196,26 @@ class Executor(abc.ABC):
172
196
 
173
197
  # --- Cancellation ---
174
198
 
175
- async def cancel(self, job_id: str) -> None:
176
- """Cancel a job by ID."""
177
- cmd = [self.cancel_command, job_id]
178
- logger.debug("Running: %s", " ".join(cmd))
179
- await self._call(cmd, timeout=self.config.command_timeout)
199
+ async def cancel(self, job_id: str, *, done: bool = False) -> None:
200
+ """Cancel a job by ID.
201
+
202
+ Args:
203
+ job_id: The job ID to cancel.
204
+ done: If True, mark the job as DONE instead of KILLED.
205
+ Subclasses may translate this into scheduler-specific flags.
206
+ """
207
+ await self._cancel_job(job_id, done=done)
180
208
  if job_id in self._jobs:
181
- self._jobs[job_id].status = JobStatus.KILLED
182
- logger.info("Cancelled job %s", job_id)
209
+ self._jobs[job_id].status = JobStatus.DONE if done else JobStatus.KILLED
210
+ logger.info("Cancelled job %s (done=%s)", job_id, done)
211
+
212
+ async def _cancel_job(self, job_id: str, *, done: bool = False) -> None:
213
+ """Run the scheduler cancel command. Must be implemented by subclasses."""
214
+ raise NotImplementedError("cancel is not supported by this executor")
183
215
 
184
216
  async def cancel_by_name(self, name_pattern: str) -> None:
185
217
  """Cancel jobs by name pattern. Override in subclasses for native support."""
186
- raise NotImplementedError("cancel_by_name not supported by this executor")
218
+ raise NotImplementedError("cancel_by_name is not supported by this executor")
187
219
 
188
220
  async def reconnect(self) -> list[JobRecord]:
189
221
  """Reconnect to running jobs and resume tracking them.
@@ -195,12 +227,12 @@ class Executor(abc.ABC):
195
227
  Returns:
196
228
  List of newly created ``JobRecord`` instances.
197
229
  """
198
- raise NotImplementedError("reconnect not supported by this executor")
230
+ raise NotImplementedError("reconnect is not supported by this executor")
199
231
 
200
- async def cancel_all(self) -> None:
232
+ async def cancel_all(self, *, done: bool = False) -> None:
201
233
  """Cancel all tracked jobs."""
202
234
  to_cancel = [jid for jid, r in self._jobs.items() if not r.is_terminal]
203
- await asyncio.gather(*(self.cancel(jid) for jid in to_cancel))
235
+ await asyncio.gather(*(self.cancel(jid, done=done) for jid in to_cancel))
204
236
 
205
237
  # --- Status polling ---
206
238
 
@@ -272,9 +304,9 @@ class Executor(abc.ABC):
272
304
  @staticmethod
273
305
  async def _call(
274
306
  cmd: list[str],
275
- shell: bool = False,
276
307
  timeout: float = 100.0,
277
308
  env: dict[str, str] | None = None,
309
+ stdin_file: str | None = None,
278
310
  ) -> str:
279
311
  """Run a subprocess and return stdout.
280
312
 
@@ -284,31 +316,32 @@ class Executor(abc.ABC):
284
316
  if env:
285
317
  full_env = {**os.environ, **env}
286
318
 
287
- if shell:
288
- proc = await asyncio.create_subprocess_shell(
289
- cmd if isinstance(cmd, str) else " ".join(cmd),
290
- stdout=asyncio.subprocess.PIPE,
291
- stderr=asyncio.subprocess.PIPE,
292
- env=full_env,
293
- )
294
- else:
319
+ stdin_fh = None
320
+ try:
321
+ if stdin_file:
322
+ stdin_fh = open(stdin_file) # noqa: SIM115
295
323
  proc = await asyncio.create_subprocess_exec(
296
324
  *cmd,
325
+ stdin=stdin_fh,
297
326
  stdout=asyncio.subprocess.PIPE,
298
327
  stderr=asyncio.subprocess.PIPE,
299
328
  env=full_env,
300
329
  )
301
330
 
302
- try:
303
- stdout, stderr = await asyncio.wait_for(
304
- proc.communicate(),
305
- timeout=timeout,
306
- )
307
- except asyncio.TimeoutError:
308
- proc.kill()
309
- raise CommandTimeoutError(
310
- f"Command timed out after {timeout}s: {cmd}"
311
- )
331
+ try:
332
+ stdout, stderr = await asyncio.wait_for(
333
+ proc.communicate(),
334
+ timeout=timeout,
335
+ )
336
+ except asyncio.TimeoutError:
337
+ proc.kill()
338
+ await proc.wait()
339
+ raise CommandTimeoutError(
340
+ f"Command timed out after {timeout}s: {cmd}"
341
+ )
342
+ finally:
343
+ if stdin_fh:
344
+ stdin_fh.close()
312
345
 
313
346
  out = stdout.decode().strip()
314
347
  err = stderr.decode().strip()
@@ -88,6 +88,12 @@ class LocalExecutor(Executor):
88
88
  cwd: str | None = None,
89
89
  ) -> tuple[str, str | None]:
90
90
  """Spawn one subprocess per array element with ARRAY_INDEX env var."""
91
+ if max_concurrent is not None:
92
+ logger.warning(
93
+ "LocalExecutor does not support max_concurrent; "
94
+ "all %d elements will run simultaneously",
95
+ array_range[1] - array_range[0] + 1,
96
+ )
91
97
  header = self.build_header(name, resources)
92
98
  script = render_script(self.config, command, header, prologue, epilogue)
93
99
  script_path = write_script(resources.work_dir, script, name, next(self._script_counter))
@@ -191,37 +197,43 @@ class LocalExecutor(Executor):
191
197
 
192
198
  return {jid: r.status for jid, r in self._jobs.items()}
193
199
 
194
- async def cancel(self, job_id: str) -> None:
200
+ async def cancel(self, job_id: str, *, done: bool = False) -> None:
195
201
  """Terminate a local subprocess (or all element processes for an array job)."""
196
- # Kill single-job process if present
202
+ # Collect all live processes for this job (single + array elements)
203
+ live: list[tuple[str, asyncio.subprocess.Process]] = []
197
204
  proc = self._processes.get(job_id)
198
205
  if proc and proc.returncode is None:
199
- proc.terminate()
200
- try:
201
- await asyncio.wait_for(proc.wait(), timeout=5.0)
202
- except asyncio.TimeoutError:
203
- proc.kill()
204
-
205
- self._close_output_files(job_id)
206
-
207
- # Kill array element processes matching "{job_id}[*]"
206
+ live.append((job_id, proc))
208
207
  prefix = f"{job_id}["
209
208
  for key, proc in self._processes.items():
210
209
  if key.startswith(prefix) and proc.returncode is None:
211
- proc.terminate()
212
- try:
213
- await asyncio.wait_for(proc.wait(), timeout=5.0)
214
- except asyncio.TimeoutError:
215
- proc.kill()
216
- self._close_output_files(key)
217
-
210
+ live.append((key, proc))
211
+
212
+ # Send SIGTERM to all, then wait concurrently
213
+ for _key, p in live:
214
+ p.terminate()
215
+ if live:
216
+ tasks = [asyncio.ensure_future(p.wait()) for _key, p in live]
217
+ _, pending = await asyncio.wait(tasks, timeout=5.0)
218
+ # SIGKILL any that didn't exit in time
219
+ for _key, p in live:
220
+ if p.returncode is None:
221
+ p.kill()
222
+ # Reap the killed processes
223
+ if pending:
224
+ await asyncio.wait(pending, timeout=5.0)
225
+
226
+ for key, _p in live:
227
+ self._close_output_files(key)
228
+
229
+ target_status = JobStatus.DONE if done else JobStatus.KILLED
218
230
  if job_id in self._jobs:
219
231
  record = self._jobs[job_id]
220
- record.status = JobStatus.KILLED
232
+ record.status = target_status
221
233
  for elem in record.array_elements.values():
222
234
  if elem.status not in {JobStatus.DONE, JobStatus.FAILED, JobStatus.KILLED}:
223
- elem.status = JobStatus.KILLED
224
- logger.info("Cancelled local job %s", job_id)
235
+ elem.status = target_status
236
+ logger.info("Cancelled local job %s (done=%s)", job_id, done)
225
237
 
226
238
  def _open_output_files(
227
239
  self,
@@ -11,7 +11,7 @@ import re
11
11
  from datetime import datetime, timezone
12
12
  from typing import Any
13
13
 
14
- from .._types import ArrayElement, JobRecord, JobStatus, ResourceSpec
14
+ from .._types import ArrayElement, JobRecord, JobStatus, ResourceSpec, _TERMINAL_STATUSES
15
15
  from ..config import ClusterConfig, parse_memory_bytes
16
16
  from ..core import Executor, _ARRAY_ELEMENT_RE
17
17
  from ..exceptions import ClusterAPIError, CommandFailedError
@@ -82,8 +82,8 @@ class LSFExecutor(Executor):
82
82
 
83
83
  out = resources.stdout_path or f"{resources.work_dir}/stdout.%J.log"
84
84
  err = resources.stderr_path or f"{resources.work_dir}/stderr.%J.log"
85
- lines.append(f"{p} -o {out}")
86
- lines.append(f"{p} -e {err}")
85
+ lines.append(f'{p} -o "{out}"')
86
+ lines.append(f'{p} -e "{err}"')
87
87
 
88
88
  # Queue
89
89
  queue = resources.queue or self.config.queue
@@ -116,7 +116,7 @@ class LSFExecutor(Executor):
116
116
  lines.append(f"{p} -W {walltime}")
117
117
 
118
118
  # Working directory
119
- lines.append(f"{p} -cwd {resources.work_dir}")
119
+ lines.append(f'{p} -cwd "{resources.work_dir}"')
120
120
 
121
121
  # Custom cluster options
122
122
  if resources.extra_directives:
@@ -145,12 +145,11 @@ class LSFExecutor(Executor):
145
145
  ) -> str:
146
146
  """Run bsub with a script file and return raw output."""
147
147
  submit_env = self._build_submit_env(env)
148
- extra = " ".join(extra_args) + " " if extra_args else ""
149
- cmd = f"{self.submit_command} {extra}< {script_path}"
150
- logger.debug("Running: %s", cmd)
148
+ cmd = [self.submit_command, *(extra_args or [])]
149
+ logger.debug("Running: %s < %s", cmd, script_path)
151
150
  return await self._call(
152
151
  cmd,
153
- shell=True,
152
+ stdin_file=script_path,
154
153
  env=submit_env,
155
154
  timeout=self.config.command_timeout,
156
155
  )
@@ -220,14 +219,12 @@ class LSFExecutor(Executor):
220
219
 
221
220
  def _build_status_args(self) -> list[str]:
222
221
  """Build bjobs command with JSON output."""
223
- prefix = self._prefix
224
- args = [
225
- self.status_command,
226
- "-J", f"{prefix}-*",
227
- "-a",
228
- "-o", _BJOBS_FIELDS,
229
- "-json",
230
- ]
222
+ args = [self.status_command]
223
+ if self.config.poll_all_users:
224
+ args.extend(["-u", "all"])
225
+ if self._prefix:
226
+ args.extend(["-J", f"{self._prefix}-*"])
227
+ args.extend(["-a", "-o", _BJOBS_FIELDS, "-json"])
231
228
  return args
232
229
 
233
230
  def _parse_job_statuses(
@@ -282,11 +279,26 @@ class LSFExecutor(Executor):
282
279
 
283
280
  return result
284
281
 
282
+ async def _cancel_job(self, job_id: str, *, done: bool = False) -> None:
283
+ """Run bkill, with ``-d`` when *done* is True."""
284
+ cmd = [self.cancel_command]
285
+ if done:
286
+ cmd.append("-d")
287
+ cmd.append(job_id)
288
+ logger.debug("Running: %s", " ".join(cmd))
289
+ await self._call(cmd, timeout=self.config.command_timeout)
290
+
285
291
  async def cancel_by_name(self, name_pattern: str) -> None:
286
292
  """Cancel jobs matching name pattern via bkill -J."""
287
293
  cmd = [self.cancel_command, "-J", name_pattern]
288
294
  logger.debug("Running: %s", " ".join(cmd))
289
- await self._call(cmd, timeout=self.config.command_timeout)
295
+ try:
296
+ await self._call(cmd, timeout=self.config.command_timeout)
297
+ except CommandFailedError as e:
298
+ if "No matching job" in str(e) or "No unfinished job" in str(e):
299
+ logger.debug("No jobs matched pattern %s", name_pattern)
300
+ return
301
+ raise
290
302
  # Update in-memory state for matching jobs
291
303
  for record in self._jobs.values():
292
304
  if not record.is_terminal and fnmatch.fnmatch(record.name, name_pattern):
@@ -302,13 +314,16 @@ class LSFExecutor(Executor):
302
314
  "Cannot reconnect: no job_name_prefix was configured. "
303
315
  "Set job_name_prefix in config to enable reconnection."
304
316
  )
305
- return [
306
- self.status_command,
317
+ args = [self.status_command]
318
+ if self.config.poll_all_users:
319
+ args.extend(["-u", "all"])
320
+ args.extend([
307
321
  "-J", f"{self._prefix}-*",
308
322
  "-a",
309
323
  "-o", _BJOBS_RECONNECT_FIELDS,
310
324
  "-json",
311
- ]
325
+ ])
326
+ return args
312
327
 
313
328
  async def reconnect(self) -> list[JobRecord]:
314
329
  """Reconnect to running jobs and resume tracking them.
@@ -347,11 +362,14 @@ class LSFExecutor(Executor):
347
362
  new_records: list[JobRecord] = []
348
363
  now = datetime.now(timezone.utc)
349
364
 
350
- # Process single (non-array) jobs
365
+ # Process single (non-array) jobs, skipping terminal ones
366
+ # (-a returns DONE/EXIT jobs too; no point reconnecting to those)
351
367
  for job_id, entries in singles.items():
352
368
  if job_id in self._jobs:
353
369
  continue
354
370
  _, status, meta = entries[0]
371
+ if status in _TERMINAL_STATUSES:
372
+ continue
355
373
  record = JobRecord(
356
374
  job_id=job_id,
357
375
  name=meta.get("job_name") or "",
@@ -371,9 +389,12 @@ class LSFExecutor(Executor):
371
389
  new_records.append(record)
372
390
 
373
391
  # Process array elements, grouping under parent
392
+ # Skip arrays where every visible element is already terminal
374
393
  for parent_id, elements in arrays.items():
375
394
  if parent_id in self._jobs:
376
395
  continue
396
+ if all(s in _TERMINAL_STATUSES for _, s, _ in elements):
397
+ continue
377
398
  indices = sorted(idx for idx, _, _ in elements)
378
399
  array_range = (min(indices), max(indices))
379
400
 
@@ -0,0 +1,77 @@
1
+ # API Reference
2
+
3
+ ## `create_executor(profile=None, config_path=None, **overrides)`
4
+
5
+ Factory function that loads config and returns an `Executor` instance.
6
+
7
+ ## `Executor`
8
+
9
+ Abstract base class. Key methods:
10
+
11
+ - `submit(command, name, resources=None, prologue=None, epilogue=None, env=None, metadata=None)` — submit a job, returns `JobRecord`
12
+ - `submit_array(command, name, array_range, ...)` — submit a job array
13
+ - `cancel(job_id, *, done=False)` — cancel a job by ID. By default marks the job as `KILLED`; pass `done=True` to mark it as `DONE` instead (useful for graceful pipeline termination where you don't want downstream logic to treat the cancellation as a failure)
14
+ - `cancel_by_name(name_pattern)` — cancel jobs matching a name pattern (LSF only)
15
+ - `cancel_all(*, done=False)` — cancel all tracked non-terminal jobs
16
+ - `reconnect()` — rediscover running jobs after a process restart (requires `job_name_prefix`)
17
+ - `poll()` — query scheduler and update job statuses
18
+ - `jobs` / `active_jobs` — properties returning tracked job dicts
19
+
20
+ ## `JobRecord`
21
+
22
+ Tracks a submitted job. Fields include `job_id`, `name`, `status`, `exit_code`, `exec_host`, `max_mem`, `submit_time`, `start_time`, `finish_time`, and `metadata`.
23
+
24
+ - `on_success(callback)` — register callback for exit code 0
25
+ - `on_failure(callback)` — register callback for non-zero exit
26
+ - `on_exit(callback, condition=ANY)` — register callback for any exit condition
27
+ - `is_terminal` — whether the job has finished
28
+
29
+ ## `JobMonitor`
30
+
31
+ Async polling loop that drives status updates and callback dispatch.
32
+
33
+ - `start()` / `stop()` — control the polling loop
34
+ - `wait_for(*records, timeout=None)` — block until jobs reach a terminal state
35
+
36
+ The monitor does not support `async with`, so use `try/finally` to ensure cleanup:
37
+
38
+ ```python
39
+ monitor = JobMonitor(executor)
40
+ await monitor.start()
41
+ try:
42
+ job = await executor.submit(command="echo hi", name="test")
43
+ await monitor.wait_for(job)
44
+ finally:
45
+ await monitor.stop()
46
+ ```
47
+
48
+ ## `ResourceSpec`
49
+
50
+ Resource requirements: `cpus`, `gpus`, `memory`, `walltime`, `queue`, `work_dir`, `stdout_path`, `stderr_path`, `extra_directives`, `extra_args`.
51
+
52
+ ## Error Handling
53
+
54
+ All exceptions inherit from `ClusterAPIError`, so you can catch broadly or narrowly:
55
+
56
+ ```python
57
+ from cluster_api import ClusterAPIError, SubmitError, CommandTimeoutError, CommandFailedError
58
+
59
+ try:
60
+ job = await executor.submit(command="echo hi", name="test")
61
+ except SubmitError as e:
62
+ # Could not parse job ID from scheduler output
63
+ print(f"Submission failed: {e}")
64
+ except CommandTimeoutError as e:
65
+ # Scheduler command (bsub, bjobs, bkill) exceeded command_timeout
66
+ print(f"Scheduler timed out: {e}")
67
+ except CommandFailedError as e:
68
+ # Scheduler command returned a non-zero exit code
69
+ print(f"Scheduler error: {e}")
70
+ ```
71
+
72
+ | Exception | Raised when |
73
+ |---|---|
74
+ | `ClusterAPIError` | Base class for all library errors |
75
+ | `SubmitError` | Job ID could not be parsed from submit output |
76
+ | `CommandTimeoutError` | A scheduler CLI command exceeded `command_timeout` |
77
+ | `CommandFailedError` | A scheduler CLI command exited with non-zero status |
@@ -37,6 +37,7 @@ pixi run check # lint + test together
37
37
  | `test_lsf.py` | `LSFExecutor` header building, bsub submission, bjobs parsing, array rewriting | No — mocks `_call()` |
38
38
  | `test_local.py` | `LocalExecutor` end-to-end (submit, poll, output files, callbacks, array jobs) | **Yes** — runs real bash subprocesses |
39
39
  | `test_monitor.py` | `JobMonitor` polling loop, callback dispatch, zombie detection, purging | No — mocks `poll()` |
40
+ | `test_reconnect.py` | `LSFExecutor.reconnect()` — rediscovering running jobs after restart | No — mocks `_call()` |
40
41
  | `test_integration.py` | Full LSF round-trips (submit, monitor, cancel, arrays, metadata) | **Yes** — requires a live LSF cluster |
41
42
 
42
43
  ### Writing tests
@@ -102,8 +103,10 @@ JobMonitor (monitor.py) # async polling loop → callbacks + zombie detection
102
103
  ```
103
104
 
104
105
  - `build_header()` (per executor) produces directive lines from `ResourceSpec` + config defaults.
105
- - `extra_directives` (config-level and per-job) append custom flags — the directive prefix (e.g. `#BSUB`) is added automatically, so users write `"-P myproject"` not `"#BSUB -P myproject"`.
106
- - `extra_args` (config-level and per-job) append raw arguments to the submit command line (e.g. `bsub -P myproject script.sh`), bypassing the script entirely.
106
+ - `extra_directives` has two levels with different behaviour:
107
+ - **Config-level** (`ClusterConfig.extra_directives`): appended verbatim to the script header users must include the full prefix, e.g. `"#BSUB -P myproject"`.
108
+ - **ResourceSpec-level** (`ResourceSpec.extra_directives`): the directive prefix is added automatically, so users write `"-P myproject"` and the executor produces `"#BSUB -P myproject"`.
109
+ - `extra_args` (config-level and per-job) append raw arguments to the submit command line, bypassing the script entirely. Both levels are merged at submit time: config-level args come first, then per-job (`ResourceSpec.extra_args`) args are appended.
107
110
  - `directives_skip` filters out unwanted directive lines by substring match.
108
111
  - Scripts are written to `{work_dir}/{safe_name}.{counter}.sh` and made executable.
109
112
 
@@ -133,8 +136,8 @@ Terminal jobs are purged from memory after `completed_retention_minutes` (once a
133
136
  ### Key design decisions
134
137
 
135
138
  - **Poll-based monitoring** — unlike dask-jobqueue (which relies on workers phoning home), this library actively polls the scheduler. This means it works with any executable, not just Python workers.
136
- - **File-based submission** — jobs are submitted via `bsub script.sh`, passing the script file path directly. The script is always written to disk before submission.
137
- - **Job name prefixing** — all jobs get a `{prefix}-{name}` name. The prefix is either configured (`job_name_prefix`) or randomly generated, so concurrent sessions don't collide when polling by name.
139
+ - **Stdin-based submission** — job scripts are written to disk, then submitted via stdin redirection (`bsub < script.sh`). The script file is kept on disk for debugging.
140
+ - **Job name prefixing** — when `job_name_prefix` is configured, all jobs get a `{prefix}-{name}` name and polling filters by that prefix. When unset, the user controls the full job name and polling queries all jobs. `reconnect()` requires a prefix to be set.
138
141
  - **Array status aggregation** — parent array job status is computed from element statuses. Only transitions to terminal when ALL expected elements are terminal.
139
142
 
140
143
  ## Module reference
@@ -5,6 +5,8 @@ environments:
5
5
  - url: https://conda.anaconda.org/conda-forge/
6
6
  indexes:
7
7
  - https://pypi.org/simple
8
+ options:
9
+ pypi-prerelease-mode: if-necessary-or-explicit
8
10
  packages:
9
11
  linux-64:
10
12
  - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -52,6 +54,8 @@ environments:
52
54
  - url: https://conda.anaconda.org/conda-forge/
53
55
  indexes:
54
56
  - https://pypi.org/simple
57
+ options:
58
+ pypi-prerelease-mode: if-necessary-or-explicit
55
59
  packages:
56
60
  linux-64:
57
61
  - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -841,8 +845,8 @@ packages:
841
845
  timestamp: 1764896838868
842
846
  - pypi: ./
843
847
  name: py-cluster-api
844
- version: 0.2.4
845
- sha256: fa7e3d392473de2f63cc6a0aff42c8491418e1cfb4d15f4d75d37dcdb48426f2
848
+ version: 0.4.0
849
+ sha256: 1dd95e2002e0e1b4908c3ea27e6c9b575ae0e2e514cf00a01289b554502ce15d
846
850
  requires_dist:
847
851
  - pyyaml
848
852
  - pytest ; extra == 'test'
@@ -851,7 +855,6 @@ packages:
851
855
  - build ; extra == 'release'
852
856
  - twine ; extra == 'release'
853
857
  requires_python: '>=3.10'
854
- editable: true
855
858
  - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda
856
859
  sha256: 79db7928d13fab2d892592223d7570f5061c192f27b9febd1a418427b719acc6
857
860
  md5: 12c566707c80111f9799308d9e265aef
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "py-cluster-api"
3
- version = "0.2.4"
3
+ version = "0.4.0"
4
4
  description = "Generic Python library for running jobs on HPC clusters"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -44,7 +44,7 @@ asyncio_mode = "auto"
44
44
  markers = ["integration: tests that submit real jobs to the cluster (deselected by default)"]
45
45
  addopts = "-m 'not integration'"
46
46
 
47
- [tool.pixi.project]
47
+ [tool.pixi.workspace]
48
48
  channels = ["conda-forge"]
49
49
  platforms = ["linux-64"]
50
50
 
@@ -7,6 +7,9 @@ memory: "1 GB"
7
7
  lsf_units: MB
8
8
  suppress_job_email: true
9
9
 
10
+ # Optional: request GPU resources
11
+ # gpus: 1
12
+
10
13
  # Optional: prologue commands to run before each job
11
14
  # script_prologue:
12
15
  # - "module load java/11"
@@ -115,21 +115,39 @@ class TestPrefix:
115
115
  executor = LocalExecutor(default_config)
116
116
  assert executor._prefix == "test"
117
117
 
118
- def test_random_prefix_when_none(self):
118
+ def test_no_prefix_when_none(self):
119
119
  from cluster_api.config import ClusterConfig
120
120
 
121
121
  config = ClusterConfig()
122
122
  executor = LocalExecutor(config)
123
- assert len(executor._prefix) == 5
124
- assert executor._prefix.isalnum()
123
+ assert executor._prefix is None
125
124
 
126
- def test_random_prefix_is_unique(self):
125
+ async def test_submit_no_prefix(self, work_dir):
127
126
  from cluster_api.config import ClusterConfig
128
127
 
129
128
  config = ClusterConfig()
130
- a = LocalExecutor(config)
131
- b = LocalExecutor(config)
132
- assert a._prefix != b._prefix
129
+ executor = LocalExecutor(config)
130
+ job = await executor.submit(
131
+ command="echo hello",
132
+ name="my-job",
133
+ resources=ResourceSpec(work_dir=work_dir),
134
+ )
135
+ assert job.name == "my-job"
136
+ await executor.cancel(job.job_id)
137
+
138
+ async def test_submit_array_no_prefix(self, work_dir):
139
+ from cluster_api.config import ClusterConfig
140
+
141
+ config = ClusterConfig()
142
+ executor = LocalExecutor(config)
143
+ job = await executor.submit_array(
144
+ command="echo hello",
145
+ name="my-array",
146
+ array_range=(1, 2),
147
+ resources=ResourceSpec(work_dir=work_dir),
148
+ )
149
+ assert job.name == "my-array"
150
+ await executor.cancel(job.job_id)
133
151
 
134
152
 
135
153
  class TestSanitizeJobName:
@@ -186,3 +204,14 @@ class TestCancelAll:
186
204
 
187
205
  await executor.cancel_all()
188
206
  assert job.status == JobStatus.KILLED
207
+
208
+ async def test_cancel_all_done(self, default_config, work_dir):
209
+ executor = LocalExecutor(default_config)
210
+ job = await executor.submit(
211
+ command="sleep 60", name="sleeper",
212
+ resources=ResourceSpec(work_dir=work_dir),
213
+ )
214
+ assert not job.is_terminal
215
+
216
+ await executor.cancel_all(done=True)
217
+ assert job.status == JobStatus.DONE
@@ -70,6 +70,16 @@ class TestLocalSubmitAndPoll:
70
70
  await executor.cancel(job.job_id)
71
71
  assert job.status == JobStatus.KILLED
72
72
 
73
+ async def test_cancel_done(self, default_config, work_dir):
74
+ executor = LocalExecutor(default_config)
75
+ job = await executor.submit(
76
+ command="sleep 60", name="cancel-done-test",
77
+ resources=ResourceSpec(work_dir=work_dir),
78
+ )
79
+
80
+ await asyncio.sleep(0.1)
81
+ await executor.cancel(job.job_id, done=True)
82
+ assert job.status == JobStatus.DONE
73
83
 
74
84
  async def test_multiple_jobs(self, default_config, work_dir):
75
85
  executor = LocalExecutor(default_config)
@@ -8,6 +8,7 @@ from unittest.mock import AsyncMock, patch
8
8
  import pytest
9
9
 
10
10
  from cluster_api._types import ArrayElement, JobRecord, JobStatus, ResourceSpec
11
+ from cluster_api.exceptions import CommandFailedError
11
12
  from cluster_api.executors.lsf import (
12
13
  LSFExecutor,
13
14
  _LSF_STATUS_MAP,
@@ -61,7 +62,7 @@ class TestBuildHeader:
61
62
  assert any("-n 4" in line for line in lines)
62
63
  assert any("span[hosts=1]" in line for line in lines)
63
64
  assert any("-W 08:00" in line for line in lines)
64
- assert any("-cwd /scratch" in line for line in lines)
65
+ assert any('-cwd "/scratch"' in line for line in lines)
65
66
 
66
67
  def test_single_cpu_no_span(self, lsf_config):
67
68
  executor = LSFExecutor(lsf_config)
@@ -267,6 +268,17 @@ class TestBuildStatusArgs:
267
268
  assert "-json" in args
268
269
  assert "test-*" in args
269
270
 
271
+ def test_status_args_no_prefix(self):
272
+ from cluster_api.config import ClusterConfig
273
+
274
+ config = ClusterConfig(executor="lsf", lsf_units="MB")
275
+ executor = LSFExecutor(config)
276
+ args = executor._build_status_args()
277
+ assert "bjobs" in args
278
+ assert "-a" in args
279
+ assert "-json" in args
280
+ assert "-J" not in args
281
+
270
282
 
271
283
  class TestSubmission:
272
284
 
@@ -285,11 +297,11 @@ class TestSubmission:
285
297
  assert job.job_id == "12345"
286
298
  assert job.name == "test-my-job"
287
299
  assert job.status == JobStatus.PENDING
288
- # Verify shell redirect submission
300
+ # Verify bsub invocation with stdin_file
289
301
  cmd = mock_call.call_args[0][0]
290
- assert "bsub" in cmd
291
- assert "< " in cmd
292
- assert cmd.endswith(".sh")
302
+ assert cmd[0] == "bsub"
303
+ kwargs = mock_call.call_args[1]
304
+ assert kwargs["stdin_file"].endswith(".sh")
293
305
 
294
306
 
295
307
  async def test_submit_email_suppression(self, lsf_config, work_dir):
@@ -350,6 +362,53 @@ class TestArrayScriptRewriting:
350
362
  assert "stderr.%J.%I.log" in script
351
363
 
352
364
 
365
+ class TestCancel:
366
+
367
+ async def test_cancel_passes_d_flag_when_done(self, lsf_config):
368
+ executor = LSFExecutor(lsf_config)
369
+ with patch.object(
370
+ executor, "_call",
371
+ new_callable=AsyncMock,
372
+ return_value="Job <123> is being submitted",
373
+ ):
374
+ job = await executor.submit(
375
+ command="echo hi", name="cancel-done",
376
+ resources=ResourceSpec(work_dir="/tmp"),
377
+ )
378
+
379
+ with patch.object(
380
+ executor, "_call",
381
+ new_callable=AsyncMock,
382
+ return_value="",
383
+ ) as mock_call:
384
+ await executor.cancel(job.job_id, done=True)
385
+ args = mock_call.call_args[0][0]
386
+ assert args == ["bkill", "-d", job.job_id]
387
+ assert job.status == JobStatus.DONE
388
+
389
+ async def test_cancel_without_done_flag(self, lsf_config):
390
+ executor = LSFExecutor(lsf_config)
391
+ with patch.object(
392
+ executor, "_call",
393
+ new_callable=AsyncMock,
394
+ return_value="Job <456> is being submitted",
395
+ ):
396
+ job = await executor.submit(
397
+ command="echo hi", name="cancel-kill",
398
+ resources=ResourceSpec(work_dir="/tmp"),
399
+ )
400
+
401
+ with patch.object(
402
+ executor, "_call",
403
+ new_callable=AsyncMock,
404
+ return_value="",
405
+ ) as mock_call:
406
+ await executor.cancel(job.job_id)
407
+ args = mock_call.call_args[0][0]
408
+ assert args == ["bkill", job.job_id]
409
+ assert job.status == JobStatus.KILLED
410
+
411
+
353
412
  class TestCancelByName:
354
413
 
355
414
  async def test_cancel_by_name(self, lsf_config):
@@ -366,6 +425,16 @@ class TestCancelByName:
366
425
  assert "-J" in args
367
426
  assert "test-*" in args
368
427
 
428
+ async def test_cancel_by_name_no_match(self, lsf_config):
429
+ """bkill -J returns non-zero when no jobs match; should not raise."""
430
+ executor = LSFExecutor(lsf_config)
431
+ with patch.object(
432
+ executor, "_call",
433
+ new_callable=AsyncMock,
434
+ side_effect=CommandFailedError("No matching job found"),
435
+ ):
436
+ await executor.cancel_by_name("nonexistent-*")
437
+
369
438
 
370
439
  class TestParseLsfTime:
371
440
  def test_standard_format(self):
@@ -61,7 +61,8 @@ class TestReconnectByPrefix:
61
61
  assert job.resources is None
62
62
  assert job.exec_host == "node01"
63
63
 
64
- async def test_completed_job(self, lsf_config):
64
+ async def test_completed_job_skipped(self, lsf_config):
65
+ """Terminal jobs from -a flag should not be reconnected."""
65
66
  executor = LSFExecutor(lsf_config)
66
67
  output = _make_bjobs_json([
67
68
  _make_record(
@@ -73,11 +74,10 @@ class TestReconnectByPrefix:
73
74
  with patch.object(executor, "_call", new_callable=AsyncMock, return_value=output):
74
75
  jobs = await executor.reconnect()
75
76
 
76
- assert len(jobs) == 1
77
- assert jobs[0].status == JobStatus.DONE
78
- assert jobs[0].exit_code == 0
77
+ assert len(jobs) == 0
79
78
 
80
- async def test_multiple_jobs(self, lsf_config):
79
+ async def test_multiple_jobs_filters_terminal(self, lsf_config):
80
+ """Only non-terminal jobs should be reconnected; DONE/EXIT are skipped."""
81
81
  executor = LSFExecutor(lsf_config)
82
82
  output = _make_bjobs_json([
83
83
  _make_record(job_id="100", job_name="test-a", stat="RUN"),
@@ -91,13 +91,12 @@ class TestReconnectByPrefix:
91
91
  with patch.object(executor, "_call", new_callable=AsyncMock, return_value=output):
92
92
  jobs = await executor.reconnect()
93
93
 
94
- assert len(jobs) == 3
94
+ assert len(jobs) == 2
95
95
  ids = {j.job_id for j in jobs}
96
- assert ids == {"100", "101", "102"}
96
+ assert ids == {"100", "101"}
97
97
  by_id = {j.job_id: j for j in jobs}
98
98
  assert by_id["100"].status == JobStatus.RUNNING
99
99
  assert by_id["101"].status == JobStatus.PENDING
100
- assert by_id["102"].status == JobStatus.DONE
101
100
 
102
101
  async def test_skips_already_tracked(self, lsf_config, work_dir):
103
102
  executor = LSFExecutor(lsf_config)
@@ -232,9 +231,9 @@ class TestReconnectArrayJobs:
232
231
 
233
232
  assert jobs[0].metadata["array_range"] == (5, 10)
234
233
 
235
- async def test_status_computed(self, lsf_config):
234
+ async def test_all_terminal_array_skipped(self, lsf_config):
235
+ """Array where all visible elements are terminal should not be reconnected."""
236
236
  executor = LSFExecutor(lsf_config)
237
- # All elements done → parent status should be DONE
238
237
  output = _make_bjobs_json([
239
238
  _make_record(job_id="600[1]", job_name="test-alldone", stat="DONE", exit_code="0"),
240
239
  _make_record(job_id="600[2]", job_name="test-alldone", stat="DONE", exit_code="0"),
@@ -243,9 +242,10 @@ class TestReconnectArrayJobs:
243
242
  with patch.object(executor, "_call", new_callable=AsyncMock, return_value=output):
244
243
  jobs = await executor.reconnect()
245
244
 
246
- assert jobs[0].status == JobStatus.DONE
245
+ assert len(jobs) == 0
247
246
 
248
- async def test_status_computed_with_failure(self, lsf_config):
247
+ async def test_all_terminal_array_with_failure_skipped(self, lsf_config):
248
+ """Array where all elements are terminal (even with failures) should not be reconnected."""
249
249
  executor = LSFExecutor(lsf_config)
250
250
  output = _make_bjobs_json([
251
251
  _make_record(job_id="700[1]", job_name="test-mixed", stat="DONE", exit_code="0"),
@@ -255,8 +255,7 @@ class TestReconnectArrayJobs:
255
255
  with patch.object(executor, "_call", new_callable=AsyncMock, return_value=output):
256
256
  jobs = await executor.reconnect()
257
257
 
258
- assert jobs[0].status == JobStatus.FAILED
259
- assert jobs[0].failed_element_indices == [2]
258
+ assert len(jobs) == 0
260
259
 
261
260
  async def test_mixed_single_and_array(self, lsf_config):
262
261
  executor = LSFExecutor(lsf_config)
@@ -280,8 +279,8 @@ class TestReconnectArrayJobs:
280
279
  executor = LSFExecutor(lsf_config)
281
280
  output = _make_bjobs_json([
282
281
  _make_record(
283
- job_id="1000[1]", job_name="test-meta", stat="DONE",
284
- exit_code="0", exec_host="node01", max_mem="256 MB",
282
+ job_id="1000[1]", job_name="test-meta", stat="RUN",
283
+ exec_host="node01", max_mem="256 MB",
285
284
  ),
286
285
  ])
287
286
 
@@ -291,7 +290,6 @@ class TestReconnectArrayJobs:
291
290
  elem = jobs[0].array_elements[1]
292
291
  assert elem.exec_host == "node01"
293
292
  assert elem.max_mem == "256 MB"
294
- assert elem.exit_code == 0
295
293
 
296
294
 
297
295
  class TestReconnectThenPoll:
File without changes