py-cluster-api 0.2.4__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/.github/workflows/ci.yml +2 -2
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/CLAUDE.md +4 -2
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/PKG-INFO +34 -41
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/README.md +33 -40
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/_types.py +21 -3
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/config.py +7 -1
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/core.py +75 -42
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/executors/local.py +33 -21
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/executors/lsf.py +42 -21
- py_cluster_api-0.4.0/docs/API.md +77 -0
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/docs/Development.md +7 -4
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/pixi.lock +6 -3
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/pyproject.toml +2 -2
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/cluster_config.example.yaml +3 -0
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/test_core.py +36 -7
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/test_local.py +10 -0
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/test_lsf.py +74 -5
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/test_reconnect.py +15 -17
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/.gitignore +0 -0
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/LICENSE +0 -0
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/__init__.py +0 -0
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/exceptions.py +0 -0
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/executors/__init__.py +0 -0
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/monitor.py +0 -0
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/cluster_api/script.py +0 -0
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/__init__.py +0 -0
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/conftest.py +0 -0
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/test_config.py +0 -0
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/test_integration.py +0 -0
- {py_cluster_api-0.2.4 → py_cluster_api-0.4.0}/tests/test_monitor.py +0 -0
|
@@ -18,12 +18,12 @@ jobs:
|
|
|
18
18
|
runs-on: ubuntu-latest
|
|
19
19
|
|
|
20
20
|
steps:
|
|
21
|
-
- uses: actions/checkout@
|
|
21
|
+
- uses: actions/checkout@v5
|
|
22
22
|
|
|
23
23
|
- name: Set up Pixi
|
|
24
24
|
uses: prefix-dev/setup-pixi@v0.9.0
|
|
25
25
|
with:
|
|
26
|
-
pixi-version: v0.
|
|
26
|
+
pixi-version: v0.65.0
|
|
27
27
|
cache: true
|
|
28
28
|
|
|
29
29
|
- name: Lint
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
Generic Python library for submitting and monitoring jobs on HPC clusters. Wraps scheduler CLIs (bsub/bjobs/bkill) behind an async executor abstraction with an active polling monitor that fires callbacks on job completion. Inspired by dask-jobqueue's script templating and Nextflow's portable config profiles, but unlike dask-jobqueue, this library actively polls the scheduler rather than relying on workers phoning home.
|
|
4
4
|
|
|
5
|
+
Key capabilities beyond submit/poll/cancel: `reconnect()` rediscovers running jobs after a process restart (requires `job_name_prefix`), and `cancel_by_name()` kills jobs by name pattern (LSF only).
|
|
6
|
+
|
|
5
7
|
Founding principles: async-only API, executors are thin wrappers around scheduler CLIs, all state lives in `JobRecord` dataclasses tracked in-process, monitoring is poll-based via `bjobs -json`, and configuration uses Nextflow-style YAML profiles.
|
|
6
8
|
|
|
7
9
|
Always use `pixi run` to run commands — never invoke python, pytest, ruff, or other tools directly.
|
|
@@ -22,7 +24,7 @@ pixi run check # lint + test
|
|
|
22
24
|
|
|
23
25
|
- `cluster_api/` — library source
|
|
24
26
|
- `core.py` — abstract `Executor` base class
|
|
25
|
-
- `_types.py` — `JobStatus`, `JobRecord`, `ResourceSpec`, `JobExitCondition`, `ArrayElement`
|
|
27
|
+
- `_types.py` — `JobStatus`, `JobRecord`, `ResourceSpec` (`cpus`, `gpus`, …), `JobExitCondition`, `ArrayElement`
|
|
26
28
|
- `config.py` — YAML config loader with profiles
|
|
27
29
|
- `script.py` — script rendering (`render_script`) and writing (`write_script`)
|
|
28
30
|
- `monitor.py` — async polling loop + callback dispatch
|
|
@@ -41,7 +43,7 @@ Explicit `stdout_path` / `stderr_path` in `ResourceSpec` override these defaults
|
|
|
41
43
|
|
|
42
44
|
## Testing
|
|
43
45
|
|
|
44
|
-
All tests mock `Executor._call()` to avoid needing a real scheduler (except `test_local.py` which runs real subprocesses). Use `unittest.mock.patch` with `AsyncMock` for async method mocking.
|
|
46
|
+
All tests mock `Executor._call()` to avoid needing a real scheduler (except `test_local.py` which runs real subprocesses, and `test_integration.py` which requires a live LSF cluster and is skipped by default). Use `unittest.mock.patch` with `AsyncMock` for async method mocking.
|
|
45
47
|
|
|
46
48
|
## Style
|
|
47
49
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: py-cluster-api
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Generic Python library for running jobs on HPC clusters
|
|
5
5
|
Project-URL: Homepage, https://github.com/JaneliaSciComp/py-cluster-api
|
|
6
6
|
Project-URL: Repository, https://github.com/JaneliaSciComp/py-cluster-api
|
|
@@ -54,12 +54,17 @@ Description-Content-Type: text/markdown
|
|
|
54
54
|
|
|
55
55
|
[](https://github.com/JaneliaSciComp/py-cluster-api/actions/workflows/ci.yml)
|
|
56
56
|
|
|
57
|
-
A Python library for submitting and monitoring jobs on HPC clusters. Supports running arbitrary executables (Nextflow pipelines, Python scripts, Java tools, etc.) on
|
|
57
|
+
A Python library for submitting and monitoring jobs on HPC clusters. Supports running arbitrary executables (Nextflow pipelines, Python scripts, Java tools, etc.) on clusters and taking action when jobs complete via async callbacks.
|
|
58
|
+
|
|
59
|
+
## Executors
|
|
60
|
+
|
|
61
|
+
* Local Subprocess
|
|
62
|
+
* IBM Platform LSF
|
|
63
|
+
* We will accept PRs that implement and test additional executors (SLURM, etc.)
|
|
58
64
|
|
|
59
65
|
## Features
|
|
60
66
|
|
|
61
67
|
- **Async-first** — built on `asyncio` for non-blocking job submission and monitoring
|
|
62
|
-
- **LSF executor** — submit via `bsub`, monitor via `bjobs -json`, cancel via `bkill`
|
|
63
68
|
- **Local executor** — run jobs as local subprocesses for development and testing, including array jobs
|
|
64
69
|
- **Job monitoring** — polls the scheduler and fires callbacks on job completion, failure, or cancellation
|
|
65
70
|
- **Job arrays** — submit array jobs with per-element log files
|
|
@@ -97,7 +102,7 @@ async def main():
|
|
|
97
102
|
job = await executor.submit(
|
|
98
103
|
command="nextflow run nf-core/rnaseq --input samples.csv",
|
|
99
104
|
name="rnaseq-run",
|
|
100
|
-
resources=ResourceSpec(cpus=4, memory="32 GB", walltime="24:00", queue="long"),
|
|
105
|
+
resources=ResourceSpec(cpus=4, gpus=1, memory="32 GB", walltime="24:00", queue="long"),
|
|
101
106
|
env={"NXF_WORK": "/scratch/work"},
|
|
102
107
|
)
|
|
103
108
|
job.on_success(lambda j: print(f"Done! Job {j.job_id}, peak mem: {j.max_mem}"))
|
|
@@ -131,6 +136,26 @@ async def run_array():
|
|
|
131
136
|
|
|
132
137
|
The array index environment variable depends on the executor: LSF uses `$LSB_JOBINDEX`, while the local executor uses `$ARRAY_INDEX`.
|
|
133
138
|
|
|
139
|
+
### Reconnecting After Restart
|
|
140
|
+
|
|
141
|
+
If your process crashes or restarts, `reconnect()` rediscovers running jobs from the scheduler and resumes tracking them. Requires `job_name_prefix` to be set in config.
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
async def resume():
|
|
145
|
+
executor = create_executor(profile="janelia_lsf")
|
|
146
|
+
monitor = JobMonitor(executor)
|
|
147
|
+
await monitor.start()
|
|
148
|
+
|
|
149
|
+
recovered = await executor.reconnect()
|
|
150
|
+
for job in recovered:
|
|
151
|
+
print(f"Reconnected to {job.job_id} ({job.name}), status={job.status}")
|
|
152
|
+
job.on_exit(lambda j: print(f"Job {j.job_id} finished: {j.status}"))
|
|
153
|
+
|
|
154
|
+
if recovered:
|
|
155
|
+
await monitor.wait_for(*recovered)
|
|
156
|
+
await monitor.stop()
|
|
157
|
+
```
|
|
158
|
+
|
|
134
159
|
### Local Testing
|
|
135
160
|
|
|
136
161
|
```python
|
|
@@ -166,6 +191,7 @@ profiles:
|
|
|
166
191
|
janelia_lsf:
|
|
167
192
|
executor: lsf
|
|
168
193
|
queue: normal
|
|
194
|
+
gpus: 1
|
|
169
195
|
memory: "8 GB"
|
|
170
196
|
walltime: "04:00"
|
|
171
197
|
script_prologue:
|
|
@@ -182,15 +208,16 @@ profiles:
|
|
|
182
208
|
|---|---|---|
|
|
183
209
|
| `executor` | `"local"` | Backend: `lsf` or `local` |
|
|
184
210
|
| `cpus` | `None` | Default CPU count |
|
|
211
|
+
| `gpus` | `None` | Default GPU count |
|
|
185
212
|
| `memory` | `None` | Default memory (e.g. `"8 GB"`) |
|
|
186
213
|
| `walltime` | `None` | Default wall time (e.g. `"04:00"`) |
|
|
187
214
|
| `queue` | `None` | Default queue/partition |
|
|
188
215
|
| `poll_interval` | `10.0` | Seconds between status polls |
|
|
189
|
-
| `job_name_prefix` | `
|
|
216
|
+
| `job_name_prefix` | `None` | Optional prefix prepended to job names. When set, polling filters by `{prefix}-*` and `reconnect()` is available; when unset, the user controls the full job name and polling queries all jobs |
|
|
190
217
|
| `shebang` | `"#!/bin/bash"` | Script shebang line |
|
|
191
218
|
| `script_prologue` | `[]` | Lines inserted before the command |
|
|
192
219
|
| `script_epilogue` | `[]` | Lines inserted after the command |
|
|
193
|
-
| `extra_directives` | `[]` | Additional scheduler
|
|
220
|
+
| `extra_directives` | `[]` | Additional scheduler directive lines appended verbatim to the script header (e.g. `"#BSUB -P myproject"`) |
|
|
194
221
|
| `directives_skip` | `[]` | Substrings to filter out of directives |
|
|
195
222
|
| `extra_args` | `[]` | Extra CLI args appended to the submit command (e.g. `bsub`) |
|
|
196
223
|
| `lsf_units` | `"MB"` | LSF memory units (`KB`, `MB`, `GB`) |
|
|
@@ -201,41 +228,7 @@ profiles:
|
|
|
201
228
|
|
|
202
229
|
## API Reference
|
|
203
230
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
Factory function that loads config and returns an `Executor` instance.
|
|
207
|
-
|
|
208
|
-
### `Executor`
|
|
209
|
-
|
|
210
|
-
Abstract base class. Key methods:
|
|
211
|
-
|
|
212
|
-
- `submit(command, name, resources=None, prologue=None, epilogue=None, env=None, metadata=None)` — submit a job, returns `JobRecord`
|
|
213
|
-
- `submit_array(command, name, array_range, ...)` — submit a job array
|
|
214
|
-
- `cancel(job_id)` — cancel a job by ID
|
|
215
|
-
- `cancel_by_name(name_pattern)` — cancel by name pattern (LSF only)
|
|
216
|
-
- `cancel_all()` — cancel all tracked jobs
|
|
217
|
-
- `poll()` — query scheduler and update job statuses
|
|
218
|
-
- `jobs` / `active_jobs` — properties returning tracked job dicts
|
|
219
|
-
|
|
220
|
-
### `JobRecord`
|
|
221
|
-
|
|
222
|
-
Tracks a submitted job. Fields include `job_id`, `name`, `status`, `exit_code`, `exec_host`, `max_mem`, `submit_time`, `start_time`, `finish_time`, and `metadata`.
|
|
223
|
-
|
|
224
|
-
- `on_success(callback)` — register callback for exit code 0
|
|
225
|
-
- `on_failure(callback)` — register callback for non-zero exit
|
|
226
|
-
- `on_exit(callback, condition=ANY)` — register callback for any exit condition
|
|
227
|
-
- `is_terminal` — whether the job has finished
|
|
228
|
-
|
|
229
|
-
### `JobMonitor`
|
|
230
|
-
|
|
231
|
-
Async polling loop that drives status updates and callback dispatch.
|
|
232
|
-
|
|
233
|
-
- `start()` / `stop()` — control the polling loop
|
|
234
|
-
- `wait_for(*records, timeout=None)` — block until jobs reach a terminal state
|
|
235
|
-
|
|
236
|
-
### `ResourceSpec`
|
|
237
|
-
|
|
238
|
-
Resource requirements: `cpus`, `memory`, `walltime`, `queue`, `work_dir`, `stdout_path`, `stderr_path`, `extra_directives`, `extra_args`.
|
|
231
|
+
See [docs/API.md](docs/API.md) for the full API reference and error handling guide.
|
|
239
232
|
|
|
240
233
|
## Development
|
|
241
234
|
|
|
@@ -2,12 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://github.com/JaneliaSciComp/py-cluster-api/actions/workflows/ci.yml)
|
|
4
4
|
|
|
5
|
-
A Python library for submitting and monitoring jobs on HPC clusters. Supports running arbitrary executables (Nextflow pipelines, Python scripts, Java tools, etc.) on
|
|
5
|
+
A Python library for submitting and monitoring jobs on HPC clusters. Supports running arbitrary executables (Nextflow pipelines, Python scripts, Java tools, etc.) on clusters and taking action when jobs complete via async callbacks.
|
|
6
|
+
|
|
7
|
+
## Executors
|
|
8
|
+
|
|
9
|
+
* Local Subprocess
|
|
10
|
+
* IBM Platform LSF
|
|
11
|
+
* We will accept PRs that implement and test additional executors (SLURM, etc.)
|
|
6
12
|
|
|
7
13
|
## Features
|
|
8
14
|
|
|
9
15
|
- **Async-first** — built on `asyncio` for non-blocking job submission and monitoring
|
|
10
|
-
- **LSF executor** — submit via `bsub`, monitor via `bjobs -json`, cancel via `bkill`
|
|
11
16
|
- **Local executor** — run jobs as local subprocesses for development and testing, including array jobs
|
|
12
17
|
- **Job monitoring** — polls the scheduler and fires callbacks on job completion, failure, or cancellation
|
|
13
18
|
- **Job arrays** — submit array jobs with per-element log files
|
|
@@ -45,7 +50,7 @@ async def main():
|
|
|
45
50
|
job = await executor.submit(
|
|
46
51
|
command="nextflow run nf-core/rnaseq --input samples.csv",
|
|
47
52
|
name="rnaseq-run",
|
|
48
|
-
resources=ResourceSpec(cpus=4, memory="32 GB", walltime="24:00", queue="long"),
|
|
53
|
+
resources=ResourceSpec(cpus=4, gpus=1, memory="32 GB", walltime="24:00", queue="long"),
|
|
49
54
|
env={"NXF_WORK": "/scratch/work"},
|
|
50
55
|
)
|
|
51
56
|
job.on_success(lambda j: print(f"Done! Job {j.job_id}, peak mem: {j.max_mem}"))
|
|
@@ -79,6 +84,26 @@ async def run_array():
|
|
|
79
84
|
|
|
80
85
|
The array index environment variable depends on the executor: LSF uses `$LSB_JOBINDEX`, while the local executor uses `$ARRAY_INDEX`.
|
|
81
86
|
|
|
87
|
+
### Reconnecting After Restart
|
|
88
|
+
|
|
89
|
+
If your process crashes or restarts, `reconnect()` rediscovers running jobs from the scheduler and resumes tracking them. Requires `job_name_prefix` to be set in config.
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
async def resume():
|
|
93
|
+
executor = create_executor(profile="janelia_lsf")
|
|
94
|
+
monitor = JobMonitor(executor)
|
|
95
|
+
await monitor.start()
|
|
96
|
+
|
|
97
|
+
recovered = await executor.reconnect()
|
|
98
|
+
for job in recovered:
|
|
99
|
+
print(f"Reconnected to {job.job_id} ({job.name}), status={job.status}")
|
|
100
|
+
job.on_exit(lambda j: print(f"Job {j.job_id} finished: {j.status}"))
|
|
101
|
+
|
|
102
|
+
if recovered:
|
|
103
|
+
await monitor.wait_for(*recovered)
|
|
104
|
+
await monitor.stop()
|
|
105
|
+
```
|
|
106
|
+
|
|
82
107
|
### Local Testing
|
|
83
108
|
|
|
84
109
|
```python
|
|
@@ -114,6 +139,7 @@ profiles:
|
|
|
114
139
|
janelia_lsf:
|
|
115
140
|
executor: lsf
|
|
116
141
|
queue: normal
|
|
142
|
+
gpus: 1
|
|
117
143
|
memory: "8 GB"
|
|
118
144
|
walltime: "04:00"
|
|
119
145
|
script_prologue:
|
|
@@ -130,15 +156,16 @@ profiles:
|
|
|
130
156
|
|---|---|---|
|
|
131
157
|
| `executor` | `"local"` | Backend: `lsf` or `local` |
|
|
132
158
|
| `cpus` | `None` | Default CPU count |
|
|
159
|
+
| `gpus` | `None` | Default GPU count |
|
|
133
160
|
| `memory` | `None` | Default memory (e.g. `"8 GB"`) |
|
|
134
161
|
| `walltime` | `None` | Default wall time (e.g. `"04:00"`) |
|
|
135
162
|
| `queue` | `None` | Default queue/partition |
|
|
136
163
|
| `poll_interval` | `10.0` | Seconds between status polls |
|
|
137
|
-
| `job_name_prefix` | `
|
|
164
|
+
| `job_name_prefix` | `None` | Optional prefix prepended to job names. When set, polling filters by `{prefix}-*` and `reconnect()` is available; when unset, the user controls the full job name and polling queries all jobs |
|
|
138
165
|
| `shebang` | `"#!/bin/bash"` | Script shebang line |
|
|
139
166
|
| `script_prologue` | `[]` | Lines inserted before the command |
|
|
140
167
|
| `script_epilogue` | `[]` | Lines inserted after the command |
|
|
141
|
-
| `extra_directives` | `[]` | Additional scheduler
|
|
168
|
+
| `extra_directives` | `[]` | Additional scheduler directive lines appended verbatim to the script header (e.g. `"#BSUB -P myproject"`) |
|
|
142
169
|
| `directives_skip` | `[]` | Substrings to filter out of directives |
|
|
143
170
|
| `extra_args` | `[]` | Extra CLI args appended to the submit command (e.g. `bsub`) |
|
|
144
171
|
| `lsf_units` | `"MB"` | LSF memory units (`KB`, `MB`, `GB`) |
|
|
@@ -149,41 +176,7 @@ profiles:
|
|
|
149
176
|
|
|
150
177
|
## API Reference
|
|
151
178
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
Factory function that loads config and returns an `Executor` instance.
|
|
155
|
-
|
|
156
|
-
### `Executor`
|
|
157
|
-
|
|
158
|
-
Abstract base class. Key methods:
|
|
159
|
-
|
|
160
|
-
- `submit(command, name, resources=None, prologue=None, epilogue=None, env=None, metadata=None)` — submit a job, returns `JobRecord`
|
|
161
|
-
- `submit_array(command, name, array_range, ...)` — submit a job array
|
|
162
|
-
- `cancel(job_id)` — cancel a job by ID
|
|
163
|
-
- `cancel_by_name(name_pattern)` — cancel by name pattern (LSF only)
|
|
164
|
-
- `cancel_all()` — cancel all tracked jobs
|
|
165
|
-
- `poll()` — query scheduler and update job statuses
|
|
166
|
-
- `jobs` / `active_jobs` — properties returning tracked job dicts
|
|
167
|
-
|
|
168
|
-
### `JobRecord`
|
|
169
|
-
|
|
170
|
-
Tracks a submitted job. Fields include `job_id`, `name`, `status`, `exit_code`, `exec_host`, `max_mem`, `submit_time`, `start_time`, `finish_time`, and `metadata`.
|
|
171
|
-
|
|
172
|
-
- `on_success(callback)` — register callback for exit code 0
|
|
173
|
-
- `on_failure(callback)` — register callback for non-zero exit
|
|
174
|
-
- `on_exit(callback, condition=ANY)` — register callback for any exit condition
|
|
175
|
-
- `is_terminal` — whether the job has finished
|
|
176
|
-
|
|
177
|
-
### `JobMonitor`
|
|
178
|
-
|
|
179
|
-
Async polling loop that drives status updates and callback dispatch.
|
|
180
|
-
|
|
181
|
-
- `start()` / `stop()` — control the polling loop
|
|
182
|
-
- `wait_for(*records, timeout=None)` — block until jobs reach a terminal state
|
|
183
|
-
|
|
184
|
-
### `ResourceSpec`
|
|
185
|
-
|
|
186
|
-
Resource requirements: `cpus`, `memory`, `walltime`, `queue`, `work_dir`, `stdout_path`, `stderr_path`, `extra_directives`, `extra_args`.
|
|
179
|
+
See [docs/API.md](docs/API.md) for the full API reference and error handling guide.
|
|
187
180
|
|
|
188
181
|
## Development
|
|
189
182
|
|
|
@@ -34,7 +34,25 @@ _TERMINAL_STATUSES = frozenset({JobStatus.DONE, JobStatus.FAILED, JobStatus.KILL
|
|
|
34
34
|
|
|
35
35
|
@dataclass
|
|
36
36
|
class ResourceSpec:
|
|
37
|
-
"""Resource requirements for a job.
|
|
37
|
+
"""Resource requirements for a job.
|
|
38
|
+
|
|
39
|
+
Fields:
|
|
40
|
+
cpus: Number of CPU cores to request.
|
|
41
|
+
gpus: Number of GPUs to request.
|
|
42
|
+
memory: Memory limit as a string with unit, e.g. ``"16GB"`` or ``"500MB"``.
|
|
43
|
+
Passed directly to the scheduler directive.
|
|
44
|
+
walltime: Wall-clock time limit, e.g. ``"1:00"`` (h:mm) or ``"24:00:00"``.
|
|
45
|
+
Format depends on the target scheduler.
|
|
46
|
+
queue: Scheduler queue / partition name.
|
|
47
|
+
work_dir: Working directory for the job (defaults to ``os.getcwd()``).
|
|
48
|
+
stdout_path: Explicit path for stdout log. Overrides the executor's
|
|
49
|
+
default log naming (see CLAUDE.md § Log File Naming).
|
|
50
|
+
stderr_path: Explicit path for stderr log. Same override behaviour.
|
|
51
|
+
extra_directives: Raw scheduler directives injected into the job script
|
|
52
|
+
header (e.g. ``["#BSUB -R 'rusage[mem=16GB]'"]``).
|
|
53
|
+
extra_args: Extra command-line arguments appended to the submit command
|
|
54
|
+
(e.g. ``["-q", "gpu"]`` for ``bsub``).
|
|
55
|
+
"""
|
|
38
56
|
|
|
39
57
|
cpus: int | None = None
|
|
40
58
|
gpus: int | None = None
|
|
@@ -149,8 +167,8 @@ class JobRecord:
|
|
|
149
167
|
return JobStatus.RUNNING
|
|
150
168
|
|
|
151
169
|
# All expected elements accounted for and terminal
|
|
152
|
-
if JobStatus.KILLED in statuses:
|
|
153
|
-
return JobStatus.KILLED
|
|
154
170
|
if JobStatus.FAILED in statuses:
|
|
155
171
|
return JobStatus.FAILED
|
|
172
|
+
if JobStatus.KILLED in statuses:
|
|
173
|
+
return JobStatus.KILLED
|
|
156
174
|
return JobStatus.DONE
|
|
@@ -58,6 +58,7 @@ class ClusterConfig:
|
|
|
58
58
|
completed_retention_minutes: float = 10.0
|
|
59
59
|
command_timeout: float = 100.0
|
|
60
60
|
suppress_job_email: bool = True
|
|
61
|
+
poll_all_users: bool = False
|
|
61
62
|
|
|
62
63
|
|
|
63
64
|
_CONFIG_SEARCH_PATHS = [
|
|
@@ -109,7 +110,12 @@ def load_config(
|
|
|
109
110
|
|
|
110
111
|
profiles = raw.pop("profiles", {})
|
|
111
112
|
|
|
112
|
-
if profile
|
|
113
|
+
if profile:
|
|
114
|
+
if profile not in profiles:
|
|
115
|
+
available = ", ".join(sorted(profiles)) if profiles else "(none)"
|
|
116
|
+
raise ValueError(
|
|
117
|
+
f"Unknown profile {profile!r}; available profiles: {available}"
|
|
118
|
+
)
|
|
113
119
|
raw = {**raw, **profiles[profile]}
|
|
114
120
|
|
|
115
121
|
if overrides:
|
|
@@ -7,8 +7,6 @@ import asyncio
|
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
9
|
import re
|
|
10
|
-
import secrets
|
|
11
|
-
import string
|
|
12
10
|
from datetime import datetime, timezone
|
|
13
11
|
from typing import Any
|
|
14
12
|
|
|
@@ -18,8 +16,10 @@ from ._types import ArrayElement, JobRecord, JobStatus, ResourceSpec
|
|
|
18
16
|
|
|
19
17
|
logger = logging.getLogger(__name__)
|
|
20
18
|
|
|
19
|
+
# Check for array element IDs like "12345[1]"
|
|
21
20
|
_ARRAY_ELEMENT_RE = re.compile(r"^(.+)\[(\d+)\]$")
|
|
22
21
|
|
|
22
|
+
# Check for job names that are unsafe in scheduler job names
|
|
23
23
|
_UNSAFE_NAME_RE = re.compile(r"[^\w\-.]")
|
|
24
24
|
|
|
25
25
|
|
|
@@ -29,7 +29,37 @@ def _sanitize_job_name(name: str) -> str:
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class Executor(abc.ABC):
|
|
32
|
-
"""Abstract base for cluster job executors.
|
|
32
|
+
"""Abstract base for cluster job executors.
|
|
33
|
+
|
|
34
|
+
Lifecycle:
|
|
35
|
+
1. **Construct** — instantiate with a ``ClusterConfig``.
|
|
36
|
+
2. **Submit** — call :meth:`submit` or :meth:`submit_array` to enqueue
|
|
37
|
+
jobs. Each returns a :class:`JobRecord` tracked in-process.
|
|
38
|
+
3. **Poll** — call :meth:`poll` (usually via :class:`~cluster_api.monitor.Monitor`)
|
|
39
|
+
to query the scheduler and update every tracked ``JobRecord``.
|
|
40
|
+
4. **Cancel** — call :meth:`cancel`, :meth:`cancel_all`, or
|
|
41
|
+
:meth:`cancel_by_name` to kill running jobs.
|
|
42
|
+
|
|
43
|
+
Subclass requirements:
|
|
44
|
+
Must implement:
|
|
45
|
+
- :meth:`_submit_job` — run the scheduler submit command.
|
|
46
|
+
- :meth:`_build_status_args` — build the CLI args for a status query.
|
|
47
|
+
- :meth:`_parse_job_statuses` — parse status output into per-job dicts.
|
|
48
|
+
|
|
49
|
+
May override:
|
|
50
|
+
- :meth:`_submit_array_job` — array submission (default delegates
|
|
51
|
+
to ``_submit_job``).
|
|
52
|
+
- :meth:`_cancel_job` — cancel a single job.
|
|
53
|
+
- :meth:`cancel_by_name` — cancel by name pattern.
|
|
54
|
+
- :meth:`reconnect` — rediscover running jobs after restart.
|
|
55
|
+
|
|
56
|
+
Class attributes:
|
|
57
|
+
submit_command: CLI executable used for submission (e.g. ``"bsub"``).
|
|
58
|
+
cancel_command: CLI executable used for cancellation (e.g. ``"bkill"``).
|
|
59
|
+
status_command: CLI executable used for status queries (e.g. ``"bjobs"``).
|
|
60
|
+
job_id_regexp: Regex with a ``job_id`` named group, applied to submit
|
|
61
|
+
output to extract the job ID.
|
|
62
|
+
"""
|
|
33
63
|
|
|
34
64
|
submit_command: str
|
|
35
65
|
cancel_command: str
|
|
@@ -39,13 +69,7 @@ class Executor(abc.ABC):
|
|
|
39
69
|
def __init__(self, config: ClusterConfig) -> None:
|
|
40
70
|
self.config = config
|
|
41
71
|
self._jobs: dict[str, JobRecord] = {}
|
|
42
|
-
|
|
43
|
-
self._prefix = config.job_name_prefix
|
|
44
|
-
else:
|
|
45
|
-
# Generate a random prefix so concurrent users/sessions don't
|
|
46
|
-
# see each other's jobs when polling by name.
|
|
47
|
-
alphabet = string.ascii_lowercase + string.digits
|
|
48
|
-
self._prefix = "".join(secrets.choice(alphabet) for _ in range(5))
|
|
72
|
+
self._prefix = config.job_name_prefix # None if not configured
|
|
49
73
|
|
|
50
74
|
# --- Submission ---
|
|
51
75
|
|
|
@@ -61,7 +85,7 @@ class Executor(abc.ABC):
|
|
|
61
85
|
) -> JobRecord:
|
|
62
86
|
"""Submit a job to the scheduler."""
|
|
63
87
|
resources = resources or ResourceSpec()
|
|
64
|
-
full_name = _sanitize_job_name(f"{self._prefix}-{name}")
|
|
88
|
+
full_name = _sanitize_job_name(f"{self._prefix}-{name}" if self._prefix else name)
|
|
65
89
|
|
|
66
90
|
job_id, script_path = await self._submit_job(
|
|
67
91
|
command, full_name, resources, prologue, epilogue, env,
|
|
@@ -96,7 +120,7 @@ class Executor(abc.ABC):
|
|
|
96
120
|
) -> JobRecord:
|
|
97
121
|
"""Submit a job array to the scheduler."""
|
|
98
122
|
resources = resources or ResourceSpec()
|
|
99
|
-
full_name = _sanitize_job_name(f"{self._prefix}-{name}")
|
|
123
|
+
full_name = _sanitize_job_name(f"{self._prefix}-{name}" if self._prefix else name)
|
|
100
124
|
|
|
101
125
|
job_id, script_path = await self._submit_array_job(
|
|
102
126
|
command, full_name, array_range, resources, prologue, epilogue,
|
|
@@ -172,18 +196,26 @@ class Executor(abc.ABC):
|
|
|
172
196
|
|
|
173
197
|
# --- Cancellation ---
|
|
174
198
|
|
|
175
|
-
async def cancel(self, job_id: str) -> None:
|
|
176
|
-
"""Cancel a job by ID.
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
199
|
+
async def cancel(self, job_id: str, *, done: bool = False) -> None:
|
|
200
|
+
"""Cancel a job by ID.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
job_id: The job ID to cancel.
|
|
204
|
+
done: If True, mark the job as DONE instead of KILLED.
|
|
205
|
+
Subclasses may translate this into scheduler-specific flags.
|
|
206
|
+
"""
|
|
207
|
+
await self._cancel_job(job_id, done=done)
|
|
180
208
|
if job_id in self._jobs:
|
|
181
|
-
self._jobs[job_id].status = JobStatus.KILLED
|
|
182
|
-
logger.info("Cancelled job %s", job_id)
|
|
209
|
+
self._jobs[job_id].status = JobStatus.DONE if done else JobStatus.KILLED
|
|
210
|
+
logger.info("Cancelled job %s (done=%s)", job_id, done)
|
|
211
|
+
|
|
212
|
+
async def _cancel_job(self, job_id: str, *, done: bool = False) -> None:
|
|
213
|
+
"""Run the scheduler cancel command. Must be implemented by subclasses."""
|
|
214
|
+
raise NotImplementedError("cancel is not supported by this executor")
|
|
183
215
|
|
|
184
216
|
async def cancel_by_name(self, name_pattern: str) -> None:
|
|
185
217
|
"""Cancel jobs by name pattern. Override in subclasses for native support."""
|
|
186
|
-
raise NotImplementedError("cancel_by_name not supported by this executor")
|
|
218
|
+
raise NotImplementedError("cancel_by_name is not supported by this executor")
|
|
187
219
|
|
|
188
220
|
async def reconnect(self) -> list[JobRecord]:
|
|
189
221
|
"""Reconnect to running jobs and resume tracking them.
|
|
@@ -195,12 +227,12 @@ class Executor(abc.ABC):
|
|
|
195
227
|
Returns:
|
|
196
228
|
List of newly created ``JobRecord`` instances.
|
|
197
229
|
"""
|
|
198
|
-
raise NotImplementedError("reconnect not supported by this executor")
|
|
230
|
+
raise NotImplementedError("reconnect is not supported by this executor")
|
|
199
231
|
|
|
200
|
-
async def cancel_all(self) -> None:
|
|
232
|
+
async def cancel_all(self, *, done: bool = False) -> None:
|
|
201
233
|
"""Cancel all tracked jobs."""
|
|
202
234
|
to_cancel = [jid for jid, r in self._jobs.items() if not r.is_terminal]
|
|
203
|
-
await asyncio.gather(*(self.cancel(jid) for jid in to_cancel))
|
|
235
|
+
await asyncio.gather(*(self.cancel(jid, done=done) for jid in to_cancel))
|
|
204
236
|
|
|
205
237
|
# --- Status polling ---
|
|
206
238
|
|
|
@@ -272,9 +304,9 @@ class Executor(abc.ABC):
|
|
|
272
304
|
@staticmethod
|
|
273
305
|
async def _call(
|
|
274
306
|
cmd: list[str],
|
|
275
|
-
shell: bool = False,
|
|
276
307
|
timeout: float = 100.0,
|
|
277
308
|
env: dict[str, str] | None = None,
|
|
309
|
+
stdin_file: str | None = None,
|
|
278
310
|
) -> str:
|
|
279
311
|
"""Run a subprocess and return stdout.
|
|
280
312
|
|
|
@@ -284,31 +316,32 @@ class Executor(abc.ABC):
|
|
|
284
316
|
if env:
|
|
285
317
|
full_env = {**os.environ, **env}
|
|
286
318
|
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
stderr=asyncio.subprocess.PIPE,
|
|
292
|
-
env=full_env,
|
|
293
|
-
)
|
|
294
|
-
else:
|
|
319
|
+
stdin_fh = None
|
|
320
|
+
try:
|
|
321
|
+
if stdin_file:
|
|
322
|
+
stdin_fh = open(stdin_file) # noqa: SIM115
|
|
295
323
|
proc = await asyncio.create_subprocess_exec(
|
|
296
324
|
*cmd,
|
|
325
|
+
stdin=stdin_fh,
|
|
297
326
|
stdout=asyncio.subprocess.PIPE,
|
|
298
327
|
stderr=asyncio.subprocess.PIPE,
|
|
299
328
|
env=full_env,
|
|
300
329
|
)
|
|
301
330
|
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
331
|
+
try:
|
|
332
|
+
stdout, stderr = await asyncio.wait_for(
|
|
333
|
+
proc.communicate(),
|
|
334
|
+
timeout=timeout,
|
|
335
|
+
)
|
|
336
|
+
except asyncio.TimeoutError:
|
|
337
|
+
proc.kill()
|
|
338
|
+
await proc.wait()
|
|
339
|
+
raise CommandTimeoutError(
|
|
340
|
+
f"Command timed out after {timeout}s: {cmd}"
|
|
341
|
+
)
|
|
342
|
+
finally:
|
|
343
|
+
if stdin_fh:
|
|
344
|
+
stdin_fh.close()
|
|
312
345
|
|
|
313
346
|
out = stdout.decode().strip()
|
|
314
347
|
err = stderr.decode().strip()
|
|
@@ -88,6 +88,12 @@ class LocalExecutor(Executor):
|
|
|
88
88
|
cwd: str | None = None,
|
|
89
89
|
) -> tuple[str, str | None]:
|
|
90
90
|
"""Spawn one subprocess per array element with ARRAY_INDEX env var."""
|
|
91
|
+
if max_concurrent is not None:
|
|
92
|
+
logger.warning(
|
|
93
|
+
"LocalExecutor does not support max_concurrent; "
|
|
94
|
+
"all %d elements will run simultaneously",
|
|
95
|
+
array_range[1] - array_range[0] + 1,
|
|
96
|
+
)
|
|
91
97
|
header = self.build_header(name, resources)
|
|
92
98
|
script = render_script(self.config, command, header, prologue, epilogue)
|
|
93
99
|
script_path = write_script(resources.work_dir, script, name, next(self._script_counter))
|
|
@@ -191,37 +197,43 @@ class LocalExecutor(Executor):
|
|
|
191
197
|
|
|
192
198
|
return {jid: r.status for jid, r in self._jobs.items()}
|
|
193
199
|
|
|
194
|
-
async def cancel(self, job_id: str) -> None:
|
|
200
|
+
async def cancel(self, job_id: str, *, done: bool = False) -> None:
|
|
195
201
|
"""Terminate a local subprocess (or all element processes for an array job)."""
|
|
196
|
-
#
|
|
202
|
+
# Collect all live processes for this job (single + array elements)
|
|
203
|
+
live: list[tuple[str, asyncio.subprocess.Process]] = []
|
|
197
204
|
proc = self._processes.get(job_id)
|
|
198
205
|
if proc and proc.returncode is None:
|
|
199
|
-
|
|
200
|
-
try:
|
|
201
|
-
await asyncio.wait_for(proc.wait(), timeout=5.0)
|
|
202
|
-
except asyncio.TimeoutError:
|
|
203
|
-
proc.kill()
|
|
204
|
-
|
|
205
|
-
self._close_output_files(job_id)
|
|
206
|
-
|
|
207
|
-
# Kill array element processes matching "{job_id}[*]"
|
|
206
|
+
live.append((job_id, proc))
|
|
208
207
|
prefix = f"{job_id}["
|
|
209
208
|
for key, proc in self._processes.items():
|
|
210
209
|
if key.startswith(prefix) and proc.returncode is None:
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
210
|
+
live.append((key, proc))
|
|
211
|
+
|
|
212
|
+
# Send SIGTERM to all, then wait concurrently
|
|
213
|
+
for _key, p in live:
|
|
214
|
+
p.terminate()
|
|
215
|
+
if live:
|
|
216
|
+
tasks = [asyncio.ensure_future(p.wait()) for _key, p in live]
|
|
217
|
+
_, pending = await asyncio.wait(tasks, timeout=5.0)
|
|
218
|
+
# SIGKILL any that didn't exit in time
|
|
219
|
+
for _key, p in live:
|
|
220
|
+
if p.returncode is None:
|
|
221
|
+
p.kill()
|
|
222
|
+
# Reap the killed processes
|
|
223
|
+
if pending:
|
|
224
|
+
await asyncio.wait(pending, timeout=5.0)
|
|
225
|
+
|
|
226
|
+
for key, _p in live:
|
|
227
|
+
self._close_output_files(key)
|
|
228
|
+
|
|
229
|
+
target_status = JobStatus.DONE if done else JobStatus.KILLED
|
|
218
230
|
if job_id in self._jobs:
|
|
219
231
|
record = self._jobs[job_id]
|
|
220
|
-
record.status =
|
|
232
|
+
record.status = target_status
|
|
221
233
|
for elem in record.array_elements.values():
|
|
222
234
|
if elem.status not in {JobStatus.DONE, JobStatus.FAILED, JobStatus.KILLED}:
|
|
223
|
-
elem.status =
|
|
224
|
-
logger.info("Cancelled local job %s", job_id)
|
|
235
|
+
elem.status = target_status
|
|
236
|
+
logger.info("Cancelled local job %s (done=%s)", job_id, done)
|
|
225
237
|
|
|
226
238
|
def _open_output_files(
|
|
227
239
|
self,
|
|
@@ -11,7 +11,7 @@ import re
|
|
|
11
11
|
from datetime import datetime, timezone
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
|
-
from .._types import ArrayElement, JobRecord, JobStatus, ResourceSpec
|
|
14
|
+
from .._types import ArrayElement, JobRecord, JobStatus, ResourceSpec, _TERMINAL_STATUSES
|
|
15
15
|
from ..config import ClusterConfig, parse_memory_bytes
|
|
16
16
|
from ..core import Executor, _ARRAY_ELEMENT_RE
|
|
17
17
|
from ..exceptions import ClusterAPIError, CommandFailedError
|
|
@@ -82,8 +82,8 @@ class LSFExecutor(Executor):
|
|
|
82
82
|
|
|
83
83
|
out = resources.stdout_path or f"{resources.work_dir}/stdout.%J.log"
|
|
84
84
|
err = resources.stderr_path or f"{resources.work_dir}/stderr.%J.log"
|
|
85
|
-
lines.append(f
|
|
86
|
-
lines.append(f
|
|
85
|
+
lines.append(f'{p} -o "{out}"')
|
|
86
|
+
lines.append(f'{p} -e "{err}"')
|
|
87
87
|
|
|
88
88
|
# Queue
|
|
89
89
|
queue = resources.queue or self.config.queue
|
|
@@ -116,7 +116,7 @@ class LSFExecutor(Executor):
|
|
|
116
116
|
lines.append(f"{p} -W {walltime}")
|
|
117
117
|
|
|
118
118
|
# Working directory
|
|
119
|
-
lines.append(f
|
|
119
|
+
lines.append(f'{p} -cwd "{resources.work_dir}"')
|
|
120
120
|
|
|
121
121
|
# Custom cluster options
|
|
122
122
|
if resources.extra_directives:
|
|
@@ -145,12 +145,11 @@ class LSFExecutor(Executor):
|
|
|
145
145
|
) -> str:
|
|
146
146
|
"""Run bsub with a script file and return raw output."""
|
|
147
147
|
submit_env = self._build_submit_env(env)
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
logger.debug("Running: %s", cmd)
|
|
148
|
+
cmd = [self.submit_command, *(extra_args or [])]
|
|
149
|
+
logger.debug("Running: %s < %s", cmd, script_path)
|
|
151
150
|
return await self._call(
|
|
152
151
|
cmd,
|
|
153
|
-
|
|
152
|
+
stdin_file=script_path,
|
|
154
153
|
env=submit_env,
|
|
155
154
|
timeout=self.config.command_timeout,
|
|
156
155
|
)
|
|
@@ -220,14 +219,12 @@ class LSFExecutor(Executor):
|
|
|
220
219
|
|
|
221
220
|
def _build_status_args(self) -> list[str]:
|
|
222
221
|
"""Build bjobs command with JSON output."""
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
"-
|
|
228
|
-
|
|
229
|
-
"-json",
|
|
230
|
-
]
|
|
222
|
+
args = [self.status_command]
|
|
223
|
+
if self.config.poll_all_users:
|
|
224
|
+
args.extend(["-u", "all"])
|
|
225
|
+
if self._prefix:
|
|
226
|
+
args.extend(["-J", f"{self._prefix}-*"])
|
|
227
|
+
args.extend(["-a", "-o", _BJOBS_FIELDS, "-json"])
|
|
231
228
|
return args
|
|
232
229
|
|
|
233
230
|
def _parse_job_statuses(
|
|
@@ -282,11 +279,26 @@ class LSFExecutor(Executor):
|
|
|
282
279
|
|
|
283
280
|
return result
|
|
284
281
|
|
|
282
|
+
async def _cancel_job(self, job_id: str, *, done: bool = False) -> None:
|
|
283
|
+
"""Run bkill, with ``-d`` when *done* is True."""
|
|
284
|
+
cmd = [self.cancel_command]
|
|
285
|
+
if done:
|
|
286
|
+
cmd.append("-d")
|
|
287
|
+
cmd.append(job_id)
|
|
288
|
+
logger.debug("Running: %s", " ".join(cmd))
|
|
289
|
+
await self._call(cmd, timeout=self.config.command_timeout)
|
|
290
|
+
|
|
285
291
|
async def cancel_by_name(self, name_pattern: str) -> None:
|
|
286
292
|
"""Cancel jobs matching name pattern via bkill -J."""
|
|
287
293
|
cmd = [self.cancel_command, "-J", name_pattern]
|
|
288
294
|
logger.debug("Running: %s", " ".join(cmd))
|
|
289
|
-
|
|
295
|
+
try:
|
|
296
|
+
await self._call(cmd, timeout=self.config.command_timeout)
|
|
297
|
+
except CommandFailedError as e:
|
|
298
|
+
if "No matching job" in str(e) or "No unfinished job" in str(e):
|
|
299
|
+
logger.debug("No jobs matched pattern %s", name_pattern)
|
|
300
|
+
return
|
|
301
|
+
raise
|
|
290
302
|
# Update in-memory state for matching jobs
|
|
291
303
|
for record in self._jobs.values():
|
|
292
304
|
if not record.is_terminal and fnmatch.fnmatch(record.name, name_pattern):
|
|
@@ -302,13 +314,16 @@ class LSFExecutor(Executor):
|
|
|
302
314
|
"Cannot reconnect: no job_name_prefix was configured. "
|
|
303
315
|
"Set job_name_prefix in config to enable reconnection."
|
|
304
316
|
)
|
|
305
|
-
|
|
306
|
-
|
|
317
|
+
args = [self.status_command]
|
|
318
|
+
if self.config.poll_all_users:
|
|
319
|
+
args.extend(["-u", "all"])
|
|
320
|
+
args.extend([
|
|
307
321
|
"-J", f"{self._prefix}-*",
|
|
308
322
|
"-a",
|
|
309
323
|
"-o", _BJOBS_RECONNECT_FIELDS,
|
|
310
324
|
"-json",
|
|
311
|
-
]
|
|
325
|
+
])
|
|
326
|
+
return args
|
|
312
327
|
|
|
313
328
|
async def reconnect(self) -> list[JobRecord]:
|
|
314
329
|
"""Reconnect to running jobs and resume tracking them.
|
|
@@ -347,11 +362,14 @@ class LSFExecutor(Executor):
|
|
|
347
362
|
new_records: list[JobRecord] = []
|
|
348
363
|
now = datetime.now(timezone.utc)
|
|
349
364
|
|
|
350
|
-
# Process single (non-array) jobs
|
|
365
|
+
# Process single (non-array) jobs, skipping terminal ones
|
|
366
|
+
# (-a returns DONE/EXIT jobs too; no point reconnecting to those)
|
|
351
367
|
for job_id, entries in singles.items():
|
|
352
368
|
if job_id in self._jobs:
|
|
353
369
|
continue
|
|
354
370
|
_, status, meta = entries[0]
|
|
371
|
+
if status in _TERMINAL_STATUSES:
|
|
372
|
+
continue
|
|
355
373
|
record = JobRecord(
|
|
356
374
|
job_id=job_id,
|
|
357
375
|
name=meta.get("job_name") or "",
|
|
@@ -371,9 +389,12 @@ class LSFExecutor(Executor):
|
|
|
371
389
|
new_records.append(record)
|
|
372
390
|
|
|
373
391
|
# Process array elements, grouping under parent
|
|
392
|
+
# Skip arrays where every visible element is already terminal
|
|
374
393
|
for parent_id, elements in arrays.items():
|
|
375
394
|
if parent_id in self._jobs:
|
|
376
395
|
continue
|
|
396
|
+
if all(s in _TERMINAL_STATUSES for _, s, _ in elements):
|
|
397
|
+
continue
|
|
377
398
|
indices = sorted(idx for idx, _, _ in elements)
|
|
378
399
|
array_range = (min(indices), max(indices))
|
|
379
400
|
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# API Reference
|
|
2
|
+
|
|
3
|
+
## `create_executor(profile=None, config_path=None, **overrides)`
|
|
4
|
+
|
|
5
|
+
Factory function that loads config and returns an `Executor` instance.
|
|
6
|
+
|
|
7
|
+
## `Executor`
|
|
8
|
+
|
|
9
|
+
Abstract base class. Key methods:
|
|
10
|
+
|
|
11
|
+
- `submit(command, name, resources=None, prologue=None, epilogue=None, env=None, metadata=None)` — submit a job, returns `JobRecord`
|
|
12
|
+
- `submit_array(command, name, array_range, ...)` — submit a job array
|
|
13
|
+
- `cancel(job_id, *, done=False)` — cancel a job by ID. By default marks the job as `KILLED`; pass `done=True` to mark it as `DONE` instead (useful for graceful pipeline termination where you don't want downstream logic to treat the cancellation as a failure)
|
|
14
|
+
- `cancel_by_name(name_pattern)` — cancel jobs matching a name pattern (LSF only)
|
|
15
|
+
- `cancel_all(*, done=False)` — cancel all tracked non-terminal jobs
|
|
16
|
+
- `reconnect()` — rediscover running jobs after a process restart (requires `job_name_prefix`)
|
|
17
|
+
- `poll()` — query scheduler and update job statuses
|
|
18
|
+
- `jobs` / `active_jobs` — properties returning tracked job dicts
|
|
19
|
+
|
|
20
|
+
## `JobRecord`
|
|
21
|
+
|
|
22
|
+
Tracks a submitted job. Fields include `job_id`, `name`, `status`, `exit_code`, `exec_host`, `max_mem`, `submit_time`, `start_time`, `finish_time`, and `metadata`.
|
|
23
|
+
|
|
24
|
+
- `on_success(callback)` — register callback for exit code 0
|
|
25
|
+
- `on_failure(callback)` — register callback for non-zero exit
|
|
26
|
+
- `on_exit(callback, condition=ANY)` — register callback for any exit condition
|
|
27
|
+
- `is_terminal` — whether the job has finished
|
|
28
|
+
|
|
29
|
+
## `JobMonitor`
|
|
30
|
+
|
|
31
|
+
Async polling loop that drives status updates and callback dispatch.
|
|
32
|
+
|
|
33
|
+
- `start()` / `stop()` — control the polling loop
|
|
34
|
+
- `wait_for(*records, timeout=None)` — block until jobs reach a terminal state
|
|
35
|
+
|
|
36
|
+
The monitor does not support `async with`, so use `try/finally` to ensure cleanup:
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
monitor = JobMonitor(executor)
|
|
40
|
+
await monitor.start()
|
|
41
|
+
try:
|
|
42
|
+
job = await executor.submit(command="echo hi", name="test")
|
|
43
|
+
await monitor.wait_for(job)
|
|
44
|
+
finally:
|
|
45
|
+
await monitor.stop()
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## `ResourceSpec`
|
|
49
|
+
|
|
50
|
+
Resource requirements: `cpus`, `gpus`, `memory`, `walltime`, `queue`, `work_dir`, `stdout_path`, `stderr_path`, `extra_directives`, `extra_args`.
|
|
51
|
+
|
|
52
|
+
## Error Handling
|
|
53
|
+
|
|
54
|
+
All exceptions inherit from `ClusterAPIError`, so you can catch broadly or narrowly:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from cluster_api import ClusterAPIError, SubmitError, CommandTimeoutError, CommandFailedError
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
job = await executor.submit(command="echo hi", name="test")
|
|
61
|
+
except SubmitError as e:
|
|
62
|
+
# Could not parse job ID from scheduler output
|
|
63
|
+
print(f"Submission failed: {e}")
|
|
64
|
+
except CommandTimeoutError as e:
|
|
65
|
+
# Scheduler command (bsub, bjobs, bkill) exceeded command_timeout
|
|
66
|
+
print(f"Scheduler timed out: {e}")
|
|
67
|
+
except CommandFailedError as e:
|
|
68
|
+
# Scheduler command returned a non-zero exit code
|
|
69
|
+
print(f"Scheduler error: {e}")
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
| Exception | Raised when |
|
|
73
|
+
|---|---|
|
|
74
|
+
| `ClusterAPIError` | Base class for all library errors |
|
|
75
|
+
| `SubmitError` | Job ID could not be parsed from submit output |
|
|
76
|
+
| `CommandTimeoutError` | A scheduler CLI command exceeded `command_timeout` |
|
|
77
|
+
| `CommandFailedError` | A scheduler CLI command exited with non-zero status |
|
|
@@ -37,6 +37,7 @@ pixi run check # lint + test together
|
|
|
37
37
|
| `test_lsf.py` | `LSFExecutor` header building, bsub submission, bjobs parsing, array rewriting | No — mocks `_call()` |
|
|
38
38
|
| `test_local.py` | `LocalExecutor` end-to-end (submit, poll, output files, callbacks, array jobs) | **Yes** — runs real bash subprocesses |
|
|
39
39
|
| `test_monitor.py` | `JobMonitor` polling loop, callback dispatch, zombie detection, purging | No — mocks `poll()` |
|
|
40
|
+
| `test_reconnect.py` | `LSFExecutor.reconnect()` — rediscovering running jobs after restart | No — mocks `_call()` |
|
|
40
41
|
| `test_integration.py` | Full LSF round-trips (submit, monitor, cancel, arrays, metadata) | **Yes** — requires a live LSF cluster |
|
|
41
42
|
|
|
42
43
|
### Writing tests
|
|
@@ -102,8 +103,10 @@ JobMonitor (monitor.py) # async polling loop → callbacks + zombie detection
|
|
|
102
103
|
```
|
|
103
104
|
|
|
104
105
|
- `build_header()` (per executor) produces directive lines from `ResourceSpec` + config defaults.
|
|
105
|
-
- `extra_directives`
|
|
106
|
-
- `
|
|
106
|
+
- `extra_directives` has two levels with different behaviour:
|
|
107
|
+
- **Config-level** (`ClusterConfig.extra_directives`): appended verbatim to the script header — users must include the full prefix, e.g. `"#BSUB -P myproject"`.
|
|
108
|
+
- **ResourceSpec-level** (`ResourceSpec.extra_directives`): the directive prefix is added automatically, so users write `"-P myproject"` and the executor produces `"#BSUB -P myproject"`.
|
|
109
|
+
- `extra_args` (config-level and per-job) append raw arguments to the submit command line, bypassing the script entirely. Both levels are merged at submit time: config-level args come first, then per-job (`ResourceSpec.extra_args`) args are appended.
|
|
107
110
|
- `directives_skip` filters out unwanted directive lines by substring match.
|
|
108
111
|
- Scripts are written to `{work_dir}/{safe_name}.{counter}.sh` and made executable.
|
|
109
112
|
|
|
@@ -133,8 +136,8 @@ Terminal jobs are purged from memory after `completed_retention_minutes` (once a
|
|
|
133
136
|
### Key design decisions
|
|
134
137
|
|
|
135
138
|
- **Poll-based monitoring** — unlike dask-jobqueue (which relies on workers phoning home), this library actively polls the scheduler. This means it works with any executable, not just Python workers.
|
|
136
|
-
- **
|
|
137
|
-
- **Job name prefixing** — all jobs get a `{prefix}-{name}` name
|
|
139
|
+
- **Stdin-based submission** — job scripts are written to disk, then submitted via stdin redirection (`bsub < script.sh`). The script file is kept on disk for debugging.
|
|
140
|
+
- **Job name prefixing** — when `job_name_prefix` is configured, all jobs get a `{prefix}-{name}` name and polling filters by that prefix. When unset, the user controls the full job name and polling queries all jobs. `reconnect()` requires a prefix to be set.
|
|
138
141
|
- **Array status aggregation** — parent array job status is computed from element statuses. Only transitions to terminal when ALL expected elements are terminal.
|
|
139
142
|
|
|
140
143
|
## Module reference
|
|
@@ -5,6 +5,8 @@ environments:
|
|
|
5
5
|
- url: https://conda.anaconda.org/conda-forge/
|
|
6
6
|
indexes:
|
|
7
7
|
- https://pypi.org/simple
|
|
8
|
+
options:
|
|
9
|
+
pypi-prerelease-mode: if-necessary-or-explicit
|
|
8
10
|
packages:
|
|
9
11
|
linux-64:
|
|
10
12
|
- conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
|
|
@@ -52,6 +54,8 @@ environments:
|
|
|
52
54
|
- url: https://conda.anaconda.org/conda-forge/
|
|
53
55
|
indexes:
|
|
54
56
|
- https://pypi.org/simple
|
|
57
|
+
options:
|
|
58
|
+
pypi-prerelease-mode: if-necessary-or-explicit
|
|
55
59
|
packages:
|
|
56
60
|
linux-64:
|
|
57
61
|
- conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
|
|
@@ -841,8 +845,8 @@ packages:
|
|
|
841
845
|
timestamp: 1764896838868
|
|
842
846
|
- pypi: ./
|
|
843
847
|
name: py-cluster-api
|
|
844
|
-
version: 0.
|
|
845
|
-
sha256:
|
|
848
|
+
version: 0.4.0
|
|
849
|
+
sha256: 1dd95e2002e0e1b4908c3ea27e6c9b575ae0e2e514cf00a01289b554502ce15d
|
|
846
850
|
requires_dist:
|
|
847
851
|
- pyyaml
|
|
848
852
|
- pytest ; extra == 'test'
|
|
@@ -851,7 +855,6 @@ packages:
|
|
|
851
855
|
- build ; extra == 'release'
|
|
852
856
|
- twine ; extra == 'release'
|
|
853
857
|
requires_python: '>=3.10'
|
|
854
|
-
editable: true
|
|
855
858
|
- conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda
|
|
856
859
|
sha256: 79db7928d13fab2d892592223d7570f5061c192f27b9febd1a418427b719acc6
|
|
857
860
|
md5: 12c566707c80111f9799308d9e265aef
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "py-cluster-api"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.4.0"
|
|
4
4
|
description = "Generic Python library for running jobs on HPC clusters"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -44,7 +44,7 @@ asyncio_mode = "auto"
|
|
|
44
44
|
markers = ["integration: tests that submit real jobs to the cluster (deselected by default)"]
|
|
45
45
|
addopts = "-m 'not integration'"
|
|
46
46
|
|
|
47
|
-
[tool.pixi.
|
|
47
|
+
[tool.pixi.workspace]
|
|
48
48
|
channels = ["conda-forge"]
|
|
49
49
|
platforms = ["linux-64"]
|
|
50
50
|
|
|
@@ -115,21 +115,39 @@ class TestPrefix:
|
|
|
115
115
|
executor = LocalExecutor(default_config)
|
|
116
116
|
assert executor._prefix == "test"
|
|
117
117
|
|
|
118
|
-
def
|
|
118
|
+
def test_no_prefix_when_none(self):
|
|
119
119
|
from cluster_api.config import ClusterConfig
|
|
120
120
|
|
|
121
121
|
config = ClusterConfig()
|
|
122
122
|
executor = LocalExecutor(config)
|
|
123
|
-
assert
|
|
124
|
-
assert executor._prefix.isalnum()
|
|
123
|
+
assert executor._prefix is None
|
|
125
124
|
|
|
126
|
-
def
|
|
125
|
+
async def test_submit_no_prefix(self, work_dir):
|
|
127
126
|
from cluster_api.config import ClusterConfig
|
|
128
127
|
|
|
129
128
|
config = ClusterConfig()
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
129
|
+
executor = LocalExecutor(config)
|
|
130
|
+
job = await executor.submit(
|
|
131
|
+
command="echo hello",
|
|
132
|
+
name="my-job",
|
|
133
|
+
resources=ResourceSpec(work_dir=work_dir),
|
|
134
|
+
)
|
|
135
|
+
assert job.name == "my-job"
|
|
136
|
+
await executor.cancel(job.job_id)
|
|
137
|
+
|
|
138
|
+
async def test_submit_array_no_prefix(self, work_dir):
|
|
139
|
+
from cluster_api.config import ClusterConfig
|
|
140
|
+
|
|
141
|
+
config = ClusterConfig()
|
|
142
|
+
executor = LocalExecutor(config)
|
|
143
|
+
job = await executor.submit_array(
|
|
144
|
+
command="echo hello",
|
|
145
|
+
name="my-array",
|
|
146
|
+
array_range=(1, 2),
|
|
147
|
+
resources=ResourceSpec(work_dir=work_dir),
|
|
148
|
+
)
|
|
149
|
+
assert job.name == "my-array"
|
|
150
|
+
await executor.cancel(job.job_id)
|
|
133
151
|
|
|
134
152
|
|
|
135
153
|
class TestSanitizeJobName:
|
|
@@ -186,3 +204,14 @@ class TestCancelAll:
|
|
|
186
204
|
|
|
187
205
|
await executor.cancel_all()
|
|
188
206
|
assert job.status == JobStatus.KILLED
|
|
207
|
+
|
|
208
|
+
async def test_cancel_all_done(self, default_config, work_dir):
|
|
209
|
+
executor = LocalExecutor(default_config)
|
|
210
|
+
job = await executor.submit(
|
|
211
|
+
command="sleep 60", name="sleeper",
|
|
212
|
+
resources=ResourceSpec(work_dir=work_dir),
|
|
213
|
+
)
|
|
214
|
+
assert not job.is_terminal
|
|
215
|
+
|
|
216
|
+
await executor.cancel_all(done=True)
|
|
217
|
+
assert job.status == JobStatus.DONE
|
|
@@ -70,6 +70,16 @@ class TestLocalSubmitAndPoll:
|
|
|
70
70
|
await executor.cancel(job.job_id)
|
|
71
71
|
assert job.status == JobStatus.KILLED
|
|
72
72
|
|
|
73
|
+
async def test_cancel_done(self, default_config, work_dir):
|
|
74
|
+
executor = LocalExecutor(default_config)
|
|
75
|
+
job = await executor.submit(
|
|
76
|
+
command="sleep 60", name="cancel-done-test",
|
|
77
|
+
resources=ResourceSpec(work_dir=work_dir),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
await asyncio.sleep(0.1)
|
|
81
|
+
await executor.cancel(job.job_id, done=True)
|
|
82
|
+
assert job.status == JobStatus.DONE
|
|
73
83
|
|
|
74
84
|
async def test_multiple_jobs(self, default_config, work_dir):
|
|
75
85
|
executor = LocalExecutor(default_config)
|
|
@@ -8,6 +8,7 @@ from unittest.mock import AsyncMock, patch
|
|
|
8
8
|
import pytest
|
|
9
9
|
|
|
10
10
|
from cluster_api._types import ArrayElement, JobRecord, JobStatus, ResourceSpec
|
|
11
|
+
from cluster_api.exceptions import CommandFailedError
|
|
11
12
|
from cluster_api.executors.lsf import (
|
|
12
13
|
LSFExecutor,
|
|
13
14
|
_LSF_STATUS_MAP,
|
|
@@ -61,7 +62,7 @@ class TestBuildHeader:
|
|
|
61
62
|
assert any("-n 4" in line for line in lines)
|
|
62
63
|
assert any("span[hosts=1]" in line for line in lines)
|
|
63
64
|
assert any("-W 08:00" in line for line in lines)
|
|
64
|
-
assert any(
|
|
65
|
+
assert any('-cwd "/scratch"' in line for line in lines)
|
|
65
66
|
|
|
66
67
|
def test_single_cpu_no_span(self, lsf_config):
|
|
67
68
|
executor = LSFExecutor(lsf_config)
|
|
@@ -267,6 +268,17 @@ class TestBuildStatusArgs:
|
|
|
267
268
|
assert "-json" in args
|
|
268
269
|
assert "test-*" in args
|
|
269
270
|
|
|
271
|
+
def test_status_args_no_prefix(self):
|
|
272
|
+
from cluster_api.config import ClusterConfig
|
|
273
|
+
|
|
274
|
+
config = ClusterConfig(executor="lsf", lsf_units="MB")
|
|
275
|
+
executor = LSFExecutor(config)
|
|
276
|
+
args = executor._build_status_args()
|
|
277
|
+
assert "bjobs" in args
|
|
278
|
+
assert "-a" in args
|
|
279
|
+
assert "-json" in args
|
|
280
|
+
assert "-J" not in args
|
|
281
|
+
|
|
270
282
|
|
|
271
283
|
class TestSubmission:
|
|
272
284
|
|
|
@@ -285,11 +297,11 @@ class TestSubmission:
|
|
|
285
297
|
assert job.job_id == "12345"
|
|
286
298
|
assert job.name == "test-my-job"
|
|
287
299
|
assert job.status == JobStatus.PENDING
|
|
288
|
-
# Verify
|
|
300
|
+
# Verify bsub invocation with stdin_file
|
|
289
301
|
cmd = mock_call.call_args[0][0]
|
|
290
|
-
assert "bsub"
|
|
291
|
-
|
|
292
|
-
assert
|
|
302
|
+
assert cmd[0] == "bsub"
|
|
303
|
+
kwargs = mock_call.call_args[1]
|
|
304
|
+
assert kwargs["stdin_file"].endswith(".sh")
|
|
293
305
|
|
|
294
306
|
|
|
295
307
|
async def test_submit_email_suppression(self, lsf_config, work_dir):
|
|
@@ -350,6 +362,53 @@ class TestArrayScriptRewriting:
|
|
|
350
362
|
assert "stderr.%J.%I.log" in script
|
|
351
363
|
|
|
352
364
|
|
|
365
|
+
class TestCancel:
|
|
366
|
+
|
|
367
|
+
async def test_cancel_passes_d_flag_when_done(self, lsf_config):
|
|
368
|
+
executor = LSFExecutor(lsf_config)
|
|
369
|
+
with patch.object(
|
|
370
|
+
executor, "_call",
|
|
371
|
+
new_callable=AsyncMock,
|
|
372
|
+
return_value="Job <123> is being submitted",
|
|
373
|
+
):
|
|
374
|
+
job = await executor.submit(
|
|
375
|
+
command="echo hi", name="cancel-done",
|
|
376
|
+
resources=ResourceSpec(work_dir="/tmp"),
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
with patch.object(
|
|
380
|
+
executor, "_call",
|
|
381
|
+
new_callable=AsyncMock,
|
|
382
|
+
return_value="",
|
|
383
|
+
) as mock_call:
|
|
384
|
+
await executor.cancel(job.job_id, done=True)
|
|
385
|
+
args = mock_call.call_args[0][0]
|
|
386
|
+
assert args == ["bkill", "-d", job.job_id]
|
|
387
|
+
assert job.status == JobStatus.DONE
|
|
388
|
+
|
|
389
|
+
async def test_cancel_without_done_flag(self, lsf_config):
|
|
390
|
+
executor = LSFExecutor(lsf_config)
|
|
391
|
+
with patch.object(
|
|
392
|
+
executor, "_call",
|
|
393
|
+
new_callable=AsyncMock,
|
|
394
|
+
return_value="Job <456> is being submitted",
|
|
395
|
+
):
|
|
396
|
+
job = await executor.submit(
|
|
397
|
+
command="echo hi", name="cancel-kill",
|
|
398
|
+
resources=ResourceSpec(work_dir="/tmp"),
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
with patch.object(
|
|
402
|
+
executor, "_call",
|
|
403
|
+
new_callable=AsyncMock,
|
|
404
|
+
return_value="",
|
|
405
|
+
) as mock_call:
|
|
406
|
+
await executor.cancel(job.job_id)
|
|
407
|
+
args = mock_call.call_args[0][0]
|
|
408
|
+
assert args == ["bkill", job.job_id]
|
|
409
|
+
assert job.status == JobStatus.KILLED
|
|
410
|
+
|
|
411
|
+
|
|
353
412
|
class TestCancelByName:
|
|
354
413
|
|
|
355
414
|
async def test_cancel_by_name(self, lsf_config):
|
|
@@ -366,6 +425,16 @@ class TestCancelByName:
|
|
|
366
425
|
assert "-J" in args
|
|
367
426
|
assert "test-*" in args
|
|
368
427
|
|
|
428
|
+
async def test_cancel_by_name_no_match(self, lsf_config):
|
|
429
|
+
"""bkill -J returns non-zero when no jobs match; should not raise."""
|
|
430
|
+
executor = LSFExecutor(lsf_config)
|
|
431
|
+
with patch.object(
|
|
432
|
+
executor, "_call",
|
|
433
|
+
new_callable=AsyncMock,
|
|
434
|
+
side_effect=CommandFailedError("No matching job found"),
|
|
435
|
+
):
|
|
436
|
+
await executor.cancel_by_name("nonexistent-*")
|
|
437
|
+
|
|
369
438
|
|
|
370
439
|
class TestParseLsfTime:
|
|
371
440
|
def test_standard_format(self):
|
|
@@ -61,7 +61,8 @@ class TestReconnectByPrefix:
|
|
|
61
61
|
assert job.resources is None
|
|
62
62
|
assert job.exec_host == "node01"
|
|
63
63
|
|
|
64
|
-
async def
|
|
64
|
+
async def test_completed_job_skipped(self, lsf_config):
|
|
65
|
+
"""Terminal jobs from -a flag should not be reconnected."""
|
|
65
66
|
executor = LSFExecutor(lsf_config)
|
|
66
67
|
output = _make_bjobs_json([
|
|
67
68
|
_make_record(
|
|
@@ -73,11 +74,10 @@ class TestReconnectByPrefix:
|
|
|
73
74
|
with patch.object(executor, "_call", new_callable=AsyncMock, return_value=output):
|
|
74
75
|
jobs = await executor.reconnect()
|
|
75
76
|
|
|
76
|
-
assert len(jobs) ==
|
|
77
|
-
assert jobs[0].status == JobStatus.DONE
|
|
78
|
-
assert jobs[0].exit_code == 0
|
|
77
|
+
assert len(jobs) == 0
|
|
79
78
|
|
|
80
|
-
async def
|
|
79
|
+
async def test_multiple_jobs_filters_terminal(self, lsf_config):
|
|
80
|
+
"""Only non-terminal jobs should be reconnected; DONE/EXIT are skipped."""
|
|
81
81
|
executor = LSFExecutor(lsf_config)
|
|
82
82
|
output = _make_bjobs_json([
|
|
83
83
|
_make_record(job_id="100", job_name="test-a", stat="RUN"),
|
|
@@ -91,13 +91,12 @@ class TestReconnectByPrefix:
|
|
|
91
91
|
with patch.object(executor, "_call", new_callable=AsyncMock, return_value=output):
|
|
92
92
|
jobs = await executor.reconnect()
|
|
93
93
|
|
|
94
|
-
assert len(jobs) ==
|
|
94
|
+
assert len(jobs) == 2
|
|
95
95
|
ids = {j.job_id for j in jobs}
|
|
96
|
-
assert ids == {"100", "101"
|
|
96
|
+
assert ids == {"100", "101"}
|
|
97
97
|
by_id = {j.job_id: j for j in jobs}
|
|
98
98
|
assert by_id["100"].status == JobStatus.RUNNING
|
|
99
99
|
assert by_id["101"].status == JobStatus.PENDING
|
|
100
|
-
assert by_id["102"].status == JobStatus.DONE
|
|
101
100
|
|
|
102
101
|
async def test_skips_already_tracked(self, lsf_config, work_dir):
|
|
103
102
|
executor = LSFExecutor(lsf_config)
|
|
@@ -232,9 +231,9 @@ class TestReconnectArrayJobs:
|
|
|
232
231
|
|
|
233
232
|
assert jobs[0].metadata["array_range"] == (5, 10)
|
|
234
233
|
|
|
235
|
-
async def
|
|
234
|
+
async def test_all_terminal_array_skipped(self, lsf_config):
|
|
235
|
+
"""Array where all visible elements are terminal should not be reconnected."""
|
|
236
236
|
executor = LSFExecutor(lsf_config)
|
|
237
|
-
# All elements done → parent status should be DONE
|
|
238
237
|
output = _make_bjobs_json([
|
|
239
238
|
_make_record(job_id="600[1]", job_name="test-alldone", stat="DONE", exit_code="0"),
|
|
240
239
|
_make_record(job_id="600[2]", job_name="test-alldone", stat="DONE", exit_code="0"),
|
|
@@ -243,9 +242,10 @@ class TestReconnectArrayJobs:
|
|
|
243
242
|
with patch.object(executor, "_call", new_callable=AsyncMock, return_value=output):
|
|
244
243
|
jobs = await executor.reconnect()
|
|
245
244
|
|
|
246
|
-
assert jobs
|
|
245
|
+
assert len(jobs) == 0
|
|
247
246
|
|
|
248
|
-
async def
|
|
247
|
+
async def test_all_terminal_array_with_failure_skipped(self, lsf_config):
|
|
248
|
+
"""Array where all elements are terminal (even with failures) should not be reconnected."""
|
|
249
249
|
executor = LSFExecutor(lsf_config)
|
|
250
250
|
output = _make_bjobs_json([
|
|
251
251
|
_make_record(job_id="700[1]", job_name="test-mixed", stat="DONE", exit_code="0"),
|
|
@@ -255,8 +255,7 @@ class TestReconnectArrayJobs:
|
|
|
255
255
|
with patch.object(executor, "_call", new_callable=AsyncMock, return_value=output):
|
|
256
256
|
jobs = await executor.reconnect()
|
|
257
257
|
|
|
258
|
-
assert jobs
|
|
259
|
-
assert jobs[0].failed_element_indices == [2]
|
|
258
|
+
assert len(jobs) == 0
|
|
260
259
|
|
|
261
260
|
async def test_mixed_single_and_array(self, lsf_config):
|
|
262
261
|
executor = LSFExecutor(lsf_config)
|
|
@@ -280,8 +279,8 @@ class TestReconnectArrayJobs:
|
|
|
280
279
|
executor = LSFExecutor(lsf_config)
|
|
281
280
|
output = _make_bjobs_json([
|
|
282
281
|
_make_record(
|
|
283
|
-
job_id="1000[1]", job_name="test-meta", stat="
|
|
284
|
-
|
|
282
|
+
job_id="1000[1]", job_name="test-meta", stat="RUN",
|
|
283
|
+
exec_host="node01", max_mem="256 MB",
|
|
285
284
|
),
|
|
286
285
|
])
|
|
287
286
|
|
|
@@ -291,7 +290,6 @@ class TestReconnectArrayJobs:
|
|
|
291
290
|
elem = jobs[0].array_elements[1]
|
|
292
291
|
assert elem.exec_host == "node01"
|
|
293
292
|
assert elem.max_mem == "256 MB"
|
|
294
|
-
assert elem.exit_code == 0
|
|
295
293
|
|
|
296
294
|
|
|
297
295
|
class TestReconnectThenPoll:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|