lsst-ctrl-mpexec 29.2025.2400__py3-none-any.whl → 29.2025.3200__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/ctrl/mpexec/__init__.py +1 -2
- lsst/ctrl/mpexec/cli/butler_factory.py +464 -0
- lsst/ctrl/mpexec/cli/cmd/commands.py +7 -1
- lsst/ctrl/mpexec/cli/opt/optionGroups.py +0 -13
- lsst/ctrl/mpexec/cli/opt/options.py +0 -46
- lsst/ctrl/mpexec/cli/script/build.py +49 -36
- lsst/ctrl/mpexec/cli/script/pre_exec_init_qbb.py +3 -1
- lsst/ctrl/mpexec/cli/script/qgraph.py +0 -25
- lsst/ctrl/mpexec/cli/script/run.py +2 -1
- lsst/ctrl/mpexec/cli/script/run_qbb.py +2 -1
- lsst/ctrl/mpexec/cmdLineFwk.py +30 -556
- lsst/ctrl/mpexec/execFixupDataId.py +9 -101
- lsst/ctrl/mpexec/executionGraphFixup.py +12 -37
- lsst/ctrl/mpexec/log_capture.py +9 -195
- lsst/ctrl/mpexec/mpGraphExecutor.py +60 -696
- lsst/ctrl/mpexec/quantumGraphExecutor.py +20 -90
- lsst/ctrl/mpexec/reports.py +30 -206
- lsst/ctrl/mpexec/separablePipelineExecutor.py +12 -263
- lsst/ctrl/mpexec/showInfo.py +2 -2
- lsst/ctrl/mpexec/simple_pipeline_executor.py +11 -590
- lsst/ctrl/mpexec/singleQuantumExecutor.py +75 -532
- lsst/ctrl/mpexec/taskFactory.py +12 -38
- lsst/ctrl/mpexec/version.py +1 -1
- {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/METADATA +1 -1
- lsst_ctrl_mpexec-29.2025.3200.dist-info/RECORD +51 -0
- lsst/ctrl/mpexec/dotTools.py +0 -100
- lsst_ctrl_mpexec-29.2025.2400.dist-info/RECORD +0 -51
- {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/WHEEL +0 -0
- {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/entry_points.txt +0 -0
- {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/licenses/LICENSE +0 -0
- {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/top_level.txt +0 -0
- {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/zip-safe +0 -0
|
@@ -25,401 +25,43 @@
|
|
|
25
25
|
# You should have received a copy of the GNU General Public License
|
|
26
26
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
27
27
|
|
|
28
|
-
|
|
28
|
+
__all__ = ("MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError")
|
|
29
29
|
|
|
30
|
-
__all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"]
|
|
31
|
-
|
|
32
|
-
import importlib
|
|
33
|
-
import logging
|
|
34
|
-
import multiprocessing
|
|
35
|
-
import pickle
|
|
36
|
-
import signal
|
|
37
|
-
import sys
|
|
38
|
-
import threading
|
|
39
|
-
import time
|
|
40
|
-
import uuid
|
|
41
|
-
from collections.abc import Iterable
|
|
42
|
-
from enum import Enum
|
|
43
30
|
from typing import Literal
|
|
44
31
|
|
|
45
|
-
from
|
|
46
|
-
from lsst.pipe.base import InvalidQuantumError, RepeatableQuantumError
|
|
47
|
-
from lsst.pipe.base.graph.graph import QuantumGraph, QuantumNode
|
|
48
|
-
from lsst.pipe.base.pipeline_graph import TaskNode
|
|
49
|
-
from lsst.utils.threads import disable_implicit_threading
|
|
50
|
-
|
|
51
|
-
from .executionGraphFixup import ExecutionGraphFixup
|
|
52
|
-
from .quantumGraphExecutor import QuantumExecutor, QuantumGraphExecutor
|
|
53
|
-
from .reports import ExecutionStatus, QuantumReport, Report
|
|
54
|
-
|
|
55
|
-
_LOG = logging.getLogger(__name__)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
# Possible states for the executing task:
|
|
59
|
-
# - PENDING: job has not started yet
|
|
60
|
-
# - RUNNING: job is currently executing
|
|
61
|
-
# - FINISHED: job finished successfully
|
|
62
|
-
# - FAILED: job execution failed (process returned non-zero status)
|
|
63
|
-
# - TIMED_OUT: job is killed due to too long execution time
|
|
64
|
-
# - FAILED_DEP: one of the dependencies of this job has failed/timed out
|
|
65
|
-
JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP")
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class _Job:
|
|
69
|
-
"""Class representing a job running single task.
|
|
70
|
-
|
|
71
|
-
Parameters
|
|
72
|
-
----------
|
|
73
|
-
qnode: `~lsst.pipe.base.QuantumNode`
|
|
74
|
-
Quantum and some associated information.
|
|
75
|
-
"""
|
|
76
|
-
|
|
77
|
-
def __init__(self, qnode: QuantumNode, fail_fast: bool = False):
|
|
78
|
-
self.qnode = qnode
|
|
79
|
-
self._fail_fast = fail_fast
|
|
80
|
-
self.process: multiprocessing.process.BaseProcess | None = None
|
|
81
|
-
self._state = JobState.PENDING
|
|
82
|
-
self.started: float = 0.0
|
|
83
|
-
self._rcv_conn: multiprocessing.connection.Connection | None = None
|
|
84
|
-
self._terminated = False
|
|
85
|
-
|
|
86
|
-
@property
|
|
87
|
-
def state(self) -> JobState:
|
|
88
|
-
"""Job processing state (JobState)."""
|
|
89
|
-
return self._state
|
|
90
|
-
|
|
91
|
-
@property
|
|
92
|
-
def terminated(self) -> bool:
|
|
93
|
-
"""Return `True` if job was killed by stop() method and negative exit
|
|
94
|
-
code is returned from child process (`bool`).
|
|
95
|
-
"""
|
|
96
|
-
if self._terminated:
|
|
97
|
-
assert self.process is not None, "Process must be started"
|
|
98
|
-
if self.process.exitcode is not None:
|
|
99
|
-
return self.process.exitcode < 0
|
|
100
|
-
return False
|
|
101
|
-
|
|
102
|
-
def start(
|
|
103
|
-
self,
|
|
104
|
-
quantumExecutor: QuantumExecutor,
|
|
105
|
-
startMethod: Literal["spawn"] | Literal["forkserver"],
|
|
106
|
-
) -> None:
|
|
107
|
-
"""Start process which runs the task.
|
|
108
|
-
|
|
109
|
-
Parameters
|
|
110
|
-
----------
|
|
111
|
-
quantumExecutor : `QuantumExecutor`
|
|
112
|
-
Executor for single quantum.
|
|
113
|
-
startMethod : `str`, optional
|
|
114
|
-
Start method from `multiprocessing` module.
|
|
115
|
-
"""
|
|
116
|
-
# Unpickling of quantum has to happen after butler/executor, also we
|
|
117
|
-
# want to setup logging before unpickling anything that can generate
|
|
118
|
-
# messages, this is why things are pickled manually here.
|
|
119
|
-
qe_pickle = pickle.dumps(quantumExecutor)
|
|
120
|
-
task_node_pickle = pickle.dumps(self.qnode.task_node)
|
|
121
|
-
quantum_pickle = pickle.dumps(self.qnode.quantum)
|
|
122
|
-
self._rcv_conn, snd_conn = multiprocessing.Pipe(False)
|
|
123
|
-
logConfigState = CliLog.configState
|
|
124
|
-
|
|
125
|
-
mp_ctx = multiprocessing.get_context(startMethod)
|
|
126
|
-
self.process = mp_ctx.Process( # type: ignore[attr-defined]
|
|
127
|
-
target=_Job._executeJob,
|
|
128
|
-
args=(
|
|
129
|
-
qe_pickle,
|
|
130
|
-
task_node_pickle,
|
|
131
|
-
quantum_pickle,
|
|
132
|
-
self.qnode.nodeId,
|
|
133
|
-
logConfigState,
|
|
134
|
-
snd_conn,
|
|
135
|
-
self._fail_fast,
|
|
136
|
-
),
|
|
137
|
-
name=f"task-{self.qnode.quantum.dataId}",
|
|
138
|
-
)
|
|
139
|
-
# mypy is getting confused by multiprocessing.
|
|
140
|
-
assert self.process is not None
|
|
141
|
-
self.process.start()
|
|
142
|
-
self.started = time.time()
|
|
143
|
-
self._state = JobState.RUNNING
|
|
144
|
-
|
|
145
|
-
@staticmethod
|
|
146
|
-
def _executeJob(
|
|
147
|
-
quantumExecutor_pickle: bytes,
|
|
148
|
-
task_node_pickle: bytes,
|
|
149
|
-
quantum_pickle: bytes,
|
|
150
|
-
quantum_id: uuid.UUID | None,
|
|
151
|
-
logConfigState: list,
|
|
152
|
-
snd_conn: multiprocessing.connection.Connection,
|
|
153
|
-
fail_fast: bool,
|
|
154
|
-
) -> None:
|
|
155
|
-
"""Execute a job with arguments.
|
|
156
|
-
|
|
157
|
-
Parameters
|
|
158
|
-
----------
|
|
159
|
-
quantumExecutor_pickle : `bytes`
|
|
160
|
-
Executor for single quantum, pickled.
|
|
161
|
-
task_node_pickle : `bytes`
|
|
162
|
-
Task definition structure, pickled.
|
|
163
|
-
quantum_pickle : `bytes`
|
|
164
|
-
Quantum for this task execution in pickled form.
|
|
165
|
-
logConfigState : `list`
|
|
166
|
-
Logging state from parent process.
|
|
167
|
-
snd_conn : `multiprocessing.Connection`
|
|
168
|
-
Connection to send job report to parent process.
|
|
169
|
-
fail_fast : `bool`
|
|
170
|
-
If `True` then kill subprocess on RepeatableQuantumError.
|
|
171
|
-
"""
|
|
172
|
-
# This terrible hack is a workaround for Python threading bug:
|
|
173
|
-
# https://github.com/python/cpython/issues/102512. Should be removed
|
|
174
|
-
# when fix for that bug is deployed. Inspired by
|
|
175
|
-
# https://github.com/QubesOS/qubes-core-admin-client/pull/236/files.
|
|
176
|
-
thread = threading.current_thread()
|
|
177
|
-
if isinstance(thread, threading._DummyThread):
|
|
178
|
-
if getattr(thread, "_tstate_lock", "") is None:
|
|
179
|
-
thread._set_tstate_lock() # type: ignore[attr-defined]
|
|
180
|
-
|
|
181
|
-
if logConfigState and not CliLog.configState:
|
|
182
|
-
# means that we are in a new spawned Python process and we have to
|
|
183
|
-
# re-initialize logging
|
|
184
|
-
CliLog.replayConfigState(logConfigState)
|
|
185
|
-
|
|
186
|
-
quantumExecutor: QuantumExecutor = pickle.loads(quantumExecutor_pickle)
|
|
187
|
-
task_node: TaskNode = pickle.loads(task_node_pickle)
|
|
188
|
-
quantum = pickle.loads(quantum_pickle)
|
|
189
|
-
report: QuantumReport | None = None
|
|
190
|
-
# Catch a few known failure modes and stop the process immediately,
|
|
191
|
-
# with exception-specific exit code.
|
|
192
|
-
try:
|
|
193
|
-
_, report = quantumExecutor.execute(task_node, quantum, quantum_id=quantum_id)
|
|
194
|
-
except RepeatableQuantumError as exc:
|
|
195
|
-
report = QuantumReport.from_exception(
|
|
196
|
-
exception=exc,
|
|
197
|
-
dataId=quantum.dataId,
|
|
198
|
-
taskLabel=task_node.label,
|
|
199
|
-
exitCode=exc.EXIT_CODE if fail_fast else None,
|
|
200
|
-
)
|
|
201
|
-
if fail_fast:
|
|
202
|
-
_LOG.warning("Caught repeatable quantum error for %s (%s):", task_node.label, quantum.dataId)
|
|
203
|
-
_LOG.warning(exc, exc_info=True)
|
|
204
|
-
sys.exit(exc.EXIT_CODE)
|
|
205
|
-
else:
|
|
206
|
-
raise
|
|
207
|
-
except InvalidQuantumError as exc:
|
|
208
|
-
_LOG.fatal("Invalid quantum error for %s (%s): %s", task_node.label, quantum.dataId)
|
|
209
|
-
_LOG.fatal(exc, exc_info=True)
|
|
210
|
-
report = QuantumReport.from_exception(
|
|
211
|
-
exception=exc,
|
|
212
|
-
dataId=quantum.dataId,
|
|
213
|
-
taskLabel=task_node.label,
|
|
214
|
-
exitCode=exc.EXIT_CODE,
|
|
215
|
-
)
|
|
216
|
-
sys.exit(exc.EXIT_CODE)
|
|
217
|
-
except Exception as exc:
|
|
218
|
-
_LOG.debug("exception from task %s dataId %s: %s", task_node.label, quantum.dataId, exc)
|
|
219
|
-
report = QuantumReport.from_exception(
|
|
220
|
-
exception=exc,
|
|
221
|
-
dataId=quantum.dataId,
|
|
222
|
-
taskLabel=task_node.label,
|
|
223
|
-
)
|
|
224
|
-
raise
|
|
225
|
-
finally:
|
|
226
|
-
if report is not None:
|
|
227
|
-
# If sending fails we do not want this new exception to be
|
|
228
|
-
# exposed.
|
|
229
|
-
try:
|
|
230
|
-
_LOG.debug("sending report for task %s dataId %s", task_node.label, quantum.dataId)
|
|
231
|
-
snd_conn.send(report)
|
|
232
|
-
except Exception:
|
|
233
|
-
pass
|
|
234
|
-
|
|
235
|
-
def stop(self) -> None:
|
|
236
|
-
"""Stop the process."""
|
|
237
|
-
assert self.process is not None, "Process must be started"
|
|
238
|
-
self.process.terminate()
|
|
239
|
-
# give it 1 second to finish or KILL
|
|
240
|
-
for _ in range(10):
|
|
241
|
-
time.sleep(0.1)
|
|
242
|
-
if not self.process.is_alive():
|
|
243
|
-
break
|
|
244
|
-
else:
|
|
245
|
-
_LOG.debug("Killing process %s", self.process.name)
|
|
246
|
-
self.process.kill()
|
|
247
|
-
self._terminated = True
|
|
248
|
-
|
|
249
|
-
def cleanup(self) -> None:
|
|
250
|
-
"""Release processes resources, has to be called for each finished
|
|
251
|
-
process.
|
|
252
|
-
"""
|
|
253
|
-
if self.process and not self.process.is_alive():
|
|
254
|
-
self.process.close()
|
|
255
|
-
self.process = None
|
|
256
|
-
self._rcv_conn = None
|
|
257
|
-
|
|
258
|
-
def report(self) -> QuantumReport:
|
|
259
|
-
"""Return task report, should be called after process finishes and
|
|
260
|
-
before cleanup().
|
|
261
|
-
"""
|
|
262
|
-
assert self.process is not None, "Process must be started"
|
|
263
|
-
assert self._rcv_conn is not None, "Process must be started"
|
|
264
|
-
try:
|
|
265
|
-
report = self._rcv_conn.recv()
|
|
266
|
-
report.exitCode = self.process.exitcode
|
|
267
|
-
except Exception:
|
|
268
|
-
# Likely due to the process killed, but there may be other reasons.
|
|
269
|
-
# Exit code should not be None, this is to keep mypy happy.
|
|
270
|
-
exitcode = self.process.exitcode if self.process.exitcode is not None else -1
|
|
271
|
-
assert self.qnode.quantum.dataId is not None, "Quantum DataId cannot be None"
|
|
272
|
-
report = QuantumReport.from_exit_code(
|
|
273
|
-
exitCode=exitcode,
|
|
274
|
-
dataId=self.qnode.quantum.dataId,
|
|
275
|
-
taskLabel=self.qnode.task_node.label,
|
|
276
|
-
)
|
|
277
|
-
if self.terminated:
|
|
278
|
-
# Means it was killed, assume it's due to timeout
|
|
279
|
-
report.status = ExecutionStatus.TIMEOUT
|
|
280
|
-
return report
|
|
281
|
-
|
|
282
|
-
def failMessage(self) -> str:
|
|
283
|
-
"""Return a message describing task failure."""
|
|
284
|
-
assert self.process is not None, "Process must be started"
|
|
285
|
-
assert self.process.exitcode is not None, "Process has to finish"
|
|
286
|
-
exitcode = self.process.exitcode
|
|
287
|
-
if exitcode < 0:
|
|
288
|
-
# Negative exit code means it is killed by signal
|
|
289
|
-
signum = -exitcode
|
|
290
|
-
msg = f"Task {self} failed, killed by signal {signum}"
|
|
291
|
-
# Just in case this is some very odd signal, expect ValueError
|
|
292
|
-
try:
|
|
293
|
-
strsignal = signal.strsignal(signum)
|
|
294
|
-
msg = f"{msg} ({strsignal})"
|
|
295
|
-
except ValueError:
|
|
296
|
-
pass
|
|
297
|
-
elif exitcode > 0:
|
|
298
|
-
msg = f"Task {self} failed, exit code={exitcode}"
|
|
299
|
-
else:
|
|
300
|
-
msg = ""
|
|
301
|
-
return msg
|
|
302
|
-
|
|
303
|
-
def __str__(self) -> str:
|
|
304
|
-
return f"<{self.qnode.task_node.label} dataId={self.qnode.quantum.dataId}>"
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
class _JobList:
|
|
308
|
-
"""Simple list of _Job instances with few convenience methods.
|
|
309
|
-
|
|
310
|
-
Parameters
|
|
311
|
-
----------
|
|
312
|
-
iterable : iterable of `~lsst.pipe.base.QuantumNode`
|
|
313
|
-
Sequence of Quanta to execute. This has to be ordered according to
|
|
314
|
-
task dependencies.
|
|
315
|
-
"""
|
|
316
|
-
|
|
317
|
-
def __init__(self, iterable: Iterable[QuantumNode]):
|
|
318
|
-
self.jobs = [_Job(qnode) for qnode in iterable]
|
|
319
|
-
self.pending = self.jobs[:]
|
|
320
|
-
self.running: list[_Job] = []
|
|
321
|
-
self.finishedNodes: set[QuantumNode] = set()
|
|
322
|
-
self.failedNodes: set[QuantumNode] = set()
|
|
323
|
-
self.timedOutNodes: set[QuantumNode] = set()
|
|
324
|
-
|
|
325
|
-
def submit(
|
|
326
|
-
self,
|
|
327
|
-
job: _Job,
|
|
328
|
-
quantumExecutor: QuantumExecutor,
|
|
329
|
-
startMethod: Literal["spawn"] | Literal["forkserver"],
|
|
330
|
-
) -> None:
|
|
331
|
-
"""Submit one more job for execution.
|
|
332
|
-
|
|
333
|
-
Parameters
|
|
334
|
-
----------
|
|
335
|
-
job : `_Job`
|
|
336
|
-
Job to submit.
|
|
337
|
-
quantumExecutor : `QuantumExecutor`
|
|
338
|
-
Executor for single quantum.
|
|
339
|
-
startMethod : `str`, optional
|
|
340
|
-
Start method from `multiprocessing` module.
|
|
341
|
-
"""
|
|
342
|
-
# this will raise if job is not in pending list
|
|
343
|
-
self.pending.remove(job)
|
|
344
|
-
job.start(quantumExecutor, startMethod)
|
|
345
|
-
self.running.append(job)
|
|
346
|
-
|
|
347
|
-
def setJobState(self, job: _Job, state: JobState) -> None:
|
|
348
|
-
"""Update job state.
|
|
32
|
+
from deprecated.sphinx import deprecated
|
|
349
33
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
state : `JobState`
|
|
355
|
-
New job state, note that only FINISHED, FAILED, TIMED_OUT, or
|
|
356
|
-
FAILED_DEP state is acceptable.
|
|
357
|
-
"""
|
|
358
|
-
allowedStates = (JobState.FINISHED, JobState.FAILED, JobState.TIMED_OUT, JobState.FAILED_DEP)
|
|
359
|
-
assert state in allowedStates, f"State {state} not allowed here"
|
|
34
|
+
import lsst.pipe.base.mp_graph_executor
|
|
35
|
+
from lsst.pipe.base.execution_graph_fixup import ExecutionGraphFixup
|
|
36
|
+
from lsst.pipe.base.quantum_graph_executor import QuantumExecutor
|
|
37
|
+
from lsst.pipe.base.quantum_reports import Report
|
|
360
38
|
|
|
361
|
-
|
|
362
|
-
if job.state == JobState.PENDING:
|
|
363
|
-
self.pending.remove(job)
|
|
364
|
-
elif job.state == JobState.RUNNING:
|
|
365
|
-
self.running.remove(job)
|
|
39
|
+
# TODO[DM-51962]: Remove this module.
|
|
366
40
|
|
|
367
|
-
qnode = job.qnode
|
|
368
|
-
# it should not be in any of these, but just in case
|
|
369
|
-
self.finishedNodes.discard(qnode)
|
|
370
|
-
self.failedNodes.discard(qnode)
|
|
371
|
-
self.timedOutNodes.discard(qnode)
|
|
372
41
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
elif state == JobState.TIMED_OUT:
|
|
381
|
-
self.failedNodes.add(qnode)
|
|
382
|
-
self.timedOutNodes.add(qnode)
|
|
383
|
-
else:
|
|
384
|
-
raise ValueError(f"Unexpected state value: {state}")
|
|
385
|
-
|
|
386
|
-
def cleanup(self) -> None:
|
|
387
|
-
"""Do periodic cleanup for jobs that did not finish correctly.
|
|
388
|
-
|
|
389
|
-
If timed out jobs are killed but take too long to stop then regular
|
|
390
|
-
cleanup will not work for them. Here we check all timed out jobs
|
|
391
|
-
periodically and do cleanup if they managed to die by this time.
|
|
392
|
-
"""
|
|
393
|
-
for job in self.jobs:
|
|
394
|
-
if job.state == JobState.TIMED_OUT and job.process is not None:
|
|
395
|
-
job.cleanup()
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
class MPGraphExecutorError(Exception):
|
|
399
|
-
"""Exception class for errors raised by MPGraphExecutor."""
|
|
400
|
-
|
|
401
|
-
pass
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
class MPTimeoutError(MPGraphExecutorError):
|
|
405
|
-
"""Exception raised when task execution times out."""
|
|
406
|
-
|
|
407
|
-
pass
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
class MPGraphExecutor(QuantumGraphExecutor):
|
|
42
|
+
@deprecated(
|
|
43
|
+
"The MPGraphExecutor class has moved to lsst.pipe.base.mp_graph_executor. "
|
|
44
|
+
"This forwarding shim will be removed after v30.",
|
|
45
|
+
version="v30",
|
|
46
|
+
category=FutureWarning,
|
|
47
|
+
)
|
|
48
|
+
class MPGraphExecutor(lsst.pipe.base.mp_graph_executor.MPGraphExecutor):
|
|
411
49
|
"""Implementation of QuantumGraphExecutor using same-host multiprocess
|
|
412
50
|
execution of Quanta.
|
|
413
51
|
|
|
52
|
+
This is a deprecated backwards-compatibility shim for
|
|
53
|
+
`lsst.pipe.base.mp_graph_executor.MPGraphExecutor`, which has
|
|
54
|
+
the same functionality with very minor interface changes.
|
|
55
|
+
|
|
414
56
|
Parameters
|
|
415
57
|
----------
|
|
416
58
|
numProc : `int`
|
|
417
59
|
Number of processes to use for executing tasks.
|
|
418
60
|
timeout : `float`
|
|
419
61
|
Time in seconds to wait for tasks to finish.
|
|
420
|
-
quantumExecutor : `QuantumExecutor`
|
|
62
|
+
quantumExecutor : `lsst.pipe.base.quantum_graph_executor.QuantumExecutor`
|
|
421
63
|
Executor for single quantum. For multiprocess-style execution when
|
|
422
|
-
``
|
|
64
|
+
``num_proc`` is greater than one this instance must support pickle.
|
|
423
65
|
startMethod : `str`, optional
|
|
424
66
|
Start method from `multiprocessing` module, `None` selects the best
|
|
425
67
|
one for current platform.
|
|
@@ -428,7 +70,9 @@ class MPGraphExecutor(QuantumGraphExecutor):
|
|
|
428
70
|
pdb : `str`, optional
|
|
429
71
|
Debugger to import and use (via the ``post_mortem`` function) in the
|
|
430
72
|
event of an exception.
|
|
431
|
-
executionGraphFixup :
|
|
73
|
+
executionGraphFixup : \
|
|
74
|
+
`lsst.pipe.base.execution_graph_fixup.ExecutionGraphFixup`, \
|
|
75
|
+
optional
|
|
432
76
|
Instance used for modification of execution graph.
|
|
433
77
|
"""
|
|
434
78
|
|
|
@@ -443,331 +87,51 @@ class MPGraphExecutor(QuantumGraphExecutor):
|
|
|
443
87
|
pdb: str | None = None,
|
|
444
88
|
executionGraphFixup: ExecutionGraphFixup | None = None,
|
|
445
89
|
):
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
if startMethod is None:
|
|
456
|
-
startMethod = "spawn"
|
|
457
|
-
self.startMethod = startMethod
|
|
458
|
-
|
|
459
|
-
def execute(self, graph: QuantumGraph) -> None:
|
|
460
|
-
# Docstring inherited from QuantumGraphExecutor.execute
|
|
461
|
-
graph = self._fixupQuanta(graph)
|
|
462
|
-
self.report = Report(qgraphSummary=graph.getSummary())
|
|
463
|
-
try:
|
|
464
|
-
if self.numProc > 1:
|
|
465
|
-
self._executeQuantaMP(graph, self.report)
|
|
466
|
-
else:
|
|
467
|
-
self._executeQuantaInProcess(graph, self.report)
|
|
468
|
-
except Exception as exc:
|
|
469
|
-
self.report.set_exception(exc)
|
|
470
|
-
raise
|
|
471
|
-
|
|
472
|
-
def _fixupQuanta(self, graph: QuantumGraph) -> QuantumGraph:
|
|
473
|
-
"""Call fixup code to modify execution graph.
|
|
474
|
-
|
|
475
|
-
Parameters
|
|
476
|
-
----------
|
|
477
|
-
graph : `~lsst.pipe.base.QuantumGraph`
|
|
478
|
-
`~lsst.pipe.base.QuantumGraph` to modify.
|
|
479
|
-
|
|
480
|
-
Returns
|
|
481
|
-
-------
|
|
482
|
-
graph : `~lsst.pipe.base.QuantumGraph`
|
|
483
|
-
Modified `~lsst.pipe.base.QuantumGraph`.
|
|
484
|
-
|
|
485
|
-
Raises
|
|
486
|
-
------
|
|
487
|
-
MPGraphExecutorError
|
|
488
|
-
Raised if execution graph cannot be ordered after modification,
|
|
489
|
-
i.e. it has dependency cycles.
|
|
490
|
-
"""
|
|
491
|
-
if not self.executionGraphFixup:
|
|
492
|
-
return graph
|
|
493
|
-
|
|
494
|
-
_LOG.debug("Call execution graph fixup method")
|
|
495
|
-
graph = self.executionGraphFixup.fixupQuanta(graph)
|
|
496
|
-
|
|
497
|
-
# Detect if there is now a cycle created within the graph
|
|
498
|
-
if graph.findCycle():
|
|
499
|
-
raise MPGraphExecutorError("Updated execution graph has dependency cycle.")
|
|
500
|
-
|
|
501
|
-
return graph
|
|
502
|
-
|
|
503
|
-
def _executeQuantaInProcess(self, graph: QuantumGraph, report: Report) -> None:
|
|
504
|
-
"""Execute all Quanta in current process.
|
|
505
|
-
|
|
506
|
-
Parameters
|
|
507
|
-
----------
|
|
508
|
-
graph : `~lsst.pipe.base.QuantumGraph`
|
|
509
|
-
`~lsst.pipe.base.QuantumGraph` that is to be executed.
|
|
510
|
-
report : `Report`
|
|
511
|
-
Object for reporting execution status.
|
|
512
|
-
"""
|
|
513
|
-
successCount, totalCount = 0, len(graph)
|
|
514
|
-
failedNodes: set[QuantumNode] = set()
|
|
515
|
-
for qnode in graph:
|
|
516
|
-
assert qnode.quantum.dataId is not None, "Quantum DataId cannot be None"
|
|
517
|
-
task_node = qnode.task_node
|
|
518
|
-
|
|
519
|
-
# Any failed inputs mean that the quantum has to be skipped.
|
|
520
|
-
inputNodes = graph.determineInputsToQuantumNode(qnode)
|
|
521
|
-
if inputNodes & failedNodes:
|
|
522
|
-
_LOG.error(
|
|
523
|
-
"Upstream job failed for task <%s dataId=%s>, skipping this task.",
|
|
524
|
-
task_node.label,
|
|
525
|
-
qnode.quantum.dataId,
|
|
526
|
-
)
|
|
527
|
-
failedNodes.add(qnode)
|
|
528
|
-
failed_quantum_report = QuantumReport(
|
|
529
|
-
status=ExecutionStatus.SKIPPED,
|
|
530
|
-
dataId=qnode.quantum.dataId,
|
|
531
|
-
taskLabel=task_node.label,
|
|
532
|
-
)
|
|
533
|
-
report.quantaReports.append(failed_quantum_report)
|
|
534
|
-
continue
|
|
535
|
-
|
|
536
|
-
_LOG.debug("Executing %s", qnode)
|
|
537
|
-
fail_exit_code: int | None = None
|
|
538
|
-
try:
|
|
539
|
-
# For some exception types we want to exit immediately with
|
|
540
|
-
# exception-specific exit code, but we still want to start
|
|
541
|
-
# debugger before exiting if debugging is enabled.
|
|
542
|
-
try:
|
|
543
|
-
_, quantum_report = self.quantumExecutor.execute(
|
|
544
|
-
task_node, qnode.quantum, quantum_id=qnode.nodeId
|
|
545
|
-
)
|
|
546
|
-
if quantum_report:
|
|
547
|
-
report.quantaReports.append(quantum_report)
|
|
548
|
-
successCount += 1
|
|
549
|
-
except RepeatableQuantumError as exc:
|
|
550
|
-
if self.failFast:
|
|
551
|
-
_LOG.warning(
|
|
552
|
-
"Caught repeatable quantum error for %s (%s):",
|
|
553
|
-
task_node.label,
|
|
554
|
-
qnode.quantum.dataId,
|
|
555
|
-
)
|
|
556
|
-
_LOG.warning(exc, exc_info=True)
|
|
557
|
-
fail_exit_code = exc.EXIT_CODE
|
|
558
|
-
raise
|
|
559
|
-
except InvalidQuantumError as exc:
|
|
560
|
-
_LOG.fatal("Invalid quantum error for %s (%s):", task_node.label, qnode.quantum.dataId)
|
|
561
|
-
_LOG.fatal(exc, exc_info=True)
|
|
562
|
-
fail_exit_code = exc.EXIT_CODE
|
|
563
|
-
raise
|
|
564
|
-
except Exception as exc:
|
|
565
|
-
quantum_report = QuantumReport.from_exception(
|
|
566
|
-
exception=exc,
|
|
567
|
-
dataId=qnode.quantum.dataId,
|
|
568
|
-
taskLabel=task_node.label,
|
|
569
|
-
)
|
|
570
|
-
report.quantaReports.append(quantum_report)
|
|
571
|
-
|
|
572
|
-
if self.pdb and sys.stdin.isatty() and sys.stdout.isatty():
|
|
573
|
-
_LOG.error(
|
|
574
|
-
"Task <%s dataId=%s> failed; dropping into pdb.",
|
|
575
|
-
task_node.label,
|
|
576
|
-
qnode.quantum.dataId,
|
|
577
|
-
exc_info=exc,
|
|
578
|
-
)
|
|
579
|
-
try:
|
|
580
|
-
pdb = importlib.import_module(self.pdb)
|
|
581
|
-
except ImportError as imp_exc:
|
|
582
|
-
raise MPGraphExecutorError(
|
|
583
|
-
f"Unable to import specified debugger module ({self.pdb}): {imp_exc}"
|
|
584
|
-
) from exc
|
|
585
|
-
if not hasattr(pdb, "post_mortem"):
|
|
586
|
-
raise MPGraphExecutorError(
|
|
587
|
-
f"Specified debugger module ({self.pdb}) can't debug with post_mortem",
|
|
588
|
-
) from exc
|
|
589
|
-
pdb.post_mortem(exc.__traceback__)
|
|
590
|
-
failedNodes.add(qnode)
|
|
591
|
-
report.status = ExecutionStatus.FAILURE
|
|
592
|
-
|
|
593
|
-
# If exception specified an exit code then just exit with that
|
|
594
|
-
# code, otherwise crash if fail-fast option is enabled.
|
|
595
|
-
if fail_exit_code is not None:
|
|
596
|
-
sys.exit(fail_exit_code)
|
|
597
|
-
if self.failFast:
|
|
598
|
-
raise MPGraphExecutorError(
|
|
599
|
-
f"Task <{task_node.label} dataId={qnode.quantum.dataId}> failed."
|
|
600
|
-
) from exc
|
|
601
|
-
else:
|
|
602
|
-
# Note that there could be exception safety issues, which
|
|
603
|
-
# we presently ignore.
|
|
604
|
-
_LOG.error(
|
|
605
|
-
"Task <%s dataId=%s> failed; processing will continue for remaining tasks.",
|
|
606
|
-
task_node.label,
|
|
607
|
-
qnode.quantum.dataId,
|
|
608
|
-
exc_info=exc,
|
|
609
|
-
)
|
|
610
|
-
|
|
611
|
-
_LOG.info(
|
|
612
|
-
"Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
|
|
613
|
-
successCount,
|
|
614
|
-
len(failedNodes),
|
|
615
|
-
totalCount - successCount - len(failedNodes),
|
|
616
|
-
totalCount,
|
|
617
|
-
)
|
|
618
|
-
|
|
619
|
-
# Raise an exception if there were any failures.
|
|
620
|
-
if failedNodes:
|
|
621
|
-
raise MPGraphExecutorError("One or more tasks failed during execution.")
|
|
622
|
-
|
|
623
|
-
def _executeQuantaMP(self, graph: QuantumGraph, report: Report) -> None:
|
|
624
|
-
"""Execute all Quanta in separate processes.
|
|
625
|
-
|
|
626
|
-
Parameters
|
|
627
|
-
----------
|
|
628
|
-
graph : `~lsst.pipe.base.QuantumGraph`
|
|
629
|
-
`~lsst.pipe.base.QuantumGraph` that is to be executed.
|
|
630
|
-
report : `Report`
|
|
631
|
-
Object for reporting execution status.
|
|
632
|
-
"""
|
|
633
|
-
disable_implicit_threading() # To prevent thread contention
|
|
634
|
-
|
|
635
|
-
_LOG.debug("Using %r for multiprocessing start method", self.startMethod)
|
|
636
|
-
|
|
637
|
-
# re-pack input quantum data into jobs list
|
|
638
|
-
jobs = _JobList(graph)
|
|
639
|
-
|
|
640
|
-
# check that all tasks can run in sub-process
|
|
641
|
-
for job in jobs.jobs:
|
|
642
|
-
task_node = job.qnode.task_node
|
|
643
|
-
if not task_node.task_class.canMultiprocess:
|
|
644
|
-
raise MPGraphExecutorError(
|
|
645
|
-
f"Task {task_node.label!r} does not support multiprocessing; use single process"
|
|
646
|
-
)
|
|
90
|
+
super().__init__(
|
|
91
|
+
num_proc=numProc,
|
|
92
|
+
timeout=timeout,
|
|
93
|
+
quantum_executor=quantumExecutor,
|
|
94
|
+
start_method=startMethod,
|
|
95
|
+
fail_fast=failFast,
|
|
96
|
+
pdb=pdb,
|
|
97
|
+
execution_graph_fixup=executionGraphFixup,
|
|
98
|
+
)
|
|
647
99
|
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
_LOG.debug("#runningJobs: %s", len(jobs.running))
|
|
100
|
+
@property
|
|
101
|
+
def numProc(self) -> int:
|
|
102
|
+
return self._num_proc
|
|
652
103
|
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
if not job.process.is_alive():
|
|
657
|
-
_LOG.debug("finished: %s", job)
|
|
658
|
-
# finished
|
|
659
|
-
exitcode = job.process.exitcode
|
|
660
|
-
quantum_report = job.report()
|
|
661
|
-
report.quantaReports.append(quantum_report)
|
|
662
|
-
if exitcode == 0:
|
|
663
|
-
jobs.setJobState(job, JobState.FINISHED)
|
|
664
|
-
job.cleanup()
|
|
665
|
-
_LOG.debug("success: %s took %.3f seconds", job, time.time() - job.started)
|
|
666
|
-
else:
|
|
667
|
-
if job.terminated:
|
|
668
|
-
# Was killed due to timeout.
|
|
669
|
-
if report.status == ExecutionStatus.SUCCESS:
|
|
670
|
-
# Do not override global FAILURE status
|
|
671
|
-
report.status = ExecutionStatus.TIMEOUT
|
|
672
|
-
message = f"Timeout ({self.timeout} sec) for task {job}, task is killed"
|
|
673
|
-
jobs.setJobState(job, JobState.TIMED_OUT)
|
|
674
|
-
else:
|
|
675
|
-
report.status = ExecutionStatus.FAILURE
|
|
676
|
-
# failMessage() has to be called before cleanup()
|
|
677
|
-
message = job.failMessage()
|
|
678
|
-
jobs.setJobState(job, JobState.FAILED)
|
|
104
|
+
@property
|
|
105
|
+
def timeout(self) -> float:
|
|
106
|
+
return self._timeout
|
|
679
107
|
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
# stop all running jobs
|
|
684
|
-
for stopJob in jobs.running:
|
|
685
|
-
if stopJob is not job:
|
|
686
|
-
stopJob.stop()
|
|
687
|
-
if job.state is JobState.TIMED_OUT:
|
|
688
|
-
raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.")
|
|
689
|
-
else:
|
|
690
|
-
raise MPGraphExecutorError(message)
|
|
691
|
-
else:
|
|
692
|
-
_LOG.error("%s; processing will continue for remaining tasks.", message)
|
|
693
|
-
else:
|
|
694
|
-
# check for timeout
|
|
695
|
-
now = time.time()
|
|
696
|
-
if now - job.started > self.timeout:
|
|
697
|
-
# Try to kill it, and there is a chance that it
|
|
698
|
-
# finishes successfully before it gets killed. Exit
|
|
699
|
-
# status is handled by the code above on next
|
|
700
|
-
# iteration.
|
|
701
|
-
_LOG.debug("Terminating job %s due to timeout", job)
|
|
702
|
-
job.stop()
|
|
108
|
+
@property
|
|
109
|
+
def quantumExecutor(self) -> QuantumExecutor:
|
|
110
|
+
return self._quantum_executor
|
|
703
111
|
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
for job in jobs.pending:
|
|
708
|
-
jobInputNodes = graph.determineInputsToQuantumNode(job.qnode)
|
|
709
|
-
assert job.qnode.quantum.dataId is not None, "Quantum DataId cannot be None"
|
|
710
|
-
if jobInputNodes & jobs.failedNodes:
|
|
711
|
-
quantum_report = QuantumReport(
|
|
712
|
-
status=ExecutionStatus.SKIPPED,
|
|
713
|
-
dataId=job.qnode.quantum.dataId,
|
|
714
|
-
taskLabel=job.qnode.task_node.label,
|
|
715
|
-
)
|
|
716
|
-
report.quantaReports.append(quantum_report)
|
|
717
|
-
jobs.setJobState(job, JobState.FAILED_DEP)
|
|
718
|
-
_LOG.error("Upstream job failed for task %s, skipping this task.", job)
|
|
112
|
+
@property
|
|
113
|
+
def failFast(self) -> bool:
|
|
114
|
+
return self._fail_fast
|
|
719
115
|
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
jobInputNodes = graph.determineInputsToQuantumNode(job.qnode)
|
|
724
|
-
if jobInputNodes <= jobs.finishedNodes:
|
|
725
|
-
# all dependencies have completed, can start new job
|
|
726
|
-
if len(jobs.running) < self.numProc:
|
|
727
|
-
_LOG.debug("Submitting %s", job)
|
|
728
|
-
jobs.submit(job, self.quantumExecutor, self.startMethod)
|
|
729
|
-
if len(jobs.running) >= self.numProc:
|
|
730
|
-
# Cannot start any more jobs, wait until something
|
|
731
|
-
# finishes.
|
|
732
|
-
break
|
|
116
|
+
@property
|
|
117
|
+
def pdb(self) -> str | None:
|
|
118
|
+
return self._pdb
|
|
733
119
|
|
|
734
|
-
|
|
735
|
-
|
|
120
|
+
@property
|
|
121
|
+
def executionGraphFixup(self) -> ExecutionGraphFixup | None:
|
|
122
|
+
return self._execution_graph_fixup
|
|
736
123
|
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
finishedCount, failedCount = newFinished, newFailed
|
|
741
|
-
totalCount = len(jobs.jobs)
|
|
742
|
-
_LOG.info(
|
|
743
|
-
"Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
|
|
744
|
-
finishedCount,
|
|
745
|
-
failedCount,
|
|
746
|
-
totalCount - finishedCount - failedCount,
|
|
747
|
-
totalCount,
|
|
748
|
-
)
|
|
124
|
+
@property
|
|
125
|
+
def report(self) -> Report | None:
|
|
126
|
+
return self._report
|
|
749
127
|
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
if jobs.running:
|
|
754
|
-
time.sleep(0.1)
|
|
128
|
+
@property
|
|
129
|
+
def startMethod(self) -> str:
|
|
130
|
+
return self._start_method
|
|
755
131
|
|
|
756
|
-
if jobs.failedNodes:
|
|
757
|
-
# print list of failed jobs
|
|
758
|
-
_LOG.error("Failed jobs:")
|
|
759
|
-
for job in jobs.jobs:
|
|
760
|
-
if job.state != JobState.FINISHED:
|
|
761
|
-
_LOG.error(" - %s: %s", job.state.name, job)
|
|
762
132
|
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
raise MPTimeoutError("One or more tasks timed out during execution.")
|
|
766
|
-
else:
|
|
767
|
-
raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")
|
|
133
|
+
# We can't make these forwarders warn by subclassing, because an 'except'
|
|
134
|
+
# statement on a derived class won't catch a base class instance.
|
|
768
135
|
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
if self.report is None:
|
|
772
|
-
raise RuntimeError("getReport() called before execute()")
|
|
773
|
-
return self.report
|
|
136
|
+
MPGraphExecutorError = lsst.pipe.base.mp_graph_executor.MPGraphExecutorError
|
|
137
|
+
MPTimeoutError = lsst.pipe.base.mp_graph_executor.MPTimeoutError
|