lsst-ctrl-mpexec 29.2025.2400__py3-none-any.whl → 29.2025.3200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. lsst/ctrl/mpexec/__init__.py +1 -2
  2. lsst/ctrl/mpexec/cli/butler_factory.py +464 -0
  3. lsst/ctrl/mpexec/cli/cmd/commands.py +7 -1
  4. lsst/ctrl/mpexec/cli/opt/optionGroups.py +0 -13
  5. lsst/ctrl/mpexec/cli/opt/options.py +0 -46
  6. lsst/ctrl/mpexec/cli/script/build.py +49 -36
  7. lsst/ctrl/mpexec/cli/script/pre_exec_init_qbb.py +3 -1
  8. lsst/ctrl/mpexec/cli/script/qgraph.py +0 -25
  9. lsst/ctrl/mpexec/cli/script/run.py +2 -1
  10. lsst/ctrl/mpexec/cli/script/run_qbb.py +2 -1
  11. lsst/ctrl/mpexec/cmdLineFwk.py +30 -556
  12. lsst/ctrl/mpexec/execFixupDataId.py +9 -101
  13. lsst/ctrl/mpexec/executionGraphFixup.py +12 -37
  14. lsst/ctrl/mpexec/log_capture.py +9 -195
  15. lsst/ctrl/mpexec/mpGraphExecutor.py +60 -696
  16. lsst/ctrl/mpexec/quantumGraphExecutor.py +20 -90
  17. lsst/ctrl/mpexec/reports.py +30 -206
  18. lsst/ctrl/mpexec/separablePipelineExecutor.py +12 -263
  19. lsst/ctrl/mpexec/showInfo.py +2 -2
  20. lsst/ctrl/mpexec/simple_pipeline_executor.py +11 -590
  21. lsst/ctrl/mpexec/singleQuantumExecutor.py +75 -532
  22. lsst/ctrl/mpexec/taskFactory.py +12 -38
  23. lsst/ctrl/mpexec/version.py +1 -1
  24. {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/METADATA +1 -1
  25. lsst_ctrl_mpexec-29.2025.3200.dist-info/RECORD +51 -0
  26. lsst/ctrl/mpexec/dotTools.py +0 -100
  27. lsst_ctrl_mpexec-29.2025.2400.dist-info/RECORD +0 -51
  28. {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/WHEEL +0 -0
  29. {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/entry_points.txt +0 -0
  30. {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/licenses/COPYRIGHT +0 -0
  31. {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/licenses/LICENSE +0 -0
  32. {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/licenses/bsd_license.txt +0 -0
  33. {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/licenses/gpl-v3.0.txt +0 -0
  34. {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/top_level.txt +0 -0
  35. {lsst_ctrl_mpexec-29.2025.2400.dist-info → lsst_ctrl_mpexec-29.2025.3200.dist-info}/zip-safe +0 -0
@@ -25,401 +25,43 @@
25
25
  # You should have received a copy of the GNU General Public License
26
26
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
27
27
 
28
- from __future__ import annotations
28
+ __all__ = ("MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError")
29
29
 
30
- __all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"]
31
-
32
- import importlib
33
- import logging
34
- import multiprocessing
35
- import pickle
36
- import signal
37
- import sys
38
- import threading
39
- import time
40
- import uuid
41
- from collections.abc import Iterable
42
- from enum import Enum
43
30
  from typing import Literal
44
31
 
45
- from lsst.daf.butler.cli.cliLog import CliLog
46
- from lsst.pipe.base import InvalidQuantumError, RepeatableQuantumError
47
- from lsst.pipe.base.graph.graph import QuantumGraph, QuantumNode
48
- from lsst.pipe.base.pipeline_graph import TaskNode
49
- from lsst.utils.threads import disable_implicit_threading
50
-
51
- from .executionGraphFixup import ExecutionGraphFixup
52
- from .quantumGraphExecutor import QuantumExecutor, QuantumGraphExecutor
53
- from .reports import ExecutionStatus, QuantumReport, Report
54
-
55
- _LOG = logging.getLogger(__name__)
56
-
57
-
58
- # Possible states for the executing task:
59
- # - PENDING: job has not started yet
60
- # - RUNNING: job is currently executing
61
- # - FINISHED: job finished successfully
62
- # - FAILED: job execution failed (process returned non-zero status)
63
- # - TIMED_OUT: job is killed due to too long execution time
64
- # - FAILED_DEP: one of the dependencies of this job has failed/timed out
65
- JobState = Enum("JobState", "PENDING RUNNING FINISHED FAILED TIMED_OUT FAILED_DEP")
66
-
67
-
68
- class _Job:
69
- """Class representing a job running single task.
70
-
71
- Parameters
72
- ----------
73
- qnode: `~lsst.pipe.base.QuantumNode`
74
- Quantum and some associated information.
75
- """
76
-
77
- def __init__(self, qnode: QuantumNode, fail_fast: bool = False):
78
- self.qnode = qnode
79
- self._fail_fast = fail_fast
80
- self.process: multiprocessing.process.BaseProcess | None = None
81
- self._state = JobState.PENDING
82
- self.started: float = 0.0
83
- self._rcv_conn: multiprocessing.connection.Connection | None = None
84
- self._terminated = False
85
-
86
- @property
87
- def state(self) -> JobState:
88
- """Job processing state (JobState)."""
89
- return self._state
90
-
91
- @property
92
- def terminated(self) -> bool:
93
- """Return `True` if job was killed by stop() method and negative exit
94
- code is returned from child process (`bool`).
95
- """
96
- if self._terminated:
97
- assert self.process is not None, "Process must be started"
98
- if self.process.exitcode is not None:
99
- return self.process.exitcode < 0
100
- return False
101
-
102
- def start(
103
- self,
104
- quantumExecutor: QuantumExecutor,
105
- startMethod: Literal["spawn"] | Literal["forkserver"],
106
- ) -> None:
107
- """Start process which runs the task.
108
-
109
- Parameters
110
- ----------
111
- quantumExecutor : `QuantumExecutor`
112
- Executor for single quantum.
113
- startMethod : `str`, optional
114
- Start method from `multiprocessing` module.
115
- """
116
- # Unpickling of quantum has to happen after butler/executor, also we
117
- # want to setup logging before unpickling anything that can generate
118
- # messages, this is why things are pickled manually here.
119
- qe_pickle = pickle.dumps(quantumExecutor)
120
- task_node_pickle = pickle.dumps(self.qnode.task_node)
121
- quantum_pickle = pickle.dumps(self.qnode.quantum)
122
- self._rcv_conn, snd_conn = multiprocessing.Pipe(False)
123
- logConfigState = CliLog.configState
124
-
125
- mp_ctx = multiprocessing.get_context(startMethod)
126
- self.process = mp_ctx.Process( # type: ignore[attr-defined]
127
- target=_Job._executeJob,
128
- args=(
129
- qe_pickle,
130
- task_node_pickle,
131
- quantum_pickle,
132
- self.qnode.nodeId,
133
- logConfigState,
134
- snd_conn,
135
- self._fail_fast,
136
- ),
137
- name=f"task-{self.qnode.quantum.dataId}",
138
- )
139
- # mypy is getting confused by multiprocessing.
140
- assert self.process is not None
141
- self.process.start()
142
- self.started = time.time()
143
- self._state = JobState.RUNNING
144
-
145
- @staticmethod
146
- def _executeJob(
147
- quantumExecutor_pickle: bytes,
148
- task_node_pickle: bytes,
149
- quantum_pickle: bytes,
150
- quantum_id: uuid.UUID | None,
151
- logConfigState: list,
152
- snd_conn: multiprocessing.connection.Connection,
153
- fail_fast: bool,
154
- ) -> None:
155
- """Execute a job with arguments.
156
-
157
- Parameters
158
- ----------
159
- quantumExecutor_pickle : `bytes`
160
- Executor for single quantum, pickled.
161
- task_node_pickle : `bytes`
162
- Task definition structure, pickled.
163
- quantum_pickle : `bytes`
164
- Quantum for this task execution in pickled form.
165
- logConfigState : `list`
166
- Logging state from parent process.
167
- snd_conn : `multiprocessing.Connection`
168
- Connection to send job report to parent process.
169
- fail_fast : `bool`
170
- If `True` then kill subprocess on RepeatableQuantumError.
171
- """
172
- # This terrible hack is a workaround for Python threading bug:
173
- # https://github.com/python/cpython/issues/102512. Should be removed
174
- # when fix for that bug is deployed. Inspired by
175
- # https://github.com/QubesOS/qubes-core-admin-client/pull/236/files.
176
- thread = threading.current_thread()
177
- if isinstance(thread, threading._DummyThread):
178
- if getattr(thread, "_tstate_lock", "") is None:
179
- thread._set_tstate_lock() # type: ignore[attr-defined]
180
-
181
- if logConfigState and not CliLog.configState:
182
- # means that we are in a new spawned Python process and we have to
183
- # re-initialize logging
184
- CliLog.replayConfigState(logConfigState)
185
-
186
- quantumExecutor: QuantumExecutor = pickle.loads(quantumExecutor_pickle)
187
- task_node: TaskNode = pickle.loads(task_node_pickle)
188
- quantum = pickle.loads(quantum_pickle)
189
- report: QuantumReport | None = None
190
- # Catch a few known failure modes and stop the process immediately,
191
- # with exception-specific exit code.
192
- try:
193
- _, report = quantumExecutor.execute(task_node, quantum, quantum_id=quantum_id)
194
- except RepeatableQuantumError as exc:
195
- report = QuantumReport.from_exception(
196
- exception=exc,
197
- dataId=quantum.dataId,
198
- taskLabel=task_node.label,
199
- exitCode=exc.EXIT_CODE if fail_fast else None,
200
- )
201
- if fail_fast:
202
- _LOG.warning("Caught repeatable quantum error for %s (%s):", task_node.label, quantum.dataId)
203
- _LOG.warning(exc, exc_info=True)
204
- sys.exit(exc.EXIT_CODE)
205
- else:
206
- raise
207
- except InvalidQuantumError as exc:
208
- _LOG.fatal("Invalid quantum error for %s (%s): %s", task_node.label, quantum.dataId)
209
- _LOG.fatal(exc, exc_info=True)
210
- report = QuantumReport.from_exception(
211
- exception=exc,
212
- dataId=quantum.dataId,
213
- taskLabel=task_node.label,
214
- exitCode=exc.EXIT_CODE,
215
- )
216
- sys.exit(exc.EXIT_CODE)
217
- except Exception as exc:
218
- _LOG.debug("exception from task %s dataId %s: %s", task_node.label, quantum.dataId, exc)
219
- report = QuantumReport.from_exception(
220
- exception=exc,
221
- dataId=quantum.dataId,
222
- taskLabel=task_node.label,
223
- )
224
- raise
225
- finally:
226
- if report is not None:
227
- # If sending fails we do not want this new exception to be
228
- # exposed.
229
- try:
230
- _LOG.debug("sending report for task %s dataId %s", task_node.label, quantum.dataId)
231
- snd_conn.send(report)
232
- except Exception:
233
- pass
234
-
235
- def stop(self) -> None:
236
- """Stop the process."""
237
- assert self.process is not None, "Process must be started"
238
- self.process.terminate()
239
- # give it 1 second to finish or KILL
240
- for _ in range(10):
241
- time.sleep(0.1)
242
- if not self.process.is_alive():
243
- break
244
- else:
245
- _LOG.debug("Killing process %s", self.process.name)
246
- self.process.kill()
247
- self._terminated = True
248
-
249
- def cleanup(self) -> None:
250
- """Release processes resources, has to be called for each finished
251
- process.
252
- """
253
- if self.process and not self.process.is_alive():
254
- self.process.close()
255
- self.process = None
256
- self._rcv_conn = None
257
-
258
- def report(self) -> QuantumReport:
259
- """Return task report, should be called after process finishes and
260
- before cleanup().
261
- """
262
- assert self.process is not None, "Process must be started"
263
- assert self._rcv_conn is not None, "Process must be started"
264
- try:
265
- report = self._rcv_conn.recv()
266
- report.exitCode = self.process.exitcode
267
- except Exception:
268
- # Likely due to the process killed, but there may be other reasons.
269
- # Exit code should not be None, this is to keep mypy happy.
270
- exitcode = self.process.exitcode if self.process.exitcode is not None else -1
271
- assert self.qnode.quantum.dataId is not None, "Quantum DataId cannot be None"
272
- report = QuantumReport.from_exit_code(
273
- exitCode=exitcode,
274
- dataId=self.qnode.quantum.dataId,
275
- taskLabel=self.qnode.task_node.label,
276
- )
277
- if self.terminated:
278
- # Means it was killed, assume it's due to timeout
279
- report.status = ExecutionStatus.TIMEOUT
280
- return report
281
-
282
- def failMessage(self) -> str:
283
- """Return a message describing task failure."""
284
- assert self.process is not None, "Process must be started"
285
- assert self.process.exitcode is not None, "Process has to finish"
286
- exitcode = self.process.exitcode
287
- if exitcode < 0:
288
- # Negative exit code means it is killed by signal
289
- signum = -exitcode
290
- msg = f"Task {self} failed, killed by signal {signum}"
291
- # Just in case this is some very odd signal, expect ValueError
292
- try:
293
- strsignal = signal.strsignal(signum)
294
- msg = f"{msg} ({strsignal})"
295
- except ValueError:
296
- pass
297
- elif exitcode > 0:
298
- msg = f"Task {self} failed, exit code={exitcode}"
299
- else:
300
- msg = ""
301
- return msg
302
-
303
- def __str__(self) -> str:
304
- return f"<{self.qnode.task_node.label} dataId={self.qnode.quantum.dataId}>"
305
-
306
-
307
- class _JobList:
308
- """Simple list of _Job instances with few convenience methods.
309
-
310
- Parameters
311
- ----------
312
- iterable : iterable of `~lsst.pipe.base.QuantumNode`
313
- Sequence of Quanta to execute. This has to be ordered according to
314
- task dependencies.
315
- """
316
-
317
- def __init__(self, iterable: Iterable[QuantumNode]):
318
- self.jobs = [_Job(qnode) for qnode in iterable]
319
- self.pending = self.jobs[:]
320
- self.running: list[_Job] = []
321
- self.finishedNodes: set[QuantumNode] = set()
322
- self.failedNodes: set[QuantumNode] = set()
323
- self.timedOutNodes: set[QuantumNode] = set()
324
-
325
- def submit(
326
- self,
327
- job: _Job,
328
- quantumExecutor: QuantumExecutor,
329
- startMethod: Literal["spawn"] | Literal["forkserver"],
330
- ) -> None:
331
- """Submit one more job for execution.
332
-
333
- Parameters
334
- ----------
335
- job : `_Job`
336
- Job to submit.
337
- quantumExecutor : `QuantumExecutor`
338
- Executor for single quantum.
339
- startMethod : `str`, optional
340
- Start method from `multiprocessing` module.
341
- """
342
- # this will raise if job is not in pending list
343
- self.pending.remove(job)
344
- job.start(quantumExecutor, startMethod)
345
- self.running.append(job)
346
-
347
- def setJobState(self, job: _Job, state: JobState) -> None:
348
- """Update job state.
32
+ from deprecated.sphinx import deprecated
349
33
 
350
- Parameters
351
- ----------
352
- job : `_Job`
353
- Job to submit.
354
- state : `JobState`
355
- New job state, note that only FINISHED, FAILED, TIMED_OUT, or
356
- FAILED_DEP state is acceptable.
357
- """
358
- allowedStates = (JobState.FINISHED, JobState.FAILED, JobState.TIMED_OUT, JobState.FAILED_DEP)
359
- assert state in allowedStates, f"State {state} not allowed here"
34
+ import lsst.pipe.base.mp_graph_executor
35
+ from lsst.pipe.base.execution_graph_fixup import ExecutionGraphFixup
36
+ from lsst.pipe.base.quantum_graph_executor import QuantumExecutor
37
+ from lsst.pipe.base.quantum_reports import Report
360
38
 
361
- # remove job from pending/running lists
362
- if job.state == JobState.PENDING:
363
- self.pending.remove(job)
364
- elif job.state == JobState.RUNNING:
365
- self.running.remove(job)
39
+ # TODO[DM-51962]: Remove this module.
366
40
 
367
- qnode = job.qnode
368
- # it should not be in any of these, but just in case
369
- self.finishedNodes.discard(qnode)
370
- self.failedNodes.discard(qnode)
371
- self.timedOutNodes.discard(qnode)
372
41
 
373
- job._state = state
374
- if state == JobState.FINISHED:
375
- self.finishedNodes.add(qnode)
376
- elif state == JobState.FAILED:
377
- self.failedNodes.add(qnode)
378
- elif state == JobState.FAILED_DEP:
379
- self.failedNodes.add(qnode)
380
- elif state == JobState.TIMED_OUT:
381
- self.failedNodes.add(qnode)
382
- self.timedOutNodes.add(qnode)
383
- else:
384
- raise ValueError(f"Unexpected state value: {state}")
385
-
386
- def cleanup(self) -> None:
387
- """Do periodic cleanup for jobs that did not finish correctly.
388
-
389
- If timed out jobs are killed but take too long to stop then regular
390
- cleanup will not work for them. Here we check all timed out jobs
391
- periodically and do cleanup if they managed to die by this time.
392
- """
393
- for job in self.jobs:
394
- if job.state == JobState.TIMED_OUT and job.process is not None:
395
- job.cleanup()
396
-
397
-
398
- class MPGraphExecutorError(Exception):
399
- """Exception class for errors raised by MPGraphExecutor."""
400
-
401
- pass
402
-
403
-
404
- class MPTimeoutError(MPGraphExecutorError):
405
- """Exception raised when task execution times out."""
406
-
407
- pass
408
-
409
-
410
- class MPGraphExecutor(QuantumGraphExecutor):
42
+ @deprecated(
43
+ "The MPGraphExecutor class has moved to lsst.pipe.base.mp_graph_executor. "
44
+ "This forwarding shim will be removed after v30.",
45
+ version="v30",
46
+ category=FutureWarning,
47
+ )
48
+ class MPGraphExecutor(lsst.pipe.base.mp_graph_executor.MPGraphExecutor):
411
49
  """Implementation of QuantumGraphExecutor using same-host multiprocess
412
50
  execution of Quanta.
413
51
 
52
+ This is a deprecated backwards-compatibility shim for
53
+ `lsst.pipe.base.mp_graph_executor.MPGraphExecutor`, which has
54
+ the same functionality with very minor interface changes.
55
+
414
56
  Parameters
415
57
  ----------
416
58
  numProc : `int`
417
59
  Number of processes to use for executing tasks.
418
60
  timeout : `float`
419
61
  Time in seconds to wait for tasks to finish.
420
- quantumExecutor : `QuantumExecutor`
62
+ quantumExecutor : `lsst.pipe.base.quantum_graph_executor.QuantumExecutor`
421
63
  Executor for single quantum. For multiprocess-style execution when
422
- ``numProc`` is greater than one this instance must support pickle.
64
+ ``num_proc`` is greater than one this instance must support pickle.
423
65
  startMethod : `str`, optional
424
66
  Start method from `multiprocessing` module, `None` selects the best
425
67
  one for current platform.
@@ -428,7 +70,9 @@ class MPGraphExecutor(QuantumGraphExecutor):
428
70
  pdb : `str`, optional
429
71
  Debugger to import and use (via the ``post_mortem`` function) in the
430
72
  event of an exception.
431
- executionGraphFixup : `ExecutionGraphFixup`, optional
73
+ executionGraphFixup : \
74
+ `lsst.pipe.base.execution_graph_fixup.ExecutionGraphFixup`, \
75
+ optional
432
76
  Instance used for modification of execution graph.
433
77
  """
434
78
 
@@ -443,331 +87,51 @@ class MPGraphExecutor(QuantumGraphExecutor):
443
87
  pdb: str | None = None,
444
88
  executionGraphFixup: ExecutionGraphFixup | None = None,
445
89
  ):
446
- self.numProc = numProc
447
- self.timeout = timeout
448
- self.quantumExecutor = quantumExecutor
449
- self.failFast = failFast
450
- self.pdb = pdb
451
- self.executionGraphFixup = executionGraphFixup
452
- self.report: Report | None = None
453
-
454
- # We set default start method as spawn for all platforms.
455
- if startMethod is None:
456
- startMethod = "spawn"
457
- self.startMethod = startMethod
458
-
459
- def execute(self, graph: QuantumGraph) -> None:
460
- # Docstring inherited from QuantumGraphExecutor.execute
461
- graph = self._fixupQuanta(graph)
462
- self.report = Report(qgraphSummary=graph.getSummary())
463
- try:
464
- if self.numProc > 1:
465
- self._executeQuantaMP(graph, self.report)
466
- else:
467
- self._executeQuantaInProcess(graph, self.report)
468
- except Exception as exc:
469
- self.report.set_exception(exc)
470
- raise
471
-
472
- def _fixupQuanta(self, graph: QuantumGraph) -> QuantumGraph:
473
- """Call fixup code to modify execution graph.
474
-
475
- Parameters
476
- ----------
477
- graph : `~lsst.pipe.base.QuantumGraph`
478
- `~lsst.pipe.base.QuantumGraph` to modify.
479
-
480
- Returns
481
- -------
482
- graph : `~lsst.pipe.base.QuantumGraph`
483
- Modified `~lsst.pipe.base.QuantumGraph`.
484
-
485
- Raises
486
- ------
487
- MPGraphExecutorError
488
- Raised if execution graph cannot be ordered after modification,
489
- i.e. it has dependency cycles.
490
- """
491
- if not self.executionGraphFixup:
492
- return graph
493
-
494
- _LOG.debug("Call execution graph fixup method")
495
- graph = self.executionGraphFixup.fixupQuanta(graph)
496
-
497
- # Detect if there is now a cycle created within the graph
498
- if graph.findCycle():
499
- raise MPGraphExecutorError("Updated execution graph has dependency cycle.")
500
-
501
- return graph
502
-
503
- def _executeQuantaInProcess(self, graph: QuantumGraph, report: Report) -> None:
504
- """Execute all Quanta in current process.
505
-
506
- Parameters
507
- ----------
508
- graph : `~lsst.pipe.base.QuantumGraph`
509
- `~lsst.pipe.base.QuantumGraph` that is to be executed.
510
- report : `Report`
511
- Object for reporting execution status.
512
- """
513
- successCount, totalCount = 0, len(graph)
514
- failedNodes: set[QuantumNode] = set()
515
- for qnode in graph:
516
- assert qnode.quantum.dataId is not None, "Quantum DataId cannot be None"
517
- task_node = qnode.task_node
518
-
519
- # Any failed inputs mean that the quantum has to be skipped.
520
- inputNodes = graph.determineInputsToQuantumNode(qnode)
521
- if inputNodes & failedNodes:
522
- _LOG.error(
523
- "Upstream job failed for task <%s dataId=%s>, skipping this task.",
524
- task_node.label,
525
- qnode.quantum.dataId,
526
- )
527
- failedNodes.add(qnode)
528
- failed_quantum_report = QuantumReport(
529
- status=ExecutionStatus.SKIPPED,
530
- dataId=qnode.quantum.dataId,
531
- taskLabel=task_node.label,
532
- )
533
- report.quantaReports.append(failed_quantum_report)
534
- continue
535
-
536
- _LOG.debug("Executing %s", qnode)
537
- fail_exit_code: int | None = None
538
- try:
539
- # For some exception types we want to exit immediately with
540
- # exception-specific exit code, but we still want to start
541
- # debugger before exiting if debugging is enabled.
542
- try:
543
- _, quantum_report = self.quantumExecutor.execute(
544
- task_node, qnode.quantum, quantum_id=qnode.nodeId
545
- )
546
- if quantum_report:
547
- report.quantaReports.append(quantum_report)
548
- successCount += 1
549
- except RepeatableQuantumError as exc:
550
- if self.failFast:
551
- _LOG.warning(
552
- "Caught repeatable quantum error for %s (%s):",
553
- task_node.label,
554
- qnode.quantum.dataId,
555
- )
556
- _LOG.warning(exc, exc_info=True)
557
- fail_exit_code = exc.EXIT_CODE
558
- raise
559
- except InvalidQuantumError as exc:
560
- _LOG.fatal("Invalid quantum error for %s (%s):", task_node.label, qnode.quantum.dataId)
561
- _LOG.fatal(exc, exc_info=True)
562
- fail_exit_code = exc.EXIT_CODE
563
- raise
564
- except Exception as exc:
565
- quantum_report = QuantumReport.from_exception(
566
- exception=exc,
567
- dataId=qnode.quantum.dataId,
568
- taskLabel=task_node.label,
569
- )
570
- report.quantaReports.append(quantum_report)
571
-
572
- if self.pdb and sys.stdin.isatty() and sys.stdout.isatty():
573
- _LOG.error(
574
- "Task <%s dataId=%s> failed; dropping into pdb.",
575
- task_node.label,
576
- qnode.quantum.dataId,
577
- exc_info=exc,
578
- )
579
- try:
580
- pdb = importlib.import_module(self.pdb)
581
- except ImportError as imp_exc:
582
- raise MPGraphExecutorError(
583
- f"Unable to import specified debugger module ({self.pdb}): {imp_exc}"
584
- ) from exc
585
- if not hasattr(pdb, "post_mortem"):
586
- raise MPGraphExecutorError(
587
- f"Specified debugger module ({self.pdb}) can't debug with post_mortem",
588
- ) from exc
589
- pdb.post_mortem(exc.__traceback__)
590
- failedNodes.add(qnode)
591
- report.status = ExecutionStatus.FAILURE
592
-
593
- # If exception specified an exit code then just exit with that
594
- # code, otherwise crash if fail-fast option is enabled.
595
- if fail_exit_code is not None:
596
- sys.exit(fail_exit_code)
597
- if self.failFast:
598
- raise MPGraphExecutorError(
599
- f"Task <{task_node.label} dataId={qnode.quantum.dataId}> failed."
600
- ) from exc
601
- else:
602
- # Note that there could be exception safety issues, which
603
- # we presently ignore.
604
- _LOG.error(
605
- "Task <%s dataId=%s> failed; processing will continue for remaining tasks.",
606
- task_node.label,
607
- qnode.quantum.dataId,
608
- exc_info=exc,
609
- )
610
-
611
- _LOG.info(
612
- "Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
613
- successCount,
614
- len(failedNodes),
615
- totalCount - successCount - len(failedNodes),
616
- totalCount,
617
- )
618
-
619
- # Raise an exception if there were any failures.
620
- if failedNodes:
621
- raise MPGraphExecutorError("One or more tasks failed during execution.")
622
-
623
- def _executeQuantaMP(self, graph: QuantumGraph, report: Report) -> None:
624
- """Execute all Quanta in separate processes.
625
-
626
- Parameters
627
- ----------
628
- graph : `~lsst.pipe.base.QuantumGraph`
629
- `~lsst.pipe.base.QuantumGraph` that is to be executed.
630
- report : `Report`
631
- Object for reporting execution status.
632
- """
633
- disable_implicit_threading() # To prevent thread contention
634
-
635
- _LOG.debug("Using %r for multiprocessing start method", self.startMethod)
636
-
637
- # re-pack input quantum data into jobs list
638
- jobs = _JobList(graph)
639
-
640
- # check that all tasks can run in sub-process
641
- for job in jobs.jobs:
642
- task_node = job.qnode.task_node
643
- if not task_node.task_class.canMultiprocess:
644
- raise MPGraphExecutorError(
645
- f"Task {task_node.label!r} does not support multiprocessing; use single process"
646
- )
90
+ super().__init__(
91
+ num_proc=numProc,
92
+ timeout=timeout,
93
+ quantum_executor=quantumExecutor,
94
+ start_method=startMethod,
95
+ fail_fast=failFast,
96
+ pdb=pdb,
97
+ execution_graph_fixup=executionGraphFixup,
98
+ )
647
99
 
648
- finishedCount, failedCount = 0, 0
649
- while jobs.pending or jobs.running:
650
- _LOG.debug("#pendingJobs: %s", len(jobs.pending))
651
- _LOG.debug("#runningJobs: %s", len(jobs.running))
100
+ @property
101
+ def numProc(self) -> int:
102
+ return self._num_proc
652
103
 
653
- # See if any jobs have finished
654
- for job in jobs.running:
655
- assert job.process is not None, "Process cannot be None"
656
- if not job.process.is_alive():
657
- _LOG.debug("finished: %s", job)
658
- # finished
659
- exitcode = job.process.exitcode
660
- quantum_report = job.report()
661
- report.quantaReports.append(quantum_report)
662
- if exitcode == 0:
663
- jobs.setJobState(job, JobState.FINISHED)
664
- job.cleanup()
665
- _LOG.debug("success: %s took %.3f seconds", job, time.time() - job.started)
666
- else:
667
- if job.terminated:
668
- # Was killed due to timeout.
669
- if report.status == ExecutionStatus.SUCCESS:
670
- # Do not override global FAILURE status
671
- report.status = ExecutionStatus.TIMEOUT
672
- message = f"Timeout ({self.timeout} sec) for task {job}, task is killed"
673
- jobs.setJobState(job, JobState.TIMED_OUT)
674
- else:
675
- report.status = ExecutionStatus.FAILURE
676
- # failMessage() has to be called before cleanup()
677
- message = job.failMessage()
678
- jobs.setJobState(job, JobState.FAILED)
104
+ @property
105
+ def timeout(self) -> float:
106
+ return self._timeout
679
107
 
680
- job.cleanup()
681
- _LOG.debug("failed: %s", job)
682
- if self.failFast or exitcode == InvalidQuantumError.EXIT_CODE:
683
- # stop all running jobs
684
- for stopJob in jobs.running:
685
- if stopJob is not job:
686
- stopJob.stop()
687
- if job.state is JobState.TIMED_OUT:
688
- raise MPTimeoutError(f"Timeout ({self.timeout} sec) for task {job}.")
689
- else:
690
- raise MPGraphExecutorError(message)
691
- else:
692
- _LOG.error("%s; processing will continue for remaining tasks.", message)
693
- else:
694
- # check for timeout
695
- now = time.time()
696
- if now - job.started > self.timeout:
697
- # Try to kill it, and there is a chance that it
698
- # finishes successfully before it gets killed. Exit
699
- # status is handled by the code above on next
700
- # iteration.
701
- _LOG.debug("Terminating job %s due to timeout", job)
702
- job.stop()
108
+ @property
109
+ def quantumExecutor(self) -> QuantumExecutor:
110
+ return self._quantum_executor
703
111
 
704
- # Fail jobs whose inputs failed, this may need several iterations
705
- # if the order is not right, will be done in the next loop.
706
- if jobs.failedNodes:
707
- for job in jobs.pending:
708
- jobInputNodes = graph.determineInputsToQuantumNode(job.qnode)
709
- assert job.qnode.quantum.dataId is not None, "Quantum DataId cannot be None"
710
- if jobInputNodes & jobs.failedNodes:
711
- quantum_report = QuantumReport(
712
- status=ExecutionStatus.SKIPPED,
713
- dataId=job.qnode.quantum.dataId,
714
- taskLabel=job.qnode.task_node.label,
715
- )
716
- report.quantaReports.append(quantum_report)
717
- jobs.setJobState(job, JobState.FAILED_DEP)
718
- _LOG.error("Upstream job failed for task %s, skipping this task.", job)
112
+ @property
113
+ def failFast(self) -> bool:
114
+ return self._fail_fast
719
115
 
720
- # see if we can start more jobs
721
- if len(jobs.running) < self.numProc:
722
- for job in jobs.pending:
723
- jobInputNodes = graph.determineInputsToQuantumNode(job.qnode)
724
- if jobInputNodes <= jobs.finishedNodes:
725
- # all dependencies have completed, can start new job
726
- if len(jobs.running) < self.numProc:
727
- _LOG.debug("Submitting %s", job)
728
- jobs.submit(job, self.quantumExecutor, self.startMethod)
729
- if len(jobs.running) >= self.numProc:
730
- # Cannot start any more jobs, wait until something
731
- # finishes.
732
- break
116
+ @property
117
+ def pdb(self) -> str | None:
118
+ return self._pdb
733
119
 
734
- # Do cleanup for timed out jobs if necessary.
735
- jobs.cleanup()
120
+ @property
121
+ def executionGraphFixup(self) -> ExecutionGraphFixup | None:
122
+ return self._execution_graph_fixup
736
123
 
737
- # Print progress message if something changed.
738
- newFinished, newFailed = len(jobs.finishedNodes), len(jobs.failedNodes)
739
- if (finishedCount, failedCount) != (newFinished, newFailed):
740
- finishedCount, failedCount = newFinished, newFailed
741
- totalCount = len(jobs.jobs)
742
- _LOG.info(
743
- "Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
744
- finishedCount,
745
- failedCount,
746
- totalCount - finishedCount - failedCount,
747
- totalCount,
748
- )
124
+ @property
125
+ def report(self) -> Report | None:
126
+ return self._report
749
127
 
750
- # Here we want to wait until one of the running jobs completes
751
- # but multiprocessing does not provide an API for that, for now
752
- # just sleep a little bit and go back to the loop.
753
- if jobs.running:
754
- time.sleep(0.1)
128
+ @property
129
+ def startMethod(self) -> str:
130
+ return self._start_method
755
131
 
756
- if jobs.failedNodes:
757
- # print list of failed jobs
758
- _LOG.error("Failed jobs:")
759
- for job in jobs.jobs:
760
- if job.state != JobState.FINISHED:
761
- _LOG.error(" - %s: %s", job.state.name, job)
762
132
 
763
- # if any job failed raise an exception
764
- if jobs.failedNodes == jobs.timedOutNodes:
765
- raise MPTimeoutError("One or more tasks timed out during execution.")
766
- else:
767
- raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")
133
+ # We can't make these forwarders warn by subclassing, because an 'except'
134
+ # statement on a derived class won't catch a base class instance.
768
135
 
769
- def getReport(self) -> Report | None:
770
- # Docstring inherited from base class
771
- if self.report is None:
772
- raise RuntimeError("getReport() called before execute()")
773
- return self.report
136
+ MPGraphExecutorError = lsst.pipe.base.mp_graph_executor.MPGraphExecutorError
137
+ MPTimeoutError = lsst.pipe.base.mp_graph_executor.MPTimeoutError