lsst-pipe-base 29.2025.3000__py3-none-any.whl → 29.2025.3200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. lsst/pipe/base/__init__.py +0 -1
  2. lsst/pipe/base/_datasetQueryConstraints.py +1 -1
  3. lsst/pipe/base/all_dimensions_quantum_graph_builder.py +10 -46
  4. lsst/pipe/base/caching_limited_butler.py +8 -4
  5. lsst/pipe/base/connectionTypes.py +19 -19
  6. lsst/pipe/base/connections.py +2 -2
  7. lsst/pipe/base/exec_fixup_data_id.py +131 -0
  8. lsst/pipe/base/execution_graph_fixup.py +69 -0
  9. lsst/pipe/base/graph/graphSummary.py +4 -4
  10. lsst/pipe/base/log_capture.py +227 -0
  11. lsst/pipe/base/mp_graph_executor.py +786 -0
  12. lsst/pipe/base/pipeline_graph/_pipeline_graph.py +40 -10
  13. lsst/pipe/base/pipeline_graph/_tasks.py +106 -0
  14. lsst/pipe/base/pipeline_graph/io.py +1 -1
  15. lsst/pipe/base/quantum_graph_builder.py +85 -58
  16. lsst/pipe/base/quantum_graph_executor.py +125 -0
  17. lsst/pipe/base/quantum_graph_skeleton.py +60 -1
  18. lsst/pipe/base/quantum_reports.py +334 -0
  19. lsst/pipe/base/script/transfer_from_graph.py +4 -1
  20. lsst/pipe/base/separable_pipeline_executor.py +296 -0
  21. lsst/pipe/base/simple_pipeline_executor.py +674 -0
  22. lsst/pipe/base/single_quantum_executor.py +635 -0
  23. lsst/pipe/base/taskFactory.py +18 -12
  24. lsst/pipe/base/tests/in_memory_limited_butler.py +223 -0
  25. lsst/pipe/base/tests/mocks/__init__.py +1 -0
  26. lsst/pipe/base/tests/mocks/_in_memory_repo.py +357 -0
  27. lsst/pipe/base/tests/mocks/_pipeline_task.py +19 -2
  28. lsst/pipe/base/version.py +1 -1
  29. {lsst_pipe_base-29.2025.3000.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/METADATA +1 -1
  30. {lsst_pipe_base-29.2025.3000.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/RECORD +38 -28
  31. lsst/pipe/base/executionButlerBuilder.py +0 -493
  32. {lsst_pipe_base-29.2025.3000.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/WHEEL +0 -0
  33. {lsst_pipe_base-29.2025.3000.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/entry_points.txt +0 -0
  34. {lsst_pipe_base-29.2025.3000.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/licenses/COPYRIGHT +0 -0
  35. {lsst_pipe_base-29.2025.3000.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/licenses/LICENSE +0 -0
  36. {lsst_pipe_base-29.2025.3000.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/licenses/bsd_license.txt +0 -0
  37. {lsst_pipe_base-29.2025.3000.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/licenses/gpl-v3.0.txt +0 -0
  38. {lsst_pipe_base-29.2025.3000.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/top_level.txt +0 -0
  39. {lsst_pipe_base-29.2025.3000.dist-info → lsst_pipe_base-29.2025.3200.dist-info}/zip-safe +0 -0
@@ -0,0 +1,786 @@
1
+ # This file is part of pipe_base.
2
+ #
3
+ # Developed for the LSST Data Management System.
4
+ # This product includes software developed by the LSST Project
5
+ # (http://www.lsst.org).
6
+ # See the COPYRIGHT file at the top-level directory of this distribution
7
+ # for details of code ownership.
8
+ #
9
+ # This software is dual licensed under the GNU General Public License and also
10
+ # under a 3-clause BSD license. Recipients may choose which of these licenses
11
+ # to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12
+ # respectively. If you choose the GPL option then the following text applies
13
+ # (but note that there is still no warranty even if you opt for BSD instead):
14
+ #
15
+ # This program is free software: you can redistribute it and/or modify
16
+ # it under the terms of the GNU General Public License as published by
17
+ # the Free Software Foundation, either version 3 of the License, or
18
+ # (at your option) any later version.
19
+ #
20
+ # This program is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
+ # GNU General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU General Public License
26
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
27
+
28
+ from __future__ import annotations
29
+
30
+ __all__ = ["MPGraphExecutor", "MPGraphExecutorError", "MPTimeoutError"]
31
+
32
+ import enum
33
+ import importlib
34
+ import logging
35
+ import multiprocessing
36
+ import pickle
37
+ import signal
38
+ import sys
39
+ import threading
40
+ import time
41
+ import uuid
42
+ from collections.abc import Iterable
43
+ from typing import Literal
44
+
45
+ from lsst.daf.butler.cli.cliLog import CliLog
46
+ from lsst.utils.threads import disable_implicit_threading
47
+
48
+ from ._status import InvalidQuantumError, RepeatableQuantumError
49
+ from .execution_graph_fixup import ExecutionGraphFixup
50
+ from .graph import QuantumGraph, QuantumNode
51
+ from .pipeline_graph import TaskNode
52
+ from .quantum_graph_executor import QuantumExecutor, QuantumGraphExecutor
53
+ from .quantum_reports import ExecutionStatus, QuantumReport, Report
54
+
55
+ _LOG = logging.getLogger(__name__)
56
+
57
+
58
+ class JobState(enum.Enum):
59
+ """Possible state for an executing task."""
60
+
61
+ PENDING = enum.auto()
62
+ """The job has not started yet."""
63
+
64
+ RUNNING = enum.auto()
65
+ """The job is currently executing."""
66
+
67
+ FINISHED = enum.auto()
68
+ """The job finished successfully."""
69
+
70
+ FAILED = enum.auto()
71
+ """The job execution failed (process returned non-zero status)."""
72
+
73
+ TIMED_OUT = enum.auto()
74
+ """The job was killed due to too long execution time."""
75
+
76
+ FAILED_DEP = enum.auto()
77
+ """One of the dependencies of this job failed or timed out."""
78
+
79
+
80
+ class _Job:
81
+ """Class representing a job running single task.
82
+
83
+ Parameters
84
+ ----------
85
+ qnode: `QuantumNode`
86
+ Quantum and some associated information.
87
+ """
88
+
89
+ def __init__(self, qnode: QuantumNode, fail_fast: bool = False):
90
+ self.qnode = qnode
91
+ self._fail_fast = fail_fast
92
+ self.process: multiprocessing.process.BaseProcess | None = None
93
+ self._state = JobState.PENDING
94
+ self.started: float = 0.0
95
+ self._rcv_conn: multiprocessing.connection.Connection | None = None
96
+ self._terminated = False
97
+
98
+ @property
99
+ def state(self) -> JobState:
100
+ """Job processing state (JobState)."""
101
+ return self._state
102
+
103
+ @property
104
+ def terminated(self) -> bool:
105
+ """Return `True` if job was killed by stop() method and negative exit
106
+ code is returned from child process (`bool`).
107
+ """
108
+ if self._terminated:
109
+ assert self.process is not None, "Process must be started"
110
+ if self.process.exitcode is not None:
111
+ return self.process.exitcode < 0
112
+ return False
113
+
114
+ def start(
115
+ self,
116
+ quantumExecutor: QuantumExecutor,
117
+ startMethod: Literal["spawn"] | Literal["forkserver"],
118
+ ) -> None:
119
+ """Start process which runs the task.
120
+
121
+ Parameters
122
+ ----------
123
+ quantumExecutor : `QuantumExecutor`
124
+ Executor for single quantum.
125
+ startMethod : `str`, optional
126
+ Start method from `multiprocessing` module.
127
+ """
128
+ # Unpickling of quantum has to happen after butler/executor, also we
129
+ # want to setup logging before unpickling anything that can generate
130
+ # messages, this is why things are pickled manually here.
131
+ qe_pickle = pickle.dumps(quantumExecutor)
132
+ task_node_pickle = pickle.dumps(self.qnode.task_node)
133
+ quantum_pickle = pickle.dumps(self.qnode.quantum)
134
+ self._rcv_conn, snd_conn = multiprocessing.Pipe(False)
135
+ logConfigState = CliLog.configState
136
+
137
+ mp_ctx = multiprocessing.get_context(startMethod)
138
+ self.process = mp_ctx.Process( # type: ignore[attr-defined]
139
+ target=_Job._executeJob,
140
+ args=(
141
+ qe_pickle,
142
+ task_node_pickle,
143
+ quantum_pickle,
144
+ self.qnode.nodeId,
145
+ logConfigState,
146
+ snd_conn,
147
+ self._fail_fast,
148
+ ),
149
+ name=f"task-{self.qnode.quantum.dataId}",
150
+ )
151
+ # mypy is getting confused by multiprocessing.
152
+ assert self.process is not None
153
+ self.process.start()
154
+ self.started = time.time()
155
+ self._state = JobState.RUNNING
156
+
157
+ @staticmethod
158
+ def _executeJob(
159
+ quantumExecutor_pickle: bytes,
160
+ task_node_pickle: bytes,
161
+ quantum_pickle: bytes,
162
+ quantum_id: uuid.UUID | None,
163
+ logConfigState: list,
164
+ snd_conn: multiprocessing.connection.Connection,
165
+ fail_fast: bool,
166
+ ) -> None:
167
+ """Execute a job with arguments.
168
+
169
+ Parameters
170
+ ----------
171
+ quantumExecutor_pickle : `bytes`
172
+ Executor for single quantum, pickled.
173
+ task_node_pickle : `bytes`
174
+ Task definition structure, pickled.
175
+ quantum_pickle : `bytes`
176
+ Quantum for this task execution in pickled form.
177
+ logConfigState : `list`
178
+ Logging state from parent process.
179
+ snd_conn : `multiprocessing.Connection`
180
+ Connection to send job report to parent process.
181
+ fail_fast : `bool`
182
+ If `True` then kill subprocess on RepeatableQuantumError.
183
+ """
184
+ # This terrible hack is a workaround for Python threading bug:
185
+ # https://github.com/python/cpython/issues/102512. Should be removed
186
+ # when fix for that bug is deployed. Inspired by
187
+ # https://github.com/QubesOS/qubes-core-admin-client/pull/236/files.
188
+ thread = threading.current_thread()
189
+ if isinstance(thread, threading._DummyThread):
190
+ if getattr(thread, "_tstate_lock", "") is None:
191
+ thread._set_tstate_lock() # type: ignore[attr-defined]
192
+
193
+ if logConfigState and not CliLog.configState:
194
+ # means that we are in a new spawned Python process and we have to
195
+ # re-initialize logging
196
+ CliLog.replayConfigState(logConfigState)
197
+
198
+ quantumExecutor: QuantumExecutor = pickle.loads(quantumExecutor_pickle)
199
+ task_node: TaskNode = pickle.loads(task_node_pickle)
200
+ quantum = pickle.loads(quantum_pickle)
201
+ report: QuantumReport | None = None
202
+ # Catch a few known failure modes and stop the process immediately,
203
+ # with exception-specific exit code.
204
+ try:
205
+ _, report = quantumExecutor.execute(task_node, quantum, quantum_id=quantum_id)
206
+ except RepeatableQuantumError as exc:
207
+ report = QuantumReport.from_exception(
208
+ exception=exc,
209
+ dataId=quantum.dataId,
210
+ taskLabel=task_node.label,
211
+ exitCode=exc.EXIT_CODE if fail_fast else None,
212
+ )
213
+ if fail_fast:
214
+ _LOG.warning("Caught repeatable quantum error for %s (%s):", task_node.label, quantum.dataId)
215
+ _LOG.warning(exc, exc_info=True)
216
+ sys.exit(exc.EXIT_CODE)
217
+ else:
218
+ raise
219
+ except InvalidQuantumError as exc:
220
+ _LOG.fatal("Invalid quantum error for %s (%s): %s", task_node.label, quantum.dataId)
221
+ _LOG.fatal(exc, exc_info=True)
222
+ report = QuantumReport.from_exception(
223
+ exception=exc,
224
+ dataId=quantum.dataId,
225
+ taskLabel=task_node.label,
226
+ exitCode=exc.EXIT_CODE,
227
+ )
228
+ sys.exit(exc.EXIT_CODE)
229
+ except Exception as exc:
230
+ _LOG.debug("exception from task %s dataId %s: %s", task_node.label, quantum.dataId, exc)
231
+ report = QuantumReport.from_exception(
232
+ exception=exc,
233
+ dataId=quantum.dataId,
234
+ taskLabel=task_node.label,
235
+ )
236
+ raise
237
+ finally:
238
+ if report is not None:
239
+ # If sending fails we do not want this new exception to be
240
+ # exposed.
241
+ try:
242
+ _LOG.debug("sending report for task %s dataId %s", task_node.label, quantum.dataId)
243
+ snd_conn.send(report)
244
+ except Exception:
245
+ pass
246
+
247
+ def stop(self) -> None:
248
+ """Stop the process."""
249
+ assert self.process is not None, "Process must be started"
250
+ self.process.terminate()
251
+ # give it 1 second to finish or KILL
252
+ for _ in range(10):
253
+ time.sleep(0.1)
254
+ if not self.process.is_alive():
255
+ break
256
+ else:
257
+ _LOG.debug("Killing process %s", self.process.name)
258
+ self.process.kill()
259
+ self._terminated = True
260
+
261
+ def cleanup(self) -> None:
262
+ """Release processes resources, has to be called for each finished
263
+ process.
264
+ """
265
+ if self.process and not self.process.is_alive():
266
+ self.process.close()
267
+ self.process = None
268
+ self._rcv_conn = None
269
+
270
+ def report(self) -> QuantumReport:
271
+ """Return task report, should be called after process finishes and
272
+ before cleanup().
273
+ """
274
+ assert self.process is not None, "Process must be started"
275
+ assert self._rcv_conn is not None, "Process must be started"
276
+ try:
277
+ report = self._rcv_conn.recv()
278
+ report.exitCode = self.process.exitcode
279
+ except Exception:
280
+ # Likely due to the process killed, but there may be other reasons.
281
+ # Exit code should not be None, this is to keep mypy happy.
282
+ exitcode = self.process.exitcode if self.process.exitcode is not None else -1
283
+ assert self.qnode.quantum.dataId is not None, "Quantum DataId cannot be None"
284
+ report = QuantumReport.from_exit_code(
285
+ exitCode=exitcode,
286
+ dataId=self.qnode.quantum.dataId,
287
+ taskLabel=self.qnode.task_node.label,
288
+ )
289
+ if self.terminated:
290
+ # Means it was killed, assume it's due to timeout
291
+ report.status = ExecutionStatus.TIMEOUT
292
+ return report
293
+
294
+ def failMessage(self) -> str:
295
+ """Return a message describing task failure."""
296
+ assert self.process is not None, "Process must be started"
297
+ assert self.process.exitcode is not None, "Process has to finish"
298
+ exitcode = self.process.exitcode
299
+ if exitcode < 0:
300
+ # Negative exit code means it is killed by signal
301
+ signum = -exitcode
302
+ msg = f"Task {self} failed, killed by signal {signum}"
303
+ # Just in case this is some very odd signal, expect ValueError
304
+ try:
305
+ strsignal = signal.strsignal(signum)
306
+ msg = f"{msg} ({strsignal})"
307
+ except ValueError:
308
+ pass
309
+ elif exitcode > 0:
310
+ msg = f"Task {self} failed, exit code={exitcode}"
311
+ else:
312
+ msg = ""
313
+ return msg
314
+
315
+ def __str__(self) -> str:
316
+ return f"<{self.qnode.task_node.label} dataId={self.qnode.quantum.dataId}>"
317
+
318
+
319
+ class _JobList:
320
+ """Simple list of _Job instances with few convenience methods.
321
+
322
+ Parameters
323
+ ----------
324
+ iterable : ~collections.abc.Iterable` [ `QuantumNode` ]
325
+ Sequence of Quanta to execute. This has to be ordered according to
326
+ task dependencies.
327
+ """
328
+
329
+ def __init__(self, iterable: Iterable[QuantumNode]):
330
+ self.jobs = [_Job(qnode) for qnode in iterable]
331
+ self.pending = self.jobs[:]
332
+ self.running: list[_Job] = []
333
+ self.finishedNodes: set[QuantumNode] = set()
334
+ self.failedNodes: set[QuantumNode] = set()
335
+ self.timedOutNodes: set[QuantumNode] = set()
336
+
337
+ def submit(
338
+ self,
339
+ job: _Job,
340
+ quantumExecutor: QuantumExecutor,
341
+ startMethod: Literal["spawn"] | Literal["forkserver"],
342
+ ) -> None:
343
+ """Submit one more job for execution.
344
+
345
+ Parameters
346
+ ----------
347
+ job : `_Job`
348
+ Job to submit.
349
+ quantumExecutor : `QuantumExecutor`
350
+ Executor for single quantum.
351
+ startMethod : `str`, optional
352
+ Start method from `multiprocessing` module.
353
+ """
354
+ # this will raise if job is not in pending list
355
+ self.pending.remove(job)
356
+ job.start(quantumExecutor, startMethod)
357
+ self.running.append(job)
358
+
359
+ def setJobState(self, job: _Job, state: JobState) -> None:
360
+ """Update job state.
361
+
362
+ Parameters
363
+ ----------
364
+ job : `_Job`
365
+ Job to submit.
366
+ state : `JobState`
367
+ New job state, note that only FINISHED, FAILED, TIMED_OUT, or
368
+ FAILED_DEP state is acceptable.
369
+ """
370
+ allowedStates = (JobState.FINISHED, JobState.FAILED, JobState.TIMED_OUT, JobState.FAILED_DEP)
371
+ assert state in allowedStates, f"State {state} not allowed here"
372
+
373
+ # remove job from pending/running lists
374
+ if job.state == JobState.PENDING:
375
+ self.pending.remove(job)
376
+ elif job.state == JobState.RUNNING:
377
+ self.running.remove(job)
378
+
379
+ qnode = job.qnode
380
+ # it should not be in any of these, but just in case
381
+ self.finishedNodes.discard(qnode)
382
+ self.failedNodes.discard(qnode)
383
+ self.timedOutNodes.discard(qnode)
384
+
385
+ job._state = state
386
+ if state == JobState.FINISHED:
387
+ self.finishedNodes.add(qnode)
388
+ elif state == JobState.FAILED:
389
+ self.failedNodes.add(qnode)
390
+ elif state == JobState.FAILED_DEP:
391
+ self.failedNodes.add(qnode)
392
+ elif state == JobState.TIMED_OUT:
393
+ self.failedNodes.add(qnode)
394
+ self.timedOutNodes.add(qnode)
395
+ else:
396
+ raise ValueError(f"Unexpected state value: {state}")
397
+
398
+ def cleanup(self) -> None:
399
+ """Do periodic cleanup for jobs that did not finish correctly.
400
+
401
+ If timed out jobs are killed but take too long to stop then regular
402
+ cleanup will not work for them. Here we check all timed out jobs
403
+ periodically and do cleanup if they managed to die by this time.
404
+ """
405
+ for job in self.jobs:
406
+ if job.state == JobState.TIMED_OUT and job.process is not None:
407
+ job.cleanup()
408
+
409
+
410
+ class MPGraphExecutorError(Exception):
411
+ """Exception class for errors raised by MPGraphExecutor."""
412
+
413
+ pass
414
+
415
+
416
+ class MPTimeoutError(MPGraphExecutorError):
417
+ """Exception raised when task execution times out."""
418
+
419
+ pass
420
+
421
+
422
+ class MPGraphExecutor(QuantumGraphExecutor):
423
+ """Implementation of QuantumGraphExecutor using same-host multiprocess
424
+ execution of Quanta.
425
+
426
+ Parameters
427
+ ----------
428
+ num_proc : `int`
429
+ Number of processes to use for executing tasks.
430
+ timeout : `float`
431
+ Time in seconds to wait for tasks to finish.
432
+ quantum_executor : `.quantum_graph_executor.QuantumExecutor`
433
+ Executor for single quantum. For multiprocess-style execution when
434
+ ``num_proc`` is greater than one this instance must support pickle.
435
+ start_method : `str`, optional
436
+ Start method from `multiprocessing` module, `None` selects the best
437
+ one for current platform.
438
+ fail_fast : `bool`, optional
439
+ If set to ``True`` then stop processing on first error from any task.
440
+ pdb : `str`, optional
441
+ Debugger to import and use (via the ``post_mortem`` function) in the
442
+ event of an exception.
443
+ execution_graph_fixup : `.execution_graph_fixup.ExecutionGraphFixup`, \
444
+ optional
445
+ Instance used for modification of execution graph.
446
+ """
447
+
448
+ def __init__(
449
+ self,
450
+ *,
451
+ num_proc: int,
452
+ timeout: float,
453
+ quantum_executor: QuantumExecutor,
454
+ start_method: Literal["spawn"] | Literal["forkserver"] | None = None,
455
+ fail_fast: bool = False,
456
+ pdb: str | None = None,
457
+ execution_graph_fixup: ExecutionGraphFixup | None = None,
458
+ ):
459
+ self._num_proc = num_proc
460
+ self._timeout = timeout
461
+ self._quantum_executor = quantum_executor
462
+ self._fail_fast = fail_fast
463
+ self._pdb = pdb
464
+ self._execution_graph_fixup = execution_graph_fixup
465
+ self._report: Report | None = None
466
+
467
+ # We set default start method as spawn for all platforms.
468
+ if start_method is None:
469
+ start_method = "spawn"
470
+ self._start_method = start_method
471
+
472
+ def execute(self, graph: QuantumGraph) -> None:
473
+ # Docstring inherited from QuantumGraphExecutor.execute
474
+ graph = self._fixupQuanta(graph)
475
+ self._report = Report(qgraphSummary=graph.getSummary())
476
+ try:
477
+ if self._num_proc > 1:
478
+ self._executeQuantaMP(graph, self._report)
479
+ else:
480
+ self._executeQuantaInProcess(graph, self._report)
481
+ except Exception as exc:
482
+ self._report.set_exception(exc)
483
+ raise
484
+
485
+ def _fixupQuanta(self, graph: QuantumGraph) -> QuantumGraph:
486
+ """Call fixup code to modify execution graph.
487
+
488
+ Parameters
489
+ ----------
490
+ graph : `.QuantumGraph`
491
+ `.QuantumGraph` to modify.
492
+
493
+ Returns
494
+ -------
495
+ graph : `.QuantumGraph`
496
+ Modified `.QuantumGraph`.
497
+
498
+ Raises
499
+ ------
500
+ MPGraphExecutorError
501
+ Raised if execution graph cannot be ordered after modification,
502
+ i.e. it has dependency cycles.
503
+ """
504
+ if not self._execution_graph_fixup:
505
+ return graph
506
+
507
+ _LOG.debug("Call execution graph fixup method")
508
+ graph = self._execution_graph_fixup.fixupQuanta(graph)
509
+
510
+ # Detect if there is now a cycle created within the graph
511
+ if graph.findCycle():
512
+ raise MPGraphExecutorError("Updated execution graph has dependency cycle.")
513
+
514
+ return graph
515
+
516
+ def _executeQuantaInProcess(self, graph: QuantumGraph, report: Report) -> None:
517
+ """Execute all Quanta in current process.
518
+
519
+ Parameters
520
+ ----------
521
+ graph : `.QuantumGraph`
522
+ `.QuantumGraph` that is to be executed.
523
+ report : `Report`
524
+ Object for reporting execution status.
525
+ """
526
+ successCount, totalCount = 0, len(graph)
527
+ failedNodes: set[QuantumNode] = set()
528
+ for qnode in graph:
529
+ assert qnode.quantum.dataId is not None, "Quantum DataId cannot be None"
530
+ task_node = qnode.task_node
531
+
532
+ # Any failed inputs mean that the quantum has to be skipped.
533
+ inputNodes = graph.determineInputsToQuantumNode(qnode)
534
+ if inputNodes & failedNodes:
535
+ _LOG.error(
536
+ "Upstream job failed for task <%s dataId=%s>, skipping this task.",
537
+ task_node.label,
538
+ qnode.quantum.dataId,
539
+ )
540
+ failedNodes.add(qnode)
541
+ failed_quantum_report = QuantumReport(
542
+ status=ExecutionStatus.SKIPPED,
543
+ dataId=qnode.quantum.dataId,
544
+ taskLabel=task_node.label,
545
+ )
546
+ report.quantaReports.append(failed_quantum_report)
547
+ continue
548
+
549
+ _LOG.debug("Executing %s", qnode)
550
+ fail_exit_code: int | None = None
551
+ try:
552
+ # For some exception types we want to exit immediately with
553
+ # exception-specific exit code, but we still want to start
554
+ # debugger before exiting if debugging is enabled.
555
+ try:
556
+ _, quantum_report = self._quantum_executor.execute(
557
+ task_node, qnode.quantum, quantum_id=qnode.nodeId
558
+ )
559
+ if quantum_report:
560
+ report.quantaReports.append(quantum_report)
561
+ successCount += 1
562
+ except RepeatableQuantumError as exc:
563
+ if self._fail_fast:
564
+ _LOG.warning(
565
+ "Caught repeatable quantum error for %s (%s):",
566
+ task_node.label,
567
+ qnode.quantum.dataId,
568
+ )
569
+ _LOG.warning(exc, exc_info=True)
570
+ fail_exit_code = exc.EXIT_CODE
571
+ raise
572
+ except InvalidQuantumError as exc:
573
+ _LOG.fatal("Invalid quantum error for %s (%s):", task_node.label, qnode.quantum.dataId)
574
+ _LOG.fatal(exc, exc_info=True)
575
+ fail_exit_code = exc.EXIT_CODE
576
+ raise
577
+ except Exception as exc:
578
+ quantum_report = QuantumReport.from_exception(
579
+ exception=exc,
580
+ dataId=qnode.quantum.dataId,
581
+ taskLabel=task_node.label,
582
+ )
583
+ report.quantaReports.append(quantum_report)
584
+
585
+ if self._pdb and sys.stdin.isatty() and sys.stdout.isatty():
586
+ _LOG.error(
587
+ "Task <%s dataId=%s> failed; dropping into pdb.",
588
+ task_node.label,
589
+ qnode.quantum.dataId,
590
+ exc_info=exc,
591
+ )
592
+ try:
593
+ pdb = importlib.import_module(self._pdb)
594
+ except ImportError as imp_exc:
595
+ raise MPGraphExecutorError(
596
+ f"Unable to import specified debugger module ({self._pdb}): {imp_exc}"
597
+ ) from exc
598
+ if not hasattr(pdb, "post_mortem"):
599
+ raise MPGraphExecutorError(
600
+ f"Specified debugger module ({self._pdb}) can't debug with post_mortem",
601
+ ) from exc
602
+ pdb.post_mortem(exc.__traceback__)
603
+ failedNodes.add(qnode)
604
+ report.status = ExecutionStatus.FAILURE
605
+
606
+ # If exception specified an exit code then just exit with that
607
+ # code, otherwise crash if fail-fast option is enabled.
608
+ if fail_exit_code is not None:
609
+ sys.exit(fail_exit_code)
610
+ if self._fail_fast:
611
+ raise MPGraphExecutorError(
612
+ f"Task <{task_node.label} dataId={qnode.quantum.dataId}> failed."
613
+ ) from exc
614
+ else:
615
+ # Note that there could be exception safety issues, which
616
+ # we presently ignore.
617
+ _LOG.error(
618
+ "Task <%s dataId=%s> failed; processing will continue for remaining tasks.",
619
+ task_node.label,
620
+ qnode.quantum.dataId,
621
+ exc_info=exc,
622
+ )
623
+
624
+ _LOG.info(
625
+ "Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
626
+ successCount,
627
+ len(failedNodes),
628
+ totalCount - successCount - len(failedNodes),
629
+ totalCount,
630
+ )
631
+
632
+ # Raise an exception if there were any failures.
633
+ if failedNodes:
634
+ raise MPGraphExecutorError("One or more tasks failed during execution.")
635
+
636
+ def _executeQuantaMP(self, graph: QuantumGraph, report: Report) -> None:
637
+ """Execute all Quanta in separate processes.
638
+
639
+ Parameters
640
+ ----------
641
+ graph : `.QuantumGraph`
642
+ `.QuantumGraph` that is to be executed.
643
+ report : `Report`
644
+ Object for reporting execution status.
645
+ """
646
+ disable_implicit_threading() # To prevent thread contention
647
+
648
+ _LOG.debug("Using %r for multiprocessing start method", self._start_method)
649
+
650
+ # re-pack input quantum data into jobs list
651
+ jobs = _JobList(graph)
652
+
653
+ # check that all tasks can run in sub-process
654
+ for job in jobs.jobs:
655
+ task_node = job.qnode.task_node
656
+ if not task_node.task_class.canMultiprocess:
657
+ raise MPGraphExecutorError(
658
+ f"Task {task_node.label!r} does not support multiprocessing; use single process"
659
+ )
660
+
661
+ finishedCount, failedCount = 0, 0
662
+ while jobs.pending or jobs.running:
663
+ _LOG.debug("#pendingJobs: %s", len(jobs.pending))
664
+ _LOG.debug("#runningJobs: %s", len(jobs.running))
665
+
666
+ # See if any jobs have finished
667
+ for job in jobs.running:
668
+ assert job.process is not None, "Process cannot be None"
669
+ if not job.process.is_alive():
670
+ _LOG.debug("finished: %s", job)
671
+ # finished
672
+ exitcode = job.process.exitcode
673
+ quantum_report = job.report()
674
+ report.quantaReports.append(quantum_report)
675
+ if exitcode == 0:
676
+ jobs.setJobState(job, JobState.FINISHED)
677
+ job.cleanup()
678
+ _LOG.debug("success: %s took %.3f seconds", job, time.time() - job.started)
679
+ else:
680
+ if job.terminated:
681
+ # Was killed due to timeout.
682
+ if report.status == ExecutionStatus.SUCCESS:
683
+ # Do not override global FAILURE status
684
+ report.status = ExecutionStatus.TIMEOUT
685
+ message = f"Timeout ({self._timeout} sec) for task {job}, task is killed"
686
+ jobs.setJobState(job, JobState.TIMED_OUT)
687
+ else:
688
+ report.status = ExecutionStatus.FAILURE
689
+ # failMessage() has to be called before cleanup()
690
+ message = job.failMessage()
691
+ jobs.setJobState(job, JobState.FAILED)
692
+
693
+ job.cleanup()
694
+ _LOG.debug("failed: %s", job)
695
+ if self._fail_fast or exitcode == InvalidQuantumError.EXIT_CODE:
696
+ # stop all running jobs
697
+ for stopJob in jobs.running:
698
+ if stopJob is not job:
699
+ stopJob.stop()
700
+ if job.state is JobState.TIMED_OUT:
701
+ raise MPTimeoutError(f"Timeout ({self._timeout} sec) for task {job}.")
702
+ else:
703
+ raise MPGraphExecutorError(message)
704
+ else:
705
+ _LOG.error("%s; processing will continue for remaining tasks.", message)
706
+ else:
707
+ # check for timeout
708
+ now = time.time()
709
+ if now - job.started > self._timeout:
710
+ # Try to kill it, and there is a chance that it
711
+ # finishes successfully before it gets killed. Exit
712
+ # status is handled by the code above on next
713
+ # iteration.
714
+ _LOG.debug("Terminating job %s due to timeout", job)
715
+ job.stop()
716
+
717
+ # Fail jobs whose inputs failed, this may need several iterations
718
+ # if the order is not right, will be done in the next loop.
719
+ if jobs.failedNodes:
720
+ for job in jobs.pending:
721
+ jobInputNodes = graph.determineInputsToQuantumNode(job.qnode)
722
+ assert job.qnode.quantum.dataId is not None, "Quantum DataId cannot be None"
723
+ if jobInputNodes & jobs.failedNodes:
724
+ quantum_report = QuantumReport(
725
+ status=ExecutionStatus.SKIPPED,
726
+ dataId=job.qnode.quantum.dataId,
727
+ taskLabel=job.qnode.task_node.label,
728
+ )
729
+ report.quantaReports.append(quantum_report)
730
+ jobs.setJobState(job, JobState.FAILED_DEP)
731
+ _LOG.error("Upstream job failed for task %s, skipping this task.", job)
732
+
733
+ # see if we can start more jobs
734
+ if len(jobs.running) < self._num_proc:
735
+ for job in jobs.pending:
736
+ jobInputNodes = graph.determineInputsToQuantumNode(job.qnode)
737
+ if jobInputNodes <= jobs.finishedNodes:
738
+ # all dependencies have completed, can start new job
739
+ if len(jobs.running) < self._num_proc:
740
+ _LOG.debug("Submitting %s", job)
741
+ jobs.submit(job, self._quantum_executor, self._start_method)
742
+ if len(jobs.running) >= self._num_proc:
743
+ # Cannot start any more jobs, wait until something
744
+ # finishes.
745
+ break
746
+
747
+ # Do cleanup for timed out jobs if necessary.
748
+ jobs.cleanup()
749
+
750
+ # Print progress message if something changed.
751
+ newFinished, newFailed = len(jobs.finishedNodes), len(jobs.failedNodes)
752
+ if (finishedCount, failedCount) != (newFinished, newFailed):
753
+ finishedCount, failedCount = newFinished, newFailed
754
+ totalCount = len(jobs.jobs)
755
+ _LOG.info(
756
+ "Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
757
+ finishedCount,
758
+ failedCount,
759
+ totalCount - finishedCount - failedCount,
760
+ totalCount,
761
+ )
762
+
763
+ # Here we want to wait until one of the running jobs completes
764
+ # but multiprocessing does not provide an API for that, for now
765
+ # just sleep a little bit and go back to the loop.
766
+ if jobs.running:
767
+ time.sleep(0.1)
768
+
769
+ if jobs.failedNodes:
770
+ # print list of failed jobs
771
+ _LOG.error("Failed jobs:")
772
+ for job in jobs.jobs:
773
+ if job.state != JobState.FINISHED:
774
+ _LOG.error(" - %s: %s", job.state.name, job)
775
+
776
+ # if any job failed raise an exception
777
+ if jobs.failedNodes == jobs.timedOutNodes:
778
+ raise MPTimeoutError("One or more tasks timed out during execution.")
779
+ else:
780
+ raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")
781
+
782
+ def getReport(self) -> Report | None:
783
+ # Docstring inherited from base class
784
+ if self._report is None:
785
+ raise RuntimeError("getReport() called before execute()")
786
+ return self._report