lsst-pipe-base 29.2025.3900__py3-none-any.whl → 29.2025.4100__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/pipe/base/_task_metadata.py +15 -0
- lsst/pipe/base/dot_tools.py +14 -152
- lsst/pipe/base/exec_fixup_data_id.py +17 -44
- lsst/pipe/base/execution_graph_fixup.py +49 -18
- lsst/pipe/base/graph/_versionDeserializers.py +6 -5
- lsst/pipe/base/graph/graph.py +30 -10
- lsst/pipe/base/graph/graphSummary.py +30 -0
- lsst/pipe/base/graph_walker.py +119 -0
- lsst/pipe/base/log_capture.py +5 -2
- lsst/pipe/base/mermaid_tools.py +11 -64
- lsst/pipe/base/mp_graph_executor.py +298 -236
- lsst/pipe/base/pipeline_graph/io.py +1 -1
- lsst/pipe/base/quantum_graph/__init__.py +32 -0
- lsst/pipe/base/quantum_graph/_common.py +632 -0
- lsst/pipe/base/quantum_graph/_multiblock.py +808 -0
- lsst/pipe/base/quantum_graph/_predicted.py +1950 -0
- lsst/pipe/base/quantum_graph/visualization.py +302 -0
- lsst/pipe/base/quantum_graph_builder.py +292 -34
- lsst/pipe/base/quantum_graph_executor.py +2 -1
- lsst/pipe/base/quantum_provenance_graph.py +16 -7
- lsst/pipe/base/quantum_reports.py +45 -0
- lsst/pipe/base/separable_pipeline_executor.py +126 -15
- lsst/pipe/base/simple_pipeline_executor.py +44 -43
- lsst/pipe/base/single_quantum_executor.py +1 -40
- lsst/pipe/base/tests/mocks/__init__.py +1 -1
- lsst/pipe/base/tests/mocks/_pipeline_task.py +16 -1
- lsst/pipe/base/tests/mocks/{_in_memory_repo.py → _repo.py} +324 -45
- lsst/pipe/base/tests/mocks/_storage_class.py +51 -0
- lsst/pipe/base/tests/simpleQGraph.py +11 -5
- lsst/pipe/base/version.py +1 -1
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/METADATA +2 -1
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/RECORD +40 -34
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/WHEEL +0 -0
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/entry_points.txt +0 -0
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/LICENSE +0 -0
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/top_level.txt +0 -0
- {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/zip-safe +0 -0
|
@@ -39,16 +39,20 @@ import sys
|
|
|
39
39
|
import threading
|
|
40
40
|
import time
|
|
41
41
|
import uuid
|
|
42
|
-
from
|
|
43
|
-
from typing import Literal
|
|
42
|
+
from typing import Literal, cast
|
|
44
43
|
|
|
44
|
+
import networkx
|
|
45
|
+
|
|
46
|
+
from lsst.daf.butler import DataCoordinate, Quantum
|
|
45
47
|
from lsst.daf.butler.cli.cliLog import CliLog
|
|
46
48
|
from lsst.utils.threads import disable_implicit_threading
|
|
47
49
|
|
|
48
50
|
from ._status import InvalidQuantumError, RepeatableQuantumError
|
|
49
51
|
from .execution_graph_fixup import ExecutionGraphFixup
|
|
50
|
-
from .graph import QuantumGraph
|
|
52
|
+
from .graph import QuantumGraph
|
|
53
|
+
from .graph_walker import GraphWalker
|
|
51
54
|
from .pipeline_graph import TaskNode
|
|
55
|
+
from .quantum_graph import PredictedQuantumGraph, PredictedQuantumInfo
|
|
52
56
|
from .quantum_graph_executor import QuantumExecutor, QuantumGraphExecutor
|
|
53
57
|
from .quantum_reports import ExecutionStatus, QuantumReport, Report
|
|
54
58
|
|
|
@@ -82,13 +86,18 @@ class _Job:
|
|
|
82
86
|
|
|
83
87
|
Parameters
|
|
84
88
|
----------
|
|
85
|
-
|
|
86
|
-
|
|
89
|
+
quantum_id : `uuid.UUID`
|
|
90
|
+
ID of the quantum this job executes.
|
|
91
|
+
quantum : `lsst.daf.butler.Quantum`
|
|
92
|
+
Description of the inputs and outputs.
|
|
93
|
+
task_node : `.pipeline_graph.TaskNode`
|
|
94
|
+
Description of the task and configuration.
|
|
87
95
|
"""
|
|
88
96
|
|
|
89
|
-
def __init__(self,
|
|
90
|
-
self.
|
|
91
|
-
self.
|
|
97
|
+
def __init__(self, quantum_id: uuid.UUID, quantum: Quantum, task_node: TaskNode):
|
|
98
|
+
self.quantum_id = quantum_id
|
|
99
|
+
self.quantum = quantum
|
|
100
|
+
self.task_node = task_node
|
|
92
101
|
self.process: multiprocessing.process.BaseProcess | None = None
|
|
93
102
|
self._state = JobState.PENDING
|
|
94
103
|
self.started: float = 0.0
|
|
@@ -115,6 +124,7 @@ class _Job:
|
|
|
115
124
|
self,
|
|
116
125
|
quantumExecutor: QuantumExecutor,
|
|
117
126
|
startMethod: Literal["spawn"] | Literal["forkserver"],
|
|
127
|
+
fail_fast: bool,
|
|
118
128
|
) -> None:
|
|
119
129
|
"""Start process which runs the task.
|
|
120
130
|
|
|
@@ -124,13 +134,15 @@ class _Job:
|
|
|
124
134
|
Executor for single quantum.
|
|
125
135
|
startMethod : `str`, optional
|
|
126
136
|
Start method from `multiprocessing` module.
|
|
137
|
+
fail_fast : `bool`, optional
|
|
138
|
+
If `True` then kill subprocess on RepeatableQuantumError.
|
|
127
139
|
"""
|
|
128
140
|
# Unpickling of quantum has to happen after butler/executor, also we
|
|
129
141
|
# want to setup logging before unpickling anything that can generate
|
|
130
142
|
# messages, this is why things are pickled manually here.
|
|
131
143
|
qe_pickle = pickle.dumps(quantumExecutor)
|
|
132
|
-
task_node_pickle = pickle.dumps(self.
|
|
133
|
-
quantum_pickle = pickle.dumps(self.
|
|
144
|
+
task_node_pickle = pickle.dumps(self.task_node)
|
|
145
|
+
quantum_pickle = pickle.dumps(self.quantum)
|
|
134
146
|
self._rcv_conn, snd_conn = multiprocessing.Pipe(False)
|
|
135
147
|
logConfigState = CliLog.configState
|
|
136
148
|
|
|
@@ -141,12 +153,12 @@ class _Job:
|
|
|
141
153
|
qe_pickle,
|
|
142
154
|
task_node_pickle,
|
|
143
155
|
quantum_pickle,
|
|
144
|
-
self.
|
|
156
|
+
self.quantum_id,
|
|
145
157
|
logConfigState,
|
|
146
158
|
snd_conn,
|
|
147
|
-
|
|
159
|
+
fail_fast,
|
|
148
160
|
),
|
|
149
|
-
name=f"task-{self.
|
|
161
|
+
name=f"task-{self.quantum.dataId}",
|
|
150
162
|
)
|
|
151
163
|
# mypy is getting confused by multiprocessing.
|
|
152
164
|
assert self.process is not None
|
|
@@ -285,12 +297,12 @@ class _Job:
|
|
|
285
297
|
# Likely due to the process killed, but there may be other reasons.
|
|
286
298
|
# Exit code should not be None, this is to keep mypy happy.
|
|
287
299
|
exitcode = self.process.exitcode if self.process.exitcode is not None else -1
|
|
288
|
-
assert self.
|
|
300
|
+
assert self.quantum.dataId is not None, "Quantum DataId cannot be None"
|
|
289
301
|
report = QuantumReport.from_exit_code(
|
|
290
|
-
quantumId=self.
|
|
302
|
+
quantumId=self.quantum_id,
|
|
291
303
|
exitCode=exitcode,
|
|
292
|
-
dataId=self.
|
|
293
|
-
taskLabel=self.
|
|
304
|
+
dataId=self.quantum.dataId,
|
|
305
|
+
taskLabel=self.task_node.label,
|
|
294
306
|
)
|
|
295
307
|
if self.terminated:
|
|
296
308
|
# Means it was killed, assume it's due to timeout
|
|
@@ -319,7 +331,7 @@ class _Job:
|
|
|
319
331
|
return msg
|
|
320
332
|
|
|
321
333
|
def __str__(self) -> str:
|
|
322
|
-
return f"<{self.
|
|
334
|
+
return f"<{self.task_node.label} dataId={self.quantum.dataId}>"
|
|
323
335
|
|
|
324
336
|
|
|
325
337
|
class _JobList:
|
|
@@ -327,42 +339,55 @@ class _JobList:
|
|
|
327
339
|
|
|
328
340
|
Parameters
|
|
329
341
|
----------
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
task dependencies.
|
|
342
|
+
xgraph : `networkx.DiGraph`
|
|
343
|
+
Directed acyclic graph of quantum IDs.
|
|
333
344
|
"""
|
|
334
345
|
|
|
335
|
-
def __init__(self,
|
|
336
|
-
self.jobs =
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
346
|
+
def __init__(self, xgraph: networkx.DiGraph):
|
|
347
|
+
self.jobs = {
|
|
348
|
+
quantum_id: _Job(
|
|
349
|
+
quantum_id=quantum_id,
|
|
350
|
+
quantum=xgraph.nodes[quantum_id]["quantum"],
|
|
351
|
+
task_node=xgraph.nodes[quantum_id]["pipeline_node"],
|
|
352
|
+
)
|
|
353
|
+
for quantum_id in xgraph
|
|
354
|
+
}
|
|
355
|
+
self.walker: GraphWalker[uuid.UUID] = GraphWalker(xgraph.copy())
|
|
356
|
+
self.pending = set(next(self.walker, ()))
|
|
357
|
+
self.running: set[uuid.UUID] = set()
|
|
358
|
+
self.finished: set[uuid.UUID] = set()
|
|
359
|
+
self.failed: set[uuid.UUID] = set()
|
|
360
|
+
self.timed_out: set[uuid.UUID] = set()
|
|
342
361
|
|
|
343
362
|
def submit(
|
|
344
363
|
self,
|
|
345
|
-
job: _Job,
|
|
346
364
|
quantumExecutor: QuantumExecutor,
|
|
347
365
|
startMethod: Literal["spawn"] | Literal["forkserver"],
|
|
348
|
-
|
|
349
|
-
|
|
366
|
+
fail_fast: bool = False,
|
|
367
|
+
) -> _Job:
|
|
368
|
+
"""Submit a pending job for execution.
|
|
350
369
|
|
|
351
370
|
Parameters
|
|
352
371
|
----------
|
|
353
|
-
job : `_Job`
|
|
354
|
-
Job to submit.
|
|
355
372
|
quantumExecutor : `QuantumExecutor`
|
|
356
373
|
Executor for single quantum.
|
|
357
374
|
startMethod : `str`, optional
|
|
358
375
|
Start method from `multiprocessing` module.
|
|
376
|
+
fail_fast : `bool`, optional
|
|
377
|
+
If `True` then kill subprocess on RepeatableQuantumError.
|
|
378
|
+
|
|
379
|
+
Returns
|
|
380
|
+
-------
|
|
381
|
+
job : `_Job`
|
|
382
|
+
The job that was submitted.
|
|
359
383
|
"""
|
|
360
|
-
|
|
361
|
-
self.
|
|
362
|
-
job.start(quantumExecutor, startMethod)
|
|
363
|
-
self.running.
|
|
384
|
+
quantum_id = self.pending.pop()
|
|
385
|
+
job = self.jobs[quantum_id]
|
|
386
|
+
job.start(quantumExecutor, startMethod, fail_fast=fail_fast)
|
|
387
|
+
self.running.add(job.quantum_id)
|
|
388
|
+
return job
|
|
364
389
|
|
|
365
|
-
def setJobState(self, job: _Job, state: JobState) ->
|
|
390
|
+
def setJobState(self, job: _Job, state: JobState) -> list[_Job]:
|
|
366
391
|
"""Update job state.
|
|
367
392
|
|
|
368
393
|
Parameters
|
|
@@ -370,36 +395,49 @@ class _JobList:
|
|
|
370
395
|
job : `_Job`
|
|
371
396
|
Job to submit.
|
|
372
397
|
state : `JobState`
|
|
373
|
-
New job state
|
|
374
|
-
|
|
398
|
+
New job state; note that only the FINISHED, FAILED, and TIMED_OUT
|
|
399
|
+
states are acceptable.
|
|
400
|
+
|
|
401
|
+
Returns
|
|
402
|
+
-------
|
|
403
|
+
blocked : `list` [ `_Job` ]
|
|
404
|
+
Additional jobs that have been marked as failed because this job
|
|
405
|
+
was upstream of them and failed or timed out.
|
|
375
406
|
"""
|
|
376
|
-
allowedStates = (JobState.FINISHED, JobState.FAILED, JobState.TIMED_OUT
|
|
407
|
+
allowedStates = (JobState.FINISHED, JobState.FAILED, JobState.TIMED_OUT)
|
|
377
408
|
assert state in allowedStates, f"State {state} not allowed here"
|
|
378
409
|
|
|
379
410
|
# remove job from pending/running lists
|
|
380
411
|
if job.state == JobState.PENDING:
|
|
381
|
-
self.pending.remove(job)
|
|
412
|
+
self.pending.remove(job.quantum_id)
|
|
382
413
|
elif job.state == JobState.RUNNING:
|
|
383
|
-
self.running.remove(job)
|
|
414
|
+
self.running.remove(job.quantum_id)
|
|
384
415
|
|
|
385
|
-
|
|
416
|
+
quantum_id = job.quantum_id
|
|
386
417
|
# it should not be in any of these, but just in case
|
|
387
|
-
self.
|
|
388
|
-
self.
|
|
389
|
-
self.
|
|
390
|
-
|
|
418
|
+
self.finished.discard(quantum_id)
|
|
419
|
+
self.failed.discard(quantum_id)
|
|
420
|
+
self.timed_out.discard(quantum_id)
|
|
391
421
|
job._state = state
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
422
|
+
match job.state:
|
|
423
|
+
case JobState.FINISHED:
|
|
424
|
+
self.finished.add(quantum_id)
|
|
425
|
+
self.walker.finish(quantum_id)
|
|
426
|
+
self.pending.update(next(self.walker, ()))
|
|
427
|
+
return []
|
|
428
|
+
case JobState.FAILED:
|
|
429
|
+
self.failed.add(quantum_id)
|
|
430
|
+
case JobState.TIMED_OUT:
|
|
431
|
+
self.failed.add(quantum_id)
|
|
432
|
+
self.timed_out.add(quantum_id)
|
|
433
|
+
case _:
|
|
434
|
+
raise ValueError(f"Unexpected state value: {state}")
|
|
435
|
+
blocked: list[_Job] = []
|
|
436
|
+
for downstream_quantum_id in self.walker.fail(quantum_id):
|
|
437
|
+
self.failed.add(downstream_quantum_id)
|
|
438
|
+
blocked.append(self.jobs[downstream_quantum_id])
|
|
439
|
+
self.jobs[downstream_quantum_id]._state = JobState.FAILED_DEP
|
|
440
|
+
return blocked
|
|
403
441
|
|
|
404
442
|
def cleanup(self) -> None:
|
|
405
443
|
"""Do periodic cleanup for jobs that did not finish correctly.
|
|
@@ -408,8 +446,10 @@ class _JobList:
|
|
|
408
446
|
cleanup will not work for them. Here we check all timed out jobs
|
|
409
447
|
periodically and do cleanup if they managed to die by this time.
|
|
410
448
|
"""
|
|
411
|
-
for
|
|
412
|
-
|
|
449
|
+
for quantum_id in self.timed_out:
|
|
450
|
+
job = self.jobs[quantum_id]
|
|
451
|
+
assert job.state == JobState.TIMED_OUT, "Job state should be consistent with the set it's in."
|
|
452
|
+
if job.process is not None:
|
|
413
453
|
job.cleanup()
|
|
414
454
|
|
|
415
455
|
|
|
@@ -475,31 +515,43 @@ class MPGraphExecutor(QuantumGraphExecutor):
|
|
|
475
515
|
start_method = "spawn"
|
|
476
516
|
self._start_method = start_method
|
|
477
517
|
|
|
478
|
-
def execute(self, graph: QuantumGraph) -> None:
|
|
518
|
+
def execute(self, graph: QuantumGraph | PredictedQuantumGraph) -> None:
|
|
479
519
|
# Docstring inherited from QuantumGraphExecutor.execute
|
|
480
|
-
|
|
481
|
-
|
|
520
|
+
old_graph: QuantumGraph | None = None
|
|
521
|
+
if isinstance(graph, QuantumGraph):
|
|
522
|
+
old_graph = graph
|
|
523
|
+
new_graph = PredictedQuantumGraph.from_old_quantum_graph(old_graph)
|
|
524
|
+
else:
|
|
525
|
+
new_graph = graph
|
|
526
|
+
xgraph = self._make_xgraph(new_graph, old_graph)
|
|
527
|
+
self._report = Report(qgraphSummary=new_graph._make_summary())
|
|
482
528
|
try:
|
|
483
529
|
if self._num_proc > 1:
|
|
484
|
-
self.
|
|
530
|
+
self._execute_quanta_mp(xgraph, self._report)
|
|
485
531
|
else:
|
|
486
|
-
self.
|
|
532
|
+
self._execute_quanta_in_process(xgraph, self._report)
|
|
487
533
|
except Exception as exc:
|
|
488
534
|
self._report.set_exception(exc)
|
|
489
535
|
raise
|
|
490
536
|
|
|
491
|
-
def
|
|
492
|
-
|
|
537
|
+
def _make_xgraph(
|
|
538
|
+
self, new_graph: PredictedQuantumGraph, old_graph: QuantumGraph | None
|
|
539
|
+
) -> networkx.DiGraph:
|
|
540
|
+
"""Obtain a networkx DAG from a quantum graph, applying any fixup and
|
|
541
|
+
adding `lsst.daf.butler.Quantum` and `~.pipeline_graph.TaskNode`
|
|
542
|
+
attributes.
|
|
493
543
|
|
|
494
544
|
Parameters
|
|
495
545
|
----------
|
|
496
|
-
|
|
497
|
-
|
|
546
|
+
new_graph : `.quantum_graph.PredictedQuantumGraph`
|
|
547
|
+
New quantum graph object.
|
|
548
|
+
old_graph : `.QuantumGraph` or `None`
|
|
549
|
+
Equivalent old quantum graph object.
|
|
498
550
|
|
|
499
551
|
Returns
|
|
500
552
|
-------
|
|
501
|
-
|
|
502
|
-
|
|
553
|
+
xgraph : `networkx.DiGraph`
|
|
554
|
+
NetworkX DAG with quantum IDs as node keys.
|
|
503
555
|
|
|
504
556
|
Raises
|
|
505
557
|
------
|
|
@@ -507,147 +559,171 @@ class MPGraphExecutor(QuantumGraphExecutor):
|
|
|
507
559
|
Raised if execution graph cannot be ordered after modification,
|
|
508
560
|
i.e. it has dependency cycles.
|
|
509
561
|
"""
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
562
|
+
new_graph.build_execution_quanta()
|
|
563
|
+
xgraph = new_graph.quantum_only_xgraph.copy()
|
|
564
|
+
if self._execution_graph_fixup:
|
|
565
|
+
try:
|
|
566
|
+
self._execution_graph_fixup.fixup_graph(xgraph, new_graph.quanta_by_task)
|
|
567
|
+
except NotImplementedError:
|
|
568
|
+
# Backwards compatibility.
|
|
569
|
+
if old_graph is None:
|
|
570
|
+
old_graph = new_graph.to_old_quantum_graph()
|
|
571
|
+
old_graph = self._execution_graph_fixup.fixupQuanta(old_graph)
|
|
572
|
+
# Adding all of the edges from old_graph is overkill, but the
|
|
573
|
+
# only option we really have to make sure we add any new ones.
|
|
574
|
+
xgraph.update([(a.nodeId, b.nodeId) for a, b in old_graph.graph.edges])
|
|
575
|
+
if networkx.dag.has_cycle(xgraph):
|
|
576
|
+
raise MPGraphExecutorError("Updated execution graph has dependency cycle.")
|
|
577
|
+
return xgraph
|
|
578
|
+
|
|
579
|
+
def _execute_quanta_in_process(self, xgraph: networkx.DiGraph, report: Report) -> None:
|
|
523
580
|
"""Execute all Quanta in current process.
|
|
524
581
|
|
|
525
582
|
Parameters
|
|
526
583
|
----------
|
|
527
|
-
|
|
528
|
-
|
|
584
|
+
xgraph : `networkx.DiGraph`
|
|
585
|
+
DAG to execute. Should have quantum IDs for nodes and ``quantum``
|
|
586
|
+
(`lsst.daf.butler.Quantum`) and ``pipeline_node``
|
|
587
|
+
(`lsst.pipe.base.pipeline_graph.TaskNode`) attributes in addition
|
|
588
|
+
to those provided by
|
|
589
|
+
`.quantum_graph.PredictedQuantumGraph.quantum_only_xgraph`.
|
|
529
590
|
report : `Report`
|
|
530
591
|
Object for reporting execution status.
|
|
531
592
|
"""
|
|
532
|
-
successCount, totalCount = 0, len(graph)
|
|
533
|
-
failedNodes: set[QuantumNode] = set()
|
|
534
|
-
for qnode in graph:
|
|
535
|
-
assert qnode.quantum.dataId is not None, "Quantum DataId cannot be None"
|
|
536
|
-
task_node = qnode.task_node
|
|
537
|
-
|
|
538
|
-
# Any failed inputs mean that the quantum has to be skipped.
|
|
539
|
-
inputNodes = graph.determineInputsToQuantumNode(qnode)
|
|
540
|
-
if inputNodes & failedNodes:
|
|
541
|
-
_LOG.error(
|
|
542
|
-
"Upstream job failed for task <%s dataId=%s>, skipping this task.",
|
|
543
|
-
task_node.label,
|
|
544
|
-
qnode.quantum.dataId,
|
|
545
|
-
)
|
|
546
|
-
failedNodes.add(qnode)
|
|
547
|
-
failed_quantum_report = QuantumReport(
|
|
548
|
-
quantumId=qnode.nodeId,
|
|
549
|
-
status=ExecutionStatus.SKIPPED,
|
|
550
|
-
dataId=qnode.quantum.dataId,
|
|
551
|
-
taskLabel=task_node.label,
|
|
552
|
-
)
|
|
553
|
-
report.quantaReports.append(failed_quantum_report)
|
|
554
|
-
continue
|
|
555
593
|
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
594
|
+
def tiebreaker_sort_key(quantum_id: uuid.UUID) -> tuple:
|
|
595
|
+
node_state = xgraph.nodes[quantum_id]
|
|
596
|
+
return (node_state["task_label"],) + node_state["data_id"].required_values
|
|
597
|
+
|
|
598
|
+
success_count, failed_count, total_count = 0, 0, len(xgraph.nodes)
|
|
599
|
+
walker = GraphWalker[uuid.UUID](xgraph.copy())
|
|
600
|
+
for unblocked_quanta in walker:
|
|
601
|
+
for quantum_id in sorted(unblocked_quanta, key=tiebreaker_sort_key):
|
|
602
|
+
node_state: PredictedQuantumInfo = xgraph.nodes[quantum_id]
|
|
603
|
+
data_id = node_state["data_id"]
|
|
604
|
+
task_node = node_state["pipeline_node"]
|
|
605
|
+
quantum = node_state["quantum"]
|
|
606
|
+
|
|
607
|
+
_LOG.debug("Executing %s (%s@%s)", quantum_id, task_node.label, data_id)
|
|
608
|
+
fail_exit_code: int | None = None
|
|
562
609
|
try:
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
except RepeatableQuantumError as exc:
|
|
570
|
-
if self._fail_fast:
|
|
571
|
-
_LOG.warning(
|
|
572
|
-
"Caught repeatable quantum error for %s (%s):",
|
|
573
|
-
task_node.label,
|
|
574
|
-
qnode.quantum.dataId,
|
|
610
|
+
# For some exception types we want to exit immediately with
|
|
611
|
+
# exception-specific exit code, but we still want to start
|
|
612
|
+
# debugger before exiting if debugging is enabled.
|
|
613
|
+
try:
|
|
614
|
+
_, quantum_report = self._quantum_executor.execute(
|
|
615
|
+
task_node, quantum, quantum_id=quantum_id
|
|
575
616
|
)
|
|
576
|
-
|
|
617
|
+
if quantum_report:
|
|
618
|
+
report.quantaReports.append(quantum_report)
|
|
619
|
+
success_count += 1
|
|
620
|
+
walker.finish(quantum_id)
|
|
621
|
+
except RepeatableQuantumError as exc:
|
|
622
|
+
if self._fail_fast:
|
|
623
|
+
_LOG.warning(
|
|
624
|
+
"Caught repeatable quantum error for %s (%s@%s):",
|
|
625
|
+
quantum_id,
|
|
626
|
+
task_node.label,
|
|
627
|
+
data_id,
|
|
628
|
+
)
|
|
629
|
+
_LOG.warning(exc, exc_info=True)
|
|
630
|
+
fail_exit_code = exc.EXIT_CODE
|
|
631
|
+
raise
|
|
632
|
+
except InvalidQuantumError as exc:
|
|
633
|
+
_LOG.fatal(
|
|
634
|
+
"Invalid quantum error for %s (%s@%s):", quantum_id, task_node.label, data_id
|
|
635
|
+
)
|
|
636
|
+
_LOG.fatal(exc, exc_info=True)
|
|
577
637
|
fail_exit_code = exc.EXIT_CODE
|
|
578
|
-
|
|
579
|
-
except
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
except Exception as exc:
|
|
585
|
-
quantum_report = QuantumReport.from_exception(
|
|
586
|
-
quantumId=qnode.nodeId,
|
|
587
|
-
exception=exc,
|
|
588
|
-
dataId=qnode.quantum.dataId,
|
|
589
|
-
taskLabel=task_node.label,
|
|
590
|
-
)
|
|
591
|
-
report.quantaReports.append(quantum_report)
|
|
592
|
-
|
|
593
|
-
if self._pdb and sys.stdin.isatty() and sys.stdout.isatty():
|
|
594
|
-
_LOG.error(
|
|
595
|
-
"Task <%s dataId=%s> failed; dropping into pdb.",
|
|
596
|
-
task_node.label,
|
|
597
|
-
qnode.quantum.dataId,
|
|
598
|
-
exc_info=exc,
|
|
638
|
+
raise
|
|
639
|
+
except Exception as exc:
|
|
640
|
+
quantum_report = QuantumReport.from_exception(
|
|
641
|
+
exception=exc,
|
|
642
|
+
dataId=data_id,
|
|
643
|
+
taskLabel=task_node.label,
|
|
599
644
|
)
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
645
|
+
report.quantaReports.append(quantum_report)
|
|
646
|
+
|
|
647
|
+
if self._pdb and sys.stdin.isatty() and sys.stdout.isatty():
|
|
648
|
+
_LOG.error(
|
|
649
|
+
"%s (%s@%s) failed; dropping into pdb.",
|
|
650
|
+
quantum_id,
|
|
651
|
+
task_node.label,
|
|
652
|
+
data_id,
|
|
653
|
+
exc_info=exc,
|
|
654
|
+
)
|
|
655
|
+
try:
|
|
656
|
+
pdb = importlib.import_module(self._pdb)
|
|
657
|
+
except ImportError as imp_exc:
|
|
658
|
+
raise MPGraphExecutorError(
|
|
659
|
+
f"Unable to import specified debugger module ({self._pdb}): {imp_exc}"
|
|
660
|
+
) from exc
|
|
661
|
+
if not hasattr(pdb, "post_mortem"):
|
|
662
|
+
raise MPGraphExecutorError(
|
|
663
|
+
f"Specified debugger module ({self._pdb}) can't debug with post_mortem",
|
|
664
|
+
) from exc
|
|
665
|
+
pdb.post_mortem(exc.__traceback__)
|
|
666
|
+
|
|
667
|
+
report.status = ExecutionStatus.FAILURE
|
|
668
|
+
failed_count += 1
|
|
669
|
+
|
|
670
|
+
# If exception specified an exit code then just exit with
|
|
671
|
+
# that code, otherwise crash if fail-fast option is
|
|
672
|
+
# enabled.
|
|
673
|
+
if fail_exit_code is not None:
|
|
674
|
+
sys.exit(fail_exit_code)
|
|
675
|
+
if self._fail_fast:
|
|
607
676
|
raise MPGraphExecutorError(
|
|
608
|
-
f"
|
|
677
|
+
f"Quantum {quantum_id} ({task_node.label}@{data_id}) failed."
|
|
609
678
|
) from exc
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
if self._fail_fast:
|
|
619
|
-
raise MPGraphExecutorError(
|
|
620
|
-
f"Task <{task_node.label} dataId={qnode.quantum.dataId}> failed."
|
|
621
|
-
) from exc
|
|
622
|
-
else:
|
|
623
|
-
# Note that there could be exception safety issues, which
|
|
624
|
-
# we presently ignore.
|
|
625
|
-
_LOG.error(
|
|
626
|
-
"Task <%s dataId=%s> failed; processing will continue for remaining tasks.",
|
|
627
|
-
task_node.label,
|
|
628
|
-
qnode.quantum.dataId,
|
|
629
|
-
exc_info=exc,
|
|
630
|
-
)
|
|
679
|
+
else:
|
|
680
|
+
_LOG.error(
|
|
681
|
+
"%s (%s@%s) failed; processing will continue for remaining tasks.",
|
|
682
|
+
quantum_id,
|
|
683
|
+
task_node.label,
|
|
684
|
+
data_id,
|
|
685
|
+
exc_info=exc,
|
|
686
|
+
)
|
|
631
687
|
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
688
|
+
for downstream_quantum_id in walker.fail(quantum_id):
|
|
689
|
+
downstream_node_state = xgraph.nodes[downstream_quantum_id]
|
|
690
|
+
failed_quantum_report = QuantumReport(
|
|
691
|
+
status=ExecutionStatus.SKIPPED,
|
|
692
|
+
dataId=downstream_node_state["data_id"],
|
|
693
|
+
taskLabel=downstream_node_state["task_label"],
|
|
694
|
+
)
|
|
695
|
+
report.quantaReports.append(failed_quantum_report)
|
|
696
|
+
_LOG.error(
|
|
697
|
+
"Upstream job failed for task %s (%s@%s), skipping this quantum.",
|
|
698
|
+
downstream_quantum_id,
|
|
699
|
+
downstream_node_state["task_label"],
|
|
700
|
+
downstream_node_state["data_id"],
|
|
701
|
+
)
|
|
702
|
+
failed_count += 1
|
|
703
|
+
|
|
704
|
+
_LOG.info(
|
|
705
|
+
"Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
|
|
706
|
+
success_count,
|
|
707
|
+
failed_count,
|
|
708
|
+
total_count - success_count - failed_count,
|
|
709
|
+
total_count,
|
|
710
|
+
)
|
|
639
711
|
|
|
640
712
|
# Raise an exception if there were any failures.
|
|
641
|
-
if
|
|
713
|
+
if failed_count:
|
|
642
714
|
raise MPGraphExecutorError("One or more tasks failed during execution.")
|
|
643
715
|
|
|
644
|
-
def
|
|
716
|
+
def _execute_quanta_mp(self, xgraph: networkx.DiGraph, report: Report) -> None:
|
|
645
717
|
"""Execute all Quanta in separate processes.
|
|
646
718
|
|
|
647
719
|
Parameters
|
|
648
720
|
----------
|
|
649
|
-
|
|
650
|
-
|
|
721
|
+
xgraph : `networkx.DiGraph`
|
|
722
|
+
DAG to execute. Should have quantum IDs for nodes and ``quantum``
|
|
723
|
+
(`lsst.daf.butler.Quantum`) and ``task_node``
|
|
724
|
+
(`lsst.pipe.base.pipeline_graph.TaskNode`) attributes in addition
|
|
725
|
+
to those provided by
|
|
726
|
+
`.quantum_graph.PredictedQuantumGraph.quantum_only_xgraph`.
|
|
651
727
|
report : `Report`
|
|
652
728
|
Object for reporting execution status.
|
|
653
729
|
"""
|
|
@@ -656,14 +732,13 @@ class MPGraphExecutor(QuantumGraphExecutor):
|
|
|
656
732
|
_LOG.debug("Using %r for multiprocessing start method", self._start_method)
|
|
657
733
|
|
|
658
734
|
# re-pack input quantum data into jobs list
|
|
659
|
-
jobs = _JobList(
|
|
735
|
+
jobs = _JobList(xgraph)
|
|
660
736
|
|
|
661
737
|
# check that all tasks can run in sub-process
|
|
662
|
-
for job in jobs.jobs:
|
|
663
|
-
|
|
664
|
-
if not task_node.task_class.canMultiprocess:
|
|
738
|
+
for job in jobs.jobs.values():
|
|
739
|
+
if not job.task_node.task_class.canMultiprocess:
|
|
665
740
|
raise MPGraphExecutorError(
|
|
666
|
-
f"Task {task_node.label!r} does not support multiprocessing; use single process"
|
|
741
|
+
f"Task {job.task_node.label!r} does not support multiprocessing; use single process"
|
|
667
742
|
)
|
|
668
743
|
|
|
669
744
|
finishedCount, failedCount = 0, 0
|
|
@@ -672,8 +747,10 @@ class MPGraphExecutor(QuantumGraphExecutor):
|
|
|
672
747
|
_LOG.debug("#runningJobs: %s", len(jobs.running))
|
|
673
748
|
|
|
674
749
|
# See if any jobs have finished
|
|
675
|
-
for
|
|
750
|
+
for quantum_id in list(jobs.running): # iterate over a copy so we can remove.
|
|
751
|
+
job = jobs.jobs[quantum_id]
|
|
676
752
|
assert job.process is not None, "Process cannot be None"
|
|
753
|
+
blocked: list[_Job] = []
|
|
677
754
|
if not job.process.is_alive():
|
|
678
755
|
_LOG.debug("finished: %s", job)
|
|
679
756
|
# finished
|
|
@@ -691,20 +768,21 @@ class MPGraphExecutor(QuantumGraphExecutor):
|
|
|
691
768
|
# Do not override global FAILURE status
|
|
692
769
|
report.status = ExecutionStatus.TIMEOUT
|
|
693
770
|
message = f"Timeout ({self._timeout} sec) for task {job}, task is killed"
|
|
694
|
-
jobs.setJobState(job, JobState.TIMED_OUT)
|
|
771
|
+
blocked = jobs.setJobState(job, JobState.TIMED_OUT)
|
|
695
772
|
else:
|
|
696
773
|
report.status = ExecutionStatus.FAILURE
|
|
697
774
|
# failMessage() has to be called before cleanup()
|
|
698
775
|
message = job.failMessage()
|
|
699
|
-
jobs.setJobState(job, JobState.FAILED)
|
|
776
|
+
blocked = jobs.setJobState(job, JobState.FAILED)
|
|
700
777
|
|
|
701
778
|
job.cleanup()
|
|
702
779
|
_LOG.debug("failed: %s", job)
|
|
703
780
|
if self._fail_fast or exitcode == InvalidQuantumError.EXIT_CODE:
|
|
704
781
|
# stop all running jobs
|
|
705
|
-
for
|
|
706
|
-
|
|
707
|
-
|
|
782
|
+
for stop_quantum_id in jobs.running:
|
|
783
|
+
stop_job = jobs.jobs[stop_quantum_id]
|
|
784
|
+
if stop_job is not job:
|
|
785
|
+
stop_job.stop()
|
|
708
786
|
if job.state is JobState.TIMED_OUT:
|
|
709
787
|
raise MPTimeoutError(f"Timeout ({self._timeout} sec) for task {job}.")
|
|
710
788
|
else:
|
|
@@ -722,42 +800,26 @@ class MPGraphExecutor(QuantumGraphExecutor):
|
|
|
722
800
|
_LOG.debug("Terminating job %s due to timeout", job)
|
|
723
801
|
job.stop()
|
|
724
802
|
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
status=ExecutionStatus.SKIPPED,
|
|
735
|
-
dataId=job.qnode.quantum.dataId,
|
|
736
|
-
taskLabel=job.qnode.task_node.label,
|
|
737
|
-
)
|
|
738
|
-
report.quantaReports.append(quantum_report)
|
|
739
|
-
jobs.setJobState(job, JobState.FAILED_DEP)
|
|
740
|
-
_LOG.error("Upstream job failed for task %s, skipping this task.", job)
|
|
803
|
+
for downstream_job in blocked:
|
|
804
|
+
quantum_report = QuantumReport(
|
|
805
|
+
quantumId=downstream_job.quantum_id,
|
|
806
|
+
status=ExecutionStatus.SKIPPED,
|
|
807
|
+
dataId=cast(DataCoordinate, downstream_job.quantum.dataId),
|
|
808
|
+
taskLabel=downstream_job.task_node.label,
|
|
809
|
+
)
|
|
810
|
+
report.quantaReports.append(quantum_report)
|
|
811
|
+
_LOG.error("Upstream job failed for task %s, skipping this task.", downstream_job)
|
|
741
812
|
|
|
742
813
|
# see if we can start more jobs
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
if jobInputNodes <= jobs.finishedNodes:
|
|
747
|
-
# all dependencies have completed, can start new job
|
|
748
|
-
if len(jobs.running) < self._num_proc:
|
|
749
|
-
_LOG.debug("Submitting %s", job)
|
|
750
|
-
jobs.submit(job, self._quantum_executor, self._start_method)
|
|
751
|
-
if len(jobs.running) >= self._num_proc:
|
|
752
|
-
# Cannot start any more jobs, wait until something
|
|
753
|
-
# finishes.
|
|
754
|
-
break
|
|
814
|
+
while len(jobs.running) < self._num_proc and jobs.pending:
|
|
815
|
+
job = jobs.submit(self._quantum_executor, self._start_method)
|
|
816
|
+
_LOG.debug("Submitted %s", job)
|
|
755
817
|
|
|
756
818
|
# Do cleanup for timed out jobs if necessary.
|
|
757
819
|
jobs.cleanup()
|
|
758
820
|
|
|
759
821
|
# Print progress message if something changed.
|
|
760
|
-
newFinished, newFailed = len(jobs.
|
|
822
|
+
newFinished, newFailed = len(jobs.finished), len(jobs.failed)
|
|
761
823
|
if (finishedCount, failedCount) != (newFinished, newFailed):
|
|
762
824
|
finishedCount, failedCount = newFinished, newFailed
|
|
763
825
|
totalCount = len(jobs.jobs)
|
|
@@ -775,20 +837,20 @@ class MPGraphExecutor(QuantumGraphExecutor):
|
|
|
775
837
|
if jobs.running:
|
|
776
838
|
time.sleep(0.1)
|
|
777
839
|
|
|
778
|
-
if jobs.
|
|
840
|
+
if jobs.failed:
|
|
779
841
|
# print list of failed jobs
|
|
780
842
|
_LOG.error("Failed jobs:")
|
|
781
|
-
for
|
|
782
|
-
|
|
783
|
-
|
|
843
|
+
for quantum_id in jobs.failed:
|
|
844
|
+
job = jobs.jobs[quantum_id]
|
|
845
|
+
_LOG.error(" - %s: %s", job.state.name, job)
|
|
784
846
|
|
|
785
847
|
# if any job failed raise an exception
|
|
786
|
-
if jobs.
|
|
848
|
+
if jobs.failed == jobs.timed_out:
|
|
787
849
|
raise MPTimeoutError("One or more tasks timed out during execution.")
|
|
788
850
|
else:
|
|
789
851
|
raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")
|
|
790
852
|
|
|
791
|
-
def getReport(self) -> Report
|
|
853
|
+
def getReport(self) -> Report:
|
|
792
854
|
# Docstring inherited from base class
|
|
793
855
|
if self._report is None:
|
|
794
856
|
raise RuntimeError("getReport() called before execute()")
|