lsst-pipe-base 29.2025.3900__py3-none-any.whl → 29.2025.4100__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. lsst/pipe/base/_task_metadata.py +15 -0
  2. lsst/pipe/base/dot_tools.py +14 -152
  3. lsst/pipe/base/exec_fixup_data_id.py +17 -44
  4. lsst/pipe/base/execution_graph_fixup.py +49 -18
  5. lsst/pipe/base/graph/_versionDeserializers.py +6 -5
  6. lsst/pipe/base/graph/graph.py +30 -10
  7. lsst/pipe/base/graph/graphSummary.py +30 -0
  8. lsst/pipe/base/graph_walker.py +119 -0
  9. lsst/pipe/base/log_capture.py +5 -2
  10. lsst/pipe/base/mermaid_tools.py +11 -64
  11. lsst/pipe/base/mp_graph_executor.py +298 -236
  12. lsst/pipe/base/pipeline_graph/io.py +1 -1
  13. lsst/pipe/base/quantum_graph/__init__.py +32 -0
  14. lsst/pipe/base/quantum_graph/_common.py +632 -0
  15. lsst/pipe/base/quantum_graph/_multiblock.py +808 -0
  16. lsst/pipe/base/quantum_graph/_predicted.py +1950 -0
  17. lsst/pipe/base/quantum_graph/visualization.py +302 -0
  18. lsst/pipe/base/quantum_graph_builder.py +292 -34
  19. lsst/pipe/base/quantum_graph_executor.py +2 -1
  20. lsst/pipe/base/quantum_provenance_graph.py +16 -7
  21. lsst/pipe/base/quantum_reports.py +45 -0
  22. lsst/pipe/base/separable_pipeline_executor.py +126 -15
  23. lsst/pipe/base/simple_pipeline_executor.py +44 -43
  24. lsst/pipe/base/single_quantum_executor.py +1 -40
  25. lsst/pipe/base/tests/mocks/__init__.py +1 -1
  26. lsst/pipe/base/tests/mocks/_pipeline_task.py +16 -1
  27. lsst/pipe/base/tests/mocks/{_in_memory_repo.py → _repo.py} +324 -45
  28. lsst/pipe/base/tests/mocks/_storage_class.py +51 -0
  29. lsst/pipe/base/tests/simpleQGraph.py +11 -5
  30. lsst/pipe/base/version.py +1 -1
  31. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/METADATA +2 -1
  32. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/RECORD +40 -34
  33. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/WHEEL +0 -0
  34. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/entry_points.txt +0 -0
  35. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/COPYRIGHT +0 -0
  36. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/LICENSE +0 -0
  37. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/bsd_license.txt +0 -0
  38. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/licenses/gpl-v3.0.txt +0 -0
  39. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/top_level.txt +0 -0
  40. {lsst_pipe_base-29.2025.3900.dist-info → lsst_pipe_base-29.2025.4100.dist-info}/zip-safe +0 -0
@@ -39,16 +39,20 @@ import sys
39
39
  import threading
40
40
  import time
41
41
  import uuid
42
- from collections.abc import Iterable
43
- from typing import Literal
42
+ from typing import Literal, cast
44
43
 
44
+ import networkx
45
+
46
+ from lsst.daf.butler import DataCoordinate, Quantum
45
47
  from lsst.daf.butler.cli.cliLog import CliLog
46
48
  from lsst.utils.threads import disable_implicit_threading
47
49
 
48
50
  from ._status import InvalidQuantumError, RepeatableQuantumError
49
51
  from .execution_graph_fixup import ExecutionGraphFixup
50
- from .graph import QuantumGraph, QuantumNode
52
+ from .graph import QuantumGraph
53
+ from .graph_walker import GraphWalker
51
54
  from .pipeline_graph import TaskNode
55
+ from .quantum_graph import PredictedQuantumGraph, PredictedQuantumInfo
52
56
  from .quantum_graph_executor import QuantumExecutor, QuantumGraphExecutor
53
57
  from .quantum_reports import ExecutionStatus, QuantumReport, Report
54
58
 
@@ -82,13 +86,18 @@ class _Job:
82
86
 
83
87
  Parameters
84
88
  ----------
85
- qnode: `QuantumNode`
86
- Quantum and some associated information.
89
+ quantum_id : `uuid.UUID`
90
+ ID of the quantum this job executes.
91
+ quantum : `lsst.daf.butler.Quantum`
92
+ Description of the inputs and outputs.
93
+ task_node : `.pipeline_graph.TaskNode`
94
+ Description of the task and configuration.
87
95
  """
88
96
 
89
- def __init__(self, qnode: QuantumNode, fail_fast: bool = False):
90
- self.qnode = qnode
91
- self._fail_fast = fail_fast
97
+ def __init__(self, quantum_id: uuid.UUID, quantum: Quantum, task_node: TaskNode):
98
+ self.quantum_id = quantum_id
99
+ self.quantum = quantum
100
+ self.task_node = task_node
92
101
  self.process: multiprocessing.process.BaseProcess | None = None
93
102
  self._state = JobState.PENDING
94
103
  self.started: float = 0.0
@@ -115,6 +124,7 @@ class _Job:
115
124
  self,
116
125
  quantumExecutor: QuantumExecutor,
117
126
  startMethod: Literal["spawn"] | Literal["forkserver"],
127
+ fail_fast: bool,
118
128
  ) -> None:
119
129
  """Start process which runs the task.
120
130
 
@@ -124,13 +134,15 @@ class _Job:
124
134
  Executor for single quantum.
125
135
  startMethod : `str`, optional
126
136
  Start method from `multiprocessing` module.
137
+ fail_fast : `bool`, optional
138
+ If `True` then kill subprocess on RepeatableQuantumError.
127
139
  """
128
140
  # Unpickling of quantum has to happen after butler/executor, also we
129
141
  # want to setup logging before unpickling anything that can generate
130
142
  # messages, this is why things are pickled manually here.
131
143
  qe_pickle = pickle.dumps(quantumExecutor)
132
- task_node_pickle = pickle.dumps(self.qnode.task_node)
133
- quantum_pickle = pickle.dumps(self.qnode.quantum)
144
+ task_node_pickle = pickle.dumps(self.task_node)
145
+ quantum_pickle = pickle.dumps(self.quantum)
134
146
  self._rcv_conn, snd_conn = multiprocessing.Pipe(False)
135
147
  logConfigState = CliLog.configState
136
148
 
@@ -141,12 +153,12 @@ class _Job:
141
153
  qe_pickle,
142
154
  task_node_pickle,
143
155
  quantum_pickle,
144
- self.qnode.nodeId,
156
+ self.quantum_id,
145
157
  logConfigState,
146
158
  snd_conn,
147
- self._fail_fast,
159
+ fail_fast,
148
160
  ),
149
- name=f"task-{self.qnode.quantum.dataId}",
161
+ name=f"task-{self.quantum.dataId}",
150
162
  )
151
163
  # mypy is getting confused by multiprocessing.
152
164
  assert self.process is not None
@@ -285,12 +297,12 @@ class _Job:
285
297
  # Likely due to the process killed, but there may be other reasons.
286
298
  # Exit code should not be None, this is to keep mypy happy.
287
299
  exitcode = self.process.exitcode if self.process.exitcode is not None else -1
288
- assert self.qnode.quantum.dataId is not None, "Quantum DataId cannot be None"
300
+ assert self.quantum.dataId is not None, "Quantum DataId cannot be None"
289
301
  report = QuantumReport.from_exit_code(
290
- quantumId=self.qnode.nodeId,
302
+ quantumId=self.quantum_id,
291
303
  exitCode=exitcode,
292
- dataId=self.qnode.quantum.dataId,
293
- taskLabel=self.qnode.task_node.label,
304
+ dataId=self.quantum.dataId,
305
+ taskLabel=self.task_node.label,
294
306
  )
295
307
  if self.terminated:
296
308
  # Means it was killed, assume it's due to timeout
@@ -319,7 +331,7 @@ class _Job:
319
331
  return msg
320
332
 
321
333
  def __str__(self) -> str:
322
- return f"<{self.qnode.task_node.label} dataId={self.qnode.quantum.dataId}>"
334
+ return f"<{self.task_node.label} dataId={self.quantum.dataId}>"
323
335
 
324
336
 
325
337
  class _JobList:
@@ -327,42 +339,55 @@ class _JobList:
327
339
 
328
340
  Parameters
329
341
  ----------
330
- iterable : ~collections.abc.Iterable` [ `QuantumNode` ]
331
- Sequence of Quanta to execute. This has to be ordered according to
332
- task dependencies.
342
+ xgraph : `networkx.DiGraph`
343
+ Directed acyclic graph of quantum IDs.
333
344
  """
334
345
 
335
- def __init__(self, iterable: Iterable[QuantumNode]):
336
- self.jobs = [_Job(qnode) for qnode in iterable]
337
- self.pending = self.jobs[:]
338
- self.running: list[_Job] = []
339
- self.finishedNodes: set[QuantumNode] = set()
340
- self.failedNodes: set[QuantumNode] = set()
341
- self.timedOutNodes: set[QuantumNode] = set()
346
+ def __init__(self, xgraph: networkx.DiGraph):
347
+ self.jobs = {
348
+ quantum_id: _Job(
349
+ quantum_id=quantum_id,
350
+ quantum=xgraph.nodes[quantum_id]["quantum"],
351
+ task_node=xgraph.nodes[quantum_id]["pipeline_node"],
352
+ )
353
+ for quantum_id in xgraph
354
+ }
355
+ self.walker: GraphWalker[uuid.UUID] = GraphWalker(xgraph.copy())
356
+ self.pending = set(next(self.walker, ()))
357
+ self.running: set[uuid.UUID] = set()
358
+ self.finished: set[uuid.UUID] = set()
359
+ self.failed: set[uuid.UUID] = set()
360
+ self.timed_out: set[uuid.UUID] = set()
342
361
 
343
362
  def submit(
344
363
  self,
345
- job: _Job,
346
364
  quantumExecutor: QuantumExecutor,
347
365
  startMethod: Literal["spawn"] | Literal["forkserver"],
348
- ) -> None:
349
- """Submit one more job for execution.
366
+ fail_fast: bool = False,
367
+ ) -> _Job:
368
+ """Submit a pending job for execution.
350
369
 
351
370
  Parameters
352
371
  ----------
353
- job : `_Job`
354
- Job to submit.
355
372
  quantumExecutor : `QuantumExecutor`
356
373
  Executor for single quantum.
357
374
  startMethod : `str`, optional
358
375
  Start method from `multiprocessing` module.
376
+ fail_fast : `bool`, optional
377
+ If `True` then kill subprocess on RepeatableQuantumError.
378
+
379
+ Returns
380
+ -------
381
+ job : `_Job`
382
+ The job that was submitted.
359
383
  """
360
- # this will raise if job is not in pending list
361
- self.pending.remove(job)
362
- job.start(quantumExecutor, startMethod)
363
- self.running.append(job)
384
+ quantum_id = self.pending.pop()
385
+ job = self.jobs[quantum_id]
386
+ job.start(quantumExecutor, startMethod, fail_fast=fail_fast)
387
+ self.running.add(job.quantum_id)
388
+ return job
364
389
 
365
- def setJobState(self, job: _Job, state: JobState) -> None:
390
+ def setJobState(self, job: _Job, state: JobState) -> list[_Job]:
366
391
  """Update job state.
367
392
 
368
393
  Parameters
@@ -370,36 +395,49 @@ class _JobList:
370
395
  job : `_Job`
371
396
  Job to submit.
372
397
  state : `JobState`
373
- New job state, note that only FINISHED, FAILED, TIMED_OUT, or
374
- FAILED_DEP state is acceptable.
398
+ New job state; note that only the FINISHED, FAILED, and TIMED_OUT
399
+ states are acceptable.
400
+
401
+ Returns
402
+ -------
403
+ blocked : `list` [ `_Job` ]
404
+ Additional jobs that have been marked as failed because this job
405
+ was upstream of them and failed or timed out.
375
406
  """
376
- allowedStates = (JobState.FINISHED, JobState.FAILED, JobState.TIMED_OUT, JobState.FAILED_DEP)
407
+ allowedStates = (JobState.FINISHED, JobState.FAILED, JobState.TIMED_OUT)
377
408
  assert state in allowedStates, f"State {state} not allowed here"
378
409
 
379
410
  # remove job from pending/running lists
380
411
  if job.state == JobState.PENDING:
381
- self.pending.remove(job)
412
+ self.pending.remove(job.quantum_id)
382
413
  elif job.state == JobState.RUNNING:
383
- self.running.remove(job)
414
+ self.running.remove(job.quantum_id)
384
415
 
385
- qnode = job.qnode
416
+ quantum_id = job.quantum_id
386
417
  # it should not be in any of these, but just in case
387
- self.finishedNodes.discard(qnode)
388
- self.failedNodes.discard(qnode)
389
- self.timedOutNodes.discard(qnode)
390
-
418
+ self.finished.discard(quantum_id)
419
+ self.failed.discard(quantum_id)
420
+ self.timed_out.discard(quantum_id)
391
421
  job._state = state
392
- if state == JobState.FINISHED:
393
- self.finishedNodes.add(qnode)
394
- elif state == JobState.FAILED:
395
- self.failedNodes.add(qnode)
396
- elif state == JobState.FAILED_DEP:
397
- self.failedNodes.add(qnode)
398
- elif state == JobState.TIMED_OUT:
399
- self.failedNodes.add(qnode)
400
- self.timedOutNodes.add(qnode)
401
- else:
402
- raise ValueError(f"Unexpected state value: {state}")
422
+ match job.state:
423
+ case JobState.FINISHED:
424
+ self.finished.add(quantum_id)
425
+ self.walker.finish(quantum_id)
426
+ self.pending.update(next(self.walker, ()))
427
+ return []
428
+ case JobState.FAILED:
429
+ self.failed.add(quantum_id)
430
+ case JobState.TIMED_OUT:
431
+ self.failed.add(quantum_id)
432
+ self.timed_out.add(quantum_id)
433
+ case _:
434
+ raise ValueError(f"Unexpected state value: {state}")
435
+ blocked: list[_Job] = []
436
+ for downstream_quantum_id in self.walker.fail(quantum_id):
437
+ self.failed.add(downstream_quantum_id)
438
+ blocked.append(self.jobs[downstream_quantum_id])
439
+ self.jobs[downstream_quantum_id]._state = JobState.FAILED_DEP
440
+ return blocked
403
441
 
404
442
  def cleanup(self) -> None:
405
443
  """Do periodic cleanup for jobs that did not finish correctly.
@@ -408,8 +446,10 @@ class _JobList:
408
446
  cleanup will not work for them. Here we check all timed out jobs
409
447
  periodically and do cleanup if they managed to die by this time.
410
448
  """
411
- for job in self.jobs:
412
- if job.state == JobState.TIMED_OUT and job.process is not None:
449
+ for quantum_id in self.timed_out:
450
+ job = self.jobs[quantum_id]
451
+ assert job.state == JobState.TIMED_OUT, "Job state should be consistent with the set it's in."
452
+ if job.process is not None:
413
453
  job.cleanup()
414
454
 
415
455
 
@@ -475,31 +515,43 @@ class MPGraphExecutor(QuantumGraphExecutor):
475
515
  start_method = "spawn"
476
516
  self._start_method = start_method
477
517
 
478
- def execute(self, graph: QuantumGraph) -> None:
518
+ def execute(self, graph: QuantumGraph | PredictedQuantumGraph) -> None:
479
519
  # Docstring inherited from QuantumGraphExecutor.execute
480
- graph = self._fixupQuanta(graph)
481
- self._report = Report(qgraphSummary=graph.getSummary())
520
+ old_graph: QuantumGraph | None = None
521
+ if isinstance(graph, QuantumGraph):
522
+ old_graph = graph
523
+ new_graph = PredictedQuantumGraph.from_old_quantum_graph(old_graph)
524
+ else:
525
+ new_graph = graph
526
+ xgraph = self._make_xgraph(new_graph, old_graph)
527
+ self._report = Report(qgraphSummary=new_graph._make_summary())
482
528
  try:
483
529
  if self._num_proc > 1:
484
- self._executeQuantaMP(graph, self._report)
530
+ self._execute_quanta_mp(xgraph, self._report)
485
531
  else:
486
- self._executeQuantaInProcess(graph, self._report)
532
+ self._execute_quanta_in_process(xgraph, self._report)
487
533
  except Exception as exc:
488
534
  self._report.set_exception(exc)
489
535
  raise
490
536
 
491
- def _fixupQuanta(self, graph: QuantumGraph) -> QuantumGraph:
492
- """Call fixup code to modify execution graph.
537
+ def _make_xgraph(
538
+ self, new_graph: PredictedQuantumGraph, old_graph: QuantumGraph | None
539
+ ) -> networkx.DiGraph:
540
+ """Obtain a networkx DAG from a quantum graph, applying any fixup and
541
+ adding `lsst.daf.butler.Quantum` and `~.pipeline_graph.TaskNode`
542
+ attributes.
493
543
 
494
544
  Parameters
495
545
  ----------
496
- graph : `.QuantumGraph`
497
- `.QuantumGraph` to modify.
546
+ new_graph : `.quantum_graph.PredictedQuantumGraph`
547
+ New quantum graph object.
548
+ old_graph : `.QuantumGraph` or `None`
549
+ Equivalent old quantum graph object.
498
550
 
499
551
  Returns
500
552
  -------
501
- graph : `.QuantumGraph`
502
- Modified `.QuantumGraph`.
553
+ xgraph : `networkx.DiGraph`
554
+ NetworkX DAG with quantum IDs as node keys.
503
555
 
504
556
  Raises
505
557
  ------
@@ -507,147 +559,171 @@ class MPGraphExecutor(QuantumGraphExecutor):
507
559
  Raised if execution graph cannot be ordered after modification,
508
560
  i.e. it has dependency cycles.
509
561
  """
510
- if not self._execution_graph_fixup:
511
- return graph
512
-
513
- _LOG.debug("Call execution graph fixup method")
514
- graph = self._execution_graph_fixup.fixupQuanta(graph)
515
-
516
- # Detect if there is now a cycle created within the graph
517
- if graph.findCycle():
518
- raise MPGraphExecutorError("Updated execution graph has dependency cycle.")
519
-
520
- return graph
521
-
522
- def _executeQuantaInProcess(self, graph: QuantumGraph, report: Report) -> None:
562
+ new_graph.build_execution_quanta()
563
+ xgraph = new_graph.quantum_only_xgraph.copy()
564
+ if self._execution_graph_fixup:
565
+ try:
566
+ self._execution_graph_fixup.fixup_graph(xgraph, new_graph.quanta_by_task)
567
+ except NotImplementedError:
568
+ # Backwards compatibility.
569
+ if old_graph is None:
570
+ old_graph = new_graph.to_old_quantum_graph()
571
+ old_graph = self._execution_graph_fixup.fixupQuanta(old_graph)
572
+ # Adding all of the edges from old_graph is overkill, but the
573
+ # only option we really have to make sure we add any new ones.
574
+ xgraph.update([(a.nodeId, b.nodeId) for a, b in old_graph.graph.edges])
575
+ if networkx.dag.has_cycle(xgraph):
576
+ raise MPGraphExecutorError("Updated execution graph has dependency cycle.")
577
+ return xgraph
578
+
579
+ def _execute_quanta_in_process(self, xgraph: networkx.DiGraph, report: Report) -> None:
523
580
  """Execute all Quanta in current process.
524
581
 
525
582
  Parameters
526
583
  ----------
527
- graph : `.QuantumGraph`
528
- `.QuantumGraph` that is to be executed.
584
+ xgraph : `networkx.DiGraph`
585
+ DAG to execute. Should have quantum IDs for nodes and ``quantum``
586
+ (`lsst.daf.butler.Quantum`) and ``pipeline_node``
587
+ (`lsst.pipe.base.pipeline_graph.TaskNode`) attributes in addition
588
+ to those provided by
589
+ `.quantum_graph.PredictedQuantumGraph.quantum_only_xgraph`.
529
590
  report : `Report`
530
591
  Object for reporting execution status.
531
592
  """
532
- successCount, totalCount = 0, len(graph)
533
- failedNodes: set[QuantumNode] = set()
534
- for qnode in graph:
535
- assert qnode.quantum.dataId is not None, "Quantum DataId cannot be None"
536
- task_node = qnode.task_node
537
-
538
- # Any failed inputs mean that the quantum has to be skipped.
539
- inputNodes = graph.determineInputsToQuantumNode(qnode)
540
- if inputNodes & failedNodes:
541
- _LOG.error(
542
- "Upstream job failed for task <%s dataId=%s>, skipping this task.",
543
- task_node.label,
544
- qnode.quantum.dataId,
545
- )
546
- failedNodes.add(qnode)
547
- failed_quantum_report = QuantumReport(
548
- quantumId=qnode.nodeId,
549
- status=ExecutionStatus.SKIPPED,
550
- dataId=qnode.quantum.dataId,
551
- taskLabel=task_node.label,
552
- )
553
- report.quantaReports.append(failed_quantum_report)
554
- continue
555
593
 
556
- _LOG.debug("Executing %s", qnode)
557
- fail_exit_code: int | None = None
558
- try:
559
- # For some exception types we want to exit immediately with
560
- # exception-specific exit code, but we still want to start
561
- # debugger before exiting if debugging is enabled.
594
+ def tiebreaker_sort_key(quantum_id: uuid.UUID) -> tuple:
595
+ node_state = xgraph.nodes[quantum_id]
596
+ return (node_state["task_label"],) + node_state["data_id"].required_values
597
+
598
+ success_count, failed_count, total_count = 0, 0, len(xgraph.nodes)
599
+ walker = GraphWalker[uuid.UUID](xgraph.copy())
600
+ for unblocked_quanta in walker:
601
+ for quantum_id in sorted(unblocked_quanta, key=tiebreaker_sort_key):
602
+ node_state: PredictedQuantumInfo = xgraph.nodes[quantum_id]
603
+ data_id = node_state["data_id"]
604
+ task_node = node_state["pipeline_node"]
605
+ quantum = node_state["quantum"]
606
+
607
+ _LOG.debug("Executing %s (%s@%s)", quantum_id, task_node.label, data_id)
608
+ fail_exit_code: int | None = None
562
609
  try:
563
- _, quantum_report = self._quantum_executor.execute(
564
- task_node, qnode.quantum, quantum_id=qnode.nodeId
565
- )
566
- if quantum_report:
567
- report.quantaReports.append(quantum_report)
568
- successCount += 1
569
- except RepeatableQuantumError as exc:
570
- if self._fail_fast:
571
- _LOG.warning(
572
- "Caught repeatable quantum error for %s (%s):",
573
- task_node.label,
574
- qnode.quantum.dataId,
610
+ # For some exception types we want to exit immediately with
611
+ # exception-specific exit code, but we still want to start
612
+ # debugger before exiting if debugging is enabled.
613
+ try:
614
+ _, quantum_report = self._quantum_executor.execute(
615
+ task_node, quantum, quantum_id=quantum_id
575
616
  )
576
- _LOG.warning(exc, exc_info=True)
617
+ if quantum_report:
618
+ report.quantaReports.append(quantum_report)
619
+ success_count += 1
620
+ walker.finish(quantum_id)
621
+ except RepeatableQuantumError as exc:
622
+ if self._fail_fast:
623
+ _LOG.warning(
624
+ "Caught repeatable quantum error for %s (%s@%s):",
625
+ quantum_id,
626
+ task_node.label,
627
+ data_id,
628
+ )
629
+ _LOG.warning(exc, exc_info=True)
630
+ fail_exit_code = exc.EXIT_CODE
631
+ raise
632
+ except InvalidQuantumError as exc:
633
+ _LOG.fatal(
634
+ "Invalid quantum error for %s (%s@%s):", quantum_id, task_node.label, data_id
635
+ )
636
+ _LOG.fatal(exc, exc_info=True)
577
637
  fail_exit_code = exc.EXIT_CODE
578
- raise
579
- except InvalidQuantumError as exc:
580
- _LOG.fatal("Invalid quantum error for %s (%s):", task_node.label, qnode.quantum.dataId)
581
- _LOG.fatal(exc, exc_info=True)
582
- fail_exit_code = exc.EXIT_CODE
583
- raise
584
- except Exception as exc:
585
- quantum_report = QuantumReport.from_exception(
586
- quantumId=qnode.nodeId,
587
- exception=exc,
588
- dataId=qnode.quantum.dataId,
589
- taskLabel=task_node.label,
590
- )
591
- report.quantaReports.append(quantum_report)
592
-
593
- if self._pdb and sys.stdin.isatty() and sys.stdout.isatty():
594
- _LOG.error(
595
- "Task <%s dataId=%s> failed; dropping into pdb.",
596
- task_node.label,
597
- qnode.quantum.dataId,
598
- exc_info=exc,
638
+ raise
639
+ except Exception as exc:
640
+ quantum_report = QuantumReport.from_exception(
641
+ exception=exc,
642
+ dataId=data_id,
643
+ taskLabel=task_node.label,
599
644
  )
600
- try:
601
- pdb = importlib.import_module(self._pdb)
602
- except ImportError as imp_exc:
603
- raise MPGraphExecutorError(
604
- f"Unable to import specified debugger module ({self._pdb}): {imp_exc}"
605
- ) from exc
606
- if not hasattr(pdb, "post_mortem"):
645
+ report.quantaReports.append(quantum_report)
646
+
647
+ if self._pdb and sys.stdin.isatty() and sys.stdout.isatty():
648
+ _LOG.error(
649
+ "%s (%s@%s) failed; dropping into pdb.",
650
+ quantum_id,
651
+ task_node.label,
652
+ data_id,
653
+ exc_info=exc,
654
+ )
655
+ try:
656
+ pdb = importlib.import_module(self._pdb)
657
+ except ImportError as imp_exc:
658
+ raise MPGraphExecutorError(
659
+ f"Unable to import specified debugger module ({self._pdb}): {imp_exc}"
660
+ ) from exc
661
+ if not hasattr(pdb, "post_mortem"):
662
+ raise MPGraphExecutorError(
663
+ f"Specified debugger module ({self._pdb}) can't debug with post_mortem",
664
+ ) from exc
665
+ pdb.post_mortem(exc.__traceback__)
666
+
667
+ report.status = ExecutionStatus.FAILURE
668
+ failed_count += 1
669
+
670
+ # If exception specified an exit code then just exit with
671
+ # that code, otherwise crash if fail-fast option is
672
+ # enabled.
673
+ if fail_exit_code is not None:
674
+ sys.exit(fail_exit_code)
675
+ if self._fail_fast:
607
676
  raise MPGraphExecutorError(
608
- f"Specified debugger module ({self._pdb}) can't debug with post_mortem",
677
+ f"Quantum {quantum_id} ({task_node.label}@{data_id}) failed."
609
678
  ) from exc
610
- pdb.post_mortem(exc.__traceback__)
611
- failedNodes.add(qnode)
612
- report.status = ExecutionStatus.FAILURE
613
-
614
- # If exception specified an exit code then just exit with that
615
- # code, otherwise crash if fail-fast option is enabled.
616
- if fail_exit_code is not None:
617
- sys.exit(fail_exit_code)
618
- if self._fail_fast:
619
- raise MPGraphExecutorError(
620
- f"Task <{task_node.label} dataId={qnode.quantum.dataId}> failed."
621
- ) from exc
622
- else:
623
- # Note that there could be exception safety issues, which
624
- # we presently ignore.
625
- _LOG.error(
626
- "Task <%s dataId=%s> failed; processing will continue for remaining tasks.",
627
- task_node.label,
628
- qnode.quantum.dataId,
629
- exc_info=exc,
630
- )
679
+ else:
680
+ _LOG.error(
681
+ "%s (%s@%s) failed; processing will continue for remaining tasks.",
682
+ quantum_id,
683
+ task_node.label,
684
+ data_id,
685
+ exc_info=exc,
686
+ )
631
687
 
632
- _LOG.info(
633
- "Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
634
- successCount,
635
- len(failedNodes),
636
- totalCount - successCount - len(failedNodes),
637
- totalCount,
638
- )
688
+ for downstream_quantum_id in walker.fail(quantum_id):
689
+ downstream_node_state = xgraph.nodes[downstream_quantum_id]
690
+ failed_quantum_report = QuantumReport(
691
+ status=ExecutionStatus.SKIPPED,
692
+ dataId=downstream_node_state["data_id"],
693
+ taskLabel=downstream_node_state["task_label"],
694
+ )
695
+ report.quantaReports.append(failed_quantum_report)
696
+ _LOG.error(
697
+ "Upstream job failed for task %s (%s@%s), skipping this quantum.",
698
+ downstream_quantum_id,
699
+ downstream_node_state["task_label"],
700
+ downstream_node_state["data_id"],
701
+ )
702
+ failed_count += 1
703
+
704
+ _LOG.info(
705
+ "Executed %d quanta successfully, %d failed and %d remain out of total %d quanta.",
706
+ success_count,
707
+ failed_count,
708
+ total_count - success_count - failed_count,
709
+ total_count,
710
+ )
639
711
 
640
712
  # Raise an exception if there were any failures.
641
- if failedNodes:
713
+ if failed_count:
642
714
  raise MPGraphExecutorError("One or more tasks failed during execution.")
643
715
 
644
- def _executeQuantaMP(self, graph: QuantumGraph, report: Report) -> None:
716
+ def _execute_quanta_mp(self, xgraph: networkx.DiGraph, report: Report) -> None:
645
717
  """Execute all Quanta in separate processes.
646
718
 
647
719
  Parameters
648
720
  ----------
649
- graph : `.QuantumGraph`
650
- `.QuantumGraph` that is to be executed.
721
+ xgraph : `networkx.DiGraph`
722
+ DAG to execute. Should have quantum IDs for nodes and ``quantum``
723
+ (`lsst.daf.butler.Quantum`) and ``task_node``
724
+ (`lsst.pipe.base.pipeline_graph.TaskNode`) attributes in addition
725
+ to those provided by
726
+ `.quantum_graph.PredictedQuantumGraph.quantum_only_xgraph`.
651
727
  report : `Report`
652
728
  Object for reporting execution status.
653
729
  """
@@ -656,14 +732,13 @@ class MPGraphExecutor(QuantumGraphExecutor):
656
732
  _LOG.debug("Using %r for multiprocessing start method", self._start_method)
657
733
 
658
734
  # re-pack input quantum data into jobs list
659
- jobs = _JobList(graph)
735
+ jobs = _JobList(xgraph)
660
736
 
661
737
  # check that all tasks can run in sub-process
662
- for job in jobs.jobs:
663
- task_node = job.qnode.task_node
664
- if not task_node.task_class.canMultiprocess:
738
+ for job in jobs.jobs.values():
739
+ if not job.task_node.task_class.canMultiprocess:
665
740
  raise MPGraphExecutorError(
666
- f"Task {task_node.label!r} does not support multiprocessing; use single process"
741
+ f"Task {job.task_node.label!r} does not support multiprocessing; use single process"
667
742
  )
668
743
 
669
744
  finishedCount, failedCount = 0, 0
@@ -672,8 +747,10 @@ class MPGraphExecutor(QuantumGraphExecutor):
672
747
  _LOG.debug("#runningJobs: %s", len(jobs.running))
673
748
 
674
749
  # See if any jobs have finished
675
- for job in jobs.running:
750
+ for quantum_id in list(jobs.running): # iterate over a copy so we can remove.
751
+ job = jobs.jobs[quantum_id]
676
752
  assert job.process is not None, "Process cannot be None"
753
+ blocked: list[_Job] = []
677
754
  if not job.process.is_alive():
678
755
  _LOG.debug("finished: %s", job)
679
756
  # finished
@@ -691,20 +768,21 @@ class MPGraphExecutor(QuantumGraphExecutor):
691
768
  # Do not override global FAILURE status
692
769
  report.status = ExecutionStatus.TIMEOUT
693
770
  message = f"Timeout ({self._timeout} sec) for task {job}, task is killed"
694
- jobs.setJobState(job, JobState.TIMED_OUT)
771
+ blocked = jobs.setJobState(job, JobState.TIMED_OUT)
695
772
  else:
696
773
  report.status = ExecutionStatus.FAILURE
697
774
  # failMessage() has to be called before cleanup()
698
775
  message = job.failMessage()
699
- jobs.setJobState(job, JobState.FAILED)
776
+ blocked = jobs.setJobState(job, JobState.FAILED)
700
777
 
701
778
  job.cleanup()
702
779
  _LOG.debug("failed: %s", job)
703
780
  if self._fail_fast or exitcode == InvalidQuantumError.EXIT_CODE:
704
781
  # stop all running jobs
705
- for stopJob in jobs.running:
706
- if stopJob is not job:
707
- stopJob.stop()
782
+ for stop_quantum_id in jobs.running:
783
+ stop_job = jobs.jobs[stop_quantum_id]
784
+ if stop_job is not job:
785
+ stop_job.stop()
708
786
  if job.state is JobState.TIMED_OUT:
709
787
  raise MPTimeoutError(f"Timeout ({self._timeout} sec) for task {job}.")
710
788
  else:
@@ -722,42 +800,26 @@ class MPGraphExecutor(QuantumGraphExecutor):
722
800
  _LOG.debug("Terminating job %s due to timeout", job)
723
801
  job.stop()
724
802
 
725
- # Fail jobs whose inputs failed, this may need several iterations
726
- # if the order is not right, will be done in the next loop.
727
- if jobs.failedNodes:
728
- for job in jobs.pending:
729
- jobInputNodes = graph.determineInputsToQuantumNode(job.qnode)
730
- assert job.qnode.quantum.dataId is not None, "Quantum DataId cannot be None"
731
- if jobInputNodes & jobs.failedNodes:
732
- quantum_report = QuantumReport(
733
- quantumId=job.qnode.nodeId,
734
- status=ExecutionStatus.SKIPPED,
735
- dataId=job.qnode.quantum.dataId,
736
- taskLabel=job.qnode.task_node.label,
737
- )
738
- report.quantaReports.append(quantum_report)
739
- jobs.setJobState(job, JobState.FAILED_DEP)
740
- _LOG.error("Upstream job failed for task %s, skipping this task.", job)
803
+ for downstream_job in blocked:
804
+ quantum_report = QuantumReport(
805
+ quantumId=downstream_job.quantum_id,
806
+ status=ExecutionStatus.SKIPPED,
807
+ dataId=cast(DataCoordinate, downstream_job.quantum.dataId),
808
+ taskLabel=downstream_job.task_node.label,
809
+ )
810
+ report.quantaReports.append(quantum_report)
811
+ _LOG.error("Upstream job failed for task %s, skipping this task.", downstream_job)
741
812
 
742
813
  # see if we can start more jobs
743
- if len(jobs.running) < self._num_proc:
744
- for job in jobs.pending:
745
- jobInputNodes = graph.determineInputsToQuantumNode(job.qnode)
746
- if jobInputNodes <= jobs.finishedNodes:
747
- # all dependencies have completed, can start new job
748
- if len(jobs.running) < self._num_proc:
749
- _LOG.debug("Submitting %s", job)
750
- jobs.submit(job, self._quantum_executor, self._start_method)
751
- if len(jobs.running) >= self._num_proc:
752
- # Cannot start any more jobs, wait until something
753
- # finishes.
754
- break
814
+ while len(jobs.running) < self._num_proc and jobs.pending:
815
+ job = jobs.submit(self._quantum_executor, self._start_method)
816
+ _LOG.debug("Submitted %s", job)
755
817
 
756
818
  # Do cleanup for timed out jobs if necessary.
757
819
  jobs.cleanup()
758
820
 
759
821
  # Print progress message if something changed.
760
- newFinished, newFailed = len(jobs.finishedNodes), len(jobs.failedNodes)
822
+ newFinished, newFailed = len(jobs.finished), len(jobs.failed)
761
823
  if (finishedCount, failedCount) != (newFinished, newFailed):
762
824
  finishedCount, failedCount = newFinished, newFailed
763
825
  totalCount = len(jobs.jobs)
@@ -775,20 +837,20 @@ class MPGraphExecutor(QuantumGraphExecutor):
775
837
  if jobs.running:
776
838
  time.sleep(0.1)
777
839
 
778
- if jobs.failedNodes:
840
+ if jobs.failed:
779
841
  # print list of failed jobs
780
842
  _LOG.error("Failed jobs:")
781
- for job in jobs.jobs:
782
- if job.state != JobState.FINISHED:
783
- _LOG.error(" - %s: %s", job.state.name, job)
843
+ for quantum_id in jobs.failed:
844
+ job = jobs.jobs[quantum_id]
845
+ _LOG.error(" - %s: %s", job.state.name, job)
784
846
 
785
847
  # if any job failed raise an exception
786
- if jobs.failedNodes == jobs.timedOutNodes:
848
+ if jobs.failed == jobs.timed_out:
787
849
  raise MPTimeoutError("One or more tasks timed out during execution.")
788
850
  else:
789
851
  raise MPGraphExecutorError("One or more tasks failed or timed out during execution.")
790
852
 
791
- def getReport(self) -> Report | None:
853
+ def getReport(self) -> Report:
792
854
  # Docstring inherited from base class
793
855
  if self._report is None:
794
856
  raise RuntimeError("getReport() called before execute()")