lsst-pipe-base 29.2025.1100__py3-none-any.whl → 29.2025.1200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,25 +39,42 @@ __all__ = (
39
39
  "QuantumProvenanceGraph",
40
40
  )
41
41
 
42
+ import concurrent.futures
42
43
  import dataclasses
44
+ import datetime
43
45
  import itertools
44
46
  import logging
45
47
  import textwrap
48
+ import threading
46
49
  import uuid
47
- from collections.abc import Iterator, Mapping, Sequence, Set
50
+ from collections.abc import Callable, Iterator, Mapping, Sequence, Set
48
51
  from enum import Enum
49
- from typing import TYPE_CHECKING, ClassVar, Literal, TypedDict, cast
52
+ from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypedDict, cast
50
53
 
51
54
  import astropy.table
52
55
  import networkx
53
56
  import pydantic
54
57
 
55
- from lsst.daf.butler import Butler, DataCoordinate, DataIdValue, DatasetRef
58
+ from lsst.daf.butler import (
59
+ Butler,
60
+ ButlerConfig,
61
+ ButlerLogRecords,
62
+ DataCoordinate,
63
+ DataIdValue,
64
+ DatasetId,
65
+ DatasetRef,
66
+ DatasetType,
67
+ DimensionUniverse,
68
+ LimitedButler,
69
+ MissingDatasetTypeError,
70
+ QuantumBackedButler,
71
+ )
56
72
  from lsst.resources import ResourcePathExpression
57
- from lsst.utils.logging import getLogger
73
+ from lsst.utils.logging import PeriodicLogger, getLogger
58
74
 
59
75
  from ._status import QuantumSuccessCaveats
60
- from .graph import QuantumGraph
76
+ from .automatic_connection_constants import LOG_OUTPUT_TEMPLATE, METADATA_OUTPUT_TEMPLATE
77
+ from .graph import QuantumGraph, QuantumNode
61
78
 
62
79
  if TYPE_CHECKING:
63
80
  from ._task_metadata import TaskMetadata
@@ -178,7 +195,7 @@ class ExceptionInfo(pydantic.BaseModel):
178
195
  """Additional metadata included in the exception."""
179
196
 
180
197
  @classmethod
181
- def from_metadata(cls, md: TaskMetadata) -> ExceptionInfo:
198
+ def _from_metadata(cls, md: TaskMetadata) -> ExceptionInfo:
182
199
  """Construct from task metadata.
183
200
 
184
201
  Parameters
@@ -476,7 +493,7 @@ class UnsuccessfulQuantumSummary(pydantic.BaseModel):
476
493
  """
477
494
 
478
495
  @classmethod
479
- def from_info(cls, info: QuantumInfo) -> UnsuccessfulQuantumSummary:
496
+ def _from_info(cls, info: QuantumInfo) -> UnsuccessfulQuantumSummary:
480
497
  """Summarize all relevant information from the `QuantumInfo` in an
481
498
  `UnsuccessfulQuantumSummary`; return an `UnsuccessfulQuantumSummary`.
482
499
 
@@ -595,7 +612,12 @@ class TaskSummary(pydantic.BaseModel):
595
612
  this module) associated with the particular issue identified.
596
613
  """
597
614
 
598
- def add_quantum_info(self, info: QuantumInfo, butler: Butler, do_store_logs: bool = True) -> None:
615
+ def _add_quantum_info(
616
+ self,
617
+ info: QuantumInfo,
618
+ log_getter: Callable[[DatasetRef], ButlerLogRecords] | None,
619
+ executor: concurrent.futures.Executor,
620
+ ) -> concurrent.futures.Future[None] | None:
599
621
  """Add a `QuantumInfo` to a `TaskSummary`.
600
622
 
601
623
  Unpack the `QuantumInfo` object, sorting quanta of each status into
@@ -607,12 +629,19 @@ class TaskSummary(pydantic.BaseModel):
607
629
  ----------
608
630
  info : `QuantumInfo`
609
631
  The `QuantumInfo` object to add to the `TaskSummary`.
610
- butler : `lsst.daf.butler.Butler`
611
- The butler repo used for the graph being inspected, which can be
612
- queried for errors and logs.
613
- do_store_logs : `bool`, optional
614
- Store error messages from Butler logs associated with failed quanta
615
- if `True`.
632
+ log_getter : `~collections.abc.Callable` or `None`
633
+ A callable that can be passed a `~lsst.daf.butler.DatasetRef` for
634
+ a log dataset to retreive those logs, or `None` to not load any
635
+ logs.
636
+ executor : `concurrent.futures.Executor`
637
+ A possibly-parallel executor that should be used to schedule
638
+ log dataset reads.
639
+
640
+ Returns
641
+ -------
642
+ future : `concurrent.futures.Future` or `None`
643
+ A future that represents a parallelized log read and summary
644
+ update.
616
645
  """
617
646
  try:
618
647
  final_run, final_quantum_run = QuantumRun.find_final(info)
@@ -637,35 +666,45 @@ class TaskSummary(pydantic.BaseModel):
637
666
  exception=final_quantum_run.exception,
638
667
  )
639
668
  )
669
+ return None
640
670
  case QuantumInfoStatus.WONKY:
641
- self.wonky_quanta.append(UnsuccessfulQuantumSummary.from_info(info))
671
+ self.wonky_quanta.append(UnsuccessfulQuantumSummary._from_info(info))
672
+ return None
642
673
  case QuantumInfoStatus.BLOCKED:
643
674
  self.n_blocked += 1
675
+ return None
644
676
  case QuantumInfoStatus.FAILED:
645
- failed_quantum_summary = UnsuccessfulQuantumSummary.from_info(info)
646
- if do_store_logs:
647
- for quantum_run in info["runs"].values():
648
- try:
649
- log = butler.get(quantum_run.log_ref)
650
- except LookupError:
651
- failed_quantum_summary.messages.append(
652
- f"Logs not ingested for {quantum_run.log_ref!r}"
653
- )
654
- except FileNotFoundError:
655
- failed_quantum_summary.messages.append(
656
- f"Logs missing or corrupt for {quantum_run.log_ref!r}"
657
- )
658
- else:
659
- failed_quantum_summary.messages.extend(
660
- [record.message for record in log if record.levelno >= logging.ERROR]
661
- )
677
+ failed_quantum_summary = UnsuccessfulQuantumSummary._from_info(info)
678
+ future: concurrent.futures.Future[None] | None = None
679
+ if log_getter:
680
+
681
+ def callback() -> None:
682
+ for quantum_run in info["runs"].values():
683
+ try:
684
+ log = log_getter(quantum_run.log_ref)
685
+ except LookupError:
686
+ failed_quantum_summary.messages.append(
687
+ f"Logs not ingested for {quantum_run.log_ref!r}"
688
+ )
689
+ except FileNotFoundError:
690
+ failed_quantum_summary.messages.append(
691
+ f"Logs missing or corrupt for {quantum_run.log_ref!r}"
692
+ )
693
+ else:
694
+ failed_quantum_summary.messages.extend(
695
+ [record.message for record in log if record.levelno >= logging.ERROR]
696
+ )
697
+
698
+ future = executor.submit(callback)
662
699
  self.failed_quanta.append(failed_quantum_summary)
700
+ return future
663
701
  case QuantumInfoStatus.UNKNOWN:
664
702
  self.n_unknown += 1
703
+ return None
665
704
  case unrecognized_state:
666
705
  raise AssertionError(f"Unrecognized quantum status {unrecognized_state!r}")
667
706
 
668
- def add_data_id_group(self, other_summary: TaskSummary) -> None:
707
+ def _add_data_id_group(self, other_summary: TaskSummary) -> None:
669
708
  """Add information from a `TaskSummary` over one dataquery-identified
670
709
  group to another, as part of aggregating `Summary` reports.
671
710
 
@@ -712,7 +751,7 @@ class CursedDatasetSummary(pydantic.BaseModel):
712
751
  """
713
752
 
714
753
  @classmethod
715
- def from_info(cls, info: DatasetInfo, producer_info: QuantumInfo) -> CursedDatasetSummary:
754
+ def _from_info(cls, info: DatasetInfo, producer_info: QuantumInfo) -> CursedDatasetSummary:
716
755
  """Summarize all relevant information from the `DatasetInfo` in an
717
756
  `CursedDatasetSummary`; return a `CursedDatasetSummary`.
718
757
 
@@ -797,7 +836,7 @@ class DatasetTypeSummary(pydantic.BaseModel):
797
836
  """A list of all unsuccessful datasets by their name and data_id.
798
837
  """
799
838
 
800
- def add_dataset_info(self, info: DatasetInfo, producer_info: QuantumInfo) -> None:
839
+ def _add_dataset_info(self, info: DatasetInfo, producer_info: QuantumInfo) -> None:
801
840
  """Add a `DatasetInfo` to a `DatasetTypeSummary`.
802
841
 
803
842
  Unpack the `DatasetInfo` object, sorting datasets of each status into
@@ -822,13 +861,13 @@ class DatasetTypeSummary(pydantic.BaseModel):
822
861
  case DatasetInfoStatus.UNSUCCESSFUL:
823
862
  self.unsuccessful_datasets.append(dict(info["data_id"].mapping))
824
863
  case DatasetInfoStatus.CURSED:
825
- self.cursed_datasets.append(CursedDatasetSummary.from_info(info, producer_info))
864
+ self.cursed_datasets.append(CursedDatasetSummary._from_info(info, producer_info))
826
865
  case DatasetInfoStatus.PREDICTED_ONLY:
827
866
  self.n_predicted_only += 1
828
867
  case unrecognized_state:
829
868
  raise AssertionError(f"Unrecognized dataset status {unrecognized_state!r}")
830
869
 
831
- def add_data_id_group(self, other_summary: DatasetTypeSummary) -> None:
870
+ def _add_data_id_group(self, other_summary: DatasetTypeSummary) -> None:
832
871
  """Add information from a `DatasetTypeSummary` over one
833
872
  dataquery-identified group to another, as part of aggregating `Summary`
834
873
  reports.
@@ -889,10 +928,10 @@ class Summary(pydantic.BaseModel):
889
928
  for summary in summaries:
890
929
  for label, task_summary in summary.tasks.items():
891
930
  result_task_summary = result.tasks.setdefault(label, TaskSummary())
892
- result_task_summary.add_data_id_group(task_summary)
931
+ result_task_summary._add_data_id_group(task_summary)
893
932
  for dataset_type, dataset_type_summary in summary.datasets.items():
894
933
  result_dataset_summary = result.datasets.setdefault(dataset_type, DatasetTypeSummary())
895
- result_dataset_summary.add_data_id_group(dataset_type_summary)
934
+ result_dataset_summary._add_data_id_group(dataset_type_summary)
896
935
  return result
897
936
 
898
937
  def pprint(self, brief: bool = False, datasets: bool = True) -> None:
@@ -1126,19 +1165,53 @@ class QuantumProvenanceGraph:
1126
1165
  """A set of already-run, merged quantum graphs with provenance
1127
1166
  information.
1128
1167
 
1129
- Step through all the quantum graphs associated with certain tasks or
1130
- processing steps. For each graph/attempt, the status of each quantum and
1131
- dataset is recorded in `QuantumProvenanceGraph.__add_new_graph` and
1132
- outcomes of quanta over multiple runs are resolved in
1133
- `QuantumProvenanceGraph.__resolve_duplicates`. These can be called outside
1134
- the class in the correct order by
1135
- `QuantumProvenanceGraph.assemble_quantum_provenance_graph`. At the end of
1136
- this process, we can combine all attempts into a summary using the
1137
- `QuantumProvenanceGraph.to_summary` method. This serves to answer the
1138
- question 'What happened to this data ID?' in a wholistic sense.
1139
- """
1140
-
1141
- def __init__(self) -> None:
1168
+ Parameters
1169
+ ----------
1170
+ butler : `lsst.daf.butler.Butler`
1171
+ The Butler used for this report. This should match the Butler used
1172
+ for the run associated with the executed quantum graph.
1173
+ qgraphs : `~collections.abc.Sequence` [`QuantumGraph` |\
1174
+ `~lsst.utils.resources.ResourcePathExpression`]
1175
+ A list of either quantum graph objects or their uri's, to be used
1176
+ to assemble the `QuantumProvenanceGraph`.
1177
+ collections : `~collections.abc.Sequence` [`str`] | `None`
1178
+ Collections to use in `lsst.daf.butler.query_datasets` when testing
1179
+ which datasets are available at a high level.
1180
+ where : `str`
1181
+ A "where" string to use to constrain the datasets; should be provided
1182
+ if ``collections`` includes many datasets that are not in any graphs,
1183
+ to select just those that might be (e.g. when sharding over dimensions
1184
+ and using a final collection that spans multiple shards).
1185
+ curse_failed_logs : `bool`
1186
+ Mark log datasets as CURSED if they are visible in the final output
1187
+ collection. Note that a campaign-level collection must be used here for
1188
+ `collections` if `curse_failed_logs` is `True`.
1189
+ read_caveats : `str` or `None`, optional
1190
+ Whether to read metadata files to get flags that describe qualified
1191
+ successes. If `None`, no metadata files will be read and all
1192
+ ``caveats`` fields will be `None`. If "exhaustive", all metadata files
1193
+ will be read. If "lazy", only metadata files where at least one
1194
+ predicted output is missing will be read.
1195
+ use_qbb : `bool`, optional
1196
+ If `True`, use a quantum-backed butler when reading metadata files.
1197
+ Note that some butler database queries are still run even if this is
1198
+ `True`; this does not avoid database access entirely.
1199
+ n_cores : `int`, optional
1200
+ Number of threads to use for parallelization.
1201
+ """
1202
+
1203
+ def __init__(
1204
+ self,
1205
+ butler: Butler | None = None,
1206
+ qgraphs: Sequence[QuantumGraph | ResourcePathExpression] = (),
1207
+ *,
1208
+ collections: Sequence[str] | None = None,
1209
+ where: str = "",
1210
+ curse_failed_logs: bool = False,
1211
+ read_caveats: Literal["lazy", "exhaustive"] | None = "lazy",
1212
+ use_qbb: bool = True,
1213
+ n_cores: int = 1,
1214
+ ) -> None:
1142
1215
  # The graph we annotate as we step through all the graphs associated
1143
1216
  # with the processing to create the `QuantumProvenanceGraph`.
1144
1217
  self._xgraph = networkx.DiGraph()
@@ -1150,6 +1223,24 @@ class QuantumProvenanceGraph:
1150
1223
  # Bool representing whether the graph has been finalized. This is set
1151
1224
  # to True when resolve_duplicates completes.
1152
1225
  self._finalized: bool = False
1226
+ # In order to both parallelize metadata/log reads and potentially use
1227
+ # QBB to do it, we in general need one butler for each output_run and
1228
+ # thread combination. This dict is keyed by the former, and the
1229
+ # wrapper type used for the value handles the latter.
1230
+ self._butler_wrappers: dict[str, _ThreadLocalButlerWrapper] = {}
1231
+ if butler is not None:
1232
+ self.assemble_quantum_provenance_graph(
1233
+ butler,
1234
+ qgraphs,
1235
+ collections=collections,
1236
+ where=where,
1237
+ curse_failed_logs=curse_failed_logs,
1238
+ read_caveats=read_caveats,
1239
+ use_qbb=use_qbb,
1240
+ n_cores=n_cores,
1241
+ )
1242
+ elif qgraphs:
1243
+ raise TypeError("'butler' must be provided if `qgraphs` is.")
1153
1244
 
1154
1245
  @property
1155
1246
  def quanta(self) -> Mapping[str, Set[QuantumKey]]:
@@ -1195,245 +1286,562 @@ class QuantumProvenanceGraph:
1195
1286
  """
1196
1287
  return self._xgraph.nodes[key]
1197
1288
 
1198
- def __add_new_graph(
1289
+ def to_summary(
1290
+ self, butler: Butler | None = None, do_store_logs: bool = True, n_cores: int = 1
1291
+ ) -> Summary:
1292
+ """Summarize the `QuantumProvenanceGraph`.
1293
+
1294
+ Parameters
1295
+ ----------
1296
+ butler : `lsst.daf.butler.Butler`, optional
1297
+ Ignored; accepted for backwards compatibility.
1298
+ do_store_logs : `bool`
1299
+ Store the logs in the summary dictionary.
1300
+ n_cores : `int`, optional
1301
+
1302
+ Returns
1303
+ -------
1304
+ result : `Summary`
1305
+ A struct containing counts of quanta and datasets in each of
1306
+ the overall states defined in `QuantumInfo` and `DatasetInfo`,
1307
+ as well as diagnostic information and error messages for failed
1308
+ quanta and strange edge cases, and a list of recovered quanta.
1309
+ """
1310
+ status_log = PeriodicLogger(_LOG)
1311
+ if not self._finalized:
1312
+ raise RuntimeError(
1313
+ """resolve_duplicates must be called to finalize the
1314
+ QuantumProvenanceGraph before making a summary."""
1315
+ )
1316
+ result = Summary()
1317
+ futures: list[concurrent.futures.Future[None]] = []
1318
+ _LOG.verbose("Summarizing %s tasks.", len(self._quanta.keys()))
1319
+ with concurrent.futures.ThreadPoolExecutor(n_cores) as executor:
1320
+ for m, (task_label, quanta) in enumerate(self._quanta.items()):
1321
+ task_summary = TaskSummary()
1322
+ task_summary.n_expected = len(quanta)
1323
+ for n, quantum_key in enumerate(quanta):
1324
+ quantum_info = self.get_quantum_info(quantum_key)
1325
+ future = task_summary._add_quantum_info(
1326
+ quantum_info,
1327
+ log_getter=self._butler_get if do_store_logs else None,
1328
+ executor=executor,
1329
+ )
1330
+ if future is not None:
1331
+ futures.append(future)
1332
+ status_log.log(
1333
+ "Summarized %s of %s quanta of task %s of %s.",
1334
+ n + 1,
1335
+ len(quanta),
1336
+ m + 1,
1337
+ len(self._quanta.keys()),
1338
+ )
1339
+ result.tasks[task_label] = task_summary
1340
+ for n, future in enumerate(concurrent.futures.as_completed(futures)):
1341
+ if (err := future.exception()) is not None:
1342
+ raise err
1343
+ status_log.log("Loaded messages from %s of %s log datasets.", n + 1, len(futures))
1344
+ _LOG.verbose("Summarizing %s dataset types.", len(self._datasets.keys()))
1345
+ for m, (dataset_type_name, datasets) in enumerate(self._datasets.items()):
1346
+ dataset_type_summary = DatasetTypeSummary(producer="")
1347
+ dataset_type_summary.n_expected = len(datasets)
1348
+ for n, dataset_key in enumerate(datasets):
1349
+ dataset_info = self.get_dataset_info(dataset_key)
1350
+ producer_key = self.get_producer_of(dataset_key)
1351
+ producer_info = self.get_quantum_info(producer_key)
1352
+ # Not ideal, but hard to get out of the graph at the moment.
1353
+ # Change after DM-40441
1354
+ dataset_type_summary.producer = producer_key.task_label
1355
+ dataset_type_summary._add_dataset_info(dataset_info, producer_info)
1356
+ status_log.log(
1357
+ "Summarized %s of %s datasets of type %s of %s.",
1358
+ n + 1,
1359
+ len(datasets),
1360
+ m + 1,
1361
+ len(self._datasets.keys()),
1362
+ )
1363
+ result.datasets[dataset_type_name] = dataset_type_summary
1364
+ return result
1365
+
1366
+ def iter_outputs_of(self, quantum_key: QuantumKey) -> Iterator[DatasetKey]:
1367
+ """Iterate through the outputs of a quantum, yielding the keys of
1368
+ all of the datasets produced by the quantum.
1369
+
1370
+ Parameters
1371
+ ----------
1372
+ quantum_key : `QuantumKey`
1373
+ The key for the quantum whose outputs are needed.
1374
+ """
1375
+ yield from self._xgraph.successors(quantum_key)
1376
+
1377
+ def get_producer_of(self, dataset_key: DatasetKey) -> QuantumKey:
1378
+ """Unpack the predecessor (producer quantum) of a given dataset key
1379
+ from a graph.
1380
+
1381
+ Parameters
1382
+ ----------
1383
+ dataset_key : `DatasetKey`
1384
+ The key for the dataset whose producer quantum is needed.
1385
+
1386
+ Returns
1387
+ -------
1388
+ result : `QuantumKey`
1389
+ The key for the quantum which produced the dataset.
1390
+ """
1391
+ (result,) = self._xgraph.predecessors(dataset_key)
1392
+ return result
1393
+
1394
+ def iter_downstream(
1395
+ self, key: QuantumKey | DatasetKey
1396
+ ) -> Iterator[tuple[QuantumKey, QuantumInfo] | tuple[DatasetKey, DatasetInfo]]:
1397
+ """Iterate over the quanta and datasets that are downstream of a
1398
+ quantum or dataset.
1399
+
1400
+ Parameters
1401
+ ----------
1402
+ key : `QuantumKey` or `DatasetKey`
1403
+ Starting node.
1404
+
1405
+ Returns
1406
+ -------
1407
+ iter : `~collections.abc.Iterator` [ `tuple` ]
1408
+ An iterator over pairs of (`QuantumKey`, `QuantumInfo`) or
1409
+ (`DatasetKey`, `DatasetInfo`).
1410
+ """
1411
+ for key in networkx.dag.descendants(self._xgraph, key):
1412
+ yield (key, self._xgraph.nodes[key]) # type: ignore
1413
+
1414
+ def assemble_quantum_provenance_graph(
1415
+ self,
1416
+ butler: Butler,
1417
+ qgraphs: Sequence[QuantumGraph | ResourcePathExpression],
1418
+ collections: Sequence[str] | None = None,
1419
+ where: str = "",
1420
+ curse_failed_logs: bool = False,
1421
+ read_caveats: Literal["lazy", "exhaustive"] | None = "lazy",
1422
+ use_qbb: bool = True,
1423
+ n_cores: int = 1,
1424
+ ) -> None:
1425
+ """Assemble the quantum provenance graph from a list of all graphs
1426
+ corresponding to processing attempts.
1427
+
1428
+ Parameters
1429
+ ----------
1430
+ butler : `lsst.daf.butler.Butler`
1431
+ The Butler used for this report. This should match the Butler used
1432
+ for the run associated with the executed quantum graph.
1433
+ qgraphs : `~collections.abc.Sequence` [`QuantumGraph` |\
1434
+ `~lsst.utils.resources.ResourcePathExpression`]
1435
+ A list of either quantum graph objects or their uri's, to be used
1436
+ to assemble the `QuantumProvenanceGraph`.
1437
+ collections : `~collections.abc.Sequence` [`str`] | `None`
1438
+ Collections to use in `lsst.daf.butler.query_datasets` when testing
1439
+ which datasets are available at a high level.
1440
+ where : `str`
1441
+ A "where" string to use to constrain the datasets; should be
1442
+ provided if ``collections`` includes many datasets that are not in
1443
+ any graphs, to select just those that might be (e.g. when sharding
1444
+ over dimensions and using a final collection that spans multiple
1445
+ shards).
1446
+ curse_failed_logs : `bool`
1447
+ Mark log datasets as CURSED if they are visible in the final
1448
+ output collection. Note that a campaign-level collection must be
1449
+ used here for `collections` if `curse_failed_logs` is `True`.
1450
+ read_caveats : `str` or `None`, optional
1451
+ Whether to read metadata files to get flags that describe qualified
1452
+ successes. If `None`, no metadata files will be read and all
1453
+ ``caveats`` fields will be `None`. If "exhaustive", all
1454
+ metadata files will be read. If "lazy", only metadata files where
1455
+ at least one predicted output is missing will be read.
1456
+ use_qbb : `bool`, optional
1457
+ If `True`, use a quantum-backed butler when reading metadata files.
1458
+ Note that some butler database queries are still run even if this
1459
+ is `True`; this does not avoid database access entirely.
1460
+ n_cores : `int`, optional
1461
+ Number of threads to use for parallelization.
1462
+ """
1463
+ if read_caveats not in ("lazy", "exhaustive", None):
1464
+ raise TypeError(
1465
+ f"Invalid option {read_caveats!r} for read_caveats; should be 'lazy', 'exhaustive', or None."
1466
+ )
1467
+ output_runs = []
1468
+ last_time: datetime.datetime | None = None
1469
+ for graph in qgraphs:
1470
+ if not isinstance(graph, QuantumGraph):
1471
+ _LOG.verbose("Loading quantum graph %r.", graph)
1472
+ qgraph = QuantumGraph.loadUri(graph)
1473
+ else:
1474
+ qgraph = graph
1475
+ assert qgraph.metadata is not None, "Saved QGs always have metadata."
1476
+ self._add_new_graph(butler, qgraph, read_caveats=read_caveats, use_qbb=use_qbb, n_cores=n_cores)
1477
+ output_runs.append(qgraph.metadata["output_run"])
1478
+ if last_time is not None and last_time > qgraph.metadata["time"]:
1479
+ raise RuntimeError("Quantum graphs must be passed in chronological order.")
1480
+ last_time = qgraph.metadata["time"]
1481
+ if not collections:
1482
+ # We reverse the order of the associated output runs because the
1483
+ # query in _resolve_duplicates must be done most-recent first.
1484
+ collections = list(reversed(output_runs))
1485
+ assert not curse_failed_logs, (
1486
+ "curse_failed_logs option must be used with one campaign-level collection."
1487
+ )
1488
+ self._resolve_duplicates(butler, collections, where, curse_failed_logs)
1489
+
1490
+ def _add_new_graph(
1199
1491
  self,
1200
1492
  butler: Butler,
1201
- qgraph: QuantumGraph | ResourcePathExpression,
1493
+ qgraph: QuantumGraph,
1202
1494
  read_caveats: Literal["lazy", "exhaustive"] | None,
1495
+ use_qbb: bool = True,
1496
+ n_cores: int = 1,
1203
1497
  ) -> None:
1204
1498
  """Add a new quantum graph to the `QuantumProvenanceGraph`.
1205
1499
 
1206
- Notes
1207
- -----
1208
- The algorithm: step through the quantum graph. Annotate a
1209
- `networkx.DiGraph` (`QuantumProvenanceGraph._xgraph`) with all of the
1210
- relevant information: quanta, dataset types and their associated run
1211
- collections (these unique quanta- and dataset type-run
1212
- collection combinations are encapsulated in the classes
1213
- `DatasetRun` and `QuantumRun`). For each new quantum, annotate
1214
- the status of the `QuantumRun` by inspecting the graph. If a
1215
- DatasetType was produced, annotate this in the run by setting
1216
- `DatasetRun.produced = True`. If a quantum is given BLOCKED
1217
- or FAILED status, annotate all their successors in the graph
1218
- as BLOCKED. For each new quantum, use the transition between
1219
- the current and last `QuantumRun.status` to determine the status
1220
- to assign to the overall `QuantumInfo`. For example, if a
1221
- previous run associated with a quantum had the status FAILED,
1222
- and the status from the new graph reads SUCCESSFUL, we can
1223
- mark the overall quantum status as SUCCESSFUL and list the data_id
1224
- as RECOVERED.
1225
-
1226
1500
  Parameters
1227
1501
  ----------
1228
1502
  butler : `lsst.daf.butler.Butler`
1229
1503
  The Butler used for this report. This should match the Butler
1230
1504
  used for the run associated with the executed quantum graph.
1231
- qgraph : `QuantumGraph` | `ResourcePathExpression`
1232
- Either the associated quantum graph object or the uri of the
1233
- location of said quantum graph.
1505
+ qgraph : `QuantumGraph`
1506
+ The quantum graph object to add.
1234
1507
  read_caveats : `str` or `None`
1235
1508
  Whether to read metadata files to get flags that describe qualified
1236
1509
  successes. If `None`, no metadata files will be read and all
1237
1510
  ``caveats`` fields will be `None`. If "exhaustive", all
1238
1511
  metadata files will be read. If "lazy", only metadata files where
1239
1512
  at least one predicted output is missing will be read.
1513
+ use_qbb : `bool`, optional
1514
+ If `True`, use a quantum-backed butler when reading metadata files.
1515
+ Note that some butler database queries are still run even if this
1516
+ is `True`; this does not avoid database access entirely.
1517
+ n_cores : `int`, optional
1518
+ Number of threads to use for parallelization.
1240
1519
  """
1241
- # first we load the quantum graph and associated output run collection
1242
- if not isinstance(qgraph, QuantumGraph):
1243
- qgraph = QuantumGraph.loadUri(qgraph)
1244
- assert qgraph.metadata is not None, "Saved QGs always have metadata."
1520
+ status_log = PeriodicLogger(_LOG)
1245
1521
  output_run = qgraph.metadata["output_run"]
1522
+ # Add QuantumRun and DatasetRun (and nodes/edges, as needed) to the
1523
+ # QPG for all quanta in the QG.
1524
+ _LOG.verbose("Adding output run to provenance graph.")
1246
1525
  new_quanta: list[QuantumKey] = []
1247
- for node in qgraph:
1248
- # make a key to refer to the quantum and add it to the quantum
1249
- # provenance graph.
1250
- quantum_key = QuantumKey(
1251
- node.taskDef.label, cast(DataCoordinate, node.quantum.dataId).required_values
1252
- )
1253
- self._xgraph.add_node(quantum_key)
1254
- # use the key to get a `QuantumInfo` object for the quantum
1255
- # and set defaults for its values.
1256
- quantum_info = self.get_quantum_info(quantum_key)
1257
- quantum_info.setdefault("messages", [])
1258
- quantum_info.setdefault("runs", {})
1259
- quantum_info.setdefault("data_id", cast(DataCoordinate, node.quantum.dataId))
1260
- quantum_info.setdefault("status", QuantumInfoStatus.UNKNOWN)
1261
- quantum_info.setdefault("recovered", False)
1262
- new_quanta.append(quantum_key)
1263
- self._quanta.setdefault(quantum_key.task_label, set()).add(quantum_key)
1264
- metadata_ref = node.quantum.outputs[f"{node.taskDef.label}_metadata"][0]
1265
- log_ref = node.quantum.outputs[f"{node.taskDef.label}_log"][0]
1266
- # associate run collections with specific quanta. this is important
1267
- # if the same quanta are processed in multiple runs as in recovery
1268
- # workflows.
1269
- quantum_runs = quantum_info.setdefault("runs", {})
1270
- # the `QuantumRun` here is the specific quantum-run collection
1271
- # combination.
1272
- quantum_runs[output_run] = QuantumRun(id=node.nodeId, metadata_ref=metadata_ref, log_ref=log_ref)
1273
- # For each of the outputs of the quanta (datasets) make a key to
1274
- # refer to the dataset.
1275
- for ref in itertools.chain.from_iterable(node.quantum.outputs.values()):
1276
- dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values)
1277
- # add datasets to the nodes of the graph, with edges on the
1278
- # quanta.
1279
- self._xgraph.add_edge(quantum_key, dataset_key)
1280
- # use the dataset key to make a `DatasetInfo` object for
1281
- # the dataset and set defaults for its values.
1282
- dataset_info = self.get_dataset_info(dataset_key)
1283
- dataset_info.setdefault("data_id", ref.dataId)
1284
- dataset_info.setdefault("status", DatasetInfoStatus.PREDICTED_ONLY)
1285
- dataset_info.setdefault("messages", [])
1286
- self._datasets.setdefault(dataset_key.dataset_type_name, set()).add(dataset_key)
1287
- dataset_runs = dataset_info.setdefault("runs", {})
1288
- # make a `DatasetRun` for the specific dataset-run
1289
- # collection combination.
1290
- dataset_runs[output_run] = DatasetRun(id=ref.id)
1291
- # save metadata and logs for easier status interpretation later
1292
- if dataset_key.dataset_type_name.endswith("_metadata"):
1293
- quantum_info["metadata"] = dataset_key
1294
- quantum_runs[output_run].metadata_ref = ref
1295
- if dataset_key.dataset_type_name.endswith("_log"):
1296
- quantum_info["log"] = dataset_key
1297
- quantum_runs[output_run].log_ref = ref
1298
- for ref in itertools.chain.from_iterable(node.quantum.inputs.values()):
1299
- dataset_key = DatasetKey(ref.datasetType.nameAndComponent()[0], ref.dataId.required_values)
1300
- if dataset_key in self._xgraph:
1301
- # add another edge if the input datasetType and quantum are
1302
- # in the graph
1303
- self._xgraph.add_edge(dataset_key, quantum_key)
1304
- for dataset_type_name in self._datasets:
1305
- for ref in butler.registry.queryDatasets(dataset_type_name, collections=output_run):
1306
- # find the datasets in the butler
1526
+ for n, node in enumerate(qgraph):
1527
+ new_quanta.append(self._add_new_quantum(node, output_run))
1528
+ status_log.log("Added nodes for %s of %s quanta.", n + 1, len(qgraph))
1529
+ # Query for datasets in the output run to see which ones were actually
1530
+ # produced.
1531
+ _LOG.verbose("Querying for existence for %s dataset types.", len(self._datasets.keys()))
1532
+ for m, dataset_type_name in enumerate(self._datasets):
1533
+ try:
1534
+ refs = butler.query_datasets(
1535
+ dataset_type_name, collections=output_run, explain=False, limit=None
1536
+ )
1537
+ except MissingDatasetTypeError:
1538
+ continue
1539
+ for n, ref in enumerate(refs):
1307
1540
  dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values)
1308
1541
  dataset_info = self.get_dataset_info(dataset_key)
1309
1542
  dataset_run = dataset_info["runs"][output_run] # dataset run (singular)
1310
- # if the dataset is in the output run collection, we produced
1311
- # it!
1312
1543
  dataset_run.produced = True
1313
- # the outputs of failed or blocked quanta in this run.
1314
- blocked: set[DatasetKey] = set()
1315
- for quantum_key in new_quanta:
1316
- quantum_info = self.get_quantum_info(quantum_key)
1317
- quantum_run = quantum_info["runs"][output_run]
1318
- metadata_key = quantum_info["metadata"]
1319
- log_key = quantum_info["log"]
1320
- metadata_dataset_run = self.get_dataset_info(metadata_key)["runs"][output_run]
1321
- log_dataset_run = self.get_dataset_info(log_key)["runs"][output_run]
1322
- # if we do have metadata, we know that the task finished.
1323
- if metadata_dataset_run.produced:
1324
- # if we also have logs, this is a success.
1325
- if log_dataset_run.produced:
1326
- quantum_run.status = QuantumRunStatus.SUCCESSFUL
1327
- else:
1328
- # if we have metadata and no logs, this is a very rare
1329
- # case. either the task ran successfully and the datastore
1330
- # died immediately afterwards, or some supporting
1331
- # infrastructure for transferring the logs to the datastore
1332
- # failed.
1333
- quantum_run.status = QuantumRunStatus.LOGS_MISSING
1334
- # If requested, read caveats from metadata.
1335
- if read_caveats == "exhaustive" or (
1336
- read_caveats == "lazy"
1337
- and not all(
1338
- self.get_dataset_info(dataset_key)["runs"][output_run].produced
1339
- for dataset_key in self._xgraph.successors(quantum_key)
1340
- )
1544
+ status_log.log(
1545
+ "Updated status for %s of %s datasets of %s of %s types.",
1546
+ n + 1,
1547
+ len(refs),
1548
+ m + 1,
1549
+ len(self._datasets.keys()),
1550
+ )
1551
+ if use_qbb:
1552
+ _LOG.verbose("Using quantum-backed butler for metadata loads.")
1553
+ self._butler_wrappers[output_run] = _ThreadLocalButlerWrapper.wrap_qbb(butler, qgraph)
1554
+ else:
1555
+ _LOG.verbose("Using full butler for metadata loads.")
1556
+ self._butler_wrappers[output_run] = _ThreadLocalButlerWrapper.wrap_full(butler)
1557
+
1558
+ _LOG.verbose("Setting quantum status from dataset existence.")
1559
+ # Update quantum status information based on which datasets were
1560
+ # produced.
1561
+ blocked: set[DatasetKey] = set() # the outputs of failed or blocked quanta in this run.
1562
+ with concurrent.futures.ThreadPoolExecutor(n_cores) as executor:
1563
+ futures: list[concurrent.futures.Future[None]] = []
1564
+ for n, quantum_key in enumerate(new_quanta):
1565
+ if (
1566
+ self._update_run_status(quantum_key, output_run, blocked) == QuantumRunStatus.SUCCESSFUL
1567
+ and read_caveats is not None
1341
1568
  ):
1342
- md = butler.get(quantum_run.metadata_ref, storageClass="TaskMetadata")
1343
- try:
1344
- # Int conversion guards against spurious conversion to
1345
- # float that can apparently sometimes happen in
1346
- # TaskMetadata.
1347
- quantum_run.caveats = QuantumSuccessCaveats(int(md["quantum"]["caveats"]))
1348
- except LookupError:
1349
- pass
1350
- try:
1351
- quantum_run.exception = ExceptionInfo.from_metadata(
1352
- md[quantum_key.task_label]["failure"]
1353
- )
1354
- except LookupError:
1355
- pass
1356
- # missing metadata means that the task did not finish.
1569
+ self._update_caveats(quantum_key, output_run, read_caveats, executor, futures)
1570
+ self._update_info_status(quantum_key, output_run)
1571
+ status_log.log("Updated status for %s of %s quanta.", n + 1, len(new_quanta))
1572
+ for n, future in enumerate(concurrent.futures.as_completed(futures)):
1573
+ if (err := future.exception()) is not None:
1574
+ raise err
1575
+ status_log.log("Added exception/caveat information for %s of %s quanta.", n + 1, len(futures))
1576
+
1577
+ def _add_new_quantum(self, node: QuantumNode, output_run: str) -> QuantumKey:
1578
+ """Add a quantum from a new quantum graph to the provenance graph.
1579
+
1580
+ Parameters
1581
+ ----------
1582
+ node : `QuantumNode`
1583
+ Node in the quantum graph.
1584
+ output_run : `str`
1585
+ Output run collection.
1586
+
1587
+ Returns
1588
+ -------
1589
+ quantum_key : `QuantumKey`
1590
+ Key for the new or existing node in the provenance graph.
1591
+
1592
+ Notes
1593
+ -----
1594
+ This method adds new quantum and dataset nodes to the provenance graph
1595
+ if they don't already exist, while adding new `QuantumRun` and
1596
+ `DatasetRun` objects to both new and existing nodes. All status
1597
+ information on those nodes is set to initial, default values that
1598
+ generally reflect quanta that have not been attempted to be run.
1599
+ """
1600
+ # make a key to refer to the quantum and add it to the quantum
1601
+ # provenance graph.
1602
+ quantum_key = QuantumKey(
1603
+ node.taskDef.label, cast(DataCoordinate, node.quantum.dataId).required_values
1604
+ )
1605
+ self._xgraph.add_node(quantum_key)
1606
+ # use the key to get a `QuantumInfo` object for the quantum
1607
+ # and set defaults for its values.
1608
+ quantum_info = self.get_quantum_info(quantum_key)
1609
+ quantum_info.setdefault("messages", [])
1610
+ quantum_info.setdefault("runs", {})
1611
+ quantum_info.setdefault("data_id", cast(DataCoordinate, node.quantum.dataId))
1612
+ quantum_info.setdefault("status", QuantumInfoStatus.UNKNOWN)
1613
+ quantum_info.setdefault("recovered", False)
1614
+ self._quanta.setdefault(quantum_key.task_label, set()).add(quantum_key)
1615
+ metadata_ref = node.quantum.outputs[f"{node.taskDef.label}_metadata"][0]
1616
+ log_ref = node.quantum.outputs[f"{node.taskDef.label}_log"][0]
1617
+ # associate run collections with specific quanta. this is important
1618
+ # if the same quanta are processed in multiple runs as in recovery
1619
+ # workflows.
1620
+ quantum_runs = quantum_info.setdefault("runs", {})
1621
+ # the `QuantumRun` here is the specific quantum-run collection
1622
+ # combination.
1623
+ quantum_runs[output_run] = QuantumRun(id=node.nodeId, metadata_ref=metadata_ref, log_ref=log_ref)
1624
+ # For each of the outputs of the quanta (datasets) make a key to
1625
+ # refer to the dataset.
1626
+ for ref in itertools.chain.from_iterable(node.quantum.outputs.values()):
1627
+ dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values)
1628
+ # add datasets to the nodes of the graph, with edges on the
1629
+ # quanta.
1630
+ self._xgraph.add_edge(quantum_key, dataset_key)
1631
+ # use the dataset key to make a `DatasetInfo` object for
1632
+ # the dataset and set defaults for its values.
1633
+ dataset_info = self.get_dataset_info(dataset_key)
1634
+ dataset_info.setdefault("data_id", ref.dataId)
1635
+ dataset_info.setdefault("status", DatasetInfoStatus.PREDICTED_ONLY)
1636
+ dataset_info.setdefault("messages", [])
1637
+ self._datasets.setdefault(dataset_key.dataset_type_name, set()).add(dataset_key)
1638
+ dataset_runs = dataset_info.setdefault("runs", {})
1639
+ # make a `DatasetRun` for the specific dataset-run
1640
+ # collection combination.
1641
+ dataset_runs[output_run] = DatasetRun(id=ref.id)
1642
+ # save metadata and logs for easier status interpretation later
1643
+ if dataset_key.dataset_type_name.endswith("_metadata"):
1644
+ quantum_info["metadata"] = dataset_key
1645
+ quantum_runs[output_run].metadata_ref = ref
1646
+ if dataset_key.dataset_type_name.endswith("_log"):
1647
+ quantum_info["log"] = dataset_key
1648
+ quantum_runs[output_run].log_ref = ref
1649
+ for ref in itertools.chain.from_iterable(node.quantum.inputs.values()):
1650
+ dataset_key = DatasetKey(ref.datasetType.nameAndComponent()[0], ref.dataId.required_values)
1651
+ if dataset_key in self._xgraph:
1652
+ # add another edge if the input datasetType and quantum are
1653
+ # in the graph
1654
+ self._xgraph.add_edge(dataset_key, quantum_key)
1655
+ return quantum_key
1656
+
1657
+ def _update_run_status(
1658
+ self, quantum_key: QuantumKey, output_run: str, blocked: set[DatasetKey]
1659
+ ) -> QuantumRunStatus:
1660
+ """Update the status of this quantum in its own output run, using
1661
+ information in the graph about which of its output datasets exist.
1662
+
1663
+ Parameters
1664
+ ----------
1665
+ quantum_key : `QuantumKey`
1666
+ Key for the node in the provenance graph.
1667
+ output_run : `str`
1668
+ Output run collection.
1669
+ blocked : `set` [ `DatasetKey` ]
1670
+ A set of output datasets (for all quanta, not just this one) that
1671
+ were blocked by failures. Will be modified in place.
1672
+
1673
+ Returns
1674
+ -------
1675
+ run_status : `QuantumRunStatus`
1676
+ Run-specific status for this quantum.
1677
+ """
1678
+ quantum_info = self.get_quantum_info(quantum_key)
1679
+ quantum_run = quantum_info["runs"][output_run]
1680
+ metadata_key = quantum_info["metadata"]
1681
+ log_key = quantum_info["log"]
1682
+ metadata_dataset_run = self.get_dataset_info(metadata_key)["runs"][output_run]
1683
+ log_dataset_run = self.get_dataset_info(log_key)["runs"][output_run]
1684
+ # if we do have metadata, we know that the task finished.
1685
+ if metadata_dataset_run.produced:
1686
+ # if we also have logs, this is a success.
1687
+ if log_dataset_run.produced:
1688
+ quantum_run.status = QuantumRunStatus.SUCCESSFUL
1357
1689
  else:
1358
- # if we have logs and no metadata, the task not finishing is
1359
- # a failure in the task itself. This includes all payload
1360
- # errors and some other problems.
1361
- if log_dataset_run.produced:
1362
- quantum_run.status = QuantumRunStatus.FAILED
1363
- # if a quantum fails, all its successor datasets are
1364
- # blocked.
1365
- blocked.update(self._xgraph.successors(quantum_key))
1366
- # if we are missing metadata and logs, either the task was not
1367
- # started, or a hard external environmental error prevented
1368
- # it from writing logs or metadata.
1690
+ # if we have metadata and no logs, this is a very rare
1691
+ # case. either the task ran successfully and the datastore
1692
+ # died immediately afterwards, or some supporting
1693
+ # infrastructure for transferring the logs to the datastore
1694
+ # failed.
1695
+ quantum_run.status = QuantumRunStatus.LOGS_MISSING
1696
+
1697
+ # missing metadata means that the task did not finish.
1698
+ else:
1699
+ # if we have logs and no metadata, the task not finishing is
1700
+ # a failure in the task itself. This includes all payload
1701
+ # errors and some other problems.
1702
+ if log_dataset_run.produced:
1703
+ quantum_run.status = QuantumRunStatus.FAILED
1704
+ # if a quantum fails, all its successor datasets are
1705
+ # blocked.
1706
+ blocked.update(self._xgraph.successors(quantum_key))
1707
+ # if we are missing metadata and logs, either the task was not
1708
+ # started, or a hard external environmental error prevented
1709
+ # it from writing logs or metadata.
1710
+ else:
1711
+ # if none of this quantum's inputs were blocked, the
1712
+ # metadata must just be missing.
1713
+ if blocked.isdisjoint(self._xgraph.predecessors(quantum_key)):
1714
+ # None of this quantum's inputs were blocked.
1715
+ quantum_run.status = QuantumRunStatus.METADATA_MISSING
1716
+ # otherwise we can assume from no metadata and no logs
1717
+ # that the task was blocked by an upstream failure.
1369
1718
  else:
1370
- # if none of this quantum's inputs were blocked, the
1371
- # metadata must just be missing.
1372
- if blocked.isdisjoint(self._xgraph.predecessors(quantum_key)):
1373
- # None of this quantum's inputs were blocked.
1374
- quantum_run.status = QuantumRunStatus.METADATA_MISSING
1375
- # otherwise we can assume from no metadata and no logs
1376
- # that the task was blocked by an upstream failure.
1377
- else:
1378
- quantum_run.status = QuantumRunStatus.BLOCKED
1379
- blocked.update(self._xgraph.successors(quantum_key))
1380
-
1381
- # Now we can start using state transitions to mark overall status.
1382
- last_status = quantum_info["status"]
1383
- new_status: QuantumInfoStatus
1384
- match last_status, quantum_run.status:
1385
- # A quantum can never escape a WONKY state.
1386
- case (QuantumInfoStatus.WONKY, _):
1387
- new_status = QuantumInfoStatus.WONKY
1388
- # Any transition to a success (excluding from WONKY) is
1389
- # a success; any transition from a failed state is also a
1390
- # recovery.
1391
- case (_, QuantumRunStatus.SUCCESSFUL):
1392
- new_status = QuantumInfoStatus.SUCCESSFUL
1393
- if (
1394
- last_status != QuantumInfoStatus.SUCCESSFUL
1395
- and last_status != QuantumInfoStatus.UNKNOWN
1396
- ):
1397
- quantum_info["recovered"] = True
1398
- # Missing logs are one of the categories of wonky quanta. They
1399
- # interfere with our ability to discern quantum status and are
1400
- # signs of weird things afoot in processing. Add a message
1401
- # noting why this quantum is being marked as wonky to be stored
1402
- # in its `UnsuccessfulQuantumInfo`.
1403
- case (_, QuantumRunStatus.LOGS_MISSING):
1404
- new_status = QuantumInfoStatus.WONKY
1405
- quantum_info["messages"].append(f"Logs missing for run {output_run!r}.")
1406
- # Leaving a successful state is another category of wonky
1407
- # quanta. If a previous success fails on a subsequent run,
1408
- # a human should inspect why. Add a message noting why this
1409
- # quantum is being marked as wonky to be stored in its
1410
- # `UnsuccessfulQuantumInfo`.
1411
- case (QuantumInfoStatus.SUCCESSFUL, _):
1412
- new_status = QuantumInfoStatus.WONKY
1413
- quantum_info["messages"].append(
1414
- f"Status went from successful in run {list(quantum_info['runs'].values())[-1]!r} "
1415
- f"to {quantum_run.status!r} in {output_run!r}."
1416
- )
1417
- # If a quantum status is unknown and it moves to blocked, we
1418
- # know for sure that it is a blocked quantum.
1419
- case (QuantumInfoStatus.UNKNOWN, QuantumRunStatus.BLOCKED):
1420
- new_status = QuantumInfoStatus.BLOCKED
1421
- # A transition into blocked does not change the overall quantum
1422
- # status for a failure.
1423
- case (_, QuantumRunStatus.BLOCKED):
1424
- new_status = last_status
1425
- # If a quantum transitions from any state into missing
1426
- # metadata, we don't have enough information to diagnose its
1427
- # state.
1428
- case (_, QuantumRunStatus.METADATA_MISSING):
1429
- new_status = QuantumInfoStatus.UNKNOWN
1430
- # Any transition into failure is a failed state.
1431
- case (_, QuantumRunStatus.FAILED):
1432
- new_status = QuantumInfoStatus.FAILED
1433
- # Update `QuantumInfo.status` for this quantum.
1434
- quantum_info["status"] = new_status
1435
-
1436
- def __resolve_duplicates(
1719
+ quantum_run.status = QuantumRunStatus.BLOCKED
1720
+ blocked.update(self._xgraph.successors(quantum_key))
1721
+ return quantum_run.status
1722
+
1723
+ def _update_info_status(self, quantum_key: QuantumKey, output_run: str) -> QuantumInfoStatus:
1724
+ """Update the status of this quantum across all runs with the status
1725
+ for its latest run.
1726
+
1727
+ Parameters
1728
+ ----------
1729
+ quantum_key : `QuantumKey`
1730
+ Key for the node in the provenance graph.
1731
+ output_run : `str`
1732
+ Output run collection.
1733
+
1734
+ Returns
1735
+ -------
1736
+ info_status : `QuantumRunStatus`
1737
+ Run-specific status for this quantum.
1738
+ """
1739
+ # Now we can start using state transitions to mark overall status.
1740
+ quantum_info = self.get_quantum_info(quantum_key)
1741
+ quantum_run = quantum_info["runs"][output_run]
1742
+ last_status = quantum_info["status"]
1743
+ new_status: QuantumInfoStatus
1744
+ match last_status, quantum_run.status:
1745
+ # A quantum can never escape a WONKY state.
1746
+ case (QuantumInfoStatus.WONKY, _):
1747
+ new_status = QuantumInfoStatus.WONKY
1748
+ # Any transition to a success (excluding from WONKY) is
1749
+ # a success; any transition from a failed state is also a
1750
+ # recovery.
1751
+ case (_, QuantumRunStatus.SUCCESSFUL):
1752
+ new_status = QuantumInfoStatus.SUCCESSFUL
1753
+ if last_status != QuantumInfoStatus.SUCCESSFUL and last_status != QuantumInfoStatus.UNKNOWN:
1754
+ quantum_info["recovered"] = True
1755
+ # Missing logs are one of the categories of wonky quanta. They
1756
+ # interfere with our ability to discern quantum status and are
1757
+ # signs of weird things afoot in processing. Add a message
1758
+ # noting why this quantum is being marked as wonky to be stored
1759
+ # in its `UnsuccessfulQuantumInfo`.
1760
+ case (_, QuantumRunStatus.LOGS_MISSING):
1761
+ new_status = QuantumInfoStatus.WONKY
1762
+ quantum_info["messages"].append(f"Logs missing for run {output_run!r}.")
1763
+ # Leaving a successful state is another category of wonky
1764
+ # quanta. If a previous success fails on a subsequent run,
1765
+ # a human should inspect why. Add a message noting why this
1766
+ # quantum is being marked as wonky to be stored in its
1767
+ # `UnsuccessfulQuantumInfo`.
1768
+ case (QuantumInfoStatus.SUCCESSFUL, _):
1769
+ new_status = QuantumInfoStatus.WONKY
1770
+ quantum_info["messages"].append(
1771
+ f"Status went from successful in run {list(quantum_info['runs'].values())[-1]!r} "
1772
+ f"to {quantum_run.status!r} in {output_run!r}."
1773
+ )
1774
+ # If a quantum status is unknown and it moves to blocked, we
1775
+ # know for sure that it is a blocked quantum.
1776
+ case (QuantumInfoStatus.UNKNOWN, QuantumRunStatus.BLOCKED):
1777
+ new_status = QuantumInfoStatus.BLOCKED
1778
+ # A transition into blocked does not change the overall quantum
1779
+ # status for a failure.
1780
+ case (_, QuantumRunStatus.BLOCKED):
1781
+ new_status = last_status
1782
+ # If a quantum transitions from any state into missing
1783
+ # metadata, we don't have enough information to diagnose its
1784
+ # state.
1785
+ case (_, QuantumRunStatus.METADATA_MISSING):
1786
+ new_status = QuantumInfoStatus.UNKNOWN
1787
+ # Any transition into failure is a failed state.
1788
+ case (_, QuantumRunStatus.FAILED):
1789
+ new_status = QuantumInfoStatus.FAILED
1790
+ # Update `QuantumInfo.status` for this quantum.
1791
+ quantum_info["status"] = new_status
1792
+ return new_status
1793
+
1794
+ def _update_caveats(
1795
+ self,
1796
+ quantum_key: QuantumKey,
1797
+ output_run: str,
1798
+ read_caveats: Literal["lazy", "exhaustive"],
1799
+ executor: concurrent.futures.Executor,
1800
+ futures: list[concurrent.futures.Future[None]],
1801
+ ) -> None:
1802
+ """Read quantum success caveats and exception information from task
1803
+ metadata.
1804
+
1805
+ Parameters
1806
+ ----------
1807
+ quantum_key : `QuantumKey`
1808
+ Key for the node in the provenance graph.
1809
+ output_run : `str`
1810
+ Output run collection.
1811
+ read_caveats : `str`
1812
+ Whether to read metadata files to get flags that describe qualified
1813
+ successes. If "exhaustive", all metadata files will be read. If
1814
+ "lazy", only metadata files where at least one predicted output is
1815
+ missing will be read.
1816
+ butler : `lsst.daf.butler.Butler`
1817
+ The Butler used for this report. This should match the Butler
1818
+ used for the run associated with the executed quantum graph.
1819
+ """
1820
+ if read_caveats == "lazy" and all(
1821
+ self.get_dataset_info(dataset_key)["runs"][output_run].produced
1822
+ for dataset_key in self._xgraph.successors(quantum_key)
1823
+ ):
1824
+ return
1825
+ quantum_info = self.get_quantum_info(quantum_key)
1826
+ quantum_run = quantum_info["runs"][output_run]
1827
+
1828
+ def read_metadata() -> None:
1829
+ md = self._butler_get(quantum_run.metadata_ref, storageClass="TaskMetadata")
1830
+ try:
1831
+ # Int conversion guards against spurious conversion to
1832
+ # float that can apparently sometimes happen in
1833
+ # TaskMetadata.
1834
+ quantum_run.caveats = QuantumSuccessCaveats(int(md["quantum"]["caveats"]))
1835
+ except LookupError:
1836
+ pass
1837
+ try:
1838
+ quantum_run.exception = ExceptionInfo._from_metadata(md[quantum_key.task_label]["failure"])
1839
+ except LookupError:
1840
+ pass
1841
+
1842
+ futures.append(executor.submit(read_metadata))
1843
+
1844
+ def _resolve_duplicates(
1437
1845
  self,
1438
1846
  butler: Butler,
1439
1847
  collections: Sequence[str] | None = None,
@@ -1450,7 +1858,7 @@ class QuantumProvenanceGraph:
1450
1858
  dataset, mark the producer quantum as WONKY.
1451
1859
 
1452
1860
  This method should be called after
1453
- `QuantumProvenanceGraph.__add_new_graph` has been called on every graph
1861
+ `QuantumProvenanceGraph._add_new_graph` has been called on every graph
1454
1862
  associated with the data processing.
1455
1863
 
1456
1864
  Parameters
@@ -1458,19 +1866,20 @@ class QuantumProvenanceGraph:
1458
1866
  butler : `lsst.daf.butler.Butler`
1459
1867
  The Butler used for this report. This should match the Butler used
1460
1868
  for the run associated with the executed quantum graph.
1461
-
1462
- collections : `Sequence` [`str`] | `None`
1463
- Collections to use in `lsst.daf.butler.registry.queryDatasets` if
1464
- paring down the query would be useful.
1465
-
1869
+ collections : `~collections.abc.Sequence` [`str`] | `None`
1870
+ Collections to use in `lsst.daf.butler.query_datasets` when testing
1871
+ which datasets are available at a high level.
1466
1872
  where : `str`
1467
- A "where" string to use to constrain the collections, if passed.
1468
-
1873
+ A "where" string to use to constrain the datasets; should be
1874
+ provided if ``collections`` includes many datasets that are not in
1875
+ any graphs, to select just those that might be (e.g. when sharding
1876
+ over dimensions and using a final collection that spans multiple
1877
+ shards).
1469
1878
  curse_failed_logs : `bool`
1470
1879
  Mark log datasets as CURSED if they are visible in the final
1471
1880
  output collection. Note that a campaign-level collection must be
1472
1881
  used here for `collections` if `curse_failed_logs` is `True`; if
1473
- `__resolve_duplicates` is run on a list of group-level collections
1882
+ `_resolve_duplicates` is run on a list of group-level collections
1474
1883
  then each will only show log datasets from their own failures as
1475
1884
  visible and datasets from others will be marked as cursed.
1476
1885
  """
@@ -1483,14 +1892,17 @@ class QuantumProvenanceGraph:
1483
1892
  been added, or make a new graph with all constituent
1484
1893
  attempts."""
1485
1894
  )
1486
- for dataset_type_name in self._datasets:
1895
+ status_log = PeriodicLogger(_LOG)
1896
+ _LOG.verbose("Querying for dataset visibility.")
1897
+ for m, dataset_type_name in enumerate(self._datasets):
1487
1898
  # find datasets in a larger collection.
1488
- for ref in butler.registry.queryDatasets(
1489
- dataset_type_name,
1490
- collections=collections,
1491
- findFirst=True,
1492
- where=where,
1493
- ):
1899
+ try:
1900
+ refs = butler.query_datasets(
1901
+ dataset_type_name, collections=collections, where=where, limit=None, explain=False
1902
+ )
1903
+ except MissingDatasetTypeError:
1904
+ continue
1905
+ for n, ref in enumerate(refs):
1494
1906
  dataset_key = DatasetKey(ref.datasetType.name, ref.dataId.required_values)
1495
1907
  try:
1496
1908
  dataset_info = self.get_dataset_info(dataset_key)
@@ -1500,9 +1912,16 @@ class QuantumProvenanceGraph:
1500
1912
  continue
1501
1913
  # queryable datasets are `visible`.
1502
1914
  dataset_info["runs"][ref.run].visible = True
1503
-
1504
- for task_quanta in self._quanta.values():
1505
- for quantum_key in task_quanta:
1915
+ status_log.log(
1916
+ "Updated visibility for %s of %s datasets of type %s of %s.",
1917
+ n + 1,
1918
+ len(refs),
1919
+ m + 1,
1920
+ len(self._datasets.keys()),
1921
+ )
1922
+ _LOG.verbose("Updating task status from dataset visibility.")
1923
+ for m, task_quanta in enumerate(self._quanta.values()):
1924
+ for n, quantum_key in enumerate(task_quanta):
1506
1925
  # runs associated with visible datasets.
1507
1926
  visible_runs: set[str] = set()
1508
1927
  quantum_info = self.get_quantum_info(quantum_key)
@@ -1570,168 +1989,118 @@ class QuantumProvenanceGraph:
1570
1989
  + f"from {str(dataset_info['runs'])};"
1571
1990
  + f"{str(dataset_info['status'])}"
1572
1991
  )
1992
+ status_log.log(
1993
+ "Updated task status from visibility for %s of %s quanta of task %s of %s.",
1994
+ n + 1,
1995
+ len(task_quanta),
1996
+ m + 1,
1997
+ len(self._quanta.keys()),
1998
+ )
1573
1999
  # If we make it all the way through resolve_duplicates, set
1574
2000
  # self._finalized = True so that it cannot be run again.
1575
2001
  self._finalized = True
1576
2002
 
1577
- def assemble_quantum_provenance_graph(
1578
- self,
1579
- butler: Butler,
1580
- qgraphs: Sequence[QuantumGraph | ResourcePathExpression],
1581
- collections: Sequence[str] | None = None,
1582
- where: str = "",
1583
- curse_failed_logs: bool = False,
1584
- read_caveats: Literal["lazy", "exhaustive"] | None = "exhaustive",
1585
- ) -> None:
1586
- """Assemble the quantum provenance graph from a list of all graphs
1587
- corresponding to processing attempts.
2003
+ def _butler_get(self, ref: DatasetRef, **kwargs: Any) -> Any:
2004
+ return self._butler_wrappers[ref.run].butler.get(ref, **kwargs)
1588
2005
 
1589
- This method calls the private method `__add_new_graph` on each of the
1590
- constituent graphs, verifying that the graphs have been passed in
1591
- order. After `__add_new_graph` has been called on all graphs in the
1592
- `Sequence`, the method calls `__resolve_duplicates`.
1593
2006
 
1594
- Parameters
1595
- ----------
1596
- butler : `lsst.daf.butler.Butler`
1597
- The Butler used for this report. This should match the Butler used
1598
- for the run associated with the executed quantum graph.
1599
- qgraphs : `Sequence` [`QuantumGraph` | `ResourcePathExpression`]
1600
- A list of either quantum graph objects or their uri's, to be used
1601
- to assemble the `QuantumProvenanceGraph`.
1602
- collections : `Sequence` [`str`] | `None`
1603
- Collections to use in `lsst.daf.butler.registry.queryDatasets` if
1604
- paring down the query would be useful.
1605
- where : `str`
1606
- A "where" string to use to constrain the collections, if passed.
1607
- curse_failed_logs : `bool`
1608
- Mark log datasets as CURSED if they are visible in the final
1609
- output collection. Note that a campaign-level collection must be
1610
- used here for `collections` if `curse_failed_logs` is `True`; if
1611
- `__resolve_duplicates` is run on a list of group-level collections
1612
- then each will only show log datasets from their own failures as
1613
- visible and datasets from others will be marked as cursed.
1614
- read_caveats : `str` or `None`, optional
1615
- Whether to read metadata files to get flags that describe qualified
1616
- successes. If `None`, no metadata files will be read and all
1617
- ``caveats`` fields will be `None`. If "exhaustive", all
1618
- metadata files will be read. If "lazy", only metadata files where
1619
- at least one predicted output is missing will be read.
1620
- """
1621
- if read_caveats not in ("lazy", "exhaustive", None):
1622
- raise TypeError(
1623
- f"Invalid option {read_caveats!r} for read_caveats; should be 'lazy', 'exhaustive', or None."
1624
- )
1625
- output_runs = []
1626
- for graph in qgraphs:
1627
- qgraph = graph if isinstance(graph, QuantumGraph) else QuantumGraph.loadUri(graph)
1628
- assert qgraph.metadata is not None, "Saved QGs always have metadata."
1629
- self.__add_new_graph(butler, qgraph, read_caveats=read_caveats)
1630
- output_runs.append(qgraph.metadata["output_run"])
1631
- # If the user has not passed a `collections` variable
1632
- if not collections:
1633
- # We reverse the order of the associated output runs because the
1634
- # query in __resolve_duplicates must be done most recent-first.
1635
- collections = list(reversed(output_runs))
1636
- assert not curse_failed_logs, (
1637
- "curse_failed_logs option must be used with one campaign-level collection."
1638
- )
1639
- self.__resolve_duplicates(butler, collections, where, curse_failed_logs)
1640
-
1641
- def to_summary(self, butler: Butler, do_store_logs: bool = True) -> Summary:
1642
- """Summarize the `QuantumProvenanceGraph`.
1643
-
1644
- Parameters
1645
- ----------
1646
- butler : `lsst.daf.butler.Butler`
1647
- The Butler used for this report.
1648
- do_store_logs : `bool`
1649
- Store the logs in the summary dictionary.
1650
-
1651
- Returns
1652
- -------
1653
- result : `Summary`
1654
- A struct containing counts of quanta and datasets in each of
1655
- the overall states defined in `QuantumInfo` and `DatasetInfo`,
1656
- as well as diagnostic information and error messages for failed
1657
- quanta and strange edge cases, and a list of recovered quanta.
1658
- """
1659
- if not self._finalized:
1660
- raise RuntimeError(
1661
- """resolve_duplicates must be called to finalize the
1662
- QuantumProvenanceGraph before making a summary."""
1663
- )
1664
- result = Summary()
1665
- for task_label, quanta in self._quanta.items():
1666
- task_summary = TaskSummary()
1667
- task_summary.n_expected = len(quanta)
1668
- for quantum_key in quanta:
1669
- quantum_info = self.get_quantum_info(quantum_key)
1670
- task_summary.add_quantum_info(quantum_info, butler, do_store_logs)
1671
- result.tasks[task_label] = task_summary
1672
-
1673
- for dataset_type_name, datasets in self._datasets.items():
1674
- dataset_type_summary = DatasetTypeSummary(producer="")
1675
- dataset_type_summary.n_expected = len(datasets)
1676
- for dataset_key in datasets:
1677
- dataset_info = self.get_dataset_info(dataset_key)
1678
- producer_key = self.get_producer_of(dataset_key)
1679
- producer_info = self.get_quantum_info(producer_key)
1680
- # Not ideal, but hard to get out of the graph at the moment.
1681
- # Change after DM-40441
1682
- dataset_type_summary.producer = producer_key.task_label
1683
- dataset_type_summary.add_dataset_info(dataset_info, producer_info)
2007
+ class _ThreadLocalButlerWrapper:
2008
+ """A wrapper for a thread-local limited butler.
1684
2009
 
1685
- result.datasets[dataset_type_name] = dataset_type_summary
1686
- return result
2010
+ Parameter
2011
+ ---------
2012
+ factory : `~collections.abc.Callable`
2013
+ A callable that takes no arguments and returns a limited butler.
2014
+ """
1687
2015
 
1688
- def iter_outputs_of(self, quantum_key: QuantumKey) -> Iterator[DatasetKey]:
1689
- """Iterate through the outputs of a quantum, yielding all the
1690
- `DatasetKey`s produced by the quantum.
2016
+ def __init__(self, factory: Callable[[], LimitedButler]):
2017
+ self._factory = factory
2018
+ self._thread_local = threading.local()
1691
2019
 
1692
- Parameters
1693
- ----------
1694
- quantum_key : `QuantumKey`
1695
- The key for the quantum whose outputs are needed.
1696
- """
1697
- yield from self._xgraph.successors(quantum_key)
1698
-
1699
- def get_producer_of(self, dataset_key: DatasetKey) -> QuantumKey:
1700
- """Unpack the predecessor (producer quantum) of a given dataset key
1701
- from a graph.
2020
+ @classmethod
2021
+ def wrap_qbb(cls, full_butler: Butler, qg: QuantumGraph) -> _ThreadLocalButlerWrapper:
2022
+ """Wrap a `~lsst.daf.butler.QuantumBackedButler` suitable for reading
2023
+ log and metadata files.
1702
2024
 
1703
2025
  Parameters
1704
2026
  ----------
1705
- dataset_key : `DatasetKey`
1706
- The key for the dataset whose producer quantum is needed.
2027
+ full_butler : `~lsst.daf.butler.Butler`
2028
+ Full butler to draw datastore and dimension configuration from.
2029
+ qg : `QuantumGraph`
2030
+ Quantum graph,
1707
2031
 
1708
2032
  Returns
1709
2033
  -------
1710
- result : `QuantumKey`
1711
- The key for the quantum which produced the dataset.
2034
+ wrapper : `_ThreadLocalButlerWrapper`
2035
+ A wrapper that provides access to a thread-local QBB, constructing]
2036
+ it on first use.
1712
2037
  """
1713
- (result,) = self._xgraph.predecessors(dataset_key)
1714
- return result
2038
+ dataset_ids = []
2039
+ for task_label in qg.pipeline_graph.tasks.keys():
2040
+ for quantum in qg.get_task_quanta(task_label).values():
2041
+ dataset_ids.append(quantum.outputs[LOG_OUTPUT_TEMPLATE.format(label=task_label)][0].id)
2042
+ dataset_ids.append(quantum.outputs[METADATA_OUTPUT_TEMPLATE.format(label=task_label)][0].id)
2043
+ try:
2044
+ butler_config = full_butler._config # type: ignore[attr-defined]
2045
+ except AttributeError:
2046
+ raise RuntimeError("use_qbb=True requires a direct butler.") from None
2047
+ factory = _QuantumBackedButlerFactory(
2048
+ butler_config,
2049
+ dataset_ids,
2050
+ full_butler.dimensions,
2051
+ dataset_types={dt.name: dt for dt in qg.registryDatasetTypes()},
2052
+ )
2053
+ return cls(factory)
1715
2054
 
1716
- def iter_downstream(
1717
- self, key: QuantumKey | DatasetKey
1718
- ) -> Iterator[tuple[QuantumKey, QuantumInfo] | tuple[DatasetKey, DatasetInfo]]:
1719
- """Iterate over the quanta and datasets that are downstream of a
1720
- quantum or dataset.
2055
+ @classmethod
2056
+ def wrap_full(cls, full_butler: Butler) -> _ThreadLocalButlerWrapper:
2057
+ """Wrap a full `~lsst.daf.butler.Butler`.
1721
2058
 
1722
2059
  Parameters
1723
2060
  ----------
1724
- key : `QuantumKey` or `DatasetKey`
1725
- Starting node.
2061
+ full_butler : `~lsst.daf.butler.Butler`
2062
+ Full butler to clone when making thread-local copies.
1726
2063
 
1727
2064
  Returns
1728
2065
  -------
1729
- iter : `~collections.abc.Iterator` [ `tuple` ]
1730
- An iterator over pairs of (`QuantumKey`, `QuantumInfo`) or
1731
- (`DatasetKey`, `DatasetInfo`).
2066
+ wrapper : `_ThreadLocalButlerWrapper`
2067
+ A wrapper that provides access to a thread-local butler,
2068
+ constructing it on first use.
1732
2069
  """
1733
- for key in networkx.dag.descendants(self._xgraph, key):
1734
- yield (key, self._xgraph.nodes[key]) # type: ignore
2070
+ return cls(full_butler.clone)
2071
+
2072
+ @property
2073
+ def butler(self) -> LimitedButler:
2074
+ """The wrapped butler, constructed on first use within each thread."""
2075
+ if (butler := getattr(self._thread_local, "butler", None)) is None:
2076
+ self._thread_local.butler = self._factory()
2077
+ butler = self._thread_local.butler
2078
+ return butler
2079
+
2080
+
2081
+ @dataclasses.dataclass
2082
+ class _QuantumBackedButlerFactory:
2083
+ """A factory for `~lsst.daf.butler.QuantumBackedButler`, for use by
2084
+ `_ThreadLocalButlerWrapper`.
2085
+ """
2086
+
2087
+ config: ButlerConfig
2088
+ dataset_ids: list[DatasetId]
2089
+ universe: DimensionUniverse
2090
+ dataset_types: dict[str, DatasetType]
2091
+
2092
+ def __call__(self) -> QuantumBackedButler:
2093
+ return QuantumBackedButler.from_predicted(
2094
+ self.config,
2095
+ predicted_inputs=self.dataset_ids,
2096
+ predicted_outputs=[],
2097
+ dimensions=self.universe,
2098
+ # We don't need the datastore records in the QG because we're
2099
+ # only going to read metadata and logs, and those are never
2100
+ # overall inputs.
2101
+ datastore_records={},
2102
+ dataset_types=self.dataset_types,
2103
+ )
1735
2104
 
1736
2105
 
1737
2106
  def _cli() -> None: