lsst-pipe-base 30.2026.200__py3-none-any.whl → 30.2026.400__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/pipe/base/_instrument.py +10 -12
- lsst/pipe/base/_status.py +29 -10
- lsst/pipe/base/automatic_connection_constants.py +9 -1
- lsst/pipe/base/cli/cmd/__init__.py +16 -2
- lsst/pipe/base/cli/cmd/commands.py +42 -4
- lsst/pipe/base/connectionTypes.py +72 -160
- lsst/pipe/base/connections.py +3 -6
- lsst/pipe/base/execution_reports.py +0 -5
- lsst/pipe/base/log_capture.py +8 -4
- lsst/pipe/base/log_on_close.py +79 -0
- lsst/pipe/base/mp_graph_executor.py +51 -15
- lsst/pipe/base/pipeline.py +3 -4
- lsst/pipe/base/pipelineIR.py +0 -6
- lsst/pipe/base/pipelineTask.py +5 -7
- lsst/pipe/base/pipeline_graph/_edges.py +19 -7
- lsst/pipe/base/pipeline_graph/_pipeline_graph.py +8 -0
- lsst/pipe/base/quantum_graph/_common.py +7 -4
- lsst/pipe/base/quantum_graph/_multiblock.py +6 -16
- lsst/pipe/base/quantum_graph/_predicted.py +111 -10
- lsst/pipe/base/quantum_graph/_provenance.py +727 -26
- lsst/pipe/base/quantum_graph/aggregator/_communicators.py +26 -50
- lsst/pipe/base/quantum_graph/aggregator/_config.py +78 -9
- lsst/pipe/base/quantum_graph/aggregator/_ingester.py +12 -11
- lsst/pipe/base/quantum_graph/aggregator/_scanner.py +48 -234
- lsst/pipe/base/quantum_graph/aggregator/_structs.py +6 -116
- lsst/pipe/base/quantum_graph/aggregator/_supervisor.py +24 -18
- lsst/pipe/base/quantum_graph/aggregator/_writer.py +33 -350
- lsst/pipe/base/quantum_graph/formatter.py +171 -0
- lsst/pipe/base/quantum_graph/ingest_graph.py +356 -0
- lsst/pipe/base/quantum_graph_executor.py +116 -13
- lsst/pipe/base/quantum_provenance_graph.py +17 -2
- lsst/pipe/base/separable_pipeline_executor.py +18 -2
- lsst/pipe/base/single_quantum_executor.py +59 -41
- lsst/pipe/base/struct.py +4 -0
- lsst/pipe/base/version.py +1 -1
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/METADATA +2 -1
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/RECORD +45 -42
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/WHEEL +1 -1
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/entry_points.txt +0 -0
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/licenses/LICENSE +0 -0
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/licenses/bsd_license.txt +0 -0
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/licenses/gpl-v3.0.txt +0 -0
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/top_level.txt +0 -0
- {lsst_pipe_base-30.2026.200.dist-info → lsst_pipe_base-30.2026.400.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
# This file is part of pipe_base.
|
|
2
|
+
#
|
|
3
|
+
# Developed for the LSST Data Management System.
|
|
4
|
+
# This product includes software developed by the LSST Project
|
|
5
|
+
# (http://www.lsst.org).
|
|
6
|
+
# See the COPYRIGHT file at the top-level directory of this distribution
|
|
7
|
+
# for details of code ownership.
|
|
8
|
+
#
|
|
9
|
+
# This software is dual licensed under the GNU General Public License and also
|
|
10
|
+
# under a 3-clause BSD license. Recipients may choose which of these licenses
|
|
11
|
+
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
|
|
12
|
+
# respectively. If you choose the GPL option then the following text applies
|
|
13
|
+
# (but note that there is still no warranty even if you opt for BSD instead):
|
|
14
|
+
#
|
|
15
|
+
# This program is free software: you can redistribute it and/or modify
|
|
16
|
+
# it under the terms of the GNU General Public License as published by
|
|
17
|
+
# the Free Software Foundation, either version 3 of the License, or
|
|
18
|
+
# (at your option) any later version.
|
|
19
|
+
#
|
|
20
|
+
# This program is distributed in the hope that it will be useful,
|
|
21
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
22
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
23
|
+
# GNU General Public License for more details.
|
|
24
|
+
#
|
|
25
|
+
# You should have received a copy of the GNU General Public License
|
|
26
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
27
|
+
|
|
28
|
+
"""A tool for ingesting provenance quantum graphs (written by the `aggregator`
|
|
29
|
+
module) and [re-]ingesting other datasets (metadata/logs/configs) backed by the
|
|
30
|
+
same file. This "finalizes" the RUN collection, prohibiting (at least
|
|
31
|
+
conceptually) further processing.
|
|
32
|
+
|
|
33
|
+
This always proceeds in three steps, so we can resume efficiently:
|
|
34
|
+
|
|
35
|
+
1. First we ask the butler to "forget" any metadata/log/config datasets that
|
|
36
|
+
exist in the output RUN collection, removing any record of them from the
|
|
37
|
+
butler database while preserving their files.
|
|
38
|
+
|
|
39
|
+
2. Next we ingest the ``run_provenance`` graph dataset itself.
|
|
40
|
+
|
|
41
|
+
3. Finally, in batches of quanta, we use a
|
|
42
|
+
`~lsst.daf.butler.QuantumBackedButler` to delete the original
|
|
43
|
+
metadata/log/config files and ingest new versions of those datasets into the
|
|
44
|
+
butler.
|
|
45
|
+
|
|
46
|
+
Thus, at any point, if the ``run_provenance`` dataset has not been ingested,
|
|
47
|
+
we know any metadata/log/config datasets that have been ingested are backed by
|
|
48
|
+
the original files.
|
|
49
|
+
|
|
50
|
+
Moreover, if the ``run_provenance`` dataset has been ingested, any existing
|
|
51
|
+
metadata/log/config datasets must be backed by the graph file, and the original
|
|
52
|
+
files for those datasets will have been deleted.
|
|
53
|
+
|
|
54
|
+
We also know that at all times the metadata/log/config *content* is safely
|
|
55
|
+
present in either the original files in the butler storage or in an
|
|
56
|
+
already-ingested ``run_provenance`` dataset.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
from __future__ import annotations
|
|
60
|
+
|
|
61
|
+
__all__ = ("ingest_graph",)
|
|
62
|
+
|
|
63
|
+
import dataclasses
|
|
64
|
+
import itertools
|
|
65
|
+
import uuid
|
|
66
|
+
from collections.abc import Iterator
|
|
67
|
+
from contextlib import contextmanager
|
|
68
|
+
|
|
69
|
+
from lsst.daf.butler import (
|
|
70
|
+
Butler,
|
|
71
|
+
Config,
|
|
72
|
+
DataCoordinate,
|
|
73
|
+
DatasetRef,
|
|
74
|
+
DatasetType,
|
|
75
|
+
FileDataset,
|
|
76
|
+
QuantumBackedButler,
|
|
77
|
+
)
|
|
78
|
+
from lsst.daf.butler.registry.sql_registry import SqlRegistry
|
|
79
|
+
from lsst.resources import ResourcePath, ResourcePathExpression
|
|
80
|
+
from lsst.utils.logging import getLogger
|
|
81
|
+
|
|
82
|
+
from ..automatic_connection_constants import PROVENANCE_DATASET_TYPE_NAME, PROVENANCE_STORAGE_CLASS
|
|
83
|
+
from ._provenance import (
|
|
84
|
+
ProvenanceDatasetInfo,
|
|
85
|
+
ProvenanceInitQuantumInfo,
|
|
86
|
+
ProvenanceQuantumGraph,
|
|
87
|
+
ProvenanceQuantumGraphReader,
|
|
88
|
+
ProvenanceQuantumInfo,
|
|
89
|
+
)
|
|
90
|
+
from .formatter import ProvenanceFormatter
|
|
91
|
+
|
|
92
|
+
_LOG = getLogger(__name__)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def ingest_graph(
|
|
96
|
+
butler_config: str | Config,
|
|
97
|
+
uri: ResourcePathExpression | None = None,
|
|
98
|
+
*,
|
|
99
|
+
transfer: str | None = "move",
|
|
100
|
+
batch_size: int = 10000,
|
|
101
|
+
output_run: str | None = None,
|
|
102
|
+
) -> None:
|
|
103
|
+
"""Ingest a provenance graph into a butler repository.
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
butler_config : `str`
|
|
108
|
+
Path or alias for the butler repository, or a butler repository config
|
|
109
|
+
object.
|
|
110
|
+
uri : convertible to `lsst.resources.ResourcePath` or `None`, optional
|
|
111
|
+
Location of the provenance quantum graph to ingest. `None` indicates
|
|
112
|
+
that the quantum graph has already been ingested, but other ingests
|
|
113
|
+
and/or deletions failed and need to be resumed.
|
|
114
|
+
batch_size : `int`, optional
|
|
115
|
+
Number of datasets to process in each transaction.
|
|
116
|
+
output_run : `str`, optional
|
|
117
|
+
Output `~lsst.daf.butler.CollectionType.RUN` collection name. Only
|
|
118
|
+
needs to be provided if ``uri`` is `None`. If it is provided the
|
|
119
|
+
output run in the graph is checked against it.
|
|
120
|
+
|
|
121
|
+
Notes
|
|
122
|
+
-----
|
|
123
|
+
After this operation, no further processing may be done in the
|
|
124
|
+
`~lsst.daf.butler.CollectionType.RUN` collection.
|
|
125
|
+
|
|
126
|
+
If this process is interrupted, it can pick up where it left off if run
|
|
127
|
+
again (at the cost of some duplicate work to figure out how much progress
|
|
128
|
+
it had made).
|
|
129
|
+
"""
|
|
130
|
+
with _GraphIngester.open(butler_config, uri, output_run) as helper:
|
|
131
|
+
helper.fetch_already_ingested_datasets()
|
|
132
|
+
if not helper.graph_already_ingested:
|
|
133
|
+
assert uri is not None
|
|
134
|
+
helper.forget_ingested_datasets(batch_size=batch_size)
|
|
135
|
+
helper.ingest_graph_dataset(uri, transfer=transfer)
|
|
136
|
+
helper.clean_and_reingest_datasets(batch_size=batch_size)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@dataclasses.dataclass
|
|
140
|
+
class _GraphIngester:
|
|
141
|
+
butler_config: str | Config
|
|
142
|
+
butler: Butler
|
|
143
|
+
graph: ProvenanceQuantumGraph
|
|
144
|
+
graph_already_ingested: bool
|
|
145
|
+
n_datasets: int
|
|
146
|
+
datasets_already_ingested: set[uuid.UUID] = dataclasses.field(default_factory=set)
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def output_run(self) -> str:
|
|
150
|
+
return self.graph.header.output_run
|
|
151
|
+
|
|
152
|
+
@classmethod
|
|
153
|
+
@contextmanager
|
|
154
|
+
def open(
|
|
155
|
+
cls,
|
|
156
|
+
butler_config: str | Config,
|
|
157
|
+
uri: ResourcePathExpression | None,
|
|
158
|
+
output_run: str | None,
|
|
159
|
+
) -> Iterator[_GraphIngester]:
|
|
160
|
+
with Butler.from_config(butler_config, collections=output_run, writeable=True) as butler:
|
|
161
|
+
butler.registry.registerDatasetType(
|
|
162
|
+
DatasetType(PROVENANCE_DATASET_TYPE_NAME, butler.dimensions.empty, PROVENANCE_STORAGE_CLASS)
|
|
163
|
+
)
|
|
164
|
+
graph, graph_already_ingested = cls.read_graph(butler, uri)
|
|
165
|
+
if output_run is not None and graph.header.output_run != output_run:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"Given output run {output_run!r} does not match the graph "
|
|
168
|
+
f"header {graph.header.output_run!r}."
|
|
169
|
+
)
|
|
170
|
+
n_datasets = 2 * len(graph.quantum_only_xgraph) + len(graph.init_quanta)
|
|
171
|
+
yield cls(
|
|
172
|
+
butler_config=butler_config,
|
|
173
|
+
butler=butler,
|
|
174
|
+
graph=graph,
|
|
175
|
+
graph_already_ingested=graph_already_ingested,
|
|
176
|
+
n_datasets=n_datasets,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
@staticmethod
|
|
180
|
+
def read_graph(
|
|
181
|
+
butler: Butler,
|
|
182
|
+
uri: ResourcePathExpression | None,
|
|
183
|
+
) -> tuple[ProvenanceQuantumGraph, bool]:
|
|
184
|
+
if uri is not None:
|
|
185
|
+
_LOG.info("Reading the pre-ingest provenance graph.")
|
|
186
|
+
with ProvenanceQuantumGraphReader.open(uri) as reader:
|
|
187
|
+
reader.read_quanta()
|
|
188
|
+
reader.read_init_quanta()
|
|
189
|
+
graph = reader.graph
|
|
190
|
+
already_ingested = (
|
|
191
|
+
butler.find_dataset(PROVENANCE_DATASET_TYPE_NAME, collections=[graph.header.output_run])
|
|
192
|
+
is not None
|
|
193
|
+
)
|
|
194
|
+
return graph, already_ingested
|
|
195
|
+
else:
|
|
196
|
+
_LOG.info("Reading the already-ingested provenance graph.")
|
|
197
|
+
parameters = {"datasets": [], "read_init_quanta": True}
|
|
198
|
+
return butler.get(PROVENANCE_DATASET_TYPE_NAME, parameters=parameters), True
|
|
199
|
+
|
|
200
|
+
def fetch_already_ingested_datasets(self) -> None:
|
|
201
|
+
_LOG.info("Querying for existing datasets in %r.", self.output_run)
|
|
202
|
+
self.datasets_already_ingested.update(self.butler.registry._fetch_run_dataset_ids(self.output_run))
|
|
203
|
+
|
|
204
|
+
def iter_datasets(self) -> Iterator[tuple[uuid.UUID, ProvenanceDatasetInfo]]:
|
|
205
|
+
xgraph = self.graph.bipartite_xgraph
|
|
206
|
+
for task_label, quanta_for_task in self.graph.quanta_by_task.items():
|
|
207
|
+
_LOG.verbose(
|
|
208
|
+
"Batching up metadata and log datasets from %d %s quanta.", len(quanta_for_task), task_label
|
|
209
|
+
)
|
|
210
|
+
for quantum_id in quanta_for_task.values():
|
|
211
|
+
quantum_info: ProvenanceQuantumInfo = xgraph.nodes[quantum_id]
|
|
212
|
+
metadata_id = quantum_info["metadata_id"]
|
|
213
|
+
yield metadata_id, xgraph.nodes[metadata_id]
|
|
214
|
+
log_id = quantum_info["log_id"]
|
|
215
|
+
yield log_id, xgraph.nodes[log_id]
|
|
216
|
+
_LOG.verbose("Batching up config datasets from %d tasks.", len(self.graph.init_quanta))
|
|
217
|
+
for task_label, quantum_id in self.graph.init_quanta.items():
|
|
218
|
+
init_quantum_info: ProvenanceInitQuantumInfo = xgraph.nodes[quantum_id]
|
|
219
|
+
config_id = init_quantum_info["config_id"]
|
|
220
|
+
yield config_id, xgraph.nodes[config_id]
|
|
221
|
+
|
|
222
|
+
def forget_ingested_datasets(self, batch_size: int) -> None:
|
|
223
|
+
_LOG.info(
|
|
224
|
+
"Dropping database records for metadata/log/config datasets backed by their original files."
|
|
225
|
+
)
|
|
226
|
+
to_forget: list[DatasetRef] = []
|
|
227
|
+
n_forgotten: int = 0
|
|
228
|
+
n_skipped: int = 0
|
|
229
|
+
for dataset_id, dataset_info in self.iter_datasets():
|
|
230
|
+
if dataset_info["produced"] and dataset_id in self.datasets_already_ingested:
|
|
231
|
+
to_forget.append(self._make_ref_from_info(dataset_id, dataset_info))
|
|
232
|
+
self.datasets_already_ingested.remove(dataset_id)
|
|
233
|
+
if len(to_forget) >= batch_size:
|
|
234
|
+
n_forgotten += self._run_forget(to_forget, n_forgotten + n_skipped)
|
|
235
|
+
else:
|
|
236
|
+
n_skipped += 1
|
|
237
|
+
n_forgotten += self._run_forget(to_forget, n_forgotten + n_skipped)
|
|
238
|
+
_LOG.info(
|
|
239
|
+
"Removed database records for %d metadata/log/config datasets, while %d were already absent.",
|
|
240
|
+
n_forgotten,
|
|
241
|
+
n_skipped,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
def _run_forget(self, to_forget: list[DatasetRef], n_current: int) -> int:
|
|
245
|
+
if to_forget:
|
|
246
|
+
_LOG.verbose(
|
|
247
|
+
"Forgetting a %d-dataset batch; %d/%d forgotten so far or already absent.",
|
|
248
|
+
len(to_forget),
|
|
249
|
+
n_current,
|
|
250
|
+
self.n_datasets,
|
|
251
|
+
)
|
|
252
|
+
with self.butler.registry.transaction():
|
|
253
|
+
self.butler._datastore.forget(to_forget)
|
|
254
|
+
self.butler.registry.removeDatasets(to_forget)
|
|
255
|
+
n = len(to_forget)
|
|
256
|
+
to_forget.clear()
|
|
257
|
+
return n
|
|
258
|
+
|
|
259
|
+
def ingest_graph_dataset(self, uri: ResourcePathExpression, transfer: str | None) -> None:
|
|
260
|
+
_LOG.info("Ingesting the provenance quantum graph.")
|
|
261
|
+
dataset_type = DatasetType(
|
|
262
|
+
PROVENANCE_DATASET_TYPE_NAME, self.butler.dimensions.empty, PROVENANCE_STORAGE_CLASS
|
|
263
|
+
)
|
|
264
|
+
self.butler.registry.registerDatasetType(dataset_type)
|
|
265
|
+
ref = DatasetRef(dataset_type, DataCoordinate.make_empty(self.butler.dimensions), run=self.output_run)
|
|
266
|
+
uri = ResourcePath(uri)
|
|
267
|
+
self.butler.ingest(
|
|
268
|
+
# We use .abspath() since butler assumes paths are relative to the
|
|
269
|
+
# repo root, while users expects them to be relative to the CWD in
|
|
270
|
+
# this context.
|
|
271
|
+
FileDataset(refs=[ref], path=uri.abspath(), formatter=ProvenanceFormatter),
|
|
272
|
+
transfer=transfer,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
def clean_and_reingest_datasets(self, batch_size: int) -> None:
|
|
276
|
+
_LOG.info(
|
|
277
|
+
"Deleting original metadata/log/config files and re-ingesting them with provenance graph backing."
|
|
278
|
+
)
|
|
279
|
+
direct_uri = self.butler.getURI(PROVENANCE_DATASET_TYPE_NAME, collections=[self.output_run])
|
|
280
|
+
qbb = self.make_qbb()
|
|
281
|
+
to_process: list[DatasetRef] = []
|
|
282
|
+
n_processed: int = 0
|
|
283
|
+
n_skipped: int = 0
|
|
284
|
+
n_not_produced: int = 0
|
|
285
|
+
for dataset_id, dataset_info in self.iter_datasets():
|
|
286
|
+
if not dataset_info["produced"]:
|
|
287
|
+
n_not_produced += 1
|
|
288
|
+
elif dataset_id not in self.datasets_already_ingested:
|
|
289
|
+
to_process.append(self._make_ref_from_info(dataset_id, dataset_info))
|
|
290
|
+
if len(to_process) >= batch_size:
|
|
291
|
+
n_processed += self._run_clean_and_ingest(
|
|
292
|
+
qbb, direct_uri, to_process, n_processed + n_skipped
|
|
293
|
+
)
|
|
294
|
+
else:
|
|
295
|
+
n_skipped += 1
|
|
296
|
+
n_processed += self._run_clean_and_ingest(qbb, direct_uri, to_process, n_processed + n_skipped)
|
|
297
|
+
_LOG.info(
|
|
298
|
+
"Deleted and re-ingested %d metadata/log/config datasets "
|
|
299
|
+
"(%d had already been processed, %d were not produced).",
|
|
300
|
+
n_processed,
|
|
301
|
+
n_skipped,
|
|
302
|
+
n_not_produced,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
def _run_clean_and_ingest(
|
|
306
|
+
self, qbb: QuantumBackedButler, direct_uri: ResourcePath, to_process: list[DatasetRef], n_current: int
|
|
307
|
+
) -> int:
|
|
308
|
+
if not to_process:
|
|
309
|
+
return 0
|
|
310
|
+
_LOG.verbose(
|
|
311
|
+
"Deleting and deleting a %d-dataset batch; %d/%d complete.",
|
|
312
|
+
len(to_process),
|
|
313
|
+
n_current,
|
|
314
|
+
self.n_datasets,
|
|
315
|
+
)
|
|
316
|
+
sql_registry: SqlRegistry = self.butler._registry # type: ignore[attr-defined]
|
|
317
|
+
expanded_refs = sql_registry.expand_refs(to_process)
|
|
318
|
+
# We need to pass predict=True to keep QBB/FileDatastore from wasting
|
|
319
|
+
# time doing existence checks, since ResourcePath.mremove will ignore
|
|
320
|
+
# nonexistent files anyway.
|
|
321
|
+
original_uris = list(
|
|
322
|
+
itertools.chain.from_iterable(
|
|
323
|
+
ref_uris.iter_all() for ref_uris in qbb.get_many_uris(expanded_refs, predict=True).values()
|
|
324
|
+
)
|
|
325
|
+
)
|
|
326
|
+
removal_status = ResourcePath.mremove(original_uris, do_raise=False)
|
|
327
|
+
for path, status in removal_status.items():
|
|
328
|
+
if not status.success and not isinstance(status.exception, FileNotFoundError):
|
|
329
|
+
assert status.exception is not None, "Exception should be set if success=False."
|
|
330
|
+
status.exception.add_note(f"Attempting to delete original file at {path}.")
|
|
331
|
+
raise status.exception
|
|
332
|
+
file_dataset = FileDataset(refs=expanded_refs, path=direct_uri, formatter=ProvenanceFormatter)
|
|
333
|
+
self.butler.ingest(file_dataset, transfer=None)
|
|
334
|
+
n = len(to_process)
|
|
335
|
+
to_process.clear()
|
|
336
|
+
return n
|
|
337
|
+
|
|
338
|
+
@staticmethod
|
|
339
|
+
def _make_ref_from_info(dataset_id: uuid.UUID, dataset_info: ProvenanceDatasetInfo) -> DatasetRef:
|
|
340
|
+
return DatasetRef(
|
|
341
|
+
dataset_info["pipeline_node"].dataset_type,
|
|
342
|
+
dataset_info["data_id"],
|
|
343
|
+
run=dataset_info["run"],
|
|
344
|
+
id=dataset_id,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
def make_qbb(self) -> QuantumBackedButler:
|
|
348
|
+
dataset_types = {d.name: d.dataset_type for d in self.graph.pipeline_graph.dataset_types.values()}
|
|
349
|
+
return QuantumBackedButler.from_predicted(
|
|
350
|
+
config=self.butler_config,
|
|
351
|
+
predicted_inputs=(),
|
|
352
|
+
predicted_outputs=(),
|
|
353
|
+
dimensions=self.butler.dimensions,
|
|
354
|
+
datastore_records={},
|
|
355
|
+
dataset_types=dataset_types,
|
|
356
|
+
)
|
|
@@ -27,23 +27,113 @@
|
|
|
27
27
|
|
|
28
28
|
from __future__ import annotations
|
|
29
29
|
|
|
30
|
-
__all__ = ["QuantumExecutor", "QuantumGraphExecutor"]
|
|
30
|
+
__all__ = ["QuantumExecutionResult", "QuantumExecutor", "QuantumGraphExecutor"]
|
|
31
31
|
|
|
32
32
|
from abc import ABC, abstractmethod
|
|
33
|
-
from typing import TYPE_CHECKING
|
|
33
|
+
from typing import TYPE_CHECKING, Self
|
|
34
|
+
|
|
35
|
+
from lsst.daf.butler import Quantum
|
|
34
36
|
|
|
35
37
|
from .quantum_reports import QuantumReport, Report
|
|
36
38
|
|
|
37
39
|
if TYPE_CHECKING:
|
|
38
40
|
import uuid
|
|
39
41
|
|
|
40
|
-
from lsst.daf.butler import
|
|
42
|
+
from lsst.daf.butler.logging import ButlerLogRecords
|
|
41
43
|
|
|
44
|
+
from ._task_metadata import TaskMetadata
|
|
42
45
|
from .graph import QuantumGraph
|
|
43
46
|
from .pipeline_graph import TaskNode
|
|
44
47
|
from .quantum_graph import PredictedQuantumGraph
|
|
45
48
|
|
|
46
49
|
|
|
50
|
+
class QuantumExecutionResult(tuple[Quantum, QuantumReport | None]):
|
|
51
|
+
"""A result struct that captures information about a single quantum's
|
|
52
|
+
execution.
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
quantum : `lsst.daf.butler.Quantum`
|
|
57
|
+
Quantum that was executed.
|
|
58
|
+
report : `.quantum_reports.QuantumReport`
|
|
59
|
+
Report with basic information about the execution.
|
|
60
|
+
task_metadata : `TaskMetadata`, optional
|
|
61
|
+
Metadata saved by the task and executor during execution.
|
|
62
|
+
skipped_existing : `bool`, optional
|
|
63
|
+
If `True`, this quantum was not executed because it appeared to have
|
|
64
|
+
already been executed successfully.
|
|
65
|
+
adjusted_no_work : `bool`, optional
|
|
66
|
+
If `True`, this quantum was not executed because the
|
|
67
|
+
`PipelineTaskConnections.adjustQuanta` hook raised `NoWorkFound`.
|
|
68
|
+
|
|
69
|
+
Notes
|
|
70
|
+
-----
|
|
71
|
+
For backwards compatibility, this class is a two-element tuple that allows
|
|
72
|
+
the ``quantum`` and ``report`` attributes to be unpacked. Additional
|
|
73
|
+
regular attributes may be added by executors (but the tuple must remain
|
|
74
|
+
only two elements to enable the current unpacking interface).
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
def __new__(
|
|
78
|
+
cls,
|
|
79
|
+
quantum: Quantum,
|
|
80
|
+
report: QuantumReport | None,
|
|
81
|
+
*,
|
|
82
|
+
task_metadata: TaskMetadata | None = None,
|
|
83
|
+
skipped_existing: bool | None = None,
|
|
84
|
+
adjusted_no_work: bool | None = None,
|
|
85
|
+
) -> Self:
|
|
86
|
+
return super().__new__(cls, (quantum, report))
|
|
87
|
+
|
|
88
|
+
# We need to define both __init__ and __new__ because tuple inheritance
|
|
89
|
+
# requires __new__ and numpydoc requires __init__.
|
|
90
|
+
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
quantum: Quantum,
|
|
94
|
+
report: QuantumReport | None,
|
|
95
|
+
*,
|
|
96
|
+
task_metadata: TaskMetadata | None = None,
|
|
97
|
+
skipped_existing: bool | None = None,
|
|
98
|
+
adjusted_no_work: bool | None = None,
|
|
99
|
+
):
|
|
100
|
+
self._task_metadata = task_metadata
|
|
101
|
+
self._skipped_existing = skipped_existing
|
|
102
|
+
self._adjusted_no_work = adjusted_no_work
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def quantum(self) -> Quantum:
|
|
106
|
+
"""The quantum actually executed."""
|
|
107
|
+
return self[0]
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def report(self) -> QuantumReport | None:
|
|
111
|
+
"""Structure describing the status of the execution of a quantum.
|
|
112
|
+
|
|
113
|
+
This is `None` if the implementation does not support this feature.
|
|
114
|
+
"""
|
|
115
|
+
return self[1]
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def task_metadata(self) -> TaskMetadata | None:
|
|
119
|
+
"""Metadata saved by the task and executor during execution."""
|
|
120
|
+
return self._task_metadata
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def skipped_existing(self) -> bool | None:
|
|
124
|
+
"""If `True`, this quantum was not executed because it appeared to have
|
|
125
|
+
already been executed successfully.
|
|
126
|
+
"""
|
|
127
|
+
return self._skipped_existing
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def adjusted_no_work(self) -> bool | None:
|
|
131
|
+
"""If `True`, this quantum was not executed because the
|
|
132
|
+
`PipelineTaskConnections.adjustQuanta` hook raised `NoWorkFound`.
|
|
133
|
+
"""
|
|
134
|
+
return self._adjusted_no_work
|
|
135
|
+
|
|
136
|
+
|
|
47
137
|
class QuantumExecutor(ABC):
|
|
48
138
|
"""Class which abstracts execution of a single Quantum.
|
|
49
139
|
|
|
@@ -55,8 +145,14 @@ class QuantumExecutor(ABC):
|
|
|
55
145
|
|
|
56
146
|
@abstractmethod
|
|
57
147
|
def execute(
|
|
58
|
-
self,
|
|
59
|
-
|
|
148
|
+
self,
|
|
149
|
+
task_node: TaskNode,
|
|
150
|
+
/,
|
|
151
|
+
quantum: Quantum,
|
|
152
|
+
quantum_id: uuid.UUID | None = None,
|
|
153
|
+
*,
|
|
154
|
+
log_records: ButlerLogRecords | None = None,
|
|
155
|
+
) -> QuantumExecutionResult:
|
|
60
156
|
"""Execute single quantum.
|
|
61
157
|
|
|
62
158
|
Parameters
|
|
@@ -67,15 +163,18 @@ class QuantumExecutor(ABC):
|
|
|
67
163
|
Quantum for this execution.
|
|
68
164
|
quantum_id : `uuid.UUID` or `None`, optional
|
|
69
165
|
The ID of the quantum to be executed.
|
|
166
|
+
log_records : `lsst.daf.butler.ButlerLogRecords`, optional
|
|
167
|
+
Container that should be used to store logs in memory before
|
|
168
|
+
writing them to the butler. This disables streaming log (since
|
|
169
|
+
we'd have to store them in memory anyway), but it permits the
|
|
170
|
+
caller to prepend logs to be stored in the butler and allows task
|
|
171
|
+
logs to be inspected by the caller after execution is complete.
|
|
70
172
|
|
|
71
173
|
Returns
|
|
72
174
|
-------
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
Structure describing the status of the execution of a quantum.
|
|
77
|
-
`None` is returned if implementation does not support this
|
|
78
|
-
feature.
|
|
175
|
+
result : `QuantumExecutionResult`
|
|
176
|
+
Result struct. May also be unpacked as a 2-tuple (see type
|
|
177
|
+
documentation).
|
|
79
178
|
|
|
80
179
|
Notes
|
|
81
180
|
-----
|
|
@@ -93,7 +192,9 @@ class QuantumGraphExecutor(ABC):
|
|
|
93
192
|
"""
|
|
94
193
|
|
|
95
194
|
@abstractmethod
|
|
96
|
-
def execute(
|
|
195
|
+
def execute(
|
|
196
|
+
self, graph: QuantumGraph | PredictedQuantumGraph, *, provenance_graph_file: str | None = None
|
|
197
|
+
) -> None:
|
|
97
198
|
"""Execute whole graph.
|
|
98
199
|
|
|
99
200
|
Implementation of this method depends on particular execution model
|
|
@@ -103,8 +204,10 @@ class QuantumGraphExecutor(ABC):
|
|
|
103
204
|
|
|
104
205
|
Parameters
|
|
105
206
|
----------
|
|
106
|
-
graph : `.QuantumGraph`
|
|
207
|
+
graph : `.QuantumGraph` or `.quantum_graph.PredictedQuantumGraph`
|
|
107
208
|
Execution graph.
|
|
209
|
+
provenance_graph_file : `str`, optional
|
|
210
|
+
A filename to write provenance to.
|
|
108
211
|
"""
|
|
109
212
|
raise NotImplementedError()
|
|
110
213
|
|
|
@@ -79,6 +79,7 @@ from .automatic_connection_constants import (
|
|
|
79
79
|
METADATA_OUTPUT_CONNECTION_NAME,
|
|
80
80
|
METADATA_OUTPUT_STORAGE_CLASS,
|
|
81
81
|
METADATA_OUTPUT_TEMPLATE,
|
|
82
|
+
PROVENANCE_DATASET_TYPE_NAME,
|
|
82
83
|
)
|
|
83
84
|
from .graph import QuantumGraph, QuantumNode
|
|
84
85
|
|
|
@@ -1513,8 +1514,22 @@ class QuantumProvenanceGraph:
|
|
|
1513
1514
|
len(self._datasets.keys()),
|
|
1514
1515
|
)
|
|
1515
1516
|
if use_qbb:
|
|
1516
|
-
|
|
1517
|
-
|
|
1517
|
+
provenance_graph_ref: DatasetRef | None = None
|
|
1518
|
+
try:
|
|
1519
|
+
provenance_graph_ref = butler.find_dataset(
|
|
1520
|
+
PROVENANCE_DATASET_TYPE_NAME, collections=output_run
|
|
1521
|
+
)
|
|
1522
|
+
except MissingDatasetTypeError:
|
|
1523
|
+
pass
|
|
1524
|
+
if provenance_graph_ref is not None:
|
|
1525
|
+
_LOG.warning(
|
|
1526
|
+
"Cannot use QBB for metadata/log reads after provenance has been ingested; "
|
|
1527
|
+
"falling back to full butler."
|
|
1528
|
+
)
|
|
1529
|
+
self._butler_wrappers[output_run] = _ThreadLocalButlerWrapper.wrap_full(butler)
|
|
1530
|
+
else:
|
|
1531
|
+
_LOG.verbose("Using quantum-backed butler for metadata loads.")
|
|
1532
|
+
self._butler_wrappers[output_run] = _ThreadLocalButlerWrapper.wrap_qbb(butler, qgraph)
|
|
1518
1533
|
else:
|
|
1519
1534
|
_LOG.verbose("Using full butler for metadata loads.")
|
|
1520
1535
|
self._butler_wrappers[output_run] = _ThreadLocalButlerWrapper.wrap_full(butler)
|
|
@@ -40,7 +40,8 @@ from collections.abc import Iterable
|
|
|
40
40
|
from typing import Any
|
|
41
41
|
|
|
42
42
|
import lsst.resources
|
|
43
|
-
from lsst.daf.butler import Butler
|
|
43
|
+
from lsst.daf.butler import Butler, DatasetRef
|
|
44
|
+
from lsst.daf.butler._rubin.temporary_for_ingest import TemporaryForIngest
|
|
44
45
|
|
|
45
46
|
from ._quantumContext import ExecutionResources
|
|
46
47
|
from .all_dimensions_quantum_graph_builder import AllDimensionsQuantumGraphBuilder
|
|
@@ -362,6 +363,8 @@ class SeparablePipelineExecutor:
|
|
|
362
363
|
fail_fast: bool = False,
|
|
363
364
|
graph_executor: QuantumGraphExecutor | None = None,
|
|
364
365
|
num_proc: int = 1,
|
|
366
|
+
*,
|
|
367
|
+
provenance_dataset_ref: DatasetRef | None = None,
|
|
365
368
|
) -> None:
|
|
366
369
|
"""Run a pipeline in the form of a prepared quantum graph.
|
|
367
370
|
|
|
@@ -384,6 +387,14 @@ class SeparablePipelineExecutor:
|
|
|
384
387
|
The number of processes that can be used to run the pipeline. The
|
|
385
388
|
default value ensures that no subprocess is created. Only used with
|
|
386
389
|
the default graph executor.
|
|
390
|
+
provenance_dataset_ref : `lsst.daf.butler.DatasetRef`, optional
|
|
391
|
+
Dataset that should be used to save provenance. Provenance is only
|
|
392
|
+
supported when running in a single process (at least for the
|
|
393
|
+
default quantum executor), and should not be used with
|
|
394
|
+
``skip_existing_in=[output_run]`` when retrying a previous
|
|
395
|
+
execution attempt. The caller is responsible for registering the
|
|
396
|
+
dataset type and for ensuring that the dimensions of this dataset
|
|
397
|
+
do not lead to uniqueness conflicts.
|
|
387
398
|
"""
|
|
388
399
|
if not graph_executor:
|
|
389
400
|
quantum_executor = SingleQuantumExecutor(
|
|
@@ -404,4 +415,9 @@ class SeparablePipelineExecutor:
|
|
|
404
415
|
# forked processes.
|
|
405
416
|
self._butler.registry.resetConnectionPool()
|
|
406
417
|
|
|
407
|
-
|
|
418
|
+
if provenance_dataset_ref is not None:
|
|
419
|
+
with TemporaryForIngest(self._butler, provenance_dataset_ref) as temporary:
|
|
420
|
+
graph_executor.execute(graph, provenance_graph_file=temporary.ospath)
|
|
421
|
+
temporary.ingest()
|
|
422
|
+
else:
|
|
423
|
+
graph_executor.execute(graph)
|