lsst-pipe-base 29.0.0rc1__py3-none-any.whl → 29.2025.1100__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/pipe/base/_datasetQueryConstraints.py +1 -1
- lsst/pipe/base/all_dimensions_quantum_graph_builder.py +638 -357
- lsst/pipe/base/pipeline_graph/visualization/_mermaid.py +151 -24
- lsst/pipe/base/prerequisite_helpers.py +1 -1
- lsst/pipe/base/quantum_graph_builder.py +3 -3
- lsst/pipe/base/tests/mocks/_data_id_match.py +4 -0
- lsst/pipe/base/version.py +1 -1
- {lsst_pipe_base-29.0.0rc1.dist-info → lsst_pipe_base-29.2025.1100.dist-info}/METADATA +3 -1
- {lsst_pipe_base-29.0.0rc1.dist-info → lsst_pipe_base-29.2025.1100.dist-info}/RECORD +17 -17
- {lsst_pipe_base-29.0.0rc1.dist-info → lsst_pipe_base-29.2025.1100.dist-info}/WHEEL +1 -1
- {lsst_pipe_base-29.0.0rc1.dist-info → lsst_pipe_base-29.2025.1100.dist-info}/COPYRIGHT +0 -0
- {lsst_pipe_base-29.0.0rc1.dist-info → lsst_pipe_base-29.2025.1100.dist-info}/LICENSE +0 -0
- {lsst_pipe_base-29.0.0rc1.dist-info → lsst_pipe_base-29.2025.1100.dist-info}/bsd_license.txt +0 -0
- {lsst_pipe_base-29.0.0rc1.dist-info → lsst_pipe_base-29.2025.1100.dist-info}/entry_points.txt +0 -0
- {lsst_pipe_base-29.0.0rc1.dist-info → lsst_pipe_base-29.2025.1100.dist-info}/gpl-v3.0.txt +0 -0
- {lsst_pipe_base-29.0.0rc1.dist-info → lsst_pipe_base-29.2025.1100.dist-info}/top_level.txt +0 -0
- {lsst_pipe_base-29.0.0rc1.dist-info → lsst_pipe_base-29.2025.1100.dist-info}/zip-safe +0 -0
|
@@ -36,8 +36,7 @@ __all__ = ("AllDimensionsQuantumGraphBuilder", "DatasetQueryConstraintVariant")
|
|
|
36
36
|
import dataclasses
|
|
37
37
|
import itertools
|
|
38
38
|
from collections import defaultdict
|
|
39
|
-
from collections.abc import
|
|
40
|
-
from contextlib import contextmanager
|
|
39
|
+
from collections.abc import Iterable, Mapping
|
|
41
40
|
from typing import TYPE_CHECKING, Any, TypeAlias, final
|
|
42
41
|
|
|
43
42
|
from lsst.daf.butler import (
|
|
@@ -46,9 +45,9 @@ from lsst.daf.butler import (
|
|
|
46
45
|
DataIdValue,
|
|
47
46
|
DimensionGroup,
|
|
48
47
|
DimensionRecord,
|
|
48
|
+
DimensionUniverse,
|
|
49
49
|
MissingDatasetTypeError,
|
|
50
50
|
)
|
|
51
|
-
from lsst.daf.butler.queries import Query
|
|
52
51
|
from lsst.utils.logging import LsstLogAdapter
|
|
53
52
|
from lsst.utils.timer import timeMethod
|
|
54
53
|
|
|
@@ -128,262 +127,308 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
|
|
|
128
127
|
# There is some chance that the dimension query for one subgraph would
|
|
129
128
|
# be the same as or a dimension-subset of another. This is an
|
|
130
129
|
# optimization opportunity we're not currently taking advantage of.
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
130
|
+
tree = _DimensionGroupTree(subgraph)
|
|
131
|
+
self._query_for_data_ids(tree)
|
|
132
|
+
skeleton = self._make_subgraph_skeleton(tree)
|
|
133
|
+
self._find_followup_datasets(tree, skeleton)
|
|
134
|
+
dimension_records = self._fetch_most_dimension_records(tree)
|
|
135
135
|
leftovers = self._attach_most_dimension_records(skeleton, dimension_records)
|
|
136
136
|
self._fetch_leftover_dimension_records(leftovers, dimension_records)
|
|
137
137
|
self._attach_leftover_dimension_records(skeleton, leftovers, dimension_records)
|
|
138
138
|
return skeleton
|
|
139
139
|
|
|
140
|
+
def _query_for_data_ids(self, tree: _DimensionGroupTree) -> None:
|
|
141
|
+
"""Query for data IDs and use the result to populate the dimension
|
|
142
|
+
group tree.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
tree : `_DimensionGroupTree`
|
|
147
|
+
Tree with dimension group branches that holds subgraph-specific
|
|
148
|
+
state for this builder, to be modified in place.
|
|
149
|
+
"""
|
|
150
|
+
self.log.debug("Analyzing subgraph dimensions and overall-inputs.")
|
|
151
|
+
constraint_datasets: set[str] = set()
|
|
152
|
+
self.log.debug("Building query for data IDs.")
|
|
153
|
+
if self.dataset_query_constraint == DatasetQueryConstraintVariant.ALL:
|
|
154
|
+
self.log.debug("Constraining graph query using all datasets not marked as deferred.")
|
|
155
|
+
constraint_datasets = {
|
|
156
|
+
name
|
|
157
|
+
for name, dataset_type_node in tree.overall_inputs.items()
|
|
158
|
+
if (dataset_type_node.is_initial_query_constraint and dataset_type_node.dimensions)
|
|
159
|
+
}
|
|
160
|
+
elif self.dataset_query_constraint == DatasetQueryConstraintVariant.OFF:
|
|
161
|
+
self.log.debug("Not using dataset existence to constrain query.")
|
|
162
|
+
elif self.dataset_query_constraint == DatasetQueryConstraintVariant.LIST:
|
|
163
|
+
constraint = set(self.dataset_query_constraint)
|
|
164
|
+
inputs = tree.overall_inputs - tree.empty_dimensions_branch.dataset_types.keys()
|
|
165
|
+
if remainder := constraint.difference(inputs):
|
|
166
|
+
self.log.debug(
|
|
167
|
+
"Ignoring dataset types %s in dataset query constraint that are not inputs to this "
|
|
168
|
+
"subgraph, on the assumption that they are relevant for a different subgraph.",
|
|
169
|
+
remainder,
|
|
170
|
+
)
|
|
171
|
+
constraint.intersection_update(inputs)
|
|
172
|
+
self.log.debug(f"Constraining graph query using {constraint}")
|
|
173
|
+
constraint_datasets = constraint
|
|
174
|
+
else:
|
|
175
|
+
raise QuantumGraphBuilderError(
|
|
176
|
+
f"Unable to handle type {self.dataset_query_constraint} given as datasetQueryConstraint."
|
|
177
|
+
)
|
|
178
|
+
query_cmd: list[str] = []
|
|
179
|
+
with self.butler.query() as query:
|
|
180
|
+
query_cmd.append("with butler.query() as query:")
|
|
181
|
+
query_cmd.append(f" query = query.join_dimensions({list(tree.all_dimensions.names)})")
|
|
182
|
+
query = query.join_dimensions(tree.all_dimensions)
|
|
183
|
+
if constraint_datasets:
|
|
184
|
+
query_cmd.append(f" collections = {list(self.input_collections)}")
|
|
185
|
+
for dataset_type_name in constraint_datasets:
|
|
186
|
+
query_cmd.append(f" query = query.join_dataset_search({dataset_type_name!r}, collections)")
|
|
187
|
+
query = query.join_dataset_search(dataset_type_name, self.input_collections)
|
|
188
|
+
query_cmd.append(
|
|
189
|
+
f" query = query.where({dict(tree.subgraph.data_id.mapping)}, "
|
|
190
|
+
f"{self.where!r}, bind={self.bind!r})"
|
|
191
|
+
)
|
|
192
|
+
query = query.where(tree.subgraph.data_id, self.where, bind=self.bind)
|
|
193
|
+
self.log.verbose("Querying for data IDs via: %s", "\n".join(query_cmd))
|
|
194
|
+
# Allow duplicates from common skypix overlaps to make some queries
|
|
195
|
+
# run faster.
|
|
196
|
+
query._allow_duplicate_overlaps = True
|
|
197
|
+
self.log.info("Iterating over query results to associate quanta with datasets.")
|
|
198
|
+
# Iterate over query results, populating data IDs for datasets,
|
|
199
|
+
# quanta, and edges. We populate only the first level of the tree
|
|
200
|
+
# in the first pass, so we can be done with the query results as
|
|
201
|
+
# quickly as possible in case that holds a connection/cursor open.
|
|
202
|
+
n_rows = 0
|
|
203
|
+
for common_data_id in query.data_ids(tree.all_dimensions):
|
|
204
|
+
for branch_dimensions, branch in tree.trunk_branches.items():
|
|
205
|
+
data_id = common_data_id.subset(branch_dimensions)
|
|
206
|
+
branch.data_ids.add(data_id)
|
|
207
|
+
n_rows += 1
|
|
208
|
+
if n_rows == 0:
|
|
209
|
+
# A single multiline log plays better with log aggregators like
|
|
210
|
+
# Loki.
|
|
211
|
+
lines = ["Initial data ID query returned no rows, so QuantumGraph will be empty."]
|
|
212
|
+
try:
|
|
213
|
+
lines.extend(query.explain_no_results())
|
|
214
|
+
finally:
|
|
215
|
+
lines.append("To reproduce this query for debugging purposes, run:")
|
|
216
|
+
lines.extend(query_cmd)
|
|
217
|
+
# If an exception was raised, write a partial.
|
|
218
|
+
self.log.error("\n".join(lines))
|
|
219
|
+
return
|
|
220
|
+
self.log.verbose("Processed %s initial data ID query rows.", n_rows)
|
|
221
|
+
# We now recursively populate the data IDs of the rest of the tree.
|
|
222
|
+
tree.project_data_ids(self.log)
|
|
223
|
+
|
|
140
224
|
@timeMethod
|
|
141
|
-
def _make_subgraph_skeleton(self,
|
|
142
|
-
"""Build a `QuantumGraphSkeleton` by
|
|
143
|
-
|
|
225
|
+
def _make_subgraph_skeleton(self, tree: _DimensionGroupTree) -> QuantumGraphSkeleton:
|
|
226
|
+
"""Build a `QuantumGraphSkeleton` by processing the data IDs in the
|
|
227
|
+
dimension group tree.
|
|
144
228
|
|
|
145
229
|
Parameters
|
|
146
230
|
----------
|
|
147
|
-
|
|
148
|
-
|
|
231
|
+
tree : `_DimensionGroupTree`
|
|
232
|
+
Tree with dimension group branches that holds subgraph-specific
|
|
233
|
+
state for this builder.
|
|
149
234
|
|
|
150
235
|
Returns
|
|
151
236
|
-------
|
|
152
237
|
skeleton : `QuantumGraphSkeleton`
|
|
153
238
|
Preliminary quantum graph.
|
|
154
239
|
"""
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
# don't want to add nodes for init datasets here.
|
|
162
|
-
skeleton = QuantumGraphSkeleton(query.subgraph.tasks)
|
|
163
|
-
empty_dimensions_dataset_keys = {}
|
|
164
|
-
for dataset_type_name in query.empty_dimensions_dataset_types.keys():
|
|
165
|
-
dataset_key = skeleton.add_dataset_node(dataset_type_name, self.empty_data_id)
|
|
166
|
-
empty_dimensions_dataset_keys[dataset_type_name] = dataset_key
|
|
167
|
-
if ref := self.empty_dimensions_datasets.inputs.get(dataset_key):
|
|
168
|
-
skeleton.set_dataset_ref(ref, dataset_key)
|
|
169
|
-
if ref := self.empty_dimensions_datasets.outputs_for_skip.get(dataset_key):
|
|
170
|
-
skeleton.set_output_for_skip(ref)
|
|
171
|
-
if ref := self.empty_dimensions_datasets.outputs_in_the_way.get(dataset_key):
|
|
172
|
-
skeleton.set_output_in_the_way(ref)
|
|
173
|
-
empty_dimensions_quantum_keys = []
|
|
174
|
-
for task_label in query.empty_dimensions_tasks.keys():
|
|
175
|
-
empty_dimensions_quantum_keys.append(skeleton.add_quantum_node(task_label, self.empty_data_id))
|
|
176
|
-
self.log.info("Iterating over query results to associate quanta with datasets.")
|
|
177
|
-
# Iterate over query results, populating data IDs for datasets and
|
|
178
|
-
# quanta and then connecting them to each other. This is the slowest
|
|
179
|
-
# client-side part of QG generation, and it's often the slowest part
|
|
180
|
-
# overall, so inside this loop is where it's really critical to avoid
|
|
181
|
-
# expensive things, especially in the nested loops.
|
|
182
|
-
n_rows = 0
|
|
183
|
-
for common_data_id in query.butler_query.data_ids():
|
|
184
|
-
# Create a data ID for each set of dimensions used by one or more
|
|
185
|
-
# tasks or dataset types, and use that to record all quanta and
|
|
186
|
-
# dataset data IDs for this row.
|
|
187
|
-
dataset_keys_for_row: dict[str, DatasetKey] = empty_dimensions_dataset_keys.copy()
|
|
188
|
-
quantum_keys_for_row: list[QuantumKey] = empty_dimensions_quantum_keys.copy()
|
|
189
|
-
for dimensions, (task_nodes, dataset_type_nodes) in query.grouped_by_dimensions.items():
|
|
190
|
-
data_id = common_data_id.subset(dimensions)
|
|
191
|
-
for dataset_type_name in dataset_type_nodes.keys():
|
|
192
|
-
dataset_keys_for_row[dataset_type_name] = skeleton.add_dataset_node(
|
|
193
|
-
dataset_type_name, data_id
|
|
194
|
-
)
|
|
195
|
-
for task_label in task_nodes.keys():
|
|
196
|
-
quantum_keys_for_row.append(skeleton.add_quantum_node(task_label, data_id))
|
|
197
|
-
# Whether these quanta are new or existing, we can now associate
|
|
198
|
-
# the dataset data IDs for this row with them. The fact that a
|
|
199
|
-
# quantum data ID and a dataset data ID both came from the same
|
|
200
|
-
# result row is what tells us they should be associated. Many of
|
|
201
|
-
# these associates will be duplicates (because another query row
|
|
202
|
-
# that differed from this one only in irrelevant dimensions already
|
|
203
|
-
# added them), and our use of sets should take care of that.
|
|
204
|
-
for quantum_key in quantum_keys_for_row:
|
|
205
|
-
for read_edge in self._pipeline_graph.tasks[quantum_key.task_label].inputs.values():
|
|
206
|
-
skeleton.add_input_edge(
|
|
207
|
-
quantum_key, dataset_keys_for_row[read_edge.parent_dataset_type_name]
|
|
208
|
-
)
|
|
209
|
-
for write_edge in self._pipeline_graph.tasks[quantum_key.task_label].iter_all_outputs():
|
|
210
|
-
skeleton.add_output_edge(
|
|
211
|
-
quantum_key, dataset_keys_for_row[write_edge.parent_dataset_type_name]
|
|
212
|
-
)
|
|
213
|
-
n_rows += 1
|
|
214
|
-
if n_rows == 0:
|
|
215
|
-
query.log_failure(self.log)
|
|
216
|
-
else:
|
|
217
|
-
n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in query.subgraph.tasks)
|
|
218
|
-
self.log.info(
|
|
219
|
-
"Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges from %d query row(s).",
|
|
220
|
-
n_quanta,
|
|
221
|
-
skeleton.n_nodes - n_quanta,
|
|
222
|
-
skeleton.n_edges,
|
|
223
|
-
n_rows,
|
|
240
|
+
skeleton = QuantumGraphSkeleton(tree.subgraph.tasks)
|
|
241
|
+
for branch_dimensions, branch in tree.trunk_branches.items():
|
|
242
|
+
self.log.verbose(
|
|
243
|
+
"Adding nodes and edges for %s %s data ID(s).",
|
|
244
|
+
len(branch.data_ids),
|
|
245
|
+
branch_dimensions,
|
|
224
246
|
)
|
|
247
|
+
branch.update_skeleton(skeleton, self.log)
|
|
248
|
+
n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in tree.subgraph.tasks)
|
|
249
|
+
self.log.info(
|
|
250
|
+
"Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges.",
|
|
251
|
+
n_quanta,
|
|
252
|
+
skeleton.n_nodes - n_quanta,
|
|
253
|
+
skeleton.n_edges,
|
|
254
|
+
)
|
|
225
255
|
return skeleton
|
|
226
256
|
|
|
227
257
|
@timeMethod
|
|
228
|
-
def _find_followup_datasets(self,
|
|
229
|
-
"""Populate `existing_datasets` by performing follow-up queries
|
|
230
|
-
|
|
258
|
+
def _find_followup_datasets(self, tree: _DimensionGroupTree, skeleton: QuantumGraphSkeleton) -> None:
|
|
259
|
+
"""Populate `existing_datasets` by performing follow-up queries with
|
|
260
|
+
the data IDs in the dimension group tree.
|
|
231
261
|
|
|
232
262
|
Parameters
|
|
233
263
|
----------
|
|
234
|
-
|
|
235
|
-
|
|
264
|
+
tree : `_DimensionGroupTree`
|
|
265
|
+
Tree with dimension group branches that holds subgraph-specific
|
|
266
|
+
state for this builder.
|
|
267
|
+
skeleton : `.quantum_graph_skeleton.QuantumGraphSkeleton`
|
|
268
|
+
In-progress quantum graph to modify in place.
|
|
236
269
|
"""
|
|
237
|
-
|
|
270
|
+
dataset_key: DatasetKey | PrerequisiteDatasetKey
|
|
271
|
+
for dataset_type_name in tree.empty_dimensions_branch.dataset_types.keys():
|
|
272
|
+
dataset_key = DatasetKey(dataset_type_name, self.empty_data_id.required_values)
|
|
273
|
+
if ref := self.empty_dimensions_datasets.inputs.get(dataset_key):
|
|
274
|
+
skeleton.set_dataset_ref(ref, dataset_key)
|
|
275
|
+
if ref := self.empty_dimensions_datasets.outputs_for_skip.get(dataset_key):
|
|
276
|
+
skeleton.set_output_for_skip(ref)
|
|
277
|
+
if ref := self.empty_dimensions_datasets.outputs_in_the_way.get(dataset_key):
|
|
278
|
+
skeleton.set_output_in_the_way(ref)
|
|
279
|
+
for dimensions, branch in tree.branches_by_dimensions.items():
|
|
280
|
+
if not branch.has_followup_queries:
|
|
281
|
+
continue
|
|
282
|
+
if not branch.data_ids:
|
|
283
|
+
continue
|
|
238
284
|
# Iterate over regular input/output dataset type nodes with these
|
|
239
|
-
# dimensions to find those datasets using
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
dataset_type_node.name, self.input_collections
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
pass
|
|
254
|
-
self.log.verbose(
|
|
255
|
-
"Found %d overall-input dataset(s) of type %r.", count, dataset_type_node.name
|
|
256
|
-
)
|
|
257
|
-
continue
|
|
258
|
-
if self.skip_existing_in:
|
|
259
|
-
# Dataset type is an intermediate or output; need to find
|
|
260
|
-
# these if only they're from previously executed quanta
|
|
261
|
-
# that we might skip...
|
|
262
|
-
count = 0
|
|
263
|
-
try:
|
|
264
|
-
for ref in query.butler_query.datasets(dataset_type_node.name, self.skip_existing_in):
|
|
265
|
-
skeleton.set_output_for_skip(ref)
|
|
266
|
-
count += 1
|
|
267
|
-
if ref.run == self.output_run:
|
|
268
|
-
skeleton.set_output_in_the_way(ref)
|
|
269
|
-
except MissingDatasetTypeError:
|
|
270
|
-
pass
|
|
271
|
-
self.log.verbose(
|
|
272
|
-
"Found %d output dataset(s) of type %r in %s.",
|
|
273
|
-
count,
|
|
274
|
-
dataset_type_node.name,
|
|
275
|
-
self.skip_existing_in,
|
|
276
|
-
)
|
|
277
|
-
if self.output_run_exists and not self.skip_existing_starts_with_output_run:
|
|
278
|
-
# ...or if they're in the way and would need to be
|
|
279
|
-
# clobbered (and we haven't already found them in the
|
|
280
|
-
# previous block).
|
|
281
|
-
count = 0
|
|
282
|
-
try:
|
|
283
|
-
for ref in query.butler_query.datasets(dataset_type_node.name, [self.output_run]):
|
|
284
|
-
skeleton.set_output_in_the_way(ref)
|
|
285
|
-
count += 1
|
|
286
|
-
except MissingDatasetTypeError:
|
|
287
|
-
pass
|
|
288
|
-
self.log.verbose(
|
|
289
|
-
"Found %d output dataset(s) of type %r in %s.",
|
|
290
|
-
count,
|
|
291
|
-
dataset_type_node.name,
|
|
292
|
-
self.output_run,
|
|
293
|
-
)
|
|
294
|
-
# Iterate over tasks with these dimensions to perform follow-up
|
|
295
|
-
# queries for prerequisite inputs, which may have dimensions that
|
|
296
|
-
# were not in ``query.butler_query.dimensions`` and/or require
|
|
297
|
-
# temporal joins to calibration validity ranges.
|
|
298
|
-
for task_node in tasks_in_group.values():
|
|
299
|
-
task_prerequisite_info = self.prerequisite_info[task_node.label]
|
|
300
|
-
for connection_name, finder in list(task_prerequisite_info.finders.items()):
|
|
301
|
-
if finder.lookup_function is not None:
|
|
285
|
+
# dimensions to find those datasets using followup queries.
|
|
286
|
+
with self.butler.query() as butler_query:
|
|
287
|
+
butler_query = butler_query.join_data_coordinates(branch.data_ids)
|
|
288
|
+
for dataset_type_node in branch.dataset_types.values():
|
|
289
|
+
if dataset_type_node.name in tree.overall_inputs:
|
|
290
|
+
# Dataset type is an overall input; we always need to
|
|
291
|
+
# try to find these.
|
|
292
|
+
count = 0
|
|
293
|
+
try:
|
|
294
|
+
for ref in butler_query.datasets(dataset_type_node.name, self.input_collections):
|
|
295
|
+
skeleton.set_dataset_ref(ref)
|
|
296
|
+
count += 1
|
|
297
|
+
except MissingDatasetTypeError:
|
|
298
|
+
pass
|
|
302
299
|
self.log.verbose(
|
|
303
|
-
"
|
|
304
|
-
"(lookup function provided).",
|
|
305
|
-
finder.dataset_type_node.name,
|
|
306
|
-
task_node.label,
|
|
300
|
+
"Found %d overall-input dataset(s) of type %r.", count, dataset_type_node.name
|
|
307
301
|
)
|
|
308
302
|
continue
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
if
|
|
329
|
-
if
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
303
|
+
if self.skip_existing_in:
|
|
304
|
+
# Dataset type is an intermediate or output; need to
|
|
305
|
+
# find these if only they're from previously executed
|
|
306
|
+
# quanta that we might skip...
|
|
307
|
+
count = 0
|
|
308
|
+
try:
|
|
309
|
+
for ref in butler_query.datasets(dataset_type_node.name, self.skip_existing_in):
|
|
310
|
+
skeleton.set_output_for_skip(ref)
|
|
311
|
+
count += 1
|
|
312
|
+
if ref.run == self.output_run:
|
|
313
|
+
skeleton.set_output_in_the_way(ref)
|
|
314
|
+
except MissingDatasetTypeError:
|
|
315
|
+
pass
|
|
316
|
+
self.log.verbose(
|
|
317
|
+
"Found %d output dataset(s) of type %r in %s.",
|
|
318
|
+
count,
|
|
319
|
+
dataset_type_node.name,
|
|
320
|
+
self.skip_existing_in,
|
|
321
|
+
)
|
|
322
|
+
if self.output_run_exists and not self.skip_existing_starts_with_output_run:
|
|
323
|
+
# ...or if they're in the way and would need to be
|
|
324
|
+
# clobbered (and we haven't already found them in the
|
|
325
|
+
# previous block).
|
|
326
|
+
count = 0
|
|
327
|
+
try:
|
|
328
|
+
for ref in butler_query.datasets(dataset_type_node.name, [self.output_run]):
|
|
329
|
+
skeleton.set_output_in_the_way(ref)
|
|
330
|
+
count += 1
|
|
331
|
+
except MissingDatasetTypeError:
|
|
332
|
+
pass
|
|
333
|
+
self.log.verbose(
|
|
334
|
+
"Found %d output dataset(s) of type %r in %s.",
|
|
335
|
+
count,
|
|
336
|
+
dataset_type_node.name,
|
|
337
|
+
self.output_run,
|
|
338
|
+
)
|
|
339
|
+
# Iterate over tasks with these dimensions to perform follow-up
|
|
340
|
+
# queries for prerequisite inputs, which may have dimensions
|
|
341
|
+
# that were not in ``tree.all_dimensions`` and/or require
|
|
342
|
+
# temporal joins to calibration validity ranges.
|
|
343
|
+
for task_node in branch.tasks.values():
|
|
344
|
+
task_prerequisite_info = self.prerequisite_info[task_node.label]
|
|
345
|
+
for connection_name, finder in list(task_prerequisite_info.finders.items()):
|
|
346
|
+
if finder.lookup_function is not None:
|
|
338
347
|
self.log.verbose(
|
|
339
348
|
"Deferring prerequisite input %r of task %r to per-quantum processing "
|
|
340
|
-
"(
|
|
349
|
+
"(lookup function provided).",
|
|
341
350
|
finder.dataset_type_node.name,
|
|
342
351
|
task_node.label,
|
|
343
352
|
)
|
|
344
353
|
continue
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
354
|
+
# We also fall back to the base class if there is a
|
|
355
|
+
# nontrivial spatial or temporal join in the lookup.
|
|
356
|
+
if finder.dataset_skypix or finder.dataset_other_spatial:
|
|
357
|
+
if task_prerequisite_info.bounds.spatial_connections:
|
|
358
|
+
self.log.verbose(
|
|
359
|
+
"Deferring prerequisite input %r of task %r to per-quantum processing "
|
|
360
|
+
"(for spatial-bounds-connections handling).",
|
|
361
|
+
finder.dataset_type_node.name,
|
|
362
|
+
task_node.label,
|
|
363
|
+
)
|
|
364
|
+
continue
|
|
365
|
+
if not task_node.dimensions.spatial:
|
|
366
|
+
self.log.verbose(
|
|
367
|
+
"Deferring prerequisite input %r of task %r to per-quantum processing "
|
|
368
|
+
"(dataset has spatial data IDs, but task does not).",
|
|
369
|
+
finder.dataset_type_node.name,
|
|
370
|
+
task_node.label,
|
|
371
|
+
)
|
|
372
|
+
continue
|
|
373
|
+
if finder.dataset_has_timespan:
|
|
374
|
+
if task_prerequisite_info.bounds.spatial_connections:
|
|
375
|
+
self.log.verbose(
|
|
376
|
+
"Deferring prerequisite input %r of task %r to per-quantum processing "
|
|
377
|
+
"(for temporal-bounds-connections handling).",
|
|
378
|
+
finder.dataset_type_node.name,
|
|
379
|
+
task_node.label,
|
|
380
|
+
)
|
|
381
|
+
continue
|
|
382
|
+
if not task_node.dimensions.temporal:
|
|
383
|
+
self.log.verbose(
|
|
384
|
+
"Deferring prerequisite input %r of task %r to per-quantum processing "
|
|
385
|
+
"(dataset has temporal data IDs, but task does not).",
|
|
386
|
+
finder.dataset_type_node.name,
|
|
387
|
+
task_node.label,
|
|
388
|
+
)
|
|
389
|
+
continue
|
|
390
|
+
# We have a simple case where we can do a single query
|
|
391
|
+
# that joins the query we already have for the task
|
|
392
|
+
# data IDs to the datasets we're looking for.
|
|
393
|
+
count = 0
|
|
394
|
+
try:
|
|
395
|
+
query_results = list(
|
|
396
|
+
butler_query.join_dataset_search(
|
|
397
|
+
finder.dataset_type_node.dataset_type, self.input_collections
|
|
398
|
+
)
|
|
399
|
+
.general(
|
|
400
|
+
dimensions | finder.dataset_type_node.dataset_type.dimensions,
|
|
401
|
+
dataset_fields={finder.dataset_type_node.name: ...},
|
|
402
|
+
find_first=True,
|
|
403
|
+
)
|
|
404
|
+
.iter_tuples(finder.dataset_type_node.dataset_type)
|
|
359
405
|
)
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
406
|
+
except MissingDatasetTypeError:
|
|
407
|
+
query_results = []
|
|
408
|
+
for data_id, refs, _ in query_results:
|
|
409
|
+
ref = refs[0]
|
|
410
|
+
dataset_key = skeleton.add_prerequisite_node(ref)
|
|
411
|
+
quantum_key = QuantumKey(
|
|
412
|
+
task_node.label, data_id.subset(dimensions).required_values
|
|
364
413
|
)
|
|
365
|
-
.
|
|
414
|
+
skeleton.add_input_edge(quantum_key, dataset_key)
|
|
415
|
+
count += 1
|
|
416
|
+
# Remove this finder from the mapping so the base class
|
|
417
|
+
# knows it doesn't have to look for these
|
|
418
|
+
# prerequisites.
|
|
419
|
+
del task_prerequisite_info.finders[connection_name]
|
|
420
|
+
self.log.verbose(
|
|
421
|
+
"Added %d prerequisite input edge(s) from dataset type %r to task %r.",
|
|
422
|
+
count,
|
|
423
|
+
finder.dataset_type_node.name,
|
|
424
|
+
task_node.label,
|
|
366
425
|
)
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
ref = refs[0]
|
|
371
|
-
dataset_key = skeleton.add_prerequisite_node(ref)
|
|
372
|
-
quantum_key = QuantumKey(task_node.label, data_id.subset(dimensions).required_values)
|
|
373
|
-
skeleton.add_input_edge(quantum_key, dataset_key)
|
|
374
|
-
count += 1
|
|
375
|
-
# Remove this finder from the mapping so the base class
|
|
376
|
-
# knows it doesn't have to look for these prerequisites.
|
|
377
|
-
del task_prerequisite_info.finders[connection_name]
|
|
378
|
-
self.log.verbose(
|
|
379
|
-
"Added %d prerequisite input edge(s) from dataset type %r to task %r.",
|
|
380
|
-
count,
|
|
381
|
-
finder.dataset_type_node.name,
|
|
382
|
-
task_node.label,
|
|
383
|
-
)
|
|
426
|
+
if not branch.record_elements:
|
|
427
|
+
# Delete data ID sets we don't need anymore.
|
|
428
|
+
del branch.data_ids
|
|
384
429
|
|
|
385
430
|
@timeMethod
|
|
386
|
-
def _fetch_most_dimension_records(self,
|
|
431
|
+
def _fetch_most_dimension_records(self, tree: _DimensionGroupTree) -> DimensionRecordsMap:
|
|
387
432
|
"""Query for dimension records for all non-prerequisite data IDs (and
|
|
388
433
|
possibly some prerequisite data IDs).
|
|
389
434
|
|
|
@@ -407,12 +452,17 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
|
|
|
407
452
|
"""
|
|
408
453
|
self.log.verbose("Performing follow-up queries for dimension records.")
|
|
409
454
|
result: dict[str, dict[tuple[DataIdValue, ...], DimensionRecord]] = {}
|
|
410
|
-
for
|
|
411
|
-
|
|
412
|
-
|
|
455
|
+
for branch in tree.branches_by_dimensions.values():
|
|
456
|
+
if not branch.record_elements:
|
|
457
|
+
continue
|
|
458
|
+
if not branch.data_ids:
|
|
459
|
+
continue
|
|
460
|
+
with self.butler.query() as butler_query:
|
|
461
|
+
butler_query = butler_query.join_data_coordinates(branch.data_ids)
|
|
462
|
+
for element in branch.record_elements:
|
|
413
463
|
result[element] = {
|
|
414
464
|
record.dataId.required_values: record
|
|
415
|
-
for record in
|
|
465
|
+
for record in butler_query.dimension_records(element)
|
|
416
466
|
}
|
|
417
467
|
return result
|
|
418
468
|
|
|
@@ -532,172 +582,396 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
|
|
|
532
582
|
skeleton.set_data_id(node_key, expanded_data_id)
|
|
533
583
|
|
|
534
584
|
|
|
535
|
-
@dataclasses.dataclass(eq=False, repr=False)
|
|
536
|
-
class
|
|
537
|
-
"""A
|
|
538
|
-
|
|
585
|
+
@dataclasses.dataclass(eq=False, repr=False, slots=True)
|
|
586
|
+
class _DimensionGroupTwig:
|
|
587
|
+
"""A small side-branch of the tree of dimensions groups that tracks the
|
|
588
|
+
tasks and dataset types with a particular set of dimensions that appear in
|
|
589
|
+
the edges populated by its parent branch.
|
|
539
590
|
|
|
540
|
-
|
|
541
|
-
an instance wrapped with a context manager. This controls the lifetime of
|
|
542
|
-
the temporary table referenced by `common_data_ids`.
|
|
591
|
+
See `_DimensionGroupTree` for more details.
|
|
543
592
|
"""
|
|
544
593
|
|
|
545
|
-
|
|
546
|
-
"""
|
|
594
|
+
parent_edge_tasks: set[str] = dataclasses.field(default_factory=set)
|
|
595
|
+
"""Task labels for tasks whose quanta have the dimensions of this twig and
|
|
596
|
+
are endpoints of edges that have the combined dimensions of this twig's
|
|
597
|
+
parent branch.
|
|
598
|
+
"""
|
|
547
599
|
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
600
|
+
parent_edge_dataset_types: set[str] = dataclasses.field(default_factory=set)
|
|
601
|
+
"""Dataset type names for datasets whose quanta have the dimensions of this
|
|
602
|
+
twig and are endpoints of edges that have the combined dimensions of this
|
|
603
|
+
twig's parent branch.
|
|
604
|
+
"""
|
|
553
605
|
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
606
|
+
|
|
607
|
+
@dataclasses.dataclass(eq=False, repr=False, slots=True)
|
|
608
|
+
class _DimensionGroupBranch:
|
|
609
|
+
"""A node in the tree of dimension groups that are used to recursively
|
|
610
|
+
process query data IDs into a quantum graph.
|
|
557
611
|
"""
|
|
558
612
|
|
|
559
|
-
|
|
560
|
-
"""The
|
|
613
|
+
tasks: dict[str, TaskNode] = dataclasses.field(default_factory=dict)
|
|
614
|
+
"""The task nodes whose quanta have these dimensions, keyed by task label.
|
|
615
|
+
"""
|
|
561
616
|
|
|
562
|
-
|
|
563
|
-
"""The dataset
|
|
564
|
-
|
|
617
|
+
dataset_types: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
|
|
618
|
+
"""The dataset type nodes whose datasets have these dimensions, keyed by
|
|
619
|
+
dataset type name.
|
|
620
|
+
"""
|
|
565
621
|
|
|
566
|
-
|
|
622
|
+
record_elements: list[str] = dataclasses.field(default_factory=list)
|
|
623
|
+
"""The names of dimension elements whose records should be looked up via
|
|
624
|
+
these dimensions.
|
|
567
625
|
"""
|
|
568
626
|
|
|
569
|
-
|
|
570
|
-
"""
|
|
571
|
-
|
|
627
|
+
data_ids: set[DataCoordinate] = dataclasses.field(default_factory=set)
|
|
628
|
+
"""All data IDs with these dimensions seen in the QuantumGraph."""
|
|
629
|
+
|
|
630
|
+
input_edges: list[tuple[str, str]] = dataclasses.field(default_factory=list)
|
|
631
|
+
"""Dataset type -> task edges that are populated by this set of dimensions.
|
|
632
|
+
|
|
633
|
+
These are cases where `dimensions` is the union of the task and dataset
|
|
634
|
+
type dimensions.
|
|
572
635
|
"""
|
|
573
636
|
|
|
574
|
-
|
|
575
|
-
"""
|
|
576
|
-
|
|
637
|
+
output_edges: list[tuple[str, str]] = dataclasses.field(default_factory=list)
|
|
638
|
+
"""Task -> dataset type edges that are populated by this set of dimensions.
|
|
639
|
+
|
|
640
|
+
These are cases where `dimensions` is the union of the task and dataset
|
|
641
|
+
type dimensions.
|
|
577
642
|
"""
|
|
578
643
|
|
|
579
|
-
|
|
580
|
-
"""
|
|
644
|
+
branches: dict[DimensionGroup, _DimensionGroupBranch] = dataclasses.field(default_factory=dict)
|
|
645
|
+
"""Child branches whose dimensions are strict subsets of this branch's
|
|
646
|
+
dimensions.
|
|
647
|
+
"""
|
|
581
648
|
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
649
|
+
twigs: defaultdict[DimensionGroup, _DimensionGroupTwig] = dataclasses.field(
|
|
650
|
+
default_factory=lambda: defaultdict(_DimensionGroupTwig)
|
|
651
|
+
)
|
|
652
|
+
"""Small branches for all of the dimensions that appear on one side of any
|
|
653
|
+
edge in `input_edges` or `output_edges`.
|
|
654
|
+
"""
|
|
655
|
+
|
|
656
|
+
@property
|
|
657
|
+
def has_followup_queries(self) -> bool:
|
|
658
|
+
"""Whether we will need to perform follow-up queries with these
|
|
659
|
+
dimensions.
|
|
660
|
+
"""
|
|
661
|
+
return bool(self.tasks or self.dataset_types or self.record_elements)
|
|
662
|
+
|
|
663
|
+
@staticmethod
|
|
664
|
+
def populate_record_elements(
|
|
665
|
+
all_dimensions: DimensionGroup, branches: dict[DimensionGroup, _DimensionGroupBranch]
|
|
666
|
+
) -> None:
|
|
667
|
+
"""Ensure we have branches for all dimension elements we'll need to
|
|
668
|
+
fetch dimension records for.
|
|
589
669
|
|
|
590
670
|
Parameters
|
|
591
671
|
----------
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
672
|
+
all_dimensions : `~lsst.daf.butler.DimensionGroup`
|
|
673
|
+
All dimensions that appear in the quantum graph.
|
|
674
|
+
branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
|
|
675
|
+
`_DimensionGroupBranch` ]
|
|
676
|
+
Flat mapping of all branches to update in-place. New branches may
|
|
677
|
+
be added and existing branches may have their `record_element`
|
|
678
|
+
attributes updated.
|
|
679
|
+
"""
|
|
680
|
+
for element_name in all_dimensions.elements:
|
|
681
|
+
element = all_dimensions.universe[element_name]
|
|
682
|
+
if element.minimal_group in branches:
|
|
683
|
+
branches[element.minimal_group].record_elements.append(element_name)
|
|
684
|
+
else:
|
|
685
|
+
branches[element.minimal_group] = _DimensionGroupBranch(record_elements=[element_name])
|
|
686
|
+
|
|
687
|
+
@staticmethod
|
|
688
|
+
def populate_edges(
|
|
689
|
+
pipeline_graph: PipelineGraph, branches: dict[DimensionGroup, _DimensionGroupBranch]
|
|
690
|
+
) -> None:
|
|
691
|
+
"""Ensure we have branches for all edges in the graph.
|
|
692
|
+
|
|
693
|
+
Parameters
|
|
694
|
+
----------
|
|
695
|
+
pipeline_graph : `~..pipeline_graph.PipelineGraph``
|
|
696
|
+
Graph of tasks and dataset types.
|
|
697
|
+
branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
|
|
698
|
+
`_DimensionGroupBranch` ]
|
|
699
|
+
Flat mapping of all branches to update in-place. New branches may
|
|
700
|
+
be added and existing branches may have their `input_edges`,
|
|
701
|
+
`output_edges`, and `twigs` attributes updated.
|
|
702
|
+
"""
|
|
703
|
+
|
|
704
|
+
def update_edge_branch(
|
|
705
|
+
task_node: TaskNode, dataset_type_node: DatasetTypeNode
|
|
706
|
+
) -> _DimensionGroupBranch:
|
|
707
|
+
union_dimensions = task_node.dimensions.union(dataset_type_node.dimensions)
|
|
708
|
+
if (branch := branches.get(union_dimensions)) is None:
|
|
709
|
+
branch = _DimensionGroupBranch()
|
|
710
|
+
branches[union_dimensions] = branch
|
|
711
|
+
branch.twigs[dataset_type_node.dimensions].parent_edge_dataset_types.add(dataset_type_node.name)
|
|
712
|
+
branch.twigs[task_node.dimensions].parent_edge_tasks.add(task_node.label)
|
|
713
|
+
return branch
|
|
714
|
+
|
|
715
|
+
for task_node in pipeline_graph.tasks.values():
|
|
716
|
+
for dataset_type_node in pipeline_graph.inputs_of(task_node.label).values():
|
|
717
|
+
assert dataset_type_node is not None, "Pipeline graph is resolved."
|
|
718
|
+
if dataset_type_node.is_prerequisite:
|
|
719
|
+
continue
|
|
720
|
+
branch = update_edge_branch(task_node, dataset_type_node)
|
|
721
|
+
branch.input_edges.append((dataset_type_node.name, task_node.label))
|
|
722
|
+
for dataset_type_node in pipeline_graph.outputs_of(task_node.label).values():
|
|
723
|
+
assert dataset_type_node is not None, "Pipeline graph is resolved."
|
|
724
|
+
branch = update_edge_branch(task_node, dataset_type_node)
|
|
725
|
+
branch.output_edges.append((task_node.label, dataset_type_node.name))
|
|
726
|
+
|
|
727
|
+
@staticmethod
|
|
728
|
+
def find_next_uncontained_dimensions(
|
|
729
|
+
parent_dimensions: DimensionGroup | None, candidates: Iterable[DimensionGroup]
|
|
730
|
+
) -> list[DimensionGroup]:
|
|
731
|
+
"""Find dimension groups that are not a subset of any other dimension
|
|
732
|
+
groups in a set.
|
|
733
|
+
|
|
734
|
+
Parameters
|
|
735
|
+
----------
|
|
736
|
+
parent_dimensions : `~lsst.daf.butler.DimensionGroup` or `None`
|
|
737
|
+
If not `None`, first filter out any candidates that are not strict
|
|
738
|
+
subsets of these dimensions.
|
|
739
|
+
candidates : `~collections.abc.Iterable` [\
|
|
740
|
+
`~lsst.daf.butler.DimensionGroup` ]
|
|
741
|
+
Iterable of dimension groups to consider.
|
|
596
742
|
|
|
597
743
|
Returns
|
|
598
744
|
-------
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
the
|
|
745
|
+
uncontained : `list` [ `~lsst.daf.butler.DimensionGroup` ]
|
|
746
|
+
Dimension groups that are not contained by any other dimension
|
|
747
|
+
group in the set of filtered candidates.
|
|
602
748
|
"""
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
result.grouped_by_dimensions = result.subgraph.group_by_dimensions()
|
|
606
|
-
(
|
|
607
|
-
result.empty_dimensions_tasks,
|
|
608
|
-
result.empty_dimensions_dataset_types,
|
|
609
|
-
) = result.grouped_by_dimensions.pop(builder.universe.empty)
|
|
610
|
-
result.overall_inputs = {
|
|
611
|
-
name: node # type: ignore
|
|
612
|
-
for name, node in result.subgraph.iter_overall_inputs()
|
|
613
|
-
if not node.is_prerequisite # type: ignore
|
|
614
|
-
}
|
|
615
|
-
dimension_names: set[str] = set()
|
|
616
|
-
for dimensions_for_group in result.grouped_by_dimensions.keys():
|
|
617
|
-
dimension_names.update(dimensions_for_group.names)
|
|
618
|
-
dimensions = builder.universe.conform(dimension_names)
|
|
619
|
-
datasets: set[str] = set()
|
|
620
|
-
builder.log.debug("Building query for data IDs.")
|
|
621
|
-
if builder.dataset_query_constraint == DatasetQueryConstraintVariant.ALL:
|
|
622
|
-
builder.log.debug("Constraining graph query using all datasets not marked as deferred.")
|
|
623
|
-
datasets = {
|
|
624
|
-
name
|
|
625
|
-
for name, dataset_type_node in result.overall_inputs.items()
|
|
626
|
-
if (
|
|
627
|
-
dataset_type_node.is_initial_query_constraint
|
|
628
|
-
and name not in result.empty_dimensions_dataset_types
|
|
629
|
-
)
|
|
630
|
-
}
|
|
631
|
-
elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.OFF:
|
|
632
|
-
builder.log.debug("Not using dataset existence to constrain query.")
|
|
633
|
-
elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.LIST:
|
|
634
|
-
constraint = set(builder.dataset_query_constraint)
|
|
635
|
-
inputs = result.overall_inputs - result.empty_dimensions_dataset_types.keys()
|
|
636
|
-
if remainder := constraint.difference(inputs):
|
|
637
|
-
builder.log.debug(
|
|
638
|
-
"Ignoring dataset types %s in dataset query constraint that are not inputs to this "
|
|
639
|
-
"subgraph, on the assumption that they are relevant for a different subraph.",
|
|
640
|
-
remainder,
|
|
641
|
-
)
|
|
642
|
-
constraint.intersection_update(inputs)
|
|
643
|
-
builder.log.debug(f"Constraining graph query using {constraint}")
|
|
644
|
-
datasets = constraint
|
|
749
|
+
if parent_dimensions is None:
|
|
750
|
+
refined_candidates = candidates
|
|
645
751
|
else:
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
752
|
+
refined_candidates = [dimensions for dimensions in candidates if dimensions < parent_dimensions]
|
|
753
|
+
return [
|
|
754
|
+
dimensions
|
|
755
|
+
for dimensions in refined_candidates
|
|
756
|
+
if not any(dimensions < other for other in refined_candidates)
|
|
757
|
+
]
|
|
758
|
+
|
|
759
|
+
@classmethod
|
|
760
|
+
def populate_branches(
|
|
761
|
+
cls,
|
|
762
|
+
parent_dimensions: DimensionGroup | None,
|
|
763
|
+
branches: dict[DimensionGroup, _DimensionGroupBranch],
|
|
764
|
+
) -> dict[DimensionGroup, _DimensionGroupBranch]:
|
|
765
|
+
"""Transform a flat mapping of dimension group branches into a tree.
|
|
766
|
+
|
|
767
|
+
Parameters
|
|
768
|
+
----------
|
|
769
|
+
parent_dimensions : `~lsst.daf.butler.DimensionGroup` or `None`
|
|
770
|
+
If not `None`, ignore any candidates in `branches` that are not
|
|
771
|
+
strict subsets of these dimensions.
|
|
772
|
+
branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
|
|
773
|
+
`_DimensionGroupBranch` ]
|
|
774
|
+
Flat mapping of all branches to update in-place, by populating
|
|
775
|
+
the `branches` attributes to form a tree and removing entries that
|
|
776
|
+
have been put into the tree.
|
|
777
|
+
|
|
778
|
+
Returns
|
|
779
|
+
-------
|
|
780
|
+
uncontained_branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
|
|
781
|
+
`_DimensionGroupBranch` ]
|
|
782
|
+
Branches whose dimensions were not subsets of any others in the
|
|
783
|
+
mapping except those that were supersets of ``parent_dimensions``.
|
|
784
|
+
"""
|
|
785
|
+
result: dict[DimensionGroup, _DimensionGroupBranch] = {}
|
|
786
|
+
for parent_branch_dimensions in cls.find_next_uncontained_dimensions(
|
|
787
|
+
parent_dimensions, branches.keys()
|
|
788
|
+
):
|
|
789
|
+
parent_branch = branches.pop(parent_branch_dimensions)
|
|
790
|
+
result[parent_branch_dimensions] = parent_branch
|
|
791
|
+
for child_branch_dimensions, child_branch in cls.populate_branches(
|
|
792
|
+
parent_branch_dimensions, branches
|
|
793
|
+
).items():
|
|
794
|
+
parent_branch.branches[child_branch_dimensions] = child_branch
|
|
795
|
+
return result
|
|
671
796
|
|
|
672
|
-
def
|
|
673
|
-
"""
|
|
797
|
+
def project_data_ids(self, log: LsstLogAdapter, log_indent: str = " ") -> None:
|
|
798
|
+
"""Populate the data ID sets of child branches from the data IDs in
|
|
799
|
+
this branch, recursively.
|
|
674
800
|
|
|
675
801
|
Parameters
|
|
676
802
|
----------
|
|
677
|
-
|
|
678
|
-
|
|
803
|
+
log : `lsst.logging.LsstLogAdapter`
|
|
804
|
+
Logger to use for status reporting.
|
|
805
|
+
log_indent : `str`, optional
|
|
806
|
+
Indentation to prefix the log message. This is used when recursing
|
|
807
|
+
to make the branch structure clear.
|
|
679
808
|
"""
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
809
|
+
for data_id in self.data_ids:
|
|
810
|
+
for branch_dimensions, branch in self.branches.items():
|
|
811
|
+
branch.data_ids.add(data_id.subset(branch_dimensions))
|
|
812
|
+
for branch_dimensions, branch in self.branches.items():
|
|
813
|
+
log.debug("%sProjecting query data IDs to %s.", log_indent, branch_dimensions)
|
|
814
|
+
branch.project_data_ids(log, log_indent + " ")
|
|
815
|
+
|
|
816
|
+
def update_skeleton(
|
|
817
|
+
self, skeleton: QuantumGraphSkeleton, log: LsstLogAdapter, log_indent: str = " "
|
|
818
|
+
) -> None:
|
|
819
|
+
"""Process the data ID sets of this branch and its children recursively
|
|
820
|
+
to add nodes and edges to the under-construction quantum graph.
|
|
821
|
+
|
|
822
|
+
Parameters
|
|
823
|
+
----------
|
|
824
|
+
skeleton : `QuantumGraphSkeleton`
|
|
825
|
+
Under-construction quantum graph to modify in place.
|
|
826
|
+
log : `lsst.logging.LsstLogAdapter`
|
|
827
|
+
Logger to use for status reporting.
|
|
828
|
+
log_indent : `str`, optional
|
|
829
|
+
Indentation to prefix the log message. This is used when recursing
|
|
830
|
+
to make the branch structure clear.
|
|
831
|
+
"""
|
|
832
|
+
for branch_dimensions, branch in self.branches.items():
|
|
833
|
+
log.verbose(
|
|
834
|
+
"%sAdding nodes and edges for %s %s data ID(s).",
|
|
835
|
+
log_indent,
|
|
836
|
+
len(branch.data_ids),
|
|
837
|
+
branch_dimensions,
|
|
838
|
+
)
|
|
839
|
+
branch.update_skeleton(skeleton, log, log_indent + " ")
|
|
840
|
+
for data_id in self.data_ids:
|
|
841
|
+
for task_label in self.tasks:
|
|
842
|
+
skeleton.add_quantum_node(task_label, data_id)
|
|
843
|
+
for dataset_type_name in self.dataset_types:
|
|
844
|
+
skeleton.add_dataset_node(dataset_type_name, data_id)
|
|
845
|
+
quantum_keys: dict[str, QuantumKey] = {}
|
|
846
|
+
dataset_keys: dict[str, DatasetKey] = {}
|
|
847
|
+
for twig_dimensions, twig in self.twigs.items():
|
|
848
|
+
twig_data_id = data_id.subset(twig_dimensions)
|
|
849
|
+
for task_label in twig.parent_edge_tasks:
|
|
850
|
+
quantum_keys[task_label] = QuantumKey(task_label, twig_data_id.required_values)
|
|
851
|
+
for dataset_type_name in twig.parent_edge_dataset_types:
|
|
852
|
+
dataset_keys[dataset_type_name] = DatasetKey(
|
|
853
|
+
dataset_type_name, twig_data_id.required_values
|
|
854
|
+
)
|
|
855
|
+
for dataset_type_name, task_label in self.input_edges:
|
|
856
|
+
skeleton.add_input_edge(quantum_keys[task_label], dataset_keys[dataset_type_name])
|
|
857
|
+
for task_label, dataset_type_name in self.output_edges:
|
|
858
|
+
skeleton.add_output_edge(quantum_keys[task_label], dataset_keys[dataset_type_name])
|
|
859
|
+
if not self.has_followup_queries:
|
|
860
|
+
# Delete data IDs we don't need anymore to save memory.
|
|
861
|
+
del self.data_ids
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
@dataclasses.dataclass(eq=False, repr=False)
|
|
865
|
+
class _DimensionGroupTree:
|
|
866
|
+
"""A tree of dimension groups in which branches are subsets of their
|
|
867
|
+
parents.
|
|
683
868
|
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
869
|
+
This class holds all of the per-subgraph state for this QG builder
|
|
870
|
+
subclass.
|
|
871
|
+
|
|
872
|
+
Notes
|
|
873
|
+
-----
|
|
874
|
+
The full set of dimensions referenced by any task or dataset type (except
|
|
875
|
+
prerequisite inputs) forms the conceptual "trunk" of this tree. Each
|
|
876
|
+
branch has a subset of the dimensions of its parent branch, and each set
|
|
877
|
+
of dimensions appears exactly once in a tree (so there is some flexibility
|
|
878
|
+
in where certain dimension subsets may appear; right now this is resolved
|
|
879
|
+
somewhat arbitrarily).
|
|
880
|
+
We do not add branches for every possible dimension subset; a branch is
|
|
881
|
+
created for a `~lsst.daf.butler.DimensionGroup` if:
|
|
882
|
+
|
|
883
|
+
- if there is a task whose quanta have those dimensions;
|
|
884
|
+
- if there is a non-prerequisite dataset type with those dimensions;
|
|
885
|
+
- if there is an edge for which the union of the task and dataset type
|
|
886
|
+
dimensions are those dimensions;
|
|
887
|
+
- if there is a dimension element in any task or non-prerequisite dataset
|
|
888
|
+
type dimensions whose `~lsst.daf.butler.DimensionElement.minimal_group`
|
|
889
|
+
is those dimensions.
|
|
890
|
+
|
|
891
|
+
We process the initial data query by recursing through this tree structure
|
|
892
|
+
to populate a data ID set for each branch
|
|
893
|
+
(`_DimensionGroupBranch.project_data_ids`), and then process those sets
|
|
894
|
+
recursively (`_DimensionGroupBranch.update_skeleton`). This can be far
|
|
895
|
+
faster than the non-recursive processing the QG builder used to use because
|
|
896
|
+
the set of data IDs is smaller (sometimes dramatically smaller) as we move
|
|
897
|
+
to smaller sets of dimensions.
|
|
898
|
+
|
|
899
|
+
In addition to their child branches, a branch that is used to define graph
|
|
900
|
+
edges also has "twigs", which are a flatter set of dimension subsets for
|
|
901
|
+
each of the tasks and dataset types that appear in that branch's edges.
|
|
902
|
+
The same twig dimensions can appear in multiple branches, and twig
|
|
903
|
+
dimensions can be the same as their parent branch's (but not a superset).
|
|
904
|
+
"""
|
|
905
|
+
|
|
906
|
+
subgraph: PipelineGraph
|
|
907
|
+
"""Graph of this subset of the pipeline."""
|
|
908
|
+
|
|
909
|
+
all_dimensions: DimensionGroup = dataclasses.field(init=False)
|
|
910
|
+
"""The union of all dimensions that appear in any task or
|
|
911
|
+
(non-prerequisite) dataset type in this subgraph.
|
|
912
|
+
"""
|
|
913
|
+
|
|
914
|
+
empty_dimensions_branch: _DimensionGroupBranch = dataclasses.field(init=False)
|
|
915
|
+
"""The tasks and dataset types of this subset of this pipeline that have
|
|
916
|
+
empty dimensions.
|
|
917
|
+
|
|
918
|
+
Prerequisite dataset types are not included.
|
|
919
|
+
"""
|
|
920
|
+
|
|
921
|
+
trunk_branches: dict[DimensionGroup, _DimensionGroupBranch] = dataclasses.field(init=False)
|
|
922
|
+
"""The top-level branches in the tree of dimension groups.
|
|
923
|
+
"""
|
|
924
|
+
|
|
925
|
+
branches_by_dimensions: dict[DimensionGroup, _DimensionGroupBranch] = dataclasses.field(init=False)
|
|
926
|
+
"""The tasks and dataset types of this subset of the pipeline, grouped
|
|
927
|
+
by their dimensions.
|
|
928
|
+
|
|
929
|
+
The tasks and dataset types with empty dimensions are not included; they're
|
|
930
|
+
in `empty_dimensions_tree` since they are usually used differently.
|
|
931
|
+
Prerequisite dataset types are also not included.
|
|
932
|
+
|
|
933
|
+
This is a flatter view of the objects in `trunk_branches`.
|
|
934
|
+
"""
|
|
935
|
+
|
|
936
|
+
overall_inputs: dict[str, DatasetTypeNode] = dataclasses.field(init=False)
|
|
937
|
+
"""Pipeline graph nodes for all non-prerequisite, non-init overall-input
|
|
938
|
+
dataset types for this subset of the pipeline.
|
|
939
|
+
"""
|
|
940
|
+
|
|
941
|
+
def __post_init__(self) -> None:
|
|
942
|
+
universe = self.subgraph.universe
|
|
943
|
+
assert universe is not None, "Pipeline graph is resolved."
|
|
944
|
+
self.branches_by_dimensions = {
|
|
945
|
+
dimensions: _DimensionGroupBranch(tasks, dataset_types)
|
|
946
|
+
for dimensions, (tasks, dataset_types) in self.subgraph.group_by_dimensions().items()
|
|
947
|
+
}
|
|
948
|
+
self.all_dimensions = _union_dimensions(self.branches_by_dimensions.keys(), universe)
|
|
949
|
+
_DimensionGroupBranch.populate_record_elements(self.all_dimensions, self.branches_by_dimensions)
|
|
950
|
+
_DimensionGroupBranch.populate_edges(self.subgraph, self.branches_by_dimensions)
|
|
951
|
+
self.trunk_branches = _DimensionGroupBranch.populate_branches(
|
|
952
|
+
None, self.branches_by_dimensions.copy()
|
|
953
|
+
)
|
|
954
|
+
self.empty_dimensions_branch = self.branches_by_dimensions.pop(
|
|
955
|
+
universe.empty, _DimensionGroupBranch()
|
|
956
|
+
)
|
|
957
|
+
self.overall_inputs = {
|
|
958
|
+
name: node # type: ignore
|
|
959
|
+
for name, node in self.subgraph.iter_overall_inputs()
|
|
960
|
+
if not node.is_prerequisite # type: ignore
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
def project_data_ids(self, log: LsstLogAdapter) -> None:
|
|
964
|
+
"""Recursively populate the data ID sets of the dimension group tree
|
|
965
|
+
from the data ID sets of the trunk branches.
|
|
687
966
|
|
|
688
967
|
Parameters
|
|
689
968
|
----------
|
|
690
|
-
log : `logging.
|
|
691
|
-
|
|
969
|
+
log : `lsst.logging.LsstLogAdapter`
|
|
970
|
+
Logger to use for status reporting.
|
|
692
971
|
"""
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
header.extend(self.butler_query.explain_no_results())
|
|
697
|
-
header.append("To reproduce this query for debugging purposes, run:")
|
|
698
|
-
finally:
|
|
699
|
-
# If an exception was raised, write a partial.
|
|
700
|
-
log.error(self.format_query_cmd(*header))
|
|
972
|
+
for branch_dimensions, branch in self.trunk_branches.items():
|
|
973
|
+
log.debug("Projecting query data IDs to %s.", branch_dimensions)
|
|
974
|
+
branch.project_data_ids(log)
|
|
701
975
|
|
|
702
976
|
|
|
703
977
|
class DimensionRecordAttacher:
|
|
@@ -829,3 +1103,10 @@ class DataIdExpansionLeftovers:
|
|
|
829
1103
|
missing_record_data_ids: defaultdict[str, set[tuple[DataIdValue, ...]]] = dataclasses.field(
|
|
830
1104
|
default_factory=lambda: defaultdict(set)
|
|
831
1105
|
)
|
|
1106
|
+
|
|
1107
|
+
|
|
1108
|
+
def _union_dimensions(groups: Iterable[DimensionGroup], universe: DimensionUniverse) -> DimensionGroup:
|
|
1109
|
+
dimension_names: set[str] = set()
|
|
1110
|
+
for dimensions_for_group in groups:
|
|
1111
|
+
dimension_names.update(dimensions_for_group.names)
|
|
1112
|
+
return universe.conform(dimension_names)
|