lsst-pipe-base 29.2025.1000__py3-none-any.whl → 29.2025.1200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. lsst/pipe/base/_datasetQueryConstraints.py +1 -1
  2. lsst/pipe/base/all_dimensions_quantum_graph_builder.py +642 -357
  3. lsst/pipe/base/connections.py +179 -2
  4. lsst/pipe/base/pipeline_graph/visualization/_mermaid.py +157 -24
  5. lsst/pipe/base/prerequisite_helpers.py +1 -1
  6. lsst/pipe/base/quantum_graph_builder.py +91 -60
  7. lsst/pipe/base/quantum_graph_skeleton.py +20 -0
  8. lsst/pipe/base/quantum_provenance_graph.py +790 -421
  9. lsst/pipe/base/tests/mocks/_data_id_match.py +4 -0
  10. lsst/pipe/base/version.py +1 -1
  11. {lsst_pipe_base-29.2025.1000.dist-info → lsst_pipe_base-29.2025.1200.dist-info}/METADATA +5 -2
  12. {lsst_pipe_base-29.2025.1000.dist-info → lsst_pipe_base-29.2025.1200.dist-info}/RECORD +20 -20
  13. {lsst_pipe_base-29.2025.1000.dist-info → lsst_pipe_base-29.2025.1200.dist-info}/WHEEL +1 -1
  14. {lsst_pipe_base-29.2025.1000.dist-info → lsst_pipe_base-29.2025.1200.dist-info}/entry_points.txt +0 -0
  15. {lsst_pipe_base-29.2025.1000.dist-info → lsst_pipe_base-29.2025.1200.dist-info/licenses}/COPYRIGHT +0 -0
  16. {lsst_pipe_base-29.2025.1000.dist-info → lsst_pipe_base-29.2025.1200.dist-info/licenses}/LICENSE +0 -0
  17. {lsst_pipe_base-29.2025.1000.dist-info → lsst_pipe_base-29.2025.1200.dist-info/licenses}/bsd_license.txt +0 -0
  18. {lsst_pipe_base-29.2025.1000.dist-info → lsst_pipe_base-29.2025.1200.dist-info/licenses}/gpl-v3.0.txt +0 -0
  19. {lsst_pipe_base-29.2025.1000.dist-info → lsst_pipe_base-29.2025.1200.dist-info}/top_level.txt +0 -0
  20. {lsst_pipe_base-29.2025.1000.dist-info → lsst_pipe_base-29.2025.1200.dist-info}/zip-safe +0 -0
@@ -36,8 +36,7 @@ __all__ = ("AllDimensionsQuantumGraphBuilder", "DatasetQueryConstraintVariant")
36
36
  import dataclasses
37
37
  import itertools
38
38
  from collections import defaultdict
39
- from collections.abc import Iterator, Mapping
40
- from contextlib import contextmanager
39
+ from collections.abc import Iterable, Mapping
41
40
  from typing import TYPE_CHECKING, Any, TypeAlias, final
42
41
 
43
42
  from lsst.daf.butler import (
@@ -46,9 +45,9 @@ from lsst.daf.butler import (
46
45
  DataIdValue,
47
46
  DimensionGroup,
48
47
  DimensionRecord,
48
+ DimensionUniverse,
49
49
  MissingDatasetTypeError,
50
50
  )
51
- from lsst.daf.butler.queries import Query
52
51
  from lsst.utils.logging import LsstLogAdapter
53
52
  from lsst.utils.timer import timeMethod
54
53
 
@@ -128,262 +127,312 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
128
127
  # There is some chance that the dimension query for one subgraph would
129
128
  # be the same as or a dimension-subset of another. This is an
130
129
  # optimization opportunity we're not currently taking advantage of.
131
- with _AllDimensionsQuery.from_builder(self, subgraph) as query:
132
- skeleton = self._make_subgraph_skeleton(query)
133
- self._find_followup_datasets(query, skeleton)
134
- dimension_records = self._fetch_most_dimension_records(query)
130
+ tree = _DimensionGroupTree(subgraph)
131
+ self._query_for_data_ids(tree)
132
+ skeleton = self._make_subgraph_skeleton(tree)
133
+ self._find_followup_datasets(tree, skeleton)
134
+ dimension_records = self._fetch_most_dimension_records(tree)
135
135
  leftovers = self._attach_most_dimension_records(skeleton, dimension_records)
136
136
  self._fetch_leftover_dimension_records(leftovers, dimension_records)
137
137
  self._attach_leftover_dimension_records(skeleton, leftovers, dimension_records)
138
138
  return skeleton
139
139
 
140
+ def _query_for_data_ids(self, tree: _DimensionGroupTree) -> None:
141
+ """Query for data IDs and use the result to populate the dimension
142
+ group tree.
143
+
144
+ Parameters
145
+ ----------
146
+ tree : `_DimensionGroupTree`
147
+ Tree with dimension group branches that holds subgraph-specific
148
+ state for this builder, to be modified in place.
149
+ """
150
+ self.log.debug("Analyzing subgraph dimensions and overall-inputs.")
151
+ constraint_datasets: set[str] = set()
152
+ self.log.debug("Building query for data IDs.")
153
+ if self.dataset_query_constraint == DatasetQueryConstraintVariant.ALL:
154
+ self.log.debug("Constraining graph query using all datasets not marked as deferred.")
155
+ constraint_datasets = {
156
+ name
157
+ for name, dataset_type_node in tree.overall_inputs.items()
158
+ if (dataset_type_node.is_initial_query_constraint and dataset_type_node.dimensions)
159
+ }
160
+ elif self.dataset_query_constraint == DatasetQueryConstraintVariant.OFF:
161
+ self.log.debug("Not using dataset existence to constrain query.")
162
+ elif self.dataset_query_constraint == DatasetQueryConstraintVariant.LIST:
163
+ constraint = set(self.dataset_query_constraint)
164
+ inputs = tree.overall_inputs - tree.empty_dimensions_branch.dataset_types.keys()
165
+ if remainder := constraint.difference(inputs):
166
+ self.log.debug(
167
+ "Ignoring dataset types %s in dataset query constraint that are not inputs to this "
168
+ "subgraph, on the assumption that they are relevant for a different subgraph.",
169
+ remainder,
170
+ )
171
+ constraint.intersection_update(inputs)
172
+ self.log.debug(f"Constraining graph query using {constraint}")
173
+ constraint_datasets = constraint
174
+ else:
175
+ raise QuantumGraphBuilderError(
176
+ f"Unable to handle type {self.dataset_query_constraint} given as datasetQueryConstraint."
177
+ )
178
+ query_cmd: list[str] = []
179
+ with self.butler.query() as query:
180
+ query_cmd.append("with butler.query() as query:")
181
+ query_cmd.append(f" query = query.join_dimensions({list(tree.all_dimensions.names)})")
182
+ query = query.join_dimensions(tree.all_dimensions)
183
+ if constraint_datasets:
184
+ query_cmd.append(f" collections = {list(self.input_collections)}")
185
+ for dataset_type_name in constraint_datasets:
186
+ query_cmd.append(f" query = query.join_dataset_search({dataset_type_name!r}, collections)")
187
+ query = query.join_dataset_search(dataset_type_name, self.input_collections)
188
+ query_cmd.append(
189
+ f" query = query.where({dict(tree.subgraph.data_id.mapping)}, "
190
+ f"{self.where!r}, bind={self.bind!r})"
191
+ )
192
+ query = query.where(tree.subgraph.data_id, self.where, bind=self.bind)
193
+ self.log.verbose("Querying for data IDs via: %s", "\n".join(query_cmd))
194
+ # Allow duplicates from common skypix overlaps to make some queries
195
+ # run faster.
196
+ query._allow_duplicate_overlaps = True
197
+ self.log.info("Iterating over query results to associate quanta with datasets.")
198
+ # Iterate over query results, populating data IDs for datasets,
199
+ # quanta, and edges. We populate only the first level of the tree
200
+ # in the first pass, so we can be done with the query results as
201
+ # quickly as possible in case that holds a connection/cursor open.
202
+ n_rows = 0
203
+ for common_data_id in query.data_ids(tree.all_dimensions):
204
+ for branch_dimensions, branch in tree.trunk_branches.items():
205
+ data_id = common_data_id.subset(branch_dimensions)
206
+ branch.data_ids.add(data_id)
207
+ n_rows += 1
208
+ if n_rows == 0:
209
+ # A single multiline log plays better with log aggregators like
210
+ # Loki.
211
+ lines = ["Initial data ID query returned no rows, so QuantumGraph will be empty."]
212
+ try:
213
+ lines.extend(query.explain_no_results())
214
+ finally:
215
+ lines.append("To reproduce this query for debugging purposes, run:")
216
+ lines.append("")
217
+ lines.extend(query_cmd)
218
+ lines.append(" print(query.any())")
219
+ lines.append("")
220
+ lines.append("And then try removing various constraints until query.any() returns True.")
221
+ # If an exception was raised, write a partial.
222
+ self.log.error("\n".join(lines))
223
+ return
224
+ self.log.verbose("Processed %s initial data ID query rows.", n_rows)
225
+ # We now recursively populate the data IDs of the rest of the tree.
226
+ tree.project_data_ids(self.log)
227
+
140
228
  @timeMethod
141
- def _make_subgraph_skeleton(self, query: _AllDimensionsQuery) -> QuantumGraphSkeleton:
142
- """Build a `QuantumGraphSkeleton` by iterating over the result rows
143
- of the initial data ID query.
229
+ def _make_subgraph_skeleton(self, tree: _DimensionGroupTree) -> QuantumGraphSkeleton:
230
+ """Build a `QuantumGraphSkeleton` by processing the data IDs in the
231
+ dimension group tree.
144
232
 
145
233
  Parameters
146
234
  ----------
147
- query : `_AllDimensionsQuery`
148
- Object representing the full-pipeline data ID query.
235
+ tree : `_DimensionGroupTree`
236
+ Tree with dimension group branches that holds subgraph-specific
237
+ state for this builder.
149
238
 
150
239
  Returns
151
240
  -------
152
241
  skeleton : `QuantumGraphSkeleton`
153
242
  Preliminary quantum graph.
154
243
  """
155
- # First we make containers of empty-dimensions quantum and dataset
156
- # keys, and add those to the skelton, since empty data IDs are
157
- # logically subsets of any data ID. We'll copy those to initialize the
158
- # containers of keys for each result row. We don't ever explicitly add
159
- # nodes to the skeleton for these, and that's okay because networkx
160
- # adds nodes implicitly when an edge to that node is added, and we
161
- # don't want to add nodes for init datasets here.
162
- skeleton = QuantumGraphSkeleton(query.subgraph.tasks)
163
- empty_dimensions_dataset_keys = {}
164
- for dataset_type_name in query.empty_dimensions_dataset_types.keys():
165
- dataset_key = skeleton.add_dataset_node(dataset_type_name, self.empty_data_id)
166
- empty_dimensions_dataset_keys[dataset_type_name] = dataset_key
167
- if ref := self.empty_dimensions_datasets.inputs.get(dataset_key):
168
- skeleton.set_dataset_ref(ref, dataset_key)
169
- if ref := self.empty_dimensions_datasets.outputs_for_skip.get(dataset_key):
170
- skeleton.set_output_for_skip(ref)
171
- if ref := self.empty_dimensions_datasets.outputs_in_the_way.get(dataset_key):
172
- skeleton.set_output_in_the_way(ref)
173
- empty_dimensions_quantum_keys = []
174
- for task_label in query.empty_dimensions_tasks.keys():
175
- empty_dimensions_quantum_keys.append(skeleton.add_quantum_node(task_label, self.empty_data_id))
176
- self.log.info("Iterating over query results to associate quanta with datasets.")
177
- # Iterate over query results, populating data IDs for datasets and
178
- # quanta and then connecting them to each other. This is the slowest
179
- # client-side part of QG generation, and it's often the slowest part
180
- # overall, so inside this loop is where it's really critical to avoid
181
- # expensive things, especially in the nested loops.
182
- n_rows = 0
183
- for common_data_id in query.butler_query.data_ids():
184
- # Create a data ID for each set of dimensions used by one or more
185
- # tasks or dataset types, and use that to record all quanta and
186
- # dataset data IDs for this row.
187
- dataset_keys_for_row: dict[str, DatasetKey] = empty_dimensions_dataset_keys.copy()
188
- quantum_keys_for_row: list[QuantumKey] = empty_dimensions_quantum_keys.copy()
189
- for dimensions, (task_nodes, dataset_type_nodes) in query.grouped_by_dimensions.items():
190
- data_id = common_data_id.subset(dimensions)
191
- for dataset_type_name in dataset_type_nodes.keys():
192
- dataset_keys_for_row[dataset_type_name] = skeleton.add_dataset_node(
193
- dataset_type_name, data_id
194
- )
195
- for task_label in task_nodes.keys():
196
- quantum_keys_for_row.append(skeleton.add_quantum_node(task_label, data_id))
197
- # Whether these quanta are new or existing, we can now associate
198
- # the dataset data IDs for this row with them. The fact that a
199
- # quantum data ID and a dataset data ID both came from the same
200
- # result row is what tells us they should be associated. Many of
201
- # these associates will be duplicates (because another query row
202
- # that differed from this one only in irrelevant dimensions already
203
- # added them), and our use of sets should take care of that.
204
- for quantum_key in quantum_keys_for_row:
205
- for read_edge in self._pipeline_graph.tasks[quantum_key.task_label].inputs.values():
206
- skeleton.add_input_edge(
207
- quantum_key, dataset_keys_for_row[read_edge.parent_dataset_type_name]
208
- )
209
- for write_edge in self._pipeline_graph.tasks[quantum_key.task_label].iter_all_outputs():
210
- skeleton.add_output_edge(
211
- quantum_key, dataset_keys_for_row[write_edge.parent_dataset_type_name]
212
- )
213
- n_rows += 1
214
- if n_rows == 0:
215
- query.log_failure(self.log)
216
- else:
217
- n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in query.subgraph.tasks)
218
- self.log.info(
219
- "Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges from %d query row(s).",
220
- n_quanta,
221
- skeleton.n_nodes - n_quanta,
222
- skeleton.n_edges,
223
- n_rows,
244
+ skeleton = QuantumGraphSkeleton(tree.subgraph.tasks)
245
+ for branch_dimensions, branch in tree.trunk_branches.items():
246
+ self.log.verbose(
247
+ "Adding nodes and edges for %s %s data ID(s).",
248
+ len(branch.data_ids),
249
+ branch_dimensions,
224
250
  )
251
+ branch.update_skeleton(skeleton, self.log)
252
+ n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in tree.subgraph.tasks)
253
+ self.log.info(
254
+ "Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges.",
255
+ n_quanta,
256
+ skeleton.n_nodes - n_quanta,
257
+ skeleton.n_edges,
258
+ )
225
259
  return skeleton
226
260
 
227
261
  @timeMethod
228
- def _find_followup_datasets(self, query: _AllDimensionsQuery, skeleton: QuantumGraphSkeleton) -> None:
229
- """Populate `existing_datasets` by performing follow-up queries joined
230
- to column-subsets of the initial data ID query.
262
+ def _find_followup_datasets(self, tree: _DimensionGroupTree, skeleton: QuantumGraphSkeleton) -> None:
263
+ """Populate `existing_datasets` by performing follow-up queries with
264
+ the data IDs in the dimension group tree.
231
265
 
232
266
  Parameters
233
267
  ----------
234
- query : `_AllDimensionsQuery`
235
- Object representing the full-pipeline data ID query.
268
+ tree : `_DimensionGroupTree`
269
+ Tree with dimension group branches that holds subgraph-specific
270
+ state for this builder.
271
+ skeleton : `.quantum_graph_skeleton.QuantumGraphSkeleton`
272
+ In-progress quantum graph to modify in place.
236
273
  """
237
- for dimensions, (tasks_in_group, dataset_types_in_group) in query.grouped_by_dimensions.items():
274
+ dataset_key: DatasetKey | PrerequisiteDatasetKey
275
+ for dataset_type_name in tree.empty_dimensions_branch.dataset_types.keys():
276
+ dataset_key = DatasetKey(dataset_type_name, self.empty_data_id.required_values)
277
+ if ref := self.empty_dimensions_datasets.inputs.get(dataset_key):
278
+ skeleton.set_dataset_ref(ref, dataset_key)
279
+ if ref := self.empty_dimensions_datasets.outputs_for_skip.get(dataset_key):
280
+ skeleton.set_output_for_skip(ref)
281
+ if ref := self.empty_dimensions_datasets.outputs_in_the_way.get(dataset_key):
282
+ skeleton.set_output_in_the_way(ref)
283
+ for dimensions, branch in tree.branches_by_dimensions.items():
284
+ if not branch.has_followup_queries:
285
+ continue
286
+ if not branch.data_ids:
287
+ continue
238
288
  # Iterate over regular input/output dataset type nodes with these
239
- # dimensions to find those datasets using straightforward followup
240
- # queries.
241
- for dataset_type_node in dataset_types_in_group.values():
242
- if dataset_type_node.name in query.overall_inputs:
243
- # Dataset type is an overall input; we always need to try
244
- # to find these.
245
- count = 0
246
- try:
247
- for ref in query.butler_query.datasets(
248
- dataset_type_node.name, self.input_collections
249
- ):
250
- skeleton.set_dataset_ref(ref)
251
- count += 1
252
- except MissingDatasetTypeError:
253
- pass
254
- self.log.verbose(
255
- "Found %d overall-input dataset(s) of type %r.", count, dataset_type_node.name
256
- )
257
- continue
258
- if self.skip_existing_in:
259
- # Dataset type is an intermediate or output; need to find
260
- # these if only they're from previously executed quanta
261
- # that we might skip...
262
- count = 0
263
- try:
264
- for ref in query.butler_query.datasets(dataset_type_node.name, self.skip_existing_in):
265
- skeleton.set_output_for_skip(ref)
266
- count += 1
267
- if ref.run == self.output_run:
268
- skeleton.set_output_in_the_way(ref)
269
- except MissingDatasetTypeError:
270
- pass
271
- self.log.verbose(
272
- "Found %d output dataset(s) of type %r in %s.",
273
- count,
274
- dataset_type_node.name,
275
- self.skip_existing_in,
276
- )
277
- if self.output_run_exists and not self.skip_existing_starts_with_output_run:
278
- # ...or if they're in the way and would need to be
279
- # clobbered (and we haven't already found them in the
280
- # previous block).
281
- count = 0
282
- try:
283
- for ref in query.butler_query.datasets(dataset_type_node.name, [self.output_run]):
284
- skeleton.set_output_in_the_way(ref)
285
- count += 1
286
- except MissingDatasetTypeError:
287
- pass
288
- self.log.verbose(
289
- "Found %d output dataset(s) of type %r in %s.",
290
- count,
291
- dataset_type_node.name,
292
- self.output_run,
293
- )
294
- # Iterate over tasks with these dimensions to perform follow-up
295
- # queries for prerequisite inputs, which may have dimensions that
296
- # were not in ``query.butler_query.dimensions`` and/or require
297
- # temporal joins to calibration validity ranges.
298
- for task_node in tasks_in_group.values():
299
- task_prerequisite_info = self.prerequisite_info[task_node.label]
300
- for connection_name, finder in list(task_prerequisite_info.finders.items()):
301
- if finder.lookup_function is not None:
289
+ # dimensions to find those datasets using followup queries.
290
+ with self.butler.query() as butler_query:
291
+ butler_query = butler_query.join_data_coordinates(branch.data_ids)
292
+ for dataset_type_node in branch.dataset_types.values():
293
+ if dataset_type_node.name in tree.overall_inputs:
294
+ # Dataset type is an overall input; we always need to
295
+ # try to find these.
296
+ count = 0
297
+ try:
298
+ for ref in butler_query.datasets(dataset_type_node.name, self.input_collections):
299
+ skeleton.set_dataset_ref(ref)
300
+ count += 1
301
+ except MissingDatasetTypeError:
302
+ pass
302
303
  self.log.verbose(
303
- "Deferring prerequisite input %r of task %r to per-quantum processing "
304
- "(lookup function provided).",
305
- finder.dataset_type_node.name,
306
- task_node.label,
304
+ "Found %d overall-input dataset(s) of type %r.", count, dataset_type_node.name
307
305
  )
308
306
  continue
309
- # We also fall back to the base class if there is a
310
- # nontrivial spatial or temporal join in the lookup.
311
- if finder.dataset_skypix or finder.dataset_other_spatial:
312
- if task_prerequisite_info.bounds.spatial_connections:
313
- self.log.verbose(
314
- "Deferring prerequisite input %r of task %r to per-quantum processing "
315
- "(for spatial-bounds-connections handling).",
316
- finder.dataset_type_node.name,
317
- task_node.label,
318
- )
319
- continue
320
- if not task_node.dimensions.spatial:
321
- self.log.verbose(
322
- "Deferring prerequisite input %r of task %r to per-quantum processing "
323
- "(dataset has spatial data IDs, but task does not).",
324
- finder.dataset_type_node.name,
325
- task_node.label,
326
- )
327
- continue
328
- if finder.dataset_has_timespan:
329
- if task_prerequisite_info.bounds.spatial_connections:
330
- self.log.verbose(
331
- "Deferring prerequisite input %r of task %r to per-quantum processing "
332
- "(for temporal-bounds-connections handling).",
333
- finder.dataset_type_node.name,
334
- task_node.label,
335
- )
336
- continue
337
- if not task_node.dimensions.temporal:
307
+ if self.skip_existing_in:
308
+ # Dataset type is an intermediate or output; need to
309
+ # find these if only they're from previously executed
310
+ # quanta that we might skip...
311
+ count = 0
312
+ try:
313
+ for ref in butler_query.datasets(dataset_type_node.name, self.skip_existing_in):
314
+ skeleton.set_output_for_skip(ref)
315
+ count += 1
316
+ if ref.run == self.output_run:
317
+ skeleton.set_output_in_the_way(ref)
318
+ except MissingDatasetTypeError:
319
+ pass
320
+ self.log.verbose(
321
+ "Found %d output dataset(s) of type %r in %s.",
322
+ count,
323
+ dataset_type_node.name,
324
+ self.skip_existing_in,
325
+ )
326
+ if self.output_run_exists and not self.skip_existing_starts_with_output_run:
327
+ # ...or if they're in the way and would need to be
328
+ # clobbered (and we haven't already found them in the
329
+ # previous block).
330
+ count = 0
331
+ try:
332
+ for ref in butler_query.datasets(dataset_type_node.name, [self.output_run]):
333
+ skeleton.set_output_in_the_way(ref)
334
+ count += 1
335
+ except MissingDatasetTypeError:
336
+ pass
337
+ self.log.verbose(
338
+ "Found %d output dataset(s) of type %r in %s.",
339
+ count,
340
+ dataset_type_node.name,
341
+ self.output_run,
342
+ )
343
+ # Iterate over tasks with these dimensions to perform follow-up
344
+ # queries for prerequisite inputs, which may have dimensions
345
+ # that were not in ``tree.all_dimensions`` and/or require
346
+ # temporal joins to calibration validity ranges.
347
+ for task_node in branch.tasks.values():
348
+ task_prerequisite_info = self.prerequisite_info[task_node.label]
349
+ for connection_name, finder in list(task_prerequisite_info.finders.items()):
350
+ if finder.lookup_function is not None:
338
351
  self.log.verbose(
339
352
  "Deferring prerequisite input %r of task %r to per-quantum processing "
340
- "(dataset has temporal data IDs, but task does not).",
353
+ "(lookup function provided).",
341
354
  finder.dataset_type_node.name,
342
355
  task_node.label,
343
356
  )
344
357
  continue
345
- # We have a simple case where we can do a single query
346
- # that joins the query we already have for the task data
347
- # IDs to the datasets we're looking for.
348
- count = 0
349
- try:
350
- query_results = list(
351
- # TODO[DM-46042]: We materialize here as a way to
352
- # to a SELECT DISTINCT on the main query with a
353
- # subset of its dimensions columns. It'd be better
354
- # to have a way to do this that just makes a
355
- # subquery or a CTE rather than a temporary table.
356
- query.butler_query.materialize(dimensions=dimensions, datasets=())
357
- .join_dataset_search(
358
- finder.dataset_type_node.dataset_type, self.input_collections
358
+ # We also fall back to the base class if there is a
359
+ # nontrivial spatial or temporal join in the lookup.
360
+ if finder.dataset_skypix or finder.dataset_other_spatial:
361
+ if task_prerequisite_info.bounds.spatial_connections:
362
+ self.log.verbose(
363
+ "Deferring prerequisite input %r of task %r to per-quantum processing "
364
+ "(for spatial-bounds-connections handling).",
365
+ finder.dataset_type_node.name,
366
+ task_node.label,
367
+ )
368
+ continue
369
+ if not task_node.dimensions.spatial:
370
+ self.log.verbose(
371
+ "Deferring prerequisite input %r of task %r to per-quantum processing "
372
+ "(dataset has spatial data IDs, but task does not).",
373
+ finder.dataset_type_node.name,
374
+ task_node.label,
375
+ )
376
+ continue
377
+ if finder.dataset_has_timespan:
378
+ if task_prerequisite_info.bounds.spatial_connections:
379
+ self.log.verbose(
380
+ "Deferring prerequisite input %r of task %r to per-quantum processing "
381
+ "(for temporal-bounds-connections handling).",
382
+ finder.dataset_type_node.name,
383
+ task_node.label,
384
+ )
385
+ continue
386
+ if not task_node.dimensions.temporal:
387
+ self.log.verbose(
388
+ "Deferring prerequisite input %r of task %r to per-quantum processing "
389
+ "(dataset has temporal data IDs, but task does not).",
390
+ finder.dataset_type_node.name,
391
+ task_node.label,
392
+ )
393
+ continue
394
+ # We have a simple case where we can do a single query
395
+ # that joins the query we already have for the task
396
+ # data IDs to the datasets we're looking for.
397
+ count = 0
398
+ try:
399
+ query_results = list(
400
+ butler_query.join_dataset_search(
401
+ finder.dataset_type_node.dataset_type, self.input_collections
402
+ )
403
+ .general(
404
+ dimensions | finder.dataset_type_node.dataset_type.dimensions,
405
+ dataset_fields={finder.dataset_type_node.name: ...},
406
+ find_first=True,
407
+ )
408
+ .iter_tuples(finder.dataset_type_node.dataset_type)
359
409
  )
360
- .general(
361
- dimensions | finder.dataset_type_node.dataset_type.dimensions,
362
- dataset_fields={finder.dataset_type_node.name: ...},
363
- find_first=True,
410
+ except MissingDatasetTypeError:
411
+ query_results = []
412
+ for data_id, refs, _ in query_results:
413
+ ref = refs[0]
414
+ dataset_key = skeleton.add_prerequisite_node(ref)
415
+ quantum_key = QuantumKey(
416
+ task_node.label, data_id.subset(dimensions).required_values
364
417
  )
365
- .iter_tuples(finder.dataset_type_node.dataset_type)
418
+ skeleton.add_input_edge(quantum_key, dataset_key)
419
+ count += 1
420
+ # Remove this finder from the mapping so the base class
421
+ # knows it doesn't have to look for these
422
+ # prerequisites.
423
+ del task_prerequisite_info.finders[connection_name]
424
+ self.log.verbose(
425
+ "Added %d prerequisite input edge(s) from dataset type %r to task %r.",
426
+ count,
427
+ finder.dataset_type_node.name,
428
+ task_node.label,
366
429
  )
367
- except MissingDatasetTypeError:
368
- query_results = []
369
- for data_id, refs, _ in query_results:
370
- ref = refs[0]
371
- dataset_key = skeleton.add_prerequisite_node(ref)
372
- quantum_key = QuantumKey(task_node.label, data_id.subset(dimensions).required_values)
373
- skeleton.add_input_edge(quantum_key, dataset_key)
374
- count += 1
375
- # Remove this finder from the mapping so the base class
376
- # knows it doesn't have to look for these prerequisites.
377
- del task_prerequisite_info.finders[connection_name]
378
- self.log.verbose(
379
- "Added %d prerequisite input edge(s) from dataset type %r to task %r.",
380
- count,
381
- finder.dataset_type_node.name,
382
- task_node.label,
383
- )
430
+ if not branch.record_elements:
431
+ # Delete data ID sets we don't need anymore.
432
+ del branch.data_ids
384
433
 
385
434
  @timeMethod
386
- def _fetch_most_dimension_records(self, query: _AllDimensionsQuery) -> DimensionRecordsMap:
435
+ def _fetch_most_dimension_records(self, tree: _DimensionGroupTree) -> DimensionRecordsMap:
387
436
  """Query for dimension records for all non-prerequisite data IDs (and
388
437
  possibly some prerequisite data IDs).
389
438
 
@@ -407,12 +456,17 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
407
456
  """
408
457
  self.log.verbose("Performing follow-up queries for dimension records.")
409
458
  result: dict[str, dict[tuple[DataIdValue, ...], DimensionRecord]] = {}
410
- for dimensions in query.grouped_by_dimensions.keys():
411
- for element in dimensions.elements:
412
- if element not in result:
459
+ for branch in tree.branches_by_dimensions.values():
460
+ if not branch.record_elements:
461
+ continue
462
+ if not branch.data_ids:
463
+ continue
464
+ with self.butler.query() as butler_query:
465
+ butler_query = butler_query.join_data_coordinates(branch.data_ids)
466
+ for element in branch.record_elements:
413
467
  result[element] = {
414
468
  record.dataId.required_values: record
415
- for record in query.butler_query.dimension_records(element)
469
+ for record in butler_query.dimension_records(element)
416
470
  }
417
471
  return result
418
472
 
@@ -532,172 +586,396 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
532
586
  skeleton.set_data_id(node_key, expanded_data_id)
533
587
 
534
588
 
535
- @dataclasses.dataclass(eq=False, repr=False)
536
- class _AllDimensionsQuery:
537
- """A helper class for `AllDimensionsQuantumGraphBuilder` that holds all
538
- per-subgraph state.
589
+ @dataclasses.dataclass(eq=False, repr=False, slots=True)
590
+ class _DimensionGroupTwig:
591
+ """A small side-branch of the tree of dimensions groups that tracks the
592
+ tasks and dataset types with a particular set of dimensions that appear in
593
+ the edges populated by its parent branch.
539
594
 
540
- This object should always be constructed by `from_builder`, which returns
541
- an instance wrapped with a context manager. This controls the lifetime of
542
- the temporary table referenced by `common_data_ids`.
595
+ See `_DimensionGroupTree` for more details.
543
596
  """
544
597
 
545
- subgraph: PipelineGraph
546
- """Graph of this subset of the pipeline."""
598
+ parent_edge_tasks: set[str] = dataclasses.field(default_factory=set)
599
+ """Task labels for tasks whose quanta have the dimensions of this twig and
600
+ are endpoints of edges that have the combined dimensions of this twig's
601
+ parent branch.
602
+ """
547
603
 
548
- grouped_by_dimensions: dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = (
549
- dataclasses.field(default_factory=dict)
550
- )
551
- """The tasks and dataset types of this subset of the pipeline, grouped
552
- by their dimensions.
604
+ parent_edge_dataset_types: set[str] = dataclasses.field(default_factory=set)
605
+ """Dataset type names for datasets whose quanta have the dimensions of this
606
+ twig and are endpoints of edges that have the combined dimensions of this
607
+ twig's parent branch.
608
+ """
553
609
 
554
- The tasks and dataset types with empty dimensions are not included; they're
555
- in other attributes since they are usually used differently. Prerequisite
556
- dataset types are also not included.
610
+
611
+ @dataclasses.dataclass(eq=False, repr=False, slots=True)
612
+ class _DimensionGroupBranch:
613
+ """A node in the tree of dimension groups that are used to recursively
614
+ process query data IDs into a quantum graph.
557
615
  """
558
616
 
559
- empty_dimensions_tasks: dict[str, TaskNode] = dataclasses.field(default_factory=dict)
560
- """The tasks of this subset of this pipeline that have empty dimensions."""
617
+ tasks: dict[str, TaskNode] = dataclasses.field(default_factory=dict)
618
+ """The task nodes whose quanta have these dimensions, keyed by task label.
619
+ """
561
620
 
562
- empty_dimensions_dataset_types: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
563
- """The dataset types of this subset of this pipeline that have empty
564
- dimensions.
621
+ dataset_types: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
622
+ """The dataset type nodes whose datasets have these dimensions, keyed by
623
+ dataset type name.
624
+ """
565
625
 
566
- Prerequisite dataset types are not included.
626
+ record_elements: list[str] = dataclasses.field(default_factory=list)
627
+ """The names of dimension elements whose records should be looked up via
628
+ these dimensions.
567
629
  """
568
630
 
569
- overall_inputs: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
570
- """Pipeline graph nodes for all non-prerequisite, non-init overall-input
571
- dataset types for this subset of the pipeline.
631
+ data_ids: set[DataCoordinate] = dataclasses.field(default_factory=set)
632
+ """All data IDs with these dimensions seen in the QuantumGraph."""
633
+
634
+ input_edges: list[tuple[str, str]] = dataclasses.field(default_factory=list)
635
+ """Dataset type -> task edges that are populated by this set of dimensions.
636
+
637
+ These are cases where `dimensions` is the union of the task and dataset
638
+ type dimensions.
572
639
  """
573
640
 
574
- query_cmd: list[str] = dataclasses.field(default_factory=list)
575
- """Python code (split across lines) that could be used to reproduce the
576
- initial query.
641
+ output_edges: list[tuple[str, str]] = dataclasses.field(default_factory=list)
642
+ """Task -> dataset type edges that are populated by this set of dimensions.
643
+
644
+ These are cases where `dimensions` is the union of the task and dataset
645
+ type dimensions.
577
646
  """
578
647
 
579
- butler_query: Query = dataclasses.field(init=False)
580
- """Results of the materialized initial data ID query."""
648
+ branches: dict[DimensionGroup, _DimensionGroupBranch] = dataclasses.field(default_factory=dict)
649
+ """Child branches whose dimensions are strict subsets of this branch's
650
+ dimensions.
651
+ """
581
652
 
582
- @classmethod
583
- @contextmanager
584
- def from_builder(
585
- cls, builder: AllDimensionsQuantumGraphBuilder, subgraph: PipelineGraph
586
- ) -> Iterator[_AllDimensionsQuery]:
587
- """Construct and run the query, returning an instance guarded by
588
- a context manager.
653
+ twigs: defaultdict[DimensionGroup, _DimensionGroupTwig] = dataclasses.field(
654
+ default_factory=lambda: defaultdict(_DimensionGroupTwig)
655
+ )
656
+ """Small branches for all of the dimensions that appear on one side of any
657
+ edge in `input_edges` or `output_edges`.
658
+ """
659
+
660
+ @property
661
+ def has_followup_queries(self) -> bool:
662
+ """Whether we will need to perform follow-up queries with these
663
+ dimensions.
664
+ """
665
+ return bool(self.tasks or self.dataset_types or self.record_elements)
666
+
667
+ @staticmethod
668
+ def populate_record_elements(
669
+ all_dimensions: DimensionGroup, branches: dict[DimensionGroup, _DimensionGroupBranch]
670
+ ) -> None:
671
+ """Ensure we have branches for all dimension elements we'll need to
672
+ fetch dimension records for.
589
673
 
590
674
  Parameters
591
675
  ----------
592
- builder : `AllDimensionsQuantumGraphBuilder`
593
- Builder object this helper is associated with.
594
- subgraph : `pipeline_graph.PipelineGraph`
595
- Subset of the pipeline being processed.
676
+ all_dimensions : `~lsst.daf.butler.DimensionGroup`
677
+ All dimensions that appear in the quantum graph.
678
+ branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
679
+ `_DimensionGroupBranch` ]
680
+ Flat mapping of all branches to update in-place. New branches may
681
+ be added and existing branches may have their `record_element`
682
+ attributes updated.
683
+ """
684
+ for element_name in all_dimensions.elements:
685
+ element = all_dimensions.universe[element_name]
686
+ if element.minimal_group in branches:
687
+ branches[element.minimal_group].record_elements.append(element_name)
688
+ else:
689
+ branches[element.minimal_group] = _DimensionGroupBranch(record_elements=[element_name])
690
+
691
+ @staticmethod
692
+ def populate_edges(
693
+ pipeline_graph: PipelineGraph, branches: dict[DimensionGroup, _DimensionGroupBranch]
694
+ ) -> None:
695
+ """Ensure we have branches for all edges in the graph.
696
+
697
+ Parameters
698
+ ----------
699
+ pipeline_graph : `~..pipeline_graph.PipelineGraph``
700
+ Graph of tasks and dataset types.
701
+ branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
702
+ `_DimensionGroupBranch` ]
703
+ Flat mapping of all branches to update in-place. New branches may
704
+ be added and existing branches may have their `input_edges`,
705
+ `output_edges`, and `twigs` attributes updated.
706
+ """
707
+
708
+ def update_edge_branch(
709
+ task_node: TaskNode, dataset_type_node: DatasetTypeNode
710
+ ) -> _DimensionGroupBranch:
711
+ union_dimensions = task_node.dimensions.union(dataset_type_node.dimensions)
712
+ if (branch := branches.get(union_dimensions)) is None:
713
+ branch = _DimensionGroupBranch()
714
+ branches[union_dimensions] = branch
715
+ branch.twigs[dataset_type_node.dimensions].parent_edge_dataset_types.add(dataset_type_node.name)
716
+ branch.twigs[task_node.dimensions].parent_edge_tasks.add(task_node.label)
717
+ return branch
718
+
719
+ for task_node in pipeline_graph.tasks.values():
720
+ for dataset_type_node in pipeline_graph.inputs_of(task_node.label).values():
721
+ assert dataset_type_node is not None, "Pipeline graph is resolved."
722
+ if dataset_type_node.is_prerequisite:
723
+ continue
724
+ branch = update_edge_branch(task_node, dataset_type_node)
725
+ branch.input_edges.append((dataset_type_node.name, task_node.label))
726
+ for dataset_type_node in pipeline_graph.outputs_of(task_node.label).values():
727
+ assert dataset_type_node is not None, "Pipeline graph is resolved."
728
+ branch = update_edge_branch(task_node, dataset_type_node)
729
+ branch.output_edges.append((task_node.label, dataset_type_node.name))
730
+
731
+ @staticmethod
732
+ def find_next_uncontained_dimensions(
733
+ parent_dimensions: DimensionGroup | None, candidates: Iterable[DimensionGroup]
734
+ ) -> list[DimensionGroup]:
735
+ """Find dimension groups that are not a subset of any other dimension
736
+ groups in a set.
737
+
738
+ Parameters
739
+ ----------
740
+ parent_dimensions : `~lsst.daf.butler.DimensionGroup` or `None`
741
+ If not `None`, first filter out any candidates that are not strict
742
+ subsets of these dimensions.
743
+ candidates : `~collections.abc.Iterable` [\
744
+ `~lsst.daf.butler.DimensionGroup` ]
745
+ Iterable of dimension groups to consider.
596
746
 
597
747
  Returns
598
748
  -------
599
- context : `AbstractContextManager` [ `_AllDimensionsQuery` ]
600
- An instance of this class, inside a context manager that manages
601
- the lifetime of its temporary database table.
749
+ uncontained : `list` [ `~lsst.daf.butler.DimensionGroup` ]
750
+ Dimension groups that are not contained by any other dimension
751
+ group in the set of filtered candidates.
602
752
  """
603
- result = cls(subgraph)
604
- builder.log.debug("Analyzing subgraph dimensions and overall-inputs.")
605
- result.grouped_by_dimensions = result.subgraph.group_by_dimensions()
606
- (
607
- result.empty_dimensions_tasks,
608
- result.empty_dimensions_dataset_types,
609
- ) = result.grouped_by_dimensions.pop(builder.universe.empty)
610
- result.overall_inputs = {
611
- name: node # type: ignore
612
- for name, node in result.subgraph.iter_overall_inputs()
613
- if not node.is_prerequisite # type: ignore
614
- }
615
- dimension_names: set[str] = set()
616
- for dimensions_for_group in result.grouped_by_dimensions.keys():
617
- dimension_names.update(dimensions_for_group.names)
618
- dimensions = builder.universe.conform(dimension_names)
619
- datasets: set[str] = set()
620
- builder.log.debug("Building query for data IDs.")
621
- if builder.dataset_query_constraint == DatasetQueryConstraintVariant.ALL:
622
- builder.log.debug("Constraining graph query using all datasets not marked as deferred.")
623
- datasets = {
624
- name
625
- for name, dataset_type_node in result.overall_inputs.items()
626
- if (
627
- dataset_type_node.is_initial_query_constraint
628
- and name not in result.empty_dimensions_dataset_types
629
- )
630
- }
631
- elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.OFF:
632
- builder.log.debug("Not using dataset existence to constrain query.")
633
- elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.LIST:
634
- constraint = set(builder.dataset_query_constraint)
635
- inputs = result.overall_inputs - result.empty_dimensions_dataset_types.keys()
636
- if remainder := constraint.difference(inputs):
637
- builder.log.debug(
638
- "Ignoring dataset types %s in dataset query constraint that are not inputs to this "
639
- "subgraph, on the assumption that they are relevant for a different subraph.",
640
- remainder,
641
- )
642
- constraint.intersection_update(inputs)
643
- builder.log.debug(f"Constraining graph query using {constraint}")
644
- datasets = constraint
753
+ if parent_dimensions is None:
754
+ refined_candidates = candidates
645
755
  else:
646
- raise QuantumGraphBuilderError(
647
- f"Unable to handle type {builder.dataset_query_constraint} given as datasetQueryConstraint."
648
- )
649
- with builder.butler.query() as query:
650
- result.query_cmd.append("with butler.query() as query:")
651
- result.query_cmd.append(f" query = query.join_dimensions({list(dimensions.names)})")
652
- query = query.join_dimensions(dimensions)
653
- if datasets:
654
- result.query_cmd.append(f" collections = {list(builder.input_collections)}")
655
- for dataset_type_name in datasets:
656
- result.query_cmd.append(
657
- f" query = query.join_dataset_search({dataset_type_name!r}, collections)"
658
- )
659
- query = query.join_dataset_search(dataset_type_name, builder.input_collections)
660
- result.query_cmd.append(
661
- f" query = query.where({dict(result.subgraph.data_id.mapping)}, "
662
- f"{builder.where!r}, bind={builder.bind!r})"
663
- )
664
- query = query.where(result.subgraph.data_id, builder.where, bind=builder.bind)
665
- builder.log.verbose(result.format_query_cmd("Querying for data IDs via:"))
666
- # Allow duplicates from common skypix overlaps to make some queries
667
- # run faster.
668
- query._allow_duplicate_overlaps = True
669
- result.butler_query = query.materialize()
670
- yield result
756
+ refined_candidates = [dimensions for dimensions in candidates if dimensions < parent_dimensions]
757
+ return [
758
+ dimensions
759
+ for dimensions in refined_candidates
760
+ if not any(dimensions < other for other in refined_candidates)
761
+ ]
762
+
763
+ @classmethod
764
+ def populate_branches(
765
+ cls,
766
+ parent_dimensions: DimensionGroup | None,
767
+ branches: dict[DimensionGroup, _DimensionGroupBranch],
768
+ ) -> dict[DimensionGroup, _DimensionGroupBranch]:
769
+ """Transform a flat mapping of dimension group branches into a tree.
770
+
771
+ Parameters
772
+ ----------
773
+ parent_dimensions : `~lsst.daf.butler.DimensionGroup` or `None`
774
+ If not `None`, ignore any candidates in `branches` that are not
775
+ strict subsets of these dimensions.
776
+ branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
777
+ `_DimensionGroupBranch` ]
778
+ Flat mapping of all branches to update in-place, by populating
779
+ the `branches` attributes to form a tree and removing entries that
780
+ have been put into the tree.
781
+
782
+ Returns
783
+ -------
784
+ uncontained_branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
785
+ `_DimensionGroupBranch` ]
786
+ Branches whose dimensions were not subsets of any others in the
787
+ mapping except those that were supersets of ``parent_dimensions``.
788
+ """
789
+ result: dict[DimensionGroup, _DimensionGroupBranch] = {}
790
+ for parent_branch_dimensions in cls.find_next_uncontained_dimensions(
791
+ parent_dimensions, branches.keys()
792
+ ):
793
+ parent_branch = branches.pop(parent_branch_dimensions)
794
+ result[parent_branch_dimensions] = parent_branch
795
+ for child_branch_dimensions, child_branch in cls.populate_branches(
796
+ parent_branch_dimensions, branches
797
+ ).items():
798
+ parent_branch.branches[child_branch_dimensions] = child_branch
799
+ return result
671
800
 
672
- def format_query_cmd(self, *header: str) -> str:
673
- """Format the butler query call used as a multi-line string.
801
+ def project_data_ids(self, log: LsstLogAdapter, log_indent: str = " ") -> None:
802
+ """Populate the data ID sets of child branches from the data IDs in
803
+ this branch, recursively.
674
804
 
675
805
  Parameters
676
806
  ----------
677
- *header : `str`
678
- Initial lines the of the returned string, not including newlines.
807
+ log : `lsst.logging.LsstLogAdapter`
808
+ Logger to use for status reporting.
809
+ log_indent : `str`, optional
810
+ Indentation to prefix the log message. This is used when recursing
811
+ to make the branch structure clear.
679
812
  """
680
- lines = list(header)
681
- lines.extend(self.query_cmd)
682
- return "\n".join(lines)
813
+ for data_id in self.data_ids:
814
+ for branch_dimensions, branch in self.branches.items():
815
+ branch.data_ids.add(data_id.subset(branch_dimensions))
816
+ for branch_dimensions, branch in self.branches.items():
817
+ log.debug("%sProjecting query data IDs to %s.", log_indent, branch_dimensions)
818
+ branch.project_data_ids(log, log_indent + " ")
819
+
820
+ def update_skeleton(
821
+ self, skeleton: QuantumGraphSkeleton, log: LsstLogAdapter, log_indent: str = " "
822
+ ) -> None:
823
+ """Process the data ID sets of this branch and its children recursively
824
+ to add nodes and edges to the under-construction quantum graph.
825
+
826
+ Parameters
827
+ ----------
828
+ skeleton : `QuantumGraphSkeleton`
829
+ Under-construction quantum graph to modify in place.
830
+ log : `lsst.logging.LsstLogAdapter`
831
+ Logger to use for status reporting.
832
+ log_indent : `str`, optional
833
+ Indentation to prefix the log message. This is used when recursing
834
+ to make the branch structure clear.
835
+ """
836
+ for branch_dimensions, branch in self.branches.items():
837
+ log.verbose(
838
+ "%sAdding nodes and edges for %s %s data ID(s).",
839
+ log_indent,
840
+ len(branch.data_ids),
841
+ branch_dimensions,
842
+ )
843
+ branch.update_skeleton(skeleton, log, log_indent + " ")
844
+ for data_id in self.data_ids:
845
+ for task_label in self.tasks:
846
+ skeleton.add_quantum_node(task_label, data_id)
847
+ for dataset_type_name in self.dataset_types:
848
+ skeleton.add_dataset_node(dataset_type_name, data_id)
849
+ quantum_keys: dict[str, QuantumKey] = {}
850
+ dataset_keys: dict[str, DatasetKey] = {}
851
+ for twig_dimensions, twig in self.twigs.items():
852
+ twig_data_id = data_id.subset(twig_dimensions)
853
+ for task_label in twig.parent_edge_tasks:
854
+ quantum_keys[task_label] = QuantumKey(task_label, twig_data_id.required_values)
855
+ for dataset_type_name in twig.parent_edge_dataset_types:
856
+ dataset_keys[dataset_type_name] = DatasetKey(
857
+ dataset_type_name, twig_data_id.required_values
858
+ )
859
+ for dataset_type_name, task_label in self.input_edges:
860
+ skeleton.add_input_edge(quantum_keys[task_label], dataset_keys[dataset_type_name])
861
+ for task_label, dataset_type_name in self.output_edges:
862
+ skeleton.add_output_edge(quantum_keys[task_label], dataset_keys[dataset_type_name])
863
+ if not self.has_followup_queries:
864
+ # Delete data IDs we don't need anymore to save memory.
865
+ del self.data_ids
866
+
867
+
868
+ @dataclasses.dataclass(eq=False, repr=False)
869
+ class _DimensionGroupTree:
870
+ """A tree of dimension groups in which branches are subsets of their
871
+ parents.
683
872
 
684
- def log_failure(self, log: LsstLogAdapter) -> None:
685
- """Emit an ERROR-level log message that attempts to explain
686
- why the initial data ID query returned no rows.
873
+ This class holds all of the per-subgraph state for this QG builder
874
+ subclass.
875
+
876
+ Notes
877
+ -----
878
+ The full set of dimensions referenced by any task or dataset type (except
879
+ prerequisite inputs) forms the conceptual "trunk" of this tree. Each
880
+ branch has a subset of the dimensions of its parent branch, and each set
881
+ of dimensions appears exactly once in a tree (so there is some flexibility
882
+ in where certain dimension subsets may appear; right now this is resolved
883
+ somewhat arbitrarily).
884
+ We do not add branches for every possible dimension subset; a branch is
885
+ created for a `~lsst.daf.butler.DimensionGroup` if:
886
+
887
+ - if there is a task whose quanta have those dimensions;
888
+ - if there is a non-prerequisite dataset type with those dimensions;
889
+ - if there is an edge for which the union of the task and dataset type
890
+ dimensions are those dimensions;
891
+ - if there is a dimension element in any task or non-prerequisite dataset
892
+ type dimensions whose `~lsst.daf.butler.DimensionElement.minimal_group`
893
+ is those dimensions.
894
+
895
+ We process the initial data query by recursing through this tree structure
896
+ to populate a data ID set for each branch
897
+ (`_DimensionGroupBranch.project_data_ids`), and then process those sets
898
+ recursively (`_DimensionGroupBranch.update_skeleton`). This can be far
899
+ faster than the non-recursive processing the QG builder used to use because
900
+ the set of data IDs is smaller (sometimes dramatically smaller) as we move
901
+ to smaller sets of dimensions.
902
+
903
+ In addition to their child branches, a branch that is used to define graph
904
+ edges also has "twigs", which are a flatter set of dimension subsets for
905
+ each of the tasks and dataset types that appear in that branch's edges.
906
+ The same twig dimensions can appear in multiple branches, and twig
907
+ dimensions can be the same as their parent branch's (but not a superset).
908
+ """
909
+
910
+ subgraph: PipelineGraph
911
+ """Graph of this subset of the pipeline."""
912
+
913
+ all_dimensions: DimensionGroup = dataclasses.field(init=False)
914
+ """The union of all dimensions that appear in any task or
915
+ (non-prerequisite) dataset type in this subgraph.
916
+ """
917
+
918
+ empty_dimensions_branch: _DimensionGroupBranch = dataclasses.field(init=False)
919
+ """The tasks and dataset types of this subset of this pipeline that have
920
+ empty dimensions.
921
+
922
+ Prerequisite dataset types are not included.
923
+ """
924
+
925
+ trunk_branches: dict[DimensionGroup, _DimensionGroupBranch] = dataclasses.field(init=False)
926
+ """The top-level branches in the tree of dimension groups.
927
+ """
928
+
929
+ branches_by_dimensions: dict[DimensionGroup, _DimensionGroupBranch] = dataclasses.field(init=False)
930
+ """The tasks and dataset types of this subset of the pipeline, grouped
931
+ by their dimensions.
932
+
933
+ The tasks and dataset types with empty dimensions are not included; they're
934
+ in `empty_dimensions_tree` since they are usually used differently.
935
+ Prerequisite dataset types are also not included.
936
+
937
+ This is a flatter view of the objects in `trunk_branches`.
938
+ """
939
+
940
+ overall_inputs: dict[str, DatasetTypeNode] = dataclasses.field(init=False)
941
+ """Pipeline graph nodes for all non-prerequisite, non-init overall-input
942
+ dataset types for this subset of the pipeline.
943
+ """
944
+
945
+ def __post_init__(self) -> None:
946
+ universe = self.subgraph.universe
947
+ assert universe is not None, "Pipeline graph is resolved."
948
+ self.branches_by_dimensions = {
949
+ dimensions: _DimensionGroupBranch(tasks, dataset_types)
950
+ for dimensions, (tasks, dataset_types) in self.subgraph.group_by_dimensions().items()
951
+ }
952
+ self.all_dimensions = _union_dimensions(self.branches_by_dimensions.keys(), universe)
953
+ _DimensionGroupBranch.populate_record_elements(self.all_dimensions, self.branches_by_dimensions)
954
+ _DimensionGroupBranch.populate_edges(self.subgraph, self.branches_by_dimensions)
955
+ self.trunk_branches = _DimensionGroupBranch.populate_branches(
956
+ None, self.branches_by_dimensions.copy()
957
+ )
958
+ self.empty_dimensions_branch = self.branches_by_dimensions.pop(
959
+ universe.empty, _DimensionGroupBranch()
960
+ )
961
+ self.overall_inputs = {
962
+ name: node # type: ignore
963
+ for name, node in self.subgraph.iter_overall_inputs()
964
+ if not node.is_prerequisite # type: ignore
965
+ }
966
+
967
+ def project_data_ids(self, log: LsstLogAdapter) -> None:
968
+ """Recursively populate the data ID sets of the dimension group tree
969
+ from the data ID sets of the trunk branches.
687
970
 
688
971
  Parameters
689
972
  ----------
690
- log : `logging.Logger`
691
- The logger to use to emit log messages.
973
+ log : `lsst.logging.LsstLogAdapter`
974
+ Logger to use for status reporting.
692
975
  """
693
- # A single multiline log plays better with log aggregators like Loki.
694
- header = ["Initial data ID query returned no rows, so QuantumGraph will be empty."]
695
- try:
696
- header.extend(self.butler_query.explain_no_results())
697
- header.append("To reproduce this query for debugging purposes, run:")
698
- finally:
699
- # If an exception was raised, write a partial.
700
- log.error(self.format_query_cmd(*header))
976
+ for branch_dimensions, branch in self.trunk_branches.items():
977
+ log.debug("Projecting query data IDs to %s.", branch_dimensions)
978
+ branch.project_data_ids(log)
701
979
 
702
980
 
703
981
  class DimensionRecordAttacher:
@@ -829,3 +1107,10 @@ class DataIdExpansionLeftovers:
829
1107
  missing_record_data_ids: defaultdict[str, set[tuple[DataIdValue, ...]]] = dataclasses.field(
830
1108
  default_factory=lambda: defaultdict(set)
831
1109
  )
1110
+
1111
+
1112
+ def _union_dimensions(groups: Iterable[DimensionGroup], universe: DimensionUniverse) -> DimensionGroup:
1113
+ dimension_names: set[str] = set()
1114
+ for dimensions_for_group in groups:
1115
+ dimension_names.update(dimensions_for_group.names)
1116
+ return universe.conform(dimension_names)