lsst-pipe-base 29.0.0rc1__py3-none-any.whl → 29.2025.1100__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,8 +36,7 @@ __all__ = ("AllDimensionsQuantumGraphBuilder", "DatasetQueryConstraintVariant")
36
36
  import dataclasses
37
37
  import itertools
38
38
  from collections import defaultdict
39
- from collections.abc import Iterator, Mapping
40
- from contextlib import contextmanager
39
+ from collections.abc import Iterable, Mapping
41
40
  from typing import TYPE_CHECKING, Any, TypeAlias, final
42
41
 
43
42
  from lsst.daf.butler import (
@@ -46,9 +45,9 @@ from lsst.daf.butler import (
46
45
  DataIdValue,
47
46
  DimensionGroup,
48
47
  DimensionRecord,
48
+ DimensionUniverse,
49
49
  MissingDatasetTypeError,
50
50
  )
51
- from lsst.daf.butler.queries import Query
52
51
  from lsst.utils.logging import LsstLogAdapter
53
52
  from lsst.utils.timer import timeMethod
54
53
 
@@ -128,262 +127,308 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
128
127
  # There is some chance that the dimension query for one subgraph would
129
128
  # be the same as or a dimension-subset of another. This is an
130
129
  # optimization opportunity we're not currently taking advantage of.
131
- with _AllDimensionsQuery.from_builder(self, subgraph) as query:
132
- skeleton = self._make_subgraph_skeleton(query)
133
- self._find_followup_datasets(query, skeleton)
134
- dimension_records = self._fetch_most_dimension_records(query)
130
+ tree = _DimensionGroupTree(subgraph)
131
+ self._query_for_data_ids(tree)
132
+ skeleton = self._make_subgraph_skeleton(tree)
133
+ self._find_followup_datasets(tree, skeleton)
134
+ dimension_records = self._fetch_most_dimension_records(tree)
135
135
  leftovers = self._attach_most_dimension_records(skeleton, dimension_records)
136
136
  self._fetch_leftover_dimension_records(leftovers, dimension_records)
137
137
  self._attach_leftover_dimension_records(skeleton, leftovers, dimension_records)
138
138
  return skeleton
139
139
 
140
+ def _query_for_data_ids(self, tree: _DimensionGroupTree) -> None:
141
+ """Query for data IDs and use the result to populate the dimension
142
+ group tree.
143
+
144
+ Parameters
145
+ ----------
146
+ tree : `_DimensionGroupTree`
147
+ Tree with dimension group branches that holds subgraph-specific
148
+ state for this builder, to be modified in place.
149
+ """
150
+ self.log.debug("Analyzing subgraph dimensions and overall-inputs.")
151
+ constraint_datasets: set[str] = set()
152
+ self.log.debug("Building query for data IDs.")
153
+ if self.dataset_query_constraint == DatasetQueryConstraintVariant.ALL:
154
+ self.log.debug("Constraining graph query using all datasets not marked as deferred.")
155
+ constraint_datasets = {
156
+ name
157
+ for name, dataset_type_node in tree.overall_inputs.items()
158
+ if (dataset_type_node.is_initial_query_constraint and dataset_type_node.dimensions)
159
+ }
160
+ elif self.dataset_query_constraint == DatasetQueryConstraintVariant.OFF:
161
+ self.log.debug("Not using dataset existence to constrain query.")
162
+ elif self.dataset_query_constraint == DatasetQueryConstraintVariant.LIST:
163
+ constraint = set(self.dataset_query_constraint)
164
+ inputs = tree.overall_inputs - tree.empty_dimensions_branch.dataset_types.keys()
165
+ if remainder := constraint.difference(inputs):
166
+ self.log.debug(
167
+ "Ignoring dataset types %s in dataset query constraint that are not inputs to this "
168
+ "subgraph, on the assumption that they are relevant for a different subgraph.",
169
+ remainder,
170
+ )
171
+ constraint.intersection_update(inputs)
172
+ self.log.debug(f"Constraining graph query using {constraint}")
173
+ constraint_datasets = constraint
174
+ else:
175
+ raise QuantumGraphBuilderError(
176
+ f"Unable to handle type {self.dataset_query_constraint} given as datasetQueryConstraint."
177
+ )
178
+ query_cmd: list[str] = []
179
+ with self.butler.query() as query:
180
+ query_cmd.append("with butler.query() as query:")
181
+ query_cmd.append(f" query = query.join_dimensions({list(tree.all_dimensions.names)})")
182
+ query = query.join_dimensions(tree.all_dimensions)
183
+ if constraint_datasets:
184
+ query_cmd.append(f" collections = {list(self.input_collections)}")
185
+ for dataset_type_name in constraint_datasets:
186
+ query_cmd.append(f" query = query.join_dataset_search({dataset_type_name!r}, collections)")
187
+ query = query.join_dataset_search(dataset_type_name, self.input_collections)
188
+ query_cmd.append(
189
+ f" query = query.where({dict(tree.subgraph.data_id.mapping)}, "
190
+ f"{self.where!r}, bind={self.bind!r})"
191
+ )
192
+ query = query.where(tree.subgraph.data_id, self.where, bind=self.bind)
193
+ self.log.verbose("Querying for data IDs via: %s", "\n".join(query_cmd))
194
+ # Allow duplicates from common skypix overlaps to make some queries
195
+ # run faster.
196
+ query._allow_duplicate_overlaps = True
197
+ self.log.info("Iterating over query results to associate quanta with datasets.")
198
+ # Iterate over query results, populating data IDs for datasets,
199
+ # quanta, and edges. We populate only the first level of the tree
200
+ # in the first pass, so we can be done with the query results as
201
+ # quickly as possible in case that holds a connection/cursor open.
202
+ n_rows = 0
203
+ for common_data_id in query.data_ids(tree.all_dimensions):
204
+ for branch_dimensions, branch in tree.trunk_branches.items():
205
+ data_id = common_data_id.subset(branch_dimensions)
206
+ branch.data_ids.add(data_id)
207
+ n_rows += 1
208
+ if n_rows == 0:
209
+ # A single multiline log plays better with log aggregators like
210
+ # Loki.
211
+ lines = ["Initial data ID query returned no rows, so QuantumGraph will be empty."]
212
+ try:
213
+ lines.extend(query.explain_no_results())
214
+ finally:
215
+ lines.append("To reproduce this query for debugging purposes, run:")
216
+ lines.extend(query_cmd)
217
+ # If an exception was raised, write a partial.
218
+ self.log.error("\n".join(lines))
219
+ return
220
+ self.log.verbose("Processed %s initial data ID query rows.", n_rows)
221
+ # We now recursively populate the data IDs of the rest of the tree.
222
+ tree.project_data_ids(self.log)
223
+
140
224
  @timeMethod
141
- def _make_subgraph_skeleton(self, query: _AllDimensionsQuery) -> QuantumGraphSkeleton:
142
- """Build a `QuantumGraphSkeleton` by iterating over the result rows
143
- of the initial data ID query.
225
+ def _make_subgraph_skeleton(self, tree: _DimensionGroupTree) -> QuantumGraphSkeleton:
226
+ """Build a `QuantumGraphSkeleton` by processing the data IDs in the
227
+ dimension group tree.
144
228
 
145
229
  Parameters
146
230
  ----------
147
- query : `_AllDimensionsQuery`
148
- Object representing the full-pipeline data ID query.
231
+ tree : `_DimensionGroupTree`
232
+ Tree with dimension group branches that holds subgraph-specific
233
+ state for this builder.
149
234
 
150
235
  Returns
151
236
  -------
152
237
  skeleton : `QuantumGraphSkeleton`
153
238
  Preliminary quantum graph.
154
239
  """
155
- # First we make containers of empty-dimensions quantum and dataset
156
- # keys, and add those to the skelton, since empty data IDs are
157
- # logically subsets of any data ID. We'll copy those to initialize the
158
- # containers of keys for each result row. We don't ever explicitly add
159
- # nodes to the skeleton for these, and that's okay because networkx
160
- # adds nodes implicitly when an edge to that node is added, and we
161
- # don't want to add nodes for init datasets here.
162
- skeleton = QuantumGraphSkeleton(query.subgraph.tasks)
163
- empty_dimensions_dataset_keys = {}
164
- for dataset_type_name in query.empty_dimensions_dataset_types.keys():
165
- dataset_key = skeleton.add_dataset_node(dataset_type_name, self.empty_data_id)
166
- empty_dimensions_dataset_keys[dataset_type_name] = dataset_key
167
- if ref := self.empty_dimensions_datasets.inputs.get(dataset_key):
168
- skeleton.set_dataset_ref(ref, dataset_key)
169
- if ref := self.empty_dimensions_datasets.outputs_for_skip.get(dataset_key):
170
- skeleton.set_output_for_skip(ref)
171
- if ref := self.empty_dimensions_datasets.outputs_in_the_way.get(dataset_key):
172
- skeleton.set_output_in_the_way(ref)
173
- empty_dimensions_quantum_keys = []
174
- for task_label in query.empty_dimensions_tasks.keys():
175
- empty_dimensions_quantum_keys.append(skeleton.add_quantum_node(task_label, self.empty_data_id))
176
- self.log.info("Iterating over query results to associate quanta with datasets.")
177
- # Iterate over query results, populating data IDs for datasets and
178
- # quanta and then connecting them to each other. This is the slowest
179
- # client-side part of QG generation, and it's often the slowest part
180
- # overall, so inside this loop is where it's really critical to avoid
181
- # expensive things, especially in the nested loops.
182
- n_rows = 0
183
- for common_data_id in query.butler_query.data_ids():
184
- # Create a data ID for each set of dimensions used by one or more
185
- # tasks or dataset types, and use that to record all quanta and
186
- # dataset data IDs for this row.
187
- dataset_keys_for_row: dict[str, DatasetKey] = empty_dimensions_dataset_keys.copy()
188
- quantum_keys_for_row: list[QuantumKey] = empty_dimensions_quantum_keys.copy()
189
- for dimensions, (task_nodes, dataset_type_nodes) in query.grouped_by_dimensions.items():
190
- data_id = common_data_id.subset(dimensions)
191
- for dataset_type_name in dataset_type_nodes.keys():
192
- dataset_keys_for_row[dataset_type_name] = skeleton.add_dataset_node(
193
- dataset_type_name, data_id
194
- )
195
- for task_label in task_nodes.keys():
196
- quantum_keys_for_row.append(skeleton.add_quantum_node(task_label, data_id))
197
- # Whether these quanta are new or existing, we can now associate
198
- # the dataset data IDs for this row with them. The fact that a
199
- # quantum data ID and a dataset data ID both came from the same
200
- # result row is what tells us they should be associated. Many of
201
- # these associates will be duplicates (because another query row
202
- # that differed from this one only in irrelevant dimensions already
203
- # added them), and our use of sets should take care of that.
204
- for quantum_key in quantum_keys_for_row:
205
- for read_edge in self._pipeline_graph.tasks[quantum_key.task_label].inputs.values():
206
- skeleton.add_input_edge(
207
- quantum_key, dataset_keys_for_row[read_edge.parent_dataset_type_name]
208
- )
209
- for write_edge in self._pipeline_graph.tasks[quantum_key.task_label].iter_all_outputs():
210
- skeleton.add_output_edge(
211
- quantum_key, dataset_keys_for_row[write_edge.parent_dataset_type_name]
212
- )
213
- n_rows += 1
214
- if n_rows == 0:
215
- query.log_failure(self.log)
216
- else:
217
- n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in query.subgraph.tasks)
218
- self.log.info(
219
- "Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges from %d query row(s).",
220
- n_quanta,
221
- skeleton.n_nodes - n_quanta,
222
- skeleton.n_edges,
223
- n_rows,
240
+ skeleton = QuantumGraphSkeleton(tree.subgraph.tasks)
241
+ for branch_dimensions, branch in tree.trunk_branches.items():
242
+ self.log.verbose(
243
+ "Adding nodes and edges for %s %s data ID(s).",
244
+ len(branch.data_ids),
245
+ branch_dimensions,
224
246
  )
247
+ branch.update_skeleton(skeleton, self.log)
248
+ n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in tree.subgraph.tasks)
249
+ self.log.info(
250
+ "Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges.",
251
+ n_quanta,
252
+ skeleton.n_nodes - n_quanta,
253
+ skeleton.n_edges,
254
+ )
225
255
  return skeleton
226
256
 
227
257
  @timeMethod
228
- def _find_followup_datasets(self, query: _AllDimensionsQuery, skeleton: QuantumGraphSkeleton) -> None:
229
- """Populate `existing_datasets` by performing follow-up queries joined
230
- to column-subsets of the initial data ID query.
258
+ def _find_followup_datasets(self, tree: _DimensionGroupTree, skeleton: QuantumGraphSkeleton) -> None:
259
+ """Populate `existing_datasets` by performing follow-up queries with
260
+ the data IDs in the dimension group tree.
231
261
 
232
262
  Parameters
233
263
  ----------
234
- query : `_AllDimensionsQuery`
235
- Object representing the full-pipeline data ID query.
264
+ tree : `_DimensionGroupTree`
265
+ Tree with dimension group branches that holds subgraph-specific
266
+ state for this builder.
267
+ skeleton : `.quantum_graph_skeleton.QuantumGraphSkeleton`
268
+ In-progress quantum graph to modify in place.
236
269
  """
237
- for dimensions, (tasks_in_group, dataset_types_in_group) in query.grouped_by_dimensions.items():
270
+ dataset_key: DatasetKey | PrerequisiteDatasetKey
271
+ for dataset_type_name in tree.empty_dimensions_branch.dataset_types.keys():
272
+ dataset_key = DatasetKey(dataset_type_name, self.empty_data_id.required_values)
273
+ if ref := self.empty_dimensions_datasets.inputs.get(dataset_key):
274
+ skeleton.set_dataset_ref(ref, dataset_key)
275
+ if ref := self.empty_dimensions_datasets.outputs_for_skip.get(dataset_key):
276
+ skeleton.set_output_for_skip(ref)
277
+ if ref := self.empty_dimensions_datasets.outputs_in_the_way.get(dataset_key):
278
+ skeleton.set_output_in_the_way(ref)
279
+ for dimensions, branch in tree.branches_by_dimensions.items():
280
+ if not branch.has_followup_queries:
281
+ continue
282
+ if not branch.data_ids:
283
+ continue
238
284
  # Iterate over regular input/output dataset type nodes with these
239
- # dimensions to find those datasets using straightforward followup
240
- # queries.
241
- for dataset_type_node in dataset_types_in_group.values():
242
- if dataset_type_node.name in query.overall_inputs:
243
- # Dataset type is an overall input; we always need to try
244
- # to find these.
245
- count = 0
246
- try:
247
- for ref in query.butler_query.datasets(
248
- dataset_type_node.name, self.input_collections
249
- ):
250
- skeleton.set_dataset_ref(ref)
251
- count += 1
252
- except MissingDatasetTypeError:
253
- pass
254
- self.log.verbose(
255
- "Found %d overall-input dataset(s) of type %r.", count, dataset_type_node.name
256
- )
257
- continue
258
- if self.skip_existing_in:
259
- # Dataset type is an intermediate or output; need to find
260
- # these if only they're from previously executed quanta
261
- # that we might skip...
262
- count = 0
263
- try:
264
- for ref in query.butler_query.datasets(dataset_type_node.name, self.skip_existing_in):
265
- skeleton.set_output_for_skip(ref)
266
- count += 1
267
- if ref.run == self.output_run:
268
- skeleton.set_output_in_the_way(ref)
269
- except MissingDatasetTypeError:
270
- pass
271
- self.log.verbose(
272
- "Found %d output dataset(s) of type %r in %s.",
273
- count,
274
- dataset_type_node.name,
275
- self.skip_existing_in,
276
- )
277
- if self.output_run_exists and not self.skip_existing_starts_with_output_run:
278
- # ...or if they're in the way and would need to be
279
- # clobbered (and we haven't already found them in the
280
- # previous block).
281
- count = 0
282
- try:
283
- for ref in query.butler_query.datasets(dataset_type_node.name, [self.output_run]):
284
- skeleton.set_output_in_the_way(ref)
285
- count += 1
286
- except MissingDatasetTypeError:
287
- pass
288
- self.log.verbose(
289
- "Found %d output dataset(s) of type %r in %s.",
290
- count,
291
- dataset_type_node.name,
292
- self.output_run,
293
- )
294
- # Iterate over tasks with these dimensions to perform follow-up
295
- # queries for prerequisite inputs, which may have dimensions that
296
- # were not in ``query.butler_query.dimensions`` and/or require
297
- # temporal joins to calibration validity ranges.
298
- for task_node in tasks_in_group.values():
299
- task_prerequisite_info = self.prerequisite_info[task_node.label]
300
- for connection_name, finder in list(task_prerequisite_info.finders.items()):
301
- if finder.lookup_function is not None:
285
+ # dimensions to find those datasets using followup queries.
286
+ with self.butler.query() as butler_query:
287
+ butler_query = butler_query.join_data_coordinates(branch.data_ids)
288
+ for dataset_type_node in branch.dataset_types.values():
289
+ if dataset_type_node.name in tree.overall_inputs:
290
+ # Dataset type is an overall input; we always need to
291
+ # try to find these.
292
+ count = 0
293
+ try:
294
+ for ref in butler_query.datasets(dataset_type_node.name, self.input_collections):
295
+ skeleton.set_dataset_ref(ref)
296
+ count += 1
297
+ except MissingDatasetTypeError:
298
+ pass
302
299
  self.log.verbose(
303
- "Deferring prerequisite input %r of task %r to per-quantum processing "
304
- "(lookup function provided).",
305
- finder.dataset_type_node.name,
306
- task_node.label,
300
+ "Found %d overall-input dataset(s) of type %r.", count, dataset_type_node.name
307
301
  )
308
302
  continue
309
- # We also fall back to the base class if there is a
310
- # nontrivial spatial or temporal join in the lookup.
311
- if finder.dataset_skypix or finder.dataset_other_spatial:
312
- if task_prerequisite_info.bounds.spatial_connections:
313
- self.log.verbose(
314
- "Deferring prerequisite input %r of task %r to per-quantum processing "
315
- "(for spatial-bounds-connections handling).",
316
- finder.dataset_type_node.name,
317
- task_node.label,
318
- )
319
- continue
320
- if not task_node.dimensions.spatial:
321
- self.log.verbose(
322
- "Deferring prerequisite input %r of task %r to per-quantum processing "
323
- "(dataset has spatial data IDs, but task does not).",
324
- finder.dataset_type_node.name,
325
- task_node.label,
326
- )
327
- continue
328
- if finder.dataset_has_timespan:
329
- if task_prerequisite_info.bounds.spatial_connections:
330
- self.log.verbose(
331
- "Deferring prerequisite input %r of task %r to per-quantum processing "
332
- "(for temporal-bounds-connections handling).",
333
- finder.dataset_type_node.name,
334
- task_node.label,
335
- )
336
- continue
337
- if not task_node.dimensions.temporal:
303
+ if self.skip_existing_in:
304
+ # Dataset type is an intermediate or output; need to
305
+ # find these if only they're from previously executed
306
+ # quanta that we might skip...
307
+ count = 0
308
+ try:
309
+ for ref in butler_query.datasets(dataset_type_node.name, self.skip_existing_in):
310
+ skeleton.set_output_for_skip(ref)
311
+ count += 1
312
+ if ref.run == self.output_run:
313
+ skeleton.set_output_in_the_way(ref)
314
+ except MissingDatasetTypeError:
315
+ pass
316
+ self.log.verbose(
317
+ "Found %d output dataset(s) of type %r in %s.",
318
+ count,
319
+ dataset_type_node.name,
320
+ self.skip_existing_in,
321
+ )
322
+ if self.output_run_exists and not self.skip_existing_starts_with_output_run:
323
+ # ...or if they're in the way and would need to be
324
+ # clobbered (and we haven't already found them in the
325
+ # previous block).
326
+ count = 0
327
+ try:
328
+ for ref in butler_query.datasets(dataset_type_node.name, [self.output_run]):
329
+ skeleton.set_output_in_the_way(ref)
330
+ count += 1
331
+ except MissingDatasetTypeError:
332
+ pass
333
+ self.log.verbose(
334
+ "Found %d output dataset(s) of type %r in %s.",
335
+ count,
336
+ dataset_type_node.name,
337
+ self.output_run,
338
+ )
339
+ # Iterate over tasks with these dimensions to perform follow-up
340
+ # queries for prerequisite inputs, which may have dimensions
341
+ # that were not in ``tree.all_dimensions`` and/or require
342
+ # temporal joins to calibration validity ranges.
343
+ for task_node in branch.tasks.values():
344
+ task_prerequisite_info = self.prerequisite_info[task_node.label]
345
+ for connection_name, finder in list(task_prerequisite_info.finders.items()):
346
+ if finder.lookup_function is not None:
338
347
  self.log.verbose(
339
348
  "Deferring prerequisite input %r of task %r to per-quantum processing "
340
- "(dataset has temporal data IDs, but task does not).",
349
+ "(lookup function provided).",
341
350
  finder.dataset_type_node.name,
342
351
  task_node.label,
343
352
  )
344
353
  continue
345
- # We have a simple case where we can do a single query
346
- # that joins the query we already have for the task data
347
- # IDs to the datasets we're looking for.
348
- count = 0
349
- try:
350
- query_results = list(
351
- # TODO[DM-46042]: We materialize here as a way to
352
- # to a SELECT DISTINCT on the main query with a
353
- # subset of its dimensions columns. It'd be better
354
- # to have a way to do this that just makes a
355
- # subquery or a CTE rather than a temporary table.
356
- query.butler_query.materialize(dimensions=dimensions, datasets=())
357
- .join_dataset_search(
358
- finder.dataset_type_node.dataset_type, self.input_collections
354
+ # We also fall back to the base class if there is a
355
+ # nontrivial spatial or temporal join in the lookup.
356
+ if finder.dataset_skypix or finder.dataset_other_spatial:
357
+ if task_prerequisite_info.bounds.spatial_connections:
358
+ self.log.verbose(
359
+ "Deferring prerequisite input %r of task %r to per-quantum processing "
360
+ "(for spatial-bounds-connections handling).",
361
+ finder.dataset_type_node.name,
362
+ task_node.label,
363
+ )
364
+ continue
365
+ if not task_node.dimensions.spatial:
366
+ self.log.verbose(
367
+ "Deferring prerequisite input %r of task %r to per-quantum processing "
368
+ "(dataset has spatial data IDs, but task does not).",
369
+ finder.dataset_type_node.name,
370
+ task_node.label,
371
+ )
372
+ continue
373
+ if finder.dataset_has_timespan:
374
+ if task_prerequisite_info.bounds.spatial_connections:
375
+ self.log.verbose(
376
+ "Deferring prerequisite input %r of task %r to per-quantum processing "
377
+ "(for temporal-bounds-connections handling).",
378
+ finder.dataset_type_node.name,
379
+ task_node.label,
380
+ )
381
+ continue
382
+ if not task_node.dimensions.temporal:
383
+ self.log.verbose(
384
+ "Deferring prerequisite input %r of task %r to per-quantum processing "
385
+ "(dataset has temporal data IDs, but task does not).",
386
+ finder.dataset_type_node.name,
387
+ task_node.label,
388
+ )
389
+ continue
390
+ # We have a simple case where we can do a single query
391
+ # that joins the query we already have for the task
392
+ # data IDs to the datasets we're looking for.
393
+ count = 0
394
+ try:
395
+ query_results = list(
396
+ butler_query.join_dataset_search(
397
+ finder.dataset_type_node.dataset_type, self.input_collections
398
+ )
399
+ .general(
400
+ dimensions | finder.dataset_type_node.dataset_type.dimensions,
401
+ dataset_fields={finder.dataset_type_node.name: ...},
402
+ find_first=True,
403
+ )
404
+ .iter_tuples(finder.dataset_type_node.dataset_type)
359
405
  )
360
- .general(
361
- dimensions | finder.dataset_type_node.dataset_type.dimensions,
362
- dataset_fields={finder.dataset_type_node.name: ...},
363
- find_first=True,
406
+ except MissingDatasetTypeError:
407
+ query_results = []
408
+ for data_id, refs, _ in query_results:
409
+ ref = refs[0]
410
+ dataset_key = skeleton.add_prerequisite_node(ref)
411
+ quantum_key = QuantumKey(
412
+ task_node.label, data_id.subset(dimensions).required_values
364
413
  )
365
- .iter_tuples(finder.dataset_type_node.dataset_type)
414
+ skeleton.add_input_edge(quantum_key, dataset_key)
415
+ count += 1
416
+ # Remove this finder from the mapping so the base class
417
+ # knows it doesn't have to look for these
418
+ # prerequisites.
419
+ del task_prerequisite_info.finders[connection_name]
420
+ self.log.verbose(
421
+ "Added %d prerequisite input edge(s) from dataset type %r to task %r.",
422
+ count,
423
+ finder.dataset_type_node.name,
424
+ task_node.label,
366
425
  )
367
- except MissingDatasetTypeError:
368
- query_results = []
369
- for data_id, refs, _ in query_results:
370
- ref = refs[0]
371
- dataset_key = skeleton.add_prerequisite_node(ref)
372
- quantum_key = QuantumKey(task_node.label, data_id.subset(dimensions).required_values)
373
- skeleton.add_input_edge(quantum_key, dataset_key)
374
- count += 1
375
- # Remove this finder from the mapping so the base class
376
- # knows it doesn't have to look for these prerequisites.
377
- del task_prerequisite_info.finders[connection_name]
378
- self.log.verbose(
379
- "Added %d prerequisite input edge(s) from dataset type %r to task %r.",
380
- count,
381
- finder.dataset_type_node.name,
382
- task_node.label,
383
- )
426
+ if not branch.record_elements:
427
+ # Delete data ID sets we don't need anymore.
428
+ del branch.data_ids
384
429
 
385
430
  @timeMethod
386
- def _fetch_most_dimension_records(self, query: _AllDimensionsQuery) -> DimensionRecordsMap:
431
+ def _fetch_most_dimension_records(self, tree: _DimensionGroupTree) -> DimensionRecordsMap:
387
432
  """Query for dimension records for all non-prerequisite data IDs (and
388
433
  possibly some prerequisite data IDs).
389
434
 
@@ -407,12 +452,17 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
407
452
  """
408
453
  self.log.verbose("Performing follow-up queries for dimension records.")
409
454
  result: dict[str, dict[tuple[DataIdValue, ...], DimensionRecord]] = {}
410
- for dimensions in query.grouped_by_dimensions.keys():
411
- for element in dimensions.elements:
412
- if element not in result:
455
+ for branch in tree.branches_by_dimensions.values():
456
+ if not branch.record_elements:
457
+ continue
458
+ if not branch.data_ids:
459
+ continue
460
+ with self.butler.query() as butler_query:
461
+ butler_query = butler_query.join_data_coordinates(branch.data_ids)
462
+ for element in branch.record_elements:
413
463
  result[element] = {
414
464
  record.dataId.required_values: record
415
- for record in query.butler_query.dimension_records(element)
465
+ for record in butler_query.dimension_records(element)
416
466
  }
417
467
  return result
418
468
 
@@ -532,172 +582,396 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
532
582
  skeleton.set_data_id(node_key, expanded_data_id)
533
583
 
534
584
 
535
- @dataclasses.dataclass(eq=False, repr=False)
536
- class _AllDimensionsQuery:
537
- """A helper class for `AllDimensionsQuantumGraphBuilder` that holds all
538
- per-subgraph state.
585
+ @dataclasses.dataclass(eq=False, repr=False, slots=True)
586
+ class _DimensionGroupTwig:
587
+ """A small side-branch of the tree of dimensions groups that tracks the
588
+ tasks and dataset types with a particular set of dimensions that appear in
589
+ the edges populated by its parent branch.
539
590
 
540
- This object should always be constructed by `from_builder`, which returns
541
- an instance wrapped with a context manager. This controls the lifetime of
542
- the temporary table referenced by `common_data_ids`.
591
+ See `_DimensionGroupTree` for more details.
543
592
  """
544
593
 
545
- subgraph: PipelineGraph
546
- """Graph of this subset of the pipeline."""
594
+ parent_edge_tasks: set[str] = dataclasses.field(default_factory=set)
595
+ """Task labels for tasks whose quanta have the dimensions of this twig and
596
+ are endpoints of edges that have the combined dimensions of this twig's
597
+ parent branch.
598
+ """
547
599
 
548
- grouped_by_dimensions: dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = (
549
- dataclasses.field(default_factory=dict)
550
- )
551
- """The tasks and dataset types of this subset of the pipeline, grouped
552
- by their dimensions.
600
+ parent_edge_dataset_types: set[str] = dataclasses.field(default_factory=set)
601
+ """Dataset type names for datasets whose quanta have the dimensions of this
602
+ twig and are endpoints of edges that have the combined dimensions of this
603
+ twig's parent branch.
604
+ """
553
605
 
554
- The tasks and dataset types with empty dimensions are not included; they're
555
- in other attributes since they are usually used differently. Prerequisite
556
- dataset types are also not included.
606
+
607
+ @dataclasses.dataclass(eq=False, repr=False, slots=True)
608
+ class _DimensionGroupBranch:
609
+ """A node in the tree of dimension groups that are used to recursively
610
+ process query data IDs into a quantum graph.
557
611
  """
558
612
 
559
- empty_dimensions_tasks: dict[str, TaskNode] = dataclasses.field(default_factory=dict)
560
- """The tasks of this subset of this pipeline that have empty dimensions."""
613
+ tasks: dict[str, TaskNode] = dataclasses.field(default_factory=dict)
614
+ """The task nodes whose quanta have these dimensions, keyed by task label.
615
+ """
561
616
 
562
- empty_dimensions_dataset_types: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
563
- """The dataset types of this subset of this pipeline that have empty
564
- dimensions.
617
+ dataset_types: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
618
+ """The dataset type nodes whose datasets have these dimensions, keyed by
619
+ dataset type name.
620
+ """
565
621
 
566
- Prerequisite dataset types are not included.
622
+ record_elements: list[str] = dataclasses.field(default_factory=list)
623
+ """The names of dimension elements whose records should be looked up via
624
+ these dimensions.
567
625
  """
568
626
 
569
- overall_inputs: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict)
570
- """Pipeline graph nodes for all non-prerequisite, non-init overall-input
571
- dataset types for this subset of the pipeline.
627
+ data_ids: set[DataCoordinate] = dataclasses.field(default_factory=set)
628
+ """All data IDs with these dimensions seen in the QuantumGraph."""
629
+
630
+ input_edges: list[tuple[str, str]] = dataclasses.field(default_factory=list)
631
+ """Dataset type -> task edges that are populated by this set of dimensions.
632
+
633
+ These are cases where `dimensions` is the union of the task and dataset
634
+ type dimensions.
572
635
  """
573
636
 
574
- query_cmd: list[str] = dataclasses.field(default_factory=list)
575
- """Python code (split across lines) that could be used to reproduce the
576
- initial query.
637
+ output_edges: list[tuple[str, str]] = dataclasses.field(default_factory=list)
638
+ """Task -> dataset type edges that are populated by this set of dimensions.
639
+
640
+ These are cases where `dimensions` is the union of the task and dataset
641
+ type dimensions.
577
642
  """
578
643
 
579
- butler_query: Query = dataclasses.field(init=False)
580
- """Results of the materialized initial data ID query."""
644
+ branches: dict[DimensionGroup, _DimensionGroupBranch] = dataclasses.field(default_factory=dict)
645
+ """Child branches whose dimensions are strict subsets of this branch's
646
+ dimensions.
647
+ """
581
648
 
582
- @classmethod
583
- @contextmanager
584
- def from_builder(
585
- cls, builder: AllDimensionsQuantumGraphBuilder, subgraph: PipelineGraph
586
- ) -> Iterator[_AllDimensionsQuery]:
587
- """Construct and run the query, returning an instance guarded by
588
- a context manager.
649
+ twigs: defaultdict[DimensionGroup, _DimensionGroupTwig] = dataclasses.field(
650
+ default_factory=lambda: defaultdict(_DimensionGroupTwig)
651
+ )
652
+ """Small branches for all of the dimensions that appear on one side of any
653
+ edge in `input_edges` or `output_edges`.
654
+ """
655
+
656
+ @property
657
+ def has_followup_queries(self) -> bool:
658
+ """Whether we will need to perform follow-up queries with these
659
+ dimensions.
660
+ """
661
+ return bool(self.tasks or self.dataset_types or self.record_elements)
662
+
663
+ @staticmethod
664
+ def populate_record_elements(
665
+ all_dimensions: DimensionGroup, branches: dict[DimensionGroup, _DimensionGroupBranch]
666
+ ) -> None:
667
+ """Ensure we have branches for all dimension elements we'll need to
668
+ fetch dimension records for.
589
669
 
590
670
  Parameters
591
671
  ----------
592
- builder : `AllDimensionsQuantumGraphBuilder`
593
- Builder object this helper is associated with.
594
- subgraph : `pipeline_graph.PipelineGraph`
595
- Subset of the pipeline being processed.
672
+ all_dimensions : `~lsst.daf.butler.DimensionGroup`
673
+ All dimensions that appear in the quantum graph.
674
+ branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
675
+ `_DimensionGroupBranch` ]
676
+ Flat mapping of all branches to update in-place. New branches may
677
+ be added and existing branches may have their `record_element`
678
+ attributes updated.
679
+ """
680
+ for element_name in all_dimensions.elements:
681
+ element = all_dimensions.universe[element_name]
682
+ if element.minimal_group in branches:
683
+ branches[element.minimal_group].record_elements.append(element_name)
684
+ else:
685
+ branches[element.minimal_group] = _DimensionGroupBranch(record_elements=[element_name])
686
+
687
+ @staticmethod
688
+ def populate_edges(
689
+ pipeline_graph: PipelineGraph, branches: dict[DimensionGroup, _DimensionGroupBranch]
690
+ ) -> None:
691
+ """Ensure we have branches for all edges in the graph.
692
+
693
+ Parameters
694
+ ----------
695
+ pipeline_graph : `~..pipeline_graph.PipelineGraph``
696
+ Graph of tasks and dataset types.
697
+ branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
698
+ `_DimensionGroupBranch` ]
699
+ Flat mapping of all branches to update in-place. New branches may
700
+ be added and existing branches may have their `input_edges`,
701
+ `output_edges`, and `twigs` attributes updated.
702
+ """
703
+
704
+ def update_edge_branch(
705
+ task_node: TaskNode, dataset_type_node: DatasetTypeNode
706
+ ) -> _DimensionGroupBranch:
707
+ union_dimensions = task_node.dimensions.union(dataset_type_node.dimensions)
708
+ if (branch := branches.get(union_dimensions)) is None:
709
+ branch = _DimensionGroupBranch()
710
+ branches[union_dimensions] = branch
711
+ branch.twigs[dataset_type_node.dimensions].parent_edge_dataset_types.add(dataset_type_node.name)
712
+ branch.twigs[task_node.dimensions].parent_edge_tasks.add(task_node.label)
713
+ return branch
714
+
715
+ for task_node in pipeline_graph.tasks.values():
716
+ for dataset_type_node in pipeline_graph.inputs_of(task_node.label).values():
717
+ assert dataset_type_node is not None, "Pipeline graph is resolved."
718
+ if dataset_type_node.is_prerequisite:
719
+ continue
720
+ branch = update_edge_branch(task_node, dataset_type_node)
721
+ branch.input_edges.append((dataset_type_node.name, task_node.label))
722
+ for dataset_type_node in pipeline_graph.outputs_of(task_node.label).values():
723
+ assert dataset_type_node is not None, "Pipeline graph is resolved."
724
+ branch = update_edge_branch(task_node, dataset_type_node)
725
+ branch.output_edges.append((task_node.label, dataset_type_node.name))
726
+
727
+ @staticmethod
728
+ def find_next_uncontained_dimensions(
729
+ parent_dimensions: DimensionGroup | None, candidates: Iterable[DimensionGroup]
730
+ ) -> list[DimensionGroup]:
731
+ """Find dimension groups that are not a subset of any other dimension
732
+ groups in a set.
733
+
734
+ Parameters
735
+ ----------
736
+ parent_dimensions : `~lsst.daf.butler.DimensionGroup` or `None`
737
+ If not `None`, first filter out any candidates that are not strict
738
+ subsets of these dimensions.
739
+ candidates : `~collections.abc.Iterable` [\
740
+ `~lsst.daf.butler.DimensionGroup` ]
741
+ Iterable of dimension groups to consider.
596
742
 
597
743
  Returns
598
744
  -------
599
- context : `AbstractContextManager` [ `_AllDimensionsQuery` ]
600
- An instance of this class, inside a context manager that manages
601
- the lifetime of its temporary database table.
745
+ uncontained : `list` [ `~lsst.daf.butler.DimensionGroup` ]
746
+ Dimension groups that are not contained by any other dimension
747
+ group in the set of filtered candidates.
602
748
  """
603
- result = cls(subgraph)
604
- builder.log.debug("Analyzing subgraph dimensions and overall-inputs.")
605
- result.grouped_by_dimensions = result.subgraph.group_by_dimensions()
606
- (
607
- result.empty_dimensions_tasks,
608
- result.empty_dimensions_dataset_types,
609
- ) = result.grouped_by_dimensions.pop(builder.universe.empty)
610
- result.overall_inputs = {
611
- name: node # type: ignore
612
- for name, node in result.subgraph.iter_overall_inputs()
613
- if not node.is_prerequisite # type: ignore
614
- }
615
- dimension_names: set[str] = set()
616
- for dimensions_for_group in result.grouped_by_dimensions.keys():
617
- dimension_names.update(dimensions_for_group.names)
618
- dimensions = builder.universe.conform(dimension_names)
619
- datasets: set[str] = set()
620
- builder.log.debug("Building query for data IDs.")
621
- if builder.dataset_query_constraint == DatasetQueryConstraintVariant.ALL:
622
- builder.log.debug("Constraining graph query using all datasets not marked as deferred.")
623
- datasets = {
624
- name
625
- for name, dataset_type_node in result.overall_inputs.items()
626
- if (
627
- dataset_type_node.is_initial_query_constraint
628
- and name not in result.empty_dimensions_dataset_types
629
- )
630
- }
631
- elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.OFF:
632
- builder.log.debug("Not using dataset existence to constrain query.")
633
- elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.LIST:
634
- constraint = set(builder.dataset_query_constraint)
635
- inputs = result.overall_inputs - result.empty_dimensions_dataset_types.keys()
636
- if remainder := constraint.difference(inputs):
637
- builder.log.debug(
638
- "Ignoring dataset types %s in dataset query constraint that are not inputs to this "
639
- "subgraph, on the assumption that they are relevant for a different subraph.",
640
- remainder,
641
- )
642
- constraint.intersection_update(inputs)
643
- builder.log.debug(f"Constraining graph query using {constraint}")
644
- datasets = constraint
749
+ if parent_dimensions is None:
750
+ refined_candidates = candidates
645
751
  else:
646
- raise QuantumGraphBuilderError(
647
- f"Unable to handle type {builder.dataset_query_constraint} given as datasetQueryConstraint."
648
- )
649
- with builder.butler.query() as query:
650
- result.query_cmd.append("with butler.query() as query:")
651
- result.query_cmd.append(f" query = query.join_dimensions({list(dimensions.names)})")
652
- query = query.join_dimensions(dimensions)
653
- if datasets:
654
- result.query_cmd.append(f" collections = {list(builder.input_collections)}")
655
- for dataset_type_name in datasets:
656
- result.query_cmd.append(
657
- f" query = query.join_dataset_search({dataset_type_name!r}, collections)"
658
- )
659
- query = query.join_dataset_search(dataset_type_name, builder.input_collections)
660
- result.query_cmd.append(
661
- f" query = query.where({dict(result.subgraph.data_id.mapping)}, "
662
- f"{builder.where!r}, bind={builder.bind!r})"
663
- )
664
- query = query.where(result.subgraph.data_id, builder.where, bind=builder.bind)
665
- builder.log.verbose(result.format_query_cmd("Querying for data IDs via:"))
666
- # Allow duplicates from common skypix overlaps to make some queries
667
- # run faster.
668
- query._allow_duplicate_overlaps = True
669
- result.butler_query = query.materialize()
670
- yield result
752
+ refined_candidates = [dimensions for dimensions in candidates if dimensions < parent_dimensions]
753
+ return [
754
+ dimensions
755
+ for dimensions in refined_candidates
756
+ if not any(dimensions < other for other in refined_candidates)
757
+ ]
758
+
759
+ @classmethod
760
+ def populate_branches(
761
+ cls,
762
+ parent_dimensions: DimensionGroup | None,
763
+ branches: dict[DimensionGroup, _DimensionGroupBranch],
764
+ ) -> dict[DimensionGroup, _DimensionGroupBranch]:
765
+ """Transform a flat mapping of dimension group branches into a tree.
766
+
767
+ Parameters
768
+ ----------
769
+ parent_dimensions : `~lsst.daf.butler.DimensionGroup` or `None`
770
+ If not `None`, ignore any candidates in `branches` that are not
771
+ strict subsets of these dimensions.
772
+ branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
773
+ `_DimensionGroupBranch` ]
774
+ Flat mapping of all branches to update in-place, by populating
775
+ the `branches` attributes to form a tree and removing entries that
776
+ have been put into the tree.
777
+
778
+ Returns
779
+ -------
780
+ uncontained_branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
781
+ `_DimensionGroupBranch` ]
782
+ Branches whose dimensions were not subsets of any others in the
783
+ mapping except those that were supersets of ``parent_dimensions``.
784
+ """
785
+ result: dict[DimensionGroup, _DimensionGroupBranch] = {}
786
+ for parent_branch_dimensions in cls.find_next_uncontained_dimensions(
787
+ parent_dimensions, branches.keys()
788
+ ):
789
+ parent_branch = branches.pop(parent_branch_dimensions)
790
+ result[parent_branch_dimensions] = parent_branch
791
+ for child_branch_dimensions, child_branch in cls.populate_branches(
792
+ parent_branch_dimensions, branches
793
+ ).items():
794
+ parent_branch.branches[child_branch_dimensions] = child_branch
795
+ return result
671
796
 
672
- def format_query_cmd(self, *header: str) -> str:
673
- """Format the butler query call used as a multi-line string.
797
+ def project_data_ids(self, log: LsstLogAdapter, log_indent: str = " ") -> None:
798
+ """Populate the data ID sets of child branches from the data IDs in
799
+ this branch, recursively.
674
800
 
675
801
  Parameters
676
802
  ----------
677
- *header : `str`
678
- Initial lines the of the returned string, not including newlines.
803
+ log : `lsst.logging.LsstLogAdapter`
804
+ Logger to use for status reporting.
805
+ log_indent : `str`, optional
806
+ Indentation to prefix the log message. This is used when recursing
807
+ to make the branch structure clear.
679
808
  """
680
- lines = list(header)
681
- lines.extend(self.query_cmd)
682
- return "\n".join(lines)
809
+ for data_id in self.data_ids:
810
+ for branch_dimensions, branch in self.branches.items():
811
+ branch.data_ids.add(data_id.subset(branch_dimensions))
812
+ for branch_dimensions, branch in self.branches.items():
813
+ log.debug("%sProjecting query data IDs to %s.", log_indent, branch_dimensions)
814
+ branch.project_data_ids(log, log_indent + " ")
815
+
816
+ def update_skeleton(
817
+ self, skeleton: QuantumGraphSkeleton, log: LsstLogAdapter, log_indent: str = " "
818
+ ) -> None:
819
+ """Process the data ID sets of this branch and its children recursively
820
+ to add nodes and edges to the under-construction quantum graph.
821
+
822
+ Parameters
823
+ ----------
824
+ skeleton : `QuantumGraphSkeleton`
825
+ Under-construction quantum graph to modify in place.
826
+ log : `lsst.logging.LsstLogAdapter`
827
+ Logger to use for status reporting.
828
+ log_indent : `str`, optional
829
+ Indentation to prefix the log message. This is used when recursing
830
+ to make the branch structure clear.
831
+ """
832
+ for branch_dimensions, branch in self.branches.items():
833
+ log.verbose(
834
+ "%sAdding nodes and edges for %s %s data ID(s).",
835
+ log_indent,
836
+ len(branch.data_ids),
837
+ branch_dimensions,
838
+ )
839
+ branch.update_skeleton(skeleton, log, log_indent + " ")
840
+ for data_id in self.data_ids:
841
+ for task_label in self.tasks:
842
+ skeleton.add_quantum_node(task_label, data_id)
843
+ for dataset_type_name in self.dataset_types:
844
+ skeleton.add_dataset_node(dataset_type_name, data_id)
845
+ quantum_keys: dict[str, QuantumKey] = {}
846
+ dataset_keys: dict[str, DatasetKey] = {}
847
+ for twig_dimensions, twig in self.twigs.items():
848
+ twig_data_id = data_id.subset(twig_dimensions)
849
+ for task_label in twig.parent_edge_tasks:
850
+ quantum_keys[task_label] = QuantumKey(task_label, twig_data_id.required_values)
851
+ for dataset_type_name in twig.parent_edge_dataset_types:
852
+ dataset_keys[dataset_type_name] = DatasetKey(
853
+ dataset_type_name, twig_data_id.required_values
854
+ )
855
+ for dataset_type_name, task_label in self.input_edges:
856
+ skeleton.add_input_edge(quantum_keys[task_label], dataset_keys[dataset_type_name])
857
+ for task_label, dataset_type_name in self.output_edges:
858
+ skeleton.add_output_edge(quantum_keys[task_label], dataset_keys[dataset_type_name])
859
+ if not self.has_followup_queries:
860
+ # Delete data IDs we don't need anymore to save memory.
861
+ del self.data_ids
862
+
863
+
864
+ @dataclasses.dataclass(eq=False, repr=False)
865
+ class _DimensionGroupTree:
866
+ """A tree of dimension groups in which branches are subsets of their
867
+ parents.
683
868
 
684
- def log_failure(self, log: LsstLogAdapter) -> None:
685
- """Emit an ERROR-level log message that attempts to explain
686
- why the initial data ID query returned no rows.
869
+ This class holds all of the per-subgraph state for this QG builder
870
+ subclass.
871
+
872
+ Notes
873
+ -----
874
+ The full set of dimensions referenced by any task or dataset type (except
875
+ prerequisite inputs) forms the conceptual "trunk" of this tree. Each
876
+ branch has a subset of the dimensions of its parent branch, and each set
877
+ of dimensions appears exactly once in a tree (so there is some flexibility
878
+ in where certain dimension subsets may appear; right now this is resolved
879
+ somewhat arbitrarily).
880
+ We do not add branches for every possible dimension subset; a branch is
881
+ created for a `~lsst.daf.butler.DimensionGroup` if:
882
+
883
+ - if there is a task whose quanta have those dimensions;
884
+ - if there is a non-prerequisite dataset type with those dimensions;
885
+ - if there is an edge for which the union of the task and dataset type
886
+ dimensions are those dimensions;
887
+ - if there is a dimension element in any task or non-prerequisite dataset
888
+ type dimensions whose `~lsst.daf.butler.DimensionElement.minimal_group`
889
+ is those dimensions.
890
+
891
+ We process the initial data query by recursing through this tree structure
892
+ to populate a data ID set for each branch
893
+ (`_DimensionGroupBranch.project_data_ids`), and then process those sets
894
+ recursively (`_DimensionGroupBranch.update_skeleton`). This can be far
895
+ faster than the non-recursive processing the QG builder used to use because
896
+ the set of data IDs is smaller (sometimes dramatically smaller) as we move
897
+ to smaller sets of dimensions.
898
+
899
+ In addition to their child branches, a branch that is used to define graph
900
+ edges also has "twigs", which are a flatter set of dimension subsets for
901
+ each of the tasks and dataset types that appear in that branch's edges.
902
+ The same twig dimensions can appear in multiple branches, and twig
903
+ dimensions can be the same as their parent branch's (but not a superset).
904
+ """
905
+
906
+ subgraph: PipelineGraph
907
+ """Graph of this subset of the pipeline."""
908
+
909
+ all_dimensions: DimensionGroup = dataclasses.field(init=False)
910
+ """The union of all dimensions that appear in any task or
911
+ (non-prerequisite) dataset type in this subgraph.
912
+ """
913
+
914
+ empty_dimensions_branch: _DimensionGroupBranch = dataclasses.field(init=False)
915
+ """The tasks and dataset types of this subset of this pipeline that have
916
+ empty dimensions.
917
+
918
+ Prerequisite dataset types are not included.
919
+ """
920
+
921
+ trunk_branches: dict[DimensionGroup, _DimensionGroupBranch] = dataclasses.field(init=False)
922
+ """The top-level branches in the tree of dimension groups.
923
+ """
924
+
925
+ branches_by_dimensions: dict[DimensionGroup, _DimensionGroupBranch] = dataclasses.field(init=False)
926
+ """The tasks and dataset types of this subset of the pipeline, grouped
927
+ by their dimensions.
928
+
929
+ The tasks and dataset types with empty dimensions are not included; they're
930
+ in `empty_dimensions_tree` since they are usually used differently.
931
+ Prerequisite dataset types are also not included.
932
+
933
+ This is a flatter view of the objects in `trunk_branches`.
934
+ """
935
+
936
+ overall_inputs: dict[str, DatasetTypeNode] = dataclasses.field(init=False)
937
+ """Pipeline graph nodes for all non-prerequisite, non-init overall-input
938
+ dataset types for this subset of the pipeline.
939
+ """
940
+
941
+ def __post_init__(self) -> None:
942
+ universe = self.subgraph.universe
943
+ assert universe is not None, "Pipeline graph is resolved."
944
+ self.branches_by_dimensions = {
945
+ dimensions: _DimensionGroupBranch(tasks, dataset_types)
946
+ for dimensions, (tasks, dataset_types) in self.subgraph.group_by_dimensions().items()
947
+ }
948
+ self.all_dimensions = _union_dimensions(self.branches_by_dimensions.keys(), universe)
949
+ _DimensionGroupBranch.populate_record_elements(self.all_dimensions, self.branches_by_dimensions)
950
+ _DimensionGroupBranch.populate_edges(self.subgraph, self.branches_by_dimensions)
951
+ self.trunk_branches = _DimensionGroupBranch.populate_branches(
952
+ None, self.branches_by_dimensions.copy()
953
+ )
954
+ self.empty_dimensions_branch = self.branches_by_dimensions.pop(
955
+ universe.empty, _DimensionGroupBranch()
956
+ )
957
+ self.overall_inputs = {
958
+ name: node # type: ignore
959
+ for name, node in self.subgraph.iter_overall_inputs()
960
+ if not node.is_prerequisite # type: ignore
961
+ }
962
+
963
+ def project_data_ids(self, log: LsstLogAdapter) -> None:
964
+ """Recursively populate the data ID sets of the dimension group tree
965
+ from the data ID sets of the trunk branches.
687
966
 
688
967
  Parameters
689
968
  ----------
690
- log : `logging.Logger`
691
- The logger to use to emit log messages.
969
+ log : `lsst.logging.LsstLogAdapter`
970
+ Logger to use for status reporting.
692
971
  """
693
- # A single multiline log plays better with log aggregators like Loki.
694
- header = ["Initial data ID query returned no rows, so QuantumGraph will be empty."]
695
- try:
696
- header.extend(self.butler_query.explain_no_results())
697
- header.append("To reproduce this query for debugging purposes, run:")
698
- finally:
699
- # If an exception was raised, write a partial.
700
- log.error(self.format_query_cmd(*header))
972
+ for branch_dimensions, branch in self.trunk_branches.items():
973
+ log.debug("Projecting query data IDs to %s.", branch_dimensions)
974
+ branch.project_data_ids(log)
701
975
 
702
976
 
703
977
  class DimensionRecordAttacher:
@@ -829,3 +1103,10 @@ class DataIdExpansionLeftovers:
829
1103
  missing_record_data_ids: defaultdict[str, set[tuple[DataIdValue, ...]]] = dataclasses.field(
830
1104
  default_factory=lambda: defaultdict(set)
831
1105
  )
1106
+
1107
+
1108
+ def _union_dimensions(groups: Iterable[DimensionGroup], universe: DimensionUniverse) -> DimensionGroup:
1109
+ dimension_names: set[str] = set()
1110
+ for dimensions_for_group in groups:
1111
+ dimension_names.update(dimensions_for_group.names)
1112
+ return universe.conform(dimension_names)