lsst-pipe-base 29.2025.2600__py3-none-any.whl → 29.2025.2800__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,8 +34,9 @@ from __future__ import annotations
34
34
  __all__ = ("AllDimensionsQuantumGraphBuilder", "DatasetQueryConstraintVariant")
35
35
 
36
36
  import dataclasses
37
+ import itertools
37
38
  from collections import defaultdict
38
- from collections.abc import Iterable, Mapping
39
+ from collections.abc import Callable, Iterable, Mapping
39
40
  from typing import TYPE_CHECKING, Any, final
40
41
 
41
42
  import astropy.table
@@ -44,10 +45,13 @@ from lsst.daf.butler import (
44
45
  Butler,
45
46
  DataCoordinate,
46
47
  DimensionDataAttacher,
48
+ DimensionElement,
47
49
  DimensionGroup,
48
50
  DimensionRecordSet,
49
51
  MissingDatasetTypeError,
52
+ SkyPixDimension,
50
53
  )
54
+ from lsst.sphgeom import RangeSet
51
55
  from lsst.utils.logging import LsstLogAdapter, PeriodicLogger
52
56
  from lsst.utils.timer import timeMethod
53
57
 
@@ -132,14 +136,17 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
132
136
  # be the same as or a dimension-subset of another. This is an
133
137
  # optimization opportunity we're not currently taking advantage of.
134
138
  tree = _DimensionGroupTree(subgraph)
139
+ tree.build(self.dataset_query_constraint, self.data_id_tables, log=self.log)
140
+ tree.pprint(printer=self.log.debug)
135
141
  self._query_for_data_ids(tree)
142
+ dimension_records = self._fetch_most_dimension_records(tree)
143
+ tree.generate_data_ids(self.log)
136
144
  skeleton = self._make_subgraph_skeleton(tree)
137
145
  if not skeleton.has_any_quanta:
138
146
  # QG is going to be empty; exit early not just for efficiency, but
139
147
  # also so downstream code doesn't have to guard against this case.
140
148
  return skeleton
141
149
  self._find_followup_datasets(tree, skeleton)
142
- dimension_records = self._fetch_most_dimension_records(tree)
143
150
  self._attach_dimension_records(skeleton, dimension_records)
144
151
  return skeleton
145
152
 
@@ -153,42 +160,14 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
153
160
  Tree with dimension group branches that holds subgraph-specific
154
161
  state for this builder, to be modified in place.
155
162
  """
156
- self.log.debug("Analyzing subgraph dimensions and overall-inputs.")
157
- constraint_datasets: set[str] = set()
158
- self.log.debug("Building query for data IDs.")
159
- if self.dataset_query_constraint == DatasetQueryConstraintVariant.ALL:
160
- self.log.debug("Constraining graph query using all datasets not marked as deferred.")
161
- constraint_datasets = {
162
- name
163
- for name, dataset_type_node in tree.overall_inputs.items()
164
- if (dataset_type_node.is_initial_query_constraint and dataset_type_node.dimensions)
165
- }
166
- elif self.dataset_query_constraint == DatasetQueryConstraintVariant.OFF:
167
- self.log.debug("Not using dataset existence to constrain query.")
168
- elif self.dataset_query_constraint == DatasetQueryConstraintVariant.LIST:
169
- constraint = set(self.dataset_query_constraint)
170
- inputs = tree.overall_inputs - tree.empty_dimensions_branch.dataset_types.keys()
171
- if remainder := constraint.difference(inputs):
172
- self.log.debug(
173
- "Ignoring dataset types %s in dataset query constraint that are not inputs to this "
174
- "subgraph, on the assumption that they are relevant for a different subgraph.",
175
- remainder,
176
- )
177
- constraint.intersection_update(inputs)
178
- self.log.debug(f"Constraining graph query using {constraint}")
179
- constraint_datasets = constraint
180
- else:
181
- raise QuantumGraphBuilderError(
182
- f"Unable to handle type {self.dataset_query_constraint} given as datasetQueryConstraint."
183
- )
184
163
  query_cmd: list[str] = []
185
164
  with self.butler.query() as query:
186
165
  query_cmd.append("with butler.query() as query:")
187
- query_cmd.append(f" query = query.join_dimensions({list(tree.all_dimensions.names)})")
188
- query = query.join_dimensions(tree.all_dimensions)
189
- if constraint_datasets:
166
+ query_cmd.append(f" query = query.join_dimensions({list(tree.queryable_dimensions.names)})")
167
+ query = query.join_dimensions(tree.queryable_dimensions)
168
+ if tree.dataset_constraint:
190
169
  query_cmd.append(f" collections = {list(self.input_collections)}")
191
- for dataset_type_name in constraint_datasets:
170
+ for dataset_type_name in tree.dataset_constraint:
192
171
  query_cmd.append(f" query = query.join_dataset_search({dataset_type_name!r}, collections)")
193
172
  try:
194
173
  query = query.join_dataset_search(dataset_type_name, self.input_collections)
@@ -221,7 +200,7 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
221
200
  # quickly as possible in case that holds a connection/cursor open.
222
201
  n_rows = 0
223
202
  progress_logger: PeriodicLogger | None = None
224
- for common_data_id in query.data_ids(tree.all_dimensions):
203
+ for common_data_id in query.data_ids(tree.queryable_dimensions):
225
204
  if progress_logger is None:
226
205
  # There can be a long wait between submitting the query and
227
206
  # returning the first row, so we want to make sure we log
@@ -230,7 +209,7 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
230
209
  # first log is seen.
231
210
  self.log.info("Iterating over data ID query results.")
232
211
  progress_logger = PeriodicLogger(self.log)
233
- for branch_dimensions, branch in tree.trunk_branches.items():
212
+ for branch_dimensions, branch in tree.queryable_branches.items():
234
213
  data_id = common_data_id.subset(branch_dimensions)
235
214
  branch.data_ids.add(data_id)
236
215
  n_rows += 1
@@ -272,13 +251,20 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
272
251
  Preliminary quantum graph.
273
252
  """
274
253
  skeleton = QuantumGraphSkeleton(tree.subgraph.tasks)
275
- for branch_dimensions, branch in tree.trunk_branches.items():
254
+ for branch_dimensions, branch in tree.branches_by_dimensions.items():
255
+ self.log.verbose(
256
+ "Adding nodes for %s %s data ID(s).",
257
+ len(branch.data_ids),
258
+ branch_dimensions,
259
+ )
260
+ branch.update_skeleton_nodes(skeleton)
261
+ for branch_dimensions, branch in tree.branches_by_dimensions.items():
276
262
  self.log.verbose(
277
- "Adding nodes and edges for %s %s data ID(s).",
263
+ "Adding edges for %s %s data ID(s).",
278
264
  len(branch.data_ids),
279
265
  branch_dimensions,
280
266
  )
281
- branch.update_skeleton(skeleton, self.log)
267
+ branch.update_skeleton_edges(skeleton)
282
268
  n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in tree.subgraph.tasks)
283
269
  self.log.info(
284
270
  "Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges.",
@@ -302,16 +288,18 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
302
288
  In-progress quantum graph to modify in place.
303
289
  """
304
290
  dataset_key: DatasetKey | PrerequisiteDatasetKey
305
- for dataset_type_name in tree.empty_dimensions_branch.dataset_types.keys():
306
- dataset_key = DatasetKey(dataset_type_name, self.empty_data_id.required_values)
307
- if ref := self.empty_dimensions_datasets.inputs.get(dataset_key):
308
- skeleton.set_dataset_ref(ref, dataset_key)
309
- if ref := self.empty_dimensions_datasets.outputs_for_skip.get(dataset_key):
310
- skeleton.set_output_for_skip(ref)
311
- if ref := self.empty_dimensions_datasets.outputs_in_the_way.get(dataset_key):
312
- skeleton.set_output_in_the_way(ref)
313
291
  for dimensions, branch in tree.branches_by_dimensions.items():
314
- if not branch.has_followup_queries:
292
+ if not dimensions:
293
+ for dataset_type_name in branch.dataset_types.keys():
294
+ dataset_key = DatasetKey(dataset_type_name, self.empty_data_id.required_values)
295
+ if ref := self.empty_dimensions_datasets.inputs.get(dataset_key):
296
+ skeleton.set_dataset_ref(ref, dataset_key)
297
+ if ref := self.empty_dimensions_datasets.outputs_for_skip.get(dataset_key):
298
+ skeleton.set_output_for_skip(ref)
299
+ if ref := self.empty_dimensions_datasets.outputs_in_the_way.get(dataset_key):
300
+ skeleton.set_output_in_the_way(ref)
301
+ continue
302
+ if not branch.dataset_types and not branch.tasks:
315
303
  continue
316
304
  if not branch.data_ids:
317
305
  continue
@@ -320,7 +308,7 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
320
308
  with self.butler.query() as butler_query:
321
309
  butler_query = butler_query.join_data_coordinates(branch.data_ids)
322
310
  for dataset_type_node in branch.dataset_types.values():
323
- if dataset_type_node.name in tree.overall_inputs:
311
+ if tree.subgraph.producer_of(dataset_type_node.name) is None:
324
312
  # Dataset type is an overall input; we always need to
325
313
  # try to find these.
326
314
  count = 0
@@ -457,9 +445,8 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
457
445
  finder.dataset_type_node.name,
458
446
  task_node.label,
459
447
  )
460
- if not branch.record_elements:
461
- # Delete data ID sets we don't need anymore.
462
- del branch.data_ids
448
+ # Delete data ID sets we don't need anymore to save memory.
449
+ del branch.data_ids
463
450
 
464
451
  @timeMethod
465
452
  def _fetch_most_dimension_records(self, tree: _DimensionGroupTree) -> list[DimensionRecordSet]:
@@ -468,8 +455,9 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
468
455
 
469
456
  Parameters
470
457
  ----------
471
- query : `_AllDimensionsQuery`
472
- Object representing the materialized sub-pipeline data ID query.
458
+ tree : `_DimensionGroupTree`
459
+ Tree with dimension group branches that holds subgraph-specific
460
+ state for this builder.
473
461
 
474
462
  Returns
475
463
  -------
@@ -485,18 +473,15 @@ class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder):
485
473
  self.log.verbose("Performing follow-up queries for dimension records.")
486
474
  result: list[DimensionRecordSet] = []
487
475
  for branch in tree.branches_by_dimensions.values():
488
- if not branch.record_elements:
476
+ if not branch.dimension_records:
489
477
  continue
490
478
  if not branch.data_ids:
491
479
  continue
492
480
  with self.butler.query() as butler_query:
493
481
  butler_query = butler_query.join_data_coordinates(branch.data_ids)
494
- for element in branch.record_elements:
495
- result.append(
496
- DimensionRecordSet(
497
- element, butler_query.dimension_records(element), universe=self.universe
498
- )
499
- )
482
+ for record_set in branch.dimension_records:
483
+ record_set.update(butler_query.dimension_records(record_set.element.name))
484
+ result.append(record_set)
500
485
  return result
501
486
 
502
487
  @timeMethod
@@ -575,10 +560,8 @@ class _DimensionGroupBranch:
575
560
  dataset type name.
576
561
  """
577
562
 
578
- record_elements: list[str] = dataclasses.field(default_factory=list)
579
- """The names of dimension elements whose records should be looked up via
580
- these dimensions.
581
- """
563
+ dimension_records: list[DimensionRecordSet] = dataclasses.field(default_factory=list)
564
+ """Sets of dimension records looked up with these dimensions."""
582
565
 
583
566
  data_ids: set[DataCoordinate] = dataclasses.field(default_factory=set)
584
567
  """All data IDs with these dimensions seen in the QuantumGraph."""
@@ -599,7 +582,8 @@ class _DimensionGroupBranch:
599
582
 
600
583
  branches: dict[DimensionGroup, _DimensionGroupBranch] = dataclasses.field(default_factory=dict)
601
584
  """Child branches whose dimensions are strict subsets of this branch's
602
- dimensions.
585
+ dimensions, populated by projecting this branch's set of data IDs (i.e.
586
+ remove a dimension, then deduplicate).
603
587
  """
604
588
 
605
589
  twigs: defaultdict[DimensionGroup, _DimensionGroupTwig] = dataclasses.field(
@@ -609,146 +593,16 @@ class _DimensionGroupBranch:
609
593
  edge in `input_edges` or `output_edges`.
610
594
  """
611
595
 
612
- @property
613
- def has_followup_queries(self) -> bool:
614
- """Whether we will need to perform follow-up queries with these
615
- dimensions.
616
- """
617
- return bool(self.tasks or self.dataset_types or self.record_elements)
618
-
619
- @staticmethod
620
- def populate_record_elements(
621
- all_dimensions: DimensionGroup, branches: dict[DimensionGroup, _DimensionGroupBranch]
622
- ) -> None:
623
- """Ensure we have branches for all dimension elements we'll need to
624
- fetch dimension records for.
625
-
626
- Parameters
627
- ----------
628
- all_dimensions : `~lsst.daf.butler.DimensionGroup`
629
- All dimensions that appear in the quantum graph.
630
- branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
631
- `_DimensionGroupBranch` ]
632
- Flat mapping of all branches to update in-place. New branches may
633
- be added and existing branches may have their `record_element`
634
- attributes updated.
635
- """
636
- for element_name in all_dimensions.elements:
637
- element = all_dimensions.universe[element_name]
638
- if element.minimal_group in branches:
639
- branches[element.minimal_group].record_elements.append(element_name)
640
- else:
641
- branches[element.minimal_group] = _DimensionGroupBranch(record_elements=[element_name])
642
-
643
- @staticmethod
644
- def populate_edges(
645
- pipeline_graph: PipelineGraph, branches: dict[DimensionGroup, _DimensionGroupBranch]
596
+ def pprint(
597
+ self,
598
+ dimensions: DimensionGroup,
599
+ indent: str = " ",
600
+ suffix: str = "",
601
+ printer: Callable[[str], None] = print,
646
602
  ) -> None:
647
- """Ensure we have branches for all edges in the graph.
648
-
649
- Parameters
650
- ----------
651
- pipeline_graph : `~..pipeline_graph.PipelineGraph``
652
- Graph of tasks and dataset types.
653
- branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
654
- `_DimensionGroupBranch` ]
655
- Flat mapping of all branches to update in-place. New branches may
656
- be added and existing branches may have their `input_edges`,
657
- `output_edges`, and `twigs` attributes updated.
658
- """
659
-
660
- def update_edge_branch(
661
- task_node: TaskNode, dataset_type_node: DatasetTypeNode
662
- ) -> _DimensionGroupBranch:
663
- union_dimensions = task_node.dimensions.union(dataset_type_node.dimensions)
664
- if (branch := branches.get(union_dimensions)) is None:
665
- branch = _DimensionGroupBranch()
666
- branches[union_dimensions] = branch
667
- branch.twigs[dataset_type_node.dimensions].parent_edge_dataset_types.add(dataset_type_node.name)
668
- branch.twigs[task_node.dimensions].parent_edge_tasks.add(task_node.label)
669
- return branch
670
-
671
- for task_node in pipeline_graph.tasks.values():
672
- for dataset_type_node in pipeline_graph.inputs_of(task_node.label).values():
673
- assert dataset_type_node is not None, "Pipeline graph is resolved."
674
- if dataset_type_node.is_prerequisite:
675
- continue
676
- branch = update_edge_branch(task_node, dataset_type_node)
677
- branch.input_edges.append((dataset_type_node.name, task_node.label))
678
- for dataset_type_node in pipeline_graph.outputs_of(task_node.label).values():
679
- assert dataset_type_node is not None, "Pipeline graph is resolved."
680
- branch = update_edge_branch(task_node, dataset_type_node)
681
- branch.output_edges.append((task_node.label, dataset_type_node.name))
682
-
683
- @staticmethod
684
- def find_next_uncontained_dimensions(
685
- parent_dimensions: DimensionGroup | None, candidates: Iterable[DimensionGroup]
686
- ) -> list[DimensionGroup]:
687
- """Find dimension groups that are not a subset of any other dimension
688
- groups in a set.
689
-
690
- Parameters
691
- ----------
692
- parent_dimensions : `~lsst.daf.butler.DimensionGroup` or `None`
693
- If not `None`, first filter out any candidates that are not strict
694
- subsets of these dimensions.
695
- candidates : `~collections.abc.Iterable` [\
696
- `~lsst.daf.butler.DimensionGroup` ]
697
- Iterable of dimension groups to consider.
698
-
699
- Returns
700
- -------
701
- uncontained : `list` [ `~lsst.daf.butler.DimensionGroup` ]
702
- Dimension groups that are not contained by any other dimension
703
- group in the set of filtered candidates.
704
- """
705
- if parent_dimensions is None:
706
- refined_candidates = candidates
707
- else:
708
- refined_candidates = [dimensions for dimensions in candidates if dimensions < parent_dimensions]
709
- return [
710
- dimensions
711
- for dimensions in refined_candidates
712
- if not any(dimensions < other for other in refined_candidates)
713
- ]
714
-
715
- @classmethod
716
- def populate_branches(
717
- cls,
718
- parent_dimensions: DimensionGroup | None,
719
- branches: dict[DimensionGroup, _DimensionGroupBranch],
720
- ) -> dict[DimensionGroup, _DimensionGroupBranch]:
721
- """Transform a flat mapping of dimension group branches into a tree.
722
-
723
- Parameters
724
- ----------
725
- parent_dimensions : `~lsst.daf.butler.DimensionGroup` or `None`
726
- If not `None`, ignore any candidates in `branches` that are not
727
- strict subsets of these dimensions.
728
- branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
729
- `_DimensionGroupBranch` ]
730
- Flat mapping of all branches to update in-place, by populating
731
- the `branches` attributes to form a tree and removing entries that
732
- have been put into the tree.
733
-
734
- Returns
735
- -------
736
- uncontained_branches : `dict` [ `~lsst.daf.butler.DimensionGroup`,\
737
- `_DimensionGroupBranch` ]
738
- Branches whose dimensions were not subsets of any others in the
739
- mapping except those that were supersets of ``parent_dimensions``.
740
- """
741
- result: dict[DimensionGroup, _DimensionGroupBranch] = {}
742
- for parent_branch_dimensions in cls.find_next_uncontained_dimensions(
743
- parent_dimensions, branches.keys()
744
- ):
745
- parent_branch = branches.pop(parent_branch_dimensions)
746
- result[parent_branch_dimensions] = parent_branch
747
- for child_branch_dimensions, child_branch in cls.populate_branches(
748
- parent_branch_dimensions, branches
749
- ).items():
750
- parent_branch.branches[child_branch_dimensions] = child_branch
751
- return result
603
+ printer(f"{indent}{dimensions}{suffix}")
604
+ for branch_dimensions, branch in self.branches.items():
605
+ branch.pprint(branch_dimensions, indent + " ", printer=printer)
752
606
 
753
607
  def project_data_ids(self, log: LsstLogAdapter, log_indent: str = " ") -> None:
754
608
  """Populate the data ID sets of child branches from the data IDs in
@@ -766,12 +620,10 @@ class _DimensionGroupBranch:
766
620
  for branch_dimensions, branch in self.branches.items():
767
621
  branch.data_ids.add(data_id.subset(branch_dimensions))
768
622
  for branch_dimensions, branch in self.branches.items():
769
- log.debug("%sProjecting query data IDs to %s.", log_indent, branch_dimensions)
623
+ log.verbose("%sProjecting query data ID(s) to %s.", log_indent, branch_dimensions)
770
624
  branch.project_data_ids(log, log_indent + " ")
771
625
 
772
- def update_skeleton(
773
- self, skeleton: QuantumGraphSkeleton, log: LsstLogAdapter, log_indent: str = " "
774
- ) -> None:
626
+ def update_skeleton_nodes(self, skeleton: QuantumGraphSkeleton) -> None:
775
627
  """Process the data ID sets of this branch and its children recursively
776
628
  to add nodes and edges to the under-construction quantum graph.
777
629
 
@@ -779,25 +631,23 @@ class _DimensionGroupBranch:
779
631
  ----------
780
632
  skeleton : `QuantumGraphSkeleton`
781
633
  Under-construction quantum graph to modify in place.
782
- log : `lsst.logging.LsstLogAdapter`
783
- Logger to use for status reporting.
784
- log_indent : `str`, optional
785
- Indentation to prefix the log message. This is used when recursing
786
- to make the branch structure clear.
787
634
  """
788
- for branch_dimensions, branch in self.branches.items():
789
- log.verbose(
790
- "%sAdding nodes and edges for %s %s data ID(s).",
791
- log_indent,
792
- len(branch.data_ids),
793
- branch_dimensions,
794
- )
795
- branch.update_skeleton(skeleton, log, log_indent + " ")
796
635
  for data_id in self.data_ids:
797
636
  for task_label in self.tasks:
798
637
  skeleton.add_quantum_node(task_label, data_id)
799
638
  for dataset_type_name in self.dataset_types:
800
639
  skeleton.add_dataset_node(dataset_type_name, data_id)
640
+
641
+ def update_skeleton_edges(self, skeleton: QuantumGraphSkeleton) -> None:
642
+ """Process the data ID sets of this branch and its children recursively
643
+ to add nodes and edges to the under-construction quantum graph.
644
+
645
+ Parameters
646
+ ----------
647
+ skeleton : `QuantumGraphSkeleton`
648
+ Under-construction quantum graph to modify in place.
649
+ """
650
+ for data_id in self.data_ids:
801
651
  quantum_keys: dict[str, QuantumKey] = {}
802
652
  dataset_keys: dict[str, DatasetKey] = {}
803
653
  for twig_dimensions, twig in self.twigs.items():
@@ -812,7 +662,7 @@ class _DimensionGroupBranch:
812
662
  skeleton.add_input_edge(quantum_keys[task_label], dataset_keys[dataset_type_name])
813
663
  for task_label, dataset_type_name in self.output_edges:
814
664
  skeleton.add_output_edge(quantum_keys[task_label], dataset_keys[dataset_type_name])
815
- if not self.has_followup_queries:
665
+ if not self.dataset_types and not self.tasks:
816
666
  # Delete data IDs we don't need anymore to save memory.
817
667
  del self.data_ids
818
668
 
@@ -842,15 +692,18 @@ class _DimensionGroupTree:
842
692
  dimensions are those dimensions;
843
693
  - if there is a dimension element in any task or non-prerequisite dataset
844
694
  type dimensions whose `~lsst.daf.butler.DimensionElement.minimal_group`
845
- is those dimensions.
695
+ is those dimensions (allowing us to look up dimension records).
696
+
697
+ In addition, for any dimension group that has unqueryable dimensions (e.g.
698
+ non-common skypix dimensions, like healpix), we create a branch for the
699
+ subset of the group with only queryable dimensions.
846
700
 
847
701
  We process the initial data query by recursing through this tree structure
848
702
  to populate a data ID set for each branch
849
- (`_DimensionGroupBranch.project_data_ids`), and then process those sets
850
- recursively (`_DimensionGroupBranch.update_skeleton`). This can be far
851
- faster than the non-recursive processing the QG builder used to use because
852
- the set of data IDs is smaller (sometimes dramatically smaller) as we move
853
- to smaller sets of dimensions.
703
+ (`_DimensionGroupBranch.project_data_ids`), and then process those sets.
704
+ This can be far faster than the non-recursive processing the QG builder
705
+ used to use because the set of data IDs is smaller (sometimes dramatically
706
+ smaller) as we move to smaller sets of dimensions.
854
707
 
855
708
  In addition to their child branches, a branch that is used to define graph
856
709
  edges also has "twigs", which are a flatter set of dimension subsets for
@@ -867,31 +720,35 @@ class _DimensionGroupTree:
867
720
  (non-prerequisite) dataset type in this subgraph.
868
721
  """
869
722
 
870
- empty_dimensions_branch: _DimensionGroupBranch = dataclasses.field(init=False)
871
- """The tasks and dataset types of this subset of this pipeline that have
872
- empty dimensions.
873
-
874
- Prerequisite dataset types are not included.
875
- """
876
-
877
- trunk_branches: dict[DimensionGroup, _DimensionGroupBranch] = dataclasses.field(init=False)
878
- """The top-level branches in the tree of dimension groups.
723
+ queryable_dimensions: DimensionGroup = dataclasses.field(init=False)
724
+ """All dimensions except those that cannot be queried for directly via the
725
+ butler (e.g. skypix systems other than the common one).
879
726
  """
880
727
 
881
728
  branches_by_dimensions: dict[DimensionGroup, _DimensionGroupBranch] = dataclasses.field(init=False)
882
729
  """The tasks and dataset types of this subset of the pipeline, grouped
883
730
  by their dimensions.
731
+ """
732
+
733
+ dataset_constraint: set[str] = dataclasses.field(default_factory=set)
734
+ """The names of dataset types used as query constraints."""
884
735
 
885
- The tasks and dataset types with empty dimensions are not included; they're
886
- in `empty_dimensions_tree` since they are usually used differently.
887
- Prerequisite dataset types are also not included.
736
+ queryable_branches: dict[DimensionGroup, _DimensionGroupBranch] = dataclasses.field(default_factory=dict)
737
+ """The top-level branches in the tree of dimension groups populated by the
738
+ butler query.
888
739
 
889
- This is a flatter view of the objects in `trunk_branches`.
740
+ Data IDs in these branches are populated from the top down, with each
741
+ branch a projection ("remove dimension, then deduplicate") of its parent,
742
+ starting with the query result rows.
890
743
  """
891
744
 
892
- overall_inputs: dict[str, DatasetTypeNode] = dataclasses.field(init=False)
893
- """Pipeline graph nodes for all non-prerequisite, non-init overall-input
894
- dataset types for this subset of the pipeline.
745
+ generators: list[DataIdGenerator] = dataclasses.field(default_factory=list)
746
+ """Branches for dimensions groups that are populated by algorithmically
747
+ generating data IDs from those in one or more other branches.
748
+
749
+ These are typically variants on the theme of adding a skypix dimension to
750
+ another set of dimensions by identifying the sky pixels that overlap the
751
+ region of the original dimensions.
895
752
  """
896
753
 
897
754
  def __post_init__(self) -> None:
@@ -902,29 +759,751 @@ class _DimensionGroupTree:
902
759
  for dimensions, (tasks, dataset_types) in self.subgraph.group_by_dimensions().items()
903
760
  }
904
761
  self.all_dimensions = DimensionGroup.union(*self.branches_by_dimensions.keys(), universe=universe)
905
- _DimensionGroupBranch.populate_record_elements(self.all_dimensions, self.branches_by_dimensions)
906
- _DimensionGroupBranch.populate_edges(self.subgraph, self.branches_by_dimensions)
907
- self.trunk_branches = _DimensionGroupBranch.populate_branches(
908
- None, self.branches_by_dimensions.copy()
909
- )
910
- self.empty_dimensions_branch = self.branches_by_dimensions.pop(
911
- universe.empty, _DimensionGroupBranch()
912
- )
913
- self.overall_inputs = {
762
+
763
+ def build(
764
+ self,
765
+ requested: DatasetQueryConstraintVariant,
766
+ data_id_tables: Iterable[astropy.table.Table],
767
+ *,
768
+ log: LsstLogAdapter,
769
+ ) -> None:
770
+ """Organize the branches into a tree.
771
+
772
+ Parameters
773
+ ----------
774
+ requested : `DatasetQueryConstraintVariant`
775
+ Query constraint specified by the user.
776
+ data_id_tables : `~collections.abc.Iterable` [ `astropy.table.Table` ]
777
+ Data ID tables being joined into the query.
778
+ log : `lsst.log.LsstLogAdapter`
779
+ Logger that supports ``verbose`` output.
780
+ """
781
+ universe = self.all_dimensions.universe
782
+ self._make_dimension_record_branches()
783
+ self._make_edge_branches()
784
+ self._set_dataset_constraint(requested, log)
785
+ # Work out which dimensions we can potentially query the database for.
786
+ # We start out by dropping all skypix dimensions other than the common
787
+ # one, and then we add them back in if a constraint dataset type or
788
+ # data ID table provides them.
789
+ unqueryable_skypix = universe.conform(self.all_dimensions.skypix - {universe.commonSkyPix.name})
790
+ self.queryable_dimensions = self.all_dimensions.difference(unqueryable_skypix)
791
+ for dataset_type_name in sorted(self.dataset_constraint):
792
+ dataset_type_dimensions = self.subgraph.dataset_types[dataset_type_name].dimensions
793
+ dataset_type_skypix = dataset_type_dimensions.intersection(unqueryable_skypix)
794
+ if dataset_type_skypix:
795
+ log.info(
796
+ f"Including {dataset_type_skypix} in the set of dimensions to query via "
797
+ f"{dataset_type_name}. If this query fails, exclude those dataset type "
798
+ "from the constraint or provide a data ID table for missing spatial joins."
799
+ )
800
+ self.queryable_dimensions = self.queryable_dimensions.union(dataset_type_dimensions)
801
+ for data_id_table in data_id_tables:
802
+ table_dimensions = universe.conform(data_id_table.colnames)
803
+ if table_dimensions.skypix:
804
+ self.queryable_dimensions = self.queryable_dimensions.union(table_dimensions)
805
+ # Set up the tree to generate most data IDs by querying for them from
806
+ # the database and then projecting to subset dimensions.
807
+ branches_not_in_tree = set(self.branches_by_dimensions.keys())
808
+ self._make_queryable_branch_tree(branches_not_in_tree)
809
+ # Try to find ways to generate other data IDs directly from the
810
+ # queryable branches.
811
+ self._make_queryable_overlap_branch_generators(branches_not_in_tree)
812
+ # As long as there are still branches that haven't been inserted into
813
+ # the tree, try to add them as projections of generated branches or
814
+ # generators on generated branches.
815
+ while branches_not_in_tree:
816
+ # Look for projections first, since those are more efficient, and
817
+ # some may be available after we've added some generators.
818
+ # We intentionally add the same branch as a projection of multiple
819
+ # parents since (unlike queryable dimensions) there's no guarantee
820
+ # that each parent branch's data IDs would project to the same set
821
+ # (e.g. a visit-healpix overlap may yield different healpixels than
822
+ # a patch-healpix overlap, even if the visits and patches overlap).
823
+ for target_dimensions in sorted(branches_not_in_tree):
824
+ for generator in self.generators:
825
+ if self._maybe_insert_projection_branch(
826
+ target_dimensions, generator.dimensions, generator.branch.branches
827
+ ):
828
+ branches_not_in_tree.discard(target_dimensions)
829
+ if not self._make_general_overlap_branch_generator(branches_not_in_tree):
830
+ break
831
+ # After we've exhausted overlap generation, try generation via joins
832
+ # of dimensions we can already query for or generate.
833
+ while branches_not_in_tree:
834
+ if not self._make_join_branch_generator(branches_not_in_tree):
835
+ raise QuantumGraphBuilderError(f"Could not generate data IDs for {branches_not_in_tree}.")
836
+
837
+ def _set_dataset_constraint(self, requested: DatasetQueryConstraintVariant, log: LsstLogAdapter) -> None:
838
+ """Set the dataset query constraint.
839
+
840
+ Parameters
841
+ ----------
842
+ requested : `DatasetQueryConstraintVariant`
843
+ Query constraint specified by the user.
844
+ log : `lsst.log.LsstLogAdapter`
845
+ Logger that supports ``verbose`` output.
846
+ """
847
+ overall_inputs: dict[str, DatasetTypeNode] = {
914
848
  name: node # type: ignore
915
849
  for name, node in self.subgraph.iter_overall_inputs()
916
850
  if not node.is_prerequisite # type: ignore
917
851
  }
852
+ match requested:
853
+ case DatasetQueryConstraintVariant.ALL:
854
+ self.dataset_constraint = {
855
+ name
856
+ for name, dataset_type_node in overall_inputs.items()
857
+ if (dataset_type_node.is_initial_query_constraint and dataset_type_node.dimensions)
858
+ }
859
+ case DatasetQueryConstraintVariant.OFF:
860
+ pass
861
+ case DatasetQueryConstraintVariant.LIST:
862
+ self.dataset_constraint = set(requested)
863
+ inputs = {
864
+ name for name, dataset_type_node in overall_inputs.items() if dataset_type_node.dimensions
865
+ }
866
+ if remainder := self.dataset_constraint.difference(inputs):
867
+ log.verbose(
868
+ "Ignoring dataset types %s in dataset query constraint that are not inputs to this "
869
+ "subgraph, on the assumption that they are relevant for a different subgraph.",
870
+ remainder,
871
+ )
872
+ self.dataset_constraint.intersection_update(inputs)
873
+ case _:
874
+ raise QuantumGraphBuilderError(
875
+ f"Unable to handle type {requested} given as dataset query constraint."
876
+ )
877
+
878
+ def _make_dimension_record_branches(self) -> None:
879
+ """Ensure we have branches for all dimension elements we'll need to
880
+ fetch dimension records for.
881
+ """
882
+ for element_name in self.all_dimensions.elements:
883
+ element = self.all_dimensions.universe[element_name]
884
+ record_set = DimensionRecordSet(element_name, universe=self.all_dimensions.universe)
885
+ if element.minimal_group in self.branches_by_dimensions:
886
+ self.branches_by_dimensions[element.minimal_group].dimension_records.append(record_set)
887
+ else:
888
+ self.branches_by_dimensions[element.minimal_group] = _DimensionGroupBranch(
889
+ dimension_records=[record_set]
890
+ )
891
+
892
+ def _make_edge_branches(self) -> None:
893
+ """Ensure we have branches for all edges in the graph."""
894
+
895
+ def update_edge_branch(
896
+ task_node: TaskNode, dataset_type_node: DatasetTypeNode
897
+ ) -> _DimensionGroupBranch:
898
+ union_dimensions = task_node.dimensions.union(dataset_type_node.dimensions)
899
+ if (branch := self.branches_by_dimensions.get(union_dimensions)) is None:
900
+ branch = _DimensionGroupBranch()
901
+ self.branches_by_dimensions[union_dimensions] = branch
902
+ branch.twigs[dataset_type_node.dimensions].parent_edge_dataset_types.add(dataset_type_node.name)
903
+ branch.twigs[task_node.dimensions].parent_edge_tasks.add(task_node.label)
904
+ return branch
905
+
906
+ for task_node in self.subgraph.tasks.values():
907
+ for dataset_type_node in self.subgraph.inputs_of(task_node.label).values():
908
+ assert dataset_type_node is not None, "Pipeline graph is resolved."
909
+ if dataset_type_node.is_prerequisite:
910
+ continue
911
+ branch = update_edge_branch(task_node, dataset_type_node)
912
+ branch.input_edges.append((dataset_type_node.name, task_node.label))
913
+ for dataset_type_node in self.subgraph.outputs_of(task_node.label).values():
914
+ assert dataset_type_node is not None, "Pipeline graph is resolved."
915
+ branch = update_edge_branch(task_node, dataset_type_node)
916
+ branch.output_edges.append((task_node.label, dataset_type_node.name))
917
+
918
+ def _make_queryable_branch_tree(self, branches_not_in_tree: set[DimensionGroup]) -> None:
919
+ """Assemble the branches with queryable dimensions into a tree, in
920
+ which each branch has a subset of the dimensions of its parent.
921
+
922
+ Parameters
923
+ ----------
924
+ branches_not_in_tree : `set` [ `lsst.daf.butler.DimensionGroup` ]
925
+ Dimensions that have not yet been inserted into the tree. Updated
926
+ in place.
927
+ """
928
+ for target_dimensions in sorted(branches_not_in_tree):
929
+ if target_dimensions.issubset(self.queryable_dimensions):
930
+ if self._maybe_insert_projection_branch(
931
+ target_dimensions, self.queryable_dimensions, self.queryable_branches
932
+ ):
933
+ branches_not_in_tree.remove(target_dimensions)
934
+ else:
935
+ raise AssertionError(
936
+ "Projection-branch insertion should not fail for queryable dimensions."
937
+ )
938
+
939
+ def _maybe_insert_projection_branch(
940
+ self,
941
+ target_dimensions: DimensionGroup,
942
+ candidate_dimensions: DimensionGroup,
943
+ candidate_projection_branches: dict[DimensionGroup, _DimensionGroupBranch],
944
+ ) -> bool:
945
+ """Insert a branch at the appropriate location in a [sub]tree.
946
+
947
+ Branches are inserted below the first parent branch whose dimensions
948
+ are a superset of their own.
949
+
950
+ Parameters
951
+ ----------
952
+ target_dimensions : `lsst.daf.butler.DimensionGroup`
953
+ Dimensions of the branch to be inserted.
954
+ candidate_dimensions : `lsst.daf.butler.DimensionGroup`
955
+ Dimensions of the subtree the branch might be inserted under. If
956
+ this is not a superset of ``target_dimensions``, this method
957
+ returns `False` and nothing is done.
958
+ candidate_projection_branches : `dict` [ \
959
+ `lsst.daf.butler.DimensionGroup`, `_DimensionGroupBranch` ]
960
+ Subtree branches to be updated directly or indirectly (i.e. in a
961
+ nested branch).
962
+
963
+ Returns
964
+ -------
965
+ inserted : `bool`
966
+ Whether the branch was actually inserted.
967
+ """
968
+ if candidate_dimensions >= target_dimensions:
969
+ target_branch = self.branches_by_dimensions[target_dimensions]
970
+ for child_dimensions in list(candidate_projection_branches.keys()):
971
+ if self._maybe_insert_projection_branch(
972
+ child_dimensions, target_dimensions, target_branch.branches
973
+ ):
974
+ del candidate_projection_branches[child_dimensions]
975
+ for child_dimensions, child_branch in candidate_projection_branches.items():
976
+ if self._maybe_insert_projection_branch(
977
+ target_dimensions, child_dimensions, child_branch.branches
978
+ ):
979
+ return True
980
+ candidate_projection_branches[target_dimensions] = target_branch
981
+ return True
982
+ return False
983
+
984
+ def _make_queryable_overlap_branch_generators(self, branches_not_in_tree: set[DimensionGroup]) -> None:
985
+ """Add data ID generators for sets of dimensions that can only
986
+ partially queried for, with the rest needing to be generated by
987
+ manipulating the data IDs of the queryable subset.
988
+
989
+ Parameters
990
+ ----------
991
+ branches_not_in_tree : `set` [ `lsst.daf.butler.DimensionGroup` ]
992
+ Dimensions that have not yet been inserted into the tree. Updated
993
+ in place.
994
+ """
995
+ for target_dimensions in sorted(branches_not_in_tree):
996
+ queryable_subset_dimensions = target_dimensions.intersection(self.queryable_dimensions)
997
+ # Make sure we actually have a branch to capture the queryable
998
+ # subset data IDs (i.e. in case we didn't already have one for some
999
+ # dataset type or task, etc).
1000
+ if queryable_subset_dimensions not in self.branches_by_dimensions:
1001
+ # If we have to make a new queryable branch, we also have to
1002
+ # insert it into the tree so its data IDs get populated.
1003
+ self.branches_by_dimensions[queryable_subset_dimensions] = _DimensionGroupBranch()
1004
+ if not self._maybe_insert_projection_branch(
1005
+ queryable_subset_dimensions,
1006
+ self.queryable_dimensions,
1007
+ self.queryable_branches,
1008
+ ):
1009
+ raise AssertionError(
1010
+ "Projection-branch insertion should not fail for queryable dimensions."
1011
+ )
1012
+ if queryable_region_name := queryable_subset_dimensions.region_dimension:
1013
+ # If there is a single well-defined region for the queryable
1014
+ # subset, we can potentially generate skypix IDs from it.
1015
+ # Do the target dimensions just add a single skypix dimension
1016
+ # to the queryable subset?
1017
+ remainder_dimensions = target_dimensions - queryable_subset_dimensions
1018
+ if (remainder_skypix := get_single_skypix(remainder_dimensions)) is not None:
1019
+ queryable_region_element = target_dimensions.universe[queryable_region_name]
1020
+ self._append_data_id_generator(
1021
+ queryable_subset_dimensions,
1022
+ queryable_region_element,
1023
+ target_dimensions,
1024
+ remainder_skypix,
1025
+ branches_not_in_tree,
1026
+ )
1027
+
1028
+ def _append_data_id_generator(
1029
+ self,
1030
+ source_dimensions: DimensionGroup,
1031
+ source_region_element: DimensionElement,
1032
+ target_dimensions: DimensionGroup,
1033
+ remainder_skypix: SkyPixDimension,
1034
+ branches_not_in_tree: set[DimensionGroup],
1035
+ ) -> None:
1036
+ """Append an appropriate `DataIdGenerator` instance for generating
1037
+ data IDs with the given characteristics.
1038
+
1039
+ Parameters
1040
+ ----------
1041
+ source_dimensions : `lsst.daf.butler.DimensionGroup`
1042
+ Dimensions whose data IDs can already populated, to use as a
1043
+ starting point.
1044
+ source_region_element : `lsst.daf.butler.DimensionElement`
1045
+ Dimension element associated with the region for the source
1046
+ dimensions. It is guaranteed that there is exactly one such
1047
+ region.
1048
+ target_dimensions : `lsst.daf.butler.DimensionGroup`
1049
+ Dimensions of the data IDs to be generated.
1050
+ remainder_skypix : `lsst.daf.butler.SkyPixDimension`
1051
+ The single skypix dimension that is being added to
1052
+ ``source_dimensions`` to yield ``target_dimensions``.
1053
+ branches_not_in_tree : `set` [ `lsst.daf.butler.DimensionGroup` ]
1054
+ Dimensions that have not yet been inserted into the tree. Updated
1055
+ in place.
1056
+ """
1057
+ target_branch = self.branches_by_dimensions[target_dimensions]
1058
+ # We want to do the overlap calculation without any extra dimensions
1059
+ # beyond the two spatial dimensions, which may or may not be what we
1060
+ # already have.
1061
+ overlap_dimensions = source_region_element.minimal_group | remainder_skypix.minimal_group
1062
+ generator: DataIdGenerator
1063
+ if overlap_dimensions == target_dimensions:
1064
+ if isinstance(source_region_element, SkyPixDimension):
1065
+ if source_region_element.system == remainder_skypix.system:
1066
+ if source_region_element.level > remainder_skypix.level:
1067
+ generator = SkyPixGatherDataIdGenerator(
1068
+ target_branch,
1069
+ target_dimensions,
1070
+ source_dimensions,
1071
+ remainder_skypix,
1072
+ source_region_element,
1073
+ )
1074
+ else:
1075
+ generator = SkyPixScatterDataIdGenerator(
1076
+ target_branch,
1077
+ target_dimensions,
1078
+ source_dimensions,
1079
+ remainder_skypix,
1080
+ source_region_element,
1081
+ )
1082
+ else:
1083
+ generator = CrossSystemDataIdGenerator(
1084
+ target_branch,
1085
+ target_dimensions,
1086
+ source_dimensions,
1087
+ remainder_skypix,
1088
+ source_region_element,
1089
+ )
1090
+ else:
1091
+ generator = DatabaseSourceDataIdGenerator(
1092
+ target_branch,
1093
+ target_dimensions,
1094
+ source_dimensions,
1095
+ remainder_skypix,
1096
+ source_region_element,
1097
+ )
1098
+ # We know we can populate the data IDs in remainder_skypix_branch
1099
+ # from the target branch by projection. Even if it's already
1100
+ # populated by some other generated branch, we want to populate it
1101
+ # again in case that picks up additional sky pixels.
1102
+ target_branch.branches[remainder_skypix.minimal_group] = self.branches_by_dimensions[
1103
+ remainder_skypix.minimal_group
1104
+ ]
1105
+ branches_not_in_tree.discard(remainder_skypix.minimal_group)
1106
+ else:
1107
+ if overlap_dimensions not in self.branches_by_dimensions:
1108
+ self.branches_by_dimensions[overlap_dimensions] = _DimensionGroupBranch()
1109
+ branches_not_in_tree.add(overlap_dimensions)
1110
+ self._append_data_id_generator(
1111
+ source_region_element.minimal_group,
1112
+ source_region_element,
1113
+ overlap_dimensions,
1114
+ remainder_skypix,
1115
+ branches_not_in_tree,
1116
+ )
1117
+ generator = JoinDataIdGenerator(
1118
+ target_branch,
1119
+ target_dimensions,
1120
+ source_dimensions,
1121
+ overlap_dimensions,
1122
+ )
1123
+ self.generators.append(generator)
1124
+ branches_not_in_tree.remove(target_dimensions)
1125
+
1126
+ def _make_general_overlap_branch_generator(self, branches_not_in_tree: set[DimensionGroup]) -> bool:
1127
+ """Add data ID generators for sets of dimensions that can be generated
1128
+ via skypix envelopes of other generated data IDs.
1129
+
1130
+ This method should be called in a loop until it returns `False`
1131
+ (indicating no progress was made) or ``branches_not_in_tree`` is empty
1132
+ (indicating no more work to be done).
1133
+
1134
+ Parameters
1135
+ ----------
1136
+ branches_not_in_tree : `set` [ `lsst.daf.butler.DimensionGroup` ]
1137
+ Dimensions that have not yet been inserted into the tree. Updated
1138
+ in place.
1139
+
1140
+ Returns
1141
+ -------
1142
+ appended : `bool`
1143
+ Whether a new data ID generator was successfully appended.
1144
+ """
1145
+ dimensions_done = sorted(self.branches_by_dimensions.keys() - branches_not_in_tree)
1146
+ for source_dimensions in dimensions_done:
1147
+ for target_dimensions in sorted(branches_not_in_tree):
1148
+ if not source_dimensions <= target_dimensions:
1149
+ continue
1150
+ remainder_dimensions = target_dimensions - source_dimensions
1151
+ if (remainder_skypix := get_single_skypix(remainder_dimensions)) is not None:
1152
+ if source_region_name := source_dimensions.region_dimension:
1153
+ # If the target dimensions are just adding a single
1154
+ # skypix to the source dimensions and the source
1155
+ # dimensions have a single region column, we can
1156
+ # generate the skypix indices from the envelopes of
1157
+ # those regions.
1158
+ source_region_element = source_dimensions.universe[source_region_name]
1159
+ self._append_data_id_generator(
1160
+ source_dimensions,
1161
+ source_region_element,
1162
+ target_dimensions,
1163
+ remainder_skypix,
1164
+ branches_not_in_tree,
1165
+ )
1166
+ return True
1167
+ return not branches_not_in_tree
1168
+
1169
+ def _make_join_branch_generator(self, branches_not_in_tree: set[DimensionGroup]) -> bool:
1170
+ """Add data ID generators for sets of dimensions that can be generated
1171
+ via inner joints of other generated data IDs.
1172
+
1173
+ This method should be called in a loop until it returns `False`
1174
+ (indicating no progress was made) or ``branches_not_in_tree`` is empty
1175
+ (indicating no more work to be done).
1176
+
1177
+ Parameters
1178
+ ----------
1179
+ branches_not_in_tree : `set` [ `lsst.daf.butler.DimensionGroup` ]
1180
+ Dimensions that have not yet been inserted into the tree. Updated
1181
+ in place.
1182
+
1183
+ Returns
1184
+ -------
1185
+ appended : `bool`
1186
+ Whether a new data ID generator was successfully appended.
1187
+ """
1188
+ for target_dimensions in sorted(branches_not_in_tree):
1189
+ dimensions_done = sorted(self.branches_by_dimensions.keys() - branches_not_in_tree)
1190
+ candidates_by_common: dict[DimensionGroup, tuple[DimensionGroup, DimensionGroup]] = {}
1191
+ for operand1, operand2 in itertools.combinations(dimensions_done, 2):
1192
+ if operand1.union(operand2) == target_dimensions:
1193
+ candidates_by_common[operand1.intersection(operand2)] = (operand1, operand2)
1194
+ if candidates_by_common:
1195
+ # Because DimensionGroup defines a set-like inequality
1196
+ # operator, 'max' returns the set of dimensions that contains
1197
+ # as many of the other sets of dimensions as possible, which is
1198
+ # a reasonable guess at the most-constrained join.
1199
+ operand1, operand2 = candidates_by_common[max(candidates_by_common)]
1200
+ generator = JoinDataIdGenerator(
1201
+ self.branches_by_dimensions[target_dimensions],
1202
+ target_dimensions,
1203
+ operand1,
1204
+ operand2,
1205
+ )
1206
+ self.generators.append(generator)
1207
+ branches_not_in_tree.remove(target_dimensions)
1208
+ return True
1209
+ return not branches_not_in_tree
918
1210
 
919
1211
  def project_data_ids(self, log: LsstLogAdapter) -> None:
920
1212
  """Recursively populate the data ID sets of the dimension group tree
921
- from the data ID sets of the trunk branches.
1213
+ from the data ID sets of the queryable branches.
922
1214
 
923
1215
  Parameters
924
1216
  ----------
925
1217
  log : `lsst.logging.LsstLogAdapter`
926
1218
  Logger to use for status reporting.
927
1219
  """
928
- for branch_dimensions, branch in self.trunk_branches.items():
929
- log.debug("Projecting query data IDs to %s.", branch_dimensions)
1220
+ for branch_dimensions, branch in self.queryable_branches.items():
1221
+ log.verbose("Projecting query data ID(s) to %s.", branch_dimensions)
930
1222
  branch.project_data_ids(log)
1223
+
1224
+ def generate_data_ids(self, log: LsstLogAdapter) -> None:
1225
+ """Run all data ID generators.
1226
+
1227
+ This runs data ID generators and projects data IDs to their subset
1228
+ dimensions. It can only be called after queryable data IDs have been
1229
+ populated and dimension records fetched.
1230
+
1231
+ Parameters
1232
+ ----------
1233
+ log : `lsst.logging.LsstLogAdapter`
1234
+ Logger to use for status reporting.
1235
+ """
1236
+ for generator in self.generators:
1237
+ generator.run(log, self.branches_by_dimensions)
1238
+ generator.branch.project_data_ids(log, log_indent=" ")
1239
+
1240
+ def pprint(self, printer: Callable[[str], None] = print) -> None:
1241
+ """Print a human-readable representation of the dimensions tree.
1242
+
1243
+ Parameters
1244
+ ----------
1245
+ printer : `~collections.abc.Callable`, optional
1246
+ A function that takes a single string argument and prints a single
1247
+ line (including a newline). Default is the built-in `print`
1248
+ function.
1249
+ """
1250
+ printer("Queryable:")
1251
+ for branch_dimensions, branch in self.queryable_branches.items():
1252
+ branch.pprint(branch_dimensions, " ", printer=printer)
1253
+ printer("Generator:")
1254
+ for generator in self.generators:
1255
+ generator.pprint(" ", printer=printer)
1256
+
1257
+
1258
+ def get_single_skypix(dimensions: DimensionGroup) -> SkyPixDimension | None:
1259
+ """Try to coerce a dimension group a single skypix dimenison.
1260
+
1261
+ Parameters
1262
+ ----------
1263
+ dimensions : `lsst.daf.butler.DimensionGroup`
1264
+ Input dimensions.
1265
+
1266
+ Returns
1267
+ -------
1268
+ skypix : `lsst.daf.butler.SkyPixDimension` or `None`
1269
+ A skypix dimension that is the only dimension in the given group, or
1270
+ `None` in all other cases.
1271
+ """
1272
+ if len(dimensions) == 1:
1273
+ (name,) = dimensions.names
1274
+ return dimensions.universe.skypix_dimensions.get(name)
1275
+ return None
1276
+
1277
+
1278
+ @dataclasses.dataclass
1279
+ class DataIdGenerator:
1280
+ """A base class for generators for quantum and dataset data IDs that cannot
1281
+ be directly queried for.
1282
+ """
1283
+
1284
+ branch: _DimensionGroupBranch
1285
+ """Branch of the dimensions tree that this generator populates."""
1286
+
1287
+ dimensions: DimensionGroup
1288
+ """Dimensions of the data IDs generated."""
1289
+
1290
+ source: DimensionGroup
1291
+ """Dimensions of another set of data IDs that this generator uses as a
1292
+ starting point.
1293
+ """
1294
+
1295
+ def pprint(self, indent: str = " ", printer: Callable[[str], None] = print) -> None:
1296
+ """Print a human-readable representation of this generator.
1297
+
1298
+ Parameters
1299
+ ----------
1300
+ indent : `str`
1301
+ Blank spaces to prefix the output with (useful when this is nested
1302
+ in hierarchical object being printed).
1303
+ printer : `~collections.abc.Callable`, optional
1304
+ A function that takes a single string argument and prints a single
1305
+ line (including a newline). Default is the built-in `print`
1306
+ function.
1307
+ """
1308
+ self.branch.pprint(
1309
+ self.dimensions,
1310
+ indent,
1311
+ f" <- {self.source} ({self.__class__.__name__})",
1312
+ printer=printer,
1313
+ )
1314
+
1315
+ def run(self, log: LsstLogAdapter, branches: Mapping[DimensionGroup, _DimensionGroupBranch]) -> None:
1316
+ """Run the generator, populating its branch's data IDs.
1317
+
1318
+ Parameters
1319
+ ----------
1320
+ log : `lsst.log.LsstLogAdapter`
1321
+ Logger with a ``verbose`` method as well as the built-in ones.
1322
+ branches : `~collections.abc.Mapping`
1323
+ Mapping of other dimension branches, keyed by their dimensions.
1324
+ """
1325
+ raise NotImplementedError()
1326
+
1327
+
1328
+ @dataclasses.dataclass
1329
+ class DatabaseSourceDataIdGenerator(DataIdGenerator):
1330
+ """A data ID generator that generates skypix indices from the envelope of
1331
+ regions stored in the database.
1332
+ """
1333
+
1334
+ remainder_skypix: SkyPixDimension
1335
+ """A single additional skypix dimension to be added to the source
1336
+ dimensions.
1337
+ """
1338
+
1339
+ source_element: DimensionElement
1340
+ """Dimension element that the database-stored regions are associated with.
1341
+ """
1342
+
1343
+ def run(self, log: LsstLogAdapter, branches: Mapping[DimensionGroup, _DimensionGroupBranch]) -> None:
1344
+ # Docstring inherited.
1345
+ source_branch = branches[self.source]
1346
+ log.verbose(
1347
+ "Generating %s data IDs via %s envelope of %s %s region(s).",
1348
+ self.dimensions,
1349
+ self.remainder_skypix,
1350
+ len(source_branch.data_ids),
1351
+ self.source_element,
1352
+ )
1353
+ pixelization = self.remainder_skypix.pixelization
1354
+ (source_records,) = [
1355
+ record_set
1356
+ for record_set in source_branch.dimension_records
1357
+ if record_set.element == self.source_element
1358
+ ]
1359
+ for source_data_id in source_branch.data_ids:
1360
+ source_record = source_records.find(source_data_id)
1361
+ for begin, end in pixelization.envelope(source_record.region):
1362
+ for index in range(begin, end):
1363
+ target_data_id = DataCoordinate.standardize(
1364
+ source_data_id,
1365
+ **{self.remainder_skypix.name: index}, # type: ignore[arg-type]
1366
+ )
1367
+ self.branch.data_ids.add(target_data_id)
1368
+
1369
+
1370
+ @dataclasses.dataclass
1371
+ class CrossSystemDataIdGenerator(DataIdGenerator):
1372
+ """A data ID generator that generates skypix indices from the envelope of
1373
+ skypix regions from some other system (e.g. healpix from HTM).
1374
+ """
1375
+
1376
+ remainder_skypix: SkyPixDimension
1377
+ """A single additional skypix dimension to be added to the source
1378
+ dimensions.
1379
+ """
1380
+
1381
+ source_skypix: SkyPixDimension
1382
+ """Dimension element for the already-known skypix indices."""
1383
+
1384
+ def run(self, log: LsstLogAdapter, branches: Mapping[DimensionGroup, _DimensionGroupBranch]) -> None:
1385
+ # Docstring inherited.
1386
+ source_branch = branches[self.source]
1387
+ log.verbose(
1388
+ "Generating %s data IDs via %s envelope of %s %s region(s).",
1389
+ self.dimensions,
1390
+ self.remainder_skypix,
1391
+ len(source_branch.data_ids),
1392
+ self.source_skypix,
1393
+ )
1394
+ source_pixelization = self.source_skypix.pixelization
1395
+ remainder_pixelization = self.remainder_skypix.pixelization
1396
+ for source_data_id in source_branch.data_ids:
1397
+ source_region = source_pixelization.pixel(source_data_id[self.source_skypix.name])
1398
+ for begin, end in remainder_pixelization.envelope(source_region):
1399
+ for index in range(begin, end):
1400
+ target_data_id = DataCoordinate.standardize(
1401
+ source_data_id,
1402
+ **{self.remainder_skypix.name: index}, # type: ignore[arg-type]
1403
+ )
1404
+ self.branch.data_ids.add(target_data_id)
1405
+
1406
+
1407
+ @dataclasses.dataclass
1408
+ class SkyPixScatterDataIdGenerator(DataIdGenerator):
1409
+ """A data ID generator that generates skypix indices at a high (fine) level
1410
+ from low-level (coarse) indices in the same system.
1411
+ """
1412
+
1413
+ remainder_skypix: SkyPixDimension
1414
+ """A single additional skypix dimension to be added to the source
1415
+ dimensions.
1416
+ """
1417
+
1418
+ source_skypix: SkyPixDimension
1419
+ """Dimension element for the already-known skypix indices."""
1420
+
1421
+ def run(self, log: LsstLogAdapter, branches: Mapping[DimensionGroup, _DimensionGroupBranch]) -> None:
1422
+ # Docstring inherited.
1423
+ factor = 4 ** (self.remainder_skypix.level - self.source_skypix.level)
1424
+ source_branch = branches[self.source]
1425
+ log.verbose(
1426
+ "Generating %s data IDs by scaling %s %s IDs in %s by %s.",
1427
+ self.dimensions,
1428
+ len(source_branch.data_ids),
1429
+ self.remainder_skypix,
1430
+ self.source,
1431
+ factor,
1432
+ )
1433
+ for source_data_id in source_branch.data_ids:
1434
+ ranges = RangeSet(source_data_id[self.source_skypix.name])
1435
+ ranges.scale(factor)
1436
+ for begin, end in ranges:
1437
+ for index in range(begin, end):
1438
+ target_data_id = DataCoordinate.standardize(
1439
+ source_data_id,
1440
+ **{self.remainder_skypix.name: index}, # type: ignore[arg-type]
1441
+ )
1442
+ self.branch.data_ids.add(target_data_id)
1443
+
1444
+
1445
+ @dataclasses.dataclass
1446
+ class SkyPixGatherDataIdGenerator(DataIdGenerator):
1447
+ """A data ID generator that generates skypix indices at a low (coarse)
1448
+ level from high-level (fine) indices in the same system.
1449
+ """
1450
+
1451
+ remainder_skypix: SkyPixDimension
1452
+ """A single additional skypix dimension to be added to the source
1453
+ dimensions.
1454
+ """
1455
+
1456
+ source_skypix: SkyPixDimension
1457
+ """Dimension element for the already-known skypix indices."""
1458
+
1459
+ def run(self, log: LsstLogAdapter, branches: Mapping[DimensionGroup, _DimensionGroupBranch]) -> None:
1460
+ # Docstring inherited.
1461
+ factor = 4 ** (self.source_skypix.level - self.remainder_skypix.level)
1462
+ source_branch = branches[self.source]
1463
+ log.verbose(
1464
+ "Generating %s data IDs by dividing %s %s IDs in %s by %s.",
1465
+ self.dimensions,
1466
+ len(source_branch.data_ids),
1467
+ self.remainder_skypix,
1468
+ self.source,
1469
+ factor,
1470
+ )
1471
+ for source_data_id in source_branch.data_ids:
1472
+ index = source_data_id[self.source_skypix.name] // factor
1473
+ target_data_id = DataCoordinate.standardize(source_data_id, **{self.remainder_skypix.name: index})
1474
+ self.branch.data_ids.add(target_data_id)
1475
+
1476
+
1477
+ @dataclasses.dataclass
1478
+ class JoinDataIdGenerator(DataIdGenerator):
1479
+ """A data ID that does an inner join between two already-populated
1480
+ sets of data IDs.
1481
+ """
1482
+
1483
+ other: DimensionGroup
1484
+ """Dimensions of the other data ID branches to join to those of ``source``.
1485
+ """
1486
+
1487
+ def run(self, log: LsstLogAdapter, branches: Mapping[DimensionGroup, _DimensionGroupBranch]) -> None:
1488
+ # Docstring inherited.
1489
+ source_branch = branches[self.source]
1490
+ other_branch = branches[self.other]
1491
+ log.verbose(
1492
+ "Generating %s data IDs by joining %s (%s) to %s (%s).",
1493
+ self.dimensions,
1494
+ self.source,
1495
+ len(source_branch.data_ids),
1496
+ self.other,
1497
+ len(other_branch.data_ids),
1498
+ )
1499
+ common = self.source & self.other
1500
+ other_by_common: defaultdict[DataCoordinate, list[DataCoordinate]] = defaultdict(list)
1501
+ for other_data_id in other_branch.data_ids:
1502
+ other_by_common[other_data_id.subset(common)].append(other_data_id)
1503
+ source_by_common: defaultdict[DataCoordinate, list[DataCoordinate]] = defaultdict(list)
1504
+ for source_data_id in source_branch.data_ids:
1505
+ source_by_common[source_data_id.subset(common)].append(source_data_id)
1506
+ for common_data_id in other_by_common.keys() & source_by_common.keys():
1507
+ for other_data_id in other_by_common[common_data_id]:
1508
+ for source_data_id in source_by_common[common_data_id]:
1509
+ self.branch.data_ids.add(other_data_id.union(source_data_id))