lsst-daf-butler 30.0.0rc2__py3-none-any.whl → 30.0.0rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. lsst/daf/butler/_butler.py +8 -5
  2. lsst/daf/butler/_butler_metrics.py +49 -2
  3. lsst/daf/butler/_formatter.py +2 -7
  4. lsst/daf/butler/_labeled_butler_factory.py +28 -8
  5. lsst/daf/butler/_rubin/temporary_for_ingest.py +207 -0
  6. lsst/daf/butler/configs/datastores/formatters.yaml +1 -0
  7. lsst/daf/butler/configs/storageClasses.yaml +15 -0
  8. lsst/daf/butler/datastore/record_data.py +1 -1
  9. lsst/daf/butler/datastores/fileDatastore.py +15 -12
  10. lsst/daf/butler/dimensions/_coordinate.py +5 -0
  11. lsst/daf/butler/direct_butler/_direct_butler.py +45 -28
  12. lsst/daf/butler/logging.py +9 -3
  13. lsst/daf/butler/registry/bridge/monolithic.py +17 -13
  14. lsst/daf/butler/registry/datasets/byDimensions/_manager.py +49 -45
  15. lsst/daf/butler/registry/expand_data_ids.py +93 -0
  16. lsst/daf/butler/registry/interfaces/_database.py +6 -1
  17. lsst/daf/butler/registry/sql_registry.py +2 -24
  18. lsst/daf/butler/remote_butler/_remote_butler.py +5 -1
  19. lsst/daf/butler/tests/hybrid_butler.py +4 -1
  20. lsst/daf/butler/tests/registry_data/lsstcam-subset.yaml +191 -0
  21. lsst/daf/butler/tests/testFormatters.py +2 -2
  22. lsst/daf/butler/transfers/_context.py +7 -6
  23. lsst/daf/butler/version.py +1 -1
  24. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.0rc3.dist-info}/METADATA +1 -1
  25. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.0rc3.dist-info}/RECORD +33 -30
  26. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.0rc3.dist-info}/WHEEL +0 -0
  27. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.0rc3.dist-info}/entry_points.txt +0 -0
  28. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.0rc3.dist-info}/licenses/COPYRIGHT +0 -0
  29. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.0rc3.dist-info}/licenses/LICENSE +0 -0
  30. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.0rc3.dist-info}/licenses/bsd_license.txt +0 -0
  31. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.0rc3.dist-info}/licenses/gpl-v3.0.txt +0 -0
  32. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.0rc3.dist-info}/top_level.txt +0 -0
  33. {lsst_daf_butler-30.0.0rc2.dist-info → lsst_daf_butler-30.0.0rc3.dist-info}/zip-safe +0 -0
@@ -1822,12 +1822,25 @@ class DirectButler(Butler): # numpydoc ignore=PR02
1822
1822
  f" Example: {existing_datasets[0]}"
1823
1823
  )
1824
1824
 
1825
+ # Calculate some statistics based on the given list of datasets.
1826
+ n_files = len(datasets)
1827
+ n_datasets = 0
1828
+ for d in datasets:
1829
+ n_datasets += len(d.refs)
1830
+ sfiles = "s" if n_files != 1 else ""
1831
+ srefs = "s" if n_datasets != 1 else ""
1832
+
1825
1833
  # We use `datasets` rather `new_datasets` for the Registry
1826
1834
  # portion of this, to let it confirm that everything matches the
1827
1835
  # existing datasets.
1828
1836
  import_info = self._prepare_ingest_file_datasets(datasets, progress)
1829
1837
 
1830
- with self.transaction():
1838
+ with (
1839
+ self._metrics.instrument_ingest(
1840
+ n_datasets, _LOG, msg=f"Ingesting {n_files} file{sfiles} with {n_datasets} dataset{srefs}"
1841
+ ),
1842
+ self.transaction(),
1843
+ ):
1831
1844
  self._ingest_file_datasets(datasets, import_info, progress)
1832
1845
 
1833
1846
  # Bulk-insert everything into Datastore.
@@ -1982,7 +1995,7 @@ class DirectButler(Butler): # numpydoc ignore=PR02
1982
1995
  doImport(filename) # type: ignore
1983
1996
 
1984
1997
  def transfer_dimension_records_from(
1985
- self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
1998
+ self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
1986
1999
  ) -> None:
1987
2000
  # Allowed dimensions in the target butler.
1988
2001
  elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
@@ -2012,16 +2025,13 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2012
2025
  source_butler, data_ids, allowed_elements
2013
2026
  )
2014
2027
 
2015
- can_query = True if isinstance(source_butler, Butler) else False
2016
-
2017
2028
  additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2018
2029
  for original_element, record_mapping in primary_records.items():
2019
2030
  # Get dimensions that depend on this dimension.
2020
2031
  populated_by = self.dimensions.get_elements_populated_by(
2021
2032
  self.dimensions[original_element.name] # type: ignore
2022
2033
  )
2023
-
2024
- for data_id in record_mapping.keys():
2034
+ if populated_by:
2025
2035
  for element in populated_by:
2026
2036
  if element not in allowed_elements:
2027
2037
  continue
@@ -2040,28 +2050,32 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2040
2050
  # have to be scanned.
2041
2051
  continue
2042
2052
 
2043
- if not can_query:
2044
- raise RuntimeError(
2045
- f"Transferring populated_by records like {element.name} requires a full Butler."
2046
- )
2053
+ if record_mapping:
2054
+ if not isinstance(source_butler, Butler):
2055
+ raise RuntimeError(
2056
+ f"Transferring populated_by records like {element.name}"
2057
+ " requires a full Butler."
2058
+ )
2047
2059
 
2048
- records = source_butler.query_dimension_records( # type: ignore
2049
- element.name,
2050
- explain=False,
2051
- **data_id.mapping, # type: ignore
2052
- )
2053
- for record in records:
2054
- additional_records[record.definition].setdefault(record.dataId, record)
2060
+ with source_butler.query() as query:
2061
+ records = query.join_data_coordinates(record_mapping.keys()).dimension_records(
2062
+ element.name
2063
+ )
2064
+ for record in records:
2065
+ additional_records[record.definition].setdefault(record.dataId, record)
2055
2066
 
2056
2067
  # The next step is to walk back through the additional records to
2057
2068
  # pick up any missing content (such as visit_definition needing to
2058
2069
  # know the exposure). Want to ensure we do not request records we
2059
2070
  # already have.
2060
2071
  missing_data_ids = set()
2061
- for name, record_mapping in additional_records.items():
2072
+ for record_mapping in additional_records.values():
2062
2073
  for data_id in record_mapping.keys():
2063
- if data_id not in primary_records[name]:
2064
- missing_data_ids.add(data_id)
2074
+ for dimension in data_id.dimensions.required:
2075
+ element = source_butler.dimensions[dimension]
2076
+ dimension_key = data_id.subset(dimension)
2077
+ if dimension_key not in primary_records[element]:
2078
+ missing_data_ids.add(dimension_key)
2065
2079
 
2066
2080
  # Fill out the new records. Assume that these new records do not
2067
2081
  # also need to carry over additional populated_by records.
@@ -2078,19 +2092,19 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2078
2092
  def _extract_dimension_records_from_data_ids(
2079
2093
  self,
2080
2094
  source_butler: LimitedButler | Butler,
2081
- data_ids: set[DataCoordinate],
2095
+ data_ids: Iterable[DataCoordinate],
2082
2096
  allowed_elements: frozenset[DimensionElement],
2083
2097
  ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
2084
2098
  dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2085
2099
 
2100
+ data_ids = set(data_ids)
2101
+ if not all(data_id.hasRecords() for data_id in data_ids):
2102
+ if isinstance(source_butler, Butler):
2103
+ data_ids = source_butler._expand_data_ids(data_ids)
2104
+ else:
2105
+ raise TypeError("Input butler needs to be a full butler to expand DataId.")
2106
+
2086
2107
  for data_id in data_ids:
2087
- # Need an expanded record, if not expanded that we need a full
2088
- # butler with registry (allow mocks with registry too).
2089
- if not data_id.hasRecords():
2090
- if registry := getattr(source_butler, "registry", None):
2091
- data_id = registry.expandDataId(data_id)
2092
- else:
2093
- raise TypeError("Input butler needs to be a full butler to expand DataId.")
2094
2108
  # If this butler doesn't know about a dimension in the source
2095
2109
  # butler things will break later.
2096
2110
  for element_name in data_id.dimensions.elements:
@@ -2569,6 +2583,9 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2569
2583
  """Immediately load caches that are used for common operations."""
2570
2584
  self._registry.preload_cache(load_dimension_record_cache=load_dimension_record_cache)
2571
2585
 
2586
+ def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
2587
+ return self._registry.expand_data_ids(data_ids)
2588
+
2572
2589
  _config: ButlerConfig
2573
2590
  """Configuration for this Butler instance."""
2574
2591
 
@@ -764,11 +764,17 @@ class ButlerLogRecords(MutableSequence[ButlerLogRecord]):
764
764
 
765
765
 
766
766
  class ButlerLogRecordHandler(StreamHandler):
767
- """Python log handler that accumulates records."""
767
+ """Python log handler that accumulates records.
768
768
 
769
- def __init__(self) -> None:
769
+ Parameters
770
+ ----------
771
+ records : `ButlerLogRecords`, optional
772
+ Container to store logs in.
773
+ """
774
+
775
+ def __init__(self, records: ButlerLogRecords | None = None) -> None:
770
776
  super().__init__()
771
- self.records = ButlerLogRecords([])
777
+ self.records = ButlerLogRecords([]) if records is None else records
772
778
 
773
779
  def emit(self, record: LogRecord) -> None:
774
780
  self.records.append(record)
@@ -215,20 +215,24 @@ class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge):
215
215
  def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]:
216
216
  # Docstring inherited from DatastoreRegistryBridge
217
217
  byId = {ref.id: ref for ref in refs}
218
- sql = (
219
- sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
220
- .select_from(self._tables.dataset_location)
221
- .where(
222
- sqlalchemy.sql.and_(
223
- self._tables.dataset_location.columns.datastore_name == self.datastoreName,
224
- self._tables.dataset_location.columns.dataset_id.in_(byId.keys()),
218
+ found: list[DatasetIdRef] = []
219
+ with self._db.session():
220
+ for batch in chunk_iterable(byId.keys(), 50000):
221
+ sql = (
222
+ sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
223
+ .select_from(self._tables.dataset_location)
224
+ .where(
225
+ sqlalchemy.sql.and_(
226
+ self._tables.dataset_location.columns.datastore_name == self.datastoreName,
227
+ self._tables.dataset_location.columns.dataset_id.in_(batch),
228
+ )
229
+ )
225
230
  )
226
- )
227
- )
228
- with self._db.query(sql) as sql_result:
229
- sql_rows = sql_result.fetchall()
230
- for row in sql_rows:
231
- yield byId[row.dataset_id]
231
+ with self._db.query(sql) as sql_result:
232
+ sql_ids = sql_result.scalars().all()
233
+ found.extend(byId[id] for id in sql_ids)
234
+
235
+ return found
232
236
 
233
237
  @contextmanager
234
238
  def emptyTrash(
@@ -12,6 +12,8 @@ from typing import TYPE_CHECKING, Any, ClassVar
12
12
  import astropy.time
13
13
  import sqlalchemy
14
14
 
15
+ from lsst.utils.iteration import chunk_iterable
16
+
15
17
  from .... import ddl
16
18
  from ...._collection_type import CollectionType
17
19
  from ...._dataset_ref import DatasetId, DatasetIdFactory, DatasetIdGenEnum, DatasetRef
@@ -424,17 +426,18 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
424
426
  return result
425
427
 
426
428
  def get_dataset_refs(self, ids: list[DatasetId]) -> list[DatasetRef]:
427
- # Look up the dataset types corresponding to the given Dataset IDs.
428
- id_col = self._static.dataset.columns["id"]
429
- sql = sqlalchemy.sql.select(
430
- id_col,
431
- self._static.dataset.columns["dataset_type_id"],
432
- ).where(id_col.in_(ids))
433
- with self._db.query(sql) as sql_result:
434
- dataset_rows = sql_result.mappings().all()
435
- dataset_type_map: dict[DatasetId, DatasetType] = {
436
- row["id"]: self._get_dataset_type_by_id(row["dataset_type_id"]) for row in dataset_rows
437
- }
429
+ dataset_type_map: dict[DatasetId, DatasetType] = {}
430
+ for batch in chunk_iterable(set(ids), 50000):
431
+ # Look up the dataset types corresponding to the given Dataset IDs.
432
+ id_col = self._static.dataset.columns["id"]
433
+ sql = sqlalchemy.sql.select(
434
+ id_col,
435
+ self._static.dataset.columns["dataset_type_id"],
436
+ ).where(id_col.in_(batch))
437
+ with self._db.query(sql) as sql_result:
438
+ dataset_rows = sql_result.mappings().all()
439
+ for row in dataset_rows:
440
+ dataset_type_map[row["id"]] = self._get_dataset_type_by_id(row["dataset_type_id"])
438
441
 
439
442
  # Group the given dataset IDs by the DimensionGroup of their dataset
440
443
  # types -- there is a separate tags table for each DimensionGroup.
@@ -448,40 +451,41 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
448
451
  # data IDs corresponding to the UUIDs found from the dataset table.
449
452
  dynamic_tables = self._get_dynamic_tables(dimension_group)
450
453
  tags_table = self._get_tags_table(dynamic_tables)
451
- tags_sql = tags_table.select().where(tags_table.columns["dataset_id"].in_(datasets))
452
- # Join in the collection table to fetch the run name.
453
- collection_column = tags_table.columns[self._collections.getCollectionForeignKeyName()]
454
- joined_collections = self._collections.join_collections_sql(collection_column, tags_sql)
455
- tags_sql = joined_collections.joined_sql
456
- run_name_column = joined_collections.name_column
457
- tags_sql = tags_sql.add_columns(run_name_column)
458
- # Tags table includes run collections and tagged
459
- # collections.
460
- # In theory the data ID for a given dataset should be the
461
- # same in both, but nothing actually guarantees this.
462
- # So skip any tagged collections, using the run collection
463
- # as the definitive definition.
464
- tags_sql = tags_sql.where(joined_collections.type_column == int(CollectionType.RUN))
465
-
466
- with self._db.query(tags_sql) as sql_result:
467
- data_id_rows = sql_result.mappings().all()
468
-
469
- assert run_name_column.key is not None
470
- for data_id_row in data_id_rows:
471
- id = data_id_row["dataset_id"]
472
- dataset_type = dataset_type_map[id]
473
- run_name = data_id_row[run_name_column.key]
474
- data_id = DataCoordinate.from_required_values(
475
- dimension_group,
476
- tuple(data_id_row[dimension] for dimension in dimension_group.required),
477
- )
478
- ref = DatasetRef(
479
- datasetType=dataset_type,
480
- dataId=data_id,
481
- id=id,
482
- run=run_name,
483
- )
484
- output_refs.append(ref)
454
+ for batch in chunk_iterable(datasets, 50000):
455
+ tags_sql = tags_table.select().where(tags_table.columns["dataset_id"].in_(batch))
456
+ # Join in the collection table to fetch the run name.
457
+ collection_column = tags_table.columns[self._collections.getCollectionForeignKeyName()]
458
+ joined_collections = self._collections.join_collections_sql(collection_column, tags_sql)
459
+ tags_sql = joined_collections.joined_sql
460
+ run_name_column = joined_collections.name_column
461
+ tags_sql = tags_sql.add_columns(run_name_column)
462
+ # Tags table includes run collections and tagged
463
+ # collections.
464
+ # In theory the data ID for a given dataset should be the
465
+ # same in both, but nothing actually guarantees this.
466
+ # So skip any tagged collections, using the run collection
467
+ # as the definitive definition.
468
+ tags_sql = tags_sql.where(joined_collections.type_column == int(CollectionType.RUN))
469
+
470
+ with self._db.query(tags_sql) as sql_result:
471
+ data_id_rows = sql_result.mappings().all()
472
+
473
+ assert run_name_column.key is not None
474
+ for data_id_row in data_id_rows:
475
+ id = data_id_row["dataset_id"]
476
+ dataset_type = dataset_type_map[id]
477
+ run_name = data_id_row[run_name_column.key]
478
+ data_id = DataCoordinate.from_required_values(
479
+ dimension_group,
480
+ tuple(data_id_row[dimension] for dimension in dimension_group.required),
481
+ )
482
+ ref = DatasetRef(
483
+ datasetType=dataset_type,
484
+ dataId=data_id,
485
+ id=id,
486
+ run=run_name,
487
+ )
488
+ output_refs.append(ref)
485
489
 
486
490
  return output_refs
487
491
 
@@ -0,0 +1,93 @@
1
+ # This file is part of daf_butler.
2
+ #
3
+ # Developed for the LSST Data Management System.
4
+ # This product includes software developed by the LSST Project
5
+ # (http://www.lsst.org).
6
+ # See the COPYRIGHT file at the top-level directory of this distribution
7
+ # for details of code ownership.
8
+ #
9
+ # This software is dual licensed under the GNU General Public License and also
10
+ # under a 3-clause BSD license. Recipients may choose which of these licenses
11
+ # to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12
+ # respectively. If you choose the GPL option then the following text applies
13
+ # (but note that there is still no warranty even if you opt for BSD instead):
14
+ #
15
+ # This program is free software: you can redistribute it and/or modify
16
+ # it under the terms of the GNU General Public License as published by
17
+ # the Free Software Foundation, either version 3 of the License, or
18
+ # (at your option) any later version.
19
+ #
20
+ # This program is distributed in the hope that it will be useful,
21
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
+ # GNU General Public License for more details.
24
+ #
25
+ # You should have received a copy of the GNU General Public License
26
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
27
+
28
+ from __future__ import annotations
29
+
30
+ from collections import defaultdict
31
+ from collections.abc import Iterable
32
+
33
+ from ..dimensions import (
34
+ DataCoordinate,
35
+ DimensionDataAttacher,
36
+ DimensionGroup,
37
+ DimensionUniverse,
38
+ )
39
+ from ..dimensions.record_cache import DimensionRecordCache
40
+ from ..queries import QueryFactoryFunction
41
+
42
+
43
+ def expand_data_ids(
44
+ data_ids: Iterable[DataCoordinate],
45
+ universe: DimensionUniverse,
46
+ query_func: QueryFactoryFunction,
47
+ cache: DimensionRecordCache | None,
48
+ ) -> list[DataCoordinate]:
49
+ """Expand the given data IDs to look up implied dimension values and attach
50
+ dimension records.
51
+
52
+ Parameters
53
+ ----------
54
+ data_ids : `~collections.abc.Iterable` [ `DataCoordinate` ]
55
+ Data coordinates to be expanded.
56
+ universe : `DimensionUniverse`
57
+ Dimension universe associated with the given ``data_ids`` values.
58
+ query_func : QueryFactoryFunction
59
+ Function used to set up a Butler query context for looking up required
60
+ information from the database.
61
+ cache : `DimensionRecordCache` | None
62
+ Cache containing already-known dimension records. May be `None` if a
63
+ cache is not available.
64
+
65
+ Returns
66
+ -------
67
+ expanded : `list` [ `DataCoordinate` ]
68
+ List of `DataCoordinate` instances in the same order as the input
69
+ values. It is guaranteed that each `DataCoordinate` has
70
+ ``hasRecords()=True`` and ``hasFull()=True``.
71
+ """
72
+ output = list(data_ids)
73
+
74
+ grouped_by_dimensions: defaultdict[DimensionGroup, list[int]] = defaultdict(list)
75
+ for i, data_id in enumerate(data_ids):
76
+ if not data_id.hasRecords():
77
+ grouped_by_dimensions[data_id.dimensions].append(i)
78
+
79
+ if not grouped_by_dimensions:
80
+ # All given DataCoordinate values are already expanded.
81
+ return output
82
+
83
+ attacher = DimensionDataAttacher(
84
+ cache=cache,
85
+ dimensions=DimensionGroup.union(*grouped_by_dimensions.keys(), universe=universe),
86
+ )
87
+ for dimensions, indexes in grouped_by_dimensions.items():
88
+ with query_func() as query:
89
+ expanded = attacher.attach(dimensions, (output[index] for index in indexes), query)
90
+ for index, data_id in zip(indexes, expanded):
91
+ output[index] = data_id
92
+
93
+ return output
@@ -1562,7 +1562,12 @@ class Database(ABC):
1562
1562
  return None
1563
1563
  else:
1564
1564
  sql = table.insert()
1565
- return [connection.execute(sql, row).inserted_primary_key[0] for row in rows]
1565
+ ids = []
1566
+ for row in rows:
1567
+ key = connection.execute(sql, row).inserted_primary_key
1568
+ assert key is not None
1569
+ ids.append(key[0])
1570
+ return ids
1566
1571
 
1567
1572
  @abstractmethod
1568
1573
  def replace(self, table: sqlalchemy.schema.Table, *rows: dict) -> None:
@@ -34,7 +34,6 @@ __all__ = ("SqlRegistry",)
34
34
  import contextlib
35
35
  import logging
36
36
  import warnings
37
- from collections import defaultdict
38
37
  from collections.abc import Iterable, Iterator, Mapping, Sequence
39
38
  from typing import TYPE_CHECKING, Any
40
39
 
@@ -54,7 +53,6 @@ from ..dimensions import (
54
53
  DataCoordinate,
55
54
  DataId,
56
55
  DimensionConfig,
57
- DimensionDataAttacher,
58
56
  DimensionElement,
59
57
  DimensionGroup,
60
58
  DimensionRecord,
@@ -78,6 +76,7 @@ from ..registry.interfaces import ChainedCollectionRecord, ReadOnlyDatabaseError
78
76
  from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
79
77
  from ..registry.wildcards import CollectionWildcard, DatasetTypeWildcard
80
78
  from ..utils import transactional
79
+ from .expand_data_ids import expand_data_ids
81
80
 
82
81
  if TYPE_CHECKING:
83
82
  from .._butler_config import ButlerConfig
@@ -1415,28 +1414,7 @@ class SqlRegistry:
1415
1414
  return DataCoordinate.standardize(keys, dimensions=standardized.dimensions).expanded(records=records)
1416
1415
 
1417
1416
  def expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
1418
- output = list(data_ids)
1419
-
1420
- grouped_by_dimensions: defaultdict[DimensionGroup, list[int]] = defaultdict(list)
1421
- for i, data_id in enumerate(data_ids):
1422
- if not data_id.hasRecords():
1423
- grouped_by_dimensions[data_id.dimensions].append(i)
1424
-
1425
- if not grouped_by_dimensions:
1426
- # All given DataCoordinate values are already expanded.
1427
- return output
1428
-
1429
- attacher = DimensionDataAttacher(
1430
- cache=self.dimension_record_cache,
1431
- dimensions=DimensionGroup.union(*grouped_by_dimensions.keys(), universe=self.dimensions),
1432
- )
1433
- with self._query() as query:
1434
- for dimensions, indexes in grouped_by_dimensions.items():
1435
- expanded = attacher.attach(dimensions, (output[index] for index in indexes), query)
1436
- for index, data_id in zip(indexes, expanded):
1437
- output[index] = data_id
1438
-
1439
- return output
1417
+ return expand_data_ids(data_ids, self.dimensions, self._query, self.dimension_record_cache)
1440
1418
 
1441
1419
  def expand_refs(self, dataset_refs: list[DatasetRef]) -> list[DatasetRef]:
1442
1420
  expanded_ids = self.expand_data_ids([ref.dataId for ref in dataset_refs])
@@ -65,6 +65,7 @@ from ..dimensions import DataCoordinate, DataIdValue, DimensionConfig, Dimension
65
65
  from ..queries import Query
66
66
  from ..queries.tree import make_column_literal
67
67
  from ..registry import CollectionArgType, NoDefaultCollectionError, Registry, RegistryDefaults
68
+ from ..registry.expand_data_ids import expand_data_ids
68
69
  from ._collection_args import convert_collection_arg_to_glob_string_list
69
70
  from ._defaults import DefaultsHolder
70
71
  from ._get import convert_http_url_to_resource_path, get_dataset_as_python_object
@@ -633,7 +634,7 @@ class RemoteButler(Butler): # numpydoc ignore=PR02
633
634
  raise NotImplementedError()
634
635
 
635
636
  def transfer_dimension_records_from(
636
- self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
637
+ self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
637
638
  ) -> None:
638
639
  # Docstring inherited.
639
640
  raise NotImplementedError()
@@ -738,6 +739,9 @@ class RemoteButler(Butler): # numpydoc ignore=PR02
738
739
  def close(self) -> None:
739
740
  pass
740
741
 
742
+ def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
743
+ return expand_data_ids(data_ids, self.dimensions, self.query, None)
744
+
741
745
  @property
742
746
  def _file_transfer_source(self) -> RemoteFileTransferSource:
743
747
  return RemoteFileTransferSource(self._connection)
@@ -338,7 +338,7 @@ class HybridButler(Butler):
338
338
  )
339
339
 
340
340
  def transfer_dimension_records_from(
341
- self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
341
+ self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
342
342
  ) -> None:
343
343
  return self._direct_butler.transfer_dimension_records_from(source_butler, source_refs)
344
344
 
@@ -425,6 +425,9 @@ class HybridButler(Butler):
425
425
  source_butler, data_ids, allowed_elements
426
426
  )
427
427
 
428
+ def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
429
+ return self._remote_butler._expand_data_ids(data_ids)
430
+
428
431
  @property
429
432
  def collection_chains(self) -> ButlerCollections:
430
433
  return HybridButlerCollections(self)