lsst-daf-butler 30.0.0rc3__py3-none-any.whl → 30.2025.5000__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. lsst/daf/butler/_butler.py +5 -8
  2. lsst/daf/butler/_butler_metrics.py +2 -49
  3. lsst/daf/butler/_formatter.py +9 -2
  4. lsst/daf/butler/_labeled_butler_factory.py +8 -28
  5. lsst/daf/butler/configs/datastores/formatters.yaml +0 -1
  6. lsst/daf/butler/configs/storageClasses.yaml +0 -15
  7. lsst/daf/butler/datastore/record_data.py +1 -1
  8. lsst/daf/butler/datastores/fileDatastore.py +0 -14
  9. lsst/daf/butler/dimensions/_coordinate.py +0 -5
  10. lsst/daf/butler/direct_butler/_direct_butler.py +28 -45
  11. lsst/daf/butler/logging.py +3 -9
  12. lsst/daf/butler/registry/bridge/monolithic.py +13 -17
  13. lsst/daf/butler/registry/datasets/byDimensions/_manager.py +45 -49
  14. lsst/daf/butler/registry/interfaces/_database.py +1 -6
  15. lsst/daf/butler/registry/sql_registry.py +24 -2
  16. lsst/daf/butler/remote_butler/_remote_butler.py +1 -5
  17. lsst/daf/butler/tests/hybrid_butler.py +1 -4
  18. lsst/daf/butler/transfers/_context.py +6 -7
  19. lsst/daf/butler/version.py +1 -1
  20. {lsst_daf_butler-30.0.0rc3.dist-info → lsst_daf_butler-30.2025.5000.dist-info}/METADATA +1 -1
  21. {lsst_daf_butler-30.0.0rc3.dist-info → lsst_daf_butler-30.2025.5000.dist-info}/RECORD +29 -32
  22. lsst/daf/butler/_rubin/temporary_for_ingest.py +0 -207
  23. lsst/daf/butler/registry/expand_data_ids.py +0 -93
  24. lsst/daf/butler/tests/registry_data/lsstcam-subset.yaml +0 -191
  25. {lsst_daf_butler-30.0.0rc3.dist-info → lsst_daf_butler-30.2025.5000.dist-info}/WHEEL +0 -0
  26. {lsst_daf_butler-30.0.0rc3.dist-info → lsst_daf_butler-30.2025.5000.dist-info}/entry_points.txt +0 -0
  27. {lsst_daf_butler-30.0.0rc3.dist-info → lsst_daf_butler-30.2025.5000.dist-info}/licenses/COPYRIGHT +0 -0
  28. {lsst_daf_butler-30.0.0rc3.dist-info → lsst_daf_butler-30.2025.5000.dist-info}/licenses/LICENSE +0 -0
  29. {lsst_daf_butler-30.0.0rc3.dist-info → lsst_daf_butler-30.2025.5000.dist-info}/licenses/bsd_license.txt +0 -0
  30. {lsst_daf_butler-30.0.0rc3.dist-info → lsst_daf_butler-30.2025.5000.dist-info}/licenses/gpl-v3.0.txt +0 -0
  31. {lsst_daf_butler-30.0.0rc3.dist-info → lsst_daf_butler-30.2025.5000.dist-info}/top_level.txt +0 -0
  32. {lsst_daf_butler-30.0.0rc3.dist-info → lsst_daf_butler-30.2025.5000.dist-info}/zip-safe +0 -0
@@ -1566,7 +1566,7 @@ class Butler(LimitedButler): # numpydoc ignore=PR02
1566
1566
 
1567
1567
  @abstractmethod
1568
1568
  def transfer_dimension_records_from(
1569
- self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
1569
+ self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
1570
1570
  ) -> None:
1571
1571
  """Transfer dimension records to this Butler from another Butler.
1572
1572
 
@@ -1578,9 +1578,10 @@ class Butler(LimitedButler): # numpydoc ignore=PR02
1578
1578
  `Butler` whose registry will be used to expand data IDs. If the
1579
1579
  source refs contain coordinates that are used to populate other
1580
1580
  records then this will also need to be a full `Butler`.
1581
- source_refs : iterable of `DatasetRef` or `DataCoordinate`
1582
- Datasets or data IDs defined in the source butler whose dimension
1583
- records should be transferred to this butler.
1581
+ source_refs : iterable of `DatasetRef`
1582
+ Datasets defined in the source butler whose dimension records
1583
+ should be transferred to this butler. In most circumstances.
1584
+ transfer is faster if the dataset refs are expanded.
1584
1585
  """
1585
1586
  raise NotImplementedError()
1586
1587
 
@@ -2226,7 +2227,3 @@ class Butler(LimitedButler): # numpydoc ignore=PR02
2226
2227
  @abstractmethod
2227
2228
  def close(self) -> None:
2228
2229
  raise NotImplementedError()
2229
-
2230
- @abstractmethod
2231
- def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
2232
- raise NotImplementedError()
@@ -29,15 +29,12 @@ from __future__ import annotations
29
29
 
30
30
  from collections.abc import Callable, Iterator
31
31
  from contextlib import contextmanager
32
- from typing import Concatenate, ParamSpec
33
32
 
34
33
  from pydantic import BaseModel
35
34
 
36
35
  from lsst.utils.logging import LsstLoggers
37
36
  from lsst.utils.timer import time_this
38
37
 
39
- P = ParamSpec("P")
40
-
41
38
 
42
39
  class ButlerMetrics(BaseModel):
43
40
  """Metrics collected during Butler operations."""
@@ -48,26 +45,18 @@ class ButlerMetrics(BaseModel):
48
45
  time_in_get: float = 0.0
49
46
  """Wall-clock time, in seconds, spent in get()."""
50
47
 
51
- time_in_ingest: float = 0.0
52
- """Wall-clock time, in seconds, spent in ingest()."""
53
-
54
48
  n_get: int = 0
55
49
  """Number of datasets retrieved with get()."""
56
50
 
57
51
  n_put: int = 0
58
52
  """Number of datasets stored with put()."""
59
53
 
60
- n_ingest: int = 0
61
- """Number of datasets ingested."""
62
-
63
54
  def reset(self) -> None:
64
55
  """Reset all metrics."""
65
56
  self.time_in_put = 0.0
66
57
  self.time_in_get = 0.0
67
- self.time_in_ingest = 0.0
68
58
  self.n_get = 0
69
59
  self.n_put = 0
70
- self.n_ingest = 0
71
60
 
72
61
  def increment_get(self, duration: float) -> None:
73
62
  """Increment time for get().
@@ -91,31 +80,13 @@ class ButlerMetrics(BaseModel):
91
80
  self.time_in_put += duration
92
81
  self.n_put += 1
93
82
 
94
- def increment_ingest(self, duration: float, n_datasets: int) -> None:
95
- """Increment time and datasets for ingest().
96
-
97
- Parameters
98
- ----------
99
- duration : `float`
100
- Duration to add to the ingest() statistics.
101
- n_datasets : `int`
102
- Number of datasets to be ingested for this call.
103
- """
104
- self.time_in_ingest += duration
105
- self.n_ingest += n_datasets
106
-
107
83
  @contextmanager
108
84
  def _timer(
109
- self,
110
- handler: Callable[Concatenate[float, P], None],
111
- log: LsstLoggers | None = None,
112
- msg: str | None = None,
113
- *args: P.args,
114
- **kwargs: P.kwargs,
85
+ self, handler: Callable[[float], None], log: LsstLoggers | None = None, msg: str | None = None
115
86
  ) -> Iterator[None]:
116
87
  with time_this(log=log, msg=msg) as timer:
117
88
  yield
118
- handler(timer.duration, *args, **kwargs)
89
+ handler(timer.duration)
119
90
 
120
91
  @contextmanager
121
92
  def instrument_get(self, log: LsstLoggers | None = None, msg: str | None = None) -> Iterator[None]:
@@ -144,21 +115,3 @@ class ButlerMetrics(BaseModel):
144
115
  """
145
116
  with self._timer(self.increment_put, log=log, msg=msg):
146
117
  yield
147
-
148
- @contextmanager
149
- def instrument_ingest(
150
- self, n_datasets: int, log: LsstLoggers | None = None, msg: str | None = None
151
- ) -> Iterator[None]:
152
- """Run code and increment ingest statistics.
153
-
154
- Parameters
155
- ----------
156
- n_datasets : `int`
157
- Number of datasets being ingested.
158
- log : `logging.Logger` or `None`
159
- Logger to use for any timing information.
160
- msg : `str` or `None`
161
- Any message to be included in log output.
162
- """
163
- with self._timer(self.increment_ingest, n_datasets=n_datasets, log=log, msg=msg):
164
- yield
@@ -54,7 +54,6 @@ from ._config import Config
54
54
  from ._config_support import LookupKey, processLookupConfigs
55
55
  from ._file_descriptor import FileDescriptor
56
56
  from ._location import Location
57
- from ._rubin.temporary_for_ingest import TemporaryForIngest
58
57
  from .dimensions import DataCoordinate, DimensionUniverse
59
58
  from .mapping_factory import MappingFactory
60
59
 
@@ -1032,7 +1031,15 @@ class FormatterV2:
1032
1031
  """
1033
1032
  cache_manager = self._ensure_cache(cache_manager)
1034
1033
 
1035
- with TemporaryForIngest.make_path(uri) as temporary_uri:
1034
+ # Always write to a temporary even if
1035
+ # using a local file system -- that gives us atomic writes.
1036
+ # If a process is killed as the file is being written we do not
1037
+ # want it to remain in the correct place but in corrupt state.
1038
+ # For local files write to the output directory not temporary dir.
1039
+ prefix = uri.dirname() if uri.isLocal else None
1040
+ if prefix is not None:
1041
+ prefix.mkdir()
1042
+ with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
1036
1043
  # Need to configure the formatter to write to a different
1037
1044
  # location and that needs us to overwrite internals
1038
1045
  log.debug("Writing dataset to temporary location at %s", temporary_uri)
@@ -30,9 +30,7 @@ from __future__ import annotations
30
30
  __all__ = ("LabeledButlerFactory", "LabeledButlerFactoryProtocol")
31
31
 
32
32
  from collections.abc import Mapping
33
- from contextlib import AbstractContextManager
34
- from logging import getLogger
35
- from typing import Any, Literal, Protocol, Self
33
+ from typing import Protocol
36
34
 
37
35
  from lsst.resources import ResourcePathExpression
38
36
 
@@ -42,8 +40,6 @@ from ._butler_repo_index import ButlerRepoIndex
42
40
  from ._utilities.named_locks import NamedLocks
43
41
  from ._utilities.thread_safe_cache import ThreadSafeCache
44
42
 
45
- _LOG = getLogger(__name__)
46
-
47
43
 
48
44
  class LabeledButlerFactoryProtocol(Protocol):
49
45
  """Callable to retrieve a butler from a label."""
@@ -51,7 +47,7 @@ class LabeledButlerFactoryProtocol(Protocol):
51
47
  def __call__(self, label: str) -> Butler: ...
52
48
 
53
49
 
54
- class LabeledButlerFactory(AbstractContextManager):
50
+ class LabeledButlerFactory:
55
51
  """Factory for efficiently instantiating Butler instances from the
56
52
  repository index file. This is intended for use from long-lived services
57
53
  that want to instantiate a separate Butler instance for each end user
@@ -64,9 +60,6 @@ class LabeledButlerFactory(AbstractContextManager):
64
60
  files. If not provided, defaults to the global repository index
65
61
  configured by the ``DAF_BUTLER_REPOSITORY_INDEX`` environment variable
66
62
  -- see `ButlerRepoIndex`.
67
- writeable : `bool`, optional
68
- If `True`, Butler instances created by this factory will be writeable.
69
- If `False` (the default), instances will be read-only.
70
63
 
71
64
  Notes
72
65
  -----
@@ -83,12 +76,11 @@ class LabeledButlerFactory(AbstractContextManager):
83
76
  safely be used by separate threads.
84
77
  """
85
78
 
86
- def __init__(self, repositories: Mapping[str, str] | None = None, writeable: bool = False) -> None:
79
+ def __init__(self, repositories: Mapping[str, str] | None = None) -> None:
87
80
  if repositories is None:
88
81
  self._repositories = None
89
82
  else:
90
83
  self._repositories = dict(repositories)
91
- self._writeable = writeable
92
84
 
93
85
  self._factories = ThreadSafeCache[str, _ButlerFactory]()
94
86
  self._initialization_locks = NamedLocks()
@@ -96,16 +88,6 @@ class LabeledButlerFactory(AbstractContextManager):
96
88
  # This may be overridden by unit tests.
97
89
  self._preload_unsafe_direct_butler_caches = True
98
90
 
99
- def __enter__(self) -> Self:
100
- return self
101
-
102
- def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> Literal[False]:
103
- try:
104
- self.close()
105
- except Exception:
106
- _LOG.exception("An exception occurred during LabeledButlerFactory.close()")
107
- return False
108
-
109
91
  def bind(self, access_token: str | None) -> LabeledButlerFactoryProtocol:
110
92
  """Create a callable factory function for generating Butler instances
111
93
  with out needing to specify access tokans again.
@@ -127,7 +109,7 @@ class LabeledButlerFactory(AbstractContextManager):
127
109
 
128
110
  return create
129
111
 
130
- def create_butler(self, label: str, *, access_token: str | None = None) -> Butler:
112
+ def create_butler(self, *, label: str, access_token: str | None) -> Butler:
131
113
  """Create a Butler instance.
132
114
 
133
115
  Parameters
@@ -136,7 +118,7 @@ class LabeledButlerFactory(AbstractContextManager):
136
118
  Label of the repository to instantiate, from the ``repositories``
137
119
  parameter to the `LabeledButlerFactory` constructor or the global
138
120
  repository index file.
139
- access_token : `str` | `None`, optional
121
+ access_token : `str` | `None`
140
122
  Gafaelfawr access token used to authenticate to a Butler server.
141
123
  This is required for any repositories configured to use
142
124
  `RemoteButler`. If you only use `DirectButler`, this may be
@@ -185,9 +167,7 @@ class LabeledButlerFactory(AbstractContextManager):
185
167
 
186
168
  match butler_type:
187
169
  case ButlerType.DIRECT:
188
- return _DirectButlerFactory(
189
- config, self._preload_unsafe_direct_butler_caches, self._writeable
190
- )
170
+ return _DirectButlerFactory(config, self._preload_unsafe_direct_butler_caches)
191
171
  case ButlerType.REMOTE:
192
172
  return _RemoteButlerFactory(config)
193
173
  case _:
@@ -209,12 +189,12 @@ class _ButlerFactory(Protocol):
209
189
 
210
190
 
211
191
  class _DirectButlerFactory(_ButlerFactory):
212
- def __init__(self, config: ButlerConfig, preload_unsafe_caches: bool, writeable: bool) -> None:
192
+ def __init__(self, config: ButlerConfig, preload_unsafe_caches: bool) -> None:
213
193
  import lsst.daf.butler.direct_butler
214
194
 
215
195
  # Create a 'template' Butler that will be cloned when callers request
216
196
  # an instance.
217
- self._butler = Butler.from_config(config, writeable=writeable)
197
+ self._butler = Butler.from_config(config)
218
198
  assert isinstance(self._butler, lsst.daf.butler.direct_butler.DirectButler)
219
199
 
220
200
  # Load caches so that data is available in cloned instances without
@@ -100,4 +100,3 @@ VisitBackgroundModel: lsst.daf.butler.formatters.json.JsonFormatter
100
100
  VignettingCorrection: lsst.ts.observatory.control.utils.extras.vignetting_storage.VignettingCorrectionFormatter
101
101
  SSPAuxiliaryFile: lsst.pipe.tasks.sspAuxiliaryFile.SSPAuxiliaryFileFormatter
102
102
  VisitGeometry: lsst.daf.butler.formatters.json.JsonFormatter
103
- ProvenanceQuantumGraph: lsst.pipe.base.quantum_graph.formatter.ProvenanceFormatter
@@ -443,18 +443,3 @@ storageClasses:
443
443
  pytype: lsst.pipe.tasks.sspAuxiliaryFile.SSPAuxiliaryFile
444
444
  VisitGeometry:
445
445
  pytype: lsst.obs.base.visit_geometry.VisitGeometry
446
- ProvenanceQuantumGraph:
447
- pytype: lsst.pipe.base.quantum_graph.ProvenanceQuantumGraph
448
- parameters:
449
- - import_mode # lsst.pipe.base.pipeline_graph.TaskImportMode
450
- - quanta # iterable of uuid.UUID; quanta to read
451
- - datasets # iterable of uuid.UUID; datasets to read
452
- - read_init_quanta # bool, defaults to True; whether to read pre-exec-init info
453
- derivedComponents:
454
- packages: Packages # ignores node parameters
455
-
456
- # UUID keys can be quantum or data IDs (whichever is passed in via
457
- # parameters). Nested lists are attempts to run the quantum (last is
458
- # most recent).
459
- logs: StructuredDataDict # dict[uuid.UUID, list[ButlerLogRecords]]
460
- metadata: StructuredDataDict # dict[uuid.UUID, list[TaskMetadata]]
@@ -49,7 +49,7 @@ if TYPE_CHECKING:
49
49
  # Pydantic requires the possible value types to be explicitly enumerated in
50
50
  # order for `uuid.UUID` in particular to work. `typing.Any` does not work
51
51
  # here.
52
- _Record: TypeAlias = dict[str, int | str | None]
52
+ _Record: TypeAlias = dict[str, int | str | uuid.UUID | None]
53
53
 
54
54
 
55
55
  class SerializedDatastoreRecordData(pydantic.BaseModel):
@@ -3166,20 +3166,6 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
3166
3166
 
3167
3167
  def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
3168
3168
  # Docstring inherited from the base class.
3169
-
3170
- # This call to 'bridge.check' filters out "partially deleted" datasets.
3171
- # Specifically, ones in the unusual edge state that:
3172
- # 1. They have an entry in the registry dataset tables
3173
- # 2. They were "trashed" from the datastore, so they are not
3174
- # present in the "dataset_location" table.)
3175
- # 3. But the trash has not been "emptied", so there are still entries
3176
- # in the "opaque" datastore records table.
3177
- #
3178
- # As far as I can tell, this can only occur in the case of a concurrent
3179
- # or aborted call to `Butler.pruneDatasets(unstore=True, purge=False)`.
3180
- # Datasets (with or without files existing on disk) can persist in
3181
- # this zombie state indefinitely, until someone manually empties
3182
- # the trash.
3183
3169
  exported_refs = list(self._bridge.check(refs))
3184
3170
  ids = {ref.id for ref in exported_refs}
3185
3171
  records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}
@@ -755,11 +755,6 @@ class DataCoordinate:
755
755
  to_json = to_json_pydantic
756
756
  from_json: ClassVar[Callable[..., Self]] = cast(Callable[..., Self], classmethod(from_json_pydantic))
757
757
 
758
- @property
759
- def dataId(self) -> Self:
760
- """Return this `DataCoordinate` instance, unmodified."""
761
- return self
762
-
763
758
 
764
759
  DataId = DataCoordinate | Mapping[str, Any]
765
760
  """A type-annotation alias for signatures that accept both informal data ID
@@ -1822,25 +1822,12 @@ class DirectButler(Butler): # numpydoc ignore=PR02
1822
1822
  f" Example: {existing_datasets[0]}"
1823
1823
  )
1824
1824
 
1825
- # Calculate some statistics based on the given list of datasets.
1826
- n_files = len(datasets)
1827
- n_datasets = 0
1828
- for d in datasets:
1829
- n_datasets += len(d.refs)
1830
- sfiles = "s" if n_files != 1 else ""
1831
- srefs = "s" if n_datasets != 1 else ""
1832
-
1833
1825
  # We use `datasets` rather `new_datasets` for the Registry
1834
1826
  # portion of this, to let it confirm that everything matches the
1835
1827
  # existing datasets.
1836
1828
  import_info = self._prepare_ingest_file_datasets(datasets, progress)
1837
1829
 
1838
- with (
1839
- self._metrics.instrument_ingest(
1840
- n_datasets, _LOG, msg=f"Ingesting {n_files} file{sfiles} with {n_datasets} dataset{srefs}"
1841
- ),
1842
- self.transaction(),
1843
- ):
1830
+ with self.transaction():
1844
1831
  self._ingest_file_datasets(datasets, import_info, progress)
1845
1832
 
1846
1833
  # Bulk-insert everything into Datastore.
@@ -1995,7 +1982,7 @@ class DirectButler(Butler): # numpydoc ignore=PR02
1995
1982
  doImport(filename) # type: ignore
1996
1983
 
1997
1984
  def transfer_dimension_records_from(
1998
- self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
1985
+ self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
1999
1986
  ) -> None:
2000
1987
  # Allowed dimensions in the target butler.
2001
1988
  elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
@@ -2025,13 +2012,16 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2025
2012
  source_butler, data_ids, allowed_elements
2026
2013
  )
2027
2014
 
2015
+ can_query = True if isinstance(source_butler, Butler) else False
2016
+
2028
2017
  additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2029
2018
  for original_element, record_mapping in primary_records.items():
2030
2019
  # Get dimensions that depend on this dimension.
2031
2020
  populated_by = self.dimensions.get_elements_populated_by(
2032
2021
  self.dimensions[original_element.name] # type: ignore
2033
2022
  )
2034
- if populated_by:
2023
+
2024
+ for data_id in record_mapping.keys():
2035
2025
  for element in populated_by:
2036
2026
  if element not in allowed_elements:
2037
2027
  continue
@@ -2050,32 +2040,28 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2050
2040
  # have to be scanned.
2051
2041
  continue
2052
2042
 
2053
- if record_mapping:
2054
- if not isinstance(source_butler, Butler):
2055
- raise RuntimeError(
2056
- f"Transferring populated_by records like {element.name}"
2057
- " requires a full Butler."
2058
- )
2043
+ if not can_query:
2044
+ raise RuntimeError(
2045
+ f"Transferring populated_by records like {element.name} requires a full Butler."
2046
+ )
2059
2047
 
2060
- with source_butler.query() as query:
2061
- records = query.join_data_coordinates(record_mapping.keys()).dimension_records(
2062
- element.name
2063
- )
2064
- for record in records:
2065
- additional_records[record.definition].setdefault(record.dataId, record)
2048
+ records = source_butler.query_dimension_records( # type: ignore
2049
+ element.name,
2050
+ explain=False,
2051
+ **data_id.mapping, # type: ignore
2052
+ )
2053
+ for record in records:
2054
+ additional_records[record.definition].setdefault(record.dataId, record)
2066
2055
 
2067
2056
  # The next step is to walk back through the additional records to
2068
2057
  # pick up any missing content (such as visit_definition needing to
2069
2058
  # know the exposure). Want to ensure we do not request records we
2070
2059
  # already have.
2071
2060
  missing_data_ids = set()
2072
- for record_mapping in additional_records.values():
2061
+ for name, record_mapping in additional_records.items():
2073
2062
  for data_id in record_mapping.keys():
2074
- for dimension in data_id.dimensions.required:
2075
- element = source_butler.dimensions[dimension]
2076
- dimension_key = data_id.subset(dimension)
2077
- if dimension_key not in primary_records[element]:
2078
- missing_data_ids.add(dimension_key)
2063
+ if data_id not in primary_records[name]:
2064
+ missing_data_ids.add(data_id)
2079
2065
 
2080
2066
  # Fill out the new records. Assume that these new records do not
2081
2067
  # also need to carry over additional populated_by records.
@@ -2092,19 +2078,19 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2092
2078
  def _extract_dimension_records_from_data_ids(
2093
2079
  self,
2094
2080
  source_butler: LimitedButler | Butler,
2095
- data_ids: Iterable[DataCoordinate],
2081
+ data_ids: set[DataCoordinate],
2096
2082
  allowed_elements: frozenset[DimensionElement],
2097
2083
  ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
2098
2084
  dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2099
2085
 
2100
- data_ids = set(data_ids)
2101
- if not all(data_id.hasRecords() for data_id in data_ids):
2102
- if isinstance(source_butler, Butler):
2103
- data_ids = source_butler._expand_data_ids(data_ids)
2104
- else:
2105
- raise TypeError("Input butler needs to be a full butler to expand DataId.")
2106
-
2107
2086
  for data_id in data_ids:
2087
+ # Need an expanded record, if not expanded that we need a full
2088
+ # butler with registry (allow mocks with registry too).
2089
+ if not data_id.hasRecords():
2090
+ if registry := getattr(source_butler, "registry", None):
2091
+ data_id = registry.expandDataId(data_id)
2092
+ else:
2093
+ raise TypeError("Input butler needs to be a full butler to expand DataId.")
2108
2094
  # If this butler doesn't know about a dimension in the source
2109
2095
  # butler things will break later.
2110
2096
  for element_name in data_id.dimensions.elements:
@@ -2583,9 +2569,6 @@ class DirectButler(Butler): # numpydoc ignore=PR02
2583
2569
  """Immediately load caches that are used for common operations."""
2584
2570
  self._registry.preload_cache(load_dimension_record_cache=load_dimension_record_cache)
2585
2571
 
2586
- def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
2587
- return self._registry.expand_data_ids(data_ids)
2588
-
2589
2572
  _config: ButlerConfig
2590
2573
  """Configuration for this Butler instance."""
2591
2574
 
@@ -764,17 +764,11 @@ class ButlerLogRecords(MutableSequence[ButlerLogRecord]):
764
764
 
765
765
 
766
766
  class ButlerLogRecordHandler(StreamHandler):
767
- """Python log handler that accumulates records.
767
+ """Python log handler that accumulates records."""
768
768
 
769
- Parameters
770
- ----------
771
- records : `ButlerLogRecords`, optional
772
- Container to store logs in.
773
- """
774
-
775
- def __init__(self, records: ButlerLogRecords | None = None) -> None:
769
+ def __init__(self) -> None:
776
770
  super().__init__()
777
- self.records = ButlerLogRecords([]) if records is None else records
771
+ self.records = ButlerLogRecords([])
778
772
 
779
773
  def emit(self, record: LogRecord) -> None:
780
774
  self.records.append(record)
@@ -215,24 +215,20 @@ class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge):
215
215
  def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]:
216
216
  # Docstring inherited from DatastoreRegistryBridge
217
217
  byId = {ref.id: ref for ref in refs}
218
- found: list[DatasetIdRef] = []
219
- with self._db.session():
220
- for batch in chunk_iterable(byId.keys(), 50000):
221
- sql = (
222
- sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
223
- .select_from(self._tables.dataset_location)
224
- .where(
225
- sqlalchemy.sql.and_(
226
- self._tables.dataset_location.columns.datastore_name == self.datastoreName,
227
- self._tables.dataset_location.columns.dataset_id.in_(batch),
228
- )
229
- )
218
+ sql = (
219
+ sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
220
+ .select_from(self._tables.dataset_location)
221
+ .where(
222
+ sqlalchemy.sql.and_(
223
+ self._tables.dataset_location.columns.datastore_name == self.datastoreName,
224
+ self._tables.dataset_location.columns.dataset_id.in_(byId.keys()),
230
225
  )
231
- with self._db.query(sql) as sql_result:
232
- sql_ids = sql_result.scalars().all()
233
- found.extend(byId[id] for id in sql_ids)
234
-
235
- return found
226
+ )
227
+ )
228
+ with self._db.query(sql) as sql_result:
229
+ sql_rows = sql_result.fetchall()
230
+ for row in sql_rows:
231
+ yield byId[row.dataset_id]
236
232
 
237
233
  @contextmanager
238
234
  def emptyTrash(
@@ -12,8 +12,6 @@ from typing import TYPE_CHECKING, Any, ClassVar
12
12
  import astropy.time
13
13
  import sqlalchemy
14
14
 
15
- from lsst.utils.iteration import chunk_iterable
16
-
17
15
  from .... import ddl
18
16
  from ...._collection_type import CollectionType
19
17
  from ...._dataset_ref import DatasetId, DatasetIdFactory, DatasetIdGenEnum, DatasetRef
@@ -426,18 +424,17 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
426
424
  return result
427
425
 
428
426
  def get_dataset_refs(self, ids: list[DatasetId]) -> list[DatasetRef]:
429
- dataset_type_map: dict[DatasetId, DatasetType] = {}
430
- for batch in chunk_iterable(set(ids), 50000):
431
- # Look up the dataset types corresponding to the given Dataset IDs.
432
- id_col = self._static.dataset.columns["id"]
433
- sql = sqlalchemy.sql.select(
434
- id_col,
435
- self._static.dataset.columns["dataset_type_id"],
436
- ).where(id_col.in_(batch))
437
- with self._db.query(sql) as sql_result:
438
- dataset_rows = sql_result.mappings().all()
439
- for row in dataset_rows:
440
- dataset_type_map[row["id"]] = self._get_dataset_type_by_id(row["dataset_type_id"])
427
+ # Look up the dataset types corresponding to the given Dataset IDs.
428
+ id_col = self._static.dataset.columns["id"]
429
+ sql = sqlalchemy.sql.select(
430
+ id_col,
431
+ self._static.dataset.columns["dataset_type_id"],
432
+ ).where(id_col.in_(ids))
433
+ with self._db.query(sql) as sql_result:
434
+ dataset_rows = sql_result.mappings().all()
435
+ dataset_type_map: dict[DatasetId, DatasetType] = {
436
+ row["id"]: self._get_dataset_type_by_id(row["dataset_type_id"]) for row in dataset_rows
437
+ }
441
438
 
442
439
  # Group the given dataset IDs by the DimensionGroup of their dataset
443
440
  # types -- there is a separate tags table for each DimensionGroup.
@@ -451,41 +448,40 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
451
448
  # data IDs corresponding to the UUIDs found from the dataset table.
452
449
  dynamic_tables = self._get_dynamic_tables(dimension_group)
453
450
  tags_table = self._get_tags_table(dynamic_tables)
454
- for batch in chunk_iterable(datasets, 50000):
455
- tags_sql = tags_table.select().where(tags_table.columns["dataset_id"].in_(batch))
456
- # Join in the collection table to fetch the run name.
457
- collection_column = tags_table.columns[self._collections.getCollectionForeignKeyName()]
458
- joined_collections = self._collections.join_collections_sql(collection_column, tags_sql)
459
- tags_sql = joined_collections.joined_sql
460
- run_name_column = joined_collections.name_column
461
- tags_sql = tags_sql.add_columns(run_name_column)
462
- # Tags table includes run collections and tagged
463
- # collections.
464
- # In theory the data ID for a given dataset should be the
465
- # same in both, but nothing actually guarantees this.
466
- # So skip any tagged collections, using the run collection
467
- # as the definitive definition.
468
- tags_sql = tags_sql.where(joined_collections.type_column == int(CollectionType.RUN))
469
-
470
- with self._db.query(tags_sql) as sql_result:
471
- data_id_rows = sql_result.mappings().all()
472
-
473
- assert run_name_column.key is not None
474
- for data_id_row in data_id_rows:
475
- id = data_id_row["dataset_id"]
476
- dataset_type = dataset_type_map[id]
477
- run_name = data_id_row[run_name_column.key]
478
- data_id = DataCoordinate.from_required_values(
479
- dimension_group,
480
- tuple(data_id_row[dimension] for dimension in dimension_group.required),
481
- )
482
- ref = DatasetRef(
483
- datasetType=dataset_type,
484
- dataId=data_id,
485
- id=id,
486
- run=run_name,
487
- )
488
- output_refs.append(ref)
451
+ tags_sql = tags_table.select().where(tags_table.columns["dataset_id"].in_(datasets))
452
+ # Join in the collection table to fetch the run name.
453
+ collection_column = tags_table.columns[self._collections.getCollectionForeignKeyName()]
454
+ joined_collections = self._collections.join_collections_sql(collection_column, tags_sql)
455
+ tags_sql = joined_collections.joined_sql
456
+ run_name_column = joined_collections.name_column
457
+ tags_sql = tags_sql.add_columns(run_name_column)
458
+ # Tags table includes run collections and tagged
459
+ # collections.
460
+ # In theory the data ID for a given dataset should be the
461
+ # same in both, but nothing actually guarantees this.
462
+ # So skip any tagged collections, using the run collection
463
+ # as the definitive definition.
464
+ tags_sql = tags_sql.where(joined_collections.type_column == int(CollectionType.RUN))
465
+
466
+ with self._db.query(tags_sql) as sql_result:
467
+ data_id_rows = sql_result.mappings().all()
468
+
469
+ assert run_name_column.key is not None
470
+ for data_id_row in data_id_rows:
471
+ id = data_id_row["dataset_id"]
472
+ dataset_type = dataset_type_map[id]
473
+ run_name = data_id_row[run_name_column.key]
474
+ data_id = DataCoordinate.from_required_values(
475
+ dimension_group,
476
+ tuple(data_id_row[dimension] for dimension in dimension_group.required),
477
+ )
478
+ ref = DatasetRef(
479
+ datasetType=dataset_type,
480
+ dataId=data_id,
481
+ id=id,
482
+ run=run_name,
483
+ )
484
+ output_refs.append(ref)
489
485
 
490
486
  return output_refs
491
487