PyPI - lsst-daf-butler - Versions diffs - 30.0.0rc3__py3-none-any.whl → 30.2025.5000__py3-none-any.whl - Mend

lsst-daf-butler 30.0.0rc3py3-none-any.whl → 30.2025.5000py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

lsst/daf/butler/_butler.py CHANGED Viewed

@@ -1566,7 +1566,7 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
     @abstractmethod
     def transfer_dimension_records_from(
-        self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
+        self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
     ) -> None:
         """Transfer dimension records to this Butler from another Butler.
@@ -1578,9 +1578,10 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
             `Butler` whose registry will be used to expand data IDs. If the
             source refs contain coordinates that are used to populate other
             records then this will also need to be a full `Butler`.
-        source_refs : iterable of `DatasetRef` or `DataCoordinate`
-            Datasets or data IDs defined in the source butler whose dimension
-            records should be transferred to this butler.
+        source_refs : iterable of `DatasetRef`
+            Datasets defined in the source butler whose dimension records
+            should be transferred to this butler. In most circumstances.
+            transfer is faster if the dataset refs are expanded.
         """
         raise NotImplementedError()
@@ -2226,7 +2227,3 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
     @abstractmethod
     def close(self) -> None:
         raise NotImplementedError()
-    @abstractmethod
-    def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
-        raise NotImplementedError()

lsst/daf/butler/_butler_metrics.py CHANGED Viewed

@@ -29,15 +29,12 @@ from __future__ import annotations
 from collections.abc import Callable, Iterator
 from contextlib import contextmanager
-from typing import Concatenate, ParamSpec
 from pydantic import BaseModel
 from lsst.utils.logging import LsstLoggers
 from lsst.utils.timer import time_this
-P = ParamSpec("P")
 class ButlerMetrics(BaseModel):
     """Metrics collected during Butler operations."""
@@ -48,26 +45,18 @@ class ButlerMetrics(BaseModel):
     time_in_get: float = 0.0
     """Wall-clock time, in seconds, spent in get()."""
-    time_in_ingest: float = 0.0
-    """Wall-clock time, in seconds, spent in ingest()."""
     n_get: int = 0
     """Number of datasets retrieved with get()."""
     n_put: int = 0
     """Number of datasets stored with put()."""
-    n_ingest: int = 0
-    """Number of datasets ingested."""
     def reset(self) -> None:
         """Reset all metrics."""
         self.time_in_put = 0.0
         self.time_in_get = 0.0
-        self.time_in_ingest = 0.0
         self.n_get = 0
         self.n_put = 0
-        self.n_ingest = 0
     def increment_get(self, duration: float) -> None:
         """Increment time for get().
@@ -91,31 +80,13 @@ class ButlerMetrics(BaseModel):
         self.time_in_put += duration
         self.n_put += 1
-    def increment_ingest(self, duration: float, n_datasets: int) -> None:
-        """Increment time and datasets for ingest().
-        Parameters
-        ----------
-        duration : `float`
-            Duration to add to the ingest() statistics.
-        n_datasets : `int`
-            Number of datasets to be ingested for this call.
-        """
-        self.time_in_ingest += duration
-        self.n_ingest += n_datasets
     @contextmanager
     def _timer(
-        self,
-        handler: Callable[Concatenate[float, P], None],
-        log: LsstLoggers | None = None,
-        msg: str | None = None,
-        *args: P.args,
-        **kwargs: P.kwargs,
+        self, handler: Callable[[float], None], log: LsstLoggers | None = None, msg: str | None = None
     ) -> Iterator[None]:
         with time_this(log=log, msg=msg) as timer:
             yield
-        handler(timer.duration, *args, **kwargs)
+        handler(timer.duration)
     @contextmanager
     def instrument_get(self, log: LsstLoggers | None = None, msg: str | None = None) -> Iterator[None]:
@@ -144,21 +115,3 @@ class ButlerMetrics(BaseModel):
         """
         with self._timer(self.increment_put, log=log, msg=msg):
             yield
-    @contextmanager
-    def instrument_ingest(
-        self, n_datasets: int, log: LsstLoggers | None = None, msg: str | None = None
-    ) -> Iterator[None]:
-        """Run code and increment ingest statistics.
-        Parameters
-        ----------
-        n_datasets : `int`
-            Number of datasets being ingested.
-        log : `logging.Logger` or `None`
-            Logger to use for any timing information.
-        msg : `str` or `None`
-            Any message to be included in log output.
-        """
-        with self._timer(self.increment_ingest, n_datasets=n_datasets, log=log, msg=msg):
-            yield

lsst/daf/butler/_formatter.py CHANGED Viewed

@@ -54,7 +54,6 @@ from ._config import Config
 from ._config_support import LookupKey, processLookupConfigs
 from ._file_descriptor import FileDescriptor
 from ._location import Location
-from ._rubin.temporary_for_ingest import TemporaryForIngest
 from .dimensions import DataCoordinate, DimensionUniverse
 from .mapping_factory import MappingFactory
@@ -1032,7 +1031,15 @@ class FormatterV2:
         """
         cache_manager = self._ensure_cache(cache_manager)
-        with TemporaryForIngest.make_path(uri) as temporary_uri:
+        # Always write to a temporary even if
+        # using a local file system -- that gives us atomic writes.
+        # If a process is killed as the file is being written we do not
+        # want it to remain in the correct place but in corrupt state.
+        # For local files write to the output directory not temporary dir.
+        prefix = uri.dirname() if uri.isLocal else None
+        if prefix is not None:
+            prefix.mkdir()
+        with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
             # Need to configure the formatter to write to a different
             # location and that needs us to overwrite internals
             log.debug("Writing dataset to temporary location at %s", temporary_uri)

lsst/daf/butler/_labeled_butler_factory.py CHANGED Viewed

@@ -30,9 +30,7 @@ from __future__ import annotations
 __all__ = ("LabeledButlerFactory", "LabeledButlerFactoryProtocol")
 from collections.abc import Mapping
-from contextlib import AbstractContextManager
-from logging import getLogger
-from typing import Any, Literal, Protocol, Self
+from typing import Protocol
 from lsst.resources import ResourcePathExpression
@@ -42,8 +40,6 @@ from ._butler_repo_index import ButlerRepoIndex
 from ._utilities.named_locks import NamedLocks
 from ._utilities.thread_safe_cache import ThreadSafeCache
-_LOG = getLogger(__name__)
 class LabeledButlerFactoryProtocol(Protocol):
     """Callable to retrieve a butler from a label."""
@@ -51,7 +47,7 @@ class LabeledButlerFactoryProtocol(Protocol):
     def __call__(self, label: str) -> Butler: ...
-class LabeledButlerFactory(AbstractContextManager):
+class LabeledButlerFactory:
     """Factory for efficiently instantiating Butler instances from the
     repository index file.  This is intended for use from long-lived services
     that want to instantiate a separate Butler instance for each end user
@@ -64,9 +60,6 @@ class LabeledButlerFactory(AbstractContextManager):
         files.  If not provided, defaults to the global repository index
         configured by the ``DAF_BUTLER_REPOSITORY_INDEX`` environment variable
         --  see `ButlerRepoIndex`.
-    writeable : `bool`, optional
-        If `True`, Butler instances created by this factory will be writeable.
-        If `False` (the default), instances will be read-only.
     Notes
     -----
@@ -83,12 +76,11 @@ class LabeledButlerFactory(AbstractContextManager):
     safely be used by separate threads.
     """
-    def __init__(self, repositories: Mapping[str, str] | None = None, writeable: bool = False) -> None:
+    def __init__(self, repositories: Mapping[str, str] | None = None) -> None:
         if repositories is None:
             self._repositories = None
         else:
             self._repositories = dict(repositories)
-        self._writeable = writeable
         self._factories = ThreadSafeCache[str, _ButlerFactory]()
         self._initialization_locks = NamedLocks()
@@ -96,16 +88,6 @@ class LabeledButlerFactory(AbstractContextManager):
         # This may be overridden by unit tests.
         self._preload_unsafe_direct_butler_caches = True
-    def __enter__(self) -> Self:
-        return self
-    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> Literal[False]:
-        try:
-            self.close()
-        except Exception:
-            _LOG.exception("An exception occurred during LabeledButlerFactory.close()")
-        return False
     def bind(self, access_token: str | None) -> LabeledButlerFactoryProtocol:
         """Create a callable factory function for generating Butler instances
         with out needing to specify access tokans again.
@@ -127,7 +109,7 @@ class LabeledButlerFactory(AbstractContextManager):
         return create
-    def create_butler(self, label: str, *, access_token: str | None = None) -> Butler:
+    def create_butler(self, *, label: str, access_token: str | None) -> Butler:
         """Create a Butler instance.
         Parameters
@@ -136,7 +118,7 @@ class LabeledButlerFactory(AbstractContextManager):
             Label of the repository to instantiate, from the ``repositories``
             parameter to the `LabeledButlerFactory` constructor or the global
             repository index file.
-        access_token : `str` | `None`, optional
+        access_token : `str` | `None`
             Gafaelfawr access token used to authenticate to a Butler server.
             This is required for any repositories configured to use
             `RemoteButler`.  If you only use `DirectButler`, this may be
@@ -185,9 +167,7 @@ class LabeledButlerFactory(AbstractContextManager):
         match butler_type:
             case ButlerType.DIRECT:
-                return _DirectButlerFactory(
-                    config, self._preload_unsafe_direct_butler_caches, self._writeable
-                )
+                return _DirectButlerFactory(config, self._preload_unsafe_direct_butler_caches)
             case ButlerType.REMOTE:
                 return _RemoteButlerFactory(config)
             case _:
@@ -209,12 +189,12 @@ class _ButlerFactory(Protocol):
 class _DirectButlerFactory(_ButlerFactory):
-    def __init__(self, config: ButlerConfig, preload_unsafe_caches: bool, writeable: bool) -> None:
+    def __init__(self, config: ButlerConfig, preload_unsafe_caches: bool) -> None:
         import lsst.daf.butler.direct_butler
         # Create a 'template' Butler that will be cloned when callers request
         # an instance.
-        self._butler = Butler.from_config(config, writeable=writeable)
+        self._butler = Butler.from_config(config)
         assert isinstance(self._butler, lsst.daf.butler.direct_butler.DirectButler)
         # Load caches so that data is available in cloned instances without

lsst/daf/butler/configs/datastores/formatters.yaml CHANGED Viewed

@@ -100,4 +100,3 @@ VisitBackgroundModel: lsst.daf.butler.formatters.json.JsonFormatter
 VignettingCorrection: lsst.ts.observatory.control.utils.extras.vignetting_storage.VignettingCorrectionFormatter
 SSPAuxiliaryFile: lsst.pipe.tasks.sspAuxiliaryFile.SSPAuxiliaryFileFormatter
 VisitGeometry: lsst.daf.butler.formatters.json.JsonFormatter
-ProvenanceQuantumGraph: lsst.pipe.base.quantum_graph.formatter.ProvenanceFormatter

lsst/daf/butler/configs/storageClasses.yaml CHANGED Viewed

@@ -443,18 +443,3 @@ storageClasses:
     pytype: lsst.pipe.tasks.sspAuxiliaryFile.SSPAuxiliaryFile
   VisitGeometry:
     pytype: lsst.obs.base.visit_geometry.VisitGeometry
-  ProvenanceQuantumGraph:
-    pytype: lsst.pipe.base.quantum_graph.ProvenanceQuantumGraph
-    parameters:
-      - import_mode   # lsst.pipe.base.pipeline_graph.TaskImportMode
-      - quanta  # iterable of uuid.UUID; quanta to read
-      - datasets  # iterable of uuid.UUID; datasets to read
-      - read_init_quanta  # bool, defaults to True; whether to read pre-exec-init info
-    derivedComponents:
-      packages: Packages  # ignores node parameters
-      # UUID keys can be quantum or data IDs (whichever is passed in via
-      # parameters).  Nested lists are attempts to run the quantum (last is
-      # most recent).
-      logs: StructuredDataDict  # dict[uuid.UUID, list[ButlerLogRecords]]
-      metadata: StructuredDataDict  # dict[uuid.UUID, list[TaskMetadata]]

lsst/daf/butler/datastore/record_data.py CHANGED Viewed

@@ -49,7 +49,7 @@ if TYPE_CHECKING:
 # Pydantic requires the possible value types to be explicitly enumerated in
 # order for `uuid.UUID` in particular to work.  `typing.Any` does not work
 # here.
-_Record: TypeAlias = dict[str, int | str | None]
+_Record: TypeAlias = dict[str, int | str | uuid.UUID | None]
 class SerializedDatastoreRecordData(pydantic.BaseModel):

lsst/daf/butler/datastores/fileDatastore.py CHANGED Viewed

@@ -3166,20 +3166,6 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
     def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
         # Docstring inherited from the base class.
-        # This call to 'bridge.check' filters out "partially deleted" datasets.
-        # Specifically, ones in the unusual edge state that:
-        # 1. They have an entry in the registry dataset tables
-        # 2. They were "trashed" from the datastore, so they are not
-        # present in the "dataset_location" table.)
-        # 3. But the trash has not been "emptied", so there are still entries
-        #  in the "opaque" datastore records table.
-        #
-        # As far as I can tell, this can only occur in the case of a concurrent
-        # or aborted call to `Butler.pruneDatasets(unstore=True, purge=False)`.
-        # Datasets (with or without files existing on disk) can persist in
-        # this zombie state indefinitely, until someone manually empties
-        # the trash.
         exported_refs = list(self._bridge.check(refs))
         ids = {ref.id for ref in exported_refs}
         records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}

lsst/daf/butler/dimensions/_coordinate.py CHANGED Viewed

@@ -755,11 +755,6 @@ class DataCoordinate:
     to_json = to_json_pydantic
     from_json: ClassVar[Callable[..., Self]] = cast(Callable[..., Self], classmethod(from_json_pydantic))
-    @property
-    def dataId(self) -> Self:
-        """Return this `DataCoordinate` instance, unmodified."""
-        return self
 DataId = DataCoordinate | Mapping[str, Any]
 """A type-annotation alias for signatures that accept both informal data ID

lsst/daf/butler/direct_butler/_direct_butler.py CHANGED Viewed

@@ -1822,25 +1822,12 @@ class DirectButler(Butler):  # numpydoc ignore=PR02
                     f" Example: {existing_datasets[0]}"
                 )
-        # Calculate some statistics based on the given list of datasets.
-        n_files = len(datasets)
-        n_datasets = 0
-        for d in datasets:
-            n_datasets += len(d.refs)
-        sfiles = "s" if n_files != 1 else ""
-        srefs = "s" if n_datasets != 1 else ""
         # We use `datasets` rather `new_datasets` for the Registry
         # portion of this, to let it confirm that everything matches the
         # existing datasets.
         import_info = self._prepare_ingest_file_datasets(datasets, progress)
-        with (
-            self._metrics.instrument_ingest(
-                n_datasets, _LOG, msg=f"Ingesting {n_files} file{sfiles} with {n_datasets} dataset{srefs}"
-            ),
-            self.transaction(),
-        ):
+        with self.transaction():
             self._ingest_file_datasets(datasets, import_info, progress)
             # Bulk-insert everything into Datastore.
@@ -1995,7 +1982,7 @@ class DirectButler(Butler):  # numpydoc ignore=PR02
             doImport(filename)  # type: ignore
     def transfer_dimension_records_from(
-        self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
+        self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
     ) -> None:
         # Allowed dimensions in the target butler.
         elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
@@ -2025,13 +2012,16 @@ class DirectButler(Butler):  # numpydoc ignore=PR02
             source_butler, data_ids, allowed_elements
         )
+        can_query = True if isinstance(source_butler, Butler) else False
         additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
         for original_element, record_mapping in primary_records.items():
             # Get dimensions that depend on this dimension.
             populated_by = self.dimensions.get_elements_populated_by(
                 self.dimensions[original_element.name]  # type: ignore
             )
-            if populated_by:
+            for data_id in record_mapping.keys():
                 for element in populated_by:
                     if element not in allowed_elements:
                         continue
@@ -2050,32 +2040,28 @@ class DirectButler(Butler):  # numpydoc ignore=PR02
                         # have to be scanned.
                         continue
-                    if record_mapping:
-                        if not isinstance(source_butler, Butler):
-                            raise RuntimeError(
-                                f"Transferring populated_by records like {element.name}"
-                                " requires a full Butler."
-                            )
+                    if not can_query:
+                        raise RuntimeError(
+                            f"Transferring populated_by records like {element.name} requires a full Butler."
+                        )
-                        with source_butler.query() as query:
-                            records = query.join_data_coordinates(record_mapping.keys()).dimension_records(
-                                element.name
-                            )
-                            for record in records:
-                                additional_records[record.definition].setdefault(record.dataId, record)
+                    records = source_butler.query_dimension_records(  # type: ignore
+                        element.name,
+                        explain=False,
+                        **data_id.mapping,  # type: ignore
+                    )
+                    for record in records:
+                        additional_records[record.definition].setdefault(record.dataId, record)
         # The next step is to walk back through the additional records to
         # pick up any missing content (such as visit_definition needing to
         # know the exposure). Want to ensure we do not request records we
         # already have.
         missing_data_ids = set()
-        for record_mapping in additional_records.values():
+        for name, record_mapping in additional_records.items():
             for data_id in record_mapping.keys():
-                for dimension in data_id.dimensions.required:
-                    element = source_butler.dimensions[dimension]
-                    dimension_key = data_id.subset(dimension)
-                    if dimension_key not in primary_records[element]:
-                        missing_data_ids.add(dimension_key)
+                if data_id not in primary_records[name]:
+                    missing_data_ids.add(data_id)
         # Fill out the new records. Assume that these new records do not
         # also need to carry over additional populated_by records.
@@ -2092,19 +2078,19 @@ class DirectButler(Butler):  # numpydoc ignore=PR02
     def _extract_dimension_records_from_data_ids(
         self,
         source_butler: LimitedButler | Butler,
-        data_ids: Iterable[DataCoordinate],
+        data_ids: set[DataCoordinate],
         allowed_elements: frozenset[DimensionElement],
     ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
         dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
-        data_ids = set(data_ids)
-        if not all(data_id.hasRecords() for data_id in data_ids):
-            if isinstance(source_butler, Butler):
-                data_ids = source_butler._expand_data_ids(data_ids)
-            else:
-                raise TypeError("Input butler needs to be a full butler to expand DataId.")
         for data_id in data_ids:
+            # Need an expanded record, if not expanded that we need a full
+            # butler with registry (allow mocks with registry too).
+            if not data_id.hasRecords():
+                if registry := getattr(source_butler, "registry", None):
+                    data_id = registry.expandDataId(data_id)
+                else:
+                    raise TypeError("Input butler needs to be a full butler to expand DataId.")
             # If this butler doesn't know about a dimension in the source
             # butler things will break later.
             for element_name in data_id.dimensions.elements:
@@ -2583,9 +2569,6 @@ class DirectButler(Butler):  # numpydoc ignore=PR02
         """Immediately load caches that are used for common operations."""
         self._registry.preload_cache(load_dimension_record_cache=load_dimension_record_cache)
-    def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
-        return self._registry.expand_data_ids(data_ids)
     _config: ButlerConfig
     """Configuration for this Butler instance."""

lsst/daf/butler/logging.py CHANGED Viewed

@@ -764,17 +764,11 @@ class ButlerLogRecords(MutableSequence[ButlerLogRecord]):
 class ButlerLogRecordHandler(StreamHandler):
-    """Python log handler that accumulates records.
+    """Python log handler that accumulates records."""
-    Parameters
-    ----------
-    records : `ButlerLogRecords`, optional
-        Container to store logs in.
-    """
-    def __init__(self, records: ButlerLogRecords | None = None) -> None:
+    def __init__(self) -> None:
         super().__init__()
-        self.records = ButlerLogRecords([]) if records is None else records
+        self.records = ButlerLogRecords([])
     def emit(self, record: LogRecord) -> None:
         self.records.append(record)

lsst/daf/butler/registry/bridge/monolithic.py CHANGED Viewed

@@ -215,24 +215,20 @@ class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge):
     def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]:
         # Docstring inherited from DatastoreRegistryBridge
         byId = {ref.id: ref for ref in refs}
-        found: list[DatasetIdRef] = []
-        with self._db.session():
-            for batch in chunk_iterable(byId.keys(), 50000):
-                sql = (
-                    sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
-                    .select_from(self._tables.dataset_location)
-                    .where(
-                        sqlalchemy.sql.and_(
-                            self._tables.dataset_location.columns.datastore_name == self.datastoreName,
-                            self._tables.dataset_location.columns.dataset_id.in_(batch),
-                        )
-                    )
+        sql = (
+            sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
+            .select_from(self._tables.dataset_location)
+            .where(
+                sqlalchemy.sql.and_(
+                    self._tables.dataset_location.columns.datastore_name == self.datastoreName,
+                    self._tables.dataset_location.columns.dataset_id.in_(byId.keys()),
                 )
-                with self._db.query(sql) as sql_result:
-                    sql_ids = sql_result.scalars().all()
-                found.extend(byId[id] for id in sql_ids)
-        return found
+            )
+        )
+        with self._db.query(sql) as sql_result:
+            sql_rows = sql_result.fetchall()
+        for row in sql_rows:
+            yield byId[row.dataset_id]
     @contextmanager
     def emptyTrash(

lsst/daf/butler/registry/datasets/byDimensions/_manager.py CHANGED Viewed

@@ -12,8 +12,6 @@ from typing import TYPE_CHECKING, Any, ClassVar
 import astropy.time
 import sqlalchemy
-from lsst.utils.iteration import chunk_iterable
 from .... import ddl
 from ...._collection_type import CollectionType
 from ...._dataset_ref import DatasetId, DatasetIdFactory, DatasetIdGenEnum, DatasetRef
@@ -426,18 +424,17 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
         return result
     def get_dataset_refs(self, ids: list[DatasetId]) -> list[DatasetRef]:
-        dataset_type_map: dict[DatasetId, DatasetType] = {}
-        for batch in chunk_iterable(set(ids), 50000):
-            # Look up the dataset types corresponding to the given Dataset IDs.
-            id_col = self._static.dataset.columns["id"]
-            sql = sqlalchemy.sql.select(
-                id_col,
-                self._static.dataset.columns["dataset_type_id"],
-            ).where(id_col.in_(batch))
-            with self._db.query(sql) as sql_result:
-                dataset_rows = sql_result.mappings().all()
-            for row in dataset_rows:
-                dataset_type_map[row["id"]] = self._get_dataset_type_by_id(row["dataset_type_id"])
+        # Look up the dataset types corresponding to the given Dataset IDs.
+        id_col = self._static.dataset.columns["id"]
+        sql = sqlalchemy.sql.select(
+            id_col,
+            self._static.dataset.columns["dataset_type_id"],
+        ).where(id_col.in_(ids))
+        with self._db.query(sql) as sql_result:
+            dataset_rows = sql_result.mappings().all()
+        dataset_type_map: dict[DatasetId, DatasetType] = {
+            row["id"]: self._get_dataset_type_by_id(row["dataset_type_id"]) for row in dataset_rows
+        }
         # Group the given dataset IDs by the DimensionGroup of their dataset
         # types -- there is a separate tags table for each DimensionGroup.
@@ -451,41 +448,40 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
             # data IDs corresponding to the UUIDs found from the dataset table.
             dynamic_tables = self._get_dynamic_tables(dimension_group)
             tags_table = self._get_tags_table(dynamic_tables)
-            for batch in chunk_iterable(datasets, 50000):
-                tags_sql = tags_table.select().where(tags_table.columns["dataset_id"].in_(batch))
-                # Join in the collection table to fetch the run name.
-                collection_column = tags_table.columns[self._collections.getCollectionForeignKeyName()]
-                joined_collections = self._collections.join_collections_sql(collection_column, tags_sql)
-                tags_sql = joined_collections.joined_sql
-                run_name_column = joined_collections.name_column
-                tags_sql = tags_sql.add_columns(run_name_column)
-                # Tags table includes run collections and tagged
-                # collections.
-                # In theory the data ID for a given dataset should be the
-                # same in both, but nothing actually guarantees this.
-                # So skip any tagged collections, using the run collection
-                # as the definitive definition.
-                tags_sql = tags_sql.where(joined_collections.type_column == int(CollectionType.RUN))
-                with self._db.query(tags_sql) as sql_result:
-                    data_id_rows = sql_result.mappings().all()
-                assert run_name_column.key is not None
-                for data_id_row in data_id_rows:
-                    id = data_id_row["dataset_id"]
-                    dataset_type = dataset_type_map[id]
-                    run_name = data_id_row[run_name_column.key]
-                    data_id = DataCoordinate.from_required_values(
-                        dimension_group,
-                        tuple(data_id_row[dimension] for dimension in dimension_group.required),
-                    )
-                    ref = DatasetRef(
-                        datasetType=dataset_type,
-                        dataId=data_id,
-                        id=id,
-                        run=run_name,
-                    )
-                    output_refs.append(ref)
+            tags_sql = tags_table.select().where(tags_table.columns["dataset_id"].in_(datasets))
+            # Join in the collection table to fetch the run name.
+            collection_column = tags_table.columns[self._collections.getCollectionForeignKeyName()]
+            joined_collections = self._collections.join_collections_sql(collection_column, tags_sql)
+            tags_sql = joined_collections.joined_sql
+            run_name_column = joined_collections.name_column
+            tags_sql = tags_sql.add_columns(run_name_column)
+            # Tags table includes run collections and tagged
+            # collections.
+            # In theory the data ID for a given dataset should be the
+            # same in both, but nothing actually guarantees this.
+            # So skip any tagged collections, using the run collection
+            # as the definitive definition.
+            tags_sql = tags_sql.where(joined_collections.type_column == int(CollectionType.RUN))
+            with self._db.query(tags_sql) as sql_result:
+                data_id_rows = sql_result.mappings().all()
+            assert run_name_column.key is not None
+            for data_id_row in data_id_rows:
+                id = data_id_row["dataset_id"]
+                dataset_type = dataset_type_map[id]
+                run_name = data_id_row[run_name_column.key]
+                data_id = DataCoordinate.from_required_values(
+                    dimension_group,
+                    tuple(data_id_row[dimension] for dimension in dimension_group.required),
+                )
+                ref = DatasetRef(
+                    datasetType=dataset_type,
+                    dataId=data_id,
+                    id=id,
+                    run=run_name,
+                )
+                output_refs.append(ref)
         return output_refs

lsst-daf-butler 30.0.0rc3__py3-none-any.whl → 30.2025.5000__py3-none-any.whl

lsst-daf-butler 30.0.0rc3py3-none-any.whl → 30.2025.5000py3-none-any.whl