PyPI - lsst-daf-butler - Versions diffs - 30.0.0rc2__py3-none-any.whl → 30.0.0rc3__py3-none-any.whl - Mend

lsst-daf-butler 30.0.0rc2py3-none-any.whl → 30.0.0rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

lsst/daf/butler/direct_butler/_direct_butler.py CHANGED Viewed

@@ -1822,12 +1822,25 @@ class DirectButler(Butler):  # numpydoc ignore=PR02
                     f" Example: {existing_datasets[0]}"
                 )
+        # Calculate some statistics based on the given list of datasets.
+        n_files = len(datasets)
+        n_datasets = 0
+        for d in datasets:
+            n_datasets += len(d.refs)
+        sfiles = "s" if n_files != 1 else ""
+        srefs = "s" if n_datasets != 1 else ""
         # We use `datasets` rather `new_datasets` for the Registry
         # portion of this, to let it confirm that everything matches the
         # existing datasets.
         import_info = self._prepare_ingest_file_datasets(datasets, progress)
-        with self.transaction():
+        with (
+            self._metrics.instrument_ingest(
+                n_datasets, _LOG, msg=f"Ingesting {n_files} file{sfiles} with {n_datasets} dataset{srefs}"
+            ),
+            self.transaction(),
+        ):
             self._ingest_file_datasets(datasets, import_info, progress)
             # Bulk-insert everything into Datastore.
@@ -1982,7 +1995,7 @@ class DirectButler(Butler):  # numpydoc ignore=PR02
             doImport(filename)  # type: ignore
     def transfer_dimension_records_from(
-        self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
+        self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
     ) -> None:
         # Allowed dimensions in the target butler.
         elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
@@ -2012,16 +2025,13 @@ class DirectButler(Butler):  # numpydoc ignore=PR02
             source_butler, data_ids, allowed_elements
         )
-        can_query = True if isinstance(source_butler, Butler) else False
         additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
         for original_element, record_mapping in primary_records.items():
             # Get dimensions that depend on this dimension.
             populated_by = self.dimensions.get_elements_populated_by(
                 self.dimensions[original_element.name]  # type: ignore
             )
-            for data_id in record_mapping.keys():
+            if populated_by:
                 for element in populated_by:
                     if element not in allowed_elements:
                         continue
@@ -2040,28 +2050,32 @@ class DirectButler(Butler):  # numpydoc ignore=PR02
                         # have to be scanned.
                         continue
-                    if not can_query:
-                        raise RuntimeError(
-                            f"Transferring populated_by records like {element.name} requires a full Butler."
-                        )
+                    if record_mapping:
+                        if not isinstance(source_butler, Butler):
+                            raise RuntimeError(
+                                f"Transferring populated_by records like {element.name}"
+                                " requires a full Butler."
+                            )
-                    records = source_butler.query_dimension_records(  # type: ignore
-                        element.name,
-                        explain=False,
-                        **data_id.mapping,  # type: ignore
-                    )
-                    for record in records:
-                        additional_records[record.definition].setdefault(record.dataId, record)
+                        with source_butler.query() as query:
+                            records = query.join_data_coordinates(record_mapping.keys()).dimension_records(
+                                element.name
+                            )
+                            for record in records:
+                                additional_records[record.definition].setdefault(record.dataId, record)
         # The next step is to walk back through the additional records to
         # pick up any missing content (such as visit_definition needing to
         # know the exposure). Want to ensure we do not request records we
         # already have.
         missing_data_ids = set()
-        for name, record_mapping in additional_records.items():
+        for record_mapping in additional_records.values():
             for data_id in record_mapping.keys():
-                if data_id not in primary_records[name]:
-                    missing_data_ids.add(data_id)
+                for dimension in data_id.dimensions.required:
+                    element = source_butler.dimensions[dimension]
+                    dimension_key = data_id.subset(dimension)
+                    if dimension_key not in primary_records[element]:
+                        missing_data_ids.add(dimension_key)
         # Fill out the new records. Assume that these new records do not
         # also need to carry over additional populated_by records.
@@ -2078,19 +2092,19 @@ class DirectButler(Butler):  # numpydoc ignore=PR02
     def _extract_dimension_records_from_data_ids(
         self,
         source_butler: LimitedButler | Butler,
-        data_ids: set[DataCoordinate],
+        data_ids: Iterable[DataCoordinate],
         allowed_elements: frozenset[DimensionElement],
     ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
         dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
+        data_ids = set(data_ids)
+        if not all(data_id.hasRecords() for data_id in data_ids):
+            if isinstance(source_butler, Butler):
+                data_ids = source_butler._expand_data_ids(data_ids)
+            else:
+                raise TypeError("Input butler needs to be a full butler to expand DataId.")
         for data_id in data_ids:
-            # Need an expanded record, if not expanded that we need a full
-            # butler with registry (allow mocks with registry too).
-            if not data_id.hasRecords():
-                if registry := getattr(source_butler, "registry", None):
-                    data_id = registry.expandDataId(data_id)
-                else:
-                    raise TypeError("Input butler needs to be a full butler to expand DataId.")
             # If this butler doesn't know about a dimension in the source
             # butler things will break later.
             for element_name in data_id.dimensions.elements:
@@ -2569,6 +2583,9 @@ class DirectButler(Butler):  # numpydoc ignore=PR02
         """Immediately load caches that are used for common operations."""
         self._registry.preload_cache(load_dimension_record_cache=load_dimension_record_cache)
+    def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
+        return self._registry.expand_data_ids(data_ids)
     _config: ButlerConfig
     """Configuration for this Butler instance."""

lsst/daf/butler/logging.py CHANGED Viewed

@@ -764,11 +764,17 @@ class ButlerLogRecords(MutableSequence[ButlerLogRecord]):
 class ButlerLogRecordHandler(StreamHandler):
-    """Python log handler that accumulates records."""
+    """Python log handler that accumulates records.
-    def __init__(self) -> None:
+    Parameters
+    ----------
+    records : `ButlerLogRecords`, optional
+        Container to store logs in.
+    """
+    def __init__(self, records: ButlerLogRecords | None = None) -> None:
         super().__init__()
-        self.records = ButlerLogRecords([])
+        self.records = ButlerLogRecords([]) if records is None else records
     def emit(self, record: LogRecord) -> None:
         self.records.append(record)

lsst/daf/butler/registry/bridge/monolithic.py CHANGED Viewed

@@ -215,20 +215,24 @@ class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge):
     def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]:
         # Docstring inherited from DatastoreRegistryBridge
         byId = {ref.id: ref for ref in refs}
-        sql = (
-            sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
-            .select_from(self._tables.dataset_location)
-            .where(
-                sqlalchemy.sql.and_(
-                    self._tables.dataset_location.columns.datastore_name == self.datastoreName,
-                    self._tables.dataset_location.columns.dataset_id.in_(byId.keys()),
+        found: list[DatasetIdRef] = []
+        with self._db.session():
+            for batch in chunk_iterable(byId.keys(), 50000):
+                sql = (
+                    sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
+                    .select_from(self._tables.dataset_location)
+                    .where(
+                        sqlalchemy.sql.and_(
+                            self._tables.dataset_location.columns.datastore_name == self.datastoreName,
+                            self._tables.dataset_location.columns.dataset_id.in_(batch),
+                        )
+                    )
                 )
-            )
-        )
-        with self._db.query(sql) as sql_result:
-            sql_rows = sql_result.fetchall()
-        for row in sql_rows:
-            yield byId[row.dataset_id]
+                with self._db.query(sql) as sql_result:
+                    sql_ids = sql_result.scalars().all()
+                found.extend(byId[id] for id in sql_ids)
+        return found
     @contextmanager
     def emptyTrash(

lsst/daf/butler/registry/datasets/byDimensions/_manager.py CHANGED Viewed

@@ -12,6 +12,8 @@ from typing import TYPE_CHECKING, Any, ClassVar
 import astropy.time
 import sqlalchemy
+from lsst.utils.iteration import chunk_iterable
 from .... import ddl
 from ...._collection_type import CollectionType
 from ...._dataset_ref import DatasetId, DatasetIdFactory, DatasetIdGenEnum, DatasetRef
@@ -424,17 +426,18 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
         return result
     def get_dataset_refs(self, ids: list[DatasetId]) -> list[DatasetRef]:
-        # Look up the dataset types corresponding to the given Dataset IDs.
-        id_col = self._static.dataset.columns["id"]
-        sql = sqlalchemy.sql.select(
-            id_col,
-            self._static.dataset.columns["dataset_type_id"],
-        ).where(id_col.in_(ids))
-        with self._db.query(sql) as sql_result:
-            dataset_rows = sql_result.mappings().all()
-        dataset_type_map: dict[DatasetId, DatasetType] = {
-            row["id"]: self._get_dataset_type_by_id(row["dataset_type_id"]) for row in dataset_rows
-        }
+        dataset_type_map: dict[DatasetId, DatasetType] = {}
+        for batch in chunk_iterable(set(ids), 50000):
+            # Look up the dataset types corresponding to the given Dataset IDs.
+            id_col = self._static.dataset.columns["id"]
+            sql = sqlalchemy.sql.select(
+                id_col,
+                self._static.dataset.columns["dataset_type_id"],
+            ).where(id_col.in_(batch))
+            with self._db.query(sql) as sql_result:
+                dataset_rows = sql_result.mappings().all()
+            for row in dataset_rows:
+                dataset_type_map[row["id"]] = self._get_dataset_type_by_id(row["dataset_type_id"])
         # Group the given dataset IDs by the DimensionGroup of their dataset
         # types -- there is a separate tags table for each DimensionGroup.
@@ -448,40 +451,41 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
             # data IDs corresponding to the UUIDs found from the dataset table.
             dynamic_tables = self._get_dynamic_tables(dimension_group)
             tags_table = self._get_tags_table(dynamic_tables)
-            tags_sql = tags_table.select().where(tags_table.columns["dataset_id"].in_(datasets))
-            # Join in the collection table to fetch the run name.
-            collection_column = tags_table.columns[self._collections.getCollectionForeignKeyName()]
-            joined_collections = self._collections.join_collections_sql(collection_column, tags_sql)
-            tags_sql = joined_collections.joined_sql
-            run_name_column = joined_collections.name_column
-            tags_sql = tags_sql.add_columns(run_name_column)
-            # Tags table includes run collections and tagged
-            # collections.
-            # In theory the data ID for a given dataset should be the
-            # same in both, but nothing actually guarantees this.
-            # So skip any tagged collections, using the run collection
-            # as the definitive definition.
-            tags_sql = tags_sql.where(joined_collections.type_column == int(CollectionType.RUN))
-            with self._db.query(tags_sql) as sql_result:
-                data_id_rows = sql_result.mappings().all()
-            assert run_name_column.key is not None
-            for data_id_row in data_id_rows:
-                id = data_id_row["dataset_id"]
-                dataset_type = dataset_type_map[id]
-                run_name = data_id_row[run_name_column.key]
-                data_id = DataCoordinate.from_required_values(
-                    dimension_group,
-                    tuple(data_id_row[dimension] for dimension in dimension_group.required),
-                )
-                ref = DatasetRef(
-                    datasetType=dataset_type,
-                    dataId=data_id,
-                    id=id,
-                    run=run_name,
-                )
-                output_refs.append(ref)
+            for batch in chunk_iterable(datasets, 50000):
+                tags_sql = tags_table.select().where(tags_table.columns["dataset_id"].in_(batch))
+                # Join in the collection table to fetch the run name.
+                collection_column = tags_table.columns[self._collections.getCollectionForeignKeyName()]
+                joined_collections = self._collections.join_collections_sql(collection_column, tags_sql)
+                tags_sql = joined_collections.joined_sql
+                run_name_column = joined_collections.name_column
+                tags_sql = tags_sql.add_columns(run_name_column)
+                # Tags table includes run collections and tagged
+                # collections.
+                # In theory the data ID for a given dataset should be the
+                # same in both, but nothing actually guarantees this.
+                # So skip any tagged collections, using the run collection
+                # as the definitive definition.
+                tags_sql = tags_sql.where(joined_collections.type_column == int(CollectionType.RUN))
+                with self._db.query(tags_sql) as sql_result:
+                    data_id_rows = sql_result.mappings().all()
+                assert run_name_column.key is not None
+                for data_id_row in data_id_rows:
+                    id = data_id_row["dataset_id"]
+                    dataset_type = dataset_type_map[id]
+                    run_name = data_id_row[run_name_column.key]
+                    data_id = DataCoordinate.from_required_values(
+                        dimension_group,
+                        tuple(data_id_row[dimension] for dimension in dimension_group.required),
+                    )
+                    ref = DatasetRef(
+                        datasetType=dataset_type,
+                        dataId=data_id,
+                        id=id,
+                        run=run_name,
+                    )
+                    output_refs.append(ref)
         return output_refs

lsst/daf/butler/registry/expand_data_ids.py ADDED Viewed

@@ -0,0 +1,93 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This software is dual licensed under the GNU General Public License and also
+# under a 3-clause BSD license. Recipients may choose which of these licenses
+# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
+# respectively.  If you choose the GPL option then the following text applies
+# (but note that there is still no warranty even if you opt for BSD instead):
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from __future__ import annotations
+from collections import defaultdict
+from collections.abc import Iterable
+from ..dimensions import (
+    DataCoordinate,
+    DimensionDataAttacher,
+    DimensionGroup,
+    DimensionUniverse,
+)
+from ..dimensions.record_cache import DimensionRecordCache
+from ..queries import QueryFactoryFunction
+def expand_data_ids(
+    data_ids: Iterable[DataCoordinate],
+    universe: DimensionUniverse,
+    query_func: QueryFactoryFunction,
+    cache: DimensionRecordCache | None,
+) -> list[DataCoordinate]:
+    """Expand the given data IDs to look up implied dimension values and attach
+    dimension records.
+    Parameters
+    ----------
+    data_ids : `~collections.abc.Iterable` [ `DataCoordinate` ]
+        Data coordinates to be expanded.
+    universe : `DimensionUniverse`
+        Dimension universe associated with the given ``data_ids`` values.
+    query_func : QueryFactoryFunction
+        Function used to set up a Butler query context for looking up required
+        information from the database.
+    cache : `DimensionRecordCache` | None
+        Cache containing already-known dimension records.  May be `None` if a
+        cache is not available.
+    Returns
+    -------
+    expanded : `list` [ `DataCoordinate` ]
+        List of `DataCoordinate` instances in the same order as the input
+        values.  It is guaranteed that each `DataCoordinate` has
+        ``hasRecords()=True`` and ``hasFull()=True``.
+    """
+    output = list(data_ids)
+    grouped_by_dimensions: defaultdict[DimensionGroup, list[int]] = defaultdict(list)
+    for i, data_id in enumerate(data_ids):
+        if not data_id.hasRecords():
+            grouped_by_dimensions[data_id.dimensions].append(i)
+    if not grouped_by_dimensions:
+        # All given DataCoordinate values are already expanded.
+        return output
+    attacher = DimensionDataAttacher(
+        cache=cache,
+        dimensions=DimensionGroup.union(*grouped_by_dimensions.keys(), universe=universe),
+    )
+    for dimensions, indexes in grouped_by_dimensions.items():
+        with query_func() as query:
+            expanded = attacher.attach(dimensions, (output[index] for index in indexes), query)
+            for index, data_id in zip(indexes, expanded):
+                output[index] = data_id
+    return output

lsst/daf/butler/registry/interfaces/_database.py CHANGED Viewed

@@ -1562,7 +1562,12 @@ class Database(ABC):
                 return None
             else:
                 sql = table.insert()
-                return [connection.execute(sql, row).inserted_primary_key[0] for row in rows]
+                ids = []
+                for row in rows:
+                    key = connection.execute(sql, row).inserted_primary_key
+                    assert key is not None
+                    ids.append(key[0])
+                return ids
     @abstractmethod
     def replace(self, table: sqlalchemy.schema.Table, *rows: dict) -> None:

lsst/daf/butler/registry/sql_registry.py CHANGED Viewed

@@ -34,7 +34,6 @@ __all__ = ("SqlRegistry",)
 import contextlib
 import logging
 import warnings
-from collections import defaultdict
 from collections.abc import Iterable, Iterator, Mapping, Sequence
 from typing import TYPE_CHECKING, Any
@@ -54,7 +53,6 @@ from ..dimensions import (
     DataCoordinate,
     DataId,
     DimensionConfig,
-    DimensionDataAttacher,
     DimensionElement,
     DimensionGroup,
     DimensionRecord,
@@ -78,6 +76,7 @@ from ..registry.interfaces import ChainedCollectionRecord, ReadOnlyDatabaseError
 from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
 from ..registry.wildcards import CollectionWildcard, DatasetTypeWildcard
 from ..utils import transactional
+from .expand_data_ids import expand_data_ids
 if TYPE_CHECKING:
     from .._butler_config import ButlerConfig
@@ -1415,28 +1414,7 @@ class SqlRegistry:
         return DataCoordinate.standardize(keys, dimensions=standardized.dimensions).expanded(records=records)
     def expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
-        output = list(data_ids)
-        grouped_by_dimensions: defaultdict[DimensionGroup, list[int]] = defaultdict(list)
-        for i, data_id in enumerate(data_ids):
-            if not data_id.hasRecords():
-                grouped_by_dimensions[data_id.dimensions].append(i)
-        if not grouped_by_dimensions:
-            # All given DataCoordinate values are already expanded.
-            return output
-        attacher = DimensionDataAttacher(
-            cache=self.dimension_record_cache,
-            dimensions=DimensionGroup.union(*grouped_by_dimensions.keys(), universe=self.dimensions),
-        )
-        with self._query() as query:
-            for dimensions, indexes in grouped_by_dimensions.items():
-                expanded = attacher.attach(dimensions, (output[index] for index in indexes), query)
-                for index, data_id in zip(indexes, expanded):
-                    output[index] = data_id
-        return output
+        return expand_data_ids(data_ids, self.dimensions, self._query, self.dimension_record_cache)
     def expand_refs(self, dataset_refs: list[DatasetRef]) -> list[DatasetRef]:
         expanded_ids = self.expand_data_ids([ref.dataId for ref in dataset_refs])

lsst/daf/butler/remote_butler/_remote_butler.py CHANGED Viewed

@@ -65,6 +65,7 @@ from ..dimensions import DataCoordinate, DataIdValue, DimensionConfig, Dimension
 from ..queries import Query
 from ..queries.tree import make_column_literal
 from ..registry import CollectionArgType, NoDefaultCollectionError, Registry, RegistryDefaults
+from ..registry.expand_data_ids import expand_data_ids
 from ._collection_args import convert_collection_arg_to_glob_string_list
 from ._defaults import DefaultsHolder
 from ._get import convert_http_url_to_resource_path, get_dataset_as_python_object
@@ -633,7 +634,7 @@ class RemoteButler(Butler):  # numpydoc ignore=PR02
         raise NotImplementedError()
     def transfer_dimension_records_from(
-        self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
+        self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
     ) -> None:
         # Docstring inherited.
         raise NotImplementedError()
@@ -738,6 +739,9 @@ class RemoteButler(Butler):  # numpydoc ignore=PR02
     def close(self) -> None:
         pass
+    def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
+        return expand_data_ids(data_ids, self.dimensions, self.query, None)
     @property
     def _file_transfer_source(self) -> RemoteFileTransferSource:
         return RemoteFileTransferSource(self._connection)

lsst/daf/butler/tests/hybrid_butler.py CHANGED Viewed

@@ -338,7 +338,7 @@ class HybridButler(Butler):
         )
     def transfer_dimension_records_from(
-        self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
+        self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
     ) -> None:
         return self._direct_butler.transfer_dimension_records_from(source_butler, source_refs)
@@ -425,6 +425,9 @@ class HybridButler(Butler):
             source_butler, data_ids, allowed_elements
         )
+    def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
+        return self._remote_butler._expand_data_ids(data_ids)
     @property
     def collection_chains(self) -> ButlerCollections:
         return HybridButlerCollections(self)

lsst-daf-butler 30.0.0rc2__py3-none-any.whl → 30.0.0rc3__py3-none-any.whl

lsst-daf-butler 30.0.0rc2py3-none-any.whl → 30.0.0rc3py3-none-any.whl