PyPI - lsst-daf-butler - Versions diffs - 30.0.0rc2__py3-none-any.whl → 30.0.1rc1__py3-none-any.whl - Mend

lsst-daf-butler 30.0.0rc2py3-none-any.whl → 30.0.1rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

lsst/daf/butler/_butler.py CHANGED Viewed

@@ -138,7 +138,10 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
     without_datastore : `bool`, optional
         If `True` do not attach a datastore to this butler. Any attempts
         to use a datastore will fail.
-    **kwargs : `Any`
+    metrics : `ButlerMetrics` or `None`
+        External metrics object to be used for tracking butler usage. If `None`
+        a new metrics object is created.
+    **kwargs : `typing.Any`
         Additional keyword arguments passed to a constructor of actual butler
         class.
@@ -240,7 +243,7 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
             to use a datastore will fail.
         metrics : `ButlerMetrics` or `None`, optional
             Metrics object to record butler usage statistics.
-        **kwargs : `Any`
+        **kwargs : `typing.Any`
             Default data ID key-value pairs.  These may only identify
             "governor" dimensions like ``instrument`` and ``skymap``.
@@ -1390,6 +1393,10 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
             raised if any datasets with the same dataset ID already exist
             in the datastore.
+        Returns
+        -------
+        None
         Raises
         ------
         TypeError
@@ -1429,6 +1436,7 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
         *,
         transfer_dimensions: bool = False,
         dry_run: bool = False,
+        skip_existing: bool = False,
     ) -> None:
         """Ingest a Zip file into this butler.
@@ -1447,6 +1455,14 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
             If `True` the ingest will be processed without any modifications
             made to the target butler and as if the target butler did not
             have any of the datasets.
+        skip_existing : `bool`, optional
+            If `True`, a zip will not be ingested if the dataset entries listed
+            in the index with the same dataset ID already exists in the butler.
+            If `False` (the default), a `ConflictingDefinitionError` will be
+            raised if any datasets with the same dataset ID already exist
+            in the repository. If, somehow, some datasets are known to the
+            butler and some are not, this is currently treated as an error
+            rather than attempting to do a partial ingest.
         Notes
         -----
@@ -1566,7 +1582,7 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
     @abstractmethod
     def transfer_dimension_records_from(
-        self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
+        self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
     ) -> None:
         """Transfer dimension records to this Butler from another Butler.
@@ -1578,10 +1594,9 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
             `Butler` whose registry will be used to expand data IDs. If the
             source refs contain coordinates that are used to populate other
             records then this will also need to be a full `Butler`.
-        source_refs : iterable of `DatasetRef`
-            Datasets defined in the source butler whose dimension records
-            should be transferred to this butler. In most circumstances.
-            transfer is faster if the dataset refs are expanded.
+        source_refs : iterable of `DatasetRef` or `DataCoordinate`
+            Datasets or data IDs defined in the source butler whose dimension
+            records should be transferred to this butler.
         """
         raise NotImplementedError()
@@ -2025,7 +2040,7 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
         Returns
         -------
-        records : `list`[`DimensionRecord`]
+        records : `list` [`DimensionRecord`]
             Dimension records matching the given query parameters.
         Raises
@@ -2227,3 +2242,7 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
     @abstractmethod
     def close(self) -> None:
         raise NotImplementedError()
+    @abstractmethod
+    def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
+        raise NotImplementedError()

lsst/daf/butler/_butler_collections.py CHANGED Viewed

@@ -360,10 +360,10 @@ class ButlerCollections(ABC, Sequence):
         name : `str`
             The name of the collection of interest.
         include_parents : `bool`, optional
-           If `True` any parents of this collection will be included.
+            If `True` any parents of this collection will be included.
         include_summary : `bool`, optional
-           If `True` dataset type names and governor dimensions of datasets
-           stored in this collection will be included in the result.
+            If `True` dataset type names and governor dimensions of datasets
+            stored in this collection will be included in the result.
         Returns
         -------
@@ -464,7 +464,7 @@ class ButlerCollections(ABC, Sequence):
         Returns
         -------
-        filtered : `~collections.abc.Mapping` [`str`, `list`[`str`]]
+        filtered : `~collections.abc.Mapping` [`str`, `list` [`str`]]
             Mapping of the dataset type name to its corresponding list of
             collection names.
         """

lsst/daf/butler/_butler_metrics.py CHANGED Viewed

@@ -27,14 +27,19 @@
 from __future__ import annotations
+__all__ = ["ButlerMetrics"]
 from collections.abc import Callable, Iterator
 from contextlib import contextmanager
+from typing import Concatenate, ParamSpec
 from pydantic import BaseModel
 from lsst.utils.logging import LsstLoggers
 from lsst.utils.timer import time_this
+P = ParamSpec("P")
 class ButlerMetrics(BaseModel):
     """Metrics collected during Butler operations."""
@@ -45,18 +50,26 @@ class ButlerMetrics(BaseModel):
     time_in_get: float = 0.0
     """Wall-clock time, in seconds, spent in get()."""
+    time_in_ingest: float = 0.0
+    """Wall-clock time, in seconds, spent in ingest()."""
     n_get: int = 0
     """Number of datasets retrieved with get()."""
     n_put: int = 0
     """Number of datasets stored with put()."""
+    n_ingest: int = 0
+    """Number of datasets ingested."""
     def reset(self) -> None:
         """Reset all metrics."""
         self.time_in_put = 0.0
         self.time_in_get = 0.0
+        self.time_in_ingest = 0.0
         self.n_get = 0
         self.n_put = 0
+        self.n_ingest = 0
     def increment_get(self, duration: float) -> None:
         """Increment time for get().
@@ -80,13 +93,31 @@ class ButlerMetrics(BaseModel):
         self.time_in_put += duration
         self.n_put += 1
+    def increment_ingest(self, duration: float, n_datasets: int) -> None:
+        """Increment time and datasets for ingest().
+        Parameters
+        ----------
+        duration : `float`
+            Duration to add to the ingest() statistics.
+        n_datasets : `int`
+            Number of datasets to be ingested for this call.
+        """
+        self.time_in_ingest += duration
+        self.n_ingest += n_datasets
     @contextmanager
     def _timer(
-        self, handler: Callable[[float], None], log: LsstLoggers | None = None, msg: str | None = None
+        self,
+        handler: Callable[Concatenate[float, P], None],
+        log: LsstLoggers | None = None,
+        msg: str | None = None,
+        *args: P.args,
+        **kwargs: P.kwargs,
     ) -> Iterator[None]:
         with time_this(log=log, msg=msg) as timer:
             yield
-        handler(timer.duration)
+        handler(timer.duration, *args, **kwargs)
     @contextmanager
     def instrument_get(self, log: LsstLoggers | None = None, msg: str | None = None) -> Iterator[None]:
@@ -115,3 +146,21 @@ class ButlerMetrics(BaseModel):
         """
         with self._timer(self.increment_put, log=log, msg=msg):
             yield
+    @contextmanager
+    def instrument_ingest(
+        self, n_datasets: int, log: LsstLoggers | None = None, msg: str | None = None
+    ) -> Iterator[None]:
+        """Run code and increment ingest statistics.
+        Parameters
+        ----------
+        n_datasets : `int`
+            Number of datasets being ingested.
+        log : `logging.Logger` or `None`
+            Logger to use for any timing information.
+        msg : `str` or `None`
+            Any message to be included in log output.
+        """
+        with self._timer(self.increment_ingest, n_datasets=n_datasets, log=log, msg=msg):
+            yield

lsst/daf/butler/_dataset_provenance.py CHANGED Viewed

@@ -267,7 +267,7 @@ class DatasetProvenance(pydantic.BaseModel):
         use_upper : `bool` or `None`
             If `True` use upper case for provenance keys, if `False` use lower
             case, if `None` match the case of the prefix.
-        keys : `tuple` of `str` | `int`
+        *keys : `tuple` of `str` | `int`
             Components of key to combine with prefix and separator.
         Returns

lsst/daf/butler/_dataset_ref.py CHANGED Viewed

@@ -479,7 +479,7 @@ class DatasetRef:
         Parameters
         ----------
-        simple : `dict` of [`str`, `Any`]
+        simple : `dict` of [`str`, `typing.Any`]
             The value returned by `to_simple()`.
         universe : `DimensionUniverse`
             The special graph of all known dimensions.

lsst/daf/butler/_exceptions.py CHANGED Viewed

@@ -196,8 +196,8 @@ class ValidationError(RuntimeError):
 class EmptyQueryResultError(Exception):
-    """Exception raised when query methods return an empty result and `explain`
-    flag is set.
+    """Exception raised when query methods return an empty result and
+    ``explain`` flag is set.
     Parameters
     ----------

lsst/daf/butler/_file_dataset.py CHANGED Viewed

@@ -129,7 +129,8 @@ class FileDataset:
         ----------
         dataset : `SerializedFileDataset`
             Object to deserialize.
-        dataset_type_loader : `Callable` [[ `str` ], `DatasetType` ]
+        dataset_type_loader : `~collections.abc.Callable` \
+              [[ `str` ], `DatasetType` ]
             Function that takes a string dataset type name as its
             only parameter, and returns an instance of `DatasetType`.
             Used to deserialize the `DatasetRef` instances contained

lsst/daf/butler/_formatter.py CHANGED Viewed

@@ -54,6 +54,7 @@ from ._config import Config
 from ._config_support import LookupKey, processLookupConfigs
 from ._file_descriptor import FileDescriptor
 from ._location import Location
+from ._rubin.temporary_for_ingest import TemporaryForIngest
 from .dimensions import DataCoordinate, DimensionUniverse
 from .mapping_factory import MappingFactory
@@ -909,6 +910,10 @@ class FormatterV2:
         provenance : `DatasetProvenance` | `None`, optional
             Provenance to attach to the file being written.
+        Returns
+        -------
+        None
         Raises
         ------
         FormatterNotImplementedError
@@ -1031,13 +1036,7 @@ class FormatterV2:
         """
         cache_manager = self._ensure_cache(cache_manager)
-        # Always write to a temporary even if
-        # using a local file system -- that gives us atomic writes.
-        # If a process is killed as the file is being written we do not
-        # want it to remain in the correct place but in corrupt state.
-        # For local files write to the output directory not temporary dir.
-        prefix = uri.dirname() if uri.isLocal else None
-        with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
+        with TemporaryForIngest.make_path(uri) as temporary_uri:
             # Need to configure the formatter to write to a different
             # location and that needs us to overwrite internals
             log.debug("Writing dataset to temporary location at %s", temporary_uri)
@@ -1142,6 +1141,10 @@ class FormatterV2:
         location : `Location`
             Location from which to extract a file extension.
+        Returns
+        -------
+        None
         Raises
         ------
         ValueError
@@ -1588,6 +1591,10 @@ class Formatter(metaclass=ABCMeta):
         location : `Location`
             Location from which to extract a file extension.
+        Returns
+        -------
+        None
         Raises
         ------
         NotImplementedError

lsst/daf/butler/_labeled_butler_factory.py CHANGED Viewed

@@ -30,7 +30,9 @@ from __future__ import annotations
 __all__ = ("LabeledButlerFactory", "LabeledButlerFactoryProtocol")
 from collections.abc import Mapping
-from typing import Protocol
+from contextlib import AbstractContextManager
+from logging import getLogger
+from typing import Any, Literal, Protocol, Self
 from lsst.resources import ResourcePathExpression
@@ -40,6 +42,8 @@ from ._butler_repo_index import ButlerRepoIndex
 from ._utilities.named_locks import NamedLocks
 from ._utilities.thread_safe_cache import ThreadSafeCache
+_LOG = getLogger(__name__)
 class LabeledButlerFactoryProtocol(Protocol):
     """Callable to retrieve a butler from a label."""
@@ -47,7 +51,7 @@ class LabeledButlerFactoryProtocol(Protocol):
     def __call__(self, label: str) -> Butler: ...
-class LabeledButlerFactory:
+class LabeledButlerFactory(AbstractContextManager):
     """Factory for efficiently instantiating Butler instances from the
     repository index file.  This is intended for use from long-lived services
     that want to instantiate a separate Butler instance for each end user
@@ -60,6 +64,9 @@ class LabeledButlerFactory:
         files.  If not provided, defaults to the global repository index
         configured by the ``DAF_BUTLER_REPOSITORY_INDEX`` environment variable
         --  see `ButlerRepoIndex`.
+    writeable : `bool`, optional
+        If `True`, Butler instances created by this factory will be writeable.
+        If `False` (the default), instances will be read-only.
     Notes
     -----
@@ -76,11 +83,12 @@ class LabeledButlerFactory:
     safely be used by separate threads.
     """
-    def __init__(self, repositories: Mapping[str, str] | None = None) -> None:
+    def __init__(self, repositories: Mapping[str, str] | None = None, writeable: bool = False) -> None:
         if repositories is None:
             self._repositories = None
         else:
             self._repositories = dict(repositories)
+        self._writeable = writeable
         self._factories = ThreadSafeCache[str, _ButlerFactory]()
         self._initialization_locks = NamedLocks()
@@ -88,6 +96,16 @@ class LabeledButlerFactory:
         # This may be overridden by unit tests.
         self._preload_unsafe_direct_butler_caches = True
+    def __enter__(self) -> Self:
+        return self
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> Literal[False]:
+        try:
+            self.close()
+        except Exception:
+            _LOG.exception("An exception occurred during LabeledButlerFactory.close()")
+        return False
     def bind(self, access_token: str | None) -> LabeledButlerFactoryProtocol:
         """Create a callable factory function for generating Butler instances
         with out needing to specify access tokans again.
@@ -109,7 +127,7 @@ class LabeledButlerFactory:
         return create
-    def create_butler(self, *, label: str, access_token: str | None) -> Butler:
+    def create_butler(self, label: str, *, access_token: str | None = None) -> Butler:
         """Create a Butler instance.
         Parameters
@@ -118,7 +136,7 @@ class LabeledButlerFactory:
             Label of the repository to instantiate, from the ``repositories``
             parameter to the `LabeledButlerFactory` constructor or the global
             repository index file.
-        access_token : `str` | `None`
+        access_token : `str` | `None`, optional
             Gafaelfawr access token used to authenticate to a Butler server.
             This is required for any repositories configured to use
             `RemoteButler`.  If you only use `DirectButler`, this may be
@@ -167,7 +185,9 @@ class LabeledButlerFactory:
         match butler_type:
             case ButlerType.DIRECT:
-                return _DirectButlerFactory(config, self._preload_unsafe_direct_butler_caches)
+                return _DirectButlerFactory(
+                    config, self._preload_unsafe_direct_butler_caches, self._writeable
+                )
             case ButlerType.REMOTE:
                 return _RemoteButlerFactory(config)
             case _:
@@ -189,12 +209,12 @@ class _ButlerFactory(Protocol):
 class _DirectButlerFactory(_ButlerFactory):
-    def __init__(self, config: ButlerConfig, preload_unsafe_caches: bool) -> None:
+    def __init__(self, config: ButlerConfig, preload_unsafe_caches: bool, writeable: bool) -> None:
         import lsst.daf.butler.direct_butler
         # Create a 'template' Butler that will be cloned when callers request
         # an instance.
-        self._butler = Butler.from_config(config)
+        self._butler = Butler.from_config(config, writeable=writeable)
         assert isinstance(self._butler, lsst.daf.butler.direct_butler.DirectButler)
         # Load caches so that data is available in cloned instances without

lsst/daf/butler/_query_all_datasets.py CHANGED Viewed

@@ -151,6 +151,8 @@ def _filter_collections_and_dataset_types(
     Parameters
     ----------
+    butler
+        Butler repository to use.
     collections
         List of collection names or collection search globs.
     dataset_type_query

lsst/daf/butler/_rubin/temporary_for_ingest.py ADDED Viewed

@@ -0,0 +1,207 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This software is dual licensed under the GNU General Public License and also
+# under a 3-clause BSD license. Recipients may choose which of these licenses
+# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
+# respectively.  If you choose the GPL option then the following text applies
+# (but note that there is still no warranty even if you opt for BSD instead):
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from __future__ import annotations
+__all__ = ("TemporaryForIngest",)
+import dataclasses
+import glob
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Self, cast
+from lsst.resources import ResourcePath
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+    from types import TracebackType
+    from .._butler import Butler
+    from .._dataset_ref import DatasetRef
+    from .._file_dataset import FileDataset
+    from .._limited_butler import LimitedButler
+@dataclasses.dataclass
+class TemporaryForIngest:
+    """A context manager for generating temporary paths that will be ingested
+    as butler datasets.
+    Notes
+    -----
+    Neither this class nor its `make_path` method run ingest automatically when
+    their context manager is exited; the `ingest` method must always be called
+    explicitly.
+    """
+    butler: Butler
+    """Full butler to obtain a predicted path from and ingest into."""
+    ref: DatasetRef
+    """Description of the dataset to ingest."""
+    dataset: FileDataset = dataclasses.field(init=False)
+    """The dataset that will be passed to `Butler.ingest`."""
+    @property
+    def path(self) -> ResourcePath:
+        """The temporary path.
+        Guaranteed to be a local POSIX path.
+        """
+        return cast(ResourcePath, self.dataset.path)
+    @property
+    def ospath(self) -> str:
+        """The temporary path as a complete filename."""
+        return self.path.ospath
+    @classmethod
+    @contextmanager
+    def make_path(cls, final_path: ResourcePath) -> Iterator[ResourcePath]:
+        """Return a temporary path context manager given the predicted final
+        path.
+        Parameters
+        ----------
+        final_path : `lsst.resources.ResourcePath`
+            Predicted final path.
+        Returns
+        -------
+        context : `contextlib.AbstractContextManager`
+            A context manager that yields the temporary
+            `~lsst.resources.ResourcePath` when entered and deletes that file
+            when exited.
+        """
+        # Always write to a temporary even if using a local file system -- that
+        # gives us atomic writes. If a process is killed as the file is being
+        # written we do not want it to remain in the correct place but in
+        # corrupt state. For local files write to the output directory not
+        # temporary dir.
+        prefix = final_path.dirname() if final_path.isLocal else None
+        if prefix is not None:
+            prefix.mkdir()
+        with ResourcePath.temporary_uri(
+            suffix=cls._get_temporary_suffix(final_path), prefix=prefix
+        ) as temporary_path:
+            yield temporary_path
+    def ingest(self, record_validation_info: bool = True) -> None:
+        """Ingest the file into the butler.
+        Parameters
+        ----------
+        record_validation_info : `bool`, optional
+            Whether to- record the file size and checksum upon ingest.
+        """
+        self.butler.ingest(self.dataset, transfer="move", record_validation_info=record_validation_info)
+    def __enter__(self) -> Self:
+        from .._file_dataset import FileDataset
+        final_path = self.butler.getURI(self.ref, predict=True).replace(fragment="")
+        prefix = final_path.dirname() if final_path.isLocal else None
+        if prefix is not None:
+            prefix.mkdir()
+        self._temporary_path_context = self.make_path(final_path)
+        temporary_path = self._temporary_path_context.__enter__()
+        self.dataset = FileDataset(temporary_path, [self.ref], formatter=None)
+        return self
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: TracebackType | None,
+    ) -> bool | None:
+        return self._temporary_path_context.__exit__(exc_type, exc_value, traceback)
+    @classmethod
+    def find_orphaned_temporaries_by_path(cls, final_path: ResourcePath) -> list[ResourcePath]:
+        """Search for temporary files that were not successfully ingested.
+        Parameters
+        ----------
+        final_path : `lsst.resources.ResourcePath`
+            Final path a successfully-ingested file would have.
+        Returns
+        -------
+        paths : `list` [ `lsst.resources.ResourcePath` ]
+            Files that look like temporaries that might have been created while
+            trying to write the target dataset.
+        Notes
+        -----
+        Orphaned files are only possible when a context manager is interrupted
+        by a hard error that prevents any cleanup code from running (e.g.
+        sudden loss of power).
+        """
+        if not final_path.isLocal:
+            # We return true tempfile for non-local predicted paths, so orphans
+            # are not our problem (the OS etc. will take care of them).
+            return []
+        return [
+            ResourcePath(filename)
+            for filename in glob.glob(
+                f"{glob.escape(final_path.dirname().ospath)}*{glob.escape(cls._get_temporary_suffix(final_path))}"
+            )
+            if filename != final_path.ospath
+        ]
+    @classmethod
+    def find_orphaned_temporaries_by_ref(cls, ref: DatasetRef, butler: LimitedButler) -> list[ResourcePath]:
+        """Search for temporary files that were not successfully ingested.
+        Parameters
+        ----------
+        ref : `..DatasetRef`
+            A dataset reference the temporaries correspond to.
+        butler : `lsst.daf.butler.LimitedButler`
+            Butler that can be used to obtain a predicted URI for a dataset.
+        Returns
+        -------
+        paths : `list` [ `lsst.resources.ResourcePath` ]
+            Files that look like temporaries that might have been created while
+            trying to write the target dataset.
+        Notes
+        -----
+        Orphaned files are only possible when a context manager is interrupted
+        by a hard error that prevents any cleanup code from running (e.g.
+        sudden loss of power).
+        """
+        final_path = butler.getURI(ref, predict=True).replace(fragment="")
+        return cls.find_orphaned_temporaries_by_path(final_path)
+    @staticmethod
+    def _get_temporary_suffix(path: ResourcePath) -> str:
+        ext = path.getExtension()
+        basename = path.basename().removesuffix(ext)
+        return f"{basename}.tmp{ext}"

lsst/daf/butler/cli/cmd/_remove_runs.py CHANGED Viewed

@@ -114,18 +114,7 @@ def remove_runs(context: click.Context, confirm: bool, force: bool, **kwargs: An
     This command can be used to remove RUN collections and the datasets within
     them.
-    Parameters
-    ----------
-    context : `click.Context`
-        Context provided by Click.
-    confirm : `bool`
-        Confirmation for removal of the run.
-    force : `bool`
-        Force removal.
-    **kwargs : `dict` [`str`, `str`]
-        The parameters to pass to `~lsst.daf.butler.script.removeRuns`.
-    """
+    """  # numpydoc ignore=PR01
     result = script.removeRuns(**kwargs)
     canRemoveRuns = len(result.runs)
     if not canRemoveRuns:

lsst-daf-butler 30.0.0rc2__py3-none-any.whl → 30.0.1rc1__py3-none-any.whl

lsst-daf-butler 30.0.0rc2py3-none-any.whl → 30.0.1rc1py3-none-any.whl