PyPI - lsst-daf-butler - Versions diffs - 30.0.0rc2__py3-none-any.whl → 30.0.0rc3__py3-none-any.whl - Mend

lsst-daf-butler 30.0.0rc2py3-none-any.whl → 30.0.0rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

lsst/daf/butler/_butler.py CHANGED Viewed

@@ -1566,7 +1566,7 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
     @abstractmethod
     def transfer_dimension_records_from(
-        self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
+        self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
     ) -> None:
         """Transfer dimension records to this Butler from another Butler.
@@ -1578,10 +1578,9 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
             `Butler` whose registry will be used to expand data IDs. If the
             source refs contain coordinates that are used to populate other
             records then this will also need to be a full `Butler`.
-        source_refs : iterable of `DatasetRef`
-            Datasets defined in the source butler whose dimension records
-            should be transferred to this butler. In most circumstances.
-            transfer is faster if the dataset refs are expanded.
+        source_refs : iterable of `DatasetRef` or `DataCoordinate`
+            Datasets or data IDs defined in the source butler whose dimension
+            records should be transferred to this butler.
         """
         raise NotImplementedError()
@@ -2227,3 +2226,7 @@ class Butler(LimitedButler):  # numpydoc ignore=PR02
     @abstractmethod
     def close(self) -> None:
         raise NotImplementedError()
+    @abstractmethod
+    def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
+        raise NotImplementedError()

lsst/daf/butler/_butler_metrics.py CHANGED Viewed

@@ -29,12 +29,15 @@ from __future__ import annotations
 from collections.abc import Callable, Iterator
 from contextlib import contextmanager
+from typing import Concatenate, ParamSpec
 from pydantic import BaseModel
 from lsst.utils.logging import LsstLoggers
 from lsst.utils.timer import time_this
+P = ParamSpec("P")
 class ButlerMetrics(BaseModel):
     """Metrics collected during Butler operations."""
@@ -45,18 +48,26 @@ class ButlerMetrics(BaseModel):
     time_in_get: float = 0.0
     """Wall-clock time, in seconds, spent in get()."""
+    time_in_ingest: float = 0.0
+    """Wall-clock time, in seconds, spent in ingest()."""
     n_get: int = 0
     """Number of datasets retrieved with get()."""
     n_put: int = 0
     """Number of datasets stored with put()."""
+    n_ingest: int = 0
+    """Number of datasets ingested."""
     def reset(self) -> None:
         """Reset all metrics."""
         self.time_in_put = 0.0
         self.time_in_get = 0.0
+        self.time_in_ingest = 0.0
         self.n_get = 0
         self.n_put = 0
+        self.n_ingest = 0
     def increment_get(self, duration: float) -> None:
         """Increment time for get().
@@ -80,13 +91,31 @@ class ButlerMetrics(BaseModel):
         self.time_in_put += duration
         self.n_put += 1
+    def increment_ingest(self, duration: float, n_datasets: int) -> None:
+        """Increment time and datasets for ingest().
+        Parameters
+        ----------
+        duration : `float`
+            Duration to add to the ingest() statistics.
+        n_datasets : `int`
+            Number of datasets to be ingested for this call.
+        """
+        self.time_in_ingest += duration
+        self.n_ingest += n_datasets
     @contextmanager
     def _timer(
-        self, handler: Callable[[float], None], log: LsstLoggers | None = None, msg: str | None = None
+        self,
+        handler: Callable[Concatenate[float, P], None],
+        log: LsstLoggers | None = None,
+        msg: str | None = None,
+        *args: P.args,
+        **kwargs: P.kwargs,
     ) -> Iterator[None]:
         with time_this(log=log, msg=msg) as timer:
             yield
-        handler(timer.duration)
+        handler(timer.duration, *args, **kwargs)
     @contextmanager
     def instrument_get(self, log: LsstLoggers | None = None, msg: str | None = None) -> Iterator[None]:
@@ -115,3 +144,21 @@ class ButlerMetrics(BaseModel):
         """
         with self._timer(self.increment_put, log=log, msg=msg):
             yield
+    @contextmanager
+    def instrument_ingest(
+        self, n_datasets: int, log: LsstLoggers | None = None, msg: str | None = None
+    ) -> Iterator[None]:
+        """Run code and increment ingest statistics.
+        Parameters
+        ----------
+        n_datasets : `int`
+            Number of datasets being ingested.
+        log : `logging.Logger` or `None`
+            Logger to use for any timing information.
+        msg : `str` or `None`
+            Any message to be included in log output.
+        """
+        with self._timer(self.increment_ingest, n_datasets=n_datasets, log=log, msg=msg):
+            yield

lsst/daf/butler/_formatter.py CHANGED Viewed

@@ -54,6 +54,7 @@ from ._config import Config
 from ._config_support import LookupKey, processLookupConfigs
 from ._file_descriptor import FileDescriptor
 from ._location import Location
+from ._rubin.temporary_for_ingest import TemporaryForIngest
 from .dimensions import DataCoordinate, DimensionUniverse
 from .mapping_factory import MappingFactory
@@ -1031,13 +1032,7 @@ class FormatterV2:
         """
         cache_manager = self._ensure_cache(cache_manager)
-        # Always write to a temporary even if
-        # using a local file system -- that gives us atomic writes.
-        # If a process is killed as the file is being written we do not
-        # want it to remain in the correct place but in corrupt state.
-        # For local files write to the output directory not temporary dir.
-        prefix = uri.dirname() if uri.isLocal else None
-        with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri:
+        with TemporaryForIngest.make_path(uri) as temporary_uri:
             # Need to configure the formatter to write to a different
             # location and that needs us to overwrite internals
             log.debug("Writing dataset to temporary location at %s", temporary_uri)

lsst/daf/butler/_labeled_butler_factory.py CHANGED Viewed

@@ -30,7 +30,9 @@ from __future__ import annotations
 __all__ = ("LabeledButlerFactory", "LabeledButlerFactoryProtocol")
 from collections.abc import Mapping
-from typing import Protocol
+from contextlib import AbstractContextManager
+from logging import getLogger
+from typing import Any, Literal, Protocol, Self
 from lsst.resources import ResourcePathExpression
@@ -40,6 +42,8 @@ from ._butler_repo_index import ButlerRepoIndex
 from ._utilities.named_locks import NamedLocks
 from ._utilities.thread_safe_cache import ThreadSafeCache
+_LOG = getLogger(__name__)
 class LabeledButlerFactoryProtocol(Protocol):
     """Callable to retrieve a butler from a label."""
@@ -47,7 +51,7 @@ class LabeledButlerFactoryProtocol(Protocol):
     def __call__(self, label: str) -> Butler: ...
-class LabeledButlerFactory:
+class LabeledButlerFactory(AbstractContextManager):
     """Factory for efficiently instantiating Butler instances from the
     repository index file.  This is intended for use from long-lived services
     that want to instantiate a separate Butler instance for each end user
@@ -60,6 +64,9 @@ class LabeledButlerFactory:
         files.  If not provided, defaults to the global repository index
         configured by the ``DAF_BUTLER_REPOSITORY_INDEX`` environment variable
         --  see `ButlerRepoIndex`.
+    writeable : `bool`, optional
+        If `True`, Butler instances created by this factory will be writeable.
+        If `False` (the default), instances will be read-only.
     Notes
     -----
@@ -76,11 +83,12 @@ class LabeledButlerFactory:
     safely be used by separate threads.
     """
-    def __init__(self, repositories: Mapping[str, str] | None = None) -> None:
+    def __init__(self, repositories: Mapping[str, str] | None = None, writeable: bool = False) -> None:
         if repositories is None:
             self._repositories = None
         else:
             self._repositories = dict(repositories)
+        self._writeable = writeable
         self._factories = ThreadSafeCache[str, _ButlerFactory]()
         self._initialization_locks = NamedLocks()
@@ -88,6 +96,16 @@ class LabeledButlerFactory:
         # This may be overridden by unit tests.
         self._preload_unsafe_direct_butler_caches = True
+    def __enter__(self) -> Self:
+        return self
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> Literal[False]:
+        try:
+            self.close()
+        except Exception:
+            _LOG.exception("An exception occurred during LabeledButlerFactory.close()")
+        return False
     def bind(self, access_token: str | None) -> LabeledButlerFactoryProtocol:
         """Create a callable factory function for generating Butler instances
         with out needing to specify access tokans again.
@@ -109,7 +127,7 @@ class LabeledButlerFactory:
         return create
-    def create_butler(self, *, label: str, access_token: str | None) -> Butler:
+    def create_butler(self, label: str, *, access_token: str | None = None) -> Butler:
         """Create a Butler instance.
         Parameters
@@ -118,7 +136,7 @@ class LabeledButlerFactory:
             Label of the repository to instantiate, from the ``repositories``
             parameter to the `LabeledButlerFactory` constructor or the global
             repository index file.
-        access_token : `str` | `None`
+        access_token : `str` | `None`, optional
             Gafaelfawr access token used to authenticate to a Butler server.
             This is required for any repositories configured to use
             `RemoteButler`.  If you only use `DirectButler`, this may be
@@ -167,7 +185,9 @@ class LabeledButlerFactory:
         match butler_type:
             case ButlerType.DIRECT:
-                return _DirectButlerFactory(config, self._preload_unsafe_direct_butler_caches)
+                return _DirectButlerFactory(
+                    config, self._preload_unsafe_direct_butler_caches, self._writeable
+                )
             case ButlerType.REMOTE:
                 return _RemoteButlerFactory(config)
             case _:
@@ -189,12 +209,12 @@ class _ButlerFactory(Protocol):
 class _DirectButlerFactory(_ButlerFactory):
-    def __init__(self, config: ButlerConfig, preload_unsafe_caches: bool) -> None:
+    def __init__(self, config: ButlerConfig, preload_unsafe_caches: bool, writeable: bool) -> None:
         import lsst.daf.butler.direct_butler
         # Create a 'template' Butler that will be cloned when callers request
         # an instance.
-        self._butler = Butler.from_config(config)
+        self._butler = Butler.from_config(config, writeable=writeable)
         assert isinstance(self._butler, lsst.daf.butler.direct_butler.DirectButler)
         # Load caches so that data is available in cloned instances without

lsst/daf/butler/_rubin/temporary_for_ingest.py ADDED Viewed

@@ -0,0 +1,207 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This software is dual licensed under the GNU General Public License and also
+# under a 3-clause BSD license. Recipients may choose which of these licenses
+# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
+# respectively.  If you choose the GPL option then the following text applies
+# (but note that there is still no warranty even if you opt for BSD instead):
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from __future__ import annotations
+__all__ = ("TemporaryForIngest",)
+import dataclasses
+import glob
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Self, cast
+from lsst.resources import ResourcePath
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+    from types import TracebackType
+    from .._butler import Butler
+    from .._dataset_ref import DatasetRef
+    from .._file_dataset import FileDataset
+    from .._limited_butler import LimitedButler
+@dataclasses.dataclass
+class TemporaryForIngest:
+    """A context manager for generating temporary paths that will be ingested
+    as butler datasets.
+    Notes
+    -----
+    Neither this class nor its `make_path` method run ingest automatically when
+    their context manager is exited; the `ingest` method must always be called
+    explicitly.
+    """
+    butler: Butler
+    """Full butler to obtain a predicted path from and ingest into."""
+    ref: DatasetRef
+    """Description of the dataset to ingest."""
+    dataset: FileDataset = dataclasses.field(init=False)
+    """The dataset that will be passed to `Butler.ingest`."""
+    @property
+    def path(self) -> ResourcePath:
+        """The temporary path.
+        Guaranteed to be a local POSIX path.
+        """
+        return cast(ResourcePath, self.dataset.path)
+    @property
+    def ospath(self) -> str:
+        """The temporary path as a complete filename."""
+        return self.path.ospath
+    @classmethod
+    @contextmanager
+    def make_path(cls, final_path: ResourcePath) -> Iterator[ResourcePath]:
+        """Return a temporary path context manager given the predicted final
+        path.
+        Parameters
+        ----------
+        final_path : `lsst.resources.ResourcePath`
+            Predicted final path.
+        Returns
+        -------
+        context : `contextlib.AbstractContextManager`
+            A context manager that yields the temporary
+            `~lsst.resources.ResourcePath` when entered and deletes that file
+            when exited.
+        """
+        # Always write to a temporary even if using a local file system -- that
+        # gives us atomic writes. If a process is killed as the file is being
+        # written we do not want it to remain in the correct place but in
+        # corrupt state. For local files write to the output directory not
+        # temporary dir.
+        prefix = final_path.dirname() if final_path.isLocal else None
+        if prefix is not None:
+            prefix.mkdir()
+        with ResourcePath.temporary_uri(
+            suffix=cls._get_temporary_suffix(final_path), prefix=prefix
+        ) as temporary_path:
+            yield temporary_path
+    def ingest(self, record_validation_info: bool = True) -> None:
+        """Ingest the file into the butler.
+        Parameters
+        ----------
+        record_validation_info : `bool`, optional
+            Whether to- record the file size and checksum upon ingest.
+        """
+        self.butler.ingest(self.dataset, transfer="move", record_validation_info=record_validation_info)
+    def __enter__(self) -> Self:
+        from .._file_dataset import FileDataset
+        final_path = self.butler.getURI(self.ref, predict=True).replace(fragment="")
+        prefix = final_path.dirname() if final_path.isLocal else None
+        if prefix is not None:
+            prefix.mkdir()
+        self._temporary_path_context = self.make_path(final_path)
+        temporary_path = self._temporary_path_context.__enter__()
+        self.dataset = FileDataset(temporary_path, [self.ref], formatter=None)
+        return self
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: TracebackType | None,
+    ) -> bool | None:
+        return self._temporary_path_context.__exit__(exc_type, exc_value, traceback)
+    @classmethod
+    def find_orphaned_temporaries_by_path(cls, final_path: ResourcePath) -> list[ResourcePath]:
+        """Search for temporary files that were not successfully ingested.
+        Parameters
+        ----------
+        final_path : `lsst.resources.ResourcePath`
+            Final path a successfully-ingested file would have.
+        Returns
+        -------
+        paths : `list` [ `lsst.resources.ResourcePath` ]
+            Files that look like temporaries that might have been created while
+            trying to write the target dataset.
+        Notes
+        -----
+        Orphaned files are only possible when a context manager is interrupted
+        by a hard error that prevents any cleanup code from running (e.g.
+        sudden loss of power).
+        """
+        if not final_path.isLocal:
+            # We return true tempfile for non-local predicted paths, so orphans
+            # are not our problem (the OS etc. will take care of them).
+            return []
+        return [
+            ResourcePath(filename)
+            for filename in glob.glob(
+                f"{glob.escape(final_path.dirname().ospath)}*{glob.escape(cls._get_temporary_suffix(final_path))}"
+            )
+            if filename != final_path.ospath
+        ]
+    @classmethod
+    def find_orphaned_temporaries_by_ref(cls, ref: DatasetRef, butler: LimitedButler) -> list[ResourcePath]:
+        """Search for temporary files that were not successfully ingested.
+        Parameters
+        ----------
+        ref : `..DatasetRef`
+            A dataset reference the temporaries correspond to.
+        butler : `lsst.daf.butler.LimitedButler`
+            Butler that can be used to obtain a predicted URI for a dataset.
+        Returns
+        -------
+        paths : `list` [ `lsst.resources.ResourcePath` ]
+            Files that look like temporaries that might have been created while
+            trying to write the target dataset.
+        Notes
+        -----
+        Orphaned files are only possible when a context manager is interrupted
+        by a hard error that prevents any cleanup code from running (e.g.
+        sudden loss of power).
+        """
+        final_path = butler.getURI(ref, predict=True).replace(fragment="")
+        return cls.find_orphaned_temporaries_by_path(final_path)
+    @staticmethod
+    def _get_temporary_suffix(path: ResourcePath) -> str:
+        ext = path.getExtension()
+        basename = path.basename().removesuffix(ext)
+        return f"{basename}.tmp{ext}"

lsst/daf/butler/configs/datastores/formatters.yaml CHANGED Viewed

@@ -100,3 +100,4 @@ VisitBackgroundModel: lsst.daf.butler.formatters.json.JsonFormatter
 VignettingCorrection: lsst.ts.observatory.control.utils.extras.vignetting_storage.VignettingCorrectionFormatter
 SSPAuxiliaryFile: lsst.pipe.tasks.sspAuxiliaryFile.SSPAuxiliaryFileFormatter
 VisitGeometry: lsst.daf.butler.formatters.json.JsonFormatter
+ProvenanceQuantumGraph: lsst.pipe.base.quantum_graph.formatter.ProvenanceFormatter

lsst/daf/butler/configs/storageClasses.yaml CHANGED Viewed

@@ -443,3 +443,18 @@ storageClasses:
     pytype: lsst.pipe.tasks.sspAuxiliaryFile.SSPAuxiliaryFile
   VisitGeometry:
     pytype: lsst.obs.base.visit_geometry.VisitGeometry
+  ProvenanceQuantumGraph:
+    pytype: lsst.pipe.base.quantum_graph.ProvenanceQuantumGraph
+    parameters:
+      - import_mode   # lsst.pipe.base.pipeline_graph.TaskImportMode
+      - quanta  # iterable of uuid.UUID; quanta to read
+      - datasets  # iterable of uuid.UUID; datasets to read
+      - read_init_quanta  # bool, defaults to True; whether to read pre-exec-init info
+    derivedComponents:
+      packages: Packages  # ignores node parameters
+      # UUID keys can be quantum or data IDs (whichever is passed in via
+      # parameters).  Nested lists are attempts to run the quantum (last is
+      # most recent).
+      logs: StructuredDataDict  # dict[uuid.UUID, list[ButlerLogRecords]]
+      metadata: StructuredDataDict  # dict[uuid.UUID, list[TaskMetadata]]

lsst/daf/butler/datastore/record_data.py CHANGED Viewed

@@ -49,7 +49,7 @@ if TYPE_CHECKING:
 # Pydantic requires the possible value types to be explicitly enumerated in
 # order for `uuid.UUID` in particular to work.  `typing.Any` does not work
 # here.
-_Record: TypeAlias = dict[str, int | str | uuid.UUID | None]
+_Record: TypeAlias = dict[str, int | str | None]
 class SerializedDatastoreRecordData(pydantic.BaseModel):

lsst/daf/butler/datastores/fileDatastore.py CHANGED Viewed

@@ -1068,9 +1068,6 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
             # Work out the name we want this ingested file to have
             # inside the datastore
             tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter)
-            if not tgtLocation.uri.dirname().exists():
-                log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
-                tgtLocation.uri.dirname().mkdir()
             # if we are transferring from a local file to a remote location
             # it may be more efficient to get the size and checksum of the
@@ -1311,12 +1308,6 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
                     f"and storage class type ({required_pytype})"
                 )
-        uri = location.uri
-        if not uri.dirname().exists():
-            log.debug("Folder %s does not exist yet so creating it.", uri.dirname())
-            uri.dirname().mkdir()
         if self._transaction is None:
             raise RuntimeError("Attempting to write artifact without transaction enabled")
@@ -1332,6 +1323,7 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
         # Register a callback to try to delete the uploaded data if
         # something fails below
+        uri = location.uri
         self._transaction.registerUndo("artifactWrite", _removeFileExists, uri)
         # Need to record the specified formatter but if this is a V1 formatter
@@ -2220,9 +2212,6 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
         else:
             # Name the zip file based on index contents.
             tgtLocation = self.locationFactory.fromPath(index.calculate_zip_file_path_in_store())
-            if not tgtLocation.uri.dirname().exists():
-                log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname())
-                tgtLocation.uri.dirname().mkdir()
             # Transfer the Zip file into the datastore.
             if not dry_run:
@@ -3177,6 +3166,20 @@ class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
     def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]:
         # Docstring inherited from the base class.
+        # This call to 'bridge.check' filters out "partially deleted" datasets.
+        # Specifically, ones in the unusual edge state that:
+        # 1. They have an entry in the registry dataset tables
+        # 2. They were "trashed" from the datastore, so they are not
+        # present in the "dataset_location" table.)
+        # 3. But the trash has not been "emptied", so there are still entries
+        #  in the "opaque" datastore records table.
+        #
+        # As far as I can tell, this can only occur in the case of a concurrent
+        # or aborted call to `Butler.pruneDatasets(unstore=True, purge=False)`.
+        # Datasets (with or without files existing on disk) can persist in
+        # this zombie state indefinitely, until someone manually empties
+        # the trash.
         exported_refs = list(self._bridge.check(refs))
         ids = {ref.id for ref in exported_refs}
         records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids}

lsst/daf/butler/dimensions/_coordinate.py CHANGED Viewed

@@ -755,6 +755,11 @@ class DataCoordinate:
     to_json = to_json_pydantic
     from_json: ClassVar[Callable[..., Self]] = cast(Callable[..., Self], classmethod(from_json_pydantic))
+    @property
+    def dataId(self) -> Self:
+        """Return this `DataCoordinate` instance, unmodified."""
+        return self
 DataId = DataCoordinate | Mapping[str, Any]
 """A type-annotation alias for signatures that accept both informal data ID

lsst-daf-butler 30.0.0rc2__py3-none-any.whl → 30.0.0rc3__py3-none-any.whl

lsst-daf-butler 30.0.0rc2py3-none-any.whl → 30.0.0rc3py3-none-any.whl