PyPI - lsst-pipe-base - Versions diffs - 29.2025.4100__py3-none-any.whl → 29.2025.4300__py3-none-any.whl - Mend

lsst-pipe-base 29.2025.4100py3-none-any.whl → 29.2025.4300py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

lsst/pipe/base/quantum_graph/aggregator/_config.py ADDED Viewed

@@ -0,0 +1,139 @@
+# This file is part of pipe_base.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This software is dual licensed under the GNU General Public License and also
+# under a 3-clause BSD license. Recipients may choose which of these licenses
+# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
+# respectively.  If you choose the GPL option then the following text applies
+# (but note that there is still no warranty even if you opt for BSD instead):
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from __future__ import annotations
+__all__ = ("AggregatorConfig",)
+import pydantic
+class AggregatorConfig(pydantic.BaseModel):
+    """Configuration for the provenance aggregator."""
+    output_path: str | None = None
+    """Path for the output provenance quantum graph file.
+    At present this option is intended only for debugging.
+    """
+    worker_log_dir: str | None = None
+    """Path to a directory (POSIX only) for parallel worker logs."""
+    worker_log_level: str = "VERBOSE"
+    """Log level for worker processes/threads.
+    Per-quantum messages only appear at ``DEBUG`` level.
+    """
+    worker_profile_dir: str | None = None
+    """Path to a directory (POSIX only) for parallel worker profiling dumps.
+    This option is ignored when `n_processes` is `1`.
+    """
+    n_processes: int = 1
+    """Number of processes the scanner should use."""
+    assume_complete: bool = True
+    """If `True`, the aggregator can assume all quanta have run to completion
+    (including any automatic retries).  If `False`, only successes can be
+    considered final, and quanta that appear to have failed or to have not been
+    executed are ignored.
+    """
+    defensive_ingest: bool = False
+    """If `True`, guard against datasets having already been ingested into the
+    central butler repository.
+    Defensive ingest mode is automatically turned on (with a warning emitted)
+    if an ingest attempt fails due to a database constraint violation. Enabling
+    defensive mode up-front avoids this warning and is slightly more efficient
+    when it is already known that some datasets have already been ingested.
+    Defensive mode does not guard against race conditions from multiple ingest
+    processes running simultaneously, as it relies on a one-time query to
+    determine what is already present in the central repository.
+    """
+    ingest_batch_size: int = 10000
+    """Number of butler datasets that must accumulate to trigger an ingest."""
+    register_dataset_types: bool = True
+    """Whether to register output dataset types in the central butler
+    repository before starting ingest.
+    """
+    update_output_chain: bool = True
+    """Whether to prepend the output `~lsst.daf.butler.CollectionType.RUN` to
+    the output `~lsst.daf.butler.CollectionType.CHAINED` collection.
+    """
+    dry_run: bool = False
+    """If `True`, do not actually perform any deletions or central butler
+    ingests.
+    Most log messages concerning deletions and ingests will still be emitted in
+    order to provide a better emulation of a real run.
+    """
+    interactive_status: bool = False
+    """Whether to use an interactive status display with progress bars.
+    If this is `True`, the `tqdm` module must be available.  If this is
+    `False`, a periodic logger will be used to display status at a fixed
+    interval instead (see `log_status_interval`).
+    """
+    log_status_interval: float | None = None
+    """Interval (in seconds) between periodic logger status updates."""
+    worker_sleep: float = 0.01
+    """Time (in seconds) a worker should wait when there are no requests from
+    the main aggregator process.
+    """
+    zstd_level: int = 10
+    """ZStandard compression level to use for all compressed-JSON blocks."""
+    zstd_dict_size: int = 32768
+    """Size (in bytes) of the ZStandard compression dictionary."""
+    zstd_dict_n_inputs: int = 512
+    """Number of samples of each type (see below) to include in ZStandard
+    compression dictionary training.
+    Training is run on a random subset of the `PredictedQuantumDatasetsModel`
+    objects in the predicted graph, as well as the first provenance quanta,
+    logs, and metadata blocks encountered.
+    """
+    mock_storage_classes: bool = False
+    """Enable support for storage classes by created by the
+    lsst.pipe.base.tests.mocks package.
+    """

lsst/pipe/base/quantum_graph/aggregator/_ingester.py ADDED Viewed

@@ -0,0 +1,312 @@
+# This file is part of pipe_base.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This software is dual licensed under the GNU General Public License and also
+# under a 3-clause BSD license. Recipients may choose which of these licenses
+# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
+# respectively.  If you choose the GPL option then the following text applies
+# (but note that there is still no warranty even if you opt for BSD instead):
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from __future__ import annotations
+__all__ = ("Ingester",)
+import dataclasses
+import logging
+import time
+import uuid
+from collections import defaultdict
+from lsst.daf.butler import Butler, CollectionType, DatasetRef, DimensionGroup
+from lsst.daf.butler.datastore.record_data import DatastoreRecordData
+from lsst.daf.butler.registry import ConflictingDefinitionError
+from ...pipeline_graph import TaskImportMode
+from .._common import DatastoreName
+from .._predicted import PredictedDatasetModel, PredictedQuantumGraphComponents, PredictedQuantumGraphReader
+from ._communicators import IngesterCommunicator
+@dataclasses.dataclass
+class Ingester:
+    """A helper class for the provenance aggregator that handles ingestion into
+    the central butler repository.
+    """
+    predicted_path: str
+    """Path to the predicted quantum graph."""
+    butler_path: str
+    """Path or alias to the central butler repository."""
+    comms: IngesterCommunicator
+    """Communicator object for this worker."""
+    predicted: PredictedQuantumGraphComponents = dataclasses.field(init=False)
+    """Components of the predicted graph."""
+    butler: Butler = dataclasses.field(init=False)
+    """Client for the central butler repository."""
+    n_datasets_ingested: int = 0
+    """Total number of datasets ingested by this invocation."""
+    n_datasets_skipped: int = 0
+    """Total number of datasets skipped because they were already present."""
+    n_producers_pending: int = 0
+    """Number of quanta whose outputs are currently pending ingest."""
+    refs_pending: defaultdict[DimensionGroup, list[DatasetRef]] = dataclasses.field(
+        default_factory=lambda: defaultdict(list)
+    )
+    """Dataset references pending ingest, grouped by their dimensions."""
+    records_pending: dict[DatastoreName, DatastoreRecordData] = dataclasses.field(default_factory=dict)
+    """Datastore records pending ingest, grouped by datastore name."""
+    already_ingested: set[uuid.UUID] | None = None
+    """A set of all dataset IDs already present in the output RUN
+    collection.
+    If this is not `None`, the ingester is in defensive ingest mode, either
+    because it was configured to query for these dataset IDs up front, or
+    because a transaction failed due to a dataset already being present.
+    """
+    last_ingest_time: float = dataclasses.field(default_factory=time.time)
+    """POSIX timestamp since the last ingest transaction concluded."""
+    def __post_init__(self) -> None:
+        self.comms.log.verbose("Reading from predicted quantum graph.")
+        with PredictedQuantumGraphReader.open(
+            self.predicted_path, import_mode=TaskImportMode.DO_NOT_IMPORT
+        ) as reader:
+            # We only need the header and pipeline graph.
+            self.predicted = reader.components
+        if self.comms.config.mock_storage_classes:
+            import lsst.pipe.base.tests.mocks  # noqa: F401
+        self.comms.log.verbose("Initializing butler.")
+        self.butler = Butler.from_config(self.butler_path, writeable=not self.comms.config.dry_run)
+    @property
+    def n_datasets_pending(self) -> int:
+        """The number of butler datasets currently pending."""
+        return sum(len(v) for v in self.refs_pending.values())
+    @staticmethod
+    def run(predicted_path: str, butler_path: str, comms: IngesterCommunicator) -> None:
+        """Run the ingester.
+        Parameters
+        ----------
+        predicted_path : `str`
+            Path to the predicted quantum graph.
+        butler_path : `str`
+            Path or alias to the central butler repository.
+        comms : `IngesterCommunicator`
+            Communicator for the ingester.
+        Notes
+        -----
+        This method is designed to run as the ``target`` in
+        `WorkerContext.make_worker`.
+        """
+        with comms:
+            ingester = Ingester(predicted_path, butler_path, comms)
+            ingester.loop()
+    def loop(self) -> None:
+        """Run the main loop for the ingester."""
+        self.comms.log.verbose("Registering collections and dataset types.")
+        if not self.comms.config.dry_run:
+            if self.comms.config.register_dataset_types:
+                self.predicted.pipeline_graph.register_dataset_types(
+                    self.butler,
+                    include_inputs=False,
+                    include_packages=True,
+                    include_configs=True,
+                    include_logs=True,
+                )
+            self.butler.collections.register(self.predicted.header.output_run)
+            # Updating the output chain cannot happen inside the caching
+            # context.
+            if self.comms.config.update_output_chain:
+                self.update_output_chain()
+        with self.butler.registry.caching_context():
+            if self.comms.config.defensive_ingest:
+                self.fetch_already_ingested()
+            self.comms.log.info("Startup completed in %ss.", time.time() - self.last_ingest_time)
+            self.last_ingest_time = time.time()
+            for ingest_request in self.comms.poll():
+                self.n_producers_pending += 1
+                self.comms.log.debug(f"Got ingest request for producer {ingest_request.producer_id}.")
+                self.update_pending(ingest_request.datasets, ingest_request.records)
+                if self.n_datasets_pending > self.comms.config.ingest_batch_size:
+                    self.ingest()
+            self.comms.log.info("All ingest requests received.")
+            # We use 'while' in case this fails with a conflict and we switch
+            # to defensive mode (should be at most two iterations).
+            ingest_start_time = time.time()
+            while self.n_datasets_pending:
+                n_datasets = self.n_datasets_pending
+                self.ingest()
+                self.comms.log.verbose(
+                    "Gathered %d final datasets in %ss and ingested them in %ss.",
+                    n_datasets,
+                    ingest_start_time - self.last_ingest_time,
+                    time.time() - ingest_start_time,
+                )
+            if self.n_producers_pending:
+                # We can finish with returns pending if we filtered out all of
+                # the datasets we started with as already existing.
+                self.report()
+        self.comms.log_progress(
+            logging.INFO,
+            f"Ingested {self.n_datasets_ingested} dataset(s); "
+            f"skipped {self.n_datasets_skipped} already present.",
+        )
+    def ingest(self) -> None:
+        """Ingest all pending datasets and report success to the supervisor."""
+        ingest_start_time = time.time()
+        self.comms.log.verbose(
+            "Gathered %d datasets from %d quanta in %ss.",
+            self.n_datasets_pending,
+            self.n_producers_pending,
+            ingest_start_time - self.last_ingest_time,
+        )
+        try:
+            if not self.comms.config.dry_run:
+                with self.butler.registry.transaction():
+                    for refs in self.refs_pending.values():
+                        self.butler.registry._importDatasets(refs, expand=False, assume_new=True)
+                    self.butler._datastore.import_records(self.records_pending)
+            self.last_ingest_time = time.time()
+            self.comms.log.verbose(
+                "Ingested %d datasets from %d quanta in %ss.",
+                self.n_datasets_pending,
+                self.n_producers_pending,
+                self.last_ingest_time - ingest_start_time,
+            )
+            self.n_datasets_ingested += self.n_datasets_pending
+        except ConflictingDefinitionError:
+            if self.already_ingested is None:
+                self.comms.log_progress(
+                    logging.INFO,
+                    "Some outputs seem to have already been ingested; querying for existing datasets and "
+                    "switching to defensive ingest mode.",
+                )
+                self.fetch_already_ingested()
+                # We just return instead of trying again immediately because we
+                # might have just shrunk the number of pending datasets below
+                # the batch threshold.
+                return
+            else:
+                raise
+        self.report()
+        self.records_pending.clear()
+        self.refs_pending.clear()
+    def report(self) -> None:
+        """Report a successful ingest to the supervisor."""
+        self.comms.report_ingest(self.n_producers_pending)
+        self.n_producers_pending = 0
+    def fetch_already_ingested(self) -> None:
+        """Query for the UUIDs of all dataset already present in the output
+        RUN collection, and filter and pending datasets accordingly.
+        """
+        self.comms.log.info("Fetching all UUIDs in output collection %r.", self.predicted.header.output_run)
+        self.already_ingested = set(
+            self.butler.registry._fetch_run_dataset_ids(self.predicted.header.output_run)
+        )
+        kept: set[uuid.UUID] = set()
+        for dimensions, refs in self.refs_pending.items():
+            filtered_refs: list[DatasetRef] = []
+            for ref in refs:
+                if ref.id not in self.already_ingested:
+                    kept.add(ref.id)
+                    filtered_refs.append(ref)
+                else:
+                    self.n_datasets_skipped += 1
+            self.refs_pending[dimensions] = filtered_refs
+        for datastore_name, datastore_records in list(self.records_pending.items()):
+            if (filtered_records := datastore_records.subset(kept)) is not None:
+                self.records_pending[datastore_name] = filtered_records
+            else:
+                del self.records_pending[datastore_name]
+    def update_pending(
+        self, datasets: list[PredictedDatasetModel], records: dict[DatastoreName, DatastoreRecordData]
+    ) -> None:
+        """Add an ingest request to the pending-ingest data structures.
+        Parameters
+        ----------
+        datasets : `list` [ `PredictedDatasetModel` ]
+            Registry information about the datasets.
+        records : `dict` [ `str`, \
+                `lsst.daf.butler.datastore.record_data.DatastoreRecordData` ]
+            Datastore information about the datasets.
+        """
+        n_given = len(datasets)
+        if self.already_ingested is not None:
+            datasets = [d for d in datasets if d.dataset_id not in self.already_ingested]
+            kept = {d.dataset_id for d in datasets}
+            self.n_datasets_skipped += n_given - len(kept)
+            records = {
+                datastore_name: filtered_records
+                for datastore_name, original_records in records.items()
+                if (filtered_records := original_records.subset(kept)) is not None
+            }
+        for dataset in datasets:
+            ref = self.predicted.make_dataset_ref(dataset)
+            self.refs_pending[ref.datasetType.dimensions].append(ref)
+        for datastore_name, datastore_records in records.items():
+            if (existing_records := self.records_pending.get(datastore_name)) is not None:
+                existing_records.update(datastore_records)
+            else:
+                self.records_pending[datastore_name] = datastore_records
+    def update_output_chain(self) -> None:
+        """Update the output CHAINED collection to include the output RUN
+        collection (and the inputs, if the output CHAINED collection does not
+        exist).
+        Notes
+        -----
+        This method cannot be called inside the registry caching context.
+        """
+        if self.predicted.header.output is None:
+            return
+        self.comms.log.info(
+            "Updating output collection %s to include %s.",
+            self.predicted.header.output,
+            self.predicted.header.output_run,
+        )
+        if self.butler.collections.register(self.predicted.header.output, CollectionType.CHAINED):
+            # Chain is new; need to add inputs, but we want to flatten them
+            # first.
+            if self.predicted.header.inputs:
+                flattened = self.butler.collections.query(self.predicted.header.inputs, flatten_chains=True)
+                self.butler.collections.extend_chain(self.predicted.header.output, flattened)
+        self.butler.collections.prepend_chain(self.predicted.header.output, self.predicted.header.output_run)

lsst/pipe/base/quantum_graph/aggregator/_progress.py ADDED Viewed

@@ -0,0 +1,208 @@
+# This file is part of pipe_base.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This software is dual licensed under the GNU General Public License and also
+# under a 3-clause BSD license. Recipients may choose which of these licenses
+# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
+# respectively.  If you choose the GPL option then the following text applies
+# (but note that there is still no warranty even if you opt for BSD instead):
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from __future__ import annotations
+__all__ = ("Progress", "make_worker_log")
+import logging
+import os
+import time
+from types import TracebackType
+from typing import Self
+from lsst.utils.logging import TRACE, VERBOSE, LsstLogAdapter, PeriodicLogger, getLogger
+from ._config import AggregatorConfig
+class Progress:
+    """A helper class for the provenance aggregator that handles reporting
+    progress to the user.
+    This includes both logging (including periodic logging) and optional
+    progress bars.
+    Parameters
+    ----------
+    log : `lsst.utils.logging.LsstLogAdapter`
+        LSST-customized logger.
+    config : `AggregatorConfig`
+        Configuration for the aggregator.
+    Notes
+    -----
+    This class is a context manager in order to manage the redirection of
+    logging when progress bars for interactive display are in use.  The context
+    manager does nothing otherwise.
+    """
+    def __init__(self, log: LsstLogAdapter, config: AggregatorConfig):
+        self.start = time.time()
+        self.log = log
+        self.config = config
+        self._periodic_log = PeriodicLogger(self.log, config.log_status_interval)
+        self._n_scanned: int = 0
+        self._n_ingested: int = 0
+        self._n_written: int = 0
+        self._n_quanta: int | None = None
+        self.interactive = config.interactive_status
+    def __enter__(self) -> Self:
+        if self.interactive:
+            from tqdm.contrib.logging import logging_redirect_tqdm
+            self._logging_redirect = logging_redirect_tqdm()
+            self._logging_redirect.__enter__()
+        return self
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: TracebackType | None,
+    ) -> bool | None:
+        if self.interactive:
+            self._logging_redirect.__exit__(exc_type, exc_value, traceback)
+        return None
+    def set_n_quanta(self, n_quanta: int) -> None:
+        """Set the total number of quanta.
+        Parameters
+        ----------
+        n_quanta : `int`
+            Total number of quanta, including special "init" quanta.
+        Notes
+        -----
+        This method must be called before any of the ``report_*`` methods.
+        """
+        self._n_quanta = n_quanta
+        if self.interactive:
+            from tqdm import tqdm
+            self._scan_progress = tqdm(desc="Scanning", total=n_quanta, leave=False, unit="quanta")
+            self._ingest_progress = tqdm(
+                desc="Ingesting", total=n_quanta, leave=False, smoothing=0.1, unit="quanta"
+            )
+            if self.config.output_path is not None:
+                self._write_progress = tqdm(desc="Writing", total=n_quanta, leave=False, unit="quanta")
+    @property
+    def elapsed_time(self) -> float:
+        """The time in seconds since the start of the aggregator."""
+        return time.time() - self.start
+    def _log_status(self) -> None:
+        """Invoke the periodic logger with the current status."""
+        self._periodic_log.log(
+            "%s quanta scanned, %s quantum outputs ingested, "
+            "%s provenance quanta written (of %s) after %0.1fs.",
+            self._n_scanned,
+            self._n_ingested,
+            self._n_written,
+            self._n_quanta,
+            self.elapsed_time,
+        )
+    def report_scan(self) -> None:
+        """Report that a quantum was scanned."""
+        self._n_scanned += 1
+        if self.interactive:
+            self._scan_progress.update(1)
+        else:
+            self._log_status()
+    def finish_scans(self) -> None:
+        """Report that all scanning is done."""
+        if self.interactive:
+            self._scan_progress.close()
+    def report_ingests(self, n_quanta: int) -> None:
+        """Report that ingests for multiple quanta were completed.
+        Parameters
+        ----------
+        n_quanta : `int`
+            Number of quanta whose outputs were ingested.
+        """
+        self._n_ingested += n_quanta
+        if self.interactive:
+            self._ingest_progress.update(n_quanta)
+        else:
+            self._log_status()
+    def finish_ingests(self) -> None:
+        """Report that all ingests are done."""
+        if self.interactive:
+            self._ingest_progress.close()
+    def report_write(self) -> None:
+        """Report that a quantum's provenance was written."""
+        self._n_written += 1
+        if self.interactive:
+            self._write_progress.update()
+        else:
+            self._log_status()
+    def finish_writes(self) -> None:
+        """Report that all writes are done."""
+        if self.interactive:
+            self._write_progress.close()
+def make_worker_log(name: str, config: AggregatorConfig) -> LsstLogAdapter:
+    """Make a logger for a worker.
+    Parameters
+    ----------
+    name : `str`
+        Name of the worker, to be used as part of the name for the logger.
+    config : `AggregatorConfig`
+        Configuration for the aggregator.
+    """
+    base_log = logging.getLogger(f"lsst.pipe.base.quantum_graph.aggregator.{name}")
+    base_log.propagate = False
+    log = getLogger(logger=base_log)
+    if config.worker_log_dir is not None:
+        os.makedirs(config.worker_log_dir, exist_ok=True)
+        match config.worker_log_level.upper():
+            case "VERBOSE":
+                log.setLevel(VERBOSE)
+            case "TRACE":
+                log.setLevel(TRACE)
+            case std:
+                log.setLevel(getattr(logging, std))
+        handler = logging.FileHandler(os.path.join(config.worker_log_dir, f"{name}.log"))
+        handler.setFormatter(
+            logging.Formatter("%(levelname)s %(asctime)s.%(msecs)03d %(message)s", "%Y-%m-%dT%H:%M:%S")
+        )
+        log.addHandler(handler)
+    else:
+        log.addHandler(logging.NullHandler())
+    return log

lsst-pipe-base 29.2025.4100__py3-none-any.whl → 29.2025.4300__py3-none-any.whl

lsst-pipe-base 29.2025.4100py3-none-any.whl → 29.2025.4300py3-none-any.whl