PyPI - lsst-pipe-base - Versions diffs - 29.2025.4100__py3-none-any.whl → 29.2025.4300__py3-none-any.whl - Mend

lsst-pipe-base 29.2025.4100py3-none-any.whl → 29.2025.4300py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

lsst/pipe/base/_status.py CHANGED Viewed

@@ -166,7 +166,7 @@ class QuantumSuccessCaveats(enum.Flag):
         """
         return {
             "+": "at least one predicted output was missing, but not all were",
-            "*": "all predicated outputs were missing (besides logs and metadata)",
+            "*": "all predicted outputs were missing (besides logs and metadata)",
             "A": "adjustQuantum raised NoWorkFound; a regenerated QG would not include this quantum",
             "D": "algorithm considers data too bad to be processable",
             "U": "one or more input dataset was incomplete due to an upstream failure",

lsst/pipe/base/cli/cmd/__init__.py CHANGED Viewed

@@ -25,6 +25,6 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
-__all__ = ["register_instrument", "transfer_from_graph", "zip_from_graph", "retrieve_artifacts_for_quanta"]
+__all__ = ["register_instrument", "transfer_from_graph", "zip_from_graph", "retrieve_artifacts_for_quanta", "aggregate_graph"]
-from .commands import register_instrument, retrieve_artifacts_for_quanta, transfer_from_graph, zip_from_graph
+from .commands import (register_instrument, retrieve_artifacts_for_quanta, transfer_from_graph, zip_from_graph, aggregate_graph)

lsst/pipe/base/cli/cmd/commands.py CHANGED Viewed

@@ -40,6 +40,7 @@ from lsst.daf.butler.cli.opt import (
 from lsst.daf.butler.cli.utils import ButlerCommand, split_commas, unwrap
 from ... import script
+from ...quantum_graph import aggregator
 from ..opt import instrument_argument, update_output_chain_option
@@ -140,7 +141,7 @@ def zip_from_graph(**kwargs: Any) -> None:
     "--include-outputs/--no-include-outputs",
     is_flag=True,
     default=True,
-    help="Whether to include outut datasets in retrieval.",
+    help="Whether to include output datasets in retrieval.",
 )
 @options_file_option()
 def retrieve_artifacts_for_quanta(**kwargs: Any) -> None:
@@ -153,3 +154,117 @@ def retrieve_artifacts_for_quanta(**kwargs: Any) -> None:
     """
     artifacts = script.retrieve_artifacts_for_quanta(**kwargs)
     print(f"Written {len(artifacts)} artifacts to {kwargs['dest']}.")
+_AGGREGATOR_DEFAULTS = aggregator.AggregatorConfig()
+@click.command(short_help="Scan for the outputs of an active or completed quantum graph.", cls=ButlerCommand)
+@click.argument("predicted_graph", required=True)
+@repo_argument(required=True, help="Path to the central butler repository.")
+@click.option(
+    "-o",
+    "--output",
+    "output_path",
+    default=_AGGREGATOR_DEFAULTS.output_path,
+    help=(
+        "Path to the output provenance quantum graph.  THIS OPTION IS FOR "
+        "DEVELOPMENT AND DEBUGGING ONLY.  IT MAY BE REMOVED IN THE FUTURE."
+    ),
+)
+@click.option(
+    "--processes",
+    "-j",
+    "n_processes",
+    default=_AGGREGATOR_DEFAULTS.n_processes,
+    type=click.IntRange(min=1),
+    help="Number of processes to use.",
+)
+@click.option(
+    "--complete/--incomplete",
+    "assume_complete",
+    default=_AGGREGATOR_DEFAULTS.assume_complete,
+    help="Whether execution has completed (and failures cannot be retried).",
+)
+@click.option(
+    "--dry-run",
+    is_flag=True,
+    default=_AGGREGATOR_DEFAULTS.dry_run,
+    help="Do not actually perform any central database ingests.",
+)
+@click.option(
+    "--interactive-status/--no-interactive-status",
+    "interactive_status",
+    default=_AGGREGATOR_DEFAULTS.interactive_status,
+    help="Use progress bars for status reporting instead of periodic logging.",
+)
+@click.option(
+    "--log-status-interval",
+    type=int,
+    default=_AGGREGATOR_DEFAULTS.log_status_interval,
+    help="Interval (in seconds) between periodic logger status updates.",
+)
+@click.option(
+    "--register-dataset-types/--no-register-dataset-types",
+    default=_AGGREGATOR_DEFAULTS.register_dataset_types,
+    help="Register output dataset types.",
+)
+@click.option(
+    "--update-output-chain/--no-update-output-chain",
+    default=_AGGREGATOR_DEFAULTS.update_output_chain,
+    help="Prepend the output RUN collection to the output CHAINED collection.",
+)
+@click.option(
+    "--worker-log-dir",
+    type=str,
+    default=_AGGREGATOR_DEFAULTS.worker_log_dir,
+    help="Path to a directory (POSIX only) for parallel worker logs.",
+)
+@click.option(
+    "--worker-log-level",
+    type=str,
+    default=_AGGREGATOR_DEFAULTS.worker_log_level,
+    help="Log level for worker processes/threads (use DEBUG for per-quantum messages).",
+)
+@click.option(
+    "--zstd-level",
+    type=int,
+    default=_AGGREGATOR_DEFAULTS.zstd_level,
+    help="Compression level for the provenance quantum graph file.",
+)
+@click.option(
+    "--zstd-dict-size",
+    type=int,
+    default=_AGGREGATOR_DEFAULTS.zstd_dict_size,
+    help="Size (in bytes) of the ZStandard compression dictionary.",
+)
+@click.option(
+    "--zstd-dict-n-inputs",
+    type=int,
+    default=_AGGREGATOR_DEFAULTS.zstd_dict_n_inputs,
+    help=("Number of samples of each type to include in ZStandard compression dictionary training."),
+)
+@click.option(
+    "--mock-storage-classes/--no-mock-storage-classes",
+    default=_AGGREGATOR_DEFAULTS.mock_storage_classes,
+    help="Enable support for storage classes created by the lsst.pipe.base.tests.mocks package.",
+)
+def aggregate_graph(predicted_graph: str, repo: str, **kwargs: Any) -> None:
+    """Scan for quantum graph's outputs to gather provenance, ingest datasets
+    into the central butler repository, and delete datasets that are no
+    longer needed.
+    """
+    # It'd be nice to allow to the user to provide a path to an
+    # AggregatorConfig JSON file for options that weren't provided, but Click
+    # 8.1 fundamentally cannot handle flag options that default to None rather
+    # than True or False (i.e. so they fall back to the config value when not
+    # set).  It's not clear whether Click 8.2.x has actually fixed this; Click
+    # 8.2.0 tried but caused new problems.
+    config = aggregator.AggregatorConfig(**kwargs)
+    try:
+        aggregator.aggregate_graph(predicted_graph, repo, config)
+    except aggregator.FatalWorkerError as err:
+        # When this exception is raised, we'll have already logged the relevant
+        # traceback from a separate worker.
+        raise click.ClickException(str(err)) from None

lsst/pipe/base/graph_walker.py CHANGED Viewed

@@ -81,10 +81,12 @@ class GraphWalker(Generic[_T]):
         Parameters
         ----------
         key : unspecified
-            NetworkX key of the node to mark finished.
+            NetworkX key of the node to mark finished.  Does not need to have
+            been returned by the iterator yet.
         """
-        self._active.remove(key)
         self._incomplete.remove(key)
+        self._active.discard(key)
+        self._ready.discard(key)
         successors = list(self._xgraph.successors(key))
         for successor in successors:
             assert successor not in self._active, (
@@ -102,7 +104,8 @@ class GraphWalker(Generic[_T]):
         Parameters
         ----------
         key : unspecified
-            NetworkX key of the node to mark as a failure.
+            NetworkX key of the node to mark as a failure.  Does not need to
+            have been returned by the iterator yet.
         Returns
         -------
@@ -110,8 +113,9 @@ class GraphWalker(Generic[_T]):
             NetworkX keys of nodes that were recursive descendants of the
             failed node, and will hence never be yielded by the iterator.
         """
-        self._active.remove(key)
         self._incomplete.remove(key)
+        self._active.discard(key)
+        self._ready.discard(key)
         descendants = list(networkx.dag.descendants(self._xgraph, key))
         self._xgraph.remove_node(key)
         self._xgraph.remove_nodes_from(descendants)

lsst/pipe/base/pipeline_graph/_pipeline_graph.py CHANGED Viewed

@@ -1697,7 +1697,15 @@ class PipelineGraph:
             PACKAGES_INIT_OUTPUT_NAME, self._universe.empty, PACKAGES_INIT_OUTPUT_STORAGE_CLASS
         )
-    def register_dataset_types(self, butler: Butler, include_packages: bool = True) -> None:
+    def register_dataset_types(
+        self,
+        butler: Butler,
+        include_packages: bool = True,
+        *,
+        include_inputs: bool = True,
+        include_configs: bool = True,
+        include_logs: bool = True,
+    ) -> None:
         """Register all dataset types in a data repository.
         Parameters
@@ -1709,11 +1717,28 @@ class PipelineGraph:
             software versions (this is not associated with a task and hence is
             not considered part of the pipeline graph in other respects, but it
             does get written with other provenance datasets).
-        """
-        dataset_types = [node.dataset_type for node in self.dataset_types.values()]
+        include_inputs : `bool`, optional
+            Whether to register overall-input dataset types as well as outputs.
+        include_configs : `bool`, optional
+            Whether to register task config dataset types.
+        include_logs : `bool`, optional
+            Whether to register task log dataset types.
+        """
+        dataset_types = {
+            node.name: node.dataset_type
+            for node in self.dataset_types.values()
+            if include_inputs or self.producer_of(node.name) is not None
+        }
         if include_packages:
-            dataset_types.append(self.packages_dataset_type)
-        for dataset_type in dataset_types:
+            dataset_types[self.packages_dataset_type.name] = self.packages_dataset_type
+        if not include_configs:
+            for task_node in self.tasks.values():
+                del dataset_types[task_node.init.config_output.dataset_type_name]
+        if not include_logs:
+            for task_node in self.tasks.values():
+                if task_node.log_output is not None:
+                    del dataset_types[task_node.log_output.dataset_type_name]
+        for dataset_type in dataset_types.values():
             butler.registry.registerDatasetType(dataset_type)
     def check_dataset_type_registrations(self, butler: Butler, include_packages: bool = True) -> None:

lsst/pipe/base/quantum_graph/__init__.py CHANGED Viewed

@@ -30,3 +30,4 @@ from __future__ import annotations
 from ._common import *
 from ._multiblock import *
 from ._predicted import *
+from ._provenance import *

lsst/pipe/base/quantum_graph/_common.py CHANGED Viewed

@@ -82,6 +82,7 @@ if TYPE_CHECKING:
 TaskLabel: TypeAlias = str
 DatasetTypeName: TypeAlias = str
 ConnectionName: TypeAlias = str
+DatasetIndex: TypeAlias = int
 QuantumIndex: TypeAlias = int
 DatastoreName: TypeAlias = str
 DimensionElementName: TypeAlias = str
@@ -326,7 +327,7 @@ class BaseQuantumGraph(ABC):
     ----------
     header : `HeaderModel`
         Structured metadata for the graph.
-    pipeline_graph : `..pipeline_graph.PipelineGraph`
+    pipeline_graph : `.pipeline_graph.PipelineGraph`
         Graph of tasks and dataset types.  May contain a superset of the tasks
         and dataset types that actually have quanta and datasets in the quantum
         graph.

lsst/pipe/base/quantum_graph/_multiblock.py CHANGED Viewed

@@ -41,6 +41,7 @@ __all__ = (
 import dataclasses
 import logging
+import tempfile
 import uuid
 from collections.abc import Iterator
 from contextlib import contextmanager
@@ -501,13 +502,13 @@ class AddressReader:
         self.pages.clear()
         return self.rows
-    def find(self, key: uuid.UUID) -> AddressRow:
-        """Read the row for the given UUID.
+    def find(self, key: uuid.UUID | int) -> AddressRow:
+        """Read the row for the given UUID or integer index.
         Parameters
         ----------
-        key : `uuid.UUID`
-            UUID to find.
+        key : `uuid.UUID` or `int`
+            UUID or integer index to find.
         Returns
         -------
@@ -517,6 +518,8 @@ class AddressReader:
         match key:
             case uuid.UUID():
                 return self._find_uuid(key)
+            case int():
+                return self._find_index(key)
             case _:
                 raise TypeError(f"Invalid argument: {key}.")
@@ -546,6 +549,22 @@ class AddressReader:
         # Ran out of pages to search.
         raise LookupError(f"Address for {target} not found.")
+    def _find_index(self, target: int) -> AddressRow:
+        # First shortcut if we've already loaded this row.
+        if (row := self.rows_by_index.get(target)) is not None:
+            return row
+        if target < 0 or target >= self.n_rows:
+            raise LookupError(f"Address for index {target} not found.")
+        # Since all indexes should be present, we can predict the right page
+        # exactly.
+        page_index = target // self.rows_per_page
+        self._read_page(page_index)
+        try:
+            return self.rows_by_index[target]
+        except KeyError:
+            _LOG.debug("Index find failed: %s should have been in page %s.", target, page_index)
+            raise LookupError(f"Address for {target} not found.") from None
     def _read_page(self, page_index: int, page_stream: BytesIO | None = None) -> bool:
         page = self.pages[page_index]
         if page.read:
@@ -594,7 +613,9 @@ class MultiblockWriter:
     @classmethod
     @contextmanager
-    def open_in_zip(cls, zf: zipfile.ZipFile, name: str, int_size: int) -> Iterator[MultiblockWriter]:
+    def open_in_zip(
+        cls, zf: zipfile.ZipFile, name: str, int_size: int, use_tempfile: bool = False
+    ) -> Iterator[MultiblockWriter]:
         """Open a writer for a file in a zip archive.
         Parameters
@@ -605,14 +626,26 @@ class MultiblockWriter:
             Base name for the multi-block file; an extension will be added.
         int_size : `int`
             Number of bytes to use for all integers.
+        use_tempfile : `bool`, optional
+            If `True`, send writes to a temporary file and only add the file to
+            the zip archive when the context manager closes.  This involves
+            more overall I/O, but it permits multiple multi-block files to be
+            open for writing in the same zip archive at once.
         Returns
         -------
         writer : `contextlib.AbstractContextManager` [ `MultiblockWriter` ]
             Context manager that returns a writer when entered.
         """
-        with zf.open(f"{name}.mb", mode="w", force_zip64=True) as stream:
-            yield MultiblockWriter(stream, int_size)
+        filename = f"{name}.mb"
+        if use_tempfile:
+            with tempfile.NamedTemporaryFile(suffix=filename) as tmp:
+                yield MultiblockWriter(tmp, int_size)
+                tmp.flush()
+                zf.write(tmp.name, filename)
+        else:
+            with zf.open(f"{name}.mb", mode="w", force_zip64=True) as stream:
+                yield MultiblockWriter(stream, int_size)
     def write_bytes(self, id: uuid.UUID, data: bytes) -> Address:
         """Write raw bytes to the multi-block file.
@@ -629,6 +662,7 @@ class MultiblockWriter:
         address : `Address`
             Address of the bytes just written.
         """
+        assert id not in self.addresses, "Duplicate write to multi-block file detected."
         self.stream.write(len(data).to_bytes(self.int_size))
         self.stream.write(data)
         block_size = len(data) + self.int_size

lsst/pipe/base/quantum_graph/_predicted.py CHANGED Viewed

@@ -347,8 +347,21 @@ class PredictedQuantumDatasetsModel(pydantic.BaseModel):
     the data repository.
     """
-    def iter_dataset_ids(self) -> Iterator[uuid.UUID]:
-        """Return an iterator over the UUIDs of all datasets referenced by this
+    def iter_input_dataset_ids(self) -> Iterator[uuid.UUID]:
+        """Return an iterator over the UUIDs of all datasets consumed by this
+        quantum.
+        Returns
+        -------
+        iter : `~collections.abc.Iterator` [ `uuid.UUID` ]
+            Iterator over dataset IDs.
+        """
+        for datasets in self.inputs.values():
+            for dataset in datasets:
+                yield dataset.dataset_id
+    def iter_output_dataset_ids(self) -> Iterator[uuid.UUID]:
+        """Return an iterator over the UUIDs of all datasets produced by this
         quantum.
         Returns
@@ -356,10 +369,22 @@ class PredictedQuantumDatasetsModel(pydantic.BaseModel):
         iter : `~collections.abc.Iterator` [ `uuid.UUID` ]
             Iterator over dataset IDs.
         """
-        for datasets in itertools.chain(self.inputs.values(), self.outputs.values()):
+        for datasets in self.outputs.values():
             for dataset in datasets:
                 yield dataset.dataset_id
+    def iter_dataset_ids(self) -> Iterator[uuid.UUID]:
+        """Return an iterator over the UUIDs of all datasets referenced by this
+        quantum.
+        Returns
+        -------
+        iter : `~collections.abc.Iterator` [ `uuid.UUID` ]
+            Iterator over dataset IDs.
+        """
+        yield from self.iter_input_dataset_ids()
+        yield from self.iter_output_dataset_ids()
     def deserialize_datastore_records(self) -> dict[DatastoreName, DatastoreRecordData]:
         """Deserialize the mapping of datastore records."""
         return {
@@ -774,7 +799,7 @@ class PredictedQuantumGraph(BaseQuantumGraph):
             Approximate number of bytes to read at once from address files.
             Note that this does not set a page size for *all* reads, but it
             does affect the smallest, most numerous reads.
-        import_mode : `..pipeline_graph.TaskImportMode`, optional
+        import_mode : `.pipeline_graph.TaskImportMode`, optional
             How to handle importing the task classes referenced in the pipeline
             graph.
@@ -1498,6 +1523,38 @@ class PredictedQuantumGraphComponents:
     This does include special "init" quanta.
     """
+    def make_dataset_ref(self, predicted: PredictedDatasetModel) -> DatasetRef:
+        """Make a `lsst.daf.butler.DatasetRef` from information in the
+        predicted quantum graph.
+        Parameters
+        ----------
+        predicted : `PredictedDatasetModel`
+            Model for the dataset in the predicted graph.
+        Returns
+        -------
+        ref : `lsst.daf.butler.DatasetRef`
+            A dataset reference.  Data ID will be expanded if and only if
+            the dimension data has been loaded.
+        """
+        try:
+            dataset_type = self.pipeline_graph.dataset_types[predicted.dataset_type_name].dataset_type
+        except KeyError:
+            if predicted.dataset_type_name == acc.PACKAGES_INIT_OUTPUT_NAME:
+                dataset_type = self.pipeline_graph.packages_dataset_type
+            else:
+                raise
+        data_id = DataCoordinate.from_full_values(dataset_type.dimensions, tuple(predicted.data_coordinate))
+        if self.dimension_data is not None:
+            (data_id,) = self.dimension_data.attach(dataset_type.dimensions, [data_id])
+        return DatasetRef(
+            dataset_type,
+            data_id,
+            run=predicted.run,
+            id=predicted.dataset_id,
+        )
     def set_quantum_indices(self) -> None:
         """Populate the `quantum_indices` component by sorting the UUIDs in the
         `init_quanta` and `quantum_datasets` components (which must both be
@@ -1813,7 +1870,7 @@ class PredictedQuantumGraphReader(BaseQuantumGraphReader):
             Approximate number of bytes to read at once from address files.
             Note that this does not set a page size for *all* reads, but it
             does affect the smallest, most numerous reads.
-        import_mode : `..pipeline_graph.TaskImportMode`, optional
+        import_mode : `.pipeline_graph.TaskImportMode`, optional
             How to handle importing the task classes referenced in the pipeline
             graph.

lsst-pipe-base 29.2025.4100__py3-none-any.whl → 29.2025.4300__py3-none-any.whl

lsst-pipe-base 29.2025.4100py3-none-any.whl → 29.2025.4300py3-none-any.whl