PyPI - umami-preprocessing - Versions diffs - 0.2.3__tar.gz → 0.2.5__tar.gz - Mend

umami-preprocessing 0.2.3tar.gz → 0.2.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{umami_preprocessing-0.2.3 → umami_preprocessing-0.2.5}/PKG-INFO RENAMED Viewed

@@ -1,25 +1,25 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: umami-preprocessing
-Version: 0.2.3
+Version: 0.2.5
 Summary: Preprocessing for jet tagging
 License: MIT
 Project-URL: Homepage, https://github.com/umami-hep/umami-preprocessing
 Requires-Python: <3.12,>=3.8
 Description-Content-Type: text/markdown
+Requires-Dist: atlas-ftag-tools==0.2.14
+Requires-Dist: dotmap==1.3.30
+Requires-Dist: puma-hep==0.4.9
 Requires-Dist: pyyaml-include==1.3
-Requires-Dist: PyYAML==6.0.1
+Requires-Dist: PyYAML>=6.0.1
 Requires-Dist: rich==12.6.0
-Requires-Dist: scipy==1.10.1
-Requires-Dist: puma-hep==0.4.2
-Requires-Dist: atlas-ftag-tools==0.2.8
-Requires-Dist: dotmap==1.3.30
+Requires-Dist: scipy>=1.15.2
 Provides-Extra: dev
-Requires-Dist: ruff==0.1.6; extra == "dev"
-Requires-Dist: mypy==1.5.1; extra == "dev"
+Requires-Dist: mypy==1.11.2; extra == "dev"
 Requires-Dist: pre-commit==3.5.0; extra == "dev"
-Requires-Dist: pytest>=7.0.1; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
 Requires-Dist: pytest-mock==3.11.1; extra == "dev"
-Requires-Dist: pytest-cov>=3.0.0; extra == "dev"
+Requires-Dist: pytest>=7.2.2; extra == "dev"
+Requires-Dist: ruff==0.6.2; extra == "dev"
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![codecov](https://codecov.io/gh/umami-hep/umami-preprocessing/graph/badge.svg?token=K8MJI20UZO)](https://codecov.io/gh/umami-hep/umami-preprocessing)

{umami_preprocessing-0.2.3 → umami_preprocessing-0.2.5}/pyproject.toml RENAMED Viewed

@@ -7,23 +7,23 @@ readme = "README.md"
 requires-python = "<3.12,>=3.8"
 dependencies = [
-    "pyyaml-include==1.3",
-    "PyYAML==6.0.1",
-    "rich==12.6.0",
-    "scipy==1.10.1",
-    "puma-hep==0.4.2",
-    "atlas-ftag-tools==0.2.8",
-    "dotmap==1.3.30"
+  "atlas-ftag-tools==0.2.14",
+  "dotmap==1.3.30",
+  "puma-hep==0.4.9",
+  "pyyaml-include==1.3",
+  "PyYAML>=6.0.1",
+  "rich==12.6.0",
+  "scipy>=1.15.2",
 ]
 [project.optional-dependencies]
 dev = [
-  "ruff==0.1.6",
-  "mypy==1.5.1",
+  "mypy==1.11.2",
   "pre-commit==3.5.0",
-  "pytest>=7.0.1",
+  "pytest-cov>=4.0.0",
   "pytest-mock==3.11.1",
-  "pytest-cov>=3.0.0",
+  "pytest>=7.2.2",
+  "ruff==0.6.2",
 ]
 [project.urls]

{umami_preprocessing-0.2.3 → umami_preprocessing-0.2.5}/umami_preprocessing.egg-info/PKG-INFO RENAMED Viewed

@@ -1,25 +1,25 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: umami-preprocessing
-Version: 0.2.3
+Version: 0.2.5
 Summary: Preprocessing for jet tagging
 License: MIT
 Project-URL: Homepage, https://github.com/umami-hep/umami-preprocessing
 Requires-Python: <3.12,>=3.8
 Description-Content-Type: text/markdown
+Requires-Dist: atlas-ftag-tools==0.2.14
+Requires-Dist: dotmap==1.3.30
+Requires-Dist: puma-hep==0.4.9
 Requires-Dist: pyyaml-include==1.3
-Requires-Dist: PyYAML==6.0.1
+Requires-Dist: PyYAML>=6.0.1
 Requires-Dist: rich==12.6.0
-Requires-Dist: scipy==1.10.1
-Requires-Dist: puma-hep==0.4.2
-Requires-Dist: atlas-ftag-tools==0.2.8
-Requires-Dist: dotmap==1.3.30
+Requires-Dist: scipy>=1.15.2
 Provides-Extra: dev
-Requires-Dist: ruff==0.1.6; extra == "dev"
-Requires-Dist: mypy==1.5.1; extra == "dev"
+Requires-Dist: mypy==1.11.2; extra == "dev"
 Requires-Dist: pre-commit==3.5.0; extra == "dev"
-Requires-Dist: pytest>=7.0.1; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
 Requires-Dist: pytest-mock==3.11.1; extra == "dev"
-Requires-Dist: pytest-cov>=3.0.0; extra == "dev"
+Requires-Dist: pytest>=7.2.2; extra == "dev"
+Requires-Dist: ruff==0.6.2; extra == "dev"
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![codecov](https://codecov.io/gh/umami-hep/umami-preprocessing/graph/badge.svg?token=K8MJI20UZO)](https://codecov.io/gh/umami-hep/umami-preprocessing)

umami_preprocessing-0.2.5/umami_preprocessing.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,15 @@
+atlas-ftag-tools==0.2.14
+dotmap==1.3.30
+puma-hep==0.4.9
+pyyaml-include==1.3
+PyYAML>=6.0.1
+rich==12.6.0
+scipy>=1.15.2
+[dev]
+mypy==1.11.2
+pre-commit==3.5.0
+pytest-cov>=4.0.0
+pytest-mock==3.11.1
+pytest>=7.2.2
+ruff==0.6.2

{umami_preprocessing-0.2.3 → umami_preprocessing-0.2.5}/upp/__init__.py RENAMED Viewed

@@ -2,4 +2,4 @@
 from __future__ import annotations
-__version__ = "v0.2.3"
+__version__ = "v0.2.5"

{umami_preprocessing-0.2.3 → umami_preprocessing-0.2.5}/upp/classes/preprocessing_config.py RENAMED Viewed

@@ -53,42 +53,56 @@ class PreprocessingConfig:
     Parameters
     ----------
+    config_path : Path
+        Path to the config yaml file that is used. Does not need to be set in config.
+    split : Split
+        For which part the preprocessing is run. Either train, val or test. This needs
+        to be set as a command line argument when running the programm. Does not need
+        to be set in config.
+    config : dict
+        Dict with the loaded config. Does not need to be set in config.
     base_dir : Path
         Base directory for all other paths.
-    ntuple_dir : Path
+    ntuple_dir : Path, optional
         Directory containing the input h5 ntuples. If a relative path is given, it is
-        interpreted as relative to base_dir.
-    components_dir : Path
+        interpreted as relative to base_dir. By default Path("ntuples")
+    components_dir : Path, optional
         Directory for intermediate component files. If a relative path is given, it is
-        interpreted as relative to base_dir.
-    out_dir : Path
+        interpreted as relative to base_dir. By default Path("components")
+    out_dir : Path, optional
         Directory for output files. If a relative path is given, it is interpreted as
-        relative to base_dir.
-    out_fname : Path
-        Filename stem for the output files.
-    batch_size : int
+        relative to base_dir. By default Path("output")
+    out_fname : Path, optional
+        Filename stem for the output files. By default Path("pp_output.h5")
+    batch_size : int, optional
         Batch size for the preprocessing. For each batch select
         `sampling_fraction*batch_size_after_cuts`. It is recommended to choose high batch sizes
         especially to the `countup` method to achive best agreement of target and resampled
-        distributions.
-    num_jets_estimate : int
+        distributions. By default 100_000
+    num_jets_estimate : int, optional
         Any of the further three arguments that are not specified will default to this value
         Is equal to 1_000_000 by default.
-    num_jets_estimate_available : int | None
+    num_jets_estimate_available : int, optional
         A sabsample taken from the whole sample to estimate the number of jets after the cuts.
         Please keep this number high in order to not get poisson error of more then 5%.
         If time allows you can use -1 to get a precise number of jets and not just an estimate
         although it will be slow for large datasets. Is equal to num_jets_estimate by default.
-    num_jets_estimate_hist : int
+    num_jets_estimate_hist : int, optional
         Number of jets of each flavour that are used to construct histograms for probability
         density function estimation. Larger numbers give a better quality estmate of the pdfs.
         Is equal to num_jets_estimate by default.
-    num_jets_estimate_norm : int
+    num_jets_estimate_norm : int, optional
         Number of jets of each flavour that are used to estimate shifting and scaling during
         normalisation step. Larger numbers give a better quality estmates.
         Is equal to num_jets_estimate by default.
-    jets_name : str
-        Name of the jets dataset in the input file.
+    num_jets_estimate_plotting : int, optional
+        Number of jets of each flavour used for plotting the initial and the final resampling
+        variable distributions. Larger numbers give a better estimate of the full distributions.
+        Is equal to num_jets_estimate by default.
+    merge_test_samples : bool, optional
+        Merge the test samples of the different processes into one file. By default False.
+    jets_name : str, optional
+        Name of the jets dataset in the input file. By default "jets".
     """
     config_path: Path
@@ -104,9 +118,11 @@ class PreprocessingConfig:
     num_jets_estimate_available: int | None = None
     num_jets_estimate_hist: int | None = None
     num_jets_estimate_norm: int | None = None
+    num_jets_estimate_plotting: int | None = None
     merge_test_samples: bool = False
     jets_name: str = "jets"
     flavour_config: Path | None = None
+    num_jets_per_output_file: int | None = None
     def __post_init__(self):
         # postprocess paths
@@ -117,6 +133,8 @@ class PreprocessingConfig:
                 self.num_jets_estimate_hist = self.num_jets_estimate
             if self.num_jets_estimate_norm is None:
                 self.num_jets_estimate_norm = self.num_jets_estimate
+            if self.num_jets_estimate_plotting is None:
+                self.num_jets_estimate_plotting = self.num_jets_estimate
         for field in dataclasses.fields(self):
             if field.type == "Path" and field.name != "out_fname" and field.name != "base_dir":

umami_preprocessing-0.2.5/upp/logger.py ADDED Viewed

@@ -0,0 +1,76 @@
+from __future__ import annotations
+import logging
+import sys
+from functools import partial
+from rich.console import Console
+from rich.logging import RichHandler
+from rich.progress import (
+    BarColumn,
+    Progress,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+# Detect if the program is executed in an interactive terminal
+_IS_TTY = sys.stderr.isatty()
+# One console object is reused everywhere so that Rich keeps a consistent idea
+# of whether it may emit ANSI control codes / animations.
+_console = Console(
+    width=100,
+    force_terminal=_IS_TTY,
+    force_interactive=_IS_TTY,
+    no_color=not _IS_TTY,
+)
+# Template for the progress bar
+ProgressBar = partial(
+    Progress,
+    TextColumn("[task.description]{task.description}"),
+    BarColumn(),
+    TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+    TextColumn("•"),
+    TimeRemainingColumn(),
+    TextColumn("•"),
+    TimeElapsedColumn(),
+    refresh_per_second=1 if _IS_TTY else 0.05,
+    speed_estimate_period=30 if _IS_TTY else 120,
+    console=_console,
+    disable=not _IS_TTY,
+    transient=_IS_TTY,
+)
+# Helper for setup the logger
+def setup_logger(level: str = "INFO"):
+    """Set up the logger.
+    Configure Rich logging so that colourful / interactive output is used when
+    the program is attached to a terminal and plain text is written when it is
+    executed under a batch system such as Slurm (where stdout / stderr are files).
+    """
+    FORMAT = "%(message)s"
+    # In a batch job we create a console that never emits colour codes.
+    console = None
+    if not _IS_TTY:
+        console = Console(
+            width=120,
+            force_terminal=False,
+            force_interactive=False,
+            no_color=True,
+        )
+    handler = RichHandler(
+        show_time=False,
+        show_path=False,
+        markup=True,
+        rich_tracebacks=True,
+        console=console,
+    )
+    logging.basicConfig(level=level, format=FORMAT, handlers=[handler])
+    return logging

{umami_preprocessing-0.2.3 → umami_preprocessing-0.2.5}/upp/main.py RENAMED Viewed

@@ -21,7 +21,7 @@ from upp.logger import setup_logger
 from upp.stages.hist import create_histograms
 from upp.stages.merging import Merging
 from upp.stages.normalisation import Normalisation
-from upp.stages.plot import plot_initial_resampling_dists, plot_resampled_dists
+from upp.stages.plot import plot_resampling_dists
 from upp.stages.resampling import Resampling
@@ -79,7 +79,7 @@ def run_pp(args) -> None:
     # run the resampling
     if args.resample:
         resampling = Resampling(config)
-        resampling.run(region=args.region)
+        resampling.run(region=args.region, component=args.component)
     # run the merging
     if args.merge:
@@ -95,8 +95,8 @@ def run_pp(args) -> None:
     if args.plot:
         title = " Plotting "
         log.info(f"[bold green]{title:-^100}")
-        plot_initial_resampling_dists(config=config)
-        plot_resampled_dists(config=config, stage=args.split)
+        plot_resampling_dists(config=config, stage="initial")
+        plot_resampling_dists(config=config, stage=args.split)
     # print end info
     end = datetime.now()

umami_preprocessing-0.2.5/upp/stages/merging.py ADDED Viewed

@@ -0,0 +1,308 @@
+from __future__ import annotations
+import json
+import logging as log
+from copy import copy
+from pathlib import Path
+from typing import TYPE_CHECKING
+import numpy as np
+from ftag.hdf5 import H5Writer, join_structured_arrays
+from upp.logger import ProgressBar
+from upp.utils import path_append
+if TYPE_CHECKING:  # pragma: no cover
+    from upp.classes.components import Component, Components
+    from upp.classes.preprocessing_config import PreprocessingConfig
+class Merging:
+    """Merging Class to merge different components/regions."""
+    def __init__(self, config: PreprocessingConfig):
+        """Init the Merging class instance.
+        Parameters
+        ----------
+        config : PreprocessingConfig
+            Loaded preprocessing config as a PreprocessingConfig instance
+        """
+        self.config = config
+        self.components = config.components
+        self.variables = config.variables
+        self.batch_size = config.batch_size
+        self.jets_name = config.jets_name
+        self.rng = np.random.default_rng(42)
+        self.flavours = self.components.flavours
+        self.num_jets_per_output_file = config.num_jets_per_output_file
+        self.file_tag = "split"
+    def add_jet_flavour_label(self, jets: np.ndarray, component: Component) -> np.ndarray:
+        """Add the jet flavour label to the jets.
+        If already present, jets will be returned without any changes.
+        Parameters
+        ----------
+        jets : np.ndarray
+            Structured array of with the jets and their variables
+        component : Component
+            Component instance of the
+        Returns
+        -------
+        np.ndarray
+            Structured array of the jets and their variables with the
+            "flavour_label" added.
+        """
+        if "flavour_label" in jets.dtype.names:
+            return jets
+        int_label = self.flavours.index(component.flavour)
+        label_array = np.full(len(jets), int_label, dtype=[("flavour_label", "i4")])
+        return join_structured_arrays([jets, label_array])
+    def _open_writer(
+        self,
+        sample: str | None,
+        jets_in_file: int,
+        file_idx: int,
+        components: Components,
+    ) -> None:
+        """Create `self.writer` for the next output file and attach all static attributes.
+        Parameters
+        ----------
+        sample
+            Sample name (``None`` for the "train/val test" merge).
+        jets_in_file
+            Capacity of the new file (= leading dimension of every dataset).
+        file_idx
+            Running part index (0, 1, 2, …); used only for the filename suffix.
+        components
+            The `Components` object we are currently merging needed for `jet_counts`, etc.
+        """
+        # Construct the filename
+        fname = Path(self.config.out_fname)
+        if sample:
+            fname = path_append(fname, sample)
+        if self.num_jets_per_output_file is not None:
+            suffix = f"{self.file_tag}_{file_idx:03d}"
+            fname = fname.with_name(f"{fname.stem}_{suffix}{fname.suffix}")
+        # Adjust shapes to the capacity of this file
+        shapes = {name: (jets_in_file,) + shape[1:] for name, shape in self.base_shapes.items()}
+        # Instantiate an H5Writer
+        self.writer = H5Writer(
+            fname,
+            self.dtypes,
+            shapes,
+            add_flavour_label=self.jets_name,
+            jets_name=self.jets_name,
+            num_jets=jets_in_file,
+        )
+        # Copy the metadata attributes
+        self.writer.add_attr(
+            "flavour_label",
+            [f.name for f in self.flavours],
+            self.jets_name,
+        )
+        self.writer.add_attr("unique_jets", components.unique_jets)
+        self.writer.add_attr("jet_counts", json.dumps(components.jet_counts))
+        self.writer.add_attr("dsids", str(components.dsids))
+        self.writer.add_attr("config", json.dumps(self.config.config))
+        self.writer.add_attr("upp_hash", self.config.git_hash)
+        # Log for debugging
+        log.debug(f"Setup merge output at {self.writer.dst}")
+    def write_chunk(self, components: Components) -> int:
+        """Read one chunk, merge and write it to disk.
+        Read one batch from every active component, merge them and write
+        them to disk. If the batch does not fit into the current file it is
+        split across files transparently.
+        Returns
+        -------
+        int
+            The number of jets that were consumed from the components
+            (== written to disk).  When all components are exhausted the
+            function returns 0 so that the caller can stop its loop.
+        """
+        # Init a merged dict
+        merged: dict[str, np.ndarray] = {}
+        # Loop over components
+        for component in components:
+            try:
+                # shallow copy because we will add a field
+                batch = copy(next(component.stream))
+                batch[self.jets_name] = self.add_jet_flavour_label(
+                    jets=batch[self.jets_name], component=component
+                )
+            except StopIteration:
+                component.complete = True
+            if component.complete:
+                continue
+            # Merge this component's arrays into the running dict
+            for name, array in batch.items():
+                if name not in merged:
+                    merged[name] = array
+                else:
+                    merged[name] = np.concatenate([merged[name], array])
+        # Stop if there is nothing more to read
+        if all(c.complete for c in components):
+            return 0
+        # Apply track selections
+        for name in self.variables.variables:
+            if name == self.jets_name:
+                continue
+            if selector := self.variables.selectors.get(name):
+                merged[name] = selector(merged[name])
+        # Get the total length of jets from the batch and how much
+        # capacity is left in the file
+        merged_len = len(merged[self.jets_name])
+        capacity_left = self.writer.num_jets - self.writer.num_written
+        # Check if the capacity of the given file is already zero
+        if capacity_left == 0:
+            # close the filled file
+            self.writer.close()
+            # open the next one
+            self._file_idx += 1
+            remaining_total = self.total_jets - self.jets_written
+            # Quit writing when no jets are left to write
+            if remaining_total == 0:
+                return 0
+            next_file_size = (
+                min(self.num_jets_per_output_file, remaining_total)
+                if self.num_jets_per_output_file
+                else remaining_total
+            )
+            self._open_writer(
+                self._sample,
+                next_file_size,
+                self._file_idx,
+                self.current_components,
+            )
+            # Recompute free space in the freshly-opened file
+            capacity_left = self.writer.num_jets - self.writer.num_written
+        # Check if the whole batch fits into the file
+        if merged_len <= capacity_left:
+            # whole batch fits
+            self.writer.write(merged)
+        else:
+            # Write the *head* that still fits into the present file
+            head = {n: a[:capacity_left] for n, a in merged.items()}
+            self.writer.write(head)
+            self.writer.close()
+            # Open a fresh file sized for the remaining jets
+            self._file_idx += 1
+            remaining_total = self.total_jets - (self.jets_written + capacity_left)
+            next_file_size = (
+                min(self.num_jets_per_output_file, remaining_total)
+                if self.num_jets_per_output_file
+                else remaining_total
+            )
+            self._open_writer(self._sample, next_file_size, self._file_idx, self.current_components)
+            # Write the *tail* that goes into the new file
+            tail = {n: a[capacity_left:] for n, a in merged.items()}
+            self.writer.write(tail)
+        # Updating the progress-bar
+        self.jets_written += merged_len
+        return merged_len
+    def write_components(self, sample: str | None, components: Components) -> None:
+        """
+        Merge *components* into one or more HDF5 files.
+        If ``self.num_jets_per_output_file`` is ``None`` the behaviour is identical to the
+        original implementation (exactly one output file).  Otherwise the function
+        keeps opening new `H5Writer`s whenever the current file reaches that jet
+        limit.  All heavy work (splitting batches, rolling files) is handled in
+        ``self.write_chunk``.
+        """
+        # Prepare every Component's reader
+        for component in components:
+            batch_size = self.batch_size * component.num_jets // components.num_jets + 1
+            component.setup_reader(
+                batch_size,
+                fname=component.out_path,
+                jets_name=self.jets_name,
+            )
+            component.stream = component.reader.stream(
+                self.variables.combined(),
+                component.reader.num_jets,
+            )
+            component.complete = False
+        # Cache dtype / base shapes once (re-used for every new file)
+        self.dtypes = components[0].reader.dtypes(self.variables.combined())
+        self.base_shapes = components[0].reader.shapes(components.num_jets, self.variables.keys())
+        # Bookkeeping shared with write_chunk
+        self.total_jets = components.num_jets
+        self.jets_written = 0
+        self._file_idx = 0
+        self._sample = sample
+        self.current_components = components
+        # decide capacity of the first file
+        first_file_size = (
+            min(self.num_jets_per_output_file, self.total_jets)
+            if self.num_jets_per_output_file
+            else self.total_jets
+        )
+        # Open the first output file
+        self._open_writer(sample, first_file_size, self._file_idx, components)
+        # Main merge loop (progress bar unchanged)
+        with ProgressBar() as progress:
+            task = progress.add_task(
+                f"[green]Merging {components.num_jets:,} jets...",
+                total=components.num_jets,
+            )
+            while True:
+                n = self.write_chunk(components)
+                if not n:
+                    break
+                progress.update(task, advance=n)
+        # Close Writer
+        self.writer.close()
+        label = "merged" if sample is None else sample
+        log.info(f"[bold green]Finished merging {components.num_jets:,} {label} jets!")
+    def run(self):
+        """Run merging of the components."""
+        title = " Running Merging "
+        log.info(f"[bold green]{title:-^100}")
+        if not self.config.is_test or self.config.merge_test_samples:
+            components = [(None, self.components)]
+        else:
+            components = self.components.groupby_sample()
+        for sample, comps in components:
+            self.write_components(sample, comps)

{umami_preprocessing-0.2.3 → umami_preprocessing-0.2.5}/upp/stages/normalisation.py RENAMED Viewed

@@ -241,14 +241,21 @@ class Normalisation:
         title = " Computing Normalisations "
         log.info(f"[bold green]{title:-^100}")
+        # Get the correct output names if multiple output files were written
+        if self.config.num_jets_per_output_file:
+            fname = self.config.out_fname.parent / f"{self.config.out_fname.stem}*.h5"
+        else:
+            fname = self.config.out_fname
         # Setup reader
         reader = H5Reader(
-            self.config.out_fname,
+            fname,
             self.config.batch_size,
             precision="full",
             jets_name=self.jets_name,
         )
-        log.debug(f"Setup reader at: {self.config.out_fname}")
+        log.debug(f"Setup reader at: {fname}")
         norm_dict = None
         class_dict = None

umami-preprocessing 0.2.3__tar.gz → 0.2.5__tar.gz

umami-preprocessing 0.2.3tar.gz → 0.2.5tar.gz