PyPI - umami-preprocessing - Versions diffs - 0.0.6__tar.gz → 0.2.0__tar.gz - Mend

umami-preprocessing 0.0.6tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{umami-preprocessing-0.0.6 → umami_preprocessing-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,26 +1,25 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: umami-preprocessing
-Version: 0.0.6
+Version: 0.2.0
 Summary: Preprocessing for jet tagging
 License: MIT
 Project-URL: Homepage, https://github.com/umami-hep/umami-preprocessing
-Requires-Python: >=3.8
+Requires-Python: <3.12,>=3.8
 Description-Content-Type: text/markdown
 Requires-Dist: pyyaml-include==1.3
 Requires-Dist: PyYAML==6.0.1
 Requires-Dist: rich==12.6.0
 Requires-Dist: scipy==1.10.1
-Requires-Dist: puma-hep==0.3.0
-Requires-Dist: atlas-ftag-tools==0.1.10
+Requires-Dist: puma-hep==0.4.1
+Requires-Dist: atlas-ftag-tools==0.2.7
 Requires-Dist: dotmap==1.3.30
 Provides-Extra: dev
-Requires-Dist: black==23.9.1; extra == "dev"
-Requires-Dist: ruff==0.0.289; extra == "dev"
+Requires-Dist: ruff==0.1.6; extra == "dev"
 Requires-Dist: mypy==1.5.1; extra == "dev"
-Requires-Dist: pre-commit==3.1.1; extra == "dev"
-Requires-Dist: pytest==7.2.2; extra == "dev"
+Requires-Dist: pre-commit==3.5.0; extra == "dev"
+Requires-Dist: pytest>=7.0.1; extra == "dev"
 Requires-Dist: pytest-mock==3.11.1; extra == "dev"
-Requires-Dist: pytest-cov==4.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=3.0.0; extra == "dev"
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![codecov](https://codecov.io/gh/umami-hep/umami-preprocessing/graph/badge.svg?token=K8MJI20UZO)](https://codecov.io/gh/umami-hep/umami-preprocessing)

{umami-preprocessing-0.0.6 → umami_preprocessing-0.2.0}/pyproject.toml RENAMED Viewed

@@ -4,27 +4,26 @@ description = "Preprocessing for jet tagging"
 dynamic = ["version"]
 license = {text = "MIT"}
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = "<3.12,>=3.8"
 dependencies = [
     "pyyaml-include==1.3",
     "PyYAML==6.0.1",
     "rich==12.6.0",
     "scipy==1.10.1",
-    "puma-hep==0.3.0",
-    "atlas-ftag-tools==0.1.10",
+    "puma-hep==0.4.1",
+    "atlas-ftag-tools==0.2.7",
     "dotmap==1.3.30"
 ]
 [project.optional-dependencies]
 dev = [
-  "black==23.9.1",
-  "ruff==0.0.289",
+  "ruff==0.1.6",
   "mypy==1.5.1",
-  "pre-commit==3.1.1",
-  "pytest==7.2.2",
+  "pre-commit==3.5.0",
+  "pytest>=7.0.1",
   "pytest-mock==3.11.1",
-  "pytest-cov==4.0.0",
+  "pytest-cov>=3.0.0",
 ]
 [project.urls]
@@ -44,24 +43,17 @@ version = {attr = "upp.__version__"}
 requires = ["setuptools>=62"]
 build-backend = "setuptools.build_meta"
-[tool.black]
-line-length = 100
-preview = "True"
 [tool.ruff]
-select = ["I", "E", "W", "F", "B", "UP", "ARG", "SIM", "TID", "RUF", "D2", "D3", "D4"]
-ignore = ["D211", "D213", "RUF005"]
+lint.select = ["I", "E", "W", "F", "B", "UP", "ARG", "SIM", "TID", "RUF", "D2", "D3", "D4"]
+lint.ignore = ["D211", "D213", "RUF005"]
 line-length = 100
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
 required-imports = ["from __future__ import annotations"]
-[tool.ruff.pydocstyle]
+[tool.ruff.lint.pydocstyle]
 convention = "numpy"  # Accepts: "google", "numpy", or "pep257".
-[mypy]
-ignore_missing_imports = "True"
 [tool.pytest.ini_options]
 log_cli_level = "debug"
 filterwarnings = ["ignore::DeprecationWarning"]

{umami-preprocessing-0.0.6 → umami_preprocessing-0.2.0}/umami_preprocessing.egg-info/PKG-INFO RENAMED Viewed

@@ -1,26 +1,25 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: umami-preprocessing
-Version: 0.0.6
+Version: 0.2.0
 Summary: Preprocessing for jet tagging
 License: MIT
 Project-URL: Homepage, https://github.com/umami-hep/umami-preprocessing
-Requires-Python: >=3.8
+Requires-Python: <3.12,>=3.8
 Description-Content-Type: text/markdown
 Requires-Dist: pyyaml-include==1.3
 Requires-Dist: PyYAML==6.0.1
 Requires-Dist: rich==12.6.0
 Requires-Dist: scipy==1.10.1
-Requires-Dist: puma-hep==0.3.0
-Requires-Dist: atlas-ftag-tools==0.1.10
+Requires-Dist: puma-hep==0.4.1
+Requires-Dist: atlas-ftag-tools==0.2.7
 Requires-Dist: dotmap==1.3.30
 Provides-Extra: dev
-Requires-Dist: black==23.9.1; extra == "dev"
-Requires-Dist: ruff==0.0.289; extra == "dev"
+Requires-Dist: ruff==0.1.6; extra == "dev"
 Requires-Dist: mypy==1.5.1; extra == "dev"
-Requires-Dist: pre-commit==3.1.1; extra == "dev"
-Requires-Dist: pytest==7.2.2; extra == "dev"
+Requires-Dist: pre-commit==3.5.0; extra == "dev"
+Requires-Dist: pytest>=7.0.1; extra == "dev"
 Requires-Dist: pytest-mock==3.11.1; extra == "dev"
-Requires-Dist: pytest-cov==4.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=3.0.0; extra == "dev"
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![codecov](https://codecov.io/gh/umami-hep/umami-preprocessing/graph/badge.svg?token=K8MJI20UZO)](https://codecov.io/gh/umami-hep/umami-preprocessing)

umami_preprocessing-0.2.0/umami_preprocessing.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,15 @@
+pyyaml-include==1.3
+PyYAML==6.0.1
+rich==12.6.0
+scipy==1.10.1
+puma-hep==0.4.1
+atlas-ftag-tools==0.2.7
+dotmap==1.3.30
+[dev]
+ruff==0.1.6
+mypy==1.5.1
+pre-commit==3.5.0
+pytest>=7.0.1
+pytest-mock==3.11.1
+pytest-cov>=3.0.0

{umami-preprocessing-0.0.6 → umami_preprocessing-0.2.0}/upp/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
 """UPP: Umami PreProcessing."""
 from __future__ import annotations
-__version__ = "v0.0.6"
+__version__ = "v0.2.0"

{umami-preprocessing-0.0.6 → umami_preprocessing-0.2.0}/upp/classes/components.py RENAMED Viewed

@@ -5,7 +5,7 @@ from dataclasses import dataclass
 from pathlib import Path
 import numpy as np
-from ftag import Cuts, Flavour, Flavours, Sample
+from ftag import Cuts, Label, Sample
 from ftag.hdf5 import H5Reader, H5Writer
 from upp.classes.region import Region
@@ -16,26 +16,28 @@ from upp.stages.hist import Hist
 class Component:
     region: Region
     sample: Sample
-    flavour: Flavour
+    flavour: Label
     global_cuts: Cuts
     dirname: Path
     num_jets: int
-    num_jets_estimate: int
-    equal_jets: bool = True
+    num_jets_estimate_available: int
+    equal_jets: bool
     def __post_init__(self):
         self.hist = Hist(self.dirname.parent.parent / "hists" / f"hist_{self.name}.h5")
-    def setup_reader(self, batch_size, fname=None, **kwargs):
+    def setup_reader(self, batch_size, jets_name="jets", fname=None, **kwargs):
         if fname is None:
             fname = self.sample.path
-        self.reader = H5Reader(fname, batch_size, equal_jets=self.equal_jets, **kwargs)
+        self.reader = H5Reader(
+            fname, batch_size, jets_name=jets_name, equal_jets=self.equal_jets, **kwargs
+        )
         log.debug(f"Setup component reader at: {fname}")
-    def setup_writer(self, variables):
+    def setup_writer(self, variables, jets_name="jets"):
         dtypes = self.reader.dtypes(variables.combined())
         shapes = self.reader.shapes(self.num_jets, variables.keys())
-        self.writer = H5Writer(self.out_path, dtypes, shapes)
+        self.writer = H5Writer(self.out_path, dtypes, shapes, jets_name=jets_name)
         log.debug(f"Setup component writer at: {self.out_path}")
     @property
@@ -61,7 +63,10 @@ class Component:
         self, num_req, sampling_frac=None, cuts=None, silent=False, raise_error=True
     ):
         # Check if num_jets jets are aviailable after the cuts and sampling fraction
-        total = self.reader.estimate_available_jets(cuts, self.num_jets_estimate)
+        num_est = (
+            None if self.num_jets_estimate_available <= 0 else self.num_jets_estimate_available
+        )
+        total = self.reader.estimate_available_jets(cuts, num_est)
         available = total
         if sampling_frac:
             available = int(total * sampling_frac)
@@ -77,11 +82,17 @@ class Component:
         if not silent:
             log.debug(f"Sampling fraction {sampling_frac}")
-            log.info(f"Estimated {available:,} {self} jets available - {num_req:,} requested")
+            log.info(
+                f"Estimated {available:,} {self} jets available - {num_req:,} requested"
+                f"({self.reader.num_jets:,} in {self.sample})"
+            )
     def get_auto_sampling_frac(self, num_jets, cuts=None, silent=False):
-        total = self.reader.estimate_available_jets(cuts, self.num_jets_estimate)
-        auto_sampling_frac = round(1.05 * num_jets / total, 3)  # 1.05 is a tolerance factor
+        num_est = (
+            None if self.num_jets_estimate_available <= 0 else self.num_jets_estimate_available
+        )
+        total = self.reader.estimate_available_jets(cuts, num_est)
+        auto_sampling_frac = round(1.1 * num_jets / total, 3)  # 1.1 is a tolerance factor
         if not silent:
             log.debug(f"optimal sampling fraction {auto_sampling_frac:.3f}")
         return auto_sampling_frac
@@ -102,6 +113,7 @@ class Components:
     def from_config(cls, pp_cfg):
         components = []
         for c in pp_cfg.config["components"]:
+            assert "equal_jets" not in c, "equal_jets flag should be set in the sample config"
             region_cuts = Cuts.empty() if pp_cfg.is_test else Cuts.from_list(c["region"]["cuts"])
             region = Region(c["region"]["name"], region_cuts + pp_cfg.global_cuts)
             pattern = c["sample"]["pattern"]
@@ -119,11 +131,11 @@ class Components:
                     Component(
                         region,
                         sample,
-                        Flavours[name],
+                        pp_cfg.flavour_cont[name],
                         pp_cfg.global_cuts,
                         pp_cfg.components_dir,
                         num_jets,
-                        pp_cfg.num_jets_estimate,
+                        pp_cfg.num_jets_estimate_available,
                         equal_jets,
                     )
                 )
@@ -193,7 +205,7 @@ class Components:
     @property
     def dsids(self):
-        return list(set(sum([c.sample.dsid for c in self], [])))
+        return list(set(sum([c.sample.dsid for c in self], [])))  # noqa: RUF017
     def groupby_region(self):
         return [(r, Components([c for c in self if c.region == r])) for r in self.regions]
@@ -207,7 +219,7 @@ class Components:
     def __getitem__(self, index):
         if isinstance(index, int):
             return self.components[index]
-        if isinstance(index, (str, Flavour)):
+        if isinstance(index, (str, Label)):
             return self.components[self.flavours.index(index)]
     def __len__(self):

{umami-preprocessing-0.0.6 → umami_preprocessing-0.2.0}/upp/classes/preprocessing_config.py RENAMED Viewed

@@ -6,12 +6,14 @@ import logging as log
 from copy import copy
 from dataclasses import dataclass
 from pathlib import Path
-from subprocess import CalledProcessError, check_output
 from typing import Literal
 import yaml
 from dotmap import DotMap
 from ftag import Cuts
+from ftag.git_check import get_git_hash
+from ftag.labels import LabelContainer
+from ftag.track_selector import TrackSelector
 from ftag.transform import Transform
 from yamlinclude import YamlIncludeConstructor
@@ -42,6 +44,7 @@ class PreprocessingConfig:
     For example:
     ```yaml
     global:
+        jets_name: jets
         batch_size: 1_000_000
         num_jets_estimate: 5_000_000
         base_dir: /my/stuff/
@@ -69,8 +72,21 @@ class PreprocessingConfig:
         especially to the `countup` method to achive best agreement of target and resampled
         distributions.
     num_jets_estimate : int
+        Any of the further three arguments that are not specified will default to this value
+        Is equal to 1_000_000 by default.
+    num_jets_estimate_available : int | None
+        A sabsample taken from the whole sample to estimate the number of jets after the cuts.
+        Please keep this number high in order to not get poisson error of more then 5%.
+        If time allows you can use -1 to get a precise number of jets and not just an estimate
+        although it will be slow for large datasets. Is equal to num_jets_estimate by default.
+    num_jets_estimate_hist : int
         Number of jets of each flavour that are used to construct histograms for probability
         density function estimation. Larger numbers give a better quality estmate of the pdfs.
+        Is equal to num_jets_estimate by default.
+    num_jets_estimate_norm : int
+        Number of jets of each flavour that are used to estimate shifting and scaling during
+        normalisation step. Larger numbers give a better quality estmates.
+        Is equal to num_jets_estimate by default.
     jets_name : str
         Name of the jets dataset in the input file.
     """
@@ -85,24 +101,50 @@ class PreprocessingConfig:
     out_fname: Path = Path("pp_output.h5")
     batch_size: int = 100_000
     num_jets_estimate: int = 1_000_000
+    num_jets_estimate_available: int | None = None
+    num_jets_estimate_hist: int | None = None
+    num_jets_estimate_norm: int | None = None
     merge_test_samples: bool = False
     jets_name: str = "jets"
+    flavour_config: Path | None = None
     def __post_init__(self):
         # postprocess paths
+        if self.num_jets_estimate:
+            if self.num_jets_estimate_available is None:
+                self.num_jets_estimate_available = max(self.num_jets_estimate, int(1e6))
+            if self.num_jets_estimate_hist is None:
+                self.num_jets_estimate_hist = self.num_jets_estimate
+            if self.num_jets_estimate_norm is None:
+                self.num_jets_estimate_norm = self.num_jets_estimate
         for field in dataclasses.fields(self):
-            if field.type == "Path" and field.name != "out_fname":
+            if field.type == "Path" and field.name != "out_fname" and field.name != "base_dir":
                 setattr(self, field.name, self.get_path(Path(getattr(self, field.name))))
         if not self.ntuple_dir.exists():
             raise FileNotFoundError(f"Path {self.ntuple_dir} does not exist")
         self.components_dir = self.components_dir / self.split
         self.out_fname = self.out_dir / path_append(self.out_fname, self.split)
+        self.flavour_cont = LabelContainer.from_yaml(self.flavour_config)
         # configure classes
         sampl_cfg = copy(self.config["resampling"])
-        self.sampl_cfg = ResamplingConfig(sampl_cfg.pop("variables"), **sampl_cfg)
+        if self.is_test:
+            sampl_cfg["method"] = None
+        self.sampl_cfg = ResamplingConfig(**sampl_cfg)
         self.components = Components.from_config(self)
-        self.variables = VariableConfig(self.config["variables"], self.jets_name, self.is_test)
+        # get track selectors
+        vc = self.config["variables"]
+        selectors = {}
+        for name, groups in vc.items():
+            if selection := groups.get("selection", None):
+                selectors[name] = TrackSelector(Cuts.from_list(selection))
+        # configure variables
+        self.variables = VariableConfig(
+            self.config["variables"], self.jets_name, self.is_test, selectors
+        )
         self.variables = self.variables.add_jet_vars(
             list(self.config["resampling"]["variables"].keys()), "labels"
         )
@@ -110,17 +152,13 @@ class PreprocessingConfig:
             Transform(**self.config["transform"]) if "transform" in self.config else None
         )
-        # copy config
-        try:
-            git_hash = check_output(
-                ["git", "rev-parse", "--short", "HEAD"], cwd=Path(__file__).parent
-            )
-            self.git_hash = git_hash.decode("ascii").strip()
-            self.config["pp_git_hash"] = self.git_hash
-        except CalledProcessError:
-            log.warning("Could not get git hash")
+        # reproducibility
+        self.git_hash = get_git_hash(Path(__file__).parent)
+        if self.git_hash is None:
             self.git_hash = __version__
-            self.config["pp_git_hash"] = self.git_hash
+        self.config["upp_hash"] = self.git_hash
+        # copy config
         self.copy_config()
     @classmethod

{umami-preprocessing-0.0.6 → umami_preprocessing-0.2.0}/upp/classes/variable_config.py RENAMED Viewed

@@ -3,12 +3,15 @@ from __future__ import annotations
 from copy import deepcopy
 from dataclasses import dataclass
+from ftag.track_selector import TrackSelector
 @dataclass(frozen=True)
 class VariableConfig:
     variables: dict[str, dict[str, list[str]]]
     jets_name: str = "jets"
     keep_all: bool = False
+    selectors: dict[str, TrackSelector] | None = None
     def __post_init__(self):
         for track_vars in self.tracks.values():
@@ -33,7 +36,7 @@ class VariableConfig:
     def add_jet_vars(self, variables: list[str], kind: str = "inputs") -> VariableConfig:
         """Return a new VariableConfig instance."""
-        vc = VariableConfig(deepcopy(self.variables), self.jets_name, self.keep_all)
+        vc = VariableConfig(deepcopy(self.variables), self.jets_name, self.keep_all, self.selectors)
         vc.jets[kind] = list(dict.fromkeys(vc.jets[kind] + variables))
         return vc

{umami-preprocessing-0.0.6 → umami_preprocessing-0.2.0}/upp/main.py RENAMED Viewed

@@ -8,11 +8,13 @@ To run without certain stages, include the corresponding negative flag.
 Note that all stages are required to run the pipeline. If you want to disable resampling,
 you need to set method: none in your config file.
 """
 from __future__ import annotations
 import argparse
 from datetime import datetime
-from pathlib import Path
+from ftag.cli_utils import HelpFormatter, valid_path
 from upp.classes.preprocessing_config import PreprocessingConfig
 from upp.logger import setup_logger
@@ -23,30 +25,24 @@ from upp.stages.plot import plot_initial_resampling_dists, plot_resampled_dists
 from upp.stages.resampling import Resampling
-class HelpFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): ...
-def parse_args():
-    abool = "store_true"
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=HelpFormatter,
-    )
-    parser.add_argument("--config", required=True, type=Path, help="Path to config file")
-    parser.add_argument("--prep", action=abool, default=None, help="Estimate and write PDFs")
+def parse_args(args):
+    _st = "store_true"
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=HelpFormatter)
+    parser.add_argument("--config", required=True, type=valid_path, help="Path to config file")
+    parser.add_argument("--prep", action=_st, default=None, help="Estimate and write PDFs")
     parser.add_argument("--no-prep", dest="prep", action="store_false")
-    parser.add_argument("--resample", action=abool, default=None, help="Run resampling")
+    parser.add_argument("--resample", action=_st, default=None, help="Run resampling")
     parser.add_argument("--no-resample", dest="resample", action="store_false")
-    parser.add_argument("--merge", action=abool, default=None, help="Run merging")
+    parser.add_argument("--merge", action=_st, default=None, help="Run merging")
     parser.add_argument("--no-merge", dest="merge", action="store_false")
-    parser.add_argument("--norm", action=abool, default=None, help="Compute normalisations")
+    parser.add_argument("--norm", action=_st, default=None, help="Compute normalisations")
     parser.add_argument("--no-norm", dest="norm", action="store_false")
-    parser.add_argument("--plot", action=abool, default=None, help="Plot resampled distributions")
+    parser.add_argument("--plot", action=_st, default=None, help="Plot output distributions")
     parser.add_argument("--no-plot", dest="plot", action="store_false")
     splits = ["train", "val", "test", "all"]
     parser.add_argument("--split", default="train", choices=splits, help="Which file to produce")
-    args = parser.parse_args()
+    args = parser.parse_args(args)
     d = vars(args)
     ignore = ["config", "split"]
     if not any(v for a, v in d.items() if a not in ignore):
@@ -65,7 +61,7 @@ def run_pp(args) -> None:
     log.info(f"Start time: {start.strftime('%Y-%m-%d %H:%M:%S')}")
     # load config
-    config = PreprocessingConfig.from_file(Path(args.config), args.split)
+    config = PreprocessingConfig.from_file(args.config, args.split)
     # create virtual datasets and pdf files
     if args.prep and args.split == "train":
@@ -88,6 +84,8 @@ def run_pp(args) -> None:
     # make plots
     if args.plot:
+        title = " Plotting "
+        log.info(f"[bold green]{title:-^100}")
         plot_initial_resampling_dists(config=config)
         plot_resampled_dists(config=config, stage=args.split)
@@ -99,8 +97,8 @@ def run_pp(args) -> None:
     log.info(f"Elapsed time: {str(end - start).split('.')[0]}")
-def main() -> None:
-    args = parse_args()
+def main(args=None) -> None:
+    args = parse_args(args)
     log = setup_logger()
     if args.split == "all":

{umami-preprocessing-0.0.6 → umami_preprocessing-0.2.0}/upp/stages/hist.py RENAMED Viewed

@@ -130,19 +130,23 @@ def create_histograms(config) -> None:
     title = " Writing PDFs "
     log.info(f"[bold green]{title:-^100}")
-    log.info(f"[bold green]Estimating PDFs using {config.num_jets_estimate:,} jets...")
+    log.info(f"[bold green]Estimating PDFs using {config.num_jets_estimate_hist:,} jets...")
     sampl_vars = config.sampl_cfg.vars
     for c in config.components:
-        log.info(f"Estimating PDF for {c}")
-        c.setup_reader(config.batch_size)
+        log.info(f"Estimating {c} PDF using {config.num_jets_estimate_hist:,} samples...")
+        c.setup_reader(config.batch_size, config.jets_name)
         cuts_no_split = c.cuts.ignore(["eventNumber"])
+        ###
+        # TODO: return the number of jets here and pass to the next function to get started
+        ###
         c.check_num_jets(
-            config.num_jets_estimate,
+            config.num_jets_estimate_hist,
             cuts=cuts_no_split,
             silent=False,
             raise_error=False,
         )
-        jets = c.get_jets(sampl_vars, config.num_jets_estimate, cuts_no_split)
+        jets = c.get_jets(sampl_vars, config.num_jets_estimate_hist, cuts_no_split)
         c.hist.write_hist(jets, sampl_vars, config.sampl_cfg.flat_bins)
     log.info(f"[bold green]Saved to {config.components[0].hist.path.parent}/")

{umami-preprocessing-0.0.6 → umami_preprocessing-0.2.0}/upp/stages/interpolation.py RENAMED Viewed

@@ -70,7 +70,7 @@ def upscale_array(
 def upscale_array_regionally(
     array: np.array,
     upscl: int,
-    regionlengthsd: list,
+    num_bins: list,
     order: int = 3,
     mode: str = "nearest",
     positive: bool = True,
@@ -83,7 +83,7 @@ def upscale_array_regionally(
         array to be upscaled
     upscl : int
         upscaling factor
-    regionlengthsd : list
+    num_bins : list
         list of lists of region lengths in each dimension,
         region lengths should sum to the length of the array in that dimension
     order : int, optional
@@ -99,10 +99,10 @@ def upscale_array_regionally(
         Array that is upscaled by a factor of upscl
     """
     up_array = np.empty(shape=[ds * upscl for ds in array.shape])
-    starts = [np.cumsum([0] + regionlengths)[:-1] for regionlengths in regionlengthsd]
+    starts = [np.cumsum([0] + regionlengths)[:-1] for regionlengths in num_bins]
     starts_grid = np.meshgrid(*starts)
     starts_grid = [starts_grid[i].flatten() for i in range(len(starts_grid))]
-    finishes = [np.cumsum(regionlengths) for regionlengths in regionlengthsd]
+    finishes = [np.cumsum(regionlengths) for regionlengths in num_bins]
     finishes_grid = np.meshgrid(*finishes)
     finishes_grid = [finishes_grid[i].flatten() for i in range(len(finishes_grid))]
     d = len(array.shape)

{umami-preprocessing-0.0.6 → umami_preprocessing-0.2.0}/upp/stages/merging.py RENAMED Viewed

@@ -17,11 +17,13 @@ class Merging:
         self.components = config.components
         self.variables = config.variables
         self.batch_size = config.batch_size
-        self.jets_name = self.ppc.jets_name
+        self.jets_name = config.jets_name
         self.rng = np.random.default_rng(42)
         self.flavours = self.components.flavours
     def add_jet_flavour_label(self, jets, component):
+        if "flavour_label" in jets.dtype.names:
+            return jets
         int_label = self.flavours.index(component.flavour)
         label_array = np.full(len(jets), int_label, dtype=[("flavour_label", "i4")])
         return join_structured_arrays([jets, label_array])
@@ -49,6 +51,13 @@ class Merging:
         if all(c.complete for c in components):
             return False
+        # apply track selections
+        for name in self.variables.variables:
+            if name == self.jets_name:
+                continue
+            if selector := self.variables.selectors.get(name):
+                merged[name] = selector(merged[name])
         # write
         self.writer.write(merged)
         return len(merged[self.jets_name])
@@ -57,7 +66,7 @@ class Merging:
         # setup inputs
         for c in components:
             batch_size = self.batch_size * c.num_jets // components.num_jets + 1
-            c.setup_reader(batch_size, fname=c.out_path)
+            c.setup_reader(batch_size, fname=c.out_path, jets_name=self.jets_name)
             c.stream = c.reader.stream(self.variables.combined(), c.reader.num_jets)
             c.complete = False
@@ -70,6 +79,7 @@ class Merging:
             components[0].reader.dtypes(self.variables.combined()),
             components[0].reader.shapes(components.num_jets, self.variables.keys()),
             add_flavour_label=self.jets_name,
+            jets_name=self.jets_name,
         )
         self.writer.add_attr("flavour_label", [f.name for f in self.flavours], self.jets_name)
         self.writer.add_attr("unique_jets", components.unique_jets)

{umami-preprocessing-0.0.6 → umami_preprocessing-0.2.0}/upp/stages/normalisation.py RENAMED Viewed

@@ -16,7 +16,7 @@ class Normalisation:
         self.components = config.components
         self.variables = config.variables
         self.jets_name = self.ppc.jets_name
-        self.num_jets = config.num_jets_estimate
+        self.num_jets = config.num_jets_estimate_norm
         self.norm_fname = config.out_dir / config.config.get("norm_fname", "norm_dict.yaml")
         self.class_fname = config.out_dir / config.config.get("class_fname", "class_dict.yaml")
@@ -62,7 +62,7 @@ class Normalisation:
         return combined
     def get_class_dict(self, batch):
-        ignore = ["VertexIndex", "ftagTruthParentBarcode", "barcode"]
+        ignore = ["VertexIndex", "ftagTruthParentBarcode", "barcode", "eventNumber", "jetFoldHash"]
         class_dict = {k: {} for k in self.variables}
         for name, array in batch.items():
             if name != self.variables.jets_name:
@@ -118,7 +118,9 @@ class Normalisation:
         log.info(f"[bold green]{title:-^100}")
         # setup reader
-        reader = H5Reader(self.ppc.out_fname, self.ppc.batch_size, precision="full")
+        reader = H5Reader(
+            self.ppc.out_fname, self.ppc.batch_size, precision="full", jets_name=self.jets_name
+        )
         log.debug(f"Setup reader at: {self.ppc.out_fname}")
         norm_dict = None

{umami-preprocessing-0.0.6 → umami_preprocessing-0.2.0}/upp/stages/plot.py RENAMED Viewed

@@ -5,6 +5,7 @@ from pathlib import Path
 from ftag import Flavours
 from ftag.hdf5 import H5Reader
+from ftag.labels import LabelContainer
 from puma import Histogram, HistogramPlot
 from upp.utils import path_append
@@ -14,6 +15,7 @@ def load_jets(
     paths: str | list,
     variable: str,
     flavour_label="flavour_label",
+    jets_name="jets",
 ) -> dict:
     """
     Load the variables and labels from the jets in a given file(s).
@@ -28,15 +30,18 @@ def load_jets(
     flavour_label : str, optional
         Name of the flavour label variable which is used for the labels,
         by default "flavour_label"
+    jets_name: str, optional
+        Name of the jet dataset / the global objects
+        by default "jets"
     Returns
     -------
     dict
         Dict with the loaded variable and labels.
     """
-    variables = {"jets": [flavour_label, variable]}
-    reader = H5Reader(paths, batch_size=1000)
-    df = reader.load(variables, num_jets=10000)["jets"]
+    variables = {jets_name: [flavour_label, variable]}
+    reader = H5Reader(paths, batch_size=1000, jets_name=jets_name)
+    df = reader.load(variables, num_jets=10000)[jets_name]
     return df
@@ -45,8 +50,10 @@ def make_hist(
     flavours: list,
     variable: str,
     in_paths: str | list,
+    jets_name: str = "jets",
     bins_range: tuple | None = None,
     suffix: str = "",
+    flavour_cont: LabelContainer = Flavours,
 ) -> None:
     """
     Create and plot the histogram and save it to disk.
@@ -64,6 +71,9 @@ def make_hist(
         Variable that is to be histogrammed and plotted.
     in_paths : str
         Path to the files from which the jets are loaded.
+    jets_name: str, optional
+        Name of the jet dataset / the global objects
+        by default "jets"
     bins_range : tuple, optional
         bins_range argument from from puma.HistogramPlot,
         by default None
@@ -72,11 +82,11 @@ def make_hist(
         output name, by default "".
     """
     # Load the variable from the jets
-    df = load_jets(in_paths, variable)
+    df = load_jets(in_paths, variable, jets_name=jets_name)
     # Setup the histogram
     plot = HistogramPlot(
-        ylabel="Normalised Number of jets",
+        ylabel=f"Normalised Number of {jets_name}",
         atlas_second_tag="$\\sqrt{s}=13$ TeV",
         xlabel=variable,
         bins=50,
@@ -94,8 +104,8 @@ def make_hist(
         plot.add(
             Histogram(
                 df[df["flavour_label"] == label_value][variable],
-                label=Flavours[label_string].label,
-                colour=Flavours[label_string].colour,
+                label=flavour_cont[label_string].label,
+                colour=flavour_cont[label_string].colour,
             )
         )
@@ -118,6 +128,7 @@ def make_hist_initial(
     flavours: list,
     variable: str,
     in_paths_list: str | list,
+    jets_name: str = "jets",
     bins_range: tuple | None = None,
     suffix: str = "",
     jets_to_plot: int = -1,
@@ -125,7 +136,7 @@ def make_hist_initial(
     suffixes: list | None = None,
     out_format: str = "png",
 ) -> None:
-    """Make inistal dist plots.
+    """Make initial distribution plots.
     Plot the initial distribution of the given variable
     for multiple different samples (like ttbar, zpext, etc.)
@@ -145,6 +156,9 @@ def make_hist_initial(
     in_paths_list : str | list
         String or list of strings with the paths to the files
         from which the jets are loaded.
+    jets_name: str, optional
+        Name of the jet dataset / the global objects
+        by default "jets"
     bins_range : tuple, optional
         bins_range argument from from puma.HistogramPlot,
         by default None
@@ -163,7 +177,7 @@ def make_hist_initial(
     """
     # Setup the histogram
     plot = HistogramPlot(
-        ylabel="Normalised Number of jets",
+        ylabel=f"Normalised Number of {jets_name}",
         atlas_second_tag="$\\sqrt{s}=13$ TeV",
         xlabel=variable,
         bins=100,
@@ -187,7 +201,7 @@ def make_hist_initial(
     # Loop over the different samples
     for i, in_paths in enumerate(in_paths_list):
         # Load jets from the file
-        reader = H5Reader(in_paths, batch_size=10000)
+        reader = H5Reader(in_paths, batch_size=10000, jets_name=jets_name)
         # Loop over the flavours
         for flavour in flavours:
@@ -197,12 +211,10 @@ def make_hist_initial(
             plot.add(
                 Histogram(
                     reader.load(
-                        {"jets": [variable]},
+                        {jets_name: [variable]},
                         num_jets=jets_to_plot,
                         cuts=flavour.cuts,
-                    )[
-                        "jets"
-                    ][variable],
+                    )[jets_name][variable],
                     label=flavour.label + " " + suffixes[i],
                     colour=flavour.colour,
                     linestyle=linestiles[i],
@@ -250,6 +262,7 @@ def plot_initial_resampling_dists(config) -> None:
             flavours=config.components.flavours,
             variable=var,
             in_paths_list=paths,
+            jets_name=config.jets_name,
             jets_to_plot=100000,
             out_dir=config.out_dir / "plots",
             suffixes=suffixes,
@@ -260,6 +273,7 @@ def plot_initial_resampling_dists(config) -> None:
                 flavours=config.components.flavours,
                 variable=var,
                 in_paths_list=paths,
+                jets_name=config.jets_name,
                 bins_range=(0, 500e3),
                 suffix="low",
                 jets_to_plot=100000,
@@ -293,15 +307,19 @@ def plot_resampled_dists(config, stage: str) -> None:
         make_hist(
             stage=stage,
             flavours=config.components.flavours,
+            flavour_cont=config.flavour_cont,
             variable=var,
             in_paths=paths,
+            jets_name=config.jets_name,
         )
         if "pt" in var:
             make_hist(
                 stage=stage,
                 flavours=config.components.flavours,
+                flavour_cont=config.flavour_cont,
                 variable=var,
                 in_paths=paths,
+                jets_name=config.jets_name,
                 bins_range=(0, 500e3),
                 suffix="low",
             )

{umami-preprocessing-0.0.6 → umami_preprocessing-0.2.0}/upp/stages/resampling.py RENAMED Viewed

@@ -38,14 +38,14 @@ class Resampling:
         self.components = config.components
         self.variables = config.variables
         self.batch_size = config.batch_size
-        self.is_test = config.is_test
-        self.num_jets_estimate = config.num_jets_estimate
+        self.jets_name = config.jets_name
         self.upscale_pdf = config.sampl_cfg.upscale_pdf or 1
-        self.regionlengthsd = self.get_regionlengthsd_from_config()
+        self.num_bins = self.get_num_bins_from_config()
         self.methods_map = {
             "pdf": self.pdf_select_func,
             "countup": self.countup_select_func,
             "none": None,
+            None: None,
         }
         if self.config.method not in self.methods_map:
             raise ValueError(
@@ -97,7 +97,7 @@ class Resampling:
         num_samples = int(len(jets) * component.sampling_fraction)
         ratios = safe_divide(self.target.hist.pbin, component.hist.pbin)
         if self.upscale_pdf > 1:
-            ratios = upscale_array_regionally(ratios, self.upscale_pdf, self.regionlengthsd)
+            ratios = upscale_array_regionally(ratios, self.upscale_pdf, self.num_bins)
         probs = ratios[binnumbers]
         idx = random.choices(np.arange(len(jets)), weights=probs, k=num_samples)
         return idx
@@ -123,7 +123,7 @@ class Resampling:
                 # apply sampling
                 idx = np.arange(len(batch_out[self.variables.jets_name]))
-                if c != self.target and not self.is_test and self.select_func:
+                if c != self.target and self.select_func:
                     idx = self.select_func(batch_out[self.variables.jets_name], c)
                     if len(idx) == 0:
                         continue
@@ -177,6 +177,7 @@ class Resampling:
             reader = H5Reader(
                 sample.path,
                 self.batch_size,
+                jets_name=self.jets_name,
                 equal_jets=equal_jets_flag,
                 transform=self.transform,
             )
@@ -242,8 +243,8 @@ class Resampling:
         # setup i/o
         for c in self.components:
             # just used for the writer configuration
-            c.setup_reader(self.batch_size, transform=self.transform)
-            c.setup_writer(self.variables)
+            c.setup_reader(self.batch_size, jets_name=self.jets_name, transform=self.transform)
+            c.setup_writer(self.variables, jets_name=self.jets_name)
         # set samplig fraction if needed
         self.set_component_sampling_fractions()
@@ -254,7 +255,7 @@ class Resampling:
             f" {self.config.sampling_fraction}..."
         )
         for c in self.components:
-            frac = c.sampling_fraction if not self.is_test else 1
+            frac = c.sampling_fraction if self.select_func else 1
             c.check_num_jets(c.num_jets, sampling_frac=frac, cuts=c.cuts)
         # run resampling
@@ -268,7 +269,7 @@ class Resampling:
         log.info(f"[bold green]Estimated unqiue jets: {unique:,.0f}")
         log.info(f"[bold green]Saved to {self.components.out_dir}/")
-    def get_regionlengthsd_from_config(self) -> list[list[int]]:
+    def get_num_bins_from_config(self) -> list[list[int]]:
         """Get the lengths of the binning regions in each variable from the config.
         Returns
@@ -276,7 +277,7 @@ class Resampling:
         typing.List[typing.List[int]]
             lengths of the binning regions in each variable from the config
         """
-        regionlengthsd = []
+        num_bins = []
         for row in self.config.bins.values():
-            regionlengthsd.append([sub[-1] for sub in row])
-        return regionlengthsd
+            num_bins.append([sub[-1] for sub in row])
+        return num_bins

umami-preprocessing-0.0.6/umami_preprocessing.egg-info/requires.txt DELETED Viewed

@@ -1,16 +0,0 @@
-pyyaml-include==1.3
-PyYAML==6.0.1
-rich==12.6.0
-scipy==1.10.1
-puma-hep==0.3.0
-atlas-ftag-tools==0.1.10
-dotmap==1.3.30
-[dev]
-black==23.9.1
-ruff==0.0.289
-mypy==1.5.1
-pre-commit==3.1.1
-pytest==7.2.2
-pytest-mock==3.11.1
-pytest-cov==4.0.0