PyPI - atlas-ftag-tools - Versions diffs - 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl - Mend

atlas-ftag-tools 0.2.12py3-none-any.whl → 0.2.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

{atlas_ftag_tools-0.2.12.dist-info → atlas_ftag_tools-0.2.13.dist-info}/METADATA RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.4
 Name: atlas-ftag-tools
-Version: 0.2.12
+Version: 0.2.13
 Summary: ATLAS Flavour Tagging Tools
 Author: Sam Van Stroud, Philipp Gadow
 License: MIT
 Project-URL: Homepage, https://github.com/umami-hep/atlas-ftag-tools/
-Requires-Python: <3.12,>=3.8
+Requires-Python: <3.12,>=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: h5py>=3.0

{atlas_ftag_tools-0.2.12.dist-info → atlas_ftag_tools-0.2.13.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-atlas_ftag_tools-0.2.12.dist-info/licenses/LICENSE,sha256=R4o6bZfajQ1KxwcIeavTC00qYTdL33YGNe1hzfV53gM,11349
-ftag/__init__.py,sha256=CU1RjEu6pHq11LQ2kAy9YDittMHXB51fNWvuy1NFr7o,748
+atlas_ftag_tools-0.2.13.dist-info/licenses/LICENSE,sha256=R4o6bZfajQ1KxwcIeavTC00qYTdL33YGNe1hzfV53gM,11349
+ftag/__init__.py,sha256=UdYmO_mROM7jvqpPUMbnaQxdCrlR8O0KLlhatyMnapw,748
 ftag/cli_utils.py,sha256=w3TtQmUHSyAKChS3ewvOtcSDAUJAZGIIomaNi8f446U,298
 ftag/cuts.py,sha256=9_ooLZHaO3SnIQBNxwbaPZn-qptGdKnB27FdKQGTiTY,2933
 ftag/flavours.py,sha256=ShH4M2UjQZpZ_NlCctTm2q1tJbzYxjmGteioQ2GcqEU,114
@@ -13,7 +13,7 @@ ftag/region.py,sha256=ANv0dGI2W6NJqD9fp7EfqAUReH4FOjc1gwl_Qn8llcM,360
 ftag/sample.py,sha256=3N0FrRcu9l1sX8ohuGOHuMYGD0See6gMO4--7NzR2tE,2538
 ftag/track_selector.py,sha256=fJNk_kIBQriBqV4CPT_3ReJbOUnavDDzO-u3EQlRuyk,2654
 ftag/transform.py,sha256=uEGGJSnqoKOzLYQv650XdK_kDNw4Aw-5dc60z9Dp_y0,3963
-ftag/vds.py,sha256=wqj1cA6mIJ4enk8inkearo7ccTw5KCbvuNo2oon51fc,4565
+ftag/vds.py,sha256=l6b54naOK7z0gZjvvtIAQv2Ky4X1w1yLrisZZZYqvbY,11259
 ftag/working_points.py,sha256=RJws2jPMEDQDspCbXUZBifS1CCBmlMJ5ax0eMyDzCRA,15949
 ftag/hdf5/__init__.py,sha256=8yzVQITge-HKkBQQ60eJwWmWDycYZjgVs-qVg4ShVr0,385
 ftag/hdf5/h5add_col.py,sha256=htS5wn4Tm4S3U6mrJ8s24VUnbI7o28Z6Ll-J_V68xTA,12558
@@ -25,8 +25,8 @@ ftag/hdf5/h5writer.py,sha256=SMurvZ8FPvqieZUaYRX2SBu-jIyZ6Fx8IasUrEOxIvM,7185
 ftag/utils/__init__.py,sha256=U3YyLY77-FzxRUbudxciieDoy_mnLlY3OfBquA3PnTE,524
 ftag/utils/logging.py,sha256=54NaQiC9Bh4vSznSqzoPfR-7tj1PXfmoH7yKgv_ZHZk,3192
 ftag/utils/metrics.py,sha256=zQI4nPeRDSyzqKpdOPmu0GU560xSWoW1wgL13rrja-I,12664
-atlas_ftag_tools-0.2.12.dist-info/METADATA,sha256=bGfabVRARSL6PZTsDqen30IkQVqVGw8Tg9lMCnzY-5w,2152
-atlas_ftag_tools-0.2.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-atlas_ftag_tools-0.2.12.dist-info/entry_points.txt,sha256=acr7WwxMIJ3x2I7AheNxNnpWE7sS8XE9MA1eUJGcU5A,169
-atlas_ftag_tools-0.2.12.dist-info/top_level.txt,sha256=qiYQuKcAvMim-31FwkT3MTQu7WQm0s58tPAia5KKWqs,5
-atlas_ftag_tools-0.2.12.dist-info/RECORD,,
+atlas_ftag_tools-0.2.13.dist-info/METADATA,sha256=ZpQ5GggkLyizsv9uHEOvIlzRqPmC-4tNaoaMgV6unF4,2153
+atlas_ftag_tools-0.2.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+atlas_ftag_tools-0.2.13.dist-info/entry_points.txt,sha256=acr7WwxMIJ3x2I7AheNxNnpWE7sS8XE9MA1eUJGcU5A,169
+atlas_ftag_tools-0.2.13.dist-info/top_level.txt,sha256=qiYQuKcAvMim-31FwkT3MTQu7WQm0s58tPAia5KKWqs,5
+atlas_ftag_tools-0.2.13.dist-info/RECORD,,

ftag/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from __future__ import annotations
-__version__ = "v0.2.12"
+__version__ = "v0.2.13"
 from . import hdf5, utils
 from .cuts import Cuts

ftag/vds.py CHANGED Viewed

@@ -8,114 +8,334 @@ import sys
 from pathlib import Path
 import h5py
+import numpy as np
-def parse_args(args):
+def parse_args(args=None):
     parser = argparse.ArgumentParser(
-        description="Create a lightweight wrapper around a set of h5 files"
+        description="Create a lightweight HDF5 wrapper (virtual datasets + "
+        "summed cutBookkeeper counts) around a set of .h5 files"
+    )
+    parser.add_argument(
+        "pattern",
+        type=Path,
+        help="quotes-enclosed glob pattern of files to merge, "
+        "or a regex if --use_regex is given",
     )
-    parser.add_argument("pattern", type=Path, help="quotes-enclosed glob pattern of files to merge")
     parser.add_argument("output", type=Path, help="path to output virtual file")
-    parser.add_argument("--use_regex", help="if provided pattern is a regex", action="store_true")
-    parser.add_argument("--regex_path", type=str, required="--regex" in sys.argv, default=None)
+    parser.add_argument(
+        "--use_regex",
+        action="store_true",
+        help="treat PATTERN as a regular expression instead of a glob",
+    )
+    parser.add_argument(
+        "--regex_path",
+        type=str,
+        required="--use_regex" in (args or sys.argv),
+        default=None,
+        help="directory whose entries the regex is applied to "
+        "(defaults to the current working directory)",
+    )
     return parser.parse_args(args)
-def get_virtual_layout(fnames: list[str], group: str):
-    # get sources
+def get_virtual_layout(fnames: list[str], group: str) -> h5py.VirtualLayout:
+    """Concatenate group from multiple files into a single VirtualDataset.
+    Parameters
+    ----------
+    fnames : list[str]
+        List with the file names
+    group : str
+        Name of the group that is concatenated
+    Returns
+    -------
+    h5py.VirtualLayout
+        Virtual layout of the new virtual dataset
+    """
     sources = []
     total = 0
+    # Loop over the input files
     for fname in fnames:
-        with h5py.File(fname) as f:
-            vsource = h5py.VirtualSource(f[group])
-            total += vsource.shape[0]
-            sources.append(vsource)
+        with h5py.File(fname, "r") as f:
+            # Get the file and append its length
+            vsrc = h5py.VirtualSource(f[group])
+            total += vsrc.shape[0]
+            sources.append(vsrc)
-    # define layout of the vds
-    with h5py.File(fnames[0]) as f:
+    # Define the layout of the output vds
+    with h5py.File(fnames[0], "r") as f:
         dtype = f[group].dtype
         shape = f[group].shape
+    # Update the shape finalize the output layout
     shape = (total,) + shape[1:]
     layout = h5py.VirtualLayout(shape=shape, dtype=dtype)
-    # fill the vds
+    # Fill the vds
     idx = 0
-    for source in sources:
-        length = source.shape[0]
-        layout[idx : idx + length] = source
+    for vsrc in sources:
+        length = vsrc.shape[0]
+        layout[idx : idx + length] = vsrc
         idx += length
     return layout
-def glob_re(pattern, regex_path):
+def glob_re(pattern: str | None, regex_path: str | None) -> list[str] | None:
+    """Return list of filenames that match REGEX pattern inside regex_path.
+    Parameters
+    ----------
+    pattern : str
+        Pattern for the input files
+    regex_path : str
+        Regex path for the input files
+    Returns
+    -------
+    list[str]
+        List of the file basenames that matched the regex pattern
+    """
+    if pattern is None or regex_path is None:
+        return None
     return list(filter(re.compile(pattern).match, os.listdir(regex_path)))
-def regex_files_from_dir(reg_matched_fnames, regex_path):
+def regex_files_from_dir(
+    reg_matched_fnames: list[str] | None,
+    regex_path: str | None,
+) -> list[str] | None:
+    """Turn a list of basenames into full paths; dive into sub-dirs if needed.
+    Parameters
+    ----------
+    reg_matched_fnames : list[str]
+        List of the regex matched file names
+    regex_path : str
+        Regex path for the input files
+    Returns
+    -------
+    list[str]
+        List of file paths (as strings) that matched the regex and any subsequent
+        globbing inside matched directories.
+    """
+    if reg_matched_fnames is None or regex_path is None:
+        return None
     parent_dir = regex_path or str(Path.cwd())
-    full_paths = [parent_dir + "/" + fname for fname in reg_matched_fnames]
-    paths_to_glob = [fname + "/*.h5" if Path(fname).is_dir() else fname for fname in full_paths]
-    nested_fnames = [glob.glob(fname) for fname in paths_to_glob]
+    full_paths = [Path(parent_dir) / fname for fname in reg_matched_fnames]
+    paths_to_glob = [str(fp / "*.h5") if fp.is_dir() else str(fp) for fp in full_paths]
+    nested_fnames = [glob.glob(p) for p in paths_to_glob]
     return sum(nested_fnames, [])
+def sum_counts_once(counts: np.ndarray) -> np.ndarray:
+    """Reduce the arrays in the counts dataset for one file to a scalar via summation.
+    Parameters
+    ----------
+    counts : np.ndarray
+        Array from the h5py dataset (counts) from the cutBookkeeper groups
+    Returns
+    -------
+    np.ndarray
+        Array with the summed variables for the file
+    """
+    dtype = counts.dtype
+    summed = np.zeros((), dtype=dtype)
+    for field in dtype.names:
+        summed[field] = counts[field].sum()
+    return summed
+def check_subgroups(fnames: list[str], group_name: str = "cutBookkeeper") -> list[str]:
+    """Check which subgroups are available for the bookkeeper.
+    Find the intersection of sub-group names that have a 'counts' dataset
+    in every input file. (Using the intersection makes the script robust
+    even if a few files are missing a variation.)
+    Parameters
+    ----------
+    fnames : list[str]
+        List of the input files
+    group_name : str, optional
+        Group name in the h5 files of the bookkeeper, by default "cutBookkeeper"
+    Returns
+    -------
+    set[str]
+        Returns the files with common sub-groups
+    Raises
+    ------
+    KeyError
+        When a file does not have a bookkeeper
+    ValueError
+        When no common bookkeeper sub-groups were found
+    """
+    common: set[str] | None = None
+    for fname in fnames:
+        with h5py.File(fname, "r") as f:
+            if group_name not in f:
+                raise KeyError(f"{fname} has no '{group_name}' group")
+            these = {
+                name
+                for name, item in f[group_name].items()
+                if isinstance(item, h5py.Group) and "counts" in item
+            }
+            common = these if common is None else common & these
+    if not common:
+        raise ValueError("No common cutBookkeeper sub-groups with 'counts' found")
+    return sorted(common)
+def aggregate_cutbookkeeper(
+    fnames: list[str],
+    group_name: str = "cutBookkeeper",
+) -> dict[str, np.ndarray] | None:
+    """Aggregate the cutBookkeeper in the input files.
+    For every input file:
+    For every sub-group (nominal, sysUp, sysDown, …):
+    1. Sum the 4-entry record array inside each file into 1 record
+    1. Add those records from all files together into grand total
+    Returns a dict  {subgroup_name: scalar-record-array}
+    Parameters
+    ----------
+    fnames : list[str]
+        List of the input files
+    Returns
+    -------
+    dict[str, np.ndarray] | None
+        Dict with the accumulated cutBookkeeper groups. If the cut bookkeeper
+        is not in the files, return None.
+    """
+    if any(group_name not in h5py.File(f, "r") for f in fnames):
+        return None
+    subgroups = check_subgroups(fnames, group_name=group_name)
+    # initialise an accumulator per subgroup (dtype taken from 1st file)
+    accum: dict[str, np.ndarray] = {}
+    with h5py.File(fnames[0], "r") as f0:
+        for sg in subgroups:
+            dtype = f0[f"{group_name}/{sg}/counts"].dtype
+            accum[sg] = np.zeros((), dtype=dtype)
+    # add each files contribution field-wise
+    for fname in fnames:
+        with h5py.File(fname, "r") as f:
+            for sg in subgroups:
+                per_file = sum_counts_once(f[f"{group_name}/{sg}/counts"][()])
+                for fld in accum[sg].dtype.names:
+                    accum[sg][fld] += per_file[fld]
+    return accum
 def create_virtual_file(
     pattern: Path | str,
-    out_fname: Path | None = None,
+    out_fname: Path | str | None = None,
     use_regex: bool = False,
     regex_path: str | None = None,
     overwrite: bool = False,
-):
-    # get list of filenames
+    bookkeeper_name: str = "cutBookkeeper",
+) -> Path:
+    """Create the virtual dataset file for the given inputs.
+    Parameters
+    ----------
+    pattern : Path | str
+        Pattern of the input files used. Wildcard is supported
+    out_fname : Path | str | None, optional
+        Output path to which the virtual dataset file is written. By default None
+    use_regex : bool, optional
+        If you want to use regex instead of glob, by default False
+    regex_path : str | None, optional
+        Regex logic used to define the input files, by default None
+    overwrite : bool, optional
+        Decide, if an existing output file is overwritten, by default False
+    bookkeeper_name : str, optional
+        Name of the cut bookkeeper in the h5 files.
+    Returns
+    -------
+    Path
+        Path object of the path to which the output file is written
+    Raises
+    ------
+    FileNotFoundError
+        If not input files were found for the given pattern
+    ValueError
+        If no output file is given and the input comes from multiple directories
+    """
+    # Get list of filenames
     pattern_str = str(pattern)
-    if use_regex:
-        reg_matched_fnames = glob_re(pattern_str, regex_path)
-        print("reg matched fnames: ", reg_matched_fnames)
-        fnames = regex_files_from_dir(reg_matched_fnames, regex_path)
+    # Use regex to find input files else use glob
+    if use_regex is True:
+        matched = glob_re(pattern_str, regex_path)
+        fnames = regex_files_from_dir(matched, regex_path)
     else:
         fnames = glob.glob(pattern_str)
+    # Throw error if no input files were found
     if not fnames:
-        raise FileNotFoundError(f"No files matched pattern {pattern}")
-    print("Files to merge to vds: ", fnames)
+        raise FileNotFoundError(f"No files matched pattern {pattern!r}")
-    # infer output path if not given
+    # Infer output path if not given
     if out_fname is None:
-        assert len({Path(fname).parent for fname in fnames}) == 1
+        if len({Path(f).parent for f in fnames}) != 1:
+            raise ValueError("Give --output when files reside in multiple dirs")
         out_fname = Path(fnames[0]).parent / "vds" / "vds.h5"
     else:
         out_fname = Path(out_fname)
-    # check if file already exists
+    # If overwrite is not active and a file exists, stop here
     if not overwrite and out_fname.is_file():
         return out_fname
-    # identify common groups across all files
+    # Identify common groups across all files
     common_groups: set[str] = set()
     for fname in fnames:
-        with h5py.File(fname) as f:
+        with h5py.File(fname, "r") as f:
             groups = set(f.keys())
-            common_groups = groups if not common_groups else common_groups.intersection(groups)
-    if not common_groups:
-        raise ValueError("No common groups found across files")
-    # create virtual file
-    out_fname.parent.mkdir(exist_ok=True)
-    with h5py.File(out_fname, "w") as f:
-        for group in common_groups:
-            layout = get_virtual_layout(fnames, group)
-            f.create_virtual_dataset(group, layout)
-            attrs_dict: dict = {}
-            for fname in fnames:
-                with h5py.File(fname) as g:
-                    for name, value in g[group].attrs.items():
-                        if name not in attrs_dict:
-                            attrs_dict[name] = []
-                        attrs_dict[name].append(value)
-            for name, value in attrs_dict.items():
-                if len(value) > 0:
-                    f[group].attrs[name] = value[0]
+            common_groups = groups if not common_groups else common_groups & groups
+    # Ditch the bookkeeper. We will process it separately
+    common_groups.discard("cutBookkeeper")
+    # Check that the directory of the output file exists
+    out_fname.parent.mkdir(parents=True, exist_ok=True)
+    # Build the output file
+    with h5py.File(out_fname, "w") as fout:
+        # Build "standard" groups
+        for gname in sorted(common_groups):
+            layout = get_virtual_layout(fnames, gname)
+            fout.create_virtual_dataset(gname, layout)
+            # Copy first-file attributes to VDS root object
+            with h5py.File(fnames[0], "r") as f0:
+                for k, v in f0[gname].attrs.items():
+                    fout[gname].attrs[k] = v
+        # Build the cutBookkeeper
+        counts_total = aggregate_cutbookkeeper(fnames=fnames, group_name=bookkeeper_name)
+        if counts_total is not None:
+            for sg, record in counts_total.items():
+                grp = fout.require_group(f"{bookkeeper_name}/{sg}")
+                grp.create_dataset("counts", data=record, shape=(), dtype=record.dtype)
     return out_fname
@@ -123,19 +343,20 @@ def create_virtual_file(
 def main(args=None) -> None:
     args = parse_args(args)
     matching_mode = "Applying regex to" if args.use_regex else "Globbing"
-    print(f"{matching_mode} {args.pattern}...")
-    create_virtual_file(
-        args.pattern,
-        args.output,
+    print(f"{matching_mode} {args.pattern} ...")
+    out_path = create_virtual_file(
+        pattern=args.pattern,
+        out_fname=args.output,
         use_regex=args.use_regex,
         regex_path=args.regex_path,
         overwrite=True,
     )
-    with h5py.File(args.output) as f:
+    with h5py.File(out_path, "r") as f:
         key = next(iter(f.keys()))
-        num = len(f[key])
-    print(f"Virtual dataset '{key}' has {num:,} entries")
-    print(f"Saved virtual file to {args.output.resolve()}")
+        print(f"Virtual dataset '{key}' has {len(f[key]):,} entries")
+    print(f"Saved virtual file to {out_path.resolve()}")
 if __name__ == "__main__":

{atlas_ftag_tools-0.2.12.dist-info → atlas_ftag_tools-0.2.13.dist-info}/WHEEL RENAMED Viewed

File without changes

{atlas_ftag_tools-0.2.12.dist-info → atlas_ftag_tools-0.2.13.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{atlas_ftag_tools-0.2.12.dist-info → atlas_ftag_tools-0.2.13.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{atlas_ftag_tools-0.2.12.dist-info → atlas_ftag_tools-0.2.13.dist-info}/top_level.txt RENAMED Viewed

File without changes

atlas-ftag-tools 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl

atlas-ftag-tools 0.2.12py3-none-any.whl → 0.2.13py3-none-any.whl