PyPI - atlas-ftag-tools - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

atlas-ftag-tools 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{atlas_ftag_tools-0.1.3.dist-info → atlas_ftag_tools-0.1.5.dist-info}/METADATA +7 -3
atlas_ftag_tools-0.1.5.dist-info/RECORD +19 -0
ftag/__init__.py +1 -1
ftag/cuts.py +1 -1
ftag/flavours.yaml +7 -2
ftag/hdf5/h5reader.py +101 -17
ftag/mock.py +1 -0
ftag/vds.py +23 -10
atlas_ftag_tools-0.1.3.dist-info/RECORD +0 -19
{atlas_ftag_tools-0.1.3.dist-info → atlas_ftag_tools-0.1.5.dist-info}/WHEEL +0 -0
{atlas_ftag_tools-0.1.3.dist-info → atlas_ftag_tools-0.1.5.dist-info}/entry_points.txt +0 -0
{atlas_ftag_tools-0.1.3.dist-info → atlas_ftag_tools-0.1.5.dist-info}/top_level.txt +0 -0

{atlas_ftag_tools-0.1.3.dist-info → atlas_ftag_tools-0.1.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: atlas-ftag-tools
-Version: 0.1.3
+Version: 0.1.5
 Summary: ATLAS Flavour Tagging Tools
 Author: Sam Van Stroud, Philipp Gadow
 License: MIT
@@ -39,7 +39,7 @@ If you want to use this package without modification, you can install from [pypi
 pip install atlas-ftag-tools
 ```
-To additionally install the development dependencies (for formatting and linting) rn
+To additionally install the development dependencies (for formatting and linting) use
 ```bash
 pip install atlas-ftag-tools[dev]
 ```
@@ -58,10 +58,11 @@ Include development dependencies with
 python -m pip install -e ".[dev]"
 ```
-You can set up pre-commit hooks with
+You can set up and run pre-commit hooks with
 ```bash
 pre-commit install
+pre-commmit run --all-files
 ```
 To run the tests you can use the `pytest` or `coverage` command, for example
@@ -75,6 +76,9 @@ Running `coverage report` will display the test coverage.
 # Usage
+Please see the [example notebook](ftag/example.ipynb) for full usage.
+Additional functionality is also documented below.
 ## Create virtual file
 This package contains a script to easily merge a set of H5 files.

atlas_ftag_tools-0.1.5.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+ftag/__init__.py,sha256=XBQEZpFSnGyihB9F3eGOvB_5YknggY_L6fzwYszXLuQ,543
+ftag/cuts.py,sha256=lCnyHd4kbrt3CMXGE1ASCgaa07o1qOBn6GQek6lClVQ,2734
+ftag/flavour.py,sha256=sEelvHNLWmHsecQQrmRc8ktwykMMHnGX8ePDRrqQkuo,2460
+ftag/flavours.yaml,sha256=VrOGD5FUhMVPIW31whY-nSqNv98AcnLsPmPGmAcCg3w,3287
+ftag/mock.py,sha256=HUyYOPsRtkmzjLRNF2zs0kpVUrTRIHTsnIyDlXIZArU,3627
+ftag/region.py,sha256=-WxdC0Gy9zz3zEJ2pN779RcxXPG-QEROuMwMoP-Qs0g,353
+ftag/sample.py,sha256=uVNyxFYMMtkP-o2tjQatpo8mIH4ZNNe3mSFEPebYh_E,2622
+ftag/vds.py,sha256=8b5-zqDELUmxdO5Txdowe3v7XGS1pKgO20bhzUQqCxU,2945
+ftag/hdf5/__init__.py,sha256=A_a_4IUlZ2mSiDcfrZKBdja_3iTrUHvADM2lWx6g66g,325
+ftag/hdf5/h5reader.py,sha256=1_iyYfWI1ht1-p9vBBpGhw47ZKola_KhWxbrywoB-Jg,11751
+ftag/hdf5/h5utils.py,sha256=GKduv9b6JRSBirRdmNgGcmsINCMTj54kH4RQqxrM1t8,2363
+ftag/hdf5/h5writer.py,sha256=_N-DJSX283r-XsGczvLFA4_qaK4BkFkdKZAusHEvRjU,2919
+ftag/wps/discriminant.py,sha256=86ISONTuIjqTJO1A27oqkoCgDjAQinofiYNdcjfdkIk,1380
+ftag/wps/working_points.py,sha256=487NsQGGY2Qt4q8mXxKABMFa-YLsbrhkPLcYVdebeVk,4950
+atlas_ftag_tools-0.1.5.dist-info/METADATA,sha256=Uc4Z2zAMD7jsSKoV6o2LJwfm2X0KEWYRingGT_msE4I,4182
+atlas_ftag_tools-0.1.5.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
+atlas_ftag_tools-0.1.5.dist-info/entry_points.txt,sha256=UKbRbwA9DxfsTPRBIVVDz3u15WdzhzgRKwXXSAXuQqc,73
+atlas_ftag_tools-0.1.5.dist-info/top_level.txt,sha256=qiYQuKcAvMim-31FwkT3MTQu7WQm0s58tPAia5KKWqs,5
+atlas_ftag_tools-0.1.5.dist-info/RECORD,,

ftag/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """atlas-ftag-tools - Common tools for ATLAS flavour tagging software."""
-__version__ = "v0.1.3"
+__version__ = "v0.1.5"
 import ftag.hdf5 as hdf5

ftag/cuts.py CHANGED Viewed

@@ -20,7 +20,7 @@ OPERATORS = {
     "notin": lambda x, y: ~np.isin(x, y),
 }
-for i in range(2, 20):
+for i in range(2, 101):
     OPERATORS[f"%{i}=="] = functools.partial(lambda x, y, i: (x % i) == y, i=i)
     OPERATORS[f"%{i}<="] = functools.partial(lambda x, y, i: (x % i) <= y, i=i)
     OPERATORS[f"%{i}>="] = functools.partial(lambda x, y, i: (x % i) >= y, i=i)

ftag/flavours.yaml CHANGED Viewed

@@ -49,12 +49,12 @@
 # Xbb tagging
 - name: hbb
-  label: Hbb
+  label: $H \rightarrow b\bar{b}$
   cuts: ["R10TruthLabel_R22v1 == 11"]
   colour: tab:blue
   category: xbb
 - name: hcc
-  label: Hcc
+  label: $H \rightarrow c\bar{c}$
   cuts: ["R10TruthLabel_R22v1 == 12"]
   colour: "#B45F06"
   category: xbb
@@ -63,6 +63,11 @@
   cuts: ["R10TruthLabel_R22v1 == 1"]
   colour: "#A300A3"
   category: xbb
+- name: inclusive_top
+  label: Inclusive Top
+  cuts: ["R10TruthLabel_R22v1 in (1,6,7)"]
+  colour: "#A300A3"
+  category: xbb
 - name: qcd
   label: QCD
   cuts: ["R10TruthLabel_R22v1 == 10"]

ftag/hdf5/h5reader.py CHANGED Viewed

@@ -26,9 +26,10 @@ class H5SingleReader:
     def __post_init__(self) -> None:
         self.sample = Sample(self.fname)
-        if len(self.sample.virtual_file()) != 1:
+        fname = self.sample.virtual_file()
+        if len(fname) != 1:
             raise ValueError("H5SingleReader should only read a single file")
-        self.fname = self.sample.virtual_file()[0]
+        self.fname = fname[0]
     @cached_property
     def num_jets(self) -> int:
@@ -57,21 +58,27 @@ class H5SingleReader:
                 isinf = np.isinf(array[var])
                 keep_idx = keep_idx & ~isinf.any(axis=-1)
                 if num_inf := isinf.sum():
-                    log.warn(
+                    log.warning(
                         f"{num_inf} inf values detected for variable {var} in"
                         f" {name} array. Removing the affected jets."
                     )
         return {name: array[keep_idx] for name, array in data.items()}
     def stream(
-        self, variables: dict | None = None, num_jets: int | None = None, cuts: Cuts | None = None
+        self,
+        variables: dict | None = None,
+        num_jets: int | None = None,
+        cuts: Cuts | None = None,
     ) -> Generator:
         if num_jets is None:
             num_jets = self.num_jets
         if num_jets > self.num_jets:
-            raise ValueError(
-                f"{num_jets:,} jets requested but only {self.num_jets:,} available in {self.fname}"
+            log.warning(
+                f"{num_jets:,} jets requested but only {self.num_jets:,} available in {self.fname}."
+                " Set to maximum available number!"
             )
+            num_jets = self.num_jets
         if variables is None:
             variables = {self.jets_name: None}
@@ -131,6 +138,9 @@ class H5Reader:
         Weights for different input datasets, by default None
     do_remove_inf : bool, optional
         Remove jets with inf values, by default False
+    equal_jets : bool, optional
+        Take the same number of jets (weighted) from each sample, by default True
+        If False, use all jets in each sample.
     """
     fname: Path | str | list[Path | str]
@@ -140,8 +150,16 @@ class H5Reader:
     shuffle: bool = True
     weights: list[float] | None = None
     do_remove_inf: bool = False
+    equal_jets: bool = True
     def __post_init__(self) -> None:
+        if not self.equal_jets:
+            log.warning(
+                "equal_jets is set to False, which will result in different number of jets taken"
+                " from each sample. Be aware that this can affect the resampling, so make sure you"
+                " know what you are doing."
+            )
         if isinstance(self.fname, (str, Path)):
             self.fname = [self.fname]
@@ -191,10 +209,14 @@ class H5Reader:
         Generator
             Generator of batches of selected jets.
         """
+        # Check if number of jets is given, if not, set to maximum available
         if num_jets is None:
             num_jets = self.num_jets
+        # Check if variables if given, if not, set to all
         if variables is None:
             variables = {self.jets_name: None}
         if self.jets_name not in variables or variables[self.jets_name] is not None:
             jet_vars = variables.get(self.jets_name, [])
             variables[self.jets_name] = list(jet_vars) + (cuts.variables if cuts else [])
@@ -207,12 +229,25 @@ class H5Reader:
         rng = np.random.default_rng(42)
         while True:
-            # yeild from each stream
             samples = []
-            for stream in streams:
-                try:
-                    samples.append(next(stream))
-                except StopIteration:
+            # Track which streams have been exhausted
+            streams_done = [False] * len(streams)
+            # for each unexhausted stream, get the next sample
+            for i, stream in enumerate(streams):
+                if not streams_done[i]:
+                    try:
+                        samples.append(next(stream))
+                    # if equal_jets is True, we can stop when any stream is done
+                    # otherwise if sample is exhausted, mark it as done
+                    except StopIteration:
+                        if self.equal_jets:
+                            return
+                        streams_done[i] = True
+                # if equal_jets is False, we need to keep going until all streams are done
+                if all(streams_done):
                     return
             # combine samples and shuffle
@@ -222,23 +257,72 @@ class H5Reader:
                 rng.shuffle(idx)
                 data = {name: array[idx] for name, array in data.items()}
-            # select
+            # yield batch
             yield data
     def load(
         self, variables: dict | None = None, num_jets: int | None = None, cuts: Cuts | None = None
     ) -> dict:
+        """Load multiple batches of selected jets into memory.
+        Parameters
+        ----------
+        variables : dict | None, optional
+            Dictionary of variables to for each group, by default use all jet variables.
+        num_jets : int | None, optional
+            Total number of selected jets to load, by default all.
+        cuts : Cuts | None, optional
+            Selection cuts to apply, by default None
+        Returns
+        -------
+        dict
+            Dictionary of arrays for each group.
+        """
+        # handle default arguments
+        if num_jets == -1:
+            num_jets = self.num_jets
         if variables is None:
             variables = {self.jets_name: None}
+        # get data from each sample
         data: dict[str, list] = {name: [] for name in variables}
-        for sample in self.stream(variables, num_jets, cuts):
-            for name, array in sample.items():
+        for batch in self.stream(variables, num_jets, cuts):
+            for name, array in batch.items():
                 if name in data:
                     data[name].append(array)
+        # concatenate batches
         return {name: np.concatenate(array) for name, array in data.items()}
     def estimate_available_jets(self, cuts: Cuts, num: int = 1_000_000) -> int:
-        """Estimate the number of jets available after selection cuts, rounded down."""
-        all_jets = self.load({self.jets_name: cuts.variables}, num)[self.jets_name]
-        estimated_num_jets = len(cuts(all_jets).values) / len(all_jets) * self.num_jets
+        """Estimate the number of jets available after selection cuts (round down).
+        Parameters
+        ----------
+        cuts : Cuts
+            Selection cuts to apply.
+        num : int, optional
+            Number of jets to use for the estimation, by default 1_000_000.
+        Returns
+        -------
+        int
+            Estimated number of jets available after selection cuts,
+            rounded down to nearest thousand.
+        """
+        # if equal jets is True, available jets is based on the smallest sample
+        if self.equal_jets:
+            num_jets = []
+            for r in self.readers:
+                stream = r.stream({self.jets_name: cuts.variables}, num)
+                all_jets = np.concatenate([batch[self.jets_name] for batch in stream])
+                frac_selected = len(cuts(all_jets).values) / len(all_jets)
+                num_jets.append(frac_selected * r.num_jets)
+            estimated_num_jets = min(num_jets) * len(self.readers)
+        # otherwise, available jets is based on all samples
+        else:
+            all_jets = self.load({self.jets_name: cuts.variables}, num)[self.jets_name]
+            frac_selected = len(cuts(all_jets).values) / len(all_jets)
+            estimated_num_jets = frac_selected * self.num_jets
         return math.floor(estimated_num_jets / 1_000) * 1_000

ftag/mock.py CHANGED Viewed

@@ -92,6 +92,7 @@ def get_mock_file(num_jets=1000, tracks_name: str = "tracks", num_tracks: int =
     fname = NamedTemporaryFile(suffix=".h5", dir=mkdtemp()).name
     f = h5py.File(fname, "w")
     f.create_dataset("jets", data=jets)
+    f.attrs["test"] = "test"
     # setup tracks
     if tracks_name:

ftag/vds.py CHANGED Viewed

@@ -1,11 +1,22 @@
 from __future__ import annotations
+import argparse
 import glob
 from pathlib import Path
 import h5py
+def parse_args(args):
+    parser = argparse.ArgumentParser(
+        description="Create a lightweight wrapper around a set of h5 files"
+    )
+    parser.add_argument("pattern", type=Path, help="quotes-enclosed glob pattern of files to merge")
+    parser.add_argument("output", type=Path, help="path to output virtual file")
+    args = parser.parse_args(args)
+    return args
 def get_virtual_layout(fnames: list[str], group: str):
     # get sources
     sources = []
@@ -58,20 +69,22 @@ def create_virtual_file(
         for group in h5py.File(fnames[0]):
             layout = get_virtual_layout(fnames, group)
             f.create_virtual_dataset(group, layout)
+            attrs_dict: dict = {}
+            for fname in fnames:
+                with h5py.File(fname) as g:
+                    for name, value in g[group].attrs.items():
+                        if name not in attrs_dict:
+                            attrs_dict[name] = []
+                        attrs_dict[name].append(value)
+            for name, value in attrs_dict.items():
+                if len(value) > 0:
+                    f[group].attrs[name] = value[0]
     return out_fname
-def main():
-    import argparse
-    parser = argparse.ArgumentParser(
-        description="Create a lightweight wrapper around a set of h5 files"
-    )
-    parser.add_argument("pattern", type=Path, help="quotes-enclosed glob pattern of files to merge")
-    parser.add_argument("output", type=Path, help="path to output virtual file")
-    args = parser.parse_args()
+def main(args=None):
+    args = parse_args(args)
     print(f"Globbing {args.pattern}...")
     create_virtual_file(args.pattern, args.output, overwrite=True)
     with h5py.File(args.output) as f:

atlas_ftag_tools-0.1.3.dist-info/RECORD DELETED Viewed

@@ -1,19 +0,0 @@
-ftag/__init__.py,sha256=3VQyLgnMa0A0325TNda80-4qGbPPnQmkrQZq1-klRcA,543
-ftag/cuts.py,sha256=Ge4WXLPg3WNgGxg-g7oIgCbbNFcKZonvkyskU0fDuDg,2733
-ftag/flavour.py,sha256=sEelvHNLWmHsecQQrmRc8ktwykMMHnGX8ePDRrqQkuo,2460
-ftag/flavours.yaml,sha256=S4WoB_n2uqvjo8_mlvNA1wKUwz9aFLhpyXtWsR8uR80,3121
-ftag/mock.py,sha256=Y__r5zToQLqrBg7T1a5RF_ten_gwBHIqgQOtj2DhIhU,3598
-ftag/region.py,sha256=-WxdC0Gy9zz3zEJ2pN779RcxXPG-QEROuMwMoP-Qs0g,353
-ftag/sample.py,sha256=uVNyxFYMMtkP-o2tjQatpo8mIH4ZNNe3mSFEPebYh_E,2622
-ftag/vds.py,sha256=FmpP31YiSKBvh6TRIMWr-_aJHAkQs0Trhmqh2KLfT64,2402
-ftag/hdf5/__init__.py,sha256=A_a_4IUlZ2mSiDcfrZKBdja_3iTrUHvADM2lWx6g66g,325
-ftag/hdf5/h5reader.py,sha256=ayKX3xUiyV42avsCZQhcTYuNLPgJ3NQCS1qUjSggcKQ,8659
-ftag/hdf5/h5utils.py,sha256=GKduv9b6JRSBirRdmNgGcmsINCMTj54kH4RQqxrM1t8,2363
-ftag/hdf5/h5writer.py,sha256=_N-DJSX283r-XsGczvLFA4_qaK4BkFkdKZAusHEvRjU,2919
-ftag/wps/discriminant.py,sha256=86ISONTuIjqTJO1A27oqkoCgDjAQinofiYNdcjfdkIk,1380
-ftag/wps/working_points.py,sha256=487NsQGGY2Qt4q8mXxKABMFa-YLsbrhkPLcYVdebeVk,4950
-atlas_ftag_tools-0.1.3.dist-info/METADATA,sha256=stGxR0B4fZIJyJFpOO3vtJR9ytUrMeDHIP6qafWPDzI,4023
-atlas_ftag_tools-0.1.3.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
-atlas_ftag_tools-0.1.3.dist-info/entry_points.txt,sha256=UKbRbwA9DxfsTPRBIVVDz3u15WdzhzgRKwXXSAXuQqc,73
-atlas_ftag_tools-0.1.3.dist-info/top_level.txt,sha256=qiYQuKcAvMim-31FwkT3MTQu7WQm0s58tPAia5KKWqs,5
-atlas_ftag_tools-0.1.3.dist-info/RECORD,,

{atlas_ftag_tools-0.1.3.dist-info → atlas_ftag_tools-0.1.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{atlas_ftag_tools-0.1.3.dist-info → atlas_ftag_tools-0.1.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{atlas_ftag_tools-0.1.3.dist-info → atlas_ftag_tools-0.1.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

atlas-ftag-tools 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

atlas-ftag-tools 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl