PyPI - atlas-ftag-tools - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

atlas-ftag-tools 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

{atlas_ftag_tools-0.1.4.dist-info → atlas_ftag_tools-0.1.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: atlas-ftag-tools
-Version: 0.1.4
+Version: 0.1.5
 Summary: ATLAS Flavour Tagging Tools
 Author: Sam Van Stroud, Philipp Gadow
 License: MIT

{atlas_ftag_tools-0.1.4.dist-info → atlas_ftag_tools-0.1.5.dist-info}/RECORD RENAMED Viewed

@@ -1,19 +1,19 @@
-ftag/__init__.py,sha256=yf92K1TFG1_KK30N_6FgdGMh-arYNnl4YKQEXPmrJOk,543
+ftag/__init__.py,sha256=XBQEZpFSnGyihB9F3eGOvB_5YknggY_L6fzwYszXLuQ,543
 ftag/cuts.py,sha256=lCnyHd4kbrt3CMXGE1ASCgaa07o1qOBn6GQek6lClVQ,2734
 ftag/flavour.py,sha256=sEelvHNLWmHsecQQrmRc8ktwykMMHnGX8ePDRrqQkuo,2460
-ftag/flavours.yaml,sha256=woPpF8hDycjv_McKbHVqQQE072_P50f9KVNNckEbFKA,3245
+ftag/flavours.yaml,sha256=VrOGD5FUhMVPIW31whY-nSqNv98AcnLsPmPGmAcCg3w,3287
 ftag/mock.py,sha256=HUyYOPsRtkmzjLRNF2zs0kpVUrTRIHTsnIyDlXIZArU,3627
 ftag/region.py,sha256=-WxdC0Gy9zz3zEJ2pN779RcxXPG-QEROuMwMoP-Qs0g,353
 ftag/sample.py,sha256=uVNyxFYMMtkP-o2tjQatpo8mIH4ZNNe3mSFEPebYh_E,2622
 ftag/vds.py,sha256=8b5-zqDELUmxdO5Txdowe3v7XGS1pKgO20bhzUQqCxU,2945
 ftag/hdf5/__init__.py,sha256=A_a_4IUlZ2mSiDcfrZKBdja_3iTrUHvADM2lWx6g66g,325
-ftag/hdf5/h5reader.py,sha256=PlLv3VkGGywAbo8dpbLdwnXW2NTTHTlepFfG8nE00J8,8723
+ftag/hdf5/h5reader.py,sha256=1_iyYfWI1ht1-p9vBBpGhw47ZKola_KhWxbrywoB-Jg,11751
 ftag/hdf5/h5utils.py,sha256=GKduv9b6JRSBirRdmNgGcmsINCMTj54kH4RQqxrM1t8,2363
 ftag/hdf5/h5writer.py,sha256=_N-DJSX283r-XsGczvLFA4_qaK4BkFkdKZAusHEvRjU,2919
 ftag/wps/discriminant.py,sha256=86ISONTuIjqTJO1A27oqkoCgDjAQinofiYNdcjfdkIk,1380
 ftag/wps/working_points.py,sha256=487NsQGGY2Qt4q8mXxKABMFa-YLsbrhkPLcYVdebeVk,4950
-atlas_ftag_tools-0.1.4.dist-info/METADATA,sha256=DiUJuY2MIGmxugt723jC3_FmWXW61wVSS_jRyic3K1w,4182
-atlas_ftag_tools-0.1.4.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
-atlas_ftag_tools-0.1.4.dist-info/entry_points.txt,sha256=UKbRbwA9DxfsTPRBIVVDz3u15WdzhzgRKwXXSAXuQqc,73
-atlas_ftag_tools-0.1.4.dist-info/top_level.txt,sha256=qiYQuKcAvMim-31FwkT3MTQu7WQm0s58tPAia5KKWqs,5
-atlas_ftag_tools-0.1.4.dist-info/RECORD,,
+atlas_ftag_tools-0.1.5.dist-info/METADATA,sha256=Uc4Z2zAMD7jsSKoV6o2LJwfm2X0KEWYRingGT_msE4I,4182
+atlas_ftag_tools-0.1.5.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
+atlas_ftag_tools-0.1.5.dist-info/entry_points.txt,sha256=UKbRbwA9DxfsTPRBIVVDz3u15WdzhzgRKwXXSAXuQqc,73
+atlas_ftag_tools-0.1.5.dist-info/top_level.txt,sha256=qiYQuKcAvMim-31FwkT3MTQu7WQm0s58tPAia5KKWqs,5
+atlas_ftag_tools-0.1.5.dist-info/RECORD,,

ftag/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """atlas-ftag-tools - Common tools for ATLAS flavour tagging software."""
-__version__ = "v0.1.4"
+__version__ = "v0.1.5"
 import ftag.hdf5 as hdf5

ftag/flavours.yaml CHANGED Viewed

@@ -49,12 +49,12 @@
 # Xbb tagging
 - name: hbb
-  label: Hbb
+  label: $H \rightarrow b\bar{b}$
   cuts: ["R10TruthLabel_R22v1 == 11"]
   colour: tab:blue
   category: xbb
 - name: hcc
-  label: Hcc
+  label: $H \rightarrow c\bar{c}$
   cuts: ["R10TruthLabel_R22v1 == 12"]
   colour: "#B45F06"
   category: xbb

ftag/hdf5/h5reader.py CHANGED Viewed

@@ -26,9 +26,10 @@ class H5SingleReader:
     def __post_init__(self) -> None:
         self.sample = Sample(self.fname)
-        if len(self.sample.virtual_file()) != 1:
+        fname = self.sample.virtual_file()
+        if len(fname) != 1:
             raise ValueError("H5SingleReader should only read a single file")
-        self.fname = self.sample.virtual_file()[0]
+        self.fname = fname[0]
     @cached_property
     def num_jets(self) -> int:
@@ -57,21 +58,27 @@ class H5SingleReader:
                 isinf = np.isinf(array[var])
                 keep_idx = keep_idx & ~isinf.any(axis=-1)
                 if num_inf := isinf.sum():
-                    log.warn(
+                    log.warning(
                         f"{num_inf} inf values detected for variable {var} in"
                         f" {name} array. Removing the affected jets."
                     )
         return {name: array[keep_idx] for name, array in data.items()}
     def stream(
-        self, variables: dict | None = None, num_jets: int | None = None, cuts: Cuts | None = None
+        self,
+        variables: dict | None = None,
+        num_jets: int | None = None,
+        cuts: Cuts | None = None,
     ) -> Generator:
         if num_jets is None:
             num_jets = self.num_jets
         if num_jets > self.num_jets:
-            raise ValueError(
-                f"{num_jets:,} jets requested but only {self.num_jets:,} available in {self.fname}"
+            log.warning(
+                f"{num_jets:,} jets requested but only {self.num_jets:,} available in {self.fname}."
+                " Set to maximum available number!"
             )
+            num_jets = self.num_jets
         if variables is None:
             variables = {self.jets_name: None}
@@ -131,6 +138,9 @@ class H5Reader:
         Weights for different input datasets, by default None
     do_remove_inf : bool, optional
         Remove jets with inf values, by default False
+    equal_jets : bool, optional
+        Take the same number of jets (weighted) from each sample, by default True
+        If False, use all jets in each sample.
     """
     fname: Path | str | list[Path | str]
@@ -140,8 +150,16 @@ class H5Reader:
     shuffle: bool = True
     weights: list[float] | None = None
     do_remove_inf: bool = False
+    equal_jets: bool = True
     def __post_init__(self) -> None:
+        if not self.equal_jets:
+            log.warning(
+                "equal_jets is set to False, which will result in different number of jets taken"
+                " from each sample. Be aware that this can affect the resampling, so make sure you"
+                " know what you are doing."
+            )
         if isinstance(self.fname, (str, Path)):
             self.fname = [self.fname]
@@ -191,10 +209,14 @@ class H5Reader:
         Generator
             Generator of batches of selected jets.
         """
+        # Check if number of jets is given, if not, set to maximum available
         if num_jets is None:
             num_jets = self.num_jets
+        # Check if variables if given, if not, set to all
         if variables is None:
             variables = {self.jets_name: None}
         if self.jets_name not in variables or variables[self.jets_name] is not None:
             jet_vars = variables.get(self.jets_name, [])
             variables[self.jets_name] = list(jet_vars) + (cuts.variables if cuts else [])
@@ -207,12 +229,25 @@ class H5Reader:
         rng = np.random.default_rng(42)
         while True:
-            # yeild from each stream
             samples = []
-            for stream in streams:
-                try:
-                    samples.append(next(stream))
-                except StopIteration:
+            # Track which streams have been exhausted
+            streams_done = [False] * len(streams)
+            # for each unexhausted stream, get the next sample
+            for i, stream in enumerate(streams):
+                if not streams_done[i]:
+                    try:
+                        samples.append(next(stream))
+                    # if equal_jets is True, we can stop when any stream is done
+                    # otherwise if sample is exhausted, mark it as done
+                    except StopIteration:
+                        if self.equal_jets:
+                            return
+                        streams_done[i] = True
+                # if equal_jets is False, we need to keep going until all streams are done
+                if all(streams_done):
                     return
             # combine samples and shuffle
@@ -222,25 +257,72 @@ class H5Reader:
                 rng.shuffle(idx)
                 data = {name: array[idx] for name, array in data.items()}
-            # select
+            # yield batch
             yield data
     def load(
         self, variables: dict | None = None, num_jets: int | None = None, cuts: Cuts | None = None
     ) -> dict:
+        """Load multiple batches of selected jets into memory.
+        Parameters
+        ----------
+        variables : dict | None, optional
+            Dictionary of variables to for each group, by default use all jet variables.
+        num_jets : int | None, optional
+            Total number of selected jets to load, by default all.
+        cuts : Cuts | None, optional
+            Selection cuts to apply, by default None
+        Returns
+        -------
+        dict
+            Dictionary of arrays for each group.
+        """
+        # handle default arguments
         if num_jets == -1:
             num_jets = self.num_jets
         if variables is None:
             variables = {self.jets_name: None}
+        # get data from each sample
         data: dict[str, list] = {name: [] for name in variables}
-        for sample in self.stream(variables, num_jets, cuts):
-            for name, array in sample.items():
+        for batch in self.stream(variables, num_jets, cuts):
+            for name, array in batch.items():
                 if name in data:
                     data[name].append(array)
+        # concatenate batches
         return {name: np.concatenate(array) for name, array in data.items()}
     def estimate_available_jets(self, cuts: Cuts, num: int = 1_000_000) -> int:
-        """Estimate the number of jets available after selection cuts, rounded down."""
-        all_jets = self.load({self.jets_name: cuts.variables}, num)[self.jets_name]
-        estimated_num_jets = len(cuts(all_jets).values) / len(all_jets) * self.num_jets
+        """Estimate the number of jets available after selection cuts (round down).
+        Parameters
+        ----------
+        cuts : Cuts
+            Selection cuts to apply.
+        num : int, optional
+            Number of jets to use for the estimation, by default 1_000_000.
+        Returns
+        -------
+        int
+            Estimated number of jets available after selection cuts,
+            rounded down to nearest thousand.
+        """
+        # if equal jets is True, available jets is based on the smallest sample
+        if self.equal_jets:
+            num_jets = []
+            for r in self.readers:
+                stream = r.stream({self.jets_name: cuts.variables}, num)
+                all_jets = np.concatenate([batch[self.jets_name] for batch in stream])
+                frac_selected = len(cuts(all_jets).values) / len(all_jets)
+                num_jets.append(frac_selected * r.num_jets)
+            estimated_num_jets = min(num_jets) * len(self.readers)
+        # otherwise, available jets is based on all samples
+        else:
+            all_jets = self.load({self.jets_name: cuts.variables}, num)[self.jets_name]
+            frac_selected = len(cuts(all_jets).values) / len(all_jets)
+            estimated_num_jets = frac_selected * self.num_jets
         return math.floor(estimated_num_jets / 1_000) * 1_000

{atlas_ftag_tools-0.1.4.dist-info → atlas_ftag_tools-0.1.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{atlas_ftag_tools-0.1.4.dist-info → atlas_ftag_tools-0.1.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{atlas_ftag_tools-0.1.4.dist-info → atlas_ftag_tools-0.1.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

atlas-ftag-tools 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

atlas-ftag-tools 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl