PyPI - atlas-ftag-tools - Versions diffs - 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl - Mend

atlas-ftag-tools 0.2.8py3-none-any.whl → 0.2.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{atlas_ftag_tools-0.2.8.dist-info → atlas_ftag_tools-0.2.10.dist-info}/METADATA +4 -3
{atlas_ftag_tools-0.2.8.dist-info → atlas_ftag_tools-0.2.10.dist-info}/RECORD +14 -12
{atlas_ftag_tools-0.2.8.dist-info → atlas_ftag_tools-0.2.10.dist-info}/WHEEL +1 -1
{atlas_ftag_tools-0.2.8.dist-info → atlas_ftag_tools-0.2.10.dist-info}/entry_points.txt +1 -1
ftag/__init__.py +6 -5
ftag/flavours.yaml +47 -4
ftag/fraction_optimization.py +184 -0
ftag/labels.py +10 -2
ftag/mock.py +58 -17
ftag/utils/__init__.py +24 -0
ftag/utils/logging.py +123 -0
ftag/utils/metrics.py +431 -0
ftag/working_points.py +547 -0
ftag/wps/__init__.py +0 -0
ftag/wps/discriminant.py +0 -131
ftag/wps/working_points.py +0 -316
{atlas_ftag_tools-0.2.8.dist-info → atlas_ftag_tools-0.2.10.dist-info}/top_level.txt +0 -0

ftag/working_points.py ADDED Viewed

@@ -0,0 +1,547 @@
+"""Calculate tagger working points."""
+from __future__ import annotations
+import argparse
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING
+import numpy as np
+import yaml
+from ftag import Flavours
+from ftag.cli_utils import HelpFormatter
+from ftag.cuts import Cuts
+from ftag.hdf5 import H5Reader
+from ftag.utils import get_discriminant
+if TYPE_CHECKING:  # pragma: no cover
+    from collections.abc import Sequence
+    from ftag.labels import Label, LabelContainer
+def parse_args(args: Sequence[str]) -> argparse.Namespace:
+    """Parse the input arguments into a Namespace.
+    Parameters
+    ----------
+    args : Sequence[str] | None
+        Sequence of string inputs to the script
+    Returns
+    -------
+    argparse.Namespace
+        Namespace with the parsed arguments
+    Raises
+    ------
+    ValueError
+        When both --effs and --disc_cuts are provided
+    ValueError
+        When neither --effs nor --disc_cuts are provided
+    ValueError
+        When the number of fraction values is not conistent
+    ValueError
+        When the sum of fraction values for a tagger is not equal to one
+    """
+    # Define the pre-parser which checks the --category
+    pre_parser = argparse.ArgumentParser(add_help=False)
+    pre_parser.add_argument(
+        "-c",
+        "--category",
+        default="single-btag",
+        type=str,
+        help="Label category to use for the working point calculation",
+    )
+    pre_parser.add_argument(
+        "-s",
+        "--signal",
+        default="bjets",
+        type=str,
+        help="Signal flavour which is to be used",
+    )
+    # Parse only --category/--signal and ignore for now all other args
+    pre_args, remaining_argv = pre_parser.parse_known_args(args=args)
+    # Create the "real" parser
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=HelpFormatter,
+    )
+    # Add --category/--signal so the help is correctly shown
+    parser.add_argument(
+        "-c",
+        "--category",
+        default="single-btag",
+        type=str,
+        help="Label category to use for the working point calculation",
+    )
+    parser.add_argument(
+        "-s",
+        "--signal",
+        default="bjets",
+        type=str,
+        help="Signal flavour which is to be used",
+    )
+    # Check which label category was chosen and load the corresponding flavours
+    flavours = Flavours.by_category(pre_args.category)
+    # Build the fraction value arguments for all classes (besides signal)
+    for flav in flavours:
+        # Skip signal
+        if flav.name == pre_args.signal:
+            continue
+        # Built fraction values for all background classes
+        parser.add_argument(
+            f"--{flav.frac_str}",
+            nargs="+",
+            required=True,
+            type=float,
+            help=f"{flav.frac_str} value(s) for each tagger",
+        )
+    # # Adding the other arguments
+    parser.add_argument(
+        "--ttbar",
+        required=True,
+        type=Path,
+        help="Path to ttbar sample (supports globbing)",
+    )
+    parser.add_argument(
+        "--zprime",
+        required=False,
+        type=Path,
+        help="Path to zprime (supports globbing). WPs from ttbar will be reused for zprime",
+    )
+    parser.add_argument(
+        "-t",
+        "--tagger",
+        nargs="+",
+        required=True,
+        type=str,
+        help="tagger name(s)",
+    )
+    parser.add_argument(
+        "-e",
+        "--effs",
+        nargs="+",
+        type=float,
+        help="Efficiency working point(s). If -r is specified, values should be 1/efficiency",
+    )
+    parser.add_argument(
+        "-r",
+        "--rejection",
+        default=None,
+        help="Use rejection of specified background class to determine working points",
+    )
+    parser.add_argument(
+        "-d",
+        "--disc_cuts",
+        nargs="+",
+        type=float,
+        help="D_x value(s) to calculate efficiency at",
+    )
+    parser.add_argument(
+        "-n",
+        "--num_jets",
+        default=1_000_000,
+        type=int,
+        help="Use this many jets (post selection)",
+    )
+    parser.add_argument(
+        "--ttbar_cuts",
+        nargs="+",
+        default=["pt > 20e3"],
+        type=list,
+        help="Selection to apply to ttbar (|eta| < 2.5 is always applied)",
+    )
+    parser.add_argument(
+        "--zprime_cuts",
+        nargs="+",
+        default=["pt > 250e3"],
+        type=list,
+        help="Selection to apply to zprime (|eta| < 2.5 is always applied)",
+    )
+    parser.add_argument(
+        "-o",
+        "--outfile",
+        type=Path,
+        help="Save results to yaml instead of printing",
+    )
+    # Final parse of all arguments
+    parsed_args = parser.parse_args(remaining_argv)
+    # Define the signal as an instance of Flavours
+    parsed_args.signal = Flavours[parsed_args.signal]
+    # Check that only --effs or --disc_cuts is given
+    if parsed_args.effs and parsed_args.disc_cuts:
+        raise ValueError("Cannot specify both --effs and --disc_cuts")
+    if not parsed_args.effs and not parsed_args.disc_cuts:
+        raise ValueError("Must specify either --effs or --disc_cuts")
+    # Check that all fraction values have the same length
+    for flav in flavours:
+        if flav.name != parsed_args.signal.name and len(getattr(parsed_args, flav.frac_str)) != len(
+            parsed_args.tagger
+        ):
+            raise ValueError(f"Number of {flav.frac_str} values must match number of taggers")
+    # Check that all fraction value combinations add up to one
+    for tagger_idx in range(len(parsed_args.tagger)):
+        fraction_value_sum = 0
+        for flav in flavours:
+            if flav.name != parsed_args.signal.name:
+                fraction_value_sum += getattr(parsed_args, flav.frac_str)[tagger_idx]
+        # Round the value to take machine precision into account
+        fraction_value_sum = np.round(fraction_value_sum, 8)
+        # Check it's equal to one
+        if fraction_value_sum != 1:
+            raise ValueError(
+                "Sum of the fraction values must be one! You gave "
+                f"{fraction_value_sum} for tagger {parsed_args.tagger[tagger_idx]}"
+            )
+    return parsed_args
+def get_fxs_from_args(args: argparse.Namespace, flavours: LabelContainer) -> list:
+    """Get the fraction values for each tagger from the argparsed inputs.
+    Parameters
+    ----------
+    args : argparse.Namespace
+        Input arguments parsed by the argparser
+    flavours : LabelContainer
+        LabelContainer instance of the labels that are used
+    Returns
+    -------
+    list
+        List of dicts with the fraction values. Each dict is for one tagger.
+    """
+    # Init the fraction_dict dict
+    fraction_dict = {}
+    # Add the fraction values to the dict
+    for flav in flavours:
+        if flav.name != args.signal.name:
+            fraction_dict[flav.frac_str] = vars(args)[flav.frac_str]
+    return [{k: v[i] for k, v in fraction_dict.items()} for i in range(len(args.tagger))]
+def get_eff_rej(
+    jets: np.ndarray,
+    disc: np.ndarray,
+    wp: float,
+    flavours: LabelContainer,
+) -> dict:
+    """Calculate the efficiency/rejection for each flavour.
+    Parameters
+    ----------
+    jets : np.ndarray
+        Loaded jets
+    disc : np.ndarray
+        Discriminant values of the jets
+    wp : float
+        Working point that is used
+    flavours : LabelContainer
+        LabelContainer instance of the flavours used
+    Returns
+    -------
+    dict
+        Dict with the efficiency/rejection values for each flavour
+    """
+    # Init an out dict
+    out: dict[str, dict] = {"eff": {}, "rej": {}}
+    # Loop over the flavours
+    for flav in flavours:
+        # Calculate discriminant values and efficiencies/rejections
+        flav_disc = disc[flav.cuts(jets).idx]
+        eff = sum(flav_disc > wp) / len(flav_disc)
+        out["eff"][flav.name] = float(f"{eff:.3g}")
+        out["rej"][flav.name] = float(f"{1 / eff:.3g}")
+    return out
+def get_rej_eff_at_disc(
+    jets: np.ndarray,
+    tagger: str,
+    signal: Label,
+    disc_cuts: list,
+    flavours: LabelContainer,
+    fraction_values: dict,
+) -> dict:
+    """Calculate the efficiency/rejection at a certain discriminant values.
+    Parameters
+    ----------
+    jets : np.ndarray
+        Loaded jets used
+    tagger : str
+        Name of the tagger
+    signal : Label
+        Label instance of the signal flavour
+    disc_cuts : list
+        List of discriminant cut values for which the efficiency/rejection is calculated
+    flavours : LabelContainer
+        LabelContainer instance of the flavours that are used
+    Returns
+    -------
+    dict
+        Dict with the discriminant cut values and their respective efficiencies/rejections
+    """
+    # Calculate discriminants
+    disc = get_discriminant(
+        jets=jets,
+        tagger=tagger,
+        signal=signal,
+        flavours=flavours,
+        fraction_values=fraction_values,
+    )
+    # Init out dict
+    ref_eff_dict: dict[str, dict] = {}
+    # Loop over the disc cut values
+    for dcut in disc_cuts:
+        ref_eff_dict[str(dcut)] = {"eff": {}, "rej": {}}
+        # Loop over the flavours
+        for flav in flavours:
+            e_discs = disc[flav.cuts(jets).idx]
+            eff = sum(e_discs > dcut) / len(e_discs)
+            ref_eff_dict[str(dcut)]["eff"][str(flav)] = float(f"{eff:.3g}")
+            ref_eff_dict[str(dcut)]["rej"][str(flav)] = 1 / float(f"{eff:.3g}")
+    return ref_eff_dict
+def setup_common_parts(
+    args: argparse.Namespace,
+) -> tuple[np.ndarray, np.ndarray | None, LabelContainer]:
+    """Load the jets from the files and setup the taggers.
+    Parameters
+    ----------
+    args : argparse.Namespace
+        Input arguments from the argparser
+    Returns
+    -------
+    tuple[dict, dict | None, list]
+        Outputs the ttbar jets, the zprime jets (if wanted, else None), and the flavours used.
+    """
+    # Get the used flavours
+    flavours = Flavours.by_category(args.category)
+    # Get the cuts for the samples
+    default_cuts = Cuts.from_list(["eta > -2.5", "eta < 2.5"])
+    ttbar_cuts = Cuts.from_list(args.ttbar_cuts) + default_cuts
+    zprime_cuts = Cuts.from_list(args.zprime_cuts) + default_cuts
+    # Prepare the loading of the jets
+    all_vars = list(set(sum((flav.cuts.variables for flav in flavours), [])))
+    reader = H5Reader(args.ttbar)
+    jet_vars = reader.dtypes()["jets"].names
+    # Create for all taggers the fraction values
+    for tagger in args.tagger:
+        all_vars += [
+            f"{tagger}_{flav.px}" for flav in flavours if (f"{tagger}_{flav.px}" in jet_vars)
+        ]
+    # Load ttbar jets
+    ttbar_jets = reader.load({"jets": all_vars}, args.num_jets, cuts=ttbar_cuts)["jets"]
+    zprime_jets = None
+    # Load zprime jets if needed
+    if args.zprime:
+        zprime_reader = H5Reader(args.zprime)
+        zprime_jets = zprime_reader.load({"jets": all_vars}, args.num_jets, cuts=zprime_cuts)[
+            "jets"
+        ]
+    else:
+        zprime_jets = None
+    return ttbar_jets, zprime_jets, flavours
+def get_working_points(args: argparse.Namespace) -> dict | None:
+    """Calculate the working points.
+    Parameters
+    ----------
+    args : argparse.Namespace
+        Input arguments from the argparser
+    Returns
+    -------
+    dict | None
+        Dict with the working points. If args.outfile is given, the function returns None and
+        stored the resulting dict in a yaml file in args.outfile.
+    """
+    # Load the jets and flavours and get the fraction values
+    ttbar_jets, zprime_jets, flavours = setup_common_parts(args=args)
+    fraction_values = get_fxs_from_args(args=args, flavours=flavours)
+    # Init an out dict
+    out = {}
+    # Loop over taggers
+    for i, tagger in enumerate(args.tagger):
+        # Calculate discriminant
+        out[tagger] = {"signal": str(args.signal), **fraction_values[i]}
+        disc = get_discriminant(
+            jets=ttbar_jets,
+            tagger=tagger,
+            signal=args.signal,
+            flavours=flavours,
+            fraction_values=fraction_values[i],
+        )
+        # Loop over efficiency working points
+        for eff in args.effs:
+            d = out[tagger][f"{eff:.0f}"] = {}
+            # Set the working point
+            wp_flavour = args.signal
+            if args.rejection:
+                eff = 100 / eff  # noqa: PLW2901
+                wp_flavour = args.rejection
+            # Calculate the discriminant value of the working point
+            wp_disc = disc[flavours[wp_flavour].cuts(ttbar_jets).idx]
+            wp = d["cut_value"] = round(float(np.percentile(wp_disc, 100 - eff)), 3)
+            # Calculate efficiency and rejection for each flavour
+            d["ttbar"] = get_eff_rej(
+                jets=ttbar_jets,
+                disc=disc,
+                wp=wp,
+                flavours=flavours,
+            )
+            # calculate for zprime
+            if args.zprime:
+                zprime_disc = get_discriminant(
+                    jets=zprime_jets,
+                    tagger=tagger,
+                    signal=args.signal,
+                    flavours=flavours,
+                    fraction_values=fraction_values[i],
+                )
+                d["zprime"] = get_eff_rej(
+                    jets=zprime_jets,
+                    disc=zprime_disc,
+                    wp=wp,
+                    flavours=flavours,
+                )
+    if args.outfile:
+        with open(args.outfile, "w") as f:
+            yaml.dump(out, f, sort_keys=False)
+            return None
+    else:
+        return out
+def get_efficiencies(args: argparse.Namespace) -> dict | None:
+    """Calculate the efficiencies for the given jets.
+    Parameters
+    ----------
+    args : argparse.Namespace
+        Input arguments from the argparser
+    Returns
+    -------
+    dict | None
+        Dict with the efficiencies. If args.outfile is given, the function returns None and
+        stored the resulting dict in a yaml file in args.outfile.
+    """
+    # Load the jets and flavours and get the fraction values
+    ttbar_jets, zprime_jets, flavours = setup_common_parts(args=args)
+    fraction_values = get_fxs_from_args(args=args, flavours=flavours)
+    # Init an out dict
+    out = {}
+    # Loop over the taggers
+    for i, tagger in enumerate(args.tagger):
+        out[tagger] = {"signal": str(args.signal), **fraction_values[i]}
+        out[tagger]["ttbar"] = get_rej_eff_at_disc(
+            jets=ttbar_jets,
+            tagger=tagger,
+            signal=args.signal,
+            disc_cuts=args.disc_cuts,
+            flavours=flavours,
+            fraction_values=fraction_values[i],
+        )
+        if args.zprime:
+            out[tagger]["zprime"] = get_rej_eff_at_disc(
+                jets=zprime_jets,
+                tagger=tagger,
+                signal=args.signal,
+                disc_cuts=args.disc_cuts,
+                flavours=flavours,
+                fraction_values=fraction_values[i],
+            )
+    if args.outfile:
+        with open(args.outfile, "w") as f:
+            yaml.dump(out, f, sort_keys=False)
+            return None
+    else:
+        return out
+def main(args: Sequence[str]) -> dict | None:
+    """Main function to run working point calculation.
+    Parameters
+    ----------
+    args : Sequence[str] | None, optional
+        Input arguments, by default None
+    Returns
+    -------
+    dict | None
+        The output dict with the calculated values. When --outfile
+        was given, the return value is None
+    """
+    parsed_args = parse_args(args=args)
+    if parsed_args.effs:
+        out = get_working_points(args=parsed_args)
+    elif parsed_args.disc_cuts:
+        out = get_efficiencies(args=parsed_args)
+    if out:
+        print(yaml.dump(out, sort_keys=False))
+        return out
+    return None
+if __name__ == "__main__":  # pragma: no cover
+    main(args=sys.argv[1:])

ftag/wps/__init__.py DELETED Viewed

File without changes

ftag/wps/discriminant.py DELETED Viewed

@@ -1,131 +0,0 @@
-from __future__ import annotations
-from typing import Callable
-import numpy as np
-from ftag import Flavours
-from ftag.labels import Label, remove_suffix
-def discriminant(
-    jets: np.ndarray,
-    tagger: str,
-    signal: Label,
-    fxs: dict[str, float],
-    epsilon: float = 1e-10,
-) -> np.ndarray:
-    """
-    Get the tagging discriminant.
-    Calculated as the logarithm of the ratio of a specified signal probability
-    to a weighted sum ofbackground probabilities.
-    Parameters
-    ----------
-    jets : np.ndarray
-        Structed jet array containing tagger scores.
-    tagger : str
-        Name of the tagger, used to construct field names.
-    signal : str
-        Type of signal.
-    fxs : dict[str, float]
-        Dict of background probability names and their fractions.
-        If a fraction is None, it is calculated as (1 - sum of provided fractions).
-    epsilon : float, optional
-        A small value added to probabilities to prevent division by zero, by default 1e-10.
-    Returns
-    -------
-    np.ndarray
-        The tagger discriminant values for the jets.
-    Raises
-    ------
-    ValueError
-        If a fraction is specified for a denominator that is not present in the input array.
-    """
-    denominator = 0.0
-    for d, fx in fxs.items():
-        name = f"{tagger}_{d}"
-        if fx > 0 and name not in jets.dtype.names:
-            raise ValueError(f"Nonzero fx for {d}, but '{name}' not found in input array.")
-        denominator += jets[name] * fx if name in jets.dtype.names else 0
-    signal_field = f"{tagger}_{signal.px}"
-    if signal_field not in jets.dtype.names:
-        signal_field = f"{tagger}_p{remove_suffix(signal.name, 'jets')}"
-    return np.log((jets[signal_field] + epsilon) / (denominator + epsilon))
-def tautag_dicriminant(jets, tagger, fb, fc, epsilon=1e-10):
-    fxs = {"pb": fb, "pc": fc, "pu": 1 - fb - fc}
-    return discriminant(jets, tagger, Flavours.taujets, fxs, epsilon=epsilon)
-def btag_discriminant(jets, tagger, fc, ftau=0, epsilon=1e-10):
-    fxs = {"pc": fc, "ptau": ftau, "pu": 1 - fc - ftau}
-    return discriminant(jets, tagger, Flavours.bjets, fxs, epsilon=epsilon)
-def ghostbtag_discriminant(jets, tagger, fc, ftau=0, epsilon=1e-10):
-    fxs = {"pghostc": fc, "pghosttau": ftau, "pghostu": 1 - fc - ftau}
-    return discriminant(jets, tagger, Flavours.ghostbjets, fxs, epsilon=epsilon)
-def ctag_discriminant(jets, tagger, fb, ftau=0, epsilon=1e-10):
-    fxs = {"pb": fb, "ptau": ftau, "pu": 1 - fb - ftau}
-    return discriminant(jets, tagger, Flavours.cjets, fxs, epsilon=epsilon)
-def hbb_discriminant(jets, tagger, ftop=0.25, fhcc=0.02, epsilon=1e-10):
-    fxs = {"phcc": fhcc, "ptop": ftop, "pqcd": 1 - ftop - fhcc}
-    return discriminant(jets, tagger, Flavours.hbb, fxs, epsilon=epsilon)
-def hcc_discriminant(jets, tagger, ftop=0.25, fhbb=0.3, epsilon=1e-10):
-    fxs = {"phbb": fhbb, "ptop": ftop, "pqcd": 1 - ftop - fhbb}
-    return discriminant(jets, tagger, Flavours.hcc, fxs, epsilon=epsilon)
-def get_discriminant(
-    jets: np.ndarray, tagger: str, signal: Label | str, epsilon: float = 1e-10, **fxs
-):
-    """Calculate the b-tag or c-tag discriminant for a given tagger.
-    Parameters
-    ----------
-    jets : np.ndarray
-        Structured array of jets containing tagger outputs
-    tagger : str
-        Name of the tagger
-    signal : Label
-        Signal flavour (bjets/cjets or hbb/hcc)
-    epsilon : float, optional
-        Small number to avoid division by zero, by default 1e-10
-    **fxs : dict
-        Fractions for the different background flavours.
-    Returns
-    -------
-    np.ndarray
-        Array of discriminant values.
-    Raises
-    ------
-    ValueError
-        If the signal flavour is not recognised.
-    """
-    tagger_funcs: dict[str, Callable] = {
-        "bjets": btag_discriminant,
-        "cjets": ctag_discriminant,
-        "taujets": tautag_dicriminant,
-        "hbb": hbb_discriminant,
-        "hcc": hcc_discriminant,
-        "ghostbjets": ghostbtag_discriminant,
-    }
-    if str(signal) not in tagger_funcs:
-        raise ValueError(f"Signal flavour must be one of {list(tagger_funcs.keys())}, not {signal}")
-    func: Callable = tagger_funcs[str(Flavours[signal])]
-    return func(jets, tagger, **fxs, epsilon=epsilon)

atlas-ftag-tools 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl

atlas-ftag-tools 0.2.8py3-none-any.whl → 0.2.10py3-none-any.whl