PyPI - atlas-ftag-tools - Versions diffs - 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl - Mend

atlas-ftag-tools 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

atlas_ftag_tools-0.2.12.dist-info/METADATA +53 -0
atlas_ftag_tools-0.2.12.dist-info/RECORD +32 -0
{atlas_ftag_tools-0.2.10.dist-info → atlas_ftag_tools-0.2.12.dist-info}/WHEEL +1 -1
{atlas_ftag_tools-0.2.10.dist-info → atlas_ftag_tools-0.2.12.dist-info}/entry_points.txt +1 -0
atlas_ftag_tools-0.2.12.dist-info/licenses/LICENSE +201 -0
ftag/__init__.py +11 -11
ftag/flavours.yaml +18 -13
ftag/hdf5/__init__.py +5 -3
ftag/hdf5/h5add_col.py +391 -0
ftag/hdf5/h5reader.py +17 -4
ftag/hdf5/h5utils.py +10 -1
ftag/hdf5/h5writer.py +86 -29
ftag/labeller.py +1 -1
ftag/mock.py +2 -2
ftag/utils/__init__.py +2 -2
ftag/vds.py +39 -4
atlas_ftag_tools-0.2.10.dist-info/METADATA +0 -151
atlas_ftag_tools-0.2.10.dist-info/RECORD +0 -30
{atlas_ftag_tools-0.2.10.dist-info → atlas_ftag_tools-0.2.12.dist-info}/top_level.txt +0 -0

ftag/hdf5/h5add_col.py ADDED Viewed

@@ -0,0 +1,391 @@
+# Utils to take an input h5 file, and append one or more columns to it
+from __future__ import annotations
+import argparse
+import importlib.util
+from pathlib import Path
+from typing import Callable
+import h5py
+import numpy as np
+from ftag.hdf5.h5reader import H5Reader
+from ftag.hdf5.h5writer import H5Writer
+def merge_dicts(dicts: list[dict[str, dict[str, np.ndarray]]]) -> dict[str, dict[str, np.ndarray]]:
+    """Merges a list of dictionaries.
+    Each dict is of the form:
+     {
+        group1: {
+            variable_1: np.array
+            variable_2: np.array
+        },
+        group2: {
+            variable_1: np.array
+            variable_2: np.array
+        }
+     }
+     E.g.
+     dict1 = {
+        "jets": {
+            "pt": np.array([1, 2, 3]),
+            "eta": np.array([4, 5, 6])
+        },
+    }
+    dict2 = {
+        "jets": {
+            "phi": np.array([7, 8, 9]),
+            "energy": np.array([10, 11, 12])
+        },
+    }
+    merged = {
+        "jets": {
+            "pt": np.array([1, 2, 3]),
+            "eta": np.array([4, 5, 6]),
+            "phi": np.array([7, 8, 9]),
+            "energy": np.array([10, 11, 12])
+        }
+    }
+    Parameters
+    ----------
+    dicts : list[dict[str, dict[str, np.ndarray]]]
+        List of dictionaries to merge. Each dictionary should be of the form:
+    Returns
+    -------
+    dict[str, dict[str, np.ndarray]]
+        Merged dictionary of the form:
+        {
+            group1: {
+                variable_1: np.array
+                variable_2: np.array
+            },
+            group2: {
+                variable_1: np.array
+                variable_2: np.array
+            }
+        }
+    Raises
+    ------
+    ValueError
+        If a variable already exists in the merged dictionary.
+    """
+    merged: dict[str, dict[str, np.ndarray]] = {}
+    for d in dicts:
+        for group, variables in d.items():
+            if group not in merged:
+                merged[group] = {}
+            for variable, data in variables.items():
+                if variable not in merged[group]:
+                    merged[group][variable] = data
+                else:
+                    raise ValueError(f"Variable {variable} already exists in group {group}.")
+    return merged
+def get_shape(num_jets: int, batch: dict[str, np.ndarray]) -> dict[str, tuple[int, ...]]:
+    """Returns a dictionary with the correct output shapes for the H5Writer.
+    Parameters
+    ----------
+    num_jets : int
+        Number of jets to write in total
+    batch : dict[str, np.ndarray]
+        Dictionary representing the batch
+    Returns
+    -------
+    dict[str, tuple[int, ...]]
+        Dictionary with the shapes of the output arrays
+    """
+    shape: dict[str, tuple[int, ...]] = {}
+    for key, values in batch.items():
+        if values.ndim == 1:
+            shape[key] = (num_jets,)
+        else:
+            shape[key] = (num_jets,) + values.shape[1:]
+    return shape
+def get_all_groups(file: Path | str) -> dict[str, None]:
+    """Returns a dictionary with all the groups in the h5 file.
+    Parameters
+    ----------
+    file : Path | str
+        Path to the h5 file
+    Returns
+    -------
+    dict[str, None]
+        A dictionary with all the groups in the h5 file as keys and None as values,
+        such that h5read.stream(all_groups) will return all the groups in the file.
+    """
+    with h5py.File(file, "r") as f:
+        groups = list(f.keys())
+        return dict.fromkeys(groups)
+def h5_add_column(
+    input_file: str | Path,
+    output_file: str | Path,
+    append_function: Callable | list[Callable],
+    num_jets: int = -1,
+    input_groups: list[str] | None = None,
+    output_groups: list[str] | None = None,
+    reader_kwargs: dict | None = None,
+    writer_kwargs: dict | None = None,
+    overwrite: bool = False,
+) -> None:
+    """Appends one or more columns to one or more groups in an h5 file.
+    Parameters
+    ----------
+    input_file : str | Path
+        Input h5 file to read from.
+    output_file : str | Path
+        Output h5 file to write to.
+    append_function : callable | list[callable]
+        A function, or list of functions, which take a batch from H5Reader and returns a dictionary
+        of the form:
+            {
+                group1 : {
+                    new_column1 : data,
+                    new_column2 : data,
+                },
+                group2 : {
+                    new_column3 : data,
+                    new_column4 : data,
+                },
+                ...
+            }
+    num_jets : int, optional
+        Number of jets to read from the input file. If -1, reads all jets. By default -1.
+    input_groups : list[str] | None, optional
+        List of groups to read from the input file. If None, reads all groups. By default None.
+    output_groups : list[str] | None, optional
+        List of groups to write to the output file. If None, writes all groups. By default None.
+        Note that this is a subset of the input groups, and must include all groups that the
+        append functions wish to write to.
+    reader_kwargs : dict, optional
+        Additional arguments to pass to the H5Reader. By default None.
+    writer_kwargs : dict, optional
+        Additional arguments to pass to the H5Writer. By default None.
+    overwrite : bool, optional
+        If True, will overwrite the output file if it exists. By default False.
+        If False, will raise a FileExistsError if the output file exists.
+        If None, will check if the output file exists and raise an error if it does unless
+        overwrite is True.
+    Raises
+    ------
+    FileNotFoundError
+        If the input file does not exist.
+    FileExistsError
+        If the output file exists and overwrite is False.
+    ValueError
+        If the new variable already exists, shape is incorrect, or the output group is not in
+        the input groups.
+    """
+    input_file = Path(input_file)
+    output_file = Path(output_file) if output_file is not None else None
+    if not input_file.exists():
+        raise FileNotFoundError(f"Input file {input_file} does not exist.")
+    if output_file is not None and output_file.exists() and not overwrite:
+        raise FileExistsError(
+            f"Output file {output_file} already exists. Please choose a different name."
+        )
+    if not reader_kwargs:
+        reader_kwargs = {}
+    if not writer_kwargs:
+        writer_kwargs = {}
+    if output_file is None:
+        output_file = input_file.with_name(input_file.name.replace(".h5", "_additional.h5"))
+    if not isinstance(append_function, list):
+        append_function = [append_function]
+    reader = H5Reader(input_file, shuffle=False, **reader_kwargs)
+    if "precision" not in writer_kwargs:
+        writer_kwargs["precision"] = "full"
+    njets = reader.num_jets if num_jets == -1 else num_jets
+    writer = None
+    input_variables = (
+        get_all_groups(input_file) if input_groups is None else dict.fromkeys(input_groups)
+    )
+    if output_groups is None:
+        output_groups = list(input_variables.keys())
+    assert all(
+        o in input_variables for o in output_groups
+    ), f"Output groups {output_groups} not in input groups {input_variables.keys()}"
+    num_batches = njets // reader.batch_size + 1
+    for i, batch in enumerate(reader.stream(input_variables, num_jets=njets)):
+        if (i + 1) % 10 == 0:
+            print(f"Processing batch {i + 1}/{num_batches} ({(i + 1) / num_batches * 100:.2f}%)")
+        to_append = merge_dicts([af(batch) for af in append_function])
+        for k, newvars in to_append.items():
+            if k not in output_groups:
+                raise ValueError(f"Trying to output to {k} but only {output_groups} are allowed")
+            for newkey, newval in newvars.items():
+                if newkey in batch[k].dtype.names:
+                    raise ValueError(
+                        f"Trying to append {newkey} to {k} but it already exists in batch"
+                    )
+                if newval.shape != batch[k].shape:
+                    raise ValueError(
+                        f"Trying to append {newkey} to {k} but the shape is not correct"
+                    )
+        to_write = {}
+        for key, str_array in batch.items():
+            if key not in output_groups:
+                continue
+            if key in to_append:
+                combined = np.lib.recfunctions.append_fields(
+                    str_array,
+                    list(to_append[key].keys()),
+                    list(to_append[key].values()),
+                    usemask=False,
+                )
+                to_write[key] = combined
+            else:
+                to_write[key] = str_array
+        if writer is None:
+            writer = H5Writer(
+                output_file,
+                dtypes={key: str_array.dtype for key, str_array in to_write.items()},
+                shapes=get_shape(njets, to_write),
+                shuffle=False,
+                **writer_kwargs,
+            )
+        writer.write(to_write)
+def parse_append_function(func_path: str) -> Callable:
+    """Attempts to load the function specified by func_path.
+    The function should be specified as 'path/to/file.py:function_name'.
+    Parameters
+    ----------
+    func_path : str
+        Path to the function to load. Should be of the form 'path/to/file.py:function_name'.
+    Returns
+    -------
+    Callable
+        The function specified by func_path.
+    Raises
+    ------
+    ValueError
+        If the function path is not of the form 'path/to/file.py:function_name'.
+    FileNotFoundError
+        If the file does not exist.
+    ImportError
+        If the file cannot be imported.
+    AttributeError
+        If the function does not exist in the file.
+    """
+    if isinstance(func_path, Path):
+        func_path = str(func_path)
+    if ":" not in func_path:
+        print(func_path)
+        raise ValueError("Function should be specified as 'path/to/file.py:function_name'")
+    file_str, func_name = func_path.split(":")
+    file_path = Path(file_str).resolve()
+    if not file_path.is_file():
+        raise FileNotFoundError(f"No such file: {file_path}")
+    module_name = file_path.stem  # Just the filename without extension
+    spec = importlib.util.spec_from_file_location(module_name, str(file_path))
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Cannot load spec for {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    if not hasattr(module, func_name):
+        raise AttributeError(f"Module {module_name} has no attribute {func_name}")
+    return getattr(module, func_name)
+def get_args(args):
+    parser = argparse.ArgumentParser(description="Append columns to an h5 file.")
+    parser.add_argument("--input", "-i", type=str, required=True, help="Input h5 file")
+    parser.add_argument(
+        "--append_function",
+        type=str,
+        nargs="+",
+        help="Function to append to the h5 file. Can be a list of functions.",
+        required=True,
+    )
+    parser.add_argument("--output", type=str, help="Output h5 file")
+    parser.add_argument(
+        "--num_jets", type=int, default=-1, help="Number of jets to read from the input file"
+    )
+    parser.add_argument(
+        "--input_groups",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of groups to read from the input file",
+    )
+    parser.add_argument(
+        "--output_groups",
+        type=str,
+        nargs="+",
+        default=None,
+        help="List of groups to write to the output file",
+    )
+    parser.add_argument(
+        "--reader_kwargs", type=dict, default=None, help="Additional arguments for H5Reader"
+    )
+    parser.add_argument(
+        "--writer_kwargs", type=dict, default=None, help="Additional arguments for H5Writer"
+    )
+    parser.add_argument(
+        "--overwrite", action="store_true", help="Overwrite the output file if it exists"
+    )
+    return parser.parse_args(args)
+def main(args=None):
+    args = get_args(args)
+    append_function = [
+        parse_append_function(func_path) if isinstance(func_path, str) else func_path
+        for func_path in args.append_function
+    ]
+    h5_add_column(
+        args.input,
+        args.output,
+        append_function,
+        num_jets=args.num_jets,
+        input_groups=args.input_groups,
+        output_groups=args.output_groups,
+        reader_kwargs=args.reader_kwargs,
+        writer_kwargs=args.writer_kwargs,
+        overwrite=args.overwrite,
+    )

ftag/hdf5/h5reader.py CHANGED Viewed

@@ -74,10 +74,12 @@ class H5SingleReader:
         num_jets: int | None = None,
         cuts: Cuts | None = None,
         start: int = 0,
+        skip_batches: int = 0,
     ) -> Generator:
         if num_jets is None:
             num_jets = self.num_jets
+        if skip_batches > 0:
+            assert not self.shuffle, "Cannot skip batches if shuffle is True"
         if num_jets > self.num_jets:
             log.warning(
                 f"{num_jets:,} jets requested but only {self.num_jets:,} available in {self.fname}."
@@ -97,7 +99,8 @@ class H5SingleReader:
             indices = list(range(start, self.num_jets + start, self.batch_size))
             if self.shuffle:
                 self.rng.shuffle(indices)
+            if skip_batches > 0:
+                indices = indices[skip_batches:]
             # loop over batches and read file
             for low in indices:
                 for name in variables:
@@ -176,7 +179,12 @@ class H5Reader:
         # calculate batch sizes
         if self.weights is None:
-            self.weights = [1 / len(self.fname)] * len(self.fname)
+            rows_per_file = [
+                H5SingleReader(f, jets_name=self.jets_name).num_jets for f in self.fname
+            ]
+            num_total = sum(rows_per_file)
+            self.weights = [num / num_total for num in rows_per_file]
         self.batch_sizes = [int(w * self.batch_size) for w in self.weights]
         # create readers
@@ -233,6 +241,7 @@ class H5Reader:
         num_jets: int | None = None,
         cuts: Cuts | None = None,
         start: int = 0,
+        skip_batches: int = 0,
     ) -> Generator:
         """Generate batches of selected jets.
@@ -246,6 +255,8 @@ class H5Reader:
             Selection cuts to apply, by default None
         start : int, optional
             Starting index of the first jet to read, by default 0
+        skip_batches : int, optional
+            Number of batches to skip, by default 0
         Yields
         ------
@@ -266,7 +277,9 @@ class H5Reader:
         # get streams for selected jets from each reader
         streams = [
-            r.stream(variables, int(r.num_jets / self.num_jets * num_jets), cuts, start)
+            r.stream(
+                variables, int(r.num_jets / self.num_jets * num_jets), cuts, start, skip_batches
+            )
             for r in self.readers
         ]

ftag/hdf5/h5utils.py CHANGED Viewed

@@ -13,6 +13,7 @@ def get_dtype(
     variables: list[str] | None = None,
     precision: str | None = None,
     transform: Transform | None = None,
+    full_precision_vars: list[str] | None = None,
 ) -> np.dtype:
     """Return a dtype based on an existing dataset and requested variables.
@@ -26,6 +27,8 @@ def get_dtype(
         Precision to cast floats to, "half" or "full", by default None
     transform : Transform | None, optional
         Transform to apply to variables names, by default None
+    full_precision_vars : list[str] | None, optional
+        List of variables to keep in full precision, by default None
     Returns
     -------
@@ -39,6 +42,8 @@ def get_dtype(
     """
     if variables is None:
         variables = ds.dtype.names
+    if full_precision_vars is None:
+        full_precision_vars = []
     if (missing := set(variables) - set(ds.dtype.names)) and transform is not None:
         variables = transform.map_variable_names(ds.name, variables, inverse=True)
@@ -50,7 +55,10 @@ def get_dtype(
     dtype = [(n, x) for n, x in ds.dtype.descr if n in variables]
     if precision:
-        dtype = [(n, cast_dtype(x, precision)) for n, x in dtype]
+        dtype = [
+            (n, cast_dtype(x, precision)) if n not in full_precision_vars else (n, x)
+            for n, x in dtype
+        ]
     return np.dtype(dtype)
@@ -78,6 +86,7 @@ def cast_dtype(typestr: str, precision: str) -> np.dtype:
     t = np.dtype(typestr)
     if t.kind != "f":
         return t
     if precision == "half":
         return np.dtype("f2")
     if precision == "full":

ftag/hdf5/h5writer.py CHANGED Viewed

@@ -31,8 +31,11 @@ class H5Writer:
         Compression algorithm to use. Default is "lzf".
     precision : str | None, optional
         Precision to use. Default is None.
+    full_precision_vars : list[str] | None, optional
+        List of variables to store in full precision. Default is None.
     shuffle : bool, optional
         Whether to shuffle the jets before writing. Default is True.
     """
     dst: Path | str
@@ -42,19 +45,30 @@ class H5Writer:
     add_flavour_label: bool = False
     compression: str = "lzf"
     precision: str = "full"
+    full_precision_vars: list[str] | None = None
     shuffle: bool = True
+    num_jets: int | None = None  # Allow dynamic mode by defaulting to None
     def __post_init__(self):
         self.num_written = 0
         self.rng = np.random.default_rng(42)
-        self.num_jets = [shape[0] for shape in self.shapes.values()]
-        assert len(set(self.num_jets)) == 1, "Must have same number of jets per group"
-        self.num_jets = self.num_jets[0]
+        # Infer number of jets from shapes if not explicitly passed
+        inferred_num_jets = [shape[0] for shape in self.shapes.values()]
+        if self.num_jets is None:
+            assert len(set(inferred_num_jets)) == 1, "Shapes must agree in first dimension"
+            self.fixed_mode = False
+        else:
+            self.fixed_mode = True
+            for name in self.shapes:
+                self.shapes[name] = (self.num_jets,) + self.shapes[name][1:]
         if self.precision == "full":
             self.fp_dtype = np.float32
         elif self.precision == "half":
             self.fp_dtype = np.float16
+        elif self.precision is None:
+            self.fp_dtype = None
         else:
             raise ValueError(f"Invalid precision: {self.precision}")
@@ -67,16 +81,34 @@ class H5Writer:
             self.create_ds(name, dtype)
     @classmethod
-    def from_file(cls, source: Path, num_jets: int | None = None, **kwargs) -> H5Writer:
+    def from_file(
+        cls, source: Path, num_jets: int | None = 0, variables=None, **kwargs
+    ) -> H5Writer:
         with h5py.File(source, "r") as f:
             dtypes = {name: ds.dtype for name, ds in f.items()}
             shapes = {name: ds.shape for name, ds in f.items()}
-            if num_jets is not None:
+            if variables:
+                new_dtye = {}
+                new_shape = {}
+                for name, ds in f.items():
+                    if name not in variables:
+                        continue
+                    new_dtye[name] = ftag.hdf5.get_dtype(
+                        ds,
+                        variables=variables[name],
+                        precision=kwargs.get("precision"),
+                        full_precision_vars=kwargs.get("full_precision_vars"),
+                    )
+                    new_shape[name] = ds.shape
+                dtypes = new_dtye
+                shapes = new_shape
+            if num_jets != 0:
                 shapes = {name: (num_jets,) + shape[1:] for name, shape in shapes.items()}
             compression = [ds.compression for ds in f.values()]
             assert len(set(compression)) == 1, "Must have same compression for all groups"
             compression = compression[0]
-            if compression not in kwargs:
+            if "compression" not in kwargs:
                 kwargs["compression"] = compression
         return cls(dtypes=dtypes, shapes=shapes, **kwargs)
@@ -84,29 +116,47 @@ class H5Writer:
         if name == self.jets_name and self.add_flavour_label and "flavour_label" not in dtype.names:
             dtype = np.dtype([*dtype.descr, ("flavour_label", "i4")])
-        # adjust dtype based on specified precision
+        fp_vars = self.full_precision_vars or []
+        # If no precision is defined, or the field is in full_precision_vars, or its non-float,
+        # keep it at the original dtype
         dtype = np.dtype([
-            (field, self.fp_dtype if np.issubdtype(dt, np.floating) else dt)
+            (
+                field,
+                (
+                    self.fp_dtype
+                    if (self.fp_dtype and field not in fp_vars and np.issubdtype(dt, np.floating))
+                    else dt
+                ),
+            )
             for field, dt in dtype.descr
         ])
-        # optimal chunking is around 100 jets, only aply for track groups
         shape = self.shapes[name]
         chunks = (100,) + shape[1:] if shape[1:] else None
-        # note: enabling the hd5 shuffle filter doesn't improve write performance
-        self.file.create_dataset(
-            name, dtype=dtype, shape=shape, compression=self.compression, chunks=chunks
-        )
+        if self.fixed_mode:
+            self.file.create_dataset(
+                name, dtype=dtype, shape=shape, compression=self.compression, chunks=chunks
+            )
+        else:
+            maxshape = (None,) + shape[1:]
+            self.file.create_dataset(
+                name,
+                dtype=dtype,
+                shape=(0,) + shape[1:],
+                maxshape=maxshape,
+                compression=self.compression,
+                chunks=chunks,
+            )
     def close(self) -> None:
-        with h5py.File(self.dst) as f:
-            written = len(f[self.jets_name])
-        if self.num_written != written:
-            raise ValueError(
-                f"Attemped to close file {self.dst} when only {self.num_written:,} out of"
-                f" {written:,} jets have been written"
-            )
+        if self.fixed_mode:
+            written = len(self.file[self.jets_name])
+            if self.num_written != written:
+                raise ValueError(
+                    f"Attempted to close file {self.dst} when only {self.num_written:,} out of"
+                    f" {written:,} jets have been written"
+                )
         self.file.close()
     def get_attr(self, name, group=None):
@@ -126,18 +176,25 @@ class H5Writer:
                 for attr_name, value in ds.attrs.items():
                     self.add_attr(attr_name, value, group=name)
-    def write(self, data: dict[str, np.array]) -> None:
-        if (total := self.num_written + len(data[self.jets_name])) > self.num_jets:
-            raise ValueError(
-                f"Attempted to write more jets than expected: {total:,} > {self.num_jets:,}"
-            )
-        idx = np.arange(len(data[self.jets_name]))
+    def write(self, data: dict[str, np.ndarray]) -> None:
+        batch_size = len(data[self.jets_name])
+        idx = np.arange(batch_size)
         if self.shuffle:
             self.rng.shuffle(idx)
             data = {name: array[idx] for name, array in data.items()}
         low = self.num_written
-        high = low + len(idx)
+        high = low + batch_size
+        if self.fixed_mode and high > self.num_jets:
+            raise ValueError(
+                f"Attempted to write more jets than expected: {high:,} > {self.num_jets:,}"
+            )
         for group in self.dtypes:
-            self.file[group][low:high] = data[group]
-        self.num_written += len(idx)
+            ds = self.file[group]
+            if not self.fixed_mode:
+                ds.resize((high,) + ds.shape[1:])
+            ds[low:high] = data[group]
+        self.num_written += batch_size

ftag/labeller.py CHANGED Viewed

@@ -30,7 +30,7 @@ class Labeller:
     def __post_init__(self) -> None:
         if isinstance(self.labels, LabelContainer):
             self.labels = list(self.labels)
-        self.labels = sorted([Flavours[label] for label in self.labels])
+        self.labels = [Flavours[label] for label in self.labels]
     @property
     def variables(self) -> list[str]:

atlas-ftag-tools 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

atlas-ftag-tools 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl