PyPI - legend-pydataobj - Versions diffs - 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

legend-pydataobj 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/METADATA +2 -2
{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/RECORD +23 -23
{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/WHEEL +1 -1
lgdo/_version.py +2 -2
lgdo/lh5/_serializers/read/array.py +9 -9
lgdo/lh5/_serializers/read/composite.py +68 -78
lgdo/lh5/_serializers/read/encoded.py +31 -9
lgdo/lh5/_serializers/read/ndarray.py +51 -37
lgdo/lh5/_serializers/read/scalar.py +10 -3
lgdo/lh5/_serializers/read/utils.py +26 -3
lgdo/lh5/_serializers/read/vector_of_vectors.py +35 -13
lgdo/lh5/_serializers/write/array.py +6 -1
lgdo/lh5/_serializers/write/composite.py +14 -5
lgdo/lh5/_serializers/write/scalar.py +6 -1
lgdo/lh5/core.py +78 -7
lgdo/lh5/exceptions.py +3 -3
lgdo/lh5/store.py +101 -11
lgdo/lh5/tools.py +1 -1
lgdo/lh5/utils.py +13 -2
lgdo/types/histogram.py +18 -3
{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/LICENSE +0 -0
{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/entry_points.txt +0 -0
{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/top_level.txt +0 -0

lgdo/lh5/store.py CHANGED Viewed

@@ -5,13 +5,16 @@ HDF5 files.
 from __future__ import annotations
+import bisect
 import logging
 import os
 import sys
 from collections.abc import Mapping, Sequence
+from inspect import signature
 from typing import Any
 import h5py
+import numpy as np
 from numpy.typing import ArrayLike
 from .. import types
@@ -34,7 +37,9 @@ class LH5Store:
     lgdo.waveformtable.WaveformTable
     """
-    def __init__(self, base_path: str = "", keep_open: bool = False) -> None:
+    def __init__(
+        self, base_path: str = "", keep_open: bool = False, locking: bool = False
+    ) -> None:
         """
         Parameters
         ----------
@@ -43,12 +48,21 @@ class LH5Store:
         keep_open
             whether to keep files open by storing the :mod:`h5py` objects as
             class attributes.
+        locking
+            whether to lock files when reading
         """
         self.base_path = "" if base_path == "" else utils.expand_path(base_path)
         self.keep_open = keep_open
+        self.locking = locking
         self.files = {}
-    def gimme_file(self, lh5_file: str | h5py.File, mode: str = "r") -> h5py.File:
+    def gimme_file(
+        self,
+        lh5_file: str | h5py.File,
+        mode: str = "r",
+        page_buffer: int = 0,
+        **file_kwargs,
+    ) -> h5py.File:
         """Returns a :mod:`h5py` file object from the store or creates a new one.
         Parameters
@@ -57,12 +71,20 @@ class LH5Store:
             LH5 file name.
         mode
             mode in which to open file. See :class:`h5py.File` documentation.
+        page_buffer
+            enable paged aggregation with a buffer of this size in bytes
+            Only used when creating a new file. Useful when writing a file
+            with a large number of small datasets. This is a short-hand for
+            ``(fs_stragety="page", fs_pagesize=[page_buffer])``
+        file_kwargs
+            Keyword arguments for :class:`h5py.File`
         """
         if isinstance(lh5_file, h5py.File):
             return lh5_file
         if mode == "r":
             lh5_file = utils.expand_path(lh5_file, base_path=self.base_path)
+            file_kwargs["locking"] = self.locking
         if lh5_file in self.files:
             return self.files[lh5_file]
@@ -72,20 +94,30 @@ class LH5Store:
         else:
             full_path = lh5_file
+        file_exists = os.path.exists(full_path)
         if mode != "r":
             directory = os.path.dirname(full_path)
             if directory != "" and not os.path.exists(directory):
                 log.debug(f"making path {directory}")
                 os.makedirs(directory)
-        if mode == "r" and not os.path.exists(full_path):
+        if mode == "r" and not file_exists:
             msg = f"file {full_path} not found"
             raise FileNotFoundError(msg)
+        if not file_exists:
+            mode = "w"
-        if mode != "r" and os.path.exists(full_path):
+        if mode != "r" and file_exists:
             log.debug(f"opening existing file {full_path} in mode '{mode}'")
-        h5f = h5py.File(full_path, mode)
+        if mode == "w":
+            file_kwargs.update(
+                {
+                    "fs_strategy": "page",
+                    "fs_page_size": page_buffer,
+                }
+            )
+        h5f = h5py.File(full_path, mode, **file_kwargs)
         if self.keep_open:
             self.files[lh5_file] = h5f
@@ -135,6 +167,7 @@ class LH5Store:
         obj_buf: types.LGDO = None,
         obj_buf_start: int = 0,
         decompress: bool = True,
+        **file_kwargs,
     ) -> tuple[types.LGDO, int]:
         """Read LH5 object data from a file in the store.
@@ -143,13 +176,62 @@ class LH5Store:
         .lh5.core.read
         """
         # grab files from store
-        if not isinstance(lh5_file, (str, h5py.File)):
-            lh5_obj = [self.gimme_file(f, "r")[name] for f in list(lh5_file)]
+        if isinstance(lh5_file, (str, h5py.File)):
+            lh5_obj = self.gimme_file(lh5_file, "r", **file_kwargs)[name]
         else:
-            lh5_obj = self.gimme_file(lh5_file, "r")[name]
+            lh5_files = list(lh5_file)
+            n_rows_read = 0
+            for i, h5f in enumerate(lh5_files):
+                if (
+                    isinstance(idx, (list, tuple))
+                    and len(idx) > 0
+                    and not np.isscalar(idx[0])
+                ):
+                    # a list of lists: must be one per file
+                    idx_i = idx[i]
+                elif idx is not None:
+                    # make idx a proper tuple if it's not one already
+                    if not (isinstance(idx, tuple) and len(idx) == 1):
+                        idx = (idx,)
+                    # idx is a long continuous array
+                    n_rows_i = utils.read_n_rows(name, h5f)
+                    # find the length of the subset of idx that contains indices
+                    # that are less than n_rows_i
+                    n_rows_to_read_i = bisect.bisect_left(idx[0], n_rows_i)
+                    # now split idx into idx_i and the remainder
+                    idx_i = np.array(idx[0])[:n_rows_to_read_i]
+                    idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
+                else:
+                    idx_i = None
+                n_rows_i = n_rows - n_rows_read
+                obj_buf, n_rows_read_i = self.read(
+                    name,
+                    h5f,
+                    start_row,
+                    n_rows_i,
+                    idx_i,
+                    use_h5idx,
+                    field_mask,
+                    obj_buf,
+                    obj_buf_start,
+                    decompress,
+                )
+                n_rows_read += n_rows_read_i
+                if n_rows_read >= n_rows or obj_buf is None:
+                    return obj_buf, n_rows_read
+                start_row = 0
+                obj_buf_start += n_rows_read_i
+            return obj_buf, n_rows_read
+        if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
+            idx = idx[0]
         return _serializers._h5_read_lgdo(
-            lh5_obj,
+            lh5_obj.id,
+            lh5_obj.file.filename,
+            lh5_obj.name,
             start_row=start_row,
             n_rows=n_rows,
             idx=idx,
@@ -170,6 +252,7 @@ class LH5Store:
         n_rows: int | None = None,
         wo_mode: str = "append",
         write_start: int = 0,
+        page_buffer: int = 0,
         **h5py_kwargs,
     ) -> None:
         """Write an LGDO into an LH5 file.
@@ -199,10 +282,17 @@ class LH5Store:
         # write_object:overwrite.
         mode = "w" if wo_mode == "of" else "a"
+        file_kwargs = {
+            k: h5py_kwargs[k]
+            for k in h5py_kwargs & signature(h5py.File).parameters.keys()
+        }
         return _serializers._h5_write_lgdo(
             obj,
             name,
-            self.gimme_file(lh5_file, mode=mode),
+            self.gimme_file(
+                lh5_file, mode=mode, page_buffer=page_buffer, **file_kwargs
+            ),
             group=group,
             start_row=start_row,
             n_rows=n_rows,

lgdo/lh5/tools.py CHANGED Viewed

@@ -128,7 +128,7 @@ def show(
     # open file
     if isinstance(lh5_file, str):
-        lh5_file = h5py.File(utils.expand_path(lh5_file), "r")
+        lh5_file = h5py.File(utils.expand_path(lh5_file), "r", locking=False)
     # go to group
     if lh5_group != "/":

lgdo/lh5/utils.py CHANGED Viewed

@@ -125,7 +125,12 @@ def get_h5_group(
         else:
             group = base_group.create_group(group)
             if grp_attrs is not None:
-                group.attrs.update(grp_attrs)
+                group.attrs.update(
+                    {
+                        k: v.encode("utf-8") if isinstance(v, str) else v
+                        for k, v in grp_attrs.items()
+                    }
+                )
             return group
     if (
         grp_attrs is not None
@@ -141,7 +146,13 @@ def get_h5_group(
         log.debug(f"overwriting {group}.attrs...")
         for key in group.attrs:
             group.attrs.pop(key)
-        group.attrs.update(grp_attrs)
+        group.attrs.update(
+            {
+                k: v.encode("utf-8") if isinstance(v, str) else v
+                for k, v in grp_attrs.items()
+            }
+        )
     return group

lgdo/types/histogram.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import logging
 from collections.abc import Iterable
 from typing import Any
@@ -12,6 +13,8 @@ from .lgdo import LGDO
 from .scalar import Scalar
 from .struct import Struct
+log = logging.getLogger(__name__)
 class Histogram(Struct):
     class Axis(Struct):
@@ -197,6 +200,7 @@ class Histogram(Struct):
         isdensity: bool = False,
         attrs: dict[str, Any] | None = None,
         binedge_attrs: dict[str, Any] | None = None,
+        flow: bool = True,
     ) -> None:
         """A special struct to contain histogrammed data.
@@ -221,6 +225,16 @@ class Histogram(Struct):
             as binning.
         attrs
             a set of user attributes to be carried along with this LGDO.
+        flow
+            If ``False``, discard counts in over-/underflow bins of the passed
+            :class:`hist.Hist` instance. If ``True``, this data will also be discarded,
+            but a warning is emitted.
+            .. note ::
+                :class:`Histogram` does not support storing counts in overflow or
+                underflow bins. This parameter just controls, whether a warning will
+                be emitted.
         """
         if isinstance(weights, hist.Hist):
             if binning is not None:
@@ -230,9 +244,10 @@ class Histogram(Struct):
                 msg = "not allowed to pass isdensity=True if constructing from hist.Hist instance"
                 raise ValueError(msg)
-            if weights.sum(flow=True) != weights.sum(flow=False):
-                msg = "flow bins of hist.Hist cannot be represented"
-                raise ValueError(msg)
+            if weights.sum(flow=True) != weights.sum(flow=False) and flow:
+                log.warning(
+                    "flow bins of hist.Hist cannot be represented, their counts are discarded"
+                )
             weights_view = weights.view(flow=False)
             if type(weights_view) is not np.ndarray:
                 msg = "only simple numpy-backed storages can be used in a hist.Hist"

{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

legend-pydataobj 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

legend-pydataobj 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl