PyPI - legend-pydataobj - Versions diffs - 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

legend-pydataobj 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/METADATA +2 -2
{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/RECORD +23 -23
{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/WHEEL +1 -1
lgdo/_version.py +2 -2
lgdo/lh5/_serializers/read/array.py +9 -9
lgdo/lh5/_serializers/read/composite.py +68 -78
lgdo/lh5/_serializers/read/encoded.py +31 -9
lgdo/lh5/_serializers/read/ndarray.py +51 -37
lgdo/lh5/_serializers/read/scalar.py +10 -3
lgdo/lh5/_serializers/read/utils.py +26 -3
lgdo/lh5/_serializers/read/vector_of_vectors.py +35 -13
lgdo/lh5/_serializers/write/array.py +6 -1
lgdo/lh5/_serializers/write/composite.py +14 -5
lgdo/lh5/_serializers/write/scalar.py +6 -1
lgdo/lh5/core.py +78 -7
lgdo/lh5/exceptions.py +3 -3
lgdo/lh5/store.py +101 -11
lgdo/lh5/tools.py +1 -1
lgdo/lh5/utils.py +13 -2
lgdo/types/histogram.py +18 -3
{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/LICENSE +0 -0
{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/entry_points.txt +0 -0
{legend_pydataobj-1.9.0.dist-info → legend_pydataobj-1.10.0.dist-info}/top_level.txt +0 -0

lgdo/lh5/_serializers/read/ndarray.py CHANGED Viewed

@@ -4,17 +4,21 @@ import logging
 import sys
 from bisect import bisect_left
+import h5py
 import numpy as np
 from ....types import Array
 from ... import datatype
 from ...exceptions import LH5DecodeError
+from .utils import read_attrs
 log = logging.getLogger(__name__)
 def _h5_read_ndarray(
     h5d,
+    fname,
+    oname,
     start_row=0,
     n_rows=sys.maxsize,
     idx=None,
@@ -24,48 +28,49 @@ def _h5_read_ndarray(
 ):
     if obj_buf is not None and not isinstance(obj_buf, Array):
         msg = "object buffer is not an Array"
-        raise LH5DecodeError(msg, h5d)
+        raise LH5DecodeError(msg, fname, oname)
     # compute the number of rows to read
     # we culled idx above for start_row and n_rows, now we have to apply
     # the constraint of the length of the dataset
     try:
-        ds_n_rows = h5d.shape[0]
+        fspace = h5d.get_space()
+        ds_n_rows = fspace.shape[0]
     except AttributeError as e:
         msg = "does not seem to be an HDF5 dataset"
-        raise LH5DecodeError(msg, h5d) from e
+        raise LH5DecodeError(msg, fname, oname) from e
     if idx is not None:
-        if len(idx[0]) > 0 and idx[0][-1] >= ds_n_rows:
+        if len(idx) > 0 and idx[-1] >= ds_n_rows:
             log.warning("idx indexed past the end of the array in the file. Culling...")
             n_rows_to_read = bisect_left(idx[0], ds_n_rows)
-            idx = (idx[0][:n_rows_to_read],)
-            if len(idx[0]) == 0:
+            idx = (idx[:n_rows_to_read],)
+            if len(idx) == 0:
                 log.warning("idx empty after culling.")
-        n_rows_to_read = len(idx[0])
+        n_rows_to_read = len(idx)
     else:
         n_rows_to_read = ds_n_rows - start_row
     if n_rows_to_read > n_rows:
         n_rows_to_read = n_rows
-    # if idx is passed, check if we can make it a slice instead (faster)
-    change_idx_to_slice = False
-    # prepare the selection for the read. Use idx if available
-    if idx is not None:
-        # check if idx is empty and convert to slice instead
-        if len(idx[0]) == 0:
-            source_sel = np.s_[0:0]
-            change_idx_to_slice = True
-        # check if idx is contiguous and increasing
-        # if so, convert it to a slice instead (faster)
-        elif np.all(np.diff(idx[0]) == 1):
-            source_sel = np.s_[idx[0][0] : idx[0][-1] + 1]
-            change_idx_to_slice = True
-        else:
-            source_sel = idx
-    else:
-        source_sel = np.s_[start_row : start_row + n_rows_to_read]
+    if idx is None:
+        fspace.select_hyperslab(
+            (start_row,) + (0,) * (h5d.rank - 1),
+            (1,) * h5d.rank,
+            None,
+            (n_rows_to_read,) + fspace.shape[1:],
+        )
+    elif use_h5idx:
+        # Note that h5s will automatically merge adjacent elements into a range
+        fspace.select_none()
+        for i in idx:
+            fspace.select_hyperslab(
+                (i,) + (0,) * (h5d.rank - 1),
+                (1,) * h5d.rank,
+                None,
+                (1,) + fspace.shape[1:],
+                h5py.h5s.SELECT_OR,
+            )
     # Now read the array
     if obj_buf is not None and n_rows_to_read > 0:
@@ -74,26 +79,35 @@ def _h5_read_ndarray(
             obj_buf.resize(buf_size)
         dest_sel = np.s_[obj_buf_start:buf_size]
-        # this is required to make the read of multiple files faster
-        # until a better solution found.
-        if change_idx_to_slice or idx is None or use_h5idx:
-            h5d.read_direct(obj_buf.nda, source_sel, dest_sel)
+        if idx is None or use_h5idx:
+            mspace = h5py.h5s.create_simple(obj_buf.nda.shape)
+            mspace.select_hyperslab(
+                (obj_buf_start,) + (0,) * (h5d.rank - 1),
+                (1,) * h5d.rank,
+                None,
+                (n_rows_to_read,) + fspace.shape[1:],
+            )
+            h5d.read(mspace, fspace, obj_buf.nda)
         else:
-            # it is faster to read the whole object and then do fancy indexing
-            obj_buf.nda[dest_sel] = h5d[...][source_sel]
+            tmp = np.empty(fspace.shape, h5d.dtype)
+            h5d.read(fspace, fspace, tmp)
+            obj_buf.nda[dest_sel, ...] = tmp[idx, ...]
         nda = obj_buf.nda
     elif n_rows == 0:
         tmp_shape = (0,) + h5d.shape[1:]
         nda = np.empty(tmp_shape, h5d.dtype)
-    elif change_idx_to_slice or idx is None or use_h5idx:
-        nda = h5d[source_sel]
     else:
-        # it is faster to read the whole object and then do fancy indexing
-        nda = h5d[...][source_sel]
+        mspace = h5py.h5s.create_simple((n_rows_to_read,) + fspace.shape[1:])
+        nda = np.empty(mspace.shape, h5d.dtype)
+        if idx is None or use_h5idx:
+            h5d.read(mspace, fspace, nda)
+        else:
+            tmp = np.empty(fspace.shape, h5d.dtype)
+            h5d.read(fspace, fspace, tmp)
+            nda[:, ...] = tmp[idx, ...]
     # Finally, set attributes and return objects
-    attrs = dict(h5d.attrs)
+    attrs = read_attrs(h5d, fname, oname)
     # special handling for bools
     # (c and Julia store as uint8 so cast to bool)

lgdo/lh5/_serializers/read/scalar.py CHANGED Viewed

@@ -2,20 +2,27 @@ from __future__ import annotations
 import logging
+import h5py
 import numpy as np
 from ....types import Scalar
 from ...exceptions import LH5DecodeError
+from .utils import read_attrs
 log = logging.getLogger(__name__)
 def _h5_read_scalar(
     h5d,
+    fname,
+    oname,
     obj_buf=None,
 ):
-    value = h5d[()]
-    attrs = dict(h5d.attrs)
+    value = np.empty((), h5d.dtype)
+    sp = h5py.h5s.create(h5py.h5s.SCALAR)
+    h5d.read(sp, sp, value)
+    value = value[()]
+    attrs = read_attrs(h5d, fname, oname)
     # special handling for bools
     # (c and Julia store as uint8 so cast to bool)
@@ -25,7 +32,7 @@ def _h5_read_scalar(
     if obj_buf is not None:
         if not isinstance(obj_buf, Scalar):
             msg = "object buffer a Scalar"
-            raise LH5DecodeError(msg, h5d)
+            raise LH5DecodeError(msg, fname, oname)
         obj_buf.value = value
         obj_buf.attrs.update(attrs)

lgdo/lh5/_serializers/read/utils.py CHANGED Viewed

@@ -1,12 +1,35 @@
 from __future__ import annotations
+import h5py
+import numpy as np
 from ...exceptions import LH5DecodeError
-def check_obj_buf_attrs(attrs, new_attrs, obj):
+def check_obj_buf_attrs(attrs, new_attrs, fname, oname):
     if set(attrs.keys()) != set(new_attrs.keys()):
         msg = (
             f"existing buffer and new data chunk have different attributes: "
-            f"obj_buf.attrs={attrs} != {obj.file.filename}[{obj.name}].attrs={new_attrs}"
+            f"obj_buf.attrs={attrs} != {fname}[{oname}].attrs={new_attrs}"
         )
-        raise LH5DecodeError(msg, obj)
+        raise LH5DecodeError(msg, fname, oname)
+def read_attrs(h5o, fname, oname):
+    """Read all attributes for an hdf5 dataset or group using low level API
+    and return them as a dict. Assume all are strings or scalar types."""
+    attrs = {}
+    for i_attr in range(h5py.h5a.get_num_attrs(h5o)):
+        h5a = h5py.h5a.open(h5o, index=i_attr)
+        name = h5a.get_name().decode()
+        if h5a.shape != ():
+            msg = f"attribute {name} is not a string or scalar"
+            raise LH5DecodeError(msg, fname, oname)
+        val = np.empty((), h5a.dtype)
+        h5a.read(val)
+        if h5a.get_type().get_class() == h5py.h5t.STRING:
+            attrs[name] = val.item().decode()
+        else:
+            attrs[name] = val.item()
+        h5a.close()
+    return attrs

lgdo/lh5/_serializers/read/vector_of_vectors.py CHANGED Viewed

@@ -3,6 +3,7 @@ from __future__ import annotations
 import logging
 import sys
+import h5py
 import numba
 import numpy as np
@@ -15,12 +16,15 @@ from ...exceptions import LH5DecodeError
 from .array import (
     _h5_read_array,
 )
+from .utils import read_attrs
 log = logging.getLogger(__name__)
 def _h5_read_vector_of_vectors(
     h5g,
+    fname,
+    oname,
     start_row=0,
     n_rows=sys.maxsize,
     idx=None,
@@ -30,12 +34,15 @@ def _h5_read_vector_of_vectors(
 ):
     if obj_buf is not None and not isinstance(obj_buf, VectorOfVectors):
         msg = "object buffer is not a VectorOfVectors"
-        raise LH5DecodeError(msg, h5g)
+        raise LH5DecodeError(msg, fname, oname)
     # read out cumulative_length
     cumulen_buf = None if obj_buf is None else obj_buf.cumulative_length
+    h5d_cl = h5py.h5d.open(h5g, b"cumulative_length")
     cumulative_length, n_rows_read = _h5_read_array(
-        h5g["cumulative_length"],
+        h5d_cl,
+        fname,
+        f"{oname}/cumulative_length",
         start_row=start_row,
         n_rows=n_rows,
         idx=idx,
@@ -51,17 +58,19 @@ def _h5_read_vector_of_vectors(
     if idx is not None and n_rows_read > 0:
         # get the starting indices for each array in flattened data:
         # the starting index for array[i] is cumulative_length[i-1]
-        idx2 = (np.asarray(idx[0]).copy() - 1,)
+        idx2 = np.asarray(idx).copy() - 1
         # re-read cumulative_length with these indices
         # note this will allocate memory for fd_starts!
         fd_start = None
-        if idx2[0][0] == -1:
-            idx2 = (idx2[0][1:],)
+        if idx2[0] == -1:
+            idx2 = idx2[1:]
             fd_start = 0  # this variable avoids an ndarray append
         fd_starts, fds_n_rows_read = _h5_read_array(
-            h5g["cumulative_length"],
+            h5d_cl,
+            fname,
+            f"{oname}/cumulative_length",
             start_row=start_row,
             n_rows=n_rows,
             idx=idx2,
@@ -98,7 +107,11 @@ def _h5_read_vector_of_vectors(
             # need to read out the cumulen sample -before- the first sample
             # read above in order to get the starting row of the first
             # vector to read out in flattened_data
-            fd_start = h5g["cumulative_length"][start_row - 1]
+            fspace = h5d_cl.get_space()
+            fspace.select_elements([[start_row - 1]])
+            mspace = h5py.h5s.create(h5py.h5s.SCALAR)
+            fd_start = np.empty((), h5d_cl.dtype)
+            h5d_cl.read(mspace, fspace, fd_start)
             # check limits for values that will be used subsequently
             if this_cumulen_nda[-1] < fd_start:
@@ -112,7 +125,7 @@ def _h5_read_vector_of_vectors(
                     f"cumulative_length non-increasing between entries "
                     f"{start_row} and {start_row+n_rows_read}"
                 )
-                raise LH5DecodeError(msg, h5g)
+                raise LH5DecodeError(msg, fname, oname)
         # determine the number of rows for the flattened_data readout
         fd_n_rows = this_cumulen_nda[-1] if n_rows_read > 0 else 0
@@ -126,6 +139,8 @@ def _h5_read_vector_of_vectors(
         # read for flattened_data
         this_cumulen_nda -= fd_start
+    h5d_cl.close()
     # If we started with a partially-filled buffer, add the
     # appropriate offset for the start of the in-memory flattened
     # data for this read.
@@ -144,17 +159,23 @@ def _h5_read_vector_of_vectors(
             fd_buf.resize(fdb_size)
     # now read
-    lgdotype = dtypeutils.datatype(h5g["flattened_data"].attrs["datatype"])
+    h5o = h5py.h5o.open(h5g, b"flattened_data")
+    h5a_dtype = h5py.h5a.open(h5o, b"datatype")
+    val = np.empty((), "O")
+    h5a_dtype.read(val)
+    lgdotype = dtypeutils.datatype(val.item().decode())
     if lgdotype is Array:
         _func = _h5_read_array
     elif lgdotype is VectorOfVectors:
         _func = _h5_read_vector_of_vectors
     else:
         msg = "type {lgdotype.__name__} is not supported"
-        raise LH5DecodeError(msg, h5g, "flattened_data")
+        raise LH5DecodeError(msg, fname, f"{oname}/flattened_data")
     flattened_data, _ = _func(
-        h5g["flattened_data"],
+        h5o,
+        fname,
+        f"{oname}/flattened_data",
         start_row=fd_start,
         n_rows=fd_n_rows,
         idx=fd_idx,
@@ -162,6 +183,7 @@ def _h5_read_vector_of_vectors(
         obj_buf=fd_buf,
         obj_buf_start=fd_buf_start,
     )
+    h5o.close()
     if obj_buf is not None:
         # if the buffer is partially filled, cumulative_length will be invalid
@@ -176,7 +198,7 @@ def _h5_read_vector_of_vectors(
         VectorOfVectors(
             flattened_data=flattened_data,
             cumulative_length=cumulative_length,
-            attrs=dict(h5g.attrs),
+            attrs=read_attrs(h5g, fname, oname),
         ),
         n_rows_read,
     )
@@ -194,4 +216,4 @@ def _make_fd_idx(starts, stops, idx):
         for i in range(starts[j], stops[j]):
             idx[k] = i
             k += 1
-    return (idx,)
+    return idx

lgdo/lh5/_serializers/write/array.py CHANGED Viewed

@@ -71,7 +71,12 @@ def _h5_write_array(
         _attrs = obj.getattrs(datatype=True)
         _attrs.pop("compression", None)
         _attrs.pop("hdf5_settings", None)
-        ds.attrs.update(_attrs)
+        ds.attrs.update(
+            {
+                k: v.encode("utf-8") if isinstance(v, str) else v
+                for k, v in _attrs.items()
+            }
+        )
         return

lgdo/lh5/_serializers/write/composite.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
 import logging
+import os
+from inspect import signature
 import h5py
@@ -27,6 +29,10 @@ def _h5_write_lgdo(
 ):
     assert isinstance(obj, types.LGDO)
+    file_kwargs = {
+        k: h5py_kwargs[k] for k in h5py_kwargs & signature(h5py.File).parameters.keys()
+    }
+    h5py_kwargs = {k: h5py_kwargs[k] for k in h5py_kwargs - file_kwargs.keys()}
     if wo_mode == "write_safe":
         wo_mode = "w"
     if wo_mode == "append":
@@ -46,10 +52,9 @@ def _h5_write_lgdo(
     # In hdf5, 'a' is really "modify" -- in addition to appending, you can
     # change any object in the file. So we use file:append for
     # write_object:overwrite.
-    mode = "w" if wo_mode == "of" else "a"
     if not isinstance(lh5_file, h5py.File):
-        lh5_file = h5py.File(lh5_file, mode=mode)
+        mode = "w" if wo_mode == "of" or not os.path.exists(lh5_file) else "a"
+        lh5_file = h5py.File(lh5_file, mode=mode, **file_kwargs)
     log.debug(
         f"writing {obj!r}[{start_row}:{n_rows}] as "
@@ -65,8 +70,12 @@ def _h5_write_lgdo(
     # struct, table, waveform table or histogram.
     if isinstance(obj, types.Struct):
-        if isinstance(obj, types.Histogram) and wo_mode not in ["w", "o", "of"]:
-            msg = f"can't append-write histogram in wo_mode '{wo_mode}'"
+        if (
+            isinstance(obj, types.Histogram)
+            and wo_mode not in ["w", "o", "of"]
+            and name in group
+        ):
+            msg = f"can't append-write to histogram in wo_mode '{wo_mode}'"
             raise LH5EncodeError(msg, lh5_file, group, name)
         if isinstance(obj, types.Histogram) and write_start != 0:
             msg = f"can't write histogram in wo_mode '{wo_mode}' with write_start != 0"

lgdo/lh5/_serializers/write/scalar.py CHANGED Viewed

@@ -20,4 +20,9 @@ def _h5_write_scalar(obj, name, lh5_file, group="/", wo_mode="append"):
             raise LH5EncodeError(msg, lh5_file, group, name)
     ds = group.create_dataset(name, shape=(), data=obj.value)
-    ds.attrs.update(obj.attrs)
+    ds.attrs.update(
+        {
+            k: v.encode("utf-8") if isinstance(v, str) else v
+            for k, v in obj.attrs.items()
+        }
+    )

lgdo/lh5/core.py CHANGED Viewed

@@ -1,15 +1,18 @@
 from __future__ import annotations
+import bisect
 import inspect
 import sys
 from collections.abc import Mapping, Sequence
 from typing import Any
 import h5py
+import numpy as np
 from numpy.typing import ArrayLike
 from .. import types
 from . import _serializers
+from .utils import read_n_rows
 def read(
@@ -23,6 +26,7 @@ def read(
     obj_buf: types.LGDO = None,
     obj_buf_start: int = 0,
     decompress: bool = True,
+    locking: bool = False,
 ) -> types.LGDO | tuple[types.LGDO, int]:
     """Read LH5 object data from a file.
@@ -97,6 +101,8 @@ def read(
         Decompress data encoded with LGDO's compression routines right
         after reading. The option has no effect on data encoded with HDF5
         built-in filters, which is always decompressed upstream by HDF5.
+    locking
+        Lock HDF5 file while reading
     Returns
     -------
@@ -110,17 +116,69 @@ def read(
     if isinstance(lh5_file, h5py.File):
         lh5_obj = lh5_file[name]
     elif isinstance(lh5_file, str):
-        lh5_file = h5py.File(lh5_file, mode="r")
+        lh5_file = h5py.File(lh5_file, mode="r", locking=locking)
         lh5_obj = lh5_file[name]
     else:
-        lh5_obj = []
-        for h5f in lh5_file:
-            if isinstance(h5f, str):
-                h5f = h5py.File(h5f, mode="r")  # noqa: PLW2901
-            lh5_obj += [h5f[name]]
+        lh5_files = list(lh5_file)
+        n_rows_read = 0
+        obj_buf_is_new = False
+        for i, h5f in enumerate(lh5_files):
+            if (
+                isinstance(idx, (list, tuple))
+                and len(idx) > 0
+                and not np.isscalar(idx[0])
+            ):
+                # a list of lists: must be one per file
+                idx_i = idx[i]
+            elif idx is not None:
+                # make idx a proper tuple if it's not one already
+                if not (isinstance(idx, tuple) and len(idx) == 1):
+                    idx = (idx,)
+                # idx is a long continuous array
+                n_rows_i = read_n_rows(name, h5f)
+                # find the length of the subset of idx that contains indices
+                # that are less than n_rows_i
+                n_rows_to_read_i = bisect.bisect_left(idx[0], n_rows_i)
+                # now split idx into idx_i and the remainder
+                idx_i = np.array(idx[0])[:n_rows_to_read_i]
+                idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
+            else:
+                idx_i = None
+            n_rows_i = n_rows - n_rows_read
+            obj_ret = read(
+                name,
+                h5f,
+                start_row,
+                n_rows_i,
+                idx_i,
+                use_h5idx,
+                field_mask,
+                obj_buf,
+                obj_buf_start,
+                decompress,
+            )
+            if isinstance(obj_ret, tuple):
+                obj_buf, n_rows_read_i = obj_ret
+                obj_buf_is_new = True
+            else:
+                obj_buf = obj_ret
+                n_rows_read_i = len(obj_buf)
+            n_rows_read += n_rows_read_i
+            if n_rows_read >= n_rows or obj_buf is None:
+                return obj_buf, n_rows_read
+            start_row = 0
+            obj_buf_start += n_rows_read_i
+        return obj_buf if obj_buf_is_new else (obj_buf, n_rows_read)
+    if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
+        idx = idx[0]
     obj, n_rows_read = _serializers._h5_read_lgdo(
-        lh5_obj,
+        lh5_obj.id,
+        lh5_obj.file.filename,
+        lh5_obj.name,
         start_row=start_row,
         n_rows=n_rows,
         idx=idx,
@@ -143,6 +201,7 @@ def write(
     n_rows: int | None = None,
     wo_mode: str = "append",
     write_start: int = 0,
+    page_buffer: int = 0,
     **h5py_kwargs,
 ) -> None:
     """Write an LGDO into an LH5 file.
@@ -218,6 +277,11 @@ def write(
     write_start
         row in the output file (if already existing) to start overwriting
         from.
+    page_buffer
+        enable paged aggregation with a buffer of this size in bytes
+        Only used when creating a new file. Useful when writing a file
+        with a large number of small datasets. This is a short-hand for
+        ``(fs_stragety="page", fs_pagesize=[page_buffer])``
     **h5py_kwargs
         additional keyword arguments forwarded to
         :meth:`h5py.Group.create_dataset` to specify, for example, an HDF5
@@ -225,6 +289,13 @@ def write(
         datasets. **Note: `compression` Ignored if compression is specified
         as an `obj` attribute.**
     """
+    if wo_mode in ("w", "write", "of", "overwrite_file"):
+        h5py_kwargs.update(
+            {
+                "fs_strategy": "page",
+                "fs_page_size": page_buffer,
+            }
+        )
     return _serializers._h5_write_lgdo(
         obj,
         name,

lgdo/lh5/exceptions.py CHANGED Viewed

@@ -4,11 +4,11 @@ import h5py
 class LH5DecodeError(Exception):
-    def __init__(self, message: str, obj: h5py.Dataset | h5py.Group) -> None:
+    def __init__(self, message: str, fname: str, oname: str) -> None:
         super().__init__(message)
-        self.file = obj.file.filename
-        self.obj = obj.name
+        self.file = fname
+        self.obj = oname
     def __str__(self) -> str:
         return (

legend-pydataobj 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

legend-pydataobj 1.9.0py3-none-any.whl → 1.10.0py3-none-any.whl