PyPI - legend-pydataobj - Versions diffs - 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl - Mend

legend-pydataobj 1.3.0py3-none-any.whl → 1.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

{legend_pydataobj-1.3.0.dist-info → legend_pydataobj-1.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: legend-pydataobj
-Version: 1.3.0
+Version: 1.4.0
 Summary: LEGEND Python Data Objects
 Home-page: https://github.com/legend-exp/legend-pydataobj
 Author: The LEGEND Collaboration

{legend_pydataobj-1.3.0.dist-info → legend_pydataobj-1.4.0.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 lgdo/__init__.py,sha256=mY6pUNy2yJ2MYzq_ZdhVZZ7xItBW1KJ8h9qA29bOECU,2878
-lgdo/_version.py,sha256=HGwtpza1HCPtlyqElUvIyH97K44TO13CYiYVZNezQ1M,411
+lgdo/_version.py,sha256=R8-T9fmURjcuoxYpHTAjyNAhgJPDtI2jogCjqYYkfCU,411
 lgdo/cli.py,sha256=6o2vGwEq0Fq1y67RTxOHjkVNmN9XGhIBnb8DFFm8ANQ,1428
 lgdo/lgdo_utils.py,sha256=LvqE_eQZjKOuLrocbxc21rvWvh7NA4BSiaJh-jhlxVs,5598
-lgdo/lh5_store.py,sha256=KKJUF3HaTbYleuMHqhHIeUp17nOYkga0pYSmEo1j444,68660
+lgdo/lh5_store.py,sha256=-ceGURWdu0jKTsDTL9bqQLxQ0T8USeeSXAk2cqd-2UU,74246
 lgdo/logging.py,sha256=Nu3wgIoWN7cyUxuzPom5rMwFvTlBu8p8d9uONHDquRg,965
 lgdo/compression/__init__.py,sha256=oT9OXiDDxC7BZciWrQVfHZNkOxXfj4p8EpF2tF04w84,1091
 lgdo/compression/base.py,sha256=ujQY2kYF4z3ZdAy7gXaoDPXFbG2Av1IQ1Nnx6UGLjmk,896
@@ -21,9 +21,9 @@ lgdo/types/struct.py,sha256=UxV0wnCHoQM5rSmzEC9EIKWYV6drHVyK5Ab7UQztuj4,2984
 lgdo/types/table.py,sha256=kgJtI4Ea6jNhsQWS_R-9Ilt7Xm9n5B97sv4Cq6m5q7E,12667
 lgdo/types/vectorofvectors.py,sha256=1oxKJDX8VVWpmvUUDHHEzEYw0RRWJrMjOB-jHRY12N4,21859
 lgdo/types/waveform_table.py,sha256=52vqjGudX5_ZR1-b087jx3vuTxJ_yEPO-dO8Dpi0ceg,9407
-legend_pydataobj-1.3.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-legend_pydataobj-1.3.0.dist-info/METADATA,sha256=s-gX6dzI_9itWMKYKsgF6D0xgB562eMVnNacLINgj-o,3577
-legend_pydataobj-1.3.0.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
-legend_pydataobj-1.3.0.dist-info/entry_points.txt,sha256=j22HoS-1cVhTtKJkDnKB49uNH0nEVER2Tpw-lVh1aws,41
-legend_pydataobj-1.3.0.dist-info/top_level.txt,sha256=KyR-EUloqiXcQ62IWnzBmtInDtvsHl4q2ZJAZgTcLXE,5
-legend_pydataobj-1.3.0.dist-info/RECORD,,
+legend_pydataobj-1.4.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+legend_pydataobj-1.4.0.dist-info/METADATA,sha256=NIX9NkqNDbKopKc0EgUEzy7fBdALFOfiGRJjHh1b01w,3577
+legend_pydataobj-1.4.0.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
+legend_pydataobj-1.4.0.dist-info/entry_points.txt,sha256=j22HoS-1cVhTtKJkDnKB49uNH0nEVER2Tpw-lVh1aws,41
+legend_pydataobj-1.4.0.dist-info/top_level.txt,sha256=KyR-EUloqiXcQ62IWnzBmtInDtvsHl4q2ZJAZgTcLXE,5
+legend_pydataobj-1.4.0.dist-info/RECORD,,

lgdo/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '1.3.0'
-__version_tuple__ = version_tuple = (1, 3, 0)
+__version__ = version = '1.4.0'
+__version_tuple__ = version_tuple = (1, 4, 0)

lgdo/lh5_store.py CHANGED Viewed

@@ -38,7 +38,7 @@ LGDO = Union[Array, Scalar, Struct, VectorOfVectors]
 log = logging.getLogger(__name__)
-DEFAULT_HDF5_COMPRESSION = None
+DEFAULT_HDF5_SETTINGS: dict[str, ...] = {"shuffle": True, "compression": "gzip"}
 class LH5Store:
@@ -169,6 +169,7 @@ class LH5Store:
         start_row: int = 0,
         n_rows: int = sys.maxsize,
         idx: np.ndarray | list | tuple | list[np.ndarray | list | tuple] = None,
+        use_h5idx: bool = False,
         field_mask: dict[str, bool] | list[str] | tuple[str] = None,
         obj_buf: LGDO = None,
         obj_buf_start: int = 0,
@@ -176,6 +177,14 @@ class LH5Store:
     ) -> tuple[LGDO, int]:
         """Read LH5 object data from a file.
+        Use the ``idx`` parameter to read out particular rows of the data. The ``use_h5idx`` flag
+        controls whether *only* those rows are read from disk or if the rows are indexed after reading
+        the entire object. Reading individual rows can be orders of magnitude slower than reading
+        the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``)
+        is to use slightly more memory for a much faster read. See
+        `legend-pydataobj #29 <https://github.com/legend-exp/legend-pydataobj/issues/29>`_
+        for additional information.
         Parameters
         ----------
         name
@@ -192,16 +201,27 @@ class LH5Store:
             actual number of rows read will be returned as one of the return
             values (see below).
         idx
-            For NumPy-style "fancying indexing" for the read. Used to read out
-            rows that pass some selection criteria. Only selection along the first
-            axis is supported, so tuple arguments must be one-tuples.  If `n_rows`
-            is not false, `idx` will be truncated to `n_rows` before reading. To use
-            with a list of files, can pass in a list of `idx`'s (one for each
-            file) or use a long contiguous list (e.g. built from a previous
+            For NumPy-style "fancying indexing" for the read to select only some
+            rows, e.g. after applying some cuts to particular columns.
+            Only selection along the first axis is supported, so tuple arguments
+            must be one-tuples.  If `n_rows` is not false, `idx` will be truncated to
+            `n_rows` before reading. To use with a list of files, can pass in a list of
+            `idx`'s (one for each file) or use a long contiguous list (e.g. built from a previous
             identical read). If used in conjunction with `start_row` and `n_rows`,
             will be sliced to obey those constraints, where `n_rows` is
             interpreted as the (max) number of *selected* values (in `idx`) to be
-            read out.
+            read out. Note that the ``use_h5idx`` parameter controls some behaviour of the
+            read and that the default behavior (``use_h5idx=False``) prioritizes speed over
+            a small memory penalty.
+        use_h5idx
+            ``True`` will directly pass the ``idx`` parameter to the underlying
+            ``h5py`` call such that only the selected rows are read directly into memory,
+            which conserves memory at the cost of speed. There can be a significant penalty
+            to speed for larger files (1 - 2 orders of magnitude longer time).
+            ``False`` (default) will read the entire object into memory before
+            performing the indexing. The default is much faster but requires additional memory,
+            though a relatively small amount in the typical use case. It is recommended to
+            leave this parameter as its default.
         field_mask
             For tables and structs, determines which fields get written out.
             Only applies to immediate fields of the requested objects. If a dict
@@ -223,6 +243,7 @@ class LH5Store:
             after reading. The option has no effect on data encoded with HDF5
             built-in filters, which is always decompressed upstream by HDF5.
         Returns
         -------
         (object, n_rows_read)
@@ -236,6 +257,14 @@ class LH5Store:
         if not isinstance(lh5_file, (str, h5py.File)):
             lh5_file = list(lh5_file)
             n_rows_read = 0
+            # to know whether we are reading in a list of files.
+            # this is part of the fix for reading data by idx
+            # (see https://github.com/legend-exp/legend-pydataobj/issues/29)
+            # so that we only make a copy of the data if absolutely necessary
+            # or if we can read the data from file without having to make a copy
+            self.in_file_loop = True
             for i, h5f in enumerate(lh5_file):
                 if isinstance(idx, list) and len(idx) > 0 and not np.isscalar(idx[0]):
                     # a list of lists: must be one per file
@@ -255,22 +284,32 @@ class LH5Store:
                 else:
                     idx_i = None
                 n_rows_i = n_rows - n_rows_read
+                # maybe someone passed in a list of len==1?
+                if i == (len(lh5_file) - 1):
+                    self.in_file_loop = False
                 obj_buf, n_rows_read_i = self.read_object(
                     name,
                     lh5_file[i],
                     start_row=start_row,
                     n_rows=n_rows_i,
                     idx=idx_i,
+                    use_h5idx=use_h5idx,
                     field_mask=field_mask,
                     obj_buf=obj_buf,
                     obj_buf_start=obj_buf_start,
                     decompress=decompress,
                 )
                 n_rows_read += n_rows_read_i
                 if n_rows_read >= n_rows or obj_buf is None:
                     return obj_buf, n_rows_read
                 start_row = 0
                 obj_buf_start += n_rows_read_i
+            self.in_file_loop = False
             return obj_buf, n_rows_read
         # get the file from the store
@@ -358,6 +397,7 @@ class LH5Store:
                     start_row=start_row,
                     n_rows=n_rows,
                     idx=idx,
+                    use_h5idx=use_h5idx,
                     decompress=decompress,
                 )
             # modify datatype in attrs if a field_mask was used
@@ -404,6 +444,7 @@ class LH5Store:
                     start_row=start_row,
                     n_rows=n_rows,
                     idx=idx,
+                    use_h5idx=use_h5idx,
                     obj_buf=fld_buf,
                     obj_buf_start=obj_buf_start,
                     decompress=decompress,
@@ -497,6 +538,7 @@ class LH5Store:
                     start_row=start_row,
                     n_rows=n_rows,
                     idx=idx,
+                    use_h5idx=use_h5idx,
                     obj_buf=None if decompress else decoded_size_buf,
                     obj_buf_start=0 if decompress else obj_buf_start,
                 )
@@ -508,6 +550,7 @@ class LH5Store:
                     start_row=start_row,
                     n_rows=n_rows,
                     idx=idx,
+                    use_h5idx=use_h5idx,
                     obj_buf=None if decompress else encoded_data_buf,
                     obj_buf_start=0 if decompress else obj_buf_start,
                 )
@@ -573,6 +616,7 @@ class LH5Store:
                 start_row=start_row,
                 n_rows=n_rows,
                 idx=idx,
+                use_h5idx=use_h5idx,
                 obj_buf=cumulen_buf,
                 obj_buf_start=obj_buf_start,
             )
@@ -597,6 +641,7 @@ class LH5Store:
                     start_row=start_row,
                     n_rows=n_rows,
                     idx=idx2,
+                    use_h5idx=use_h5idx,
                 )
                 fd_starts = fd_starts.nda  # we just need the nda
                 if fd_start is None:
@@ -679,6 +724,7 @@ class LH5Store:
                 start_row=fd_start,
                 n_rows=fd_n_rows,
                 idx=fd_idx,
+                use_h5idx=use_h5idx,
                 obj_buf=fd_buf,
                 obj_buf_start=fd_buf_start,
             )
@@ -722,9 +768,22 @@ class LH5Store:
             if n_rows_to_read > n_rows:
                 n_rows_to_read = n_rows
+            # if idx is passed, check if we can make it a slice instead (faster)
+            change_idx_to_slice = False
             # prepare the selection for the read. Use idx if available
             if idx is not None:
-                source_sel = idx
+                # check if idx is empty and convert to slice instead
+                if len(idx[0]) == 0:
+                    source_sel = np.s_[0:0]
+                    change_idx_to_slice = True
+                # check if idx is contiguous and increasing
+                # if so, convert it to a slice instead (faster)
+                elif np.all(np.diff(idx[0]) == 1):
+                    source_sel = np.s_[idx[0][0] : idx[0][-1] + 1]
+                    change_idx_to_slice = True
+                else:
+                    source_sel = idx
             else:
                 source_sel = np.s_[start_row : start_row + n_rows_to_read]
@@ -734,14 +793,34 @@ class LH5Store:
                 if len(obj_buf) < buf_size:
                     obj_buf.resize(buf_size)
                 dest_sel = np.s_[obj_buf_start:buf_size]
-                h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
+                # this is required to make the read of multiple files faster
+                # until a better solution found.
+                if change_idx_to_slice or idx is None or use_h5idx:
+                    h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
+                else:
+                    # it is faster to read the whole object and then do fancy indexing
+                    obj_buf.nda[dest_sel] = h5f[name][...][source_sel]
                 nda = obj_buf.nda
             else:
                 if n_rows == 0:
                     tmp_shape = (0,) + h5f[name].shape[1:]
                     nda = np.empty(tmp_shape, h5f[name].dtype)
                 else:
-                    nda = h5f[name][source_sel]
+                    if change_idx_to_slice or idx is None or use_h5idx:
+                        nda = h5f[name][source_sel]
+                    else:
+                        # it is faster to read the whole object and then do fancy indexing
+                        nda = h5f[name][...][source_sel]
+                        # if reading a list of files recursively, this is given to obj_buf on
+                        # the first file read. obj_buf needs to be resized and therefore
+                        # it needs to hold the data itself (not a view of the data).
+                        # a view is returned by the source_sel indexing, which cannot be resized
+                        # by ndarray.resize().
+                        if hasattr(self, "in_file_loop") and self.in_file_loop:
+                            nda = np.copy(nda)
             # special handling for bools
             # (c and Julia store as uint8 so cast to bool)
@@ -781,7 +860,7 @@ class LH5Store:
         n_rows: int = None,
         wo_mode: str = "append",
         write_start: int = 0,
-        hdf5_compression: str | h5py.filters.FilterRefBase = DEFAULT_HDF5_COMPRESSION,
+        **h5py_kwargs,
     ) -> None:
         """Write an LGDO into an LH5 file.
@@ -796,20 +875,30 @@ class LH5Store:
           passed directly to :meth:`h5py.Group.create_dataset`.
         :class:`.WaveformCodec` object
-          If `obj` is a :class:`.WaveformTable`, compress its `values` using
-          this algorithm. More documentation about the supported waveform
-          compression algorithms at :mod:`.lgdo.compression`.
+          If `obj` is a :class:`.WaveformTable` and ``obj.values`` holds the
+          attribute, compress ``values`` using this algorithm. More
+          documentation about the supported waveform compression algorithms at
+          :mod:`.lgdo.compression`.
+        If the `obj` :class:`.LGDO` has a `hdf5_settings` attribute holding a
+        dictionary, it is interpreted as a list of keyword arguments to be
+        forwarded directly to :meth:`h5py.Group.create_dataset` (exactly like
+        the first format of `compression` above). This is the preferred way to
+        specify HDF5 dataset options such as chunking etc. If compression
+        options are specified, they take precedence over those set with the
+        `compression` attribute.
         Note
         ----
-        The `compression` attribute takes precedence over the
-        `hdf5_compression` argument and is not written to disk.
+        The `compression` LGDO attribute takes precedence over the default HDF5
+        compression settings. The `hdf5_settings` attribute takes precedence
+        over `compression`. These attributes are not written to disk.
         Note
         ----
-        HDF5 compression is skipped for the `encoded_data` dataset of
-        :class:`.VectorOfEncodedVectors` and
-        :class`.ArrayOfEncodedEqualSizedArrays`.
+        HDF5 compression is skipped for the `encoded_data.flattened_data`
+        dataset of :class:`.VectorOfEncodedVectors` and
+        :class:`.ArrayOfEncodedEqualSizedArrays`.
         Parameters
         ----------
@@ -845,15 +934,17 @@ class LH5Store:
         write_start
             row in the output file (if already existing) to start overwriting
             from.
-        hdf5_compression
-            HDF5 compression filter to be applied before writing non-scalar
-            datasets. **Ignored if compression is specified as an `obj`
-            attribute.**
+        **h5py_kwargs
+            additional keyword arguments forwarded to
+            :meth:`h5py.Group.create_dataset` to specify, for example, an HDF5
+            compression filter to be applied before writing non-scalar
+            datasets. **Note: `compression` Ignored if compression is specified
+            as an `obj` attribute.**
         """
         log.debug(
             f"writing {repr(obj)}[{start_row}:{n_rows}] as "
             f"{lh5_file}:{group}/{name}[{write_start}:], "
-            f"mode = {wo_mode}, hdf5_compression = {hdf5_compression}"
+            f"mode = {wo_mode}, h5py_kwargs = {h5py_kwargs}"
         )
         if wo_mode == "write_safe":
@@ -926,8 +1017,8 @@ class LH5Store:
             for field in obj.keys():
                 # eventually compress waveform table values with LGDO's
                 # custom codecs before writing
-                # if waveformtable.values.attrs["compression"] is a string,
-                # interpret it as an HDF5 built-in filter
+                # if waveformtable.values.attrs["compression"] is NOT a
+                # WaveformCodec, just leave it there
                 obj_fld = None
                 if (
                     isinstance(obj, WaveformTable)
@@ -953,7 +1044,7 @@ class LH5Store:
                     n_rows=n_rows,
                     wo_mode=wo_mode,
                     write_start=write_start,
-                    hdf5_compression=hdf5_compression,
+                    **h5py_kwargs,
                 )
             return
@@ -977,6 +1068,9 @@ class LH5Store:
                 name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
             )
+            # ask not to further compress flattened_data, it is already compressed!
+            obj.encoded_data.flattened_data.attrs["compression"] = None
             self.write_object(
                 obj.encoded_data,
                 "encoded_data",
@@ -986,7 +1080,7 @@ class LH5Store:
                 n_rows=n_rows,
                 wo_mode=wo_mode,
                 write_start=write_start,
-                hdf5_compression=None,  # data is already compressed!
+                **h5py_kwargs,
             )
             self.write_object(
@@ -998,7 +1092,7 @@ class LH5Store:
                 n_rows=n_rows,
                 wo_mode=wo_mode,
                 write_start=write_start,
-                hdf5_compression=hdf5_compression,
+                **h5py_kwargs,
             )
         # vector of vectors
@@ -1034,7 +1128,7 @@ class LH5Store:
                 n_rows=fd_n_rows,
                 wo_mode=wo_mode,
                 write_start=offset,
-                hdf5_compression=hdf5_compression,
+                **h5py_kwargs,
             )
             # now offset is used to give appropriate in-file values for
@@ -1057,7 +1151,7 @@ class LH5Store:
                 n_rows=n_rows,
                 wo_mode=wo_mode,
                 write_start=write_start,
-                hdf5_compression=hdf5_compression,
+                **h5py_kwargs,
             )
             obj.cumulative_length.nda -= cl_dtype(offset)
@@ -1077,29 +1171,39 @@ class LH5Store:
             # need to create dataset from ndarray the first time for speed
             # creating an empty dataset and appending to that is super slow!
             if (wo_mode != "a" and write_start == 0) or name not in group:
+                # this is needed in order to have a resizable (in the first
+                # axis) data set, i.e. rows can be appended later
+                # NOTE: this automatically turns chunking on!
                 maxshape = (None,) + nda.shape[1:]
+                h5py_kwargs.setdefault("maxshape", maxshape)
                 if wo_mode == "o" and name in group:
                     log.debug(f"overwriting {name} in {group}")
                     del group[name]
+                # set default compression options
+                for k, v in DEFAULT_HDF5_SETTINGS.items():
+                    h5py_kwargs.setdefault(k, v)
+                # compress using the 'compression' LGDO attribute, if available
+                if "compression" in obj.attrs:
+                    comp_algo = obj.attrs["compression"]
+                    if isinstance(comp_algo, dict):
+                        h5py_kwargs |= obj.attrs["compression"]
+                    else:
+                        h5py_kwargs["compression"] = obj.attrs["compression"]
+                # and even the 'hdf5_settings' one, preferred
+                if "hdf5_settings" in obj.attrs:
+                    h5py_kwargs |= obj.attrs["hdf5_settings"]
                 # create HDF5 dataset
-                # - compress using the 'compression' LGDO attribute, if
-                #   available
-                # - otherwise use "hdf5_compression"
-                # - attach HDF5 dataset attributes, but not "compression"!
-                comp_algo = obj.attrs.get("compression", hdf5_compression)
-                comp_kwargs = {}
-                if isinstance(comp_algo, str):
-                    comp_kwargs = {"compression": comp_algo}
-                elif comp_algo is not None:
-                    comp_kwargs = comp_algo
-                ds = group.create_dataset(
-                    name, data=nda, maxshape=maxshape, **comp_kwargs
-                )
+                ds = group.create_dataset(name, data=nda, **h5py_kwargs)
+                # attach HDF5 dataset attributes, but not "compression"!
                 _attrs = obj.getattrs(datatype=True)
                 _attrs.pop("compression", None)
+                _attrs.pop("hdf5_settings", None)
                 ds.attrs.update(_attrs)
                 return

{legend_pydataobj-1.3.0.dist-info → legend_pydataobj-1.4.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{legend_pydataobj-1.3.0.dist-info → legend_pydataobj-1.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{legend_pydataobj-1.3.0.dist-info → legend_pydataobj-1.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{legend_pydataobj-1.3.0.dist-info → legend_pydataobj-1.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

legend-pydataobj 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

legend-pydataobj 1.3.0py3-none-any.whl → 1.4.0py3-none-any.whl