PyPI - legend-pydataobj - Versions diffs - 1.2.1__py3-none-any.whl → 1.4.0__py3-none-any.whl - Mend

legend-pydataobj 1.2.1py3-none-any.whl → 1.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/METADATA +1 -1
{legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/RECORD +15 -14
lgdo/_version.py +2 -2
lgdo/cli.py +56 -0
lgdo/compression/base.py +2 -0
lgdo/compression/generic.py +8 -2
lgdo/compression/radware.py +226 -112
lgdo/compression/varlen.py +30 -21
lgdo/lh5_store.py +173 -64
lgdo/types/scalar.py +1 -1
lgdo/types/table.py +6 -3
{legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/LICENSE +0 -0
{legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/WHEEL +0 -0
{legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/entry_points.txt +0 -0
{legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/top_level.txt +0 -0

lgdo/compression/varlen.py CHANGED Viewed

@@ -11,7 +11,7 @@ from numpy import int32, ubyte, uint32
 from numpy.typing import NDArray
 from .. import types as lgdo
-from .base import WaveformCodec
+from .base import WaveformCodec, numba_defaults
 log = logging.getLogger(__name__)
@@ -30,7 +30,11 @@ class ULEB128ZigZagDiff(WaveformCodec):
 def encode(
     sig_in: NDArray | lgdo.VectorOfVectors | lgdo.ArrayOfEqualSizedArrays,
     sig_out: NDArray[ubyte] = None,
-) -> (NDArray[ubyte], NDArray[uint32]) | lgdo.VectorOfEncodedVectors:
+) -> (
+    (NDArray[ubyte], NDArray[uint32])
+    | lgdo.VectorOfEncodedVectors
+    | lgdo.ArrayOfEncodedEqualSizedArrays
+):
     """Compress digital signal(s) with a variable-length encoding of its derivative.
     Wraps :func:`uleb128_zigzag_diff_array_encode` and adds support for encoding
@@ -41,8 +45,9 @@ def encode(
     If `sig_in` is a NumPy array, no resizing of `sig_out` is performed. Not
     even of the internally allocated one.
-    Because of the current implementation, providing a pre-allocated
-    :class:`.VectorOfEncodedVectors` as `sig_out` is not possible.
+    Because of the current (hardware vectorized) implementation, providing a
+    pre-allocated :class:`.VectorOfEncodedVectors` or
+    :class:`.ArrayOfEncodedEqualSizedArrays` as `sig_out` is not possible.
     Parameters
     ----------
@@ -54,11 +59,12 @@ def encode(
     Returns
     -------
-    sig_out, nbytes
+    sig_out, nbytes | LGDO
         given pre-allocated `sig_out` structure or new structure of unsigned
         8-bit integers, plus the number of bytes (length) of the encoded
         signal. If `sig_in` is an :class:`.LGDO`, only a newly allocated
-        :class:`.VectorOfEncodedVectors` is returned.
+        :class:`.VectorOfEncodedVectors` or
+        :class:`.ArrayOfEncodedEqualSizedArrays` is returned.
     See Also
     --------
@@ -88,7 +94,7 @@ def encode(
         return sig_out, nbytes
     elif isinstance(sig_in, lgdo.VectorOfVectors):
-        if sig_out:
+        if sig_out is not None:
             log.warning(
                 "a pre-allocated VectorOfEncodedVectors was given "
                 "to hold an encoded ArrayOfEqualSizedArrays. "
@@ -142,9 +148,11 @@ def encode(
 def decode(
-    sig_in: (NDArray[ubyte], NDArray[uint32]) | lgdo.VectorOfEncodedVectors,
-    sig_out: NDArray | lgdo.VectorOfVectors | lgdo.ArrayOfEqualSizedArrays = None,
-) -> NDArray | lgdo.VectorOfVectors | lgdo.ArrayOfEqualSizedArrays:
+    sig_in: (NDArray[ubyte], NDArray[uint32])
+    | lgdo.VectorOfEncodedVectors
+    | lgdo.ArrayOfEncodedEqualSizedArrays,
+    sig_out: NDArray | lgdo.ArrayOfEqualSizedArrays = None,
+) -> (NDArray, NDArray[uint32]) | lgdo.VectorOfVectors | lgdo.ArrayOfEqualSizedArrays:
     """Deompress digital signal(s) with a variable-length encoding of its derivative.
     Wraps :func:`uleb128_zigzag_diff_array_decode` and adds support for decoding
@@ -159,8 +167,8 @@ def decode(
     :class:`.ArrayOfEqualSizedArrays` `sig_out` has instead always the correct
     size.
-    Because of the current implementation, providing a pre-allocated
-    :class:`.VectorOfVectors` as `sig_out` is not possible.
+    Because of the current (hardware vectorized) implementation, providing a
+    pre-allocated :class:`.VectorOfVectors` as `sig_out` is not possible.
     Parameters
     ----------
@@ -173,8 +181,9 @@ def decode(
     Returns
     -------
-    sig_out
-        given pre-allocated structure or new structure of 32-bit integers.
+    sig_out, nbytes | LGDO
+        given pre-allocated structure or new structure of 32-bit integers, plus
+        the number of bytes (length) of the decoded signal.
     See Also
     --------
@@ -199,7 +208,7 @@ def decode(
         return sig_out, siglen
     elif isinstance(sig_in, lgdo.ArrayOfEncodedEqualSizedArrays):
-        if not sig_out:
+        if sig_out is None:
             # initialize output structure with decoded_size
             sig_out = lgdo.ArrayOfEqualSizedArrays(
                 dims=(1, 1),
@@ -257,7 +266,7 @@ def decode(
 @numba.vectorize(
     ["uint64(int64)", "uint32(int32)", "uint16(int16)"],
-    nopython=True,
+    **numba_defaults,
 )
 def zigzag_encode(x: int | NDArray[int]) -> int | NDArray[int]:
     """ZigZag-encode [#WikiZZ]_ signed integer numbers."""
@@ -266,14 +275,14 @@ def zigzag_encode(x: int | NDArray[int]) -> int | NDArray[int]:
 @numba.vectorize(
     ["int64(uint64)", "int32(uint32)", "int16(uint16)"],
-    nopython=True,
+    **numba_defaults,
 )
 def zigzag_decode(x: int | NDArray[int]) -> int | NDArray[int]:
     """ZigZag-decode [#WikiZZ]_ signed integer numbers."""
     return (x >> 1) ^ -(x & 1)
-@numba.jit(["uint32(int64, byte[:])"], nopython=True)
+@numba.jit(["uint32(int64, byte[:])"], **numba_defaults)
 def uleb128_encode(x: int, encx: NDArray[ubyte]) -> int:
     """Compute a variable-length representation of an unsigned integer.
@@ -306,7 +315,7 @@ def uleb128_encode(x: int, encx: NDArray[ubyte]) -> int:
     return i + 1
-@numba.jit(["UniTuple(uint32, 2)(byte[:])"], nopython=True)
+@numba.jit(["UniTuple(uint32, 2)(byte[:])"], **numba_defaults)
 def uleb128_decode(encx: NDArray[ubyte]) -> (int, int):
     """Decode a variable-length integer into an unsigned integer.
@@ -351,7 +360,7 @@ def uleb128_decode(encx: NDArray[ubyte]) -> (int, int):
         "void(int64[:], byte[:], uint32[:])",
     ],
     "(n),(m),()",
-    nopython=True,
+    **numba_defaults,
 )
 def uleb128_zigzag_diff_array_encode(
     sig_in: NDArray[int], sig_out: NDArray[ubyte], nbytes: int
@@ -401,7 +410,7 @@ def uleb128_zigzag_diff_array_encode(
         "void(byte[:], uint32[:], int64[:], uint32[:])",
     ],
     "(n),(),(m),()",
-    nopython=True,
+    **numba_defaults,
 )
 def uleb128_zigzag_diff_array_decode(
     sig_in: NDArray[ubyte],

lgdo/lh5_store.py CHANGED Viewed

@@ -38,7 +38,7 @@ LGDO = Union[Array, Scalar, Struct, VectorOfVectors]
 log = logging.getLogger(__name__)
-DEFAULT_HDF5_COMPRESSION = None
+DEFAULT_HDF5_SETTINGS: dict[str, ...] = {"shuffle": True, "compression": "gzip"}
 class LH5Store:
@@ -169,6 +169,7 @@ class LH5Store:
         start_row: int = 0,
         n_rows: int = sys.maxsize,
         idx: np.ndarray | list | tuple | list[np.ndarray | list | tuple] = None,
+        use_h5idx: bool = False,
         field_mask: dict[str, bool] | list[str] | tuple[str] = None,
         obj_buf: LGDO = None,
         obj_buf_start: int = 0,
@@ -176,6 +177,14 @@ class LH5Store:
     ) -> tuple[LGDO, int]:
         """Read LH5 object data from a file.
+        Use the ``idx`` parameter to read out particular rows of the data. The ``use_h5idx`` flag
+        controls whether *only* those rows are read from disk or if the rows are indexed after reading
+        the entire object. Reading individual rows can be orders of magnitude slower than reading
+        the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``)
+        is to use slightly more memory for a much faster read. See
+        `legend-pydataobj #29 <https://github.com/legend-exp/legend-pydataobj/issues/29>`_
+        for additional information.
         Parameters
         ----------
         name
@@ -192,16 +201,27 @@ class LH5Store:
             actual number of rows read will be returned as one of the return
             values (see below).
         idx
-            For NumPy-style "fancying indexing" for the read. Used to read out
-            rows that pass some selection criteria. Only selection along the first
-            axis is supported, so tuple arguments must be one-tuples.  If `n_rows`
-            is not false, `idx` will be truncated to `n_rows` before reading. To use
-            with a list of files, can pass in a list of `idx`'s (one for each
-            file) or use a long contiguous list (e.g. built from a previous
+            For NumPy-style "fancying indexing" for the read to select only some
+            rows, e.g. after applying some cuts to particular columns.
+            Only selection along the first axis is supported, so tuple arguments
+            must be one-tuples.  If `n_rows` is not false, `idx` will be truncated to
+            `n_rows` before reading. To use with a list of files, can pass in a list of
+            `idx`'s (one for each file) or use a long contiguous list (e.g. built from a previous
             identical read). If used in conjunction with `start_row` and `n_rows`,
             will be sliced to obey those constraints, where `n_rows` is
             interpreted as the (max) number of *selected* values (in `idx`) to be
-            read out.
+            read out. Note that the ``use_h5idx`` parameter controls some behaviour of the
+            read and that the default behavior (``use_h5idx=False``) prioritizes speed over
+            a small memory penalty.
+        use_h5idx
+            ``True`` will directly pass the ``idx`` parameter to the underlying
+            ``h5py`` call such that only the selected rows are read directly into memory,
+            which conserves memory at the cost of speed. There can be a significant penalty
+            to speed for larger files (1 - 2 orders of magnitude longer time).
+            ``False`` (default) will read the entire object into memory before
+            performing the indexing. The default is much faster but requires additional memory,
+            though a relatively small amount in the typical use case. It is recommended to
+            leave this parameter as its default.
         field_mask
             For tables and structs, determines which fields get written out.
             Only applies to immediate fields of the requested objects. If a dict
@@ -223,6 +243,7 @@ class LH5Store:
             after reading. The option has no effect on data encoded with HDF5
             built-in filters, which is always decompressed upstream by HDF5.
         Returns
         -------
         (object, n_rows_read)
@@ -236,6 +257,14 @@ class LH5Store:
         if not isinstance(lh5_file, (str, h5py.File)):
             lh5_file = list(lh5_file)
             n_rows_read = 0
+            # to know whether we are reading in a list of files.
+            # this is part of the fix for reading data by idx
+            # (see https://github.com/legend-exp/legend-pydataobj/issues/29)
+            # so that we only make a copy of the data if absolutely necessary
+            # or if we can read the data from file without having to make a copy
+            self.in_file_loop = True
             for i, h5f in enumerate(lh5_file):
                 if isinstance(idx, list) and len(idx) > 0 and not np.isscalar(idx[0]):
                     # a list of lists: must be one per file
@@ -255,22 +284,32 @@ class LH5Store:
                 else:
                     idx_i = None
                 n_rows_i = n_rows - n_rows_read
+                # maybe someone passed in a list of len==1?
+                if i == (len(lh5_file) - 1):
+                    self.in_file_loop = False
                 obj_buf, n_rows_read_i = self.read_object(
                     name,
                     lh5_file[i],
                     start_row=start_row,
                     n_rows=n_rows_i,
                     idx=idx_i,
+                    use_h5idx=use_h5idx,
                     field_mask=field_mask,
                     obj_buf=obj_buf,
                     obj_buf_start=obj_buf_start,
                     decompress=decompress,
                 )
                 n_rows_read += n_rows_read_i
                 if n_rows_read >= n_rows or obj_buf is None:
                     return obj_buf, n_rows_read
                 start_row = 0
                 obj_buf_start += n_rows_read_i
+            self.in_file_loop = False
             return obj_buf, n_rows_read
         # get the file from the store
@@ -358,6 +397,7 @@ class LH5Store:
                     start_row=start_row,
                     n_rows=n_rows,
                     idx=idx,
+                    use_h5idx=use_h5idx,
                     decompress=decompress,
                 )
             # modify datatype in attrs if a field_mask was used
@@ -404,6 +444,7 @@ class LH5Store:
                     start_row=start_row,
                     n_rows=n_rows,
                     idx=idx,
+                    use_h5idx=use_h5idx,
                     obj_buf=fld_buf,
                     obj_buf_start=obj_buf_start,
                     decompress=decompress,
@@ -497,6 +538,7 @@ class LH5Store:
                     start_row=start_row,
                     n_rows=n_rows,
                     idx=idx,
+                    use_h5idx=use_h5idx,
                     obj_buf=None if decompress else decoded_size_buf,
                     obj_buf_start=0 if decompress else obj_buf_start,
                 )
@@ -508,6 +550,7 @@ class LH5Store:
                     start_row=start_row,
                     n_rows=n_rows,
                     idx=idx,
+                    use_h5idx=use_h5idx,
                     obj_buf=None if decompress else encoded_data_buf,
                     obj_buf_start=0 if decompress else obj_buf_start,
                 )
@@ -531,26 +574,31 @@ class LH5Store:
                 elif obj_buf is None and decompress:
                     return compress.decode(rawdata), n_rows_read
+                # eventually expand provided obj_buf, if too short
+                buf_size = obj_buf_start + n_rows_read
+                if len(obj_buf) < buf_size:
+                    obj_buf.resize(buf_size)
                 # use the (decoded object type) buffer otherwise
-                if enc_lgdo == VectorOfEncodedVectors and not isinstance(
-                    obj_buf, VectorOfVectors
-                ):
-                    raise ValueError(
-                        f"obj_buf for decoded '{name}' not a VectorOfVectors"
-                    )
-                elif enc_lgdo == ArrayOfEncodedEqualSizedArrays and not isinstance(
-                    obj_buf, ArrayOfEqualSizedArrays
-                ):
-                    raise ValueError(
-                        f"obj_buf for decoded '{name}' not an ArrayOfEqualSizedArrays"
-                    )
+                if enc_lgdo == ArrayOfEncodedEqualSizedArrays:
+                    if not isinstance(obj_buf, ArrayOfEqualSizedArrays):
+                        raise ValueError(
+                            f"obj_buf for decoded '{name}' not an ArrayOfEqualSizedArrays"
+                        )
+                    compress.decode(rawdata, obj_buf[obj_buf_start:buf_size])
+                elif enc_lgdo == VectorOfEncodedVectors:
+                    if not isinstance(obj_buf, VectorOfVectors):
+                        raise ValueError(
+                            f"obj_buf for decoded '{name}' not a VectorOfVectors"
+                        )
-                # FIXME: not a good idea. an in place decoding version
-                # of decode would be needed to avoid extra memory
-                # allocations
-                # FIXME: obj_buf_start??? Write a unit test
-                for i, wf in enumerate(compress.decode(rawdata)):
-                    obj_buf[i] = wf
+                    # FIXME: not a good idea. an in place decoding version
+                    # of decode would be needed to avoid extra memory
+                    # allocations
+                    for i, wf in enumerate(compress.decode(rawdata)):
+                        obj_buf[obj_buf_start + i] = wf
                 return obj_buf, n_rows_read
@@ -568,6 +616,7 @@ class LH5Store:
                 start_row=start_row,
                 n_rows=n_rows,
                 idx=idx,
+                use_h5idx=use_h5idx,
                 obj_buf=cumulen_buf,
                 obj_buf_start=obj_buf_start,
             )
@@ -592,6 +641,7 @@ class LH5Store:
                     start_row=start_row,
                     n_rows=n_rows,
                     idx=idx2,
+                    use_h5idx=use_h5idx,
                 )
                 fd_starts = fd_starts.nda  # we just need the nda
                 if fd_start is None:
@@ -674,6 +724,7 @@ class LH5Store:
                 start_row=fd_start,
                 n_rows=fd_n_rows,
                 idx=fd_idx,
+                use_h5idx=use_h5idx,
                 obj_buf=fd_buf,
                 obj_buf_start=fd_buf_start,
             )
@@ -717,9 +768,22 @@ class LH5Store:
             if n_rows_to_read > n_rows:
                 n_rows_to_read = n_rows
+            # if idx is passed, check if we can make it a slice instead (faster)
+            change_idx_to_slice = False
             # prepare the selection for the read. Use idx if available
             if idx is not None:
-                source_sel = idx
+                # check if idx is empty and convert to slice instead
+                if len(idx[0]) == 0:
+                    source_sel = np.s_[0:0]
+                    change_idx_to_slice = True
+                # check if idx is contiguous and increasing
+                # if so, convert it to a slice instead (faster)
+                elif np.all(np.diff(idx[0]) == 1):
+                    source_sel = np.s_[idx[0][0] : idx[0][-1] + 1]
+                    change_idx_to_slice = True
+                else:
+                    source_sel = idx
             else:
                 source_sel = np.s_[start_row : start_row + n_rows_to_read]
@@ -729,14 +793,34 @@ class LH5Store:
                 if len(obj_buf) < buf_size:
                     obj_buf.resize(buf_size)
                 dest_sel = np.s_[obj_buf_start:buf_size]
-                h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
+                # this is required to make the read of multiple files faster
+                # until a better solution found.
+                if change_idx_to_slice or idx is None or use_h5idx:
+                    h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
+                else:
+                    # it is faster to read the whole object and then do fancy indexing
+                    obj_buf.nda[dest_sel] = h5f[name][...][source_sel]
                 nda = obj_buf.nda
             else:
                 if n_rows == 0:
                     tmp_shape = (0,) + h5f[name].shape[1:]
                     nda = np.empty(tmp_shape, h5f[name].dtype)
                 else:
-                    nda = h5f[name][source_sel]
+                    if change_idx_to_slice or idx is None or use_h5idx:
+                        nda = h5f[name][source_sel]
+                    else:
+                        # it is faster to read the whole object and then do fancy indexing
+                        nda = h5f[name][...][source_sel]
+                        # if reading a list of files recursively, this is given to obj_buf on
+                        # the first file read. obj_buf needs to be resized and therefore
+                        # it needs to hold the data itself (not a view of the data).
+                        # a view is returned by the source_sel indexing, which cannot be resized
+                        # by ndarray.resize().
+                        if hasattr(self, "in_file_loop") and self.in_file_loop:
+                            nda = np.copy(nda)
             # special handling for bools
             # (c and Julia store as uint8 so cast to bool)
@@ -776,7 +860,7 @@ class LH5Store:
         n_rows: int = None,
         wo_mode: str = "append",
         write_start: int = 0,
-        hdf5_compression: str | h5py.filters.FilterRefBase = DEFAULT_HDF5_COMPRESSION,
+        **h5py_kwargs,
     ) -> None:
         """Write an LGDO into an LH5 file.
@@ -791,20 +875,30 @@ class LH5Store:
           passed directly to :meth:`h5py.Group.create_dataset`.
         :class:`.WaveformCodec` object
-          If `obj` is a :class:`.WaveformTable`, compress its `values` using
-          this algorithm. More documentation about the supported waveform
-          compression algorithms at :mod:`.lgdo.compression`.
+          If `obj` is a :class:`.WaveformTable` and ``obj.values`` holds the
+          attribute, compress ``values`` using this algorithm. More
+          documentation about the supported waveform compression algorithms at
+          :mod:`.lgdo.compression`.
+        If the `obj` :class:`.LGDO` has a `hdf5_settings` attribute holding a
+        dictionary, it is interpreted as a list of keyword arguments to be
+        forwarded directly to :meth:`h5py.Group.create_dataset` (exactly like
+        the first format of `compression` above). This is the preferred way to
+        specify HDF5 dataset options such as chunking etc. If compression
+        options are specified, they take precedence over those set with the
+        `compression` attribute.
         Note
         ----
-        The `compression` attribute takes precedence over the
-        `hdf5_compression` argument and is not written to disk.
+        The `compression` LGDO attribute takes precedence over the default HDF5
+        compression settings. The `hdf5_settings` attribute takes precedence
+        over `compression`. These attributes are not written to disk.
         Note
         ----
-        HDF5 compression is skipped for the `encoded_data` dataset of
-        :class:`.VectorOfEncodedVectors` and
-        :class`.ArrayOfEncodedEqualSizedArrays`.
+        HDF5 compression is skipped for the `encoded_data.flattened_data`
+        dataset of :class:`.VectorOfEncodedVectors` and
+        :class:`.ArrayOfEncodedEqualSizedArrays`.
         Parameters
         ----------
@@ -840,15 +934,17 @@ class LH5Store:
         write_start
             row in the output file (if already existing) to start overwriting
             from.
-        hdf5_compression
-            HDF5 compression filter to be applied before writing non-scalar
-            datasets. **Ignored if compression is specified as an `obj`
-            attribute.**
+        **h5py_kwargs
+            additional keyword arguments forwarded to
+            :meth:`h5py.Group.create_dataset` to specify, for example, an HDF5
+            compression filter to be applied before writing non-scalar
+            datasets. **Note: `compression` Ignored if compression is specified
+            as an `obj` attribute.**
         """
         log.debug(
             f"writing {repr(obj)}[{start_row}:{n_rows}] as "
             f"{lh5_file}:{group}/{name}[{write_start}:], "
-            f"mode = {wo_mode}, hdf5_compression = {hdf5_compression}"
+            f"mode = {wo_mode}, h5py_kwargs = {h5py_kwargs}"
         )
         if wo_mode == "write_safe":
@@ -921,8 +1017,8 @@ class LH5Store:
             for field in obj.keys():
                 # eventually compress waveform table values with LGDO's
                 # custom codecs before writing
-                # if waveformtable.values.attrs["compression"] is a string,
-                # interpret it as an HDF5 built-in filter
+                # if waveformtable.values.attrs["compression"] is NOT a
+                # WaveformCodec, just leave it there
                 obj_fld = None
                 if (
                     isinstance(obj, WaveformTable)
@@ -948,7 +1044,7 @@ class LH5Store:
                     n_rows=n_rows,
                     wo_mode=wo_mode,
                     write_start=write_start,
-                    hdf5_compression=hdf5_compression,
+                    **h5py_kwargs,
                 )
             return
@@ -972,6 +1068,9 @@ class LH5Store:
                 name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
             )
+            # ask not to further compress flattened_data, it is already compressed!
+            obj.encoded_data.flattened_data.attrs["compression"] = None
             self.write_object(
                 obj.encoded_data,
                 "encoded_data",
@@ -981,7 +1080,7 @@ class LH5Store:
                 n_rows=n_rows,
                 wo_mode=wo_mode,
                 write_start=write_start,
-                hdf5_compression=None,  # data is already compressed!
+                **h5py_kwargs,
             )
             self.write_object(
@@ -993,7 +1092,7 @@ class LH5Store:
                 n_rows=n_rows,
                 wo_mode=wo_mode,
                 write_start=write_start,
-                hdf5_compression=hdf5_compression,
+                **h5py_kwargs,
             )
         # vector of vectors
@@ -1029,7 +1128,7 @@ class LH5Store:
                 n_rows=fd_n_rows,
                 wo_mode=wo_mode,
                 write_start=offset,
-                hdf5_compression=hdf5_compression,
+                **h5py_kwargs,
             )
             # now offset is used to give appropriate in-file values for
@@ -1052,7 +1151,7 @@ class LH5Store:
                 n_rows=n_rows,
                 wo_mode=wo_mode,
                 write_start=write_start,
-                hdf5_compression=hdf5_compression,
+                **h5py_kwargs,
             )
             obj.cumulative_length.nda -= cl_dtype(offset)
@@ -1072,29 +1171,39 @@ class LH5Store:
             # need to create dataset from ndarray the first time for speed
             # creating an empty dataset and appending to that is super slow!
             if (wo_mode != "a" and write_start == 0) or name not in group:
+                # this is needed in order to have a resizable (in the first
+                # axis) data set, i.e. rows can be appended later
+                # NOTE: this automatically turns chunking on!
                 maxshape = (None,) + nda.shape[1:]
+                h5py_kwargs.setdefault("maxshape", maxshape)
                 if wo_mode == "o" and name in group:
                     log.debug(f"overwriting {name} in {group}")
                     del group[name]
+                # set default compression options
+                for k, v in DEFAULT_HDF5_SETTINGS.items():
+                    h5py_kwargs.setdefault(k, v)
+                # compress using the 'compression' LGDO attribute, if available
+                if "compression" in obj.attrs:
+                    comp_algo = obj.attrs["compression"]
+                    if isinstance(comp_algo, dict):
+                        h5py_kwargs |= obj.attrs["compression"]
+                    else:
+                        h5py_kwargs["compression"] = obj.attrs["compression"]
+                # and even the 'hdf5_settings' one, preferred
+                if "hdf5_settings" in obj.attrs:
+                    h5py_kwargs |= obj.attrs["hdf5_settings"]
                 # create HDF5 dataset
-                # - compress using the 'compression' LGDO attribute, if
-                #   available
-                # - otherwise use "hdf5_compression"
-                # - attach HDF5 dataset attributes, but not "compression"!
-                comp_algo = obj.attrs.get("compression", hdf5_compression)
-                comp_kwargs = {}
-                if isinstance(comp_algo, str):
-                    comp_kwargs = {"compression": comp_algo}
-                elif comp_algo is not None:
-                    comp_kwargs = comp_algo
-                ds = group.create_dataset(
-                    name, data=nda, maxshape=maxshape, **comp_kwargs
-                )
+                ds = group.create_dataset(name, data=nda, **h5py_kwargs)
+                # attach HDF5 dataset attributes, but not "compression"!
                 _attrs = obj.getattrs(datatype=True)
                 _attrs.pop("compression", None)
+                _attrs.pop("hdf5_settings", None)
                 ds.attrs.update(_attrs)
                 return

lgdo/types/scalar.py CHANGED Viewed

@@ -18,7 +18,7 @@ class Scalar(LGDO):
     # TODO: do scalars need proper numpy dtypes?
-    def __init__(self, value: int | float, attrs: dict[str, Any] = None) -> None:
+    def __init__(self, value: int | float | str, attrs: dict[str, Any] = None) -> None:
         """
         Parameters
         ----------

lgdo/types/table.py CHANGED Viewed

@@ -225,7 +225,10 @@ class Table(Struct):
                 if not hasattr(column, "nda"):
                     raise ValueError(f"column {col} does not have an nda")
                 else:
-                    df[prefix + str(col)] = column.nda.tolist()
+                    if len(column.nda.shape) == 1:
+                        df[prefix + str(col)] = column.nda
+                    else:
+                        df[prefix + str(col)] = column.nda.tolist()
         return df
@@ -248,8 +251,8 @@ class Table(Struct):
                     "O1": {
                         "expression": "p1 + p2 * a**2",
                         "parameters": {
-                            "p1": "2",
-                            "p2": "3"
+                            "p1": 2,
+                            "p2": 3
                         }
                     },
                     "O2": {

{legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

legend-pydataobj 1.2.1__py3-none-any.whl → 1.4.0__py3-none-any.whl

legend-pydataobj 1.2.1py3-none-any.whl → 1.4.0py3-none-any.whl