PyPI - legend-pydataobj - Versions diffs - 1.0.0__py3-none-any.whl - Mend

legend-pydataobj 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

legend_pydataobj-1.0.0.dist-info/LICENSE +674 -0
legend_pydataobj-1.0.0.dist-info/METADATA +63 -0
legend_pydataobj-1.0.0.dist-info/RECORD +26 -0
legend_pydataobj-1.0.0.dist-info/WHEEL +5 -0
legend_pydataobj-1.0.0.dist-info/top_level.txt +1 -0
lgdo/__init__.py +75 -0
lgdo/_version.py +4 -0
lgdo/compression/__init__.py +36 -0
lgdo/compression/base.py +29 -0
lgdo/compression/generic.py +77 -0
lgdo/compression/radware.py +579 -0
lgdo/compression/utils.py +34 -0
lgdo/compression/varlen.py +449 -0
lgdo/lgdo_utils.py +196 -0
lgdo/lh5_store.py +1711 -0
lgdo/types/__init__.py +30 -0
lgdo/types/array.py +140 -0
lgdo/types/arrayofequalsizedarrays.py +133 -0
lgdo/types/encoded.py +390 -0
lgdo/types/fixedsizearray.py +43 -0
lgdo/types/lgdo.py +51 -0
lgdo/types/scalar.py +59 -0
lgdo/types/struct.py +108 -0
lgdo/types/table.py +349 -0
lgdo/types/vectorofvectors.py +627 -0
lgdo/types/waveform_table.py +264 -0

lgdo/compression/varlen.py ADDED Viewed

@@ -0,0 +1,449 @@
+"""Variable-length code compression algorithms."""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass
+import numba
+import numpy as np
+from numpy import int32, ubyte, uint32
+from numpy.typing import NDArray
+from .. import types as lgdo
+from .base import WaveformCodec
+log = logging.getLogger(__name__)
+@dataclass(frozen=True)
+class ULEB128ZigZagDiff(WaveformCodec):
+    """ZigZag [#WikiZZ]_ encoding followed by Unsigned Little Endian Base 128 (ULEB128) [#WikiULEB128]_ encoding of array differences.
+    .. [#WikiZZ] https://wikipedia.org/wiki/Variable-length_quantity#Zigzag_encoding
+    .. [#WikiULEB128] https://wikipedia.org/wiki/LEB128#Unsigned_LEB128
+    """
+    codec: str = "uleb128_zigzag_diff"
+def encode(
+    sig_in: NDArray | lgdo.VectorOfVectors | lgdo.ArrayOfEqualSizedArrays,
+    sig_out: NDArray[ubyte] = None,
+) -> (NDArray[ubyte], NDArray[uint32]) | lgdo.VectorOfEncodedVectors:
+    """Compress digital signal(s) with a variable-length encoding of its derivative.
+    Wraps :func:`uleb128_zigzag_diff_array_encode` and adds support for encoding
+    LGDOs.
+    Note
+    ----
+    If `sig_in` is a NumPy array, no resizing of `sig_out` is performed. Not
+    even of the internally allocated one.
+    Because of the current implementation, providing a pre-allocated
+    :class:`.VectorOfEncodedVectors` as `sig_out` is not possible.
+    Parameters
+    ----------
+    sig_in
+        array(s) holding the input signal(s).
+    sig_out
+        pre-allocated unsigned 8-bit integer array(s) for the compressed
+        signal(s). If not provided, a new one will be allocated.
+    Returns
+    -------
+    sig_out, nbytes
+        given pre-allocated `sig_out` structure or new structure of unsigned
+        8-bit integers, plus the number of bytes (length) of the encoded
+        signal. If `sig_in` is an :class:`.LGDO`, only a newly allocated
+        :class:`.VectorOfEncodedVectors` is returned.
+    See Also
+    --------
+    uleb128_zigzag_diff_array_encode
+    """
+    if isinstance(sig_in, np.ndarray):
+        s = sig_in.shape
+        if len(sig_in) == 0:
+            return np.empty(s[:-1] + (0,), dtype=ubyte), np.empty(0, dtype=uint32)
+        if sig_out is None:
+            # the encoded signal is an array of bytes
+            # pre-allocate ubyte (uint8) array with a generous (but safe) size
+            max_b = int(np.ceil(np.iinfo(sig_in.dtype).bits / 16) * 5)
+            # expand last dimension
+            sig_out = np.empty(s[:-1] + (s[-1] * max_b,), dtype=ubyte)
+        if sig_out.dtype != ubyte:
+            raise ValueError("sig_out must be of type ubyte")
+        # nbytes has one dimension less (the last one)
+        nbytes = np.empty(s[:-1], dtype=uint32)
+        uleb128_zigzag_diff_array_encode(sig_in, sig_out, nbytes)
+        # return without resizing
+        return sig_out, nbytes
+    elif isinstance(sig_in, lgdo.VectorOfVectors):
+        if sig_out:
+            log.warning(
+                "a pre-allocated VectorOfEncodedVectors was given "
+                "to hold an encoded ArrayOfEqualSizedArrays. "
+                "This is not supported at the moment, so a new one "
+                "will be allocated to replace it"
+            )
+        # convert VectorOfVectors to ArrayOfEqualSizedArrays so it can be
+        # directly passed to the low-level encoding routine
+        sig_out_nda, nbytes = encode(sig_in.to_aoesa())
+        # build the encoded LGDO
+        encoded_data = lgdo.ArrayOfEqualSizedArrays(nda=sig_out_nda).to_vov(
+            cumulative_length=np.cumsum(nbytes, dtype=uint32)
+        )
+        # decoded_size is an array, compute it by diff'ing the original VOV
+        decoded_size = np.diff(sig_in.cumulative_length, prepend=uint32(0))
+        sig_out = lgdo.VectorOfEncodedVectors(encoded_data, decoded_size)
+        return sig_out
+    elif isinstance(sig_in, lgdo.ArrayOfEqualSizedArrays):
+        if sig_out:
+            log.warning(
+                "a pre-allocated VectorOfEncodedVectors was given "
+                "to hold an encoded ArrayOfEqualSizedArrays. "
+                "This is not supported at the moment, so a new one "
+                "will be allocated to replace it"
+            )
+        # encode the internal numpy array
+        sig_out_nda, nbytes = encode(sig_in.nda)
+        # build the encoded LGDO
+        encoded_data = lgdo.ArrayOfEqualSizedArrays(nda=sig_out_nda).to_vov(
+            cumulative_length=np.cumsum(nbytes, dtype=uint32)
+        )
+        sig_out = lgdo.ArrayOfEncodedEqualSizedArrays(
+            encoded_data, decoded_size=sig_in.nda.shape[1]
+        )
+        return sig_out
+    elif isinstance(sig_in, lgdo.Array):
+        # encode the internal numpy array
+        sig_out_nda, nbytes = encode(sig_in.nda, sig_out)
+        return lgdo.Array(sig_out_nda), nbytes
+    else:
+        raise ValueError(f"unsupported input signal type ({type(sig_in)})")
+def decode(
+    sig_in: (NDArray[ubyte], NDArray[uint32]) | lgdo.VectorOfEncodedVectors,
+    sig_out: NDArray | lgdo.VectorOfVectors | lgdo.ArrayOfEqualSizedArrays = None,
+) -> NDArray | lgdo.VectorOfVectors | lgdo.ArrayOfEqualSizedArrays:
+    """Deompress digital signal(s) with a variable-length encoding of its derivative.
+    Wraps :func:`uleb128_zigzag_diff_array_decode` and adds support for decoding
+    LGDOs.
+    Note
+    ----
+    If `sig_in` is a NumPy array, no resizing (along the last dimension) of
+    `sig_out` to its actual length is performed. Not even of the internally
+    allocated one. If a pre-allocated :class:`.ArrayOfEqualSizedArrays` is
+    provided, it won't be resized too. The internally allocated
+    :class:`.ArrayOfEqualSizedArrays` `sig_out` has instead always the correct
+    size.
+    Because of the current implementation, providing a pre-allocated
+    :class:`.VectorOfVectors` as `sig_out` is not possible.
+    Parameters
+    ----------
+    sig_in
+        array(s) holding the input, compressed signal(s). Output of
+        :func:`.encode`.
+    sig_out
+        pre-allocated array(s) for the decompressed signal(s).  If not
+        provided, will allocate a 32-bit integer array(s) structure.
+    Returns
+    -------
+    sig_out
+        given pre-allocated structure or new structure of 32-bit integers.
+    See Also
+    --------
+    uleb128_zigzag_diff_array_decode
+    """
+    # expect the output of encode()
+    if isinstance(sig_in, tuple):
+        if sig_out is None:
+            # allocate output array of the same shape (generous)
+            sig_out = np.empty_like(sig_in[0], dtype=int32)
+        # siglen has one dimension less (the last)
+        s = sig_in[0].shape
+        siglen = np.empty(s[:-1], dtype=uint32)
+        if len(sig_in[0]) == 0:
+            return sig_out, siglen
+        # call low-level routine
+        uleb128_zigzag_diff_array_decode(sig_in[0], sig_in[1], sig_out, siglen)
+        return sig_out, siglen
+    elif isinstance(sig_in, lgdo.ArrayOfEncodedEqualSizedArrays):
+        if not sig_out:
+            # initialize output structure with decoded_size
+            sig_out = lgdo.ArrayOfEqualSizedArrays(
+                dims=(1, 1),
+                shape=(len(sig_in), sig_in.decoded_size.value),
+                dtype=int32,
+                attrs=sig_in.getattrs(),
+            )
+        siglen = np.empty(len(sig_in), dtype=uint32)
+        # save original encoded vector lengths
+        nbytes = np.diff(sig_in.encoded_data.cumulative_length.nda, prepend=uint32(0))
+        if len(sig_in) == 0:
+            return sig_out
+        # convert vector of vectors to array of equal sized arrays
+        # can now decode on the 2D matrix together with number of bytes to read per row
+        _, siglen = decode(
+            (sig_in.encoded_data.to_aoesa(preserve_dtype=True).nda, nbytes), sig_out.nda
+        )
+        # sanity check
+        assert np.all(sig_in.decoded_size.value == siglen)
+        return sig_out
+    elif isinstance(sig_in, lgdo.VectorOfEncodedVectors):
+        if sig_out:
+            log.warning(
+                "a pre-allocated VectorOfVectors was given "
+                "to hold an encoded VectorOfVectors. "
+                "This is not supported at the moment, so a new one "
+                "will be allocated to replace it"
+            )
+        siglen = np.empty(len(sig_in), dtype=uint32)
+        # save original encoded vector lengths
+        nbytes = np.diff(sig_in.encoded_data.cumulative_length.nda, prepend=uint32(0))
+        # convert vector of vectors to array of equal sized arrays
+        # can now decode on the 2D matrix together with number of bytes to read per row
+        sig_out, siglen = decode(
+            (sig_in.encoded_data.to_aoesa(preserve_dtype=True).nda, nbytes)
+        )
+        # sanity check
+        assert np.array_equal(sig_in.decoded_size, siglen)
+        # converto to VOV before returning
+        return sig_out.to_vov(np.cumsum(siglen, dtype=uint32))
+    else:
+        raise ValueError("unsupported input signal type")
+@numba.vectorize(
+    ["uint64(int64)", "uint32(int32)", "uint16(int16)"],
+    nopython=True,
+)
+def zigzag_encode(x: int | NDArray[int]) -> int | NDArray[int]:
+    """ZigZag-encode [#WikiZZ]_ signed integer numbers."""
+    return (x >> 31) ^ (x << 1)
+@numba.vectorize(
+    ["int64(uint64)", "int32(uint32)", "int16(uint16)"],
+    nopython=True,
+)
+def zigzag_decode(x: int | NDArray[int]) -> int | NDArray[int]:
+    """ZigZag-decode [#WikiZZ]_ signed integer numbers."""
+    return (x >> 1) ^ -(x & 1)
+@numba.jit(["uint32(int64, byte[:])"], nopython=True)
+def uleb128_encode(x: int, encx: NDArray[ubyte]) -> int:
+    """Compute a variable-length representation of an unsigned integer.
+    Implements the Unsigned Little Endian Base-128 encoding [#WikiULEB128]_.
+    Only positive numbers are expected, as no *two’s complement* is applied.
+    Parameters
+    ----------
+    x
+        the number to be encoded.
+    encx
+        the encoded varint as a NumPy array of bytes.
+    Returns
+    -------
+    nbytes
+        size of varint in bytes
+    """
+    i = 0
+    bits = x & 0x7F
+    x >>= 7
+    while x:
+        encx[i] = 0x80 | bits
+        bits = x & 0x7F
+        i += 1
+        x >>= 7
+    encx[i] = bits
+    # return size of varint in bytes
+    return i + 1
+@numba.jit(["UniTuple(uint32, 2)(byte[:])"], nopython=True)
+def uleb128_decode(encx: NDArray[ubyte]) -> (int, int):
+    """Decode a variable-length integer into an unsigned integer.
+    Implements the Unsigned Little Endian Base-128 decoding [#WikiULEB128]_.
+    Only encoded positive numbers are expected, as no *two’s complement* is
+    applied.
+    Parameters
+    ----------
+    encx
+        the encoded varint as a NumPy array of bytes.
+    Returns
+    -------
+    x, nread
+        the decoded value and the number of bytes read from the input array.
+    """
+    if len(encx) <= 0:
+        raise ValueError("input bytes array is empty")
+    x = pos = uint32(0)
+    for b in encx:
+        x = x | ((b & 0x7F) << pos)
+        if (b & 0x80) == 0:
+            return (x, int(pos / 7 + 1))
+        else:
+            pos += 7
+        if pos >= 64:
+            raise OverflowError("overflow during decoding of varint encoded number")
+    raise RuntimeError("malformed varint")
+@numba.guvectorize(
+    [
+        "void(uint16[:], byte[:], uint32[:])",
+        "void(uint32[:], byte[:], uint32[:])",
+        "void(uint64[:], byte[:], uint32[:])",
+        "void(int16[:], byte[:], uint32[:])",
+        "void(int32[:], byte[:], uint32[:])",
+        "void(int64[:], byte[:], uint32[:])",
+    ],
+    "(n),(m),()",
+    nopython=True,
+)
+def uleb128_zigzag_diff_array_encode(
+    sig_in: NDArray[int], sig_out: NDArray[ubyte], nbytes: int
+) -> None:
+    """Encode an array of integer numbers.
+    The algorithm computes the derivative (prepending 0 first) of `sig_in`,
+    maps it to positive numbers by applying :func:`zigzag_encode` and finally
+    computes its variable-length binary representation with
+    :func:`uleb128_encode`.
+    The encoded data is stored in `sig_out` as an array of bytes. The number of
+    bytes written is stored in `nbytes`. The actual encoded data can therefore
+    be found in ``sig_out[:nbytes]``.
+    Parameters
+    ----------
+    sig_in
+        the input array of integers.
+    sig_out
+        pre-allocated bytes array for the output encoded data.
+    nbytes
+        pre-allocated output array holding the number of bytes written (stored
+        in the first index).
+    See Also
+    --------
+    .uleb128_zigzag_diff_array_decode
+    """
+    pos = uint32(0)
+    last = int32(0)
+    for s in sig_in:
+        zzdiff = zigzag_encode(int32(s - last))
+        pos += uleb128_encode(zzdiff, sig_out[pos:])
+        last = s
+    nbytes[0] = pos
+@numba.guvectorize(
+    [
+        "void(byte[:], uint32[:], uint16[:], uint32[:])",
+        "void(byte[:], uint32[:], uint32[:], uint32[:])",
+        "void(byte[:], uint32[:], uint64[:], uint32[:])",
+        "void(byte[:], uint32[:], int16[:], uint32[:])",
+        "void(byte[:], uint32[:], int32[:], uint32[:])",
+        "void(byte[:], uint32[:], int64[:], uint32[:])",
+    ],
+    "(n),(),(m),()",
+    nopython=True,
+)
+def uleb128_zigzag_diff_array_decode(
+    sig_in: NDArray[ubyte],
+    nbytes: int,
+    sig_out: NDArray[int],
+    siglen: int,
+) -> None:
+    """Decode an array of variable-length integers.
+    The algorithm inverts :func:`.uleb128_zigzag_diff_array_encode` by decoding
+    the variable-length binary data in `sig_in` with :func:`uleb128_decode`,
+    then reconstructing the original signal derivative with
+    :func:`zigzag_decode` and finally computing its cumulative (i.e. the
+    original signal).
+    Parameters
+    ----------
+    sig_in
+        the array of bytes encoding the variable-length integers.
+    nbytes
+        the number of bytes to read from `sig_in` (stored in the first index of
+        this array).
+    sig_out
+        pre-allocated array for the output decoded signal.
+    siglen
+        the length of the decoded signal, (stored in the first index of this
+        array).
+    See Also
+    --------
+    .uleb128_zigzag_diff_array_encode
+    """
+    if len(sig_in) <= 0:
+        raise ValueError("input bytes array is empty")
+    _nbytes = min(nbytes[0], len(sig_in))
+    pos = i = uint32(0)
+    last = int32(0)
+    while pos < _nbytes:
+        x, nread = uleb128_decode(sig_in[pos:])
+        sig_out[i] = last = zigzag_decode(x) + last
+        i += 1
+        pos += nread
+    siglen[0] = i

lgdo/lgdo_utils.py ADDED Viewed

@@ -0,0 +1,196 @@
+"""Implements utilities for LEGEND Data Objects."""
+from __future__ import annotations
+import glob
+import logging
+import os
+import string
+import numpy as np
+from . import types as lgdo
+log = logging.getLogger(__name__)
+def get_element_type(obj: object) -> str:
+    """Get the LGDO element type of a scalar or array.
+    For use in LGDO datatype attributes.
+    Parameters
+    ----------
+    obj
+        if a ``str``, will automatically return ``string`` if the object has
+        a :class:`numpy.dtype`, that will be used for determining the element
+        type otherwise will attempt to case the type of the object to a
+        :class:`numpy.dtype`.
+    Returns
+    -------
+    element_type
+        A string stating the determined element type of the object.
+    """
+    # special handling for strings
+    if isinstance(obj, str):
+        return "string"
+    # the rest use dtypes
+    dt = obj.dtype if hasattr(obj, "dtype") else np.dtype(type(obj))
+    kind = dt.kind
+    if kind == "b":
+        return "bool"
+    if kind == "V":
+        return "blob"
+    if kind in ["i", "u", "f"]:
+        return "real"
+    if kind == "c":
+        return "complex"
+    if kind in ["S", "U"]:
+        return "string"
+    # couldn't figure it out
+    raise ValueError(
+        "cannot determine lgdo element_type for object of type", type(obj).__name__
+    )
+def copy(obj: lgdo.LGDO, dtype: np.dtype = None) -> lgdo.LGDO:
+    """Return a copy of an LGDO.
+    Parameters
+    ----------
+    obj
+        the LGDO to be copied.
+    dtype
+        NumPy dtype to be used for the copied object.
+    """
+    if dtype is None:
+        dtype = obj.dtype
+    if isinstance(obj, lgdo.Array):
+        return lgdo.Array(
+            np.array(obj.nda, dtype=dtype, copy=True), attrs=dict(obj.attrs)
+        )
+    if isinstance(obj, lgdo.VectorOfVectors):
+        return lgdo.VectorOfVectors(
+            flattened_data=copy(obj.flattened_data, dtype=dtype),
+            cumulative_length=copy(obj.cumulative_length),
+            attrs=dict(obj.attrs),
+        )
+    else:
+        raise ValueError(f"copy of {type(obj)} not supported")
+def parse_datatype(datatype: str) -> tuple[str, tuple[int, ...], str | list[str]]:
+    """Parse datatype string and return type, dimensions and elements.
+    Parameters
+    ----------
+    datatype
+        a LGDO-formatted datatype string.
+    Returns
+    -------
+    element_type
+        the datatype name dims if not ``None``, a tuple of dimensions for the
+        LGDO. Note this is not the same as the NumPy shape of the underlying
+        data object. See the LGDO specification for more information. Also see
+        :class:`~.types.ArrayOfEqualSizedArrays` and
+        :meth:`.lh5_store.LH5Store.read_object` for example code elements for
+        numeric objects, the element type for struct-like  objects, the list of
+        fields in the struct.
+    """
+    if "{" not in datatype:
+        return "scalar", None, datatype
+    # for other datatypes, need to parse the datatype string
+    from parse import parse
+    datatype, element_description = parse("{}{{{}}}", datatype)
+    if datatype.endswith(">"):
+        datatype, dims = parse("{}<{}>", datatype)
+        dims = [int(i) for i in dims.split(",")]
+        return datatype, tuple(dims), element_description
+    else:
+        return datatype, None, element_description.split(",")
+def expand_vars(expr: str, substitute: dict[str, str] = None) -> str:
+    """Expand (environment) variables.
+    Note
+    ----
+    Malformed variable names and references to non-existing variables are left
+    unchanged.
+    Parameters
+    ----------
+    expr
+        string expression, which may include (environment) variables prefixed by
+        ``$``.
+    substitute
+        use this dictionary to substitute variables. Environment variables take
+        precedence.
+    """
+    if substitute is None:
+        substitute = {}
+    # expand env variables first
+    # then try using provided mapping
+    return string.Template(os.path.expandvars(expr)).safe_substitute(substitute)
+def expand_path(
+    path: str,
+    substitute: dict[str, str] = None,
+    list: bool = False,
+    base_path: str = None,
+) -> str | list:
+    """Expand (environment) variables and wildcards to return absolute paths.
+    Parameters
+    ----------
+    path
+        name of path, which may include environment variables and wildcards.
+    list
+        if ``True``, return a list. If ``False``, return a string; if ``False``
+        and a unique file is not found, raise an exception.
+    substitute
+        use this dictionary to substitute variables. Environment variables take
+        precedence.
+    base_path
+        name of base path. Returned paths will be relative to base.
+    Returns
+    -------
+    path or list of paths
+        Unique absolute path, or list of all absolute paths
+    """
+    if base_path is not None and base_path != "":
+        base_path = os.path.expanduser(os.path.expandvars(base_path))
+        path = os.path.join(base_path, path)
+    # first expand variables
+    _path = expand_vars(path, substitute)
+    # then expand wildcards
+    paths = glob.glob(os.path.expanduser(_path))
+    if base_path is not None and base_path != "":
+        paths = [os.path.relpath(p, base_path) for p in paths]
+    if not list:
+        if len(paths) == 0:
+            raise FileNotFoundError(f"could not find path matching {path}")
+        elif len(paths) > 1:
+            raise FileNotFoundError(f"found multiple paths matching {path}")
+        else:
+            return paths[0]
+    else:
+        return paths