PyPI - legend-pydataobj - Versions diffs - 1.5.1__py3-none-any.whl → 1.6.1__py3-none-any.whl - Mend

legend-pydataobj 1.5.1py3-none-any.whl → 1.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

{legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/METADATA +1 -1
legend_pydataobj-1.6.1.dist-info/RECORD +54 -0
{legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/WHEEL +1 -1
{legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/entry_points.txt +1 -0
lgdo/__init__.py +7 -4
lgdo/_version.py +2 -2
lgdo/cli.py +237 -12
lgdo/compression/__init__.py +1 -0
lgdo/lh5/__init__.py +9 -1
lgdo/lh5/_serializers/__init__.py +43 -0
lgdo/lh5/_serializers/read/__init__.py +0 -0
lgdo/lh5/_serializers/read/array.py +34 -0
lgdo/lh5/_serializers/read/composite.py +405 -0
lgdo/lh5/_serializers/read/encoded.py +129 -0
lgdo/lh5/_serializers/read/ndarray.py +104 -0
lgdo/lh5/_serializers/read/scalar.py +34 -0
lgdo/lh5/_serializers/read/utils.py +12 -0
lgdo/lh5/_serializers/read/vector_of_vectors.py +201 -0
lgdo/lh5/_serializers/write/__init__.py +0 -0
lgdo/lh5/_serializers/write/array.py +92 -0
lgdo/lh5/_serializers/write/composite.py +259 -0
lgdo/lh5/_serializers/write/scalar.py +23 -0
lgdo/lh5/_serializers/write/vector_of_vectors.py +95 -0
lgdo/lh5/core.py +272 -0
lgdo/lh5/datatype.py +46 -0
lgdo/lh5/exceptions.py +34 -0
lgdo/lh5/iterator.py +1 -1
lgdo/lh5/store.py +69 -1160
lgdo/lh5/tools.py +27 -53
lgdo/lh5/utils.py +130 -27
lgdo/lh5_store.py +11 -2
lgdo/logging.py +1 -0
lgdo/types/__init__.py +1 -0
lgdo/types/array.py +1 -0
lgdo/types/arrayofequalsizedarrays.py +1 -0
lgdo/types/encoded.py +3 -8
lgdo/types/fixedsizearray.py +1 -0
lgdo/types/struct.py +1 -0
lgdo/types/table.py +46 -5
lgdo/types/vectorofvectors.py +314 -458
lgdo/types/vovutils.py +320 -0
lgdo/types/waveformtable.py +1 -0
lgdo/utils.py +1 -32
legend_pydataobj-1.5.1.dist-info/RECORD +0 -36
{legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/LICENSE +0 -0
{legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/top_level.txt +0 -0

lgdo/types/vectorofvectors.py CHANGED Viewed

@@ -2,23 +2,22 @@
 Implements a LEGEND Data Object representing a variable-length array of
 variable-length arrays and corresponding utilities.
 """
 from __future__ import annotations
-import itertools
 import logging
-from collections.abc import Iterator
+from collections.abc import Iterator, Mapping, Sequence
 from typing import Any
 import awkward as ak
 import awkward_pandas as akpd
-import numba
 import numpy as np
 import pandas as pd
-from numpy.typing import DTypeLike, NDArray
+from numpy.typing import ArrayLike, DTypeLike, NDArray
 from .. import utils
-from ..utils import numba_defaults_kwargs as nb_kwargs
 from . import arrayofequalsizedarrays as aoesa
+from . import vovutils
 from .array import Array
 from .lgdo import LGDO
@@ -26,30 +25,56 @@ log = logging.getLogger(__name__)
 class VectorOfVectors(LGDO):
-    """A variable-length array of variable-length arrays.
+    """A n-dimensional variable-length 1D array of variable-length 1D arrays.
+    If the vector is 2-dimensional, the internal representation is as two NumPy
+    arrays, one to store the flattened data contiguosly
+    (:attr:`flattened_data`) and one to store the cumulative sum of lengths of
+    each vector (:attr:`cumulative_length`). When the dimension is more than 2,
+    :attr:`flattened_data` is a :class:`VectorOfVectors` itself.
-    For now only a 1D vector of 1D vectors is supported. Internal representation
-    is as two NumPy arrays, one to store the flattened data contiguosly and one
-    to store the cumulative sum of lengths of each vector.
+    Examples
+    --------
+    >>> from lgdo import VectorOfVectors
+    >>> data = VectorOfVectors(
+    ...   [[[1, 2], [3, 4, 5]], [[2], [4, 8, 9, 7]], [[5, 3, 1]]],
+    ...   attrs={"units": "m"}
+    ... )
+    >>> print(data)
+    [[[1, 2], [3, 4, 5]],
+     [[2], [4, 8, 9, 7]],
+     [[5, 3, 1]]
+    ] with attrs={'units': 'm'}
+    >>> data.view_as("ak")
+    <Array [[[1, 2], [3, 4, 5]], ..., [[5, ..., 1]]] type='3 * var * var * int64'>
+    Note
+    ----
+    Many class methods are currently implemented only for 2D vectors and will
+    raise an exception on higher dimensional data.
     """
     def __init__(
         self,
-        array: ak.Array | list[list[int | float]] = None,
-        flattened_data: Array | NDArray = None,
-        cumulative_length: Array | NDArray = None,
-        shape_guess: tuple[int, int] | None = None,
-        dtype: DTypeLike = None,
+        data: ArrayLike | None = None,
+        flattened_data: ArrayLike | None = None,
+        cumulative_length: ArrayLike | VectorOfVectors | None = None,
+        shape_guess: Sequence[int, ...] | None = None,
+        dtype: DTypeLike | None = None,
         fill_val: int | float | None = None,
-        attrs: dict[str, Any] | None = None,
+        attrs: Mapping[str, Any] | None = None,
     ) -> None:
         """
         Parameters
         ----------
-        array
-            create a ``VectorOfVectors`` out of a Python list of lists or an
-            :class:`ak.Array`. Takes priority over `flattened_data` and
-            `cumulative_length`.
+        data
+            Any array-like structure accepted by the :class:`ak.Array`
+            constructor, with the exception that elements cannot be of type
+            ``OptionType``, ``UnionType`` or ``RecordType``.  Takes priority
+            over `flattened_data` and `cumulative_length`. The serialization of
+            the :class:`ak.Array` is performed through :func:`ak.to_buffers`.
+            Since the latter returns non-data-owning NumPy arrays, which would
+            prevent later modifications like resizing, a copy is performed.
         flattened_data
             if not ``None``, used as the internal array for
             `self.flattened_data`.  Otherwise, an internal `flattened_data` is
@@ -74,124 +99,181 @@ class VectorOfVectors(LGDO):
         attrs
             a set of user attributes to be carried along with this LGDO.
         """
-        if array is not None:
-            if isinstance(array, ak.Array):
-                if array.ndim != 2:
-                    msg = (
-                        "cannot initialize a VectorOfVectors with "
-                        f"{array.ndim}-dimensional data"
-                    )
-                    raise ValueError(msg)
-                form, length, container = ak.to_buffers(array)
-                self.__init__(
-                    flattened_data=container["node1-data"],
-                    cumulative_length=container["node0-offsets"][1:],
+        # sanitize
+        if cumulative_length is not None and not isinstance(cumulative_length, Array):
+            cumulative_length = Array(cumulative_length)
+        if flattened_data is not None and not isinstance(
+            flattened_data, (Array, VectorOfVectors)
+        ):
+            flattened_data = Array(flattened_data)
+        if data is not None:
+            if not isinstance(data, ak.Array):
+                data = ak.Array(data)
+            if data.ndim < 2:
+                msg = (
+                    "cannot initialize a VectorOfVectors with "
+                    f"{data.ndim}-dimensional data"
                 )
+                raise ValueError(msg)
-            else:
-                cl_nda = np.cumsum([len(ll) for ll in array])
-                if dtype is None:
-                    if len(cl_nda) == 0 or cl_nda[-1] == 0:
-                        msg = "array can't be empty with dtype=None!"
-                        raise ValueError(msg)
+            # make sure it's not a record array
+            if not vovutils._ak_is_valid(data):
+                msg = "input array type is not supported!"
+                raise ValueError(msg)
-                    # Set dtype from the first element in the list
-                    # Find it efficiently, allowing for zero-length lists as some of the entries
-                    first_element = next(itertools.chain.from_iterable(array))
-                    dtype = type(first_element)
+            # array might be non-jagged! ('container' will hold a ndim NumPy array)
+            if not vovutils._ak_is_jagged(data):
+                data = ak.from_regular(data, axis=None)
+            # ak.to_buffer helps in de-serialization
+            # NOTE: ak.to_packed() needed?
+            form, length, container = ak.to_buffers(ak.to_packed(data))
+            # NOTE: node#-data is not even in the dict if the awkward array is empty
+            # NOTE: if the data arg was a numpy array, to_buffers() preserves
+            # the original dtype
+            # FIXME: have to copy the buffers, otherwise self will not own the
+            # data and self.resize() will fail. Is it possible to avoid this?
+            flattened_data = np.copy(
+                container.pop(f"node{data.ndim-1}-data", np.empty(0, dtype=dtype))
+            )
-                self.dtype = np.dtype(dtype)
-                self.cumulative_length = Array(cl_nda)
-                self.flattened_data = Array(
-                    np.fromiter(itertools.chain.from_iterable(array), dtype=self.dtype)
-                )
+            # if user-provided dtype is different than dtype from Awkward, cast
+            # NOTE: makes a copy only if needed
+            flattened_data = np.asarray(flattened_data, dtype=dtype)
+            # start from innermost VoV and build nested structure
+            for i in range(data.ndim - 2, -1, -1):
+                # NOTE: remember, omit the leading 0 from ak.Array offsets
+                cumulative_length = np.copy(container[f"node{i}-offsets"][1:])
+                if i != 0:
+                    # at the beginning of the loop: initialize innermost
+                    # flattened_data and replace current flattened_data
+                    # reference. in the following iterations flattened_data is
+                    # a VectorOfVectors
+                    flattened_data = VectorOfVectors(
+                        flattened_data=flattened_data,
+                        cumulative_length=cumulative_length,
+                    )
+                else:
+                    # at end we need to initialize self with the latest flattened_data
+                    self.__init__(
+                        flattened_data=flattened_data,
+                        cumulative_length=cumulative_length,
+                    )
         else:
+            self.flattened_data = None
+            self.cumulative_length = None
+            # let's first setup cumulative_length...
             if cumulative_length is None:
-                if shape_guess is None:
-                    # just make an empty vector
-                    self.cumulative_length = Array(np.empty((0,), dtype="uint32"))
                 # initialize based on shape_guess
-                elif shape_guess[1] <= 0:
-                    self.cumulative_length = Array(
-                        shape=(shape_guess[0],), dtype="uint32", fill_val=0
+                if shape_guess is None:
+                    # just make an empty 2D vector
+                    shape_guess = (0, 0)
+                # sanity check
+                if len(shape_guess) < 2:
+                    msg = "shape_guess must be a sequence of 2 integers or more"
+                    raise ValueError(msg)
+                # let's Awkward do the job here, we're lazy
+                if fill_val is not None:
+                    self.__init__(
+                        np.full(shape=shape_guess, fill_value=fill_val, dtype=dtype)
                     )
                 else:
-                    self.cumulative_length = Array(
-                        np.arange(
-                            shape_guess[1],
-                            np.prod(shape_guess) + 1,
-                            shape_guess[1],
-                            dtype="uint32",
-                        )
-                    )
+                    self.__init__(np.empty(shape=shape_guess, dtype=dtype))
             else:
-                self.cumulative_length = Array(cumulative_length)
-            if flattened_data is None:
+                # if it's user provided just use it
+                self.cumulative_length = cumulative_length
+            # ...then flattened_data
+            # NOTE: self.flattened_data might have already been initialized
+            # above
+            if flattened_data is None and self.flattened_data is None:
+                # this happens when the cumulative_length arg is not None
                 if dtype is None:
                     msg = "flattened_data and dtype cannot both be None!"
                     raise ValueError(msg)
-                length = 0
-                if cumulative_length is None:
-                    # just make an empty vector or use shape_guess
-                    length = 0 if shape_guess is None else np.prod(shape_guess)
-                else:
-                    # use cumulative_length
-                    length = cumulative_length[-1]
+                # now ready to initialize the object!
                 self.flattened_data = Array(
-                    shape=(length,), dtype=dtype, fill_val=fill_val
+                    shape=(self.cumulative_length[-1],), dtype=dtype, fill_val=fill_val
                 )
-            else:
-                self.flattened_data = Array(flattened_data)
+            elif self.flattened_data is None:
+                self.flattened_data = flattened_data
             # finally set dtype
             self.dtype = self.flattened_data.dtype
+        # set ndim
+        self.ndim = 2
+        pointer = self.flattened_data
+        while True:
+            if isinstance(pointer, Array):
+                break
+            self.ndim += 1
+            pointer = pointer.flattened_data
         super().__init__(attrs)
     def datatype_name(self) -> str:
         return "array"
     def form_datatype(self) -> str:
-        et = utils.get_element_type(self)
-        return "array<1>{array<1>{" + et + "}}"
+        eltype = (
+            "array<1>{" + utils.get_element_type(self) + "}"
+            if self.ndim == 2
+            else self.flattened_data.form_datatype()
+        )
+        return "array<1>{" + eltype + "}"
     def __len__(self) -> int:
-        """Return the number of stored vectors."""
+        """Return the number of stored vectors along the first axis (0)."""
         return len(self.cumulative_length)
     def __eq__(self, other: VectorOfVectors) -> bool:
         if isinstance(other, VectorOfVectors):
+            if self.ndim == 2 and len(self.cumulative_length) != 0:
+                fldata_eq = np.array_equal(
+                    self.flattened_data[: self.cumulative_length[-1]],
+                    other.flattened_data[: other.cumulative_length[-1]],
+                )
+            else:
+                fldata_eq = self.flattened_data == other.flattened_data
             return (
                 self.cumulative_length == other.cumulative_length
-                and (
-                    len(self.cumulative_length) == 0
-                    or np.all(
-                        self.flattened_data[: self.cumulative_length[-1]]
-                        == other.flattened_data[: other.cumulative_length[-1]]
-                    )
-                )
+                and fldata_eq
                 and self.dtype == other.dtype
                 and self.attrs == other.attrs
             )
         return False
-    def __getitem__(self, i: int) -> list:
-        """Return vector at index `i`."""
-        stop = self.cumulative_length[i]
-        if i in (0, -len(self)):
-            return self.flattened_data[0:stop]
+    def __getitem__(self, i: int) -> NDArray:
+        """Return a view of the vector at index `i` along the first axis."""
+        if self.ndim == 2:
+            stop = self.cumulative_length[i]
+            if i in (0, -len(self)):
+                return self.flattened_data[0:stop]
+            return self.flattened_data[self.cumulative_length[i - 1] : stop]
-        return self.flattened_data[self.cumulative_length[i - 1] : stop]
+        raise NotImplementedError
     def __setitem__(self, i: int, new: NDArray) -> None:
-        self.__getitem__(i)[:] = new
+        if self.ndim == 2:
+            self.__getitem__(i)[:] = new
+        else:
+            raise NotImplementedError
     def resize(self, new_size: int) -> None:
         """Resize vector along the first axis.
@@ -220,24 +302,26 @@ class VectorOfVectors(LGDO):
          [3],
         ]
         """
-        vidx = self.cumulative_length
-        old_s = len(self)
-        dlen = new_size - old_s
-        csum = vidx[-1] if len(self) > 0 else 0
-        # first resize the cumulative length
-        self.cumulative_length.resize(new_size)
-        # if new_size > size, new elements are filled with zeros, let's fix
-        # that
-        if dlen > 0:
-            self.cumulative_length[old_s:] = csum
-        # then resize the data array
-        # if dlen > 0 this has no effect
-        if len(self.cumulative_length) > 0:
-            self.flattened_data.resize(self.cumulative_length[-1])
+        if self.ndim == 2:
+            vidx = self.cumulative_length
+            old_s = len(self)
+            dlen = new_size - old_s
+            csum = vidx[-1] if len(self) > 0 else 0
+            # first resize the cumulative length
+            self.cumulative_length.resize(new_size)
+            # if new_size > size, new elements are filled with zeros, let's fix
+            # that
+            if dlen > 0:
+                self.cumulative_length[old_s:] = csum
+            # then resize the data array
+            # if dlen > 0 this has no effect
+            if len(self.cumulative_length) > 0:
+                self.flattened_data.resize(self.cumulative_length[-1])
+        else:
+            raise NotImplementedError
     def append(self, new: NDArray) -> None:
         """Append a 1D vector `new` at the end.
@@ -252,15 +336,20 @@ class VectorOfVectors(LGDO):
          [8 9],
         ]
         """
-        # first extend cumulative_length by +1
-        self.cumulative_length.resize(len(self) + 1)
-        # set it at the right value
-        newlen = self.cumulative_length[-2] + len(new) if len(self) > 1 else len(new)
-        self.cumulative_length[-1] = newlen
-        # then resize flattened_data to accommodate the new vector
-        self.flattened_data.resize(len(self.flattened_data) + len(new))
-        # finally set it
-        self[-1] = new
+        if self.ndim == 2:
+            # first extend cumulative_length by +1
+            self.cumulative_length.resize(len(self) + 1)
+            # set it at the right value
+            newlen = (
+                self.cumulative_length[-2] + len(new) if len(self) > 1 else len(new)
+            )
+            self.cumulative_length[-1] = newlen
+            # then resize flattened_data to accommodate the new vector
+            self.flattened_data.resize(len(self.flattened_data) + len(new))
+            # finally set it
+            self[-1] = new
+        else:
+            raise NotImplementedError
     def insert(self, i: int, new: NDArray) -> None:
         """Insert a vector at index `i`.
@@ -283,17 +372,20 @@ class VectorOfVectors(LGDO):
         This method involves a significant amount of memory re-allocation and
         is expected to perform poorly on large vectors.
         """
-        if i >= len(self):
-            msg = f"index {i} is out of bounds for vector owith size {len(self)}"
-            raise IndexError(msg)
+        if self.ndim == 2:
+            if i >= len(self):
+                msg = f"index {i} is out of bounds for vector owith size {len(self)}"
+                raise IndexError(msg)
-        self.flattened_data = Array(
-            np.insert(self.flattened_data, self.cumulative_length[i - 1], new)
-        )
-        self.cumulative_length = Array(
-            np.insert(self.cumulative_length, i, self.cumulative_length[i - 1])
-        )
-        self.cumulative_length[i:] += np.uint32(len(new))
+            self.flattened_data = Array(
+                np.insert(self.flattened_data, self.cumulative_length[i - 1], new)
+            )
+            self.cumulative_length = Array(
+                np.insert(self.cumulative_length, i, self.cumulative_length[i - 1])
+            )
+            self.cumulative_length[i:] += np.uint32(len(new))
+        else:
+            raise NotImplementedError
     def replace(self, i: int, new: NDArray) -> None:
         """Replace the vector at index `i` with `new`.
@@ -316,36 +408,41 @@ class VectorOfVectors(LGDO):
         This method involves a significant amount of memory re-allocation and
         is expected to perform poorly on large vectors.
         """
-        if i >= len(self):
-            msg = f"index {i} is out of bounds for vector with size {len(self)}"
-            raise IndexError(msg)
-        vidx = self.cumulative_length
-        dlen = len(new) - len(self[i])
-        if dlen == 0:
-            # don't waste resources
-            self[i] = new
-        elif dlen < 0:
-            start = vidx[i - 1]
-            stop = start + len(new)
-            # set the already allocated indices
-            self.flattened_data[start:stop] = new
-            # then delete the extra indices
-            self.flattened_data = Array(
-                np.delete(self.flattened_data, np.s_[stop : vidx[i]])
-            )
-        else:
-            # set the already allocated indices
-            self.flattened_data[vidx[i - 1] : vidx[i]] = new[: len(self[i])]
-            # then insert the remaining
-            self.flattened_data = Array(
-                np.insert(self.flattened_data, vidx[i], new[len(self[i]) :])
-            )
+        if self.ndim == 2:
+            if i >= len(self):
+                msg = f"index {i} is out of bounds for vector with size {len(self)}"
+                raise IndexError(msg)
+            vidx = self.cumulative_length
+            dlen = len(new) - len(self[i])
+            if dlen == 0:
+                # don't waste resources
+                self[i] = new
+            elif dlen < 0:
+                start = vidx[i - 1]
+                stop = start + len(new)
+                # set the already allocated indices
+                self.flattened_data[start:stop] = new
+                # then delete the extra indices
+                self.flattened_data = Array(
+                    np.delete(self.flattened_data, np.s_[stop : vidx[i]])
+                )
+            else:
+                # set the already allocated indices
+                self.flattened_data[vidx[i - 1] : vidx[i]] = new[: len(self[i])]
+                # then insert the remaining
+                self.flattened_data = Array(
+                    np.insert(self.flattened_data, vidx[i], new[len(self[i]) :])
+                )
-        vidx[i:] = vidx[i:] + dlen
+            vidx[i:] = vidx[i:] + dlen
+        else:
+            raise NotImplementedError
-    def _set_vector_unsafe(self, i: int, vec: NDArray, lens: NDArray = None) -> None:
+    def _set_vector_unsafe(
+        self, i: int, vec: NDArray, lens: ArrayLike | None = None
+    ) -> None:
         r"""Insert vector `vec` at position `i`.
         Assumes that ``j = self.cumulative_length[i-1]`` is the index (in
@@ -357,9 +454,9 @@ class VectorOfVectors(LGDO):
         behavior. This method is typically used for fast sequential fill of a
         pre-allocated vector of vectors.
-        If vec is 1D array and lens is None, set using full array. If vec
-        is 2D, require lens to be included, and fill each array only up to
-        lengths in lens.
+        If i`vec` is 1D array and `lens` is ``None``, set using full array. If
+        `vec` is 2D, require `lens` to be included, and fill each array only up
+        to lengths in `lens`.
         Danger
         ------
@@ -370,39 +467,47 @@ class VectorOfVectors(LGDO):
         --------
         append, replace, insert
         """
-        start = 0 if i == 0 else self.cumulative_length[i - 1]
-        if len(vec.shape) == 1:
-            vec = np.expand_dims(vec, axis=0)
-            if lens is None:
-                lens = np.array([vec.shape[1]], dtype="u4")
-        if not isinstance(lens, np.ndarray):
-            lens = np.array([lens], dtype="u4")
-        cum_lens = start + lens.cumsum()
-        _nb_fill(vec, lens, self.flattened_data.nda[start : cum_lens[-1]])
-        self.cumulative_length[i : i + len(lens)] = cum_lens
+        if self.ndim == 2:
+            # check if current vector is empty and get the start index in
+            # flattened_data
+            start = 0 if i == 0 else self.cumulative_length[i - 1]
-    def __iter__(self) -> Iterator[NDArray]:
-        for j, stop in enumerate(self.cumulative_length):
-            if j == 0:
-                yield self.flattened_data[0:stop]
-            else:
-                yield self.flattened_data[self.cumulative_length[j - 1] : stop]
+            # if the new element is 1D, convert to dummy 2D
+            if len(vec.shape) == 1:
+                vec = np.expand_dims(vec, axis=0)
+                if lens is None:
+                    lens = np.array([vec.shape[1]], dtype="u4")
-    def __str__(self) -> str:
-        string = ""
-        pos = 0
-        for vec in self:
-            if pos != 0:
-                string += " "
+            # this in case lens is 02, convert to 1D
+            if not isinstance(lens, np.ndarray):
+                lens = np.array([lens], dtype="u4")
-            string += np.array2string(vec, prefix=" ")
+            # calculate stop index in flattened_data
+            cum_lens = start + lens.cumsum()
-            if pos < len(self.cumulative_length):
-                string += ",\n"
+            # fill with fast vectorized routine
+            vovutils._nb_fill(vec, lens, self.flattened_data.nda[start : cum_lens[-1]])
-            pos += 1
+            # add new vector(s) length to cumulative_length
+            self.cumulative_length[i : i + len(lens)] = cum_lens
+        else:
+            raise NotImplementedError
-        string = f"[{string}]"
+    def __iter__(self) -> Iterator[NDArray]:
+        if self.ndim == 2:
+            for j, stop in enumerate(self.cumulative_length):
+                if j == 0:
+                    yield self.flattened_data[0:stop]
+                else:
+                    yield self.flattened_data[self.cumulative_length[j - 1] : stop]
+        else:
+            raise NotImplementedError
+    def __str__(self) -> str:
+        string = self.view_as("ak").show(stream=None)
+        string = string.strip().removesuffix("]")
+        string += "\n]"
         tmp_attrs = self.attrs.copy()
         tmp_attrs.pop("datatype")
@@ -457,19 +562,22 @@ class VectorOfVectors(LGDO):
             original vector of vectors. The type `fill_val` must be a
             compatible one.
         """
-        ak_arr = self.view_as("ak")
+        if self.ndim == 2:
+            ak_arr = self.view_as("ak")
-        if max_len is None:
-            max_len = int(ak.max(ak.count(ak_arr, axis=-1)))
+            if max_len is None:
+                max_len = int(ak.max(ak.count(ak_arr, axis=-1)))
-        nda = ak.fill_none(ak.pad_none(ak_arr, max_len, clip=True), fill_val).to_numpy(
-            allow_missing=False
-        )
+            nda = ak.fill_none(
+                ak.pad_none(ak_arr, max_len, clip=True), fill_val
+            ).to_numpy(allow_missing=False)
-        if preserve_dtype:
-            nda = nda.astype(self.flattened_data.dtype, copy=False)
+            if preserve_dtype:
+                nda = nda.astype(self.flattened_data.dtype, copy=False)
-        return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs())
+            return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs())
+        raise NotImplementedError
     def view_as(
         self,
@@ -519,6 +627,8 @@ class VectorOfVectors(LGDO):
                 msg = "Pint does not support Awkward yet, you must view the data with_units=False"
                 raise ValueError(msg)
+            # see https://github.com/scikit-hep/awkward/discussions/2848
             # cannot avoid making a copy here. we should add the leading 0 to
             # cumulative_length inside VectorOfVectors at some point in the
             # future
@@ -528,9 +638,15 @@ class VectorOfVectors(LGDO):
             offsets[1:] = self.cumulative_length
             offsets[0] = 0
+            content = (
+                ak.contents.NumpyArray(self.flattened_data.nda)
+                if self.ndim == 2
+                else self.flattened_data.view_as(library, with_units=with_units).layout
+            )
             layout = ak.contents.ListOffsetArray(
                 offsets=ak.index.Index(offsets),
-                content=ak.contents.NumpyArray(self.flattened_data.nda),
+                content=content,
             )
             return ak.Array(layout)
@@ -551,263 +667,3 @@ class VectorOfVectors(LGDO):
         msg = f"{library} is not a supported third-party format."
         raise ValueError(msg)
-def build_cl(
-    sorted_array_in: NDArray, cumulative_length_out: NDArray = None
-) -> NDArray:
-    """Build a cumulative length array from an array of sorted data.
-    Examples
-    --------
-    >>> build_cl(np.array([3, 3, 3, 4])
-    array([3., 4.])
-    For a `sorted_array_in` of indices, this is the inverse of
-    :func:`.explode_cl`, in the sense that doing
-    ``build_cl(explode_cl(cumulative_length))`` would recover the original
-    `cumulative_length`.
-    Parameters
-    ----------
-    sorted_array_in
-        array of data already sorted; each N matching contiguous entries will
-        be converted into a new row of `cumulative_length_out`.
-    cumulative_length_out
-        a pre-allocated array for the output `cumulative_length`. It will
-        always have length <= `sorted_array_in`, so giving them the same length
-        is safe if there is not a better guess.
-    Returns
-    -------
-    cumulative_length_out
-        the output cumulative length array. If the user provides a
-        `cumulative_length_out` that is too long, this return value is sliced
-        to contain only the used portion of the allocated memory.
-    """
-    if len(sorted_array_in) == 0:
-        return None
-    sorted_array_in = np.asarray(sorted_array_in)
-    if cumulative_length_out is None:
-        cumulative_length_out = np.zeros(len(sorted_array_in), dtype=np.uint64)
-    else:
-        cumulative_length_out.fill(0)
-    if len(cumulative_length_out) == 0 and len(sorted_array_in) > 0:
-        msg = "cumulative_length_out too short ({len(cumulative_length_out)})"
-        raise ValueError(msg)
-    return _nb_build_cl(sorted_array_in, cumulative_length_out)
-@numba.njit(**nb_kwargs)
-def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> NDArray:
-    """numbified inner loop for build_cl"""
-    ii = 0
-    last_val = sorted_array_in[0]
-    for val in sorted_array_in:
-        if val != last_val:
-            ii += 1
-            cumulative_length_out[ii] = cumulative_length_out[ii - 1]
-            if ii >= len(cumulative_length_out):
-                msg = "cumulative_length_out too short"
-                raise RuntimeError(msg)
-            last_val = val
-        cumulative_length_out[ii] += 1
-    ii += 1
-    return cumulative_length_out[:ii]
-@numba.guvectorize(
-    [
-        f"{data_type}[:,:],{size_type}[:],{data_type}[:]"
-        for data_type in [
-            "b1",
-            "i1",
-            "i2",
-            "i4",
-            "i8",
-            "u1",
-            "u2",
-            "u4",
-            "u8",
-            "f4",
-            "f8",
-            "c8",
-            "c16",
-        ]
-        for size_type in ["i4", "i8", "u4", "u8"]
-    ],
-    "(l,m),(l),(n)",
-    **nb_kwargs,
-)
-def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
-    """Vectorized function to fill flattened array from array of arrays and
-    lengths. Values in aoa_in past lengths will not be copied.
-    Parameters
-    ----------
-    aoa_in
-        array of arrays containing values to be copied
-    len_in
-        array of vector lengths for each row of aoa_in
-    flattened_array_out
-        flattened array to copy values into. Must be longer than sum of
-        lengths in len_in
-    """
-    if len(flattened_array_out) < len_in.sum():
-        msg = "flattened array not large enough to hold values"
-        raise ValueError(msg)
-    start = 0
-    for i, ll in enumerate(len_in):
-        stop = start + ll
-        flattened_array_out[start:stop] = aoa_in[i, :ll]
-        start = stop
-def explode_cl(cumulative_length: NDArray, array_out: NDArray = None) -> NDArray:
-    """Explode a `cumulative_length` array.
-    Examples
-    --------
-    >>> explode_cl(np.array([2, 3]))
-    array([0., 0., 1.])
-    This is the inverse of :func:`.build_cl`, in the sense that doing
-    ``build_cl(explode_cl(cumulative_length))`` would recover the original
-    `cumulative_length`.
-    Parameters
-    ----------
-    cumulative_length
-        the cumulative length array to be exploded.
-    array_out
-        a pre-allocated array to hold the exploded cumulative length array.
-        The length should be equal to ``cumulative_length[-1]``.
-    Returns
-    -------
-    array_out
-        the exploded cumulative length array.
-    """
-    cumulative_length = np.asarray(cumulative_length)
-    out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
-    if array_out is None:
-        array_out = np.empty(int(out_len), dtype=np.uint64)
-    if len(array_out) != out_len:
-        msg = f"bad lengths: cl[-1] ({cumulative_length[-1]}) != out ({len(array_out)})"
-        raise ValueError(msg)
-    return _nb_explode_cl(cumulative_length, array_out)
-@numba.njit(**nb_kwargs)
-def _nb_explode_cl(cumulative_length: NDArray, array_out: NDArray) -> NDArray:
-    """numbified inner loop for explode_cl"""
-    out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
-    if len(array_out) != out_len:
-        msg = "bad lengths"
-        raise ValueError(msg)
-    start = 0
-    for ii in range(len(cumulative_length)):
-        nn = int(cumulative_length[ii] - start)
-        for jj in range(nn):
-            array_out[int(start + jj)] = ii
-        start = cumulative_length[ii]
-    return array_out
-def explode(
-    cumulative_length: NDArray, array_in: NDArray, array_out: NDArray = None
-) -> NDArray:
-    """Explode a data array using a `cumulative_length` array.
-    This is identical to :func:`.explode_cl`, except `array_in` gets exploded
-    instead of `cumulative_length`.
-    Examples
-    --------
-    >>> explode(np.array([2, 3]), np.array([3, 4]))
-    array([3., 3., 4.])
-    Parameters
-    ----------
-    cumulative_length
-        the cumulative length array to use for exploding.
-    array_in
-        the data to be exploded. Must have same length as `cumulative_length`.
-    array_out
-        a pre-allocated array to hold the exploded data. The length should be
-        equal to ``cumulative_length[-1]``.
-    Returns
-    -------
-    array_out
-        the exploded cumulative length array.
-    """
-    cumulative_length = np.asarray(cumulative_length)
-    array_in = np.asarray(array_in)
-    out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
-    if array_out is None:
-        array_out = np.empty(out_len, dtype=array_in.dtype)
-    if len(cumulative_length) != len(array_in) or len(array_out) != out_len:
-        msg = (
-            f"bad lengths: cl ({len(cumulative_length)}) != in ({len(array_in)}) "
-            f"and cl[-1] ({cumulative_length[-1]}) != out ({len(array_out)})"
-        )
-        raise ValueError(msg)
-    return nb_explode(cumulative_length, array_in, array_out)
-@numba.njit(**nb_kwargs)
-def nb_explode(
-    cumulative_length: NDArray, array_in: NDArray, array_out: NDArray
-) -> NDArray:
-    """Numbified inner loop for :func:`.explode`."""
-    out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
-    if len(cumulative_length) != len(array_in) or len(array_out) != out_len:
-        msg = "bad lengths"
-        raise ValueError(msg)
-    ii = 0
-    for jj in range(len(array_out)):
-        while ii < len(cumulative_length) and jj >= cumulative_length[ii]:
-            ii += 1
-        array_out[jj] = array_in[ii]
-    return array_out
-def explode_arrays(
-    cumulative_length: Array,
-    arrays: list[NDArray],
-    arrays_out: list[NDArray] | None = None,
-) -> list:
-    """Explode a set of arrays using a `cumulative_length` array.
-    Parameters
-    ----------
-    cumulative_length
-        the cumulative length array to use for exploding.
-    arrays
-        the data arrays to be exploded. Each array must have same length as
-        `cumulative_length`.
-    arrays_out
-        a list of pre-allocated arrays to hold the exploded data. The length of
-        the list should be equal to the length of `arrays`, and each entry in
-        arrays_out should have length ``cumulative_length[-1]``. If not
-        provided, output arrays are allocated for the user.
-    Returns
-    -------
-    arrays_out
-        the list of exploded cumulative length arrays.
-    """
-    cumulative_length = np.asarray(cumulative_length)
-    for ii in range(len(arrays)):
-        arrays[ii] = np.asarray(arrays[ii])
-    out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
-    if arrays_out is None:
-        arrays_out = []
-        for array in arrays:
-            arrays_out.append(np.empty(out_len, dtype=array.dtype))
-    for ii in range(len(arrays)):
-        explode(cumulative_length, arrays[ii], arrays_out[ii])
-    return arrays_out

legend-pydataobj 1.5.1__py3-none-any.whl → 1.6.1__py3-none-any.whl

legend-pydataobj 1.5.1py3-none-any.whl → 1.6.1py3-none-any.whl