PyPI - legend-pydataobj - Versions diffs - 1.11.6__py3-none-any.whl → 1.12.0a1__py3-none-any.whl - Mend

legend-pydataobj 1.11.6py3-none-any.whl → 1.12.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.12.0a1.dist-info}/METADATA +3 -2
{legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.12.0a1.dist-info}/RECORD +23 -22
{legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.12.0a1.dist-info}/WHEEL +1 -1
{legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.12.0a1.dist-info}/entry_points.txt +1 -1
lgdo/_version.py +9 -4
lgdo/cli.py +10 -155
lgdo/lh5/__init__.py +1 -0
lgdo/lh5/_serializers/read/composite.py +1 -3
lgdo/lh5/_serializers/read/utils.py +1 -1
lgdo/lh5/_serializers/read/vector_of_vectors.py +1 -1
lgdo/lh5/concat.py +219 -0
lgdo/lh5/core.py +21 -30
lgdo/lh5/iterator.py +48 -27
lgdo/lh5/store.py +15 -68
lgdo/types/array.py +74 -13
lgdo/types/encoded.py +25 -20
lgdo/types/histogram.py +1 -1
lgdo/types/lgdo.py +50 -0
lgdo/types/table.py +49 -28
lgdo/types/vectorofvectors.py +70 -77
lgdo/types/vovutils.py +14 -4
{legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.12.0a1.dist-info/licenses}/LICENSE +0 -0
{legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.12.0a1.dist-info}/top_level.txt +0 -0

lgdo/types/table.py CHANGED Viewed

@@ -19,7 +19,7 @@ from pandas.io.formats import format as fmt
 from .array import Array
 from .arrayofequalsizedarrays import ArrayOfEqualSizedArrays
-from .lgdo import LGDO
+from .lgdo import LGDO, LGDOCollection
 from .scalar import Scalar
 from .struct import Struct
 from .vectorofvectors import VectorOfVectors
@@ -27,13 +27,9 @@ from .vectorofvectors import VectorOfVectors
 log = logging.getLogger(__name__)
-class Table(Struct):
+class Table(Struct, LGDOCollection):
     """A special struct of arrays or subtable columns of equal length.
-    Holds onto an internal read/write location ``loc`` that is useful in
-    managing table I/O using functions like :meth:`push_row`, :meth:`is_full`,
-    and :meth:`clear`.
     Note
     ----
     If you write to a table and don't fill it up to its total size, be sure to
@@ -49,7 +45,7 @@ class Table(Struct):
     def __init__(
         self,
-        col_dict: Mapping[str, LGDO] | pd.DataFrame | ak.Array | None = None,
+        col_dict: Mapping[str, LGDOCollection] | pd.DataFrame | ak.Array | None = None,
         size: int | None = None,
         attrs: Mapping[str, Any] | None = None,
     ) -> None:
@@ -65,7 +61,7 @@ class Table(Struct):
         col_dict
             instantiate this table using the supplied mapping of column names
             and array-like objects. Supported input types are: mapping of
-            strings to LGDOs, :class:`pd.DataFrame` and :class:`ak.Array`.
+            strings to LGDOCollections, :class:`pd.DataFrame` and :class:`ak.Array`.
             Note 1: no copy is performed, the objects are used directly (unless
             :class:`ak.Array` is provided).  Note 2: if `size` is not ``None``,
             all arrays will be resized to match it.  Note 3: if the arrays have
@@ -85,7 +81,8 @@ class Table(Struct):
             col_dict = _ak_to_lgdo_or_col_dict(col_dict)
         # call Struct constructor
-        super().__init__(obj_dict=col_dict, attrs=attrs)
+        Struct.__init__(self, obj_dict=col_dict)
+        LGDOCollection.__init__(self, attrs=attrs)
         # if col_dict is not empty, set size according to it
         # if size is also supplied, resize all fields to match it
@@ -93,13 +90,10 @@ class Table(Struct):
         if col_dict is not None and len(col_dict) > 0:
             self.resize(new_size=size, do_warn=(size is None))
-        # if no col_dict, just set the size (default to 1024)
+        # if no col_dict, just set the size
         else:
             self.size = size if size is not None else None
-        # always start at loc=0
-        self.loc = 0
     def datatype_name(self) -> str:
         return "table"
@@ -107,7 +101,31 @@ class Table(Struct):
         """Provides ``__len__`` for this array-like class."""
         return self.size
-    def resize(self, new_size: int | None = None, do_warn: bool = False) -> None:
+    def reserve_capacity(self, capacity: int | list) -> None:
+        "Set size (number of rows) of internal memory buffer"
+        if isinstance(capacity, int):
+            for obj in self.values():
+                obj.reserve_capacity(capacity)
+        else:
+            if len(capacity) != len(self.keys()):
+                msg = "List of capacities must have same length as number of keys"
+                raise ValueError(msg)
+            for obj, cap in zip(self.values(), capacity):
+                obj.reserve_capacity(cap)
+    def get_capacity(self) -> int:
+        "Get list of capacities for each key"
+        return [v.get_capacity() for v in self.values()]
+    def trim_capacity(self) -> int:
+        "Set capacity to be minimum needed to support Array size"
+        for v in self.values():
+            v.trim_capacity()
+    def resize(
+        self, new_size: int | None = None, do_warn: bool = False, trim: bool = False
+    ) -> None:
         # if new_size = None, use the size from the first field
         for field, obj in self.items():
             if new_size is None:
@@ -119,21 +137,20 @@ class Table(Struct):
                         f"with size {len(obj)} != {new_size}"
                     )
                 if isinstance(obj, Table):
-                    obj.resize(new_size)
+                    obj.resize(new_size, trim)
                 else:
-                    obj.resize(new_size)
+                    obj.resize(new_size, trim)
         self.size = new_size
-    def push_row(self) -> None:
-        self.loc += 1
-    def is_full(self) -> bool:
-        return self.loc >= self.size
-    def clear(self) -> None:
-        self.loc = 0
+    def insert(self, i: int, vals: dict) -> None:
+        "Insert vals into table at row i. Vals is a mapping from table key to val"
+        for k, ar in self.items():
+            ar.insert(i, vals[k])
+        self.size += 1
-    def add_field(self, name: str, obj: LGDO, use_obj_size: bool = False) -> None:
+    def add_field(
+        self, name: str, obj: LGDOCollection, use_obj_size: bool = False
+    ) -> None:
         """Add a field (column) to the table.
         Use the name "field" here to match the terminology used in
@@ -170,7 +187,9 @@ class Table(Struct):
             new_size = len(obj) if use_obj_size else self.size
             self.resize(new_size=new_size)
-    def add_column(self, name: str, obj: LGDO, use_obj_size: bool = False) -> None:
+    def add_column(
+        self, name: str, obj: LGDOCollection, use_obj_size: bool = False
+    ) -> None:
         """Alias for :meth:`.add_field` using table terminology 'column'."""
         self.add_field(name, obj, use_obj_size=use_obj_size)
@@ -201,8 +220,10 @@ class Table(Struct):
             set to ``False`` to turn off warnings associated with mismatched
             `loc` parameter or :meth:`add_column` warnings.
         """
-        if other_table.loc != self.loc and do_warn:
-            log.warning(f"other_table.loc ({other_table.loc}) != self.loc({self.loc})")
+        if len(other_table) != len(self) and do_warn:
+            log.warning(
+                f"len(other_table) ({len(other_table)}) != len(self) ({len(self)})"
+            )
         if cols is None:
             cols = other_table.keys()
         for name in cols:

lgdo/types/vectorofvectors.py CHANGED Viewed

@@ -20,12 +20,12 @@ from .. import utils
 from . import arrayofequalsizedarrays as aoesa
 from . import vovutils
 from .array import Array
-from .lgdo import LGDO
+from .lgdo import LGDOCollection
 log = logging.getLogger(__name__)
-class VectorOfVectors(LGDO):
+class VectorOfVectors(LGDOCollection):
     """A n-dimensional variable-length 1D array of variable-length 1D arrays.
     If the vector is 2-dimensional, the internal representation is as two NumPy
@@ -138,7 +138,7 @@ class VectorOfVectors(LGDO):
             # FIXME: have to copy the buffers, otherwise self will not own the
             # data and self.resize() will fail. Is it possible to avoid this?
             flattened_data = np.copy(
-                container.pop(f"node{data.ndim-1}-data", np.empty(0, dtype=dtype))
+                container.pop(f"node{data.ndim - 1}-data", np.empty(0, dtype=dtype))
             )
             # if user-provided dtype is different than dtype from Awkward, cast
@@ -210,20 +210,17 @@ class VectorOfVectors(LGDO):
             elif self.flattened_data is None:
                 self.flattened_data = flattened_data
-            # finally set dtype
-            self.dtype = self.flattened_data.dtype
-        # set ndim
-        self.ndim = 2
-        pointer = self.flattened_data
-        while True:
-            if isinstance(pointer, Array):
-                break
+        super().__init__(attrs)
-            self.ndim += 1
-            pointer = pointer.flattened_data
+    @property
+    def ndim(self):
+        return 1 + (
+            1 if isinstance(self.flattened_data, Array) else self.flattened_data.ndim
+        )
-        super().__init__(attrs)
+    @property
+    def dtype(self) -> np.dtype:
+        return self.flattened_data.dtype
     def datatype_name(self) -> str:
         return "array"
@@ -276,7 +273,30 @@ class VectorOfVectors(LGDO):
         else:
             raise NotImplementedError
-    def resize(self, new_size: int) -> None:
+    def reserve_capacity(self, cap_cl, *cap_args) -> None:
+        """Set capacity of internal data arrays. Expect number of args to
+        equal `self.n_dim`. First arg is capacity of cumulative length array.
+        If `self.n_dim` is 2, second argument is capacity of flattened data,
+        otherwise arguments are fed recursively to remaining dimensions.
+        """
+        self.cumulative_length.reserve_capacity(cap_cl)
+        self.flattened_data.reserve_capacity(*cap_args)
+    def get_capacity(self) -> tuple[int]:
+        """Get tuple containing capacity of each dimension. First dimension
+        is cumulative length array. Last dimension is flattened data.
+        """
+        fd_cap = self.flattened_data.get_capacity()
+        if isinstance(fd_cap, int):
+            return (self.cumulative_length.get_capacity(), fd_cap)
+        return (self.cumulative_length.get_capacity(), *fd_cap)
+    def trim_capacity(self) -> None:
+        "Set capacity for all dimensions to minimum needed to hold data"
+        self.cumulative_length.trim_capacity()
+        self.flattened_data.trim_capacity()
+    def resize(self, new_size: int, trim: bool = False) -> None:
         """Resize vector along the first axis.
         `self.flattened_data` is resized only if `new_size` is smaller than the
@@ -286,6 +306,8 @@ class VectorOfVectors(LGDO):
         `self.cumulative_length` is padded with its last element.  This
         corresponds to appending empty vectors.
+        If `trim` is ``True``, resize capacity to match new size
         Examples
         --------
         >>> vov = VectorOfVectors([[1, 2, 3], [4, 5]])
@@ -303,23 +325,22 @@ class VectorOfVectors(LGDO):
          [3],
         ]
         """
-        vidx = self.cumulative_length
         old_s = len(self)
-        dlen = new_size - old_s
-        csum = vidx[-1] if len(self) > 0 else 0
         # first resize the cumulative length
-        self.cumulative_length.resize(new_size)
+        self.cumulative_length.resize(new_size, trim)
         # if new_size > size, new elements are filled with zeros, let's fix
         # that
-        if dlen > 0:
-            self.cumulative_length[old_s:] = csum
+        if new_size > old_s:
+            self.cumulative_length[old_s:] = self.cumulative_length[old_s - 1]
         # then resize the data array
         # if dlen > 0 this has no effect
         if len(self.cumulative_length) > 0:
-            self.flattened_data.resize(self.cumulative_length[-1])
+            self.flattened_data.resize(self.cumulative_length[-1], trim)
+        else:
+            self.flattened_data.resize(0, trim)
     def append(self, new: NDArray) -> None:
         """Append a 1D vector `new` at the end.
@@ -334,20 +355,7 @@ class VectorOfVectors(LGDO):
          [8 9],
         ]
         """
-        if self.ndim == 2:
-            # first extend cumulative_length by +1
-            self.cumulative_length.resize(len(self) + 1)
-            # set it at the right value
-            newlen = (
-                self.cumulative_length[-2] + len(new) if len(self) > 1 else len(new)
-            )
-            self.cumulative_length[-1] = newlen
-            # then resize flattened_data to accommodate the new vector
-            self.flattened_data.resize(len(self.flattened_data) + len(new))
-            # finally set it
-            self[-1] = new
-        else:
-            raise NotImplementedError
+        self.insert(len(self), new)
     def insert(self, i: int, new: NDArray) -> None:
         """Insert a vector at index `i`.
@@ -364,23 +372,15 @@ class VectorOfVectors(LGDO):
          [8 9],
          [4 5],
         ]
-        Warning
-        -------
-        This method involves a significant amount of memory re-allocation and
-        is expected to perform poorly on large vectors.
         """
         if self.ndim == 2:
-            if i >= len(self):
-                msg = f"index {i} is out of bounds for vector owith size {len(self)}"
+            if i > len(self):
+                msg = f"index {i} is out of bounds for vector with size {len(self)}"
                 raise IndexError(msg)
-            self.flattened_data = Array(
-                np.insert(self.flattened_data, self.cumulative_length[i - 1], new)
-            )
-            self.cumulative_length = Array(
-                np.insert(self.cumulative_length, i, self.cumulative_length[i - 1])
-            )
+            i_start = 0 if i == 0 else self.cumulative_length[i - 1]
+            self.flattened_data.insert(i_start, new)
+            self.cumulative_length.insert(i, i_start)
             self.cumulative_length[i:] += np.uint32(len(new))
         else:
             raise NotImplementedError
@@ -400,11 +400,6 @@ class VectorOfVectors(LGDO):
         [[8 9],
          [4 5],
         ]
-        Warning
-        -------
-        This method involves a significant amount of memory re-allocation and
-        is expected to perform poorly on large vectors.
         """
         if self.ndim == 2:
             if i >= len(self):
@@ -414,27 +409,17 @@ class VectorOfVectors(LGDO):
             vidx = self.cumulative_length
             dlen = len(new) - len(self[i])
-            if dlen == 0:
-                # don't waste resources
-                self[i] = new
-            elif dlen < 0:
-                start = vidx[i - 1]
-                stop = start + len(new)
-                # set the already allocated indices
-                self.flattened_data[start:stop] = new
-                # then delete the extra indices
-                self.flattened_data = Array(
-                    np.delete(self.flattened_data, np.s_[stop : vidx[i]])
-                )
-            else:
-                # set the already allocated indices
-                self.flattened_data[vidx[i - 1] : vidx[i]] = new[: len(self[i])]
-                # then insert the remaining
-                self.flattened_data = Array(
-                    np.insert(self.flattened_data, vidx[i], new[len(self[i]) :])
-                )
-            vidx[i:] = vidx[i:] + dlen
+            if dlen != 0:
+                # move the subsequent entries
+                vidx[i:] += dlen
+                self.flattened_data.resize(vidx[-1])
+                self.flattened_data._nda[vidx[i] : vidx[-1]] = self.flattened_data._nda[
+                    vidx[i] - dlen : vidx[-1] - dlen
+                ]
+            # set the already allocated indices
+            start = vidx[i - 1] if i > 0 else 0
+            self.flattened_data[start : vidx[i]] = new
         else:
             raise NotImplementedError
@@ -484,7 +469,15 @@ class VectorOfVectors(LGDO):
             cum_lens = np.add(start, lens.cumsum(), dtype=int)
             # fill with fast vectorized routine
-            vovutils._nb_fill(vec, lens, self.flattened_data.nda[start : cum_lens[-1]])
+            if np.issubdtype(self.flattened_data.dtype, np.unsignedinteger):
+                nan_val = np.iinfo(self.flattened_data.dtype).max
+            if np.issubdtype(self.flattened_data.dtype, np.integer):
+                nan_val = np.iinfo(self.flattened_data.dtype).min
+            else:
+                nan_val = np.nan
+            vovutils._nb_fill(
+                vec, lens, nan_val, self.flattened_data.nda[start : cum_lens[-1]]
+            )
             # add new vector(s) length to cumulative_length
             self.cumulative_length[i : i + len(lens)] = cum_lens

lgdo/types/vovutils.py CHANGED Viewed

@@ -81,7 +81,7 @@ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> ND
 @numba.guvectorize(
     [
-        f"{data_type}[:,:],{size_type}[:],{data_type}[:]"
+        f"{data_type}[:,:],{size_type}[:],{data_type},{data_type}[:]"
         for data_type in [
             "b1",
             "i1",
@@ -99,10 +99,12 @@ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> ND
         ]
         for size_type in ["i4", "i8", "u4", "u8"]
     ],
-    "(l,m),(l),(n)",
+    "(l,m),(l),(),(n)",
     **nb_kwargs,
 )
-def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
+def _nb_fill(
+    aoa_in: NDArray, len_in: NDArray, nan_val: int | float, flattened_array_out: NDArray
+):
     """Vectorized function to fill flattened array from array of arrays and
     lengths. Values in aoa_in past lengths will not be copied.
@@ -112,6 +114,9 @@ def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
         array of arrays containing values to be copied
     len_in
         array of vector lengths for each row of aoa_in
+    nan_val
+        value to use when len_in is longer than aoa_in. Should use
+        np.nan for floating point, and 0xfff... for integer types
     flattened_array_out
         flattened array to copy values into. Must be longer than sum of
         lengths in len_in
@@ -122,9 +127,14 @@ def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
         raise ValueError(msg)
     start = 0
+    max_len = aoa_in.shape[1]
     for i, ll in enumerate(len_in):
         stop = start + ll
-        flattened_array_out[start:stop] = aoa_in[i, :ll]
+        if ll > max_len:
+            flattened_array_out[start : start + max_len] = aoa_in[i, :]
+            flattened_array_out[start + max_len : stop] = nan_val
+        else:
+            flattened_array_out[start:stop] = aoa_in[i, :ll]
         start = stop

{legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.12.0a1.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.12.0a1.dist-info}/top_level.txt RENAMED Viewed

File without changes

legend-pydataobj 1.11.6__py3-none-any.whl → 1.12.0a1__py3-none-any.whl

legend-pydataobj 1.11.6py3-none-any.whl → 1.12.0a1py3-none-any.whl