PyPI - legend-pydataobj - Versions diffs - 1.11.7__py3-none-any.whl → 1.12.0a1__py3-none-any.whl - Mend

legend-pydataobj 1.11.7py3-none-any.whl → 1.12.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/METADATA +1 -1
{legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/RECORD +19 -19
lgdo/_version.py +2 -2
lgdo/lh5/_serializers/read/composite.py +1 -3
lgdo/lh5/concat.py +3 -9
lgdo/lh5/core.py +21 -30
lgdo/lh5/iterator.py +48 -27
lgdo/lh5/store.py +15 -68
lgdo/types/array.py +74 -13
lgdo/types/encoded.py +25 -20
lgdo/types/histogram.py +1 -1
lgdo/types/lgdo.py +50 -0
lgdo/types/table.py +49 -28
lgdo/types/vectorofvectors.py +69 -76
lgdo/types/vovutils.py +14 -4
{legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/WHEEL +0 -0
{legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/entry_points.txt +0 -0
{legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/licenses/LICENSE +0 -0
{legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/top_level.txt +0 -0

{legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: legend_pydataobj
-Version: 1.11.7
+Version: 1.12.0a1
 Summary: LEGEND Python Data Objects
 Author: The LEGEND Collaboration
 Maintainer: The LEGEND Collaboration

{legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
-legend_pydataobj-1.11.7.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+legend_pydataobj-1.12.0a1.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
 lgdo/__init__.py,sha256=QMYK9HhoMi0pbahPN8mPD18gyTxscFgo7QKfCxVhy-0,3196
-lgdo/_version.py,sha256=WYo6AtimYOvXEEB_DEJYUqS-yeVHGFoR5t7JM_9dSwo,513
+lgdo/_version.py,sha256=kTYHwRhTzZEJHpwJeVgXBi4yFTeQDpnR6MYkvCMA06Q,515
 lgdo/cli.py,sha256=s_EWTBWW76l7zWb6gaTSTjiT-0RzzcYEmjeFEQCVxfk,4647
 lgdo/lgdo_utils.py,sha256=6a2YWEwpyEMXlAyTHZMO01aqxy6SxJzPZkGNWKNWuS0,2567
 lgdo/lh5_store.py,sha256=5BzbJA9sLcqjp8bJDc2olwOiw0VS6rmfg3cfh1kQkRY,8512
@@ -14,18 +14,18 @@ lgdo/compression/radware.py,sha256=GcNTtjuyL7VBBqziUBmSqNXuhqy1bJJgvcyvyumPtrc,2
 lgdo/compression/utils.py,sha256=W2RkBrxPpXlat84dnU9Ad7d_tTws0irtGl7O1dNWjnk,1140
 lgdo/compression/varlen.py,sha256=6ZZUItyoOfygDdE0DyoISeFZfqdbH6xl7T0eclfarzg,15127
 lgdo/lh5/__init__.py,sha256=y1XE_mpFWwamrl7WVjAVSVB25X4PrEfdVXSneSQEmlQ,825
-lgdo/lh5/concat.py,sha256=5nO7dNSb0UEP9rZiWGTKH5Cfwsm5LSm3tBJM4Kd70u0,6336
-lgdo/lh5/core.py,sha256=__-A6Abctzfwfo4-xJi68xs2e4vfzONEQTJVrUCOw-I,13922
+lgdo/lh5/concat.py,sha256=BZCgK7TWPKK8fMmha8K83d3bC31FVO1b5LOW7x-Ru1s,6186
+lgdo/lh5/core.py,sha256=GjosZGUp4GSO5FtWV9eXUt_6DGU_OwJXODlj5K1j93M,13320
 lgdo/lh5/datatype.py,sha256=O_7BqOlX8PFMyG0ppkfUT5aps5HEqX0bpuKcJO3jhu0,1691
 lgdo/lh5/exceptions.py,sha256=3kj8avXl4eBGvebl3LG12gJEmw91W0T8PYR0AfvUAyM,1211
-lgdo/lh5/iterator.py,sha256=ZaBBnmuNIjinwO0JUY55wLxX8Om9rVRRzXBC5uHmSKM,19772
-lgdo/lh5/store.py,sha256=3wAaQDd1Zmo0_bQ9DbB-FbKS4Uy_Tb642qKHXtZpSw4,10643
+lgdo/lh5/iterator.py,sha256=1ob9B7Bf3ioGCtZkUZoL6ibTxAwLf4ld8_33ghVVEa4,20498
+lgdo/lh5/store.py,sha256=MYbMt-Mc7izELxuyLlSrrYrylCIzxc2CLzZYIVbZ33w,8455
 lgdo/lh5/tools.py,sha256=T9CgHA8A3_tVBMtiNJ6hATQKhdqI61m3cX4p2wGKc6c,9937
 lgdo/lh5/utils.py,sha256=ioz8DlyXZsejwnU2qYdIccdHcF12H62jgLkZsiDOLSM,6243
 lgdo/lh5/_serializers/__init__.py,sha256=eZzxMp1SeZWG0PkEXUiCz3XyprQ8EmelHUmJogC8xYE,1263
 lgdo/lh5/_serializers/read/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 lgdo/lh5/_serializers/read/array.py,sha256=uWfMCihfAmW2DE2ewip2qCK_kvQC_mb2zvOv26uzijc,1000
-lgdo/lh5/_serializers/read/composite.py,sha256=yTm5dfTgkIL7eG9iZXxhdiRhG04cQLd_hybP4wmxCJE,11809
+lgdo/lh5/_serializers/read/composite.py,sha256=UvkZHEhf0V7SFLxzF52eyP68hU0guGOLqosrfmIfeys,11729
 lgdo/lh5/_serializers/read/encoded.py,sha256=Q98c08d8LkZq2AlY4rThYECVaEqwbv4T2Urn7TGnsyE,4130
 lgdo/lh5/_serializers/read/ndarray.py,sha256=lFCXD6bSzmMOH7cVmvRYXakkfMCI8EoqTPNONRJ1F0s,3690
 lgdo/lh5/_serializers/read/scalar.py,sha256=kwhWm1T91pXf86CqtUUD8_qheSR92gXZrQVtssV5YCg,922
@@ -37,20 +37,20 @@ lgdo/lh5/_serializers/write/composite.py,sha256=I6lH0nWFIpAfZyG4-0rLxzg3mfazZ_FE
 lgdo/lh5/_serializers/write/scalar.py,sha256=JPt_fcdTKOSFp5hfJdcKIfK4hxhcD8vhOlvDF-7btQ8,763
 lgdo/lh5/_serializers/write/vector_of_vectors.py,sha256=puGQX9XF5P_5DVbm_Cc6TvPrsDywgBLSYtkqFNltbB4,3493
 lgdo/types/__init__.py,sha256=DNfOErPiAZg-7Gygkp6ZKAi20Yrm1mfderZHvKo1Y4s,821
-lgdo/types/array.py,sha256=sUxh1CNCaefrnybt5qdjmmMpVQa_RqFxUv1tJ_pyBbc,6537
+lgdo/types/array.py,sha256=e3p93yrfzSmyBgWdGqqtETcKpM7_FxENaAErru15rvo,8904
 lgdo/types/arrayofequalsizedarrays.py,sha256=DOGJiTmc1QCdm7vLbE6uIRXoMPtt8uuCfmwQawgWf5s,4949
-lgdo/types/encoded.py,sha256=JW4U5ow7KLMzhKnmhdnxbC3SZJAs4bOEDZWKG4KY1uU,15293
+lgdo/types/encoded.py,sha256=_e8u_BPfpjJbLnEdyTo9QG3kbNsGj0BN4gjdj3L1ndw,15640
 lgdo/types/fixedsizearray.py,sha256=7RjUwTz1bW0pcrdy27JlfrXPAuOU89Kj7pOuSUCojK8,1527
-lgdo/types/histogram.py,sha256=y6j2VDuGYYnLy7WI4J90ApS0PAwic4kCpouZPX09Nus,19974
-lgdo/types/lgdo.py,sha256=RQ2P70N7IWMBDnLLuJI3sm6zQTIKyOMSsKZtBNzmE90,2928
+lgdo/types/histogram.py,sha256=Jz1lLH56BfYnmcUhxUHK1h2wLDQ0Abgyd-6LznU-3-k,19979
+lgdo/types/lgdo.py,sha256=21YNtJCHnSO3M60rjsAdbMO5crDjL_0BtuFpudZ2xvU,4500
 lgdo/types/scalar.py,sha256=c5Es2vyDqyWTPV6mujzfIzMpC1jNWkEIcvYyWQUxH3Q,1933
 lgdo/types/struct.py,sha256=Q0OWLVd4B0ciLb8t6VsxU3MPbmGLZ7WfQNno1lSQS0Q,4918
-lgdo/types/table.py,sha256=VIHQOPXJHJgiCjMMb_p7EdbcCqLFSObHMdHSxC1Dm5Y,19212
-lgdo/types/vectorofvectors.py,sha256=K8w7CZou857I9YGkeOe2uYB20gbHl4OV9xhnnJPNOjc,24665
-lgdo/types/vovutils.py,sha256=7BWPP0BSj-92ifbCIUBcfqxG5-TS8uxujTyJJuDFI04,10302
+lgdo/types/table.py,sha256=FkWesoEA9bmGGSW8Ewig1Zs77ffUoR_nggfYSmkWpjU,20079
+lgdo/types/vectorofvectors.py,sha256=CtPR2WDBmJmzzfXwH4aUcNMB5LvTiGWmL_qRbFah3to,24756
+lgdo/types/vovutils.py,sha256=WjvPLEJrRNjktnbyfypfgxZX-K_aOvcwPygfzoknsyA,10701
 lgdo/types/waveformtable.py,sha256=f2tS4f1OEoYaTM5ldCX9zmw8iSISCT3t3wS1SrPdu_o,9901
-legend_pydataobj-1.11.7.dist-info/METADATA,sha256=Z0-UFMzWILag78U1HkNpbYwKDb_JZkZ8kZLtW4T8gw0,44443
-legend_pydataobj-1.11.7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-legend_pydataobj-1.11.7.dist-info/entry_points.txt,sha256=0KWfnwbuwhNn0vPUqARukjp04Ca6lzfZBSirouRmk7I,76
-legend_pydataobj-1.11.7.dist-info/top_level.txt,sha256=KyR-EUloqiXcQ62IWnzBmtInDtvsHl4q2ZJAZgTcLXE,5
-legend_pydataobj-1.11.7.dist-info/RECORD,,
+legend_pydataobj-1.12.0a1.dist-info/METADATA,sha256=55pMph32j8h4LKGnoVEdvHX27bHr8k__sdT4L9O5dIA,44445
+legend_pydataobj-1.12.0a1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+legend_pydataobj-1.12.0a1.dist-info/entry_points.txt,sha256=0KWfnwbuwhNn0vPUqARukjp04Ca6lzfZBSirouRmk7I,76
+legend_pydataobj-1.12.0a1.dist-info/top_level.txt,sha256=KyR-EUloqiXcQ62IWnzBmtInDtvsHl4q2ZJAZgTcLXE,5
+legend_pydataobj-1.12.0a1.dist-info/RECORD,,

lgdo/_version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '1.11.7'
-__version_tuple__ = version_tuple = (1, 11, 7)
+__version__ = version = '1.12.0a1'
+__version_tuple__ = version_tuple = (1, 12, 0)

lgdo/lh5/_serializers/read/composite.py CHANGED Viewed

@@ -353,15 +353,13 @@ def _h5_read_table(
             table = Table(col_dict=col_dict, attrs=attrs)
         # set (write) loc to end of tree
-        table.loc = n_rows_read
+        table.resize(do_warn=True)
         return table, n_rows_read
     # We have read all fields into the object buffer. Run
     # checks: All columns should be the same size. So update
     # table's size as necessary, warn if any mismatches are found
     obj_buf.resize(do_warn=True)
-    # set (write) loc to end of tree
-    obj_buf.loc = obj_buf_start + n_rows_read
     # check attributes
     utils.check_obj_buf_attrs(obj_buf.attrs, attrs, fname, oname)

lgdo/lh5/concat.py CHANGED Viewed

@@ -76,7 +76,7 @@ def _get_lgdos(file, obj_list):
                 continue
             # read as little as possible
-            obj, _ = store.read(current, h5f0, n_rows=1)
+            obj = store.read(current, h5f0, n_rows=1)
             if isinstance(obj, (Table, Array, VectorOfVectors)):
                 lgdos.append(current)
@@ -139,12 +139,6 @@ def _remove_nested_fields(lgdos: dict, obj_list: list):
         _inplace_table_filter(key, val, obj_list)
-def _slice(obj, n_rows):
-    ak_obj = obj.view_as("ak")[:n_rows]
-    obj_type = type(obj)
-    return obj_type(ak_obj)
 def lh5concat(
     lh5_files: list,
     output: str,
@@ -186,8 +180,8 @@ def lh5concat(
     # loop over lgdo objects
     for lgdo in lgdos:
         # iterate over the files
-        for lh5_obj, _, n_rows in LH5Iterator(lh5_files, lgdo):
-            data = {lgdo: _slice(lh5_obj, n_rows)}
+        for lh5_obj in LH5Iterator(lh5_files, lgdo):
+            data = {lgdo: lh5_obj}
             # remove the nested fields
             _remove_nested_fields(data, obj_list)

lgdo/lh5/core.py CHANGED Viewed

@@ -4,6 +4,7 @@ import bisect
 import inspect
 import sys
 from collections.abc import Mapping, Sequence
+from contextlib import suppress
 from typing import Any
 import h5py
@@ -92,8 +93,7 @@ def read(
         will be set to ``True``, while the rest will default to ``False``.
     obj_buf
         Read directly into memory provided in `obj_buf`. Note: the buffer
-        will be expanded to accommodate the data requested. To maintain the
-        buffer length, send in ``n_rows = len(obj_buf)``.
+        will be resized to accommodate the data retrieved.
     obj_buf_start
         Start location in ``obj_buf`` for read. For concatenating data to
         array-like objects.
@@ -106,12 +106,8 @@ def read(
     Returns
     -------
-    (object, n_rows_read)
-        `object` is the read-out object `n_rows_read` is the number of rows
-        successfully read out. Essential for arrays when the amount of data
-        is smaller than the object buffer.  For scalars and structs
-        `n_rows_read` will be``1``. For tables it is redundant with
-        ``table.loc``. If `obj_buf` is ``None``, only `object` is returned.
+    object
+        the read-out object
     """
     if isinstance(lh5_file, h5py.File):
         lh5_obj = lh5_file[name]
@@ -119,12 +115,12 @@ def read(
         lh5_file = h5py.File(lh5_file, mode="r", locking=locking)
         lh5_obj = lh5_file[name]
     else:
-        lh5_files = list(lh5_file)
-        n_rows_read = 0
-        obj_buf_is_new = False
+        if obj_buf is not None:
+            obj_buf.resize(obj_buf_start)
+        else:
+            obj_buf_start = 0
-        for i, h5f in enumerate(lh5_files):
+        for i, h5f in enumerate(lh5_file):
             if (
                 isinstance(idx, (list, tuple))
                 and len(idx) > 0
@@ -146,33 +142,26 @@ def read(
                 idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
             else:
                 idx_i = None
-            n_rows_i = n_rows - n_rows_read
-            obj_ret = read(
+            obj_buf_start_i = len(obj_buf) if obj_buf else 0
+            n_rows_i = n_rows - (obj_buf_start_i - obj_buf_start)
+            obj_buf = read(
                 name,
                 h5f,
-                start_row,
+                start_row if i == 0 else 0,
                 n_rows_i,
                 idx_i,
                 use_h5idx,
                 field_mask,
                 obj_buf,
-                obj_buf_start,
+                obj_buf_start_i,
                 decompress,
             )
-            if isinstance(obj_ret, tuple):
-                obj_buf, n_rows_read_i = obj_ret
-                obj_buf_is_new = True
-            else:
-                obj_buf = obj_ret
-                n_rows_read_i = len(obj_buf)
-            n_rows_read += n_rows_read_i
-            if n_rows_read >= n_rows or obj_buf is None:
-                return obj_buf, n_rows_read
-            start_row = 0
-            obj_buf_start += n_rows_read_i
-        return obj_buf if obj_buf_is_new else (obj_buf, n_rows_read)
+            if obj_buf is None or (len(obj_buf) - obj_buf_start) >= n_rows:
+                return obj_buf
+        return obj_buf
     if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
         idx = idx[0]
@@ -192,8 +181,10 @@ def read(
         obj_buf_start=obj_buf_start,
         decompress=decompress,
     )
+    with suppress(AttributeError):
+        obj.resize(obj_buf_start + n_rows_read)
-    return obj if obj_buf is None else (obj, n_rows_read)
+    return obj
 def write(

lgdo/lh5/iterator.py CHANGED Viewed

@@ -24,7 +24,8 @@ class LH5Iterator(typing.Iterator):
     This can be used as an iterator:
-    >>> for lh5_obj, i_entry, n_rows in LH5Iterator(...):
+    >>> for lh5_obj in LH5Iterator(...):
     >>>    # do the thing!
     This is intended for if you are reading a large quantity of data. This
@@ -42,6 +43,8 @@ class LH5Iterator(typing.Iterator):
     In addition to accessing requested data via ``lh5_obj``, several
     properties exist to tell you where that data came from:
+    - lh5_it.current_i_entry: get the index within the entry list of the
+      first entry that is currently read
     - lh5_it.current_local_entries: get the entry numbers relative to the
       file the data came from
     - lh5_it.current_global_entries: get the entry number relative to the
@@ -49,9 +52,9 @@ class LH5Iterator(typing.Iterator):
     - lh5_it.current_files: get the file name corresponding to each entry
     - lh5_it.current_groups: get the group name corresponding to each entry
-    This class can also be used either for random access:
+    This class can also be used for random access:
-    >>> lh5_obj, n_rows = lh5_it.read(i_entry)
+    >>> lh5_obj = lh5_it.read(i_entry)
     to read the block of entries starting at i_entry. In case of multiple files
     or the use of an event selection, i_entry refers to a global event index
@@ -65,6 +68,8 @@ class LH5Iterator(typing.Iterator):
         base_path: str = "",
         entry_list: list[int] | list[list[int]] | None = None,
         entry_mask: list[bool] | list[list[bool]] | None = None,
+        i_start: int = 0,
+        n_entries: int | None = None,
         field_mask: dict[str, bool] | list[str] | tuple[str] | None = None,
         buffer_len: int = "100*MB",
         file_cache: int = 10,
@@ -89,6 +94,10 @@ class LH5Iterator(typing.Iterator):
         entry_mask
             mask of entries to read. If a list of arrays is provided, expect
             one for each file. Ignore if a selection list is provided.
+        i_start
+            index of first entry to start at when iterating
+        n_entries
+            number of entries to read before terminating iteration
         field_mask
             mask of which fields to read. See :meth:`LH5Store.read` for
             more details.
@@ -183,7 +192,8 @@ class LH5Iterator(typing.Iterator):
             msg = f"can't open any files from {lh5_files}"
             raise RuntimeError(msg)
-        self.n_rows = 0
+        self.i_start = i_start
+        self.n_entries = n_entries
         self.current_i_entry = 0
         self.next_i_entry = 0
@@ -317,14 +327,21 @@ class LH5Iterator(typing.Iterator):
                 )
         return self.global_entry_list
-    def read(self, i_entry: int) -> tuple[LGDO, int]:
-        """Read the nextlocal chunk of events, starting at i_entry. Return the
-        LH5 buffer and number of rows read."""
-        self.n_rows = 0
-        i_file = np.searchsorted(self.entry_map, i_entry, "right")
+    def read(self, i_entry: int, n_entries: int | None = None) -> LGDO:
+        "Read the nextlocal chunk of events, starting at entry."
+        self.lh5_buffer.resize(0)
+        if n_entries is None:
+            n_entries = self.buffer_len
+        elif n_entries == 0:
+            return self.lh5_buffer
+        elif n_entries > self.buffer_len:
+            msg = "n_entries cannot be larger than buffer_len"
+            raise ValueError(msg)
         # if file hasn't been opened yet, search through files
         # sequentially until we find the right one
+        i_file = np.searchsorted(self.entry_map, i_entry, "right")
         if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("q").max:
             while i_file < len(self.lh5_files) and i_entry >= self._get_file_cumentries(
                 i_file
@@ -332,10 +349,10 @@ class LH5Iterator(typing.Iterator):
                 i_file += 1
         if i_file == len(self.lh5_files):
-            return (self.lh5_buffer, self.n_rows)
+            return self.lh5_buffer
         local_i_entry = i_entry - self._get_file_cumentries(i_file - 1)
-        while self.n_rows < self.buffer_len and i_file < len(self.file_map):
+        while len(self.lh5_buffer) < n_entries and i_file < len(self.file_map):
             # Loop through files
             local_idx = self.get_file_entrylist(i_file)
             if local_idx is not None and len(local_idx) == 0:
@@ -344,18 +361,17 @@ class LH5Iterator(typing.Iterator):
                 continue
             i_local = local_i_entry if local_idx is None else local_idx[local_i_entry]
-            self.lh5_buffer, n_rows = self.lh5_st.read(
+            self.lh5_buffer = self.lh5_st.read(
                 self.groups[i_file],
                 self.lh5_files[i_file],
                 start_row=i_local,
-                n_rows=self.buffer_len - self.n_rows,
+                n_rows=n_entries - len(self.lh5_buffer),
                 idx=local_idx,
                 field_mask=self.field_mask,
                 obj_buf=self.lh5_buffer,
-                obj_buf_start=self.n_rows,
+                obj_buf_start=len(self.lh5_buffer),
             )
-            self.n_rows += n_rows
             i_file += 1
             local_i_entry = 0
@@ -364,7 +380,7 @@ class LH5Iterator(typing.Iterator):
         if self.friend is not None:
             self.friend.read(i_entry)
-        return (self.lh5_buffer, self.n_rows)
+        return self.lh5_buffer
     def reset_field_mask(self, mask):
         """Replaces the field mask of this iterator and any friends with mask"""
@@ -375,7 +391,7 @@ class LH5Iterator(typing.Iterator):
     @property
     def current_local_entries(self) -> NDArray[int]:
         """Return list of local file entries in buffer"""
-        cur_entries = np.zeros(self.n_rows, dtype="int32")
+        cur_entries = np.zeros(len(self.lh5_buffer), dtype="int32")
         i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
         file_start = self._get_file_cumentries(i_file - 1)
         i_local = self.current_i_entry - file_start
@@ -402,7 +418,7 @@ class LH5Iterator(typing.Iterator):
     @property
     def current_global_entries(self) -> NDArray[int]:
         """Return list of local file entries in buffer"""
-        cur_entries = np.zeros(self.n_rows, dtype="int32")
+        cur_entries = np.zeros(len(self.lh5_buffer), dtype="int32")
         i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
         file_start = self._get_file_cumentries(i_file - 1)
         i_local = self.current_i_entry - file_start
@@ -433,7 +449,7 @@ class LH5Iterator(typing.Iterator):
     @property
     def current_files(self) -> NDArray[str]:
         """Return list of file names for entries in buffer"""
-        cur_files = np.zeros(self.n_rows, dtype=object)
+        cur_files = np.zeros(len(self.lh5_buffer), dtype=object)
         i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
         file_start = self._get_file_cumentries(i_file - 1)
         i_local = self.current_i_entry - file_start
@@ -455,7 +471,7 @@ class LH5Iterator(typing.Iterator):
     @property
     def current_groups(self) -> NDArray[str]:
         """Return list of group names for entries in buffer"""
-        cur_groups = np.zeros(self.n_rows, dtype=object)
+        cur_groups = np.zeros(len(self.lh5_buffer), dtype=object)
         i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
         file_start = self._get_file_cumentries(i_file - 1)
         i_local = self.current_i_entry - file_start
@@ -485,14 +501,19 @@ class LH5Iterator(typing.Iterator):
     def __iter__(self) -> typing.Iterator:
         """Loop through entries in blocks of size buffer_len."""
         self.current_i_entry = 0
-        self.next_i_entry = 0
+        self.next_i_entry = self.i_start
         return self
     def __next__(self) -> tuple[LGDO, int, int]:
-        """Read next buffer_len entries and return lh5_table, iterator entry
-        and n_rows read."""
-        buf, n_rows = self.read(self.next_i_entry)
-        self.next_i_entry = self.current_i_entry + n_rows
-        if n_rows == 0:
+        """Read next buffer_len entries and return lh5_table and iterator entry."""
+        n_entries = self.n_entries
+        if n_entries is not None:
+            n_entries = min(
+                self.buffer_len, n_entries + self.i_start - self.next_i_entry
+            )
+        buf = self.read(self.next_i_entry, n_entries)
+        if len(buf) == 0:
             raise StopIteration
-        return (buf, self.current_i_entry, n_rows)
+        self.next_i_entry = self.current_i_entry + len(buf)
+        return buf

lgdo/lh5/store.py CHANGED Viewed

@@ -5,7 +5,6 @@ HDF5 files.
 from __future__ import annotations
-import bisect
 import logging
 import os
 import sys
@@ -15,11 +14,11 @@ from inspect import signature
 from typing import Any
 import h5py
-import numpy as np
 from numpy.typing import ArrayLike
 from .. import types
 from . import _serializers, utils
+from .core import read
 log = logging.getLogger(__name__)
@@ -155,7 +154,7 @@ class LH5Store:
         """Returns an LH5 object appropriate for use as a pre-allocated buffer
         in a read loop. Sets size to `size` if object has a size.
         """
-        obj, n_rows = self.read(name, lh5_file, n_rows=0, field_mask=field_mask)
+        obj = self.read(name, lh5_file, n_rows=0, field_mask=field_mask)
         if hasattr(obj, "resize") and size is not None:
             obj.resize(new_size=size)
         return obj
@@ -182,72 +181,20 @@ class LH5Store:
         """
         # grab files from store
         if isinstance(lh5_file, (str, h5py.File)):
-            lh5_obj = self.gimme_file(lh5_file, "r", **file_kwargs)[name]
+            h5f = self.gimme_file(lh5_file, "r", **file_kwargs)
         else:
-            lh5_files = list(lh5_file)
-            n_rows_read = 0
-            for i, h5f in enumerate(lh5_files):
-                if (
-                    isinstance(idx, (list, tuple))
-                    and len(idx) > 0
-                    and not np.isscalar(idx[0])
-                ):
-                    # a list of lists: must be one per file
-                    idx_i = idx[i]
-                elif idx is not None:
-                    # make idx a proper tuple if it's not one already
-                    if not (isinstance(idx, tuple) and len(idx) == 1):
-                        idx = (idx,)
-                    # idx is a long continuous array
-                    n_rows_i = utils.read_n_rows(name, h5f)
-                    # find the length of the subset of idx that contains indices
-                    # that are less than n_rows_i
-                    n_rows_to_read_i = bisect.bisect_left(idx[0], n_rows_i)
-                    # now split idx into idx_i and the remainder
-                    idx_i = np.array(idx[0])[:n_rows_to_read_i]
-                    idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
-                else:
-                    idx_i = None
-                n_rows_i = n_rows - n_rows_read
-                obj_buf, n_rows_read_i = self.read(
-                    name,
-                    h5f,
-                    start_row,
-                    n_rows_i,
-                    idx_i,
-                    use_h5idx,
-                    field_mask,
-                    obj_buf,
-                    obj_buf_start,
-                    decompress,
-                )
-                n_rows_read += n_rows_read_i
-                if n_rows_read >= n_rows or obj_buf is None:
-                    return obj_buf, n_rows_read
-                start_row = 0
-                obj_buf_start += n_rows_read_i
-            return obj_buf, n_rows_read
-        if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
-            idx = idx[0]
-        if isinstance(idx, np.ndarray) and idx.dtype == np.dtype("?"):
-            idx = np.where(idx)[0]
-        return _serializers._h5_read_lgdo(
-            lh5_obj.id,
-            lh5_obj.file.filename,
-            lh5_obj.name,
-            start_row=start_row,
-            n_rows=n_rows,
-            idx=idx,
-            use_h5idx=use_h5idx,
-            field_mask=field_mask,
-            obj_buf=obj_buf,
-            obj_buf_start=obj_buf_start,
-            decompress=decompress,
+            h5f = [self.gimme_file(f, "r", **file_kwargs) for f in lh5_file]
+        return read(
+            name,
+            h5f,
+            start_row,
+            n_rows,
+            idx,
+            use_h5idx,
+            field_mask,
+            obj_buf,
+            obj_buf_start,
+            decompress,
         )
     def write(

lgdo/types/array.py CHANGED Viewed

@@ -17,12 +17,12 @@ import pint_pandas  # noqa: F401
 from .. import utils
 from ..units import default_units_registry as u
-from .lgdo import LGDO
+from .lgdo import LGDOCollection
 log = logging.getLogger(__name__)
-class Array(LGDO):
+class Array(LGDOCollection):
     r"""Holds an :class:`numpy.ndarray` and attributes.
     :class:`Array` (and the other various array types) holds an `nda` instead
@@ -78,11 +78,7 @@ class Array(LGDO):
         elif isinstance(nda, Array):
             nda = nda.nda
-        elif not isinstance(nda, np.ndarray):
-            nda = np.array(nda)
         self.nda = nda
-        self.dtype = self.nda.dtype
         super().__init__(attrs)
@@ -96,18 +92,83 @@ class Array(LGDO):
         return dt + "<" + nd + ">{" + et + "}"
     def __len__(self) -> int:
-        return len(self.nda)
+        return self._size
+    @property
+    def nda(self):
+        return self._nda[: self._size, ...] if self._nda.shape != () else self._nda
+    @nda.setter
+    def nda(self, value):
+        self._nda = value if isinstance(value, np.ndarray) else np.array(value)
+        self._size = len(self._nda) if self._nda.shape != () else 0
+    @property
+    def dtype(self):
+        return self._nda.dtype
+    @property
+    def shape(self):
+        return (len(self),) + self._nda.shape[1:]
+    def reserve_capacity(self, capacity: int) -> None:
+        "Set size (number of rows) of internal memory buffer"
+        if capacity < len(self):
+            msg = "Cannot reduce capacity below Array length"
+            raise ValueError(msg)
+        self._nda.resize((capacity,) + self._nda.shape[1:], refcheck=False)
+    def get_capacity(self) -> int:
+        "Get capacity (i.e. max size before memory must be re-allocated)"
+        return len(self._nda)
+    def trim_capacity(self) -> None:
+        "Set capacity to be minimum needed to support Array size"
+        self.reserve_capacity(np.prod(self.shape))
+    def resize(self, new_size: int, trim=False) -> None:
+        """Set size of Array in rows. Only change capacity if it must be
+        increased to accommodate new rows; in this case double capacity.
+        If trim is True, capacity will be set to match size."""
+        self._size = new_size
+        if trim and new_size != self.get_capacity:
+            self.reserve_capacity(new_size)
-    def resize(self, new_size: int) -> None:
-        new_shape = (new_size,) + self.nda.shape[1:]
-        return self.nda.resize(new_shape, refcheck=True)
+        # If capacity is not big enough, set to next power of 2 big enough
+        if new_size > self.get_capacity():
+            self.reserve_capacity(int(2 ** (np.ceil(np.log2(new_size)))))
     def append(self, value: np.ndarray) -> None:
-        self.resize(len(self) + 1)
-        self.nda[-1] = value
+        "Append value to end of array (with copy)"
+        self.insert(len(self), value)
     def insert(self, i: int, value: int | float) -> None:
-        self.nda = np.insert(self.nda, i, value)
+        "Insert value into row i (with copy)"
+        if i > len(self):
+            msg = f"index {i} is out of bounds for array with size {len(self)}"
+            raise IndexError(msg)
+        value = np.array(value)
+        if value.shape == self.shape[1:]:
+            self.resize(len(self) + 1)
+            self[i + 1 :] = self[i:-1]
+            self[i] = value
+        elif value.shape[1:] == self.shape[1:]:
+            self.resize(len(self) + len(value))
+            self[i + len(value) :] = self[i : -len(value)]
+            self[i : i + len(value)] = value
+        else:
+            msg = f"Could not insert value with shape {value.shape} into Array with shape {self.shape}"
+            raise ValueError(msg)
+    def replace(self, i: int, value: int | float) -> None:
+        "Replace value at row i"
+        if i >= len(self):
+            msg = f"index {i} is out of bounds for array with size {len(self)}"
+            raise IndexError(msg)
+        self[i] = value
     def __getitem__(self, key):
         return self.nda[key]

lgdo/types/encoded.py CHANGED Viewed

@@ -11,12 +11,12 @@ from numpy.typing import NDArray
 from .. import utils
 from .array import Array
-from .lgdo import LGDO
+from .lgdo import LGDOCollection
 from .scalar import Scalar
 from .vectorofvectors import VectorOfVectors
-class VectorOfEncodedVectors(LGDO):
+class VectorOfEncodedVectors(LGDOCollection):
     """An array of variable-length encoded arrays.
     Used to represent an encoded :class:`.VectorOfVectors`. In addition to an
@@ -92,6 +92,17 @@ class VectorOfEncodedVectors(LGDO):
         return False
+    def reserve_capacity(self, *capacity: int) -> None:
+        self.encoded_data.reserve_capacity(*capacity)
+        self.decoded_size.reserve_capacity(capacity[0])
+    def get_capacity(self) -> tuple:
+        return (self.decoded_size.get_capacity, *self.encoded_data.get_capacity())
+    def trim_capacity(self) -> None:
+        self.encoded_data.trim_capacity()
+        self.decoded_size.trim_capacity()
     def resize(self, new_size: int) -> None:
         """Resize vector along the first axis.
@@ -102,21 +113,6 @@ class VectorOfEncodedVectors(LGDO):
         self.encoded_data.resize(new_size)
         self.decoded_size.resize(new_size)
-    def append(self, value: tuple[NDArray, int]) -> None:
-        """Append a 1D encoded vector at the end.
-        Parameters
-        ----------
-        value
-            a tuple holding the encoded array and its decoded size.
-        See Also
-        --------
-        .VectorOfVectors.append
-        """
-        self.encoded_data.append(value[0])
-        self.decoded_size.append(value[1])
     def insert(self, i: int, value: tuple[NDArray, int]) -> None:
         """Insert an encoded vector at index `i`.
@@ -282,7 +278,7 @@ class VectorOfEncodedVectors(LGDO):
         raise ValueError(msg)
-class ArrayOfEncodedEqualSizedArrays(LGDO):
+class ArrayOfEncodedEqualSizedArrays(LGDOCollection):
     """An array of encoded arrays with equal decoded size.
     Used to represent an encoded :class:`.ArrayOfEqualSizedArrays`. In addition
@@ -349,14 +345,23 @@ class ArrayOfEncodedEqualSizedArrays(LGDO):
         return False
-    def resize(self, new_size: int) -> None:
+    def reserve_capacity(self, *capacity: int) -> None:
+        self.encoded_data.reserve_capacity(capacity)
+    def get_capacity(self) -> tuple:
+        return self.encoded_data.get_capacity()
+    def trim_capacity(self) -> None:
+        self.encoded_data.trim_capacity()
+    def resize(self, new_size: int, trim: bool = False) -> None:
         """Resize array along the first axis.
         See Also
         --------
         .VectorOfVectors.resize
         """
-        self.encoded_data.resize(new_size)
+        self.encoded_data.resize(new_size, trim)
     def append(self, value: NDArray) -> None:
         """Append a 1D encoded array at the end.

lgdo/types/histogram.py CHANGED Viewed

@@ -424,7 +424,7 @@ class Histogram(Struct):
             dict.__setitem__(self, name, obj)
         else:
             msg = "histogram fields cannot be mutated "
-            raise TypeError(msg)
+            raise AttributeError(msg)
     def __getattr__(self, name: str) -> None:
         # do not allow for new attributes on this

lgdo/types/lgdo.py CHANGED Viewed

@@ -92,3 +92,53 @@ class LGDO(ABC):
     def __repr__(self) -> str:
         return self.__class__.__name__ + f"(attrs={self.attrs!r})"
+class LGDOCollection(LGDO):
+    """Abstract base class representing a LEGEND Collection Object (LGDO).
+    This defines the interface for classes used as table columns.
+    """
+    @abstractmethod
+    def __init__(self, attrs: dict[str, Any] | None = None) -> None:
+        super().__init__(attrs)
+    @abstractmethod
+    def __len__(self) -> int:
+        """Provides ``__len__`` for this array-like class."""
+    @abstractmethod
+    def reserve_capacity(self, capacity: int) -> None:
+        """Reserve capacity (in rows) for later use. Internal memory buffers
+        will have enough entries to store this many rows.
+        """
+    @abstractmethod
+    def get_capacity(self) -> int:
+        "get reserved capacity of internal memory buffers in rows"
+    @abstractmethod
+    def trim_capacity(self) -> None:
+        """set capacity to only what is required to store current contents
+        of LGDOCollection
+        """
+    @abstractmethod
+    def resize(self, new_size: int, trim: bool = False) -> None:
+        """Return this LGDO's datatype attribute string."""
+    def append(self, val) -> None:
+        "append val to end of LGDOCollection"
+        self.insert(len(self), val)
+    @abstractmethod
+    def insert(self, i: int, val) -> None:
+        "insert val into LGDOCollection at position i"
+    @abstractmethod
+    def replace(self, i: int, val) -> None:
+        "replace item at position i with val in LGDOCollection"
+    def clear(self, trim: bool = False) -> None:
+        "set size of LGDOCollection to zero"
+        self.resize(0, trim=trim)

lgdo/types/table.py CHANGED Viewed

@@ -19,7 +19,7 @@ from pandas.io.formats import format as fmt
 from .array import Array
 from .arrayofequalsizedarrays import ArrayOfEqualSizedArrays
-from .lgdo import LGDO
+from .lgdo import LGDO, LGDOCollection
 from .scalar import Scalar
 from .struct import Struct
 from .vectorofvectors import VectorOfVectors
@@ -27,13 +27,9 @@ from .vectorofvectors import VectorOfVectors
 log = logging.getLogger(__name__)
-class Table(Struct):
+class Table(Struct, LGDOCollection):
     """A special struct of arrays or subtable columns of equal length.
-    Holds onto an internal read/write location ``loc`` that is useful in
-    managing table I/O using functions like :meth:`push_row`, :meth:`is_full`,
-    and :meth:`clear`.
     Note
     ----
     If you write to a table and don't fill it up to its total size, be sure to
@@ -49,7 +45,7 @@ class Table(Struct):
     def __init__(
         self,
-        col_dict: Mapping[str, LGDO] | pd.DataFrame | ak.Array | None = None,
+        col_dict: Mapping[str, LGDOCollection] | pd.DataFrame | ak.Array | None = None,
         size: int | None = None,
         attrs: Mapping[str, Any] | None = None,
     ) -> None:
@@ -65,7 +61,7 @@ class Table(Struct):
         col_dict
             instantiate this table using the supplied mapping of column names
             and array-like objects. Supported input types are: mapping of
-            strings to LGDOs, :class:`pd.DataFrame` and :class:`ak.Array`.
+            strings to LGDOCollections, :class:`pd.DataFrame` and :class:`ak.Array`.
             Note 1: no copy is performed, the objects are used directly (unless
             :class:`ak.Array` is provided).  Note 2: if `size` is not ``None``,
             all arrays will be resized to match it.  Note 3: if the arrays have
@@ -85,7 +81,8 @@ class Table(Struct):
             col_dict = _ak_to_lgdo_or_col_dict(col_dict)
         # call Struct constructor
-        super().__init__(obj_dict=col_dict, attrs=attrs)
+        Struct.__init__(self, obj_dict=col_dict)
+        LGDOCollection.__init__(self, attrs=attrs)
         # if col_dict is not empty, set size according to it
         # if size is also supplied, resize all fields to match it
@@ -93,13 +90,10 @@ class Table(Struct):
         if col_dict is not None and len(col_dict) > 0:
             self.resize(new_size=size, do_warn=(size is None))
-        # if no col_dict, just set the size (default to 1024)
+        # if no col_dict, just set the size
         else:
             self.size = size if size is not None else None
-        # always start at loc=0
-        self.loc = 0
     def datatype_name(self) -> str:
         return "table"
@@ -107,7 +101,31 @@ class Table(Struct):
         """Provides ``__len__`` for this array-like class."""
         return self.size
-    def resize(self, new_size: int | None = None, do_warn: bool = False) -> None:
+    def reserve_capacity(self, capacity: int | list) -> None:
+        "Set size (number of rows) of internal memory buffer"
+        if isinstance(capacity, int):
+            for obj in self.values():
+                obj.reserve_capacity(capacity)
+        else:
+            if len(capacity) != len(self.keys()):
+                msg = "List of capacities must have same length as number of keys"
+                raise ValueError(msg)
+            for obj, cap in zip(self.values(), capacity):
+                obj.reserve_capacity(cap)
+    def get_capacity(self) -> int:
+        "Get list of capacities for each key"
+        return [v.get_capacity() for v in self.values()]
+    def trim_capacity(self) -> int:
+        "Set capacity to be minimum needed to support Array size"
+        for v in self.values():
+            v.trim_capacity()
+    def resize(
+        self, new_size: int | None = None, do_warn: bool = False, trim: bool = False
+    ) -> None:
         # if new_size = None, use the size from the first field
         for field, obj in self.items():
             if new_size is None:
@@ -119,21 +137,20 @@ class Table(Struct):
                         f"with size {len(obj)} != {new_size}"
                     )
                 if isinstance(obj, Table):
-                    obj.resize(new_size)
+                    obj.resize(new_size, trim)
                 else:
-                    obj.resize(new_size)
+                    obj.resize(new_size, trim)
         self.size = new_size
-    def push_row(self) -> None:
-        self.loc += 1
-    def is_full(self) -> bool:
-        return self.loc >= self.size
-    def clear(self) -> None:
-        self.loc = 0
+    def insert(self, i: int, vals: dict) -> None:
+        "Insert vals into table at row i. Vals is a mapping from table key to val"
+        for k, ar in self.items():
+            ar.insert(i, vals[k])
+        self.size += 1
-    def add_field(self, name: str, obj: LGDO, use_obj_size: bool = False) -> None:
+    def add_field(
+        self, name: str, obj: LGDOCollection, use_obj_size: bool = False
+    ) -> None:
         """Add a field (column) to the table.
         Use the name "field" here to match the terminology used in
@@ -170,7 +187,9 @@ class Table(Struct):
             new_size = len(obj) if use_obj_size else self.size
             self.resize(new_size=new_size)
-    def add_column(self, name: str, obj: LGDO, use_obj_size: bool = False) -> None:
+    def add_column(
+        self, name: str, obj: LGDOCollection, use_obj_size: bool = False
+    ) -> None:
         """Alias for :meth:`.add_field` using table terminology 'column'."""
         self.add_field(name, obj, use_obj_size=use_obj_size)
@@ -201,8 +220,10 @@ class Table(Struct):
             set to ``False`` to turn off warnings associated with mismatched
             `loc` parameter or :meth:`add_column` warnings.
         """
-        if other_table.loc != self.loc and do_warn:
-            log.warning(f"other_table.loc ({other_table.loc}) != self.loc({self.loc})")
+        if len(other_table) != len(self) and do_warn:
+            log.warning(
+                f"len(other_table) ({len(other_table)}) != len(self) ({len(self)})"
+            )
         if cols is None:
             cols = other_table.keys()
         for name in cols:

lgdo/types/vectorofvectors.py CHANGED Viewed

@@ -20,12 +20,12 @@ from .. import utils
 from . import arrayofequalsizedarrays as aoesa
 from . import vovutils
 from .array import Array
-from .lgdo import LGDO
+from .lgdo import LGDOCollection
 log = logging.getLogger(__name__)
-class VectorOfVectors(LGDO):
+class VectorOfVectors(LGDOCollection):
     """A n-dimensional variable-length 1D array of variable-length 1D arrays.
     If the vector is 2-dimensional, the internal representation is as two NumPy
@@ -210,20 +210,17 @@ class VectorOfVectors(LGDO):
             elif self.flattened_data is None:
                 self.flattened_data = flattened_data
-            # finally set dtype
-            self.dtype = self.flattened_data.dtype
-        # set ndim
-        self.ndim = 2
-        pointer = self.flattened_data
-        while True:
-            if isinstance(pointer, Array):
-                break
+        super().__init__(attrs)
-            self.ndim += 1
-            pointer = pointer.flattened_data
+    @property
+    def ndim(self):
+        return 1 + (
+            1 if isinstance(self.flattened_data, Array) else self.flattened_data.ndim
+        )
-        super().__init__(attrs)
+    @property
+    def dtype(self) -> np.dtype:
+        return self.flattened_data.dtype
     def datatype_name(self) -> str:
         return "array"
@@ -276,7 +273,30 @@ class VectorOfVectors(LGDO):
         else:
             raise NotImplementedError
-    def resize(self, new_size: int) -> None:
+    def reserve_capacity(self, cap_cl, *cap_args) -> None:
+        """Set capacity of internal data arrays. Expect number of args to
+        equal `self.n_dim`. First arg is capacity of cumulative length array.
+        If `self.n_dim` is 2, second argument is capacity of flattened data,
+        otherwise arguments are fed recursively to remaining dimensions.
+        """
+        self.cumulative_length.reserve_capacity(cap_cl)
+        self.flattened_data.reserve_capacity(*cap_args)
+    def get_capacity(self) -> tuple[int]:
+        """Get tuple containing capacity of each dimension. First dimension
+        is cumulative length array. Last dimension is flattened data.
+        """
+        fd_cap = self.flattened_data.get_capacity()
+        if isinstance(fd_cap, int):
+            return (self.cumulative_length.get_capacity(), fd_cap)
+        return (self.cumulative_length.get_capacity(), *fd_cap)
+    def trim_capacity(self) -> None:
+        "Set capacity for all dimensions to minimum needed to hold data"
+        self.cumulative_length.trim_capacity()
+        self.flattened_data.trim_capacity()
+    def resize(self, new_size: int, trim: bool = False) -> None:
         """Resize vector along the first axis.
         `self.flattened_data` is resized only if `new_size` is smaller than the
@@ -286,6 +306,8 @@ class VectorOfVectors(LGDO):
         `self.cumulative_length` is padded with its last element.  This
         corresponds to appending empty vectors.
+        If `trim` is ``True``, resize capacity to match new size
         Examples
         --------
         >>> vov = VectorOfVectors([[1, 2, 3], [4, 5]])
@@ -303,23 +325,22 @@ class VectorOfVectors(LGDO):
          [3],
         ]
         """
-        vidx = self.cumulative_length
         old_s = len(self)
-        dlen = new_size - old_s
-        csum = vidx[-1] if len(self) > 0 else 0
         # first resize the cumulative length
-        self.cumulative_length.resize(new_size)
+        self.cumulative_length.resize(new_size, trim)
         # if new_size > size, new elements are filled with zeros, let's fix
         # that
-        if dlen > 0:
-            self.cumulative_length[old_s:] = csum
+        if new_size > old_s:
+            self.cumulative_length[old_s:] = self.cumulative_length[old_s - 1]
         # then resize the data array
         # if dlen > 0 this has no effect
         if len(self.cumulative_length) > 0:
-            self.flattened_data.resize(self.cumulative_length[-1])
+            self.flattened_data.resize(self.cumulative_length[-1], trim)
+        else:
+            self.flattened_data.resize(0, trim)
     def append(self, new: NDArray) -> None:
         """Append a 1D vector `new` at the end.
@@ -334,20 +355,7 @@ class VectorOfVectors(LGDO):
          [8 9],
         ]
         """
-        if self.ndim == 2:
-            # first extend cumulative_length by +1
-            self.cumulative_length.resize(len(self) + 1)
-            # set it at the right value
-            newlen = (
-                self.cumulative_length[-2] + len(new) if len(self) > 1 else len(new)
-            )
-            self.cumulative_length[-1] = newlen
-            # then resize flattened_data to accommodate the new vector
-            self.flattened_data.resize(len(self.flattened_data) + len(new))
-            # finally set it
-            self[-1] = new
-        else:
-            raise NotImplementedError
+        self.insert(len(self), new)
     def insert(self, i: int, new: NDArray) -> None:
         """Insert a vector at index `i`.
@@ -364,23 +372,15 @@ class VectorOfVectors(LGDO):
          [8 9],
          [4 5],
         ]
-        Warning
-        -------
-        This method involves a significant amount of memory re-allocation and
-        is expected to perform poorly on large vectors.
         """
         if self.ndim == 2:
-            if i >= len(self):
-                msg = f"index {i} is out of bounds for vector owith size {len(self)}"
+            if i > len(self):
+                msg = f"index {i} is out of bounds for vector with size {len(self)}"
                 raise IndexError(msg)
-            self.flattened_data = Array(
-                np.insert(self.flattened_data, self.cumulative_length[i - 1], new)
-            )
-            self.cumulative_length = Array(
-                np.insert(self.cumulative_length, i, self.cumulative_length[i - 1])
-            )
+            i_start = 0 if i == 0 else self.cumulative_length[i - 1]
+            self.flattened_data.insert(i_start, new)
+            self.cumulative_length.insert(i, i_start)
             self.cumulative_length[i:] += np.uint32(len(new))
         else:
             raise NotImplementedError
@@ -400,11 +400,6 @@ class VectorOfVectors(LGDO):
         [[8 9],
          [4 5],
         ]
-        Warning
-        -------
-        This method involves a significant amount of memory re-allocation and
-        is expected to perform poorly on large vectors.
         """
         if self.ndim == 2:
             if i >= len(self):
@@ -414,27 +409,17 @@ class VectorOfVectors(LGDO):
             vidx = self.cumulative_length
             dlen = len(new) - len(self[i])
-            if dlen == 0:
-                # don't waste resources
-                self[i] = new
-            elif dlen < 0:
-                start = vidx[i - 1]
-                stop = start + len(new)
-                # set the already allocated indices
-                self.flattened_data[start:stop] = new
-                # then delete the extra indices
-                self.flattened_data = Array(
-                    np.delete(self.flattened_data, np.s_[stop : vidx[i]])
-                )
-            else:
-                # set the already allocated indices
-                self.flattened_data[vidx[i - 1] : vidx[i]] = new[: len(self[i])]
-                # then insert the remaining
-                self.flattened_data = Array(
-                    np.insert(self.flattened_data, vidx[i], new[len(self[i]) :])
-                )
-            vidx[i:] = vidx[i:] + dlen
+            if dlen != 0:
+                # move the subsequent entries
+                vidx[i:] += dlen
+                self.flattened_data.resize(vidx[-1])
+                self.flattened_data._nda[vidx[i] : vidx[-1]] = self.flattened_data._nda[
+                    vidx[i] - dlen : vidx[-1] - dlen
+                ]
+            # set the already allocated indices
+            start = vidx[i - 1] if i > 0 else 0
+            self.flattened_data[start : vidx[i]] = new
         else:
             raise NotImplementedError
@@ -484,7 +469,15 @@ class VectorOfVectors(LGDO):
             cum_lens = np.add(start, lens.cumsum(), dtype=int)
             # fill with fast vectorized routine
-            vovutils._nb_fill(vec, lens, self.flattened_data.nda[start : cum_lens[-1]])
+            if np.issubdtype(self.flattened_data.dtype, np.unsignedinteger):
+                nan_val = np.iinfo(self.flattened_data.dtype).max
+            if np.issubdtype(self.flattened_data.dtype, np.integer):
+                nan_val = np.iinfo(self.flattened_data.dtype).min
+            else:
+                nan_val = np.nan
+            vovutils._nb_fill(
+                vec, lens, nan_val, self.flattened_data.nda[start : cum_lens[-1]]
+            )
             # add new vector(s) length to cumulative_length
             self.cumulative_length[i : i + len(lens)] = cum_lens

lgdo/types/vovutils.py CHANGED Viewed

@@ -81,7 +81,7 @@ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> ND
 @numba.guvectorize(
     [
-        f"{data_type}[:,:],{size_type}[:],{data_type}[:]"
+        f"{data_type}[:,:],{size_type}[:],{data_type},{data_type}[:]"
         for data_type in [
             "b1",
             "i1",
@@ -99,10 +99,12 @@ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> ND
         ]
         for size_type in ["i4", "i8", "u4", "u8"]
     ],
-    "(l,m),(l),(n)",
+    "(l,m),(l),(),(n)",
     **nb_kwargs,
 )
-def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
+def _nb_fill(
+    aoa_in: NDArray, len_in: NDArray, nan_val: int | float, flattened_array_out: NDArray
+):
     """Vectorized function to fill flattened array from array of arrays and
     lengths. Values in aoa_in past lengths will not be copied.
@@ -112,6 +114,9 @@ def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
         array of arrays containing values to be copied
     len_in
         array of vector lengths for each row of aoa_in
+    nan_val
+        value to use when len_in is longer than aoa_in. Should use
+        np.nan for floating point, and 0xfff... for integer types
     flattened_array_out
         flattened array to copy values into. Must be longer than sum of
         lengths in len_in
@@ -122,9 +127,14 @@ def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
         raise ValueError(msg)
     start = 0
+    max_len = aoa_in.shape[1]
     for i, ll in enumerate(len_in):
         stop = start + ll
-        flattened_array_out[start:stop] = aoa_in[i, :ll]
+        if ll > max_len:
+            flattened_array_out[start : start + max_len] = aoa_in[i, :]
+            flattened_array_out[start + max_len : stop] = nan_val
+        else:
+            flattened_array_out[start:stop] = aoa_in[i, :ll]
         start = stop

{legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/WHEEL RENAMED Viewed

File without changes

{legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/top_level.txt RENAMED Viewed

File without changes

legend-pydataobj 1.11.7__py3-none-any.whl → 1.12.0a1__py3-none-any.whl

legend-pydataobj 1.11.7py3-none-any.whl → 1.12.0a1py3-none-any.whl