PyPI - legend-pydataobj - Versions diffs - 1.10.0__py3-none-any.whl → 1.10.1__py3-none-any.whl - Mend

legend-pydataobj 1.10.0py3-none-any.whl → 1.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.1.dist-info}/METADATA +1 -1
{legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.1.dist-info}/RECORD +20 -20
{legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.1.dist-info}/WHEEL +1 -1
lgdo/_version.py +2 -2
lgdo/compression/radware.py +8 -16
lgdo/lh5/_serializers/read/composite.py +1 -2
lgdo/lh5/_serializers/read/ndarray.py +4 -5
lgdo/lh5/_serializers/read/scalar.py +2 -2
lgdo/lh5/_serializers/read/utils.py +140 -1
lgdo/lh5/_serializers/read/vector_of_vectors.py +1 -1
lgdo/lh5/core.py +3 -0
lgdo/lh5/iterator.py +258 -74
lgdo/lh5/store.py +16 -2
lgdo/lh5/utils.py +16 -42
lgdo/types/histogram.py +104 -3
lgdo/types/table.py +2 -2
lgdo/types/vectorofvectors.py +1 -1
{legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.1.dist-info}/LICENSE +0 -0
{legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.1.dist-info}/entry_points.txt +0 -0
{legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.1.dist-info}/top_level.txt +0 -0

{legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: legend_pydataobj
-Version: 1.10.0
+Version: 1.10.1
 Summary: LEGEND Python Data Objects
 Author: The LEGEND Collaboration
 Maintainer: The LEGEND Collaboration

{legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.1.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 lgdo/__init__.py,sha256=1YUuAFQHNrOOkr3ZfrtEJOpYqgzbHRYA81ssbQZitQE,3196
-lgdo/_version.py,sha256=PEdW0PLUrZm2JiH_V3EAqPOK-ZxEDfT2nPsBGV10Pow,413
+lgdo/_version.py,sha256=8G9z72uuzZV_GnX2AJUyUhUAHl8bmD6KQXou4HB100U,413
 lgdo/cli.py,sha256=vB1Oj6kZ5gWaY9HBPBRRRyiepp72hm3bFvQeUUWeMYg,8214
 lgdo/lgdo_utils.py,sha256=6a2YWEwpyEMXlAyTHZMO01aqxy6SxJzPZkGNWKNWuS0,2567
 lgdo/lh5_store.py,sha256=5BzbJA9sLcqjp8bJDc2olwOiw0VS6rmfg3cfh1kQkRY,8512
@@ -9,26 +9,26 @@ lgdo/utils.py,sha256=9t_GYdB8aQhZ4Vz6ujmASzwCgTuP7ZdINtPTVPyIR6E,3661
 lgdo/compression/__init__.py,sha256=gqbdx4NnpCcW-C7kUXV-hVUZFiNlbCwIbs3uzFe4AFE,1127
 lgdo/compression/base.py,sha256=82cQJujfvoAOKBFx761dEcx_xM02TBCBBuBo6i78tuI,838
 lgdo/compression/generic.py,sha256=tF3UhLJbUDcovLxpIzgQRxFSjZ5Fz3uDRy9kI4mFntQ,2515
-lgdo/compression/radware.py,sha256=VbKAvi18h48Fz-ZxMEg64yD1ezaw1NkMZazxurdyMmc,24015
+lgdo/compression/radware.py,sha256=GcNTtjuyL7VBBqziUBmSqNXuhqy1bJJgvcyvyumPtrc,23839
 lgdo/compression/utils.py,sha256=W2RkBrxPpXlat84dnU9Ad7d_tTws0irtGl7O1dNWjnk,1140
 lgdo/compression/varlen.py,sha256=6ZZUItyoOfygDdE0DyoISeFZfqdbH6xl7T0eclfarzg,15127
 lgdo/lh5/__init__.py,sha256=KzWF6HI-6N1NqQUm8LAxMmDbg0rgRY4DAaJ2s7w2tLM,811
-lgdo/lh5/core.py,sha256=k6noKZIW3Aq1JPxV4ogLUgmsFrajMWwrQHc-7OYnVoQ,13769
+lgdo/lh5/core.py,sha256=YVtkTaU3SISDoLqR9UE_BDzsPApEW6_h_ac2NwSZ9zg,13868
 lgdo/lh5/datatype.py,sha256=O_7BqOlX8PFMyG0ppkfUT5aps5HEqX0bpuKcJO3jhu0,1691
 lgdo/lh5/exceptions.py,sha256=43fQ8MnAsylY4aG6GF6hsRclagYaMkUv8957c1uTjWE,962
-lgdo/lh5/iterator.py,sha256=eqH9a_ZjEhgqJUZbMj36jXK_1Xbx86450DVw7LHNB3Y,12369
-lgdo/lh5/store.py,sha256=vrvIbucCdKkAX3Ceo-fCuRJp4X7sofHq1gGKbFdeXyE,9895
+lgdo/lh5/iterator.py,sha256=ZaBBnmuNIjinwO0JUY55wLxX8Om9rVRRzXBC5uHmSKM,19772
+lgdo/lh5/store.py,sha256=3wAaQDd1Zmo0_bQ9DbB-FbKS4Uy_Tb642qKHXtZpSw4,10643
 lgdo/lh5/tools.py,sha256=T9CgHA8A3_tVBMtiNJ6hATQKhdqI61m3cX4p2wGKc6c,9937
-lgdo/lh5/utils.py,sha256=PG_iwLb-AHZgc2jYTdR6WZW_dD8kI_YnSOZsZ3SYFrY,7305
+lgdo/lh5/utils.py,sha256=ioz8DlyXZsejwnU2qYdIccdHcF12H62jgLkZsiDOLSM,6243
 lgdo/lh5/_serializers/__init__.py,sha256=NSH8uOVY3r_Wn3t0nQHhEHhkHT7-GJYlxuS3YTDJa5Y,1263
 lgdo/lh5/_serializers/read/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 lgdo/lh5/_serializers/read/array.py,sha256=uWfMCihfAmW2DE2ewip2qCK_kvQC_mb2zvOv26uzijc,1000
-lgdo/lh5/_serializers/read/composite.py,sha256=XepzeY7oh_M3ejBBuCxU6LcJwQAKOvZHvZqDgOYXlIA,12409
+lgdo/lh5/_serializers/read/composite.py,sha256=vQGh6nUQdSuHl0NTF2FeU7QC9UAA-E7XvNvrRJi4uw8,12384
 lgdo/lh5/_serializers/read/encoded.py,sha256=Q98c08d8LkZq2AlY4rThYECVaEqwbv4T2Urn7TGnsyE,4130
-lgdo/lh5/_serializers/read/ndarray.py,sha256=m0uAwuVL00dt0I1weI9nuEYW25wJx6ZJDPTbZHMrqDo,3699
-lgdo/lh5/_serializers/read/scalar.py,sha256=ghw6VsZLGoZ9mmcY7G-NaEioAbocM9JHOqk9ipPE6U0,926
-lgdo/lh5/_serializers/read/utils.py,sha256=bIhz2RSxwYtvDKgqE7yBtF9hcqbMS0e-M8uM8bdvChA,1184
-lgdo/lh5/_serializers/read/vector_of_vectors.py,sha256=98P_XoXE8QWLQeSyBm9QHBF_5WGHKrfpNppLhj3QrbE,7169
+lgdo/lh5/_serializers/read/ndarray.py,sha256=lFCXD6bSzmMOH7cVmvRYXakkfMCI8EoqTPNONRJ1F0s,3690
+lgdo/lh5/_serializers/read/scalar.py,sha256=kwhWm1T91pXf86CqtUUD8_qheSR92gXZrQVtssV5YCg,922
+lgdo/lh5/_serializers/read/utils.py,sha256=0kYUFKiaQ3JUbjhP7tuKas_s80Kou6DhPlVXc40NHRE,5945
+lgdo/lh5/_serializers/read/vector_of_vectors.py,sha256=aCWTMbym7dF2yrhEfQs_GotcDqOKALRxgdJm4CA-bYs,7189
 lgdo/lh5/_serializers/write/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 lgdo/lh5/_serializers/write/array.py,sha256=eyVPwwddVOR9TNNyliCNYlS-XYXsdTEA8QoTnnOYJbw,2943
 lgdo/lh5/_serializers/write/composite.py,sha256=I6lH0nWFIpAfZyG4-0rLxzg3mfazZ_FEhQVp1FZ0aA4,9254
@@ -39,17 +39,17 @@ lgdo/types/array.py,sha256=sUxh1CNCaefrnybt5qdjmmMpVQa_RqFxUv1tJ_pyBbc,6537
 lgdo/types/arrayofequalsizedarrays.py,sha256=DOGJiTmc1QCdm7vLbE6uIRXoMPtt8uuCfmwQawgWf5s,4949
 lgdo/types/encoded.py,sha256=JW4U5ow7KLMzhKnmhdnxbC3SZJAs4bOEDZWKG4KY1uU,15293
 lgdo/types/fixedsizearray.py,sha256=7RjUwTz1bW0pcrdy27JlfrXPAuOU89Kj7pOuSUCojK8,1527
-lgdo/types/histogram.py,sha256=-3PXydJK_1Os9mL3TahwbApwVNf9FMp7N234TfbjVt8,15508
+lgdo/types/histogram.py,sha256=XuE81aRXgIY-g-rFgr9Jo7KZ-0tsNpq1lRVRyA4uTSQ,19679
 lgdo/types/lgdo.py,sha256=UnJDi1emQYVgH_H29Vipfs4LelPopxG5pgZUu1eKOlw,2761
 lgdo/types/scalar.py,sha256=c5Es2vyDqyWTPV6mujzfIzMpC1jNWkEIcvYyWQUxH3Q,1933
 lgdo/types/struct.py,sha256=Q0OWLVd4B0ciLb8t6VsxU3MPbmGLZ7WfQNno1lSQS0Q,4918
-lgdo/types/table.py,sha256=w6ESACX6TNvEGIUQfNBtn2ofPNPM-Tl-6m6SITGVvtI,17942
-lgdo/types/vectorofvectors.py,sha256=Q53K8wiHwRHpGw3ARqrLnOXu3kLHptTYMp0ay9KK1vs,24386
+lgdo/types/table.py,sha256=lB_jj6C0C5w8jbo17Lp0P8_uY8jy7opkTJc1OrbCGEI,17956
+lgdo/types/vectorofvectors.py,sha256=fBLI8P0HDe12Ib95eFUJObLa--gxz6wfAmOs_mDsokg,24390
 lgdo/types/vovutils.py,sha256=7BWPP0BSj-92ifbCIUBcfqxG5-TS8uxujTyJJuDFI04,10302
 lgdo/types/waveformtable.py,sha256=f2tS4f1OEoYaTM5ldCX9zmw8iSISCT3t3wS1SrPdu_o,9901
-legend_pydataobj-1.10.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-legend_pydataobj-1.10.0.dist-info/METADATA,sha256=BMxnHVSQ-28DaoWafTCdPTuD-pbII3RoGs71LMNRpyo,44381
-legend_pydataobj-1.10.0.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
-legend_pydataobj-1.10.0.dist-info/entry_points.txt,sha256=Uu5MTlppBZxB4QGlLv-oX8FqACWjAZDNii__TBDJwLQ,72
-legend_pydataobj-1.10.0.dist-info/top_level.txt,sha256=KyR-EUloqiXcQ62IWnzBmtInDtvsHl4q2ZJAZgTcLXE,5
-legend_pydataobj-1.10.0.dist-info/RECORD,,
+legend_pydataobj-1.10.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+legend_pydataobj-1.10.1.dist-info/METADATA,sha256=gY35ifo00rptHQjVQ3BqkGeInupz20DGh1VIjeNEGlY,44381
+legend_pydataobj-1.10.1.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+legend_pydataobj-1.10.1.dist-info/entry_points.txt,sha256=Uu5MTlppBZxB4QGlLv-oX8FqACWjAZDNii__TBDJwLQ,72
+legend_pydataobj-1.10.1.dist-info/top_level.txt,sha256=KyR-EUloqiXcQ62IWnzBmtInDtvsHl4q2ZJAZgTcLXE,5
+legend_pydataobj-1.10.1.dist-info/RECORD,,

{legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.2.0)
+Generator: setuptools (75.3.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

lgdo/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '1.10.0'
-__version_tuple__ = version_tuple = (1, 10, 0)
+__version__ = version = '1.10.1'
+__version_tuple__ = version_tuple = (1, 10, 1)

lgdo/compression/radware.py CHANGED Viewed

@@ -441,15 +441,11 @@ def _radware_sigcompress_encode(
         while (i < sig_in.size) and (i < j + 48):
             si_i = int16(sig_in[i] + shift)
             si_im1 = int16(sig_in[i - 1] + shift)
-            if max1 < si_i:
-                max1 = si_i
-            if min1 > si_i:
-                min1 = si_i
+            max1 = max(max1, si_i)
+            min1 = min(min1, si_i)
             ds = si_i - si_im1
-            if max2 < ds:
-                max2 = ds
-            if min2 > ds:
-                min2 = ds
+            max2 = max(max2, ds)
+            min2 = min(min2, ds)
             nw += 1
             i += 1
         if max1 - min1 <= max2 - min2:  # use absolute values
@@ -460,15 +456,13 @@ def _radware_sigcompress_encode(
                 i < j + 128
             ):  # FIXME: 128 could be tuned better?
                 si_i = int16(sig_in[i] + shift)
-                if max1 < si_i:
-                    max1 = si_i
+                max1 = max(max1, si_i)
                 dd1 = max1 - min1
                 if min1 > si_i:
                     dd1 = max1 - si_i
                 if dd1 > mask[nb1]:
                     break
-                if min1 > si_i:
-                    min1 = si_i
+                min1 = min(min1, si_i)
                 nw += 1
                 i += 1
         else:  # use difference values
@@ -481,15 +475,13 @@ def _radware_sigcompress_encode(
                 si_i = int16(sig_in[i] + shift)
                 si_im1 = int16(sig_in[i - 1] + shift)
                 ds = si_i - si_im1
-                if max2 < ds:
-                    max2 = ds
+                max2 = max(max2, ds)
                 dd2 = max2 - min2
                 if min2 > ds:
                     dd2 = max2 - ds
                 if dd2 > mask[nb2]:
                     break
-                if min2 > ds:
-                    min2 = ds
+                min2 = min(min2, ds)
                 nw += 1
                 i += 1

lgdo/lh5/_serializers/read/composite.py CHANGED Viewed

@@ -103,8 +103,7 @@ def _h5_read_lgdo(
     if idx is not None:
         # check if idx is just an ordered list of the integers if so can ignore
         if (idx == np.arange(0, len(idx), 1)).all():
-            if n_rows > len(idx):
-                n_rows = len(idx)
+            n_rows = min(n_rows, len(idx))
             idx = None
         else:
             # chop off indices < start_row

lgdo/lh5/_serializers/read/ndarray.py CHANGED Viewed

@@ -43,15 +43,14 @@ def _h5_read_ndarray(
     if idx is not None:
         if len(idx) > 0 and idx[-1] >= ds_n_rows:
             log.warning("idx indexed past the end of the array in the file. Culling...")
-            n_rows_to_read = bisect_left(idx[0], ds_n_rows)
-            idx = (idx[:n_rows_to_read],)
+            n_rows_to_read = bisect_left(idx, ds_n_rows)
+            idx = idx[:n_rows_to_read]
             if len(idx) == 0:
                 log.warning("idx empty after culling.")
         n_rows_to_read = len(idx)
     else:
         n_rows_to_read = ds_n_rows - start_row
-    if n_rows_to_read > n_rows:
-        n_rows_to_read = n_rows
+    n_rows_to_read = min(n_rows_to_read, n_rows)
     if idx is None:
         fspace.select_hyperslab(
@@ -112,6 +111,6 @@ def _h5_read_ndarray(
     # special handling for bools
     # (c and Julia store as uint8 so cast to bool)
     if datatype.get_nested_datatype_string(attrs["datatype"]) == "bool":
-        nda = nda.astype(np.bool_)
+        nda = nda.astype(np.bool_, copy=False)
     return (nda, attrs, n_rows_to_read)

lgdo/lh5/_serializers/read/scalar.py CHANGED Viewed

@@ -7,7 +7,7 @@ import numpy as np
 from ....types import Scalar
 from ...exceptions import LH5DecodeError
-from .utils import read_attrs
+from . import utils
 log = logging.getLogger(__name__)
@@ -22,7 +22,7 @@ def _h5_read_scalar(
     sp = h5py.h5s.create(h5py.h5s.SCALAR)
     h5d.read(sp, sp, value)
     value = value[()]
-    attrs = read_attrs(h5d, fname, oname)
+    attrs = utils.read_attrs(h5d, fname, oname)
     # special handling for bools
     # (c and Julia store as uint8 so cast to bool)

lgdo/lh5/_serializers/read/utils.py CHANGED Viewed

@@ -1,9 +1,16 @@
 from __future__ import annotations
+import logging
 import h5py
 import numpy as np
+from .... import types
+from ... import datatype
 from ...exceptions import LH5DecodeError
+from . import scalar
+log = logging.getLogger(__name__)
 def check_obj_buf_attrs(attrs, new_attrs, fname, oname):
@@ -23,7 +30,7 @@ def read_attrs(h5o, fname, oname):
         h5a = h5py.h5a.open(h5o, index=i_attr)
         name = h5a.get_name().decode()
         if h5a.shape != ():
-            msg = f"attribute {name} is not a string or scalar"
+            msg = f"attribute {oname} is not a string or scalar"
             raise LH5DecodeError(msg, fname, oname)
         val = np.empty((), h5a.dtype)
         h5a.read(val)
@@ -33,3 +40,135 @@ def read_attrs(h5o, fname, oname):
             attrs[name] = val.item()
         h5a.close()
     return attrs
+def read_n_rows(h5o, fname, oname):
+    """Read number of rows in LH5 object"""
+    if not h5py.h5a.exists(h5o, b"datatype"):
+        msg = "missing 'datatype' attribute"
+        raise LH5DecodeError(msg, fname, oname)
+    h5a = h5py.h5a.open(h5o, b"datatype")
+    type_attr = np.empty((), h5a.dtype)
+    h5a.read(type_attr)
+    type_attr = type_attr.item().decode()
+    lgdotype = datatype.datatype(type_attr)
+    # scalars are dim-0 datasets
+    if lgdotype is types.Scalar:
+        return None
+    # structs don't have rows
+    if lgdotype is types.Struct:
+        return None
+    # tables should have elements with all the same length
+    if lgdotype is types.Table:
+        # read out each of the fields
+        rows_read = None
+        for field in datatype.get_struct_fields(type_attr):
+            obj = h5py.h5o.open(h5o, field.encode())
+            n_rows_read = read_n_rows(obj, fname, field)
+            obj.close()
+            if not rows_read:
+                rows_read = n_rows_read
+            elif rows_read != n_rows_read:
+                log.warning(
+                    f"'{field}' field in table '{oname}' has {rows_read} rows, "
+                    f"{n_rows_read} was expected"
+                )
+        return rows_read
+    # length of vector of vectors is the length of its cumulative_length
+    if lgdotype is types.VectorOfVectors:
+        obj = h5py.h5o.open(h5o, b"cumulative_length")
+        n_rows = read_n_rows(obj, fname, "cumulative_length")
+        obj.close()
+        return n_rows
+    # length of vector of encoded vectors is the length of its decoded_size
+    if lgdotype in (types.VectorOfEncodedVectors, types.ArrayOfEncodedEqualSizedArrays):
+        obj = h5py.h5o.open(h5o, b"encoded_data")
+        n_rows = read_n_rows(obj, fname, "encoded_data")
+        obj.close()
+        return n_rows
+    # return array length (without reading the array!)
+    if issubclass(lgdotype, types.Array):
+        # compute the number of rows to read
+        return h5o.get_space().shape[0]
+    msg = f"don't know how to read rows of LGDO {lgdotype.__name__}"
+    raise LH5DecodeError(msg, fname, oname)
+def read_size_in_bytes(h5o, fname, oname, field_mask=None):
+    """Read number size in LH5 object in memory (in B)"""
+    if not h5py.h5a.exists(h5o, b"datatype"):
+        msg = "missing 'datatype' attribute"
+        raise LH5DecodeError(msg, fname, oname)
+    h5a = h5py.h5a.open(h5o, b"datatype")
+    type_attr = np.empty((), h5a.dtype)
+    h5a.read(type_attr)
+    type_attr = type_attr.item().decode()
+    lgdotype = datatype.datatype(type_attr)
+    # scalars are dim-0 datasets
+    if lgdotype in (
+        types.Scalar,
+        types.Array,
+        types.ArrayOfEqualSizedArrays,
+        types.FixedSizeArray,
+    ):
+        return int(np.prod(h5o.shape) * h5o.dtype.itemsize)
+    # structs don't have rows
+    if lgdotype in (types.Struct, types.Histogram, types.Histogram.Axis):
+        size = 0
+        for key in h5o:
+            obj = h5py.h5o.open(h5o, key)
+            size += read_size_in_bytes(obj, fname, oname, field_mask)
+            obj.close()
+        return size
+    # tables should have elements with all the same length
+    if lgdotype in (types.Table, types.WaveformTable):
+        # read out each of the fields
+        size = 0
+        if not field_mask:
+            field_mask = datatype.get_struct_fields(type_attr)
+        for field in field_mask:
+            obj = h5py.h5o.open(h5o, field.encode())
+            size += read_size_in_bytes(obj, fname, field)
+            obj.close()
+        return size
+    # length of vector of vectors is the length of its cumulative_length
+    if lgdotype is types.VectorOfVectors:
+        size = 0
+        obj = h5py.h5o.open(h5o, b"cumulative_length")
+        size += read_size_in_bytes(obj, fname, "cumulative_length")
+        obj.close()
+        obj = h5py.h5o.open(h5o, b"flattened_data")
+        size += read_size_in_bytes(obj, fname, "flattened_data")
+        obj.close()
+        return size
+    # length of vector of encoded vectors is the length of its decoded_size
+    if lgdotype is types.ArrayOfEncodedEqualSizedArrays:
+        obj = h5py.h5o.open(h5o, b"decoded_size")
+        size = scalar._h5_read_scalar(obj, fname, "decoded_size")[0].value
+        obj.close()
+        obj = h5py.h5o.open(h5o, b"encoded_data")
+        cl = h5py.h5o.open(obj, b"cumulative_length")
+        size *= cl.shape[0]
+        size *= 4  # TODO: UPDATE WHEN CODECS SUPPORT MORE DTYPES
+        obj.close()
+        return size
+    msg = f"don't know how to read size of LGDO {lgdotype.__name__}"
+    raise LH5DecodeError(msg, fname, oname)

lgdo/lh5/_serializers/read/vector_of_vectors.py CHANGED Viewed

@@ -156,7 +156,7 @@ def _h5_read_vector_of_vectors(
         # grow fd_buf if necessary to hold the data
         fdb_size = fd_buf_start + fd_n_rows
         if len(fd_buf) < fdb_size:
-            fd_buf.resize(fdb_size)
+            fd_buf.nda.resize(fdb_size, refcheck=False)
     # now read
     h5o = h5py.h5o.open(h5g, b"flattened_data")

lgdo/lh5/core.py CHANGED Viewed

@@ -175,6 +175,9 @@ def read(
     if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
         idx = idx[0]
+    if isinstance(idx, np.ndarray) and idx.dtype == np.dtype("?"):
+        idx = np.where(idx)[0]
     obj, n_rows_read = _serializers._h5_read_lgdo(
         lh5_obj.id,
         lh5_obj.file.filename,

lgdo/lh5/iterator.py CHANGED Viewed

@@ -2,11 +2,14 @@ from __future__ import annotations
 import logging
 import typing
+from warnings import warn
 import numpy as np
 import pandas as pd
+from numpy.typing import NDArray
 from ..types import Array, Scalar, Struct, VectorOfVectors
+from ..units import default_units_registry as ureg
 from .store import LH5Store
 from .utils import expand_path
@@ -19,35 +22,53 @@ class LH5Iterator(typing.Iterator):
     at a time. This also accepts an entry list/mask to enable event selection,
     and a field mask.
-    This class can be used either for random access:
+    This can be used as an iterator:
-    >>> lh5_obj, n_rows = lh5_it.read(entry)
-    to read the block of entries starting at entry. In case of multiple files
-    or the use of an event selection, entry refers to a global event index
-    across files and does not count events that are excluded by the selection.
-    This can also be used as an iterator:
-    >>> for lh5_obj, entry, n_rows in LH5Iterator(...):
+    >>> for lh5_obj, i_entry, n_rows in LH5Iterator(...):
     >>>    # do the thing!
-    This is intended for if you are reading a large quantity of data but
-    want to limit your memory usage (particularly when reading in waveforms!).
+    This is intended for if you are reading a large quantity of data. This
+    will ensure that you traverse files efficiently to minimize caching time
+    and will limit your memory usage (particularly when reading in waveforms!).
     The ``lh5_obj`` that is read by this class is reused in order to avoid
     reallocation of memory; this means that if you want to hold on to data
     between reads, you will have to copy it somewhere!
+    When defining an LH5Iterator, you must give it a list of files and the
+    hdf5 groups containing the data tables you are reading. You may also
+    provide a field mask, and an entry list or mask, specifying which entries
+    to read from the files. You may also pair it with a friend iterator, which
+    contains a parallel group of files which will be simultaneously read.
+    In addition to accessing requested data via ``lh5_obj``, several
+    properties exist to tell you where that data came from:
+    - lh5_it.current_local_entries: get the entry numbers relative to the
+      file the data came from
+    - lh5_it.current_global_entries: get the entry number relative to the
+      full dataset
+    - lh5_it.current_files: get the file name corresponding to each entry
+    - lh5_it.current_groups: get the group name corresponding to each entry
+    This class can also be used either for random access:
+    >>> lh5_obj, n_rows = lh5_it.read(i_entry)
+    to read the block of entries starting at i_entry. In case of multiple files
+    or the use of an event selection, i_entry refers to a global event index
+    across files and does not count events that are excluded by the selection.
     """
     def __init__(
         self,
         lh5_files: str | list[str],
-        groups: str | list[str],
+        groups: str | list[str] | list[list[str]],
         base_path: str = "",
         entry_list: list[int] | list[list[int]] | None = None,
         entry_mask: list[bool] | list[list[bool]] | None = None,
         field_mask: dict[str, bool] | list[str] | tuple[str] | None = None,
-        buffer_len: int = 3200,
+        buffer_len: int = "100*MB",
+        file_cache: int = 10,
+        file_map: NDArray[int] = None,
         friend: typing.Iterator | None = None,
     ) -> None:
         """
@@ -57,9 +78,10 @@ class LH5Iterator(typing.Iterator):
             file or files to read from. May include wildcards and environment
             variables.
         groups
-            HDF5 group(s) to read. If a list is provided for both lh5_files
-            and group, they must be the same size. If a file is wild-carded,
-            the same group will be assigned to each file found
+            HDF5 group(s) to read. If a list of strings is provided, use
+            same groups for each file. If a list of lists is provided, size
+            of outer list must match size of file list, and each inner list
+            will apply to a single file (or set of wildcarded files)
         entry_list
             list of entry numbers to read. If a nested list is provided,
             expect one top-level list for each file, containing a list of
@@ -72,66 +94,98 @@ class LH5Iterator(typing.Iterator):
             more details.
         buffer_len
             number of entries to read at a time while iterating through files.
+        file_cache
+            maximum number of files to keep open at a time
+        file_map
+            cumulative file/group entries. This can be provided on construction
+            to speed up random or sparse access; otherwise, we sequentially
+            read the size of each group. WARNING: no checks for accuracy are
+            performed so only use this if you know what you are doing!
         friend
             a \"friend\" LH5Iterator that will be read in parallel with this.
             The friend should have the same length and entry list. A single
             LH5 table containing columns from both iterators will be returned.
+            Note that buffer_len will be set to the minimum of the two.
         """
-        self.lh5_st = LH5Store(base_path=base_path, keep_open=True)
+        self.lh5_st = LH5Store(base_path=base_path, keep_open=file_cache)
         # List of files, with wildcards and env vars expanded
         if isinstance(lh5_files, str):
             lh5_files = [lh5_files]
-            if isinstance(groups, list):
-                lh5_files *= len(groups)
-        elif not isinstance(lh5_files, list):
+        elif not isinstance(lh5_files, (list, set, tuple)):
             msg = "lh5_files must be a string or list of strings"
             raise ValueError(msg)
         if isinstance(groups, str):
-            groups = [groups] * len(lh5_files)
+            groups = [[groups]] * len(lh5_files)
         elif not isinstance(groups, list):
-            msg = "group must be a string or list of strings"
+            msg = "group must be a string or appropriate list"
+            raise ValueError(msg)
+        elif all(isinstance(g, str) for g in groups):
+            groups = [groups] * len(lh5_files)
+        elif len(groups) == len(lh5_files) and all(
+            isinstance(gr_list, (list, set, tuple)) for gr_list in groups
+        ):
+            pass
+        else:
+            msg = "group must be a string or appropriate list"
             raise ValueError(msg)
         if len(groups) != len(lh5_files):
             msg = "lh5_files and groups must have same length"
             raise ValueError(msg)
+        # make flattened outer-product-like list of files and groups
         self.lh5_files = []
         self.groups = []
         for f, g in zip(lh5_files, groups):
-            f_exp = expand_path(f, list=True, base_path=base_path)
-            self.lh5_files += f_exp
-            self.groups += [g] * len(f_exp)
+            for f_exp in expand_path(f, list=True, base_path=base_path):
+                self.lh5_files += [f_exp] * len(g)
+                self.groups += list(g)
         if entry_list is not None and entry_mask is not None:
             msg = "entry_list and entry_mask arguments are mutually exclusive"
             raise ValueError(msg)
         # Map to last row in each file
-        self.file_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i")
+        if file_map is None:
+            self.file_map = np.full(len(self.lh5_files), np.iinfo("q").max, "q")
+        else:
+            self.file_map = np.array(file_map)
         # Map to last iterator entry for each file
-        self.entry_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i")
+        self.entry_map = np.full(len(self.lh5_files), np.iinfo("q").max, "q")
         self.buffer_len = buffer_len
         if len(self.lh5_files) > 0:
             f = self.lh5_files[0]
             g = self.groups[0]
+            n_rows = self.lh5_st.read_n_rows(g, f)
+            if isinstance(self.buffer_len, str):
+                self.buffer_len = ureg.Quantity(buffer_len)
+            if isinstance(self.buffer_len, ureg.Quantity):
+                self.buffer_len = int(
+                    self.buffer_len
+                    / (self.lh5_st.read_size_in_bytes(g, f) * ureg.B)
+                    * n_rows
+                )
             self.lh5_buffer = self.lh5_st.get_buffer(
                 g,
                 f,
                 size=self.buffer_len,
                 field_mask=field_mask,
             )
-            self.file_map[0] = self.lh5_st.read_n_rows(g, f)
+            if file_map is None:
+                self.file_map[0] = n_rows
         else:
             msg = f"can't open any files from {lh5_files}"
             raise RuntimeError(msg)
         self.n_rows = 0
-        self.current_entry = 0
-        self.next_entry = 0
+        self.current_i_entry = 0
+        self.next_i_entry = 0
         self.field_mask = field_mask
@@ -142,13 +196,13 @@ class LH5Iterator(typing.Iterator):
             entry_list = list(entry_list)
             if isinstance(entry_list[0], int):
                 self.local_entry_list = [None] * len(self.file_map)
-                self.global_entry_list = np.array(entry_list, "i")
+                self.global_entry_list = np.array(entry_list, "q")
                 self.global_entry_list.sort()
             else:
                 self.local_entry_list = [[]] * len(self.file_map)
                 for i_file, local_list in enumerate(entry_list):
-                    self.local_entry_list[i_file] = np.array(local_list, "i")
+                    self.local_entry_list[i_file] = np.array(local_list, "q")
                     self.local_entry_list[i_file].sort()
         elif entry_mask is not None:
@@ -168,6 +222,15 @@ class LH5Iterator(typing.Iterator):
             if not isinstance(friend, typing.Iterator):
                 msg = "Friend must be an Iterator"
                 raise ValueError(msg)
+            # set buffer_lens to be equal
+            if self.buffer_len < friend.buffer_len:
+                friend.buffer_len = self.buffer_len
+                friend.lh5_buffer.resize(self.buffer_len)
+            elif self.buffer_len > friend.buffer_len:
+                self.buffer_len = friend.buffer_len
+                self.lh5_buffer.resize(friend.buffer_len)
             self.lh5_buffer.join(friend.lh5_buffer)
         self.friend = friend
@@ -176,33 +239,52 @@ class LH5Iterator(typing.Iterator):
         if i_file < 0:
             return 0
         fcl = self.file_map[i_file]
-        if fcl == np.iinfo("i").max:
-            fcl = self._get_file_cumlen(i_file - 1) + self.lh5_st.read_n_rows(
-                self.groups[i_file], self.lh5_files[i_file]
-            )
-            self.file_map[i_file] = fcl
+        # if we haven't already calculated, calculate for all files up to i_file
+        if fcl == np.iinfo("q").max:
+            i_start = np.searchsorted(self.file_map, np.iinfo("q").max)
+            fcl = self.file_map[i_start - 1] if i_start > 0 else 0
+            for i in range(i_start, i_file + 1):
+                fcl += self.lh5_st.read_n_rows(self.groups[i], self.lh5_files[i])
+                self.file_map[i] = fcl
         return fcl
+    @property
+    def current_entry(self) -> int:
+        "deprecated alias for current_i_entry"
+        warn(
+            "current_entry has been renamed to current_i_entry.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return self.current_i_entry
     def _get_file_cumentries(self, i_file: int) -> int:
         """Helper to get cumulative iterator entries in file"""
         if i_file < 0:
             return 0
         n = self.entry_map[i_file]
-        if n == np.iinfo("i").max:
-            elist = self.get_file_entrylist(i_file)
-            fcl = self._get_file_cumlen(i_file)
-            if elist is None:
-                # no entry list provided
-                n = fcl
-            else:
-                file_entries = self.get_file_entrylist(i_file)
-                n = len(file_entries)
-                # check that file entries fall inside of file
-                if n > 0 and file_entries[-1] >= fcl:
-                    logging.warning(f"Found entries out of range for file {i_file}")
-                    n = np.searchsorted(file_entries, fcl, "right")
-                n += self._get_file_cumentries(i_file - 1)
-            self.entry_map[i_file] = n
+        # if we haven't already calculated, calculate for all files up to i_file
+        if n == np.iinfo("q").max:
+            i_start = np.searchsorted(self.entry_map, np.iinfo("q").max)
+            n = self.entry_map[i_start - 1] if i_start > 0 else 0
+            for i in range(i_start, i_file + 1):
+                elist = self.get_file_entrylist(i)
+                fcl = self._get_file_cumlen(i)
+                if elist is None:
+                    # no entry list provided
+                    n = fcl
+                else:
+                    n += len(elist)
+                    # check that file entries fall inside of file
+                    if len(elist) > 0 and elist[-1] >= fcl:
+                        logging.warning(f"Found entries out of range for file {i}")
+                        n += np.searchsorted(elist, fcl, "right") - len(elist)
+                self.entry_map[i] = n
         return n
     def get_file_entrylist(self, i_file: int) -> np.ndarray:
@@ -218,50 +300,50 @@ class LH5Iterator(typing.Iterator):
             f_end = self._get_file_cumlen(i_file)
             i_start = self._get_file_cumentries(i_file - 1)
             i_stop = np.searchsorted(self.global_entry_list, f_end, "right")
-            elist = np.array(self.global_entry_list[i_start:i_stop], "i") - f_start
+            elist = np.array(self.global_entry_list[i_start:i_stop], "q") - f_start
             self.local_entry_list[i_file] = elist
         return elist
     def get_global_entrylist(self) -> np.ndarray:
         """Get global entry list, constructing it if needed"""
         if self.global_entry_list is None and self.local_entry_list is not None:
-            self.global_entry_list = np.zeros(len(self), "i")
+            self.global_entry_list = np.zeros(len(self), "q")
             for i_file in range(len(self.lh5_files)):
-                i_start = self.get_file_cumentries(i_file - 1)
-                i_stop = self.get_file_cumentries(i_file)
-                f_start = self.get_file_cumlen(i_file - 1)
+                i_start = self._get_file_cumentries(i_file - 1)
+                i_stop = self._get_file_cumentries(i_file)
+                f_start = self._get_file_cumlen(i_file - 1)
                 self.global_entry_list[i_start:i_stop] = (
                     self.get_file_entrylist(i_file) + f_start
                 )
         return self.global_entry_list
-    def read(self, entry: int) -> tuple[LGDO, int]:
-        """Read the nextlocal chunk of events, starting at entry. Return the
+    def read(self, i_entry: int) -> tuple[LGDO, int]:
+        """Read the nextlocal chunk of events, starting at i_entry. Return the
         LH5 buffer and number of rows read."""
         self.n_rows = 0
-        i_file = np.searchsorted(self.entry_map, entry, "right")
+        i_file = np.searchsorted(self.entry_map, i_entry, "right")
         # if file hasn't been opened yet, search through files
         # sequentially until we find the right one
-        if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("i").max:
-            while i_file < len(self.lh5_files) and entry >= self._get_file_cumentries(
+        if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("q").max:
+            while i_file < len(self.lh5_files) and i_entry >= self._get_file_cumentries(
                 i_file
             ):
                 i_file += 1
         if i_file == len(self.lh5_files):
             return (self.lh5_buffer, self.n_rows)
-        local_entry = entry - self._get_file_cumentries(i_file - 1)
+        local_i_entry = i_entry - self._get_file_cumentries(i_file - 1)
         while self.n_rows < self.buffer_len and i_file < len(self.file_map):
             # Loop through files
             local_idx = self.get_file_entrylist(i_file)
             if local_idx is not None and len(local_idx) == 0:
                 i_file += 1
-                local_entry = 0
+                local_i_entry = 0
                 continue
-            i_local = local_idx[local_entry] if local_idx is not None else local_entry
+            i_local = local_i_entry if local_idx is None else local_idx[local_i_entry]
             self.lh5_buffer, n_rows = self.lh5_st.read(
                 self.groups[i_file],
                 self.lh5_files[i_file],
@@ -275,12 +357,12 @@ class LH5Iterator(typing.Iterator):
             self.n_rows += n_rows
             i_file += 1
-            local_entry = 0
+            local_i_entry = 0
-        self.current_entry = entry
+        self.current_i_entry = i_entry
         if self.friend is not None:
-            self.friend.read(entry)
+            self.friend.read(i_entry)
         return (self.lh5_buffer, self.n_rows)
@@ -290,6 +372,108 @@ class LH5Iterator(typing.Iterator):
         if self.friend is not None:
             self.friend.reset_field_mask(mask)
+    @property
+    def current_local_entries(self) -> NDArray[int]:
+        """Return list of local file entries in buffer"""
+        cur_entries = np.zeros(self.n_rows, dtype="int32")
+        i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
+        file_start = self._get_file_cumentries(i_file - 1)
+        i_local = self.current_i_entry - file_start
+        i = 0
+        while i < len(cur_entries):
+            # number of entries to read from this file
+            file_end = self._get_file_cumentries(i_file)
+            n = min(file_end - file_start - i_local, len(cur_entries) - i)
+            entries = self.get_file_entrylist(i_file)
+            if entries is None:
+                cur_entries[i : i + n] = np.arange(i_local, i_local + n)
+            else:
+                cur_entries[i : i + n] = entries[i_local : i_local + n]
+            i_file += 1
+            file_start = file_end
+            i_local = 0
+            i += n
+        return cur_entries
+    @property
+    def current_global_entries(self) -> NDArray[int]:
+        """Return list of local file entries in buffer"""
+        cur_entries = np.zeros(self.n_rows, dtype="int32")
+        i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
+        file_start = self._get_file_cumentries(i_file - 1)
+        i_local = self.current_i_entry - file_start
+        i = 0
+        while i < len(cur_entries):
+            # number of entries to read from this file
+            file_end = self._get_file_cumentries(i_file)
+            n = min(file_end - file_start - i_local, len(cur_entries) - i)
+            entries = self.get_file_entrylist(i_file)
+            if entries is None:
+                cur_entries[i : i + n] = self._get_file_cumlen(i_file - 1) + np.arange(
+                    i_local, i_local + n
+                )
+            else:
+                cur_entries[i : i + n] = (
+                    self._get_file_cumlen(i_file - 1) + entries[i_local : i_local + n]
+                )
+            i_file += 1
+            file_start = file_end
+            i_local = 0
+            i += n
+        return cur_entries
+    @property
+    def current_files(self) -> NDArray[str]:
+        """Return list of file names for entries in buffer"""
+        cur_files = np.zeros(self.n_rows, dtype=object)
+        i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
+        file_start = self._get_file_cumentries(i_file - 1)
+        i_local = self.current_i_entry - file_start
+        i = 0
+        while i < len(cur_files):
+            # number of entries to read from this file
+            file_end = self._get_file_cumentries(i_file)
+            n = min(file_end - file_start - i_local, len(cur_files) - i)
+            cur_files[i : i + n] = self.lh5_files[i_file]
+            i_file += 1
+            file_start = file_end
+            i_local = 0
+            i += n
+        return cur_files
+    @property
+    def current_groups(self) -> NDArray[str]:
+        """Return list of group names for entries in buffer"""
+        cur_groups = np.zeros(self.n_rows, dtype=object)
+        i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
+        file_start = self._get_file_cumentries(i_file - 1)
+        i_local = self.current_i_entry - file_start
+        i = 0
+        while i < len(cur_groups):
+            # number of entries to read from this file
+            file_end = self._get_file_cumentries(i_file)
+            n = min(file_end - file_start - i_local, len(cur_groups) - i)
+            cur_groups[i : i + n] = self.groups[i_file]
+            i_file += 1
+            file_start = file_end
+            i_local = 0
+            i += n
+        return cur_groups
     def __len__(self) -> int:
         """Return the total number of entries."""
         return (
@@ -300,15 +484,15 @@ class LH5Iterator(typing.Iterator):
     def __iter__(self) -> typing.Iterator:
         """Loop through entries in blocks of size buffer_len."""
-        self.current_entry = 0
-        self.next_entry = 0
+        self.current_i_entry = 0
+        self.next_i_entry = 0
         return self
     def __next__(self) -> tuple[LGDO, int, int]:
         """Read next buffer_len entries and return lh5_table, iterator entry
         and n_rows read."""
-        buf, n_rows = self.read(self.next_entry)
-        self.next_entry = self.current_entry + n_rows
+        buf, n_rows = self.read(self.next_i_entry)
+        self.next_i_entry = self.current_i_entry + n_rows
         if n_rows == 0:
             raise StopIteration
-        return (buf, self.current_entry, n_rows)
+        return (buf, self.current_i_entry, n_rows)

lgdo/lh5/store.py CHANGED Viewed

@@ -9,6 +9,7 @@ import bisect
 import logging
 import os
 import sys
+from collections import OrderedDict
 from collections.abc import Mapping, Sequence
 from inspect import signature
 from typing import Any
@@ -47,14 +48,15 @@ class LH5Store:
             directory path to prepend to LH5 files.
         keep_open
             whether to keep files open by storing the :mod:`h5py` objects as
-            class attributes.
+            class attributes. If ``keep_open`` is an ``int``, keep only the
+            ``n`` most recently opened files; if ``True``, no limit
         locking
             whether to lock files when reading
         """
         self.base_path = "" if base_path == "" else utils.expand_path(base_path)
         self.keep_open = keep_open
         self.locking = locking
-        self.files = {}
+        self.files = OrderedDict()
     def gimme_file(
         self,
@@ -87,6 +89,7 @@ class LH5Store:
             file_kwargs["locking"] = self.locking
         if lh5_file in self.files:
+            self.files.move_to_end(lh5_file)
             return self.files[lh5_file]
         if self.base_path != "":
@@ -120,6 +123,8 @@ class LH5Store:
         h5f = h5py.File(full_path, mode, **file_kwargs)
         if self.keep_open:
+            if isinstance(self.keep_open, int) and len(self.files) >= self.keep_open:
+                self.files.popitem(last=False)
             self.files[lh5_file] = h5f
         return h5f
@@ -228,6 +233,9 @@ class LH5Store:
         if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
             idx = idx[0]
+        if isinstance(idx, np.ndarray) and idx.dtype == np.dtype("?"):
+            idx = np.where(idx)[0]
         return _serializers._h5_read_lgdo(
             lh5_obj.id,
             lh5_obj.file.filename,
@@ -307,3 +315,9 @@ class LH5Store:
         Return ``None`` if it is a :class:`.Scalar` or a :class:`.Struct`.
         """
         return utils.read_n_rows(name, self.gimme_file(lh5_file, "r"))
+    def read_size_in_bytes(self, name: str, lh5_file: str | h5py.File) -> int:
+        """Look up the size (in B) of the object in memory. Will recursively
+        crawl through all objects in a Struct or Table
+        """
+        return utils.read_size_in_bytes(name, self.gimme_file(lh5_file, "r"))

lgdo/lh5/utils.py CHANGED Viewed

@@ -12,7 +12,7 @@ from typing import Any
 import h5py
 from .. import types
-from . import _serializers, datatype
+from . import _serializers
 from .exceptions import LH5DecodeError
 log = logging.getLogger(__name__)
@@ -44,57 +44,31 @@ def read_n_rows(name: str, h5f: str | h5py.File) -> int | None:
     Return ``None`` if `name` is a :class:`.Scalar` or a :class:`.Struct`.
     """
     if not isinstance(h5f, h5py.File):
-        h5f = h5py.File(h5f, "r")
+        h5f = h5py.File(h5f, "r", locking=False)
     try:
-        attrs = h5f[name].attrs
+        h5o = h5f[name].id
     except KeyError as e:
         msg = "not found"
         raise LH5DecodeError(msg, h5f, name) from e
-    except AttributeError as e:
-        msg = "missing 'datatype' attribute"
-        raise LH5DecodeError(msg, h5f, name) from e
-    lgdotype = datatype.datatype(attrs["datatype"])
-    # scalars are dim-0 datasets
-    if lgdotype is types.Scalar:
-        return None
-    # structs don't have rows
-    if lgdotype is types.Struct:
-        return None
-    # tables should have elements with all the same length
-    if lgdotype is types.Table:
-        # read out each of the fields
-        rows_read = None
-        for field in datatype.get_struct_fields(attrs["datatype"]):
-            n_rows_read = read_n_rows(name + "/" + field, h5f)
-            if not rows_read:
-                rows_read = n_rows_read
-            elif rows_read != n_rows_read:
-                log.warning(
-                    f"'{field}' field in table '{name}' has {rows_read} rows, "
-                    f"{n_rows_read} was expected"
-                )
-        return rows_read
+    return _serializers.read.utils.read_n_rows(h5o, h5f.name, name)
-    # length of vector of vectors is the length of its cumulative_length
-    if lgdotype is types.VectorOfVectors:
-        return read_n_rows(f"{name}/cumulative_length", h5f)
-    # length of vector of encoded vectors is the length of its decoded_size
-    if lgdotype in (types.VectorOfEncodedVectors, types.ArrayOfEncodedEqualSizedArrays):
-        return read_n_rows(f"{name}/encoded_data", h5f)
+def read_size_in_bytes(name: str, h5f: str | h5py.File) -> int | None:
+    """Look up the size (in B) in an LGDO object in memory. Will crawl
+    recursively through members of a Struct or Table
+    """
+    if not isinstance(h5f, h5py.File):
+        h5f = h5py.File(h5f, "r", locking=False)
-    # return array length (without reading the array!)
-    if issubclass(lgdotype, types.Array):
-        # compute the number of rows to read
-        return h5f[name].shape[0]
+    try:
+        h5o = h5f[name].id
+    except KeyError as e:
+        msg = "not found"
+        raise LH5DecodeError(msg, h5f, name) from e
-    msg = f"don't know how to read rows of LGDO {lgdotype.__name__}"
-    raise LH5DecodeError(msg, h5f, name)
+    return _serializers.read.utils.read_size_in_bytes(h5o, h5f.name, name)
 def get_h5_group(

lgdo/types/histogram.py CHANGED Viewed

@@ -1,11 +1,12 @@
 from __future__ import annotations
 import logging
-from collections.abc import Iterable
+from collections.abc import Iterable, Mapping, Sequence
 from typing import Any
 import hist
 import numpy as np
+import pandas as pd
 from numpy.typing import NDArray
 from .array import Array
@@ -269,10 +270,10 @@ class Histogram(Struct):
                     b.append(Histogram.Axis.from_edges(ax.edges, binedge_attrs))
         else:
             if binning is None:
-                msg = "need to also pass binning if passing histogram as array"
+                msg = "need to pass binning to construct Histogram"
                 raise ValueError(msg)
-            w = weights if isinstance(weights, Array) else Array(weights)
+            # set up binning
             if all(isinstance(ax, Histogram.Axis) for ax in binning):
                 if binedge_attrs is not None:
                     msg = "passed both binedges as Axis instances and binedge_attrs"
@@ -286,6 +287,14 @@ class Histogram(Struct):
                 msg = "invalid binning object passed"
                 raise ValueError(msg)
+            # set up bin weights
+            if isinstance(weights, Array):
+                w = weights
+            elif weights is None:
+                w = Array(shape=[ax.nbins for ax in b], fill_val=0, dtype=np.float32)
+            else:
+                w = Array(weights)
             if len(binning) != len(w.nda.shape):
                 msg = "binning and weight dimensions do not match"
                 raise ValueError(msg)
@@ -315,6 +324,98 @@ class Histogram(Struct):
         assert all(isinstance(v, Histogram.Axis) for k, v in bins)
         return tuple(v for _, v in bins)
+    def fill(self, data, w: NDArray = None, keys: Sequence[str] = None) -> None:
+        """Fill histogram by incrementing bins with data points weighted by w
+        Parameters
+        ----------
+        data
+            a ndarray with inner dimension equal to number of axes, or a list
+            of equal-length 1d-arrays containing data for each axis, or a
+            Mapping to 1d-arrays containing data for each axis (requires keys),
+            or a Pandas dataframe (optionally takes a list of keys)
+        w
+            weight to use for incrementing data points. If None, use 1 for all
+        keys
+            list of keys to use if data is a pandas ''DataFrame'' or ''Mapping''
+        """
+        if keys is not None:
+            if isinstance(keys, str):
+                keys = [keys]
+            elif not isinstance(keys, list):
+                keys = list(keys)
+        if (
+            isinstance(data, np.ndarray)
+            and len(data.shape) == 1
+            and len(self.binning) == 1
+        ):
+            N = len(data)
+            data = [data]
+        elif (
+            isinstance(data, np.ndarray)
+            and len(data.shape) == 2
+            and data.shape[1] == len(self.binning)
+        ):
+            N = data.shape[0]
+            data = data.T
+        elif isinstance(data, pd.DataFrame) and (
+            (keys is not None and len(keys) == len(self.binning))
+            or data.ndim == len(self.binning)
+        ):
+            if keys is not None:
+                data = data[keys]
+            N = len(data)
+            data = data.values.T
+        elif isinstance(data, Sequence) and len(data) == len(self.binning):
+            data = [d if isinstance(d, np.ndarray) else np.array(d) for d in data]
+            N = len(data[0])
+            if not all(len(d) == N for d in data):
+                msg = "length of all data arrays must be equal"
+                raise ValueError(msg)
+        elif isinstance(data, Mapping):
+            if not isinstance(keys, Sequence) or len(keys) != len(self.binning):
+                msg = "filling hist with Mapping data requires a list of keys with same length as histogram rank"
+                raise ValueError(msg)
+            data = [
+                data[k] if isinstance(data[k], np.ndarray) else np.array(data[k])
+                for k in keys
+            ]
+            N = len(data[0])
+            if not all(len(d) == N for d in data):
+                msg = "length of all data arrays must be equal"
+                raise ValueError(msg)
+        else:
+            msg = "data must be 2D numpy array or list of 1D arrays with length equal to number of axes"
+            raise ValueError(msg)
+        idx = np.zeros(N, np.float64)  # bin indices for flattened array
+        oor_mask = np.ones(N, np.bool_)  # mask to remove out of range values
+        stride = [s // self.weights.dtype.itemsize for s in self.weights.nda.strides]
+        for col, ax, s in zip(data, self.binning, stride):
+            if ax.is_range:
+                idx += s * np.floor((col - ax.first) / ax.step - int(not ax.closedleft))
+                if ax.closedleft:
+                    oor_mask &= (ax.first <= col) & (col < ax.last)
+                else:
+                    oor_mask &= (ax.first < col) & (col <= ax.last)
+            else:
+                idx += s * (
+                    np.searchsorted(
+                        ax.edges, col, side=("right" if ax.closedleft else "left")
+                    )
+                    - 1
+                )
+                if ax.closedleft:
+                    oor_mask &= (ax.edges[0] <= col) & (col < ax.edges[-1])
+                else:
+                    oor_mask &= (ax.edges[0] < col) & (col <= ax.edges[-1])
+        # increment bin contents
+        idx = idx[oor_mask].astype(np.int64)
+        w = w[oor_mask] if w is not None else 1
+        np.add.at(self.weights.nda.reshape(-1), idx, w)
     def __setitem__(self, name: str, obj: LGDO) -> None:
         # do not allow for new attributes on this
         msg = "histogram fields cannot be mutated"

lgdo/types/table.py CHANGED Viewed

@@ -450,7 +450,7 @@ class Table(Struct):
             cols = self.keys()
         if library == "pd":
-            df = pd.DataFrame()
+            df = {}
             for col in cols:
                 data = self[col]
@@ -470,7 +470,7 @@ class Table(Struct):
                     )
                     df[f"{prefix}{col}"] = data.view_as("pd", with_units=with_units)
-            return df
+            return pd.DataFrame(df, copy=False)
         if library == "np":
             msg = f"Format {library!r} is not supported for Tables."

lgdo/types/vectorofvectors.py CHANGED Viewed

@@ -632,7 +632,7 @@ class VectorOfVectors(LGDO):
             offsets = np.empty(
                 len(self.cumulative_length) + 1, dtype=self.cumulative_length.dtype
             )
-            offsets[1:] = self.cumulative_length
+            offsets[1:] = self.cumulative_length.nda
             offsets[0] = 0
             content = (

{legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

legend-pydataobj 1.10.0__py3-none-any.whl → 1.10.1__py3-none-any.whl

legend-pydataobj 1.10.0py3-none-any.whl → 1.10.1py3-none-any.whl