legend-pydataobj 1.10.0__py3-none-any.whl → 1.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: legend_pydataobj
3
- Version: 1.10.0
3
+ Version: 1.10.1
4
4
  Summary: LEGEND Python Data Objects
5
5
  Author: The LEGEND Collaboration
6
6
  Maintainer: The LEGEND Collaboration
@@ -1,5 +1,5 @@
1
1
  lgdo/__init__.py,sha256=1YUuAFQHNrOOkr3ZfrtEJOpYqgzbHRYA81ssbQZitQE,3196
2
- lgdo/_version.py,sha256=PEdW0PLUrZm2JiH_V3EAqPOK-ZxEDfT2nPsBGV10Pow,413
2
+ lgdo/_version.py,sha256=8G9z72uuzZV_GnX2AJUyUhUAHl8bmD6KQXou4HB100U,413
3
3
  lgdo/cli.py,sha256=vB1Oj6kZ5gWaY9HBPBRRRyiepp72hm3bFvQeUUWeMYg,8214
4
4
  lgdo/lgdo_utils.py,sha256=6a2YWEwpyEMXlAyTHZMO01aqxy6SxJzPZkGNWKNWuS0,2567
5
5
  lgdo/lh5_store.py,sha256=5BzbJA9sLcqjp8bJDc2olwOiw0VS6rmfg3cfh1kQkRY,8512
@@ -9,26 +9,26 @@ lgdo/utils.py,sha256=9t_GYdB8aQhZ4Vz6ujmASzwCgTuP7ZdINtPTVPyIR6E,3661
9
9
  lgdo/compression/__init__.py,sha256=gqbdx4NnpCcW-C7kUXV-hVUZFiNlbCwIbs3uzFe4AFE,1127
10
10
  lgdo/compression/base.py,sha256=82cQJujfvoAOKBFx761dEcx_xM02TBCBBuBo6i78tuI,838
11
11
  lgdo/compression/generic.py,sha256=tF3UhLJbUDcovLxpIzgQRxFSjZ5Fz3uDRy9kI4mFntQ,2515
12
- lgdo/compression/radware.py,sha256=VbKAvi18h48Fz-ZxMEg64yD1ezaw1NkMZazxurdyMmc,24015
12
+ lgdo/compression/radware.py,sha256=GcNTtjuyL7VBBqziUBmSqNXuhqy1bJJgvcyvyumPtrc,23839
13
13
  lgdo/compression/utils.py,sha256=W2RkBrxPpXlat84dnU9Ad7d_tTws0irtGl7O1dNWjnk,1140
14
14
  lgdo/compression/varlen.py,sha256=6ZZUItyoOfygDdE0DyoISeFZfqdbH6xl7T0eclfarzg,15127
15
15
  lgdo/lh5/__init__.py,sha256=KzWF6HI-6N1NqQUm8LAxMmDbg0rgRY4DAaJ2s7w2tLM,811
16
- lgdo/lh5/core.py,sha256=k6noKZIW3Aq1JPxV4ogLUgmsFrajMWwrQHc-7OYnVoQ,13769
16
+ lgdo/lh5/core.py,sha256=YVtkTaU3SISDoLqR9UE_BDzsPApEW6_h_ac2NwSZ9zg,13868
17
17
  lgdo/lh5/datatype.py,sha256=O_7BqOlX8PFMyG0ppkfUT5aps5HEqX0bpuKcJO3jhu0,1691
18
18
  lgdo/lh5/exceptions.py,sha256=43fQ8MnAsylY4aG6GF6hsRclagYaMkUv8957c1uTjWE,962
19
- lgdo/lh5/iterator.py,sha256=eqH9a_ZjEhgqJUZbMj36jXK_1Xbx86450DVw7LHNB3Y,12369
20
- lgdo/lh5/store.py,sha256=vrvIbucCdKkAX3Ceo-fCuRJp4X7sofHq1gGKbFdeXyE,9895
19
+ lgdo/lh5/iterator.py,sha256=ZaBBnmuNIjinwO0JUY55wLxX8Om9rVRRzXBC5uHmSKM,19772
20
+ lgdo/lh5/store.py,sha256=3wAaQDd1Zmo0_bQ9DbB-FbKS4Uy_Tb642qKHXtZpSw4,10643
21
21
  lgdo/lh5/tools.py,sha256=T9CgHA8A3_tVBMtiNJ6hATQKhdqI61m3cX4p2wGKc6c,9937
22
- lgdo/lh5/utils.py,sha256=PG_iwLb-AHZgc2jYTdR6WZW_dD8kI_YnSOZsZ3SYFrY,7305
22
+ lgdo/lh5/utils.py,sha256=ioz8DlyXZsejwnU2qYdIccdHcF12H62jgLkZsiDOLSM,6243
23
23
  lgdo/lh5/_serializers/__init__.py,sha256=NSH8uOVY3r_Wn3t0nQHhEHhkHT7-GJYlxuS3YTDJa5Y,1263
24
24
  lgdo/lh5/_serializers/read/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  lgdo/lh5/_serializers/read/array.py,sha256=uWfMCihfAmW2DE2ewip2qCK_kvQC_mb2zvOv26uzijc,1000
26
- lgdo/lh5/_serializers/read/composite.py,sha256=XepzeY7oh_M3ejBBuCxU6LcJwQAKOvZHvZqDgOYXlIA,12409
26
+ lgdo/lh5/_serializers/read/composite.py,sha256=vQGh6nUQdSuHl0NTF2FeU7QC9UAA-E7XvNvrRJi4uw8,12384
27
27
  lgdo/lh5/_serializers/read/encoded.py,sha256=Q98c08d8LkZq2AlY4rThYECVaEqwbv4T2Urn7TGnsyE,4130
28
- lgdo/lh5/_serializers/read/ndarray.py,sha256=m0uAwuVL00dt0I1weI9nuEYW25wJx6ZJDPTbZHMrqDo,3699
29
- lgdo/lh5/_serializers/read/scalar.py,sha256=ghw6VsZLGoZ9mmcY7G-NaEioAbocM9JHOqk9ipPE6U0,926
30
- lgdo/lh5/_serializers/read/utils.py,sha256=bIhz2RSxwYtvDKgqE7yBtF9hcqbMS0e-M8uM8bdvChA,1184
31
- lgdo/lh5/_serializers/read/vector_of_vectors.py,sha256=98P_XoXE8QWLQeSyBm9QHBF_5WGHKrfpNppLhj3QrbE,7169
28
+ lgdo/lh5/_serializers/read/ndarray.py,sha256=lFCXD6bSzmMOH7cVmvRYXakkfMCI8EoqTPNONRJ1F0s,3690
29
+ lgdo/lh5/_serializers/read/scalar.py,sha256=kwhWm1T91pXf86CqtUUD8_qheSR92gXZrQVtssV5YCg,922
30
+ lgdo/lh5/_serializers/read/utils.py,sha256=0kYUFKiaQ3JUbjhP7tuKas_s80Kou6DhPlVXc40NHRE,5945
31
+ lgdo/lh5/_serializers/read/vector_of_vectors.py,sha256=aCWTMbym7dF2yrhEfQs_GotcDqOKALRxgdJm4CA-bYs,7189
32
32
  lgdo/lh5/_serializers/write/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
33
  lgdo/lh5/_serializers/write/array.py,sha256=eyVPwwddVOR9TNNyliCNYlS-XYXsdTEA8QoTnnOYJbw,2943
34
34
  lgdo/lh5/_serializers/write/composite.py,sha256=I6lH0nWFIpAfZyG4-0rLxzg3mfazZ_FEhQVp1FZ0aA4,9254
@@ -39,17 +39,17 @@ lgdo/types/array.py,sha256=sUxh1CNCaefrnybt5qdjmmMpVQa_RqFxUv1tJ_pyBbc,6537
39
39
  lgdo/types/arrayofequalsizedarrays.py,sha256=DOGJiTmc1QCdm7vLbE6uIRXoMPtt8uuCfmwQawgWf5s,4949
40
40
  lgdo/types/encoded.py,sha256=JW4U5ow7KLMzhKnmhdnxbC3SZJAs4bOEDZWKG4KY1uU,15293
41
41
  lgdo/types/fixedsizearray.py,sha256=7RjUwTz1bW0pcrdy27JlfrXPAuOU89Kj7pOuSUCojK8,1527
42
- lgdo/types/histogram.py,sha256=-3PXydJK_1Os9mL3TahwbApwVNf9FMp7N234TfbjVt8,15508
42
+ lgdo/types/histogram.py,sha256=XuE81aRXgIY-g-rFgr9Jo7KZ-0tsNpq1lRVRyA4uTSQ,19679
43
43
  lgdo/types/lgdo.py,sha256=UnJDi1emQYVgH_H29Vipfs4LelPopxG5pgZUu1eKOlw,2761
44
44
  lgdo/types/scalar.py,sha256=c5Es2vyDqyWTPV6mujzfIzMpC1jNWkEIcvYyWQUxH3Q,1933
45
45
  lgdo/types/struct.py,sha256=Q0OWLVd4B0ciLb8t6VsxU3MPbmGLZ7WfQNno1lSQS0Q,4918
46
- lgdo/types/table.py,sha256=w6ESACX6TNvEGIUQfNBtn2ofPNPM-Tl-6m6SITGVvtI,17942
47
- lgdo/types/vectorofvectors.py,sha256=Q53K8wiHwRHpGw3ARqrLnOXu3kLHptTYMp0ay9KK1vs,24386
46
+ lgdo/types/table.py,sha256=lB_jj6C0C5w8jbo17Lp0P8_uY8jy7opkTJc1OrbCGEI,17956
47
+ lgdo/types/vectorofvectors.py,sha256=fBLI8P0HDe12Ib95eFUJObLa--gxz6wfAmOs_mDsokg,24390
48
48
  lgdo/types/vovutils.py,sha256=7BWPP0BSj-92ifbCIUBcfqxG5-TS8uxujTyJJuDFI04,10302
49
49
  lgdo/types/waveformtable.py,sha256=f2tS4f1OEoYaTM5ldCX9zmw8iSISCT3t3wS1SrPdu_o,9901
50
- legend_pydataobj-1.10.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
51
- legend_pydataobj-1.10.0.dist-info/METADATA,sha256=BMxnHVSQ-28DaoWafTCdPTuD-pbII3RoGs71LMNRpyo,44381
52
- legend_pydataobj-1.10.0.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
53
- legend_pydataobj-1.10.0.dist-info/entry_points.txt,sha256=Uu5MTlppBZxB4QGlLv-oX8FqACWjAZDNii__TBDJwLQ,72
54
- legend_pydataobj-1.10.0.dist-info/top_level.txt,sha256=KyR-EUloqiXcQ62IWnzBmtInDtvsHl4q2ZJAZgTcLXE,5
55
- legend_pydataobj-1.10.0.dist-info/RECORD,,
50
+ legend_pydataobj-1.10.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
51
+ legend_pydataobj-1.10.1.dist-info/METADATA,sha256=gY35ifo00rptHQjVQ3BqkGeInupz20DGh1VIjeNEGlY,44381
52
+ legend_pydataobj-1.10.1.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
53
+ legend_pydataobj-1.10.1.dist-info/entry_points.txt,sha256=Uu5MTlppBZxB4QGlLv-oX8FqACWjAZDNii__TBDJwLQ,72
54
+ legend_pydataobj-1.10.1.dist-info/top_level.txt,sha256=KyR-EUloqiXcQ62IWnzBmtInDtvsHl4q2ZJAZgTcLXE,5
55
+ legend_pydataobj-1.10.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.2.0)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
lgdo/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '1.10.0'
16
- __version_tuple__ = version_tuple = (1, 10, 0)
15
+ __version__ = version = '1.10.1'
16
+ __version_tuple__ = version_tuple = (1, 10, 1)
@@ -441,15 +441,11 @@ def _radware_sigcompress_encode(
441
441
  while (i < sig_in.size) and (i < j + 48):
442
442
  si_i = int16(sig_in[i] + shift)
443
443
  si_im1 = int16(sig_in[i - 1] + shift)
444
- if max1 < si_i:
445
- max1 = si_i
446
- if min1 > si_i:
447
- min1 = si_i
444
+ max1 = max(max1, si_i)
445
+ min1 = min(min1, si_i)
448
446
  ds = si_i - si_im1
449
- if max2 < ds:
450
- max2 = ds
451
- if min2 > ds:
452
- min2 = ds
447
+ max2 = max(max2, ds)
448
+ min2 = min(min2, ds)
453
449
  nw += 1
454
450
  i += 1
455
451
  if max1 - min1 <= max2 - min2: # use absolute values
@@ -460,15 +456,13 @@ def _radware_sigcompress_encode(
460
456
  i < j + 128
461
457
  ): # FIXME: 128 could be tuned better?
462
458
  si_i = int16(sig_in[i] + shift)
463
- if max1 < si_i:
464
- max1 = si_i
459
+ max1 = max(max1, si_i)
465
460
  dd1 = max1 - min1
466
461
  if min1 > si_i:
467
462
  dd1 = max1 - si_i
468
463
  if dd1 > mask[nb1]:
469
464
  break
470
- if min1 > si_i:
471
- min1 = si_i
465
+ min1 = min(min1, si_i)
472
466
  nw += 1
473
467
  i += 1
474
468
  else: # use difference values
@@ -481,15 +475,13 @@ def _radware_sigcompress_encode(
481
475
  si_i = int16(sig_in[i] + shift)
482
476
  si_im1 = int16(sig_in[i - 1] + shift)
483
477
  ds = si_i - si_im1
484
- if max2 < ds:
485
- max2 = ds
478
+ max2 = max(max2, ds)
486
479
  dd2 = max2 - min2
487
480
  if min2 > ds:
488
481
  dd2 = max2 - ds
489
482
  if dd2 > mask[nb2]:
490
483
  break
491
- if min2 > ds:
492
- min2 = ds
484
+ min2 = min(min2, ds)
493
485
  nw += 1
494
486
  i += 1
495
487
 
@@ -103,8 +103,7 @@ def _h5_read_lgdo(
103
103
  if idx is not None:
104
104
  # check if idx is just an ordered list of the integers if so can ignore
105
105
  if (idx == np.arange(0, len(idx), 1)).all():
106
- if n_rows > len(idx):
107
- n_rows = len(idx)
106
+ n_rows = min(n_rows, len(idx))
108
107
  idx = None
109
108
  else:
110
109
  # chop off indices < start_row
@@ -43,15 +43,14 @@ def _h5_read_ndarray(
43
43
  if idx is not None:
44
44
  if len(idx) > 0 and idx[-1] >= ds_n_rows:
45
45
  log.warning("idx indexed past the end of the array in the file. Culling...")
46
- n_rows_to_read = bisect_left(idx[0], ds_n_rows)
47
- idx = (idx[:n_rows_to_read],)
46
+ n_rows_to_read = bisect_left(idx, ds_n_rows)
47
+ idx = idx[:n_rows_to_read]
48
48
  if len(idx) == 0:
49
49
  log.warning("idx empty after culling.")
50
50
  n_rows_to_read = len(idx)
51
51
  else:
52
52
  n_rows_to_read = ds_n_rows - start_row
53
- if n_rows_to_read > n_rows:
54
- n_rows_to_read = n_rows
53
+ n_rows_to_read = min(n_rows_to_read, n_rows)
55
54
 
56
55
  if idx is None:
57
56
  fspace.select_hyperslab(
@@ -112,6 +111,6 @@ def _h5_read_ndarray(
112
111
  # special handling for bools
113
112
  # (c and Julia store as uint8 so cast to bool)
114
113
  if datatype.get_nested_datatype_string(attrs["datatype"]) == "bool":
115
- nda = nda.astype(np.bool_)
114
+ nda = nda.astype(np.bool_, copy=False)
116
115
 
117
116
  return (nda, attrs, n_rows_to_read)
@@ -7,7 +7,7 @@ import numpy as np
7
7
 
8
8
  from ....types import Scalar
9
9
  from ...exceptions import LH5DecodeError
10
- from .utils import read_attrs
10
+ from . import utils
11
11
 
12
12
  log = logging.getLogger(__name__)
13
13
 
@@ -22,7 +22,7 @@ def _h5_read_scalar(
22
22
  sp = h5py.h5s.create(h5py.h5s.SCALAR)
23
23
  h5d.read(sp, sp, value)
24
24
  value = value[()]
25
- attrs = read_attrs(h5d, fname, oname)
25
+ attrs = utils.read_attrs(h5d, fname, oname)
26
26
 
27
27
  # special handling for bools
28
28
  # (c and Julia store as uint8 so cast to bool)
@@ -1,9 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
4
+
3
5
  import h5py
4
6
  import numpy as np
5
7
 
8
+ from .... import types
9
+ from ... import datatype
6
10
  from ...exceptions import LH5DecodeError
11
+ from . import scalar
12
+
13
+ log = logging.getLogger(__name__)
7
14
 
8
15
 
9
16
  def check_obj_buf_attrs(attrs, new_attrs, fname, oname):
@@ -23,7 +30,7 @@ def read_attrs(h5o, fname, oname):
23
30
  h5a = h5py.h5a.open(h5o, index=i_attr)
24
31
  name = h5a.get_name().decode()
25
32
  if h5a.shape != ():
26
- msg = f"attribute {name} is not a string or scalar"
33
+ msg = f"attribute {oname} is not a string or scalar"
27
34
  raise LH5DecodeError(msg, fname, oname)
28
35
  val = np.empty((), h5a.dtype)
29
36
  h5a.read(val)
@@ -33,3 +40,135 @@ def read_attrs(h5o, fname, oname):
33
40
  attrs[name] = val.item()
34
41
  h5a.close()
35
42
  return attrs
43
+
44
+
45
+ def read_n_rows(h5o, fname, oname):
46
+ """Read number of rows in LH5 object"""
47
+ if not h5py.h5a.exists(h5o, b"datatype"):
48
+ msg = "missing 'datatype' attribute"
49
+ raise LH5DecodeError(msg, fname, oname)
50
+
51
+ h5a = h5py.h5a.open(h5o, b"datatype")
52
+ type_attr = np.empty((), h5a.dtype)
53
+ h5a.read(type_attr)
54
+ type_attr = type_attr.item().decode()
55
+ lgdotype = datatype.datatype(type_attr)
56
+
57
+ # scalars are dim-0 datasets
58
+ if lgdotype is types.Scalar:
59
+ return None
60
+
61
+ # structs don't have rows
62
+ if lgdotype is types.Struct:
63
+ return None
64
+
65
+ # tables should have elements with all the same length
66
+ if lgdotype is types.Table:
67
+ # read out each of the fields
68
+ rows_read = None
69
+ for field in datatype.get_struct_fields(type_attr):
70
+ obj = h5py.h5o.open(h5o, field.encode())
71
+ n_rows_read = read_n_rows(obj, fname, field)
72
+ obj.close()
73
+ if not rows_read:
74
+ rows_read = n_rows_read
75
+ elif rows_read != n_rows_read:
76
+ log.warning(
77
+ f"'{field}' field in table '{oname}' has {rows_read} rows, "
78
+ f"{n_rows_read} was expected"
79
+ )
80
+
81
+ return rows_read
82
+
83
+ # length of vector of vectors is the length of its cumulative_length
84
+ if lgdotype is types.VectorOfVectors:
85
+ obj = h5py.h5o.open(h5o, b"cumulative_length")
86
+ n_rows = read_n_rows(obj, fname, "cumulative_length")
87
+ obj.close()
88
+ return n_rows
89
+
90
+ # length of vector of encoded vectors is the length of its decoded_size
91
+ if lgdotype in (types.VectorOfEncodedVectors, types.ArrayOfEncodedEqualSizedArrays):
92
+ obj = h5py.h5o.open(h5o, b"encoded_data")
93
+ n_rows = read_n_rows(obj, fname, "encoded_data")
94
+ obj.close()
95
+ return n_rows
96
+
97
+ # return array length (without reading the array!)
98
+ if issubclass(lgdotype, types.Array):
99
+ # compute the number of rows to read
100
+ return h5o.get_space().shape[0]
101
+
102
+ msg = f"don't know how to read rows of LGDO {lgdotype.__name__}"
103
+ raise LH5DecodeError(msg, fname, oname)
104
+
105
+
106
+ def read_size_in_bytes(h5o, fname, oname, field_mask=None):
107
+ """Read number size in LH5 object in memory (in B)"""
108
+ if not h5py.h5a.exists(h5o, b"datatype"):
109
+ msg = "missing 'datatype' attribute"
110
+ raise LH5DecodeError(msg, fname, oname)
111
+
112
+ h5a = h5py.h5a.open(h5o, b"datatype")
113
+ type_attr = np.empty((), h5a.dtype)
114
+ h5a.read(type_attr)
115
+ type_attr = type_attr.item().decode()
116
+ lgdotype = datatype.datatype(type_attr)
117
+
118
+ # scalars are dim-0 datasets
119
+ if lgdotype in (
120
+ types.Scalar,
121
+ types.Array,
122
+ types.ArrayOfEqualSizedArrays,
123
+ types.FixedSizeArray,
124
+ ):
125
+ return int(np.prod(h5o.shape) * h5o.dtype.itemsize)
126
+
127
+ # structs don't have rows
128
+ if lgdotype in (types.Struct, types.Histogram, types.Histogram.Axis):
129
+ size = 0
130
+ for key in h5o:
131
+ obj = h5py.h5o.open(h5o, key)
132
+ size += read_size_in_bytes(obj, fname, oname, field_mask)
133
+ obj.close()
134
+ return size
135
+
136
+ # tables should have elements with all the same length
137
+ if lgdotype in (types.Table, types.WaveformTable):
138
+ # read out each of the fields
139
+ size = 0
140
+ if not field_mask:
141
+ field_mask = datatype.get_struct_fields(type_attr)
142
+ for field in field_mask:
143
+ obj = h5py.h5o.open(h5o, field.encode())
144
+ size += read_size_in_bytes(obj, fname, field)
145
+ obj.close()
146
+ return size
147
+
148
+ # length of vector of vectors is the length of its cumulative_length
149
+ if lgdotype is types.VectorOfVectors:
150
+ size = 0
151
+ obj = h5py.h5o.open(h5o, b"cumulative_length")
152
+ size += read_size_in_bytes(obj, fname, "cumulative_length")
153
+ obj.close()
154
+ obj = h5py.h5o.open(h5o, b"flattened_data")
155
+ size += read_size_in_bytes(obj, fname, "flattened_data")
156
+ obj.close()
157
+ return size
158
+
159
+ # length of vector of encoded vectors is the length of its decoded_size
160
+ if lgdotype is types.ArrayOfEncodedEqualSizedArrays:
161
+ obj = h5py.h5o.open(h5o, b"decoded_size")
162
+ size = scalar._h5_read_scalar(obj, fname, "decoded_size")[0].value
163
+ obj.close()
164
+
165
+ obj = h5py.h5o.open(h5o, b"encoded_data")
166
+ cl = h5py.h5o.open(obj, b"cumulative_length")
167
+ size *= cl.shape[0]
168
+ size *= 4 # TODO: UPDATE WHEN CODECS SUPPORT MORE DTYPES
169
+ obj.close()
170
+
171
+ return size
172
+
173
+ msg = f"don't know how to read size of LGDO {lgdotype.__name__}"
174
+ raise LH5DecodeError(msg, fname, oname)
@@ -156,7 +156,7 @@ def _h5_read_vector_of_vectors(
156
156
  # grow fd_buf if necessary to hold the data
157
157
  fdb_size = fd_buf_start + fd_n_rows
158
158
  if len(fd_buf) < fdb_size:
159
- fd_buf.resize(fdb_size)
159
+ fd_buf.nda.resize(fdb_size, refcheck=False)
160
160
 
161
161
  # now read
162
162
  h5o = h5py.h5o.open(h5g, b"flattened_data")
lgdo/lh5/core.py CHANGED
@@ -175,6 +175,9 @@ def read(
175
175
 
176
176
  if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
177
177
  idx = idx[0]
178
+ if isinstance(idx, np.ndarray) and idx.dtype == np.dtype("?"):
179
+ idx = np.where(idx)[0]
180
+
178
181
  obj, n_rows_read = _serializers._h5_read_lgdo(
179
182
  lh5_obj.id,
180
183
  lh5_obj.file.filename,
lgdo/lh5/iterator.py CHANGED
@@ -2,11 +2,14 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import typing
5
+ from warnings import warn
5
6
 
6
7
  import numpy as np
7
8
  import pandas as pd
9
+ from numpy.typing import NDArray
8
10
 
9
11
  from ..types import Array, Scalar, Struct, VectorOfVectors
12
+ from ..units import default_units_registry as ureg
10
13
  from .store import LH5Store
11
14
  from .utils import expand_path
12
15
 
@@ -19,35 +22,53 @@ class LH5Iterator(typing.Iterator):
19
22
  at a time. This also accepts an entry list/mask to enable event selection,
20
23
  and a field mask.
21
24
 
22
- This class can be used either for random access:
25
+ This can be used as an iterator:
23
26
 
24
- >>> lh5_obj, n_rows = lh5_it.read(entry)
25
-
26
- to read the block of entries starting at entry. In case of multiple files
27
- or the use of an event selection, entry refers to a global event index
28
- across files and does not count events that are excluded by the selection.
29
-
30
- This can also be used as an iterator:
31
-
32
- >>> for lh5_obj, entry, n_rows in LH5Iterator(...):
27
+ >>> for lh5_obj, i_entry, n_rows in LH5Iterator(...):
33
28
  >>> # do the thing!
34
29
 
35
- This is intended for if you are reading a large quantity of data but
36
- want to limit your memory usage (particularly when reading in waveforms!).
30
+ This is intended for if you are reading a large quantity of data. This
31
+ will ensure that you traverse files efficiently to minimize caching time
32
+ and will limit your memory usage (particularly when reading in waveforms!).
37
33
  The ``lh5_obj`` that is read by this class is reused in order to avoid
38
34
  reallocation of memory; this means that if you want to hold on to data
39
35
  between reads, you will have to copy it somewhere!
36
+
37
+ When defining an LH5Iterator, you must give it a list of files and the
38
+ hdf5 groups containing the data tables you are reading. You may also
39
+ provide a field mask, and an entry list or mask, specifying which entries
40
+ to read from the files. You may also pair it with a friend iterator, which
41
+ contains a parallel group of files which will be simultaneously read.
42
+ In addition to accessing requested data via ``lh5_obj``, several
43
+ properties exist to tell you where that data came from:
44
+
45
+ - lh5_it.current_local_entries: get the entry numbers relative to the
46
+ file the data came from
47
+ - lh5_it.current_global_entries: get the entry number relative to the
48
+ full dataset
49
+ - lh5_it.current_files: get the file name corresponding to each entry
50
+ - lh5_it.current_groups: get the group name corresponding to each entry
51
+
52
+ This class can also be used either for random access:
53
+
54
+ >>> lh5_obj, n_rows = lh5_it.read(i_entry)
55
+
56
+ to read the block of entries starting at i_entry. In case of multiple files
57
+ or the use of an event selection, i_entry refers to a global event index
58
+ across files and does not count events that are excluded by the selection.
40
59
  """
41
60
 
42
61
  def __init__(
43
62
  self,
44
63
  lh5_files: str | list[str],
45
- groups: str | list[str],
64
+ groups: str | list[str] | list[list[str]],
46
65
  base_path: str = "",
47
66
  entry_list: list[int] | list[list[int]] | None = None,
48
67
  entry_mask: list[bool] | list[list[bool]] | None = None,
49
68
  field_mask: dict[str, bool] | list[str] | tuple[str] | None = None,
50
- buffer_len: int = 3200,
69
+ buffer_len: int = "100*MB",
70
+ file_cache: int = 10,
71
+ file_map: NDArray[int] = None,
51
72
  friend: typing.Iterator | None = None,
52
73
  ) -> None:
53
74
  """
@@ -57,9 +78,10 @@ class LH5Iterator(typing.Iterator):
57
78
  file or files to read from. May include wildcards and environment
58
79
  variables.
59
80
  groups
60
- HDF5 group(s) to read. If a list is provided for both lh5_files
61
- and group, they must be the same size. If a file is wild-carded,
62
- the same group will be assigned to each file found
81
+ HDF5 group(s) to read. If a list of strings is provided, use
82
+ same groups for each file. If a list of lists is provided, size
83
+ of outer list must match size of file list, and each inner list
84
+ will apply to a single file (or set of wildcarded files)
63
85
  entry_list
64
86
  list of entry numbers to read. If a nested list is provided,
65
87
  expect one top-level list for each file, containing a list of
@@ -72,66 +94,98 @@ class LH5Iterator(typing.Iterator):
72
94
  more details.
73
95
  buffer_len
74
96
  number of entries to read at a time while iterating through files.
97
+ file_cache
98
+ maximum number of files to keep open at a time
99
+ file_map
100
+ cumulative file/group entries. This can be provided on construction
101
+ to speed up random or sparse access; otherwise, we sequentially
102
+ read the size of each group. WARNING: no checks for accuracy are
103
+ performed so only use this if you know what you are doing!
75
104
  friend
76
105
  a \"friend\" LH5Iterator that will be read in parallel with this.
77
106
  The friend should have the same length and entry list. A single
78
107
  LH5 table containing columns from both iterators will be returned.
108
+ Note that buffer_len will be set to the minimum of the two.
79
109
  """
80
- self.lh5_st = LH5Store(base_path=base_path, keep_open=True)
110
+ self.lh5_st = LH5Store(base_path=base_path, keep_open=file_cache)
81
111
 
82
112
  # List of files, with wildcards and env vars expanded
83
113
  if isinstance(lh5_files, str):
84
114
  lh5_files = [lh5_files]
85
- if isinstance(groups, list):
86
- lh5_files *= len(groups)
87
- elif not isinstance(lh5_files, list):
115
+ elif not isinstance(lh5_files, (list, set, tuple)):
88
116
  msg = "lh5_files must be a string or list of strings"
89
117
  raise ValueError(msg)
90
118
 
91
119
  if isinstance(groups, str):
92
- groups = [groups] * len(lh5_files)
120
+ groups = [[groups]] * len(lh5_files)
93
121
  elif not isinstance(groups, list):
94
- msg = "group must be a string or list of strings"
122
+ msg = "group must be a string or appropriate list"
123
+ raise ValueError(msg)
124
+ elif all(isinstance(g, str) for g in groups):
125
+ groups = [groups] * len(lh5_files)
126
+ elif len(groups) == len(lh5_files) and all(
127
+ isinstance(gr_list, (list, set, tuple)) for gr_list in groups
128
+ ):
129
+ pass
130
+ else:
131
+ msg = "group must be a string or appropriate list"
95
132
  raise ValueError(msg)
96
133
 
97
134
  if len(groups) != len(lh5_files):
98
135
  msg = "lh5_files and groups must have same length"
99
136
  raise ValueError(msg)
100
137
 
138
+ # make flattened outer-product-like list of files and groups
101
139
  self.lh5_files = []
102
140
  self.groups = []
103
141
  for f, g in zip(lh5_files, groups):
104
- f_exp = expand_path(f, list=True, base_path=base_path)
105
- self.lh5_files += f_exp
106
- self.groups += [g] * len(f_exp)
142
+ for f_exp in expand_path(f, list=True, base_path=base_path):
143
+ self.lh5_files += [f_exp] * len(g)
144
+ self.groups += list(g)
107
145
 
108
146
  if entry_list is not None and entry_mask is not None:
109
147
  msg = "entry_list and entry_mask arguments are mutually exclusive"
110
148
  raise ValueError(msg)
111
149
 
112
150
  # Map to last row in each file
113
- self.file_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i")
151
+ if file_map is None:
152
+ self.file_map = np.full(len(self.lh5_files), np.iinfo("q").max, "q")
153
+ else:
154
+ self.file_map = np.array(file_map)
155
+
114
156
  # Map to last iterator entry for each file
115
- self.entry_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i")
157
+ self.entry_map = np.full(len(self.lh5_files), np.iinfo("q").max, "q")
116
158
  self.buffer_len = buffer_len
117
159
 
118
160
  if len(self.lh5_files) > 0:
119
161
  f = self.lh5_files[0]
120
162
  g = self.groups[0]
163
+ n_rows = self.lh5_st.read_n_rows(g, f)
164
+
165
+ if isinstance(self.buffer_len, str):
166
+ self.buffer_len = ureg.Quantity(buffer_len)
167
+ if isinstance(self.buffer_len, ureg.Quantity):
168
+ self.buffer_len = int(
169
+ self.buffer_len
170
+ / (self.lh5_st.read_size_in_bytes(g, f) * ureg.B)
171
+ * n_rows
172
+ )
173
+
121
174
  self.lh5_buffer = self.lh5_st.get_buffer(
122
175
  g,
123
176
  f,
124
177
  size=self.buffer_len,
125
178
  field_mask=field_mask,
126
179
  )
127
- self.file_map[0] = self.lh5_st.read_n_rows(g, f)
180
+ if file_map is None:
181
+ self.file_map[0] = n_rows
128
182
  else:
129
183
  msg = f"can't open any files from {lh5_files}"
130
184
  raise RuntimeError(msg)
131
185
 
132
186
  self.n_rows = 0
133
- self.current_entry = 0
134
- self.next_entry = 0
187
+ self.current_i_entry = 0
188
+ self.next_i_entry = 0
135
189
 
136
190
  self.field_mask = field_mask
137
191
 
@@ -142,13 +196,13 @@ class LH5Iterator(typing.Iterator):
142
196
  entry_list = list(entry_list)
143
197
  if isinstance(entry_list[0], int):
144
198
  self.local_entry_list = [None] * len(self.file_map)
145
- self.global_entry_list = np.array(entry_list, "i")
199
+ self.global_entry_list = np.array(entry_list, "q")
146
200
  self.global_entry_list.sort()
147
201
 
148
202
  else:
149
203
  self.local_entry_list = [[]] * len(self.file_map)
150
204
  for i_file, local_list in enumerate(entry_list):
151
- self.local_entry_list[i_file] = np.array(local_list, "i")
205
+ self.local_entry_list[i_file] = np.array(local_list, "q")
152
206
  self.local_entry_list[i_file].sort()
153
207
 
154
208
  elif entry_mask is not None:
@@ -168,6 +222,15 @@ class LH5Iterator(typing.Iterator):
168
222
  if not isinstance(friend, typing.Iterator):
169
223
  msg = "Friend must be an Iterator"
170
224
  raise ValueError(msg)
225
+
226
+ # set buffer_lens to be equal
227
+ if self.buffer_len < friend.buffer_len:
228
+ friend.buffer_len = self.buffer_len
229
+ friend.lh5_buffer.resize(self.buffer_len)
230
+ elif self.buffer_len > friend.buffer_len:
231
+ self.buffer_len = friend.buffer_len
232
+ self.lh5_buffer.resize(friend.buffer_len)
233
+
171
234
  self.lh5_buffer.join(friend.lh5_buffer)
172
235
  self.friend = friend
173
236
 
@@ -176,33 +239,52 @@ class LH5Iterator(typing.Iterator):
176
239
  if i_file < 0:
177
240
  return 0
178
241
  fcl = self.file_map[i_file]
179
- if fcl == np.iinfo("i").max:
180
- fcl = self._get_file_cumlen(i_file - 1) + self.lh5_st.read_n_rows(
181
- self.groups[i_file], self.lh5_files[i_file]
182
- )
183
- self.file_map[i_file] = fcl
242
+
243
+ # if we haven't already calculated, calculate for all files up to i_file
244
+ if fcl == np.iinfo("q").max:
245
+ i_start = np.searchsorted(self.file_map, np.iinfo("q").max)
246
+ fcl = self.file_map[i_start - 1] if i_start > 0 else 0
247
+
248
+ for i in range(i_start, i_file + 1):
249
+ fcl += self.lh5_st.read_n_rows(self.groups[i], self.lh5_files[i])
250
+ self.file_map[i] = fcl
184
251
  return fcl
185
252
 
253
+ @property
254
+ def current_entry(self) -> int:
255
+ "deprecated alias for current_i_entry"
256
+ warn(
257
+ "current_entry has been renamed to current_i_entry.",
258
+ DeprecationWarning,
259
+ stacklevel=2,
260
+ )
261
+
262
+ return self.current_i_entry
263
+
186
264
  def _get_file_cumentries(self, i_file: int) -> int:
187
265
  """Helper to get cumulative iterator entries in file"""
188
266
  if i_file < 0:
189
267
  return 0
190
268
  n = self.entry_map[i_file]
191
- if n == np.iinfo("i").max:
192
- elist = self.get_file_entrylist(i_file)
193
- fcl = self._get_file_cumlen(i_file)
194
- if elist is None:
195
- # no entry list provided
196
- n = fcl
197
- else:
198
- file_entries = self.get_file_entrylist(i_file)
199
- n = len(file_entries)
200
- # check that file entries fall inside of file
201
- if n > 0 and file_entries[-1] >= fcl:
202
- logging.warning(f"Found entries out of range for file {i_file}")
203
- n = np.searchsorted(file_entries, fcl, "right")
204
- n += self._get_file_cumentries(i_file - 1)
205
- self.entry_map[i_file] = n
269
+
270
+ # if we haven't already calculated, calculate for all files up to i_file
271
+ if n == np.iinfo("q").max:
272
+ i_start = np.searchsorted(self.entry_map, np.iinfo("q").max)
273
+ n = self.entry_map[i_start - 1] if i_start > 0 else 0
274
+
275
+ for i in range(i_start, i_file + 1):
276
+ elist = self.get_file_entrylist(i)
277
+ fcl = self._get_file_cumlen(i)
278
+ if elist is None:
279
+ # no entry list provided
280
+ n = fcl
281
+ else:
282
+ n += len(elist)
283
+ # check that file entries fall inside of file
284
+ if len(elist) > 0 and elist[-1] >= fcl:
285
+ logging.warning(f"Found entries out of range for file {i}")
286
+ n += np.searchsorted(elist, fcl, "right") - len(elist)
287
+ self.entry_map[i] = n
206
288
  return n
207
289
 
208
290
  def get_file_entrylist(self, i_file: int) -> np.ndarray:
@@ -218,50 +300,50 @@ class LH5Iterator(typing.Iterator):
218
300
  f_end = self._get_file_cumlen(i_file)
219
301
  i_start = self._get_file_cumentries(i_file - 1)
220
302
  i_stop = np.searchsorted(self.global_entry_list, f_end, "right")
221
- elist = np.array(self.global_entry_list[i_start:i_stop], "i") - f_start
303
+ elist = np.array(self.global_entry_list[i_start:i_stop], "q") - f_start
222
304
  self.local_entry_list[i_file] = elist
223
305
  return elist
224
306
 
225
307
  def get_global_entrylist(self) -> np.ndarray:
226
308
  """Get global entry list, constructing it if needed"""
227
309
  if self.global_entry_list is None and self.local_entry_list is not None:
228
- self.global_entry_list = np.zeros(len(self), "i")
310
+ self.global_entry_list = np.zeros(len(self), "q")
229
311
  for i_file in range(len(self.lh5_files)):
230
- i_start = self.get_file_cumentries(i_file - 1)
231
- i_stop = self.get_file_cumentries(i_file)
232
- f_start = self.get_file_cumlen(i_file - 1)
312
+ i_start = self._get_file_cumentries(i_file - 1)
313
+ i_stop = self._get_file_cumentries(i_file)
314
+ f_start = self._get_file_cumlen(i_file - 1)
233
315
  self.global_entry_list[i_start:i_stop] = (
234
316
  self.get_file_entrylist(i_file) + f_start
235
317
  )
236
318
  return self.global_entry_list
237
319
 
238
- def read(self, entry: int) -> tuple[LGDO, int]:
239
- """Read the nextlocal chunk of events, starting at entry. Return the
320
+ def read(self, i_entry: int) -> tuple[LGDO, int]:
321
+ """Read the nextlocal chunk of events, starting at i_entry. Return the
240
322
  LH5 buffer and number of rows read."""
241
323
  self.n_rows = 0
242
- i_file = np.searchsorted(self.entry_map, entry, "right")
324
+ i_file = np.searchsorted(self.entry_map, i_entry, "right")
243
325
 
244
326
  # if file hasn't been opened yet, search through files
245
327
  # sequentially until we find the right one
246
- if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("i").max:
247
- while i_file < len(self.lh5_files) and entry >= self._get_file_cumentries(
328
+ if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("q").max:
329
+ while i_file < len(self.lh5_files) and i_entry >= self._get_file_cumentries(
248
330
  i_file
249
331
  ):
250
332
  i_file += 1
251
333
 
252
334
  if i_file == len(self.lh5_files):
253
335
  return (self.lh5_buffer, self.n_rows)
254
- local_entry = entry - self._get_file_cumentries(i_file - 1)
336
+ local_i_entry = i_entry - self._get_file_cumentries(i_file - 1)
255
337
 
256
338
  while self.n_rows < self.buffer_len and i_file < len(self.file_map):
257
339
  # Loop through files
258
340
  local_idx = self.get_file_entrylist(i_file)
259
341
  if local_idx is not None and len(local_idx) == 0:
260
342
  i_file += 1
261
- local_entry = 0
343
+ local_i_entry = 0
262
344
  continue
263
345
 
264
- i_local = local_idx[local_entry] if local_idx is not None else local_entry
346
+ i_local = local_i_entry if local_idx is None else local_idx[local_i_entry]
265
347
  self.lh5_buffer, n_rows = self.lh5_st.read(
266
348
  self.groups[i_file],
267
349
  self.lh5_files[i_file],
@@ -275,12 +357,12 @@ class LH5Iterator(typing.Iterator):
275
357
 
276
358
  self.n_rows += n_rows
277
359
  i_file += 1
278
- local_entry = 0
360
+ local_i_entry = 0
279
361
 
280
- self.current_entry = entry
362
+ self.current_i_entry = i_entry
281
363
 
282
364
  if self.friend is not None:
283
- self.friend.read(entry)
365
+ self.friend.read(i_entry)
284
366
 
285
367
  return (self.lh5_buffer, self.n_rows)
286
368
 
@@ -290,6 +372,108 @@ class LH5Iterator(typing.Iterator):
290
372
  if self.friend is not None:
291
373
  self.friend.reset_field_mask(mask)
292
374
 
375
+ @property
376
+ def current_local_entries(self) -> NDArray[int]:
377
+ """Return list of local file entries in buffer"""
378
+ cur_entries = np.zeros(self.n_rows, dtype="int32")
379
+ i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
380
+ file_start = self._get_file_cumentries(i_file - 1)
381
+ i_local = self.current_i_entry - file_start
382
+ i = 0
383
+
384
+ while i < len(cur_entries):
385
+ # number of entries to read from this file
386
+ file_end = self._get_file_cumentries(i_file)
387
+ n = min(file_end - file_start - i_local, len(cur_entries) - i)
388
+ entries = self.get_file_entrylist(i_file)
389
+
390
+ if entries is None:
391
+ cur_entries[i : i + n] = np.arange(i_local, i_local + n)
392
+ else:
393
+ cur_entries[i : i + n] = entries[i_local : i_local + n]
394
+
395
+ i_file += 1
396
+ file_start = file_end
397
+ i_local = 0
398
+ i += n
399
+
400
+ return cur_entries
401
+
402
+ @property
403
+ def current_global_entries(self) -> NDArray[int]:
404
+ """Return list of local file entries in buffer"""
405
+ cur_entries = np.zeros(self.n_rows, dtype="int32")
406
+ i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
407
+ file_start = self._get_file_cumentries(i_file - 1)
408
+ i_local = self.current_i_entry - file_start
409
+ i = 0
410
+
411
+ while i < len(cur_entries):
412
+ # number of entries to read from this file
413
+ file_end = self._get_file_cumentries(i_file)
414
+ n = min(file_end - file_start - i_local, len(cur_entries) - i)
415
+ entries = self.get_file_entrylist(i_file)
416
+
417
+ if entries is None:
418
+ cur_entries[i : i + n] = self._get_file_cumlen(i_file - 1) + np.arange(
419
+ i_local, i_local + n
420
+ )
421
+ else:
422
+ cur_entries[i : i + n] = (
423
+ self._get_file_cumlen(i_file - 1) + entries[i_local : i_local + n]
424
+ )
425
+
426
+ i_file += 1
427
+ file_start = file_end
428
+ i_local = 0
429
+ i += n
430
+
431
+ return cur_entries
432
+
433
+ @property
434
+ def current_files(self) -> NDArray[str]:
435
+ """Return list of file names for entries in buffer"""
436
+ cur_files = np.zeros(self.n_rows, dtype=object)
437
+ i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
438
+ file_start = self._get_file_cumentries(i_file - 1)
439
+ i_local = self.current_i_entry - file_start
440
+ i = 0
441
+
442
+ while i < len(cur_files):
443
+ # number of entries to read from this file
444
+ file_end = self._get_file_cumentries(i_file)
445
+ n = min(file_end - file_start - i_local, len(cur_files) - i)
446
+ cur_files[i : i + n] = self.lh5_files[i_file]
447
+
448
+ i_file += 1
449
+ file_start = file_end
450
+ i_local = 0
451
+ i += n
452
+
453
+ return cur_files
454
+
455
+ @property
456
+ def current_groups(self) -> NDArray[str]:
457
+ """Return list of group names for entries in buffer"""
458
+ cur_groups = np.zeros(self.n_rows, dtype=object)
459
+ i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
460
+ file_start = self._get_file_cumentries(i_file - 1)
461
+ i_local = self.current_i_entry - file_start
462
+ i = 0
463
+
464
+ while i < len(cur_groups):
465
+ # number of entries to read from this file
466
+ file_end = self._get_file_cumentries(i_file)
467
+ n = min(file_end - file_start - i_local, len(cur_groups) - i)
468
+ cur_groups[i : i + n] = self.groups[i_file]
469
+
470
+ i_file += 1
471
+ file_start = file_end
472
+ i_local = 0
473
+ i += n
474
+
475
+ return cur_groups
476
+
293
477
  def __len__(self) -> int:
294
478
  """Return the total number of entries."""
295
479
  return (
@@ -300,15 +484,15 @@ class LH5Iterator(typing.Iterator):
300
484
 
301
485
  def __iter__(self) -> typing.Iterator:
302
486
  """Loop through entries in blocks of size buffer_len."""
303
- self.current_entry = 0
304
- self.next_entry = 0
487
+ self.current_i_entry = 0
488
+ self.next_i_entry = 0
305
489
  return self
306
490
 
307
491
  def __next__(self) -> tuple[LGDO, int, int]:
308
492
  """Read next buffer_len entries and return lh5_table, iterator entry
309
493
  and n_rows read."""
310
- buf, n_rows = self.read(self.next_entry)
311
- self.next_entry = self.current_entry + n_rows
494
+ buf, n_rows = self.read(self.next_i_entry)
495
+ self.next_i_entry = self.current_i_entry + n_rows
312
496
  if n_rows == 0:
313
497
  raise StopIteration
314
- return (buf, self.current_entry, n_rows)
498
+ return (buf, self.current_i_entry, n_rows)
lgdo/lh5/store.py CHANGED
@@ -9,6 +9,7 @@ import bisect
9
9
  import logging
10
10
  import os
11
11
  import sys
12
+ from collections import OrderedDict
12
13
  from collections.abc import Mapping, Sequence
13
14
  from inspect import signature
14
15
  from typing import Any
@@ -47,14 +48,15 @@ class LH5Store:
47
48
  directory path to prepend to LH5 files.
48
49
  keep_open
49
50
  whether to keep files open by storing the :mod:`h5py` objects as
50
- class attributes.
51
+ class attributes. If ``keep_open`` is an ``int``, keep only the
52
+ ``n`` most recently opened files; if ``True``, no limit
51
53
  locking
52
54
  whether to lock files when reading
53
55
  """
54
56
  self.base_path = "" if base_path == "" else utils.expand_path(base_path)
55
57
  self.keep_open = keep_open
56
58
  self.locking = locking
57
- self.files = {}
59
+ self.files = OrderedDict()
58
60
 
59
61
  def gimme_file(
60
62
  self,
@@ -87,6 +89,7 @@ class LH5Store:
87
89
  file_kwargs["locking"] = self.locking
88
90
 
89
91
  if lh5_file in self.files:
92
+ self.files.move_to_end(lh5_file)
90
93
  return self.files[lh5_file]
91
94
 
92
95
  if self.base_path != "":
@@ -120,6 +123,8 @@ class LH5Store:
120
123
  h5f = h5py.File(full_path, mode, **file_kwargs)
121
124
 
122
125
  if self.keep_open:
126
+ if isinstance(self.keep_open, int) and len(self.files) >= self.keep_open:
127
+ self.files.popitem(last=False)
123
128
  self.files[lh5_file] = h5f
124
129
 
125
130
  return h5f
@@ -228,6 +233,9 @@ class LH5Store:
228
233
 
229
234
  if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
230
235
  idx = idx[0]
236
+ if isinstance(idx, np.ndarray) and idx.dtype == np.dtype("?"):
237
+ idx = np.where(idx)[0]
238
+
231
239
  return _serializers._h5_read_lgdo(
232
240
  lh5_obj.id,
233
241
  lh5_obj.file.filename,
@@ -307,3 +315,9 @@ class LH5Store:
307
315
  Return ``None`` if it is a :class:`.Scalar` or a :class:`.Struct`.
308
316
  """
309
317
  return utils.read_n_rows(name, self.gimme_file(lh5_file, "r"))
318
+
319
+ def read_size_in_bytes(self, name: str, lh5_file: str | h5py.File) -> int:
320
+ """Look up the size (in B) of the object in memory. Will recursively
321
+ crawl through all objects in a Struct or Table
322
+ """
323
+ return utils.read_size_in_bytes(name, self.gimme_file(lh5_file, "r"))
lgdo/lh5/utils.py CHANGED
@@ -12,7 +12,7 @@ from typing import Any
12
12
  import h5py
13
13
 
14
14
  from .. import types
15
- from . import _serializers, datatype
15
+ from . import _serializers
16
16
  from .exceptions import LH5DecodeError
17
17
 
18
18
  log = logging.getLogger(__name__)
@@ -44,57 +44,31 @@ def read_n_rows(name: str, h5f: str | h5py.File) -> int | None:
44
44
  Return ``None`` if `name` is a :class:`.Scalar` or a :class:`.Struct`.
45
45
  """
46
46
  if not isinstance(h5f, h5py.File):
47
- h5f = h5py.File(h5f, "r")
47
+ h5f = h5py.File(h5f, "r", locking=False)
48
48
 
49
49
  try:
50
- attrs = h5f[name].attrs
50
+ h5o = h5f[name].id
51
51
  except KeyError as e:
52
52
  msg = "not found"
53
53
  raise LH5DecodeError(msg, h5f, name) from e
54
- except AttributeError as e:
55
- msg = "missing 'datatype' attribute"
56
- raise LH5DecodeError(msg, h5f, name) from e
57
54
 
58
- lgdotype = datatype.datatype(attrs["datatype"])
59
-
60
- # scalars are dim-0 datasets
61
- if lgdotype is types.Scalar:
62
- return None
63
-
64
- # structs don't have rows
65
- if lgdotype is types.Struct:
66
- return None
67
-
68
- # tables should have elements with all the same length
69
- if lgdotype is types.Table:
70
- # read out each of the fields
71
- rows_read = None
72
- for field in datatype.get_struct_fields(attrs["datatype"]):
73
- n_rows_read = read_n_rows(name + "/" + field, h5f)
74
- if not rows_read:
75
- rows_read = n_rows_read
76
- elif rows_read != n_rows_read:
77
- log.warning(
78
- f"'{field}' field in table '{name}' has {rows_read} rows, "
79
- f"{n_rows_read} was expected"
80
- )
81
- return rows_read
55
+ return _serializers.read.utils.read_n_rows(h5o, h5f.name, name)
82
56
 
83
- # length of vector of vectors is the length of its cumulative_length
84
- if lgdotype is types.VectorOfVectors:
85
- return read_n_rows(f"{name}/cumulative_length", h5f)
86
57
 
87
- # length of vector of encoded vectors is the length of its decoded_size
88
- if lgdotype in (types.VectorOfEncodedVectors, types.ArrayOfEncodedEqualSizedArrays):
89
- return read_n_rows(f"{name}/encoded_data", h5f)
58
+ def read_size_in_bytes(name: str, h5f: str | h5py.File) -> int | None:
59
+ """Look up the size (in B) in an LGDO object in memory. Will crawl
60
+ recursively through members of a Struct or Table
61
+ """
62
+ if not isinstance(h5f, h5py.File):
63
+ h5f = h5py.File(h5f, "r", locking=False)
90
64
 
91
- # return array length (without reading the array!)
92
- if issubclass(lgdotype, types.Array):
93
- # compute the number of rows to read
94
- return h5f[name].shape[0]
65
+ try:
66
+ h5o = h5f[name].id
67
+ except KeyError as e:
68
+ msg = "not found"
69
+ raise LH5DecodeError(msg, h5f, name) from e
95
70
 
96
- msg = f"don't know how to read rows of LGDO {lgdotype.__name__}"
97
- raise LH5DecodeError(msg, h5f, name)
71
+ return _serializers.read.utils.read_size_in_bytes(h5o, h5f.name, name)
98
72
 
99
73
 
100
74
  def get_h5_group(
lgdo/types/histogram.py CHANGED
@@ -1,11 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
- from collections.abc import Iterable
4
+ from collections.abc import Iterable, Mapping, Sequence
5
5
  from typing import Any
6
6
 
7
7
  import hist
8
8
  import numpy as np
9
+ import pandas as pd
9
10
  from numpy.typing import NDArray
10
11
 
11
12
  from .array import Array
@@ -269,10 +270,10 @@ class Histogram(Struct):
269
270
  b.append(Histogram.Axis.from_edges(ax.edges, binedge_attrs))
270
271
  else:
271
272
  if binning is None:
272
- msg = "need to also pass binning if passing histogram as array"
273
+ msg = "need to pass binning to construct Histogram"
273
274
  raise ValueError(msg)
274
- w = weights if isinstance(weights, Array) else Array(weights)
275
275
 
276
+ # set up binning
276
277
  if all(isinstance(ax, Histogram.Axis) for ax in binning):
277
278
  if binedge_attrs is not None:
278
279
  msg = "passed both binedges as Axis instances and binedge_attrs"
@@ -286,6 +287,14 @@ class Histogram(Struct):
286
287
  msg = "invalid binning object passed"
287
288
  raise ValueError(msg)
288
289
 
290
+ # set up bin weights
291
+ if isinstance(weights, Array):
292
+ w = weights
293
+ elif weights is None:
294
+ w = Array(shape=[ax.nbins for ax in b], fill_val=0, dtype=np.float32)
295
+ else:
296
+ w = Array(weights)
297
+
289
298
  if len(binning) != len(w.nda.shape):
290
299
  msg = "binning and weight dimensions do not match"
291
300
  raise ValueError(msg)
@@ -315,6 +324,98 @@ class Histogram(Struct):
315
324
  assert all(isinstance(v, Histogram.Axis) for k, v in bins)
316
325
  return tuple(v for _, v in bins)
317
326
 
327
+ def fill(self, data, w: NDArray = None, keys: Sequence[str] = None) -> None:
328
+ """Fill histogram by incrementing bins with data points weighted by w
329
+
330
+ Parameters
331
+ ----------
332
+ data
333
+ a ndarray with inner dimension equal to number of axes, or a list
334
+ of equal-length 1d-arrays containing data for each axis, or a
335
+ Mapping to 1d-arrays containing data for each axis (requires keys),
336
+ or a Pandas dataframe (optionally takes a list of keys)
337
+ w
338
+ weight to use for incrementing data points. If None, use 1 for all
339
+ keys
340
+ list of keys to use if data is a pandas ''DataFrame'' or ''Mapping''
341
+ """
342
+ if keys is not None:
343
+ if isinstance(keys, str):
344
+ keys = [keys]
345
+ elif not isinstance(keys, list):
346
+ keys = list(keys)
347
+
348
+ if (
349
+ isinstance(data, np.ndarray)
350
+ and len(data.shape) == 1
351
+ and len(self.binning) == 1
352
+ ):
353
+ N = len(data)
354
+ data = [data]
355
+ elif (
356
+ isinstance(data, np.ndarray)
357
+ and len(data.shape) == 2
358
+ and data.shape[1] == len(self.binning)
359
+ ):
360
+ N = data.shape[0]
361
+ data = data.T
362
+ elif isinstance(data, pd.DataFrame) and (
363
+ (keys is not None and len(keys) == len(self.binning))
364
+ or data.ndim == len(self.binning)
365
+ ):
366
+ if keys is not None:
367
+ data = data[keys]
368
+ N = len(data)
369
+ data = data.values.T
370
+ elif isinstance(data, Sequence) and len(data) == len(self.binning):
371
+ data = [d if isinstance(d, np.ndarray) else np.array(d) for d in data]
372
+ N = len(data[0])
373
+ if not all(len(d) == N for d in data):
374
+ msg = "length of all data arrays must be equal"
375
+ raise ValueError(msg)
376
+ elif isinstance(data, Mapping):
377
+ if not isinstance(keys, Sequence) or len(keys) != len(self.binning):
378
+ msg = "filling hist with Mapping data requires a list of keys with same length as histogram rank"
379
+ raise ValueError(msg)
380
+ data = [
381
+ data[k] if isinstance(data[k], np.ndarray) else np.array(data[k])
382
+ for k in keys
383
+ ]
384
+ N = len(data[0])
385
+ if not all(len(d) == N for d in data):
386
+ msg = "length of all data arrays must be equal"
387
+ raise ValueError(msg)
388
+ else:
389
+ msg = "data must be 2D numpy array or list of 1D arrays with length equal to number of axes"
390
+ raise ValueError(msg)
391
+
392
+ idx = np.zeros(N, np.float64) # bin indices for flattened array
393
+ oor_mask = np.ones(N, np.bool_) # mask to remove out of range values
394
+ stride = [s // self.weights.dtype.itemsize for s in self.weights.nda.strides]
395
+ for col, ax, s in zip(data, self.binning, stride):
396
+ if ax.is_range:
397
+ idx += s * np.floor((col - ax.first) / ax.step - int(not ax.closedleft))
398
+ if ax.closedleft:
399
+ oor_mask &= (ax.first <= col) & (col < ax.last)
400
+ else:
401
+ oor_mask &= (ax.first < col) & (col <= ax.last)
402
+ else:
403
+ idx += s * (
404
+ np.searchsorted(
405
+ ax.edges, col, side=("right" if ax.closedleft else "left")
406
+ )
407
+ - 1
408
+ )
409
+ if ax.closedleft:
410
+ oor_mask &= (ax.edges[0] <= col) & (col < ax.edges[-1])
411
+ else:
412
+ oor_mask &= (ax.edges[0] < col) & (col <= ax.edges[-1])
413
+
414
+ # increment bin contents
415
+ idx = idx[oor_mask].astype(np.int64)
416
+ w = w[oor_mask] if w is not None else 1
417
+ np.add.at(self.weights.nda.reshape(-1), idx, w)
418
+
318
419
  def __setitem__(self, name: str, obj: LGDO) -> None:
319
420
  # do not allow for new attributes on this
320
421
  msg = "histogram fields cannot be mutated"
lgdo/types/table.py CHANGED
@@ -450,7 +450,7 @@ class Table(Struct):
450
450
  cols = self.keys()
451
451
 
452
452
  if library == "pd":
453
- df = pd.DataFrame()
453
+ df = {}
454
454
 
455
455
  for col in cols:
456
456
  data = self[col]
@@ -470,7 +470,7 @@ class Table(Struct):
470
470
  )
471
471
  df[f"{prefix}{col}"] = data.view_as("pd", with_units=with_units)
472
472
 
473
- return df
473
+ return pd.DataFrame(df, copy=False)
474
474
 
475
475
  if library == "np":
476
476
  msg = f"Format {library!r} is not supported for Tables."
@@ -632,7 +632,7 @@ class VectorOfVectors(LGDO):
632
632
  offsets = np.empty(
633
633
  len(self.cumulative_length) + 1, dtype=self.cumulative_length.dtype
634
634
  )
635
- offsets[1:] = self.cumulative_length
635
+ offsets[1:] = self.cumulative_length.nda
636
636
  offsets[0] = 0
637
637
 
638
638
  content = (