legend-pydataobj 1.10.0__py3-none-any.whl → 1.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.2.dist-info}/METADATA +1 -1
- {legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.2.dist-info}/RECORD +20 -20
- {legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.2.dist-info}/WHEEL +1 -1
- lgdo/_version.py +2 -2
- lgdo/compression/radware.py +8 -16
- lgdo/lh5/_serializers/read/composite.py +1 -2
- lgdo/lh5/_serializers/read/ndarray.py +4 -5
- lgdo/lh5/_serializers/read/scalar.py +2 -2
- lgdo/lh5/_serializers/read/utils.py +140 -1
- lgdo/lh5/_serializers/read/vector_of_vectors.py +1 -1
- lgdo/lh5/core.py +7 -0
- lgdo/lh5/iterator.py +258 -74
- lgdo/lh5/store.py +16 -2
- lgdo/lh5/utils.py +16 -42
- lgdo/types/histogram.py +104 -3
- lgdo/types/table.py +2 -2
- lgdo/types/vectorofvectors.py +2 -2
- {legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.2.dist-info}/LICENSE +0 -0
- {legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.2.dist-info}/entry_points.txt +0 -0
- {legend_pydataobj-1.10.0.dist-info → legend_pydataobj-1.10.2.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
|
|
1
1
|
lgdo/__init__.py,sha256=1YUuAFQHNrOOkr3ZfrtEJOpYqgzbHRYA81ssbQZitQE,3196
|
2
|
-
lgdo/_version.py,sha256=
|
2
|
+
lgdo/_version.py,sha256=OsEgLsDG1xRQRG0sI3ZjVZ80k63_OfDzWK-5WpWx-Bs,413
|
3
3
|
lgdo/cli.py,sha256=vB1Oj6kZ5gWaY9HBPBRRRyiepp72hm3bFvQeUUWeMYg,8214
|
4
4
|
lgdo/lgdo_utils.py,sha256=6a2YWEwpyEMXlAyTHZMO01aqxy6SxJzPZkGNWKNWuS0,2567
|
5
5
|
lgdo/lh5_store.py,sha256=5BzbJA9sLcqjp8bJDc2olwOiw0VS6rmfg3cfh1kQkRY,8512
|
@@ -9,26 +9,26 @@ lgdo/utils.py,sha256=9t_GYdB8aQhZ4Vz6ujmASzwCgTuP7ZdINtPTVPyIR6E,3661
|
|
9
9
|
lgdo/compression/__init__.py,sha256=gqbdx4NnpCcW-C7kUXV-hVUZFiNlbCwIbs3uzFe4AFE,1127
|
10
10
|
lgdo/compression/base.py,sha256=82cQJujfvoAOKBFx761dEcx_xM02TBCBBuBo6i78tuI,838
|
11
11
|
lgdo/compression/generic.py,sha256=tF3UhLJbUDcovLxpIzgQRxFSjZ5Fz3uDRy9kI4mFntQ,2515
|
12
|
-
lgdo/compression/radware.py,sha256=
|
12
|
+
lgdo/compression/radware.py,sha256=GcNTtjuyL7VBBqziUBmSqNXuhqy1bJJgvcyvyumPtrc,23839
|
13
13
|
lgdo/compression/utils.py,sha256=W2RkBrxPpXlat84dnU9Ad7d_tTws0irtGl7O1dNWjnk,1140
|
14
14
|
lgdo/compression/varlen.py,sha256=6ZZUItyoOfygDdE0DyoISeFZfqdbH6xl7T0eclfarzg,15127
|
15
15
|
lgdo/lh5/__init__.py,sha256=KzWF6HI-6N1NqQUm8LAxMmDbg0rgRY4DAaJ2s7w2tLM,811
|
16
|
-
lgdo/lh5/core.py,sha256=
|
16
|
+
lgdo/lh5/core.py,sha256=__-A6Abctzfwfo4-xJi68xs2e4vfzONEQTJVrUCOw-I,13922
|
17
17
|
lgdo/lh5/datatype.py,sha256=O_7BqOlX8PFMyG0ppkfUT5aps5HEqX0bpuKcJO3jhu0,1691
|
18
18
|
lgdo/lh5/exceptions.py,sha256=43fQ8MnAsylY4aG6GF6hsRclagYaMkUv8957c1uTjWE,962
|
19
|
-
lgdo/lh5/iterator.py,sha256=
|
20
|
-
lgdo/lh5/store.py,sha256=
|
19
|
+
lgdo/lh5/iterator.py,sha256=ZaBBnmuNIjinwO0JUY55wLxX8Om9rVRRzXBC5uHmSKM,19772
|
20
|
+
lgdo/lh5/store.py,sha256=3wAaQDd1Zmo0_bQ9DbB-FbKS4Uy_Tb642qKHXtZpSw4,10643
|
21
21
|
lgdo/lh5/tools.py,sha256=T9CgHA8A3_tVBMtiNJ6hATQKhdqI61m3cX4p2wGKc6c,9937
|
22
|
-
lgdo/lh5/utils.py,sha256=
|
22
|
+
lgdo/lh5/utils.py,sha256=ioz8DlyXZsejwnU2qYdIccdHcF12H62jgLkZsiDOLSM,6243
|
23
23
|
lgdo/lh5/_serializers/__init__.py,sha256=NSH8uOVY3r_Wn3t0nQHhEHhkHT7-GJYlxuS3YTDJa5Y,1263
|
24
24
|
lgdo/lh5/_serializers/read/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
25
|
lgdo/lh5/_serializers/read/array.py,sha256=uWfMCihfAmW2DE2ewip2qCK_kvQC_mb2zvOv26uzijc,1000
|
26
|
-
lgdo/lh5/_serializers/read/composite.py,sha256=
|
26
|
+
lgdo/lh5/_serializers/read/composite.py,sha256=vQGh6nUQdSuHl0NTF2FeU7QC9UAA-E7XvNvrRJi4uw8,12384
|
27
27
|
lgdo/lh5/_serializers/read/encoded.py,sha256=Q98c08d8LkZq2AlY4rThYECVaEqwbv4T2Urn7TGnsyE,4130
|
28
|
-
lgdo/lh5/_serializers/read/ndarray.py,sha256=
|
29
|
-
lgdo/lh5/_serializers/read/scalar.py,sha256=
|
30
|
-
lgdo/lh5/_serializers/read/utils.py,sha256=
|
31
|
-
lgdo/lh5/_serializers/read/vector_of_vectors.py,sha256=
|
28
|
+
lgdo/lh5/_serializers/read/ndarray.py,sha256=lFCXD6bSzmMOH7cVmvRYXakkfMCI8EoqTPNONRJ1F0s,3690
|
29
|
+
lgdo/lh5/_serializers/read/scalar.py,sha256=kwhWm1T91pXf86CqtUUD8_qheSR92gXZrQVtssV5YCg,922
|
30
|
+
lgdo/lh5/_serializers/read/utils.py,sha256=0kYUFKiaQ3JUbjhP7tuKas_s80Kou6DhPlVXc40NHRE,5945
|
31
|
+
lgdo/lh5/_serializers/read/vector_of_vectors.py,sha256=aCWTMbym7dF2yrhEfQs_GotcDqOKALRxgdJm4CA-bYs,7189
|
32
32
|
lgdo/lh5/_serializers/write/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
33
33
|
lgdo/lh5/_serializers/write/array.py,sha256=eyVPwwddVOR9TNNyliCNYlS-XYXsdTEA8QoTnnOYJbw,2943
|
34
34
|
lgdo/lh5/_serializers/write/composite.py,sha256=I6lH0nWFIpAfZyG4-0rLxzg3mfazZ_FEhQVp1FZ0aA4,9254
|
@@ -39,17 +39,17 @@ lgdo/types/array.py,sha256=sUxh1CNCaefrnybt5qdjmmMpVQa_RqFxUv1tJ_pyBbc,6537
|
|
39
39
|
lgdo/types/arrayofequalsizedarrays.py,sha256=DOGJiTmc1QCdm7vLbE6uIRXoMPtt8uuCfmwQawgWf5s,4949
|
40
40
|
lgdo/types/encoded.py,sha256=JW4U5ow7KLMzhKnmhdnxbC3SZJAs4bOEDZWKG4KY1uU,15293
|
41
41
|
lgdo/types/fixedsizearray.py,sha256=7RjUwTz1bW0pcrdy27JlfrXPAuOU89Kj7pOuSUCojK8,1527
|
42
|
-
lgdo/types/histogram.py,sha256
|
42
|
+
lgdo/types/histogram.py,sha256=XuE81aRXgIY-g-rFgr9Jo7KZ-0tsNpq1lRVRyA4uTSQ,19679
|
43
43
|
lgdo/types/lgdo.py,sha256=UnJDi1emQYVgH_H29Vipfs4LelPopxG5pgZUu1eKOlw,2761
|
44
44
|
lgdo/types/scalar.py,sha256=c5Es2vyDqyWTPV6mujzfIzMpC1jNWkEIcvYyWQUxH3Q,1933
|
45
45
|
lgdo/types/struct.py,sha256=Q0OWLVd4B0ciLb8t6VsxU3MPbmGLZ7WfQNno1lSQS0Q,4918
|
46
|
-
lgdo/types/table.py,sha256=
|
47
|
-
lgdo/types/vectorofvectors.py,sha256=
|
46
|
+
lgdo/types/table.py,sha256=lB_jj6C0C5w8jbo17Lp0P8_uY8jy7opkTJc1OrbCGEI,17956
|
47
|
+
lgdo/types/vectorofvectors.py,sha256=cic9PsZ5EptQ6RMsykYeVHA8T7fh_KBZCcqeTP4i1wU,24395
|
48
48
|
lgdo/types/vovutils.py,sha256=7BWPP0BSj-92ifbCIUBcfqxG5-TS8uxujTyJJuDFI04,10302
|
49
49
|
lgdo/types/waveformtable.py,sha256=f2tS4f1OEoYaTM5ldCX9zmw8iSISCT3t3wS1SrPdu_o,9901
|
50
|
-
legend_pydataobj-1.10.
|
51
|
-
legend_pydataobj-1.10.
|
52
|
-
legend_pydataobj-1.10.
|
53
|
-
legend_pydataobj-1.10.
|
54
|
-
legend_pydataobj-1.10.
|
55
|
-
legend_pydataobj-1.10.
|
50
|
+
legend_pydataobj-1.10.2.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
51
|
+
legend_pydataobj-1.10.2.dist-info/METADATA,sha256=mipAxDDjt7yxG50BWppw9P612g_4p4sb1IzWjTHLDUw,44381
|
52
|
+
legend_pydataobj-1.10.2.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
53
|
+
legend_pydataobj-1.10.2.dist-info/entry_points.txt,sha256=Uu5MTlppBZxB4QGlLv-oX8FqACWjAZDNii__TBDJwLQ,72
|
54
|
+
legend_pydataobj-1.10.2.dist-info/top_level.txt,sha256=KyR-EUloqiXcQ62IWnzBmtInDtvsHl4q2ZJAZgTcLXE,5
|
55
|
+
legend_pydataobj-1.10.2.dist-info/RECORD,,
|
lgdo/_version.py
CHANGED
lgdo/compression/radware.py
CHANGED
@@ -441,15 +441,11 @@ def _radware_sigcompress_encode(
|
|
441
441
|
while (i < sig_in.size) and (i < j + 48):
|
442
442
|
si_i = int16(sig_in[i] + shift)
|
443
443
|
si_im1 = int16(sig_in[i - 1] + shift)
|
444
|
-
|
445
|
-
|
446
|
-
if min1 > si_i:
|
447
|
-
min1 = si_i
|
444
|
+
max1 = max(max1, si_i)
|
445
|
+
min1 = min(min1, si_i)
|
448
446
|
ds = si_i - si_im1
|
449
|
-
|
450
|
-
|
451
|
-
if min2 > ds:
|
452
|
-
min2 = ds
|
447
|
+
max2 = max(max2, ds)
|
448
|
+
min2 = min(min2, ds)
|
453
449
|
nw += 1
|
454
450
|
i += 1
|
455
451
|
if max1 - min1 <= max2 - min2: # use absolute values
|
@@ -460,15 +456,13 @@ def _radware_sigcompress_encode(
|
|
460
456
|
i < j + 128
|
461
457
|
): # FIXME: 128 could be tuned better?
|
462
458
|
si_i = int16(sig_in[i] + shift)
|
463
|
-
|
464
|
-
max1 = si_i
|
459
|
+
max1 = max(max1, si_i)
|
465
460
|
dd1 = max1 - min1
|
466
461
|
if min1 > si_i:
|
467
462
|
dd1 = max1 - si_i
|
468
463
|
if dd1 > mask[nb1]:
|
469
464
|
break
|
470
|
-
|
471
|
-
min1 = si_i
|
465
|
+
min1 = min(min1, si_i)
|
472
466
|
nw += 1
|
473
467
|
i += 1
|
474
468
|
else: # use difference values
|
@@ -481,15 +475,13 @@ def _radware_sigcompress_encode(
|
|
481
475
|
si_i = int16(sig_in[i] + shift)
|
482
476
|
si_im1 = int16(sig_in[i - 1] + shift)
|
483
477
|
ds = si_i - si_im1
|
484
|
-
|
485
|
-
max2 = ds
|
478
|
+
max2 = max(max2, ds)
|
486
479
|
dd2 = max2 - min2
|
487
480
|
if min2 > ds:
|
488
481
|
dd2 = max2 - ds
|
489
482
|
if dd2 > mask[nb2]:
|
490
483
|
break
|
491
|
-
|
492
|
-
min2 = ds
|
484
|
+
min2 = min(min2, ds)
|
493
485
|
nw += 1
|
494
486
|
i += 1
|
495
487
|
|
@@ -103,8 +103,7 @@ def _h5_read_lgdo(
|
|
103
103
|
if idx is not None:
|
104
104
|
# check if idx is just an ordered list of the integers if so can ignore
|
105
105
|
if (idx == np.arange(0, len(idx), 1)).all():
|
106
|
-
|
107
|
-
n_rows = len(idx)
|
106
|
+
n_rows = min(n_rows, len(idx))
|
108
107
|
idx = None
|
109
108
|
else:
|
110
109
|
# chop off indices < start_row
|
@@ -43,15 +43,14 @@ def _h5_read_ndarray(
|
|
43
43
|
if idx is not None:
|
44
44
|
if len(idx) > 0 and idx[-1] >= ds_n_rows:
|
45
45
|
log.warning("idx indexed past the end of the array in the file. Culling...")
|
46
|
-
n_rows_to_read = bisect_left(idx
|
47
|
-
idx =
|
46
|
+
n_rows_to_read = bisect_left(idx, ds_n_rows)
|
47
|
+
idx = idx[:n_rows_to_read]
|
48
48
|
if len(idx) == 0:
|
49
49
|
log.warning("idx empty after culling.")
|
50
50
|
n_rows_to_read = len(idx)
|
51
51
|
else:
|
52
52
|
n_rows_to_read = ds_n_rows - start_row
|
53
|
-
|
54
|
-
n_rows_to_read = n_rows
|
53
|
+
n_rows_to_read = min(n_rows_to_read, n_rows)
|
55
54
|
|
56
55
|
if idx is None:
|
57
56
|
fspace.select_hyperslab(
|
@@ -112,6 +111,6 @@ def _h5_read_ndarray(
|
|
112
111
|
# special handling for bools
|
113
112
|
# (c and Julia store as uint8 so cast to bool)
|
114
113
|
if datatype.get_nested_datatype_string(attrs["datatype"]) == "bool":
|
115
|
-
nda = nda.astype(np.bool_)
|
114
|
+
nda = nda.astype(np.bool_, copy=False)
|
116
115
|
|
117
116
|
return (nda, attrs, n_rows_to_read)
|
@@ -7,7 +7,7 @@ import numpy as np
|
|
7
7
|
|
8
8
|
from ....types import Scalar
|
9
9
|
from ...exceptions import LH5DecodeError
|
10
|
-
from .
|
10
|
+
from . import utils
|
11
11
|
|
12
12
|
log = logging.getLogger(__name__)
|
13
13
|
|
@@ -22,7 +22,7 @@ def _h5_read_scalar(
|
|
22
22
|
sp = h5py.h5s.create(h5py.h5s.SCALAR)
|
23
23
|
h5d.read(sp, sp, value)
|
24
24
|
value = value[()]
|
25
|
-
attrs = read_attrs(h5d, fname, oname)
|
25
|
+
attrs = utils.read_attrs(h5d, fname, oname)
|
26
26
|
|
27
27
|
# special handling for bools
|
28
28
|
# (c and Julia store as uint8 so cast to bool)
|
@@ -1,9 +1,16 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import logging
|
4
|
+
|
3
5
|
import h5py
|
4
6
|
import numpy as np
|
5
7
|
|
8
|
+
from .... import types
|
9
|
+
from ... import datatype
|
6
10
|
from ...exceptions import LH5DecodeError
|
11
|
+
from . import scalar
|
12
|
+
|
13
|
+
log = logging.getLogger(__name__)
|
7
14
|
|
8
15
|
|
9
16
|
def check_obj_buf_attrs(attrs, new_attrs, fname, oname):
|
@@ -23,7 +30,7 @@ def read_attrs(h5o, fname, oname):
|
|
23
30
|
h5a = h5py.h5a.open(h5o, index=i_attr)
|
24
31
|
name = h5a.get_name().decode()
|
25
32
|
if h5a.shape != ():
|
26
|
-
msg = f"attribute {
|
33
|
+
msg = f"attribute {oname} is not a string or scalar"
|
27
34
|
raise LH5DecodeError(msg, fname, oname)
|
28
35
|
val = np.empty((), h5a.dtype)
|
29
36
|
h5a.read(val)
|
@@ -33,3 +40,135 @@ def read_attrs(h5o, fname, oname):
|
|
33
40
|
attrs[name] = val.item()
|
34
41
|
h5a.close()
|
35
42
|
return attrs
|
43
|
+
|
44
|
+
|
45
|
+
def read_n_rows(h5o, fname, oname):
|
46
|
+
"""Read number of rows in LH5 object"""
|
47
|
+
if not h5py.h5a.exists(h5o, b"datatype"):
|
48
|
+
msg = "missing 'datatype' attribute"
|
49
|
+
raise LH5DecodeError(msg, fname, oname)
|
50
|
+
|
51
|
+
h5a = h5py.h5a.open(h5o, b"datatype")
|
52
|
+
type_attr = np.empty((), h5a.dtype)
|
53
|
+
h5a.read(type_attr)
|
54
|
+
type_attr = type_attr.item().decode()
|
55
|
+
lgdotype = datatype.datatype(type_attr)
|
56
|
+
|
57
|
+
# scalars are dim-0 datasets
|
58
|
+
if lgdotype is types.Scalar:
|
59
|
+
return None
|
60
|
+
|
61
|
+
# structs don't have rows
|
62
|
+
if lgdotype is types.Struct:
|
63
|
+
return None
|
64
|
+
|
65
|
+
# tables should have elements with all the same length
|
66
|
+
if lgdotype is types.Table:
|
67
|
+
# read out each of the fields
|
68
|
+
rows_read = None
|
69
|
+
for field in datatype.get_struct_fields(type_attr):
|
70
|
+
obj = h5py.h5o.open(h5o, field.encode())
|
71
|
+
n_rows_read = read_n_rows(obj, fname, field)
|
72
|
+
obj.close()
|
73
|
+
if not rows_read:
|
74
|
+
rows_read = n_rows_read
|
75
|
+
elif rows_read != n_rows_read:
|
76
|
+
log.warning(
|
77
|
+
f"'{field}' field in table '{oname}' has {rows_read} rows, "
|
78
|
+
f"{n_rows_read} was expected"
|
79
|
+
)
|
80
|
+
|
81
|
+
return rows_read
|
82
|
+
|
83
|
+
# length of vector of vectors is the length of its cumulative_length
|
84
|
+
if lgdotype is types.VectorOfVectors:
|
85
|
+
obj = h5py.h5o.open(h5o, b"cumulative_length")
|
86
|
+
n_rows = read_n_rows(obj, fname, "cumulative_length")
|
87
|
+
obj.close()
|
88
|
+
return n_rows
|
89
|
+
|
90
|
+
# length of vector of encoded vectors is the length of its decoded_size
|
91
|
+
if lgdotype in (types.VectorOfEncodedVectors, types.ArrayOfEncodedEqualSizedArrays):
|
92
|
+
obj = h5py.h5o.open(h5o, b"encoded_data")
|
93
|
+
n_rows = read_n_rows(obj, fname, "encoded_data")
|
94
|
+
obj.close()
|
95
|
+
return n_rows
|
96
|
+
|
97
|
+
# return array length (without reading the array!)
|
98
|
+
if issubclass(lgdotype, types.Array):
|
99
|
+
# compute the number of rows to read
|
100
|
+
return h5o.get_space().shape[0]
|
101
|
+
|
102
|
+
msg = f"don't know how to read rows of LGDO {lgdotype.__name__}"
|
103
|
+
raise LH5DecodeError(msg, fname, oname)
|
104
|
+
|
105
|
+
|
106
|
+
def read_size_in_bytes(h5o, fname, oname, field_mask=None):
|
107
|
+
"""Read number size in LH5 object in memory (in B)"""
|
108
|
+
if not h5py.h5a.exists(h5o, b"datatype"):
|
109
|
+
msg = "missing 'datatype' attribute"
|
110
|
+
raise LH5DecodeError(msg, fname, oname)
|
111
|
+
|
112
|
+
h5a = h5py.h5a.open(h5o, b"datatype")
|
113
|
+
type_attr = np.empty((), h5a.dtype)
|
114
|
+
h5a.read(type_attr)
|
115
|
+
type_attr = type_attr.item().decode()
|
116
|
+
lgdotype = datatype.datatype(type_attr)
|
117
|
+
|
118
|
+
# scalars are dim-0 datasets
|
119
|
+
if lgdotype in (
|
120
|
+
types.Scalar,
|
121
|
+
types.Array,
|
122
|
+
types.ArrayOfEqualSizedArrays,
|
123
|
+
types.FixedSizeArray,
|
124
|
+
):
|
125
|
+
return int(np.prod(h5o.shape) * h5o.dtype.itemsize)
|
126
|
+
|
127
|
+
# structs don't have rows
|
128
|
+
if lgdotype in (types.Struct, types.Histogram, types.Histogram.Axis):
|
129
|
+
size = 0
|
130
|
+
for key in h5o:
|
131
|
+
obj = h5py.h5o.open(h5o, key)
|
132
|
+
size += read_size_in_bytes(obj, fname, oname, field_mask)
|
133
|
+
obj.close()
|
134
|
+
return size
|
135
|
+
|
136
|
+
# tables should have elements with all the same length
|
137
|
+
if lgdotype in (types.Table, types.WaveformTable):
|
138
|
+
# read out each of the fields
|
139
|
+
size = 0
|
140
|
+
if not field_mask:
|
141
|
+
field_mask = datatype.get_struct_fields(type_attr)
|
142
|
+
for field in field_mask:
|
143
|
+
obj = h5py.h5o.open(h5o, field.encode())
|
144
|
+
size += read_size_in_bytes(obj, fname, field)
|
145
|
+
obj.close()
|
146
|
+
return size
|
147
|
+
|
148
|
+
# length of vector of vectors is the length of its cumulative_length
|
149
|
+
if lgdotype is types.VectorOfVectors:
|
150
|
+
size = 0
|
151
|
+
obj = h5py.h5o.open(h5o, b"cumulative_length")
|
152
|
+
size += read_size_in_bytes(obj, fname, "cumulative_length")
|
153
|
+
obj.close()
|
154
|
+
obj = h5py.h5o.open(h5o, b"flattened_data")
|
155
|
+
size += read_size_in_bytes(obj, fname, "flattened_data")
|
156
|
+
obj.close()
|
157
|
+
return size
|
158
|
+
|
159
|
+
# length of vector of encoded vectors is the length of its decoded_size
|
160
|
+
if lgdotype is types.ArrayOfEncodedEqualSizedArrays:
|
161
|
+
obj = h5py.h5o.open(h5o, b"decoded_size")
|
162
|
+
size = scalar._h5_read_scalar(obj, fname, "decoded_size")[0].value
|
163
|
+
obj.close()
|
164
|
+
|
165
|
+
obj = h5py.h5o.open(h5o, b"encoded_data")
|
166
|
+
cl = h5py.h5o.open(obj, b"cumulative_length")
|
167
|
+
size *= cl.shape[0]
|
168
|
+
size *= 4 # TODO: UPDATE WHEN CODECS SUPPORT MORE DTYPES
|
169
|
+
obj.close()
|
170
|
+
|
171
|
+
return size
|
172
|
+
|
173
|
+
msg = f"don't know how to read size of LGDO {lgdotype.__name__}"
|
174
|
+
raise LH5DecodeError(msg, fname, oname)
|
@@ -156,7 +156,7 @@ def _h5_read_vector_of_vectors(
|
|
156
156
|
# grow fd_buf if necessary to hold the data
|
157
157
|
fdb_size = fd_buf_start + fd_n_rows
|
158
158
|
if len(fd_buf) < fdb_size:
|
159
|
-
fd_buf.resize(fdb_size)
|
159
|
+
fd_buf.nda.resize(fdb_size, refcheck=False)
|
160
160
|
|
161
161
|
# now read
|
162
162
|
h5o = h5py.h5o.open(h5g, b"flattened_data")
|
lgdo/lh5/core.py
CHANGED
@@ -120,6 +120,7 @@ def read(
|
|
120
120
|
lh5_obj = lh5_file[name]
|
121
121
|
else:
|
122
122
|
lh5_files = list(lh5_file)
|
123
|
+
|
123
124
|
n_rows_read = 0
|
124
125
|
obj_buf_is_new = False
|
125
126
|
|
@@ -175,6 +176,9 @@ def read(
|
|
175
176
|
|
176
177
|
if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
|
177
178
|
idx = idx[0]
|
179
|
+
if isinstance(idx, np.ndarray) and idx.dtype == np.dtype("?"):
|
180
|
+
idx = np.where(idx)[0]
|
181
|
+
|
178
182
|
obj, n_rows_read = _serializers._h5_read_lgdo(
|
179
183
|
lh5_obj.id,
|
180
184
|
lh5_obj.file.filename,
|
@@ -350,5 +354,8 @@ def read_as(
|
|
350
354
|
# NOTE: providing a buffer does not make much sense
|
351
355
|
obj = read(name, lh5_file, **kwargs1)
|
352
356
|
|
357
|
+
if isinstance(obj, tuple):
|
358
|
+
obj = obj[0]
|
359
|
+
|
353
360
|
# and finally return a view
|
354
361
|
return obj.view_as(library, **kwargs2)
|
lgdo/lh5/iterator.py
CHANGED
@@ -2,11 +2,14 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import logging
|
4
4
|
import typing
|
5
|
+
from warnings import warn
|
5
6
|
|
6
7
|
import numpy as np
|
7
8
|
import pandas as pd
|
9
|
+
from numpy.typing import NDArray
|
8
10
|
|
9
11
|
from ..types import Array, Scalar, Struct, VectorOfVectors
|
12
|
+
from ..units import default_units_registry as ureg
|
10
13
|
from .store import LH5Store
|
11
14
|
from .utils import expand_path
|
12
15
|
|
@@ -19,35 +22,53 @@ class LH5Iterator(typing.Iterator):
|
|
19
22
|
at a time. This also accepts an entry list/mask to enable event selection,
|
20
23
|
and a field mask.
|
21
24
|
|
22
|
-
This
|
25
|
+
This can be used as an iterator:
|
23
26
|
|
24
|
-
>>> lh5_obj, n_rows
|
25
|
-
|
26
|
-
to read the block of entries starting at entry. In case of multiple files
|
27
|
-
or the use of an event selection, entry refers to a global event index
|
28
|
-
across files and does not count events that are excluded by the selection.
|
29
|
-
|
30
|
-
This can also be used as an iterator:
|
31
|
-
|
32
|
-
>>> for lh5_obj, entry, n_rows in LH5Iterator(...):
|
27
|
+
>>> for lh5_obj, i_entry, n_rows in LH5Iterator(...):
|
33
28
|
>>> # do the thing!
|
34
29
|
|
35
|
-
This is intended for if you are reading a large quantity of data
|
36
|
-
|
30
|
+
This is intended for if you are reading a large quantity of data. This
|
31
|
+
will ensure that you traverse files efficiently to minimize caching time
|
32
|
+
and will limit your memory usage (particularly when reading in waveforms!).
|
37
33
|
The ``lh5_obj`` that is read by this class is reused in order to avoid
|
38
34
|
reallocation of memory; this means that if you want to hold on to data
|
39
35
|
between reads, you will have to copy it somewhere!
|
36
|
+
|
37
|
+
When defining an LH5Iterator, you must give it a list of files and the
|
38
|
+
hdf5 groups containing the data tables you are reading. You may also
|
39
|
+
provide a field mask, and an entry list or mask, specifying which entries
|
40
|
+
to read from the files. You may also pair it with a friend iterator, which
|
41
|
+
contains a parallel group of files which will be simultaneously read.
|
42
|
+
In addition to accessing requested data via ``lh5_obj``, several
|
43
|
+
properties exist to tell you where that data came from:
|
44
|
+
|
45
|
+
- lh5_it.current_local_entries: get the entry numbers relative to the
|
46
|
+
file the data came from
|
47
|
+
- lh5_it.current_global_entries: get the entry number relative to the
|
48
|
+
full dataset
|
49
|
+
- lh5_it.current_files: get the file name corresponding to each entry
|
50
|
+
- lh5_it.current_groups: get the group name corresponding to each entry
|
51
|
+
|
52
|
+
This class can also be used either for random access:
|
53
|
+
|
54
|
+
>>> lh5_obj, n_rows = lh5_it.read(i_entry)
|
55
|
+
|
56
|
+
to read the block of entries starting at i_entry. In case of multiple files
|
57
|
+
or the use of an event selection, i_entry refers to a global event index
|
58
|
+
across files and does not count events that are excluded by the selection.
|
40
59
|
"""
|
41
60
|
|
42
61
|
def __init__(
|
43
62
|
self,
|
44
63
|
lh5_files: str | list[str],
|
45
|
-
groups: str | list[str],
|
64
|
+
groups: str | list[str] | list[list[str]],
|
46
65
|
base_path: str = "",
|
47
66
|
entry_list: list[int] | list[list[int]] | None = None,
|
48
67
|
entry_mask: list[bool] | list[list[bool]] | None = None,
|
49
68
|
field_mask: dict[str, bool] | list[str] | tuple[str] | None = None,
|
50
|
-
buffer_len: int =
|
69
|
+
buffer_len: int = "100*MB",
|
70
|
+
file_cache: int = 10,
|
71
|
+
file_map: NDArray[int] = None,
|
51
72
|
friend: typing.Iterator | None = None,
|
52
73
|
) -> None:
|
53
74
|
"""
|
@@ -57,9 +78,10 @@ class LH5Iterator(typing.Iterator):
|
|
57
78
|
file or files to read from. May include wildcards and environment
|
58
79
|
variables.
|
59
80
|
groups
|
60
|
-
HDF5 group(s) to read. If a list is provided
|
61
|
-
|
62
|
-
|
81
|
+
HDF5 group(s) to read. If a list of strings is provided, use
|
82
|
+
same groups for each file. If a list of lists is provided, size
|
83
|
+
of outer list must match size of file list, and each inner list
|
84
|
+
will apply to a single file (or set of wildcarded files)
|
63
85
|
entry_list
|
64
86
|
list of entry numbers to read. If a nested list is provided,
|
65
87
|
expect one top-level list for each file, containing a list of
|
@@ -72,66 +94,98 @@ class LH5Iterator(typing.Iterator):
|
|
72
94
|
more details.
|
73
95
|
buffer_len
|
74
96
|
number of entries to read at a time while iterating through files.
|
97
|
+
file_cache
|
98
|
+
maximum number of files to keep open at a time
|
99
|
+
file_map
|
100
|
+
cumulative file/group entries. This can be provided on construction
|
101
|
+
to speed up random or sparse access; otherwise, we sequentially
|
102
|
+
read the size of each group. WARNING: no checks for accuracy are
|
103
|
+
performed so only use this if you know what you are doing!
|
75
104
|
friend
|
76
105
|
a \"friend\" LH5Iterator that will be read in parallel with this.
|
77
106
|
The friend should have the same length and entry list. A single
|
78
107
|
LH5 table containing columns from both iterators will be returned.
|
108
|
+
Note that buffer_len will be set to the minimum of the two.
|
79
109
|
"""
|
80
|
-
self.lh5_st = LH5Store(base_path=base_path, keep_open=
|
110
|
+
self.lh5_st = LH5Store(base_path=base_path, keep_open=file_cache)
|
81
111
|
|
82
112
|
# List of files, with wildcards and env vars expanded
|
83
113
|
if isinstance(lh5_files, str):
|
84
114
|
lh5_files = [lh5_files]
|
85
|
-
|
86
|
-
lh5_files *= len(groups)
|
87
|
-
elif not isinstance(lh5_files, list):
|
115
|
+
elif not isinstance(lh5_files, (list, set, tuple)):
|
88
116
|
msg = "lh5_files must be a string or list of strings"
|
89
117
|
raise ValueError(msg)
|
90
118
|
|
91
119
|
if isinstance(groups, str):
|
92
|
-
groups = [groups] * len(lh5_files)
|
120
|
+
groups = [[groups]] * len(lh5_files)
|
93
121
|
elif not isinstance(groups, list):
|
94
|
-
msg = "group must be a string or list
|
122
|
+
msg = "group must be a string or appropriate list"
|
123
|
+
raise ValueError(msg)
|
124
|
+
elif all(isinstance(g, str) for g in groups):
|
125
|
+
groups = [groups] * len(lh5_files)
|
126
|
+
elif len(groups) == len(lh5_files) and all(
|
127
|
+
isinstance(gr_list, (list, set, tuple)) for gr_list in groups
|
128
|
+
):
|
129
|
+
pass
|
130
|
+
else:
|
131
|
+
msg = "group must be a string or appropriate list"
|
95
132
|
raise ValueError(msg)
|
96
133
|
|
97
134
|
if len(groups) != len(lh5_files):
|
98
135
|
msg = "lh5_files and groups must have same length"
|
99
136
|
raise ValueError(msg)
|
100
137
|
|
138
|
+
# make flattened outer-product-like list of files and groups
|
101
139
|
self.lh5_files = []
|
102
140
|
self.groups = []
|
103
141
|
for f, g in zip(lh5_files, groups):
|
104
|
-
f_exp
|
105
|
-
|
106
|
-
|
142
|
+
for f_exp in expand_path(f, list=True, base_path=base_path):
|
143
|
+
self.lh5_files += [f_exp] * len(g)
|
144
|
+
self.groups += list(g)
|
107
145
|
|
108
146
|
if entry_list is not None and entry_mask is not None:
|
109
147
|
msg = "entry_list and entry_mask arguments are mutually exclusive"
|
110
148
|
raise ValueError(msg)
|
111
149
|
|
112
150
|
# Map to last row in each file
|
113
|
-
|
151
|
+
if file_map is None:
|
152
|
+
self.file_map = np.full(len(self.lh5_files), np.iinfo("q").max, "q")
|
153
|
+
else:
|
154
|
+
self.file_map = np.array(file_map)
|
155
|
+
|
114
156
|
# Map to last iterator entry for each file
|
115
|
-
self.entry_map = np.full(len(self.lh5_files), np.iinfo("
|
157
|
+
self.entry_map = np.full(len(self.lh5_files), np.iinfo("q").max, "q")
|
116
158
|
self.buffer_len = buffer_len
|
117
159
|
|
118
160
|
if len(self.lh5_files) > 0:
|
119
161
|
f = self.lh5_files[0]
|
120
162
|
g = self.groups[0]
|
163
|
+
n_rows = self.lh5_st.read_n_rows(g, f)
|
164
|
+
|
165
|
+
if isinstance(self.buffer_len, str):
|
166
|
+
self.buffer_len = ureg.Quantity(buffer_len)
|
167
|
+
if isinstance(self.buffer_len, ureg.Quantity):
|
168
|
+
self.buffer_len = int(
|
169
|
+
self.buffer_len
|
170
|
+
/ (self.lh5_st.read_size_in_bytes(g, f) * ureg.B)
|
171
|
+
* n_rows
|
172
|
+
)
|
173
|
+
|
121
174
|
self.lh5_buffer = self.lh5_st.get_buffer(
|
122
175
|
g,
|
123
176
|
f,
|
124
177
|
size=self.buffer_len,
|
125
178
|
field_mask=field_mask,
|
126
179
|
)
|
127
|
-
|
180
|
+
if file_map is None:
|
181
|
+
self.file_map[0] = n_rows
|
128
182
|
else:
|
129
183
|
msg = f"can't open any files from {lh5_files}"
|
130
184
|
raise RuntimeError(msg)
|
131
185
|
|
132
186
|
self.n_rows = 0
|
133
|
-
self.
|
134
|
-
self.
|
187
|
+
self.current_i_entry = 0
|
188
|
+
self.next_i_entry = 0
|
135
189
|
|
136
190
|
self.field_mask = field_mask
|
137
191
|
|
@@ -142,13 +196,13 @@ class LH5Iterator(typing.Iterator):
|
|
142
196
|
entry_list = list(entry_list)
|
143
197
|
if isinstance(entry_list[0], int):
|
144
198
|
self.local_entry_list = [None] * len(self.file_map)
|
145
|
-
self.global_entry_list = np.array(entry_list, "
|
199
|
+
self.global_entry_list = np.array(entry_list, "q")
|
146
200
|
self.global_entry_list.sort()
|
147
201
|
|
148
202
|
else:
|
149
203
|
self.local_entry_list = [[]] * len(self.file_map)
|
150
204
|
for i_file, local_list in enumerate(entry_list):
|
151
|
-
self.local_entry_list[i_file] = np.array(local_list, "
|
205
|
+
self.local_entry_list[i_file] = np.array(local_list, "q")
|
152
206
|
self.local_entry_list[i_file].sort()
|
153
207
|
|
154
208
|
elif entry_mask is not None:
|
@@ -168,6 +222,15 @@ class LH5Iterator(typing.Iterator):
|
|
168
222
|
if not isinstance(friend, typing.Iterator):
|
169
223
|
msg = "Friend must be an Iterator"
|
170
224
|
raise ValueError(msg)
|
225
|
+
|
226
|
+
# set buffer_lens to be equal
|
227
|
+
if self.buffer_len < friend.buffer_len:
|
228
|
+
friend.buffer_len = self.buffer_len
|
229
|
+
friend.lh5_buffer.resize(self.buffer_len)
|
230
|
+
elif self.buffer_len > friend.buffer_len:
|
231
|
+
self.buffer_len = friend.buffer_len
|
232
|
+
self.lh5_buffer.resize(friend.buffer_len)
|
233
|
+
|
171
234
|
self.lh5_buffer.join(friend.lh5_buffer)
|
172
235
|
self.friend = friend
|
173
236
|
|
@@ -176,33 +239,52 @@ class LH5Iterator(typing.Iterator):
|
|
176
239
|
if i_file < 0:
|
177
240
|
return 0
|
178
241
|
fcl = self.file_map[i_file]
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
)
|
183
|
-
self.file_map[
|
242
|
+
|
243
|
+
# if we haven't already calculated, calculate for all files up to i_file
|
244
|
+
if fcl == np.iinfo("q").max:
|
245
|
+
i_start = np.searchsorted(self.file_map, np.iinfo("q").max)
|
246
|
+
fcl = self.file_map[i_start - 1] if i_start > 0 else 0
|
247
|
+
|
248
|
+
for i in range(i_start, i_file + 1):
|
249
|
+
fcl += self.lh5_st.read_n_rows(self.groups[i], self.lh5_files[i])
|
250
|
+
self.file_map[i] = fcl
|
184
251
|
return fcl
|
185
252
|
|
253
|
+
@property
|
254
|
+
def current_entry(self) -> int:
|
255
|
+
"deprecated alias for current_i_entry"
|
256
|
+
warn(
|
257
|
+
"current_entry has been renamed to current_i_entry.",
|
258
|
+
DeprecationWarning,
|
259
|
+
stacklevel=2,
|
260
|
+
)
|
261
|
+
|
262
|
+
return self.current_i_entry
|
263
|
+
|
186
264
|
def _get_file_cumentries(self, i_file: int) -> int:
|
187
265
|
"""Helper to get cumulative iterator entries in file"""
|
188
266
|
if i_file < 0:
|
189
267
|
return 0
|
190
268
|
n = self.entry_map[i_file]
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
269
|
+
|
270
|
+
# if we haven't already calculated, calculate for all files up to i_file
|
271
|
+
if n == np.iinfo("q").max:
|
272
|
+
i_start = np.searchsorted(self.entry_map, np.iinfo("q").max)
|
273
|
+
n = self.entry_map[i_start - 1] if i_start > 0 else 0
|
274
|
+
|
275
|
+
for i in range(i_start, i_file + 1):
|
276
|
+
elist = self.get_file_entrylist(i)
|
277
|
+
fcl = self._get_file_cumlen(i)
|
278
|
+
if elist is None:
|
279
|
+
# no entry list provided
|
280
|
+
n = fcl
|
281
|
+
else:
|
282
|
+
n += len(elist)
|
283
|
+
# check that file entries fall inside of file
|
284
|
+
if len(elist) > 0 and elist[-1] >= fcl:
|
285
|
+
logging.warning(f"Found entries out of range for file {i}")
|
286
|
+
n += np.searchsorted(elist, fcl, "right") - len(elist)
|
287
|
+
self.entry_map[i] = n
|
206
288
|
return n
|
207
289
|
|
208
290
|
def get_file_entrylist(self, i_file: int) -> np.ndarray:
|
@@ -218,50 +300,50 @@ class LH5Iterator(typing.Iterator):
|
|
218
300
|
f_end = self._get_file_cumlen(i_file)
|
219
301
|
i_start = self._get_file_cumentries(i_file - 1)
|
220
302
|
i_stop = np.searchsorted(self.global_entry_list, f_end, "right")
|
221
|
-
elist = np.array(self.global_entry_list[i_start:i_stop], "
|
303
|
+
elist = np.array(self.global_entry_list[i_start:i_stop], "q") - f_start
|
222
304
|
self.local_entry_list[i_file] = elist
|
223
305
|
return elist
|
224
306
|
|
225
307
|
def get_global_entrylist(self) -> np.ndarray:
|
226
308
|
"""Get global entry list, constructing it if needed"""
|
227
309
|
if self.global_entry_list is None and self.local_entry_list is not None:
|
228
|
-
self.global_entry_list = np.zeros(len(self), "
|
310
|
+
self.global_entry_list = np.zeros(len(self), "q")
|
229
311
|
for i_file in range(len(self.lh5_files)):
|
230
|
-
i_start = self.
|
231
|
-
i_stop = self.
|
232
|
-
f_start = self.
|
312
|
+
i_start = self._get_file_cumentries(i_file - 1)
|
313
|
+
i_stop = self._get_file_cumentries(i_file)
|
314
|
+
f_start = self._get_file_cumlen(i_file - 1)
|
233
315
|
self.global_entry_list[i_start:i_stop] = (
|
234
316
|
self.get_file_entrylist(i_file) + f_start
|
235
317
|
)
|
236
318
|
return self.global_entry_list
|
237
319
|
|
238
|
-
def read(self,
|
239
|
-
"""Read the nextlocal chunk of events, starting at
|
320
|
+
def read(self, i_entry: int) -> tuple[LGDO, int]:
|
321
|
+
"""Read the nextlocal chunk of events, starting at i_entry. Return the
|
240
322
|
LH5 buffer and number of rows read."""
|
241
323
|
self.n_rows = 0
|
242
|
-
i_file = np.searchsorted(self.entry_map,
|
324
|
+
i_file = np.searchsorted(self.entry_map, i_entry, "right")
|
243
325
|
|
244
326
|
# if file hasn't been opened yet, search through files
|
245
327
|
# sequentially until we find the right one
|
246
|
-
if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("
|
247
|
-
while i_file < len(self.lh5_files) and
|
328
|
+
if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("q").max:
|
329
|
+
while i_file < len(self.lh5_files) and i_entry >= self._get_file_cumentries(
|
248
330
|
i_file
|
249
331
|
):
|
250
332
|
i_file += 1
|
251
333
|
|
252
334
|
if i_file == len(self.lh5_files):
|
253
335
|
return (self.lh5_buffer, self.n_rows)
|
254
|
-
|
336
|
+
local_i_entry = i_entry - self._get_file_cumentries(i_file - 1)
|
255
337
|
|
256
338
|
while self.n_rows < self.buffer_len and i_file < len(self.file_map):
|
257
339
|
# Loop through files
|
258
340
|
local_idx = self.get_file_entrylist(i_file)
|
259
341
|
if local_idx is not None and len(local_idx) == 0:
|
260
342
|
i_file += 1
|
261
|
-
|
343
|
+
local_i_entry = 0
|
262
344
|
continue
|
263
345
|
|
264
|
-
i_local =
|
346
|
+
i_local = local_i_entry if local_idx is None else local_idx[local_i_entry]
|
265
347
|
self.lh5_buffer, n_rows = self.lh5_st.read(
|
266
348
|
self.groups[i_file],
|
267
349
|
self.lh5_files[i_file],
|
@@ -275,12 +357,12 @@ class LH5Iterator(typing.Iterator):
|
|
275
357
|
|
276
358
|
self.n_rows += n_rows
|
277
359
|
i_file += 1
|
278
|
-
|
360
|
+
local_i_entry = 0
|
279
361
|
|
280
|
-
self.
|
362
|
+
self.current_i_entry = i_entry
|
281
363
|
|
282
364
|
if self.friend is not None:
|
283
|
-
self.friend.read(
|
365
|
+
self.friend.read(i_entry)
|
284
366
|
|
285
367
|
return (self.lh5_buffer, self.n_rows)
|
286
368
|
|
@@ -290,6 +372,108 @@ class LH5Iterator(typing.Iterator):
|
|
290
372
|
if self.friend is not None:
|
291
373
|
self.friend.reset_field_mask(mask)
|
292
374
|
|
375
|
+
@property
|
376
|
+
def current_local_entries(self) -> NDArray[int]:
|
377
|
+
"""Return list of local file entries in buffer"""
|
378
|
+
cur_entries = np.zeros(self.n_rows, dtype="int32")
|
379
|
+
i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
|
380
|
+
file_start = self._get_file_cumentries(i_file - 1)
|
381
|
+
i_local = self.current_i_entry - file_start
|
382
|
+
i = 0
|
383
|
+
|
384
|
+
while i < len(cur_entries):
|
385
|
+
# number of entries to read from this file
|
386
|
+
file_end = self._get_file_cumentries(i_file)
|
387
|
+
n = min(file_end - file_start - i_local, len(cur_entries) - i)
|
388
|
+
entries = self.get_file_entrylist(i_file)
|
389
|
+
|
390
|
+
if entries is None:
|
391
|
+
cur_entries[i : i + n] = np.arange(i_local, i_local + n)
|
392
|
+
else:
|
393
|
+
cur_entries[i : i + n] = entries[i_local : i_local + n]
|
394
|
+
|
395
|
+
i_file += 1
|
396
|
+
file_start = file_end
|
397
|
+
i_local = 0
|
398
|
+
i += n
|
399
|
+
|
400
|
+
return cur_entries
|
401
|
+
|
402
|
+
@property
|
403
|
+
def current_global_entries(self) -> NDArray[int]:
|
404
|
+
"""Return list of local file entries in buffer"""
|
405
|
+
cur_entries = np.zeros(self.n_rows, dtype="int32")
|
406
|
+
i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
|
407
|
+
file_start = self._get_file_cumentries(i_file - 1)
|
408
|
+
i_local = self.current_i_entry - file_start
|
409
|
+
i = 0
|
410
|
+
|
411
|
+
while i < len(cur_entries):
|
412
|
+
# number of entries to read from this file
|
413
|
+
file_end = self._get_file_cumentries(i_file)
|
414
|
+
n = min(file_end - file_start - i_local, len(cur_entries) - i)
|
415
|
+
entries = self.get_file_entrylist(i_file)
|
416
|
+
|
417
|
+
if entries is None:
|
418
|
+
cur_entries[i : i + n] = self._get_file_cumlen(i_file - 1) + np.arange(
|
419
|
+
i_local, i_local + n
|
420
|
+
)
|
421
|
+
else:
|
422
|
+
cur_entries[i : i + n] = (
|
423
|
+
self._get_file_cumlen(i_file - 1) + entries[i_local : i_local + n]
|
424
|
+
)
|
425
|
+
|
426
|
+
i_file += 1
|
427
|
+
file_start = file_end
|
428
|
+
i_local = 0
|
429
|
+
i += n
|
430
|
+
|
431
|
+
return cur_entries
|
432
|
+
|
433
|
+
@property
|
434
|
+
def current_files(self) -> NDArray[str]:
|
435
|
+
"""Return list of file names for entries in buffer"""
|
436
|
+
cur_files = np.zeros(self.n_rows, dtype=object)
|
437
|
+
i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
|
438
|
+
file_start = self._get_file_cumentries(i_file - 1)
|
439
|
+
i_local = self.current_i_entry - file_start
|
440
|
+
i = 0
|
441
|
+
|
442
|
+
while i < len(cur_files):
|
443
|
+
# number of entries to read from this file
|
444
|
+
file_end = self._get_file_cumentries(i_file)
|
445
|
+
n = min(file_end - file_start - i_local, len(cur_files) - i)
|
446
|
+
cur_files[i : i + n] = self.lh5_files[i_file]
|
447
|
+
|
448
|
+
i_file += 1
|
449
|
+
file_start = file_end
|
450
|
+
i_local = 0
|
451
|
+
i += n
|
452
|
+
|
453
|
+
return cur_files
|
454
|
+
|
455
|
+
@property
|
456
|
+
def current_groups(self) -> NDArray[str]:
|
457
|
+
"""Return list of group names for entries in buffer"""
|
458
|
+
cur_groups = np.zeros(self.n_rows, dtype=object)
|
459
|
+
i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
|
460
|
+
file_start = self._get_file_cumentries(i_file - 1)
|
461
|
+
i_local = self.current_i_entry - file_start
|
462
|
+
i = 0
|
463
|
+
|
464
|
+
while i < len(cur_groups):
|
465
|
+
# number of entries to read from this file
|
466
|
+
file_end = self._get_file_cumentries(i_file)
|
467
|
+
n = min(file_end - file_start - i_local, len(cur_groups) - i)
|
468
|
+
cur_groups[i : i + n] = self.groups[i_file]
|
469
|
+
|
470
|
+
i_file += 1
|
471
|
+
file_start = file_end
|
472
|
+
i_local = 0
|
473
|
+
i += n
|
474
|
+
|
475
|
+
return cur_groups
|
476
|
+
|
293
477
|
def __len__(self) -> int:
|
294
478
|
"""Return the total number of entries."""
|
295
479
|
return (
|
@@ -300,15 +484,15 @@ class LH5Iterator(typing.Iterator):
|
|
300
484
|
|
301
485
|
def __iter__(self) -> typing.Iterator:
|
302
486
|
"""Loop through entries in blocks of size buffer_len."""
|
303
|
-
self.
|
304
|
-
self.
|
487
|
+
self.current_i_entry = 0
|
488
|
+
self.next_i_entry = 0
|
305
489
|
return self
|
306
490
|
|
307
491
|
def __next__(self) -> tuple[LGDO, int, int]:
|
308
492
|
"""Read next buffer_len entries and return lh5_table, iterator entry
|
309
493
|
and n_rows read."""
|
310
|
-
buf, n_rows = self.read(self.
|
311
|
-
self.
|
494
|
+
buf, n_rows = self.read(self.next_i_entry)
|
495
|
+
self.next_i_entry = self.current_i_entry + n_rows
|
312
496
|
if n_rows == 0:
|
313
497
|
raise StopIteration
|
314
|
-
return (buf, self.
|
498
|
+
return (buf, self.current_i_entry, n_rows)
|
lgdo/lh5/store.py
CHANGED
@@ -9,6 +9,7 @@ import bisect
|
|
9
9
|
import logging
|
10
10
|
import os
|
11
11
|
import sys
|
12
|
+
from collections import OrderedDict
|
12
13
|
from collections.abc import Mapping, Sequence
|
13
14
|
from inspect import signature
|
14
15
|
from typing import Any
|
@@ -47,14 +48,15 @@ class LH5Store:
|
|
47
48
|
directory path to prepend to LH5 files.
|
48
49
|
keep_open
|
49
50
|
whether to keep files open by storing the :mod:`h5py` objects as
|
50
|
-
class attributes.
|
51
|
+
class attributes. If ``keep_open`` is an ``int``, keep only the
|
52
|
+
``n`` most recently opened files; if ``True``, no limit
|
51
53
|
locking
|
52
54
|
whether to lock files when reading
|
53
55
|
"""
|
54
56
|
self.base_path = "" if base_path == "" else utils.expand_path(base_path)
|
55
57
|
self.keep_open = keep_open
|
56
58
|
self.locking = locking
|
57
|
-
self.files =
|
59
|
+
self.files = OrderedDict()
|
58
60
|
|
59
61
|
def gimme_file(
|
60
62
|
self,
|
@@ -87,6 +89,7 @@ class LH5Store:
|
|
87
89
|
file_kwargs["locking"] = self.locking
|
88
90
|
|
89
91
|
if lh5_file in self.files:
|
92
|
+
self.files.move_to_end(lh5_file)
|
90
93
|
return self.files[lh5_file]
|
91
94
|
|
92
95
|
if self.base_path != "":
|
@@ -120,6 +123,8 @@ class LH5Store:
|
|
120
123
|
h5f = h5py.File(full_path, mode, **file_kwargs)
|
121
124
|
|
122
125
|
if self.keep_open:
|
126
|
+
if isinstance(self.keep_open, int) and len(self.files) >= self.keep_open:
|
127
|
+
self.files.popitem(last=False)
|
123
128
|
self.files[lh5_file] = h5f
|
124
129
|
|
125
130
|
return h5f
|
@@ -228,6 +233,9 @@ class LH5Store:
|
|
228
233
|
|
229
234
|
if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
|
230
235
|
idx = idx[0]
|
236
|
+
if isinstance(idx, np.ndarray) and idx.dtype == np.dtype("?"):
|
237
|
+
idx = np.where(idx)[0]
|
238
|
+
|
231
239
|
return _serializers._h5_read_lgdo(
|
232
240
|
lh5_obj.id,
|
233
241
|
lh5_obj.file.filename,
|
@@ -307,3 +315,9 @@ class LH5Store:
|
|
307
315
|
Return ``None`` if it is a :class:`.Scalar` or a :class:`.Struct`.
|
308
316
|
"""
|
309
317
|
return utils.read_n_rows(name, self.gimme_file(lh5_file, "r"))
|
318
|
+
|
319
|
+
def read_size_in_bytes(self, name: str, lh5_file: str | h5py.File) -> int:
|
320
|
+
"""Look up the size (in B) of the object in memory. Will recursively
|
321
|
+
crawl through all objects in a Struct or Table
|
322
|
+
"""
|
323
|
+
return utils.read_size_in_bytes(name, self.gimme_file(lh5_file, "r"))
|
lgdo/lh5/utils.py
CHANGED
@@ -12,7 +12,7 @@ from typing import Any
|
|
12
12
|
import h5py
|
13
13
|
|
14
14
|
from .. import types
|
15
|
-
from . import _serializers
|
15
|
+
from . import _serializers
|
16
16
|
from .exceptions import LH5DecodeError
|
17
17
|
|
18
18
|
log = logging.getLogger(__name__)
|
@@ -44,57 +44,31 @@ def read_n_rows(name: str, h5f: str | h5py.File) -> int | None:
|
|
44
44
|
Return ``None`` if `name` is a :class:`.Scalar` or a :class:`.Struct`.
|
45
45
|
"""
|
46
46
|
if not isinstance(h5f, h5py.File):
|
47
|
-
h5f = h5py.File(h5f, "r")
|
47
|
+
h5f = h5py.File(h5f, "r", locking=False)
|
48
48
|
|
49
49
|
try:
|
50
|
-
|
50
|
+
h5o = h5f[name].id
|
51
51
|
except KeyError as e:
|
52
52
|
msg = "not found"
|
53
53
|
raise LH5DecodeError(msg, h5f, name) from e
|
54
|
-
except AttributeError as e:
|
55
|
-
msg = "missing 'datatype' attribute"
|
56
|
-
raise LH5DecodeError(msg, h5f, name) from e
|
57
54
|
|
58
|
-
|
59
|
-
|
60
|
-
# scalars are dim-0 datasets
|
61
|
-
if lgdotype is types.Scalar:
|
62
|
-
return None
|
63
|
-
|
64
|
-
# structs don't have rows
|
65
|
-
if lgdotype is types.Struct:
|
66
|
-
return None
|
67
|
-
|
68
|
-
# tables should have elements with all the same length
|
69
|
-
if lgdotype is types.Table:
|
70
|
-
# read out each of the fields
|
71
|
-
rows_read = None
|
72
|
-
for field in datatype.get_struct_fields(attrs["datatype"]):
|
73
|
-
n_rows_read = read_n_rows(name + "/" + field, h5f)
|
74
|
-
if not rows_read:
|
75
|
-
rows_read = n_rows_read
|
76
|
-
elif rows_read != n_rows_read:
|
77
|
-
log.warning(
|
78
|
-
f"'{field}' field in table '{name}' has {rows_read} rows, "
|
79
|
-
f"{n_rows_read} was expected"
|
80
|
-
)
|
81
|
-
return rows_read
|
55
|
+
return _serializers.read.utils.read_n_rows(h5o, h5f.name, name)
|
82
56
|
|
83
|
-
# length of vector of vectors is the length of its cumulative_length
|
84
|
-
if lgdotype is types.VectorOfVectors:
|
85
|
-
return read_n_rows(f"{name}/cumulative_length", h5f)
|
86
57
|
|
87
|
-
|
88
|
-
|
89
|
-
|
58
|
+
def read_size_in_bytes(name: str, h5f: str | h5py.File) -> int | None:
|
59
|
+
"""Look up the size (in B) in an LGDO object in memory. Will crawl
|
60
|
+
recursively through members of a Struct or Table
|
61
|
+
"""
|
62
|
+
if not isinstance(h5f, h5py.File):
|
63
|
+
h5f = h5py.File(h5f, "r", locking=False)
|
90
64
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
65
|
+
try:
|
66
|
+
h5o = h5f[name].id
|
67
|
+
except KeyError as e:
|
68
|
+
msg = "not found"
|
69
|
+
raise LH5DecodeError(msg, h5f, name) from e
|
95
70
|
|
96
|
-
|
97
|
-
raise LH5DecodeError(msg, h5f, name)
|
71
|
+
return _serializers.read.utils.read_size_in_bytes(h5o, h5f.name, name)
|
98
72
|
|
99
73
|
|
100
74
|
def get_h5_group(
|
lgdo/types/histogram.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import logging
|
4
|
-
from collections.abc import Iterable
|
4
|
+
from collections.abc import Iterable, Mapping, Sequence
|
5
5
|
from typing import Any
|
6
6
|
|
7
7
|
import hist
|
8
8
|
import numpy as np
|
9
|
+
import pandas as pd
|
9
10
|
from numpy.typing import NDArray
|
10
11
|
|
11
12
|
from .array import Array
|
@@ -269,10 +270,10 @@ class Histogram(Struct):
|
|
269
270
|
b.append(Histogram.Axis.from_edges(ax.edges, binedge_attrs))
|
270
271
|
else:
|
271
272
|
if binning is None:
|
272
|
-
msg = "need to
|
273
|
+
msg = "need to pass binning to construct Histogram"
|
273
274
|
raise ValueError(msg)
|
274
|
-
w = weights if isinstance(weights, Array) else Array(weights)
|
275
275
|
|
276
|
+
# set up binning
|
276
277
|
if all(isinstance(ax, Histogram.Axis) for ax in binning):
|
277
278
|
if binedge_attrs is not None:
|
278
279
|
msg = "passed both binedges as Axis instances and binedge_attrs"
|
@@ -286,6 +287,14 @@ class Histogram(Struct):
|
|
286
287
|
msg = "invalid binning object passed"
|
287
288
|
raise ValueError(msg)
|
288
289
|
|
290
|
+
# set up bin weights
|
291
|
+
if isinstance(weights, Array):
|
292
|
+
w = weights
|
293
|
+
elif weights is None:
|
294
|
+
w = Array(shape=[ax.nbins for ax in b], fill_val=0, dtype=np.float32)
|
295
|
+
else:
|
296
|
+
w = Array(weights)
|
297
|
+
|
289
298
|
if len(binning) != len(w.nda.shape):
|
290
299
|
msg = "binning and weight dimensions do not match"
|
291
300
|
raise ValueError(msg)
|
@@ -315,6 +324,98 @@ class Histogram(Struct):
|
|
315
324
|
assert all(isinstance(v, Histogram.Axis) for k, v in bins)
|
316
325
|
return tuple(v for _, v in bins)
|
317
326
|
|
327
|
+
def fill(self, data, w: NDArray = None, keys: Sequence[str] = None) -> None:
|
328
|
+
"""Fill histogram by incrementing bins with data points weighted by w
|
329
|
+
|
330
|
+
Parameters
|
331
|
+
----------
|
332
|
+
data
|
333
|
+
a ndarray with inner dimension equal to number of axes, or a list
|
334
|
+
of equal-length 1d-arrays containing data for each axis, or a
|
335
|
+
Mapping to 1d-arrays containing data for each axis (requires keys),
|
336
|
+
or a Pandas dataframe (optionally takes a list of keys)
|
337
|
+
w
|
338
|
+
weight to use for incrementing data points. If None, use 1 for all
|
339
|
+
keys
|
340
|
+
list of keys to use if data is a pandas ''DataFrame'' or ''Mapping''
|
341
|
+
"""
|
342
|
+
if keys is not None:
|
343
|
+
if isinstance(keys, str):
|
344
|
+
keys = [keys]
|
345
|
+
elif not isinstance(keys, list):
|
346
|
+
keys = list(keys)
|
347
|
+
|
348
|
+
if (
|
349
|
+
isinstance(data, np.ndarray)
|
350
|
+
and len(data.shape) == 1
|
351
|
+
and len(self.binning) == 1
|
352
|
+
):
|
353
|
+
N = len(data)
|
354
|
+
data = [data]
|
355
|
+
elif (
|
356
|
+
isinstance(data, np.ndarray)
|
357
|
+
and len(data.shape) == 2
|
358
|
+
and data.shape[1] == len(self.binning)
|
359
|
+
):
|
360
|
+
N = data.shape[0]
|
361
|
+
data = data.T
|
362
|
+
elif isinstance(data, pd.DataFrame) and (
|
363
|
+
(keys is not None and len(keys) == len(self.binning))
|
364
|
+
or data.ndim == len(self.binning)
|
365
|
+
):
|
366
|
+
if keys is not None:
|
367
|
+
data = data[keys]
|
368
|
+
N = len(data)
|
369
|
+
data = data.values.T
|
370
|
+
elif isinstance(data, Sequence) and len(data) == len(self.binning):
|
371
|
+
data = [d if isinstance(d, np.ndarray) else np.array(d) for d in data]
|
372
|
+
N = len(data[0])
|
373
|
+
if not all(len(d) == N for d in data):
|
374
|
+
msg = "length of all data arrays must be equal"
|
375
|
+
raise ValueError(msg)
|
376
|
+
elif isinstance(data, Mapping):
|
377
|
+
if not isinstance(keys, Sequence) or len(keys) != len(self.binning):
|
378
|
+
msg = "filling hist with Mapping data requires a list of keys with same length as histogram rank"
|
379
|
+
raise ValueError(msg)
|
380
|
+
data = [
|
381
|
+
data[k] if isinstance(data[k], np.ndarray) else np.array(data[k])
|
382
|
+
for k in keys
|
383
|
+
]
|
384
|
+
N = len(data[0])
|
385
|
+
if not all(len(d) == N for d in data):
|
386
|
+
msg = "length of all data arrays must be equal"
|
387
|
+
raise ValueError(msg)
|
388
|
+
else:
|
389
|
+
msg = "data must be 2D numpy array or list of 1D arrays with length equal to number of axes"
|
390
|
+
raise ValueError(msg)
|
391
|
+
|
392
|
+
idx = np.zeros(N, np.float64) # bin indices for flattened array
|
393
|
+
oor_mask = np.ones(N, np.bool_) # mask to remove out of range values
|
394
|
+
stride = [s // self.weights.dtype.itemsize for s in self.weights.nda.strides]
|
395
|
+
for col, ax, s in zip(data, self.binning, stride):
|
396
|
+
if ax.is_range:
|
397
|
+
idx += s * np.floor((col - ax.first) / ax.step - int(not ax.closedleft))
|
398
|
+
if ax.closedleft:
|
399
|
+
oor_mask &= (ax.first <= col) & (col < ax.last)
|
400
|
+
else:
|
401
|
+
oor_mask &= (ax.first < col) & (col <= ax.last)
|
402
|
+
else:
|
403
|
+
idx += s * (
|
404
|
+
np.searchsorted(
|
405
|
+
ax.edges, col, side=("right" if ax.closedleft else "left")
|
406
|
+
)
|
407
|
+
- 1
|
408
|
+
)
|
409
|
+
if ax.closedleft:
|
410
|
+
oor_mask &= (ax.edges[0] <= col) & (col < ax.edges[-1])
|
411
|
+
else:
|
412
|
+
oor_mask &= (ax.edges[0] < col) & (col <= ax.edges[-1])
|
413
|
+
|
414
|
+
# increment bin contents
|
415
|
+
idx = idx[oor_mask].astype(np.int64)
|
416
|
+
w = w[oor_mask] if w is not None else 1
|
417
|
+
np.add.at(self.weights.nda.reshape(-1), idx, w)
|
418
|
+
|
318
419
|
def __setitem__(self, name: str, obj: LGDO) -> None:
|
319
420
|
# do not allow for new attributes on this
|
320
421
|
msg = "histogram fields cannot be mutated"
|
lgdo/types/table.py
CHANGED
@@ -450,7 +450,7 @@ class Table(Struct):
|
|
450
450
|
cols = self.keys()
|
451
451
|
|
452
452
|
if library == "pd":
|
453
|
-
df =
|
453
|
+
df = {}
|
454
454
|
|
455
455
|
for col in cols:
|
456
456
|
data = self[col]
|
@@ -470,7 +470,7 @@ class Table(Struct):
|
|
470
470
|
)
|
471
471
|
df[f"{prefix}{col}"] = data.view_as("pd", with_units=with_units)
|
472
472
|
|
473
|
-
return df
|
473
|
+
return pd.DataFrame(df, copy=False)
|
474
474
|
|
475
475
|
if library == "np":
|
476
476
|
msg = f"Format {library!r} is not supported for Tables."
|
lgdo/types/vectorofvectors.py
CHANGED
@@ -503,7 +503,7 @@ class VectorOfVectors(LGDO):
|
|
503
503
|
def __str__(self) -> str:
|
504
504
|
string = self.view_as("ak").show(stream=None)
|
505
505
|
|
506
|
-
string = string.strip().removesuffix("]")
|
506
|
+
string = str(string).strip().removesuffix("]")
|
507
507
|
string += "\n]"
|
508
508
|
|
509
509
|
tmp_attrs = self.attrs.copy()
|
@@ -632,7 +632,7 @@ class VectorOfVectors(LGDO):
|
|
632
632
|
offsets = np.empty(
|
633
633
|
len(self.cumulative_length) + 1, dtype=self.cumulative_length.dtype
|
634
634
|
)
|
635
|
-
offsets[1:] = self.cumulative_length
|
635
|
+
offsets[1:] = self.cumulative_length.nda
|
636
636
|
offsets[0] = 0
|
637
637
|
|
638
638
|
content = (
|
File without changes
|
File without changes
|
File without changes
|