legend-pydataobj 1.8.1__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {legend_pydataobj-1.8.1.dist-info → legend_pydataobj-1.10.0.dist-info}/METADATA +3 -2
- legend_pydataobj-1.10.0.dist-info/RECORD +55 -0
- {legend_pydataobj-1.8.1.dist-info → legend_pydataobj-1.10.0.dist-info}/WHEEL +1 -1
- lgdo/__init__.py +4 -0
- lgdo/_version.py +2 -2
- lgdo/lh5/_serializers/__init__.py +2 -0
- lgdo/lh5/_serializers/read/array.py +9 -9
- lgdo/lh5/_serializers/read/composite.py +122 -70
- lgdo/lh5/_serializers/read/encoded.py +31 -9
- lgdo/lh5/_serializers/read/ndarray.py +51 -37
- lgdo/lh5/_serializers/read/scalar.py +10 -3
- lgdo/lh5/_serializers/read/utils.py +26 -3
- lgdo/lh5/_serializers/read/vector_of_vectors.py +35 -13
- lgdo/lh5/_serializers/write/array.py +6 -1
- lgdo/lh5/_serializers/write/composite.py +20 -4
- lgdo/lh5/_serializers/write/scalar.py +6 -1
- lgdo/lh5/core.py +78 -7
- lgdo/lh5/datatype.py +1 -0
- lgdo/lh5/exceptions.py +3 -3
- lgdo/lh5/store.py +101 -11
- lgdo/lh5/tools.py +1 -1
- lgdo/lh5/utils.py +13 -2
- lgdo/lh5_store.py +1 -0
- lgdo/types/__init__.py +2 -0
- lgdo/types/histogram.py +419 -0
- lgdo/types/table.py +1 -1
- legend_pydataobj-1.8.1.dist-info/RECORD +0 -54
- {legend_pydataobj-1.8.1.dist-info → legend_pydataobj-1.10.0.dist-info}/LICENSE +0 -0
- {legend_pydataobj-1.8.1.dist-info → legend_pydataobj-1.10.0.dist-info}/entry_points.txt +0 -0
- {legend_pydataobj-1.8.1.dist-info → legend_pydataobj-1.10.0.dist-info}/top_level.txt +0 -0
@@ -4,17 +4,21 @@ import logging
|
|
4
4
|
import sys
|
5
5
|
from bisect import bisect_left
|
6
6
|
|
7
|
+
import h5py
|
7
8
|
import numpy as np
|
8
9
|
|
9
10
|
from ....types import Array
|
10
11
|
from ... import datatype
|
11
12
|
from ...exceptions import LH5DecodeError
|
13
|
+
from .utils import read_attrs
|
12
14
|
|
13
15
|
log = logging.getLogger(__name__)
|
14
16
|
|
15
17
|
|
16
18
|
def _h5_read_ndarray(
|
17
19
|
h5d,
|
20
|
+
fname,
|
21
|
+
oname,
|
18
22
|
start_row=0,
|
19
23
|
n_rows=sys.maxsize,
|
20
24
|
idx=None,
|
@@ -24,48 +28,49 @@ def _h5_read_ndarray(
|
|
24
28
|
):
|
25
29
|
if obj_buf is not None and not isinstance(obj_buf, Array):
|
26
30
|
msg = "object buffer is not an Array"
|
27
|
-
raise LH5DecodeError(msg,
|
31
|
+
raise LH5DecodeError(msg, fname, oname)
|
28
32
|
|
29
33
|
# compute the number of rows to read
|
30
34
|
# we culled idx above for start_row and n_rows, now we have to apply
|
31
35
|
# the constraint of the length of the dataset
|
32
36
|
try:
|
33
|
-
|
37
|
+
fspace = h5d.get_space()
|
38
|
+
ds_n_rows = fspace.shape[0]
|
34
39
|
except AttributeError as e:
|
35
40
|
msg = "does not seem to be an HDF5 dataset"
|
36
|
-
raise LH5DecodeError(msg,
|
41
|
+
raise LH5DecodeError(msg, fname, oname) from e
|
37
42
|
|
38
43
|
if idx is not None:
|
39
|
-
if len(idx
|
44
|
+
if len(idx) > 0 and idx[-1] >= ds_n_rows:
|
40
45
|
log.warning("idx indexed past the end of the array in the file. Culling...")
|
41
46
|
n_rows_to_read = bisect_left(idx[0], ds_n_rows)
|
42
|
-
idx = (idx[
|
43
|
-
if len(idx
|
47
|
+
idx = (idx[:n_rows_to_read],)
|
48
|
+
if len(idx) == 0:
|
44
49
|
log.warning("idx empty after culling.")
|
45
|
-
n_rows_to_read = len(idx
|
50
|
+
n_rows_to_read = len(idx)
|
46
51
|
else:
|
47
52
|
n_rows_to_read = ds_n_rows - start_row
|
48
53
|
if n_rows_to_read > n_rows:
|
49
54
|
n_rows_to_read = n_rows
|
50
55
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
56
|
+
if idx is None:
|
57
|
+
fspace.select_hyperslab(
|
58
|
+
(start_row,) + (0,) * (h5d.rank - 1),
|
59
|
+
(1,) * h5d.rank,
|
60
|
+
None,
|
61
|
+
(n_rows_to_read,) + fspace.shape[1:],
|
62
|
+
)
|
63
|
+
elif use_h5idx:
|
64
|
+
# Note that h5s will automatically merge adjacent elements into a range
|
65
|
+
fspace.select_none()
|
66
|
+
for i in idx:
|
67
|
+
fspace.select_hyperslab(
|
68
|
+
(i,) + (0,) * (h5d.rank - 1),
|
69
|
+
(1,) * h5d.rank,
|
70
|
+
None,
|
71
|
+
(1,) + fspace.shape[1:],
|
72
|
+
h5py.h5s.SELECT_OR,
|
73
|
+
)
|
69
74
|
|
70
75
|
# Now read the array
|
71
76
|
if obj_buf is not None and n_rows_to_read > 0:
|
@@ -74,26 +79,35 @@ def _h5_read_ndarray(
|
|
74
79
|
obj_buf.resize(buf_size)
|
75
80
|
dest_sel = np.s_[obj_buf_start:buf_size]
|
76
81
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
82
|
+
if idx is None or use_h5idx:
|
83
|
+
mspace = h5py.h5s.create_simple(obj_buf.nda.shape)
|
84
|
+
mspace.select_hyperslab(
|
85
|
+
(obj_buf_start,) + (0,) * (h5d.rank - 1),
|
86
|
+
(1,) * h5d.rank,
|
87
|
+
None,
|
88
|
+
(n_rows_to_read,) + fspace.shape[1:],
|
89
|
+
)
|
90
|
+
h5d.read(mspace, fspace, obj_buf.nda)
|
81
91
|
else:
|
82
|
-
|
83
|
-
|
84
|
-
|
92
|
+
tmp = np.empty(fspace.shape, h5d.dtype)
|
93
|
+
h5d.read(fspace, fspace, tmp)
|
94
|
+
obj_buf.nda[dest_sel, ...] = tmp[idx, ...]
|
85
95
|
nda = obj_buf.nda
|
86
96
|
elif n_rows == 0:
|
87
97
|
tmp_shape = (0,) + h5d.shape[1:]
|
88
98
|
nda = np.empty(tmp_shape, h5d.dtype)
|
89
|
-
elif change_idx_to_slice or idx is None or use_h5idx:
|
90
|
-
nda = h5d[source_sel]
|
91
99
|
else:
|
92
|
-
|
93
|
-
nda = h5d
|
100
|
+
mspace = h5py.h5s.create_simple((n_rows_to_read,) + fspace.shape[1:])
|
101
|
+
nda = np.empty(mspace.shape, h5d.dtype)
|
102
|
+
if idx is None or use_h5idx:
|
103
|
+
h5d.read(mspace, fspace, nda)
|
104
|
+
else:
|
105
|
+
tmp = np.empty(fspace.shape, h5d.dtype)
|
106
|
+
h5d.read(fspace, fspace, tmp)
|
107
|
+
nda[:, ...] = tmp[idx, ...]
|
94
108
|
|
95
109
|
# Finally, set attributes and return objects
|
96
|
-
attrs =
|
110
|
+
attrs = read_attrs(h5d, fname, oname)
|
97
111
|
|
98
112
|
# special handling for bools
|
99
113
|
# (c and Julia store as uint8 so cast to bool)
|
@@ -2,20 +2,27 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import logging
|
4
4
|
|
5
|
+
import h5py
|
5
6
|
import numpy as np
|
6
7
|
|
7
8
|
from ....types import Scalar
|
8
9
|
from ...exceptions import LH5DecodeError
|
10
|
+
from .utils import read_attrs
|
9
11
|
|
10
12
|
log = logging.getLogger(__name__)
|
11
13
|
|
12
14
|
|
13
15
|
def _h5_read_scalar(
|
14
16
|
h5d,
|
17
|
+
fname,
|
18
|
+
oname,
|
15
19
|
obj_buf=None,
|
16
20
|
):
|
17
|
-
value =
|
18
|
-
|
21
|
+
value = np.empty((), h5d.dtype)
|
22
|
+
sp = h5py.h5s.create(h5py.h5s.SCALAR)
|
23
|
+
h5d.read(sp, sp, value)
|
24
|
+
value = value[()]
|
25
|
+
attrs = read_attrs(h5d, fname, oname)
|
19
26
|
|
20
27
|
# special handling for bools
|
21
28
|
# (c and Julia store as uint8 so cast to bool)
|
@@ -25,7 +32,7 @@ def _h5_read_scalar(
|
|
25
32
|
if obj_buf is not None:
|
26
33
|
if not isinstance(obj_buf, Scalar):
|
27
34
|
msg = "object buffer a Scalar"
|
28
|
-
raise LH5DecodeError(msg,
|
35
|
+
raise LH5DecodeError(msg, fname, oname)
|
29
36
|
|
30
37
|
obj_buf.value = value
|
31
38
|
obj_buf.attrs.update(attrs)
|
@@ -1,12 +1,35 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import h5py
|
4
|
+
import numpy as np
|
5
|
+
|
3
6
|
from ...exceptions import LH5DecodeError
|
4
7
|
|
5
8
|
|
6
|
-
def check_obj_buf_attrs(attrs, new_attrs,
|
9
|
+
def check_obj_buf_attrs(attrs, new_attrs, fname, oname):
|
7
10
|
if set(attrs.keys()) != set(new_attrs.keys()):
|
8
11
|
msg = (
|
9
12
|
f"existing buffer and new data chunk have different attributes: "
|
10
|
-
f"obj_buf.attrs={attrs} != {
|
13
|
+
f"obj_buf.attrs={attrs} != {fname}[{oname}].attrs={new_attrs}"
|
11
14
|
)
|
12
|
-
raise LH5DecodeError(msg,
|
15
|
+
raise LH5DecodeError(msg, fname, oname)
|
16
|
+
|
17
|
+
|
18
|
+
def read_attrs(h5o, fname, oname):
|
19
|
+
"""Read all attributes for an hdf5 dataset or group using low level API
|
20
|
+
and return them as a dict. Assume all are strings or scalar types."""
|
21
|
+
attrs = {}
|
22
|
+
for i_attr in range(h5py.h5a.get_num_attrs(h5o)):
|
23
|
+
h5a = h5py.h5a.open(h5o, index=i_attr)
|
24
|
+
name = h5a.get_name().decode()
|
25
|
+
if h5a.shape != ():
|
26
|
+
msg = f"attribute {name} is not a string or scalar"
|
27
|
+
raise LH5DecodeError(msg, fname, oname)
|
28
|
+
val = np.empty((), h5a.dtype)
|
29
|
+
h5a.read(val)
|
30
|
+
if h5a.get_type().get_class() == h5py.h5t.STRING:
|
31
|
+
attrs[name] = val.item().decode()
|
32
|
+
else:
|
33
|
+
attrs[name] = val.item()
|
34
|
+
h5a.close()
|
35
|
+
return attrs
|
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
import logging
|
4
4
|
import sys
|
5
5
|
|
6
|
+
import h5py
|
6
7
|
import numba
|
7
8
|
import numpy as np
|
8
9
|
|
@@ -15,12 +16,15 @@ from ...exceptions import LH5DecodeError
|
|
15
16
|
from .array import (
|
16
17
|
_h5_read_array,
|
17
18
|
)
|
19
|
+
from .utils import read_attrs
|
18
20
|
|
19
21
|
log = logging.getLogger(__name__)
|
20
22
|
|
21
23
|
|
22
24
|
def _h5_read_vector_of_vectors(
|
23
25
|
h5g,
|
26
|
+
fname,
|
27
|
+
oname,
|
24
28
|
start_row=0,
|
25
29
|
n_rows=sys.maxsize,
|
26
30
|
idx=None,
|
@@ -30,12 +34,15 @@ def _h5_read_vector_of_vectors(
|
|
30
34
|
):
|
31
35
|
if obj_buf is not None and not isinstance(obj_buf, VectorOfVectors):
|
32
36
|
msg = "object buffer is not a VectorOfVectors"
|
33
|
-
raise LH5DecodeError(msg,
|
37
|
+
raise LH5DecodeError(msg, fname, oname)
|
34
38
|
|
35
39
|
# read out cumulative_length
|
36
40
|
cumulen_buf = None if obj_buf is None else obj_buf.cumulative_length
|
41
|
+
h5d_cl = h5py.h5d.open(h5g, b"cumulative_length")
|
37
42
|
cumulative_length, n_rows_read = _h5_read_array(
|
38
|
-
|
43
|
+
h5d_cl,
|
44
|
+
fname,
|
45
|
+
f"{oname}/cumulative_length",
|
39
46
|
start_row=start_row,
|
40
47
|
n_rows=n_rows,
|
41
48
|
idx=idx,
|
@@ -51,17 +58,19 @@ def _h5_read_vector_of_vectors(
|
|
51
58
|
if idx is not None and n_rows_read > 0:
|
52
59
|
# get the starting indices for each array in flattened data:
|
53
60
|
# the starting index for array[i] is cumulative_length[i-1]
|
54
|
-
idx2 =
|
61
|
+
idx2 = np.asarray(idx).copy() - 1
|
55
62
|
|
56
63
|
# re-read cumulative_length with these indices
|
57
64
|
# note this will allocate memory for fd_starts!
|
58
65
|
fd_start = None
|
59
|
-
if idx2[0]
|
60
|
-
idx2 =
|
66
|
+
if idx2[0] == -1:
|
67
|
+
idx2 = idx2[1:]
|
61
68
|
fd_start = 0 # this variable avoids an ndarray append
|
62
69
|
|
63
70
|
fd_starts, fds_n_rows_read = _h5_read_array(
|
64
|
-
|
71
|
+
h5d_cl,
|
72
|
+
fname,
|
73
|
+
f"{oname}/cumulative_length",
|
65
74
|
start_row=start_row,
|
66
75
|
n_rows=n_rows,
|
67
76
|
idx=idx2,
|
@@ -98,7 +107,11 @@ def _h5_read_vector_of_vectors(
|
|
98
107
|
# need to read out the cumulen sample -before- the first sample
|
99
108
|
# read above in order to get the starting row of the first
|
100
109
|
# vector to read out in flattened_data
|
101
|
-
|
110
|
+
fspace = h5d_cl.get_space()
|
111
|
+
fspace.select_elements([[start_row - 1]])
|
112
|
+
mspace = h5py.h5s.create(h5py.h5s.SCALAR)
|
113
|
+
fd_start = np.empty((), h5d_cl.dtype)
|
114
|
+
h5d_cl.read(mspace, fspace, fd_start)
|
102
115
|
|
103
116
|
# check limits for values that will be used subsequently
|
104
117
|
if this_cumulen_nda[-1] < fd_start:
|
@@ -112,7 +125,7 @@ def _h5_read_vector_of_vectors(
|
|
112
125
|
f"cumulative_length non-increasing between entries "
|
113
126
|
f"{start_row} and {start_row+n_rows_read}"
|
114
127
|
)
|
115
|
-
raise LH5DecodeError(msg,
|
128
|
+
raise LH5DecodeError(msg, fname, oname)
|
116
129
|
|
117
130
|
# determine the number of rows for the flattened_data readout
|
118
131
|
fd_n_rows = this_cumulen_nda[-1] if n_rows_read > 0 else 0
|
@@ -126,6 +139,8 @@ def _h5_read_vector_of_vectors(
|
|
126
139
|
# read for flattened_data
|
127
140
|
this_cumulen_nda -= fd_start
|
128
141
|
|
142
|
+
h5d_cl.close()
|
143
|
+
|
129
144
|
# If we started with a partially-filled buffer, add the
|
130
145
|
# appropriate offset for the start of the in-memory flattened
|
131
146
|
# data for this read.
|
@@ -144,17 +159,23 @@ def _h5_read_vector_of_vectors(
|
|
144
159
|
fd_buf.resize(fdb_size)
|
145
160
|
|
146
161
|
# now read
|
147
|
-
|
162
|
+
h5o = h5py.h5o.open(h5g, b"flattened_data")
|
163
|
+
h5a_dtype = h5py.h5a.open(h5o, b"datatype")
|
164
|
+
val = np.empty((), "O")
|
165
|
+
h5a_dtype.read(val)
|
166
|
+
lgdotype = dtypeutils.datatype(val.item().decode())
|
148
167
|
if lgdotype is Array:
|
149
168
|
_func = _h5_read_array
|
150
169
|
elif lgdotype is VectorOfVectors:
|
151
170
|
_func = _h5_read_vector_of_vectors
|
152
171
|
else:
|
153
172
|
msg = "type {lgdotype.__name__} is not supported"
|
154
|
-
raise LH5DecodeError(msg,
|
173
|
+
raise LH5DecodeError(msg, fname, f"{oname}/flattened_data")
|
155
174
|
|
156
175
|
flattened_data, _ = _func(
|
157
|
-
|
176
|
+
h5o,
|
177
|
+
fname,
|
178
|
+
f"{oname}/flattened_data",
|
158
179
|
start_row=fd_start,
|
159
180
|
n_rows=fd_n_rows,
|
160
181
|
idx=fd_idx,
|
@@ -162,6 +183,7 @@ def _h5_read_vector_of_vectors(
|
|
162
183
|
obj_buf=fd_buf,
|
163
184
|
obj_buf_start=fd_buf_start,
|
164
185
|
)
|
186
|
+
h5o.close()
|
165
187
|
|
166
188
|
if obj_buf is not None:
|
167
189
|
# if the buffer is partially filled, cumulative_length will be invalid
|
@@ -176,7 +198,7 @@ def _h5_read_vector_of_vectors(
|
|
176
198
|
VectorOfVectors(
|
177
199
|
flattened_data=flattened_data,
|
178
200
|
cumulative_length=cumulative_length,
|
179
|
-
attrs=
|
201
|
+
attrs=read_attrs(h5g, fname, oname),
|
180
202
|
),
|
181
203
|
n_rows_read,
|
182
204
|
)
|
@@ -194,4 +216,4 @@ def _make_fd_idx(starts, stops, idx):
|
|
194
216
|
for i in range(starts[j], stops[j]):
|
195
217
|
idx[k] = i
|
196
218
|
k += 1
|
197
|
-
return
|
219
|
+
return idx
|
@@ -71,7 +71,12 @@ def _h5_write_array(
|
|
71
71
|
_attrs = obj.getattrs(datatype=True)
|
72
72
|
_attrs.pop("compression", None)
|
73
73
|
_attrs.pop("hdf5_settings", None)
|
74
|
-
ds.attrs.update(
|
74
|
+
ds.attrs.update(
|
75
|
+
{
|
76
|
+
k: v.encode("utf-8") if isinstance(v, str) else v
|
77
|
+
for k, v in _attrs.items()
|
78
|
+
}
|
79
|
+
)
|
75
80
|
|
76
81
|
return
|
77
82
|
|
@@ -1,6 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import logging
|
4
|
+
import os
|
5
|
+
from inspect import signature
|
4
6
|
|
5
7
|
import h5py
|
6
8
|
|
@@ -27,6 +29,10 @@ def _h5_write_lgdo(
|
|
27
29
|
):
|
28
30
|
assert isinstance(obj, types.LGDO)
|
29
31
|
|
32
|
+
file_kwargs = {
|
33
|
+
k: h5py_kwargs[k] for k in h5py_kwargs & signature(h5py.File).parameters.keys()
|
34
|
+
}
|
35
|
+
h5py_kwargs = {k: h5py_kwargs[k] for k in h5py_kwargs - file_kwargs.keys()}
|
30
36
|
if wo_mode == "write_safe":
|
31
37
|
wo_mode = "w"
|
32
38
|
if wo_mode == "append":
|
@@ -46,10 +52,9 @@ def _h5_write_lgdo(
|
|
46
52
|
# In hdf5, 'a' is really "modify" -- in addition to appending, you can
|
47
53
|
# change any object in the file. So we use file:append for
|
48
54
|
# write_object:overwrite.
|
49
|
-
mode = "w" if wo_mode == "of" else "a"
|
50
|
-
|
51
55
|
if not isinstance(lh5_file, h5py.File):
|
52
|
-
|
56
|
+
mode = "w" if wo_mode == "of" or not os.path.exists(lh5_file) else "a"
|
57
|
+
lh5_file = h5py.File(lh5_file, mode=mode, **file_kwargs)
|
53
58
|
|
54
59
|
log.debug(
|
55
60
|
f"writing {obj!r}[{start_row}:{n_rows}] as "
|
@@ -63,8 +68,19 @@ def _h5_write_lgdo(
|
|
63
68
|
msg = f"can't overwrite '{name}' in wo_mode 'write_safe'"
|
64
69
|
raise LH5EncodeError(msg, lh5_file, group, name)
|
65
70
|
|
66
|
-
# struct
|
71
|
+
# struct, table, waveform table or histogram.
|
67
72
|
if isinstance(obj, types.Struct):
|
73
|
+
if (
|
74
|
+
isinstance(obj, types.Histogram)
|
75
|
+
and wo_mode not in ["w", "o", "of"]
|
76
|
+
and name in group
|
77
|
+
):
|
78
|
+
msg = f"can't append-write to histogram in wo_mode '{wo_mode}'"
|
79
|
+
raise LH5EncodeError(msg, lh5_file, group, name)
|
80
|
+
if isinstance(obj, types.Histogram) and write_start != 0:
|
81
|
+
msg = f"can't write histogram in wo_mode '{wo_mode}' with write_start != 0"
|
82
|
+
raise LH5EncodeError(msg, lh5_file, group, name)
|
83
|
+
|
68
84
|
return _h5_write_struct(
|
69
85
|
obj,
|
70
86
|
name,
|
@@ -20,4 +20,9 @@ def _h5_write_scalar(obj, name, lh5_file, group="/", wo_mode="append"):
|
|
20
20
|
raise LH5EncodeError(msg, lh5_file, group, name)
|
21
21
|
|
22
22
|
ds = group.create_dataset(name, shape=(), data=obj.value)
|
23
|
-
ds.attrs.update(
|
23
|
+
ds.attrs.update(
|
24
|
+
{
|
25
|
+
k: v.encode("utf-8") if isinstance(v, str) else v
|
26
|
+
for k, v in obj.attrs.items()
|
27
|
+
}
|
28
|
+
)
|
lgdo/lh5/core.py
CHANGED
@@ -1,15 +1,18 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import bisect
|
3
4
|
import inspect
|
4
5
|
import sys
|
5
6
|
from collections.abc import Mapping, Sequence
|
6
7
|
from typing import Any
|
7
8
|
|
8
9
|
import h5py
|
10
|
+
import numpy as np
|
9
11
|
from numpy.typing import ArrayLike
|
10
12
|
|
11
13
|
from .. import types
|
12
14
|
from . import _serializers
|
15
|
+
from .utils import read_n_rows
|
13
16
|
|
14
17
|
|
15
18
|
def read(
|
@@ -23,6 +26,7 @@ def read(
|
|
23
26
|
obj_buf: types.LGDO = None,
|
24
27
|
obj_buf_start: int = 0,
|
25
28
|
decompress: bool = True,
|
29
|
+
locking: bool = False,
|
26
30
|
) -> types.LGDO | tuple[types.LGDO, int]:
|
27
31
|
"""Read LH5 object data from a file.
|
28
32
|
|
@@ -97,6 +101,8 @@ def read(
|
|
97
101
|
Decompress data encoded with LGDO's compression routines right
|
98
102
|
after reading. The option has no effect on data encoded with HDF5
|
99
103
|
built-in filters, which is always decompressed upstream by HDF5.
|
104
|
+
locking
|
105
|
+
Lock HDF5 file while reading
|
100
106
|
|
101
107
|
Returns
|
102
108
|
-------
|
@@ -110,17 +116,69 @@ def read(
|
|
110
116
|
if isinstance(lh5_file, h5py.File):
|
111
117
|
lh5_obj = lh5_file[name]
|
112
118
|
elif isinstance(lh5_file, str):
|
113
|
-
lh5_file = h5py.File(lh5_file, mode="r")
|
119
|
+
lh5_file = h5py.File(lh5_file, mode="r", locking=locking)
|
114
120
|
lh5_obj = lh5_file[name]
|
115
121
|
else:
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
h5f = h5py.File(h5f, mode="r") # noqa: PLW2901
|
120
|
-
lh5_obj += [h5f[name]]
|
122
|
+
lh5_files = list(lh5_file)
|
123
|
+
n_rows_read = 0
|
124
|
+
obj_buf_is_new = False
|
121
125
|
|
126
|
+
for i, h5f in enumerate(lh5_files):
|
127
|
+
if (
|
128
|
+
isinstance(idx, (list, tuple))
|
129
|
+
and len(idx) > 0
|
130
|
+
and not np.isscalar(idx[0])
|
131
|
+
):
|
132
|
+
# a list of lists: must be one per file
|
133
|
+
idx_i = idx[i]
|
134
|
+
elif idx is not None:
|
135
|
+
# make idx a proper tuple if it's not one already
|
136
|
+
if not (isinstance(idx, tuple) and len(idx) == 1):
|
137
|
+
idx = (idx,)
|
138
|
+
# idx is a long continuous array
|
139
|
+
n_rows_i = read_n_rows(name, h5f)
|
140
|
+
# find the length of the subset of idx that contains indices
|
141
|
+
# that are less than n_rows_i
|
142
|
+
n_rows_to_read_i = bisect.bisect_left(idx[0], n_rows_i)
|
143
|
+
# now split idx into idx_i and the remainder
|
144
|
+
idx_i = np.array(idx[0])[:n_rows_to_read_i]
|
145
|
+
idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
|
146
|
+
else:
|
147
|
+
idx_i = None
|
148
|
+
n_rows_i = n_rows - n_rows_read
|
149
|
+
|
150
|
+
obj_ret = read(
|
151
|
+
name,
|
152
|
+
h5f,
|
153
|
+
start_row,
|
154
|
+
n_rows_i,
|
155
|
+
idx_i,
|
156
|
+
use_h5idx,
|
157
|
+
field_mask,
|
158
|
+
obj_buf,
|
159
|
+
obj_buf_start,
|
160
|
+
decompress,
|
161
|
+
)
|
162
|
+
if isinstance(obj_ret, tuple):
|
163
|
+
obj_buf, n_rows_read_i = obj_ret
|
164
|
+
obj_buf_is_new = True
|
165
|
+
else:
|
166
|
+
obj_buf = obj_ret
|
167
|
+
n_rows_read_i = len(obj_buf)
|
168
|
+
|
169
|
+
n_rows_read += n_rows_read_i
|
170
|
+
if n_rows_read >= n_rows or obj_buf is None:
|
171
|
+
return obj_buf, n_rows_read
|
172
|
+
start_row = 0
|
173
|
+
obj_buf_start += n_rows_read_i
|
174
|
+
return obj_buf if obj_buf_is_new else (obj_buf, n_rows_read)
|
175
|
+
|
176
|
+
if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
|
177
|
+
idx = idx[0]
|
122
178
|
obj, n_rows_read = _serializers._h5_read_lgdo(
|
123
|
-
lh5_obj,
|
179
|
+
lh5_obj.id,
|
180
|
+
lh5_obj.file.filename,
|
181
|
+
lh5_obj.name,
|
124
182
|
start_row=start_row,
|
125
183
|
n_rows=n_rows,
|
126
184
|
idx=idx,
|
@@ -143,6 +201,7 @@ def write(
|
|
143
201
|
n_rows: int | None = None,
|
144
202
|
wo_mode: str = "append",
|
145
203
|
write_start: int = 0,
|
204
|
+
page_buffer: int = 0,
|
146
205
|
**h5py_kwargs,
|
147
206
|
) -> None:
|
148
207
|
"""Write an LGDO into an LH5 file.
|
@@ -218,6 +277,11 @@ def write(
|
|
218
277
|
write_start
|
219
278
|
row in the output file (if already existing) to start overwriting
|
220
279
|
from.
|
280
|
+
page_buffer
|
281
|
+
enable paged aggregation with a buffer of this size in bytes
|
282
|
+
Only used when creating a new file. Useful when writing a file
|
283
|
+
with a large number of small datasets. This is a short-hand for
|
284
|
+
``(fs_stragety="page", fs_pagesize=[page_buffer])``
|
221
285
|
**h5py_kwargs
|
222
286
|
additional keyword arguments forwarded to
|
223
287
|
:meth:`h5py.Group.create_dataset` to specify, for example, an HDF5
|
@@ -225,6 +289,13 @@ def write(
|
|
225
289
|
datasets. **Note: `compression` Ignored if compression is specified
|
226
290
|
as an `obj` attribute.**
|
227
291
|
"""
|
292
|
+
if wo_mode in ("w", "write", "of", "overwrite_file"):
|
293
|
+
h5py_kwargs.update(
|
294
|
+
{
|
295
|
+
"fs_strategy": "page",
|
296
|
+
"fs_page_size": page_buffer,
|
297
|
+
}
|
298
|
+
)
|
228
299
|
return _serializers._h5_write_lgdo(
|
229
300
|
obj,
|
230
301
|
name,
|
lgdo/lh5/datatype.py
CHANGED
@@ -14,6 +14,7 @@ _lgdo_datatype_map: dict[str, lgdo.LGDO] = OrderedDict(
|
|
14
14
|
lgdo.ArrayOfEncodedEqualSizedArrays,
|
15
15
|
r"^array_of_encoded_equalsized_arrays<1,1>\{.+\}$",
|
16
16
|
),
|
17
|
+
(lgdo.Histogram, r"^struct\{binning,weights,isdensity\}$"),
|
17
18
|
(lgdo.Struct, r"^struct\{.*\}$"),
|
18
19
|
(lgdo.Table, r"^table\{.*\}$"),
|
19
20
|
(lgdo.FixedSizeArray, r"^fixedsize_array<\d+>\{.+\}$"),
|
lgdo/lh5/exceptions.py
CHANGED
@@ -4,11 +4,11 @@ import h5py
|
|
4
4
|
|
5
5
|
|
6
6
|
class LH5DecodeError(Exception):
|
7
|
-
def __init__(self, message: str,
|
7
|
+
def __init__(self, message: str, fname: str, oname: str) -> None:
|
8
8
|
super().__init__(message)
|
9
9
|
|
10
|
-
self.file =
|
11
|
-
self.obj =
|
10
|
+
self.file = fname
|
11
|
+
self.obj = oname
|
12
12
|
|
13
13
|
def __str__(self) -> str:
|
14
14
|
return (
|