legend-pydataobj 1.5.1__py3-none-any.whl → 1.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/METADATA +1 -1
- legend_pydataobj-1.6.1.dist-info/RECORD +54 -0
- {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/WHEEL +1 -1
- {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/entry_points.txt +1 -0
- lgdo/__init__.py +7 -4
- lgdo/_version.py +2 -2
- lgdo/cli.py +237 -12
- lgdo/compression/__init__.py +1 -0
- lgdo/lh5/__init__.py +9 -1
- lgdo/lh5/_serializers/__init__.py +43 -0
- lgdo/lh5/_serializers/read/__init__.py +0 -0
- lgdo/lh5/_serializers/read/array.py +34 -0
- lgdo/lh5/_serializers/read/composite.py +405 -0
- lgdo/lh5/_serializers/read/encoded.py +129 -0
- lgdo/lh5/_serializers/read/ndarray.py +104 -0
- lgdo/lh5/_serializers/read/scalar.py +34 -0
- lgdo/lh5/_serializers/read/utils.py +12 -0
- lgdo/lh5/_serializers/read/vector_of_vectors.py +201 -0
- lgdo/lh5/_serializers/write/__init__.py +0 -0
- lgdo/lh5/_serializers/write/array.py +92 -0
- lgdo/lh5/_serializers/write/composite.py +259 -0
- lgdo/lh5/_serializers/write/scalar.py +23 -0
- lgdo/lh5/_serializers/write/vector_of_vectors.py +95 -0
- lgdo/lh5/core.py +272 -0
- lgdo/lh5/datatype.py +46 -0
- lgdo/lh5/exceptions.py +34 -0
- lgdo/lh5/iterator.py +1 -1
- lgdo/lh5/store.py +69 -1160
- lgdo/lh5/tools.py +27 -53
- lgdo/lh5/utils.py +130 -27
- lgdo/lh5_store.py +11 -2
- lgdo/logging.py +1 -0
- lgdo/types/__init__.py +1 -0
- lgdo/types/array.py +1 -0
- lgdo/types/arrayofequalsizedarrays.py +1 -0
- lgdo/types/encoded.py +3 -8
- lgdo/types/fixedsizearray.py +1 -0
- lgdo/types/struct.py +1 -0
- lgdo/types/table.py +46 -5
- lgdo/types/vectorofvectors.py +314 -458
- lgdo/types/vovutils.py +320 -0
- lgdo/types/waveformtable.py +1 -0
- lgdo/utils.py +1 -32
- legend_pydataobj-1.5.1.dist-info/RECORD +0 -36
- {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/LICENSE +0 -0
- {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/top_level.txt +0 -0
lgdo/lh5/store.py
CHANGED
@@ -2,42 +2,23 @@
|
|
2
2
|
This module implements routines from reading and writing LEGEND Data Objects in
|
3
3
|
HDF5 files.
|
4
4
|
"""
|
5
|
+
|
5
6
|
from __future__ import annotations
|
6
7
|
|
7
8
|
import logging
|
8
9
|
import os
|
9
10
|
import sys
|
10
|
-
from
|
11
|
-
from
|
12
|
-
from typing import Any, Union
|
11
|
+
from collections.abc import Mapping, Sequence
|
12
|
+
from typing import Any
|
13
13
|
|
14
14
|
import h5py
|
15
|
-
|
16
|
-
import numpy as np
|
17
|
-
|
18
|
-
from .. import compression as compress
|
19
|
-
from ..compression import WaveformCodec
|
20
|
-
from ..types import (
|
21
|
-
Array,
|
22
|
-
ArrayOfEncodedEqualSizedArrays,
|
23
|
-
ArrayOfEqualSizedArrays,
|
24
|
-
FixedSizeArray,
|
25
|
-
Scalar,
|
26
|
-
Struct,
|
27
|
-
Table,
|
28
|
-
VectorOfEncodedVectors,
|
29
|
-
VectorOfVectors,
|
30
|
-
WaveformTable,
|
31
|
-
)
|
32
|
-
from .utils import expand_path, parse_datatype
|
15
|
+
from numpy.typing import ArrayLike
|
33
16
|
|
34
|
-
|
17
|
+
from .. import types
|
18
|
+
from . import _serializers, utils
|
35
19
|
|
36
20
|
log = logging.getLogger(__name__)
|
37
21
|
|
38
|
-
DEFAULT_HDF5_SETTINGS: dict[str, ...] = {"shuffle": True, "compression": "gzip"}
|
39
|
-
DEFAULT_HDF5_COMPRESSION = None
|
40
|
-
|
41
22
|
|
42
23
|
class LH5Store:
|
43
24
|
"""
|
@@ -63,7 +44,7 @@ class LH5Store:
|
|
63
44
|
whether to keep files open by storing the :mod:`h5py` objects as
|
64
45
|
class attributes.
|
65
46
|
"""
|
66
|
-
self.base_path = "" if base_path == "" else expand_path(base_path)
|
47
|
+
self.base_path = "" if base_path == "" else utils.expand_path(base_path)
|
67
48
|
self.keep_open = keep_open
|
68
49
|
self.files = {}
|
69
50
|
|
@@ -79,27 +60,36 @@ class LH5Store:
|
|
79
60
|
"""
|
80
61
|
if isinstance(lh5_file, h5py.File):
|
81
62
|
return lh5_file
|
63
|
+
|
82
64
|
if mode == "r":
|
83
|
-
lh5_file = expand_path(lh5_file, base_path=self.base_path)
|
65
|
+
lh5_file = utils.expand_path(lh5_file, base_path=self.base_path)
|
66
|
+
|
84
67
|
if lh5_file in self.files:
|
85
68
|
return self.files[lh5_file]
|
69
|
+
|
86
70
|
if self.base_path != "":
|
87
71
|
full_path = os.path.join(self.base_path, lh5_file)
|
88
72
|
else:
|
89
73
|
full_path = lh5_file
|
74
|
+
|
90
75
|
if mode != "r":
|
91
76
|
directory = os.path.dirname(full_path)
|
92
77
|
if directory != "" and not os.path.exists(directory):
|
93
78
|
log.debug(f"making path {directory}")
|
94
79
|
os.makedirs(directory)
|
80
|
+
|
95
81
|
if mode == "r" and not os.path.exists(full_path):
|
96
82
|
msg = f"file {full_path} not found"
|
97
83
|
raise FileNotFoundError(msg)
|
84
|
+
|
98
85
|
if mode != "r" and os.path.exists(full_path):
|
99
86
|
log.debug(f"opening existing file {full_path} in mode '{mode}'")
|
87
|
+
|
100
88
|
h5f = h5py.File(full_path, mode)
|
89
|
+
|
101
90
|
if self.keep_open:
|
102
91
|
self.files[lh5_file] = h5f
|
92
|
+
|
103
93
|
return h5f
|
104
94
|
|
105
95
|
def gimme_group(
|
@@ -110,51 +100,21 @@ class LH5Store:
|
|
110
100
|
overwrite: bool = False,
|
111
101
|
) -> h5py.Group:
|
112
102
|
"""
|
113
|
-
Returns an existing :class:`h5py` group from a base group or creates a
|
114
|
-
new one. Can also set (or replace) group attributes.
|
103
|
+
Returns an existing :class:`h5py` group from a base group or creates a new one.
|
115
104
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
name of the HDF5 group.
|
120
|
-
base_group
|
121
|
-
HDF5 group to be used as a base.
|
122
|
-
grp_attrs
|
123
|
-
HDF5 group attributes.
|
124
|
-
overwrite
|
125
|
-
whether overwrite group attributes, ignored if `grp_attrs` is
|
126
|
-
``None``.
|
105
|
+
See Also
|
106
|
+
--------
|
107
|
+
.lh5.utils.get_h5_group
|
127
108
|
"""
|
128
|
-
|
129
|
-
if group in base_group:
|
130
|
-
group = base_group[group]
|
131
|
-
else:
|
132
|
-
group = base_group.create_group(group)
|
133
|
-
if grp_attrs is not None:
|
134
|
-
group.attrs.update(grp_attrs)
|
135
|
-
return group
|
136
|
-
if (
|
137
|
-
grp_attrs is not None
|
138
|
-
and len(set(grp_attrs.items()) ^ set(group.attrs.items())) > 0
|
139
|
-
):
|
140
|
-
if not overwrite:
|
141
|
-
msg = "grp_attrs != group.attrs but overwrite not set"
|
142
|
-
raise RuntimeError(msg)
|
143
|
-
|
144
|
-
log.debug(f"overwriting {group}.attrs...")
|
145
|
-
for key in group.attrs:
|
146
|
-
group.attrs.pop(key)
|
147
|
-
group.attrs.update(grp_attrs)
|
148
|
-
|
149
|
-
return group
|
109
|
+
return utils.get_h5_group(group, base_group, grp_attrs, overwrite)
|
150
110
|
|
151
111
|
def get_buffer(
|
152
112
|
self,
|
153
113
|
name: str,
|
154
|
-
lh5_file: str | h5py.File |
|
114
|
+
lh5_file: str | h5py.File | Sequence[str | h5py.File],
|
155
115
|
size: int | None = None,
|
156
|
-
field_mask:
|
157
|
-
) -> LGDO:
|
116
|
+
field_mask: Mapping[str, bool] | Sequence[str] | None = None,
|
117
|
+
) -> types.LGDO:
|
158
118
|
"""Returns an LH5 object appropriate for use as a pre-allocated buffer
|
159
119
|
in a read loop. Sets size to `size` if object has a size.
|
160
120
|
"""
|
@@ -166,700 +126,44 @@ class LH5Store:
|
|
166
126
|
def read(
|
167
127
|
self,
|
168
128
|
name: str,
|
169
|
-
lh5_file: str | h5py.File |
|
129
|
+
lh5_file: str | h5py.File | Sequence[str | h5py.File],
|
170
130
|
start_row: int = 0,
|
171
131
|
n_rows: int = sys.maxsize,
|
172
|
-
idx:
|
132
|
+
idx: ArrayLike = None,
|
173
133
|
use_h5idx: bool = False,
|
174
|
-
field_mask:
|
175
|
-
obj_buf: LGDO = None,
|
134
|
+
field_mask: Mapping[str, bool] | Sequence[str] | None = None,
|
135
|
+
obj_buf: types.LGDO = None,
|
176
136
|
obj_buf_start: int = 0,
|
177
137
|
decompress: bool = True,
|
178
|
-
) -> tuple[LGDO, int]:
|
179
|
-
"""Read LH5 object data from a file.
|
180
|
-
|
181
|
-
Use the ``idx`` parameter to read out particular rows of the data. The ``use_h5idx`` flag
|
182
|
-
controls whether *only* those rows are read from disk or if the rows are indexed after reading
|
183
|
-
the entire object. Reading individual rows can be orders of magnitude slower than reading
|
184
|
-
the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``)
|
185
|
-
is to use slightly more memory for a much faster read. See
|
186
|
-
`legend-pydataobj #29 <https://github.com/legend-exp/legend-pydataobj/issues/29>`_
|
187
|
-
for additional information.
|
188
|
-
|
189
|
-
Parameters
|
190
|
-
----------
|
191
|
-
name
|
192
|
-
Name of the LH5 object to be read (including its group path).
|
193
|
-
lh5_file
|
194
|
-
The file(s) containing the object to be read out. If a list of
|
195
|
-
files, array-like object data will be concatenated into the output
|
196
|
-
object.
|
197
|
-
start_row
|
198
|
-
Starting entry for the object read (for array-like objects). For a
|
199
|
-
list of files, only applies to the first file.
|
200
|
-
n_rows
|
201
|
-
The maximum number of rows to read (for array-like objects). The
|
202
|
-
actual number of rows read will be returned as one of the return
|
203
|
-
values (see below).
|
204
|
-
idx
|
205
|
-
For NumPy-style "fancying indexing" for the read to select only some
|
206
|
-
rows, e.g. after applying some cuts to particular columns.
|
207
|
-
Only selection along the first axis is supported, so tuple arguments
|
208
|
-
must be one-tuples. If `n_rows` is not false, `idx` will be truncated to
|
209
|
-
`n_rows` before reading. To use with a list of files, can pass in a list of
|
210
|
-
`idx`'s (one for each file) or use a long contiguous list (e.g. built from a previous
|
211
|
-
identical read). If used in conjunction with `start_row` and `n_rows`,
|
212
|
-
will be sliced to obey those constraints, where `n_rows` is
|
213
|
-
interpreted as the (max) number of *selected* values (in `idx`) to be
|
214
|
-
read out. Note that the ``use_h5idx`` parameter controls some behaviour of the
|
215
|
-
read and that the default behavior (``use_h5idx=False``) prioritizes speed over
|
216
|
-
a small memory penalty.
|
217
|
-
use_h5idx
|
218
|
-
``True`` will directly pass the ``idx`` parameter to the underlying
|
219
|
-
``h5py`` call such that only the selected rows are read directly into memory,
|
220
|
-
which conserves memory at the cost of speed. There can be a significant penalty
|
221
|
-
to speed for larger files (1 - 2 orders of magnitude longer time).
|
222
|
-
``False`` (default) will read the entire object into memory before
|
223
|
-
performing the indexing. The default is much faster but requires additional memory,
|
224
|
-
though a relatively small amount in the typical use case. It is recommended to
|
225
|
-
leave this parameter as its default.
|
226
|
-
field_mask
|
227
|
-
For tables and structs, determines which fields get written out.
|
228
|
-
Only applies to immediate fields of the requested objects. If a dict
|
229
|
-
is used, a default dict will be made with the default set to the
|
230
|
-
opposite of the first element in the dict. This way if one specifies
|
231
|
-
a few fields at ``False``, all but those fields will be read out,
|
232
|
-
while if one specifies just a few fields as ``True``, only those
|
233
|
-
fields will be read out. If a list is provided, the listed fields
|
234
|
-
will be set to ``True``, while the rest will default to ``False``.
|
235
|
-
obj_buf
|
236
|
-
Read directly into memory provided in `obj_buf`. Note: the buffer
|
237
|
-
will be expanded to accommodate the data requested. To maintain the
|
238
|
-
buffer length, send in ``n_rows = len(obj_buf)``.
|
239
|
-
obj_buf_start
|
240
|
-
Start location in ``obj_buf`` for read. For concatenating data to
|
241
|
-
array-like objects.
|
242
|
-
decompress
|
243
|
-
Decompress data encoded with LGDO's compression routines right
|
244
|
-
after reading. The option has no effect on data encoded with HDF5
|
245
|
-
built-in filters, which is always decompressed upstream by HDF5.
|
246
|
-
|
138
|
+
) -> tuple[types.LGDO, int]:
|
139
|
+
"""Read LH5 object data from a file in the store.
|
247
140
|
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
`object` is the read-out object `n_rows_read` is the number of rows
|
252
|
-
successfully read out. Essential for arrays when the amount of data
|
253
|
-
is smaller than the object buffer. For scalars and structs
|
254
|
-
`n_rows_read` will be``1``. For tables it is redundant with
|
255
|
-
``table.loc``.
|
141
|
+
See Also
|
142
|
+
--------
|
143
|
+
.lh5.core.read
|
256
144
|
"""
|
257
|
-
#
|
145
|
+
# grab files from store
|
258
146
|
if not isinstance(lh5_file, (str, h5py.File)):
|
259
|
-
lh5_file = list(lh5_file)
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
# make idx a proper tuple if it's not one already
|
275
|
-
if not (isinstance(idx, tuple) and len(idx) == 1):
|
276
|
-
idx = (idx,)
|
277
|
-
# idx is a long continuous array
|
278
|
-
n_rows_i = self.read_n_rows(name, h5f)
|
279
|
-
# find the length of the subset of idx that contains indices
|
280
|
-
# that are less than n_rows_i
|
281
|
-
n_rows_to_read_i = bisect_left(idx[0], n_rows_i)
|
282
|
-
# now split idx into idx_i and the remainder
|
283
|
-
idx_i = (idx[0][:n_rows_to_read_i],)
|
284
|
-
idx = (idx[0][n_rows_to_read_i:] - n_rows_i,)
|
285
|
-
else:
|
286
|
-
idx_i = None
|
287
|
-
n_rows_i = n_rows - n_rows_read
|
288
|
-
|
289
|
-
# maybe someone passed in a list of len==1?
|
290
|
-
if i == (len(lh5_file) - 1):
|
291
|
-
self.in_file_loop = False
|
292
|
-
|
293
|
-
obj_buf, n_rows_read_i = self.read(
|
294
|
-
name,
|
295
|
-
lh5_file[i],
|
296
|
-
start_row=start_row,
|
297
|
-
n_rows=n_rows_i,
|
298
|
-
idx=idx_i,
|
299
|
-
use_h5idx=use_h5idx,
|
300
|
-
field_mask=field_mask,
|
301
|
-
obj_buf=obj_buf,
|
302
|
-
obj_buf_start=obj_buf_start,
|
303
|
-
decompress=decompress,
|
304
|
-
)
|
305
|
-
|
306
|
-
n_rows_read += n_rows_read_i
|
307
|
-
if n_rows_read >= n_rows or obj_buf is None:
|
308
|
-
return obj_buf, n_rows_read
|
309
|
-
start_row = 0
|
310
|
-
obj_buf_start += n_rows_read_i
|
311
|
-
|
312
|
-
self.in_file_loop = False
|
313
|
-
|
314
|
-
return obj_buf, n_rows_read
|
315
|
-
|
316
|
-
# get the file from the store
|
317
|
-
h5f = self.gimme_file(lh5_file, "r")
|
318
|
-
if not h5f or name not in h5f:
|
319
|
-
msg = f"'{name}' not in {h5f.filename}"
|
320
|
-
raise KeyError(msg)
|
321
|
-
|
322
|
-
log.debug(
|
323
|
-
f"reading {h5f.filename}:{name}[{start_row}:{n_rows}], decompress = {decompress}, "
|
324
|
-
+ (f" with field mask {field_mask}" if field_mask else "")
|
147
|
+
lh5_file = [self.gimme_file(f, "r") for f in list(lh5_file)]
|
148
|
+
else:
|
149
|
+
lh5_file = self.gimme_file(lh5_file, "r")
|
150
|
+
|
151
|
+
return _serializers._h5_read_lgdo(
|
152
|
+
name,
|
153
|
+
lh5_file,
|
154
|
+
start_row=start_row,
|
155
|
+
n_rows=n_rows,
|
156
|
+
idx=idx,
|
157
|
+
use_h5idx=use_h5idx,
|
158
|
+
field_mask=field_mask,
|
159
|
+
obj_buf=obj_buf,
|
160
|
+
obj_buf_start=obj_buf_start,
|
161
|
+
decompress=decompress,
|
325
162
|
)
|
326
163
|
|
327
|
-
# make idx a proper tuple if it's not one already
|
328
|
-
if not (isinstance(idx, tuple) and len(idx) == 1) and idx is not None:
|
329
|
-
idx = (idx,)
|
330
|
-
|
331
|
-
# get the object's datatype
|
332
|
-
if "datatype" not in h5f[name].attrs:
|
333
|
-
msg = f"'{name}' in file {lh5_file} is missing the datatype attribute"
|
334
|
-
raise RuntimeError(msg)
|
335
|
-
|
336
|
-
datatype = h5f[name].attrs["datatype"]
|
337
|
-
datatype, shape, elements = parse_datatype(datatype)
|
338
|
-
|
339
|
-
# check field_mask and make it a default dict
|
340
|
-
if datatype in ("struct", "table"):
|
341
|
-
if field_mask is None:
|
342
|
-
field_mask = defaultdict(lambda: True)
|
343
|
-
elif isinstance(field_mask, dict):
|
344
|
-
default = True
|
345
|
-
if len(field_mask) > 0:
|
346
|
-
default = not field_mask[next(iter(field_mask.keys()))]
|
347
|
-
field_mask = defaultdict(lambda: default, field_mask)
|
348
|
-
elif isinstance(field_mask, (list, tuple)):
|
349
|
-
field_mask = defaultdict(bool, {field: True for field in field_mask})
|
350
|
-
elif not isinstance(field_mask, defaultdict):
|
351
|
-
msg = "bad field_mask of type"
|
352
|
-
raise RuntimeError(msg, type(field_mask).__name__)
|
353
|
-
elif field_mask is not None:
|
354
|
-
msg = f"datatype {datatype} does not accept a field_mask"
|
355
|
-
raise RuntimeError(msg)
|
356
|
-
|
357
|
-
# Scalar
|
358
|
-
# scalars are dim-0 datasets
|
359
|
-
if datatype == "scalar":
|
360
|
-
value = h5f[name][()]
|
361
|
-
if elements == "bool":
|
362
|
-
value = np.bool_(value)
|
363
|
-
if obj_buf is not None:
|
364
|
-
obj_buf.value = value
|
365
|
-
obj_buf.attrs.update(h5f[name].attrs)
|
366
|
-
return obj_buf, 1
|
367
|
-
|
368
|
-
return Scalar(value=value, attrs=h5f[name].attrs), 1
|
369
|
-
|
370
|
-
# Struct
|
371
|
-
# recursively build a struct, return as a dictionary
|
372
|
-
if datatype == "struct":
|
373
|
-
# ignore obj_buf.
|
374
|
-
# TODO: could append new fields or overwrite/concat to existing
|
375
|
-
# fields. If implemented, get_buffer() above should probably also
|
376
|
-
# (optionally?) prep buffers for each field
|
377
|
-
if obj_buf is not None:
|
378
|
-
msg = "obj_buf not implemented for LGOD Structs"
|
379
|
-
raise NotImplementedError(msg)
|
380
|
-
|
381
|
-
# loop over fields and read
|
382
|
-
obj_dict = {}
|
383
|
-
for field in elements:
|
384
|
-
if not field_mask[field]:
|
385
|
-
continue
|
386
|
-
# TODO: it's strange to pass start_row, n_rows, idx to struct
|
387
|
-
# fields. If they all had shared indexing, they should be in a
|
388
|
-
# table... Maybe should emit a warning? Or allow them to be
|
389
|
-
# dicts keyed by field name?
|
390
|
-
if "int_keys" in h5f[name].attrs:
|
391
|
-
if dict(h5f[name].attrs)["int_keys"]:
|
392
|
-
f = int(field)
|
393
|
-
else:
|
394
|
-
f = str(field)
|
395
|
-
obj_dict[f], _ = self.read(
|
396
|
-
name + "/" + field,
|
397
|
-
h5f,
|
398
|
-
start_row=start_row,
|
399
|
-
n_rows=n_rows,
|
400
|
-
idx=idx,
|
401
|
-
use_h5idx=use_h5idx,
|
402
|
-
decompress=decompress,
|
403
|
-
)
|
404
|
-
# modify datatype in attrs if a field_mask was used
|
405
|
-
attrs = dict(h5f[name].attrs)
|
406
|
-
if field_mask is not None:
|
407
|
-
selected_fields = []
|
408
|
-
for field in elements:
|
409
|
-
if field_mask[field]:
|
410
|
-
selected_fields.append(field)
|
411
|
-
attrs["datatype"] = "struct" + "{" + ",".join(selected_fields) + "}"
|
412
|
-
return Struct(obj_dict=obj_dict, attrs=attrs), 1
|
413
|
-
|
414
|
-
# Below here is all array-like types. So trim idx if needed
|
415
|
-
if idx is not None:
|
416
|
-
# check if idx is just an ordered list of the integers if so can ignore
|
417
|
-
if (idx[0] == np.arange(0, len(idx[0]), 1)).all():
|
418
|
-
if n_rows > len(idx[0]):
|
419
|
-
n_rows = len(idx[0])
|
420
|
-
idx = None
|
421
|
-
else:
|
422
|
-
# chop off indices < start_row
|
423
|
-
i_first_valid = bisect_left(idx[0], start_row)
|
424
|
-
idxa = idx[0][i_first_valid:]
|
425
|
-
# don't readout more than n_rows indices
|
426
|
-
idx = (idxa[:n_rows],) # works even if n_rows > len(idxa)
|
427
|
-
|
428
|
-
# Table or WaveformTable
|
429
|
-
if datatype == "table":
|
430
|
-
col_dict = {}
|
431
|
-
|
432
|
-
# read out each of the fields
|
433
|
-
rows_read = []
|
434
|
-
for field in elements:
|
435
|
-
if not field_mask[field]:
|
436
|
-
continue
|
437
|
-
|
438
|
-
fld_buf = None
|
439
|
-
if obj_buf is not None:
|
440
|
-
if not isinstance(obj_buf, Table) or field not in obj_buf:
|
441
|
-
msg = f"obj_buf for LGDO Table '{name}' not formatted correctly"
|
442
|
-
raise ValueError(msg)
|
443
|
-
|
444
|
-
fld_buf = obj_buf[field]
|
445
|
-
|
446
|
-
col_dict[field], n_rows_read = self.read(
|
447
|
-
name + "/" + field,
|
448
|
-
h5f,
|
449
|
-
start_row=start_row,
|
450
|
-
n_rows=n_rows,
|
451
|
-
idx=idx,
|
452
|
-
use_h5idx=use_h5idx,
|
453
|
-
obj_buf=fld_buf,
|
454
|
-
obj_buf_start=obj_buf_start,
|
455
|
-
decompress=decompress,
|
456
|
-
)
|
457
|
-
if obj_buf is not None and obj_buf_start + n_rows_read > len(obj_buf):
|
458
|
-
obj_buf.resize(obj_buf_start + n_rows_read)
|
459
|
-
|
460
|
-
rows_read.append(n_rows_read)
|
461
|
-
|
462
|
-
# warn if all columns don't read in the same number of rows
|
463
|
-
if len(rows_read) > 0:
|
464
|
-
n_rows_read = rows_read[0]
|
465
|
-
else:
|
466
|
-
n_rows_read = 0
|
467
|
-
log.warning(f"Table '{name}' has no subgroups accepted by field mask")
|
468
|
-
|
469
|
-
for n in rows_read[1:]:
|
470
|
-
if n != n_rows_read:
|
471
|
-
log.warning(
|
472
|
-
f"Table '{name}' got strange n_rows_read = {n}, {n_rows_read} was expected ({rows_read})"
|
473
|
-
)
|
474
|
-
|
475
|
-
# modify datatype in attrs if a field_mask was used
|
476
|
-
attrs = dict(h5f[name].attrs)
|
477
|
-
if field_mask is not None:
|
478
|
-
selected_fields = []
|
479
|
-
for field in elements:
|
480
|
-
if field_mask[field]:
|
481
|
-
selected_fields.append(field)
|
482
|
-
attrs["datatype"] = "table" + "{" + ",".join(selected_fields) + "}"
|
483
|
-
|
484
|
-
# fields have been read out, now return a table
|
485
|
-
if obj_buf is None:
|
486
|
-
# if col_dict contains just 3 objects called t0, dt, and values,
|
487
|
-
# return a WaveformTable
|
488
|
-
if (
|
489
|
-
len(col_dict) == 3
|
490
|
-
and "t0" in col_dict
|
491
|
-
and "dt" in col_dict
|
492
|
-
and "values" in col_dict
|
493
|
-
):
|
494
|
-
table = WaveformTable(
|
495
|
-
t0=col_dict["t0"], dt=col_dict["dt"], values=col_dict["values"]
|
496
|
-
)
|
497
|
-
else:
|
498
|
-
table = Table(col_dict=col_dict, attrs=attrs)
|
499
|
-
|
500
|
-
# set (write) loc to end of tree
|
501
|
-
table.loc = n_rows_read
|
502
|
-
return table, n_rows_read
|
503
|
-
|
504
|
-
# We have read all fields into the object buffer. Run
|
505
|
-
# checks: All columns should be the same size. So update
|
506
|
-
# table's size as necessary, warn if any mismatches are found
|
507
|
-
obj_buf.resize(do_warn=True)
|
508
|
-
# set (write) loc to end of tree
|
509
|
-
obj_buf.loc = obj_buf_start + n_rows_read
|
510
|
-
# check attributes
|
511
|
-
if set(obj_buf.attrs.keys()) != set(attrs.keys()):
|
512
|
-
msg = (
|
513
|
-
f"attrs mismatch. obj_buf.attrs: "
|
514
|
-
f"{obj_buf.attrs}, h5f[{name}].attrs: {attrs}"
|
515
|
-
)
|
516
|
-
raise RuntimeError(msg)
|
517
|
-
return obj_buf, n_rows_read
|
518
|
-
|
519
|
-
# ArrayOfEncodedEqualSizedArrays and VectorOfEncodedVectors
|
520
|
-
for cond, enc_lgdo in [
|
521
|
-
(
|
522
|
-
datatype == "array_of_encoded_equalsized_arrays",
|
523
|
-
ArrayOfEncodedEqualSizedArrays,
|
524
|
-
),
|
525
|
-
(elements.startswith("encoded_array"), VectorOfEncodedVectors),
|
526
|
-
]:
|
527
|
-
if cond:
|
528
|
-
if (
|
529
|
-
not decompress
|
530
|
-
and obj_buf is not None
|
531
|
-
and not isinstance(obj_buf, enc_lgdo)
|
532
|
-
):
|
533
|
-
msg = f"obj_buf for '{name}' not a {enc_lgdo}"
|
534
|
-
raise ValueError(msg)
|
535
|
-
|
536
|
-
# read out decoded_size, either a Scalar or an Array
|
537
|
-
decoded_size_buf = encoded_data_buf = None
|
538
|
-
if obj_buf is not None and not decompress:
|
539
|
-
decoded_size_buf = obj_buf.decoded_size
|
540
|
-
encoded_data_buf = obj_buf.encoded_data
|
541
|
-
|
542
|
-
decoded_size, _ = self.read(
|
543
|
-
f"{name}/decoded_size",
|
544
|
-
h5f,
|
545
|
-
start_row=start_row,
|
546
|
-
n_rows=n_rows,
|
547
|
-
idx=idx,
|
548
|
-
use_h5idx=use_h5idx,
|
549
|
-
obj_buf=None if decompress else decoded_size_buf,
|
550
|
-
obj_buf_start=0 if decompress else obj_buf_start,
|
551
|
-
)
|
552
|
-
|
553
|
-
# read out encoded_data, a VectorOfVectors
|
554
|
-
encoded_data, n_rows_read = self.read(
|
555
|
-
f"{name}/encoded_data",
|
556
|
-
h5f,
|
557
|
-
start_row=start_row,
|
558
|
-
n_rows=n_rows,
|
559
|
-
idx=idx,
|
560
|
-
use_h5idx=use_h5idx,
|
561
|
-
obj_buf=None if decompress else encoded_data_buf,
|
562
|
-
obj_buf_start=0 if decompress else obj_buf_start,
|
563
|
-
)
|
564
|
-
|
565
|
-
# return the still encoded data in the buffer object, if there
|
566
|
-
if obj_buf is not None and not decompress:
|
567
|
-
return obj_buf, n_rows_read
|
568
|
-
|
569
|
-
# otherwise re-create the encoded LGDO
|
570
|
-
rawdata = enc_lgdo(
|
571
|
-
encoded_data=encoded_data,
|
572
|
-
decoded_size=decoded_size,
|
573
|
-
attrs=h5f[name].attrs,
|
574
|
-
)
|
575
|
-
|
576
|
-
# already return if no decompression is requested
|
577
|
-
if not decompress:
|
578
|
-
return rawdata, n_rows_read
|
579
|
-
|
580
|
-
# if no buffer, decode and return
|
581
|
-
if obj_buf is None and decompress:
|
582
|
-
return compress.decode(rawdata), n_rows_read
|
583
|
-
|
584
|
-
# eventually expand provided obj_buf, if too short
|
585
|
-
buf_size = obj_buf_start + n_rows_read
|
586
|
-
if len(obj_buf) < buf_size:
|
587
|
-
obj_buf.resize(buf_size)
|
588
|
-
|
589
|
-
# use the (decoded object type) buffer otherwise
|
590
|
-
if enc_lgdo == ArrayOfEncodedEqualSizedArrays:
|
591
|
-
if not isinstance(obj_buf, ArrayOfEqualSizedArrays):
|
592
|
-
msg = f"obj_buf for decoded '{name}' not an ArrayOfEqualSizedArrays"
|
593
|
-
raise ValueError(msg)
|
594
|
-
|
595
|
-
compress.decode(rawdata, obj_buf[obj_buf_start:buf_size])
|
596
|
-
|
597
|
-
elif enc_lgdo == VectorOfEncodedVectors:
|
598
|
-
if not isinstance(obj_buf, VectorOfVectors):
|
599
|
-
msg = f"obj_buf for decoded '{name}' not a VectorOfVectors"
|
600
|
-
raise ValueError(msg)
|
601
|
-
|
602
|
-
# FIXME: not a good idea. an in place decoding version
|
603
|
-
# of decode would be needed to avoid extra memory
|
604
|
-
# allocations
|
605
|
-
for i, wf in enumerate(compress.decode(rawdata)):
|
606
|
-
obj_buf[obj_buf_start + i] = wf
|
607
|
-
|
608
|
-
return obj_buf, n_rows_read
|
609
|
-
|
610
|
-
# VectorOfVectors
|
611
|
-
# read out vector of vectors of different size
|
612
|
-
if elements.startswith("array"):
|
613
|
-
if obj_buf is not None and not isinstance(obj_buf, VectorOfVectors):
|
614
|
-
msg = f"obj_buf for '{name}' not a LGDO VectorOfVectors"
|
615
|
-
raise ValueError(msg)
|
616
|
-
|
617
|
-
# read out cumulative_length
|
618
|
-
cumulen_buf = None if obj_buf is None else obj_buf.cumulative_length
|
619
|
-
cumulative_length, n_rows_read = self.read(
|
620
|
-
f"{name}/cumulative_length",
|
621
|
-
h5f,
|
622
|
-
start_row=start_row,
|
623
|
-
n_rows=n_rows,
|
624
|
-
idx=idx,
|
625
|
-
use_h5idx=use_h5idx,
|
626
|
-
obj_buf=cumulen_buf,
|
627
|
-
obj_buf_start=obj_buf_start,
|
628
|
-
)
|
629
|
-
# get a view of just what was read out for cleaner code below
|
630
|
-
this_cumulen_nda = cumulative_length.nda[
|
631
|
-
obj_buf_start : obj_buf_start + n_rows_read
|
632
|
-
]
|
633
|
-
|
634
|
-
if idx is not None and n_rows_read > 0:
|
635
|
-
# get the starting indices for each array in flattended data:
|
636
|
-
# the starting index for array[i] is cumulative_length[i-1]
|
637
|
-
idx2 = (np.asarray(idx[0]).copy() - 1,)
|
638
|
-
# re-read cumulative_length with these indices
|
639
|
-
# note this will allocate memory for fd_starts!
|
640
|
-
fd_start = None
|
641
|
-
if idx2[0][0] == -1:
|
642
|
-
idx2 = (idx2[0][1:],)
|
643
|
-
fd_start = 0 # this variable avoids an ndarray append
|
644
|
-
fd_starts, fds_n_rows_read = self.read(
|
645
|
-
f"{name}/cumulative_length",
|
646
|
-
h5f,
|
647
|
-
start_row=start_row,
|
648
|
-
n_rows=n_rows,
|
649
|
-
idx=idx2,
|
650
|
-
use_h5idx=use_h5idx,
|
651
|
-
)
|
652
|
-
fd_starts = fd_starts.nda # we just need the nda
|
653
|
-
if fd_start is None:
|
654
|
-
fd_start = fd_starts[0]
|
655
|
-
|
656
|
-
# compute the length that flattened_data will have after the
|
657
|
-
# fancy-indexed read
|
658
|
-
fd_n_rows = np.sum(this_cumulen_nda[-len(fd_starts) :] - fd_starts)
|
659
|
-
if fd_start == 0:
|
660
|
-
fd_n_rows += this_cumulen_nda[0]
|
661
|
-
|
662
|
-
# now make fd_idx
|
663
|
-
fd_idx = np.empty(fd_n_rows, dtype="uint32")
|
664
|
-
fd_idx = _make_fd_idx(fd_starts, this_cumulen_nda, fd_idx)
|
665
|
-
|
666
|
-
# Now clean up this_cumulen_nda, to be ready
|
667
|
-
# to match the in-memory version of flattened_data. Note: these
|
668
|
-
# operations on the view change the original array because they are
|
669
|
-
# numpy arrays, not lists.
|
670
|
-
this_cumulen_nda[-len(fd_starts) :] -= fd_starts
|
671
|
-
np.cumsum(this_cumulen_nda, out=this_cumulen_nda)
|
672
|
-
|
673
|
-
else:
|
674
|
-
fd_idx = None
|
675
|
-
|
676
|
-
# determine the start_row and n_rows for the flattened_data readout
|
677
|
-
fd_start = 0
|
678
|
-
if start_row > 0 and n_rows_read > 0:
|
679
|
-
# need to read out the cumulen sample -before- the first sample
|
680
|
-
# read above in order to get the starting row of the first
|
681
|
-
# vector to read out in flattened_data
|
682
|
-
fd_start = h5f[f"{name}/cumulative_length"][start_row - 1]
|
683
|
-
|
684
|
-
# check limits for values that will be used subsequently
|
685
|
-
if this_cumulen_nda[-1] < fd_start:
|
686
|
-
log.debug(
|
687
|
-
f"this_cumulen_nda[-1] = {this_cumulen_nda[-1]}, "
|
688
|
-
f"fd_start = {fd_start}, "
|
689
|
-
f"start_row = {start_row}, "
|
690
|
-
f"n_rows_read = {n_rows_read}"
|
691
|
-
)
|
692
|
-
msg = (
|
693
|
-
f"cumulative_length non-increasing between entries "
|
694
|
-
f"{start_row} and {start_row+n_rows_read} ??"
|
695
|
-
)
|
696
|
-
raise RuntimeError(msg)
|
697
|
-
|
698
|
-
# determine the number of rows for the flattened_data readout
|
699
|
-
fd_n_rows = this_cumulen_nda[-1] if n_rows_read > 0 else 0
|
700
|
-
|
701
|
-
# Now done with this_cumulen_nda, so we can clean it up to be ready
|
702
|
-
# to match the in-memory version of flattened_data. Note: these
|
703
|
-
# operations on the view change the original array because they are
|
704
|
-
# numpy arrays, not lists.
|
705
|
-
#
|
706
|
-
# First we need to subtract off the in-file offset for the start of
|
707
|
-
# read for flattened_data
|
708
|
-
this_cumulen_nda -= fd_start
|
709
|
-
|
710
|
-
# If we started with a partially-filled buffer, add the
|
711
|
-
# appropriate offset for the start of the in-memory flattened
|
712
|
-
# data for this read.
|
713
|
-
fd_buf_start = np.uint32(0)
|
714
|
-
if obj_buf_start > 0:
|
715
|
-
fd_buf_start = cumulative_length.nda[obj_buf_start - 1]
|
716
|
-
this_cumulen_nda += fd_buf_start
|
717
|
-
|
718
|
-
# Now prepare the object buffer if necessary
|
719
|
-
fd_buf = None
|
720
|
-
if obj_buf is not None:
|
721
|
-
fd_buf = obj_buf.flattened_data
|
722
|
-
# grow fd_buf if necessary to hold the data
|
723
|
-
fdb_size = fd_buf_start + fd_n_rows
|
724
|
-
if len(fd_buf) < fdb_size:
|
725
|
-
fd_buf.resize(fdb_size)
|
726
|
-
|
727
|
-
# now read
|
728
|
-
flattened_data, dummy_rows_read = self.read(
|
729
|
-
f"{name}/flattened_data",
|
730
|
-
h5f,
|
731
|
-
start_row=fd_start,
|
732
|
-
n_rows=fd_n_rows,
|
733
|
-
idx=fd_idx,
|
734
|
-
use_h5idx=use_h5idx,
|
735
|
-
obj_buf=fd_buf,
|
736
|
-
obj_buf_start=fd_buf_start,
|
737
|
-
)
|
738
|
-
if obj_buf is not None:
|
739
|
-
return obj_buf, n_rows_read
|
740
|
-
return (
|
741
|
-
VectorOfVectors(
|
742
|
-
flattened_data=flattened_data,
|
743
|
-
cumulative_length=cumulative_length,
|
744
|
-
attrs=h5f[name].attrs,
|
745
|
-
),
|
746
|
-
n_rows_read,
|
747
|
-
)
|
748
|
-
|
749
|
-
# Array
|
750
|
-
# FixedSizeArray
|
751
|
-
# ArrayOfEqualSizedArrays
|
752
|
-
# read out all arrays by slicing
|
753
|
-
if "array" in datatype:
|
754
|
-
if obj_buf is not None and not isinstance(obj_buf, Array):
|
755
|
-
msg = f"obj_buf for '{name}' not an LGDO Array"
|
756
|
-
raise ValueError(msg)
|
757
|
-
obj_buf = None
|
758
|
-
|
759
|
-
# compute the number of rows to read
|
760
|
-
# we culled idx above for start_row and n_rows, now we have to apply
|
761
|
-
# the constraint of the length of the dataset
|
762
|
-
ds_n_rows = h5f[name].shape[0]
|
763
|
-
if idx is not None:
|
764
|
-
if len(idx[0]) > 0 and idx[0][-1] >= ds_n_rows:
|
765
|
-
log.warning(
|
766
|
-
"idx indexed past the end of the array in the file. Culling..."
|
767
|
-
)
|
768
|
-
n_rows_to_read = bisect_left(idx[0], ds_n_rows)
|
769
|
-
idx = (idx[0][:n_rows_to_read],)
|
770
|
-
if len(idx[0]) == 0:
|
771
|
-
log.warning("idx empty after culling.")
|
772
|
-
n_rows_to_read = len(idx[0])
|
773
|
-
else:
|
774
|
-
n_rows_to_read = ds_n_rows - start_row
|
775
|
-
if n_rows_to_read > n_rows:
|
776
|
-
n_rows_to_read = n_rows
|
777
|
-
|
778
|
-
# if idx is passed, check if we can make it a slice instead (faster)
|
779
|
-
change_idx_to_slice = False
|
780
|
-
|
781
|
-
# prepare the selection for the read. Use idx if available
|
782
|
-
if idx is not None:
|
783
|
-
# check if idx is empty and convert to slice instead
|
784
|
-
if len(idx[0]) == 0:
|
785
|
-
source_sel = np.s_[0:0]
|
786
|
-
change_idx_to_slice = True
|
787
|
-
# check if idx is contiguous and increasing
|
788
|
-
# if so, convert it to a slice instead (faster)
|
789
|
-
elif np.all(np.diff(idx[0]) == 1):
|
790
|
-
source_sel = np.s_[idx[0][0] : idx[0][-1] + 1]
|
791
|
-
change_idx_to_slice = True
|
792
|
-
else:
|
793
|
-
source_sel = idx
|
794
|
-
else:
|
795
|
-
source_sel = np.s_[start_row : start_row + n_rows_to_read]
|
796
|
-
|
797
|
-
# Now read the array
|
798
|
-
if obj_buf is not None and n_rows_to_read > 0:
|
799
|
-
buf_size = obj_buf_start + n_rows_to_read
|
800
|
-
if len(obj_buf) < buf_size:
|
801
|
-
obj_buf.resize(buf_size)
|
802
|
-
dest_sel = np.s_[obj_buf_start:buf_size]
|
803
|
-
|
804
|
-
# this is required to make the read of multiple files faster
|
805
|
-
# until a better solution found.
|
806
|
-
if change_idx_to_slice or idx is None or use_h5idx:
|
807
|
-
h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
|
808
|
-
else:
|
809
|
-
# it is faster to read the whole object and then do fancy indexing
|
810
|
-
obj_buf.nda[dest_sel] = h5f[name][...][source_sel]
|
811
|
-
|
812
|
-
nda = obj_buf.nda
|
813
|
-
elif n_rows == 0:
|
814
|
-
tmp_shape = (0,) + h5f[name].shape[1:]
|
815
|
-
nda = np.empty(tmp_shape, h5f[name].dtype)
|
816
|
-
elif change_idx_to_slice or idx is None or use_h5idx:
|
817
|
-
nda = h5f[name][source_sel]
|
818
|
-
else:
|
819
|
-
# it is faster to read the whole object and then do fancy indexing
|
820
|
-
nda = h5f[name][...][source_sel]
|
821
|
-
|
822
|
-
# if reading a list of files recursively, this is given to obj_buf on
|
823
|
-
# the first file read. obj_buf needs to be resized and therefore
|
824
|
-
# it needs to hold the data itself (not a view of the data).
|
825
|
-
# a view is returned by the source_sel indexing, which cannot be resized
|
826
|
-
# by ndarray.resize().
|
827
|
-
if hasattr(self, "in_file_loop") and self.in_file_loop:
|
828
|
-
nda = np.copy(nda)
|
829
|
-
|
830
|
-
# special handling for bools
|
831
|
-
# (c and Julia store as uint8 so cast to bool)
|
832
|
-
if elements == "bool":
|
833
|
-
nda = nda.astype(np.bool_)
|
834
|
-
|
835
|
-
# Finally, set attributes and return objects
|
836
|
-
attrs = h5f[name].attrs
|
837
|
-
if obj_buf is None:
|
838
|
-
if datatype == "array":
|
839
|
-
return Array(nda=nda, attrs=attrs), n_rows_to_read
|
840
|
-
if datatype == "fixedsize_array":
|
841
|
-
return FixedSizeArray(nda=nda, attrs=attrs), n_rows_to_read
|
842
|
-
if datatype == "array_of_equalsized_arrays":
|
843
|
-
return (
|
844
|
-
ArrayOfEqualSizedArrays(nda=nda, dims=shape, attrs=attrs),
|
845
|
-
n_rows_to_read,
|
846
|
-
)
|
847
|
-
else:
|
848
|
-
if set(obj_buf.attrs.keys()) != set(attrs.keys()):
|
849
|
-
msg = (
|
850
|
-
f"attrs mismatch. "
|
851
|
-
f"obj_buf.attrs: {obj_buf.attrs}, "
|
852
|
-
f"h5f[{name}].attrs: {attrs}"
|
853
|
-
)
|
854
|
-
raise RuntimeError(msg)
|
855
|
-
return obj_buf, n_rows_to_read
|
856
|
-
|
857
|
-
msg = "don't know how to read datatype {datatype}"
|
858
|
-
raise RuntimeError(msg)
|
859
|
-
|
860
164
|
def write(
|
861
165
|
self,
|
862
|
-
obj: LGDO,
|
166
|
+
obj: types.LGDO,
|
863
167
|
name: str,
|
864
168
|
lh5_file: str | h5py.File,
|
865
169
|
group: str | h5py.Group = "/",
|
@@ -871,89 +175,10 @@ class LH5Store:
|
|
871
175
|
) -> None:
|
872
176
|
"""Write an LGDO into an LH5 file.
|
873
177
|
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
string, kwargs dictionary, hdf5plugin filter
|
879
|
-
interpreted as the name of a built-in or custom `HDF5 compression
|
880
|
-
filter <https://docs.h5py.org/en/stable/high/dataset.html#filter-pipeline>`_
|
881
|
-
(``"gzip"``, ``"lzf"``, :mod:`hdf5plugin` filter object etc.) and
|
882
|
-
passed directly to :meth:`h5py.Group.create_dataset`.
|
883
|
-
|
884
|
-
:class:`.WaveformCodec` object
|
885
|
-
If `obj` is a :class:`.WaveformTable` and ``obj.values`` holds the
|
886
|
-
attribute, compress ``values`` using this algorithm. More
|
887
|
-
documentation about the supported waveform compression algorithms at
|
888
|
-
:mod:`.lgdo.compression`.
|
889
|
-
|
890
|
-
If the `obj` :class:`.LGDO` has a `hdf5_settings` attribute holding a
|
891
|
-
dictionary, it is interpreted as a list of keyword arguments to be
|
892
|
-
forwarded directly to :meth:`h5py.Group.create_dataset` (exactly like
|
893
|
-
the first format of `compression` above). This is the preferred way to
|
894
|
-
specify HDF5 dataset options such as chunking etc. If compression
|
895
|
-
options are specified, they take precedence over those set with the
|
896
|
-
`compression` attribute.
|
897
|
-
|
898
|
-
Note
|
899
|
-
----------
|
900
|
-
The `compression` LGDO attribute takes precedence over the default HDF5
|
901
|
-
compression settings. The `hdf5_settings` attribute takes precedence
|
902
|
-
over `compression`. These attributes are not written to disk.
|
903
|
-
|
904
|
-
Note
|
905
|
-
----------
|
906
|
-
HDF5 compression is skipped for the `encoded_data.flattened_data`
|
907
|
-
dataset of :class:`.VectorOfEncodedVectors` and
|
908
|
-
:class:`.ArrayOfEncodedEqualSizedArrays`.
|
909
|
-
|
910
|
-
Parameters
|
911
|
-
----------
|
912
|
-
obj
|
913
|
-
LH5 object. if object is array-like, writes `n_rows` starting from
|
914
|
-
`start_row` in `obj`.
|
915
|
-
name
|
916
|
-
name of the object in the output HDF5 file.
|
917
|
-
lh5_file
|
918
|
-
HDF5 file name or :class:`h5py.File` object.
|
919
|
-
group
|
920
|
-
HDF5 group name or :class:`h5py.Group` object in which `obj` should
|
921
|
-
be written.
|
922
|
-
start_row
|
923
|
-
first row in `obj` to be written.
|
924
|
-
n_rows
|
925
|
-
number of rows in `obj` to be written.
|
926
|
-
wo_mode
|
927
|
-
- ``write_safe`` or ``w``: only proceed with writing if the
|
928
|
-
object does not already exist in the file.
|
929
|
-
- ``append`` or ``a``: append along axis 0 (the first dimension)
|
930
|
-
of array-like objects and array-like subfields of structs.
|
931
|
-
:class:`~.lgdo.scalar.Scalar` objects get overwritten.
|
932
|
-
- ``overwrite`` or ``o``: replace data in the file if present,
|
933
|
-
starting from `write_start`. Note: overwriting with `write_start` =
|
934
|
-
end of array is the same as ``append``.
|
935
|
-
- ``overwrite_file`` or ``of``: delete file if present prior to
|
936
|
-
writing to it. `write_start` should be 0 (its ignored).
|
937
|
-
- ``append_column`` or ``ac``: append columns from an :class:`~.lgdo.table.Table`
|
938
|
-
`obj` only if there is an existing :class:`~.lgdo.table.Table` in the `lh5_file` with
|
939
|
-
the same `name` and :class:`~.lgdo.table.Table.size`. If the sizes don't match,
|
940
|
-
or if there are matching fields, it errors out.
|
941
|
-
write_start
|
942
|
-
row in the output file (if already existing) to start overwriting
|
943
|
-
from.
|
944
|
-
**h5py_kwargs
|
945
|
-
additional keyword arguments forwarded to
|
946
|
-
:meth:`h5py.Group.create_dataset` to specify, for example, an HDF5
|
947
|
-
compression filter to be applied before writing non-scalar
|
948
|
-
datasets. **Note: `compression` Ignored if compression is specified
|
949
|
-
as an `obj` attribute.**
|
178
|
+
See Also
|
179
|
+
--------
|
180
|
+
.lh5.core.write
|
950
181
|
"""
|
951
|
-
log.debug(
|
952
|
-
f"writing {obj!r}[{start_row}:{n_rows}] as "
|
953
|
-
f"{lh5_file}:{group}/{name}[{write_start}:], "
|
954
|
-
f"mode = {wo_mode}, h5py_kwargs = {h5py_kwargs}"
|
955
|
-
)
|
956
|
-
|
957
182
|
if wo_mode == "write_safe":
|
958
183
|
wo_mode = "w"
|
959
184
|
if wo_mode == "append":
|
@@ -974,338 +199,22 @@ class LH5Store:
|
|
974
199
|
# change any object in the file. So we use file:append for
|
975
200
|
# write_object:overwrite.
|
976
201
|
mode = "w" if wo_mode == "of" else "a"
|
977
|
-
lh5_file = self.gimme_file(lh5_file, mode=mode)
|
978
|
-
group = self.gimme_group(group, lh5_file)
|
979
|
-
if wo_mode == "w" and name in group:
|
980
|
-
msg = f"can't overwrite '{name}' in wo_mode 'write_safe'"
|
981
|
-
raise RuntimeError(msg)
|
982
|
-
|
983
|
-
# struct or table or waveform table
|
984
|
-
if isinstance(obj, Struct):
|
985
|
-
# In order to append a column, we need to update the `table{old_fields}` value in `group.attrs['datatype"]` to include the new fields.
|
986
|
-
# One way to do this is to override `obj.attrs["datatype"]` to include old and new fields. Then we can write the fields to the table as normal.
|
987
|
-
if wo_mode == "ac":
|
988
|
-
old_group = self.gimme_group(name, group)
|
989
|
-
datatype, shape, fields = parse_datatype(old_group.attrs["datatype"])
|
990
|
-
if datatype not in ["table", "struct"]:
|
991
|
-
msg = f"Trying to append columns to an object of type {datatype}"
|
992
|
-
raise RuntimeError(msg)
|
993
|
-
|
994
|
-
# If the mode is `append_column`, make sure we aren't appending a table that has a column of the same name as in the existing table
|
995
|
-
# Also make sure that the field we are adding has the same size
|
996
|
-
if len(list(set(fields).intersection(set(obj.keys())))) != 0:
|
997
|
-
msg = f"Can't append {list(set(fields).intersection(set(obj.keys())))} column(s) to a table with the same field(s)"
|
998
|
-
raise ValueError(msg)
|
999
|
-
# It doesn't matter what key we access, as all fields in the old table have the same size
|
1000
|
-
if old_group[next(iter(old_group.keys()))].size != obj.size:
|
1001
|
-
msg = f"Table sizes don't match. Trying to append column of size {obj.size} to a table of size {old_group[next(iter(old_group.keys()))].size}."
|
1002
|
-
raise ValueError(msg)
|
1003
|
-
|
1004
|
-
# Now we can append the obj.keys() to the old fields, and then update obj.attrs.
|
1005
|
-
fields.extend(list(obj.keys()))
|
1006
|
-
obj.attrs.pop("datatype")
|
1007
|
-
obj.attrs["datatype"] = "table" + "{" + ",".join(fields) + "}"
|
1008
|
-
|
1009
|
-
group = self.gimme_group(
|
1010
|
-
name,
|
1011
|
-
group,
|
1012
|
-
grp_attrs=obj.attrs,
|
1013
|
-
overwrite=(wo_mode in ["o", "ac"]),
|
1014
|
-
)
|
1015
|
-
# If the mode is overwrite, then we need to peek into the file's table's existing fields
|
1016
|
-
# If we are writing a new table to the group that does not contain an old field, we should delete that old field from the file
|
1017
|
-
if wo_mode == "o":
|
1018
|
-
# Find the old keys in the group that are not present in the new table's keys, then delete them
|
1019
|
-
for key in list(set(group.keys()) - set(obj.keys())):
|
1020
|
-
log.debug(f"{key} is not present in new table, deleting field")
|
1021
|
-
del group[key]
|
1022
|
-
|
1023
|
-
for field in obj:
|
1024
|
-
# eventually compress waveform table values with LGDO's
|
1025
|
-
# custom codecs before writing
|
1026
|
-
# if waveformtable.values.attrs["compression"] is NOT a
|
1027
|
-
# WaveformCodec, just leave it there
|
1028
|
-
obj_fld = None
|
1029
|
-
if (
|
1030
|
-
isinstance(obj, WaveformTable)
|
1031
|
-
and field == "values"
|
1032
|
-
and not isinstance(obj.values, VectorOfEncodedVectors)
|
1033
|
-
and not isinstance(obj.values, ArrayOfEncodedEqualSizedArrays)
|
1034
|
-
and "compression" in obj.values.attrs
|
1035
|
-
and isinstance(obj.values.attrs["compression"], WaveformCodec)
|
1036
|
-
):
|
1037
|
-
codec = obj.values.attrs["compression"]
|
1038
|
-
obj_fld = compress.encode(obj.values, codec=codec)
|
1039
|
-
else:
|
1040
|
-
obj_fld = obj[field]
|
1041
|
-
|
1042
|
-
# Convert keys to string for dataset names
|
1043
|
-
f = str(field)
|
1044
|
-
self.write(
|
1045
|
-
obj_fld,
|
1046
|
-
f,
|
1047
|
-
lh5_file,
|
1048
|
-
group=group,
|
1049
|
-
start_row=start_row,
|
1050
|
-
n_rows=n_rows,
|
1051
|
-
wo_mode=wo_mode,
|
1052
|
-
write_start=write_start,
|
1053
|
-
**h5py_kwargs,
|
1054
|
-
)
|
1055
|
-
return
|
1056
|
-
|
1057
|
-
# scalars
|
1058
|
-
if isinstance(obj, Scalar):
|
1059
|
-
if name in group:
|
1060
|
-
if wo_mode in ["o", "a"]:
|
1061
|
-
log.debug(f"overwriting {name} in {group}")
|
1062
|
-
del group[name]
|
1063
|
-
else:
|
1064
|
-
msg = f"tried to overwrite {name} in {group} for wo_mode {wo_mode}"
|
1065
|
-
raise RuntimeError(msg)
|
1066
|
-
ds = group.create_dataset(name, shape=(), data=obj.value)
|
1067
|
-
ds.attrs.update(obj.attrs)
|
1068
|
-
|
1069
|
-
return
|
1070
202
|
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
"encoded_data",
|
1083
|
-
lh5_file,
|
1084
|
-
group=group,
|
1085
|
-
start_row=start_row,
|
1086
|
-
n_rows=n_rows,
|
1087
|
-
wo_mode=wo_mode,
|
1088
|
-
write_start=write_start,
|
1089
|
-
**h5py_kwargs,
|
1090
|
-
)
|
1091
|
-
|
1092
|
-
self.write(
|
1093
|
-
obj.decoded_size,
|
1094
|
-
"decoded_size",
|
1095
|
-
lh5_file,
|
1096
|
-
group=group,
|
1097
|
-
start_row=start_row,
|
1098
|
-
n_rows=n_rows,
|
1099
|
-
wo_mode=wo_mode,
|
1100
|
-
write_start=write_start,
|
1101
|
-
**h5py_kwargs,
|
1102
|
-
)
|
1103
|
-
|
1104
|
-
# vector of vectors
|
1105
|
-
elif isinstance(obj, VectorOfVectors):
|
1106
|
-
group = self.gimme_group(
|
1107
|
-
name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
|
1108
|
-
)
|
1109
|
-
if (
|
1110
|
-
n_rows is None
|
1111
|
-
or n_rows > obj.cumulative_length.nda.shape[0] - start_row
|
1112
|
-
):
|
1113
|
-
n_rows = obj.cumulative_length.nda.shape[0] - start_row
|
1114
|
-
|
1115
|
-
# if appending we need to add an appropriate offset to the
|
1116
|
-
# cumulative lengths as appropriate for the in-file object
|
1117
|
-
offset = 0 # declare here because we have to subtract it off at the end
|
1118
|
-
if (wo_mode in ("a", "o")) and "cumulative_length" in group:
|
1119
|
-
len_cl = len(group["cumulative_length"])
|
1120
|
-
if wo_mode == "a":
|
1121
|
-
write_start = len_cl
|
1122
|
-
if len_cl > 0:
|
1123
|
-
offset = group["cumulative_length"][write_start - 1]
|
1124
|
-
|
1125
|
-
# First write flattened_data array. Only write rows with data.
|
1126
|
-
fd_start = 0 if start_row == 0 else obj.cumulative_length.nda[start_row - 1]
|
1127
|
-
fd_n_rows = obj.cumulative_length.nda[start_row + n_rows - 1] - fd_start
|
1128
|
-
self.write(
|
1129
|
-
obj.flattened_data,
|
1130
|
-
"flattened_data",
|
1131
|
-
lh5_file,
|
1132
|
-
group=group,
|
1133
|
-
start_row=fd_start,
|
1134
|
-
n_rows=fd_n_rows,
|
1135
|
-
wo_mode=wo_mode,
|
1136
|
-
write_start=offset,
|
1137
|
-
**h5py_kwargs,
|
1138
|
-
)
|
1139
|
-
|
1140
|
-
# now offset is used to give appropriate in-file values for
|
1141
|
-
# cumulative_length. Need to adjust it for start_row
|
1142
|
-
if start_row > 0:
|
1143
|
-
offset -= obj.cumulative_length.nda[start_row - 1]
|
1144
|
-
|
1145
|
-
# Add offset to obj.cumulative_length itself to avoid memory allocation.
|
1146
|
-
# Then subtract it off after writing! (otherwise it will be changed
|
1147
|
-
# upon return)
|
1148
|
-
cl_dtype = obj.cumulative_length.nda.dtype.type
|
1149
|
-
obj.cumulative_length.nda += cl_dtype(offset)
|
1150
|
-
|
1151
|
-
self.write(
|
1152
|
-
obj.cumulative_length,
|
1153
|
-
"cumulative_length",
|
1154
|
-
lh5_file,
|
1155
|
-
group=group,
|
1156
|
-
start_row=start_row,
|
1157
|
-
n_rows=n_rows,
|
1158
|
-
wo_mode=wo_mode,
|
1159
|
-
write_start=write_start,
|
1160
|
-
**h5py_kwargs,
|
1161
|
-
)
|
1162
|
-
obj.cumulative_length.nda -= cl_dtype(offset)
|
1163
|
-
|
1164
|
-
return
|
1165
|
-
|
1166
|
-
# if we get this far, must be one of the Array types
|
1167
|
-
elif isinstance(obj, Array):
|
1168
|
-
if n_rows is None or n_rows > obj.nda.shape[0] - start_row:
|
1169
|
-
n_rows = obj.nda.shape[0] - start_row
|
1170
|
-
|
1171
|
-
nda = obj.nda[start_row : start_row + n_rows]
|
1172
|
-
|
1173
|
-
# hack to store bools as uint8 for c / Julia compliance
|
1174
|
-
if nda.dtype.name == "bool":
|
1175
|
-
nda = nda.astype(np.uint8)
|
1176
|
-
|
1177
|
-
# need to create dataset from ndarray the first time for speed
|
1178
|
-
# creating an empty dataset and appending to that is super slow!
|
1179
|
-
if (wo_mode != "a" and write_start == 0) or name not in group:
|
1180
|
-
# this is needed in order to have a resizable (in the first
|
1181
|
-
# axis) data set, i.e. rows can be appended later
|
1182
|
-
# NOTE: this automatically turns chunking on!
|
1183
|
-
maxshape = (None,) + nda.shape[1:]
|
1184
|
-
h5py_kwargs.setdefault("maxshape", maxshape)
|
1185
|
-
|
1186
|
-
if wo_mode == "o" and name in group:
|
1187
|
-
log.debug(f"overwriting {name} in {group}")
|
1188
|
-
del group[name]
|
1189
|
-
|
1190
|
-
# set default compression options
|
1191
|
-
for k, v in DEFAULT_HDF5_SETTINGS.items():
|
1192
|
-
h5py_kwargs.setdefault(k, v)
|
1193
|
-
|
1194
|
-
# compress using the 'compression' LGDO attribute, if available
|
1195
|
-
if "compression" in obj.attrs:
|
1196
|
-
comp_algo = obj.attrs["compression"]
|
1197
|
-
if isinstance(comp_algo, dict):
|
1198
|
-
h5py_kwargs |= obj.attrs["compression"]
|
1199
|
-
else:
|
1200
|
-
h5py_kwargs["compression"] = obj.attrs["compression"]
|
1201
|
-
|
1202
|
-
# and even the 'hdf5_settings' one, preferred
|
1203
|
-
if "hdf5_settings" in obj.attrs:
|
1204
|
-
h5py_kwargs |= obj.attrs["hdf5_settings"]
|
1205
|
-
|
1206
|
-
# create HDF5 dataset
|
1207
|
-
ds = group.create_dataset(name, data=nda, **h5py_kwargs)
|
1208
|
-
|
1209
|
-
# attach HDF5 dataset attributes, but not "compression"!
|
1210
|
-
_attrs = obj.getattrs(datatype=True)
|
1211
|
-
_attrs.pop("compression", None)
|
1212
|
-
_attrs.pop("hdf5_settings", None)
|
1213
|
-
ds.attrs.update(_attrs)
|
1214
|
-
return
|
1215
|
-
|
1216
|
-
# Now append or overwrite
|
1217
|
-
ds = group[name]
|
1218
|
-
if not isinstance(ds, h5py.Dataset):
|
1219
|
-
msg = (
|
1220
|
-
f"existing HDF5 object '{name}' in group '{group}'"
|
1221
|
-
" is not a dataset! Cannot overwrite or append"
|
1222
|
-
)
|
1223
|
-
raise RuntimeError(msg)
|
1224
|
-
|
1225
|
-
old_len = ds.shape[0]
|
1226
|
-
if wo_mode == "a":
|
1227
|
-
write_start = old_len
|
1228
|
-
add_len = write_start + nda.shape[0] - old_len
|
1229
|
-
ds.resize(old_len + add_len, axis=0)
|
1230
|
-
ds[write_start:] = nda
|
1231
|
-
return
|
1232
|
-
|
1233
|
-
else:
|
1234
|
-
msg = f"do not know how to write '{name}' of type '{type(obj).__name__}'"
|
1235
|
-
raise RuntimeError(msg)
|
203
|
+
return _serializers._h5_write_lgdo(
|
204
|
+
obj,
|
205
|
+
name,
|
206
|
+
self.gimme_file(lh5_file, mode=mode),
|
207
|
+
group=group,
|
208
|
+
start_row=start_row,
|
209
|
+
n_rows=n_rows,
|
210
|
+
wo_mode=wo_mode,
|
211
|
+
write_start=write_start,
|
212
|
+
**h5py_kwargs,
|
213
|
+
)
|
1236
214
|
|
1237
215
|
def read_n_rows(self, name: str, lh5_file: str | h5py.File) -> int | None:
|
1238
|
-
"""Look up the number of rows in an Array-like object called `name` in
|
1239
|
-
`lh5_file`.
|
1240
|
-
|
1241
|
-
Return ``None`` if it is a :class:`.Scalar` or a :class:`.Struct`."""
|
1242
|
-
# this is basically a stripped down version of read_object
|
1243
|
-
h5f = self.gimme_file(lh5_file, "r")
|
1244
|
-
if not h5f or name not in h5f:
|
1245
|
-
msg = f"'{name}' not in {lh5_file}"
|
1246
|
-
raise KeyError(msg)
|
1247
|
-
|
1248
|
-
# get the datatype
|
1249
|
-
if "datatype" not in h5f[name].attrs:
|
1250
|
-
msg = f"'{name}' in file {lh5_file} is missing the datatype attribute"
|
1251
|
-
raise RuntimeError(msg)
|
1252
|
-
|
1253
|
-
datatype = h5f[name].attrs["datatype"]
|
1254
|
-
datatype, shape, elements = parse_datatype(datatype)
|
216
|
+
"""Look up the number of rows in an Array-like object called `name` in `lh5_file`.
|
1255
217
|
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
# structs don't have rows
|
1261
|
-
if datatype == "struct":
|
1262
|
-
return None
|
1263
|
-
|
1264
|
-
# tables should have elements with all the same length
|
1265
|
-
if datatype == "table":
|
1266
|
-
# read out each of the fields
|
1267
|
-
rows_read = None
|
1268
|
-
for field in elements:
|
1269
|
-
n_rows_read = self.read_n_rows(name + "/" + field, h5f)
|
1270
|
-
if not rows_read:
|
1271
|
-
rows_read = n_rows_read
|
1272
|
-
elif rows_read != n_rows_read:
|
1273
|
-
log.warning(
|
1274
|
-
f"'{field}' field in table '{name}' has {rows_read} rows, "
|
1275
|
-
f"{n_rows_read} was expected"
|
1276
|
-
)
|
1277
|
-
return rows_read
|
1278
|
-
|
1279
|
-
# length of vector of vectors is the length of its cumulative_length
|
1280
|
-
if elements.startswith("array"):
|
1281
|
-
return self.read_n_rows(f"{name}/cumulative_length", h5f)
|
1282
|
-
|
1283
|
-
# length of vector of encoded vectors is the length of its decoded_size
|
1284
|
-
if (
|
1285
|
-
elements.startswith("encoded_array")
|
1286
|
-
or datatype == "array_of_encoded_equalsized_arrays"
|
1287
|
-
):
|
1288
|
-
return self.read_n_rows(f"{name}/encoded_data", h5f)
|
1289
|
-
|
1290
|
-
# return array length (without reading the array!)
|
1291
|
-
if "array" in datatype:
|
1292
|
-
# compute the number of rows to read
|
1293
|
-
return h5f[name].shape[0]
|
1294
|
-
|
1295
|
-
msg = f"don't know how to read datatype '{datatype}'"
|
1296
|
-
raise RuntimeError(msg)
|
1297
|
-
|
1298
|
-
|
1299
|
-
@nb.njit(parallel=False, fastmath=True)
|
1300
|
-
def _make_fd_idx(starts, stops, idx):
|
1301
|
-
k = 0
|
1302
|
-
if len(starts) < len(stops):
|
1303
|
-
for i in range(stops[0]):
|
1304
|
-
idx[k] = i
|
1305
|
-
k += 1
|
1306
|
-
stops = stops[1:]
|
1307
|
-
for j in range(len(starts)):
|
1308
|
-
for i in range(starts[j], stops[j]):
|
1309
|
-
idx[k] = i
|
1310
|
-
k += 1
|
1311
|
-
return (idx,)
|
218
|
+
Return ``None`` if it is a :class:`.Scalar` or a :class:`.Struct`.
|
219
|
+
"""
|
220
|
+
return utils.read_n_rows(name, self.gimme_file(lh5_file, "r"))
|