legend-pydataobj 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- legend_pydataobj-1.0.0.dist-info/LICENSE +674 -0
- legend_pydataobj-1.0.0.dist-info/METADATA +63 -0
- legend_pydataobj-1.0.0.dist-info/RECORD +26 -0
- legend_pydataobj-1.0.0.dist-info/WHEEL +5 -0
- legend_pydataobj-1.0.0.dist-info/top_level.txt +1 -0
- lgdo/__init__.py +75 -0
- lgdo/_version.py +4 -0
- lgdo/compression/__init__.py +36 -0
- lgdo/compression/base.py +29 -0
- lgdo/compression/generic.py +77 -0
- lgdo/compression/radware.py +579 -0
- lgdo/compression/utils.py +34 -0
- lgdo/compression/varlen.py +449 -0
- lgdo/lgdo_utils.py +196 -0
- lgdo/lh5_store.py +1711 -0
- lgdo/types/__init__.py +30 -0
- lgdo/types/array.py +140 -0
- lgdo/types/arrayofequalsizedarrays.py +133 -0
- lgdo/types/encoded.py +390 -0
- lgdo/types/fixedsizearray.py +43 -0
- lgdo/types/lgdo.py +51 -0
- lgdo/types/scalar.py +59 -0
- lgdo/types/struct.py +108 -0
- lgdo/types/table.py +349 -0
- lgdo/types/vectorofvectors.py +627 -0
- lgdo/types/waveform_table.py +264 -0
lgdo/lh5_store.py
ADDED
@@ -0,0 +1,1711 @@
|
|
1
|
+
"""
|
2
|
+
This module implements routines from reading and writing LEGEND Data Objects in
|
3
|
+
HDF5 files.
|
4
|
+
"""
|
5
|
+
from __future__ import annotations
|
6
|
+
|
7
|
+
import fnmatch
|
8
|
+
import glob
|
9
|
+
import logging
|
10
|
+
import os
|
11
|
+
import sys
|
12
|
+
from bisect import bisect_left
|
13
|
+
from collections import defaultdict
|
14
|
+
from typing import Any, Iterator, Union
|
15
|
+
|
16
|
+
import h5py
|
17
|
+
import numba as nb
|
18
|
+
import numpy as np
|
19
|
+
import pandas as pd
|
20
|
+
|
21
|
+
from . import compression as compress
|
22
|
+
from .compression import WaveformCodec
|
23
|
+
from .lgdo_utils import expand_path, parse_datatype
|
24
|
+
from .types import (
|
25
|
+
Array,
|
26
|
+
ArrayOfEncodedEqualSizedArrays,
|
27
|
+
ArrayOfEqualSizedArrays,
|
28
|
+
FixedSizeArray,
|
29
|
+
Scalar,
|
30
|
+
Struct,
|
31
|
+
Table,
|
32
|
+
VectorOfEncodedVectors,
|
33
|
+
VectorOfVectors,
|
34
|
+
WaveformTable,
|
35
|
+
)
|
36
|
+
|
37
|
+
LGDO = Union[Array, Scalar, Struct, VectorOfVectors]
|
38
|
+
|
39
|
+
log = logging.getLogger(__name__)
|
40
|
+
|
41
|
+
DEFAULT_HDF5_COMPRESSION = None
|
42
|
+
|
43
|
+
|
44
|
+
class LH5Store:
|
45
|
+
"""
|
46
|
+
Class to represent a store of LEGEND HDF5 files. The two main methods
|
47
|
+
implemented by the class are :meth:`read_object` and :meth:`write_object`.
|
48
|
+
|
49
|
+
Examples
|
50
|
+
--------
|
51
|
+
>>> from lgdo import LH5Store
|
52
|
+
>>> store = LH5Store()
|
53
|
+
>>> obj, _ = store.read_object("/geds/waveform", "file.lh5")
|
54
|
+
>>> type(obj)
|
55
|
+
lgdo.waveform_table.WaveformTable
|
56
|
+
"""
|
57
|
+
|
58
|
+
def __init__(self, base_path: str = "", keep_open: bool = False) -> None:
|
59
|
+
"""
|
60
|
+
Parameters
|
61
|
+
----------
|
62
|
+
base_path
|
63
|
+
directory path to prepend to LH5 files.
|
64
|
+
keep_open
|
65
|
+
whether to keep files open by storing the :mod:`h5py` objects as
|
66
|
+
class attributes.
|
67
|
+
"""
|
68
|
+
self.base_path = "" if base_path == "" else expand_path(base_path)
|
69
|
+
self.keep_open = keep_open
|
70
|
+
self.files = {}
|
71
|
+
|
72
|
+
def gimme_file(self, lh5_file: str | h5py.File, mode: str = "r") -> h5py.File:
|
73
|
+
"""Returns a :mod:`h5py` file object from the store or creates a new one.
|
74
|
+
|
75
|
+
Parameters
|
76
|
+
----------
|
77
|
+
lh5_file
|
78
|
+
LH5 file name.
|
79
|
+
mode
|
80
|
+
mode in which to open file. See :class:`h5py.File` documentation.
|
81
|
+
"""
|
82
|
+
if isinstance(lh5_file, h5py.File):
|
83
|
+
return lh5_file
|
84
|
+
if mode == "r":
|
85
|
+
lh5_file = expand_path(lh5_file, base_path=self.base_path)
|
86
|
+
if lh5_file in self.files.keys():
|
87
|
+
return self.files[lh5_file]
|
88
|
+
if self.base_path != "":
|
89
|
+
full_path = os.path.join(self.base_path, lh5_file)
|
90
|
+
else:
|
91
|
+
full_path = lh5_file
|
92
|
+
if mode != "r":
|
93
|
+
directory = os.path.dirname(full_path)
|
94
|
+
if directory != "" and not os.path.exists(directory):
|
95
|
+
log.debug(f"making path {directory}")
|
96
|
+
os.makedirs(directory)
|
97
|
+
if mode == "r" and not os.path.exists(full_path):
|
98
|
+
raise FileNotFoundError(f"file {full_path} not found")
|
99
|
+
if mode != "r" and os.path.exists(full_path):
|
100
|
+
log.debug(f"opening existing file {full_path} in mode '{mode}'")
|
101
|
+
h5f = h5py.File(full_path, mode)
|
102
|
+
if self.keep_open:
|
103
|
+
self.files[lh5_file] = h5f
|
104
|
+
return h5f
|
105
|
+
|
106
|
+
def gimme_group(
|
107
|
+
self,
|
108
|
+
group: str | h5py.Group,
|
109
|
+
base_group: h5py.Group,
|
110
|
+
grp_attrs: dict[str, Any] = None,
|
111
|
+
overwrite: bool = False,
|
112
|
+
) -> h5py.Group:
|
113
|
+
"""
|
114
|
+
Returns an existing :class:`h5py` group from a base group or creates a
|
115
|
+
new one. Can also set (or replace) group attributes.
|
116
|
+
|
117
|
+
Parameters
|
118
|
+
----------
|
119
|
+
group
|
120
|
+
name of the HDF5 group.
|
121
|
+
base_group
|
122
|
+
HDF5 group to be used as a base.
|
123
|
+
grp_attrs
|
124
|
+
HDF5 group attributes.
|
125
|
+
overwrite
|
126
|
+
whether overwrite group attributes, ignored if `grp_attrs` is
|
127
|
+
``None``.
|
128
|
+
"""
|
129
|
+
if not isinstance(group, h5py.Group):
|
130
|
+
if group in base_group:
|
131
|
+
group = base_group[group]
|
132
|
+
else:
|
133
|
+
group = base_group.create_group(group)
|
134
|
+
if grp_attrs is not None:
|
135
|
+
group.attrs.update(grp_attrs)
|
136
|
+
return group
|
137
|
+
if (
|
138
|
+
grp_attrs is not None
|
139
|
+
and len(set(grp_attrs.items()) ^ set(group.attrs.items())) > 0
|
140
|
+
):
|
141
|
+
if not overwrite:
|
142
|
+
raise RuntimeError("grp_attrs != group.attrs but overwrite not set")
|
143
|
+
else:
|
144
|
+
log.debug(f"overwriting {group}.attrs...")
|
145
|
+
for key in group.attrs.keys():
|
146
|
+
group.attrs.pop(key)
|
147
|
+
group.attrs.update(grp_attrs)
|
148
|
+
return group
|
149
|
+
|
150
|
+
def get_buffer(
|
151
|
+
self,
|
152
|
+
name: str,
|
153
|
+
lh5_file: str | h5py.File | list[str | h5py.File],
|
154
|
+
size: int = None,
|
155
|
+
field_mask: dict[str, bool] | list[str] | tuple[str] = None,
|
156
|
+
) -> LGDO:
|
157
|
+
"""Returns an LH5 object appropriate for use as a pre-allocated buffer
|
158
|
+
in a read loop. Sets size to `size` if object has a size.
|
159
|
+
"""
|
160
|
+
obj, n_rows = self.read_object(name, lh5_file, n_rows=0, field_mask=field_mask)
|
161
|
+
if hasattr(obj, "resize") and size is not None:
|
162
|
+
obj.resize(new_size=size)
|
163
|
+
return obj
|
164
|
+
|
165
|
+
def read_object(
|
166
|
+
self,
|
167
|
+
name: str,
|
168
|
+
lh5_file: str | h5py.File | list[str | h5py.File],
|
169
|
+
start_row: int = 0,
|
170
|
+
n_rows: int = sys.maxsize,
|
171
|
+
idx: np.ndarray | list | tuple | list[np.ndarray | list | tuple] = None,
|
172
|
+
field_mask: dict[str, bool] | list[str] | tuple[str] = None,
|
173
|
+
obj_buf: LGDO = None,
|
174
|
+
obj_buf_start: int = 0,
|
175
|
+
decompress: bool = True,
|
176
|
+
) -> tuple[LGDO, int]:
|
177
|
+
"""Read LH5 object data from a file.
|
178
|
+
|
179
|
+
Parameters
|
180
|
+
----------
|
181
|
+
name
|
182
|
+
Name of the LH5 object to be read (including its group path).
|
183
|
+
lh5_file
|
184
|
+
The file(s) containing the object to be read out. If a list of
|
185
|
+
files, array-like object data will be concatenated into the output
|
186
|
+
object.
|
187
|
+
start_row
|
188
|
+
Starting entry for the object read (for array-like objects). For a
|
189
|
+
list of files, only applies to the first file.
|
190
|
+
n_rows
|
191
|
+
The maximum number of rows to read (for array-like objects). The
|
192
|
+
actual number of rows read will be returned as one of the return
|
193
|
+
values (see below).
|
194
|
+
idx
|
195
|
+
For NumPy-style "fancying indexing" for the read. Used to read out
|
196
|
+
rows that pass some selection criteria. Only selection along the first
|
197
|
+
axis is supported, so tuple arguments must be one-tuples. If `n_rows`
|
198
|
+
is not false, `idx` will be truncated to `n_rows` before reading. To use
|
199
|
+
with a list of files, can pass in a list of `idx`'s (one for each
|
200
|
+
file) or use a long contiguous list (e.g. built from a previous
|
201
|
+
identical read). If used in conjunction with `start_row` and `n_rows`,
|
202
|
+
will be sliced to obey those constraints, where `n_rows` is
|
203
|
+
interpreted as the (max) number of *selected* values (in `idx`) to be
|
204
|
+
read out.
|
205
|
+
field_mask
|
206
|
+
For tables and structs, determines which fields get written out.
|
207
|
+
Only applies to immediate fields of the requested objects. If a dict
|
208
|
+
is used, a default dict will be made with the default set to the
|
209
|
+
opposite of the first element in the dict. This way if one specifies
|
210
|
+
a few fields at ``False``, all but those fields will be read out,
|
211
|
+
while if one specifies just a few fields as ``True``, only those
|
212
|
+
fields will be read out. If a list is provided, the listed fields
|
213
|
+
will be set to ``True``, while the rest will default to ``False``.
|
214
|
+
obj_buf
|
215
|
+
Read directly into memory provided in `obj_buf`. Note: the buffer
|
216
|
+
will be expanded to accommodate the data requested. To maintain the
|
217
|
+
buffer length, send in ``n_rows = len(obj_buf)``.
|
218
|
+
obj_buf_start
|
219
|
+
Start location in ``obj_buf`` for read. For concatenating data to
|
220
|
+
array-like objects.
|
221
|
+
decompress
|
222
|
+
Decompress data encoded with LGDO's compression routines right
|
223
|
+
after reading. The option has no effect on data encoded with HDF5
|
224
|
+
built-in filters, which is always decompressed upstream by HDF5.
|
225
|
+
|
226
|
+
Returns
|
227
|
+
-------
|
228
|
+
(object, n_rows_read)
|
229
|
+
`object` is the read-out object `n_rows_read` is the number of rows
|
230
|
+
successfully read out. Essential for arrays when the amount of data
|
231
|
+
is smaller than the object buffer. For scalars and structs
|
232
|
+
`n_rows_read` will be``1``. For tables it is redundant with
|
233
|
+
``table.loc``.
|
234
|
+
"""
|
235
|
+
# Handle list-of-files recursively
|
236
|
+
if not isinstance(lh5_file, (str, h5py.File)):
|
237
|
+
lh5_file = list(lh5_file)
|
238
|
+
n_rows_read = 0
|
239
|
+
for i, h5f in enumerate(lh5_file):
|
240
|
+
if isinstance(idx, list) and len(idx) > 0 and not np.isscalar(idx[0]):
|
241
|
+
# a list of lists: must be one per file
|
242
|
+
idx_i = idx[i]
|
243
|
+
elif idx is not None:
|
244
|
+
# make idx a proper tuple if it's not one already
|
245
|
+
if not (isinstance(idx, tuple) and len(idx) == 1):
|
246
|
+
idx = (idx,)
|
247
|
+
# idx is a long continuous array
|
248
|
+
n_rows_i = self.read_n_rows(name, h5f)
|
249
|
+
# find the length of the subset of idx that contains indices
|
250
|
+
# that are less than n_rows_i
|
251
|
+
n_rows_to_read_i = bisect_left(idx[0], n_rows_i)
|
252
|
+
# now split idx into idx_i and the remainder
|
253
|
+
idx_i = (idx[0][:n_rows_to_read_i],)
|
254
|
+
idx = (idx[0][n_rows_to_read_i:] - n_rows_i,)
|
255
|
+
else:
|
256
|
+
idx_i = None
|
257
|
+
n_rows_i = n_rows - n_rows_read
|
258
|
+
obj_buf, n_rows_read_i = self.read_object(
|
259
|
+
name,
|
260
|
+
lh5_file[i],
|
261
|
+
start_row=start_row,
|
262
|
+
n_rows=n_rows_i,
|
263
|
+
idx=idx_i,
|
264
|
+
field_mask=field_mask,
|
265
|
+
obj_buf=obj_buf,
|
266
|
+
obj_buf_start=obj_buf_start,
|
267
|
+
decompress=decompress,
|
268
|
+
)
|
269
|
+
n_rows_read += n_rows_read_i
|
270
|
+
if n_rows_read >= n_rows or obj_buf is None:
|
271
|
+
return obj_buf, n_rows_read
|
272
|
+
start_row = 0
|
273
|
+
obj_buf_start += n_rows_read_i
|
274
|
+
return obj_buf, n_rows_read
|
275
|
+
|
276
|
+
# get the file from the store
|
277
|
+
h5f = self.gimme_file(lh5_file, "r")
|
278
|
+
if not h5f or name not in h5f:
|
279
|
+
raise KeyError(f"'{name}' not in {h5f.filename}")
|
280
|
+
|
281
|
+
log.debug(
|
282
|
+
f"reading {h5f.filename}:{name}[{start_row}:{n_rows}], decompress = {decompress}, "
|
283
|
+
+ (f" with field mask {field_mask}" if field_mask else "")
|
284
|
+
)
|
285
|
+
|
286
|
+
# make idx a proper tuple if it's not one already
|
287
|
+
if not (isinstance(idx, tuple) and len(idx) == 1):
|
288
|
+
if idx is not None:
|
289
|
+
idx = (idx,)
|
290
|
+
|
291
|
+
# get the object's datatype
|
292
|
+
if "datatype" not in h5f[name].attrs:
|
293
|
+
raise RuntimeError(
|
294
|
+
f"'{name}' in file {lh5_file} is missing the datatype attribute"
|
295
|
+
)
|
296
|
+
|
297
|
+
datatype = h5f[name].attrs["datatype"]
|
298
|
+
datatype, shape, elements = parse_datatype(datatype)
|
299
|
+
|
300
|
+
# check field_mask and make it a default dict
|
301
|
+
if datatype == "struct" or datatype == "table":
|
302
|
+
if field_mask is None:
|
303
|
+
field_mask = defaultdict(lambda: True)
|
304
|
+
elif isinstance(field_mask, dict):
|
305
|
+
default = True
|
306
|
+
if len(field_mask) > 0:
|
307
|
+
default = not field_mask[list(field_mask.keys())[0]]
|
308
|
+
field_mask = defaultdict(lambda: default, field_mask)
|
309
|
+
elif isinstance(field_mask, (list, tuple)):
|
310
|
+
field_mask = defaultdict(
|
311
|
+
lambda: False, {field: True for field in field_mask}
|
312
|
+
)
|
313
|
+
elif not isinstance(field_mask, defaultdict):
|
314
|
+
raise RuntimeError("bad field_mask of type", type(field_mask).__name__)
|
315
|
+
elif field_mask is not None:
|
316
|
+
raise RuntimeError(f"datatype {datatype} does not accept a field_mask")
|
317
|
+
|
318
|
+
# Scalar
|
319
|
+
# scalars are dim-0 datasets
|
320
|
+
if datatype == "scalar":
|
321
|
+
value = h5f[name][()]
|
322
|
+
if elements == "bool":
|
323
|
+
value = np.bool_(value)
|
324
|
+
if obj_buf is not None:
|
325
|
+
obj_buf.value = value
|
326
|
+
obj_buf.attrs.update(h5f[name].attrs)
|
327
|
+
return obj_buf, 1
|
328
|
+
else:
|
329
|
+
return Scalar(value=value, attrs=h5f[name].attrs), 1
|
330
|
+
|
331
|
+
# Struct
|
332
|
+
# recursively build a struct, return as a dictionary
|
333
|
+
if datatype == "struct":
|
334
|
+
# ignore obj_buf.
|
335
|
+
# TODO: could append new fields or overwrite/concat to existing
|
336
|
+
# fields. If implemented, get_buffer() above should probably also
|
337
|
+
# (optionally?) prep buffers for each field
|
338
|
+
if obj_buf is not None:
|
339
|
+
raise NotImplementedError("obj_buf not implemented for LGOD Structs")
|
340
|
+
|
341
|
+
# loop over fields and read
|
342
|
+
obj_dict = {}
|
343
|
+
for field in elements:
|
344
|
+
if not field_mask[field]:
|
345
|
+
continue
|
346
|
+
# TODO: it's strange to pass start_row, n_rows, idx to struct
|
347
|
+
# fields. If they all had shared indexing, they should be in a
|
348
|
+
# table... Maybe should emit a warning? Or allow them to be
|
349
|
+
# dicts keyed by field name?
|
350
|
+
if "int_keys" in h5f[name].attrs:
|
351
|
+
if dict(h5f[name].attrs)["int_keys"]:
|
352
|
+
f = int(field)
|
353
|
+
else:
|
354
|
+
f = str(field)
|
355
|
+
obj_dict[f], _ = self.read_object(
|
356
|
+
name + "/" + field,
|
357
|
+
h5f,
|
358
|
+
start_row=start_row,
|
359
|
+
n_rows=n_rows,
|
360
|
+
idx=idx,
|
361
|
+
decompress=decompress,
|
362
|
+
)
|
363
|
+
# modify datatype in attrs if a field_mask was used
|
364
|
+
attrs = dict(h5f[name].attrs)
|
365
|
+
if field_mask is not None:
|
366
|
+
selected_fields = []
|
367
|
+
for field in elements:
|
368
|
+
if field_mask[field]:
|
369
|
+
selected_fields.append(field)
|
370
|
+
attrs["datatype"] = "struct" + "{" + ",".join(selected_fields) + "}"
|
371
|
+
return Struct(obj_dict=obj_dict, attrs=attrs), 1
|
372
|
+
|
373
|
+
# Below here is all array-like types. So trim idx if needed
|
374
|
+
if idx is not None:
|
375
|
+
# chop off indices < start_row
|
376
|
+
i_first_valid = bisect_left(idx[0], start_row)
|
377
|
+
idxa = idx[0][i_first_valid:]
|
378
|
+
# don't readout more than n_rows indices
|
379
|
+
idx = (idxa[:n_rows],) # works even if n_rows > len(idxa)
|
380
|
+
|
381
|
+
# Table or WaveformTable
|
382
|
+
if datatype == "table":
|
383
|
+
col_dict = {}
|
384
|
+
|
385
|
+
# read out each of the fields
|
386
|
+
rows_read = []
|
387
|
+
for field in elements:
|
388
|
+
if not field_mask[field]:
|
389
|
+
continue
|
390
|
+
|
391
|
+
fld_buf = None
|
392
|
+
if obj_buf is not None:
|
393
|
+
if not isinstance(obj_buf, Table) or field not in obj_buf:
|
394
|
+
raise ValueError(
|
395
|
+
f"obj_buf for LGDO Table '{name}' not formatted correctly"
|
396
|
+
)
|
397
|
+
|
398
|
+
else:
|
399
|
+
fld_buf = obj_buf[field]
|
400
|
+
|
401
|
+
col_dict[field], n_rows_read = self.read_object(
|
402
|
+
name + "/" + field,
|
403
|
+
h5f,
|
404
|
+
start_row=start_row,
|
405
|
+
n_rows=n_rows,
|
406
|
+
idx=idx,
|
407
|
+
obj_buf=fld_buf,
|
408
|
+
obj_buf_start=obj_buf_start,
|
409
|
+
decompress=decompress,
|
410
|
+
)
|
411
|
+
if obj_buf is not None and obj_buf_start + n_rows_read > len(obj_buf):
|
412
|
+
obj_buf.resize(obj_buf_start + n_rows_read)
|
413
|
+
|
414
|
+
rows_read.append(n_rows_read)
|
415
|
+
|
416
|
+
# warn if all columns don't read in the same number of rows
|
417
|
+
if len(rows_read) > 0:
|
418
|
+
n_rows_read = rows_read[0]
|
419
|
+
else:
|
420
|
+
n_rows_read = 0
|
421
|
+
log.warning(f"Table '{name}' has no subgroups accepted by field mask")
|
422
|
+
|
423
|
+
for n in rows_read[1:]:
|
424
|
+
if n != n_rows_read:
|
425
|
+
log.warning(
|
426
|
+
f"Table '{name}' got strange n_rows_read = {n}, {n_rows_read} was expected ({rows_read})"
|
427
|
+
)
|
428
|
+
|
429
|
+
# modify datatype in attrs if a field_mask was used
|
430
|
+
attrs = dict(h5f[name].attrs)
|
431
|
+
if field_mask is not None:
|
432
|
+
selected_fields = []
|
433
|
+
for field in elements:
|
434
|
+
if field_mask[field]:
|
435
|
+
selected_fields.append(field)
|
436
|
+
attrs["datatype"] = "table" + "{" + ",".join(selected_fields) + "}"
|
437
|
+
|
438
|
+
# fields have been read out, now return a table
|
439
|
+
if obj_buf is None:
|
440
|
+
# if col_dict contains just 3 objects called t0, dt, and values,
|
441
|
+
# return a WaveformTable
|
442
|
+
if (
|
443
|
+
len(col_dict) == 3
|
444
|
+
and "t0" in col_dict
|
445
|
+
and "dt" in col_dict
|
446
|
+
and "values" in col_dict
|
447
|
+
):
|
448
|
+
table = WaveformTable(
|
449
|
+
t0=col_dict["t0"], dt=col_dict["dt"], values=col_dict["values"]
|
450
|
+
)
|
451
|
+
else:
|
452
|
+
table = Table(col_dict=col_dict, attrs=attrs)
|
453
|
+
|
454
|
+
# set (write) loc to end of tree
|
455
|
+
table.loc = n_rows_read
|
456
|
+
return table, n_rows_read
|
457
|
+
else:
|
458
|
+
# We have read all fields into the object buffer. Run
|
459
|
+
# checks: All columns should be the same size. So update
|
460
|
+
# table's size as necessary, warn if any mismatches are found
|
461
|
+
obj_buf.resize(do_warn=True)
|
462
|
+
# set (write) loc to end of tree
|
463
|
+
obj_buf.loc = obj_buf_start + n_rows_read
|
464
|
+
# check attributes
|
465
|
+
if set(obj_buf.attrs.keys()) != set(attrs.keys()):
|
466
|
+
raise RuntimeError(
|
467
|
+
f"attrs mismatch. obj_buf.attrs: "
|
468
|
+
f"{obj_buf.attrs}, h5f[{name}].attrs: {attrs}"
|
469
|
+
)
|
470
|
+
return obj_buf, n_rows_read
|
471
|
+
|
472
|
+
# ArrayOfEncodedEqualSizedArrays and VectorOfEncodedVectors
|
473
|
+
for cond, enc_lgdo in [
|
474
|
+
(
|
475
|
+
datatype == "array_of_encoded_equalsized_arrays",
|
476
|
+
ArrayOfEncodedEqualSizedArrays,
|
477
|
+
),
|
478
|
+
(elements.startswith("encoded_array"), VectorOfEncodedVectors),
|
479
|
+
]:
|
480
|
+
if cond:
|
481
|
+
if (
|
482
|
+
not decompress
|
483
|
+
and obj_buf is not None
|
484
|
+
and not isinstance(obj_buf, enc_lgdo)
|
485
|
+
):
|
486
|
+
raise ValueError(f"obj_buf for '{name}' not a {enc_lgdo}")
|
487
|
+
|
488
|
+
# read out decoded_size, either a Scalar or an Array
|
489
|
+
decoded_size_buf = encoded_data_buf = None
|
490
|
+
if obj_buf is not None and not decompress:
|
491
|
+
decoded_size_buf = obj_buf.decoded_size
|
492
|
+
encoded_data_buf = obj_buf.encoded_data
|
493
|
+
|
494
|
+
decoded_size, _ = self.read_object(
|
495
|
+
f"{name}/decoded_size",
|
496
|
+
h5f,
|
497
|
+
start_row=start_row,
|
498
|
+
n_rows=n_rows,
|
499
|
+
idx=idx,
|
500
|
+
obj_buf=None if decompress else decoded_size_buf,
|
501
|
+
obj_buf_start=0 if decompress else obj_buf_start,
|
502
|
+
)
|
503
|
+
|
504
|
+
# read out encoded_data, a VectorOfVectors
|
505
|
+
encoded_data, n_rows_read = self.read_object(
|
506
|
+
f"{name}/encoded_data",
|
507
|
+
h5f,
|
508
|
+
start_row=start_row,
|
509
|
+
n_rows=n_rows,
|
510
|
+
idx=idx,
|
511
|
+
obj_buf=None if decompress else encoded_data_buf,
|
512
|
+
obj_buf_start=0 if decompress else obj_buf_start,
|
513
|
+
)
|
514
|
+
|
515
|
+
# return the still encoded data in the buffer object, if there
|
516
|
+
if obj_buf is not None and not decompress:
|
517
|
+
return obj_buf, n_rows_read
|
518
|
+
|
519
|
+
# otherwise re-create the encoded LGDO
|
520
|
+
rawdata = enc_lgdo(
|
521
|
+
encoded_data=encoded_data,
|
522
|
+
decoded_size=decoded_size,
|
523
|
+
attrs=h5f[name].attrs,
|
524
|
+
)
|
525
|
+
|
526
|
+
# already return if no decompression is requested
|
527
|
+
if not decompress:
|
528
|
+
return rawdata, n_rows_read
|
529
|
+
|
530
|
+
# if no buffer, decode and return
|
531
|
+
elif obj_buf is None and decompress:
|
532
|
+
return compress.decode(rawdata), n_rows_read
|
533
|
+
|
534
|
+
# use the (decoded object type) buffer otherwise
|
535
|
+
if enc_lgdo == VectorOfEncodedVectors and not isinstance(
|
536
|
+
obj_buf, VectorOfVectors
|
537
|
+
):
|
538
|
+
raise ValueError(
|
539
|
+
f"obj_buf for decoded '{name}' not a VectorOfVectors"
|
540
|
+
)
|
541
|
+
elif enc_lgdo == ArrayOfEncodedEqualSizedArrays and not isinstance(
|
542
|
+
obj_buf, ArrayOfEqualSizedArrays
|
543
|
+
):
|
544
|
+
raise ValueError(
|
545
|
+
f"obj_buf for decoded '{name}' not an ArrayOfEqualSizedArrays"
|
546
|
+
)
|
547
|
+
|
548
|
+
# FIXME: not a good idea. an in place decoding version
|
549
|
+
# of decode would be needed to avoid extra memory
|
550
|
+
# allocations
|
551
|
+
# FIXME: obj_buf_start??? Write a unit test
|
552
|
+
for i, wf in enumerate(compress.decode(rawdata)):
|
553
|
+
obj_buf[i] = wf
|
554
|
+
|
555
|
+
return obj_buf, n_rows_read
|
556
|
+
|
557
|
+
# VectorOfVectors
|
558
|
+
# read out vector of vectors of different size
|
559
|
+
if elements.startswith("array"):
|
560
|
+
if obj_buf is not None and not isinstance(obj_buf, VectorOfVectors):
|
561
|
+
raise ValueError(f"obj_buf for '{name}' not a LGDO VectorOfVectors")
|
562
|
+
|
563
|
+
# read out cumulative_length
|
564
|
+
cumulen_buf = None if obj_buf is None else obj_buf.cumulative_length
|
565
|
+
cumulative_length, n_rows_read = self.read_object(
|
566
|
+
f"{name}/cumulative_length",
|
567
|
+
h5f,
|
568
|
+
start_row=start_row,
|
569
|
+
n_rows=n_rows,
|
570
|
+
idx=idx,
|
571
|
+
obj_buf=cumulen_buf,
|
572
|
+
obj_buf_start=obj_buf_start,
|
573
|
+
)
|
574
|
+
# get a view of just what was read out for cleaner code below
|
575
|
+
this_cumulen_nda = cumulative_length.nda[
|
576
|
+
obj_buf_start : obj_buf_start + n_rows_read
|
577
|
+
]
|
578
|
+
|
579
|
+
if idx is not None and n_rows_read > 0:
|
580
|
+
# get the starting indices for each array in flattended data:
|
581
|
+
# the starting index for array[i] is cumulative_length[i-1]
|
582
|
+
idx2 = (np.asarray(idx[0]).copy() - 1,)
|
583
|
+
# re-read cumulative_length with these indices
|
584
|
+
# note this will allocate memory for fd_starts!
|
585
|
+
fd_start = None
|
586
|
+
if idx2[0][0] == -1:
|
587
|
+
idx2 = (idx2[0][1:],)
|
588
|
+
fd_start = 0 # this variable avoids an ndarray append
|
589
|
+
fd_starts, fds_n_rows_read = self.read_object(
|
590
|
+
f"{name}/cumulative_length",
|
591
|
+
h5f,
|
592
|
+
start_row=start_row,
|
593
|
+
n_rows=n_rows,
|
594
|
+
idx=idx2,
|
595
|
+
)
|
596
|
+
fd_starts = fd_starts.nda # we just need the nda
|
597
|
+
if fd_start is None:
|
598
|
+
fd_start = fd_starts[0]
|
599
|
+
|
600
|
+
# compute the length that flattened_data will have after the
|
601
|
+
# fancy-indexed read
|
602
|
+
fd_n_rows = np.sum(this_cumulen_nda[-len(fd_starts) :] - fd_starts)
|
603
|
+
if fd_start == 0:
|
604
|
+
fd_n_rows += this_cumulen_nda[0]
|
605
|
+
|
606
|
+
# now make fd_idx
|
607
|
+
fd_idx = np.empty(fd_n_rows, dtype="uint32")
|
608
|
+
fd_idx = _make_fd_idx(fd_starts, this_cumulen_nda, fd_idx)
|
609
|
+
|
610
|
+
# Now clean up this_cumulen_nda, to be ready
|
611
|
+
# to match the in-memory version of flattened_data. Note: these
|
612
|
+
# operations on the view change the original array because they are
|
613
|
+
# numpy arrays, not lists.
|
614
|
+
this_cumulen_nda[-len(fd_starts) :] -= fd_starts
|
615
|
+
np.cumsum(this_cumulen_nda, out=this_cumulen_nda)
|
616
|
+
|
617
|
+
else:
|
618
|
+
fd_idx = None
|
619
|
+
|
620
|
+
# determine the start_row and n_rows for the flattened_data readout
|
621
|
+
fd_start = 0
|
622
|
+
if start_row > 0 and n_rows_read > 0:
|
623
|
+
# need to read out the cumulen sample -before- the first sample
|
624
|
+
# read above in order to get the starting row of the first
|
625
|
+
# vector to read out in flattened_data
|
626
|
+
fd_start = h5f[f"{name}/cumulative_length"][start_row - 1]
|
627
|
+
|
628
|
+
# check limits for values that will be used subsequently
|
629
|
+
if this_cumulen_nda[-1] < fd_start:
|
630
|
+
log.debug(
|
631
|
+
f"this_cumulen_nda[-1] = {this_cumulen_nda[-1]}, "
|
632
|
+
f"fd_start = {fd_start}, "
|
633
|
+
f"start_row = {start_row}, "
|
634
|
+
f"n_rows_read = {n_rows_read}"
|
635
|
+
)
|
636
|
+
raise RuntimeError(
|
637
|
+
f"cumulative_length non-increasing between entries "
|
638
|
+
f"{start_row} and {start_row+n_rows_read} ??"
|
639
|
+
)
|
640
|
+
|
641
|
+
# determine the number of rows for the flattened_data readout
|
642
|
+
fd_n_rows = this_cumulen_nda[-1] if n_rows_read > 0 else 0
|
643
|
+
|
644
|
+
# Now done with this_cumulen_nda, so we can clean it up to be ready
|
645
|
+
# to match the in-memory version of flattened_data. Note: these
|
646
|
+
# operations on the view change the original array because they are
|
647
|
+
# numpy arrays, not lists.
|
648
|
+
#
|
649
|
+
# First we need to subtract off the in-file offset for the start of
|
650
|
+
# read for flattened_data
|
651
|
+
this_cumulen_nda -= fd_start
|
652
|
+
|
653
|
+
# If we started with a partially-filled buffer, add the
|
654
|
+
# appropriate offset for the start of the in-memory flattened
|
655
|
+
# data for this read.
|
656
|
+
fd_buf_start = np.uint32(0)
|
657
|
+
if obj_buf_start > 0:
|
658
|
+
fd_buf_start = cumulative_length.nda[obj_buf_start - 1]
|
659
|
+
this_cumulen_nda += fd_buf_start
|
660
|
+
|
661
|
+
# Now prepare the object buffer if necessary
|
662
|
+
fd_buf = None
|
663
|
+
if obj_buf is not None:
|
664
|
+
fd_buf = obj_buf.flattened_data
|
665
|
+
# grow fd_buf if necessary to hold the data
|
666
|
+
fdb_size = fd_buf_start + fd_n_rows
|
667
|
+
if len(fd_buf) < fdb_size:
|
668
|
+
fd_buf.resize(fdb_size)
|
669
|
+
|
670
|
+
# now read
|
671
|
+
flattened_data, dummy_rows_read = self.read_object(
|
672
|
+
f"{name}/flattened_data",
|
673
|
+
h5f,
|
674
|
+
start_row=fd_start,
|
675
|
+
n_rows=fd_n_rows,
|
676
|
+
idx=fd_idx,
|
677
|
+
obj_buf=fd_buf,
|
678
|
+
obj_buf_start=fd_buf_start,
|
679
|
+
)
|
680
|
+
if obj_buf is not None:
|
681
|
+
return obj_buf, n_rows_read
|
682
|
+
return (
|
683
|
+
VectorOfVectors(
|
684
|
+
flattened_data=flattened_data,
|
685
|
+
cumulative_length=cumulative_length,
|
686
|
+
attrs=h5f[name].attrs,
|
687
|
+
),
|
688
|
+
n_rows_read,
|
689
|
+
)
|
690
|
+
|
691
|
+
# Array
|
692
|
+
# FixedSizeArray
|
693
|
+
# ArrayOfEqualSizedArrays
|
694
|
+
# read out all arrays by slicing
|
695
|
+
if "array" in datatype:
|
696
|
+
if obj_buf is not None:
|
697
|
+
if not isinstance(obj_buf, Array):
|
698
|
+
raise ValueError(f"obj_buf for '{name}' not an LGDO Array")
|
699
|
+
obj_buf = None
|
700
|
+
|
701
|
+
# compute the number of rows to read
|
702
|
+
# we culled idx above for start_row and n_rows, now we have to apply
|
703
|
+
# the constraint of the length of the dataset
|
704
|
+
ds_n_rows = h5f[name].shape[0]
|
705
|
+
if idx is not None:
|
706
|
+
if len(idx[0]) > 0 and idx[0][-1] >= ds_n_rows:
|
707
|
+
log.warning(
|
708
|
+
"idx indexed past the end of the array in the file. Culling..."
|
709
|
+
)
|
710
|
+
n_rows_to_read = bisect_left(idx[0], ds_n_rows)
|
711
|
+
idx = (idx[0][:n_rows_to_read],)
|
712
|
+
if len(idx[0]) == 0:
|
713
|
+
log.warning("idx empty after culling.")
|
714
|
+
n_rows_to_read = len(idx[0])
|
715
|
+
else:
|
716
|
+
n_rows_to_read = ds_n_rows - start_row
|
717
|
+
if n_rows_to_read > n_rows:
|
718
|
+
n_rows_to_read = n_rows
|
719
|
+
|
720
|
+
# prepare the selection for the read. Use idx if available
|
721
|
+
if idx is not None:
|
722
|
+
source_sel = idx
|
723
|
+
else:
|
724
|
+
source_sel = np.s_[start_row : start_row + n_rows_to_read]
|
725
|
+
|
726
|
+
# Now read the array
|
727
|
+
if obj_buf is not None and n_rows_to_read > 0:
|
728
|
+
buf_size = obj_buf_start + n_rows_to_read
|
729
|
+
if len(obj_buf) < buf_size:
|
730
|
+
obj_buf.resize(buf_size)
|
731
|
+
dest_sel = np.s_[obj_buf_start:buf_size]
|
732
|
+
h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
|
733
|
+
nda = obj_buf.nda
|
734
|
+
else:
|
735
|
+
if n_rows == 0:
|
736
|
+
tmp_shape = (0,) + h5f[name].shape[1:]
|
737
|
+
nda = np.empty(tmp_shape, h5f[name].dtype)
|
738
|
+
else:
|
739
|
+
nda = h5f[name][source_sel]
|
740
|
+
|
741
|
+
# special handling for bools
|
742
|
+
# (c and Julia store as uint8 so cast to bool)
|
743
|
+
if elements == "bool":
|
744
|
+
nda = nda.astype(np.bool_)
|
745
|
+
|
746
|
+
# Finally, set attributes and return objects
|
747
|
+
attrs = h5f[name].attrs
|
748
|
+
if obj_buf is None:
|
749
|
+
if datatype == "array":
|
750
|
+
return Array(nda=nda, attrs=attrs), n_rows_to_read
|
751
|
+
if datatype == "fixedsize_array":
|
752
|
+
return FixedSizeArray(nda=nda, attrs=attrs), n_rows_to_read
|
753
|
+
if datatype == "array_of_equalsized_arrays":
|
754
|
+
return (
|
755
|
+
ArrayOfEqualSizedArrays(nda=nda, dims=shape, attrs=attrs),
|
756
|
+
n_rows_to_read,
|
757
|
+
)
|
758
|
+
else:
|
759
|
+
if set(obj_buf.attrs.keys()) != set(attrs.keys()):
|
760
|
+
raise RuntimeError(
|
761
|
+
f"attrs mismatch. "
|
762
|
+
f"obj_buf.attrs: {obj_buf.attrs}, "
|
763
|
+
f"h5f[{name}].attrs: {attrs}"
|
764
|
+
)
|
765
|
+
return obj_buf, n_rows_to_read
|
766
|
+
|
767
|
+
raise RuntimeError("don't know how to read datatype {datatype}")
|
768
|
+
|
769
|
+
def write_object(
|
770
|
+
self,
|
771
|
+
obj: LGDO,
|
772
|
+
name: str,
|
773
|
+
lh5_file: str | h5py.File,
|
774
|
+
group: str | h5py.Group = "/",
|
775
|
+
start_row: int = 0,
|
776
|
+
n_rows: int = None,
|
777
|
+
wo_mode: str = "append",
|
778
|
+
write_start: int = 0,
|
779
|
+
hdf5_compression: str | h5py.filters.FilterRefBase = DEFAULT_HDF5_COMPRESSION,
|
780
|
+
) -> None:
|
781
|
+
"""Write an LGDO into an LH5 file.
|
782
|
+
|
783
|
+
If the `obj` :class:`.LGDO` has a `compression` attribute, its value is
|
784
|
+
interpreted as the algorithm to be used to compress `obj` before
|
785
|
+
writing to disk. The type of `compression` can be:
|
786
|
+
|
787
|
+
string, kwargs dictionary, hdf5plugin filter
|
788
|
+
interpreted as the name of a built-in or custom `HDF5 compression
|
789
|
+
filter <https://docs.h5py.org/en/stable/high/dataset.html#filter-pipeline>`_
|
790
|
+
(``"gzip"``, ``"lzf"``, :mod:`hdf5plugin` filter object etc.) and
|
791
|
+
passed directly to :meth:`h5py.Group.create_dataset`.
|
792
|
+
|
793
|
+
:class:`.WaveformCodec` object
|
794
|
+
If `obj` is a :class:`.WaveformTable`, compress its `values` using
|
795
|
+
this algorithm. More documentation about the supported waveform
|
796
|
+
compression algorithms at :mod:`.lgdo.compression`.
|
797
|
+
|
798
|
+
Note
|
799
|
+
----
|
800
|
+
The `compression` attribute takes precedence over the
|
801
|
+
`hdf5_compression` argument and is not written to disk.
|
802
|
+
|
803
|
+
Note
|
804
|
+
----
|
805
|
+
HDF5 compression is skipped for the `encoded_data` dataset of
|
806
|
+
:class:`.VectorOfEncodedVectors` and
|
807
|
+
:class`.ArrayOfEncodedEqualSizedArrays`.
|
808
|
+
|
809
|
+
Parameters
|
810
|
+
----------
|
811
|
+
obj
|
812
|
+
LH5 object. if object is array-like, writes `n_rows` starting from
|
813
|
+
`start_row` in `obj`.
|
814
|
+
name
|
815
|
+
name of the object in the output HDF5 file.
|
816
|
+
lh5_file
|
817
|
+
HDF5 file name or :class:`h5py.File` object.
|
818
|
+
group
|
819
|
+
HDF5 group name or :class:`h5py.Group` object in which `obj` should
|
820
|
+
be written.
|
821
|
+
start_row
|
822
|
+
first row in `obj` to be written.
|
823
|
+
n_rows
|
824
|
+
number of rows in `obj` to be written.
|
825
|
+
wo_mode
|
826
|
+
- ``write_safe`` or ``w``: only proceed with writing if the
|
827
|
+
object does not already exist in the file.
|
828
|
+
- ``append`` or ``a``: append along axis 0 (the first dimension)
|
829
|
+
of array-like objects and array-like subfields of structs.
|
830
|
+
:class:`~.lgdo.scalar.Scalar` objects get overwritten.
|
831
|
+
- ``overwrite`` or ``o``: replace data in the file if present,
|
832
|
+
starting from `write_start`. Note: overwriting with `write_start` =
|
833
|
+
end of array is the same as ``append``.
|
834
|
+
- ``overwrite_file`` or ``of``: delete file if present prior to
|
835
|
+
writing to it. `write_start` should be 0 (its ignored).
|
836
|
+
- ``append_column`` or ``ac``: append columns from an :class:`~.lgdo.table.Table`
|
837
|
+
`obj` only if there is an existing :class:`~.lgdo.table.Table` in the `lh5_file` with
|
838
|
+
the same `name` and :class:`~.lgdo.table.Table.size`. If the sizes don't match,
|
839
|
+
or if there are matching fields, it errors out.
|
840
|
+
write_start
|
841
|
+
row in the output file (if already existing) to start overwriting
|
842
|
+
from.
|
843
|
+
hdf5_compression
|
844
|
+
HDF5 compression filter to be applied before writing non-scalar
|
845
|
+
datasets. **Ignored if compression is specified as an `obj`
|
846
|
+
attribute.**
|
847
|
+
"""
|
848
|
+
log.debug(
|
849
|
+
f"writing {repr(obj)}[{start_row}:{n_rows}] as "
|
850
|
+
f"{lh5_file}:{group}/{name}[{write_start}:], "
|
851
|
+
f"mode = {wo_mode}, hdf5_compression = {hdf5_compression}"
|
852
|
+
)
|
853
|
+
|
854
|
+
if wo_mode == "write_safe":
|
855
|
+
wo_mode = "w"
|
856
|
+
if wo_mode == "append":
|
857
|
+
wo_mode = "a"
|
858
|
+
if wo_mode == "overwrite":
|
859
|
+
wo_mode = "o"
|
860
|
+
if wo_mode == "overwrite_file":
|
861
|
+
wo_mode = "of"
|
862
|
+
write_start = 0
|
863
|
+
if wo_mode == "append_column":
|
864
|
+
wo_mode = "ac"
|
865
|
+
if wo_mode not in ["w", "a", "o", "of", "ac"]:
|
866
|
+
raise ValueError(f"unknown wo_mode '{wo_mode}'")
|
867
|
+
|
868
|
+
# "mode" is for the h5df.File and wo_mode is for this function
|
869
|
+
# In hdf5, 'a' is really "modify" -- in addition to appending, you can
|
870
|
+
# change any object in the file. So we use file:append for
|
871
|
+
# write_object:overwrite.
|
872
|
+
mode = "w" if wo_mode == "of" else "a"
|
873
|
+
lh5_file = self.gimme_file(lh5_file, mode=mode)
|
874
|
+
group = self.gimme_group(group, lh5_file)
|
875
|
+
if wo_mode == "w" and name in group:
|
876
|
+
raise RuntimeError(f"can't overwrite '{name}' in wo_mode 'write_safe'")
|
877
|
+
|
878
|
+
# struct or table or waveform table
|
879
|
+
if isinstance(obj, Struct):
|
880
|
+
# In order to append a column, we need to update the `table{old_fields}` value in `group.attrs['datatype"]` to include the new fields.
|
881
|
+
# One way to do this is to override `obj.attrs["datatype"]` to include old and new fields. Then we can write the fields to the table as normal.
|
882
|
+
if wo_mode == "ac":
|
883
|
+
old_group = self.gimme_group(name, group)
|
884
|
+
datatype, shape, fields = parse_datatype(old_group.attrs["datatype"])
|
885
|
+
if datatype not in ["table", "struct"]:
|
886
|
+
raise RuntimeError(
|
887
|
+
f"Trying to append columns to an object of type {datatype}"
|
888
|
+
)
|
889
|
+
|
890
|
+
# If the mode is `append_column`, make sure we aren't appending a table that has a column of the same name as in the existing table
|
891
|
+
# Also make sure that the field we are adding has the same size
|
892
|
+
if len(list(set(fields).intersection(set(obj.keys())))) != 0:
|
893
|
+
raise ValueError(
|
894
|
+
f"Can't append {list(set(fields).intersection(set(obj.keys())))} column(s) to a table with the same field(s)"
|
895
|
+
)
|
896
|
+
# It doesn't matter what key we access, as all fields in the old table have the same size
|
897
|
+
if old_group[list(old_group.keys())[0]].size != obj.size:
|
898
|
+
raise ValueError(
|
899
|
+
f"Table sizes don't match. Trying to append column of size {obj.size} to a table of size {old_group[list(old_group.keys())[0]].size}."
|
900
|
+
)
|
901
|
+
|
902
|
+
# Now we can append the obj.keys() to the old fields, and then update obj.attrs.
|
903
|
+
fields.extend(list(obj.keys()))
|
904
|
+
obj.attrs.pop("datatype")
|
905
|
+
obj.attrs["datatype"] = "table" + "{" + ",".join(fields) + "}"
|
906
|
+
|
907
|
+
group = self.gimme_group(
|
908
|
+
name,
|
909
|
+
group,
|
910
|
+
grp_attrs=obj.attrs,
|
911
|
+
overwrite=(wo_mode in ["o", "ac"]),
|
912
|
+
)
|
913
|
+
# If the mode is overwrite, then we need to peek into the file's table's existing fields
|
914
|
+
# If we are writing a new table to the group that does not contain an old field, we should delete that old field from the file
|
915
|
+
if wo_mode == "o":
|
916
|
+
# Find the old keys in the group that are not present in the new table's keys, then delete them
|
917
|
+
for key in list(set(group.keys()) - set(obj.keys())):
|
918
|
+
log.debug(f"{key} is not present in new table, deleting field")
|
919
|
+
del group[key]
|
920
|
+
|
921
|
+
for field in obj.keys():
|
922
|
+
# eventually compress waveform table values with LGDO's
|
923
|
+
# custom codecs before writing
|
924
|
+
# if waveformtable.values.attrs["compression"] is a string,
|
925
|
+
# interpret it as an HDF5 built-in filter
|
926
|
+
obj_fld = None
|
927
|
+
if (
|
928
|
+
isinstance(obj, WaveformTable)
|
929
|
+
and field == "values"
|
930
|
+
and not isinstance(obj.values, VectorOfEncodedVectors)
|
931
|
+
and not isinstance(obj.values, ArrayOfEncodedEqualSizedArrays)
|
932
|
+
and "compression" in obj.values.attrs
|
933
|
+
and isinstance(obj.values.attrs["compression"], WaveformCodec)
|
934
|
+
):
|
935
|
+
codec = obj.values.attrs["compression"]
|
936
|
+
obj_fld = compress.encode(obj.values, codec=codec)
|
937
|
+
else:
|
938
|
+
obj_fld = obj[field]
|
939
|
+
|
940
|
+
# Convert keys to string for dataset names
|
941
|
+
f = str(field)
|
942
|
+
self.write_object(
|
943
|
+
obj_fld,
|
944
|
+
f,
|
945
|
+
lh5_file,
|
946
|
+
group=group,
|
947
|
+
start_row=start_row,
|
948
|
+
n_rows=n_rows,
|
949
|
+
wo_mode=wo_mode,
|
950
|
+
write_start=write_start,
|
951
|
+
hdf5_compression=hdf5_compression,
|
952
|
+
)
|
953
|
+
return
|
954
|
+
|
955
|
+
# scalars
|
956
|
+
elif isinstance(obj, Scalar):
|
957
|
+
if name in group:
|
958
|
+
if wo_mode in ["o", "a"]:
|
959
|
+
log.debug(f"overwriting {name} in {group}")
|
960
|
+
del group[name]
|
961
|
+
else:
|
962
|
+
raise RuntimeError(
|
963
|
+
f"tried to overwrite {name} in {group} for wo_mode {wo_mode}"
|
964
|
+
)
|
965
|
+
ds = group.create_dataset(name, shape=(), data=obj.value)
|
966
|
+
ds.attrs.update(obj.attrs)
|
967
|
+
return
|
968
|
+
|
969
|
+
# vector of encoded vectors
|
970
|
+
elif isinstance(obj, (VectorOfEncodedVectors, ArrayOfEncodedEqualSizedArrays)):
|
971
|
+
group = self.gimme_group(
|
972
|
+
name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
|
973
|
+
)
|
974
|
+
|
975
|
+
self.write_object(
|
976
|
+
obj.encoded_data,
|
977
|
+
"encoded_data",
|
978
|
+
lh5_file,
|
979
|
+
group=group,
|
980
|
+
start_row=start_row,
|
981
|
+
n_rows=n_rows,
|
982
|
+
wo_mode=wo_mode,
|
983
|
+
write_start=write_start,
|
984
|
+
hdf5_compression=None, # data is already compressed!
|
985
|
+
)
|
986
|
+
|
987
|
+
self.write_object(
|
988
|
+
obj.decoded_size,
|
989
|
+
"decoded_size",
|
990
|
+
lh5_file,
|
991
|
+
group=group,
|
992
|
+
start_row=start_row,
|
993
|
+
n_rows=n_rows,
|
994
|
+
wo_mode=wo_mode,
|
995
|
+
write_start=write_start,
|
996
|
+
hdf5_compression=hdf5_compression,
|
997
|
+
)
|
998
|
+
|
999
|
+
# vector of vectors
|
1000
|
+
elif isinstance(obj, VectorOfVectors):
|
1001
|
+
group = self.gimme_group(
|
1002
|
+
name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
|
1003
|
+
)
|
1004
|
+
if (
|
1005
|
+
n_rows is None
|
1006
|
+
or n_rows > obj.cumulative_length.nda.shape[0] - start_row
|
1007
|
+
):
|
1008
|
+
n_rows = obj.cumulative_length.nda.shape[0] - start_row
|
1009
|
+
|
1010
|
+
# if appending we need to add an appropriate offset to the
|
1011
|
+
# cumulative lengths as appropriate for the in-file object
|
1012
|
+
offset = 0 # declare here because we have to subtract it off at the end
|
1013
|
+
if (wo_mode == "a" or wo_mode == "o") and "cumulative_length" in group:
|
1014
|
+
len_cl = len(group["cumulative_length"])
|
1015
|
+
if wo_mode == "a":
|
1016
|
+
write_start = len_cl
|
1017
|
+
if len_cl > 0:
|
1018
|
+
offset = group["cumulative_length"][write_start - 1]
|
1019
|
+
|
1020
|
+
# First write flattened_data array. Only write rows with data.
|
1021
|
+
fd_start = 0 if start_row == 0 else obj.cumulative_length.nda[start_row - 1]
|
1022
|
+
fd_n_rows = obj.cumulative_length.nda[start_row + n_rows - 1] - fd_start
|
1023
|
+
self.write_object(
|
1024
|
+
obj.flattened_data,
|
1025
|
+
"flattened_data",
|
1026
|
+
lh5_file,
|
1027
|
+
group=group,
|
1028
|
+
start_row=fd_start,
|
1029
|
+
n_rows=fd_n_rows,
|
1030
|
+
wo_mode=wo_mode,
|
1031
|
+
write_start=offset,
|
1032
|
+
hdf5_compression=hdf5_compression,
|
1033
|
+
)
|
1034
|
+
|
1035
|
+
# now offset is used to give appropriate in-file values for
|
1036
|
+
# cumulative_length. Need to adjust it for start_row
|
1037
|
+
if start_row > 0:
|
1038
|
+
offset -= obj.cumulative_length.nda[start_row - 1]
|
1039
|
+
|
1040
|
+
# Add offset to obj.cumulative_length itself to avoid memory allocation.
|
1041
|
+
# Then subtract it off after writing! (otherwise it will be changed
|
1042
|
+
# upon return)
|
1043
|
+
cl_dtype = obj.cumulative_length.nda.dtype.type
|
1044
|
+
obj.cumulative_length.nda += cl_dtype(offset)
|
1045
|
+
|
1046
|
+
self.write_object(
|
1047
|
+
obj.cumulative_length,
|
1048
|
+
"cumulative_length",
|
1049
|
+
lh5_file,
|
1050
|
+
group=group,
|
1051
|
+
start_row=start_row,
|
1052
|
+
n_rows=n_rows,
|
1053
|
+
wo_mode=wo_mode,
|
1054
|
+
write_start=write_start,
|
1055
|
+
hdf5_compression=hdf5_compression,
|
1056
|
+
)
|
1057
|
+
obj.cumulative_length.nda -= cl_dtype(offset)
|
1058
|
+
|
1059
|
+
return
|
1060
|
+
|
1061
|
+
# if we get this far, must be one of the Array types
|
1062
|
+
elif isinstance(obj, Array):
|
1063
|
+
if n_rows is None or n_rows > obj.nda.shape[0] - start_row:
|
1064
|
+
n_rows = obj.nda.shape[0] - start_row
|
1065
|
+
|
1066
|
+
nda = obj.nda[start_row : start_row + n_rows]
|
1067
|
+
|
1068
|
+
# hack to store bools as uint8 for c / Julia compliance
|
1069
|
+
if nda.dtype.name == "bool":
|
1070
|
+
nda = nda.astype(np.uint8)
|
1071
|
+
|
1072
|
+
# need to create dataset from ndarray the first time for speed
|
1073
|
+
# creating an empty dataset and appending to that is super slow!
|
1074
|
+
if (wo_mode != "a" and write_start == 0) or name not in group:
|
1075
|
+
maxshape = (None,) + nda.shape[1:]
|
1076
|
+
if wo_mode == "o" and name in group:
|
1077
|
+
log.debug(f"overwriting {name} in {group}")
|
1078
|
+
del group[name]
|
1079
|
+
|
1080
|
+
# create HDF5 dataset
|
1081
|
+
# - compress using the 'compression' LGDO attribute, if
|
1082
|
+
# available
|
1083
|
+
# - otherwise use "hdf5_compression"
|
1084
|
+
# - attach HDF5 dataset attributes, but not "compression"!
|
1085
|
+
comp_algo = obj.attrs.get("compression", hdf5_compression)
|
1086
|
+
comp_kwargs = {}
|
1087
|
+
if isinstance(comp_algo, str):
|
1088
|
+
comp_kwargs = {"compression": comp_algo}
|
1089
|
+
elif comp_algo is not None:
|
1090
|
+
comp_kwargs = comp_algo
|
1091
|
+
|
1092
|
+
ds = group.create_dataset(
|
1093
|
+
name, data=nda, maxshape=maxshape, **comp_kwargs
|
1094
|
+
)
|
1095
|
+
|
1096
|
+
_attrs = obj.getattrs(datatype=True)
|
1097
|
+
_attrs.pop("compression", None)
|
1098
|
+
ds.attrs.update(_attrs)
|
1099
|
+
return
|
1100
|
+
|
1101
|
+
# Now append or overwrite
|
1102
|
+
ds = group[name]
|
1103
|
+
if not isinstance(ds, h5py.Dataset):
|
1104
|
+
raise RuntimeError(
|
1105
|
+
f"existing HDF5 object '{name}' in group '{group}'"
|
1106
|
+
" is not a dataset! Cannot overwrite or append"
|
1107
|
+
)
|
1108
|
+
|
1109
|
+
old_len = ds.shape[0]
|
1110
|
+
if wo_mode == "a":
|
1111
|
+
write_start = old_len
|
1112
|
+
add_len = write_start + nda.shape[0] - old_len
|
1113
|
+
ds.resize(old_len + add_len, axis=0)
|
1114
|
+
ds[write_start:] = nda
|
1115
|
+
return
|
1116
|
+
|
1117
|
+
else:
|
1118
|
+
raise RuntimeError(
|
1119
|
+
f"do not know how to write '{name}' of type '{type(obj).__name__}'"
|
1120
|
+
)
|
1121
|
+
|
1122
|
+
def read_n_rows(self, name: str, lh5_file: str | h5py.File) -> int | None:
|
1123
|
+
"""Look up the number of rows in an Array-like object called `name` in
|
1124
|
+
`lh5_file`.
|
1125
|
+
|
1126
|
+
Return ``None`` if it is a :class:`.Scalar` or a :class:`.Struct`."""
|
1127
|
+
# this is basically a stripped down version of read_object
|
1128
|
+
h5f = self.gimme_file(lh5_file, "r")
|
1129
|
+
if not h5f or name not in h5f:
|
1130
|
+
raise KeyError(f"'{name}' not in {lh5_file}")
|
1131
|
+
|
1132
|
+
# get the datatype
|
1133
|
+
if "datatype" not in h5f[name].attrs:
|
1134
|
+
raise RuntimeError(
|
1135
|
+
f"'{name}' in file {lh5_file} is missing the datatype attribute"
|
1136
|
+
)
|
1137
|
+
|
1138
|
+
datatype = h5f[name].attrs["datatype"]
|
1139
|
+
datatype, shape, elements = parse_datatype(datatype)
|
1140
|
+
|
1141
|
+
# scalars are dim-0 datasets
|
1142
|
+
if datatype == "scalar":
|
1143
|
+
return None
|
1144
|
+
|
1145
|
+
# structs don't have rows
|
1146
|
+
if datatype == "struct":
|
1147
|
+
return None
|
1148
|
+
|
1149
|
+
# tables should have elements with all the same length
|
1150
|
+
if datatype == "table":
|
1151
|
+
# read out each of the fields
|
1152
|
+
rows_read = None
|
1153
|
+
for field in elements:
|
1154
|
+
n_rows_read = self.read_n_rows(name + "/" + field, h5f)
|
1155
|
+
if not rows_read:
|
1156
|
+
rows_read = n_rows_read
|
1157
|
+
elif rows_read != n_rows_read:
|
1158
|
+
log.warning(
|
1159
|
+
f"'{field}' field in table '{name}' has {rows_read} rows, "
|
1160
|
+
f"{n_rows_read} was expected"
|
1161
|
+
)
|
1162
|
+
return rows_read
|
1163
|
+
|
1164
|
+
# length of vector of vectors is the length of its cumulative_length
|
1165
|
+
if elements.startswith("array"):
|
1166
|
+
return self.read_n_rows(f"{name}/cumulative_length", h5f)
|
1167
|
+
|
1168
|
+
# length of vector of encoded vectors is the length of its decoded_size
|
1169
|
+
if (
|
1170
|
+
elements.startswith("encoded_array")
|
1171
|
+
or datatype == "array_of_encoded_equalsized_arrays"
|
1172
|
+
):
|
1173
|
+
return self.read_n_rows(f"{name}/encoded_data", h5f)
|
1174
|
+
|
1175
|
+
# return array length (without reading the array!)
|
1176
|
+
if "array" in datatype:
|
1177
|
+
# compute the number of rows to read
|
1178
|
+
return h5f[name].shape[0]
|
1179
|
+
|
1180
|
+
raise RuntimeError(f"don't know how to read datatype '{datatype}'")
|
1181
|
+
|
1182
|
+
|
1183
|
+
def ls(lh5_file: str | h5py.Group, lh5_group: str = "") -> list[str]:
|
1184
|
+
"""Return a list of LH5 groups in the input file and group, similar
|
1185
|
+
to ``ls`` or ``h5ls``. Supports wildcards in group names.
|
1186
|
+
|
1187
|
+
|
1188
|
+
Parameters
|
1189
|
+
----------
|
1190
|
+
lh5_file
|
1191
|
+
name of file.
|
1192
|
+
lh5_group
|
1193
|
+
group to search. add a ``/`` to the end of the group name if you want to
|
1194
|
+
list all objects inside that group.
|
1195
|
+
"""
|
1196
|
+
|
1197
|
+
log.debug(
|
1198
|
+
f"Listing objects in '{lh5_file}'"
|
1199
|
+
+ ("" if lh5_group == "" else f" (and group {lh5_group})")
|
1200
|
+
)
|
1201
|
+
|
1202
|
+
lh5_st = LH5Store()
|
1203
|
+
# To use recursively, make lh5_file a h5group instead of a string
|
1204
|
+
if isinstance(lh5_file, str):
|
1205
|
+
lh5_file = lh5_st.gimme_file(lh5_file, "r")
|
1206
|
+
if lh5_group.startswith("/"):
|
1207
|
+
lh5_group = lh5_group[1:]
|
1208
|
+
|
1209
|
+
if lh5_group == "":
|
1210
|
+
lh5_group = "*"
|
1211
|
+
|
1212
|
+
splitpath = lh5_group.split("/", 1)
|
1213
|
+
matchingkeys = fnmatch.filter(lh5_file.keys(), splitpath[0])
|
1214
|
+
|
1215
|
+
if len(splitpath) == 1:
|
1216
|
+
return matchingkeys
|
1217
|
+
else:
|
1218
|
+
ret = []
|
1219
|
+
for key in matchingkeys:
|
1220
|
+
ret.extend([f"{key}/{path}" for path in ls(lh5_file[key], splitpath[1])])
|
1221
|
+
return ret
|
1222
|
+
|
1223
|
+
|
1224
|
+
def show(
|
1225
|
+
lh5_file: str | h5py.Group,
|
1226
|
+
lh5_group: str = "/",
|
1227
|
+
attrs: bool = False,
|
1228
|
+
indent: str = "",
|
1229
|
+
header: bool = True,
|
1230
|
+
) -> None:
|
1231
|
+
"""Print a tree of LH5 file contents with LGDO datatype.
|
1232
|
+
|
1233
|
+
Parameters
|
1234
|
+
----------
|
1235
|
+
lh5_file
|
1236
|
+
the LH5 file.
|
1237
|
+
lh5_group
|
1238
|
+
print only contents of this HDF5 group.
|
1239
|
+
attrs
|
1240
|
+
print the HDF5 attributes too.
|
1241
|
+
indent
|
1242
|
+
indent the diagram with this string.
|
1243
|
+
header
|
1244
|
+
print `lh5_group` at the top of the diagram.
|
1245
|
+
|
1246
|
+
Examples
|
1247
|
+
--------
|
1248
|
+
>>> from lgdo import show
|
1249
|
+
>>> show("file.lh5", "/geds/raw")
|
1250
|
+
/geds/raw
|
1251
|
+
├── channel · array<1>{real}
|
1252
|
+
├── energy · array<1>{real}
|
1253
|
+
├── timestamp · array<1>{real}
|
1254
|
+
├── waveform · table{t0,dt,values}
|
1255
|
+
│ ├── dt · array<1>{real}
|
1256
|
+
│ ├── t0 · array<1>{real}
|
1257
|
+
│ └── values · array_of_equalsized_arrays<1,1>{real}
|
1258
|
+
└── wf_std · array<1>{real}
|
1259
|
+
"""
|
1260
|
+
# open file
|
1261
|
+
if isinstance(lh5_file, str):
|
1262
|
+
lh5_file = h5py.File(expand_path(lh5_file), "r")
|
1263
|
+
|
1264
|
+
# go to group
|
1265
|
+
if lh5_group != "/":
|
1266
|
+
lh5_file = lh5_file[lh5_group]
|
1267
|
+
|
1268
|
+
if header:
|
1269
|
+
print(f"\033[1m{lh5_group}\033[0m") # noqa: T201
|
1270
|
+
|
1271
|
+
# get an iterator over the keys in the group
|
1272
|
+
it = iter(lh5_file)
|
1273
|
+
key = None
|
1274
|
+
|
1275
|
+
# make sure there is actually something in this file/group
|
1276
|
+
try:
|
1277
|
+
key = next(it) # get first key
|
1278
|
+
except StopIteration:
|
1279
|
+
print(f"{indent}└── empty") # noqa: T201
|
1280
|
+
return
|
1281
|
+
|
1282
|
+
# loop over keys
|
1283
|
+
while True:
|
1284
|
+
val = lh5_file[key]
|
1285
|
+
# we want to print the LGDO datatype
|
1286
|
+
dtype = val.attrs.get("datatype", default="no datatype")
|
1287
|
+
if dtype == "no datatype" and isinstance(val, h5py.Group):
|
1288
|
+
dtype = "HDF5 group"
|
1289
|
+
|
1290
|
+
attrs_d = dict(val.attrs)
|
1291
|
+
attrs_d.pop("datatype", "")
|
1292
|
+
attrs = "── " + str(attrs_d) if attrs_d else ""
|
1293
|
+
|
1294
|
+
# is this the last key?
|
1295
|
+
killme = False
|
1296
|
+
try:
|
1297
|
+
k_new = next(it) # get next key
|
1298
|
+
except StopIteration:
|
1299
|
+
char = "└──"
|
1300
|
+
killme = True # we'll have to kill this loop later
|
1301
|
+
else:
|
1302
|
+
char = "├──"
|
1303
|
+
|
1304
|
+
print(f"{indent}{char} \033[1m{key}\033[0m · {dtype} {attrs}") # noqa: T201
|
1305
|
+
|
1306
|
+
# if it's a group, call this function recursively
|
1307
|
+
if isinstance(val, h5py.Group):
|
1308
|
+
show(val, indent=indent + (" " if killme else "│ "), header=False)
|
1309
|
+
|
1310
|
+
# break or move to next key
|
1311
|
+
if killme:
|
1312
|
+
break
|
1313
|
+
else:
|
1314
|
+
key = k_new
|
1315
|
+
|
1316
|
+
|
1317
|
+
def load_nda(
|
1318
|
+
f_list: str | list[str],
|
1319
|
+
par_list: list[str],
|
1320
|
+
lh5_group: str = "",
|
1321
|
+
idx_list: list[np.ndarray | list | tuple] = None,
|
1322
|
+
) -> dict[str, np.ndarray]:
|
1323
|
+
r"""Build a dictionary of :class:`numpy.ndarray`\ s from LH5 data.
|
1324
|
+
|
1325
|
+
Given a list of files, a list of LH5 table parameters, and an optional
|
1326
|
+
group path, return a NumPy array with all values for each parameter.
|
1327
|
+
|
1328
|
+
Parameters
|
1329
|
+
----------
|
1330
|
+
f_list
|
1331
|
+
A list of files. Can contain wildcards.
|
1332
|
+
par_list
|
1333
|
+
A list of parameters to read from each file.
|
1334
|
+
lh5_group
|
1335
|
+
group path within which to find the specified parameters.
|
1336
|
+
idx_list
|
1337
|
+
for fancy-indexed reads. Must be one index array for each file in
|
1338
|
+
`f_list`.
|
1339
|
+
|
1340
|
+
Returns
|
1341
|
+
-------
|
1342
|
+
par_data
|
1343
|
+
A dictionary of the parameter data keyed by the elements of `par_list`.
|
1344
|
+
Each entry contains the data for the specified parameter concatenated
|
1345
|
+
over all files in `f_list`.
|
1346
|
+
"""
|
1347
|
+
if isinstance(f_list, str):
|
1348
|
+
f_list = [f_list]
|
1349
|
+
if idx_list is not None:
|
1350
|
+
idx_list = [idx_list]
|
1351
|
+
if idx_list is not None and len(f_list) != len(idx_list):
|
1352
|
+
raise ValueError(
|
1353
|
+
f"f_list length ({len(f_list)}) != idx_list length ({len(idx_list)})!"
|
1354
|
+
)
|
1355
|
+
|
1356
|
+
# Expand wildcards
|
1357
|
+
f_list = [f for f_wc in f_list for f in sorted(glob.glob(os.path.expandvars(f_wc)))]
|
1358
|
+
|
1359
|
+
sto = LH5Store()
|
1360
|
+
par_data = {par: [] for par in par_list}
|
1361
|
+
for ii, f in enumerate(f_list):
|
1362
|
+
f = sto.gimme_file(f, "r")
|
1363
|
+
for par in par_list:
|
1364
|
+
if f"{lh5_group}/{par}" not in f:
|
1365
|
+
raise RuntimeError(f"'{lh5_group}/{par}' not in file {f_list[ii]}")
|
1366
|
+
|
1367
|
+
if idx_list is None:
|
1368
|
+
data, _ = sto.read_object(f"{lh5_group}/{par}", f)
|
1369
|
+
else:
|
1370
|
+
data, _ = sto.read_object(f"{lh5_group}/{par}", f, idx=idx_list[ii])
|
1371
|
+
if not data:
|
1372
|
+
continue
|
1373
|
+
par_data[par].append(data.nda)
|
1374
|
+
par_data = {par: np.concatenate(par_data[par]) for par in par_list}
|
1375
|
+
return par_data
|
1376
|
+
|
1377
|
+
|
1378
|
+
def load_dfs(
|
1379
|
+
f_list: str | list[str],
|
1380
|
+
par_list: list[str],
|
1381
|
+
lh5_group: str = "",
|
1382
|
+
idx_list: list[np.ndarray | list | tuple] = None,
|
1383
|
+
) -> pd.DataFrame:
|
1384
|
+
"""Build a :class:`pandas.DataFrame` from LH5 data.
|
1385
|
+
|
1386
|
+
Given a list of files (can use wildcards), a list of LH5 columns, and
|
1387
|
+
optionally the group path, return a :class:`pandas.DataFrame` with all
|
1388
|
+
values for each parameter.
|
1389
|
+
|
1390
|
+
See Also
|
1391
|
+
--------
|
1392
|
+
:func:`load_nda`
|
1393
|
+
|
1394
|
+
Returns
|
1395
|
+
-------
|
1396
|
+
dataframe
|
1397
|
+
contains columns for each parameter in `par_list`, and rows containing
|
1398
|
+
all data for the associated parameters concatenated over all files in
|
1399
|
+
`f_list`.
|
1400
|
+
"""
|
1401
|
+
return pd.DataFrame(
|
1402
|
+
load_nda(f_list, par_list, lh5_group=lh5_group, idx_list=idx_list)
|
1403
|
+
)
|
1404
|
+
|
1405
|
+
|
1406
|
+
class LH5Iterator(Iterator):
|
1407
|
+
"""
|
1408
|
+
A class for iterating through one or more LH5 files, one block of entries
|
1409
|
+
at a time. This also accepts an entry list/mask to enable event selection,
|
1410
|
+
and a field mask.
|
1411
|
+
|
1412
|
+
This class can be used either for random access:
|
1413
|
+
|
1414
|
+
>>> lh5_obj, n_rows = lh5_it.read(entry)
|
1415
|
+
|
1416
|
+
to read the block of entries starting at entry. In case of multiple files
|
1417
|
+
or the use of an event selection, entry refers to a global event index
|
1418
|
+
across files and does not count events that are excluded by the selection.
|
1419
|
+
|
1420
|
+
This can also be used as an iterator:
|
1421
|
+
|
1422
|
+
>>> for lh5_obj, entry, n_rows in LH5Iterator(...):
|
1423
|
+
>>> # do the thing!
|
1424
|
+
|
1425
|
+
This is intended for if you are reading a large quantity of data but
|
1426
|
+
want to limit your memory usage (particularly when reading in waveforms!).
|
1427
|
+
The ``lh5_obj`` that is read by this class is reused in order to avoid
|
1428
|
+
reallocation of memory; this means that if you want to hold on to data
|
1429
|
+
between reads, you will have to copy it somewhere!
|
1430
|
+
"""
|
1431
|
+
|
1432
|
+
def __init__(
|
1433
|
+
self,
|
1434
|
+
lh5_files: str | list[str],
|
1435
|
+
groups: str | list[str],
|
1436
|
+
base_path: str = "",
|
1437
|
+
entry_list: list[int] | list[list[int]] = None,
|
1438
|
+
entry_mask: list[bool] | list[list[bool]] = None,
|
1439
|
+
field_mask: dict[str, bool] | list[str] | tuple[str] = None,
|
1440
|
+
buffer_len: int = 3200,
|
1441
|
+
friend: LH5Iterator = None,
|
1442
|
+
) -> None:
|
1443
|
+
"""
|
1444
|
+
Parameters
|
1445
|
+
----------
|
1446
|
+
lh5_files
|
1447
|
+
file or files to read from. May include wildcards and environment
|
1448
|
+
variables.
|
1449
|
+
groups
|
1450
|
+
HDF5 group(s) to read. If a list is provided for both lh5_files
|
1451
|
+
and group, they must be the same size. If a file is wild-carded,
|
1452
|
+
the same group will be assigned to each file found
|
1453
|
+
entry_list
|
1454
|
+
list of entry numbers to read. If a nested list is provided,
|
1455
|
+
expect one top-level list for each file, containing a list of
|
1456
|
+
local entries. If a list of ints is provided, use global entries.
|
1457
|
+
entry_mask
|
1458
|
+
mask of entries to read. If a list of arrays is provided, expect
|
1459
|
+
one for each file. Ignore if a selection list is provided.
|
1460
|
+
field_mask
|
1461
|
+
mask of which fields to read. See :meth:`LH5Store.read_object` for
|
1462
|
+
more details.
|
1463
|
+
buffer_len
|
1464
|
+
number of entries to read at a time while iterating through files.
|
1465
|
+
friend
|
1466
|
+
a ''friend'' LH5Iterator that will be read in parallel with this.
|
1467
|
+
The friend should have the same length and entry list. A single
|
1468
|
+
LH5 table containing columns from both iterators will be returned.
|
1469
|
+
"""
|
1470
|
+
self.lh5_st = LH5Store(base_path=base_path, keep_open=True)
|
1471
|
+
|
1472
|
+
# List of files, with wildcards and env vars expanded
|
1473
|
+
if isinstance(lh5_files, str):
|
1474
|
+
lh5_files = [lh5_files]
|
1475
|
+
if isinstance(groups, list):
|
1476
|
+
lh5_files *= len(groups)
|
1477
|
+
elif not isinstance(lh5_files, list):
|
1478
|
+
raise ValueError("lh5_files must be a string or list of strings")
|
1479
|
+
|
1480
|
+
if isinstance(groups, str):
|
1481
|
+
groups = [groups] * len(lh5_files)
|
1482
|
+
elif not isinstance(groups, list):
|
1483
|
+
raise ValueError("group must be a string or list of strings")
|
1484
|
+
|
1485
|
+
if not len(groups) == len(lh5_files):
|
1486
|
+
raise ValueError("lh5_files and groups must have same length")
|
1487
|
+
|
1488
|
+
self.lh5_files = []
|
1489
|
+
self.groups = []
|
1490
|
+
for f, g in zip(lh5_files, groups):
|
1491
|
+
f_exp = expand_path(f, list=True, base_path=base_path)
|
1492
|
+
self.lh5_files += f_exp
|
1493
|
+
self.groups += [g] * len(f_exp)
|
1494
|
+
|
1495
|
+
if entry_list is not None and entry_mask is not None:
|
1496
|
+
raise ValueError(
|
1497
|
+
"entry_list and entry_mask arguments are mutually exclusive"
|
1498
|
+
)
|
1499
|
+
|
1500
|
+
# Map to last row in each file
|
1501
|
+
self.file_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i")
|
1502
|
+
# Map to last iterator entry for each file
|
1503
|
+
self.entry_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i")
|
1504
|
+
self.buffer_len = buffer_len
|
1505
|
+
|
1506
|
+
if len(self.lh5_files) > 0:
|
1507
|
+
f = self.lh5_files[0]
|
1508
|
+
g = self.groups[0]
|
1509
|
+
self.lh5_buffer = self.lh5_st.get_buffer(
|
1510
|
+
g,
|
1511
|
+
f,
|
1512
|
+
size=self.buffer_len,
|
1513
|
+
field_mask=field_mask,
|
1514
|
+
)
|
1515
|
+
self.file_map[0] = self.lh5_st.read_n_rows(g, f)
|
1516
|
+
else:
|
1517
|
+
raise RuntimeError(f"can't open any files from {lh5_files}")
|
1518
|
+
|
1519
|
+
self.n_rows = 0
|
1520
|
+
self.current_entry = 0
|
1521
|
+
self.next_entry = 0
|
1522
|
+
|
1523
|
+
self.field_mask = field_mask
|
1524
|
+
|
1525
|
+
# List of entry indices from each file
|
1526
|
+
self.local_entry_list = None
|
1527
|
+
self.global_entry_list = None
|
1528
|
+
if entry_list is not None:
|
1529
|
+
entry_list = list(entry_list)
|
1530
|
+
if isinstance(entry_list[0], int):
|
1531
|
+
self.local_entry_list = [None] * len(self.file_map)
|
1532
|
+
self.global_entry_list = np.array(entry_list, "i")
|
1533
|
+
self.global_entry_list.sort()
|
1534
|
+
|
1535
|
+
else:
|
1536
|
+
self.local_entry_list = [[]] * len(self.file_map)
|
1537
|
+
for i_file, local_list in enumerate(entry_list):
|
1538
|
+
self.local_entry_list[i_file] = np.array(local_list, "i")
|
1539
|
+
self.local_entry_list[i_file].sort()
|
1540
|
+
|
1541
|
+
elif entry_mask is not None:
|
1542
|
+
# Convert entry mask into an entry list
|
1543
|
+
if isinstance(entry_mask, pd.Series):
|
1544
|
+
entry_mask = entry_mask.values
|
1545
|
+
if isinstance(entry_mask, np.ndarray):
|
1546
|
+
self.local_entry_list = [None] * len(self.file_map)
|
1547
|
+
self.global_entry_list = np.nonzero(entry_mask)[0]
|
1548
|
+
else:
|
1549
|
+
self.local_entry_list = [[]] * len(self.file_map)
|
1550
|
+
for i_file, local_mask in enumerate(entry_mask):
|
1551
|
+
self.local_entry_list[i_file] = np.nonzero(local_mask)[0]
|
1552
|
+
|
1553
|
+
# Attach the friend
|
1554
|
+
if friend is not None:
|
1555
|
+
if not isinstance(friend, LH5Iterator):
|
1556
|
+
raise ValueError("Friend must be an LH5Iterator")
|
1557
|
+
self.lh5_buffer.join(friend.lh5_buffer)
|
1558
|
+
self.friend = friend
|
1559
|
+
|
1560
|
+
def _get_file_cumlen(self, i_file: int) -> int:
|
1561
|
+
"""Helper to get cumulative file length of file"""
|
1562
|
+
if i_file < 0:
|
1563
|
+
return 0
|
1564
|
+
fcl = self.file_map[i_file]
|
1565
|
+
if fcl == np.iinfo("i").max:
|
1566
|
+
fcl = self._get_file_cumlen(i_file - 1) + self.lh5_st.read_n_rows(
|
1567
|
+
self.groups[i_file], self.lh5_files[i_file]
|
1568
|
+
)
|
1569
|
+
self.file_map[i_file] = fcl
|
1570
|
+
return fcl
|
1571
|
+
|
1572
|
+
def _get_file_cumentries(self, i_file: int) -> int:
|
1573
|
+
"""Helper to get cumulative iterator entries in file"""
|
1574
|
+
if i_file < 0:
|
1575
|
+
return 0
|
1576
|
+
n = self.entry_map[i_file]
|
1577
|
+
if n == np.iinfo("i").max:
|
1578
|
+
elist = self.get_file_entrylist(i_file)
|
1579
|
+
fcl = self._get_file_cumlen(i_file)
|
1580
|
+
if elist is None:
|
1581
|
+
# no entry list provided
|
1582
|
+
n = fcl
|
1583
|
+
else:
|
1584
|
+
file_entries = self.get_file_entrylist(i_file)
|
1585
|
+
# check that file entries fall inside of file
|
1586
|
+
if file_entries[-1] >= fcl:
|
1587
|
+
logging.warning(f"Found entries out of range for file {i_file}")
|
1588
|
+
n = np.searchsorted(file_entries, fcl, "right")
|
1589
|
+
else:
|
1590
|
+
n = len(file_entries)
|
1591
|
+
n += self._get_file_cumentries(i_file - 1)
|
1592
|
+
self.entry_map[i_file] = n
|
1593
|
+
return n
|
1594
|
+
|
1595
|
+
def get_file_entrylist(self, i_file: int) -> np.ndarray:
|
1596
|
+
"""Helper to get entry list for file"""
|
1597
|
+
# If no entry list is provided
|
1598
|
+
if self.local_entry_list is None:
|
1599
|
+
return None
|
1600
|
+
|
1601
|
+
elist = self.local_entry_list[i_file]
|
1602
|
+
if elist is None:
|
1603
|
+
# Get local entrylist for this file from global entry list
|
1604
|
+
f_start = self._get_file_cumlen(i_file - 1)
|
1605
|
+
f_end = self._get_file_cumlen(i_file)
|
1606
|
+
i_start = self._get_file_cumentries(i_file - 1)
|
1607
|
+
i_stop = np.searchsorted(self.global_entry_list, f_end, "right")
|
1608
|
+
elist = np.array(self.global_entry_list[i_start:i_stop], "i") - f_start
|
1609
|
+
self.local_entry_list[i_file] = elist
|
1610
|
+
return elist
|
1611
|
+
|
1612
|
+
def get_global_entrylist(self) -> np.ndarray:
|
1613
|
+
"""Get global entry list, constructing it if needed"""
|
1614
|
+
if self.global_entry_list is None and self.local_entry_list is not None:
|
1615
|
+
self.global_entry_list = np.zeros(len(self), "i")
|
1616
|
+
for i_file in range(len(self.lh5_files)):
|
1617
|
+
i_start = self.get_file_cumentries(i_file - 1)
|
1618
|
+
i_stop = self.get_file_cumentries(i_file)
|
1619
|
+
f_start = self.get_file_cumlen(i_file - 1)
|
1620
|
+
self.global_entry_list[i_start:i_stop] = (
|
1621
|
+
self.get_file_entrylist(i_file) + f_start
|
1622
|
+
)
|
1623
|
+
return self.global_entry_list
|
1624
|
+
|
1625
|
+
def read(self, entry: int) -> tuple[LGDO, int]:
|
1626
|
+
"""Read the nextlocal chunk of events, starting at entry. Return the
|
1627
|
+
LH5 buffer and number of rows read."""
|
1628
|
+
self.n_rows = 0
|
1629
|
+
i_file = np.searchsorted(self.entry_map, entry, "right")
|
1630
|
+
|
1631
|
+
# if file hasn't been opened yet, search through files
|
1632
|
+
# sequentially until we find the right one
|
1633
|
+
if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("i").max:
|
1634
|
+
while i_file < len(self.lh5_files) and entry >= self._get_file_cumentries(
|
1635
|
+
i_file
|
1636
|
+
):
|
1637
|
+
i_file += 1
|
1638
|
+
|
1639
|
+
if i_file == len(self.lh5_files):
|
1640
|
+
return (self.lh5_buffer, self.n_rows)
|
1641
|
+
local_entry = entry - self._get_file_cumentries(i_file - 1)
|
1642
|
+
|
1643
|
+
while self.n_rows < self.buffer_len and i_file < len(self.file_map):
|
1644
|
+
# Loop through files
|
1645
|
+
local_idx = self.get_file_entrylist(i_file)
|
1646
|
+
i_local = local_idx[local_entry] if local_idx is not None else local_entry
|
1647
|
+
self.lh5_buffer, n_rows = self.lh5_st.read_object(
|
1648
|
+
self.groups[i_file],
|
1649
|
+
self.lh5_files[i_file],
|
1650
|
+
start_row=i_local,
|
1651
|
+
n_rows=self.buffer_len - self.n_rows,
|
1652
|
+
idx=local_idx,
|
1653
|
+
field_mask=self.field_mask,
|
1654
|
+
obj_buf=self.lh5_buffer,
|
1655
|
+
obj_buf_start=self.n_rows,
|
1656
|
+
)
|
1657
|
+
|
1658
|
+
self.n_rows += n_rows
|
1659
|
+
i_file += 1
|
1660
|
+
local_entry = 0
|
1661
|
+
|
1662
|
+
self.current_entry = entry
|
1663
|
+
|
1664
|
+
if self.friend is not None:
|
1665
|
+
self.friend.read(entry)
|
1666
|
+
|
1667
|
+
return (self.lh5_buffer, self.n_rows)
|
1668
|
+
|
1669
|
+
def reset_field_mask(self, mask):
|
1670
|
+
"""Replaces the field mask of this iterator and any friends with mask"""
|
1671
|
+
self.field_mask = mask
|
1672
|
+
if self.friend is not None:
|
1673
|
+
self.friend.reset_field_mask(mask)
|
1674
|
+
|
1675
|
+
def __len__(self) -> int:
|
1676
|
+
"""Return the total number of entries."""
|
1677
|
+
return (
|
1678
|
+
self._get_file_cumentries(len(self.lh5_files) - 1)
|
1679
|
+
if len(self.entry_map) > 0
|
1680
|
+
else 0
|
1681
|
+
)
|
1682
|
+
|
1683
|
+
def __iter__(self) -> Iterator:
|
1684
|
+
"""Loop through entries in blocks of size buffer_len."""
|
1685
|
+
self.current_entry = 0
|
1686
|
+
self.next_entry = 0
|
1687
|
+
return self
|
1688
|
+
|
1689
|
+
def __next__(self) -> tuple[LGDO, int, int]:
|
1690
|
+
"""Read next buffer_len entries and return lh5_table, iterator entry
|
1691
|
+
and n_rows read."""
|
1692
|
+
buf, n_rows = self.read(self.next_entry)
|
1693
|
+
self.next_entry = self.current_entry + n_rows
|
1694
|
+
if n_rows == 0:
|
1695
|
+
raise StopIteration
|
1696
|
+
return (buf, self.current_entry, n_rows)
|
1697
|
+
|
1698
|
+
|
1699
|
+
@nb.njit(parallel=False, fastmath=True)
|
1700
|
+
def _make_fd_idx(starts, stops, idx):
|
1701
|
+
k = 0
|
1702
|
+
if len(starts) < len(stops):
|
1703
|
+
for i in range(stops[0]):
|
1704
|
+
idx[k] = i
|
1705
|
+
k += 1
|
1706
|
+
stops = stops[1:]
|
1707
|
+
for j in range(len(starts)):
|
1708
|
+
for i in range(starts[j], stops[j]):
|
1709
|
+
idx[k] = i
|
1710
|
+
k += 1
|
1711
|
+
return (idx,)
|