legend-pydataobj 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lgdo/lh5_store.py ADDED
@@ -0,0 +1,1711 @@
1
+ """
2
+ This module implements routines from reading and writing LEGEND Data Objects in
3
+ HDF5 files.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import fnmatch
8
+ import glob
9
+ import logging
10
+ import os
11
+ import sys
12
+ from bisect import bisect_left
13
+ from collections import defaultdict
14
+ from typing import Any, Iterator, Union
15
+
16
+ import h5py
17
+ import numba as nb
18
+ import numpy as np
19
+ import pandas as pd
20
+
21
+ from . import compression as compress
22
+ from .compression import WaveformCodec
23
+ from .lgdo_utils import expand_path, parse_datatype
24
+ from .types import (
25
+ Array,
26
+ ArrayOfEncodedEqualSizedArrays,
27
+ ArrayOfEqualSizedArrays,
28
+ FixedSizeArray,
29
+ Scalar,
30
+ Struct,
31
+ Table,
32
+ VectorOfEncodedVectors,
33
+ VectorOfVectors,
34
+ WaveformTable,
35
+ )
36
+
37
+ LGDO = Union[Array, Scalar, Struct, VectorOfVectors]
38
+
39
+ log = logging.getLogger(__name__)
40
+
41
+ DEFAULT_HDF5_COMPRESSION = None
42
+
43
+
44
+ class LH5Store:
45
+ """
46
+ Class to represent a store of LEGEND HDF5 files. The two main methods
47
+ implemented by the class are :meth:`read_object` and :meth:`write_object`.
48
+
49
+ Examples
50
+ --------
51
+ >>> from lgdo import LH5Store
52
+ >>> store = LH5Store()
53
+ >>> obj, _ = store.read_object("/geds/waveform", "file.lh5")
54
+ >>> type(obj)
55
+ lgdo.waveform_table.WaveformTable
56
+ """
57
+
58
+ def __init__(self, base_path: str = "", keep_open: bool = False) -> None:
59
+ """
60
+ Parameters
61
+ ----------
62
+ base_path
63
+ directory path to prepend to LH5 files.
64
+ keep_open
65
+ whether to keep files open by storing the :mod:`h5py` objects as
66
+ class attributes.
67
+ """
68
+ self.base_path = "" if base_path == "" else expand_path(base_path)
69
+ self.keep_open = keep_open
70
+ self.files = {}
71
+
72
+ def gimme_file(self, lh5_file: str | h5py.File, mode: str = "r") -> h5py.File:
73
+ """Returns a :mod:`h5py` file object from the store or creates a new one.
74
+
75
+ Parameters
76
+ ----------
77
+ lh5_file
78
+ LH5 file name.
79
+ mode
80
+ mode in which to open file. See :class:`h5py.File` documentation.
81
+ """
82
+ if isinstance(lh5_file, h5py.File):
83
+ return lh5_file
84
+ if mode == "r":
85
+ lh5_file = expand_path(lh5_file, base_path=self.base_path)
86
+ if lh5_file in self.files.keys():
87
+ return self.files[lh5_file]
88
+ if self.base_path != "":
89
+ full_path = os.path.join(self.base_path, lh5_file)
90
+ else:
91
+ full_path = lh5_file
92
+ if mode != "r":
93
+ directory = os.path.dirname(full_path)
94
+ if directory != "" and not os.path.exists(directory):
95
+ log.debug(f"making path {directory}")
96
+ os.makedirs(directory)
97
+ if mode == "r" and not os.path.exists(full_path):
98
+ raise FileNotFoundError(f"file {full_path} not found")
99
+ if mode != "r" and os.path.exists(full_path):
100
+ log.debug(f"opening existing file {full_path} in mode '{mode}'")
101
+ h5f = h5py.File(full_path, mode)
102
+ if self.keep_open:
103
+ self.files[lh5_file] = h5f
104
+ return h5f
105
+
106
+ def gimme_group(
107
+ self,
108
+ group: str | h5py.Group,
109
+ base_group: h5py.Group,
110
+ grp_attrs: dict[str, Any] = None,
111
+ overwrite: bool = False,
112
+ ) -> h5py.Group:
113
+ """
114
+ Returns an existing :class:`h5py` group from a base group or creates a
115
+ new one. Can also set (or replace) group attributes.
116
+
117
+ Parameters
118
+ ----------
119
+ group
120
+ name of the HDF5 group.
121
+ base_group
122
+ HDF5 group to be used as a base.
123
+ grp_attrs
124
+ HDF5 group attributes.
125
+ overwrite
126
+ whether overwrite group attributes, ignored if `grp_attrs` is
127
+ ``None``.
128
+ """
129
+ if not isinstance(group, h5py.Group):
130
+ if group in base_group:
131
+ group = base_group[group]
132
+ else:
133
+ group = base_group.create_group(group)
134
+ if grp_attrs is not None:
135
+ group.attrs.update(grp_attrs)
136
+ return group
137
+ if (
138
+ grp_attrs is not None
139
+ and len(set(grp_attrs.items()) ^ set(group.attrs.items())) > 0
140
+ ):
141
+ if not overwrite:
142
+ raise RuntimeError("grp_attrs != group.attrs but overwrite not set")
143
+ else:
144
+ log.debug(f"overwriting {group}.attrs...")
145
+ for key in group.attrs.keys():
146
+ group.attrs.pop(key)
147
+ group.attrs.update(grp_attrs)
148
+ return group
149
+
150
+ def get_buffer(
151
+ self,
152
+ name: str,
153
+ lh5_file: str | h5py.File | list[str | h5py.File],
154
+ size: int = None,
155
+ field_mask: dict[str, bool] | list[str] | tuple[str] = None,
156
+ ) -> LGDO:
157
+ """Returns an LH5 object appropriate for use as a pre-allocated buffer
158
+ in a read loop. Sets size to `size` if object has a size.
159
+ """
160
+ obj, n_rows = self.read_object(name, lh5_file, n_rows=0, field_mask=field_mask)
161
+ if hasattr(obj, "resize") and size is not None:
162
+ obj.resize(new_size=size)
163
+ return obj
164
+
165
+ def read_object(
166
+ self,
167
+ name: str,
168
+ lh5_file: str | h5py.File | list[str | h5py.File],
169
+ start_row: int = 0,
170
+ n_rows: int = sys.maxsize,
171
+ idx: np.ndarray | list | tuple | list[np.ndarray | list | tuple] = None,
172
+ field_mask: dict[str, bool] | list[str] | tuple[str] = None,
173
+ obj_buf: LGDO = None,
174
+ obj_buf_start: int = 0,
175
+ decompress: bool = True,
176
+ ) -> tuple[LGDO, int]:
177
+ """Read LH5 object data from a file.
178
+
179
+ Parameters
180
+ ----------
181
+ name
182
+ Name of the LH5 object to be read (including its group path).
183
+ lh5_file
184
+ The file(s) containing the object to be read out. If a list of
185
+ files, array-like object data will be concatenated into the output
186
+ object.
187
+ start_row
188
+ Starting entry for the object read (for array-like objects). For a
189
+ list of files, only applies to the first file.
190
+ n_rows
191
+ The maximum number of rows to read (for array-like objects). The
192
+ actual number of rows read will be returned as one of the return
193
+ values (see below).
194
+ idx
195
+ For NumPy-style "fancying indexing" for the read. Used to read out
196
+ rows that pass some selection criteria. Only selection along the first
197
+ axis is supported, so tuple arguments must be one-tuples. If `n_rows`
198
+ is not false, `idx` will be truncated to `n_rows` before reading. To use
199
+ with a list of files, can pass in a list of `idx`'s (one for each
200
+ file) or use a long contiguous list (e.g. built from a previous
201
+ identical read). If used in conjunction with `start_row` and `n_rows`,
202
+ will be sliced to obey those constraints, where `n_rows` is
203
+ interpreted as the (max) number of *selected* values (in `idx`) to be
204
+ read out.
205
+ field_mask
206
+ For tables and structs, determines which fields get written out.
207
+ Only applies to immediate fields of the requested objects. If a dict
208
+ is used, a default dict will be made with the default set to the
209
+ opposite of the first element in the dict. This way if one specifies
210
+ a few fields at ``False``, all but those fields will be read out,
211
+ while if one specifies just a few fields as ``True``, only those
212
+ fields will be read out. If a list is provided, the listed fields
213
+ will be set to ``True``, while the rest will default to ``False``.
214
+ obj_buf
215
+ Read directly into memory provided in `obj_buf`. Note: the buffer
216
+ will be expanded to accommodate the data requested. To maintain the
217
+ buffer length, send in ``n_rows = len(obj_buf)``.
218
+ obj_buf_start
219
+ Start location in ``obj_buf`` for read. For concatenating data to
220
+ array-like objects.
221
+ decompress
222
+ Decompress data encoded with LGDO's compression routines right
223
+ after reading. The option has no effect on data encoded with HDF5
224
+ built-in filters, which is always decompressed upstream by HDF5.
225
+
226
+ Returns
227
+ -------
228
+ (object, n_rows_read)
229
+ `object` is the read-out object `n_rows_read` is the number of rows
230
+ successfully read out. Essential for arrays when the amount of data
231
+ is smaller than the object buffer. For scalars and structs
232
+ `n_rows_read` will be``1``. For tables it is redundant with
233
+ ``table.loc``.
234
+ """
235
+ # Handle list-of-files recursively
236
+ if not isinstance(lh5_file, (str, h5py.File)):
237
+ lh5_file = list(lh5_file)
238
+ n_rows_read = 0
239
+ for i, h5f in enumerate(lh5_file):
240
+ if isinstance(idx, list) and len(idx) > 0 and not np.isscalar(idx[0]):
241
+ # a list of lists: must be one per file
242
+ idx_i = idx[i]
243
+ elif idx is not None:
244
+ # make idx a proper tuple if it's not one already
245
+ if not (isinstance(idx, tuple) and len(idx) == 1):
246
+ idx = (idx,)
247
+ # idx is a long continuous array
248
+ n_rows_i = self.read_n_rows(name, h5f)
249
+ # find the length of the subset of idx that contains indices
250
+ # that are less than n_rows_i
251
+ n_rows_to_read_i = bisect_left(idx[0], n_rows_i)
252
+ # now split idx into idx_i and the remainder
253
+ idx_i = (idx[0][:n_rows_to_read_i],)
254
+ idx = (idx[0][n_rows_to_read_i:] - n_rows_i,)
255
+ else:
256
+ idx_i = None
257
+ n_rows_i = n_rows - n_rows_read
258
+ obj_buf, n_rows_read_i = self.read_object(
259
+ name,
260
+ lh5_file[i],
261
+ start_row=start_row,
262
+ n_rows=n_rows_i,
263
+ idx=idx_i,
264
+ field_mask=field_mask,
265
+ obj_buf=obj_buf,
266
+ obj_buf_start=obj_buf_start,
267
+ decompress=decompress,
268
+ )
269
+ n_rows_read += n_rows_read_i
270
+ if n_rows_read >= n_rows or obj_buf is None:
271
+ return obj_buf, n_rows_read
272
+ start_row = 0
273
+ obj_buf_start += n_rows_read_i
274
+ return obj_buf, n_rows_read
275
+
276
+ # get the file from the store
277
+ h5f = self.gimme_file(lh5_file, "r")
278
+ if not h5f or name not in h5f:
279
+ raise KeyError(f"'{name}' not in {h5f.filename}")
280
+
281
+ log.debug(
282
+ f"reading {h5f.filename}:{name}[{start_row}:{n_rows}], decompress = {decompress}, "
283
+ + (f" with field mask {field_mask}" if field_mask else "")
284
+ )
285
+
286
+ # make idx a proper tuple if it's not one already
287
+ if not (isinstance(idx, tuple) and len(idx) == 1):
288
+ if idx is not None:
289
+ idx = (idx,)
290
+
291
+ # get the object's datatype
292
+ if "datatype" not in h5f[name].attrs:
293
+ raise RuntimeError(
294
+ f"'{name}' in file {lh5_file} is missing the datatype attribute"
295
+ )
296
+
297
+ datatype = h5f[name].attrs["datatype"]
298
+ datatype, shape, elements = parse_datatype(datatype)
299
+
300
+ # check field_mask and make it a default dict
301
+ if datatype == "struct" or datatype == "table":
302
+ if field_mask is None:
303
+ field_mask = defaultdict(lambda: True)
304
+ elif isinstance(field_mask, dict):
305
+ default = True
306
+ if len(field_mask) > 0:
307
+ default = not field_mask[list(field_mask.keys())[0]]
308
+ field_mask = defaultdict(lambda: default, field_mask)
309
+ elif isinstance(field_mask, (list, tuple)):
310
+ field_mask = defaultdict(
311
+ lambda: False, {field: True for field in field_mask}
312
+ )
313
+ elif not isinstance(field_mask, defaultdict):
314
+ raise RuntimeError("bad field_mask of type", type(field_mask).__name__)
315
+ elif field_mask is not None:
316
+ raise RuntimeError(f"datatype {datatype} does not accept a field_mask")
317
+
318
+ # Scalar
319
+ # scalars are dim-0 datasets
320
+ if datatype == "scalar":
321
+ value = h5f[name][()]
322
+ if elements == "bool":
323
+ value = np.bool_(value)
324
+ if obj_buf is not None:
325
+ obj_buf.value = value
326
+ obj_buf.attrs.update(h5f[name].attrs)
327
+ return obj_buf, 1
328
+ else:
329
+ return Scalar(value=value, attrs=h5f[name].attrs), 1
330
+
331
+ # Struct
332
+ # recursively build a struct, return as a dictionary
333
+ if datatype == "struct":
334
+ # ignore obj_buf.
335
+ # TODO: could append new fields or overwrite/concat to existing
336
+ # fields. If implemented, get_buffer() above should probably also
337
+ # (optionally?) prep buffers for each field
338
+ if obj_buf is not None:
339
+ raise NotImplementedError("obj_buf not implemented for LGOD Structs")
340
+
341
+ # loop over fields and read
342
+ obj_dict = {}
343
+ for field in elements:
344
+ if not field_mask[field]:
345
+ continue
346
+ # TODO: it's strange to pass start_row, n_rows, idx to struct
347
+ # fields. If they all had shared indexing, they should be in a
348
+ # table... Maybe should emit a warning? Or allow them to be
349
+ # dicts keyed by field name?
350
+ if "int_keys" in h5f[name].attrs:
351
+ if dict(h5f[name].attrs)["int_keys"]:
352
+ f = int(field)
353
+ else:
354
+ f = str(field)
355
+ obj_dict[f], _ = self.read_object(
356
+ name + "/" + field,
357
+ h5f,
358
+ start_row=start_row,
359
+ n_rows=n_rows,
360
+ idx=idx,
361
+ decompress=decompress,
362
+ )
363
+ # modify datatype in attrs if a field_mask was used
364
+ attrs = dict(h5f[name].attrs)
365
+ if field_mask is not None:
366
+ selected_fields = []
367
+ for field in elements:
368
+ if field_mask[field]:
369
+ selected_fields.append(field)
370
+ attrs["datatype"] = "struct" + "{" + ",".join(selected_fields) + "}"
371
+ return Struct(obj_dict=obj_dict, attrs=attrs), 1
372
+
373
+ # Below here is all array-like types. So trim idx if needed
374
+ if idx is not None:
375
+ # chop off indices < start_row
376
+ i_first_valid = bisect_left(idx[0], start_row)
377
+ idxa = idx[0][i_first_valid:]
378
+ # don't readout more than n_rows indices
379
+ idx = (idxa[:n_rows],) # works even if n_rows > len(idxa)
380
+
381
+ # Table or WaveformTable
382
+ if datatype == "table":
383
+ col_dict = {}
384
+
385
+ # read out each of the fields
386
+ rows_read = []
387
+ for field in elements:
388
+ if not field_mask[field]:
389
+ continue
390
+
391
+ fld_buf = None
392
+ if obj_buf is not None:
393
+ if not isinstance(obj_buf, Table) or field not in obj_buf:
394
+ raise ValueError(
395
+ f"obj_buf for LGDO Table '{name}' not formatted correctly"
396
+ )
397
+
398
+ else:
399
+ fld_buf = obj_buf[field]
400
+
401
+ col_dict[field], n_rows_read = self.read_object(
402
+ name + "/" + field,
403
+ h5f,
404
+ start_row=start_row,
405
+ n_rows=n_rows,
406
+ idx=idx,
407
+ obj_buf=fld_buf,
408
+ obj_buf_start=obj_buf_start,
409
+ decompress=decompress,
410
+ )
411
+ if obj_buf is not None and obj_buf_start + n_rows_read > len(obj_buf):
412
+ obj_buf.resize(obj_buf_start + n_rows_read)
413
+
414
+ rows_read.append(n_rows_read)
415
+
416
+ # warn if all columns don't read in the same number of rows
417
+ if len(rows_read) > 0:
418
+ n_rows_read = rows_read[0]
419
+ else:
420
+ n_rows_read = 0
421
+ log.warning(f"Table '{name}' has no subgroups accepted by field mask")
422
+
423
+ for n in rows_read[1:]:
424
+ if n != n_rows_read:
425
+ log.warning(
426
+ f"Table '{name}' got strange n_rows_read = {n}, {n_rows_read} was expected ({rows_read})"
427
+ )
428
+
429
+ # modify datatype in attrs if a field_mask was used
430
+ attrs = dict(h5f[name].attrs)
431
+ if field_mask is not None:
432
+ selected_fields = []
433
+ for field in elements:
434
+ if field_mask[field]:
435
+ selected_fields.append(field)
436
+ attrs["datatype"] = "table" + "{" + ",".join(selected_fields) + "}"
437
+
438
+ # fields have been read out, now return a table
439
+ if obj_buf is None:
440
+ # if col_dict contains just 3 objects called t0, dt, and values,
441
+ # return a WaveformTable
442
+ if (
443
+ len(col_dict) == 3
444
+ and "t0" in col_dict
445
+ and "dt" in col_dict
446
+ and "values" in col_dict
447
+ ):
448
+ table = WaveformTable(
449
+ t0=col_dict["t0"], dt=col_dict["dt"], values=col_dict["values"]
450
+ )
451
+ else:
452
+ table = Table(col_dict=col_dict, attrs=attrs)
453
+
454
+ # set (write) loc to end of tree
455
+ table.loc = n_rows_read
456
+ return table, n_rows_read
457
+ else:
458
+ # We have read all fields into the object buffer. Run
459
+ # checks: All columns should be the same size. So update
460
+ # table's size as necessary, warn if any mismatches are found
461
+ obj_buf.resize(do_warn=True)
462
+ # set (write) loc to end of tree
463
+ obj_buf.loc = obj_buf_start + n_rows_read
464
+ # check attributes
465
+ if set(obj_buf.attrs.keys()) != set(attrs.keys()):
466
+ raise RuntimeError(
467
+ f"attrs mismatch. obj_buf.attrs: "
468
+ f"{obj_buf.attrs}, h5f[{name}].attrs: {attrs}"
469
+ )
470
+ return obj_buf, n_rows_read
471
+
472
+ # ArrayOfEncodedEqualSizedArrays and VectorOfEncodedVectors
473
+ for cond, enc_lgdo in [
474
+ (
475
+ datatype == "array_of_encoded_equalsized_arrays",
476
+ ArrayOfEncodedEqualSizedArrays,
477
+ ),
478
+ (elements.startswith("encoded_array"), VectorOfEncodedVectors),
479
+ ]:
480
+ if cond:
481
+ if (
482
+ not decompress
483
+ and obj_buf is not None
484
+ and not isinstance(obj_buf, enc_lgdo)
485
+ ):
486
+ raise ValueError(f"obj_buf for '{name}' not a {enc_lgdo}")
487
+
488
+ # read out decoded_size, either a Scalar or an Array
489
+ decoded_size_buf = encoded_data_buf = None
490
+ if obj_buf is not None and not decompress:
491
+ decoded_size_buf = obj_buf.decoded_size
492
+ encoded_data_buf = obj_buf.encoded_data
493
+
494
+ decoded_size, _ = self.read_object(
495
+ f"{name}/decoded_size",
496
+ h5f,
497
+ start_row=start_row,
498
+ n_rows=n_rows,
499
+ idx=idx,
500
+ obj_buf=None if decompress else decoded_size_buf,
501
+ obj_buf_start=0 if decompress else obj_buf_start,
502
+ )
503
+
504
+ # read out encoded_data, a VectorOfVectors
505
+ encoded_data, n_rows_read = self.read_object(
506
+ f"{name}/encoded_data",
507
+ h5f,
508
+ start_row=start_row,
509
+ n_rows=n_rows,
510
+ idx=idx,
511
+ obj_buf=None if decompress else encoded_data_buf,
512
+ obj_buf_start=0 if decompress else obj_buf_start,
513
+ )
514
+
515
+ # return the still encoded data in the buffer object, if there
516
+ if obj_buf is not None and not decompress:
517
+ return obj_buf, n_rows_read
518
+
519
+ # otherwise re-create the encoded LGDO
520
+ rawdata = enc_lgdo(
521
+ encoded_data=encoded_data,
522
+ decoded_size=decoded_size,
523
+ attrs=h5f[name].attrs,
524
+ )
525
+
526
+ # already return if no decompression is requested
527
+ if not decompress:
528
+ return rawdata, n_rows_read
529
+
530
+ # if no buffer, decode and return
531
+ elif obj_buf is None and decompress:
532
+ return compress.decode(rawdata), n_rows_read
533
+
534
+ # use the (decoded object type) buffer otherwise
535
+ if enc_lgdo == VectorOfEncodedVectors and not isinstance(
536
+ obj_buf, VectorOfVectors
537
+ ):
538
+ raise ValueError(
539
+ f"obj_buf for decoded '{name}' not a VectorOfVectors"
540
+ )
541
+ elif enc_lgdo == ArrayOfEncodedEqualSizedArrays and not isinstance(
542
+ obj_buf, ArrayOfEqualSizedArrays
543
+ ):
544
+ raise ValueError(
545
+ f"obj_buf for decoded '{name}' not an ArrayOfEqualSizedArrays"
546
+ )
547
+
548
+ # FIXME: not a good idea. an in place decoding version
549
+ # of decode would be needed to avoid extra memory
550
+ # allocations
551
+ # FIXME: obj_buf_start??? Write a unit test
552
+ for i, wf in enumerate(compress.decode(rawdata)):
553
+ obj_buf[i] = wf
554
+
555
+ return obj_buf, n_rows_read
556
+
557
+ # VectorOfVectors
558
+ # read out vector of vectors of different size
559
+ if elements.startswith("array"):
560
+ if obj_buf is not None and not isinstance(obj_buf, VectorOfVectors):
561
+ raise ValueError(f"obj_buf for '{name}' not a LGDO VectorOfVectors")
562
+
563
+ # read out cumulative_length
564
+ cumulen_buf = None if obj_buf is None else obj_buf.cumulative_length
565
+ cumulative_length, n_rows_read = self.read_object(
566
+ f"{name}/cumulative_length",
567
+ h5f,
568
+ start_row=start_row,
569
+ n_rows=n_rows,
570
+ idx=idx,
571
+ obj_buf=cumulen_buf,
572
+ obj_buf_start=obj_buf_start,
573
+ )
574
+ # get a view of just what was read out for cleaner code below
575
+ this_cumulen_nda = cumulative_length.nda[
576
+ obj_buf_start : obj_buf_start + n_rows_read
577
+ ]
578
+
579
+ if idx is not None and n_rows_read > 0:
580
+ # get the starting indices for each array in flattended data:
581
+ # the starting index for array[i] is cumulative_length[i-1]
582
+ idx2 = (np.asarray(idx[0]).copy() - 1,)
583
+ # re-read cumulative_length with these indices
584
+ # note this will allocate memory for fd_starts!
585
+ fd_start = None
586
+ if idx2[0][0] == -1:
587
+ idx2 = (idx2[0][1:],)
588
+ fd_start = 0 # this variable avoids an ndarray append
589
+ fd_starts, fds_n_rows_read = self.read_object(
590
+ f"{name}/cumulative_length",
591
+ h5f,
592
+ start_row=start_row,
593
+ n_rows=n_rows,
594
+ idx=idx2,
595
+ )
596
+ fd_starts = fd_starts.nda # we just need the nda
597
+ if fd_start is None:
598
+ fd_start = fd_starts[0]
599
+
600
+ # compute the length that flattened_data will have after the
601
+ # fancy-indexed read
602
+ fd_n_rows = np.sum(this_cumulen_nda[-len(fd_starts) :] - fd_starts)
603
+ if fd_start == 0:
604
+ fd_n_rows += this_cumulen_nda[0]
605
+
606
+ # now make fd_idx
607
+ fd_idx = np.empty(fd_n_rows, dtype="uint32")
608
+ fd_idx = _make_fd_idx(fd_starts, this_cumulen_nda, fd_idx)
609
+
610
+ # Now clean up this_cumulen_nda, to be ready
611
+ # to match the in-memory version of flattened_data. Note: these
612
+ # operations on the view change the original array because they are
613
+ # numpy arrays, not lists.
614
+ this_cumulen_nda[-len(fd_starts) :] -= fd_starts
615
+ np.cumsum(this_cumulen_nda, out=this_cumulen_nda)
616
+
617
+ else:
618
+ fd_idx = None
619
+
620
+ # determine the start_row and n_rows for the flattened_data readout
621
+ fd_start = 0
622
+ if start_row > 0 and n_rows_read > 0:
623
+ # need to read out the cumulen sample -before- the first sample
624
+ # read above in order to get the starting row of the first
625
+ # vector to read out in flattened_data
626
+ fd_start = h5f[f"{name}/cumulative_length"][start_row - 1]
627
+
628
+ # check limits for values that will be used subsequently
629
+ if this_cumulen_nda[-1] < fd_start:
630
+ log.debug(
631
+ f"this_cumulen_nda[-1] = {this_cumulen_nda[-1]}, "
632
+ f"fd_start = {fd_start}, "
633
+ f"start_row = {start_row}, "
634
+ f"n_rows_read = {n_rows_read}"
635
+ )
636
+ raise RuntimeError(
637
+ f"cumulative_length non-increasing between entries "
638
+ f"{start_row} and {start_row+n_rows_read} ??"
639
+ )
640
+
641
+ # determine the number of rows for the flattened_data readout
642
+ fd_n_rows = this_cumulen_nda[-1] if n_rows_read > 0 else 0
643
+
644
+ # Now done with this_cumulen_nda, so we can clean it up to be ready
645
+ # to match the in-memory version of flattened_data. Note: these
646
+ # operations on the view change the original array because they are
647
+ # numpy arrays, not lists.
648
+ #
649
+ # First we need to subtract off the in-file offset for the start of
650
+ # read for flattened_data
651
+ this_cumulen_nda -= fd_start
652
+
653
+ # If we started with a partially-filled buffer, add the
654
+ # appropriate offset for the start of the in-memory flattened
655
+ # data for this read.
656
+ fd_buf_start = np.uint32(0)
657
+ if obj_buf_start > 0:
658
+ fd_buf_start = cumulative_length.nda[obj_buf_start - 1]
659
+ this_cumulen_nda += fd_buf_start
660
+
661
+ # Now prepare the object buffer if necessary
662
+ fd_buf = None
663
+ if obj_buf is not None:
664
+ fd_buf = obj_buf.flattened_data
665
+ # grow fd_buf if necessary to hold the data
666
+ fdb_size = fd_buf_start + fd_n_rows
667
+ if len(fd_buf) < fdb_size:
668
+ fd_buf.resize(fdb_size)
669
+
670
+ # now read
671
+ flattened_data, dummy_rows_read = self.read_object(
672
+ f"{name}/flattened_data",
673
+ h5f,
674
+ start_row=fd_start,
675
+ n_rows=fd_n_rows,
676
+ idx=fd_idx,
677
+ obj_buf=fd_buf,
678
+ obj_buf_start=fd_buf_start,
679
+ )
680
+ if obj_buf is not None:
681
+ return obj_buf, n_rows_read
682
+ return (
683
+ VectorOfVectors(
684
+ flattened_data=flattened_data,
685
+ cumulative_length=cumulative_length,
686
+ attrs=h5f[name].attrs,
687
+ ),
688
+ n_rows_read,
689
+ )
690
+
691
+ # Array
692
+ # FixedSizeArray
693
+ # ArrayOfEqualSizedArrays
694
+ # read out all arrays by slicing
695
+ if "array" in datatype:
696
+ if obj_buf is not None:
697
+ if not isinstance(obj_buf, Array):
698
+ raise ValueError(f"obj_buf for '{name}' not an LGDO Array")
699
+ obj_buf = None
700
+
701
+ # compute the number of rows to read
702
+ # we culled idx above for start_row and n_rows, now we have to apply
703
+ # the constraint of the length of the dataset
704
+ ds_n_rows = h5f[name].shape[0]
705
+ if idx is not None:
706
+ if len(idx[0]) > 0 and idx[0][-1] >= ds_n_rows:
707
+ log.warning(
708
+ "idx indexed past the end of the array in the file. Culling..."
709
+ )
710
+ n_rows_to_read = bisect_left(idx[0], ds_n_rows)
711
+ idx = (idx[0][:n_rows_to_read],)
712
+ if len(idx[0]) == 0:
713
+ log.warning("idx empty after culling.")
714
+ n_rows_to_read = len(idx[0])
715
+ else:
716
+ n_rows_to_read = ds_n_rows - start_row
717
+ if n_rows_to_read > n_rows:
718
+ n_rows_to_read = n_rows
719
+
720
+ # prepare the selection for the read. Use idx if available
721
+ if idx is not None:
722
+ source_sel = idx
723
+ else:
724
+ source_sel = np.s_[start_row : start_row + n_rows_to_read]
725
+
726
+ # Now read the array
727
+ if obj_buf is not None and n_rows_to_read > 0:
728
+ buf_size = obj_buf_start + n_rows_to_read
729
+ if len(obj_buf) < buf_size:
730
+ obj_buf.resize(buf_size)
731
+ dest_sel = np.s_[obj_buf_start:buf_size]
732
+ h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
733
+ nda = obj_buf.nda
734
+ else:
735
+ if n_rows == 0:
736
+ tmp_shape = (0,) + h5f[name].shape[1:]
737
+ nda = np.empty(tmp_shape, h5f[name].dtype)
738
+ else:
739
+ nda = h5f[name][source_sel]
740
+
741
+ # special handling for bools
742
+ # (c and Julia store as uint8 so cast to bool)
743
+ if elements == "bool":
744
+ nda = nda.astype(np.bool_)
745
+
746
+ # Finally, set attributes and return objects
747
+ attrs = h5f[name].attrs
748
+ if obj_buf is None:
749
+ if datatype == "array":
750
+ return Array(nda=nda, attrs=attrs), n_rows_to_read
751
+ if datatype == "fixedsize_array":
752
+ return FixedSizeArray(nda=nda, attrs=attrs), n_rows_to_read
753
+ if datatype == "array_of_equalsized_arrays":
754
+ return (
755
+ ArrayOfEqualSizedArrays(nda=nda, dims=shape, attrs=attrs),
756
+ n_rows_to_read,
757
+ )
758
+ else:
759
+ if set(obj_buf.attrs.keys()) != set(attrs.keys()):
760
+ raise RuntimeError(
761
+ f"attrs mismatch. "
762
+ f"obj_buf.attrs: {obj_buf.attrs}, "
763
+ f"h5f[{name}].attrs: {attrs}"
764
+ )
765
+ return obj_buf, n_rows_to_read
766
+
767
+ raise RuntimeError("don't know how to read datatype {datatype}")
768
+
769
+ def write_object(
770
+ self,
771
+ obj: LGDO,
772
+ name: str,
773
+ lh5_file: str | h5py.File,
774
+ group: str | h5py.Group = "/",
775
+ start_row: int = 0,
776
+ n_rows: int = None,
777
+ wo_mode: str = "append",
778
+ write_start: int = 0,
779
+ hdf5_compression: str | h5py.filters.FilterRefBase = DEFAULT_HDF5_COMPRESSION,
780
+ ) -> None:
781
+ """Write an LGDO into an LH5 file.
782
+
783
+ If the `obj` :class:`.LGDO` has a `compression` attribute, its value is
784
+ interpreted as the algorithm to be used to compress `obj` before
785
+ writing to disk. The type of `compression` can be:
786
+
787
+ string, kwargs dictionary, hdf5plugin filter
788
+ interpreted as the name of a built-in or custom `HDF5 compression
789
+ filter <https://docs.h5py.org/en/stable/high/dataset.html#filter-pipeline>`_
790
+ (``"gzip"``, ``"lzf"``, :mod:`hdf5plugin` filter object etc.) and
791
+ passed directly to :meth:`h5py.Group.create_dataset`.
792
+
793
+ :class:`.WaveformCodec` object
794
+ If `obj` is a :class:`.WaveformTable`, compress its `values` using
795
+ this algorithm. More documentation about the supported waveform
796
+ compression algorithms at :mod:`.lgdo.compression`.
797
+
798
+ Note
799
+ ----
800
+ The `compression` attribute takes precedence over the
801
+ `hdf5_compression` argument and is not written to disk.
802
+
803
+ Note
804
+ ----
805
+ HDF5 compression is skipped for the `encoded_data` dataset of
806
+ :class:`.VectorOfEncodedVectors` and
807
+ :class`.ArrayOfEncodedEqualSizedArrays`.
808
+
809
+ Parameters
810
+ ----------
811
+ obj
812
+ LH5 object. if object is array-like, writes `n_rows` starting from
813
+ `start_row` in `obj`.
814
+ name
815
+ name of the object in the output HDF5 file.
816
+ lh5_file
817
+ HDF5 file name or :class:`h5py.File` object.
818
+ group
819
+ HDF5 group name or :class:`h5py.Group` object in which `obj` should
820
+ be written.
821
+ start_row
822
+ first row in `obj` to be written.
823
+ n_rows
824
+ number of rows in `obj` to be written.
825
+ wo_mode
826
+ - ``write_safe`` or ``w``: only proceed with writing if the
827
+ object does not already exist in the file.
828
+ - ``append`` or ``a``: append along axis 0 (the first dimension)
829
+ of array-like objects and array-like subfields of structs.
830
+ :class:`~.lgdo.scalar.Scalar` objects get overwritten.
831
+ - ``overwrite`` or ``o``: replace data in the file if present,
832
+ starting from `write_start`. Note: overwriting with `write_start` =
833
+ end of array is the same as ``append``.
834
+ - ``overwrite_file`` or ``of``: delete file if present prior to
835
+ writing to it. `write_start` should be 0 (its ignored).
836
+ - ``append_column`` or ``ac``: append columns from an :class:`~.lgdo.table.Table`
837
+ `obj` only if there is an existing :class:`~.lgdo.table.Table` in the `lh5_file` with
838
+ the same `name` and :class:`~.lgdo.table.Table.size`. If the sizes don't match,
839
+ or if there are matching fields, it errors out.
840
+ write_start
841
+ row in the output file (if already existing) to start overwriting
842
+ from.
843
+ hdf5_compression
844
+ HDF5 compression filter to be applied before writing non-scalar
845
+ datasets. **Ignored if compression is specified as an `obj`
846
+ attribute.**
847
+ """
848
+ log.debug(
849
+ f"writing {repr(obj)}[{start_row}:{n_rows}] as "
850
+ f"{lh5_file}:{group}/{name}[{write_start}:], "
851
+ f"mode = {wo_mode}, hdf5_compression = {hdf5_compression}"
852
+ )
853
+
854
+ if wo_mode == "write_safe":
855
+ wo_mode = "w"
856
+ if wo_mode == "append":
857
+ wo_mode = "a"
858
+ if wo_mode == "overwrite":
859
+ wo_mode = "o"
860
+ if wo_mode == "overwrite_file":
861
+ wo_mode = "of"
862
+ write_start = 0
863
+ if wo_mode == "append_column":
864
+ wo_mode = "ac"
865
+ if wo_mode not in ["w", "a", "o", "of", "ac"]:
866
+ raise ValueError(f"unknown wo_mode '{wo_mode}'")
867
+
868
+ # "mode" is for the h5df.File and wo_mode is for this function
869
+ # In hdf5, 'a' is really "modify" -- in addition to appending, you can
870
+ # change any object in the file. So we use file:append for
871
+ # write_object:overwrite.
872
+ mode = "w" if wo_mode == "of" else "a"
873
+ lh5_file = self.gimme_file(lh5_file, mode=mode)
874
+ group = self.gimme_group(group, lh5_file)
875
+ if wo_mode == "w" and name in group:
876
+ raise RuntimeError(f"can't overwrite '{name}' in wo_mode 'write_safe'")
877
+
878
+ # struct or table or waveform table
879
+ if isinstance(obj, Struct):
880
+ # In order to append a column, we need to update the `table{old_fields}` value in `group.attrs['datatype"]` to include the new fields.
881
+ # One way to do this is to override `obj.attrs["datatype"]` to include old and new fields. Then we can write the fields to the table as normal.
882
+ if wo_mode == "ac":
883
+ old_group = self.gimme_group(name, group)
884
+ datatype, shape, fields = parse_datatype(old_group.attrs["datatype"])
885
+ if datatype not in ["table", "struct"]:
886
+ raise RuntimeError(
887
+ f"Trying to append columns to an object of type {datatype}"
888
+ )
889
+
890
+ # If the mode is `append_column`, make sure we aren't appending a table that has a column of the same name as in the existing table
891
+ # Also make sure that the field we are adding has the same size
892
+ if len(list(set(fields).intersection(set(obj.keys())))) != 0:
893
+ raise ValueError(
894
+ f"Can't append {list(set(fields).intersection(set(obj.keys())))} column(s) to a table with the same field(s)"
895
+ )
896
+ # It doesn't matter what key we access, as all fields in the old table have the same size
897
+ if old_group[list(old_group.keys())[0]].size != obj.size:
898
+ raise ValueError(
899
+ f"Table sizes don't match. Trying to append column of size {obj.size} to a table of size {old_group[list(old_group.keys())[0]].size}."
900
+ )
901
+
902
+ # Now we can append the obj.keys() to the old fields, and then update obj.attrs.
903
+ fields.extend(list(obj.keys()))
904
+ obj.attrs.pop("datatype")
905
+ obj.attrs["datatype"] = "table" + "{" + ",".join(fields) + "}"
906
+
907
+ group = self.gimme_group(
908
+ name,
909
+ group,
910
+ grp_attrs=obj.attrs,
911
+ overwrite=(wo_mode in ["o", "ac"]),
912
+ )
913
+ # If the mode is overwrite, then we need to peek into the file's table's existing fields
914
+ # If we are writing a new table to the group that does not contain an old field, we should delete that old field from the file
915
+ if wo_mode == "o":
916
+ # Find the old keys in the group that are not present in the new table's keys, then delete them
917
+ for key in list(set(group.keys()) - set(obj.keys())):
918
+ log.debug(f"{key} is not present in new table, deleting field")
919
+ del group[key]
920
+
921
+ for field in obj.keys():
922
+ # eventually compress waveform table values with LGDO's
923
+ # custom codecs before writing
924
+ # if waveformtable.values.attrs["compression"] is a string,
925
+ # interpret it as an HDF5 built-in filter
926
+ obj_fld = None
927
+ if (
928
+ isinstance(obj, WaveformTable)
929
+ and field == "values"
930
+ and not isinstance(obj.values, VectorOfEncodedVectors)
931
+ and not isinstance(obj.values, ArrayOfEncodedEqualSizedArrays)
932
+ and "compression" in obj.values.attrs
933
+ and isinstance(obj.values.attrs["compression"], WaveformCodec)
934
+ ):
935
+ codec = obj.values.attrs["compression"]
936
+ obj_fld = compress.encode(obj.values, codec=codec)
937
+ else:
938
+ obj_fld = obj[field]
939
+
940
+ # Convert keys to string for dataset names
941
+ f = str(field)
942
+ self.write_object(
943
+ obj_fld,
944
+ f,
945
+ lh5_file,
946
+ group=group,
947
+ start_row=start_row,
948
+ n_rows=n_rows,
949
+ wo_mode=wo_mode,
950
+ write_start=write_start,
951
+ hdf5_compression=hdf5_compression,
952
+ )
953
+ return
954
+
955
+ # scalars
956
+ elif isinstance(obj, Scalar):
957
+ if name in group:
958
+ if wo_mode in ["o", "a"]:
959
+ log.debug(f"overwriting {name} in {group}")
960
+ del group[name]
961
+ else:
962
+ raise RuntimeError(
963
+ f"tried to overwrite {name} in {group} for wo_mode {wo_mode}"
964
+ )
965
+ ds = group.create_dataset(name, shape=(), data=obj.value)
966
+ ds.attrs.update(obj.attrs)
967
+ return
968
+
969
+ # vector of encoded vectors
970
+ elif isinstance(obj, (VectorOfEncodedVectors, ArrayOfEncodedEqualSizedArrays)):
971
+ group = self.gimme_group(
972
+ name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
973
+ )
974
+
975
+ self.write_object(
976
+ obj.encoded_data,
977
+ "encoded_data",
978
+ lh5_file,
979
+ group=group,
980
+ start_row=start_row,
981
+ n_rows=n_rows,
982
+ wo_mode=wo_mode,
983
+ write_start=write_start,
984
+ hdf5_compression=None, # data is already compressed!
985
+ )
986
+
987
+ self.write_object(
988
+ obj.decoded_size,
989
+ "decoded_size",
990
+ lh5_file,
991
+ group=group,
992
+ start_row=start_row,
993
+ n_rows=n_rows,
994
+ wo_mode=wo_mode,
995
+ write_start=write_start,
996
+ hdf5_compression=hdf5_compression,
997
+ )
998
+
999
+ # vector of vectors
1000
+ elif isinstance(obj, VectorOfVectors):
1001
+ group = self.gimme_group(
1002
+ name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
1003
+ )
1004
+ if (
1005
+ n_rows is None
1006
+ or n_rows > obj.cumulative_length.nda.shape[0] - start_row
1007
+ ):
1008
+ n_rows = obj.cumulative_length.nda.shape[0] - start_row
1009
+
1010
+ # if appending we need to add an appropriate offset to the
1011
+ # cumulative lengths as appropriate for the in-file object
1012
+ offset = 0 # declare here because we have to subtract it off at the end
1013
+ if (wo_mode == "a" or wo_mode == "o") and "cumulative_length" in group:
1014
+ len_cl = len(group["cumulative_length"])
1015
+ if wo_mode == "a":
1016
+ write_start = len_cl
1017
+ if len_cl > 0:
1018
+ offset = group["cumulative_length"][write_start - 1]
1019
+
1020
+ # First write flattened_data array. Only write rows with data.
1021
+ fd_start = 0 if start_row == 0 else obj.cumulative_length.nda[start_row - 1]
1022
+ fd_n_rows = obj.cumulative_length.nda[start_row + n_rows - 1] - fd_start
1023
+ self.write_object(
1024
+ obj.flattened_data,
1025
+ "flattened_data",
1026
+ lh5_file,
1027
+ group=group,
1028
+ start_row=fd_start,
1029
+ n_rows=fd_n_rows,
1030
+ wo_mode=wo_mode,
1031
+ write_start=offset,
1032
+ hdf5_compression=hdf5_compression,
1033
+ )
1034
+
1035
+ # now offset is used to give appropriate in-file values for
1036
+ # cumulative_length. Need to adjust it for start_row
1037
+ if start_row > 0:
1038
+ offset -= obj.cumulative_length.nda[start_row - 1]
1039
+
1040
+ # Add offset to obj.cumulative_length itself to avoid memory allocation.
1041
+ # Then subtract it off after writing! (otherwise it will be changed
1042
+ # upon return)
1043
+ cl_dtype = obj.cumulative_length.nda.dtype.type
1044
+ obj.cumulative_length.nda += cl_dtype(offset)
1045
+
1046
+ self.write_object(
1047
+ obj.cumulative_length,
1048
+ "cumulative_length",
1049
+ lh5_file,
1050
+ group=group,
1051
+ start_row=start_row,
1052
+ n_rows=n_rows,
1053
+ wo_mode=wo_mode,
1054
+ write_start=write_start,
1055
+ hdf5_compression=hdf5_compression,
1056
+ )
1057
+ obj.cumulative_length.nda -= cl_dtype(offset)
1058
+
1059
+ return
1060
+
1061
+ # if we get this far, must be one of the Array types
1062
+ elif isinstance(obj, Array):
1063
+ if n_rows is None or n_rows > obj.nda.shape[0] - start_row:
1064
+ n_rows = obj.nda.shape[0] - start_row
1065
+
1066
+ nda = obj.nda[start_row : start_row + n_rows]
1067
+
1068
+ # hack to store bools as uint8 for c / Julia compliance
1069
+ if nda.dtype.name == "bool":
1070
+ nda = nda.astype(np.uint8)
1071
+
1072
+ # need to create dataset from ndarray the first time for speed
1073
+ # creating an empty dataset and appending to that is super slow!
1074
+ if (wo_mode != "a" and write_start == 0) or name not in group:
1075
+ maxshape = (None,) + nda.shape[1:]
1076
+ if wo_mode == "o" and name in group:
1077
+ log.debug(f"overwriting {name} in {group}")
1078
+ del group[name]
1079
+
1080
+ # create HDF5 dataset
1081
+ # - compress using the 'compression' LGDO attribute, if
1082
+ # available
1083
+ # - otherwise use "hdf5_compression"
1084
+ # - attach HDF5 dataset attributes, but not "compression"!
1085
+ comp_algo = obj.attrs.get("compression", hdf5_compression)
1086
+ comp_kwargs = {}
1087
+ if isinstance(comp_algo, str):
1088
+ comp_kwargs = {"compression": comp_algo}
1089
+ elif comp_algo is not None:
1090
+ comp_kwargs = comp_algo
1091
+
1092
+ ds = group.create_dataset(
1093
+ name, data=nda, maxshape=maxshape, **comp_kwargs
1094
+ )
1095
+
1096
+ _attrs = obj.getattrs(datatype=True)
1097
+ _attrs.pop("compression", None)
1098
+ ds.attrs.update(_attrs)
1099
+ return
1100
+
1101
+ # Now append or overwrite
1102
+ ds = group[name]
1103
+ if not isinstance(ds, h5py.Dataset):
1104
+ raise RuntimeError(
1105
+ f"existing HDF5 object '{name}' in group '{group}'"
1106
+ " is not a dataset! Cannot overwrite or append"
1107
+ )
1108
+
1109
+ old_len = ds.shape[0]
1110
+ if wo_mode == "a":
1111
+ write_start = old_len
1112
+ add_len = write_start + nda.shape[0] - old_len
1113
+ ds.resize(old_len + add_len, axis=0)
1114
+ ds[write_start:] = nda
1115
+ return
1116
+
1117
+ else:
1118
+ raise RuntimeError(
1119
+ f"do not know how to write '{name}' of type '{type(obj).__name__}'"
1120
+ )
1121
+
1122
+ def read_n_rows(self, name: str, lh5_file: str | h5py.File) -> int | None:
1123
+ """Look up the number of rows in an Array-like object called `name` in
1124
+ `lh5_file`.
1125
+
1126
+ Return ``None`` if it is a :class:`.Scalar` or a :class:`.Struct`."""
1127
+ # this is basically a stripped down version of read_object
1128
+ h5f = self.gimme_file(lh5_file, "r")
1129
+ if not h5f or name not in h5f:
1130
+ raise KeyError(f"'{name}' not in {lh5_file}")
1131
+
1132
+ # get the datatype
1133
+ if "datatype" not in h5f[name].attrs:
1134
+ raise RuntimeError(
1135
+ f"'{name}' in file {lh5_file} is missing the datatype attribute"
1136
+ )
1137
+
1138
+ datatype = h5f[name].attrs["datatype"]
1139
+ datatype, shape, elements = parse_datatype(datatype)
1140
+
1141
+ # scalars are dim-0 datasets
1142
+ if datatype == "scalar":
1143
+ return None
1144
+
1145
+ # structs don't have rows
1146
+ if datatype == "struct":
1147
+ return None
1148
+
1149
+ # tables should have elements with all the same length
1150
+ if datatype == "table":
1151
+ # read out each of the fields
1152
+ rows_read = None
1153
+ for field in elements:
1154
+ n_rows_read = self.read_n_rows(name + "/" + field, h5f)
1155
+ if not rows_read:
1156
+ rows_read = n_rows_read
1157
+ elif rows_read != n_rows_read:
1158
+ log.warning(
1159
+ f"'{field}' field in table '{name}' has {rows_read} rows, "
1160
+ f"{n_rows_read} was expected"
1161
+ )
1162
+ return rows_read
1163
+
1164
+ # length of vector of vectors is the length of its cumulative_length
1165
+ if elements.startswith("array"):
1166
+ return self.read_n_rows(f"{name}/cumulative_length", h5f)
1167
+
1168
+ # length of vector of encoded vectors is the length of its decoded_size
1169
+ if (
1170
+ elements.startswith("encoded_array")
1171
+ or datatype == "array_of_encoded_equalsized_arrays"
1172
+ ):
1173
+ return self.read_n_rows(f"{name}/encoded_data", h5f)
1174
+
1175
+ # return array length (without reading the array!)
1176
+ if "array" in datatype:
1177
+ # compute the number of rows to read
1178
+ return h5f[name].shape[0]
1179
+
1180
+ raise RuntimeError(f"don't know how to read datatype '{datatype}'")
1181
+
1182
+
1183
+ def ls(lh5_file: str | h5py.Group, lh5_group: str = "") -> list[str]:
1184
+ """Return a list of LH5 groups in the input file and group, similar
1185
+ to ``ls`` or ``h5ls``. Supports wildcards in group names.
1186
+
1187
+
1188
+ Parameters
1189
+ ----------
1190
+ lh5_file
1191
+ name of file.
1192
+ lh5_group
1193
+ group to search. add a ``/`` to the end of the group name if you want to
1194
+ list all objects inside that group.
1195
+ """
1196
+
1197
+ log.debug(
1198
+ f"Listing objects in '{lh5_file}'"
1199
+ + ("" if lh5_group == "" else f" (and group {lh5_group})")
1200
+ )
1201
+
1202
+ lh5_st = LH5Store()
1203
+ # To use recursively, make lh5_file a h5group instead of a string
1204
+ if isinstance(lh5_file, str):
1205
+ lh5_file = lh5_st.gimme_file(lh5_file, "r")
1206
+ if lh5_group.startswith("/"):
1207
+ lh5_group = lh5_group[1:]
1208
+
1209
+ if lh5_group == "":
1210
+ lh5_group = "*"
1211
+
1212
+ splitpath = lh5_group.split("/", 1)
1213
+ matchingkeys = fnmatch.filter(lh5_file.keys(), splitpath[0])
1214
+
1215
+ if len(splitpath) == 1:
1216
+ return matchingkeys
1217
+ else:
1218
+ ret = []
1219
+ for key in matchingkeys:
1220
+ ret.extend([f"{key}/{path}" for path in ls(lh5_file[key], splitpath[1])])
1221
+ return ret
1222
+
1223
+
1224
+ def show(
1225
+ lh5_file: str | h5py.Group,
1226
+ lh5_group: str = "/",
1227
+ attrs: bool = False,
1228
+ indent: str = "",
1229
+ header: bool = True,
1230
+ ) -> None:
1231
+ """Print a tree of LH5 file contents with LGDO datatype.
1232
+
1233
+ Parameters
1234
+ ----------
1235
+ lh5_file
1236
+ the LH5 file.
1237
+ lh5_group
1238
+ print only contents of this HDF5 group.
1239
+ attrs
1240
+ print the HDF5 attributes too.
1241
+ indent
1242
+ indent the diagram with this string.
1243
+ header
1244
+ print `lh5_group` at the top of the diagram.
1245
+
1246
+ Examples
1247
+ --------
1248
+ >>> from lgdo import show
1249
+ >>> show("file.lh5", "/geds/raw")
1250
+ /geds/raw
1251
+ ├── channel · array<1>{real}
1252
+ ├── energy · array<1>{real}
1253
+ ├── timestamp · array<1>{real}
1254
+ ├── waveform · table{t0,dt,values}
1255
+ │ ├── dt · array<1>{real}
1256
+ │ ├── t0 · array<1>{real}
1257
+ │ └── values · array_of_equalsized_arrays<1,1>{real}
1258
+ └── wf_std · array<1>{real}
1259
+ """
1260
+ # open file
1261
+ if isinstance(lh5_file, str):
1262
+ lh5_file = h5py.File(expand_path(lh5_file), "r")
1263
+
1264
+ # go to group
1265
+ if lh5_group != "/":
1266
+ lh5_file = lh5_file[lh5_group]
1267
+
1268
+ if header:
1269
+ print(f"\033[1m{lh5_group}\033[0m") # noqa: T201
1270
+
1271
+ # get an iterator over the keys in the group
1272
+ it = iter(lh5_file)
1273
+ key = None
1274
+
1275
+ # make sure there is actually something in this file/group
1276
+ try:
1277
+ key = next(it) # get first key
1278
+ except StopIteration:
1279
+ print(f"{indent}└── empty") # noqa: T201
1280
+ return
1281
+
1282
+ # loop over keys
1283
+ while True:
1284
+ val = lh5_file[key]
1285
+ # we want to print the LGDO datatype
1286
+ dtype = val.attrs.get("datatype", default="no datatype")
1287
+ if dtype == "no datatype" and isinstance(val, h5py.Group):
1288
+ dtype = "HDF5 group"
1289
+
1290
+ attrs_d = dict(val.attrs)
1291
+ attrs_d.pop("datatype", "")
1292
+ attrs = "── " + str(attrs_d) if attrs_d else ""
1293
+
1294
+ # is this the last key?
1295
+ killme = False
1296
+ try:
1297
+ k_new = next(it) # get next key
1298
+ except StopIteration:
1299
+ char = "└──"
1300
+ killme = True # we'll have to kill this loop later
1301
+ else:
1302
+ char = "├──"
1303
+
1304
+ print(f"{indent}{char} \033[1m{key}\033[0m · {dtype} {attrs}") # noqa: T201
1305
+
1306
+ # if it's a group, call this function recursively
1307
+ if isinstance(val, h5py.Group):
1308
+ show(val, indent=indent + (" " if killme else "│ "), header=False)
1309
+
1310
+ # break or move to next key
1311
+ if killme:
1312
+ break
1313
+ else:
1314
+ key = k_new
1315
+
1316
+
1317
+ def load_nda(
1318
+ f_list: str | list[str],
1319
+ par_list: list[str],
1320
+ lh5_group: str = "",
1321
+ idx_list: list[np.ndarray | list | tuple] = None,
1322
+ ) -> dict[str, np.ndarray]:
1323
+ r"""Build a dictionary of :class:`numpy.ndarray`\ s from LH5 data.
1324
+
1325
+ Given a list of files, a list of LH5 table parameters, and an optional
1326
+ group path, return a NumPy array with all values for each parameter.
1327
+
1328
+ Parameters
1329
+ ----------
1330
+ f_list
1331
+ A list of files. Can contain wildcards.
1332
+ par_list
1333
+ A list of parameters to read from each file.
1334
+ lh5_group
1335
+ group path within which to find the specified parameters.
1336
+ idx_list
1337
+ for fancy-indexed reads. Must be one index array for each file in
1338
+ `f_list`.
1339
+
1340
+ Returns
1341
+ -------
1342
+ par_data
1343
+ A dictionary of the parameter data keyed by the elements of `par_list`.
1344
+ Each entry contains the data for the specified parameter concatenated
1345
+ over all files in `f_list`.
1346
+ """
1347
+ if isinstance(f_list, str):
1348
+ f_list = [f_list]
1349
+ if idx_list is not None:
1350
+ idx_list = [idx_list]
1351
+ if idx_list is not None and len(f_list) != len(idx_list):
1352
+ raise ValueError(
1353
+ f"f_list length ({len(f_list)}) != idx_list length ({len(idx_list)})!"
1354
+ )
1355
+
1356
+ # Expand wildcards
1357
+ f_list = [f for f_wc in f_list for f in sorted(glob.glob(os.path.expandvars(f_wc)))]
1358
+
1359
+ sto = LH5Store()
1360
+ par_data = {par: [] for par in par_list}
1361
+ for ii, f in enumerate(f_list):
1362
+ f = sto.gimme_file(f, "r")
1363
+ for par in par_list:
1364
+ if f"{lh5_group}/{par}" not in f:
1365
+ raise RuntimeError(f"'{lh5_group}/{par}' not in file {f_list[ii]}")
1366
+
1367
+ if idx_list is None:
1368
+ data, _ = sto.read_object(f"{lh5_group}/{par}", f)
1369
+ else:
1370
+ data, _ = sto.read_object(f"{lh5_group}/{par}", f, idx=idx_list[ii])
1371
+ if not data:
1372
+ continue
1373
+ par_data[par].append(data.nda)
1374
+ par_data = {par: np.concatenate(par_data[par]) for par in par_list}
1375
+ return par_data
1376
+
1377
+
1378
+ def load_dfs(
1379
+ f_list: str | list[str],
1380
+ par_list: list[str],
1381
+ lh5_group: str = "",
1382
+ idx_list: list[np.ndarray | list | tuple] = None,
1383
+ ) -> pd.DataFrame:
1384
+ """Build a :class:`pandas.DataFrame` from LH5 data.
1385
+
1386
+ Given a list of files (can use wildcards), a list of LH5 columns, and
1387
+ optionally the group path, return a :class:`pandas.DataFrame` with all
1388
+ values for each parameter.
1389
+
1390
+ See Also
1391
+ --------
1392
+ :func:`load_nda`
1393
+
1394
+ Returns
1395
+ -------
1396
+ dataframe
1397
+ contains columns for each parameter in `par_list`, and rows containing
1398
+ all data for the associated parameters concatenated over all files in
1399
+ `f_list`.
1400
+ """
1401
+ return pd.DataFrame(
1402
+ load_nda(f_list, par_list, lh5_group=lh5_group, idx_list=idx_list)
1403
+ )
1404
+
1405
+
1406
+ class LH5Iterator(Iterator):
1407
+ """
1408
+ A class for iterating through one or more LH5 files, one block of entries
1409
+ at a time. This also accepts an entry list/mask to enable event selection,
1410
+ and a field mask.
1411
+
1412
+ This class can be used either for random access:
1413
+
1414
+ >>> lh5_obj, n_rows = lh5_it.read(entry)
1415
+
1416
+ to read the block of entries starting at entry. In case of multiple files
1417
+ or the use of an event selection, entry refers to a global event index
1418
+ across files and does not count events that are excluded by the selection.
1419
+
1420
+ This can also be used as an iterator:
1421
+
1422
+ >>> for lh5_obj, entry, n_rows in LH5Iterator(...):
1423
+ >>> # do the thing!
1424
+
1425
+ This is intended for if you are reading a large quantity of data but
1426
+ want to limit your memory usage (particularly when reading in waveforms!).
1427
+ The ``lh5_obj`` that is read by this class is reused in order to avoid
1428
+ reallocation of memory; this means that if you want to hold on to data
1429
+ between reads, you will have to copy it somewhere!
1430
+ """
1431
+
1432
+ def __init__(
1433
+ self,
1434
+ lh5_files: str | list[str],
1435
+ groups: str | list[str],
1436
+ base_path: str = "",
1437
+ entry_list: list[int] | list[list[int]] = None,
1438
+ entry_mask: list[bool] | list[list[bool]] = None,
1439
+ field_mask: dict[str, bool] | list[str] | tuple[str] = None,
1440
+ buffer_len: int = 3200,
1441
+ friend: LH5Iterator = None,
1442
+ ) -> None:
1443
+ """
1444
+ Parameters
1445
+ ----------
1446
+ lh5_files
1447
+ file or files to read from. May include wildcards and environment
1448
+ variables.
1449
+ groups
1450
+ HDF5 group(s) to read. If a list is provided for both lh5_files
1451
+ and group, they must be the same size. If a file is wild-carded,
1452
+ the same group will be assigned to each file found
1453
+ entry_list
1454
+ list of entry numbers to read. If a nested list is provided,
1455
+ expect one top-level list for each file, containing a list of
1456
+ local entries. If a list of ints is provided, use global entries.
1457
+ entry_mask
1458
+ mask of entries to read. If a list of arrays is provided, expect
1459
+ one for each file. Ignore if a selection list is provided.
1460
+ field_mask
1461
+ mask of which fields to read. See :meth:`LH5Store.read_object` for
1462
+ more details.
1463
+ buffer_len
1464
+ number of entries to read at a time while iterating through files.
1465
+ friend
1466
+ a ''friend'' LH5Iterator that will be read in parallel with this.
1467
+ The friend should have the same length and entry list. A single
1468
+ LH5 table containing columns from both iterators will be returned.
1469
+ """
1470
+ self.lh5_st = LH5Store(base_path=base_path, keep_open=True)
1471
+
1472
+ # List of files, with wildcards and env vars expanded
1473
+ if isinstance(lh5_files, str):
1474
+ lh5_files = [lh5_files]
1475
+ if isinstance(groups, list):
1476
+ lh5_files *= len(groups)
1477
+ elif not isinstance(lh5_files, list):
1478
+ raise ValueError("lh5_files must be a string or list of strings")
1479
+
1480
+ if isinstance(groups, str):
1481
+ groups = [groups] * len(lh5_files)
1482
+ elif not isinstance(groups, list):
1483
+ raise ValueError("group must be a string or list of strings")
1484
+
1485
+ if not len(groups) == len(lh5_files):
1486
+ raise ValueError("lh5_files and groups must have same length")
1487
+
1488
+ self.lh5_files = []
1489
+ self.groups = []
1490
+ for f, g in zip(lh5_files, groups):
1491
+ f_exp = expand_path(f, list=True, base_path=base_path)
1492
+ self.lh5_files += f_exp
1493
+ self.groups += [g] * len(f_exp)
1494
+
1495
+ if entry_list is not None and entry_mask is not None:
1496
+ raise ValueError(
1497
+ "entry_list and entry_mask arguments are mutually exclusive"
1498
+ )
1499
+
1500
+ # Map to last row in each file
1501
+ self.file_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i")
1502
+ # Map to last iterator entry for each file
1503
+ self.entry_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i")
1504
+ self.buffer_len = buffer_len
1505
+
1506
+ if len(self.lh5_files) > 0:
1507
+ f = self.lh5_files[0]
1508
+ g = self.groups[0]
1509
+ self.lh5_buffer = self.lh5_st.get_buffer(
1510
+ g,
1511
+ f,
1512
+ size=self.buffer_len,
1513
+ field_mask=field_mask,
1514
+ )
1515
+ self.file_map[0] = self.lh5_st.read_n_rows(g, f)
1516
+ else:
1517
+ raise RuntimeError(f"can't open any files from {lh5_files}")
1518
+
1519
+ self.n_rows = 0
1520
+ self.current_entry = 0
1521
+ self.next_entry = 0
1522
+
1523
+ self.field_mask = field_mask
1524
+
1525
+ # List of entry indices from each file
1526
+ self.local_entry_list = None
1527
+ self.global_entry_list = None
1528
+ if entry_list is not None:
1529
+ entry_list = list(entry_list)
1530
+ if isinstance(entry_list[0], int):
1531
+ self.local_entry_list = [None] * len(self.file_map)
1532
+ self.global_entry_list = np.array(entry_list, "i")
1533
+ self.global_entry_list.sort()
1534
+
1535
+ else:
1536
+ self.local_entry_list = [[]] * len(self.file_map)
1537
+ for i_file, local_list in enumerate(entry_list):
1538
+ self.local_entry_list[i_file] = np.array(local_list, "i")
1539
+ self.local_entry_list[i_file].sort()
1540
+
1541
+ elif entry_mask is not None:
1542
+ # Convert entry mask into an entry list
1543
+ if isinstance(entry_mask, pd.Series):
1544
+ entry_mask = entry_mask.values
1545
+ if isinstance(entry_mask, np.ndarray):
1546
+ self.local_entry_list = [None] * len(self.file_map)
1547
+ self.global_entry_list = np.nonzero(entry_mask)[0]
1548
+ else:
1549
+ self.local_entry_list = [[]] * len(self.file_map)
1550
+ for i_file, local_mask in enumerate(entry_mask):
1551
+ self.local_entry_list[i_file] = np.nonzero(local_mask)[0]
1552
+
1553
+ # Attach the friend
1554
+ if friend is not None:
1555
+ if not isinstance(friend, LH5Iterator):
1556
+ raise ValueError("Friend must be an LH5Iterator")
1557
+ self.lh5_buffer.join(friend.lh5_buffer)
1558
+ self.friend = friend
1559
+
1560
+ def _get_file_cumlen(self, i_file: int) -> int:
1561
+ """Helper to get cumulative file length of file"""
1562
+ if i_file < 0:
1563
+ return 0
1564
+ fcl = self.file_map[i_file]
1565
+ if fcl == np.iinfo("i").max:
1566
+ fcl = self._get_file_cumlen(i_file - 1) + self.lh5_st.read_n_rows(
1567
+ self.groups[i_file], self.lh5_files[i_file]
1568
+ )
1569
+ self.file_map[i_file] = fcl
1570
+ return fcl
1571
+
1572
+ def _get_file_cumentries(self, i_file: int) -> int:
1573
+ """Helper to get cumulative iterator entries in file"""
1574
+ if i_file < 0:
1575
+ return 0
1576
+ n = self.entry_map[i_file]
1577
+ if n == np.iinfo("i").max:
1578
+ elist = self.get_file_entrylist(i_file)
1579
+ fcl = self._get_file_cumlen(i_file)
1580
+ if elist is None:
1581
+ # no entry list provided
1582
+ n = fcl
1583
+ else:
1584
+ file_entries = self.get_file_entrylist(i_file)
1585
+ # check that file entries fall inside of file
1586
+ if file_entries[-1] >= fcl:
1587
+ logging.warning(f"Found entries out of range for file {i_file}")
1588
+ n = np.searchsorted(file_entries, fcl, "right")
1589
+ else:
1590
+ n = len(file_entries)
1591
+ n += self._get_file_cumentries(i_file - 1)
1592
+ self.entry_map[i_file] = n
1593
+ return n
1594
+
1595
+ def get_file_entrylist(self, i_file: int) -> np.ndarray:
1596
+ """Helper to get entry list for file"""
1597
+ # If no entry list is provided
1598
+ if self.local_entry_list is None:
1599
+ return None
1600
+
1601
+ elist = self.local_entry_list[i_file]
1602
+ if elist is None:
1603
+ # Get local entrylist for this file from global entry list
1604
+ f_start = self._get_file_cumlen(i_file - 1)
1605
+ f_end = self._get_file_cumlen(i_file)
1606
+ i_start = self._get_file_cumentries(i_file - 1)
1607
+ i_stop = np.searchsorted(self.global_entry_list, f_end, "right")
1608
+ elist = np.array(self.global_entry_list[i_start:i_stop], "i") - f_start
1609
+ self.local_entry_list[i_file] = elist
1610
+ return elist
1611
+
1612
+ def get_global_entrylist(self) -> np.ndarray:
1613
+ """Get global entry list, constructing it if needed"""
1614
+ if self.global_entry_list is None and self.local_entry_list is not None:
1615
+ self.global_entry_list = np.zeros(len(self), "i")
1616
+ for i_file in range(len(self.lh5_files)):
1617
+ i_start = self.get_file_cumentries(i_file - 1)
1618
+ i_stop = self.get_file_cumentries(i_file)
1619
+ f_start = self.get_file_cumlen(i_file - 1)
1620
+ self.global_entry_list[i_start:i_stop] = (
1621
+ self.get_file_entrylist(i_file) + f_start
1622
+ )
1623
+ return self.global_entry_list
1624
+
1625
+ def read(self, entry: int) -> tuple[LGDO, int]:
1626
+ """Read the nextlocal chunk of events, starting at entry. Return the
1627
+ LH5 buffer and number of rows read."""
1628
+ self.n_rows = 0
1629
+ i_file = np.searchsorted(self.entry_map, entry, "right")
1630
+
1631
+ # if file hasn't been opened yet, search through files
1632
+ # sequentially until we find the right one
1633
+ if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("i").max:
1634
+ while i_file < len(self.lh5_files) and entry >= self._get_file_cumentries(
1635
+ i_file
1636
+ ):
1637
+ i_file += 1
1638
+
1639
+ if i_file == len(self.lh5_files):
1640
+ return (self.lh5_buffer, self.n_rows)
1641
+ local_entry = entry - self._get_file_cumentries(i_file - 1)
1642
+
1643
+ while self.n_rows < self.buffer_len and i_file < len(self.file_map):
1644
+ # Loop through files
1645
+ local_idx = self.get_file_entrylist(i_file)
1646
+ i_local = local_idx[local_entry] if local_idx is not None else local_entry
1647
+ self.lh5_buffer, n_rows = self.lh5_st.read_object(
1648
+ self.groups[i_file],
1649
+ self.lh5_files[i_file],
1650
+ start_row=i_local,
1651
+ n_rows=self.buffer_len - self.n_rows,
1652
+ idx=local_idx,
1653
+ field_mask=self.field_mask,
1654
+ obj_buf=self.lh5_buffer,
1655
+ obj_buf_start=self.n_rows,
1656
+ )
1657
+
1658
+ self.n_rows += n_rows
1659
+ i_file += 1
1660
+ local_entry = 0
1661
+
1662
+ self.current_entry = entry
1663
+
1664
+ if self.friend is not None:
1665
+ self.friend.read(entry)
1666
+
1667
+ return (self.lh5_buffer, self.n_rows)
1668
+
1669
+ def reset_field_mask(self, mask):
1670
+ """Replaces the field mask of this iterator and any friends with mask"""
1671
+ self.field_mask = mask
1672
+ if self.friend is not None:
1673
+ self.friend.reset_field_mask(mask)
1674
+
1675
+ def __len__(self) -> int:
1676
+ """Return the total number of entries."""
1677
+ return (
1678
+ self._get_file_cumentries(len(self.lh5_files) - 1)
1679
+ if len(self.entry_map) > 0
1680
+ else 0
1681
+ )
1682
+
1683
+ def __iter__(self) -> Iterator:
1684
+ """Loop through entries in blocks of size buffer_len."""
1685
+ self.current_entry = 0
1686
+ self.next_entry = 0
1687
+ return self
1688
+
1689
+ def __next__(self) -> tuple[LGDO, int, int]:
1690
+ """Read next buffer_len entries and return lh5_table, iterator entry
1691
+ and n_rows read."""
1692
+ buf, n_rows = self.read(self.next_entry)
1693
+ self.next_entry = self.current_entry + n_rows
1694
+ if n_rows == 0:
1695
+ raise StopIteration
1696
+ return (buf, self.current_entry, n_rows)
1697
+
1698
+
1699
+ @nb.njit(parallel=False, fastmath=True)
1700
+ def _make_fd_idx(starts, stops, idx):
1701
+ k = 0
1702
+ if len(starts) < len(stops):
1703
+ for i in range(stops[0]):
1704
+ idx[k] = i
1705
+ k += 1
1706
+ stops = stops[1:]
1707
+ for j in range(len(starts)):
1708
+ for i in range(starts[j], stops[j]):
1709
+ idx[k] = i
1710
+ k += 1
1711
+ return (idx,)