legend-pydataobj 1.2.1__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/METADATA +1 -1
- {legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/RECORD +15 -14
- lgdo/_version.py +2 -2
- lgdo/cli.py +56 -0
- lgdo/compression/base.py +2 -0
- lgdo/compression/generic.py +8 -2
- lgdo/compression/radware.py +226 -112
- lgdo/compression/varlen.py +30 -21
- lgdo/lh5_store.py +173 -64
- lgdo/types/scalar.py +1 -1
- lgdo/types/table.py +6 -3
- {legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/LICENSE +0 -0
- {legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/WHEEL +0 -0
- {legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/entry_points.txt +0 -0
- {legend_pydataobj-1.2.1.dist-info → legend_pydataobj-1.4.0.dist-info}/top_level.txt +0 -0
lgdo/compression/varlen.py
CHANGED
@@ -11,7 +11,7 @@ from numpy import int32, ubyte, uint32
|
|
11
11
|
from numpy.typing import NDArray
|
12
12
|
|
13
13
|
from .. import types as lgdo
|
14
|
-
from .base import WaveformCodec
|
14
|
+
from .base import WaveformCodec, numba_defaults
|
15
15
|
|
16
16
|
log = logging.getLogger(__name__)
|
17
17
|
|
@@ -30,7 +30,11 @@ class ULEB128ZigZagDiff(WaveformCodec):
|
|
30
30
|
def encode(
|
31
31
|
sig_in: NDArray | lgdo.VectorOfVectors | lgdo.ArrayOfEqualSizedArrays,
|
32
32
|
sig_out: NDArray[ubyte] = None,
|
33
|
-
) -> (
|
33
|
+
) -> (
|
34
|
+
(NDArray[ubyte], NDArray[uint32])
|
35
|
+
| lgdo.VectorOfEncodedVectors
|
36
|
+
| lgdo.ArrayOfEncodedEqualSizedArrays
|
37
|
+
):
|
34
38
|
"""Compress digital signal(s) with a variable-length encoding of its derivative.
|
35
39
|
|
36
40
|
Wraps :func:`uleb128_zigzag_diff_array_encode` and adds support for encoding
|
@@ -41,8 +45,9 @@ def encode(
|
|
41
45
|
If `sig_in` is a NumPy array, no resizing of `sig_out` is performed. Not
|
42
46
|
even of the internally allocated one.
|
43
47
|
|
44
|
-
Because of the current implementation, providing a
|
45
|
-
:class:`.VectorOfEncodedVectors`
|
48
|
+
Because of the current (hardware vectorized) implementation, providing a
|
49
|
+
pre-allocated :class:`.VectorOfEncodedVectors` or
|
50
|
+
:class:`.ArrayOfEncodedEqualSizedArrays` as `sig_out` is not possible.
|
46
51
|
|
47
52
|
Parameters
|
48
53
|
----------
|
@@ -54,11 +59,12 @@ def encode(
|
|
54
59
|
|
55
60
|
Returns
|
56
61
|
-------
|
57
|
-
sig_out, nbytes
|
62
|
+
sig_out, nbytes | LGDO
|
58
63
|
given pre-allocated `sig_out` structure or new structure of unsigned
|
59
64
|
8-bit integers, plus the number of bytes (length) of the encoded
|
60
65
|
signal. If `sig_in` is an :class:`.LGDO`, only a newly allocated
|
61
|
-
:class:`.VectorOfEncodedVectors`
|
66
|
+
:class:`.VectorOfEncodedVectors` or
|
67
|
+
:class:`.ArrayOfEncodedEqualSizedArrays` is returned.
|
62
68
|
|
63
69
|
See Also
|
64
70
|
--------
|
@@ -88,7 +94,7 @@ def encode(
|
|
88
94
|
return sig_out, nbytes
|
89
95
|
|
90
96
|
elif isinstance(sig_in, lgdo.VectorOfVectors):
|
91
|
-
if sig_out:
|
97
|
+
if sig_out is not None:
|
92
98
|
log.warning(
|
93
99
|
"a pre-allocated VectorOfEncodedVectors was given "
|
94
100
|
"to hold an encoded ArrayOfEqualSizedArrays. "
|
@@ -142,9 +148,11 @@ def encode(
|
|
142
148
|
|
143
149
|
|
144
150
|
def decode(
|
145
|
-
sig_in: (NDArray[ubyte], NDArray[uint32])
|
146
|
-
|
147
|
-
|
151
|
+
sig_in: (NDArray[ubyte], NDArray[uint32])
|
152
|
+
| lgdo.VectorOfEncodedVectors
|
153
|
+
| lgdo.ArrayOfEncodedEqualSizedArrays,
|
154
|
+
sig_out: NDArray | lgdo.ArrayOfEqualSizedArrays = None,
|
155
|
+
) -> (NDArray, NDArray[uint32]) | lgdo.VectorOfVectors | lgdo.ArrayOfEqualSizedArrays:
|
148
156
|
"""Deompress digital signal(s) with a variable-length encoding of its derivative.
|
149
157
|
|
150
158
|
Wraps :func:`uleb128_zigzag_diff_array_decode` and adds support for decoding
|
@@ -159,8 +167,8 @@ def decode(
|
|
159
167
|
:class:`.ArrayOfEqualSizedArrays` `sig_out` has instead always the correct
|
160
168
|
size.
|
161
169
|
|
162
|
-
Because of the current implementation, providing a
|
163
|
-
:class:`.VectorOfVectors` as `sig_out` is not possible.
|
170
|
+
Because of the current (hardware vectorized) implementation, providing a
|
171
|
+
pre-allocated :class:`.VectorOfVectors` as `sig_out` is not possible.
|
164
172
|
|
165
173
|
Parameters
|
166
174
|
----------
|
@@ -173,8 +181,9 @@ def decode(
|
|
173
181
|
|
174
182
|
Returns
|
175
183
|
-------
|
176
|
-
sig_out
|
177
|
-
given pre-allocated structure or new structure of 32-bit integers
|
184
|
+
sig_out, nbytes | LGDO
|
185
|
+
given pre-allocated structure or new structure of 32-bit integers, plus
|
186
|
+
the number of bytes (length) of the decoded signal.
|
178
187
|
|
179
188
|
See Also
|
180
189
|
--------
|
@@ -199,7 +208,7 @@ def decode(
|
|
199
208
|
return sig_out, siglen
|
200
209
|
|
201
210
|
elif isinstance(sig_in, lgdo.ArrayOfEncodedEqualSizedArrays):
|
202
|
-
if
|
211
|
+
if sig_out is None:
|
203
212
|
# initialize output structure with decoded_size
|
204
213
|
sig_out = lgdo.ArrayOfEqualSizedArrays(
|
205
214
|
dims=(1, 1),
|
@@ -257,7 +266,7 @@ def decode(
|
|
257
266
|
|
258
267
|
@numba.vectorize(
|
259
268
|
["uint64(int64)", "uint32(int32)", "uint16(int16)"],
|
260
|
-
|
269
|
+
**numba_defaults,
|
261
270
|
)
|
262
271
|
def zigzag_encode(x: int | NDArray[int]) -> int | NDArray[int]:
|
263
272
|
"""ZigZag-encode [#WikiZZ]_ signed integer numbers."""
|
@@ -266,14 +275,14 @@ def zigzag_encode(x: int | NDArray[int]) -> int | NDArray[int]:
|
|
266
275
|
|
267
276
|
@numba.vectorize(
|
268
277
|
["int64(uint64)", "int32(uint32)", "int16(uint16)"],
|
269
|
-
|
278
|
+
**numba_defaults,
|
270
279
|
)
|
271
280
|
def zigzag_decode(x: int | NDArray[int]) -> int | NDArray[int]:
|
272
281
|
"""ZigZag-decode [#WikiZZ]_ signed integer numbers."""
|
273
282
|
return (x >> 1) ^ -(x & 1)
|
274
283
|
|
275
284
|
|
276
|
-
@numba.jit(["uint32(int64, byte[:])"],
|
285
|
+
@numba.jit(["uint32(int64, byte[:])"], **numba_defaults)
|
277
286
|
def uleb128_encode(x: int, encx: NDArray[ubyte]) -> int:
|
278
287
|
"""Compute a variable-length representation of an unsigned integer.
|
279
288
|
|
@@ -306,7 +315,7 @@ def uleb128_encode(x: int, encx: NDArray[ubyte]) -> int:
|
|
306
315
|
return i + 1
|
307
316
|
|
308
317
|
|
309
|
-
@numba.jit(["UniTuple(uint32, 2)(byte[:])"],
|
318
|
+
@numba.jit(["UniTuple(uint32, 2)(byte[:])"], **numba_defaults)
|
310
319
|
def uleb128_decode(encx: NDArray[ubyte]) -> (int, int):
|
311
320
|
"""Decode a variable-length integer into an unsigned integer.
|
312
321
|
|
@@ -351,7 +360,7 @@ def uleb128_decode(encx: NDArray[ubyte]) -> (int, int):
|
|
351
360
|
"void(int64[:], byte[:], uint32[:])",
|
352
361
|
],
|
353
362
|
"(n),(m),()",
|
354
|
-
|
363
|
+
**numba_defaults,
|
355
364
|
)
|
356
365
|
def uleb128_zigzag_diff_array_encode(
|
357
366
|
sig_in: NDArray[int], sig_out: NDArray[ubyte], nbytes: int
|
@@ -401,7 +410,7 @@ def uleb128_zigzag_diff_array_encode(
|
|
401
410
|
"void(byte[:], uint32[:], int64[:], uint32[:])",
|
402
411
|
],
|
403
412
|
"(n),(),(m),()",
|
404
|
-
|
413
|
+
**numba_defaults,
|
405
414
|
)
|
406
415
|
def uleb128_zigzag_diff_array_decode(
|
407
416
|
sig_in: NDArray[ubyte],
|
lgdo/lh5_store.py
CHANGED
@@ -38,7 +38,7 @@ LGDO = Union[Array, Scalar, Struct, VectorOfVectors]
|
|
38
38
|
|
39
39
|
log = logging.getLogger(__name__)
|
40
40
|
|
41
|
-
|
41
|
+
DEFAULT_HDF5_SETTINGS: dict[str, ...] = {"shuffle": True, "compression": "gzip"}
|
42
42
|
|
43
43
|
|
44
44
|
class LH5Store:
|
@@ -169,6 +169,7 @@ class LH5Store:
|
|
169
169
|
start_row: int = 0,
|
170
170
|
n_rows: int = sys.maxsize,
|
171
171
|
idx: np.ndarray | list | tuple | list[np.ndarray | list | tuple] = None,
|
172
|
+
use_h5idx: bool = False,
|
172
173
|
field_mask: dict[str, bool] | list[str] | tuple[str] = None,
|
173
174
|
obj_buf: LGDO = None,
|
174
175
|
obj_buf_start: int = 0,
|
@@ -176,6 +177,14 @@ class LH5Store:
|
|
176
177
|
) -> tuple[LGDO, int]:
|
177
178
|
"""Read LH5 object data from a file.
|
178
179
|
|
180
|
+
Use the ``idx`` parameter to read out particular rows of the data. The ``use_h5idx`` flag
|
181
|
+
controls whether *only* those rows are read from disk or if the rows are indexed after reading
|
182
|
+
the entire object. Reading individual rows can be orders of magnitude slower than reading
|
183
|
+
the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``)
|
184
|
+
is to use slightly more memory for a much faster read. See
|
185
|
+
`legend-pydataobj #29 <https://github.com/legend-exp/legend-pydataobj/issues/29>`_
|
186
|
+
for additional information.
|
187
|
+
|
179
188
|
Parameters
|
180
189
|
----------
|
181
190
|
name
|
@@ -192,16 +201,27 @@ class LH5Store:
|
|
192
201
|
actual number of rows read will be returned as one of the return
|
193
202
|
values (see below).
|
194
203
|
idx
|
195
|
-
For NumPy-style "fancying indexing" for the read
|
196
|
-
rows
|
197
|
-
axis is supported, so tuple arguments
|
198
|
-
is not false, `idx` will be truncated to
|
199
|
-
with a list of files, can pass in a list of
|
200
|
-
file) or use a long contiguous list (e.g. built from a previous
|
204
|
+
For NumPy-style "fancying indexing" for the read to select only some
|
205
|
+
rows, e.g. after applying some cuts to particular columns.
|
206
|
+
Only selection along the first axis is supported, so tuple arguments
|
207
|
+
must be one-tuples. If `n_rows` is not false, `idx` will be truncated to
|
208
|
+
`n_rows` before reading. To use with a list of files, can pass in a list of
|
209
|
+
`idx`'s (one for each file) or use a long contiguous list (e.g. built from a previous
|
201
210
|
identical read). If used in conjunction with `start_row` and `n_rows`,
|
202
211
|
will be sliced to obey those constraints, where `n_rows` is
|
203
212
|
interpreted as the (max) number of *selected* values (in `idx`) to be
|
204
|
-
read out.
|
213
|
+
read out. Note that the ``use_h5idx`` parameter controls some behaviour of the
|
214
|
+
read and that the default behavior (``use_h5idx=False``) prioritizes speed over
|
215
|
+
a small memory penalty.
|
216
|
+
use_h5idx
|
217
|
+
``True`` will directly pass the ``idx`` parameter to the underlying
|
218
|
+
``h5py`` call such that only the selected rows are read directly into memory,
|
219
|
+
which conserves memory at the cost of speed. There can be a significant penalty
|
220
|
+
to speed for larger files (1 - 2 orders of magnitude longer time).
|
221
|
+
``False`` (default) will read the entire object into memory before
|
222
|
+
performing the indexing. The default is much faster but requires additional memory,
|
223
|
+
though a relatively small amount in the typical use case. It is recommended to
|
224
|
+
leave this parameter as its default.
|
205
225
|
field_mask
|
206
226
|
For tables and structs, determines which fields get written out.
|
207
227
|
Only applies to immediate fields of the requested objects. If a dict
|
@@ -223,6 +243,7 @@ class LH5Store:
|
|
223
243
|
after reading. The option has no effect on data encoded with HDF5
|
224
244
|
built-in filters, which is always decompressed upstream by HDF5.
|
225
245
|
|
246
|
+
|
226
247
|
Returns
|
227
248
|
-------
|
228
249
|
(object, n_rows_read)
|
@@ -236,6 +257,14 @@ class LH5Store:
|
|
236
257
|
if not isinstance(lh5_file, (str, h5py.File)):
|
237
258
|
lh5_file = list(lh5_file)
|
238
259
|
n_rows_read = 0
|
260
|
+
|
261
|
+
# to know whether we are reading in a list of files.
|
262
|
+
# this is part of the fix for reading data by idx
|
263
|
+
# (see https://github.com/legend-exp/legend-pydataobj/issues/29)
|
264
|
+
# so that we only make a copy of the data if absolutely necessary
|
265
|
+
# or if we can read the data from file without having to make a copy
|
266
|
+
self.in_file_loop = True
|
267
|
+
|
239
268
|
for i, h5f in enumerate(lh5_file):
|
240
269
|
if isinstance(idx, list) and len(idx) > 0 and not np.isscalar(idx[0]):
|
241
270
|
# a list of lists: must be one per file
|
@@ -255,22 +284,32 @@ class LH5Store:
|
|
255
284
|
else:
|
256
285
|
idx_i = None
|
257
286
|
n_rows_i = n_rows - n_rows_read
|
287
|
+
|
288
|
+
# maybe someone passed in a list of len==1?
|
289
|
+
if i == (len(lh5_file) - 1):
|
290
|
+
self.in_file_loop = False
|
291
|
+
|
258
292
|
obj_buf, n_rows_read_i = self.read_object(
|
259
293
|
name,
|
260
294
|
lh5_file[i],
|
261
295
|
start_row=start_row,
|
262
296
|
n_rows=n_rows_i,
|
263
297
|
idx=idx_i,
|
298
|
+
use_h5idx=use_h5idx,
|
264
299
|
field_mask=field_mask,
|
265
300
|
obj_buf=obj_buf,
|
266
301
|
obj_buf_start=obj_buf_start,
|
267
302
|
decompress=decompress,
|
268
303
|
)
|
304
|
+
|
269
305
|
n_rows_read += n_rows_read_i
|
270
306
|
if n_rows_read >= n_rows or obj_buf is None:
|
271
307
|
return obj_buf, n_rows_read
|
272
308
|
start_row = 0
|
273
309
|
obj_buf_start += n_rows_read_i
|
310
|
+
|
311
|
+
self.in_file_loop = False
|
312
|
+
|
274
313
|
return obj_buf, n_rows_read
|
275
314
|
|
276
315
|
# get the file from the store
|
@@ -358,6 +397,7 @@ class LH5Store:
|
|
358
397
|
start_row=start_row,
|
359
398
|
n_rows=n_rows,
|
360
399
|
idx=idx,
|
400
|
+
use_h5idx=use_h5idx,
|
361
401
|
decompress=decompress,
|
362
402
|
)
|
363
403
|
# modify datatype in attrs if a field_mask was used
|
@@ -404,6 +444,7 @@ class LH5Store:
|
|
404
444
|
start_row=start_row,
|
405
445
|
n_rows=n_rows,
|
406
446
|
idx=idx,
|
447
|
+
use_h5idx=use_h5idx,
|
407
448
|
obj_buf=fld_buf,
|
408
449
|
obj_buf_start=obj_buf_start,
|
409
450
|
decompress=decompress,
|
@@ -497,6 +538,7 @@ class LH5Store:
|
|
497
538
|
start_row=start_row,
|
498
539
|
n_rows=n_rows,
|
499
540
|
idx=idx,
|
541
|
+
use_h5idx=use_h5idx,
|
500
542
|
obj_buf=None if decompress else decoded_size_buf,
|
501
543
|
obj_buf_start=0 if decompress else obj_buf_start,
|
502
544
|
)
|
@@ -508,6 +550,7 @@ class LH5Store:
|
|
508
550
|
start_row=start_row,
|
509
551
|
n_rows=n_rows,
|
510
552
|
idx=idx,
|
553
|
+
use_h5idx=use_h5idx,
|
511
554
|
obj_buf=None if decompress else encoded_data_buf,
|
512
555
|
obj_buf_start=0 if decompress else obj_buf_start,
|
513
556
|
)
|
@@ -531,26 +574,31 @@ class LH5Store:
|
|
531
574
|
elif obj_buf is None and decompress:
|
532
575
|
return compress.decode(rawdata), n_rows_read
|
533
576
|
|
577
|
+
# eventually expand provided obj_buf, if too short
|
578
|
+
buf_size = obj_buf_start + n_rows_read
|
579
|
+
if len(obj_buf) < buf_size:
|
580
|
+
obj_buf.resize(buf_size)
|
581
|
+
|
534
582
|
# use the (decoded object type) buffer otherwise
|
535
|
-
if enc_lgdo ==
|
536
|
-
obj_buf,
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
583
|
+
if enc_lgdo == ArrayOfEncodedEqualSizedArrays:
|
584
|
+
if not isinstance(obj_buf, ArrayOfEqualSizedArrays):
|
585
|
+
raise ValueError(
|
586
|
+
f"obj_buf for decoded '{name}' not an ArrayOfEqualSizedArrays"
|
587
|
+
)
|
588
|
+
|
589
|
+
compress.decode(rawdata, obj_buf[obj_buf_start:buf_size])
|
590
|
+
|
591
|
+
elif enc_lgdo == VectorOfEncodedVectors:
|
592
|
+
if not isinstance(obj_buf, VectorOfVectors):
|
593
|
+
raise ValueError(
|
594
|
+
f"obj_buf for decoded '{name}' not a VectorOfVectors"
|
595
|
+
)
|
547
596
|
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
obj_buf[i] = wf
|
597
|
+
# FIXME: not a good idea. an in place decoding version
|
598
|
+
# of decode would be needed to avoid extra memory
|
599
|
+
# allocations
|
600
|
+
for i, wf in enumerate(compress.decode(rawdata)):
|
601
|
+
obj_buf[obj_buf_start + i] = wf
|
554
602
|
|
555
603
|
return obj_buf, n_rows_read
|
556
604
|
|
@@ -568,6 +616,7 @@ class LH5Store:
|
|
568
616
|
start_row=start_row,
|
569
617
|
n_rows=n_rows,
|
570
618
|
idx=idx,
|
619
|
+
use_h5idx=use_h5idx,
|
571
620
|
obj_buf=cumulen_buf,
|
572
621
|
obj_buf_start=obj_buf_start,
|
573
622
|
)
|
@@ -592,6 +641,7 @@ class LH5Store:
|
|
592
641
|
start_row=start_row,
|
593
642
|
n_rows=n_rows,
|
594
643
|
idx=idx2,
|
644
|
+
use_h5idx=use_h5idx,
|
595
645
|
)
|
596
646
|
fd_starts = fd_starts.nda # we just need the nda
|
597
647
|
if fd_start is None:
|
@@ -674,6 +724,7 @@ class LH5Store:
|
|
674
724
|
start_row=fd_start,
|
675
725
|
n_rows=fd_n_rows,
|
676
726
|
idx=fd_idx,
|
727
|
+
use_h5idx=use_h5idx,
|
677
728
|
obj_buf=fd_buf,
|
678
729
|
obj_buf_start=fd_buf_start,
|
679
730
|
)
|
@@ -717,9 +768,22 @@ class LH5Store:
|
|
717
768
|
if n_rows_to_read > n_rows:
|
718
769
|
n_rows_to_read = n_rows
|
719
770
|
|
771
|
+
# if idx is passed, check if we can make it a slice instead (faster)
|
772
|
+
change_idx_to_slice = False
|
773
|
+
|
720
774
|
# prepare the selection for the read. Use idx if available
|
721
775
|
if idx is not None:
|
722
|
-
|
776
|
+
# check if idx is empty and convert to slice instead
|
777
|
+
if len(idx[0]) == 0:
|
778
|
+
source_sel = np.s_[0:0]
|
779
|
+
change_idx_to_slice = True
|
780
|
+
# check if idx is contiguous and increasing
|
781
|
+
# if so, convert it to a slice instead (faster)
|
782
|
+
elif np.all(np.diff(idx[0]) == 1):
|
783
|
+
source_sel = np.s_[idx[0][0] : idx[0][-1] + 1]
|
784
|
+
change_idx_to_slice = True
|
785
|
+
else:
|
786
|
+
source_sel = idx
|
723
787
|
else:
|
724
788
|
source_sel = np.s_[start_row : start_row + n_rows_to_read]
|
725
789
|
|
@@ -729,14 +793,34 @@ class LH5Store:
|
|
729
793
|
if len(obj_buf) < buf_size:
|
730
794
|
obj_buf.resize(buf_size)
|
731
795
|
dest_sel = np.s_[obj_buf_start:buf_size]
|
732
|
-
|
796
|
+
|
797
|
+
# this is required to make the read of multiple files faster
|
798
|
+
# until a better solution found.
|
799
|
+
if change_idx_to_slice or idx is None or use_h5idx:
|
800
|
+
h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
|
801
|
+
else:
|
802
|
+
# it is faster to read the whole object and then do fancy indexing
|
803
|
+
obj_buf.nda[dest_sel] = h5f[name][...][source_sel]
|
804
|
+
|
733
805
|
nda = obj_buf.nda
|
734
806
|
else:
|
735
807
|
if n_rows == 0:
|
736
808
|
tmp_shape = (0,) + h5f[name].shape[1:]
|
737
809
|
nda = np.empty(tmp_shape, h5f[name].dtype)
|
738
810
|
else:
|
739
|
-
|
811
|
+
if change_idx_to_slice or idx is None or use_h5idx:
|
812
|
+
nda = h5f[name][source_sel]
|
813
|
+
else:
|
814
|
+
# it is faster to read the whole object and then do fancy indexing
|
815
|
+
nda = h5f[name][...][source_sel]
|
816
|
+
|
817
|
+
# if reading a list of files recursively, this is given to obj_buf on
|
818
|
+
# the first file read. obj_buf needs to be resized and therefore
|
819
|
+
# it needs to hold the data itself (not a view of the data).
|
820
|
+
# a view is returned by the source_sel indexing, which cannot be resized
|
821
|
+
# by ndarray.resize().
|
822
|
+
if hasattr(self, "in_file_loop") and self.in_file_loop:
|
823
|
+
nda = np.copy(nda)
|
740
824
|
|
741
825
|
# special handling for bools
|
742
826
|
# (c and Julia store as uint8 so cast to bool)
|
@@ -776,7 +860,7 @@ class LH5Store:
|
|
776
860
|
n_rows: int = None,
|
777
861
|
wo_mode: str = "append",
|
778
862
|
write_start: int = 0,
|
779
|
-
|
863
|
+
**h5py_kwargs,
|
780
864
|
) -> None:
|
781
865
|
"""Write an LGDO into an LH5 file.
|
782
866
|
|
@@ -791,20 +875,30 @@ class LH5Store:
|
|
791
875
|
passed directly to :meth:`h5py.Group.create_dataset`.
|
792
876
|
|
793
877
|
:class:`.WaveformCodec` object
|
794
|
-
If `obj` is a :class:`.WaveformTable
|
795
|
-
this algorithm. More
|
796
|
-
compression algorithms at
|
878
|
+
If `obj` is a :class:`.WaveformTable` and ``obj.values`` holds the
|
879
|
+
attribute, compress ``values`` using this algorithm. More
|
880
|
+
documentation about the supported waveform compression algorithms at
|
881
|
+
:mod:`.lgdo.compression`.
|
882
|
+
|
883
|
+
If the `obj` :class:`.LGDO` has a `hdf5_settings` attribute holding a
|
884
|
+
dictionary, it is interpreted as a list of keyword arguments to be
|
885
|
+
forwarded directly to :meth:`h5py.Group.create_dataset` (exactly like
|
886
|
+
the first format of `compression` above). This is the preferred way to
|
887
|
+
specify HDF5 dataset options such as chunking etc. If compression
|
888
|
+
options are specified, they take precedence over those set with the
|
889
|
+
`compression` attribute.
|
797
890
|
|
798
891
|
Note
|
799
892
|
----
|
800
|
-
The `compression` attribute takes precedence over the
|
801
|
-
|
893
|
+
The `compression` LGDO attribute takes precedence over the default HDF5
|
894
|
+
compression settings. The `hdf5_settings` attribute takes precedence
|
895
|
+
over `compression`. These attributes are not written to disk.
|
802
896
|
|
803
897
|
Note
|
804
898
|
----
|
805
|
-
HDF5 compression is skipped for the `encoded_data`
|
806
|
-
:class:`.VectorOfEncodedVectors` and
|
807
|
-
:class
|
899
|
+
HDF5 compression is skipped for the `encoded_data.flattened_data`
|
900
|
+
dataset of :class:`.VectorOfEncodedVectors` and
|
901
|
+
:class:`.ArrayOfEncodedEqualSizedArrays`.
|
808
902
|
|
809
903
|
Parameters
|
810
904
|
----------
|
@@ -840,15 +934,17 @@ class LH5Store:
|
|
840
934
|
write_start
|
841
935
|
row in the output file (if already existing) to start overwriting
|
842
936
|
from.
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
937
|
+
**h5py_kwargs
|
938
|
+
additional keyword arguments forwarded to
|
939
|
+
:meth:`h5py.Group.create_dataset` to specify, for example, an HDF5
|
940
|
+
compression filter to be applied before writing non-scalar
|
941
|
+
datasets. **Note: `compression` Ignored if compression is specified
|
942
|
+
as an `obj` attribute.**
|
847
943
|
"""
|
848
944
|
log.debug(
|
849
945
|
f"writing {repr(obj)}[{start_row}:{n_rows}] as "
|
850
946
|
f"{lh5_file}:{group}/{name}[{write_start}:], "
|
851
|
-
f"mode = {wo_mode},
|
947
|
+
f"mode = {wo_mode}, h5py_kwargs = {h5py_kwargs}"
|
852
948
|
)
|
853
949
|
|
854
950
|
if wo_mode == "write_safe":
|
@@ -921,8 +1017,8 @@ class LH5Store:
|
|
921
1017
|
for field in obj.keys():
|
922
1018
|
# eventually compress waveform table values with LGDO's
|
923
1019
|
# custom codecs before writing
|
924
|
-
# if waveformtable.values.attrs["compression"] is a
|
925
|
-
#
|
1020
|
+
# if waveformtable.values.attrs["compression"] is NOT a
|
1021
|
+
# WaveformCodec, just leave it there
|
926
1022
|
obj_fld = None
|
927
1023
|
if (
|
928
1024
|
isinstance(obj, WaveformTable)
|
@@ -948,7 +1044,7 @@ class LH5Store:
|
|
948
1044
|
n_rows=n_rows,
|
949
1045
|
wo_mode=wo_mode,
|
950
1046
|
write_start=write_start,
|
951
|
-
|
1047
|
+
**h5py_kwargs,
|
952
1048
|
)
|
953
1049
|
return
|
954
1050
|
|
@@ -972,6 +1068,9 @@ class LH5Store:
|
|
972
1068
|
name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
|
973
1069
|
)
|
974
1070
|
|
1071
|
+
# ask not to further compress flattened_data, it is already compressed!
|
1072
|
+
obj.encoded_data.flattened_data.attrs["compression"] = None
|
1073
|
+
|
975
1074
|
self.write_object(
|
976
1075
|
obj.encoded_data,
|
977
1076
|
"encoded_data",
|
@@ -981,7 +1080,7 @@ class LH5Store:
|
|
981
1080
|
n_rows=n_rows,
|
982
1081
|
wo_mode=wo_mode,
|
983
1082
|
write_start=write_start,
|
984
|
-
|
1083
|
+
**h5py_kwargs,
|
985
1084
|
)
|
986
1085
|
|
987
1086
|
self.write_object(
|
@@ -993,7 +1092,7 @@ class LH5Store:
|
|
993
1092
|
n_rows=n_rows,
|
994
1093
|
wo_mode=wo_mode,
|
995
1094
|
write_start=write_start,
|
996
|
-
|
1095
|
+
**h5py_kwargs,
|
997
1096
|
)
|
998
1097
|
|
999
1098
|
# vector of vectors
|
@@ -1029,7 +1128,7 @@ class LH5Store:
|
|
1029
1128
|
n_rows=fd_n_rows,
|
1030
1129
|
wo_mode=wo_mode,
|
1031
1130
|
write_start=offset,
|
1032
|
-
|
1131
|
+
**h5py_kwargs,
|
1033
1132
|
)
|
1034
1133
|
|
1035
1134
|
# now offset is used to give appropriate in-file values for
|
@@ -1052,7 +1151,7 @@ class LH5Store:
|
|
1052
1151
|
n_rows=n_rows,
|
1053
1152
|
wo_mode=wo_mode,
|
1054
1153
|
write_start=write_start,
|
1055
|
-
|
1154
|
+
**h5py_kwargs,
|
1056
1155
|
)
|
1057
1156
|
obj.cumulative_length.nda -= cl_dtype(offset)
|
1058
1157
|
|
@@ -1072,29 +1171,39 @@ class LH5Store:
|
|
1072
1171
|
# need to create dataset from ndarray the first time for speed
|
1073
1172
|
# creating an empty dataset and appending to that is super slow!
|
1074
1173
|
if (wo_mode != "a" and write_start == 0) or name not in group:
|
1174
|
+
# this is needed in order to have a resizable (in the first
|
1175
|
+
# axis) data set, i.e. rows can be appended later
|
1176
|
+
# NOTE: this automatically turns chunking on!
|
1075
1177
|
maxshape = (None,) + nda.shape[1:]
|
1178
|
+
h5py_kwargs.setdefault("maxshape", maxshape)
|
1179
|
+
|
1076
1180
|
if wo_mode == "o" and name in group:
|
1077
1181
|
log.debug(f"overwriting {name} in {group}")
|
1078
1182
|
del group[name]
|
1079
1183
|
|
1184
|
+
# set default compression options
|
1185
|
+
for k, v in DEFAULT_HDF5_SETTINGS.items():
|
1186
|
+
h5py_kwargs.setdefault(k, v)
|
1187
|
+
|
1188
|
+
# compress using the 'compression' LGDO attribute, if available
|
1189
|
+
if "compression" in obj.attrs:
|
1190
|
+
comp_algo = obj.attrs["compression"]
|
1191
|
+
if isinstance(comp_algo, dict):
|
1192
|
+
h5py_kwargs |= obj.attrs["compression"]
|
1193
|
+
else:
|
1194
|
+
h5py_kwargs["compression"] = obj.attrs["compression"]
|
1195
|
+
|
1196
|
+
# and even the 'hdf5_settings' one, preferred
|
1197
|
+
if "hdf5_settings" in obj.attrs:
|
1198
|
+
h5py_kwargs |= obj.attrs["hdf5_settings"]
|
1199
|
+
|
1080
1200
|
# create HDF5 dataset
|
1081
|
-
|
1082
|
-
# available
|
1083
|
-
# - otherwise use "hdf5_compression"
|
1084
|
-
# - attach HDF5 dataset attributes, but not "compression"!
|
1085
|
-
comp_algo = obj.attrs.get("compression", hdf5_compression)
|
1086
|
-
comp_kwargs = {}
|
1087
|
-
if isinstance(comp_algo, str):
|
1088
|
-
comp_kwargs = {"compression": comp_algo}
|
1089
|
-
elif comp_algo is not None:
|
1090
|
-
comp_kwargs = comp_algo
|
1091
|
-
|
1092
|
-
ds = group.create_dataset(
|
1093
|
-
name, data=nda, maxshape=maxshape, **comp_kwargs
|
1094
|
-
)
|
1201
|
+
ds = group.create_dataset(name, data=nda, **h5py_kwargs)
|
1095
1202
|
|
1203
|
+
# attach HDF5 dataset attributes, but not "compression"!
|
1096
1204
|
_attrs = obj.getattrs(datatype=True)
|
1097
1205
|
_attrs.pop("compression", None)
|
1206
|
+
_attrs.pop("hdf5_settings", None)
|
1098
1207
|
ds.attrs.update(_attrs)
|
1099
1208
|
return
|
1100
1209
|
|
lgdo/types/scalar.py
CHANGED
@@ -18,7 +18,7 @@ class Scalar(LGDO):
|
|
18
18
|
|
19
19
|
# TODO: do scalars need proper numpy dtypes?
|
20
20
|
|
21
|
-
def __init__(self, value: int | float, attrs: dict[str, Any] = None) -> None:
|
21
|
+
def __init__(self, value: int | float | str, attrs: dict[str, Any] = None) -> None:
|
22
22
|
"""
|
23
23
|
Parameters
|
24
24
|
----------
|
lgdo/types/table.py
CHANGED
@@ -225,7 +225,10 @@ class Table(Struct):
|
|
225
225
|
if not hasattr(column, "nda"):
|
226
226
|
raise ValueError(f"column {col} does not have an nda")
|
227
227
|
else:
|
228
|
-
|
228
|
+
if len(column.nda.shape) == 1:
|
229
|
+
df[prefix + str(col)] = column.nda
|
230
|
+
else:
|
231
|
+
df[prefix + str(col)] = column.nda.tolist()
|
229
232
|
|
230
233
|
return df
|
231
234
|
|
@@ -248,8 +251,8 @@ class Table(Struct):
|
|
248
251
|
"O1": {
|
249
252
|
"expression": "p1 + p2 * a**2",
|
250
253
|
"parameters": {
|
251
|
-
"p1":
|
252
|
-
"p2":
|
254
|
+
"p1": 2,
|
255
|
+
"p2": 3
|
253
256
|
}
|
254
257
|
},
|
255
258
|
"O2": {
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|