legend-pydataobj 1.2.1__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,7 +11,7 @@ from numpy import int32, ubyte, uint32
11
11
  from numpy.typing import NDArray
12
12
 
13
13
  from .. import types as lgdo
14
- from .base import WaveformCodec
14
+ from .base import WaveformCodec, numba_defaults
15
15
 
16
16
  log = logging.getLogger(__name__)
17
17
 
@@ -30,7 +30,11 @@ class ULEB128ZigZagDiff(WaveformCodec):
30
30
  def encode(
31
31
  sig_in: NDArray | lgdo.VectorOfVectors | lgdo.ArrayOfEqualSizedArrays,
32
32
  sig_out: NDArray[ubyte] = None,
33
- ) -> (NDArray[ubyte], NDArray[uint32]) | lgdo.VectorOfEncodedVectors:
33
+ ) -> (
34
+ (NDArray[ubyte], NDArray[uint32])
35
+ | lgdo.VectorOfEncodedVectors
36
+ | lgdo.ArrayOfEncodedEqualSizedArrays
37
+ ):
34
38
  """Compress digital signal(s) with a variable-length encoding of its derivative.
35
39
 
36
40
  Wraps :func:`uleb128_zigzag_diff_array_encode` and adds support for encoding
@@ -41,8 +45,9 @@ def encode(
41
45
  If `sig_in` is a NumPy array, no resizing of `sig_out` is performed. Not
42
46
  even of the internally allocated one.
43
47
 
44
- Because of the current implementation, providing a pre-allocated
45
- :class:`.VectorOfEncodedVectors` as `sig_out` is not possible.
48
+ Because of the current (hardware vectorized) implementation, providing a
49
+ pre-allocated :class:`.VectorOfEncodedVectors` or
50
+ :class:`.ArrayOfEncodedEqualSizedArrays` as `sig_out` is not possible.
46
51
 
47
52
  Parameters
48
53
  ----------
@@ -54,11 +59,12 @@ def encode(
54
59
 
55
60
  Returns
56
61
  -------
57
- sig_out, nbytes
62
+ sig_out, nbytes | LGDO
58
63
  given pre-allocated `sig_out` structure or new structure of unsigned
59
64
  8-bit integers, plus the number of bytes (length) of the encoded
60
65
  signal. If `sig_in` is an :class:`.LGDO`, only a newly allocated
61
- :class:`.VectorOfEncodedVectors` is returned.
66
+ :class:`.VectorOfEncodedVectors` or
67
+ :class:`.ArrayOfEncodedEqualSizedArrays` is returned.
62
68
 
63
69
  See Also
64
70
  --------
@@ -88,7 +94,7 @@ def encode(
88
94
  return sig_out, nbytes
89
95
 
90
96
  elif isinstance(sig_in, lgdo.VectorOfVectors):
91
- if sig_out:
97
+ if sig_out is not None:
92
98
  log.warning(
93
99
  "a pre-allocated VectorOfEncodedVectors was given "
94
100
  "to hold an encoded ArrayOfEqualSizedArrays. "
@@ -142,9 +148,11 @@ def encode(
142
148
 
143
149
 
144
150
  def decode(
145
- sig_in: (NDArray[ubyte], NDArray[uint32]) | lgdo.VectorOfEncodedVectors,
146
- sig_out: NDArray | lgdo.VectorOfVectors | lgdo.ArrayOfEqualSizedArrays = None,
147
- ) -> NDArray | lgdo.VectorOfVectors | lgdo.ArrayOfEqualSizedArrays:
151
+ sig_in: (NDArray[ubyte], NDArray[uint32])
152
+ | lgdo.VectorOfEncodedVectors
153
+ | lgdo.ArrayOfEncodedEqualSizedArrays,
154
+ sig_out: NDArray | lgdo.ArrayOfEqualSizedArrays = None,
155
+ ) -> (NDArray, NDArray[uint32]) | lgdo.VectorOfVectors | lgdo.ArrayOfEqualSizedArrays:
148
156
  """Deompress digital signal(s) with a variable-length encoding of its derivative.
149
157
 
150
158
  Wraps :func:`uleb128_zigzag_diff_array_decode` and adds support for decoding
@@ -159,8 +167,8 @@ def decode(
159
167
  :class:`.ArrayOfEqualSizedArrays` `sig_out` has instead always the correct
160
168
  size.
161
169
 
162
- Because of the current implementation, providing a pre-allocated
163
- :class:`.VectorOfVectors` as `sig_out` is not possible.
170
+ Because of the current (hardware vectorized) implementation, providing a
171
+ pre-allocated :class:`.VectorOfVectors` as `sig_out` is not possible.
164
172
 
165
173
  Parameters
166
174
  ----------
@@ -173,8 +181,9 @@ def decode(
173
181
 
174
182
  Returns
175
183
  -------
176
- sig_out
177
- given pre-allocated structure or new structure of 32-bit integers.
184
+ sig_out, nbytes | LGDO
185
+ given pre-allocated structure or new structure of 32-bit integers, plus
186
+ the number of bytes (length) of the decoded signal.
178
187
 
179
188
  See Also
180
189
  --------
@@ -199,7 +208,7 @@ def decode(
199
208
  return sig_out, siglen
200
209
 
201
210
  elif isinstance(sig_in, lgdo.ArrayOfEncodedEqualSizedArrays):
202
- if not sig_out:
211
+ if sig_out is None:
203
212
  # initialize output structure with decoded_size
204
213
  sig_out = lgdo.ArrayOfEqualSizedArrays(
205
214
  dims=(1, 1),
@@ -257,7 +266,7 @@ def decode(
257
266
 
258
267
  @numba.vectorize(
259
268
  ["uint64(int64)", "uint32(int32)", "uint16(int16)"],
260
- nopython=True,
269
+ **numba_defaults,
261
270
  )
262
271
  def zigzag_encode(x: int | NDArray[int]) -> int | NDArray[int]:
263
272
  """ZigZag-encode [#WikiZZ]_ signed integer numbers."""
@@ -266,14 +275,14 @@ def zigzag_encode(x: int | NDArray[int]) -> int | NDArray[int]:
266
275
 
267
276
  @numba.vectorize(
268
277
  ["int64(uint64)", "int32(uint32)", "int16(uint16)"],
269
- nopython=True,
278
+ **numba_defaults,
270
279
  )
271
280
  def zigzag_decode(x: int | NDArray[int]) -> int | NDArray[int]:
272
281
  """ZigZag-decode [#WikiZZ]_ signed integer numbers."""
273
282
  return (x >> 1) ^ -(x & 1)
274
283
 
275
284
 
276
- @numba.jit(["uint32(int64, byte[:])"], nopython=True)
285
+ @numba.jit(["uint32(int64, byte[:])"], **numba_defaults)
277
286
  def uleb128_encode(x: int, encx: NDArray[ubyte]) -> int:
278
287
  """Compute a variable-length representation of an unsigned integer.
279
288
 
@@ -306,7 +315,7 @@ def uleb128_encode(x: int, encx: NDArray[ubyte]) -> int:
306
315
  return i + 1
307
316
 
308
317
 
309
- @numba.jit(["UniTuple(uint32, 2)(byte[:])"], nopython=True)
318
+ @numba.jit(["UniTuple(uint32, 2)(byte[:])"], **numba_defaults)
310
319
  def uleb128_decode(encx: NDArray[ubyte]) -> (int, int):
311
320
  """Decode a variable-length integer into an unsigned integer.
312
321
 
@@ -351,7 +360,7 @@ def uleb128_decode(encx: NDArray[ubyte]) -> (int, int):
351
360
  "void(int64[:], byte[:], uint32[:])",
352
361
  ],
353
362
  "(n),(m),()",
354
- nopython=True,
363
+ **numba_defaults,
355
364
  )
356
365
  def uleb128_zigzag_diff_array_encode(
357
366
  sig_in: NDArray[int], sig_out: NDArray[ubyte], nbytes: int
@@ -401,7 +410,7 @@ def uleb128_zigzag_diff_array_encode(
401
410
  "void(byte[:], uint32[:], int64[:], uint32[:])",
402
411
  ],
403
412
  "(n),(),(m),()",
404
- nopython=True,
413
+ **numba_defaults,
405
414
  )
406
415
  def uleb128_zigzag_diff_array_decode(
407
416
  sig_in: NDArray[ubyte],
lgdo/lh5_store.py CHANGED
@@ -38,7 +38,7 @@ LGDO = Union[Array, Scalar, Struct, VectorOfVectors]
38
38
 
39
39
  log = logging.getLogger(__name__)
40
40
 
41
- DEFAULT_HDF5_COMPRESSION = None
41
+ DEFAULT_HDF5_SETTINGS: dict[str, ...] = {"shuffle": True, "compression": "gzip"}
42
42
 
43
43
 
44
44
  class LH5Store:
@@ -169,6 +169,7 @@ class LH5Store:
169
169
  start_row: int = 0,
170
170
  n_rows: int = sys.maxsize,
171
171
  idx: np.ndarray | list | tuple | list[np.ndarray | list | tuple] = None,
172
+ use_h5idx: bool = False,
172
173
  field_mask: dict[str, bool] | list[str] | tuple[str] = None,
173
174
  obj_buf: LGDO = None,
174
175
  obj_buf_start: int = 0,
@@ -176,6 +177,14 @@ class LH5Store:
176
177
  ) -> tuple[LGDO, int]:
177
178
  """Read LH5 object data from a file.
178
179
 
180
+ Use the ``idx`` parameter to read out particular rows of the data. The ``use_h5idx`` flag
181
+ controls whether *only* those rows are read from disk or if the rows are indexed after reading
182
+ the entire object. Reading individual rows can be orders of magnitude slower than reading
183
+ the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``)
184
+ is to use slightly more memory for a much faster read. See
185
+ `legend-pydataobj #29 <https://github.com/legend-exp/legend-pydataobj/issues/29>`_
186
+ for additional information.
187
+
179
188
  Parameters
180
189
  ----------
181
190
  name
@@ -192,16 +201,27 @@ class LH5Store:
192
201
  actual number of rows read will be returned as one of the return
193
202
  values (see below).
194
203
  idx
195
- For NumPy-style "fancying indexing" for the read. Used to read out
196
- rows that pass some selection criteria. Only selection along the first
197
- axis is supported, so tuple arguments must be one-tuples. If `n_rows`
198
- is not false, `idx` will be truncated to `n_rows` before reading. To use
199
- with a list of files, can pass in a list of `idx`'s (one for each
200
- file) or use a long contiguous list (e.g. built from a previous
204
+ For NumPy-style "fancying indexing" for the read to select only some
205
+ rows, e.g. after applying some cuts to particular columns.
206
+ Only selection along the first axis is supported, so tuple arguments
207
+ must be one-tuples. If `n_rows` is not false, `idx` will be truncated to
208
+ `n_rows` before reading. To use with a list of files, can pass in a list of
209
+ `idx`'s (one for each file) or use a long contiguous list (e.g. built from a previous
201
210
  identical read). If used in conjunction with `start_row` and `n_rows`,
202
211
  will be sliced to obey those constraints, where `n_rows` is
203
212
  interpreted as the (max) number of *selected* values (in `idx`) to be
204
- read out.
213
+ read out. Note that the ``use_h5idx`` parameter controls some behaviour of the
214
+ read and that the default behavior (``use_h5idx=False``) prioritizes speed over
215
+ a small memory penalty.
216
+ use_h5idx
217
+ ``True`` will directly pass the ``idx`` parameter to the underlying
218
+ ``h5py`` call such that only the selected rows are read directly into memory,
219
+ which conserves memory at the cost of speed. There can be a significant penalty
220
+ to speed for larger files (1 - 2 orders of magnitude longer time).
221
+ ``False`` (default) will read the entire object into memory before
222
+ performing the indexing. The default is much faster but requires additional memory,
223
+ though a relatively small amount in the typical use case. It is recommended to
224
+ leave this parameter as its default.
205
225
  field_mask
206
226
  For tables and structs, determines which fields get written out.
207
227
  Only applies to immediate fields of the requested objects. If a dict
@@ -223,6 +243,7 @@ class LH5Store:
223
243
  after reading. The option has no effect on data encoded with HDF5
224
244
  built-in filters, which is always decompressed upstream by HDF5.
225
245
 
246
+
226
247
  Returns
227
248
  -------
228
249
  (object, n_rows_read)
@@ -236,6 +257,14 @@ class LH5Store:
236
257
  if not isinstance(lh5_file, (str, h5py.File)):
237
258
  lh5_file = list(lh5_file)
238
259
  n_rows_read = 0
260
+
261
+ # to know whether we are reading in a list of files.
262
+ # this is part of the fix for reading data by idx
263
+ # (see https://github.com/legend-exp/legend-pydataobj/issues/29)
264
+ # so that we only make a copy of the data if absolutely necessary
265
+ # or if we can read the data from file without having to make a copy
266
+ self.in_file_loop = True
267
+
239
268
  for i, h5f in enumerate(lh5_file):
240
269
  if isinstance(idx, list) and len(idx) > 0 and not np.isscalar(idx[0]):
241
270
  # a list of lists: must be one per file
@@ -255,22 +284,32 @@ class LH5Store:
255
284
  else:
256
285
  idx_i = None
257
286
  n_rows_i = n_rows - n_rows_read
287
+
288
+ # maybe someone passed in a list of len==1?
289
+ if i == (len(lh5_file) - 1):
290
+ self.in_file_loop = False
291
+
258
292
  obj_buf, n_rows_read_i = self.read_object(
259
293
  name,
260
294
  lh5_file[i],
261
295
  start_row=start_row,
262
296
  n_rows=n_rows_i,
263
297
  idx=idx_i,
298
+ use_h5idx=use_h5idx,
264
299
  field_mask=field_mask,
265
300
  obj_buf=obj_buf,
266
301
  obj_buf_start=obj_buf_start,
267
302
  decompress=decompress,
268
303
  )
304
+
269
305
  n_rows_read += n_rows_read_i
270
306
  if n_rows_read >= n_rows or obj_buf is None:
271
307
  return obj_buf, n_rows_read
272
308
  start_row = 0
273
309
  obj_buf_start += n_rows_read_i
310
+
311
+ self.in_file_loop = False
312
+
274
313
  return obj_buf, n_rows_read
275
314
 
276
315
  # get the file from the store
@@ -358,6 +397,7 @@ class LH5Store:
358
397
  start_row=start_row,
359
398
  n_rows=n_rows,
360
399
  idx=idx,
400
+ use_h5idx=use_h5idx,
361
401
  decompress=decompress,
362
402
  )
363
403
  # modify datatype in attrs if a field_mask was used
@@ -404,6 +444,7 @@ class LH5Store:
404
444
  start_row=start_row,
405
445
  n_rows=n_rows,
406
446
  idx=idx,
447
+ use_h5idx=use_h5idx,
407
448
  obj_buf=fld_buf,
408
449
  obj_buf_start=obj_buf_start,
409
450
  decompress=decompress,
@@ -497,6 +538,7 @@ class LH5Store:
497
538
  start_row=start_row,
498
539
  n_rows=n_rows,
499
540
  idx=idx,
541
+ use_h5idx=use_h5idx,
500
542
  obj_buf=None if decompress else decoded_size_buf,
501
543
  obj_buf_start=0 if decompress else obj_buf_start,
502
544
  )
@@ -508,6 +550,7 @@ class LH5Store:
508
550
  start_row=start_row,
509
551
  n_rows=n_rows,
510
552
  idx=idx,
553
+ use_h5idx=use_h5idx,
511
554
  obj_buf=None if decompress else encoded_data_buf,
512
555
  obj_buf_start=0 if decompress else obj_buf_start,
513
556
  )
@@ -531,26 +574,31 @@ class LH5Store:
531
574
  elif obj_buf is None and decompress:
532
575
  return compress.decode(rawdata), n_rows_read
533
576
 
577
+ # eventually expand provided obj_buf, if too short
578
+ buf_size = obj_buf_start + n_rows_read
579
+ if len(obj_buf) < buf_size:
580
+ obj_buf.resize(buf_size)
581
+
534
582
  # use the (decoded object type) buffer otherwise
535
- if enc_lgdo == VectorOfEncodedVectors and not isinstance(
536
- obj_buf, VectorOfVectors
537
- ):
538
- raise ValueError(
539
- f"obj_buf for decoded '{name}' not a VectorOfVectors"
540
- )
541
- elif enc_lgdo == ArrayOfEncodedEqualSizedArrays and not isinstance(
542
- obj_buf, ArrayOfEqualSizedArrays
543
- ):
544
- raise ValueError(
545
- f"obj_buf for decoded '{name}' not an ArrayOfEqualSizedArrays"
546
- )
583
+ if enc_lgdo == ArrayOfEncodedEqualSizedArrays:
584
+ if not isinstance(obj_buf, ArrayOfEqualSizedArrays):
585
+ raise ValueError(
586
+ f"obj_buf for decoded '{name}' not an ArrayOfEqualSizedArrays"
587
+ )
588
+
589
+ compress.decode(rawdata, obj_buf[obj_buf_start:buf_size])
590
+
591
+ elif enc_lgdo == VectorOfEncodedVectors:
592
+ if not isinstance(obj_buf, VectorOfVectors):
593
+ raise ValueError(
594
+ f"obj_buf for decoded '{name}' not a VectorOfVectors"
595
+ )
547
596
 
548
- # FIXME: not a good idea. an in place decoding version
549
- # of decode would be needed to avoid extra memory
550
- # allocations
551
- # FIXME: obj_buf_start??? Write a unit test
552
- for i, wf in enumerate(compress.decode(rawdata)):
553
- obj_buf[i] = wf
597
+ # FIXME: not a good idea. an in place decoding version
598
+ # of decode would be needed to avoid extra memory
599
+ # allocations
600
+ for i, wf in enumerate(compress.decode(rawdata)):
601
+ obj_buf[obj_buf_start + i] = wf
554
602
 
555
603
  return obj_buf, n_rows_read
556
604
 
@@ -568,6 +616,7 @@ class LH5Store:
568
616
  start_row=start_row,
569
617
  n_rows=n_rows,
570
618
  idx=idx,
619
+ use_h5idx=use_h5idx,
571
620
  obj_buf=cumulen_buf,
572
621
  obj_buf_start=obj_buf_start,
573
622
  )
@@ -592,6 +641,7 @@ class LH5Store:
592
641
  start_row=start_row,
593
642
  n_rows=n_rows,
594
643
  idx=idx2,
644
+ use_h5idx=use_h5idx,
595
645
  )
596
646
  fd_starts = fd_starts.nda # we just need the nda
597
647
  if fd_start is None:
@@ -674,6 +724,7 @@ class LH5Store:
674
724
  start_row=fd_start,
675
725
  n_rows=fd_n_rows,
676
726
  idx=fd_idx,
727
+ use_h5idx=use_h5idx,
677
728
  obj_buf=fd_buf,
678
729
  obj_buf_start=fd_buf_start,
679
730
  )
@@ -717,9 +768,22 @@ class LH5Store:
717
768
  if n_rows_to_read > n_rows:
718
769
  n_rows_to_read = n_rows
719
770
 
771
+ # if idx is passed, check if we can make it a slice instead (faster)
772
+ change_idx_to_slice = False
773
+
720
774
  # prepare the selection for the read. Use idx if available
721
775
  if idx is not None:
722
- source_sel = idx
776
+ # check if idx is empty and convert to slice instead
777
+ if len(idx[0]) == 0:
778
+ source_sel = np.s_[0:0]
779
+ change_idx_to_slice = True
780
+ # check if idx is contiguous and increasing
781
+ # if so, convert it to a slice instead (faster)
782
+ elif np.all(np.diff(idx[0]) == 1):
783
+ source_sel = np.s_[idx[0][0] : idx[0][-1] + 1]
784
+ change_idx_to_slice = True
785
+ else:
786
+ source_sel = idx
723
787
  else:
724
788
  source_sel = np.s_[start_row : start_row + n_rows_to_read]
725
789
 
@@ -729,14 +793,34 @@ class LH5Store:
729
793
  if len(obj_buf) < buf_size:
730
794
  obj_buf.resize(buf_size)
731
795
  dest_sel = np.s_[obj_buf_start:buf_size]
732
- h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
796
+
797
+ # this is required to make the read of multiple files faster
798
+ # until a better solution found.
799
+ if change_idx_to_slice or idx is None or use_h5idx:
800
+ h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
801
+ else:
802
+ # it is faster to read the whole object and then do fancy indexing
803
+ obj_buf.nda[dest_sel] = h5f[name][...][source_sel]
804
+
733
805
  nda = obj_buf.nda
734
806
  else:
735
807
  if n_rows == 0:
736
808
  tmp_shape = (0,) + h5f[name].shape[1:]
737
809
  nda = np.empty(tmp_shape, h5f[name].dtype)
738
810
  else:
739
- nda = h5f[name][source_sel]
811
+ if change_idx_to_slice or idx is None or use_h5idx:
812
+ nda = h5f[name][source_sel]
813
+ else:
814
+ # it is faster to read the whole object and then do fancy indexing
815
+ nda = h5f[name][...][source_sel]
816
+
817
+ # if reading a list of files recursively, this is given to obj_buf on
818
+ # the first file read. obj_buf needs to be resized and therefore
819
+ # it needs to hold the data itself (not a view of the data).
820
+ # a view is returned by the source_sel indexing, which cannot be resized
821
+ # by ndarray.resize().
822
+ if hasattr(self, "in_file_loop") and self.in_file_loop:
823
+ nda = np.copy(nda)
740
824
 
741
825
  # special handling for bools
742
826
  # (c and Julia store as uint8 so cast to bool)
@@ -776,7 +860,7 @@ class LH5Store:
776
860
  n_rows: int = None,
777
861
  wo_mode: str = "append",
778
862
  write_start: int = 0,
779
- hdf5_compression: str | h5py.filters.FilterRefBase = DEFAULT_HDF5_COMPRESSION,
863
+ **h5py_kwargs,
780
864
  ) -> None:
781
865
  """Write an LGDO into an LH5 file.
782
866
 
@@ -791,20 +875,30 @@ class LH5Store:
791
875
  passed directly to :meth:`h5py.Group.create_dataset`.
792
876
 
793
877
  :class:`.WaveformCodec` object
794
- If `obj` is a :class:`.WaveformTable`, compress its `values` using
795
- this algorithm. More documentation about the supported waveform
796
- compression algorithms at :mod:`.lgdo.compression`.
878
+ If `obj` is a :class:`.WaveformTable` and ``obj.values`` holds the
879
+ attribute, compress ``values`` using this algorithm. More
880
+ documentation about the supported waveform compression algorithms at
881
+ :mod:`.lgdo.compression`.
882
+
883
+ If the `obj` :class:`.LGDO` has a `hdf5_settings` attribute holding a
884
+ dictionary, it is interpreted as a list of keyword arguments to be
885
+ forwarded directly to :meth:`h5py.Group.create_dataset` (exactly like
886
+ the first format of `compression` above). This is the preferred way to
887
+ specify HDF5 dataset options such as chunking etc. If compression
888
+ options are specified, they take precedence over those set with the
889
+ `compression` attribute.
797
890
 
798
891
  Note
799
892
  ----
800
- The `compression` attribute takes precedence over the
801
- `hdf5_compression` argument and is not written to disk.
893
+ The `compression` LGDO attribute takes precedence over the default HDF5
894
+ compression settings. The `hdf5_settings` attribute takes precedence
895
+ over `compression`. These attributes are not written to disk.
802
896
 
803
897
  Note
804
898
  ----
805
- HDF5 compression is skipped for the `encoded_data` dataset of
806
- :class:`.VectorOfEncodedVectors` and
807
- :class`.ArrayOfEncodedEqualSizedArrays`.
899
+ HDF5 compression is skipped for the `encoded_data.flattened_data`
900
+ dataset of :class:`.VectorOfEncodedVectors` and
901
+ :class:`.ArrayOfEncodedEqualSizedArrays`.
808
902
 
809
903
  Parameters
810
904
  ----------
@@ -840,15 +934,17 @@ class LH5Store:
840
934
  write_start
841
935
  row in the output file (if already existing) to start overwriting
842
936
  from.
843
- hdf5_compression
844
- HDF5 compression filter to be applied before writing non-scalar
845
- datasets. **Ignored if compression is specified as an `obj`
846
- attribute.**
937
+ **h5py_kwargs
938
+ additional keyword arguments forwarded to
939
+ :meth:`h5py.Group.create_dataset` to specify, for example, an HDF5
940
+ compression filter to be applied before writing non-scalar
941
+ datasets. **Note: `compression` Ignored if compression is specified
942
+ as an `obj` attribute.**
847
943
  """
848
944
  log.debug(
849
945
  f"writing {repr(obj)}[{start_row}:{n_rows}] as "
850
946
  f"{lh5_file}:{group}/{name}[{write_start}:], "
851
- f"mode = {wo_mode}, hdf5_compression = {hdf5_compression}"
947
+ f"mode = {wo_mode}, h5py_kwargs = {h5py_kwargs}"
852
948
  )
853
949
 
854
950
  if wo_mode == "write_safe":
@@ -921,8 +1017,8 @@ class LH5Store:
921
1017
  for field in obj.keys():
922
1018
  # eventually compress waveform table values with LGDO's
923
1019
  # custom codecs before writing
924
- # if waveformtable.values.attrs["compression"] is a string,
925
- # interpret it as an HDF5 built-in filter
1020
+ # if waveformtable.values.attrs["compression"] is NOT a
1021
+ # WaveformCodec, just leave it there
926
1022
  obj_fld = None
927
1023
  if (
928
1024
  isinstance(obj, WaveformTable)
@@ -948,7 +1044,7 @@ class LH5Store:
948
1044
  n_rows=n_rows,
949
1045
  wo_mode=wo_mode,
950
1046
  write_start=write_start,
951
- hdf5_compression=hdf5_compression,
1047
+ **h5py_kwargs,
952
1048
  )
953
1049
  return
954
1050
 
@@ -972,6 +1068,9 @@ class LH5Store:
972
1068
  name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
973
1069
  )
974
1070
 
1071
+ # ask not to further compress flattened_data, it is already compressed!
1072
+ obj.encoded_data.flattened_data.attrs["compression"] = None
1073
+
975
1074
  self.write_object(
976
1075
  obj.encoded_data,
977
1076
  "encoded_data",
@@ -981,7 +1080,7 @@ class LH5Store:
981
1080
  n_rows=n_rows,
982
1081
  wo_mode=wo_mode,
983
1082
  write_start=write_start,
984
- hdf5_compression=None, # data is already compressed!
1083
+ **h5py_kwargs,
985
1084
  )
986
1085
 
987
1086
  self.write_object(
@@ -993,7 +1092,7 @@ class LH5Store:
993
1092
  n_rows=n_rows,
994
1093
  wo_mode=wo_mode,
995
1094
  write_start=write_start,
996
- hdf5_compression=hdf5_compression,
1095
+ **h5py_kwargs,
997
1096
  )
998
1097
 
999
1098
  # vector of vectors
@@ -1029,7 +1128,7 @@ class LH5Store:
1029
1128
  n_rows=fd_n_rows,
1030
1129
  wo_mode=wo_mode,
1031
1130
  write_start=offset,
1032
- hdf5_compression=hdf5_compression,
1131
+ **h5py_kwargs,
1033
1132
  )
1034
1133
 
1035
1134
  # now offset is used to give appropriate in-file values for
@@ -1052,7 +1151,7 @@ class LH5Store:
1052
1151
  n_rows=n_rows,
1053
1152
  wo_mode=wo_mode,
1054
1153
  write_start=write_start,
1055
- hdf5_compression=hdf5_compression,
1154
+ **h5py_kwargs,
1056
1155
  )
1057
1156
  obj.cumulative_length.nda -= cl_dtype(offset)
1058
1157
 
@@ -1072,29 +1171,39 @@ class LH5Store:
1072
1171
  # need to create dataset from ndarray the first time for speed
1073
1172
  # creating an empty dataset and appending to that is super slow!
1074
1173
  if (wo_mode != "a" and write_start == 0) or name not in group:
1174
+ # this is needed in order to have a resizable (in the first
1175
+ # axis) data set, i.e. rows can be appended later
1176
+ # NOTE: this automatically turns chunking on!
1075
1177
  maxshape = (None,) + nda.shape[1:]
1178
+ h5py_kwargs.setdefault("maxshape", maxshape)
1179
+
1076
1180
  if wo_mode == "o" and name in group:
1077
1181
  log.debug(f"overwriting {name} in {group}")
1078
1182
  del group[name]
1079
1183
 
1184
+ # set default compression options
1185
+ for k, v in DEFAULT_HDF5_SETTINGS.items():
1186
+ h5py_kwargs.setdefault(k, v)
1187
+
1188
+ # compress using the 'compression' LGDO attribute, if available
1189
+ if "compression" in obj.attrs:
1190
+ comp_algo = obj.attrs["compression"]
1191
+ if isinstance(comp_algo, dict):
1192
+ h5py_kwargs |= obj.attrs["compression"]
1193
+ else:
1194
+ h5py_kwargs["compression"] = obj.attrs["compression"]
1195
+
1196
+ # and even the 'hdf5_settings' one, preferred
1197
+ if "hdf5_settings" in obj.attrs:
1198
+ h5py_kwargs |= obj.attrs["hdf5_settings"]
1199
+
1080
1200
  # create HDF5 dataset
1081
- # - compress using the 'compression' LGDO attribute, if
1082
- # available
1083
- # - otherwise use "hdf5_compression"
1084
- # - attach HDF5 dataset attributes, but not "compression"!
1085
- comp_algo = obj.attrs.get("compression", hdf5_compression)
1086
- comp_kwargs = {}
1087
- if isinstance(comp_algo, str):
1088
- comp_kwargs = {"compression": comp_algo}
1089
- elif comp_algo is not None:
1090
- comp_kwargs = comp_algo
1091
-
1092
- ds = group.create_dataset(
1093
- name, data=nda, maxshape=maxshape, **comp_kwargs
1094
- )
1201
+ ds = group.create_dataset(name, data=nda, **h5py_kwargs)
1095
1202
 
1203
+ # attach HDF5 dataset attributes, but not "compression"!
1096
1204
  _attrs = obj.getattrs(datatype=True)
1097
1205
  _attrs.pop("compression", None)
1206
+ _attrs.pop("hdf5_settings", None)
1098
1207
  ds.attrs.update(_attrs)
1099
1208
  return
1100
1209
 
lgdo/types/scalar.py CHANGED
@@ -18,7 +18,7 @@ class Scalar(LGDO):
18
18
 
19
19
  # TODO: do scalars need proper numpy dtypes?
20
20
 
21
- def __init__(self, value: int | float, attrs: dict[str, Any] = None) -> None:
21
+ def __init__(self, value: int | float | str, attrs: dict[str, Any] = None) -> None:
22
22
  """
23
23
  Parameters
24
24
  ----------
lgdo/types/table.py CHANGED
@@ -225,7 +225,10 @@ class Table(Struct):
225
225
  if not hasattr(column, "nda"):
226
226
  raise ValueError(f"column {col} does not have an nda")
227
227
  else:
228
- df[prefix + str(col)] = column.nda.tolist()
228
+ if len(column.nda.shape) == 1:
229
+ df[prefix + str(col)] = column.nda
230
+ else:
231
+ df[prefix + str(col)] = column.nda.tolist()
229
232
 
230
233
  return df
231
234
 
@@ -248,8 +251,8 @@ class Table(Struct):
248
251
  "O1": {
249
252
  "expression": "p1 + p2 * a**2",
250
253
  "parameters": {
251
- "p1": "2",
252
- "p2": "3"
254
+ "p1": 2,
255
+ "p2": 3
253
256
  }
254
257
  },
255
258
  "O2": {