legend-pydataobj 1.5.1__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/METADATA +1 -1
  2. legend_pydataobj-1.6.1.dist-info/RECORD +54 -0
  3. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/WHEEL +1 -1
  4. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/entry_points.txt +1 -0
  5. lgdo/__init__.py +7 -4
  6. lgdo/_version.py +2 -2
  7. lgdo/cli.py +237 -12
  8. lgdo/compression/__init__.py +1 -0
  9. lgdo/lh5/__init__.py +9 -1
  10. lgdo/lh5/_serializers/__init__.py +43 -0
  11. lgdo/lh5/_serializers/read/__init__.py +0 -0
  12. lgdo/lh5/_serializers/read/array.py +34 -0
  13. lgdo/lh5/_serializers/read/composite.py +405 -0
  14. lgdo/lh5/_serializers/read/encoded.py +129 -0
  15. lgdo/lh5/_serializers/read/ndarray.py +104 -0
  16. lgdo/lh5/_serializers/read/scalar.py +34 -0
  17. lgdo/lh5/_serializers/read/utils.py +12 -0
  18. lgdo/lh5/_serializers/read/vector_of_vectors.py +201 -0
  19. lgdo/lh5/_serializers/write/__init__.py +0 -0
  20. lgdo/lh5/_serializers/write/array.py +92 -0
  21. lgdo/lh5/_serializers/write/composite.py +259 -0
  22. lgdo/lh5/_serializers/write/scalar.py +23 -0
  23. lgdo/lh5/_serializers/write/vector_of_vectors.py +95 -0
  24. lgdo/lh5/core.py +272 -0
  25. lgdo/lh5/datatype.py +46 -0
  26. lgdo/lh5/exceptions.py +34 -0
  27. lgdo/lh5/iterator.py +1 -1
  28. lgdo/lh5/store.py +69 -1160
  29. lgdo/lh5/tools.py +27 -53
  30. lgdo/lh5/utils.py +130 -27
  31. lgdo/lh5_store.py +11 -2
  32. lgdo/logging.py +1 -0
  33. lgdo/types/__init__.py +1 -0
  34. lgdo/types/array.py +1 -0
  35. lgdo/types/arrayofequalsizedarrays.py +1 -0
  36. lgdo/types/encoded.py +3 -8
  37. lgdo/types/fixedsizearray.py +1 -0
  38. lgdo/types/struct.py +1 -0
  39. lgdo/types/table.py +46 -5
  40. lgdo/types/vectorofvectors.py +314 -458
  41. lgdo/types/vovutils.py +320 -0
  42. lgdo/types/waveformtable.py +1 -0
  43. lgdo/utils.py +1 -32
  44. legend_pydataobj-1.5.1.dist-info/RECORD +0 -36
  45. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/LICENSE +0 -0
  46. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/top_level.txt +0 -0
lgdo/lh5/store.py CHANGED
@@ -2,42 +2,23 @@
2
2
  This module implements routines from reading and writing LEGEND Data Objects in
3
3
  HDF5 files.
4
4
  """
5
+
5
6
  from __future__ import annotations
6
7
 
7
8
  import logging
8
9
  import os
9
10
  import sys
10
- from bisect import bisect_left
11
- from collections import defaultdict
12
- from typing import Any, Union
11
+ from collections.abc import Mapping, Sequence
12
+ from typing import Any
13
13
 
14
14
  import h5py
15
- import numba as nb
16
- import numpy as np
17
-
18
- from .. import compression as compress
19
- from ..compression import WaveformCodec
20
- from ..types import (
21
- Array,
22
- ArrayOfEncodedEqualSizedArrays,
23
- ArrayOfEqualSizedArrays,
24
- FixedSizeArray,
25
- Scalar,
26
- Struct,
27
- Table,
28
- VectorOfEncodedVectors,
29
- VectorOfVectors,
30
- WaveformTable,
31
- )
32
- from .utils import expand_path, parse_datatype
15
+ from numpy.typing import ArrayLike
33
16
 
34
- LGDO = Union[Array, Scalar, Struct, VectorOfVectors]
17
+ from .. import types
18
+ from . import _serializers, utils
35
19
 
36
20
  log = logging.getLogger(__name__)
37
21
 
38
- DEFAULT_HDF5_SETTINGS: dict[str, ...] = {"shuffle": True, "compression": "gzip"}
39
- DEFAULT_HDF5_COMPRESSION = None
40
-
41
22
 
42
23
  class LH5Store:
43
24
  """
@@ -63,7 +44,7 @@ class LH5Store:
63
44
  whether to keep files open by storing the :mod:`h5py` objects as
64
45
  class attributes.
65
46
  """
66
- self.base_path = "" if base_path == "" else expand_path(base_path)
47
+ self.base_path = "" if base_path == "" else utils.expand_path(base_path)
67
48
  self.keep_open = keep_open
68
49
  self.files = {}
69
50
 
@@ -79,27 +60,36 @@ class LH5Store:
79
60
  """
80
61
  if isinstance(lh5_file, h5py.File):
81
62
  return lh5_file
63
+
82
64
  if mode == "r":
83
- lh5_file = expand_path(lh5_file, base_path=self.base_path)
65
+ lh5_file = utils.expand_path(lh5_file, base_path=self.base_path)
66
+
84
67
  if lh5_file in self.files:
85
68
  return self.files[lh5_file]
69
+
86
70
  if self.base_path != "":
87
71
  full_path = os.path.join(self.base_path, lh5_file)
88
72
  else:
89
73
  full_path = lh5_file
74
+
90
75
  if mode != "r":
91
76
  directory = os.path.dirname(full_path)
92
77
  if directory != "" and not os.path.exists(directory):
93
78
  log.debug(f"making path {directory}")
94
79
  os.makedirs(directory)
80
+
95
81
  if mode == "r" and not os.path.exists(full_path):
96
82
  msg = f"file {full_path} not found"
97
83
  raise FileNotFoundError(msg)
84
+
98
85
  if mode != "r" and os.path.exists(full_path):
99
86
  log.debug(f"opening existing file {full_path} in mode '{mode}'")
87
+
100
88
  h5f = h5py.File(full_path, mode)
89
+
101
90
  if self.keep_open:
102
91
  self.files[lh5_file] = h5f
92
+
103
93
  return h5f
104
94
 
105
95
  def gimme_group(
@@ -110,51 +100,21 @@ class LH5Store:
110
100
  overwrite: bool = False,
111
101
  ) -> h5py.Group:
112
102
  """
113
- Returns an existing :class:`h5py` group from a base group or creates a
114
- new one. Can also set (or replace) group attributes.
103
+ Returns an existing :class:`h5py` group from a base group or creates a new one.
115
104
 
116
- Parameters
117
- ----------
118
- group
119
- name of the HDF5 group.
120
- base_group
121
- HDF5 group to be used as a base.
122
- grp_attrs
123
- HDF5 group attributes.
124
- overwrite
125
- whether overwrite group attributes, ignored if `grp_attrs` is
126
- ``None``.
105
+ See Also
106
+ --------
107
+ .lh5.utils.get_h5_group
127
108
  """
128
- if not isinstance(group, h5py.Group):
129
- if group in base_group:
130
- group = base_group[group]
131
- else:
132
- group = base_group.create_group(group)
133
- if grp_attrs is not None:
134
- group.attrs.update(grp_attrs)
135
- return group
136
- if (
137
- grp_attrs is not None
138
- and len(set(grp_attrs.items()) ^ set(group.attrs.items())) > 0
139
- ):
140
- if not overwrite:
141
- msg = "grp_attrs != group.attrs but overwrite not set"
142
- raise RuntimeError(msg)
143
-
144
- log.debug(f"overwriting {group}.attrs...")
145
- for key in group.attrs:
146
- group.attrs.pop(key)
147
- group.attrs.update(grp_attrs)
148
-
149
- return group
109
+ return utils.get_h5_group(group, base_group, grp_attrs, overwrite)
150
110
 
151
111
  def get_buffer(
152
112
  self,
153
113
  name: str,
154
- lh5_file: str | h5py.File | list[str | h5py.File],
114
+ lh5_file: str | h5py.File | Sequence[str | h5py.File],
155
115
  size: int | None = None,
156
- field_mask: dict[str, bool] | list[str] | tuple[str] | None = None,
157
- ) -> LGDO:
116
+ field_mask: Mapping[str, bool] | Sequence[str] | None = None,
117
+ ) -> types.LGDO:
158
118
  """Returns an LH5 object appropriate for use as a pre-allocated buffer
159
119
  in a read loop. Sets size to `size` if object has a size.
160
120
  """
@@ -166,700 +126,44 @@ class LH5Store:
166
126
  def read(
167
127
  self,
168
128
  name: str,
169
- lh5_file: str | h5py.File | list[str | h5py.File],
129
+ lh5_file: str | h5py.File | Sequence[str | h5py.File],
170
130
  start_row: int = 0,
171
131
  n_rows: int = sys.maxsize,
172
- idx: np.ndarray | list | tuple | list[np.ndarray | list | tuple] = None,
132
+ idx: ArrayLike = None,
173
133
  use_h5idx: bool = False,
174
- field_mask: dict[str, bool] | list[str] | tuple[str] | None = None,
175
- obj_buf: LGDO = None,
134
+ field_mask: Mapping[str, bool] | Sequence[str] | None = None,
135
+ obj_buf: types.LGDO = None,
176
136
  obj_buf_start: int = 0,
177
137
  decompress: bool = True,
178
- ) -> tuple[LGDO, int]:
179
- """Read LH5 object data from a file.
180
-
181
- Use the ``idx`` parameter to read out particular rows of the data. The ``use_h5idx`` flag
182
- controls whether *only* those rows are read from disk or if the rows are indexed after reading
183
- the entire object. Reading individual rows can be orders of magnitude slower than reading
184
- the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``)
185
- is to use slightly more memory for a much faster read. See
186
- `legend-pydataobj #29 <https://github.com/legend-exp/legend-pydataobj/issues/29>`_
187
- for additional information.
188
-
189
- Parameters
190
- ----------
191
- name
192
- Name of the LH5 object to be read (including its group path).
193
- lh5_file
194
- The file(s) containing the object to be read out. If a list of
195
- files, array-like object data will be concatenated into the output
196
- object.
197
- start_row
198
- Starting entry for the object read (for array-like objects). For a
199
- list of files, only applies to the first file.
200
- n_rows
201
- The maximum number of rows to read (for array-like objects). The
202
- actual number of rows read will be returned as one of the return
203
- values (see below).
204
- idx
205
- For NumPy-style "fancying indexing" for the read to select only some
206
- rows, e.g. after applying some cuts to particular columns.
207
- Only selection along the first axis is supported, so tuple arguments
208
- must be one-tuples. If `n_rows` is not false, `idx` will be truncated to
209
- `n_rows` before reading. To use with a list of files, can pass in a list of
210
- `idx`'s (one for each file) or use a long contiguous list (e.g. built from a previous
211
- identical read). If used in conjunction with `start_row` and `n_rows`,
212
- will be sliced to obey those constraints, where `n_rows` is
213
- interpreted as the (max) number of *selected* values (in `idx`) to be
214
- read out. Note that the ``use_h5idx`` parameter controls some behaviour of the
215
- read and that the default behavior (``use_h5idx=False``) prioritizes speed over
216
- a small memory penalty.
217
- use_h5idx
218
- ``True`` will directly pass the ``idx`` parameter to the underlying
219
- ``h5py`` call such that only the selected rows are read directly into memory,
220
- which conserves memory at the cost of speed. There can be a significant penalty
221
- to speed for larger files (1 - 2 orders of magnitude longer time).
222
- ``False`` (default) will read the entire object into memory before
223
- performing the indexing. The default is much faster but requires additional memory,
224
- though a relatively small amount in the typical use case. It is recommended to
225
- leave this parameter as its default.
226
- field_mask
227
- For tables and structs, determines which fields get written out.
228
- Only applies to immediate fields of the requested objects. If a dict
229
- is used, a default dict will be made with the default set to the
230
- opposite of the first element in the dict. This way if one specifies
231
- a few fields at ``False``, all but those fields will be read out,
232
- while if one specifies just a few fields as ``True``, only those
233
- fields will be read out. If a list is provided, the listed fields
234
- will be set to ``True``, while the rest will default to ``False``.
235
- obj_buf
236
- Read directly into memory provided in `obj_buf`. Note: the buffer
237
- will be expanded to accommodate the data requested. To maintain the
238
- buffer length, send in ``n_rows = len(obj_buf)``.
239
- obj_buf_start
240
- Start location in ``obj_buf`` for read. For concatenating data to
241
- array-like objects.
242
- decompress
243
- Decompress data encoded with LGDO's compression routines right
244
- after reading. The option has no effect on data encoded with HDF5
245
- built-in filters, which is always decompressed upstream by HDF5.
246
-
138
+ ) -> tuple[types.LGDO, int]:
139
+ """Read LH5 object data from a file in the store.
247
140
 
248
- Returns
249
- -------
250
- (object, n_rows_read)
251
- `object` is the read-out object `n_rows_read` is the number of rows
252
- successfully read out. Essential for arrays when the amount of data
253
- is smaller than the object buffer. For scalars and structs
254
- `n_rows_read` will be``1``. For tables it is redundant with
255
- ``table.loc``.
141
+ See Also
142
+ --------
143
+ .lh5.core.read
256
144
  """
257
- # Handle list-of-files recursively
145
+ # grab files from store
258
146
  if not isinstance(lh5_file, (str, h5py.File)):
259
- lh5_file = list(lh5_file)
260
- n_rows_read = 0
261
-
262
- # to know whether we are reading in a list of files.
263
- # this is part of the fix for reading data by idx
264
- # (see https://github.com/legend-exp/legend-pydataobj/issues/29)
265
- # so that we only make a copy of the data if absolutely necessary
266
- # or if we can read the data from file without having to make a copy
267
- self.in_file_loop = True
268
-
269
- for i, h5f in enumerate(lh5_file):
270
- if isinstance(idx, list) and len(idx) > 0 and not np.isscalar(idx[0]):
271
- # a list of lists: must be one per file
272
- idx_i = idx[i]
273
- elif idx is not None:
274
- # make idx a proper tuple if it's not one already
275
- if not (isinstance(idx, tuple) and len(idx) == 1):
276
- idx = (idx,)
277
- # idx is a long continuous array
278
- n_rows_i = self.read_n_rows(name, h5f)
279
- # find the length of the subset of idx that contains indices
280
- # that are less than n_rows_i
281
- n_rows_to_read_i = bisect_left(idx[0], n_rows_i)
282
- # now split idx into idx_i and the remainder
283
- idx_i = (idx[0][:n_rows_to_read_i],)
284
- idx = (idx[0][n_rows_to_read_i:] - n_rows_i,)
285
- else:
286
- idx_i = None
287
- n_rows_i = n_rows - n_rows_read
288
-
289
- # maybe someone passed in a list of len==1?
290
- if i == (len(lh5_file) - 1):
291
- self.in_file_loop = False
292
-
293
- obj_buf, n_rows_read_i = self.read(
294
- name,
295
- lh5_file[i],
296
- start_row=start_row,
297
- n_rows=n_rows_i,
298
- idx=idx_i,
299
- use_h5idx=use_h5idx,
300
- field_mask=field_mask,
301
- obj_buf=obj_buf,
302
- obj_buf_start=obj_buf_start,
303
- decompress=decompress,
304
- )
305
-
306
- n_rows_read += n_rows_read_i
307
- if n_rows_read >= n_rows or obj_buf is None:
308
- return obj_buf, n_rows_read
309
- start_row = 0
310
- obj_buf_start += n_rows_read_i
311
-
312
- self.in_file_loop = False
313
-
314
- return obj_buf, n_rows_read
315
-
316
- # get the file from the store
317
- h5f = self.gimme_file(lh5_file, "r")
318
- if not h5f or name not in h5f:
319
- msg = f"'{name}' not in {h5f.filename}"
320
- raise KeyError(msg)
321
-
322
- log.debug(
323
- f"reading {h5f.filename}:{name}[{start_row}:{n_rows}], decompress = {decompress}, "
324
- + (f" with field mask {field_mask}" if field_mask else "")
147
+ lh5_file = [self.gimme_file(f, "r") for f in list(lh5_file)]
148
+ else:
149
+ lh5_file = self.gimme_file(lh5_file, "r")
150
+
151
+ return _serializers._h5_read_lgdo(
152
+ name,
153
+ lh5_file,
154
+ start_row=start_row,
155
+ n_rows=n_rows,
156
+ idx=idx,
157
+ use_h5idx=use_h5idx,
158
+ field_mask=field_mask,
159
+ obj_buf=obj_buf,
160
+ obj_buf_start=obj_buf_start,
161
+ decompress=decompress,
325
162
  )
326
163
 
327
- # make idx a proper tuple if it's not one already
328
- if not (isinstance(idx, tuple) and len(idx) == 1) and idx is not None:
329
- idx = (idx,)
330
-
331
- # get the object's datatype
332
- if "datatype" not in h5f[name].attrs:
333
- msg = f"'{name}' in file {lh5_file} is missing the datatype attribute"
334
- raise RuntimeError(msg)
335
-
336
- datatype = h5f[name].attrs["datatype"]
337
- datatype, shape, elements = parse_datatype(datatype)
338
-
339
- # check field_mask and make it a default dict
340
- if datatype in ("struct", "table"):
341
- if field_mask is None:
342
- field_mask = defaultdict(lambda: True)
343
- elif isinstance(field_mask, dict):
344
- default = True
345
- if len(field_mask) > 0:
346
- default = not field_mask[next(iter(field_mask.keys()))]
347
- field_mask = defaultdict(lambda: default, field_mask)
348
- elif isinstance(field_mask, (list, tuple)):
349
- field_mask = defaultdict(bool, {field: True for field in field_mask})
350
- elif not isinstance(field_mask, defaultdict):
351
- msg = "bad field_mask of type"
352
- raise RuntimeError(msg, type(field_mask).__name__)
353
- elif field_mask is not None:
354
- msg = f"datatype {datatype} does not accept a field_mask"
355
- raise RuntimeError(msg)
356
-
357
- # Scalar
358
- # scalars are dim-0 datasets
359
- if datatype == "scalar":
360
- value = h5f[name][()]
361
- if elements == "bool":
362
- value = np.bool_(value)
363
- if obj_buf is not None:
364
- obj_buf.value = value
365
- obj_buf.attrs.update(h5f[name].attrs)
366
- return obj_buf, 1
367
-
368
- return Scalar(value=value, attrs=h5f[name].attrs), 1
369
-
370
- # Struct
371
- # recursively build a struct, return as a dictionary
372
- if datatype == "struct":
373
- # ignore obj_buf.
374
- # TODO: could append new fields or overwrite/concat to existing
375
- # fields. If implemented, get_buffer() above should probably also
376
- # (optionally?) prep buffers for each field
377
- if obj_buf is not None:
378
- msg = "obj_buf not implemented for LGOD Structs"
379
- raise NotImplementedError(msg)
380
-
381
- # loop over fields and read
382
- obj_dict = {}
383
- for field in elements:
384
- if not field_mask[field]:
385
- continue
386
- # TODO: it's strange to pass start_row, n_rows, idx to struct
387
- # fields. If they all had shared indexing, they should be in a
388
- # table... Maybe should emit a warning? Or allow them to be
389
- # dicts keyed by field name?
390
- if "int_keys" in h5f[name].attrs:
391
- if dict(h5f[name].attrs)["int_keys"]:
392
- f = int(field)
393
- else:
394
- f = str(field)
395
- obj_dict[f], _ = self.read(
396
- name + "/" + field,
397
- h5f,
398
- start_row=start_row,
399
- n_rows=n_rows,
400
- idx=idx,
401
- use_h5idx=use_h5idx,
402
- decompress=decompress,
403
- )
404
- # modify datatype in attrs if a field_mask was used
405
- attrs = dict(h5f[name].attrs)
406
- if field_mask is not None:
407
- selected_fields = []
408
- for field in elements:
409
- if field_mask[field]:
410
- selected_fields.append(field)
411
- attrs["datatype"] = "struct" + "{" + ",".join(selected_fields) + "}"
412
- return Struct(obj_dict=obj_dict, attrs=attrs), 1
413
-
414
- # Below here is all array-like types. So trim idx if needed
415
- if idx is not None:
416
- # check if idx is just an ordered list of the integers if so can ignore
417
- if (idx[0] == np.arange(0, len(idx[0]), 1)).all():
418
- if n_rows > len(idx[0]):
419
- n_rows = len(idx[0])
420
- idx = None
421
- else:
422
- # chop off indices < start_row
423
- i_first_valid = bisect_left(idx[0], start_row)
424
- idxa = idx[0][i_first_valid:]
425
- # don't readout more than n_rows indices
426
- idx = (idxa[:n_rows],) # works even if n_rows > len(idxa)
427
-
428
- # Table or WaveformTable
429
- if datatype == "table":
430
- col_dict = {}
431
-
432
- # read out each of the fields
433
- rows_read = []
434
- for field in elements:
435
- if not field_mask[field]:
436
- continue
437
-
438
- fld_buf = None
439
- if obj_buf is not None:
440
- if not isinstance(obj_buf, Table) or field not in obj_buf:
441
- msg = f"obj_buf for LGDO Table '{name}' not formatted correctly"
442
- raise ValueError(msg)
443
-
444
- fld_buf = obj_buf[field]
445
-
446
- col_dict[field], n_rows_read = self.read(
447
- name + "/" + field,
448
- h5f,
449
- start_row=start_row,
450
- n_rows=n_rows,
451
- idx=idx,
452
- use_h5idx=use_h5idx,
453
- obj_buf=fld_buf,
454
- obj_buf_start=obj_buf_start,
455
- decompress=decompress,
456
- )
457
- if obj_buf is not None and obj_buf_start + n_rows_read > len(obj_buf):
458
- obj_buf.resize(obj_buf_start + n_rows_read)
459
-
460
- rows_read.append(n_rows_read)
461
-
462
- # warn if all columns don't read in the same number of rows
463
- if len(rows_read) > 0:
464
- n_rows_read = rows_read[0]
465
- else:
466
- n_rows_read = 0
467
- log.warning(f"Table '{name}' has no subgroups accepted by field mask")
468
-
469
- for n in rows_read[1:]:
470
- if n != n_rows_read:
471
- log.warning(
472
- f"Table '{name}' got strange n_rows_read = {n}, {n_rows_read} was expected ({rows_read})"
473
- )
474
-
475
- # modify datatype in attrs if a field_mask was used
476
- attrs = dict(h5f[name].attrs)
477
- if field_mask is not None:
478
- selected_fields = []
479
- for field in elements:
480
- if field_mask[field]:
481
- selected_fields.append(field)
482
- attrs["datatype"] = "table" + "{" + ",".join(selected_fields) + "}"
483
-
484
- # fields have been read out, now return a table
485
- if obj_buf is None:
486
- # if col_dict contains just 3 objects called t0, dt, and values,
487
- # return a WaveformTable
488
- if (
489
- len(col_dict) == 3
490
- and "t0" in col_dict
491
- and "dt" in col_dict
492
- and "values" in col_dict
493
- ):
494
- table = WaveformTable(
495
- t0=col_dict["t0"], dt=col_dict["dt"], values=col_dict["values"]
496
- )
497
- else:
498
- table = Table(col_dict=col_dict, attrs=attrs)
499
-
500
- # set (write) loc to end of tree
501
- table.loc = n_rows_read
502
- return table, n_rows_read
503
-
504
- # We have read all fields into the object buffer. Run
505
- # checks: All columns should be the same size. So update
506
- # table's size as necessary, warn if any mismatches are found
507
- obj_buf.resize(do_warn=True)
508
- # set (write) loc to end of tree
509
- obj_buf.loc = obj_buf_start + n_rows_read
510
- # check attributes
511
- if set(obj_buf.attrs.keys()) != set(attrs.keys()):
512
- msg = (
513
- f"attrs mismatch. obj_buf.attrs: "
514
- f"{obj_buf.attrs}, h5f[{name}].attrs: {attrs}"
515
- )
516
- raise RuntimeError(msg)
517
- return obj_buf, n_rows_read
518
-
519
- # ArrayOfEncodedEqualSizedArrays and VectorOfEncodedVectors
520
- for cond, enc_lgdo in [
521
- (
522
- datatype == "array_of_encoded_equalsized_arrays",
523
- ArrayOfEncodedEqualSizedArrays,
524
- ),
525
- (elements.startswith("encoded_array"), VectorOfEncodedVectors),
526
- ]:
527
- if cond:
528
- if (
529
- not decompress
530
- and obj_buf is not None
531
- and not isinstance(obj_buf, enc_lgdo)
532
- ):
533
- msg = f"obj_buf for '{name}' not a {enc_lgdo}"
534
- raise ValueError(msg)
535
-
536
- # read out decoded_size, either a Scalar or an Array
537
- decoded_size_buf = encoded_data_buf = None
538
- if obj_buf is not None and not decompress:
539
- decoded_size_buf = obj_buf.decoded_size
540
- encoded_data_buf = obj_buf.encoded_data
541
-
542
- decoded_size, _ = self.read(
543
- f"{name}/decoded_size",
544
- h5f,
545
- start_row=start_row,
546
- n_rows=n_rows,
547
- idx=idx,
548
- use_h5idx=use_h5idx,
549
- obj_buf=None if decompress else decoded_size_buf,
550
- obj_buf_start=0 if decompress else obj_buf_start,
551
- )
552
-
553
- # read out encoded_data, a VectorOfVectors
554
- encoded_data, n_rows_read = self.read(
555
- f"{name}/encoded_data",
556
- h5f,
557
- start_row=start_row,
558
- n_rows=n_rows,
559
- idx=idx,
560
- use_h5idx=use_h5idx,
561
- obj_buf=None if decompress else encoded_data_buf,
562
- obj_buf_start=0 if decompress else obj_buf_start,
563
- )
564
-
565
- # return the still encoded data in the buffer object, if there
566
- if obj_buf is not None and not decompress:
567
- return obj_buf, n_rows_read
568
-
569
- # otherwise re-create the encoded LGDO
570
- rawdata = enc_lgdo(
571
- encoded_data=encoded_data,
572
- decoded_size=decoded_size,
573
- attrs=h5f[name].attrs,
574
- )
575
-
576
- # already return if no decompression is requested
577
- if not decompress:
578
- return rawdata, n_rows_read
579
-
580
- # if no buffer, decode and return
581
- if obj_buf is None and decompress:
582
- return compress.decode(rawdata), n_rows_read
583
-
584
- # eventually expand provided obj_buf, if too short
585
- buf_size = obj_buf_start + n_rows_read
586
- if len(obj_buf) < buf_size:
587
- obj_buf.resize(buf_size)
588
-
589
- # use the (decoded object type) buffer otherwise
590
- if enc_lgdo == ArrayOfEncodedEqualSizedArrays:
591
- if not isinstance(obj_buf, ArrayOfEqualSizedArrays):
592
- msg = f"obj_buf for decoded '{name}' not an ArrayOfEqualSizedArrays"
593
- raise ValueError(msg)
594
-
595
- compress.decode(rawdata, obj_buf[obj_buf_start:buf_size])
596
-
597
- elif enc_lgdo == VectorOfEncodedVectors:
598
- if not isinstance(obj_buf, VectorOfVectors):
599
- msg = f"obj_buf for decoded '{name}' not a VectorOfVectors"
600
- raise ValueError(msg)
601
-
602
- # FIXME: not a good idea. an in place decoding version
603
- # of decode would be needed to avoid extra memory
604
- # allocations
605
- for i, wf in enumerate(compress.decode(rawdata)):
606
- obj_buf[obj_buf_start + i] = wf
607
-
608
- return obj_buf, n_rows_read
609
-
610
- # VectorOfVectors
611
- # read out vector of vectors of different size
612
- if elements.startswith("array"):
613
- if obj_buf is not None and not isinstance(obj_buf, VectorOfVectors):
614
- msg = f"obj_buf for '{name}' not a LGDO VectorOfVectors"
615
- raise ValueError(msg)
616
-
617
- # read out cumulative_length
618
- cumulen_buf = None if obj_buf is None else obj_buf.cumulative_length
619
- cumulative_length, n_rows_read = self.read(
620
- f"{name}/cumulative_length",
621
- h5f,
622
- start_row=start_row,
623
- n_rows=n_rows,
624
- idx=idx,
625
- use_h5idx=use_h5idx,
626
- obj_buf=cumulen_buf,
627
- obj_buf_start=obj_buf_start,
628
- )
629
- # get a view of just what was read out for cleaner code below
630
- this_cumulen_nda = cumulative_length.nda[
631
- obj_buf_start : obj_buf_start + n_rows_read
632
- ]
633
-
634
- if idx is not None and n_rows_read > 0:
635
- # get the starting indices for each array in flattended data:
636
- # the starting index for array[i] is cumulative_length[i-1]
637
- idx2 = (np.asarray(idx[0]).copy() - 1,)
638
- # re-read cumulative_length with these indices
639
- # note this will allocate memory for fd_starts!
640
- fd_start = None
641
- if idx2[0][0] == -1:
642
- idx2 = (idx2[0][1:],)
643
- fd_start = 0 # this variable avoids an ndarray append
644
- fd_starts, fds_n_rows_read = self.read(
645
- f"{name}/cumulative_length",
646
- h5f,
647
- start_row=start_row,
648
- n_rows=n_rows,
649
- idx=idx2,
650
- use_h5idx=use_h5idx,
651
- )
652
- fd_starts = fd_starts.nda # we just need the nda
653
- if fd_start is None:
654
- fd_start = fd_starts[0]
655
-
656
- # compute the length that flattened_data will have after the
657
- # fancy-indexed read
658
- fd_n_rows = np.sum(this_cumulen_nda[-len(fd_starts) :] - fd_starts)
659
- if fd_start == 0:
660
- fd_n_rows += this_cumulen_nda[0]
661
-
662
- # now make fd_idx
663
- fd_idx = np.empty(fd_n_rows, dtype="uint32")
664
- fd_idx = _make_fd_idx(fd_starts, this_cumulen_nda, fd_idx)
665
-
666
- # Now clean up this_cumulen_nda, to be ready
667
- # to match the in-memory version of flattened_data. Note: these
668
- # operations on the view change the original array because they are
669
- # numpy arrays, not lists.
670
- this_cumulen_nda[-len(fd_starts) :] -= fd_starts
671
- np.cumsum(this_cumulen_nda, out=this_cumulen_nda)
672
-
673
- else:
674
- fd_idx = None
675
-
676
- # determine the start_row and n_rows for the flattened_data readout
677
- fd_start = 0
678
- if start_row > 0 and n_rows_read > 0:
679
- # need to read out the cumulen sample -before- the first sample
680
- # read above in order to get the starting row of the first
681
- # vector to read out in flattened_data
682
- fd_start = h5f[f"{name}/cumulative_length"][start_row - 1]
683
-
684
- # check limits for values that will be used subsequently
685
- if this_cumulen_nda[-1] < fd_start:
686
- log.debug(
687
- f"this_cumulen_nda[-1] = {this_cumulen_nda[-1]}, "
688
- f"fd_start = {fd_start}, "
689
- f"start_row = {start_row}, "
690
- f"n_rows_read = {n_rows_read}"
691
- )
692
- msg = (
693
- f"cumulative_length non-increasing between entries "
694
- f"{start_row} and {start_row+n_rows_read} ??"
695
- )
696
- raise RuntimeError(msg)
697
-
698
- # determine the number of rows for the flattened_data readout
699
- fd_n_rows = this_cumulen_nda[-1] if n_rows_read > 0 else 0
700
-
701
- # Now done with this_cumulen_nda, so we can clean it up to be ready
702
- # to match the in-memory version of flattened_data. Note: these
703
- # operations on the view change the original array because they are
704
- # numpy arrays, not lists.
705
- #
706
- # First we need to subtract off the in-file offset for the start of
707
- # read for flattened_data
708
- this_cumulen_nda -= fd_start
709
-
710
- # If we started with a partially-filled buffer, add the
711
- # appropriate offset for the start of the in-memory flattened
712
- # data for this read.
713
- fd_buf_start = np.uint32(0)
714
- if obj_buf_start > 0:
715
- fd_buf_start = cumulative_length.nda[obj_buf_start - 1]
716
- this_cumulen_nda += fd_buf_start
717
-
718
- # Now prepare the object buffer if necessary
719
- fd_buf = None
720
- if obj_buf is not None:
721
- fd_buf = obj_buf.flattened_data
722
- # grow fd_buf if necessary to hold the data
723
- fdb_size = fd_buf_start + fd_n_rows
724
- if len(fd_buf) < fdb_size:
725
- fd_buf.resize(fdb_size)
726
-
727
- # now read
728
- flattened_data, dummy_rows_read = self.read(
729
- f"{name}/flattened_data",
730
- h5f,
731
- start_row=fd_start,
732
- n_rows=fd_n_rows,
733
- idx=fd_idx,
734
- use_h5idx=use_h5idx,
735
- obj_buf=fd_buf,
736
- obj_buf_start=fd_buf_start,
737
- )
738
- if obj_buf is not None:
739
- return obj_buf, n_rows_read
740
- return (
741
- VectorOfVectors(
742
- flattened_data=flattened_data,
743
- cumulative_length=cumulative_length,
744
- attrs=h5f[name].attrs,
745
- ),
746
- n_rows_read,
747
- )
748
-
749
- # Array
750
- # FixedSizeArray
751
- # ArrayOfEqualSizedArrays
752
- # read out all arrays by slicing
753
- if "array" in datatype:
754
- if obj_buf is not None and not isinstance(obj_buf, Array):
755
- msg = f"obj_buf for '{name}' not an LGDO Array"
756
- raise ValueError(msg)
757
- obj_buf = None
758
-
759
- # compute the number of rows to read
760
- # we culled idx above for start_row and n_rows, now we have to apply
761
- # the constraint of the length of the dataset
762
- ds_n_rows = h5f[name].shape[0]
763
- if idx is not None:
764
- if len(idx[0]) > 0 and idx[0][-1] >= ds_n_rows:
765
- log.warning(
766
- "idx indexed past the end of the array in the file. Culling..."
767
- )
768
- n_rows_to_read = bisect_left(idx[0], ds_n_rows)
769
- idx = (idx[0][:n_rows_to_read],)
770
- if len(idx[0]) == 0:
771
- log.warning("idx empty after culling.")
772
- n_rows_to_read = len(idx[0])
773
- else:
774
- n_rows_to_read = ds_n_rows - start_row
775
- if n_rows_to_read > n_rows:
776
- n_rows_to_read = n_rows
777
-
778
- # if idx is passed, check if we can make it a slice instead (faster)
779
- change_idx_to_slice = False
780
-
781
- # prepare the selection for the read. Use idx if available
782
- if idx is not None:
783
- # check if idx is empty and convert to slice instead
784
- if len(idx[0]) == 0:
785
- source_sel = np.s_[0:0]
786
- change_idx_to_slice = True
787
- # check if idx is contiguous and increasing
788
- # if so, convert it to a slice instead (faster)
789
- elif np.all(np.diff(idx[0]) == 1):
790
- source_sel = np.s_[idx[0][0] : idx[0][-1] + 1]
791
- change_idx_to_slice = True
792
- else:
793
- source_sel = idx
794
- else:
795
- source_sel = np.s_[start_row : start_row + n_rows_to_read]
796
-
797
- # Now read the array
798
- if obj_buf is not None and n_rows_to_read > 0:
799
- buf_size = obj_buf_start + n_rows_to_read
800
- if len(obj_buf) < buf_size:
801
- obj_buf.resize(buf_size)
802
- dest_sel = np.s_[obj_buf_start:buf_size]
803
-
804
- # this is required to make the read of multiple files faster
805
- # until a better solution found.
806
- if change_idx_to_slice or idx is None or use_h5idx:
807
- h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
808
- else:
809
- # it is faster to read the whole object and then do fancy indexing
810
- obj_buf.nda[dest_sel] = h5f[name][...][source_sel]
811
-
812
- nda = obj_buf.nda
813
- elif n_rows == 0:
814
- tmp_shape = (0,) + h5f[name].shape[1:]
815
- nda = np.empty(tmp_shape, h5f[name].dtype)
816
- elif change_idx_to_slice or idx is None or use_h5idx:
817
- nda = h5f[name][source_sel]
818
- else:
819
- # it is faster to read the whole object and then do fancy indexing
820
- nda = h5f[name][...][source_sel]
821
-
822
- # if reading a list of files recursively, this is given to obj_buf on
823
- # the first file read. obj_buf needs to be resized and therefore
824
- # it needs to hold the data itself (not a view of the data).
825
- # a view is returned by the source_sel indexing, which cannot be resized
826
- # by ndarray.resize().
827
- if hasattr(self, "in_file_loop") and self.in_file_loop:
828
- nda = np.copy(nda)
829
-
830
- # special handling for bools
831
- # (c and Julia store as uint8 so cast to bool)
832
- if elements == "bool":
833
- nda = nda.astype(np.bool_)
834
-
835
- # Finally, set attributes and return objects
836
- attrs = h5f[name].attrs
837
- if obj_buf is None:
838
- if datatype == "array":
839
- return Array(nda=nda, attrs=attrs), n_rows_to_read
840
- if datatype == "fixedsize_array":
841
- return FixedSizeArray(nda=nda, attrs=attrs), n_rows_to_read
842
- if datatype == "array_of_equalsized_arrays":
843
- return (
844
- ArrayOfEqualSizedArrays(nda=nda, dims=shape, attrs=attrs),
845
- n_rows_to_read,
846
- )
847
- else:
848
- if set(obj_buf.attrs.keys()) != set(attrs.keys()):
849
- msg = (
850
- f"attrs mismatch. "
851
- f"obj_buf.attrs: {obj_buf.attrs}, "
852
- f"h5f[{name}].attrs: {attrs}"
853
- )
854
- raise RuntimeError(msg)
855
- return obj_buf, n_rows_to_read
856
-
857
- msg = "don't know how to read datatype {datatype}"
858
- raise RuntimeError(msg)
859
-
860
164
  def write(
861
165
  self,
862
- obj: LGDO,
166
+ obj: types.LGDO,
863
167
  name: str,
864
168
  lh5_file: str | h5py.File,
865
169
  group: str | h5py.Group = "/",
@@ -871,89 +175,10 @@ class LH5Store:
871
175
  ) -> None:
872
176
  """Write an LGDO into an LH5 file.
873
177
 
874
- If the `obj` :class:`.LGDO` has a `compression` attribute, its value is
875
- interpreted as the algorithm to be used to compress `obj` before
876
- writing to disk. The type of `compression` can be:
877
-
878
- string, kwargs dictionary, hdf5plugin filter
879
- interpreted as the name of a built-in or custom `HDF5 compression
880
- filter <https://docs.h5py.org/en/stable/high/dataset.html#filter-pipeline>`_
881
- (``"gzip"``, ``"lzf"``, :mod:`hdf5plugin` filter object etc.) and
882
- passed directly to :meth:`h5py.Group.create_dataset`.
883
-
884
- :class:`.WaveformCodec` object
885
- If `obj` is a :class:`.WaveformTable` and ``obj.values`` holds the
886
- attribute, compress ``values`` using this algorithm. More
887
- documentation about the supported waveform compression algorithms at
888
- :mod:`.lgdo.compression`.
889
-
890
- If the `obj` :class:`.LGDO` has a `hdf5_settings` attribute holding a
891
- dictionary, it is interpreted as a list of keyword arguments to be
892
- forwarded directly to :meth:`h5py.Group.create_dataset` (exactly like
893
- the first format of `compression` above). This is the preferred way to
894
- specify HDF5 dataset options such as chunking etc. If compression
895
- options are specified, they take precedence over those set with the
896
- `compression` attribute.
897
-
898
- Note
899
- ----------
900
- The `compression` LGDO attribute takes precedence over the default HDF5
901
- compression settings. The `hdf5_settings` attribute takes precedence
902
- over `compression`. These attributes are not written to disk.
903
-
904
- Note
905
- ----------
906
- HDF5 compression is skipped for the `encoded_data.flattened_data`
907
- dataset of :class:`.VectorOfEncodedVectors` and
908
- :class:`.ArrayOfEncodedEqualSizedArrays`.
909
-
910
- Parameters
911
- ----------
912
- obj
913
- LH5 object. if object is array-like, writes `n_rows` starting from
914
- `start_row` in `obj`.
915
- name
916
- name of the object in the output HDF5 file.
917
- lh5_file
918
- HDF5 file name or :class:`h5py.File` object.
919
- group
920
- HDF5 group name or :class:`h5py.Group` object in which `obj` should
921
- be written.
922
- start_row
923
- first row in `obj` to be written.
924
- n_rows
925
- number of rows in `obj` to be written.
926
- wo_mode
927
- - ``write_safe`` or ``w``: only proceed with writing if the
928
- object does not already exist in the file.
929
- - ``append`` or ``a``: append along axis 0 (the first dimension)
930
- of array-like objects and array-like subfields of structs.
931
- :class:`~.lgdo.scalar.Scalar` objects get overwritten.
932
- - ``overwrite`` or ``o``: replace data in the file if present,
933
- starting from `write_start`. Note: overwriting with `write_start` =
934
- end of array is the same as ``append``.
935
- - ``overwrite_file`` or ``of``: delete file if present prior to
936
- writing to it. `write_start` should be 0 (its ignored).
937
- - ``append_column`` or ``ac``: append columns from an :class:`~.lgdo.table.Table`
938
- `obj` only if there is an existing :class:`~.lgdo.table.Table` in the `lh5_file` with
939
- the same `name` and :class:`~.lgdo.table.Table.size`. If the sizes don't match,
940
- or if there are matching fields, it errors out.
941
- write_start
942
- row in the output file (if already existing) to start overwriting
943
- from.
944
- **h5py_kwargs
945
- additional keyword arguments forwarded to
946
- :meth:`h5py.Group.create_dataset` to specify, for example, an HDF5
947
- compression filter to be applied before writing non-scalar
948
- datasets. **Note: `compression` Ignored if compression is specified
949
- as an `obj` attribute.**
178
+ See Also
179
+ --------
180
+ .lh5.core.write
950
181
  """
951
- log.debug(
952
- f"writing {obj!r}[{start_row}:{n_rows}] as "
953
- f"{lh5_file}:{group}/{name}[{write_start}:], "
954
- f"mode = {wo_mode}, h5py_kwargs = {h5py_kwargs}"
955
- )
956
-
957
182
  if wo_mode == "write_safe":
958
183
  wo_mode = "w"
959
184
  if wo_mode == "append":
@@ -974,338 +199,22 @@ class LH5Store:
974
199
  # change any object in the file. So we use file:append for
975
200
  # write_object:overwrite.
976
201
  mode = "w" if wo_mode == "of" else "a"
977
- lh5_file = self.gimme_file(lh5_file, mode=mode)
978
- group = self.gimme_group(group, lh5_file)
979
- if wo_mode == "w" and name in group:
980
- msg = f"can't overwrite '{name}' in wo_mode 'write_safe'"
981
- raise RuntimeError(msg)
982
-
983
- # struct or table or waveform table
984
- if isinstance(obj, Struct):
985
- # In order to append a column, we need to update the `table{old_fields}` value in `group.attrs['datatype"]` to include the new fields.
986
- # One way to do this is to override `obj.attrs["datatype"]` to include old and new fields. Then we can write the fields to the table as normal.
987
- if wo_mode == "ac":
988
- old_group = self.gimme_group(name, group)
989
- datatype, shape, fields = parse_datatype(old_group.attrs["datatype"])
990
- if datatype not in ["table", "struct"]:
991
- msg = f"Trying to append columns to an object of type {datatype}"
992
- raise RuntimeError(msg)
993
-
994
- # If the mode is `append_column`, make sure we aren't appending a table that has a column of the same name as in the existing table
995
- # Also make sure that the field we are adding has the same size
996
- if len(list(set(fields).intersection(set(obj.keys())))) != 0:
997
- msg = f"Can't append {list(set(fields).intersection(set(obj.keys())))} column(s) to a table with the same field(s)"
998
- raise ValueError(msg)
999
- # It doesn't matter what key we access, as all fields in the old table have the same size
1000
- if old_group[next(iter(old_group.keys()))].size != obj.size:
1001
- msg = f"Table sizes don't match. Trying to append column of size {obj.size} to a table of size {old_group[next(iter(old_group.keys()))].size}."
1002
- raise ValueError(msg)
1003
-
1004
- # Now we can append the obj.keys() to the old fields, and then update obj.attrs.
1005
- fields.extend(list(obj.keys()))
1006
- obj.attrs.pop("datatype")
1007
- obj.attrs["datatype"] = "table" + "{" + ",".join(fields) + "}"
1008
-
1009
- group = self.gimme_group(
1010
- name,
1011
- group,
1012
- grp_attrs=obj.attrs,
1013
- overwrite=(wo_mode in ["o", "ac"]),
1014
- )
1015
- # If the mode is overwrite, then we need to peek into the file's table's existing fields
1016
- # If we are writing a new table to the group that does not contain an old field, we should delete that old field from the file
1017
- if wo_mode == "o":
1018
- # Find the old keys in the group that are not present in the new table's keys, then delete them
1019
- for key in list(set(group.keys()) - set(obj.keys())):
1020
- log.debug(f"{key} is not present in new table, deleting field")
1021
- del group[key]
1022
-
1023
- for field in obj:
1024
- # eventually compress waveform table values with LGDO's
1025
- # custom codecs before writing
1026
- # if waveformtable.values.attrs["compression"] is NOT a
1027
- # WaveformCodec, just leave it there
1028
- obj_fld = None
1029
- if (
1030
- isinstance(obj, WaveformTable)
1031
- and field == "values"
1032
- and not isinstance(obj.values, VectorOfEncodedVectors)
1033
- and not isinstance(obj.values, ArrayOfEncodedEqualSizedArrays)
1034
- and "compression" in obj.values.attrs
1035
- and isinstance(obj.values.attrs["compression"], WaveformCodec)
1036
- ):
1037
- codec = obj.values.attrs["compression"]
1038
- obj_fld = compress.encode(obj.values, codec=codec)
1039
- else:
1040
- obj_fld = obj[field]
1041
-
1042
- # Convert keys to string for dataset names
1043
- f = str(field)
1044
- self.write(
1045
- obj_fld,
1046
- f,
1047
- lh5_file,
1048
- group=group,
1049
- start_row=start_row,
1050
- n_rows=n_rows,
1051
- wo_mode=wo_mode,
1052
- write_start=write_start,
1053
- **h5py_kwargs,
1054
- )
1055
- return
1056
-
1057
- # scalars
1058
- if isinstance(obj, Scalar):
1059
- if name in group:
1060
- if wo_mode in ["o", "a"]:
1061
- log.debug(f"overwriting {name} in {group}")
1062
- del group[name]
1063
- else:
1064
- msg = f"tried to overwrite {name} in {group} for wo_mode {wo_mode}"
1065
- raise RuntimeError(msg)
1066
- ds = group.create_dataset(name, shape=(), data=obj.value)
1067
- ds.attrs.update(obj.attrs)
1068
-
1069
- return
1070
202
 
1071
- # vector of encoded vectors
1072
- if isinstance(obj, (VectorOfEncodedVectors, ArrayOfEncodedEqualSizedArrays)):
1073
- group = self.gimme_group(
1074
- name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
1075
- )
1076
-
1077
- # ask not to further compress flattened_data, it is already compressed!
1078
- obj.encoded_data.flattened_data.attrs["compression"] = None
1079
-
1080
- self.write(
1081
- obj.encoded_data,
1082
- "encoded_data",
1083
- lh5_file,
1084
- group=group,
1085
- start_row=start_row,
1086
- n_rows=n_rows,
1087
- wo_mode=wo_mode,
1088
- write_start=write_start,
1089
- **h5py_kwargs,
1090
- )
1091
-
1092
- self.write(
1093
- obj.decoded_size,
1094
- "decoded_size",
1095
- lh5_file,
1096
- group=group,
1097
- start_row=start_row,
1098
- n_rows=n_rows,
1099
- wo_mode=wo_mode,
1100
- write_start=write_start,
1101
- **h5py_kwargs,
1102
- )
1103
-
1104
- # vector of vectors
1105
- elif isinstance(obj, VectorOfVectors):
1106
- group = self.gimme_group(
1107
- name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
1108
- )
1109
- if (
1110
- n_rows is None
1111
- or n_rows > obj.cumulative_length.nda.shape[0] - start_row
1112
- ):
1113
- n_rows = obj.cumulative_length.nda.shape[0] - start_row
1114
-
1115
- # if appending we need to add an appropriate offset to the
1116
- # cumulative lengths as appropriate for the in-file object
1117
- offset = 0 # declare here because we have to subtract it off at the end
1118
- if (wo_mode in ("a", "o")) and "cumulative_length" in group:
1119
- len_cl = len(group["cumulative_length"])
1120
- if wo_mode == "a":
1121
- write_start = len_cl
1122
- if len_cl > 0:
1123
- offset = group["cumulative_length"][write_start - 1]
1124
-
1125
- # First write flattened_data array. Only write rows with data.
1126
- fd_start = 0 if start_row == 0 else obj.cumulative_length.nda[start_row - 1]
1127
- fd_n_rows = obj.cumulative_length.nda[start_row + n_rows - 1] - fd_start
1128
- self.write(
1129
- obj.flattened_data,
1130
- "flattened_data",
1131
- lh5_file,
1132
- group=group,
1133
- start_row=fd_start,
1134
- n_rows=fd_n_rows,
1135
- wo_mode=wo_mode,
1136
- write_start=offset,
1137
- **h5py_kwargs,
1138
- )
1139
-
1140
- # now offset is used to give appropriate in-file values for
1141
- # cumulative_length. Need to adjust it for start_row
1142
- if start_row > 0:
1143
- offset -= obj.cumulative_length.nda[start_row - 1]
1144
-
1145
- # Add offset to obj.cumulative_length itself to avoid memory allocation.
1146
- # Then subtract it off after writing! (otherwise it will be changed
1147
- # upon return)
1148
- cl_dtype = obj.cumulative_length.nda.dtype.type
1149
- obj.cumulative_length.nda += cl_dtype(offset)
1150
-
1151
- self.write(
1152
- obj.cumulative_length,
1153
- "cumulative_length",
1154
- lh5_file,
1155
- group=group,
1156
- start_row=start_row,
1157
- n_rows=n_rows,
1158
- wo_mode=wo_mode,
1159
- write_start=write_start,
1160
- **h5py_kwargs,
1161
- )
1162
- obj.cumulative_length.nda -= cl_dtype(offset)
1163
-
1164
- return
1165
-
1166
- # if we get this far, must be one of the Array types
1167
- elif isinstance(obj, Array):
1168
- if n_rows is None or n_rows > obj.nda.shape[0] - start_row:
1169
- n_rows = obj.nda.shape[0] - start_row
1170
-
1171
- nda = obj.nda[start_row : start_row + n_rows]
1172
-
1173
- # hack to store bools as uint8 for c / Julia compliance
1174
- if nda.dtype.name == "bool":
1175
- nda = nda.astype(np.uint8)
1176
-
1177
- # need to create dataset from ndarray the first time for speed
1178
- # creating an empty dataset and appending to that is super slow!
1179
- if (wo_mode != "a" and write_start == 0) or name not in group:
1180
- # this is needed in order to have a resizable (in the first
1181
- # axis) data set, i.e. rows can be appended later
1182
- # NOTE: this automatically turns chunking on!
1183
- maxshape = (None,) + nda.shape[1:]
1184
- h5py_kwargs.setdefault("maxshape", maxshape)
1185
-
1186
- if wo_mode == "o" and name in group:
1187
- log.debug(f"overwriting {name} in {group}")
1188
- del group[name]
1189
-
1190
- # set default compression options
1191
- for k, v in DEFAULT_HDF5_SETTINGS.items():
1192
- h5py_kwargs.setdefault(k, v)
1193
-
1194
- # compress using the 'compression' LGDO attribute, if available
1195
- if "compression" in obj.attrs:
1196
- comp_algo = obj.attrs["compression"]
1197
- if isinstance(comp_algo, dict):
1198
- h5py_kwargs |= obj.attrs["compression"]
1199
- else:
1200
- h5py_kwargs["compression"] = obj.attrs["compression"]
1201
-
1202
- # and even the 'hdf5_settings' one, preferred
1203
- if "hdf5_settings" in obj.attrs:
1204
- h5py_kwargs |= obj.attrs["hdf5_settings"]
1205
-
1206
- # create HDF5 dataset
1207
- ds = group.create_dataset(name, data=nda, **h5py_kwargs)
1208
-
1209
- # attach HDF5 dataset attributes, but not "compression"!
1210
- _attrs = obj.getattrs(datatype=True)
1211
- _attrs.pop("compression", None)
1212
- _attrs.pop("hdf5_settings", None)
1213
- ds.attrs.update(_attrs)
1214
- return
1215
-
1216
- # Now append or overwrite
1217
- ds = group[name]
1218
- if not isinstance(ds, h5py.Dataset):
1219
- msg = (
1220
- f"existing HDF5 object '{name}' in group '{group}'"
1221
- " is not a dataset! Cannot overwrite or append"
1222
- )
1223
- raise RuntimeError(msg)
1224
-
1225
- old_len = ds.shape[0]
1226
- if wo_mode == "a":
1227
- write_start = old_len
1228
- add_len = write_start + nda.shape[0] - old_len
1229
- ds.resize(old_len + add_len, axis=0)
1230
- ds[write_start:] = nda
1231
- return
1232
-
1233
- else:
1234
- msg = f"do not know how to write '{name}' of type '{type(obj).__name__}'"
1235
- raise RuntimeError(msg)
203
+ return _serializers._h5_write_lgdo(
204
+ obj,
205
+ name,
206
+ self.gimme_file(lh5_file, mode=mode),
207
+ group=group,
208
+ start_row=start_row,
209
+ n_rows=n_rows,
210
+ wo_mode=wo_mode,
211
+ write_start=write_start,
212
+ **h5py_kwargs,
213
+ )
1236
214
 
1237
215
  def read_n_rows(self, name: str, lh5_file: str | h5py.File) -> int | None:
1238
- """Look up the number of rows in an Array-like object called `name` in
1239
- `lh5_file`.
1240
-
1241
- Return ``None`` if it is a :class:`.Scalar` or a :class:`.Struct`."""
1242
- # this is basically a stripped down version of read_object
1243
- h5f = self.gimme_file(lh5_file, "r")
1244
- if not h5f or name not in h5f:
1245
- msg = f"'{name}' not in {lh5_file}"
1246
- raise KeyError(msg)
1247
-
1248
- # get the datatype
1249
- if "datatype" not in h5f[name].attrs:
1250
- msg = f"'{name}' in file {lh5_file} is missing the datatype attribute"
1251
- raise RuntimeError(msg)
1252
-
1253
- datatype = h5f[name].attrs["datatype"]
1254
- datatype, shape, elements = parse_datatype(datatype)
216
+ """Look up the number of rows in an Array-like object called `name` in `lh5_file`.
1255
217
 
1256
- # scalars are dim-0 datasets
1257
- if datatype == "scalar":
1258
- return None
1259
-
1260
- # structs don't have rows
1261
- if datatype == "struct":
1262
- return None
1263
-
1264
- # tables should have elements with all the same length
1265
- if datatype == "table":
1266
- # read out each of the fields
1267
- rows_read = None
1268
- for field in elements:
1269
- n_rows_read = self.read_n_rows(name + "/" + field, h5f)
1270
- if not rows_read:
1271
- rows_read = n_rows_read
1272
- elif rows_read != n_rows_read:
1273
- log.warning(
1274
- f"'{field}' field in table '{name}' has {rows_read} rows, "
1275
- f"{n_rows_read} was expected"
1276
- )
1277
- return rows_read
1278
-
1279
- # length of vector of vectors is the length of its cumulative_length
1280
- if elements.startswith("array"):
1281
- return self.read_n_rows(f"{name}/cumulative_length", h5f)
1282
-
1283
- # length of vector of encoded vectors is the length of its decoded_size
1284
- if (
1285
- elements.startswith("encoded_array")
1286
- or datatype == "array_of_encoded_equalsized_arrays"
1287
- ):
1288
- return self.read_n_rows(f"{name}/encoded_data", h5f)
1289
-
1290
- # return array length (without reading the array!)
1291
- if "array" in datatype:
1292
- # compute the number of rows to read
1293
- return h5f[name].shape[0]
1294
-
1295
- msg = f"don't know how to read datatype '{datatype}'"
1296
- raise RuntimeError(msg)
1297
-
1298
-
1299
- @nb.njit(parallel=False, fastmath=True)
1300
- def _make_fd_idx(starts, stops, idx):
1301
- k = 0
1302
- if len(starts) < len(stops):
1303
- for i in range(stops[0]):
1304
- idx[k] = i
1305
- k += 1
1306
- stops = stops[1:]
1307
- for j in range(len(starts)):
1308
- for i in range(starts[j], stops[j]):
1309
- idx[k] = i
1310
- k += 1
1311
- return (idx,)
218
+ Return ``None`` if it is a :class:`.Scalar` or a :class:`.Struct`.
219
+ """
220
+ return utils.read_n_rows(name, self.gimme_file(lh5_file, "r"))