legend-pydataobj 1.5.1__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/METADATA +1 -1
  2. legend_pydataobj-1.6.1.dist-info/RECORD +54 -0
  3. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/WHEEL +1 -1
  4. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/entry_points.txt +1 -0
  5. lgdo/__init__.py +7 -4
  6. lgdo/_version.py +2 -2
  7. lgdo/cli.py +237 -12
  8. lgdo/compression/__init__.py +1 -0
  9. lgdo/lh5/__init__.py +9 -1
  10. lgdo/lh5/_serializers/__init__.py +43 -0
  11. lgdo/lh5/_serializers/read/__init__.py +0 -0
  12. lgdo/lh5/_serializers/read/array.py +34 -0
  13. lgdo/lh5/_serializers/read/composite.py +405 -0
  14. lgdo/lh5/_serializers/read/encoded.py +129 -0
  15. lgdo/lh5/_serializers/read/ndarray.py +104 -0
  16. lgdo/lh5/_serializers/read/scalar.py +34 -0
  17. lgdo/lh5/_serializers/read/utils.py +12 -0
  18. lgdo/lh5/_serializers/read/vector_of_vectors.py +201 -0
  19. lgdo/lh5/_serializers/write/__init__.py +0 -0
  20. lgdo/lh5/_serializers/write/array.py +92 -0
  21. lgdo/lh5/_serializers/write/composite.py +259 -0
  22. lgdo/lh5/_serializers/write/scalar.py +23 -0
  23. lgdo/lh5/_serializers/write/vector_of_vectors.py +95 -0
  24. lgdo/lh5/core.py +272 -0
  25. lgdo/lh5/datatype.py +46 -0
  26. lgdo/lh5/exceptions.py +34 -0
  27. lgdo/lh5/iterator.py +1 -1
  28. lgdo/lh5/store.py +69 -1160
  29. lgdo/lh5/tools.py +27 -53
  30. lgdo/lh5/utils.py +130 -27
  31. lgdo/lh5_store.py +11 -2
  32. lgdo/logging.py +1 -0
  33. lgdo/types/__init__.py +1 -0
  34. lgdo/types/array.py +1 -0
  35. lgdo/types/arrayofequalsizedarrays.py +1 -0
  36. lgdo/types/encoded.py +3 -8
  37. lgdo/types/fixedsizearray.py +1 -0
  38. lgdo/types/struct.py +1 -0
  39. lgdo/types/table.py +46 -5
  40. lgdo/types/vectorofvectors.py +314 -458
  41. lgdo/types/vovutils.py +320 -0
  42. lgdo/types/waveformtable.py +1 -0
  43. lgdo/utils.py +1 -32
  44. legend_pydataobj-1.5.1.dist-info/RECORD +0 -36
  45. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/LICENSE +0 -0
  46. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,405 @@
1
+ from __future__ import annotations
2
+
3
+ import bisect
4
+ import logging
5
+ import sys
6
+ from collections import defaultdict
7
+
8
+ import h5py
9
+ import numpy as np
10
+
11
+ from ....types import (
12
+ Array,
13
+ ArrayOfEncodedEqualSizedArrays,
14
+ ArrayOfEqualSizedArrays,
15
+ FixedSizeArray,
16
+ Scalar,
17
+ Struct,
18
+ Table,
19
+ VectorOfEncodedVectors,
20
+ VectorOfVectors,
21
+ WaveformTable,
22
+ )
23
+ from ... import datatype as dtypeutils
24
+ from ...exceptions import LH5DecodeError
25
+ from ...utils import read_n_rows
26
+ from . import utils
27
+ from .array import (
28
+ _h5_read_array,
29
+ _h5_read_array_of_equalsized_arrays,
30
+ _h5_read_fixedsize_array,
31
+ )
32
+ from .encoded import (
33
+ _h5_read_array_of_encoded_equalsized_arrays,
34
+ _h5_read_vector_of_encoded_vectors,
35
+ )
36
+ from .scalar import _h5_read_scalar
37
+ from .vector_of_vectors import _h5_read_vector_of_vectors
38
+
39
+ log = logging.getLogger(__name__)
40
+
41
+
42
+ def _h5_read_lgdo(
43
+ name,
44
+ h5f,
45
+ start_row=0,
46
+ n_rows=sys.maxsize,
47
+ idx=None,
48
+ use_h5idx=False,
49
+ field_mask=None,
50
+ obj_buf=None,
51
+ obj_buf_start=0,
52
+ decompress=True,
53
+ ):
54
+ # Handle list-of-files recursively
55
+ if not isinstance(h5f, (str, h5py.File)):
56
+ lh5_file = list(h5f)
57
+ n_rows_read = 0
58
+
59
+ for i, h5f in enumerate(lh5_file):
60
+ if isinstance(idx, list) and len(idx) > 0 and not np.isscalar(idx[0]):
61
+ # a list of lists: must be one per file
62
+ idx_i = idx[i]
63
+ elif idx is not None:
64
+ # make idx a proper tuple if it's not one already
65
+ if not (isinstance(idx, tuple) and len(idx) == 1):
66
+ idx = (idx,)
67
+ # idx is a long continuous array
68
+ n_rows_i = read_n_rows(name, h5f)
69
+ # find the length of the subset of idx that contains indices
70
+ # that are less than n_rows_i
71
+ n_rows_to_read_i = bisect.bisect_left(idx[0], n_rows_i)
72
+ # now split idx into idx_i and the remainder
73
+ idx_i = (idx[0][:n_rows_to_read_i],)
74
+ idx = (idx[0][n_rows_to_read_i:] - n_rows_i,)
75
+ else:
76
+ idx_i = None
77
+ n_rows_i = n_rows - n_rows_read
78
+
79
+ obj_buf, n_rows_read_i = _h5_read_lgdo(
80
+ name,
81
+ h5f,
82
+ start_row=start_row,
83
+ n_rows=n_rows_i,
84
+ idx=idx_i,
85
+ use_h5idx=use_h5idx,
86
+ field_mask=field_mask,
87
+ obj_buf=obj_buf,
88
+ obj_buf_start=obj_buf_start,
89
+ decompress=decompress,
90
+ )
91
+
92
+ n_rows_read += n_rows_read_i
93
+ if n_rows_read >= n_rows or obj_buf is None:
94
+ return obj_buf, n_rows_read
95
+ start_row = 0
96
+ obj_buf_start += n_rows_read_i
97
+
98
+ return obj_buf, n_rows_read
99
+
100
+ if not isinstance(h5f, h5py.File):
101
+ h5f = h5py.File(h5f, mode="r")
102
+
103
+ log.debug(
104
+ f"reading {h5f.filename}:{name}[{start_row}:{n_rows}], decompress = {decompress}, "
105
+ + (f" with field mask {field_mask}" if field_mask else "")
106
+ )
107
+
108
+ # make idx a proper tuple if it's not one already
109
+ if not (isinstance(idx, tuple) and len(idx) == 1) and idx is not None:
110
+ idx = (idx,)
111
+
112
+ try:
113
+ lgdotype = dtypeutils.datatype(h5f[name].attrs["datatype"])
114
+ except KeyError as e:
115
+ msg = "dataset not in file or missing 'datatype' attribute"
116
+ raise LH5DecodeError(msg, h5f, name) from e
117
+
118
+ if lgdotype is Scalar:
119
+ return _h5_read_scalar(
120
+ name,
121
+ h5f,
122
+ obj_buf=obj_buf,
123
+ )
124
+
125
+ # check field_mask and make it a default dict
126
+ if field_mask is None:
127
+ field_mask = defaultdict(lambda: True)
128
+ elif isinstance(field_mask, dict):
129
+ default = True
130
+ if len(field_mask) > 0:
131
+ default = not field_mask[next(iter(field_mask.keys()))]
132
+ field_mask = defaultdict(lambda: default, field_mask)
133
+ elif isinstance(field_mask, (list, tuple)):
134
+ field_mask = defaultdict(bool, {field: True for field in field_mask})
135
+ elif not isinstance(field_mask, defaultdict):
136
+ msg = "bad field_mask type"
137
+ raise ValueError(msg, type(field_mask).__name__)
138
+
139
+ if lgdotype is Struct:
140
+ return _h5_read_struct(
141
+ name,
142
+ h5f,
143
+ start_row=start_row,
144
+ n_rows=n_rows,
145
+ idx=idx,
146
+ use_h5idx=use_h5idx,
147
+ field_mask=field_mask,
148
+ decompress=decompress,
149
+ )
150
+
151
+ # Below here is all array-like types. So trim idx if needed
152
+ if idx is not None:
153
+ # check if idx is just an ordered list of the integers if so can ignore
154
+ if (idx[0] == np.arange(0, len(idx[0]), 1)).all():
155
+ if n_rows > len(idx[0]):
156
+ n_rows = len(idx[0])
157
+ idx = None
158
+ else:
159
+ # chop off indices < start_row
160
+ i_first_valid = bisect.bisect_left(idx[0], start_row)
161
+ idxa = idx[0][i_first_valid:]
162
+ # don't readout more than n_rows indices
163
+ idx = (idxa[:n_rows],) # works even if n_rows > len(idxa)
164
+
165
+ if lgdotype is Table:
166
+ return _h5_read_table(
167
+ name,
168
+ h5f,
169
+ start_row=start_row,
170
+ n_rows=n_rows,
171
+ idx=idx,
172
+ use_h5idx=use_h5idx,
173
+ field_mask=field_mask,
174
+ obj_buf=obj_buf,
175
+ obj_buf_start=obj_buf_start,
176
+ decompress=decompress,
177
+ )
178
+
179
+ if lgdotype is ArrayOfEncodedEqualSizedArrays:
180
+ return _h5_read_array_of_encoded_equalsized_arrays(
181
+ name,
182
+ h5f,
183
+ start_row=start_row,
184
+ n_rows=n_rows,
185
+ idx=idx,
186
+ use_h5idx=use_h5idx,
187
+ obj_buf=obj_buf,
188
+ obj_buf_start=obj_buf_start,
189
+ decompress=decompress,
190
+ )
191
+
192
+ if lgdotype is VectorOfEncodedVectors:
193
+ return _h5_read_vector_of_encoded_vectors(
194
+ name,
195
+ h5f,
196
+ start_row=start_row,
197
+ n_rows=n_rows,
198
+ idx=idx,
199
+ use_h5idx=use_h5idx,
200
+ obj_buf=obj_buf,
201
+ obj_buf_start=obj_buf_start,
202
+ decompress=decompress,
203
+ )
204
+
205
+ if lgdotype is VectorOfVectors:
206
+ return _h5_read_vector_of_vectors(
207
+ name,
208
+ h5f,
209
+ start_row=start_row,
210
+ n_rows=n_rows,
211
+ idx=idx,
212
+ use_h5idx=use_h5idx,
213
+ obj_buf=obj_buf,
214
+ obj_buf_start=obj_buf_start,
215
+ )
216
+
217
+ if lgdotype is FixedSizeArray:
218
+ return _h5_read_fixedsize_array(
219
+ name,
220
+ h5f,
221
+ start_row=start_row,
222
+ n_rows=n_rows,
223
+ idx=idx,
224
+ use_h5idx=use_h5idx,
225
+ obj_buf=obj_buf,
226
+ obj_buf_start=obj_buf_start,
227
+ )
228
+
229
+ if lgdotype is ArrayOfEqualSizedArrays:
230
+ return _h5_read_array_of_equalsized_arrays(
231
+ name,
232
+ h5f,
233
+ start_row=start_row,
234
+ n_rows=n_rows,
235
+ idx=idx,
236
+ use_h5idx=use_h5idx,
237
+ obj_buf=obj_buf,
238
+ obj_buf_start=obj_buf_start,
239
+ )
240
+
241
+ if lgdotype is Array:
242
+ return _h5_read_array(
243
+ name,
244
+ h5f,
245
+ start_row=start_row,
246
+ n_rows=n_rows,
247
+ idx=idx,
248
+ use_h5idx=use_h5idx,
249
+ obj_buf=obj_buf,
250
+ obj_buf_start=obj_buf_start,
251
+ )
252
+
253
+ msg = f"no rule to decode {lgdotype.__name__} from LH5"
254
+ raise LH5DecodeError(msg, h5f, name)
255
+
256
+
257
+ def _h5_read_struct(
258
+ name,
259
+ h5f,
260
+ start_row=0,
261
+ n_rows=sys.maxsize,
262
+ idx=None,
263
+ use_h5idx=False,
264
+ field_mask=None,
265
+ decompress=True,
266
+ ):
267
+ # TODO: it's strange to pass start_row, n_rows, idx to struct
268
+ # fields. If they all had shared indexing, they should be in a
269
+ # table... Maybe should emit a warning? Or allow them to be
270
+ # dicts keyed by field name?
271
+
272
+ attrs = dict(h5f[name].attrs)
273
+
274
+ # determine fields to be read out
275
+ all_fields = dtypeutils.get_struct_fields(attrs["datatype"])
276
+ selected_fields = (
277
+ [field for field in all_fields if field_mask[field]]
278
+ if field_mask is not None
279
+ else all_fields
280
+ )
281
+
282
+ # modify datatype in attrs if a field_mask was used
283
+ attrs["datatype"] = "struct{" + ",".join(selected_fields) + "}"
284
+
285
+ # loop over fields and read
286
+ obj_dict = {}
287
+ for field in selected_fields:
288
+ # support for integer keys
289
+ field_key = int(field) if attrs.get("int_keys") else str(field)
290
+ obj_dict[field_key], _ = _h5_read_lgdo(
291
+ f"{name}/{field}",
292
+ h5f,
293
+ start_row=start_row,
294
+ n_rows=n_rows,
295
+ idx=idx,
296
+ use_h5idx=use_h5idx,
297
+ decompress=decompress,
298
+ )
299
+
300
+ return Struct(obj_dict=obj_dict, attrs=attrs), 1
301
+
302
+
303
+ def _h5_read_table(
304
+ name,
305
+ h5f,
306
+ start_row=0,
307
+ n_rows=sys.maxsize,
308
+ idx=None,
309
+ use_h5idx=False,
310
+ field_mask=None,
311
+ obj_buf=None,
312
+ obj_buf_start=0,
313
+ decompress=True,
314
+ ):
315
+ if obj_buf is not None and not isinstance(obj_buf, Table):
316
+ msg = "provided object buffer is not a Table"
317
+ raise LH5DecodeError(msg, h5f, name)
318
+
319
+ attrs = dict(h5f[name].attrs)
320
+
321
+ # determine fields to be read out
322
+ all_fields = dtypeutils.get_struct_fields(attrs["datatype"])
323
+ selected_fields = (
324
+ [field for field in all_fields if field_mask[field]]
325
+ if field_mask is not None
326
+ else all_fields
327
+ )
328
+
329
+ # modify datatype in attrs if a field_mask was used
330
+ attrs["datatype"] = "table{" + ",".join(selected_fields) + "}"
331
+
332
+ # read out each of the fields
333
+ col_dict = {}
334
+ rows_read = []
335
+ for field in selected_fields:
336
+ fld_buf = None
337
+ if obj_buf is not None:
338
+ if not isinstance(obj_buf, Table) or field not in obj_buf:
339
+ msg = "provided object buffer is not a Table or columns are missing"
340
+ raise LH5DecodeError(msg, h5f, name)
341
+
342
+ fld_buf = obj_buf[field]
343
+
344
+ col_dict[field], n_rows_read = _h5_read_lgdo(
345
+ f"{name}/{field}",
346
+ h5f,
347
+ start_row=start_row,
348
+ n_rows=n_rows,
349
+ idx=idx,
350
+ use_h5idx=use_h5idx,
351
+ obj_buf=fld_buf,
352
+ obj_buf_start=obj_buf_start,
353
+ decompress=decompress,
354
+ )
355
+
356
+ if obj_buf is not None and obj_buf_start + n_rows_read > len(obj_buf):
357
+ obj_buf.resize(obj_buf_start + n_rows_read)
358
+
359
+ rows_read.append(n_rows_read)
360
+
361
+ # warn if all columns don't read in the same number of rows
362
+ if len(rows_read) > 0:
363
+ n_rows_read = rows_read[0]
364
+ else:
365
+ n_rows_read = 0
366
+ log.warning(f"Table '{name}' has no fields specified by {field_mask=}")
367
+
368
+ for n in rows_read[1:]:
369
+ if n != n_rows_read:
370
+ log.warning(
371
+ f"Table '{name}' got strange n_rows_read = {n}, "
372
+ "{n_rows_read} was expected ({rows_read})"
373
+ )
374
+
375
+ # fields have been read out, now return a table
376
+ if obj_buf is None:
377
+ # if col_dict contains just 3 objects called t0, dt, and values,
378
+ # return a WaveformTable
379
+ if (
380
+ len(col_dict) == 3
381
+ and "t0" in col_dict
382
+ and "dt" in col_dict
383
+ and "values" in col_dict
384
+ ):
385
+ table = WaveformTable(
386
+ t0=col_dict["t0"], dt=col_dict["dt"], values=col_dict["values"]
387
+ )
388
+ else:
389
+ table = Table(col_dict=col_dict, attrs=attrs)
390
+
391
+ # set (write) loc to end of tree
392
+ table.loc = n_rows_read
393
+ return table, n_rows_read
394
+
395
+ # We have read all fields into the object buffer. Run
396
+ # checks: All columns should be the same size. So update
397
+ # table's size as necessary, warn if any mismatches are found
398
+ obj_buf.resize(do_warn=True)
399
+ # set (write) loc to end of tree
400
+ obj_buf.loc = obj_buf_start + n_rows_read
401
+
402
+ # check attributes
403
+ utils.check_obj_buf_attrs(obj_buf.attrs, attrs, h5f, name)
404
+
405
+ return obj_buf, n_rows_read
@@ -0,0 +1,129 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import sys
5
+
6
+ from .... import compression as compress
7
+ from ....types import (
8
+ ArrayOfEncodedEqualSizedArrays,
9
+ VectorOfEncodedVectors,
10
+ )
11
+ from ...exceptions import LH5DecodeError
12
+ from .array import (
13
+ _h5_read_array,
14
+ )
15
+ from .scalar import _h5_read_scalar
16
+ from .vector_of_vectors import _h5_read_vector_of_vectors
17
+
18
+ log = logging.getLogger(__name__)
19
+
20
+
21
+ def _h5_read_array_of_encoded_equalsized_arrays(
22
+ name,
23
+ h5f,
24
+ **kwargs,
25
+ ):
26
+ return _h5_read_encoded_array(ArrayOfEncodedEqualSizedArrays, name, h5f, **kwargs)
27
+
28
+
29
+ def _h5_read_vector_of_encoded_vectors(
30
+ name,
31
+ h5f,
32
+ **kwargs,
33
+ ):
34
+ return _h5_read_encoded_array(VectorOfEncodedVectors, name, h5f, **kwargs)
35
+
36
+
37
+ def _h5_read_encoded_array(
38
+ lgdotype,
39
+ name,
40
+ h5f,
41
+ start_row=0,
42
+ n_rows=sys.maxsize,
43
+ idx=None,
44
+ use_h5idx=False,
45
+ obj_buf=None,
46
+ obj_buf_start=0,
47
+ decompress=True,
48
+ ):
49
+ if lgdotype not in (ArrayOfEncodedEqualSizedArrays, VectorOfEncodedVectors):
50
+ msg = f"unsupported read of encoded type {lgdotype.__name__}"
51
+ raise LH5DecodeError(msg, h5f, name)
52
+
53
+ if not decompress and obj_buf is not None and not isinstance(obj_buf, lgdotype):
54
+ msg = f"object buffer is not a {lgdotype.__name__}"
55
+ raise LH5DecodeError(msg, h5f, name)
56
+
57
+ # read out decoded_size, either a Scalar or an Array
58
+ decoded_size_buf = encoded_data_buf = None
59
+ if obj_buf is not None and not decompress:
60
+ decoded_size_buf = obj_buf.decoded_size
61
+ encoded_data_buf = obj_buf.encoded_data
62
+
63
+ if lgdotype is VectorOfEncodedVectors:
64
+ decoded_size, _ = _h5_read_array(
65
+ f"{name}/decoded_size",
66
+ h5f,
67
+ start_row=start_row,
68
+ n_rows=n_rows,
69
+ idx=idx,
70
+ use_h5idx=use_h5idx,
71
+ obj_buf=None if decompress else decoded_size_buf,
72
+ obj_buf_start=0 if decompress else obj_buf_start,
73
+ )
74
+
75
+ else:
76
+ decoded_size, _ = _h5_read_scalar(
77
+ f"{name}/decoded_size",
78
+ h5f,
79
+ obj_buf=None if decompress else decoded_size_buf,
80
+ )
81
+
82
+ # read out encoded_data, a VectorOfVectors
83
+ encoded_data, n_rows_read = _h5_read_vector_of_vectors(
84
+ f"{name}/encoded_data",
85
+ h5f,
86
+ start_row=start_row,
87
+ n_rows=n_rows,
88
+ idx=idx,
89
+ use_h5idx=use_h5idx,
90
+ obj_buf=None if decompress else encoded_data_buf,
91
+ obj_buf_start=0 if decompress else obj_buf_start,
92
+ )
93
+
94
+ # return the still encoded data in the buffer object, if there
95
+ if obj_buf is not None and not decompress:
96
+ return obj_buf, n_rows_read
97
+
98
+ # otherwise re-create the encoded LGDO
99
+ rawdata = lgdotype(
100
+ encoded_data=encoded_data,
101
+ decoded_size=decoded_size,
102
+ attrs=h5f[name].attrs,
103
+ )
104
+
105
+ # already return if no decompression is requested
106
+ if not decompress:
107
+ return rawdata, n_rows_read
108
+
109
+ # if no buffer, decode and return
110
+ if obj_buf is None and decompress:
111
+ return compress.decode(rawdata), n_rows_read
112
+
113
+ # eventually expand provided obj_buf, if too short
114
+ buf_size = obj_buf_start + n_rows_read
115
+ if len(obj_buf) < buf_size:
116
+ obj_buf.resize(buf_size)
117
+
118
+ # use the (decoded object type) buffer otherwise
119
+ if lgdotype is ArrayOfEncodedEqualSizedArrays:
120
+ compress.decode(rawdata, obj_buf[obj_buf_start:buf_size])
121
+
122
+ elif lgdotype is VectorOfEncodedVectors:
123
+ # FIXME: not a good idea. an in place decoding version
124
+ # of decode would be needed to avoid extra memory
125
+ # allocations
126
+ for i, wf in enumerate(compress.decode(rawdata)):
127
+ obj_buf[obj_buf_start + i] = wf
128
+
129
+ return obj_buf, n_rows_read
@@ -0,0 +1,104 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import sys
5
+ from bisect import bisect_left
6
+
7
+ import numpy as np
8
+
9
+ from ....types import Array
10
+ from ... import datatype
11
+ from ...exceptions import LH5DecodeError
12
+
13
+ log = logging.getLogger(__name__)
14
+
15
+
16
+ def _h5_read_ndarray(
17
+ name,
18
+ h5f,
19
+ start_row=0,
20
+ n_rows=sys.maxsize,
21
+ idx=None,
22
+ use_h5idx=False,
23
+ obj_buf=None,
24
+ obj_buf_start=0,
25
+ ):
26
+ if obj_buf is not None and not isinstance(obj_buf, Array):
27
+ msg = "object buffer is not an Array"
28
+ raise LH5DecodeError(msg, h5f, name)
29
+
30
+ # compute the number of rows to read
31
+ # we culled idx above for start_row and n_rows, now we have to apply
32
+ # the constraint of the length of the dataset
33
+ try:
34
+ ds_n_rows = h5f[name].shape[0]
35
+ except AttributeError as e:
36
+ msg = "does not seem to be an HDF5 dataset"
37
+ raise LH5DecodeError(msg, h5f, name) from e
38
+
39
+ if idx is not None:
40
+ if len(idx[0]) > 0 and idx[0][-1] >= ds_n_rows:
41
+ log.warning("idx indexed past the end of the array in the file. Culling...")
42
+ n_rows_to_read = bisect_left(idx[0], ds_n_rows)
43
+ idx = (idx[0][:n_rows_to_read],)
44
+ if len(idx[0]) == 0:
45
+ log.warning("idx empty after culling.")
46
+ n_rows_to_read = len(idx[0])
47
+ else:
48
+ n_rows_to_read = ds_n_rows - start_row
49
+ if n_rows_to_read > n_rows:
50
+ n_rows_to_read = n_rows
51
+
52
+ # if idx is passed, check if we can make it a slice instead (faster)
53
+ change_idx_to_slice = False
54
+
55
+ # prepare the selection for the read. Use idx if available
56
+ if idx is not None:
57
+ # check if idx is empty and convert to slice instead
58
+ if len(idx[0]) == 0:
59
+ source_sel = np.s_[0:0]
60
+ change_idx_to_slice = True
61
+ # check if idx is contiguous and increasing
62
+ # if so, convert it to a slice instead (faster)
63
+ elif np.all(np.diff(idx[0]) == 1):
64
+ source_sel = np.s_[idx[0][0] : idx[0][-1] + 1]
65
+ change_idx_to_slice = True
66
+ else:
67
+ source_sel = idx
68
+ else:
69
+ source_sel = np.s_[start_row : start_row + n_rows_to_read]
70
+
71
+ # Now read the array
72
+ if obj_buf is not None and n_rows_to_read > 0:
73
+ buf_size = obj_buf_start + n_rows_to_read
74
+ if len(obj_buf) < buf_size:
75
+ obj_buf.resize(buf_size)
76
+ dest_sel = np.s_[obj_buf_start:buf_size]
77
+
78
+ # this is required to make the read of multiple files faster
79
+ # until a better solution found.
80
+ if change_idx_to_slice or idx is None or use_h5idx:
81
+ h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
82
+ else:
83
+ # it is faster to read the whole object and then do fancy indexing
84
+ obj_buf.nda[dest_sel] = h5f[name][...][source_sel]
85
+
86
+ nda = obj_buf.nda
87
+ elif n_rows == 0:
88
+ tmp_shape = (0,) + h5f[name].shape[1:]
89
+ nda = np.empty(tmp_shape, h5f[name].dtype)
90
+ elif change_idx_to_slice or idx is None or use_h5idx:
91
+ nda = h5f[name][source_sel]
92
+ else:
93
+ # it is faster to read the whole object and then do fancy indexing
94
+ nda = h5f[name][...][source_sel]
95
+
96
+ # Finally, set attributes and return objects
97
+ attrs = h5f[name].attrs
98
+
99
+ # special handling for bools
100
+ # (c and Julia store as uint8 so cast to bool)
101
+ if datatype.get_nested_datatype_string(attrs["datatype"]) == "bool":
102
+ nda = nda.astype(np.bool_)
103
+
104
+ return (nda, attrs, n_rows_to_read)
@@ -0,0 +1,34 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ import numpy as np
6
+
7
+ from ....types import Scalar
8
+ from ...exceptions import LH5DecodeError
9
+
10
+ log = logging.getLogger(__name__)
11
+
12
+
13
+ def _h5_read_scalar(
14
+ name,
15
+ h5f,
16
+ obj_buf=None,
17
+ ):
18
+ value = h5f[name][()]
19
+
20
+ # special handling for bools
21
+ # (c and Julia store as uint8 so cast to bool)
22
+ if h5f[name].attrs["datatype"] == "bool":
23
+ value = np.bool_(value)
24
+
25
+ if obj_buf is not None:
26
+ if not isinstance(obj_buf, Scalar):
27
+ msg = "object buffer a Scalar"
28
+ raise LH5DecodeError(msg, h5f, name)
29
+
30
+ obj_buf.value = value
31
+ obj_buf.attrs.update(h5f[name].attrs)
32
+ return obj_buf, 1
33
+
34
+ return Scalar(value=value, attrs=h5f[name].attrs), 1
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ from ...exceptions import LH5DecodeError
4
+
5
+
6
+ def check_obj_buf_attrs(attrs, new_attrs, file, name):
7
+ if set(attrs.keys()) != set(new_attrs.keys()):
8
+ msg = (
9
+ f"existing buffer and new data chunk have different attributes: "
10
+ f"obj_buf.attrs={attrs} != {file.filename}[{name}].attrs={new_attrs}"
11
+ )
12
+ raise LH5DecodeError(msg, file, name)