legend-pydataobj 1.5.1__py3-none-any.whl → 1.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/METADATA +1 -1
- legend_pydataobj-1.6.1.dist-info/RECORD +54 -0
- {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/WHEEL +1 -1
- {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/entry_points.txt +1 -0
- lgdo/__init__.py +7 -4
- lgdo/_version.py +2 -2
- lgdo/cli.py +237 -12
- lgdo/compression/__init__.py +1 -0
- lgdo/lh5/__init__.py +9 -1
- lgdo/lh5/_serializers/__init__.py +43 -0
- lgdo/lh5/_serializers/read/__init__.py +0 -0
- lgdo/lh5/_serializers/read/array.py +34 -0
- lgdo/lh5/_serializers/read/composite.py +405 -0
- lgdo/lh5/_serializers/read/encoded.py +129 -0
- lgdo/lh5/_serializers/read/ndarray.py +104 -0
- lgdo/lh5/_serializers/read/scalar.py +34 -0
- lgdo/lh5/_serializers/read/utils.py +12 -0
- lgdo/lh5/_serializers/read/vector_of_vectors.py +201 -0
- lgdo/lh5/_serializers/write/__init__.py +0 -0
- lgdo/lh5/_serializers/write/array.py +92 -0
- lgdo/lh5/_serializers/write/composite.py +259 -0
- lgdo/lh5/_serializers/write/scalar.py +23 -0
- lgdo/lh5/_serializers/write/vector_of_vectors.py +95 -0
- lgdo/lh5/core.py +272 -0
- lgdo/lh5/datatype.py +46 -0
- lgdo/lh5/exceptions.py +34 -0
- lgdo/lh5/iterator.py +1 -1
- lgdo/lh5/store.py +69 -1160
- lgdo/lh5/tools.py +27 -53
- lgdo/lh5/utils.py +130 -27
- lgdo/lh5_store.py +11 -2
- lgdo/logging.py +1 -0
- lgdo/types/__init__.py +1 -0
- lgdo/types/array.py +1 -0
- lgdo/types/arrayofequalsizedarrays.py +1 -0
- lgdo/types/encoded.py +3 -8
- lgdo/types/fixedsizearray.py +1 -0
- lgdo/types/struct.py +1 -0
- lgdo/types/table.py +46 -5
- lgdo/types/vectorofvectors.py +314 -458
- lgdo/types/vovutils.py +320 -0
- lgdo/types/waveformtable.py +1 -0
- lgdo/utils.py +1 -32
- legend_pydataobj-1.5.1.dist-info/RECORD +0 -36
- {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/LICENSE +0 -0
- {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,405 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import bisect
|
4
|
+
import logging
|
5
|
+
import sys
|
6
|
+
from collections import defaultdict
|
7
|
+
|
8
|
+
import h5py
|
9
|
+
import numpy as np
|
10
|
+
|
11
|
+
from ....types import (
|
12
|
+
Array,
|
13
|
+
ArrayOfEncodedEqualSizedArrays,
|
14
|
+
ArrayOfEqualSizedArrays,
|
15
|
+
FixedSizeArray,
|
16
|
+
Scalar,
|
17
|
+
Struct,
|
18
|
+
Table,
|
19
|
+
VectorOfEncodedVectors,
|
20
|
+
VectorOfVectors,
|
21
|
+
WaveformTable,
|
22
|
+
)
|
23
|
+
from ... import datatype as dtypeutils
|
24
|
+
from ...exceptions import LH5DecodeError
|
25
|
+
from ...utils import read_n_rows
|
26
|
+
from . import utils
|
27
|
+
from .array import (
|
28
|
+
_h5_read_array,
|
29
|
+
_h5_read_array_of_equalsized_arrays,
|
30
|
+
_h5_read_fixedsize_array,
|
31
|
+
)
|
32
|
+
from .encoded import (
|
33
|
+
_h5_read_array_of_encoded_equalsized_arrays,
|
34
|
+
_h5_read_vector_of_encoded_vectors,
|
35
|
+
)
|
36
|
+
from .scalar import _h5_read_scalar
|
37
|
+
from .vector_of_vectors import _h5_read_vector_of_vectors
|
38
|
+
|
39
|
+
log = logging.getLogger(__name__)
|
40
|
+
|
41
|
+
|
42
|
+
def _h5_read_lgdo(
|
43
|
+
name,
|
44
|
+
h5f,
|
45
|
+
start_row=0,
|
46
|
+
n_rows=sys.maxsize,
|
47
|
+
idx=None,
|
48
|
+
use_h5idx=False,
|
49
|
+
field_mask=None,
|
50
|
+
obj_buf=None,
|
51
|
+
obj_buf_start=0,
|
52
|
+
decompress=True,
|
53
|
+
):
|
54
|
+
# Handle list-of-files recursively
|
55
|
+
if not isinstance(h5f, (str, h5py.File)):
|
56
|
+
lh5_file = list(h5f)
|
57
|
+
n_rows_read = 0
|
58
|
+
|
59
|
+
for i, h5f in enumerate(lh5_file):
|
60
|
+
if isinstance(idx, list) and len(idx) > 0 and not np.isscalar(idx[0]):
|
61
|
+
# a list of lists: must be one per file
|
62
|
+
idx_i = idx[i]
|
63
|
+
elif idx is not None:
|
64
|
+
# make idx a proper tuple if it's not one already
|
65
|
+
if not (isinstance(idx, tuple) and len(idx) == 1):
|
66
|
+
idx = (idx,)
|
67
|
+
# idx is a long continuous array
|
68
|
+
n_rows_i = read_n_rows(name, h5f)
|
69
|
+
# find the length of the subset of idx that contains indices
|
70
|
+
# that are less than n_rows_i
|
71
|
+
n_rows_to_read_i = bisect.bisect_left(idx[0], n_rows_i)
|
72
|
+
# now split idx into idx_i and the remainder
|
73
|
+
idx_i = (idx[0][:n_rows_to_read_i],)
|
74
|
+
idx = (idx[0][n_rows_to_read_i:] - n_rows_i,)
|
75
|
+
else:
|
76
|
+
idx_i = None
|
77
|
+
n_rows_i = n_rows - n_rows_read
|
78
|
+
|
79
|
+
obj_buf, n_rows_read_i = _h5_read_lgdo(
|
80
|
+
name,
|
81
|
+
h5f,
|
82
|
+
start_row=start_row,
|
83
|
+
n_rows=n_rows_i,
|
84
|
+
idx=idx_i,
|
85
|
+
use_h5idx=use_h5idx,
|
86
|
+
field_mask=field_mask,
|
87
|
+
obj_buf=obj_buf,
|
88
|
+
obj_buf_start=obj_buf_start,
|
89
|
+
decompress=decompress,
|
90
|
+
)
|
91
|
+
|
92
|
+
n_rows_read += n_rows_read_i
|
93
|
+
if n_rows_read >= n_rows or obj_buf is None:
|
94
|
+
return obj_buf, n_rows_read
|
95
|
+
start_row = 0
|
96
|
+
obj_buf_start += n_rows_read_i
|
97
|
+
|
98
|
+
return obj_buf, n_rows_read
|
99
|
+
|
100
|
+
if not isinstance(h5f, h5py.File):
|
101
|
+
h5f = h5py.File(h5f, mode="r")
|
102
|
+
|
103
|
+
log.debug(
|
104
|
+
f"reading {h5f.filename}:{name}[{start_row}:{n_rows}], decompress = {decompress}, "
|
105
|
+
+ (f" with field mask {field_mask}" if field_mask else "")
|
106
|
+
)
|
107
|
+
|
108
|
+
# make idx a proper tuple if it's not one already
|
109
|
+
if not (isinstance(idx, tuple) and len(idx) == 1) and idx is not None:
|
110
|
+
idx = (idx,)
|
111
|
+
|
112
|
+
try:
|
113
|
+
lgdotype = dtypeutils.datatype(h5f[name].attrs["datatype"])
|
114
|
+
except KeyError as e:
|
115
|
+
msg = "dataset not in file or missing 'datatype' attribute"
|
116
|
+
raise LH5DecodeError(msg, h5f, name) from e
|
117
|
+
|
118
|
+
if lgdotype is Scalar:
|
119
|
+
return _h5_read_scalar(
|
120
|
+
name,
|
121
|
+
h5f,
|
122
|
+
obj_buf=obj_buf,
|
123
|
+
)
|
124
|
+
|
125
|
+
# check field_mask and make it a default dict
|
126
|
+
if field_mask is None:
|
127
|
+
field_mask = defaultdict(lambda: True)
|
128
|
+
elif isinstance(field_mask, dict):
|
129
|
+
default = True
|
130
|
+
if len(field_mask) > 0:
|
131
|
+
default = not field_mask[next(iter(field_mask.keys()))]
|
132
|
+
field_mask = defaultdict(lambda: default, field_mask)
|
133
|
+
elif isinstance(field_mask, (list, tuple)):
|
134
|
+
field_mask = defaultdict(bool, {field: True for field in field_mask})
|
135
|
+
elif not isinstance(field_mask, defaultdict):
|
136
|
+
msg = "bad field_mask type"
|
137
|
+
raise ValueError(msg, type(field_mask).__name__)
|
138
|
+
|
139
|
+
if lgdotype is Struct:
|
140
|
+
return _h5_read_struct(
|
141
|
+
name,
|
142
|
+
h5f,
|
143
|
+
start_row=start_row,
|
144
|
+
n_rows=n_rows,
|
145
|
+
idx=idx,
|
146
|
+
use_h5idx=use_h5idx,
|
147
|
+
field_mask=field_mask,
|
148
|
+
decompress=decompress,
|
149
|
+
)
|
150
|
+
|
151
|
+
# Below here is all array-like types. So trim idx if needed
|
152
|
+
if idx is not None:
|
153
|
+
# check if idx is just an ordered list of the integers if so can ignore
|
154
|
+
if (idx[0] == np.arange(0, len(idx[0]), 1)).all():
|
155
|
+
if n_rows > len(idx[0]):
|
156
|
+
n_rows = len(idx[0])
|
157
|
+
idx = None
|
158
|
+
else:
|
159
|
+
# chop off indices < start_row
|
160
|
+
i_first_valid = bisect.bisect_left(idx[0], start_row)
|
161
|
+
idxa = idx[0][i_first_valid:]
|
162
|
+
# don't readout more than n_rows indices
|
163
|
+
idx = (idxa[:n_rows],) # works even if n_rows > len(idxa)
|
164
|
+
|
165
|
+
if lgdotype is Table:
|
166
|
+
return _h5_read_table(
|
167
|
+
name,
|
168
|
+
h5f,
|
169
|
+
start_row=start_row,
|
170
|
+
n_rows=n_rows,
|
171
|
+
idx=idx,
|
172
|
+
use_h5idx=use_h5idx,
|
173
|
+
field_mask=field_mask,
|
174
|
+
obj_buf=obj_buf,
|
175
|
+
obj_buf_start=obj_buf_start,
|
176
|
+
decompress=decompress,
|
177
|
+
)
|
178
|
+
|
179
|
+
if lgdotype is ArrayOfEncodedEqualSizedArrays:
|
180
|
+
return _h5_read_array_of_encoded_equalsized_arrays(
|
181
|
+
name,
|
182
|
+
h5f,
|
183
|
+
start_row=start_row,
|
184
|
+
n_rows=n_rows,
|
185
|
+
idx=idx,
|
186
|
+
use_h5idx=use_h5idx,
|
187
|
+
obj_buf=obj_buf,
|
188
|
+
obj_buf_start=obj_buf_start,
|
189
|
+
decompress=decompress,
|
190
|
+
)
|
191
|
+
|
192
|
+
if lgdotype is VectorOfEncodedVectors:
|
193
|
+
return _h5_read_vector_of_encoded_vectors(
|
194
|
+
name,
|
195
|
+
h5f,
|
196
|
+
start_row=start_row,
|
197
|
+
n_rows=n_rows,
|
198
|
+
idx=idx,
|
199
|
+
use_h5idx=use_h5idx,
|
200
|
+
obj_buf=obj_buf,
|
201
|
+
obj_buf_start=obj_buf_start,
|
202
|
+
decompress=decompress,
|
203
|
+
)
|
204
|
+
|
205
|
+
if lgdotype is VectorOfVectors:
|
206
|
+
return _h5_read_vector_of_vectors(
|
207
|
+
name,
|
208
|
+
h5f,
|
209
|
+
start_row=start_row,
|
210
|
+
n_rows=n_rows,
|
211
|
+
idx=idx,
|
212
|
+
use_h5idx=use_h5idx,
|
213
|
+
obj_buf=obj_buf,
|
214
|
+
obj_buf_start=obj_buf_start,
|
215
|
+
)
|
216
|
+
|
217
|
+
if lgdotype is FixedSizeArray:
|
218
|
+
return _h5_read_fixedsize_array(
|
219
|
+
name,
|
220
|
+
h5f,
|
221
|
+
start_row=start_row,
|
222
|
+
n_rows=n_rows,
|
223
|
+
idx=idx,
|
224
|
+
use_h5idx=use_h5idx,
|
225
|
+
obj_buf=obj_buf,
|
226
|
+
obj_buf_start=obj_buf_start,
|
227
|
+
)
|
228
|
+
|
229
|
+
if lgdotype is ArrayOfEqualSizedArrays:
|
230
|
+
return _h5_read_array_of_equalsized_arrays(
|
231
|
+
name,
|
232
|
+
h5f,
|
233
|
+
start_row=start_row,
|
234
|
+
n_rows=n_rows,
|
235
|
+
idx=idx,
|
236
|
+
use_h5idx=use_h5idx,
|
237
|
+
obj_buf=obj_buf,
|
238
|
+
obj_buf_start=obj_buf_start,
|
239
|
+
)
|
240
|
+
|
241
|
+
if lgdotype is Array:
|
242
|
+
return _h5_read_array(
|
243
|
+
name,
|
244
|
+
h5f,
|
245
|
+
start_row=start_row,
|
246
|
+
n_rows=n_rows,
|
247
|
+
idx=idx,
|
248
|
+
use_h5idx=use_h5idx,
|
249
|
+
obj_buf=obj_buf,
|
250
|
+
obj_buf_start=obj_buf_start,
|
251
|
+
)
|
252
|
+
|
253
|
+
msg = f"no rule to decode {lgdotype.__name__} from LH5"
|
254
|
+
raise LH5DecodeError(msg, h5f, name)
|
255
|
+
|
256
|
+
|
257
|
+
def _h5_read_struct(
|
258
|
+
name,
|
259
|
+
h5f,
|
260
|
+
start_row=0,
|
261
|
+
n_rows=sys.maxsize,
|
262
|
+
idx=None,
|
263
|
+
use_h5idx=False,
|
264
|
+
field_mask=None,
|
265
|
+
decompress=True,
|
266
|
+
):
|
267
|
+
# TODO: it's strange to pass start_row, n_rows, idx to struct
|
268
|
+
# fields. If they all had shared indexing, they should be in a
|
269
|
+
# table... Maybe should emit a warning? Or allow them to be
|
270
|
+
# dicts keyed by field name?
|
271
|
+
|
272
|
+
attrs = dict(h5f[name].attrs)
|
273
|
+
|
274
|
+
# determine fields to be read out
|
275
|
+
all_fields = dtypeutils.get_struct_fields(attrs["datatype"])
|
276
|
+
selected_fields = (
|
277
|
+
[field for field in all_fields if field_mask[field]]
|
278
|
+
if field_mask is not None
|
279
|
+
else all_fields
|
280
|
+
)
|
281
|
+
|
282
|
+
# modify datatype in attrs if a field_mask was used
|
283
|
+
attrs["datatype"] = "struct{" + ",".join(selected_fields) + "}"
|
284
|
+
|
285
|
+
# loop over fields and read
|
286
|
+
obj_dict = {}
|
287
|
+
for field in selected_fields:
|
288
|
+
# support for integer keys
|
289
|
+
field_key = int(field) if attrs.get("int_keys") else str(field)
|
290
|
+
obj_dict[field_key], _ = _h5_read_lgdo(
|
291
|
+
f"{name}/{field}",
|
292
|
+
h5f,
|
293
|
+
start_row=start_row,
|
294
|
+
n_rows=n_rows,
|
295
|
+
idx=idx,
|
296
|
+
use_h5idx=use_h5idx,
|
297
|
+
decompress=decompress,
|
298
|
+
)
|
299
|
+
|
300
|
+
return Struct(obj_dict=obj_dict, attrs=attrs), 1
|
301
|
+
|
302
|
+
|
303
|
+
def _h5_read_table(
|
304
|
+
name,
|
305
|
+
h5f,
|
306
|
+
start_row=0,
|
307
|
+
n_rows=sys.maxsize,
|
308
|
+
idx=None,
|
309
|
+
use_h5idx=False,
|
310
|
+
field_mask=None,
|
311
|
+
obj_buf=None,
|
312
|
+
obj_buf_start=0,
|
313
|
+
decompress=True,
|
314
|
+
):
|
315
|
+
if obj_buf is not None and not isinstance(obj_buf, Table):
|
316
|
+
msg = "provided object buffer is not a Table"
|
317
|
+
raise LH5DecodeError(msg, h5f, name)
|
318
|
+
|
319
|
+
attrs = dict(h5f[name].attrs)
|
320
|
+
|
321
|
+
# determine fields to be read out
|
322
|
+
all_fields = dtypeutils.get_struct_fields(attrs["datatype"])
|
323
|
+
selected_fields = (
|
324
|
+
[field for field in all_fields if field_mask[field]]
|
325
|
+
if field_mask is not None
|
326
|
+
else all_fields
|
327
|
+
)
|
328
|
+
|
329
|
+
# modify datatype in attrs if a field_mask was used
|
330
|
+
attrs["datatype"] = "table{" + ",".join(selected_fields) + "}"
|
331
|
+
|
332
|
+
# read out each of the fields
|
333
|
+
col_dict = {}
|
334
|
+
rows_read = []
|
335
|
+
for field in selected_fields:
|
336
|
+
fld_buf = None
|
337
|
+
if obj_buf is not None:
|
338
|
+
if not isinstance(obj_buf, Table) or field not in obj_buf:
|
339
|
+
msg = "provided object buffer is not a Table or columns are missing"
|
340
|
+
raise LH5DecodeError(msg, h5f, name)
|
341
|
+
|
342
|
+
fld_buf = obj_buf[field]
|
343
|
+
|
344
|
+
col_dict[field], n_rows_read = _h5_read_lgdo(
|
345
|
+
f"{name}/{field}",
|
346
|
+
h5f,
|
347
|
+
start_row=start_row,
|
348
|
+
n_rows=n_rows,
|
349
|
+
idx=idx,
|
350
|
+
use_h5idx=use_h5idx,
|
351
|
+
obj_buf=fld_buf,
|
352
|
+
obj_buf_start=obj_buf_start,
|
353
|
+
decompress=decompress,
|
354
|
+
)
|
355
|
+
|
356
|
+
if obj_buf is not None and obj_buf_start + n_rows_read > len(obj_buf):
|
357
|
+
obj_buf.resize(obj_buf_start + n_rows_read)
|
358
|
+
|
359
|
+
rows_read.append(n_rows_read)
|
360
|
+
|
361
|
+
# warn if all columns don't read in the same number of rows
|
362
|
+
if len(rows_read) > 0:
|
363
|
+
n_rows_read = rows_read[0]
|
364
|
+
else:
|
365
|
+
n_rows_read = 0
|
366
|
+
log.warning(f"Table '{name}' has no fields specified by {field_mask=}")
|
367
|
+
|
368
|
+
for n in rows_read[1:]:
|
369
|
+
if n != n_rows_read:
|
370
|
+
log.warning(
|
371
|
+
f"Table '{name}' got strange n_rows_read = {n}, "
|
372
|
+
"{n_rows_read} was expected ({rows_read})"
|
373
|
+
)
|
374
|
+
|
375
|
+
# fields have been read out, now return a table
|
376
|
+
if obj_buf is None:
|
377
|
+
# if col_dict contains just 3 objects called t0, dt, and values,
|
378
|
+
# return a WaveformTable
|
379
|
+
if (
|
380
|
+
len(col_dict) == 3
|
381
|
+
and "t0" in col_dict
|
382
|
+
and "dt" in col_dict
|
383
|
+
and "values" in col_dict
|
384
|
+
):
|
385
|
+
table = WaveformTable(
|
386
|
+
t0=col_dict["t0"], dt=col_dict["dt"], values=col_dict["values"]
|
387
|
+
)
|
388
|
+
else:
|
389
|
+
table = Table(col_dict=col_dict, attrs=attrs)
|
390
|
+
|
391
|
+
# set (write) loc to end of tree
|
392
|
+
table.loc = n_rows_read
|
393
|
+
return table, n_rows_read
|
394
|
+
|
395
|
+
# We have read all fields into the object buffer. Run
|
396
|
+
# checks: All columns should be the same size. So update
|
397
|
+
# table's size as necessary, warn if any mismatches are found
|
398
|
+
obj_buf.resize(do_warn=True)
|
399
|
+
# set (write) loc to end of tree
|
400
|
+
obj_buf.loc = obj_buf_start + n_rows_read
|
401
|
+
|
402
|
+
# check attributes
|
403
|
+
utils.check_obj_buf_attrs(obj_buf.attrs, attrs, h5f, name)
|
404
|
+
|
405
|
+
return obj_buf, n_rows_read
|
@@ -0,0 +1,129 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import sys
|
5
|
+
|
6
|
+
from .... import compression as compress
|
7
|
+
from ....types import (
|
8
|
+
ArrayOfEncodedEqualSizedArrays,
|
9
|
+
VectorOfEncodedVectors,
|
10
|
+
)
|
11
|
+
from ...exceptions import LH5DecodeError
|
12
|
+
from .array import (
|
13
|
+
_h5_read_array,
|
14
|
+
)
|
15
|
+
from .scalar import _h5_read_scalar
|
16
|
+
from .vector_of_vectors import _h5_read_vector_of_vectors
|
17
|
+
|
18
|
+
log = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
def _h5_read_array_of_encoded_equalsized_arrays(
|
22
|
+
name,
|
23
|
+
h5f,
|
24
|
+
**kwargs,
|
25
|
+
):
|
26
|
+
return _h5_read_encoded_array(ArrayOfEncodedEqualSizedArrays, name, h5f, **kwargs)
|
27
|
+
|
28
|
+
|
29
|
+
def _h5_read_vector_of_encoded_vectors(
|
30
|
+
name,
|
31
|
+
h5f,
|
32
|
+
**kwargs,
|
33
|
+
):
|
34
|
+
return _h5_read_encoded_array(VectorOfEncodedVectors, name, h5f, **kwargs)
|
35
|
+
|
36
|
+
|
37
|
+
def _h5_read_encoded_array(
|
38
|
+
lgdotype,
|
39
|
+
name,
|
40
|
+
h5f,
|
41
|
+
start_row=0,
|
42
|
+
n_rows=sys.maxsize,
|
43
|
+
idx=None,
|
44
|
+
use_h5idx=False,
|
45
|
+
obj_buf=None,
|
46
|
+
obj_buf_start=0,
|
47
|
+
decompress=True,
|
48
|
+
):
|
49
|
+
if lgdotype not in (ArrayOfEncodedEqualSizedArrays, VectorOfEncodedVectors):
|
50
|
+
msg = f"unsupported read of encoded type {lgdotype.__name__}"
|
51
|
+
raise LH5DecodeError(msg, h5f, name)
|
52
|
+
|
53
|
+
if not decompress and obj_buf is not None and not isinstance(obj_buf, lgdotype):
|
54
|
+
msg = f"object buffer is not a {lgdotype.__name__}"
|
55
|
+
raise LH5DecodeError(msg, h5f, name)
|
56
|
+
|
57
|
+
# read out decoded_size, either a Scalar or an Array
|
58
|
+
decoded_size_buf = encoded_data_buf = None
|
59
|
+
if obj_buf is not None and not decompress:
|
60
|
+
decoded_size_buf = obj_buf.decoded_size
|
61
|
+
encoded_data_buf = obj_buf.encoded_data
|
62
|
+
|
63
|
+
if lgdotype is VectorOfEncodedVectors:
|
64
|
+
decoded_size, _ = _h5_read_array(
|
65
|
+
f"{name}/decoded_size",
|
66
|
+
h5f,
|
67
|
+
start_row=start_row,
|
68
|
+
n_rows=n_rows,
|
69
|
+
idx=idx,
|
70
|
+
use_h5idx=use_h5idx,
|
71
|
+
obj_buf=None if decompress else decoded_size_buf,
|
72
|
+
obj_buf_start=0 if decompress else obj_buf_start,
|
73
|
+
)
|
74
|
+
|
75
|
+
else:
|
76
|
+
decoded_size, _ = _h5_read_scalar(
|
77
|
+
f"{name}/decoded_size",
|
78
|
+
h5f,
|
79
|
+
obj_buf=None if decompress else decoded_size_buf,
|
80
|
+
)
|
81
|
+
|
82
|
+
# read out encoded_data, a VectorOfVectors
|
83
|
+
encoded_data, n_rows_read = _h5_read_vector_of_vectors(
|
84
|
+
f"{name}/encoded_data",
|
85
|
+
h5f,
|
86
|
+
start_row=start_row,
|
87
|
+
n_rows=n_rows,
|
88
|
+
idx=idx,
|
89
|
+
use_h5idx=use_h5idx,
|
90
|
+
obj_buf=None if decompress else encoded_data_buf,
|
91
|
+
obj_buf_start=0 if decompress else obj_buf_start,
|
92
|
+
)
|
93
|
+
|
94
|
+
# return the still encoded data in the buffer object, if there
|
95
|
+
if obj_buf is not None and not decompress:
|
96
|
+
return obj_buf, n_rows_read
|
97
|
+
|
98
|
+
# otherwise re-create the encoded LGDO
|
99
|
+
rawdata = lgdotype(
|
100
|
+
encoded_data=encoded_data,
|
101
|
+
decoded_size=decoded_size,
|
102
|
+
attrs=h5f[name].attrs,
|
103
|
+
)
|
104
|
+
|
105
|
+
# already return if no decompression is requested
|
106
|
+
if not decompress:
|
107
|
+
return rawdata, n_rows_read
|
108
|
+
|
109
|
+
# if no buffer, decode and return
|
110
|
+
if obj_buf is None and decompress:
|
111
|
+
return compress.decode(rawdata), n_rows_read
|
112
|
+
|
113
|
+
# eventually expand provided obj_buf, if too short
|
114
|
+
buf_size = obj_buf_start + n_rows_read
|
115
|
+
if len(obj_buf) < buf_size:
|
116
|
+
obj_buf.resize(buf_size)
|
117
|
+
|
118
|
+
# use the (decoded object type) buffer otherwise
|
119
|
+
if lgdotype is ArrayOfEncodedEqualSizedArrays:
|
120
|
+
compress.decode(rawdata, obj_buf[obj_buf_start:buf_size])
|
121
|
+
|
122
|
+
elif lgdotype is VectorOfEncodedVectors:
|
123
|
+
# FIXME: not a good idea. an in place decoding version
|
124
|
+
# of decode would be needed to avoid extra memory
|
125
|
+
# allocations
|
126
|
+
for i, wf in enumerate(compress.decode(rawdata)):
|
127
|
+
obj_buf[obj_buf_start + i] = wf
|
128
|
+
|
129
|
+
return obj_buf, n_rows_read
|
@@ -0,0 +1,104 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import sys
|
5
|
+
from bisect import bisect_left
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from ....types import Array
|
10
|
+
from ... import datatype
|
11
|
+
from ...exceptions import LH5DecodeError
|
12
|
+
|
13
|
+
log = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
def _h5_read_ndarray(
|
17
|
+
name,
|
18
|
+
h5f,
|
19
|
+
start_row=0,
|
20
|
+
n_rows=sys.maxsize,
|
21
|
+
idx=None,
|
22
|
+
use_h5idx=False,
|
23
|
+
obj_buf=None,
|
24
|
+
obj_buf_start=0,
|
25
|
+
):
|
26
|
+
if obj_buf is not None and not isinstance(obj_buf, Array):
|
27
|
+
msg = "object buffer is not an Array"
|
28
|
+
raise LH5DecodeError(msg, h5f, name)
|
29
|
+
|
30
|
+
# compute the number of rows to read
|
31
|
+
# we culled idx above for start_row and n_rows, now we have to apply
|
32
|
+
# the constraint of the length of the dataset
|
33
|
+
try:
|
34
|
+
ds_n_rows = h5f[name].shape[0]
|
35
|
+
except AttributeError as e:
|
36
|
+
msg = "does not seem to be an HDF5 dataset"
|
37
|
+
raise LH5DecodeError(msg, h5f, name) from e
|
38
|
+
|
39
|
+
if idx is not None:
|
40
|
+
if len(idx[0]) > 0 and idx[0][-1] >= ds_n_rows:
|
41
|
+
log.warning("idx indexed past the end of the array in the file. Culling...")
|
42
|
+
n_rows_to_read = bisect_left(idx[0], ds_n_rows)
|
43
|
+
idx = (idx[0][:n_rows_to_read],)
|
44
|
+
if len(idx[0]) == 0:
|
45
|
+
log.warning("idx empty after culling.")
|
46
|
+
n_rows_to_read = len(idx[0])
|
47
|
+
else:
|
48
|
+
n_rows_to_read = ds_n_rows - start_row
|
49
|
+
if n_rows_to_read > n_rows:
|
50
|
+
n_rows_to_read = n_rows
|
51
|
+
|
52
|
+
# if idx is passed, check if we can make it a slice instead (faster)
|
53
|
+
change_idx_to_slice = False
|
54
|
+
|
55
|
+
# prepare the selection for the read. Use idx if available
|
56
|
+
if idx is not None:
|
57
|
+
# check if idx is empty and convert to slice instead
|
58
|
+
if len(idx[0]) == 0:
|
59
|
+
source_sel = np.s_[0:0]
|
60
|
+
change_idx_to_slice = True
|
61
|
+
# check if idx is contiguous and increasing
|
62
|
+
# if so, convert it to a slice instead (faster)
|
63
|
+
elif np.all(np.diff(idx[0]) == 1):
|
64
|
+
source_sel = np.s_[idx[0][0] : idx[0][-1] + 1]
|
65
|
+
change_idx_to_slice = True
|
66
|
+
else:
|
67
|
+
source_sel = idx
|
68
|
+
else:
|
69
|
+
source_sel = np.s_[start_row : start_row + n_rows_to_read]
|
70
|
+
|
71
|
+
# Now read the array
|
72
|
+
if obj_buf is not None and n_rows_to_read > 0:
|
73
|
+
buf_size = obj_buf_start + n_rows_to_read
|
74
|
+
if len(obj_buf) < buf_size:
|
75
|
+
obj_buf.resize(buf_size)
|
76
|
+
dest_sel = np.s_[obj_buf_start:buf_size]
|
77
|
+
|
78
|
+
# this is required to make the read of multiple files faster
|
79
|
+
# until a better solution found.
|
80
|
+
if change_idx_to_slice or idx is None or use_h5idx:
|
81
|
+
h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
|
82
|
+
else:
|
83
|
+
# it is faster to read the whole object and then do fancy indexing
|
84
|
+
obj_buf.nda[dest_sel] = h5f[name][...][source_sel]
|
85
|
+
|
86
|
+
nda = obj_buf.nda
|
87
|
+
elif n_rows == 0:
|
88
|
+
tmp_shape = (0,) + h5f[name].shape[1:]
|
89
|
+
nda = np.empty(tmp_shape, h5f[name].dtype)
|
90
|
+
elif change_idx_to_slice or idx is None or use_h5idx:
|
91
|
+
nda = h5f[name][source_sel]
|
92
|
+
else:
|
93
|
+
# it is faster to read the whole object and then do fancy indexing
|
94
|
+
nda = h5f[name][...][source_sel]
|
95
|
+
|
96
|
+
# Finally, set attributes and return objects
|
97
|
+
attrs = h5f[name].attrs
|
98
|
+
|
99
|
+
# special handling for bools
|
100
|
+
# (c and Julia store as uint8 so cast to bool)
|
101
|
+
if datatype.get_nested_datatype_string(attrs["datatype"]) == "bool":
|
102
|
+
nda = nda.astype(np.bool_)
|
103
|
+
|
104
|
+
return (nda, attrs, n_rows_to_read)
|
@@ -0,0 +1,34 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
|
7
|
+
from ....types import Scalar
|
8
|
+
from ...exceptions import LH5DecodeError
|
9
|
+
|
10
|
+
log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
def _h5_read_scalar(
|
14
|
+
name,
|
15
|
+
h5f,
|
16
|
+
obj_buf=None,
|
17
|
+
):
|
18
|
+
value = h5f[name][()]
|
19
|
+
|
20
|
+
# special handling for bools
|
21
|
+
# (c and Julia store as uint8 so cast to bool)
|
22
|
+
if h5f[name].attrs["datatype"] == "bool":
|
23
|
+
value = np.bool_(value)
|
24
|
+
|
25
|
+
if obj_buf is not None:
|
26
|
+
if not isinstance(obj_buf, Scalar):
|
27
|
+
msg = "object buffer a Scalar"
|
28
|
+
raise LH5DecodeError(msg, h5f, name)
|
29
|
+
|
30
|
+
obj_buf.value = value
|
31
|
+
obj_buf.attrs.update(h5f[name].attrs)
|
32
|
+
return obj_buf, 1
|
33
|
+
|
34
|
+
return Scalar(value=value, attrs=h5f[name].attrs), 1
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from ...exceptions import LH5DecodeError
|
4
|
+
|
5
|
+
|
6
|
+
def check_obj_buf_attrs(attrs, new_attrs, file, name):
|
7
|
+
if set(attrs.keys()) != set(new_attrs.keys()):
|
8
|
+
msg = (
|
9
|
+
f"existing buffer and new data chunk have different attributes: "
|
10
|
+
f"obj_buf.attrs={attrs} != {file.filename}[{name}].attrs={new_attrs}"
|
11
|
+
)
|
12
|
+
raise LH5DecodeError(msg, file, name)
|