legend-pydataobj 1.11.6__py3-none-any.whl → 1.11.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.11.8.dist-info}/METADATA +3 -2
- {legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.11.8.dist-info}/RECORD +28 -28
- {legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.11.8.dist-info}/WHEEL +1 -1
- {legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.11.8.dist-info}/entry_points.txt +1 -1
- lgdo/__init__.py +5 -4
- lgdo/_version.py +9 -4
- lgdo/cli.py +10 -155
- lgdo/lh5/__init__.py +2 -3
- lgdo/lh5/_serializers/read/composite.py +1 -3
- lgdo/lh5/_serializers/read/utils.py +1 -1
- lgdo/lh5/_serializers/read/vector_of_vectors.py +1 -1
- lgdo/lh5/_serializers/write/composite.py +14 -8
- lgdo/lh5/concat.py +219 -0
- lgdo/lh5/core.py +33 -36
- lgdo/lh5/iterator.py +48 -27
- lgdo/lh5/store.py +22 -75
- lgdo/lh5/tools.py +0 -111
- lgdo/lh5/utils.py +6 -4
- lgdo/types/array.py +84 -15
- lgdo/types/encoded.py +25 -20
- lgdo/types/histogram.py +1 -1
- lgdo/types/lgdo.py +50 -0
- lgdo/types/table.py +49 -28
- lgdo/types/vectorofvectors.py +132 -94
- lgdo/types/vovutils.py +14 -4
- lgdo/types/waveformtable.py +19 -21
- lgdo/lh5_store.py +0 -284
- {legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.11.8.dist-info/licenses}/LICENSE +0 -0
- {legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.11.8.dist-info}/top_level.txt +0 -0
lgdo/lh5/concat.py
ADDED
@@ -0,0 +1,219 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import fnmatch
|
4
|
+
import logging
|
5
|
+
|
6
|
+
from lgdo.lh5 import LH5Iterator
|
7
|
+
|
8
|
+
from .. import Array, Scalar, Struct, Table, VectorOfVectors, lh5
|
9
|
+
|
10
|
+
log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
def _get_obj_list(
|
14
|
+
lh5_files: list, include_list: list | None = None, exclude_list: list | None = None
|
15
|
+
) -> list[str]:
|
16
|
+
"""Extract a list of lh5 objects to concatenate.
|
17
|
+
|
18
|
+
Parameters
|
19
|
+
----------
|
20
|
+
lh5_files
|
21
|
+
list of input files to concatenate.
|
22
|
+
include_list
|
23
|
+
patterns for tables to include.
|
24
|
+
exclude_list
|
25
|
+
patterns for tables to exclude.
|
26
|
+
|
27
|
+
"""
|
28
|
+
file0 = lh5_files[0]
|
29
|
+
obj_list_full = set(lh5.ls(file0, recursive=True))
|
30
|
+
|
31
|
+
# let's remove objects with nested LGDOs inside
|
32
|
+
to_remove = set()
|
33
|
+
for name in obj_list_full:
|
34
|
+
if len(fnmatch.filter(obj_list_full, f"{name}/*")) > 1:
|
35
|
+
to_remove.add(name)
|
36
|
+
obj_list_full -= to_remove
|
37
|
+
|
38
|
+
obj_list = set()
|
39
|
+
# now first remove excluded stuff
|
40
|
+
if exclude_list is not None:
|
41
|
+
for exc in exclude_list:
|
42
|
+
obj_list_full -= set(fnmatch.filter(obj_list_full, exc.strip("/")))
|
43
|
+
|
44
|
+
# then make list of included, based on latest list
|
45
|
+
if include_list is not None:
|
46
|
+
for inc in include_list:
|
47
|
+
obj_list |= set(fnmatch.filter(obj_list_full, inc.strip("/")))
|
48
|
+
else:
|
49
|
+
obj_list = obj_list_full
|
50
|
+
|
51
|
+
# sort
|
52
|
+
return sorted(obj_list)
|
53
|
+
|
54
|
+
|
55
|
+
def _get_lgdos(file, obj_list):
|
56
|
+
"""Get name of LGDO objects."""
|
57
|
+
|
58
|
+
store = lh5.LH5Store()
|
59
|
+
h5f0 = store.gimme_file(file)
|
60
|
+
|
61
|
+
lgdos = []
|
62
|
+
lgdo_structs = {}
|
63
|
+
|
64
|
+
# loop over object list in the first file
|
65
|
+
for name in obj_list:
|
66
|
+
# now loop over groups starting from root
|
67
|
+
current = ""
|
68
|
+
for item in name.split("/"):
|
69
|
+
current = f"{current}/{item}".strip("/")
|
70
|
+
|
71
|
+
if current in lgdos:
|
72
|
+
break
|
73
|
+
|
74
|
+
# not even an LGDO (i.e. a plain HDF5 group)!
|
75
|
+
if "datatype" not in h5f0[current].attrs:
|
76
|
+
continue
|
77
|
+
|
78
|
+
# read as little as possible
|
79
|
+
obj = store.read(current, h5f0, n_rows=1)
|
80
|
+
if isinstance(obj, (Table, Array, VectorOfVectors)):
|
81
|
+
lgdos.append(current)
|
82
|
+
|
83
|
+
elif isinstance(obj, Struct):
|
84
|
+
# structs might be used in a "group-like" fashion (i.e. they might only
|
85
|
+
# contain array-like objects).
|
86
|
+
# note: handle after handling tables, as tables also satisfy this check.
|
87
|
+
lgdo_structs[current] = obj.attrs["datatype"]
|
88
|
+
continue
|
89
|
+
|
90
|
+
elif isinstance(obj, Scalar):
|
91
|
+
msg = f"cannot concat scalar field {current}"
|
92
|
+
log.warning(msg)
|
93
|
+
|
94
|
+
break
|
95
|
+
|
96
|
+
msg = f"first-level, array-like objects: {lgdos}"
|
97
|
+
log.info(msg)
|
98
|
+
|
99
|
+
msg = f"nested structs: {lgdo_structs}"
|
100
|
+
log.info(msg)
|
101
|
+
|
102
|
+
h5f0.close()
|
103
|
+
|
104
|
+
if lgdos == []:
|
105
|
+
msg = "did not find any field to concatenate, exit"
|
106
|
+
raise RuntimeError(msg)
|
107
|
+
|
108
|
+
return lgdos, lgdo_structs
|
109
|
+
|
110
|
+
|
111
|
+
def _inplace_table_filter(name, table, obj_list):
|
112
|
+
"""filter objects nested in this LGDO"""
|
113
|
+
skm = fnmatch.filter(obj_list, f"{name}/*")
|
114
|
+
kept = {it.removeprefix(name).strip("/").split("/")[0] for it in skm}
|
115
|
+
|
116
|
+
# now remove fields
|
117
|
+
for k in list(table.keys()):
|
118
|
+
if k not in kept:
|
119
|
+
table.remove_column(k)
|
120
|
+
|
121
|
+
msg = f"fields left in table '{name}': {table.keys()}"
|
122
|
+
log.debug(msg)
|
123
|
+
|
124
|
+
# recurse!
|
125
|
+
for k2, v2 in table.items():
|
126
|
+
if not isinstance(v2, Table):
|
127
|
+
continue
|
128
|
+
|
129
|
+
_inplace_table_filter(f"{name}/{k2}", v2, obj_list)
|
130
|
+
|
131
|
+
|
132
|
+
def _remove_nested_fields(lgdos: dict, obj_list: list):
|
133
|
+
"""Remove (nested) table fields based on obj_list."""
|
134
|
+
|
135
|
+
for key, val in lgdos.items():
|
136
|
+
if not isinstance(val, Table):
|
137
|
+
continue
|
138
|
+
|
139
|
+
_inplace_table_filter(key, val, obj_list)
|
140
|
+
|
141
|
+
|
142
|
+
def lh5concat(
|
143
|
+
lh5_files: list,
|
144
|
+
output: str,
|
145
|
+
overwrite: bool = False,
|
146
|
+
*,
|
147
|
+
include_list: list | None = None,
|
148
|
+
exclude_list: list | None = None,
|
149
|
+
) -> None:
|
150
|
+
"""Concatenate LGDO Arrays, VectorOfVectors and Tables in LH5 files.
|
151
|
+
|
152
|
+
Parameters
|
153
|
+
----------
|
154
|
+
lh5_files
|
155
|
+
list of input files to concatenate.
|
156
|
+
output
|
157
|
+
path to the output file
|
158
|
+
include_list
|
159
|
+
patterns for tables to include.
|
160
|
+
exclude_list
|
161
|
+
patterns for tables to exclude.
|
162
|
+
"""
|
163
|
+
|
164
|
+
if len(lh5_files) < 2:
|
165
|
+
msg = "you must provide at least two input files"
|
166
|
+
raise RuntimeError(msg)
|
167
|
+
|
168
|
+
# determine list of objects by recursively ls'ing first file
|
169
|
+
obj_list = _get_obj_list(
|
170
|
+
lh5_files, include_list=include_list, exclude_list=exclude_list
|
171
|
+
)
|
172
|
+
|
173
|
+
msg = f"objects matching include patterns {include_list} in {lh5_files[0]}: {obj_list}"
|
174
|
+
log.info(msg)
|
175
|
+
|
176
|
+
lgdos, lgdo_structs = _get_lgdos(lh5_files[0], obj_list)
|
177
|
+
first_done = False
|
178
|
+
store = lh5.LH5Store()
|
179
|
+
|
180
|
+
# loop over lgdo objects
|
181
|
+
for lgdo in lgdos:
|
182
|
+
# iterate over the files
|
183
|
+
for lh5_obj in LH5Iterator(lh5_files, lgdo):
|
184
|
+
data = {lgdo: lh5_obj}
|
185
|
+
|
186
|
+
# remove the nested fields
|
187
|
+
_remove_nested_fields(data, obj_list)
|
188
|
+
|
189
|
+
if first_done is False:
|
190
|
+
msg = f"creating output file {output}"
|
191
|
+
log.info(msg)
|
192
|
+
|
193
|
+
store.write(
|
194
|
+
data[lgdo],
|
195
|
+
lgdo,
|
196
|
+
output,
|
197
|
+
wo_mode="overwrite_file"
|
198
|
+
if (overwrite and not first_done)
|
199
|
+
else "write_safe",
|
200
|
+
)
|
201
|
+
first_done = True
|
202
|
+
|
203
|
+
else:
|
204
|
+
msg = f"appending to {output}"
|
205
|
+
log.info(msg)
|
206
|
+
|
207
|
+
if isinstance(data[lgdo], Table):
|
208
|
+
_inplace_table_filter(lgdo, data[lgdo], obj_list)
|
209
|
+
|
210
|
+
store.write(data[lgdo], lgdo, output, wo_mode="append")
|
211
|
+
|
212
|
+
if lgdo_structs != {}:
|
213
|
+
output_file = store.gimme_file(output, mode="a")
|
214
|
+
for struct, struct_dtype in lgdo_structs.items():
|
215
|
+
msg = f"reset datatype of struct {struct} to {struct_dtype}"
|
216
|
+
log.debug(msg)
|
217
|
+
|
218
|
+
output_file[struct].attrs["datatype"] = struct_dtype
|
219
|
+
output_file.close()
|
lgdo/lh5/core.py
CHANGED
@@ -4,6 +4,7 @@ import bisect
|
|
4
4
|
import inspect
|
5
5
|
import sys
|
6
6
|
from collections.abc import Mapping, Sequence
|
7
|
+
from contextlib import suppress
|
7
8
|
from typing import Any
|
8
9
|
|
9
10
|
import h5py
|
@@ -92,8 +93,7 @@ def read(
|
|
92
93
|
will be set to ``True``, while the rest will default to ``False``.
|
93
94
|
obj_buf
|
94
95
|
Read directly into memory provided in `obj_buf`. Note: the buffer
|
95
|
-
will be
|
96
|
-
buffer length, send in ``n_rows = len(obj_buf)``.
|
96
|
+
will be resized to accommodate the data retrieved.
|
97
97
|
obj_buf_start
|
98
98
|
Start location in ``obj_buf`` for read. For concatenating data to
|
99
99
|
array-like objects.
|
@@ -106,25 +106,25 @@ def read(
|
|
106
106
|
|
107
107
|
Returns
|
108
108
|
-------
|
109
|
-
|
110
|
-
|
111
|
-
successfully read out. Essential for arrays when the amount of data
|
112
|
-
is smaller than the object buffer. For scalars and structs
|
113
|
-
`n_rows_read` will be``1``. For tables it is redundant with
|
114
|
-
``table.loc``. If `obj_buf` is ``None``, only `object` is returned.
|
109
|
+
object
|
110
|
+
the read-out object
|
115
111
|
"""
|
116
112
|
if isinstance(lh5_file, h5py.File):
|
117
113
|
lh5_obj = lh5_file[name]
|
118
114
|
elif isinstance(lh5_file, str):
|
119
115
|
lh5_file = h5py.File(lh5_file, mode="r", locking=locking)
|
120
|
-
|
116
|
+
try:
|
117
|
+
lh5_obj = lh5_file[name]
|
118
|
+
except KeyError as ke:
|
119
|
+
err = f"Object {name} not found in file {lh5_file.filename}"
|
120
|
+
raise KeyError(err) from ke
|
121
121
|
else:
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
122
|
+
if obj_buf is not None:
|
123
|
+
obj_buf.resize(obj_buf_start)
|
124
|
+
else:
|
125
|
+
obj_buf_start = 0
|
126
126
|
|
127
|
-
for i, h5f in enumerate(
|
127
|
+
for i, h5f in enumerate(lh5_file):
|
128
128
|
if (
|
129
129
|
isinstance(idx, (list, tuple))
|
130
130
|
and len(idx) > 0
|
@@ -146,33 +146,26 @@ def read(
|
|
146
146
|
idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
|
147
147
|
else:
|
148
148
|
idx_i = None
|
149
|
-
n_rows_i = n_rows - n_rows_read
|
150
149
|
|
151
|
-
|
150
|
+
obj_buf_start_i = len(obj_buf) if obj_buf else 0
|
151
|
+
n_rows_i = n_rows - (obj_buf_start_i - obj_buf_start)
|
152
|
+
|
153
|
+
obj_buf = read(
|
152
154
|
name,
|
153
155
|
h5f,
|
154
|
-
start_row,
|
156
|
+
start_row if i == 0 else 0,
|
155
157
|
n_rows_i,
|
156
158
|
idx_i,
|
157
159
|
use_h5idx,
|
158
160
|
field_mask,
|
159
161
|
obj_buf,
|
160
|
-
|
162
|
+
obj_buf_start_i,
|
161
163
|
decompress,
|
162
164
|
)
|
163
|
-
if isinstance(obj_ret, tuple):
|
164
|
-
obj_buf, n_rows_read_i = obj_ret
|
165
|
-
obj_buf_is_new = True
|
166
|
-
else:
|
167
|
-
obj_buf = obj_ret
|
168
|
-
n_rows_read_i = len(obj_buf)
|
169
165
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
start_row = 0
|
174
|
-
obj_buf_start += n_rows_read_i
|
175
|
-
return obj_buf if obj_buf_is_new else (obj_buf, n_rows_read)
|
166
|
+
if obj_buf is None or (len(obj_buf) - obj_buf_start) >= n_rows:
|
167
|
+
return obj_buf
|
168
|
+
return obj_buf
|
176
169
|
|
177
170
|
if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
|
178
171
|
idx = idx[0]
|
@@ -192,8 +185,10 @@ def read(
|
|
192
185
|
obj_buf_start=obj_buf_start,
|
193
186
|
decompress=decompress,
|
194
187
|
)
|
188
|
+
with suppress(AttributeError):
|
189
|
+
obj.resize(obj_buf_start + n_rows_read)
|
195
190
|
|
196
|
-
return obj
|
191
|
+
return obj
|
197
192
|
|
198
193
|
|
199
194
|
def write(
|
@@ -273,11 +268,13 @@ def write(
|
|
273
268
|
end of array is the same as ``append``.
|
274
269
|
- ``overwrite_file`` or ``of``: delete file if present prior to
|
275
270
|
writing to it. `write_start` should be 0 (its ignored).
|
276
|
-
- ``append_column`` or ``ac``: append columns from an
|
277
|
-
:class:`~.lgdo.
|
278
|
-
:class:`~.lgdo.table.Table`
|
279
|
-
|
280
|
-
|
271
|
+
- ``append_column`` or ``ac``: append fields/columns from an
|
272
|
+
:class:`~.lgdo.struct.Struct` `obj` (and derived types such as
|
273
|
+
:class:`~.lgdo.table.Table`) only if there is an existing
|
274
|
+
:class:`~.lgdo.struct.Struct` in the `lh5_file` with the same `name`.
|
275
|
+
If there are matching fields, it errors out. If appending to a
|
276
|
+
``Table`` and the size of the new column is different from the size
|
277
|
+
of the existing table, it errors out.
|
281
278
|
write_start
|
282
279
|
row in the output file (if already existing) to start overwriting
|
283
280
|
from.
|
lgdo/lh5/iterator.py
CHANGED
@@ -24,7 +24,8 @@ class LH5Iterator(typing.Iterator):
|
|
24
24
|
|
25
25
|
This can be used as an iterator:
|
26
26
|
|
27
|
-
|
27
|
+
|
28
|
+
>>> for lh5_obj in LH5Iterator(...):
|
28
29
|
>>> # do the thing!
|
29
30
|
|
30
31
|
This is intended for if you are reading a large quantity of data. This
|
@@ -42,6 +43,8 @@ class LH5Iterator(typing.Iterator):
|
|
42
43
|
In addition to accessing requested data via ``lh5_obj``, several
|
43
44
|
properties exist to tell you where that data came from:
|
44
45
|
|
46
|
+
- lh5_it.current_i_entry: get the index within the entry list of the
|
47
|
+
first entry that is currently read
|
45
48
|
- lh5_it.current_local_entries: get the entry numbers relative to the
|
46
49
|
file the data came from
|
47
50
|
- lh5_it.current_global_entries: get the entry number relative to the
|
@@ -49,9 +52,9 @@ class LH5Iterator(typing.Iterator):
|
|
49
52
|
- lh5_it.current_files: get the file name corresponding to each entry
|
50
53
|
- lh5_it.current_groups: get the group name corresponding to each entry
|
51
54
|
|
52
|
-
This class can also be used
|
55
|
+
This class can also be used for random access:
|
53
56
|
|
54
|
-
>>> lh5_obj
|
57
|
+
>>> lh5_obj = lh5_it.read(i_entry)
|
55
58
|
|
56
59
|
to read the block of entries starting at i_entry. In case of multiple files
|
57
60
|
or the use of an event selection, i_entry refers to a global event index
|
@@ -65,6 +68,8 @@ class LH5Iterator(typing.Iterator):
|
|
65
68
|
base_path: str = "",
|
66
69
|
entry_list: list[int] | list[list[int]] | None = None,
|
67
70
|
entry_mask: list[bool] | list[list[bool]] | None = None,
|
71
|
+
i_start: int = 0,
|
72
|
+
n_entries: int | None = None,
|
68
73
|
field_mask: dict[str, bool] | list[str] | tuple[str] | None = None,
|
69
74
|
buffer_len: int = "100*MB",
|
70
75
|
file_cache: int = 10,
|
@@ -89,6 +94,10 @@ class LH5Iterator(typing.Iterator):
|
|
89
94
|
entry_mask
|
90
95
|
mask of entries to read. If a list of arrays is provided, expect
|
91
96
|
one for each file. Ignore if a selection list is provided.
|
97
|
+
i_start
|
98
|
+
index of first entry to start at when iterating
|
99
|
+
n_entries
|
100
|
+
number of entries to read before terminating iteration
|
92
101
|
field_mask
|
93
102
|
mask of which fields to read. See :meth:`LH5Store.read` for
|
94
103
|
more details.
|
@@ -183,7 +192,8 @@ class LH5Iterator(typing.Iterator):
|
|
183
192
|
msg = f"can't open any files from {lh5_files}"
|
184
193
|
raise RuntimeError(msg)
|
185
194
|
|
186
|
-
self.
|
195
|
+
self.i_start = i_start
|
196
|
+
self.n_entries = n_entries
|
187
197
|
self.current_i_entry = 0
|
188
198
|
self.next_i_entry = 0
|
189
199
|
|
@@ -317,14 +327,21 @@ class LH5Iterator(typing.Iterator):
|
|
317
327
|
)
|
318
328
|
return self.global_entry_list
|
319
329
|
|
320
|
-
def read(self, i_entry: int) ->
|
321
|
-
"
|
322
|
-
|
323
|
-
|
324
|
-
|
330
|
+
def read(self, i_entry: int, n_entries: int | None = None) -> LGDO:
|
331
|
+
"Read the nextlocal chunk of events, starting at entry."
|
332
|
+
self.lh5_buffer.resize(0)
|
333
|
+
|
334
|
+
if n_entries is None:
|
335
|
+
n_entries = self.buffer_len
|
336
|
+
elif n_entries == 0:
|
337
|
+
return self.lh5_buffer
|
338
|
+
elif n_entries > self.buffer_len:
|
339
|
+
msg = "n_entries cannot be larger than buffer_len"
|
340
|
+
raise ValueError(msg)
|
325
341
|
|
326
342
|
# if file hasn't been opened yet, search through files
|
327
343
|
# sequentially until we find the right one
|
344
|
+
i_file = np.searchsorted(self.entry_map, i_entry, "right")
|
328
345
|
if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("q").max:
|
329
346
|
while i_file < len(self.lh5_files) and i_entry >= self._get_file_cumentries(
|
330
347
|
i_file
|
@@ -332,10 +349,10 @@ class LH5Iterator(typing.Iterator):
|
|
332
349
|
i_file += 1
|
333
350
|
|
334
351
|
if i_file == len(self.lh5_files):
|
335
|
-
return
|
352
|
+
return self.lh5_buffer
|
336
353
|
local_i_entry = i_entry - self._get_file_cumentries(i_file - 1)
|
337
354
|
|
338
|
-
while self.
|
355
|
+
while len(self.lh5_buffer) < n_entries and i_file < len(self.file_map):
|
339
356
|
# Loop through files
|
340
357
|
local_idx = self.get_file_entrylist(i_file)
|
341
358
|
if local_idx is not None and len(local_idx) == 0:
|
@@ -344,18 +361,17 @@ class LH5Iterator(typing.Iterator):
|
|
344
361
|
continue
|
345
362
|
|
346
363
|
i_local = local_i_entry if local_idx is None else local_idx[local_i_entry]
|
347
|
-
self.lh5_buffer
|
364
|
+
self.lh5_buffer = self.lh5_st.read(
|
348
365
|
self.groups[i_file],
|
349
366
|
self.lh5_files[i_file],
|
350
367
|
start_row=i_local,
|
351
|
-
n_rows=
|
368
|
+
n_rows=n_entries - len(self.lh5_buffer),
|
352
369
|
idx=local_idx,
|
353
370
|
field_mask=self.field_mask,
|
354
371
|
obj_buf=self.lh5_buffer,
|
355
|
-
obj_buf_start=self.
|
372
|
+
obj_buf_start=len(self.lh5_buffer),
|
356
373
|
)
|
357
374
|
|
358
|
-
self.n_rows += n_rows
|
359
375
|
i_file += 1
|
360
376
|
local_i_entry = 0
|
361
377
|
|
@@ -364,7 +380,7 @@ class LH5Iterator(typing.Iterator):
|
|
364
380
|
if self.friend is not None:
|
365
381
|
self.friend.read(i_entry)
|
366
382
|
|
367
|
-
return
|
383
|
+
return self.lh5_buffer
|
368
384
|
|
369
385
|
def reset_field_mask(self, mask):
|
370
386
|
"""Replaces the field mask of this iterator and any friends with mask"""
|
@@ -375,7 +391,7 @@ class LH5Iterator(typing.Iterator):
|
|
375
391
|
@property
|
376
392
|
def current_local_entries(self) -> NDArray[int]:
|
377
393
|
"""Return list of local file entries in buffer"""
|
378
|
-
cur_entries = np.zeros(self.
|
394
|
+
cur_entries = np.zeros(len(self.lh5_buffer), dtype="int32")
|
379
395
|
i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
|
380
396
|
file_start = self._get_file_cumentries(i_file - 1)
|
381
397
|
i_local = self.current_i_entry - file_start
|
@@ -402,7 +418,7 @@ class LH5Iterator(typing.Iterator):
|
|
402
418
|
@property
|
403
419
|
def current_global_entries(self) -> NDArray[int]:
|
404
420
|
"""Return list of local file entries in buffer"""
|
405
|
-
cur_entries = np.zeros(self.
|
421
|
+
cur_entries = np.zeros(len(self.lh5_buffer), dtype="int32")
|
406
422
|
i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
|
407
423
|
file_start = self._get_file_cumentries(i_file - 1)
|
408
424
|
i_local = self.current_i_entry - file_start
|
@@ -433,7 +449,7 @@ class LH5Iterator(typing.Iterator):
|
|
433
449
|
@property
|
434
450
|
def current_files(self) -> NDArray[str]:
|
435
451
|
"""Return list of file names for entries in buffer"""
|
436
|
-
cur_files = np.zeros(self.
|
452
|
+
cur_files = np.zeros(len(self.lh5_buffer), dtype=object)
|
437
453
|
i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
|
438
454
|
file_start = self._get_file_cumentries(i_file - 1)
|
439
455
|
i_local = self.current_i_entry - file_start
|
@@ -455,7 +471,7 @@ class LH5Iterator(typing.Iterator):
|
|
455
471
|
@property
|
456
472
|
def current_groups(self) -> NDArray[str]:
|
457
473
|
"""Return list of group names for entries in buffer"""
|
458
|
-
cur_groups = np.zeros(self.
|
474
|
+
cur_groups = np.zeros(len(self.lh5_buffer), dtype=object)
|
459
475
|
i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
|
460
476
|
file_start = self._get_file_cumentries(i_file - 1)
|
461
477
|
i_local = self.current_i_entry - file_start
|
@@ -485,14 +501,19 @@ class LH5Iterator(typing.Iterator):
|
|
485
501
|
def __iter__(self) -> typing.Iterator:
|
486
502
|
"""Loop through entries in blocks of size buffer_len."""
|
487
503
|
self.current_i_entry = 0
|
488
|
-
self.next_i_entry =
|
504
|
+
self.next_i_entry = self.i_start
|
489
505
|
return self
|
490
506
|
|
491
507
|
def __next__(self) -> tuple[LGDO, int, int]:
|
492
|
-
"""Read next buffer_len entries and return lh5_table
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
508
|
+
"""Read next buffer_len entries and return lh5_table and iterator entry."""
|
509
|
+
n_entries = self.n_entries
|
510
|
+
if n_entries is not None:
|
511
|
+
n_entries = min(
|
512
|
+
self.buffer_len, n_entries + self.i_start - self.next_i_entry
|
513
|
+
)
|
514
|
+
|
515
|
+
buf = self.read(self.next_i_entry, n_entries)
|
516
|
+
if len(buf) == 0:
|
497
517
|
raise StopIteration
|
498
|
-
|
518
|
+
self.next_i_entry = self.current_i_entry + len(buf)
|
519
|
+
return buf
|
lgdo/lh5/store.py
CHANGED
@@ -5,21 +5,20 @@ HDF5 files.
|
|
5
5
|
|
6
6
|
from __future__ import annotations
|
7
7
|
|
8
|
-
import bisect
|
9
8
|
import logging
|
10
|
-
import os
|
11
9
|
import sys
|
12
10
|
from collections import OrderedDict
|
13
11
|
from collections.abc import Mapping, Sequence
|
14
12
|
from inspect import signature
|
13
|
+
from pathlib import Path
|
15
14
|
from typing import Any
|
16
15
|
|
17
16
|
import h5py
|
18
|
-
import numpy as np
|
19
17
|
from numpy.typing import ArrayLike
|
20
18
|
|
21
19
|
from .. import types
|
22
20
|
from . import _serializers, utils
|
21
|
+
from .core import read
|
23
22
|
|
24
23
|
log = logging.getLogger(__name__)
|
25
24
|
|
@@ -93,16 +92,16 @@ class LH5Store:
|
|
93
92
|
return self.files[lh5_file]
|
94
93
|
|
95
94
|
if self.base_path != "":
|
96
|
-
full_path =
|
95
|
+
full_path = Path(self.base_path) / lh5_file
|
97
96
|
else:
|
98
|
-
full_path = lh5_file
|
97
|
+
full_path = Path(lh5_file)
|
99
98
|
|
100
|
-
file_exists =
|
99
|
+
file_exists = full_path.exists()
|
101
100
|
if mode != "r":
|
102
|
-
directory =
|
103
|
-
if directory != "" and not
|
101
|
+
directory = full_path.parent
|
102
|
+
if directory != "" and not full_path.parent.exists():
|
104
103
|
log.debug(f"making path {directory}")
|
105
|
-
|
104
|
+
directory.mkdir(parents=True, exist_ok=True)
|
106
105
|
|
107
106
|
if mode == "r" and not file_exists:
|
108
107
|
msg = f"file {full_path} not found"
|
@@ -155,7 +154,7 @@ class LH5Store:
|
|
155
154
|
"""Returns an LH5 object appropriate for use as a pre-allocated buffer
|
156
155
|
in a read loop. Sets size to `size` if object has a size.
|
157
156
|
"""
|
158
|
-
obj
|
157
|
+
obj = self.read(name, lh5_file, n_rows=0, field_mask=field_mask)
|
159
158
|
if hasattr(obj, "resize") and size is not None:
|
160
159
|
obj.resize(new_size=size)
|
161
160
|
return obj
|
@@ -182,72 +181,20 @@ class LH5Store:
|
|
182
181
|
"""
|
183
182
|
# grab files from store
|
184
183
|
if isinstance(lh5_file, (str, h5py.File)):
|
185
|
-
|
184
|
+
h5f = self.gimme_file(lh5_file, "r", **file_kwargs)
|
186
185
|
else:
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
# make idx a proper tuple if it's not one already
|
200
|
-
if not (isinstance(idx, tuple) and len(idx) == 1):
|
201
|
-
idx = (idx,)
|
202
|
-
# idx is a long continuous array
|
203
|
-
n_rows_i = utils.read_n_rows(name, h5f)
|
204
|
-
# find the length of the subset of idx that contains indices
|
205
|
-
# that are less than n_rows_i
|
206
|
-
n_rows_to_read_i = bisect.bisect_left(idx[0], n_rows_i)
|
207
|
-
# now split idx into idx_i and the remainder
|
208
|
-
idx_i = np.array(idx[0])[:n_rows_to_read_i]
|
209
|
-
idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
|
210
|
-
else:
|
211
|
-
idx_i = None
|
212
|
-
n_rows_i = n_rows - n_rows_read
|
213
|
-
|
214
|
-
obj_buf, n_rows_read_i = self.read(
|
215
|
-
name,
|
216
|
-
h5f,
|
217
|
-
start_row,
|
218
|
-
n_rows_i,
|
219
|
-
idx_i,
|
220
|
-
use_h5idx,
|
221
|
-
field_mask,
|
222
|
-
obj_buf,
|
223
|
-
obj_buf_start,
|
224
|
-
decompress,
|
225
|
-
)
|
226
|
-
|
227
|
-
n_rows_read += n_rows_read_i
|
228
|
-
if n_rows_read >= n_rows or obj_buf is None:
|
229
|
-
return obj_buf, n_rows_read
|
230
|
-
start_row = 0
|
231
|
-
obj_buf_start += n_rows_read_i
|
232
|
-
return obj_buf, n_rows_read
|
233
|
-
|
234
|
-
if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
|
235
|
-
idx = idx[0]
|
236
|
-
if isinstance(idx, np.ndarray) and idx.dtype == np.dtype("?"):
|
237
|
-
idx = np.where(idx)[0]
|
238
|
-
|
239
|
-
return _serializers._h5_read_lgdo(
|
240
|
-
lh5_obj.id,
|
241
|
-
lh5_obj.file.filename,
|
242
|
-
lh5_obj.name,
|
243
|
-
start_row=start_row,
|
244
|
-
n_rows=n_rows,
|
245
|
-
idx=idx,
|
246
|
-
use_h5idx=use_h5idx,
|
247
|
-
field_mask=field_mask,
|
248
|
-
obj_buf=obj_buf,
|
249
|
-
obj_buf_start=obj_buf_start,
|
250
|
-
decompress=decompress,
|
186
|
+
h5f = [self.gimme_file(f, "r", **file_kwargs) for f in lh5_file]
|
187
|
+
return read(
|
188
|
+
name,
|
189
|
+
h5f,
|
190
|
+
start_row,
|
191
|
+
n_rows,
|
192
|
+
idx,
|
193
|
+
use_h5idx,
|
194
|
+
field_mask,
|
195
|
+
obj_buf,
|
196
|
+
obj_buf_start,
|
197
|
+
decompress,
|
251
198
|
)
|
252
199
|
|
253
200
|
def write(
|