legend-pydataobj 1.11.7__py3-none-any.whl → 1.12.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/METADATA +1 -1
- {legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/RECORD +19 -19
- lgdo/_version.py +2 -2
- lgdo/lh5/_serializers/read/composite.py +1 -3
- lgdo/lh5/concat.py +3 -9
- lgdo/lh5/core.py +21 -30
- lgdo/lh5/iterator.py +48 -27
- lgdo/lh5/store.py +15 -68
- lgdo/types/array.py +74 -13
- lgdo/types/encoded.py +25 -20
- lgdo/types/histogram.py +1 -1
- lgdo/types/lgdo.py +50 -0
- lgdo/types/table.py +49 -28
- lgdo/types/vectorofvectors.py +69 -76
- lgdo/types/vovutils.py +14 -4
- {legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/WHEEL +0 -0
- {legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/entry_points.txt +0 -0
- {legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/licenses/LICENSE +0 -0
- {legend_pydataobj-1.11.7.dist-info → legend_pydataobj-1.12.0a1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
legend_pydataobj-1.
|
1
|
+
legend_pydataobj-1.12.0a1.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
2
2
|
lgdo/__init__.py,sha256=QMYK9HhoMi0pbahPN8mPD18gyTxscFgo7QKfCxVhy-0,3196
|
3
|
-
lgdo/_version.py,sha256=
|
3
|
+
lgdo/_version.py,sha256=kTYHwRhTzZEJHpwJeVgXBi4yFTeQDpnR6MYkvCMA06Q,515
|
4
4
|
lgdo/cli.py,sha256=s_EWTBWW76l7zWb6gaTSTjiT-0RzzcYEmjeFEQCVxfk,4647
|
5
5
|
lgdo/lgdo_utils.py,sha256=6a2YWEwpyEMXlAyTHZMO01aqxy6SxJzPZkGNWKNWuS0,2567
|
6
6
|
lgdo/lh5_store.py,sha256=5BzbJA9sLcqjp8bJDc2olwOiw0VS6rmfg3cfh1kQkRY,8512
|
@@ -14,18 +14,18 @@ lgdo/compression/radware.py,sha256=GcNTtjuyL7VBBqziUBmSqNXuhqy1bJJgvcyvyumPtrc,2
|
|
14
14
|
lgdo/compression/utils.py,sha256=W2RkBrxPpXlat84dnU9Ad7d_tTws0irtGl7O1dNWjnk,1140
|
15
15
|
lgdo/compression/varlen.py,sha256=6ZZUItyoOfygDdE0DyoISeFZfqdbH6xl7T0eclfarzg,15127
|
16
16
|
lgdo/lh5/__init__.py,sha256=y1XE_mpFWwamrl7WVjAVSVB25X4PrEfdVXSneSQEmlQ,825
|
17
|
-
lgdo/lh5/concat.py,sha256=
|
18
|
-
lgdo/lh5/core.py,sha256=
|
17
|
+
lgdo/lh5/concat.py,sha256=BZCgK7TWPKK8fMmha8K83d3bC31FVO1b5LOW7x-Ru1s,6186
|
18
|
+
lgdo/lh5/core.py,sha256=GjosZGUp4GSO5FtWV9eXUt_6DGU_OwJXODlj5K1j93M,13320
|
19
19
|
lgdo/lh5/datatype.py,sha256=O_7BqOlX8PFMyG0ppkfUT5aps5HEqX0bpuKcJO3jhu0,1691
|
20
20
|
lgdo/lh5/exceptions.py,sha256=3kj8avXl4eBGvebl3LG12gJEmw91W0T8PYR0AfvUAyM,1211
|
21
|
-
lgdo/lh5/iterator.py,sha256=
|
22
|
-
lgdo/lh5/store.py,sha256=
|
21
|
+
lgdo/lh5/iterator.py,sha256=1ob9B7Bf3ioGCtZkUZoL6ibTxAwLf4ld8_33ghVVEa4,20498
|
22
|
+
lgdo/lh5/store.py,sha256=MYbMt-Mc7izELxuyLlSrrYrylCIzxc2CLzZYIVbZ33w,8455
|
23
23
|
lgdo/lh5/tools.py,sha256=T9CgHA8A3_tVBMtiNJ6hATQKhdqI61m3cX4p2wGKc6c,9937
|
24
24
|
lgdo/lh5/utils.py,sha256=ioz8DlyXZsejwnU2qYdIccdHcF12H62jgLkZsiDOLSM,6243
|
25
25
|
lgdo/lh5/_serializers/__init__.py,sha256=eZzxMp1SeZWG0PkEXUiCz3XyprQ8EmelHUmJogC8xYE,1263
|
26
26
|
lgdo/lh5/_serializers/read/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
27
|
lgdo/lh5/_serializers/read/array.py,sha256=uWfMCihfAmW2DE2ewip2qCK_kvQC_mb2zvOv26uzijc,1000
|
28
|
-
lgdo/lh5/_serializers/read/composite.py,sha256=
|
28
|
+
lgdo/lh5/_serializers/read/composite.py,sha256=UvkZHEhf0V7SFLxzF52eyP68hU0guGOLqosrfmIfeys,11729
|
29
29
|
lgdo/lh5/_serializers/read/encoded.py,sha256=Q98c08d8LkZq2AlY4rThYECVaEqwbv4T2Urn7TGnsyE,4130
|
30
30
|
lgdo/lh5/_serializers/read/ndarray.py,sha256=lFCXD6bSzmMOH7cVmvRYXakkfMCI8EoqTPNONRJ1F0s,3690
|
31
31
|
lgdo/lh5/_serializers/read/scalar.py,sha256=kwhWm1T91pXf86CqtUUD8_qheSR92gXZrQVtssV5YCg,922
|
@@ -37,20 +37,20 @@ lgdo/lh5/_serializers/write/composite.py,sha256=I6lH0nWFIpAfZyG4-0rLxzg3mfazZ_FE
|
|
37
37
|
lgdo/lh5/_serializers/write/scalar.py,sha256=JPt_fcdTKOSFp5hfJdcKIfK4hxhcD8vhOlvDF-7btQ8,763
|
38
38
|
lgdo/lh5/_serializers/write/vector_of_vectors.py,sha256=puGQX9XF5P_5DVbm_Cc6TvPrsDywgBLSYtkqFNltbB4,3493
|
39
39
|
lgdo/types/__init__.py,sha256=DNfOErPiAZg-7Gygkp6ZKAi20Yrm1mfderZHvKo1Y4s,821
|
40
|
-
lgdo/types/array.py,sha256=
|
40
|
+
lgdo/types/array.py,sha256=e3p93yrfzSmyBgWdGqqtETcKpM7_FxENaAErru15rvo,8904
|
41
41
|
lgdo/types/arrayofequalsizedarrays.py,sha256=DOGJiTmc1QCdm7vLbE6uIRXoMPtt8uuCfmwQawgWf5s,4949
|
42
|
-
lgdo/types/encoded.py,sha256=
|
42
|
+
lgdo/types/encoded.py,sha256=_e8u_BPfpjJbLnEdyTo9QG3kbNsGj0BN4gjdj3L1ndw,15640
|
43
43
|
lgdo/types/fixedsizearray.py,sha256=7RjUwTz1bW0pcrdy27JlfrXPAuOU89Kj7pOuSUCojK8,1527
|
44
|
-
lgdo/types/histogram.py,sha256=
|
45
|
-
lgdo/types/lgdo.py,sha256=
|
44
|
+
lgdo/types/histogram.py,sha256=Jz1lLH56BfYnmcUhxUHK1h2wLDQ0Abgyd-6LznU-3-k,19979
|
45
|
+
lgdo/types/lgdo.py,sha256=21YNtJCHnSO3M60rjsAdbMO5crDjL_0BtuFpudZ2xvU,4500
|
46
46
|
lgdo/types/scalar.py,sha256=c5Es2vyDqyWTPV6mujzfIzMpC1jNWkEIcvYyWQUxH3Q,1933
|
47
47
|
lgdo/types/struct.py,sha256=Q0OWLVd4B0ciLb8t6VsxU3MPbmGLZ7WfQNno1lSQS0Q,4918
|
48
|
-
lgdo/types/table.py,sha256=
|
49
|
-
lgdo/types/vectorofvectors.py,sha256=
|
50
|
-
lgdo/types/vovutils.py,sha256=
|
48
|
+
lgdo/types/table.py,sha256=FkWesoEA9bmGGSW8Ewig1Zs77ffUoR_nggfYSmkWpjU,20079
|
49
|
+
lgdo/types/vectorofvectors.py,sha256=CtPR2WDBmJmzzfXwH4aUcNMB5LvTiGWmL_qRbFah3to,24756
|
50
|
+
lgdo/types/vovutils.py,sha256=WjvPLEJrRNjktnbyfypfgxZX-K_aOvcwPygfzoknsyA,10701
|
51
51
|
lgdo/types/waveformtable.py,sha256=f2tS4f1OEoYaTM5ldCX9zmw8iSISCT3t3wS1SrPdu_o,9901
|
52
|
-
legend_pydataobj-1.
|
53
|
-
legend_pydataobj-1.
|
54
|
-
legend_pydataobj-1.
|
55
|
-
legend_pydataobj-1.
|
56
|
-
legend_pydataobj-1.
|
52
|
+
legend_pydataobj-1.12.0a1.dist-info/METADATA,sha256=55pMph32j8h4LKGnoVEdvHX27bHr8k__sdT4L9O5dIA,44445
|
53
|
+
legend_pydataobj-1.12.0a1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
54
|
+
legend_pydataobj-1.12.0a1.dist-info/entry_points.txt,sha256=0KWfnwbuwhNn0vPUqARukjp04Ca6lzfZBSirouRmk7I,76
|
55
|
+
legend_pydataobj-1.12.0a1.dist-info/top_level.txt,sha256=KyR-EUloqiXcQ62IWnzBmtInDtvsHl4q2ZJAZgTcLXE,5
|
56
|
+
legend_pydataobj-1.12.0a1.dist-info/RECORD,,
|
lgdo/_version.py
CHANGED
@@ -353,15 +353,13 @@ def _h5_read_table(
|
|
353
353
|
table = Table(col_dict=col_dict, attrs=attrs)
|
354
354
|
|
355
355
|
# set (write) loc to end of tree
|
356
|
-
table.
|
356
|
+
table.resize(do_warn=True)
|
357
357
|
return table, n_rows_read
|
358
358
|
|
359
359
|
# We have read all fields into the object buffer. Run
|
360
360
|
# checks: All columns should be the same size. So update
|
361
361
|
# table's size as necessary, warn if any mismatches are found
|
362
362
|
obj_buf.resize(do_warn=True)
|
363
|
-
# set (write) loc to end of tree
|
364
|
-
obj_buf.loc = obj_buf_start + n_rows_read
|
365
363
|
|
366
364
|
# check attributes
|
367
365
|
utils.check_obj_buf_attrs(obj_buf.attrs, attrs, fname, oname)
|
lgdo/lh5/concat.py
CHANGED
@@ -76,7 +76,7 @@ def _get_lgdos(file, obj_list):
|
|
76
76
|
continue
|
77
77
|
|
78
78
|
# read as little as possible
|
79
|
-
obj
|
79
|
+
obj = store.read(current, h5f0, n_rows=1)
|
80
80
|
if isinstance(obj, (Table, Array, VectorOfVectors)):
|
81
81
|
lgdos.append(current)
|
82
82
|
|
@@ -139,12 +139,6 @@ def _remove_nested_fields(lgdos: dict, obj_list: list):
|
|
139
139
|
_inplace_table_filter(key, val, obj_list)
|
140
140
|
|
141
141
|
|
142
|
-
def _slice(obj, n_rows):
|
143
|
-
ak_obj = obj.view_as("ak")[:n_rows]
|
144
|
-
obj_type = type(obj)
|
145
|
-
return obj_type(ak_obj)
|
146
|
-
|
147
|
-
|
148
142
|
def lh5concat(
|
149
143
|
lh5_files: list,
|
150
144
|
output: str,
|
@@ -186,8 +180,8 @@ def lh5concat(
|
|
186
180
|
# loop over lgdo objects
|
187
181
|
for lgdo in lgdos:
|
188
182
|
# iterate over the files
|
189
|
-
for lh5_obj
|
190
|
-
data = {lgdo:
|
183
|
+
for lh5_obj in LH5Iterator(lh5_files, lgdo):
|
184
|
+
data = {lgdo: lh5_obj}
|
191
185
|
|
192
186
|
# remove the nested fields
|
193
187
|
_remove_nested_fields(data, obj_list)
|
lgdo/lh5/core.py
CHANGED
@@ -4,6 +4,7 @@ import bisect
|
|
4
4
|
import inspect
|
5
5
|
import sys
|
6
6
|
from collections.abc import Mapping, Sequence
|
7
|
+
from contextlib import suppress
|
7
8
|
from typing import Any
|
8
9
|
|
9
10
|
import h5py
|
@@ -92,8 +93,7 @@ def read(
|
|
92
93
|
will be set to ``True``, while the rest will default to ``False``.
|
93
94
|
obj_buf
|
94
95
|
Read directly into memory provided in `obj_buf`. Note: the buffer
|
95
|
-
will be
|
96
|
-
buffer length, send in ``n_rows = len(obj_buf)``.
|
96
|
+
will be resized to accommodate the data retrieved.
|
97
97
|
obj_buf_start
|
98
98
|
Start location in ``obj_buf`` for read. For concatenating data to
|
99
99
|
array-like objects.
|
@@ -106,12 +106,8 @@ def read(
|
|
106
106
|
|
107
107
|
Returns
|
108
108
|
-------
|
109
|
-
|
110
|
-
|
111
|
-
successfully read out. Essential for arrays when the amount of data
|
112
|
-
is smaller than the object buffer. For scalars and structs
|
113
|
-
`n_rows_read` will be``1``. For tables it is redundant with
|
114
|
-
``table.loc``. If `obj_buf` is ``None``, only `object` is returned.
|
109
|
+
object
|
110
|
+
the read-out object
|
115
111
|
"""
|
116
112
|
if isinstance(lh5_file, h5py.File):
|
117
113
|
lh5_obj = lh5_file[name]
|
@@ -119,12 +115,12 @@ def read(
|
|
119
115
|
lh5_file = h5py.File(lh5_file, mode="r", locking=locking)
|
120
116
|
lh5_obj = lh5_file[name]
|
121
117
|
else:
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
118
|
+
if obj_buf is not None:
|
119
|
+
obj_buf.resize(obj_buf_start)
|
120
|
+
else:
|
121
|
+
obj_buf_start = 0
|
126
122
|
|
127
|
-
for i, h5f in enumerate(
|
123
|
+
for i, h5f in enumerate(lh5_file):
|
128
124
|
if (
|
129
125
|
isinstance(idx, (list, tuple))
|
130
126
|
and len(idx) > 0
|
@@ -146,33 +142,26 @@ def read(
|
|
146
142
|
idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
|
147
143
|
else:
|
148
144
|
idx_i = None
|
149
|
-
n_rows_i = n_rows - n_rows_read
|
150
145
|
|
151
|
-
|
146
|
+
obj_buf_start_i = len(obj_buf) if obj_buf else 0
|
147
|
+
n_rows_i = n_rows - (obj_buf_start_i - obj_buf_start)
|
148
|
+
|
149
|
+
obj_buf = read(
|
152
150
|
name,
|
153
151
|
h5f,
|
154
|
-
start_row,
|
152
|
+
start_row if i == 0 else 0,
|
155
153
|
n_rows_i,
|
156
154
|
idx_i,
|
157
155
|
use_h5idx,
|
158
156
|
field_mask,
|
159
157
|
obj_buf,
|
160
|
-
|
158
|
+
obj_buf_start_i,
|
161
159
|
decompress,
|
162
160
|
)
|
163
|
-
if isinstance(obj_ret, tuple):
|
164
|
-
obj_buf, n_rows_read_i = obj_ret
|
165
|
-
obj_buf_is_new = True
|
166
|
-
else:
|
167
|
-
obj_buf = obj_ret
|
168
|
-
n_rows_read_i = len(obj_buf)
|
169
161
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
start_row = 0
|
174
|
-
obj_buf_start += n_rows_read_i
|
175
|
-
return obj_buf if obj_buf_is_new else (obj_buf, n_rows_read)
|
162
|
+
if obj_buf is None or (len(obj_buf) - obj_buf_start) >= n_rows:
|
163
|
+
return obj_buf
|
164
|
+
return obj_buf
|
176
165
|
|
177
166
|
if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
|
178
167
|
idx = idx[0]
|
@@ -192,8 +181,10 @@ def read(
|
|
192
181
|
obj_buf_start=obj_buf_start,
|
193
182
|
decompress=decompress,
|
194
183
|
)
|
184
|
+
with suppress(AttributeError):
|
185
|
+
obj.resize(obj_buf_start + n_rows_read)
|
195
186
|
|
196
|
-
return obj
|
187
|
+
return obj
|
197
188
|
|
198
189
|
|
199
190
|
def write(
|
lgdo/lh5/iterator.py
CHANGED
@@ -24,7 +24,8 @@ class LH5Iterator(typing.Iterator):
|
|
24
24
|
|
25
25
|
This can be used as an iterator:
|
26
26
|
|
27
|
-
|
27
|
+
|
28
|
+
>>> for lh5_obj in LH5Iterator(...):
|
28
29
|
>>> # do the thing!
|
29
30
|
|
30
31
|
This is intended for if you are reading a large quantity of data. This
|
@@ -42,6 +43,8 @@ class LH5Iterator(typing.Iterator):
|
|
42
43
|
In addition to accessing requested data via ``lh5_obj``, several
|
43
44
|
properties exist to tell you where that data came from:
|
44
45
|
|
46
|
+
- lh5_it.current_i_entry: get the index within the entry list of the
|
47
|
+
first entry that is currently read
|
45
48
|
- lh5_it.current_local_entries: get the entry numbers relative to the
|
46
49
|
file the data came from
|
47
50
|
- lh5_it.current_global_entries: get the entry number relative to the
|
@@ -49,9 +52,9 @@ class LH5Iterator(typing.Iterator):
|
|
49
52
|
- lh5_it.current_files: get the file name corresponding to each entry
|
50
53
|
- lh5_it.current_groups: get the group name corresponding to each entry
|
51
54
|
|
52
|
-
This class can also be used
|
55
|
+
This class can also be used for random access:
|
53
56
|
|
54
|
-
>>> lh5_obj
|
57
|
+
>>> lh5_obj = lh5_it.read(i_entry)
|
55
58
|
|
56
59
|
to read the block of entries starting at i_entry. In case of multiple files
|
57
60
|
or the use of an event selection, i_entry refers to a global event index
|
@@ -65,6 +68,8 @@ class LH5Iterator(typing.Iterator):
|
|
65
68
|
base_path: str = "",
|
66
69
|
entry_list: list[int] | list[list[int]] | None = None,
|
67
70
|
entry_mask: list[bool] | list[list[bool]] | None = None,
|
71
|
+
i_start: int = 0,
|
72
|
+
n_entries: int | None = None,
|
68
73
|
field_mask: dict[str, bool] | list[str] | tuple[str] | None = None,
|
69
74
|
buffer_len: int = "100*MB",
|
70
75
|
file_cache: int = 10,
|
@@ -89,6 +94,10 @@ class LH5Iterator(typing.Iterator):
|
|
89
94
|
entry_mask
|
90
95
|
mask of entries to read. If a list of arrays is provided, expect
|
91
96
|
one for each file. Ignore if a selection list is provided.
|
97
|
+
i_start
|
98
|
+
index of first entry to start at when iterating
|
99
|
+
n_entries
|
100
|
+
number of entries to read before terminating iteration
|
92
101
|
field_mask
|
93
102
|
mask of which fields to read. See :meth:`LH5Store.read` for
|
94
103
|
more details.
|
@@ -183,7 +192,8 @@ class LH5Iterator(typing.Iterator):
|
|
183
192
|
msg = f"can't open any files from {lh5_files}"
|
184
193
|
raise RuntimeError(msg)
|
185
194
|
|
186
|
-
self.
|
195
|
+
self.i_start = i_start
|
196
|
+
self.n_entries = n_entries
|
187
197
|
self.current_i_entry = 0
|
188
198
|
self.next_i_entry = 0
|
189
199
|
|
@@ -317,14 +327,21 @@ class LH5Iterator(typing.Iterator):
|
|
317
327
|
)
|
318
328
|
return self.global_entry_list
|
319
329
|
|
320
|
-
def read(self, i_entry: int) ->
|
321
|
-
"
|
322
|
-
|
323
|
-
|
324
|
-
|
330
|
+
def read(self, i_entry: int, n_entries: int | None = None) -> LGDO:
|
331
|
+
"Read the nextlocal chunk of events, starting at entry."
|
332
|
+
self.lh5_buffer.resize(0)
|
333
|
+
|
334
|
+
if n_entries is None:
|
335
|
+
n_entries = self.buffer_len
|
336
|
+
elif n_entries == 0:
|
337
|
+
return self.lh5_buffer
|
338
|
+
elif n_entries > self.buffer_len:
|
339
|
+
msg = "n_entries cannot be larger than buffer_len"
|
340
|
+
raise ValueError(msg)
|
325
341
|
|
326
342
|
# if file hasn't been opened yet, search through files
|
327
343
|
# sequentially until we find the right one
|
344
|
+
i_file = np.searchsorted(self.entry_map, i_entry, "right")
|
328
345
|
if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("q").max:
|
329
346
|
while i_file < len(self.lh5_files) and i_entry >= self._get_file_cumentries(
|
330
347
|
i_file
|
@@ -332,10 +349,10 @@ class LH5Iterator(typing.Iterator):
|
|
332
349
|
i_file += 1
|
333
350
|
|
334
351
|
if i_file == len(self.lh5_files):
|
335
|
-
return
|
352
|
+
return self.lh5_buffer
|
336
353
|
local_i_entry = i_entry - self._get_file_cumentries(i_file - 1)
|
337
354
|
|
338
|
-
while self.
|
355
|
+
while len(self.lh5_buffer) < n_entries and i_file < len(self.file_map):
|
339
356
|
# Loop through files
|
340
357
|
local_idx = self.get_file_entrylist(i_file)
|
341
358
|
if local_idx is not None and len(local_idx) == 0:
|
@@ -344,18 +361,17 @@ class LH5Iterator(typing.Iterator):
|
|
344
361
|
continue
|
345
362
|
|
346
363
|
i_local = local_i_entry if local_idx is None else local_idx[local_i_entry]
|
347
|
-
self.lh5_buffer
|
364
|
+
self.lh5_buffer = self.lh5_st.read(
|
348
365
|
self.groups[i_file],
|
349
366
|
self.lh5_files[i_file],
|
350
367
|
start_row=i_local,
|
351
|
-
n_rows=
|
368
|
+
n_rows=n_entries - len(self.lh5_buffer),
|
352
369
|
idx=local_idx,
|
353
370
|
field_mask=self.field_mask,
|
354
371
|
obj_buf=self.lh5_buffer,
|
355
|
-
obj_buf_start=self.
|
372
|
+
obj_buf_start=len(self.lh5_buffer),
|
356
373
|
)
|
357
374
|
|
358
|
-
self.n_rows += n_rows
|
359
375
|
i_file += 1
|
360
376
|
local_i_entry = 0
|
361
377
|
|
@@ -364,7 +380,7 @@ class LH5Iterator(typing.Iterator):
|
|
364
380
|
if self.friend is not None:
|
365
381
|
self.friend.read(i_entry)
|
366
382
|
|
367
|
-
return
|
383
|
+
return self.lh5_buffer
|
368
384
|
|
369
385
|
def reset_field_mask(self, mask):
|
370
386
|
"""Replaces the field mask of this iterator and any friends with mask"""
|
@@ -375,7 +391,7 @@ class LH5Iterator(typing.Iterator):
|
|
375
391
|
@property
|
376
392
|
def current_local_entries(self) -> NDArray[int]:
|
377
393
|
"""Return list of local file entries in buffer"""
|
378
|
-
cur_entries = np.zeros(self.
|
394
|
+
cur_entries = np.zeros(len(self.lh5_buffer), dtype="int32")
|
379
395
|
i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
|
380
396
|
file_start = self._get_file_cumentries(i_file - 1)
|
381
397
|
i_local = self.current_i_entry - file_start
|
@@ -402,7 +418,7 @@ class LH5Iterator(typing.Iterator):
|
|
402
418
|
@property
|
403
419
|
def current_global_entries(self) -> NDArray[int]:
|
404
420
|
"""Return list of local file entries in buffer"""
|
405
|
-
cur_entries = np.zeros(self.
|
421
|
+
cur_entries = np.zeros(len(self.lh5_buffer), dtype="int32")
|
406
422
|
i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
|
407
423
|
file_start = self._get_file_cumentries(i_file - 1)
|
408
424
|
i_local = self.current_i_entry - file_start
|
@@ -433,7 +449,7 @@ class LH5Iterator(typing.Iterator):
|
|
433
449
|
@property
|
434
450
|
def current_files(self) -> NDArray[str]:
|
435
451
|
"""Return list of file names for entries in buffer"""
|
436
|
-
cur_files = np.zeros(self.
|
452
|
+
cur_files = np.zeros(len(self.lh5_buffer), dtype=object)
|
437
453
|
i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
|
438
454
|
file_start = self._get_file_cumentries(i_file - 1)
|
439
455
|
i_local = self.current_i_entry - file_start
|
@@ -455,7 +471,7 @@ class LH5Iterator(typing.Iterator):
|
|
455
471
|
@property
|
456
472
|
def current_groups(self) -> NDArray[str]:
|
457
473
|
"""Return list of group names for entries in buffer"""
|
458
|
-
cur_groups = np.zeros(self.
|
474
|
+
cur_groups = np.zeros(len(self.lh5_buffer), dtype=object)
|
459
475
|
i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
|
460
476
|
file_start = self._get_file_cumentries(i_file - 1)
|
461
477
|
i_local = self.current_i_entry - file_start
|
@@ -485,14 +501,19 @@ class LH5Iterator(typing.Iterator):
|
|
485
501
|
def __iter__(self) -> typing.Iterator:
|
486
502
|
"""Loop through entries in blocks of size buffer_len."""
|
487
503
|
self.current_i_entry = 0
|
488
|
-
self.next_i_entry =
|
504
|
+
self.next_i_entry = self.i_start
|
489
505
|
return self
|
490
506
|
|
491
507
|
def __next__(self) -> tuple[LGDO, int, int]:
|
492
|
-
"""Read next buffer_len entries and return lh5_table
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
508
|
+
"""Read next buffer_len entries and return lh5_table and iterator entry."""
|
509
|
+
n_entries = self.n_entries
|
510
|
+
if n_entries is not None:
|
511
|
+
n_entries = min(
|
512
|
+
self.buffer_len, n_entries + self.i_start - self.next_i_entry
|
513
|
+
)
|
514
|
+
|
515
|
+
buf = self.read(self.next_i_entry, n_entries)
|
516
|
+
if len(buf) == 0:
|
497
517
|
raise StopIteration
|
498
|
-
|
518
|
+
self.next_i_entry = self.current_i_entry + len(buf)
|
519
|
+
return buf
|
lgdo/lh5/store.py
CHANGED
@@ -5,7 +5,6 @@ HDF5 files.
|
|
5
5
|
|
6
6
|
from __future__ import annotations
|
7
7
|
|
8
|
-
import bisect
|
9
8
|
import logging
|
10
9
|
import os
|
11
10
|
import sys
|
@@ -15,11 +14,11 @@ from inspect import signature
|
|
15
14
|
from typing import Any
|
16
15
|
|
17
16
|
import h5py
|
18
|
-
import numpy as np
|
19
17
|
from numpy.typing import ArrayLike
|
20
18
|
|
21
19
|
from .. import types
|
22
20
|
from . import _serializers, utils
|
21
|
+
from .core import read
|
23
22
|
|
24
23
|
log = logging.getLogger(__name__)
|
25
24
|
|
@@ -155,7 +154,7 @@ class LH5Store:
|
|
155
154
|
"""Returns an LH5 object appropriate for use as a pre-allocated buffer
|
156
155
|
in a read loop. Sets size to `size` if object has a size.
|
157
156
|
"""
|
158
|
-
obj
|
157
|
+
obj = self.read(name, lh5_file, n_rows=0, field_mask=field_mask)
|
159
158
|
if hasattr(obj, "resize") and size is not None:
|
160
159
|
obj.resize(new_size=size)
|
161
160
|
return obj
|
@@ -182,72 +181,20 @@ class LH5Store:
|
|
182
181
|
"""
|
183
182
|
# grab files from store
|
184
183
|
if isinstance(lh5_file, (str, h5py.File)):
|
185
|
-
|
184
|
+
h5f = self.gimme_file(lh5_file, "r", **file_kwargs)
|
186
185
|
else:
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
# make idx a proper tuple if it's not one already
|
200
|
-
if not (isinstance(idx, tuple) and len(idx) == 1):
|
201
|
-
idx = (idx,)
|
202
|
-
# idx is a long continuous array
|
203
|
-
n_rows_i = utils.read_n_rows(name, h5f)
|
204
|
-
# find the length of the subset of idx that contains indices
|
205
|
-
# that are less than n_rows_i
|
206
|
-
n_rows_to_read_i = bisect.bisect_left(idx[0], n_rows_i)
|
207
|
-
# now split idx into idx_i and the remainder
|
208
|
-
idx_i = np.array(idx[0])[:n_rows_to_read_i]
|
209
|
-
idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
|
210
|
-
else:
|
211
|
-
idx_i = None
|
212
|
-
n_rows_i = n_rows - n_rows_read
|
213
|
-
|
214
|
-
obj_buf, n_rows_read_i = self.read(
|
215
|
-
name,
|
216
|
-
h5f,
|
217
|
-
start_row,
|
218
|
-
n_rows_i,
|
219
|
-
idx_i,
|
220
|
-
use_h5idx,
|
221
|
-
field_mask,
|
222
|
-
obj_buf,
|
223
|
-
obj_buf_start,
|
224
|
-
decompress,
|
225
|
-
)
|
226
|
-
|
227
|
-
n_rows_read += n_rows_read_i
|
228
|
-
if n_rows_read >= n_rows or obj_buf is None:
|
229
|
-
return obj_buf, n_rows_read
|
230
|
-
start_row = 0
|
231
|
-
obj_buf_start += n_rows_read_i
|
232
|
-
return obj_buf, n_rows_read
|
233
|
-
|
234
|
-
if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
|
235
|
-
idx = idx[0]
|
236
|
-
if isinstance(idx, np.ndarray) and idx.dtype == np.dtype("?"):
|
237
|
-
idx = np.where(idx)[0]
|
238
|
-
|
239
|
-
return _serializers._h5_read_lgdo(
|
240
|
-
lh5_obj.id,
|
241
|
-
lh5_obj.file.filename,
|
242
|
-
lh5_obj.name,
|
243
|
-
start_row=start_row,
|
244
|
-
n_rows=n_rows,
|
245
|
-
idx=idx,
|
246
|
-
use_h5idx=use_h5idx,
|
247
|
-
field_mask=field_mask,
|
248
|
-
obj_buf=obj_buf,
|
249
|
-
obj_buf_start=obj_buf_start,
|
250
|
-
decompress=decompress,
|
186
|
+
h5f = [self.gimme_file(f, "r", **file_kwargs) for f in lh5_file]
|
187
|
+
return read(
|
188
|
+
name,
|
189
|
+
h5f,
|
190
|
+
start_row,
|
191
|
+
n_rows,
|
192
|
+
idx,
|
193
|
+
use_h5idx,
|
194
|
+
field_mask,
|
195
|
+
obj_buf,
|
196
|
+
obj_buf_start,
|
197
|
+
decompress,
|
251
198
|
)
|
252
199
|
|
253
200
|
def write(
|
lgdo/types/array.py
CHANGED
@@ -17,12 +17,12 @@ import pint_pandas # noqa: F401
|
|
17
17
|
|
18
18
|
from .. import utils
|
19
19
|
from ..units import default_units_registry as u
|
20
|
-
from .lgdo import
|
20
|
+
from .lgdo import LGDOCollection
|
21
21
|
|
22
22
|
log = logging.getLogger(__name__)
|
23
23
|
|
24
24
|
|
25
|
-
class Array(
|
25
|
+
class Array(LGDOCollection):
|
26
26
|
r"""Holds an :class:`numpy.ndarray` and attributes.
|
27
27
|
|
28
28
|
:class:`Array` (and the other various array types) holds an `nda` instead
|
@@ -78,11 +78,7 @@ class Array(LGDO):
|
|
78
78
|
elif isinstance(nda, Array):
|
79
79
|
nda = nda.nda
|
80
80
|
|
81
|
-
elif not isinstance(nda, np.ndarray):
|
82
|
-
nda = np.array(nda)
|
83
|
-
|
84
81
|
self.nda = nda
|
85
|
-
self.dtype = self.nda.dtype
|
86
82
|
|
87
83
|
super().__init__(attrs)
|
88
84
|
|
@@ -96,18 +92,83 @@ class Array(LGDO):
|
|
96
92
|
return dt + "<" + nd + ">{" + et + "}"
|
97
93
|
|
98
94
|
def __len__(self) -> int:
|
99
|
-
return
|
95
|
+
return self._size
|
96
|
+
|
97
|
+
@property
|
98
|
+
def nda(self):
|
99
|
+
return self._nda[: self._size, ...] if self._nda.shape != () else self._nda
|
100
|
+
|
101
|
+
@nda.setter
|
102
|
+
def nda(self, value):
|
103
|
+
self._nda = value if isinstance(value, np.ndarray) else np.array(value)
|
104
|
+
self._size = len(self._nda) if self._nda.shape != () else 0
|
105
|
+
|
106
|
+
@property
|
107
|
+
def dtype(self):
|
108
|
+
return self._nda.dtype
|
109
|
+
|
110
|
+
@property
|
111
|
+
def shape(self):
|
112
|
+
return (len(self),) + self._nda.shape[1:]
|
113
|
+
|
114
|
+
def reserve_capacity(self, capacity: int) -> None:
|
115
|
+
"Set size (number of rows) of internal memory buffer"
|
116
|
+
if capacity < len(self):
|
117
|
+
msg = "Cannot reduce capacity below Array length"
|
118
|
+
raise ValueError(msg)
|
119
|
+
self._nda.resize((capacity,) + self._nda.shape[1:], refcheck=False)
|
120
|
+
|
121
|
+
def get_capacity(self) -> int:
|
122
|
+
"Get capacity (i.e. max size before memory must be re-allocated)"
|
123
|
+
return len(self._nda)
|
124
|
+
|
125
|
+
def trim_capacity(self) -> None:
|
126
|
+
"Set capacity to be minimum needed to support Array size"
|
127
|
+
self.reserve_capacity(np.prod(self.shape))
|
128
|
+
|
129
|
+
def resize(self, new_size: int, trim=False) -> None:
|
130
|
+
"""Set size of Array in rows. Only change capacity if it must be
|
131
|
+
increased to accommodate new rows; in this case double capacity.
|
132
|
+
If trim is True, capacity will be set to match size."""
|
133
|
+
|
134
|
+
self._size = new_size
|
135
|
+
|
136
|
+
if trim and new_size != self.get_capacity:
|
137
|
+
self.reserve_capacity(new_size)
|
100
138
|
|
101
|
-
|
102
|
-
|
103
|
-
|
139
|
+
# If capacity is not big enough, set to next power of 2 big enough
|
140
|
+
if new_size > self.get_capacity():
|
141
|
+
self.reserve_capacity(int(2 ** (np.ceil(np.log2(new_size)))))
|
104
142
|
|
105
143
|
def append(self, value: np.ndarray) -> None:
|
106
|
-
|
107
|
-
self.
|
144
|
+
"Append value to end of array (with copy)"
|
145
|
+
self.insert(len(self), value)
|
108
146
|
|
109
147
|
def insert(self, i: int, value: int | float) -> None:
|
110
|
-
|
148
|
+
"Insert value into row i (with copy)"
|
149
|
+
if i > len(self):
|
150
|
+
msg = f"index {i} is out of bounds for array with size {len(self)}"
|
151
|
+
raise IndexError(msg)
|
152
|
+
|
153
|
+
value = np.array(value)
|
154
|
+
if value.shape == self.shape[1:]:
|
155
|
+
self.resize(len(self) + 1)
|
156
|
+
self[i + 1 :] = self[i:-1]
|
157
|
+
self[i] = value
|
158
|
+
elif value.shape[1:] == self.shape[1:]:
|
159
|
+
self.resize(len(self) + len(value))
|
160
|
+
self[i + len(value) :] = self[i : -len(value)]
|
161
|
+
self[i : i + len(value)] = value
|
162
|
+
else:
|
163
|
+
msg = f"Could not insert value with shape {value.shape} into Array with shape {self.shape}"
|
164
|
+
raise ValueError(msg)
|
165
|
+
|
166
|
+
def replace(self, i: int, value: int | float) -> None:
|
167
|
+
"Replace value at row i"
|
168
|
+
if i >= len(self):
|
169
|
+
msg = f"index {i} is out of bounds for array with size {len(self)}"
|
170
|
+
raise IndexError(msg)
|
171
|
+
self[i] = value
|
111
172
|
|
112
173
|
def __getitem__(self, key):
|
113
174
|
return self.nda[key]
|
lgdo/types/encoded.py
CHANGED
@@ -11,12 +11,12 @@ from numpy.typing import NDArray
|
|
11
11
|
|
12
12
|
from .. import utils
|
13
13
|
from .array import Array
|
14
|
-
from .lgdo import
|
14
|
+
from .lgdo import LGDOCollection
|
15
15
|
from .scalar import Scalar
|
16
16
|
from .vectorofvectors import VectorOfVectors
|
17
17
|
|
18
18
|
|
19
|
-
class VectorOfEncodedVectors(
|
19
|
+
class VectorOfEncodedVectors(LGDOCollection):
|
20
20
|
"""An array of variable-length encoded arrays.
|
21
21
|
|
22
22
|
Used to represent an encoded :class:`.VectorOfVectors`. In addition to an
|
@@ -92,6 +92,17 @@ class VectorOfEncodedVectors(LGDO):
|
|
92
92
|
|
93
93
|
return False
|
94
94
|
|
95
|
+
def reserve_capacity(self, *capacity: int) -> None:
|
96
|
+
self.encoded_data.reserve_capacity(*capacity)
|
97
|
+
self.decoded_size.reserve_capacity(capacity[0])
|
98
|
+
|
99
|
+
def get_capacity(self) -> tuple:
|
100
|
+
return (self.decoded_size.get_capacity, *self.encoded_data.get_capacity())
|
101
|
+
|
102
|
+
def trim_capacity(self) -> None:
|
103
|
+
self.encoded_data.trim_capacity()
|
104
|
+
self.decoded_size.trim_capacity()
|
105
|
+
|
95
106
|
def resize(self, new_size: int) -> None:
|
96
107
|
"""Resize vector along the first axis.
|
97
108
|
|
@@ -102,21 +113,6 @@ class VectorOfEncodedVectors(LGDO):
|
|
102
113
|
self.encoded_data.resize(new_size)
|
103
114
|
self.decoded_size.resize(new_size)
|
104
115
|
|
105
|
-
def append(self, value: tuple[NDArray, int]) -> None:
|
106
|
-
"""Append a 1D encoded vector at the end.
|
107
|
-
|
108
|
-
Parameters
|
109
|
-
----------
|
110
|
-
value
|
111
|
-
a tuple holding the encoded array and its decoded size.
|
112
|
-
|
113
|
-
See Also
|
114
|
-
--------
|
115
|
-
.VectorOfVectors.append
|
116
|
-
"""
|
117
|
-
self.encoded_data.append(value[0])
|
118
|
-
self.decoded_size.append(value[1])
|
119
|
-
|
120
116
|
def insert(self, i: int, value: tuple[NDArray, int]) -> None:
|
121
117
|
"""Insert an encoded vector at index `i`.
|
122
118
|
|
@@ -282,7 +278,7 @@ class VectorOfEncodedVectors(LGDO):
|
|
282
278
|
raise ValueError(msg)
|
283
279
|
|
284
280
|
|
285
|
-
class ArrayOfEncodedEqualSizedArrays(
|
281
|
+
class ArrayOfEncodedEqualSizedArrays(LGDOCollection):
|
286
282
|
"""An array of encoded arrays with equal decoded size.
|
287
283
|
|
288
284
|
Used to represent an encoded :class:`.ArrayOfEqualSizedArrays`. In addition
|
@@ -349,14 +345,23 @@ class ArrayOfEncodedEqualSizedArrays(LGDO):
|
|
349
345
|
|
350
346
|
return False
|
351
347
|
|
352
|
-
def
|
348
|
+
def reserve_capacity(self, *capacity: int) -> None:
|
349
|
+
self.encoded_data.reserve_capacity(capacity)
|
350
|
+
|
351
|
+
def get_capacity(self) -> tuple:
|
352
|
+
return self.encoded_data.get_capacity()
|
353
|
+
|
354
|
+
def trim_capacity(self) -> None:
|
355
|
+
self.encoded_data.trim_capacity()
|
356
|
+
|
357
|
+
def resize(self, new_size: int, trim: bool = False) -> None:
|
353
358
|
"""Resize array along the first axis.
|
354
359
|
|
355
360
|
See Also
|
356
361
|
--------
|
357
362
|
.VectorOfVectors.resize
|
358
363
|
"""
|
359
|
-
self.encoded_data.resize(new_size)
|
364
|
+
self.encoded_data.resize(new_size, trim)
|
360
365
|
|
361
366
|
def append(self, value: NDArray) -> None:
|
362
367
|
"""Append a 1D encoded array at the end.
|
lgdo/types/histogram.py
CHANGED
@@ -424,7 +424,7 @@ class Histogram(Struct):
|
|
424
424
|
dict.__setitem__(self, name, obj)
|
425
425
|
else:
|
426
426
|
msg = "histogram fields cannot be mutated "
|
427
|
-
raise
|
427
|
+
raise AttributeError(msg)
|
428
428
|
|
429
429
|
def __getattr__(self, name: str) -> None:
|
430
430
|
# do not allow for new attributes on this
|
lgdo/types/lgdo.py
CHANGED
@@ -92,3 +92,53 @@ class LGDO(ABC):
|
|
92
92
|
|
93
93
|
def __repr__(self) -> str:
|
94
94
|
return self.__class__.__name__ + f"(attrs={self.attrs!r})"
|
95
|
+
|
96
|
+
|
97
|
+
class LGDOCollection(LGDO):
|
98
|
+
"""Abstract base class representing a LEGEND Collection Object (LGDO).
|
99
|
+
This defines the interface for classes used as table columns.
|
100
|
+
"""
|
101
|
+
|
102
|
+
@abstractmethod
|
103
|
+
def __init__(self, attrs: dict[str, Any] | None = None) -> None:
|
104
|
+
super().__init__(attrs)
|
105
|
+
|
106
|
+
@abstractmethod
|
107
|
+
def __len__(self) -> int:
|
108
|
+
"""Provides ``__len__`` for this array-like class."""
|
109
|
+
|
110
|
+
@abstractmethod
|
111
|
+
def reserve_capacity(self, capacity: int) -> None:
|
112
|
+
"""Reserve capacity (in rows) for later use. Internal memory buffers
|
113
|
+
will have enough entries to store this many rows.
|
114
|
+
"""
|
115
|
+
|
116
|
+
@abstractmethod
|
117
|
+
def get_capacity(self) -> int:
|
118
|
+
"get reserved capacity of internal memory buffers in rows"
|
119
|
+
|
120
|
+
@abstractmethod
|
121
|
+
def trim_capacity(self) -> None:
|
122
|
+
"""set capacity to only what is required to store current contents
|
123
|
+
of LGDOCollection
|
124
|
+
"""
|
125
|
+
|
126
|
+
@abstractmethod
|
127
|
+
def resize(self, new_size: int, trim: bool = False) -> None:
|
128
|
+
"""Return this LGDO's datatype attribute string."""
|
129
|
+
|
130
|
+
def append(self, val) -> None:
|
131
|
+
"append val to end of LGDOCollection"
|
132
|
+
self.insert(len(self), val)
|
133
|
+
|
134
|
+
@abstractmethod
|
135
|
+
def insert(self, i: int, val) -> None:
|
136
|
+
"insert val into LGDOCollection at position i"
|
137
|
+
|
138
|
+
@abstractmethod
|
139
|
+
def replace(self, i: int, val) -> None:
|
140
|
+
"replace item at position i with val in LGDOCollection"
|
141
|
+
|
142
|
+
def clear(self, trim: bool = False) -> None:
|
143
|
+
"set size of LGDOCollection to zero"
|
144
|
+
self.resize(0, trim=trim)
|
lgdo/types/table.py
CHANGED
@@ -19,7 +19,7 @@ from pandas.io.formats import format as fmt
|
|
19
19
|
|
20
20
|
from .array import Array
|
21
21
|
from .arrayofequalsizedarrays import ArrayOfEqualSizedArrays
|
22
|
-
from .lgdo import LGDO
|
22
|
+
from .lgdo import LGDO, LGDOCollection
|
23
23
|
from .scalar import Scalar
|
24
24
|
from .struct import Struct
|
25
25
|
from .vectorofvectors import VectorOfVectors
|
@@ -27,13 +27,9 @@ from .vectorofvectors import VectorOfVectors
|
|
27
27
|
log = logging.getLogger(__name__)
|
28
28
|
|
29
29
|
|
30
|
-
class Table(Struct):
|
30
|
+
class Table(Struct, LGDOCollection):
|
31
31
|
"""A special struct of arrays or subtable columns of equal length.
|
32
32
|
|
33
|
-
Holds onto an internal read/write location ``loc`` that is useful in
|
34
|
-
managing table I/O using functions like :meth:`push_row`, :meth:`is_full`,
|
35
|
-
and :meth:`clear`.
|
36
|
-
|
37
33
|
Note
|
38
34
|
----
|
39
35
|
If you write to a table and don't fill it up to its total size, be sure to
|
@@ -49,7 +45,7 @@ class Table(Struct):
|
|
49
45
|
|
50
46
|
def __init__(
|
51
47
|
self,
|
52
|
-
col_dict: Mapping[str,
|
48
|
+
col_dict: Mapping[str, LGDOCollection] | pd.DataFrame | ak.Array | None = None,
|
53
49
|
size: int | None = None,
|
54
50
|
attrs: Mapping[str, Any] | None = None,
|
55
51
|
) -> None:
|
@@ -65,7 +61,7 @@ class Table(Struct):
|
|
65
61
|
col_dict
|
66
62
|
instantiate this table using the supplied mapping of column names
|
67
63
|
and array-like objects. Supported input types are: mapping of
|
68
|
-
strings to
|
64
|
+
strings to LGDOCollections, :class:`pd.DataFrame` and :class:`ak.Array`.
|
69
65
|
Note 1: no copy is performed, the objects are used directly (unless
|
70
66
|
:class:`ak.Array` is provided). Note 2: if `size` is not ``None``,
|
71
67
|
all arrays will be resized to match it. Note 3: if the arrays have
|
@@ -85,7 +81,8 @@ class Table(Struct):
|
|
85
81
|
col_dict = _ak_to_lgdo_or_col_dict(col_dict)
|
86
82
|
|
87
83
|
# call Struct constructor
|
88
|
-
|
84
|
+
Struct.__init__(self, obj_dict=col_dict)
|
85
|
+
LGDOCollection.__init__(self, attrs=attrs)
|
89
86
|
|
90
87
|
# if col_dict is not empty, set size according to it
|
91
88
|
# if size is also supplied, resize all fields to match it
|
@@ -93,13 +90,10 @@ class Table(Struct):
|
|
93
90
|
if col_dict is not None and len(col_dict) > 0:
|
94
91
|
self.resize(new_size=size, do_warn=(size is None))
|
95
92
|
|
96
|
-
# if no col_dict, just set the size
|
93
|
+
# if no col_dict, just set the size
|
97
94
|
else:
|
98
95
|
self.size = size if size is not None else None
|
99
96
|
|
100
|
-
# always start at loc=0
|
101
|
-
self.loc = 0
|
102
|
-
|
103
97
|
def datatype_name(self) -> str:
|
104
98
|
return "table"
|
105
99
|
|
@@ -107,7 +101,31 @@ class Table(Struct):
|
|
107
101
|
"""Provides ``__len__`` for this array-like class."""
|
108
102
|
return self.size
|
109
103
|
|
110
|
-
def
|
104
|
+
def reserve_capacity(self, capacity: int | list) -> None:
|
105
|
+
"Set size (number of rows) of internal memory buffer"
|
106
|
+
if isinstance(capacity, int):
|
107
|
+
for obj in self.values():
|
108
|
+
obj.reserve_capacity(capacity)
|
109
|
+
else:
|
110
|
+
if len(capacity) != len(self.keys()):
|
111
|
+
msg = "List of capacities must have same length as number of keys"
|
112
|
+
raise ValueError(msg)
|
113
|
+
|
114
|
+
for obj, cap in zip(self.values(), capacity):
|
115
|
+
obj.reserve_capacity(cap)
|
116
|
+
|
117
|
+
def get_capacity(self) -> int:
|
118
|
+
"Get list of capacities for each key"
|
119
|
+
return [v.get_capacity() for v in self.values()]
|
120
|
+
|
121
|
+
def trim_capacity(self) -> int:
|
122
|
+
"Set capacity to be minimum needed to support Array size"
|
123
|
+
for v in self.values():
|
124
|
+
v.trim_capacity()
|
125
|
+
|
126
|
+
def resize(
|
127
|
+
self, new_size: int | None = None, do_warn: bool = False, trim: bool = False
|
128
|
+
) -> None:
|
111
129
|
# if new_size = None, use the size from the first field
|
112
130
|
for field, obj in self.items():
|
113
131
|
if new_size is None:
|
@@ -119,21 +137,20 @@ class Table(Struct):
|
|
119
137
|
f"with size {len(obj)} != {new_size}"
|
120
138
|
)
|
121
139
|
if isinstance(obj, Table):
|
122
|
-
obj.resize(new_size)
|
140
|
+
obj.resize(new_size, trim)
|
123
141
|
else:
|
124
|
-
obj.resize(new_size)
|
142
|
+
obj.resize(new_size, trim)
|
125
143
|
self.size = new_size
|
126
144
|
|
127
|
-
def
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
def clear(self) -> None:
|
134
|
-
self.loc = 0
|
145
|
+
def insert(self, i: int, vals: dict) -> None:
|
146
|
+
"Insert vals into table at row i. Vals is a mapping from table key to val"
|
147
|
+
for k, ar in self.items():
|
148
|
+
ar.insert(i, vals[k])
|
149
|
+
self.size += 1
|
135
150
|
|
136
|
-
def add_field(
|
151
|
+
def add_field(
|
152
|
+
self, name: str, obj: LGDOCollection, use_obj_size: bool = False
|
153
|
+
) -> None:
|
137
154
|
"""Add a field (column) to the table.
|
138
155
|
|
139
156
|
Use the name "field" here to match the terminology used in
|
@@ -170,7 +187,9 @@ class Table(Struct):
|
|
170
187
|
new_size = len(obj) if use_obj_size else self.size
|
171
188
|
self.resize(new_size=new_size)
|
172
189
|
|
173
|
-
def add_column(
|
190
|
+
def add_column(
|
191
|
+
self, name: str, obj: LGDOCollection, use_obj_size: bool = False
|
192
|
+
) -> None:
|
174
193
|
"""Alias for :meth:`.add_field` using table terminology 'column'."""
|
175
194
|
self.add_field(name, obj, use_obj_size=use_obj_size)
|
176
195
|
|
@@ -201,8 +220,10 @@ class Table(Struct):
|
|
201
220
|
set to ``False`` to turn off warnings associated with mismatched
|
202
221
|
`loc` parameter or :meth:`add_column` warnings.
|
203
222
|
"""
|
204
|
-
if other_table
|
205
|
-
log.warning(
|
223
|
+
if len(other_table) != len(self) and do_warn:
|
224
|
+
log.warning(
|
225
|
+
f"len(other_table) ({len(other_table)}) != len(self) ({len(self)})"
|
226
|
+
)
|
206
227
|
if cols is None:
|
207
228
|
cols = other_table.keys()
|
208
229
|
for name in cols:
|
lgdo/types/vectorofvectors.py
CHANGED
@@ -20,12 +20,12 @@ from .. import utils
|
|
20
20
|
from . import arrayofequalsizedarrays as aoesa
|
21
21
|
from . import vovutils
|
22
22
|
from .array import Array
|
23
|
-
from .lgdo import
|
23
|
+
from .lgdo import LGDOCollection
|
24
24
|
|
25
25
|
log = logging.getLogger(__name__)
|
26
26
|
|
27
27
|
|
28
|
-
class VectorOfVectors(
|
28
|
+
class VectorOfVectors(LGDOCollection):
|
29
29
|
"""A n-dimensional variable-length 1D array of variable-length 1D arrays.
|
30
30
|
|
31
31
|
If the vector is 2-dimensional, the internal representation is as two NumPy
|
@@ -210,20 +210,17 @@ class VectorOfVectors(LGDO):
|
|
210
210
|
elif self.flattened_data is None:
|
211
211
|
self.flattened_data = flattened_data
|
212
212
|
|
213
|
-
|
214
|
-
self.dtype = self.flattened_data.dtype
|
215
|
-
|
216
|
-
# set ndim
|
217
|
-
self.ndim = 2
|
218
|
-
pointer = self.flattened_data
|
219
|
-
while True:
|
220
|
-
if isinstance(pointer, Array):
|
221
|
-
break
|
213
|
+
super().__init__(attrs)
|
222
214
|
|
223
|
-
|
224
|
-
|
215
|
+
@property
|
216
|
+
def ndim(self):
|
217
|
+
return 1 + (
|
218
|
+
1 if isinstance(self.flattened_data, Array) else self.flattened_data.ndim
|
219
|
+
)
|
225
220
|
|
226
|
-
|
221
|
+
@property
|
222
|
+
def dtype(self) -> np.dtype:
|
223
|
+
return self.flattened_data.dtype
|
227
224
|
|
228
225
|
def datatype_name(self) -> str:
|
229
226
|
return "array"
|
@@ -276,7 +273,30 @@ class VectorOfVectors(LGDO):
|
|
276
273
|
else:
|
277
274
|
raise NotImplementedError
|
278
275
|
|
279
|
-
def
|
276
|
+
def reserve_capacity(self, cap_cl, *cap_args) -> None:
|
277
|
+
"""Set capacity of internal data arrays. Expect number of args to
|
278
|
+
equal `self.n_dim`. First arg is capacity of cumulative length array.
|
279
|
+
If `self.n_dim` is 2, second argument is capacity of flattened data,
|
280
|
+
otherwise arguments are fed recursively to remaining dimensions.
|
281
|
+
"""
|
282
|
+
self.cumulative_length.reserve_capacity(cap_cl)
|
283
|
+
self.flattened_data.reserve_capacity(*cap_args)
|
284
|
+
|
285
|
+
def get_capacity(self) -> tuple[int]:
|
286
|
+
"""Get tuple containing capacity of each dimension. First dimension
|
287
|
+
is cumulative length array. Last dimension is flattened data.
|
288
|
+
"""
|
289
|
+
fd_cap = self.flattened_data.get_capacity()
|
290
|
+
if isinstance(fd_cap, int):
|
291
|
+
return (self.cumulative_length.get_capacity(), fd_cap)
|
292
|
+
return (self.cumulative_length.get_capacity(), *fd_cap)
|
293
|
+
|
294
|
+
def trim_capacity(self) -> None:
|
295
|
+
"Set capacity for all dimensions to minimum needed to hold data"
|
296
|
+
self.cumulative_length.trim_capacity()
|
297
|
+
self.flattened_data.trim_capacity()
|
298
|
+
|
299
|
+
def resize(self, new_size: int, trim: bool = False) -> None:
|
280
300
|
"""Resize vector along the first axis.
|
281
301
|
|
282
302
|
`self.flattened_data` is resized only if `new_size` is smaller than the
|
@@ -286,6 +306,8 @@ class VectorOfVectors(LGDO):
|
|
286
306
|
`self.cumulative_length` is padded with its last element. This
|
287
307
|
corresponds to appending empty vectors.
|
288
308
|
|
309
|
+
If `trim` is ``True``, resize capacity to match new size
|
310
|
+
|
289
311
|
Examples
|
290
312
|
--------
|
291
313
|
>>> vov = VectorOfVectors([[1, 2, 3], [4, 5]])
|
@@ -303,23 +325,22 @@ class VectorOfVectors(LGDO):
|
|
303
325
|
[3],
|
304
326
|
]
|
305
327
|
"""
|
306
|
-
vidx = self.cumulative_length
|
307
328
|
old_s = len(self)
|
308
|
-
dlen = new_size - old_s
|
309
|
-
csum = vidx[-1] if len(self) > 0 else 0
|
310
329
|
|
311
330
|
# first resize the cumulative length
|
312
|
-
self.cumulative_length.resize(new_size)
|
331
|
+
self.cumulative_length.resize(new_size, trim)
|
313
332
|
|
314
333
|
# if new_size > size, new elements are filled with zeros, let's fix
|
315
334
|
# that
|
316
|
-
if
|
317
|
-
self.cumulative_length[old_s:] =
|
335
|
+
if new_size > old_s:
|
336
|
+
self.cumulative_length[old_s:] = self.cumulative_length[old_s - 1]
|
318
337
|
|
319
338
|
# then resize the data array
|
320
339
|
# if dlen > 0 this has no effect
|
321
340
|
if len(self.cumulative_length) > 0:
|
322
|
-
self.flattened_data.resize(self.cumulative_length[-1])
|
341
|
+
self.flattened_data.resize(self.cumulative_length[-1], trim)
|
342
|
+
else:
|
343
|
+
self.flattened_data.resize(0, trim)
|
323
344
|
|
324
345
|
def append(self, new: NDArray) -> None:
|
325
346
|
"""Append a 1D vector `new` at the end.
|
@@ -334,20 +355,7 @@ class VectorOfVectors(LGDO):
|
|
334
355
|
[8 9],
|
335
356
|
]
|
336
357
|
"""
|
337
|
-
|
338
|
-
# first extend cumulative_length by +1
|
339
|
-
self.cumulative_length.resize(len(self) + 1)
|
340
|
-
# set it at the right value
|
341
|
-
newlen = (
|
342
|
-
self.cumulative_length[-2] + len(new) if len(self) > 1 else len(new)
|
343
|
-
)
|
344
|
-
self.cumulative_length[-1] = newlen
|
345
|
-
# then resize flattened_data to accommodate the new vector
|
346
|
-
self.flattened_data.resize(len(self.flattened_data) + len(new))
|
347
|
-
# finally set it
|
348
|
-
self[-1] = new
|
349
|
-
else:
|
350
|
-
raise NotImplementedError
|
358
|
+
self.insert(len(self), new)
|
351
359
|
|
352
360
|
def insert(self, i: int, new: NDArray) -> None:
|
353
361
|
"""Insert a vector at index `i`.
|
@@ -364,23 +372,15 @@ class VectorOfVectors(LGDO):
|
|
364
372
|
[8 9],
|
365
373
|
[4 5],
|
366
374
|
]
|
367
|
-
|
368
|
-
Warning
|
369
|
-
-------
|
370
|
-
This method involves a significant amount of memory re-allocation and
|
371
|
-
is expected to perform poorly on large vectors.
|
372
375
|
"""
|
373
376
|
if self.ndim == 2:
|
374
|
-
if i
|
375
|
-
msg = f"index {i} is out of bounds for vector
|
377
|
+
if i > len(self):
|
378
|
+
msg = f"index {i} is out of bounds for vector with size {len(self)}"
|
376
379
|
raise IndexError(msg)
|
377
380
|
|
378
|
-
self.
|
379
|
-
|
380
|
-
)
|
381
|
-
self.cumulative_length = Array(
|
382
|
-
np.insert(self.cumulative_length, i, self.cumulative_length[i - 1])
|
383
|
-
)
|
381
|
+
i_start = 0 if i == 0 else self.cumulative_length[i - 1]
|
382
|
+
self.flattened_data.insert(i_start, new)
|
383
|
+
self.cumulative_length.insert(i, i_start)
|
384
384
|
self.cumulative_length[i:] += np.uint32(len(new))
|
385
385
|
else:
|
386
386
|
raise NotImplementedError
|
@@ -400,11 +400,6 @@ class VectorOfVectors(LGDO):
|
|
400
400
|
[[8 9],
|
401
401
|
[4 5],
|
402
402
|
]
|
403
|
-
|
404
|
-
Warning
|
405
|
-
-------
|
406
|
-
This method involves a significant amount of memory re-allocation and
|
407
|
-
is expected to perform poorly on large vectors.
|
408
403
|
"""
|
409
404
|
if self.ndim == 2:
|
410
405
|
if i >= len(self):
|
@@ -414,27 +409,17 @@ class VectorOfVectors(LGDO):
|
|
414
409
|
vidx = self.cumulative_length
|
415
410
|
dlen = len(new) - len(self[i])
|
416
411
|
|
417
|
-
if dlen
|
418
|
-
#
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
)
|
429
|
-
else:
|
430
|
-
# set the already allocated indices
|
431
|
-
self.flattened_data[vidx[i - 1] : vidx[i]] = new[: len(self[i])]
|
432
|
-
# then insert the remaining
|
433
|
-
self.flattened_data = Array(
|
434
|
-
np.insert(self.flattened_data, vidx[i], new[len(self[i]) :])
|
435
|
-
)
|
436
|
-
|
437
|
-
vidx[i:] = vidx[i:] + dlen
|
412
|
+
if dlen != 0:
|
413
|
+
# move the subsequent entries
|
414
|
+
vidx[i:] += dlen
|
415
|
+
self.flattened_data.resize(vidx[-1])
|
416
|
+
self.flattened_data._nda[vidx[i] : vidx[-1]] = self.flattened_data._nda[
|
417
|
+
vidx[i] - dlen : vidx[-1] - dlen
|
418
|
+
]
|
419
|
+
|
420
|
+
# set the already allocated indices
|
421
|
+
start = vidx[i - 1] if i > 0 else 0
|
422
|
+
self.flattened_data[start : vidx[i]] = new
|
438
423
|
else:
|
439
424
|
raise NotImplementedError
|
440
425
|
|
@@ -484,7 +469,15 @@ class VectorOfVectors(LGDO):
|
|
484
469
|
cum_lens = np.add(start, lens.cumsum(), dtype=int)
|
485
470
|
|
486
471
|
# fill with fast vectorized routine
|
487
|
-
|
472
|
+
if np.issubdtype(self.flattened_data.dtype, np.unsignedinteger):
|
473
|
+
nan_val = np.iinfo(self.flattened_data.dtype).max
|
474
|
+
if np.issubdtype(self.flattened_data.dtype, np.integer):
|
475
|
+
nan_val = np.iinfo(self.flattened_data.dtype).min
|
476
|
+
else:
|
477
|
+
nan_val = np.nan
|
478
|
+
vovutils._nb_fill(
|
479
|
+
vec, lens, nan_val, self.flattened_data.nda[start : cum_lens[-1]]
|
480
|
+
)
|
488
481
|
|
489
482
|
# add new vector(s) length to cumulative_length
|
490
483
|
self.cumulative_length[i : i + len(lens)] = cum_lens
|
lgdo/types/vovutils.py
CHANGED
@@ -81,7 +81,7 @@ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> ND
|
|
81
81
|
|
82
82
|
@numba.guvectorize(
|
83
83
|
[
|
84
|
-
f"{data_type}[:,:],{size_type}[:],{data_type}[:]"
|
84
|
+
f"{data_type}[:,:],{size_type}[:],{data_type},{data_type}[:]"
|
85
85
|
for data_type in [
|
86
86
|
"b1",
|
87
87
|
"i1",
|
@@ -99,10 +99,12 @@ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> ND
|
|
99
99
|
]
|
100
100
|
for size_type in ["i4", "i8", "u4", "u8"]
|
101
101
|
],
|
102
|
-
"(l,m),(l),(n)",
|
102
|
+
"(l,m),(l),(),(n)",
|
103
103
|
**nb_kwargs,
|
104
104
|
)
|
105
|
-
def _nb_fill(
|
105
|
+
def _nb_fill(
|
106
|
+
aoa_in: NDArray, len_in: NDArray, nan_val: int | float, flattened_array_out: NDArray
|
107
|
+
):
|
106
108
|
"""Vectorized function to fill flattened array from array of arrays and
|
107
109
|
lengths. Values in aoa_in past lengths will not be copied.
|
108
110
|
|
@@ -112,6 +114,9 @@ def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
|
|
112
114
|
array of arrays containing values to be copied
|
113
115
|
len_in
|
114
116
|
array of vector lengths for each row of aoa_in
|
117
|
+
nan_val
|
118
|
+
value to use when len_in is longer than aoa_in. Should use
|
119
|
+
np.nan for floating point, and 0xfff... for integer types
|
115
120
|
flattened_array_out
|
116
121
|
flattened array to copy values into. Must be longer than sum of
|
117
122
|
lengths in len_in
|
@@ -122,9 +127,14 @@ def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
|
|
122
127
|
raise ValueError(msg)
|
123
128
|
|
124
129
|
start = 0
|
130
|
+
max_len = aoa_in.shape[1]
|
125
131
|
for i, ll in enumerate(len_in):
|
126
132
|
stop = start + ll
|
127
|
-
|
133
|
+
if ll > max_len:
|
134
|
+
flattened_array_out[start : start + max_len] = aoa_in[i, :]
|
135
|
+
flattened_array_out[start + max_len : stop] = nan_val
|
136
|
+
else:
|
137
|
+
flattened_array_out[start:stop] = aoa_in[i, :ll]
|
128
138
|
start = stop
|
129
139
|
|
130
140
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|