legend-pydataobj 1.5.0a5__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {legend_pydataobj-1.5.0a5.dist-info → legend_pydataobj-1.6.0.dist-info}/METADATA +1 -1
- legend_pydataobj-1.6.0.dist-info/RECORD +54 -0
- {legend_pydataobj-1.5.0a5.dist-info → legend_pydataobj-1.6.0.dist-info}/WHEEL +1 -1
- {legend_pydataobj-1.5.0a5.dist-info → legend_pydataobj-1.6.0.dist-info}/entry_points.txt +1 -0
- lgdo/__init__.py +7 -4
- lgdo/_version.py +2 -2
- lgdo/cli.py +237 -12
- lgdo/compression/__init__.py +1 -0
- lgdo/lh5/__init__.py +9 -1
- lgdo/lh5/_serializers/__init__.py +43 -0
- lgdo/lh5/_serializers/read/__init__.py +0 -0
- lgdo/lh5/_serializers/read/array.py +34 -0
- lgdo/lh5/_serializers/read/composite.py +405 -0
- lgdo/lh5/_serializers/read/encoded.py +129 -0
- lgdo/lh5/_serializers/read/ndarray.py +104 -0
- lgdo/lh5/_serializers/read/scalar.py +34 -0
- lgdo/lh5/_serializers/read/utils.py +12 -0
- lgdo/lh5/_serializers/read/vector_of_vectors.py +195 -0
- lgdo/lh5/_serializers/write/__init__.py +0 -0
- lgdo/lh5/_serializers/write/array.py +92 -0
- lgdo/lh5/_serializers/write/composite.py +259 -0
- lgdo/lh5/_serializers/write/scalar.py +23 -0
- lgdo/lh5/_serializers/write/vector_of_vectors.py +95 -0
- lgdo/lh5/core.py +272 -0
- lgdo/lh5/datatype.py +46 -0
- lgdo/lh5/exceptions.py +34 -0
- lgdo/lh5/iterator.py +1 -1
- lgdo/lh5/store.py +69 -1160
- lgdo/lh5/tools.py +27 -53
- lgdo/lh5/utils.py +130 -27
- lgdo/lh5_store.py +59 -2
- lgdo/logging.py +4 -3
- lgdo/types/__init__.py +1 -0
- lgdo/types/array.py +3 -0
- lgdo/types/arrayofequalsizedarrays.py +1 -0
- lgdo/types/encoded.py +3 -8
- lgdo/types/fixedsizearray.py +1 -0
- lgdo/types/struct.py +1 -0
- lgdo/types/table.py +69 -26
- lgdo/types/vectorofvectors.py +314 -458
- lgdo/types/vovutils.py +320 -0
- lgdo/types/waveformtable.py +1 -0
- lgdo/utils.py +1 -32
- legend_pydataobj-1.5.0a5.dist-info/RECORD +0 -36
- {legend_pydataobj-1.5.0a5.dist-info → legend_pydataobj-1.6.0.dist-info}/LICENSE +0 -0
- {legend_pydataobj-1.5.0a5.dist-info → legend_pydataobj-1.6.0.dist-info}/top_level.txt +0 -0
lgdo/types/vectorofvectors.py
CHANGED
@@ -2,23 +2,22 @@
|
|
2
2
|
Implements a LEGEND Data Object representing a variable-length array of
|
3
3
|
variable-length arrays and corresponding utilities.
|
4
4
|
"""
|
5
|
+
|
5
6
|
from __future__ import annotations
|
6
7
|
|
7
|
-
import itertools
|
8
8
|
import logging
|
9
|
-
from collections.abc import Iterator
|
9
|
+
from collections.abc import Iterator, Mapping, Sequence
|
10
10
|
from typing import Any
|
11
11
|
|
12
12
|
import awkward as ak
|
13
13
|
import awkward_pandas as akpd
|
14
|
-
import numba
|
15
14
|
import numpy as np
|
16
15
|
import pandas as pd
|
17
|
-
from numpy.typing import DTypeLike, NDArray
|
16
|
+
from numpy.typing import ArrayLike, DTypeLike, NDArray
|
18
17
|
|
19
18
|
from .. import utils
|
20
|
-
from ..utils import numba_defaults_kwargs as nb_kwargs
|
21
19
|
from . import arrayofequalsizedarrays as aoesa
|
20
|
+
from . import vovutils
|
22
21
|
from .array import Array
|
23
22
|
from .lgdo import LGDO
|
24
23
|
|
@@ -26,30 +25,56 @@ log = logging.getLogger(__name__)
|
|
26
25
|
|
27
26
|
|
28
27
|
class VectorOfVectors(LGDO):
|
29
|
-
"""A variable-length array of variable-length arrays.
|
28
|
+
"""A n-dimensional variable-length 1D array of variable-length 1D arrays.
|
29
|
+
|
30
|
+
If the vector is 2-dimensional, the internal representation is as two NumPy
|
31
|
+
arrays, one to store the flattened data contiguosly
|
32
|
+
(:attr:`flattened_data`) and one to store the cumulative sum of lengths of
|
33
|
+
each vector (:attr:`cumulative_length`). When the dimension is more than 2,
|
34
|
+
:attr:`flattened_data` is a :class:`VectorOfVectors` itself.
|
30
35
|
|
31
|
-
|
32
|
-
|
33
|
-
|
36
|
+
Examples
|
37
|
+
--------
|
38
|
+
>>> from lgdo import VectorOfVectors
|
39
|
+
>>> data = VectorOfVectors(
|
40
|
+
... [[[1, 2], [3, 4, 5]], [[2], [4, 8, 9, 7]], [[5, 3, 1]]],
|
41
|
+
... attrs={"units": "m"}
|
42
|
+
... )
|
43
|
+
>>> print(data)
|
44
|
+
[[[1, 2], [3, 4, 5]],
|
45
|
+
[[2], [4, 8, 9, 7]],
|
46
|
+
[[5, 3, 1]]
|
47
|
+
] with attrs={'units': 'm'}
|
48
|
+
>>> data.view_as("ak")
|
49
|
+
<Array [[[1, 2], [3, 4, 5]], ..., [[5, ..., 1]]] type='3 * var * var * int64'>
|
50
|
+
|
51
|
+
Note
|
52
|
+
----
|
53
|
+
Many class methods are currently implemented only for 2D vectors and will
|
54
|
+
raise an exception on higher dimensional data.
|
34
55
|
"""
|
35
56
|
|
36
57
|
def __init__(
|
37
58
|
self,
|
38
|
-
|
39
|
-
flattened_data:
|
40
|
-
cumulative_length:
|
41
|
-
shape_guess:
|
42
|
-
dtype: DTypeLike = None,
|
59
|
+
data: ArrayLike | None = None,
|
60
|
+
flattened_data: ArrayLike | None = None,
|
61
|
+
cumulative_length: ArrayLike | VectorOfVectors | None = None,
|
62
|
+
shape_guess: Sequence[int, ...] | None = None,
|
63
|
+
dtype: DTypeLike | None = None,
|
43
64
|
fill_val: int | float | None = None,
|
44
|
-
attrs:
|
65
|
+
attrs: Mapping[str, Any] | None = None,
|
45
66
|
) -> None:
|
46
67
|
"""
|
47
68
|
Parameters
|
48
69
|
----------
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
70
|
+
data
|
71
|
+
Any array-like structure accepted by the :class:`ak.Array`
|
72
|
+
constructor, with the exception that elements cannot be of type
|
73
|
+
``OptionType``, ``UnionType`` or ``RecordType``. Takes priority
|
74
|
+
over `flattened_data` and `cumulative_length`. The serialization of
|
75
|
+
the :class:`ak.Array` is performed through :func:`ak.to_buffers`.
|
76
|
+
Since the latter returns non-data-owning NumPy arrays, which would
|
77
|
+
prevent later modifications like resizing, a copy is performed.
|
53
78
|
flattened_data
|
54
79
|
if not ``None``, used as the internal array for
|
55
80
|
`self.flattened_data`. Otherwise, an internal `flattened_data` is
|
@@ -74,124 +99,181 @@ class VectorOfVectors(LGDO):
|
|
74
99
|
attrs
|
75
100
|
a set of user attributes to be carried along with this LGDO.
|
76
101
|
"""
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
102
|
+
# sanitize
|
103
|
+
if cumulative_length is not None and not isinstance(cumulative_length, Array):
|
104
|
+
cumulative_length = Array(cumulative_length)
|
105
|
+
if flattened_data is not None and not isinstance(
|
106
|
+
flattened_data, (Array, VectorOfVectors)
|
107
|
+
):
|
108
|
+
flattened_data = Array(flattened_data)
|
109
|
+
|
110
|
+
if data is not None:
|
111
|
+
if not isinstance(data, ak.Array):
|
112
|
+
data = ak.Array(data)
|
113
|
+
|
114
|
+
if data.ndim < 2:
|
115
|
+
msg = (
|
116
|
+
"cannot initialize a VectorOfVectors with "
|
117
|
+
f"{data.ndim}-dimensional data"
|
91
118
|
)
|
119
|
+
raise ValueError(msg)
|
92
120
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
msg = "array can't be empty with dtype=None!"
|
98
|
-
raise ValueError(msg)
|
121
|
+
# make sure it's not a record array
|
122
|
+
if not vovutils._ak_is_valid(data):
|
123
|
+
msg = "input array type is not supported!"
|
124
|
+
raise ValueError(msg)
|
99
125
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
126
|
+
# array might be non-jagged! ('container' will hold a ndim NumPy array)
|
127
|
+
if not vovutils._ak_is_jagged(data):
|
128
|
+
data = ak.from_regular(data, axis=None)
|
129
|
+
|
130
|
+
# ak.to_buffer helps in de-serialization
|
131
|
+
# NOTE: ak.to_packed() needed?
|
132
|
+
form, length, container = ak.to_buffers(ak.to_packed(data))
|
133
|
+
|
134
|
+
# NOTE: node#-data is not even in the dict if the awkward array is empty
|
135
|
+
# NOTE: if the data arg was a numpy array, to_buffers() preserves
|
136
|
+
# the original dtype
|
137
|
+
# FIXME: have to copy the buffers, otherwise self will not own the
|
138
|
+
# data and self.resize() will fail. Is it possible to avoid this?
|
139
|
+
flattened_data = np.copy(
|
140
|
+
container.pop(f"node{data.ndim-1}-data", np.empty(0, dtype=dtype))
|
141
|
+
)
|
104
142
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
143
|
+
# if user-provided dtype is different than dtype from Awkward, cast
|
144
|
+
# NOTE: makes a copy only if needed
|
145
|
+
flattened_data = np.asarray(flattened_data, dtype=dtype)
|
146
|
+
|
147
|
+
# start from innermost VoV and build nested structure
|
148
|
+
for i in range(data.ndim - 2, -1, -1):
|
149
|
+
# NOTE: remember, omit the leading 0 from ak.Array offsets
|
150
|
+
cumulative_length = np.copy(container[f"node{i}-offsets"][1:])
|
151
|
+
|
152
|
+
if i != 0:
|
153
|
+
# at the beginning of the loop: initialize innermost
|
154
|
+
# flattened_data and replace current flattened_data
|
155
|
+
# reference. in the following iterations flattened_data is
|
156
|
+
# a VectorOfVectors
|
157
|
+
flattened_data = VectorOfVectors(
|
158
|
+
flattened_data=flattened_data,
|
159
|
+
cumulative_length=cumulative_length,
|
160
|
+
)
|
161
|
+
|
162
|
+
else:
|
163
|
+
# at end we need to initialize self with the latest flattened_data
|
164
|
+
self.__init__(
|
165
|
+
flattened_data=flattened_data,
|
166
|
+
cumulative_length=cumulative_length,
|
167
|
+
)
|
110
168
|
|
111
169
|
else:
|
170
|
+
self.flattened_data = None
|
171
|
+
self.cumulative_length = None
|
172
|
+
|
173
|
+
# let's first setup cumulative_length...
|
112
174
|
if cumulative_length is None:
|
113
|
-
if shape_guess is None:
|
114
|
-
# just make an empty vector
|
115
|
-
self.cumulative_length = Array(np.empty((0,), dtype="uint32"))
|
116
175
|
# initialize based on shape_guess
|
117
|
-
|
118
|
-
|
119
|
-
|
176
|
+
if shape_guess is None:
|
177
|
+
# just make an empty 2D vector
|
178
|
+
shape_guess = (0, 0)
|
179
|
+
|
180
|
+
# sanity check
|
181
|
+
if len(shape_guess) < 2:
|
182
|
+
msg = "shape_guess must be a sequence of 2 integers or more"
|
183
|
+
raise ValueError(msg)
|
184
|
+
|
185
|
+
# let's Awkward do the job here, we're lazy
|
186
|
+
if fill_val is not None:
|
187
|
+
self.__init__(
|
188
|
+
np.full(shape=shape_guess, fill_value=fill_val, dtype=dtype)
|
120
189
|
)
|
121
190
|
else:
|
122
|
-
self.
|
123
|
-
np.arange(
|
124
|
-
shape_guess[1],
|
125
|
-
np.prod(shape_guess) + 1,
|
126
|
-
shape_guess[1],
|
127
|
-
dtype="uint32",
|
128
|
-
)
|
129
|
-
)
|
191
|
+
self.__init__(np.empty(shape=shape_guess, dtype=dtype))
|
130
192
|
else:
|
131
|
-
|
132
|
-
|
133
|
-
|
193
|
+
# if it's user provided just use it
|
194
|
+
self.cumulative_length = cumulative_length
|
195
|
+
|
196
|
+
# ...then flattened_data
|
197
|
+
# NOTE: self.flattened_data might have already been initialized
|
198
|
+
# above
|
199
|
+
if flattened_data is None and self.flattened_data is None:
|
200
|
+
# this happens when the cumulative_length arg is not None
|
134
201
|
if dtype is None:
|
135
202
|
msg = "flattened_data and dtype cannot both be None!"
|
136
203
|
raise ValueError(msg)
|
137
204
|
|
138
|
-
|
139
|
-
if cumulative_length is None:
|
140
|
-
# just make an empty vector or use shape_guess
|
141
|
-
length = 0 if shape_guess is None else np.prod(shape_guess)
|
142
|
-
else:
|
143
|
-
# use cumulative_length
|
144
|
-
length = cumulative_length[-1]
|
145
|
-
|
205
|
+
# now ready to initialize the object!
|
146
206
|
self.flattened_data = Array(
|
147
|
-
shape=(
|
207
|
+
shape=(self.cumulative_length[-1],), dtype=dtype, fill_val=fill_val
|
148
208
|
)
|
149
|
-
|
150
|
-
self.flattened_data =
|
209
|
+
elif self.flattened_data is None:
|
210
|
+
self.flattened_data = flattened_data
|
151
211
|
|
152
212
|
# finally set dtype
|
153
213
|
self.dtype = self.flattened_data.dtype
|
154
214
|
|
215
|
+
# set ndim
|
216
|
+
self.ndim = 2
|
217
|
+
pointer = self.flattened_data
|
218
|
+
while True:
|
219
|
+
if isinstance(pointer, Array):
|
220
|
+
break
|
221
|
+
|
222
|
+
self.ndim += 1
|
223
|
+
pointer = pointer.flattened_data
|
224
|
+
|
155
225
|
super().__init__(attrs)
|
156
226
|
|
157
227
|
def datatype_name(self) -> str:
|
158
228
|
return "array"
|
159
229
|
|
160
230
|
def form_datatype(self) -> str:
|
161
|
-
|
162
|
-
|
231
|
+
eltype = (
|
232
|
+
"array<1>{" + utils.get_element_type(self) + "}"
|
233
|
+
if self.ndim == 2
|
234
|
+
else self.flattened_data.form_datatype()
|
235
|
+
)
|
236
|
+
return "array<1>{" + eltype + "}"
|
163
237
|
|
164
238
|
def __len__(self) -> int:
|
165
|
-
"""Return the number of stored vectors."""
|
239
|
+
"""Return the number of stored vectors along the first axis (0)."""
|
166
240
|
return len(self.cumulative_length)
|
167
241
|
|
168
242
|
def __eq__(self, other: VectorOfVectors) -> bool:
|
169
243
|
if isinstance(other, VectorOfVectors):
|
244
|
+
if self.ndim == 2 and len(self.cumulative_length) != 0:
|
245
|
+
fldata_eq = np.array_equal(
|
246
|
+
self.flattened_data[: self.cumulative_length[-1]],
|
247
|
+
other.flattened_data[: other.cumulative_length[-1]],
|
248
|
+
)
|
249
|
+
else:
|
250
|
+
fldata_eq = self.flattened_data == other.flattened_data
|
251
|
+
|
170
252
|
return (
|
171
253
|
self.cumulative_length == other.cumulative_length
|
172
|
-
and
|
173
|
-
len(self.cumulative_length) == 0
|
174
|
-
or np.all(
|
175
|
-
self.flattened_data[: self.cumulative_length[-1]]
|
176
|
-
== other.flattened_data[: other.cumulative_length[-1]]
|
177
|
-
)
|
178
|
-
)
|
254
|
+
and fldata_eq
|
179
255
|
and self.dtype == other.dtype
|
180
256
|
and self.attrs == other.attrs
|
181
257
|
)
|
182
258
|
|
183
259
|
return False
|
184
260
|
|
185
|
-
def __getitem__(self, i: int) ->
|
186
|
-
"""Return vector at index `i
|
187
|
-
|
188
|
-
|
189
|
-
|
261
|
+
def __getitem__(self, i: int) -> NDArray:
|
262
|
+
"""Return a view of the vector at index `i` along the first axis."""
|
263
|
+
if self.ndim == 2:
|
264
|
+
stop = self.cumulative_length[i]
|
265
|
+
if i in (0, -len(self)):
|
266
|
+
return self.flattened_data[0:stop]
|
267
|
+
|
268
|
+
return self.flattened_data[self.cumulative_length[i - 1] : stop]
|
190
269
|
|
191
|
-
|
270
|
+
raise NotImplementedError
|
192
271
|
|
193
272
|
def __setitem__(self, i: int, new: NDArray) -> None:
|
194
|
-
self.
|
273
|
+
if self.ndim == 2:
|
274
|
+
self.__getitem__(i)[:] = new
|
275
|
+
else:
|
276
|
+
raise NotImplementedError
|
195
277
|
|
196
278
|
def resize(self, new_size: int) -> None:
|
197
279
|
"""Resize vector along the first axis.
|
@@ -220,24 +302,26 @@ class VectorOfVectors(LGDO):
|
|
220
302
|
[3],
|
221
303
|
]
|
222
304
|
"""
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
305
|
+
if self.ndim == 2:
|
306
|
+
vidx = self.cumulative_length
|
307
|
+
old_s = len(self)
|
308
|
+
dlen = new_size - old_s
|
309
|
+
csum = vidx[-1] if len(self) > 0 else 0
|
310
|
+
|
311
|
+
# first resize the cumulative length
|
312
|
+
self.cumulative_length.resize(new_size)
|
313
|
+
|
314
|
+
# if new_size > size, new elements are filled with zeros, let's fix
|
315
|
+
# that
|
316
|
+
if dlen > 0:
|
317
|
+
self.cumulative_length[old_s:] = csum
|
318
|
+
|
319
|
+
# then resize the data array
|
320
|
+
# if dlen > 0 this has no effect
|
321
|
+
if len(self.cumulative_length) > 0:
|
322
|
+
self.flattened_data.resize(self.cumulative_length[-1])
|
323
|
+
else:
|
324
|
+
raise NotImplementedError
|
241
325
|
|
242
326
|
def append(self, new: NDArray) -> None:
|
243
327
|
"""Append a 1D vector `new` at the end.
|
@@ -252,15 +336,20 @@ class VectorOfVectors(LGDO):
|
|
252
336
|
[8 9],
|
253
337
|
]
|
254
338
|
"""
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
339
|
+
if self.ndim == 2:
|
340
|
+
# first extend cumulative_length by +1
|
341
|
+
self.cumulative_length.resize(len(self) + 1)
|
342
|
+
# set it at the right value
|
343
|
+
newlen = (
|
344
|
+
self.cumulative_length[-2] + len(new) if len(self) > 1 else len(new)
|
345
|
+
)
|
346
|
+
self.cumulative_length[-1] = newlen
|
347
|
+
# then resize flattened_data to accommodate the new vector
|
348
|
+
self.flattened_data.resize(len(self.flattened_data) + len(new))
|
349
|
+
# finally set it
|
350
|
+
self[-1] = new
|
351
|
+
else:
|
352
|
+
raise NotImplementedError
|
264
353
|
|
265
354
|
def insert(self, i: int, new: NDArray) -> None:
|
266
355
|
"""Insert a vector at index `i`.
|
@@ -283,17 +372,20 @@ class VectorOfVectors(LGDO):
|
|
283
372
|
This method involves a significant amount of memory re-allocation and
|
284
373
|
is expected to perform poorly on large vectors.
|
285
374
|
"""
|
286
|
-
if
|
287
|
-
|
288
|
-
|
375
|
+
if self.ndim == 2:
|
376
|
+
if i >= len(self):
|
377
|
+
msg = f"index {i} is out of bounds for vector owith size {len(self)}"
|
378
|
+
raise IndexError(msg)
|
289
379
|
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
380
|
+
self.flattened_data = Array(
|
381
|
+
np.insert(self.flattened_data, self.cumulative_length[i - 1], new)
|
382
|
+
)
|
383
|
+
self.cumulative_length = Array(
|
384
|
+
np.insert(self.cumulative_length, i, self.cumulative_length[i - 1])
|
385
|
+
)
|
386
|
+
self.cumulative_length[i:] += np.uint32(len(new))
|
387
|
+
else:
|
388
|
+
raise NotImplementedError
|
297
389
|
|
298
390
|
def replace(self, i: int, new: NDArray) -> None:
|
299
391
|
"""Replace the vector at index `i` with `new`.
|
@@ -316,36 +408,41 @@ class VectorOfVectors(LGDO):
|
|
316
408
|
This method involves a significant amount of memory re-allocation and
|
317
409
|
is expected to perform poorly on large vectors.
|
318
410
|
"""
|
319
|
-
if
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
411
|
+
if self.ndim == 2:
|
412
|
+
if i >= len(self):
|
413
|
+
msg = f"index {i} is out of bounds for vector with size {len(self)}"
|
414
|
+
raise IndexError(msg)
|
415
|
+
|
416
|
+
vidx = self.cumulative_length
|
417
|
+
dlen = len(new) - len(self[i])
|
418
|
+
|
419
|
+
if dlen == 0:
|
420
|
+
# don't waste resources
|
421
|
+
self[i] = new
|
422
|
+
elif dlen < 0:
|
423
|
+
start = vidx[i - 1]
|
424
|
+
stop = start + len(new)
|
425
|
+
# set the already allocated indices
|
426
|
+
self.flattened_data[start:stop] = new
|
427
|
+
# then delete the extra indices
|
428
|
+
self.flattened_data = Array(
|
429
|
+
np.delete(self.flattened_data, np.s_[stop : vidx[i]])
|
430
|
+
)
|
431
|
+
else:
|
432
|
+
# set the already allocated indices
|
433
|
+
self.flattened_data[vidx[i - 1] : vidx[i]] = new[: len(self[i])]
|
434
|
+
# then insert the remaining
|
435
|
+
self.flattened_data = Array(
|
436
|
+
np.insert(self.flattened_data, vidx[i], new[len(self[i]) :])
|
437
|
+
)
|
345
438
|
|
346
|
-
|
439
|
+
vidx[i:] = vidx[i:] + dlen
|
440
|
+
else:
|
441
|
+
raise NotImplementedError
|
347
442
|
|
348
|
-
def _set_vector_unsafe(
|
443
|
+
def _set_vector_unsafe(
|
444
|
+
self, i: int, vec: NDArray, lens: ArrayLike | None = None
|
445
|
+
) -> None:
|
349
446
|
r"""Insert vector `vec` at position `i`.
|
350
447
|
|
351
448
|
Assumes that ``j = self.cumulative_length[i-1]`` is the index (in
|
@@ -357,9 +454,9 @@ class VectorOfVectors(LGDO):
|
|
357
454
|
behavior. This method is typically used for fast sequential fill of a
|
358
455
|
pre-allocated vector of vectors.
|
359
456
|
|
360
|
-
If vec is 1D array and lens is None
|
361
|
-
is 2D, require lens to be included, and fill each array only up
|
362
|
-
lengths in lens
|
457
|
+
If i`vec` is 1D array and `lens` is ``None``, set using full array. If
|
458
|
+
`vec` is 2D, require `lens` to be included, and fill each array only up
|
459
|
+
to lengths in `lens`.
|
363
460
|
|
364
461
|
Danger
|
365
462
|
------
|
@@ -370,39 +467,47 @@ class VectorOfVectors(LGDO):
|
|
370
467
|
--------
|
371
468
|
append, replace, insert
|
372
469
|
"""
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
if
|
377
|
-
lens = np.array([vec.shape[1]], dtype="u4")
|
378
|
-
if not isinstance(lens, np.ndarray):
|
379
|
-
lens = np.array([lens], dtype="u4")
|
380
|
-
cum_lens = start + lens.cumsum()
|
381
|
-
_nb_fill(vec, lens, self.flattened_data.nda[start : cum_lens[-1]])
|
382
|
-
self.cumulative_length[i : i + len(lens)] = cum_lens
|
470
|
+
if self.ndim == 2:
|
471
|
+
# check if current vector is empty and get the start index in
|
472
|
+
# flattened_data
|
473
|
+
start = 0 if i == 0 else self.cumulative_length[i - 1]
|
383
474
|
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
yield self.flattened_data[self.cumulative_length[j - 1] : stop]
|
475
|
+
# if the new element is 1D, convert to dummy 2D
|
476
|
+
if len(vec.shape) == 1:
|
477
|
+
vec = np.expand_dims(vec, axis=0)
|
478
|
+
if lens is None:
|
479
|
+
lens = np.array([vec.shape[1]], dtype="u4")
|
390
480
|
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
for vec in self:
|
395
|
-
if pos != 0:
|
396
|
-
string += " "
|
481
|
+
# this in case lens is 02, convert to 1D
|
482
|
+
if not isinstance(lens, np.ndarray):
|
483
|
+
lens = np.array([lens], dtype="u4")
|
397
484
|
|
398
|
-
|
485
|
+
# calculate stop index in flattened_data
|
486
|
+
cum_lens = start + lens.cumsum()
|
399
487
|
|
400
|
-
|
401
|
-
|
488
|
+
# fill with fast vectorized routine
|
489
|
+
vovutils._nb_fill(vec, lens, self.flattened_data.nda[start : cum_lens[-1]])
|
402
490
|
|
403
|
-
|
491
|
+
# add new vector(s) length to cumulative_length
|
492
|
+
self.cumulative_length[i : i + len(lens)] = cum_lens
|
493
|
+
else:
|
494
|
+
raise NotImplementedError
|
404
495
|
|
405
|
-
|
496
|
+
def __iter__(self) -> Iterator[NDArray]:
|
497
|
+
if self.ndim == 2:
|
498
|
+
for j, stop in enumerate(self.cumulative_length):
|
499
|
+
if j == 0:
|
500
|
+
yield self.flattened_data[0:stop]
|
501
|
+
else:
|
502
|
+
yield self.flattened_data[self.cumulative_length[j - 1] : stop]
|
503
|
+
else:
|
504
|
+
raise NotImplementedError
|
505
|
+
|
506
|
+
def __str__(self) -> str:
|
507
|
+
string = self.view_as("ak").show(stream=None)
|
508
|
+
|
509
|
+
string = string.strip().removesuffix("]")
|
510
|
+
string += "\n]"
|
406
511
|
|
407
512
|
tmp_attrs = self.attrs.copy()
|
408
513
|
tmp_attrs.pop("datatype")
|
@@ -457,19 +562,22 @@ class VectorOfVectors(LGDO):
|
|
457
562
|
original vector of vectors. The type `fill_val` must be a
|
458
563
|
compatible one.
|
459
564
|
"""
|
460
|
-
|
565
|
+
if self.ndim == 2:
|
566
|
+
ak_arr = self.view_as("ak")
|
461
567
|
|
462
|
-
|
463
|
-
|
568
|
+
if max_len is None:
|
569
|
+
max_len = int(ak.max(ak.count(ak_arr, axis=-1)))
|
464
570
|
|
465
|
-
|
466
|
-
|
467
|
-
|
571
|
+
nda = ak.fill_none(
|
572
|
+
ak.pad_none(ak_arr, max_len, clip=True), fill_val
|
573
|
+
).to_numpy(allow_missing=False)
|
468
574
|
|
469
|
-
|
470
|
-
|
575
|
+
if preserve_dtype:
|
576
|
+
nda = nda.astype(self.flattened_data.dtype, copy=False)
|
471
577
|
|
472
|
-
|
578
|
+
return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs())
|
579
|
+
|
580
|
+
raise NotImplementedError
|
473
581
|
|
474
582
|
def view_as(
|
475
583
|
self,
|
@@ -519,6 +627,8 @@ class VectorOfVectors(LGDO):
|
|
519
627
|
msg = "Pint does not support Awkward yet, you must view the data with_units=False"
|
520
628
|
raise ValueError(msg)
|
521
629
|
|
630
|
+
# see https://github.com/scikit-hep/awkward/discussions/2848
|
631
|
+
|
522
632
|
# cannot avoid making a copy here. we should add the leading 0 to
|
523
633
|
# cumulative_length inside VectorOfVectors at some point in the
|
524
634
|
# future
|
@@ -528,9 +638,15 @@ class VectorOfVectors(LGDO):
|
|
528
638
|
offsets[1:] = self.cumulative_length
|
529
639
|
offsets[0] = 0
|
530
640
|
|
641
|
+
content = (
|
642
|
+
ak.contents.NumpyArray(self.flattened_data.nda)
|
643
|
+
if self.ndim == 2
|
644
|
+
else self.flattened_data.view_as(library, with_units=with_units).layout
|
645
|
+
)
|
646
|
+
|
531
647
|
layout = ak.contents.ListOffsetArray(
|
532
648
|
offsets=ak.index.Index(offsets),
|
533
|
-
content=
|
649
|
+
content=content,
|
534
650
|
)
|
535
651
|
return ak.Array(layout)
|
536
652
|
|
@@ -551,263 +667,3 @@ class VectorOfVectors(LGDO):
|
|
551
667
|
|
552
668
|
msg = f"{library} is not a supported third-party format."
|
553
669
|
raise ValueError(msg)
|
554
|
-
|
555
|
-
|
556
|
-
def build_cl(
|
557
|
-
sorted_array_in: NDArray, cumulative_length_out: NDArray = None
|
558
|
-
) -> NDArray:
|
559
|
-
"""Build a cumulative length array from an array of sorted data.
|
560
|
-
|
561
|
-
Examples
|
562
|
-
--------
|
563
|
-
>>> build_cl(np.array([3, 3, 3, 4])
|
564
|
-
array([3., 4.])
|
565
|
-
|
566
|
-
For a `sorted_array_in` of indices, this is the inverse of
|
567
|
-
:func:`.explode_cl`, in the sense that doing
|
568
|
-
``build_cl(explode_cl(cumulative_length))`` would recover the original
|
569
|
-
`cumulative_length`.
|
570
|
-
|
571
|
-
Parameters
|
572
|
-
----------
|
573
|
-
sorted_array_in
|
574
|
-
array of data already sorted; each N matching contiguous entries will
|
575
|
-
be converted into a new row of `cumulative_length_out`.
|
576
|
-
cumulative_length_out
|
577
|
-
a pre-allocated array for the output `cumulative_length`. It will
|
578
|
-
always have length <= `sorted_array_in`, so giving them the same length
|
579
|
-
is safe if there is not a better guess.
|
580
|
-
|
581
|
-
Returns
|
582
|
-
-------
|
583
|
-
cumulative_length_out
|
584
|
-
the output cumulative length array. If the user provides a
|
585
|
-
`cumulative_length_out` that is too long, this return value is sliced
|
586
|
-
to contain only the used portion of the allocated memory.
|
587
|
-
"""
|
588
|
-
if len(sorted_array_in) == 0:
|
589
|
-
return None
|
590
|
-
sorted_array_in = np.asarray(sorted_array_in)
|
591
|
-
if cumulative_length_out is None:
|
592
|
-
cumulative_length_out = np.zeros(len(sorted_array_in), dtype=np.uint64)
|
593
|
-
else:
|
594
|
-
cumulative_length_out.fill(0)
|
595
|
-
if len(cumulative_length_out) == 0 and len(sorted_array_in) > 0:
|
596
|
-
msg = "cumulative_length_out too short ({len(cumulative_length_out)})"
|
597
|
-
raise ValueError(msg)
|
598
|
-
return _nb_build_cl(sorted_array_in, cumulative_length_out)
|
599
|
-
|
600
|
-
|
601
|
-
@numba.njit(**nb_kwargs)
|
602
|
-
def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> NDArray:
|
603
|
-
"""numbified inner loop for build_cl"""
|
604
|
-
ii = 0
|
605
|
-
last_val = sorted_array_in[0]
|
606
|
-
for val in sorted_array_in:
|
607
|
-
if val != last_val:
|
608
|
-
ii += 1
|
609
|
-
cumulative_length_out[ii] = cumulative_length_out[ii - 1]
|
610
|
-
if ii >= len(cumulative_length_out):
|
611
|
-
msg = "cumulative_length_out too short"
|
612
|
-
raise RuntimeError(msg)
|
613
|
-
last_val = val
|
614
|
-
cumulative_length_out[ii] += 1
|
615
|
-
ii += 1
|
616
|
-
return cumulative_length_out[:ii]
|
617
|
-
|
618
|
-
|
619
|
-
@numba.guvectorize(
|
620
|
-
[
|
621
|
-
f"{data_type}[:,:],{size_type}[:],{data_type}[:]"
|
622
|
-
for data_type in [
|
623
|
-
"b1",
|
624
|
-
"i1",
|
625
|
-
"i2",
|
626
|
-
"i4",
|
627
|
-
"i8",
|
628
|
-
"u1",
|
629
|
-
"u2",
|
630
|
-
"u4",
|
631
|
-
"u8",
|
632
|
-
"f4",
|
633
|
-
"f8",
|
634
|
-
"c8",
|
635
|
-
"c16",
|
636
|
-
]
|
637
|
-
for size_type in ["i4", "i8", "u4", "u8"]
|
638
|
-
],
|
639
|
-
"(l,m),(l),(n)",
|
640
|
-
**nb_kwargs,
|
641
|
-
)
|
642
|
-
def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
|
643
|
-
"""Vectorized function to fill flattened array from array of arrays and
|
644
|
-
lengths. Values in aoa_in past lengths will not be copied.
|
645
|
-
|
646
|
-
Parameters
|
647
|
-
----------
|
648
|
-
aoa_in
|
649
|
-
array of arrays containing values to be copied
|
650
|
-
len_in
|
651
|
-
array of vector lengths for each row of aoa_in
|
652
|
-
flattened_array_out
|
653
|
-
flattened array to copy values into. Must be longer than sum of
|
654
|
-
lengths in len_in
|
655
|
-
"""
|
656
|
-
|
657
|
-
if len(flattened_array_out) < len_in.sum():
|
658
|
-
msg = "flattened array not large enough to hold values"
|
659
|
-
raise ValueError(msg)
|
660
|
-
|
661
|
-
start = 0
|
662
|
-
for i, ll in enumerate(len_in):
|
663
|
-
stop = start + ll
|
664
|
-
flattened_array_out[start:stop] = aoa_in[i, :ll]
|
665
|
-
start = stop
|
666
|
-
|
667
|
-
|
668
|
-
def explode_cl(cumulative_length: NDArray, array_out: NDArray = None) -> NDArray:
|
669
|
-
"""Explode a `cumulative_length` array.
|
670
|
-
|
671
|
-
Examples
|
672
|
-
--------
|
673
|
-
>>> explode_cl(np.array([2, 3]))
|
674
|
-
array([0., 0., 1.])
|
675
|
-
|
676
|
-
This is the inverse of :func:`.build_cl`, in the sense that doing
|
677
|
-
``build_cl(explode_cl(cumulative_length))`` would recover the original
|
678
|
-
`cumulative_length`.
|
679
|
-
|
680
|
-
Parameters
|
681
|
-
----------
|
682
|
-
cumulative_length
|
683
|
-
the cumulative length array to be exploded.
|
684
|
-
array_out
|
685
|
-
a pre-allocated array to hold the exploded cumulative length array.
|
686
|
-
The length should be equal to ``cumulative_length[-1]``.
|
687
|
-
|
688
|
-
Returns
|
689
|
-
-------
|
690
|
-
array_out
|
691
|
-
the exploded cumulative length array.
|
692
|
-
"""
|
693
|
-
cumulative_length = np.asarray(cumulative_length)
|
694
|
-
out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
|
695
|
-
if array_out is None:
|
696
|
-
array_out = np.empty(int(out_len), dtype=np.uint64)
|
697
|
-
if len(array_out) != out_len:
|
698
|
-
msg = f"bad lengths: cl[-1] ({cumulative_length[-1]}) != out ({len(array_out)})"
|
699
|
-
raise ValueError(msg)
|
700
|
-
return _nb_explode_cl(cumulative_length, array_out)
|
701
|
-
|
702
|
-
|
703
|
-
@numba.njit(**nb_kwargs)
|
704
|
-
def _nb_explode_cl(cumulative_length: NDArray, array_out: NDArray) -> NDArray:
|
705
|
-
"""numbified inner loop for explode_cl"""
|
706
|
-
out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
|
707
|
-
if len(array_out) != out_len:
|
708
|
-
msg = "bad lengths"
|
709
|
-
raise ValueError(msg)
|
710
|
-
start = 0
|
711
|
-
for ii in range(len(cumulative_length)):
|
712
|
-
nn = int(cumulative_length[ii] - start)
|
713
|
-
for jj in range(nn):
|
714
|
-
array_out[int(start + jj)] = ii
|
715
|
-
start = cumulative_length[ii]
|
716
|
-
return array_out
|
717
|
-
|
718
|
-
|
719
|
-
def explode(
|
720
|
-
cumulative_length: NDArray, array_in: NDArray, array_out: NDArray = None
|
721
|
-
) -> NDArray:
|
722
|
-
"""Explode a data array using a `cumulative_length` array.
|
723
|
-
|
724
|
-
This is identical to :func:`.explode_cl`, except `array_in` gets exploded
|
725
|
-
instead of `cumulative_length`.
|
726
|
-
|
727
|
-
Examples
|
728
|
-
--------
|
729
|
-
>>> explode(np.array([2, 3]), np.array([3, 4]))
|
730
|
-
array([3., 3., 4.])
|
731
|
-
|
732
|
-
Parameters
|
733
|
-
----------
|
734
|
-
cumulative_length
|
735
|
-
the cumulative length array to use for exploding.
|
736
|
-
array_in
|
737
|
-
the data to be exploded. Must have same length as `cumulative_length`.
|
738
|
-
array_out
|
739
|
-
a pre-allocated array to hold the exploded data. The length should be
|
740
|
-
equal to ``cumulative_length[-1]``.
|
741
|
-
|
742
|
-
Returns
|
743
|
-
-------
|
744
|
-
array_out
|
745
|
-
the exploded cumulative length array.
|
746
|
-
"""
|
747
|
-
cumulative_length = np.asarray(cumulative_length)
|
748
|
-
array_in = np.asarray(array_in)
|
749
|
-
out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
|
750
|
-
if array_out is None:
|
751
|
-
array_out = np.empty(out_len, dtype=array_in.dtype)
|
752
|
-
if len(cumulative_length) != len(array_in) or len(array_out) != out_len:
|
753
|
-
msg = (
|
754
|
-
f"bad lengths: cl ({len(cumulative_length)}) != in ({len(array_in)}) "
|
755
|
-
f"and cl[-1] ({cumulative_length[-1]}) != out ({len(array_out)})"
|
756
|
-
)
|
757
|
-
raise ValueError(msg)
|
758
|
-
return nb_explode(cumulative_length, array_in, array_out)
|
759
|
-
|
760
|
-
|
761
|
-
@numba.njit(**nb_kwargs)
|
762
|
-
def nb_explode(
|
763
|
-
cumulative_length: NDArray, array_in: NDArray, array_out: NDArray
|
764
|
-
) -> NDArray:
|
765
|
-
"""Numbified inner loop for :func:`.explode`."""
|
766
|
-
out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
|
767
|
-
if len(cumulative_length) != len(array_in) or len(array_out) != out_len:
|
768
|
-
msg = "bad lengths"
|
769
|
-
raise ValueError(msg)
|
770
|
-
ii = 0
|
771
|
-
for jj in range(len(array_out)):
|
772
|
-
while ii < len(cumulative_length) and jj >= cumulative_length[ii]:
|
773
|
-
ii += 1
|
774
|
-
array_out[jj] = array_in[ii]
|
775
|
-
return array_out
|
776
|
-
|
777
|
-
|
778
|
-
def explode_arrays(
|
779
|
-
cumulative_length: Array,
|
780
|
-
arrays: list[NDArray],
|
781
|
-
arrays_out: list[NDArray] | None = None,
|
782
|
-
) -> list:
|
783
|
-
"""Explode a set of arrays using a `cumulative_length` array.
|
784
|
-
|
785
|
-
Parameters
|
786
|
-
----------
|
787
|
-
cumulative_length
|
788
|
-
the cumulative length array to use for exploding.
|
789
|
-
arrays
|
790
|
-
the data arrays to be exploded. Each array must have same length as
|
791
|
-
`cumulative_length`.
|
792
|
-
arrays_out
|
793
|
-
a list of pre-allocated arrays to hold the exploded data. The length of
|
794
|
-
the list should be equal to the length of `arrays`, and each entry in
|
795
|
-
arrays_out should have length ``cumulative_length[-1]``. If not
|
796
|
-
provided, output arrays are allocated for the user.
|
797
|
-
|
798
|
-
Returns
|
799
|
-
-------
|
800
|
-
arrays_out
|
801
|
-
the list of exploded cumulative length arrays.
|
802
|
-
"""
|
803
|
-
cumulative_length = np.asarray(cumulative_length)
|
804
|
-
for ii in range(len(arrays)):
|
805
|
-
arrays[ii] = np.asarray(arrays[ii])
|
806
|
-
out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
|
807
|
-
if arrays_out is None:
|
808
|
-
arrays_out = []
|
809
|
-
for array in arrays:
|
810
|
-
arrays_out.append(np.empty(out_len, dtype=array.dtype))
|
811
|
-
for ii in range(len(arrays)):
|
812
|
-
explode(cumulative_length, arrays[ii], arrays_out[ii])
|
813
|
-
return arrays_out
|