legend-pydataobj 1.5.1__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/METADATA +1 -1
  2. legend_pydataobj-1.6.1.dist-info/RECORD +54 -0
  3. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/WHEEL +1 -1
  4. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/entry_points.txt +1 -0
  5. lgdo/__init__.py +7 -4
  6. lgdo/_version.py +2 -2
  7. lgdo/cli.py +237 -12
  8. lgdo/compression/__init__.py +1 -0
  9. lgdo/lh5/__init__.py +9 -1
  10. lgdo/lh5/_serializers/__init__.py +43 -0
  11. lgdo/lh5/_serializers/read/__init__.py +0 -0
  12. lgdo/lh5/_serializers/read/array.py +34 -0
  13. lgdo/lh5/_serializers/read/composite.py +405 -0
  14. lgdo/lh5/_serializers/read/encoded.py +129 -0
  15. lgdo/lh5/_serializers/read/ndarray.py +104 -0
  16. lgdo/lh5/_serializers/read/scalar.py +34 -0
  17. lgdo/lh5/_serializers/read/utils.py +12 -0
  18. lgdo/lh5/_serializers/read/vector_of_vectors.py +201 -0
  19. lgdo/lh5/_serializers/write/__init__.py +0 -0
  20. lgdo/lh5/_serializers/write/array.py +92 -0
  21. lgdo/lh5/_serializers/write/composite.py +259 -0
  22. lgdo/lh5/_serializers/write/scalar.py +23 -0
  23. lgdo/lh5/_serializers/write/vector_of_vectors.py +95 -0
  24. lgdo/lh5/core.py +272 -0
  25. lgdo/lh5/datatype.py +46 -0
  26. lgdo/lh5/exceptions.py +34 -0
  27. lgdo/lh5/iterator.py +1 -1
  28. lgdo/lh5/store.py +69 -1160
  29. lgdo/lh5/tools.py +27 -53
  30. lgdo/lh5/utils.py +130 -27
  31. lgdo/lh5_store.py +11 -2
  32. lgdo/logging.py +1 -0
  33. lgdo/types/__init__.py +1 -0
  34. lgdo/types/array.py +1 -0
  35. lgdo/types/arrayofequalsizedarrays.py +1 -0
  36. lgdo/types/encoded.py +3 -8
  37. lgdo/types/fixedsizearray.py +1 -0
  38. lgdo/types/struct.py +1 -0
  39. lgdo/types/table.py +46 -5
  40. lgdo/types/vectorofvectors.py +314 -458
  41. lgdo/types/vovutils.py +320 -0
  42. lgdo/types/waveformtable.py +1 -0
  43. lgdo/utils.py +1 -32
  44. legend_pydataobj-1.5.1.dist-info/RECORD +0 -36
  45. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/LICENSE +0 -0
  46. {legend_pydataobj-1.5.1.dist-info → legend_pydataobj-1.6.1.dist-info}/top_level.txt +0 -0
@@ -2,23 +2,22 @@
2
2
  Implements a LEGEND Data Object representing a variable-length array of
3
3
  variable-length arrays and corresponding utilities.
4
4
  """
5
+
5
6
  from __future__ import annotations
6
7
 
7
- import itertools
8
8
  import logging
9
- from collections.abc import Iterator
9
+ from collections.abc import Iterator, Mapping, Sequence
10
10
  from typing import Any
11
11
 
12
12
  import awkward as ak
13
13
  import awkward_pandas as akpd
14
- import numba
15
14
  import numpy as np
16
15
  import pandas as pd
17
- from numpy.typing import DTypeLike, NDArray
16
+ from numpy.typing import ArrayLike, DTypeLike, NDArray
18
17
 
19
18
  from .. import utils
20
- from ..utils import numba_defaults_kwargs as nb_kwargs
21
19
  from . import arrayofequalsizedarrays as aoesa
20
+ from . import vovutils
22
21
  from .array import Array
23
22
  from .lgdo import LGDO
24
23
 
@@ -26,30 +25,56 @@ log = logging.getLogger(__name__)
26
25
 
27
26
 
28
27
  class VectorOfVectors(LGDO):
29
- """A variable-length array of variable-length arrays.
28
+ """A n-dimensional variable-length 1D array of variable-length 1D arrays.
29
+
30
+ If the vector is 2-dimensional, the internal representation is as two NumPy
31
+ arrays, one to store the flattened data contiguosly
32
+ (:attr:`flattened_data`) and one to store the cumulative sum of lengths of
33
+ each vector (:attr:`cumulative_length`). When the dimension is more than 2,
34
+ :attr:`flattened_data` is a :class:`VectorOfVectors` itself.
30
35
 
31
- For now only a 1D vector of 1D vectors is supported. Internal representation
32
- is as two NumPy arrays, one to store the flattened data contiguosly and one
33
- to store the cumulative sum of lengths of each vector.
36
+ Examples
37
+ --------
38
+ >>> from lgdo import VectorOfVectors
39
+ >>> data = VectorOfVectors(
40
+ ... [[[1, 2], [3, 4, 5]], [[2], [4, 8, 9, 7]], [[5, 3, 1]]],
41
+ ... attrs={"units": "m"}
42
+ ... )
43
+ >>> print(data)
44
+ [[[1, 2], [3, 4, 5]],
45
+ [[2], [4, 8, 9, 7]],
46
+ [[5, 3, 1]]
47
+ ] with attrs={'units': 'm'}
48
+ >>> data.view_as("ak")
49
+ <Array [[[1, 2], [3, 4, 5]], ..., [[5, ..., 1]]] type='3 * var * var * int64'>
50
+
51
+ Note
52
+ ----
53
+ Many class methods are currently implemented only for 2D vectors and will
54
+ raise an exception on higher dimensional data.
34
55
  """
35
56
 
36
57
  def __init__(
37
58
  self,
38
- array: ak.Array | list[list[int | float]] = None,
39
- flattened_data: Array | NDArray = None,
40
- cumulative_length: Array | NDArray = None,
41
- shape_guess: tuple[int, int] | None = None,
42
- dtype: DTypeLike = None,
59
+ data: ArrayLike | None = None,
60
+ flattened_data: ArrayLike | None = None,
61
+ cumulative_length: ArrayLike | VectorOfVectors | None = None,
62
+ shape_guess: Sequence[int, ...] | None = None,
63
+ dtype: DTypeLike | None = None,
43
64
  fill_val: int | float | None = None,
44
- attrs: dict[str, Any] | None = None,
65
+ attrs: Mapping[str, Any] | None = None,
45
66
  ) -> None:
46
67
  """
47
68
  Parameters
48
69
  ----------
49
- array
50
- create a ``VectorOfVectors`` out of a Python list of lists or an
51
- :class:`ak.Array`. Takes priority over `flattened_data` and
52
- `cumulative_length`.
70
+ data
71
+ Any array-like structure accepted by the :class:`ak.Array`
72
+ constructor, with the exception that elements cannot be of type
73
+ ``OptionType``, ``UnionType`` or ``RecordType``. Takes priority
74
+ over `flattened_data` and `cumulative_length`. The serialization of
75
+ the :class:`ak.Array` is performed through :func:`ak.to_buffers`.
76
+ Since the latter returns non-data-owning NumPy arrays, which would
77
+ prevent later modifications like resizing, a copy is performed.
53
78
  flattened_data
54
79
  if not ``None``, used as the internal array for
55
80
  `self.flattened_data`. Otherwise, an internal `flattened_data` is
@@ -74,124 +99,181 @@ class VectorOfVectors(LGDO):
74
99
  attrs
75
100
  a set of user attributes to be carried along with this LGDO.
76
101
  """
77
- if array is not None:
78
- if isinstance(array, ak.Array):
79
- if array.ndim != 2:
80
- msg = (
81
- "cannot initialize a VectorOfVectors with "
82
- f"{array.ndim}-dimensional data"
83
- )
84
- raise ValueError(msg)
85
-
86
- form, length, container = ak.to_buffers(array)
87
-
88
- self.__init__(
89
- flattened_data=container["node1-data"],
90
- cumulative_length=container["node0-offsets"][1:],
102
+ # sanitize
103
+ if cumulative_length is not None and not isinstance(cumulative_length, Array):
104
+ cumulative_length = Array(cumulative_length)
105
+ if flattened_data is not None and not isinstance(
106
+ flattened_data, (Array, VectorOfVectors)
107
+ ):
108
+ flattened_data = Array(flattened_data)
109
+
110
+ if data is not None:
111
+ if not isinstance(data, ak.Array):
112
+ data = ak.Array(data)
113
+
114
+ if data.ndim < 2:
115
+ msg = (
116
+ "cannot initialize a VectorOfVectors with "
117
+ f"{data.ndim}-dimensional data"
91
118
  )
119
+ raise ValueError(msg)
92
120
 
93
- else:
94
- cl_nda = np.cumsum([len(ll) for ll in array])
95
- if dtype is None:
96
- if len(cl_nda) == 0 or cl_nda[-1] == 0:
97
- msg = "array can't be empty with dtype=None!"
98
- raise ValueError(msg)
121
+ # make sure it's not a record array
122
+ if not vovutils._ak_is_valid(data):
123
+ msg = "input array type is not supported!"
124
+ raise ValueError(msg)
99
125
 
100
- # Set dtype from the first element in the list
101
- # Find it efficiently, allowing for zero-length lists as some of the entries
102
- first_element = next(itertools.chain.from_iterable(array))
103
- dtype = type(first_element)
126
+ # array might be non-jagged! ('container' will hold a ndim NumPy array)
127
+ if not vovutils._ak_is_jagged(data):
128
+ data = ak.from_regular(data, axis=None)
129
+
130
+ # ak.to_buffer helps in de-serialization
131
+ # NOTE: ak.to_packed() needed?
132
+ form, length, container = ak.to_buffers(ak.to_packed(data))
133
+
134
+ # NOTE: node#-data is not even in the dict if the awkward array is empty
135
+ # NOTE: if the data arg was a numpy array, to_buffers() preserves
136
+ # the original dtype
137
+ # FIXME: have to copy the buffers, otherwise self will not own the
138
+ # data and self.resize() will fail. Is it possible to avoid this?
139
+ flattened_data = np.copy(
140
+ container.pop(f"node{data.ndim-1}-data", np.empty(0, dtype=dtype))
141
+ )
104
142
 
105
- self.dtype = np.dtype(dtype)
106
- self.cumulative_length = Array(cl_nda)
107
- self.flattened_data = Array(
108
- np.fromiter(itertools.chain.from_iterable(array), dtype=self.dtype)
109
- )
143
+ # if user-provided dtype is different than dtype from Awkward, cast
144
+ # NOTE: makes a copy only if needed
145
+ flattened_data = np.asarray(flattened_data, dtype=dtype)
146
+
147
+ # start from innermost VoV and build nested structure
148
+ for i in range(data.ndim - 2, -1, -1):
149
+ # NOTE: remember, omit the leading 0 from ak.Array offsets
150
+ cumulative_length = np.copy(container[f"node{i}-offsets"][1:])
151
+
152
+ if i != 0:
153
+ # at the beginning of the loop: initialize innermost
154
+ # flattened_data and replace current flattened_data
155
+ # reference. in the following iterations flattened_data is
156
+ # a VectorOfVectors
157
+ flattened_data = VectorOfVectors(
158
+ flattened_data=flattened_data,
159
+ cumulative_length=cumulative_length,
160
+ )
161
+
162
+ else:
163
+ # at end we need to initialize self with the latest flattened_data
164
+ self.__init__(
165
+ flattened_data=flattened_data,
166
+ cumulative_length=cumulative_length,
167
+ )
110
168
 
111
169
  else:
170
+ self.flattened_data = None
171
+ self.cumulative_length = None
172
+
173
+ # let's first setup cumulative_length...
112
174
  if cumulative_length is None:
113
- if shape_guess is None:
114
- # just make an empty vector
115
- self.cumulative_length = Array(np.empty((0,), dtype="uint32"))
116
175
  # initialize based on shape_guess
117
- elif shape_guess[1] <= 0:
118
- self.cumulative_length = Array(
119
- shape=(shape_guess[0],), dtype="uint32", fill_val=0
176
+ if shape_guess is None:
177
+ # just make an empty 2D vector
178
+ shape_guess = (0, 0)
179
+
180
+ # sanity check
181
+ if len(shape_guess) < 2:
182
+ msg = "shape_guess must be a sequence of 2 integers or more"
183
+ raise ValueError(msg)
184
+
185
+ # let's Awkward do the job here, we're lazy
186
+ if fill_val is not None:
187
+ self.__init__(
188
+ np.full(shape=shape_guess, fill_value=fill_val, dtype=dtype)
120
189
  )
121
190
  else:
122
- self.cumulative_length = Array(
123
- np.arange(
124
- shape_guess[1],
125
- np.prod(shape_guess) + 1,
126
- shape_guess[1],
127
- dtype="uint32",
128
- )
129
- )
191
+ self.__init__(np.empty(shape=shape_guess, dtype=dtype))
130
192
  else:
131
- self.cumulative_length = Array(cumulative_length)
132
-
133
- if flattened_data is None:
193
+ # if it's user provided just use it
194
+ self.cumulative_length = cumulative_length
195
+
196
+ # ...then flattened_data
197
+ # NOTE: self.flattened_data might have already been initialized
198
+ # above
199
+ if flattened_data is None and self.flattened_data is None:
200
+ # this happens when the cumulative_length arg is not None
134
201
  if dtype is None:
135
202
  msg = "flattened_data and dtype cannot both be None!"
136
203
  raise ValueError(msg)
137
204
 
138
- length = 0
139
- if cumulative_length is None:
140
- # just make an empty vector or use shape_guess
141
- length = 0 if shape_guess is None else np.prod(shape_guess)
142
- else:
143
- # use cumulative_length
144
- length = cumulative_length[-1]
145
-
205
+ # now ready to initialize the object!
146
206
  self.flattened_data = Array(
147
- shape=(length,), dtype=dtype, fill_val=fill_val
207
+ shape=(self.cumulative_length[-1],), dtype=dtype, fill_val=fill_val
148
208
  )
149
- else:
150
- self.flattened_data = Array(flattened_data)
209
+ elif self.flattened_data is None:
210
+ self.flattened_data = flattened_data
151
211
 
152
212
  # finally set dtype
153
213
  self.dtype = self.flattened_data.dtype
154
214
 
215
+ # set ndim
216
+ self.ndim = 2
217
+ pointer = self.flattened_data
218
+ while True:
219
+ if isinstance(pointer, Array):
220
+ break
221
+
222
+ self.ndim += 1
223
+ pointer = pointer.flattened_data
224
+
155
225
  super().__init__(attrs)
156
226
 
157
227
  def datatype_name(self) -> str:
158
228
  return "array"
159
229
 
160
230
  def form_datatype(self) -> str:
161
- et = utils.get_element_type(self)
162
- return "array<1>{array<1>{" + et + "}}"
231
+ eltype = (
232
+ "array<1>{" + utils.get_element_type(self) + "}"
233
+ if self.ndim == 2
234
+ else self.flattened_data.form_datatype()
235
+ )
236
+ return "array<1>{" + eltype + "}"
163
237
 
164
238
  def __len__(self) -> int:
165
- """Return the number of stored vectors."""
239
+ """Return the number of stored vectors along the first axis (0)."""
166
240
  return len(self.cumulative_length)
167
241
 
168
242
  def __eq__(self, other: VectorOfVectors) -> bool:
169
243
  if isinstance(other, VectorOfVectors):
244
+ if self.ndim == 2 and len(self.cumulative_length) != 0:
245
+ fldata_eq = np.array_equal(
246
+ self.flattened_data[: self.cumulative_length[-1]],
247
+ other.flattened_data[: other.cumulative_length[-1]],
248
+ )
249
+ else:
250
+ fldata_eq = self.flattened_data == other.flattened_data
251
+
170
252
  return (
171
253
  self.cumulative_length == other.cumulative_length
172
- and (
173
- len(self.cumulative_length) == 0
174
- or np.all(
175
- self.flattened_data[: self.cumulative_length[-1]]
176
- == other.flattened_data[: other.cumulative_length[-1]]
177
- )
178
- )
254
+ and fldata_eq
179
255
  and self.dtype == other.dtype
180
256
  and self.attrs == other.attrs
181
257
  )
182
258
 
183
259
  return False
184
260
 
185
- def __getitem__(self, i: int) -> list:
186
- """Return vector at index `i`."""
187
- stop = self.cumulative_length[i]
188
- if i in (0, -len(self)):
189
- return self.flattened_data[0:stop]
261
+ def __getitem__(self, i: int) -> NDArray:
262
+ """Return a view of the vector at index `i` along the first axis."""
263
+ if self.ndim == 2:
264
+ stop = self.cumulative_length[i]
265
+ if i in (0, -len(self)):
266
+ return self.flattened_data[0:stop]
267
+
268
+ return self.flattened_data[self.cumulative_length[i - 1] : stop]
190
269
 
191
- return self.flattened_data[self.cumulative_length[i - 1] : stop]
270
+ raise NotImplementedError
192
271
 
193
272
  def __setitem__(self, i: int, new: NDArray) -> None:
194
- self.__getitem__(i)[:] = new
273
+ if self.ndim == 2:
274
+ self.__getitem__(i)[:] = new
275
+ else:
276
+ raise NotImplementedError
195
277
 
196
278
  def resize(self, new_size: int) -> None:
197
279
  """Resize vector along the first axis.
@@ -220,24 +302,26 @@ class VectorOfVectors(LGDO):
220
302
  [3],
221
303
  ]
222
304
  """
223
-
224
- vidx = self.cumulative_length
225
- old_s = len(self)
226
- dlen = new_size - old_s
227
- csum = vidx[-1] if len(self) > 0 else 0
228
-
229
- # first resize the cumulative length
230
- self.cumulative_length.resize(new_size)
231
-
232
- # if new_size > size, new elements are filled with zeros, let's fix
233
- # that
234
- if dlen > 0:
235
- self.cumulative_length[old_s:] = csum
236
-
237
- # then resize the data array
238
- # if dlen > 0 this has no effect
239
- if len(self.cumulative_length) > 0:
240
- self.flattened_data.resize(self.cumulative_length[-1])
305
+ if self.ndim == 2:
306
+ vidx = self.cumulative_length
307
+ old_s = len(self)
308
+ dlen = new_size - old_s
309
+ csum = vidx[-1] if len(self) > 0 else 0
310
+
311
+ # first resize the cumulative length
312
+ self.cumulative_length.resize(new_size)
313
+
314
+ # if new_size > size, new elements are filled with zeros, let's fix
315
+ # that
316
+ if dlen > 0:
317
+ self.cumulative_length[old_s:] = csum
318
+
319
+ # then resize the data array
320
+ # if dlen > 0 this has no effect
321
+ if len(self.cumulative_length) > 0:
322
+ self.flattened_data.resize(self.cumulative_length[-1])
323
+ else:
324
+ raise NotImplementedError
241
325
 
242
326
  def append(self, new: NDArray) -> None:
243
327
  """Append a 1D vector `new` at the end.
@@ -252,15 +336,20 @@ class VectorOfVectors(LGDO):
252
336
  [8 9],
253
337
  ]
254
338
  """
255
- # first extend cumulative_length by +1
256
- self.cumulative_length.resize(len(self) + 1)
257
- # set it at the right value
258
- newlen = self.cumulative_length[-2] + len(new) if len(self) > 1 else len(new)
259
- self.cumulative_length[-1] = newlen
260
- # then resize flattened_data to accommodate the new vector
261
- self.flattened_data.resize(len(self.flattened_data) + len(new))
262
- # finally set it
263
- self[-1] = new
339
+ if self.ndim == 2:
340
+ # first extend cumulative_length by +1
341
+ self.cumulative_length.resize(len(self) + 1)
342
+ # set it at the right value
343
+ newlen = (
344
+ self.cumulative_length[-2] + len(new) if len(self) > 1 else len(new)
345
+ )
346
+ self.cumulative_length[-1] = newlen
347
+ # then resize flattened_data to accommodate the new vector
348
+ self.flattened_data.resize(len(self.flattened_data) + len(new))
349
+ # finally set it
350
+ self[-1] = new
351
+ else:
352
+ raise NotImplementedError
264
353
 
265
354
  def insert(self, i: int, new: NDArray) -> None:
266
355
  """Insert a vector at index `i`.
@@ -283,17 +372,20 @@ class VectorOfVectors(LGDO):
283
372
  This method involves a significant amount of memory re-allocation and
284
373
  is expected to perform poorly on large vectors.
285
374
  """
286
- if i >= len(self):
287
- msg = f"index {i} is out of bounds for vector owith size {len(self)}"
288
- raise IndexError(msg)
375
+ if self.ndim == 2:
376
+ if i >= len(self):
377
+ msg = f"index {i} is out of bounds for vector owith size {len(self)}"
378
+ raise IndexError(msg)
289
379
 
290
- self.flattened_data = Array(
291
- np.insert(self.flattened_data, self.cumulative_length[i - 1], new)
292
- )
293
- self.cumulative_length = Array(
294
- np.insert(self.cumulative_length, i, self.cumulative_length[i - 1])
295
- )
296
- self.cumulative_length[i:] += np.uint32(len(new))
380
+ self.flattened_data = Array(
381
+ np.insert(self.flattened_data, self.cumulative_length[i - 1], new)
382
+ )
383
+ self.cumulative_length = Array(
384
+ np.insert(self.cumulative_length, i, self.cumulative_length[i - 1])
385
+ )
386
+ self.cumulative_length[i:] += np.uint32(len(new))
387
+ else:
388
+ raise NotImplementedError
297
389
 
298
390
  def replace(self, i: int, new: NDArray) -> None:
299
391
  """Replace the vector at index `i` with `new`.
@@ -316,36 +408,41 @@ class VectorOfVectors(LGDO):
316
408
  This method involves a significant amount of memory re-allocation and
317
409
  is expected to perform poorly on large vectors.
318
410
  """
319
- if i >= len(self):
320
- msg = f"index {i} is out of bounds for vector with size {len(self)}"
321
- raise IndexError(msg)
322
-
323
- vidx = self.cumulative_length
324
- dlen = len(new) - len(self[i])
325
-
326
- if dlen == 0:
327
- # don't waste resources
328
- self[i] = new
329
- elif dlen < 0:
330
- start = vidx[i - 1]
331
- stop = start + len(new)
332
- # set the already allocated indices
333
- self.flattened_data[start:stop] = new
334
- # then delete the extra indices
335
- self.flattened_data = Array(
336
- np.delete(self.flattened_data, np.s_[stop : vidx[i]])
337
- )
338
- else:
339
- # set the already allocated indices
340
- self.flattened_data[vidx[i - 1] : vidx[i]] = new[: len(self[i])]
341
- # then insert the remaining
342
- self.flattened_data = Array(
343
- np.insert(self.flattened_data, vidx[i], new[len(self[i]) :])
344
- )
411
+ if self.ndim == 2:
412
+ if i >= len(self):
413
+ msg = f"index {i} is out of bounds for vector with size {len(self)}"
414
+ raise IndexError(msg)
415
+
416
+ vidx = self.cumulative_length
417
+ dlen = len(new) - len(self[i])
418
+
419
+ if dlen == 0:
420
+ # don't waste resources
421
+ self[i] = new
422
+ elif dlen < 0:
423
+ start = vidx[i - 1]
424
+ stop = start + len(new)
425
+ # set the already allocated indices
426
+ self.flattened_data[start:stop] = new
427
+ # then delete the extra indices
428
+ self.flattened_data = Array(
429
+ np.delete(self.flattened_data, np.s_[stop : vidx[i]])
430
+ )
431
+ else:
432
+ # set the already allocated indices
433
+ self.flattened_data[vidx[i - 1] : vidx[i]] = new[: len(self[i])]
434
+ # then insert the remaining
435
+ self.flattened_data = Array(
436
+ np.insert(self.flattened_data, vidx[i], new[len(self[i]) :])
437
+ )
345
438
 
346
- vidx[i:] = vidx[i:] + dlen
439
+ vidx[i:] = vidx[i:] + dlen
440
+ else:
441
+ raise NotImplementedError
347
442
 
348
- def _set_vector_unsafe(self, i: int, vec: NDArray, lens: NDArray = None) -> None:
443
+ def _set_vector_unsafe(
444
+ self, i: int, vec: NDArray, lens: ArrayLike | None = None
445
+ ) -> None:
349
446
  r"""Insert vector `vec` at position `i`.
350
447
 
351
448
  Assumes that ``j = self.cumulative_length[i-1]`` is the index (in
@@ -357,9 +454,9 @@ class VectorOfVectors(LGDO):
357
454
  behavior. This method is typically used for fast sequential fill of a
358
455
  pre-allocated vector of vectors.
359
456
 
360
- If vec is 1D array and lens is None, set using full array. If vec
361
- is 2D, require lens to be included, and fill each array only up to
362
- lengths in lens.
457
+ If i`vec` is 1D array and `lens` is ``None``, set using full array. If
458
+ `vec` is 2D, require `lens` to be included, and fill each array only up
459
+ to lengths in `lens`.
363
460
 
364
461
  Danger
365
462
  ------
@@ -370,39 +467,47 @@ class VectorOfVectors(LGDO):
370
467
  --------
371
468
  append, replace, insert
372
469
  """
373
- start = 0 if i == 0 else self.cumulative_length[i - 1]
374
- if len(vec.shape) == 1:
375
- vec = np.expand_dims(vec, axis=0)
376
- if lens is None:
377
- lens = np.array([vec.shape[1]], dtype="u4")
378
- if not isinstance(lens, np.ndarray):
379
- lens = np.array([lens], dtype="u4")
380
- cum_lens = start + lens.cumsum()
381
- _nb_fill(vec, lens, self.flattened_data.nda[start : cum_lens[-1]])
382
- self.cumulative_length[i : i + len(lens)] = cum_lens
470
+ if self.ndim == 2:
471
+ # check if current vector is empty and get the start index in
472
+ # flattened_data
473
+ start = 0 if i == 0 else self.cumulative_length[i - 1]
383
474
 
384
- def __iter__(self) -> Iterator[NDArray]:
385
- for j, stop in enumerate(self.cumulative_length):
386
- if j == 0:
387
- yield self.flattened_data[0:stop]
388
- else:
389
- yield self.flattened_data[self.cumulative_length[j - 1] : stop]
475
+ # if the new element is 1D, convert to dummy 2D
476
+ if len(vec.shape) == 1:
477
+ vec = np.expand_dims(vec, axis=0)
478
+ if lens is None:
479
+ lens = np.array([vec.shape[1]], dtype="u4")
390
480
 
391
- def __str__(self) -> str:
392
- string = ""
393
- pos = 0
394
- for vec in self:
395
- if pos != 0:
396
- string += " "
481
+ # this in case lens is 02, convert to 1D
482
+ if not isinstance(lens, np.ndarray):
483
+ lens = np.array([lens], dtype="u4")
397
484
 
398
- string += np.array2string(vec, prefix=" ")
485
+ # calculate stop index in flattened_data
486
+ cum_lens = start + lens.cumsum()
399
487
 
400
- if pos < len(self.cumulative_length):
401
- string += ",\n"
488
+ # fill with fast vectorized routine
489
+ vovutils._nb_fill(vec, lens, self.flattened_data.nda[start : cum_lens[-1]])
402
490
 
403
- pos += 1
491
+ # add new vector(s) length to cumulative_length
492
+ self.cumulative_length[i : i + len(lens)] = cum_lens
493
+ else:
494
+ raise NotImplementedError
404
495
 
405
- string = f"[{string}]"
496
+ def __iter__(self) -> Iterator[NDArray]:
497
+ if self.ndim == 2:
498
+ for j, stop in enumerate(self.cumulative_length):
499
+ if j == 0:
500
+ yield self.flattened_data[0:stop]
501
+ else:
502
+ yield self.flattened_data[self.cumulative_length[j - 1] : stop]
503
+ else:
504
+ raise NotImplementedError
505
+
506
+ def __str__(self) -> str:
507
+ string = self.view_as("ak").show(stream=None)
508
+
509
+ string = string.strip().removesuffix("]")
510
+ string += "\n]"
406
511
 
407
512
  tmp_attrs = self.attrs.copy()
408
513
  tmp_attrs.pop("datatype")
@@ -457,19 +562,22 @@ class VectorOfVectors(LGDO):
457
562
  original vector of vectors. The type `fill_val` must be a
458
563
  compatible one.
459
564
  """
460
- ak_arr = self.view_as("ak")
565
+ if self.ndim == 2:
566
+ ak_arr = self.view_as("ak")
461
567
 
462
- if max_len is None:
463
- max_len = int(ak.max(ak.count(ak_arr, axis=-1)))
568
+ if max_len is None:
569
+ max_len = int(ak.max(ak.count(ak_arr, axis=-1)))
464
570
 
465
- nda = ak.fill_none(ak.pad_none(ak_arr, max_len, clip=True), fill_val).to_numpy(
466
- allow_missing=False
467
- )
571
+ nda = ak.fill_none(
572
+ ak.pad_none(ak_arr, max_len, clip=True), fill_val
573
+ ).to_numpy(allow_missing=False)
468
574
 
469
- if preserve_dtype:
470
- nda = nda.astype(self.flattened_data.dtype, copy=False)
575
+ if preserve_dtype:
576
+ nda = nda.astype(self.flattened_data.dtype, copy=False)
471
577
 
472
- return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs())
578
+ return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs())
579
+
580
+ raise NotImplementedError
473
581
 
474
582
  def view_as(
475
583
  self,
@@ -519,6 +627,8 @@ class VectorOfVectors(LGDO):
519
627
  msg = "Pint does not support Awkward yet, you must view the data with_units=False"
520
628
  raise ValueError(msg)
521
629
 
630
+ # see https://github.com/scikit-hep/awkward/discussions/2848
631
+
522
632
  # cannot avoid making a copy here. we should add the leading 0 to
523
633
  # cumulative_length inside VectorOfVectors at some point in the
524
634
  # future
@@ -528,9 +638,15 @@ class VectorOfVectors(LGDO):
528
638
  offsets[1:] = self.cumulative_length
529
639
  offsets[0] = 0
530
640
 
641
+ content = (
642
+ ak.contents.NumpyArray(self.flattened_data.nda)
643
+ if self.ndim == 2
644
+ else self.flattened_data.view_as(library, with_units=with_units).layout
645
+ )
646
+
531
647
  layout = ak.contents.ListOffsetArray(
532
648
  offsets=ak.index.Index(offsets),
533
- content=ak.contents.NumpyArray(self.flattened_data.nda),
649
+ content=content,
534
650
  )
535
651
  return ak.Array(layout)
536
652
 
@@ -551,263 +667,3 @@ class VectorOfVectors(LGDO):
551
667
 
552
668
  msg = f"{library} is not a supported third-party format."
553
669
  raise ValueError(msg)
554
-
555
-
556
- def build_cl(
557
- sorted_array_in: NDArray, cumulative_length_out: NDArray = None
558
- ) -> NDArray:
559
- """Build a cumulative length array from an array of sorted data.
560
-
561
- Examples
562
- --------
563
- >>> build_cl(np.array([3, 3, 3, 4])
564
- array([3., 4.])
565
-
566
- For a `sorted_array_in` of indices, this is the inverse of
567
- :func:`.explode_cl`, in the sense that doing
568
- ``build_cl(explode_cl(cumulative_length))`` would recover the original
569
- `cumulative_length`.
570
-
571
- Parameters
572
- ----------
573
- sorted_array_in
574
- array of data already sorted; each N matching contiguous entries will
575
- be converted into a new row of `cumulative_length_out`.
576
- cumulative_length_out
577
- a pre-allocated array for the output `cumulative_length`. It will
578
- always have length <= `sorted_array_in`, so giving them the same length
579
- is safe if there is not a better guess.
580
-
581
- Returns
582
- -------
583
- cumulative_length_out
584
- the output cumulative length array. If the user provides a
585
- `cumulative_length_out` that is too long, this return value is sliced
586
- to contain only the used portion of the allocated memory.
587
- """
588
- if len(sorted_array_in) == 0:
589
- return None
590
- sorted_array_in = np.asarray(sorted_array_in)
591
- if cumulative_length_out is None:
592
- cumulative_length_out = np.zeros(len(sorted_array_in), dtype=np.uint64)
593
- else:
594
- cumulative_length_out.fill(0)
595
- if len(cumulative_length_out) == 0 and len(sorted_array_in) > 0:
596
- msg = "cumulative_length_out too short ({len(cumulative_length_out)})"
597
- raise ValueError(msg)
598
- return _nb_build_cl(sorted_array_in, cumulative_length_out)
599
-
600
-
601
- @numba.njit(**nb_kwargs)
602
- def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> NDArray:
603
- """numbified inner loop for build_cl"""
604
- ii = 0
605
- last_val = sorted_array_in[0]
606
- for val in sorted_array_in:
607
- if val != last_val:
608
- ii += 1
609
- cumulative_length_out[ii] = cumulative_length_out[ii - 1]
610
- if ii >= len(cumulative_length_out):
611
- msg = "cumulative_length_out too short"
612
- raise RuntimeError(msg)
613
- last_val = val
614
- cumulative_length_out[ii] += 1
615
- ii += 1
616
- return cumulative_length_out[:ii]
617
-
618
-
619
- @numba.guvectorize(
620
- [
621
- f"{data_type}[:,:],{size_type}[:],{data_type}[:]"
622
- for data_type in [
623
- "b1",
624
- "i1",
625
- "i2",
626
- "i4",
627
- "i8",
628
- "u1",
629
- "u2",
630
- "u4",
631
- "u8",
632
- "f4",
633
- "f8",
634
- "c8",
635
- "c16",
636
- ]
637
- for size_type in ["i4", "i8", "u4", "u8"]
638
- ],
639
- "(l,m),(l),(n)",
640
- **nb_kwargs,
641
- )
642
- def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
643
- """Vectorized function to fill flattened array from array of arrays and
644
- lengths. Values in aoa_in past lengths will not be copied.
645
-
646
- Parameters
647
- ----------
648
- aoa_in
649
- array of arrays containing values to be copied
650
- len_in
651
- array of vector lengths for each row of aoa_in
652
- flattened_array_out
653
- flattened array to copy values into. Must be longer than sum of
654
- lengths in len_in
655
- """
656
-
657
- if len(flattened_array_out) < len_in.sum():
658
- msg = "flattened array not large enough to hold values"
659
- raise ValueError(msg)
660
-
661
- start = 0
662
- for i, ll in enumerate(len_in):
663
- stop = start + ll
664
- flattened_array_out[start:stop] = aoa_in[i, :ll]
665
- start = stop
666
-
667
-
668
- def explode_cl(cumulative_length: NDArray, array_out: NDArray = None) -> NDArray:
669
- """Explode a `cumulative_length` array.
670
-
671
- Examples
672
- --------
673
- >>> explode_cl(np.array([2, 3]))
674
- array([0., 0., 1.])
675
-
676
- This is the inverse of :func:`.build_cl`, in the sense that doing
677
- ``build_cl(explode_cl(cumulative_length))`` would recover the original
678
- `cumulative_length`.
679
-
680
- Parameters
681
- ----------
682
- cumulative_length
683
- the cumulative length array to be exploded.
684
- array_out
685
- a pre-allocated array to hold the exploded cumulative length array.
686
- The length should be equal to ``cumulative_length[-1]``.
687
-
688
- Returns
689
- -------
690
- array_out
691
- the exploded cumulative length array.
692
- """
693
- cumulative_length = np.asarray(cumulative_length)
694
- out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
695
- if array_out is None:
696
- array_out = np.empty(int(out_len), dtype=np.uint64)
697
- if len(array_out) != out_len:
698
- msg = f"bad lengths: cl[-1] ({cumulative_length[-1]}) != out ({len(array_out)})"
699
- raise ValueError(msg)
700
- return _nb_explode_cl(cumulative_length, array_out)
701
-
702
-
703
- @numba.njit(**nb_kwargs)
704
- def _nb_explode_cl(cumulative_length: NDArray, array_out: NDArray) -> NDArray:
705
- """numbified inner loop for explode_cl"""
706
- out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
707
- if len(array_out) != out_len:
708
- msg = "bad lengths"
709
- raise ValueError(msg)
710
- start = 0
711
- for ii in range(len(cumulative_length)):
712
- nn = int(cumulative_length[ii] - start)
713
- for jj in range(nn):
714
- array_out[int(start + jj)] = ii
715
- start = cumulative_length[ii]
716
- return array_out
717
-
718
-
719
- def explode(
720
- cumulative_length: NDArray, array_in: NDArray, array_out: NDArray = None
721
- ) -> NDArray:
722
- """Explode a data array using a `cumulative_length` array.
723
-
724
- This is identical to :func:`.explode_cl`, except `array_in` gets exploded
725
- instead of `cumulative_length`.
726
-
727
- Examples
728
- --------
729
- >>> explode(np.array([2, 3]), np.array([3, 4]))
730
- array([3., 3., 4.])
731
-
732
- Parameters
733
- ----------
734
- cumulative_length
735
- the cumulative length array to use for exploding.
736
- array_in
737
- the data to be exploded. Must have same length as `cumulative_length`.
738
- array_out
739
- a pre-allocated array to hold the exploded data. The length should be
740
- equal to ``cumulative_length[-1]``.
741
-
742
- Returns
743
- -------
744
- array_out
745
- the exploded cumulative length array.
746
- """
747
- cumulative_length = np.asarray(cumulative_length)
748
- array_in = np.asarray(array_in)
749
- out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
750
- if array_out is None:
751
- array_out = np.empty(out_len, dtype=array_in.dtype)
752
- if len(cumulative_length) != len(array_in) or len(array_out) != out_len:
753
- msg = (
754
- f"bad lengths: cl ({len(cumulative_length)}) != in ({len(array_in)}) "
755
- f"and cl[-1] ({cumulative_length[-1]}) != out ({len(array_out)})"
756
- )
757
- raise ValueError(msg)
758
- return nb_explode(cumulative_length, array_in, array_out)
759
-
760
-
761
- @numba.njit(**nb_kwargs)
762
- def nb_explode(
763
- cumulative_length: NDArray, array_in: NDArray, array_out: NDArray
764
- ) -> NDArray:
765
- """Numbified inner loop for :func:`.explode`."""
766
- out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
767
- if len(cumulative_length) != len(array_in) or len(array_out) != out_len:
768
- msg = "bad lengths"
769
- raise ValueError(msg)
770
- ii = 0
771
- for jj in range(len(array_out)):
772
- while ii < len(cumulative_length) and jj >= cumulative_length[ii]:
773
- ii += 1
774
- array_out[jj] = array_in[ii]
775
- return array_out
776
-
777
-
778
- def explode_arrays(
779
- cumulative_length: Array,
780
- arrays: list[NDArray],
781
- arrays_out: list[NDArray] | None = None,
782
- ) -> list:
783
- """Explode a set of arrays using a `cumulative_length` array.
784
-
785
- Parameters
786
- ----------
787
- cumulative_length
788
- the cumulative length array to use for exploding.
789
- arrays
790
- the data arrays to be exploded. Each array must have same length as
791
- `cumulative_length`.
792
- arrays_out
793
- a list of pre-allocated arrays to hold the exploded data. The length of
794
- the list should be equal to the length of `arrays`, and each entry in
795
- arrays_out should have length ``cumulative_length[-1]``. If not
796
- provided, output arrays are allocated for the user.
797
-
798
- Returns
799
- -------
800
- arrays_out
801
- the list of exploded cumulative length arrays.
802
- """
803
- cumulative_length = np.asarray(cumulative_length)
804
- for ii in range(len(arrays)):
805
- arrays[ii] = np.asarray(arrays[ii])
806
- out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
807
- if arrays_out is None:
808
- arrays_out = []
809
- for array in arrays:
810
- arrays_out.append(np.empty(out_len, dtype=array.dtype))
811
- for ii in range(len(arrays)):
812
- explode(cumulative_length, arrays[ii], arrays_out[ii])
813
- return arrays_out