legend-pydataobj 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,627 @@
1
+ """
2
+ Implements a LEGEND Data Object representing a variable-length array of
3
+ variable-length arrays and corresponding utilities.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import itertools
8
+ import logging
9
+ from collections.abc import Iterator
10
+ from typing import Any
11
+
12
+ import numba
13
+ import numpy as np
14
+ from numpy.typing import DTypeLike, NDArray
15
+
16
+ from .. import lgdo_utils as utils
17
+ from . import arrayofequalsizedarrays as aoesa
18
+ from .array import Array
19
+ from .lgdo import LGDO
20
+
21
+ log = logging.getLogger(__name__)
22
+
23
+
24
+ class VectorOfVectors(LGDO):
25
+ """A variable-length array of variable-length arrays.
26
+
27
+ For now only a 1D vector of 1D vectors is supported. Internal representation
28
+ is as two NumPy arrays, one to store the flattened data contiguosly and one
29
+ to store the cumulative sum of lengths of each vector.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ listoflists: list[list[int | float]] = None,
35
+ flattened_data: Array | NDArray = None,
36
+ cumulative_length: Array | NDArray = None,
37
+ shape_guess: tuple[int, int] = None,
38
+ dtype: DTypeLike = None,
39
+ fill_val: int | float = None,
40
+ attrs: dict[str, Any] = None,
41
+ ) -> None:
42
+ """
43
+ Parameters
44
+ ----------
45
+ listoflists
46
+ create a VectorOfVectors out of a Python list of lists. Takes
47
+ priority over `flattened_data` and `cumulative_length`.
48
+ flattened_data
49
+ if not ``None``, used as the internal array for
50
+ `self.flattened_data`. Otherwise, an internal `flattened_data` is
51
+ allocated based on `cumulative_length` (or `shape_guess`) and `dtype`.
52
+ cumulative_length
53
+ if not ``None``, used as the internal array for
54
+ `self.cumulative_length`. Should be `dtype` :any:`numpy.uint32`. If
55
+ `cumulative_length` is ``None``, an internal `cumulative_length` is
56
+ allocated based on the first element of `shape_guess`.
57
+ shape_guess
58
+ a NumPy-format shape specification, required if either of
59
+ `flattened_data` or `cumulative_length` are not supplied. The
60
+ first element should not be a guess and sets the number of vectors
61
+ to be stored. The second element is a guess or approximation of the
62
+ typical length of a stored vector, used to set the initial length
63
+ of `flattened_data` if it was not supplied.
64
+ dtype
65
+ sets the type of data stored in `flattened_data`. Required if
66
+ `flattened_data` and `listoflists` are ``None``.
67
+ fill_val
68
+ fill all of `self.flattened_data` with this value.
69
+ attrs
70
+ a set of user attributes to be carried along with this LGDO.
71
+ """
72
+ if listoflists is not None:
73
+ cl_nda = np.cumsum([len(ll) for ll in listoflists])
74
+ if dtype is None:
75
+ if len(cl_nda) == 0 or cl_nda[-1] == 0:
76
+ raise ValueError("listoflists can't be empty with dtype=None!")
77
+ else:
78
+ # Set dtype from the first element in the list
79
+ # Find it efficiently, allowing for zero-length lists as some of the entries
80
+ first_element = next(itertools.chain.from_iterable(listoflists))
81
+ dtype = type(first_element)
82
+
83
+ self.dtype = np.dtype(dtype)
84
+ self.cumulative_length = Array(cl_nda)
85
+ self.flattened_data = Array(
86
+ np.fromiter(
87
+ itertools.chain.from_iterable(listoflists), dtype=self.dtype
88
+ )
89
+ )
90
+
91
+ else:
92
+ if cumulative_length is None:
93
+ if shape_guess is None:
94
+ # just make an empty vector
95
+ self.cumulative_length = Array(np.empty((0,), dtype="uint32"))
96
+ else:
97
+ # initialize based on shape_guess
98
+ if shape_guess[1] <= 0:
99
+ self.cumulative_length = Array(
100
+ shape=(shape_guess[0],), dtype="uint32", fill_val=0
101
+ )
102
+ else:
103
+ self.cumulative_length = Array(
104
+ np.arange(
105
+ shape_guess[1],
106
+ np.prod(shape_guess) + 1,
107
+ shape_guess[1],
108
+ dtype="uint32",
109
+ )
110
+ )
111
+ else:
112
+ self.cumulative_length = Array(cumulative_length)
113
+
114
+ if flattened_data is None:
115
+ if dtype is None:
116
+ raise ValueError("flattened_data and dtype cannot both be None!")
117
+
118
+ length = 0
119
+ if cumulative_length is None:
120
+ if shape_guess is None:
121
+ # just make an empty vector
122
+ length = 0
123
+ else:
124
+ # use shape_guess
125
+ length = np.prod(shape_guess)
126
+ else:
127
+ # use cumulative_length
128
+ length = cumulative_length[-1]
129
+
130
+ self.flattened_data = Array(
131
+ shape=(length,), dtype=dtype, fill_val=fill_val
132
+ )
133
+ else:
134
+ self.flattened_data = Array(flattened_data)
135
+
136
+ # finally set dtype
137
+ self.dtype = self.flattened_data.dtype
138
+
139
+ super().__init__(attrs)
140
+
141
+ def datatype_name(self) -> str:
142
+ return "array"
143
+
144
+ def form_datatype(self) -> str:
145
+ et = utils.get_element_type(self)
146
+ return "array<1>{array<1>{" + et + "}}"
147
+
148
+ def __len__(self) -> int:
149
+ """Return the number of stored vectors."""
150
+ return len(self.cumulative_length)
151
+
152
+ def __eq__(self, other: VectorOfVectors) -> bool:
153
+ if isinstance(other, VectorOfVectors):
154
+ return (
155
+ self.flattened_data == other.flattened_data
156
+ and self.cumulative_length == other.cumulative_length
157
+ and self.dtype == other.dtype
158
+ and self.attrs == other.attrs
159
+ )
160
+
161
+ else:
162
+ return False
163
+
164
+ def __getitem__(self, i: int) -> list:
165
+ """Return vector at index `i`."""
166
+ stop = self.cumulative_length[i]
167
+ if i == 0 or i == -len(self):
168
+ return self.flattened_data[0:stop]
169
+ else:
170
+ return self.flattened_data[self.cumulative_length[i - 1] : stop]
171
+
172
+ def __setitem__(self, i: int, new: NDArray) -> None:
173
+ self.__getitem__(i)[:] = new
174
+
175
+ def resize(self, new_size: int) -> None:
176
+ """Resize vector along the first axis.
177
+
178
+ `self.flattened_data` is resized only if `new_size` is smaller than the
179
+ current vector length.
180
+
181
+ If `new_size` is larger than the current vector length,
182
+ `self.cumulative_length` is padded with its last element. This
183
+ corresponds to appending empty vectors.
184
+
185
+ Examples
186
+ --------
187
+ >>> vov = VectorOfVectors([[1, 2, 3], [4, 5]])
188
+ >>> vov.resize(3)
189
+ >>> print(vov)
190
+ [[1 2 3],
191
+ [4 5],
192
+ [],
193
+ ]
194
+
195
+ >>> vov = VectorOfVectors([[1, 2], [3], [4, 5]])
196
+ >>> vov.resize(2)
197
+ >>> print(vov)
198
+ [[1 2],
199
+ [3],
200
+ ]
201
+ """
202
+
203
+ vidx = self.cumulative_length
204
+ old_s = len(self)
205
+ dlen = new_size - old_s
206
+ csum = vidx[-1] if len(self) > 0 else 0
207
+
208
+ # first resize the cumulative length
209
+ self.cumulative_length.resize(new_size)
210
+
211
+ # if new_size > size, new elements are filled with zeros, let's fix
212
+ # that
213
+ if dlen > 0:
214
+ self.cumulative_length[old_s:] = csum
215
+
216
+ # then resize the data array
217
+ # if dlen > 0 this has no effect
218
+ if len(self.cumulative_length) > 0:
219
+ self.flattened_data.resize(self.cumulative_length[-1])
220
+
221
+ def append(self, new: NDArray) -> None:
222
+ """Append a 1D vector `new` at the end.
223
+
224
+ Examples
225
+ --------
226
+ >>> vov = VectorOfVectors([[1, 2, 3], [4, 5]])
227
+ >>> vov.append([8, 9])
228
+ >>> print(vov)
229
+ [[1 2 3],
230
+ [4 5],
231
+ [8 9],
232
+ ]
233
+ """
234
+ # first extend cumulative_length by +1
235
+ self.cumulative_length.resize(len(self) + 1)
236
+ # set it at the right value
237
+ newlen = self.cumulative_length[-2] + len(new) if len(self) > 1 else len(new)
238
+ self.cumulative_length[-1] = newlen
239
+ # then resize flattened_data to accommodate the new vector
240
+ self.flattened_data.resize(len(self.flattened_data) + len(new))
241
+ # finally set it
242
+ self[-1] = new
243
+
244
+ def insert(self, i: int, new: NDArray) -> None:
245
+ """Insert a vector at index `i`.
246
+
247
+ `self.flattened_data` (and therefore `self.cumulative_length`) is
248
+ resized in order to accommodate the new element.
249
+
250
+ Examples
251
+ --------
252
+ >>> vov = VectorOfVectors([[1, 2, 3], [4, 5]])
253
+ >>> vov.insert(1, [8, 9])
254
+ >>> print(vov)
255
+ [[1 2 3],
256
+ [8 9],
257
+ [4 5],
258
+ ]
259
+
260
+ Warning
261
+ -------
262
+ This method involves a significant amount of memory re-allocation and
263
+ is expected to perform poorly on large vectors.
264
+ """
265
+ if i >= len(self):
266
+ raise IndexError(
267
+ f"index {i} is out of bounds for vector owith size {len(self)}"
268
+ )
269
+
270
+ self.flattened_data = Array(
271
+ np.insert(self.flattened_data, self.cumulative_length[i - 1], new)
272
+ )
273
+ self.cumulative_length = Array(
274
+ np.insert(self.cumulative_length, i, self.cumulative_length[i - 1])
275
+ )
276
+ self.cumulative_length[i:] += np.uint32(len(new))
277
+
278
+ def replace(self, i: int, new: NDArray) -> None:
279
+ """Replace the vector at index `i` with `new`.
280
+
281
+ `self.flattened_data` (and therefore `self.cumulative_length`) is
282
+ resized, if the length of `new` is different from the vector currently
283
+ at index `i`.
284
+
285
+ Examples
286
+ --------
287
+ >>> vov = VectorOfVectors([[1, 2, 3], [4, 5]])
288
+ >>> vov.replace(0, [8, 9])
289
+ >>> print(vov)
290
+ [[8 9],
291
+ [4 5],
292
+ ]
293
+
294
+ Warning
295
+ -------
296
+ This method involves a significant amount of memory re-allocation and
297
+ is expected to perform poorly on large vectors.
298
+ """
299
+ if i >= len(self):
300
+ raise IndexError(
301
+ f"index {i} is out of bounds for vector with size {len(self)}"
302
+ )
303
+
304
+ vidx = self.cumulative_length
305
+ dlen = len(new) - len(self[i])
306
+
307
+ if dlen == 0:
308
+ # don't waste resources
309
+ self[i] = new
310
+ elif dlen < 0:
311
+ start = vidx[i - 1]
312
+ stop = start + len(new)
313
+ # set the already allocated indices
314
+ self.flattened_data[start:stop] = new
315
+ # then delete the extra indices
316
+ self.flattened_data = Array(
317
+ np.delete(self.flattened_data, np.s_[stop : vidx[i]])
318
+ )
319
+ else:
320
+ # set the already allocated indices
321
+ self.flattened_data[vidx[i - 1] : vidx[i]] = new[: len(self[i])]
322
+ # then insert the remaining
323
+ self.flattened_data = Array(
324
+ np.insert(self.flattened_data, vidx[i], new[len(self[i]) :])
325
+ )
326
+
327
+ vidx[i:] = vidx[i:] + dlen
328
+
329
+ def _set_vector_unsafe(self, i: int, vec: NDArray) -> None:
330
+ r"""Insert vector `vec` at position `i`.
331
+
332
+ Assumes that ``j = self.cumulative_length[i-1]`` is the index (in
333
+ `self.flattened_data`) of the end of the `(i-1)`\ th vector and copies
334
+ `vec` in ``self.flattened_data[j:len(vec)]``. Finally updates
335
+ ``self.cumulative_length[i]`` with the new flattened data array length.
336
+
337
+ Vectors stored after index `i` can be overridden, producing unintended
338
+ behavior. This method is typically used for fast sequential fill of a
339
+ pre-allocated vector of vectors.
340
+
341
+ Danger
342
+ ------
343
+ This method can lead to undefined behavior or vector invalidation if
344
+ used improperly. Use it only if you know what you are doing.
345
+
346
+ See Also
347
+ --------
348
+ append, replace, insert
349
+ """
350
+ start = 0 if i == 0 else self.cumulative_length[i - 1]
351
+ end = start + len(vec)
352
+ self.flattened_data[start:end] = vec
353
+ self.cumulative_length[i] = end
354
+
355
+ def __iter__(self) -> Iterator[NDArray]:
356
+ for j, stop in enumerate(self.cumulative_length):
357
+ if j == 0:
358
+ yield self.flattened_data[0:stop]
359
+ else:
360
+ yield self.flattened_data[self.cumulative_length[j - 1] : stop]
361
+
362
+ def __str__(self) -> str:
363
+ string = ""
364
+ pos = 0
365
+ for vec in self:
366
+ if pos != 0:
367
+ string += " "
368
+
369
+ string += np.array2string(vec, prefix=" ")
370
+
371
+ if pos < len(self.cumulative_length):
372
+ string += ",\n"
373
+
374
+ pos += 1
375
+
376
+ string = f"[{string}]"
377
+
378
+ tmp_attrs = self.attrs.copy()
379
+ tmp_attrs.pop("datatype")
380
+ if len(tmp_attrs) > 0:
381
+ string += f" with attrs={tmp_attrs}"
382
+
383
+ return string
384
+
385
+ def __repr__(self) -> str:
386
+ npopt = np.get_printoptions()
387
+ np.set_printoptions(threshold=5, edgeitems=2, linewidth=100)
388
+ out = (
389
+ "VectorOfVectors(flattened_data="
390
+ + repr(self.flattened_data)
391
+ + ", cumulative_length="
392
+ + repr(self.cumulative_length)
393
+ + ", attrs="
394
+ + repr(self.attrs)
395
+ + ")"
396
+ )
397
+ np.set_printoptions(**npopt)
398
+ return out
399
+
400
+ def to_aoesa(self, preserve_dtype: bool = False) -> aoesa.ArrayOfEqualSizedArrays:
401
+ """Convert to :class:`ArrayOfEqualSizedArrays`.
402
+
403
+ If `preserve_dtype` is False, the output array will have dtype
404
+ :class:`numpy.float64` and is padded with :class:`numpy.nan`.
405
+ Otherwise, the dtype of the original :class:`VectorOfVectors` is
406
+ preserved.
407
+ """
408
+ ind_lengths = np.diff(self.cumulative_length.nda, prepend=0)
409
+ arr_len = np.max(ind_lengths)
410
+
411
+ if not preserve_dtype:
412
+ nda = np.empty((len(self.cumulative_length), arr_len))
413
+ nda.fill(np.nan)
414
+ else:
415
+ nda = np.empty((len(self.cumulative_length), arr_len), dtype=self.dtype)
416
+
417
+ for i in range(len(self.cumulative_length)):
418
+ nda[i, : ind_lengths[i]] = self[i]
419
+
420
+ return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs())
421
+
422
+
423
+ def build_cl(
424
+ sorted_array_in: NDArray, cumulative_length_out: NDArray = None
425
+ ) -> NDArray:
426
+ """Build a cumulative length array from an array of sorted data.
427
+
428
+ Examples
429
+ --------
430
+ >>> build_cl(np.array([3, 3, 3, 4])
431
+ array([3., 4.])
432
+
433
+ For a `sorted_array_in` of indices, this is the inverse of
434
+ :func:`.explode_cl`, in the sense that doing
435
+ ``build_cl(explode_cl(cumulative_length))`` would recover the original
436
+ `cumulative_length`.
437
+
438
+ Parameters
439
+ ----------
440
+ sorted_array_in
441
+ array of data already sorted; each N matching contiguous entries will
442
+ be converted into a new row of `cumulative_length_out`.
443
+ cumulative_length_out
444
+ a pre-allocated array for the output `cumulative_length`. It will
445
+ always have length <= `sorted_array_in`, so giving them the same length
446
+ is safe if there is not a better guess.
447
+
448
+ Returns
449
+ -------
450
+ cumulative_length_out
451
+ the output cumulative length array. If the user provides a
452
+ `cumulative_length_out` that is too long, this return value is sliced
453
+ to contain only the used portion of the allocated memory.
454
+ """
455
+ if len(sorted_array_in) == 0:
456
+ return None
457
+ sorted_array_in = np.asarray(sorted_array_in)
458
+ if cumulative_length_out is None:
459
+ cumulative_length_out = np.zeros(len(sorted_array_in), dtype=np.uint64)
460
+ else:
461
+ cumulative_length_out.fill(0)
462
+ if len(cumulative_length_out) == 0 and len(sorted_array_in) > 0:
463
+ raise ValueError(
464
+ "cumulative_length_out too short ({len(cumulative_length_out)})"
465
+ )
466
+ return _nb_build_cl(sorted_array_in, cumulative_length_out)
467
+
468
+
469
+ @numba.njit
470
+ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> NDArray:
471
+ """numbified inner loop for build_cl"""
472
+ ii = 0
473
+ last_val = sorted_array_in[0]
474
+ for val in sorted_array_in:
475
+ if val != last_val:
476
+ ii += 1
477
+ cumulative_length_out[ii] = cumulative_length_out[ii - 1]
478
+ if ii >= len(cumulative_length_out):
479
+ raise RuntimeError("cumulative_length_out too short")
480
+ last_val = val
481
+ cumulative_length_out[ii] += 1
482
+ ii += 1
483
+ return cumulative_length_out[:ii]
484
+
485
+
486
+ def explode_cl(cumulative_length: NDArray, array_out: NDArray = None) -> NDArray:
487
+ """Explode a `cumulative_length` array.
488
+
489
+ Examples
490
+ --------
491
+ >>> explode_cl(np.array([2, 3]))
492
+ array([0., 0., 1.])
493
+
494
+ This is the inverse of :func:`.build_cl`, in the sense that doing
495
+ ``build_cl(explode_cl(cumulative_length))`` would recover the original
496
+ `cumulative_length`.
497
+
498
+ Parameters
499
+ ----------
500
+ cumulative_length
501
+ the cumulative length array to be exploded.
502
+ array_out
503
+ a pre-allocated array to hold the exploded cumulative length array.
504
+ The length should be equal to ``cumulative_length[-1]``.
505
+
506
+ Returns
507
+ -------
508
+ array_out
509
+ the exploded cumulative length array.
510
+ """
511
+ cumulative_length = np.asarray(cumulative_length)
512
+ out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
513
+ if array_out is None:
514
+ array_out = np.empty(int(out_len), dtype=np.uint64)
515
+ if len(array_out) != out_len:
516
+ raise ValueError(
517
+ f"bad lengths: cl[-1] ({cumulative_length[-1]}) != out ({len(array_out)})"
518
+ )
519
+ return _nb_explode_cl(cumulative_length, array_out)
520
+
521
+
522
+ @numba.njit
523
+ def _nb_explode_cl(cumulative_length: NDArray, array_out: NDArray) -> NDArray:
524
+ """numbified inner loop for explode_cl"""
525
+ out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
526
+ if len(array_out) != out_len:
527
+ raise ValueError("bad lengths")
528
+ start = 0
529
+ for ii in range(len(cumulative_length)):
530
+ nn = int(cumulative_length[ii] - start)
531
+ for jj in range(nn):
532
+ array_out[int(start + jj)] = ii
533
+ start = cumulative_length[ii]
534
+ return array_out
535
+
536
+
537
+ def explode(
538
+ cumulative_length: NDArray, array_in: NDArray, array_out: NDArray = None
539
+ ) -> NDArray:
540
+ """Explode a data array using a `cumulative_length` array.
541
+
542
+ This is identical to :func:`.explode_cl`, except `array_in` gets exploded
543
+ instead of `cumulative_length`.
544
+
545
+ Examples
546
+ --------
547
+ >>> explode(np.array([2, 3]), np.array([3, 4]))
548
+ array([3., 3., 4.])
549
+
550
+ Parameters
551
+ ----------
552
+ cumulative_length
553
+ the cumulative length array to use for exploding.
554
+ array_in
555
+ the data to be exploded. Must have same length as `cumulative_length`.
556
+ array_out
557
+ a pre-allocated array to hold the exploded data. The length should be
558
+ equal to ``cumulative_length[-1]``.
559
+
560
+ Returns
561
+ -------
562
+ array_out
563
+ the exploded cumulative length array.
564
+ """
565
+ cumulative_length = np.asarray(cumulative_length)
566
+ array_in = np.asarray(array_in)
567
+ out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
568
+ if array_out is None:
569
+ array_out = np.empty(out_len, dtype=array_in.dtype)
570
+ if len(cumulative_length) != len(array_in) or len(array_out) != out_len:
571
+ raise ValueError(
572
+ f"bad lengths: cl ({len(cumulative_length)}) != in ({len(array_in)}) "
573
+ f"and cl[-1] ({cumulative_length[-1]}) != out ({len(array_out)})"
574
+ )
575
+ return nb_explode(cumulative_length, array_in, array_out)
576
+
577
+
578
+ @numba.njit
579
+ def nb_explode(
580
+ cumulative_length: NDArray, array_in: NDArray, array_out: NDArray
581
+ ) -> NDArray:
582
+ """Numbified inner loop for :func:`.explode`."""
583
+ out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
584
+ if len(cumulative_length) != len(array_in) or len(array_out) != out_len:
585
+ raise ValueError("bad lengths")
586
+ ii = 0
587
+ for jj in range(len(array_out)):
588
+ while ii < len(cumulative_length) and jj >= cumulative_length[ii]:
589
+ ii += 1
590
+ array_out[jj] = array_in[ii]
591
+ return array_out
592
+
593
+
594
+ def explode_arrays(
595
+ cumulative_length: Array, arrays: list[NDArray], arrays_out: list[NDArray] = None
596
+ ) -> list:
597
+ """Explode a set of arrays using a `cumulative_length` array.
598
+
599
+ Parameters
600
+ ----------
601
+ cumulative_length
602
+ the cumulative length array to use for exploding.
603
+ arrays
604
+ the data arrays to be exploded. Each array must have same length as
605
+ `cumulative_length`.
606
+ arrays_out
607
+ a list of pre-allocated arrays to hold the exploded data. The length of
608
+ the list should be equal to the length of `arrays`, and each entry in
609
+ arrays_out should have length ``cumulative_length[-1]``. If not
610
+ provided, output arrays are allocated for the user.
611
+
612
+ Returns
613
+ -------
614
+ arrays_out
615
+ the list of exploded cumulative length arrays.
616
+ """
617
+ cumulative_length = np.asarray(cumulative_length)
618
+ for ii in range(len(arrays)):
619
+ arrays[ii] = np.asarray(arrays[ii])
620
+ out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
621
+ if arrays_out is None:
622
+ arrays_out = []
623
+ for array in arrays:
624
+ arrays_out.append(np.empty(out_len, dtype=array.dtype))
625
+ for ii in range(len(arrays)):
626
+ explode(cumulative_length, arrays[ii], arrays_out[ii])
627
+ return arrays_out