legend-pydataobj 1.11.6__py3-none-any.whl → 1.12.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.12.0a1.dist-info}/METADATA +3 -2
- {legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.12.0a1.dist-info}/RECORD +23 -22
- {legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.12.0a1.dist-info}/WHEEL +1 -1
- {legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.12.0a1.dist-info}/entry_points.txt +1 -1
- lgdo/_version.py +9 -4
- lgdo/cli.py +10 -155
- lgdo/lh5/__init__.py +1 -0
- lgdo/lh5/_serializers/read/composite.py +1 -3
- lgdo/lh5/_serializers/read/utils.py +1 -1
- lgdo/lh5/_serializers/read/vector_of_vectors.py +1 -1
- lgdo/lh5/concat.py +219 -0
- lgdo/lh5/core.py +21 -30
- lgdo/lh5/iterator.py +48 -27
- lgdo/lh5/store.py +15 -68
- lgdo/types/array.py +74 -13
- lgdo/types/encoded.py +25 -20
- lgdo/types/histogram.py +1 -1
- lgdo/types/lgdo.py +50 -0
- lgdo/types/table.py +49 -28
- lgdo/types/vectorofvectors.py +70 -77
- lgdo/types/vovutils.py +14 -4
- {legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.12.0a1.dist-info/licenses}/LICENSE +0 -0
- {legend_pydataobj-1.11.6.dist-info → legend_pydataobj-1.12.0a1.dist-info}/top_level.txt +0 -0
lgdo/types/table.py
CHANGED
@@ -19,7 +19,7 @@ from pandas.io.formats import format as fmt
|
|
19
19
|
|
20
20
|
from .array import Array
|
21
21
|
from .arrayofequalsizedarrays import ArrayOfEqualSizedArrays
|
22
|
-
from .lgdo import LGDO
|
22
|
+
from .lgdo import LGDO, LGDOCollection
|
23
23
|
from .scalar import Scalar
|
24
24
|
from .struct import Struct
|
25
25
|
from .vectorofvectors import VectorOfVectors
|
@@ -27,13 +27,9 @@ from .vectorofvectors import VectorOfVectors
|
|
27
27
|
log = logging.getLogger(__name__)
|
28
28
|
|
29
29
|
|
30
|
-
class Table(Struct):
|
30
|
+
class Table(Struct, LGDOCollection):
|
31
31
|
"""A special struct of arrays or subtable columns of equal length.
|
32
32
|
|
33
|
-
Holds onto an internal read/write location ``loc`` that is useful in
|
34
|
-
managing table I/O using functions like :meth:`push_row`, :meth:`is_full`,
|
35
|
-
and :meth:`clear`.
|
36
|
-
|
37
33
|
Note
|
38
34
|
----
|
39
35
|
If you write to a table and don't fill it up to its total size, be sure to
|
@@ -49,7 +45,7 @@ class Table(Struct):
|
|
49
45
|
|
50
46
|
def __init__(
|
51
47
|
self,
|
52
|
-
col_dict: Mapping[str,
|
48
|
+
col_dict: Mapping[str, LGDOCollection] | pd.DataFrame | ak.Array | None = None,
|
53
49
|
size: int | None = None,
|
54
50
|
attrs: Mapping[str, Any] | None = None,
|
55
51
|
) -> None:
|
@@ -65,7 +61,7 @@ class Table(Struct):
|
|
65
61
|
col_dict
|
66
62
|
instantiate this table using the supplied mapping of column names
|
67
63
|
and array-like objects. Supported input types are: mapping of
|
68
|
-
strings to
|
64
|
+
strings to LGDOCollections, :class:`pd.DataFrame` and :class:`ak.Array`.
|
69
65
|
Note 1: no copy is performed, the objects are used directly (unless
|
70
66
|
:class:`ak.Array` is provided). Note 2: if `size` is not ``None``,
|
71
67
|
all arrays will be resized to match it. Note 3: if the arrays have
|
@@ -85,7 +81,8 @@ class Table(Struct):
|
|
85
81
|
col_dict = _ak_to_lgdo_or_col_dict(col_dict)
|
86
82
|
|
87
83
|
# call Struct constructor
|
88
|
-
|
84
|
+
Struct.__init__(self, obj_dict=col_dict)
|
85
|
+
LGDOCollection.__init__(self, attrs=attrs)
|
89
86
|
|
90
87
|
# if col_dict is not empty, set size according to it
|
91
88
|
# if size is also supplied, resize all fields to match it
|
@@ -93,13 +90,10 @@ class Table(Struct):
|
|
93
90
|
if col_dict is not None and len(col_dict) > 0:
|
94
91
|
self.resize(new_size=size, do_warn=(size is None))
|
95
92
|
|
96
|
-
# if no col_dict, just set the size
|
93
|
+
# if no col_dict, just set the size
|
97
94
|
else:
|
98
95
|
self.size = size if size is not None else None
|
99
96
|
|
100
|
-
# always start at loc=0
|
101
|
-
self.loc = 0
|
102
|
-
|
103
97
|
def datatype_name(self) -> str:
|
104
98
|
return "table"
|
105
99
|
|
@@ -107,7 +101,31 @@ class Table(Struct):
|
|
107
101
|
"""Provides ``__len__`` for this array-like class."""
|
108
102
|
return self.size
|
109
103
|
|
110
|
-
def
|
104
|
+
def reserve_capacity(self, capacity: int | list) -> None:
|
105
|
+
"Set size (number of rows) of internal memory buffer"
|
106
|
+
if isinstance(capacity, int):
|
107
|
+
for obj in self.values():
|
108
|
+
obj.reserve_capacity(capacity)
|
109
|
+
else:
|
110
|
+
if len(capacity) != len(self.keys()):
|
111
|
+
msg = "List of capacities must have same length as number of keys"
|
112
|
+
raise ValueError(msg)
|
113
|
+
|
114
|
+
for obj, cap in zip(self.values(), capacity):
|
115
|
+
obj.reserve_capacity(cap)
|
116
|
+
|
117
|
+
def get_capacity(self) -> int:
|
118
|
+
"Get list of capacities for each key"
|
119
|
+
return [v.get_capacity() for v in self.values()]
|
120
|
+
|
121
|
+
def trim_capacity(self) -> int:
|
122
|
+
"Set capacity to be minimum needed to support Array size"
|
123
|
+
for v in self.values():
|
124
|
+
v.trim_capacity()
|
125
|
+
|
126
|
+
def resize(
|
127
|
+
self, new_size: int | None = None, do_warn: bool = False, trim: bool = False
|
128
|
+
) -> None:
|
111
129
|
# if new_size = None, use the size from the first field
|
112
130
|
for field, obj in self.items():
|
113
131
|
if new_size is None:
|
@@ -119,21 +137,20 @@ class Table(Struct):
|
|
119
137
|
f"with size {len(obj)} != {new_size}"
|
120
138
|
)
|
121
139
|
if isinstance(obj, Table):
|
122
|
-
obj.resize(new_size)
|
140
|
+
obj.resize(new_size, trim)
|
123
141
|
else:
|
124
|
-
obj.resize(new_size)
|
142
|
+
obj.resize(new_size, trim)
|
125
143
|
self.size = new_size
|
126
144
|
|
127
|
-
def
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
def clear(self) -> None:
|
134
|
-
self.loc = 0
|
145
|
+
def insert(self, i: int, vals: dict) -> None:
|
146
|
+
"Insert vals into table at row i. Vals is a mapping from table key to val"
|
147
|
+
for k, ar in self.items():
|
148
|
+
ar.insert(i, vals[k])
|
149
|
+
self.size += 1
|
135
150
|
|
136
|
-
def add_field(
|
151
|
+
def add_field(
|
152
|
+
self, name: str, obj: LGDOCollection, use_obj_size: bool = False
|
153
|
+
) -> None:
|
137
154
|
"""Add a field (column) to the table.
|
138
155
|
|
139
156
|
Use the name "field" here to match the terminology used in
|
@@ -170,7 +187,9 @@ class Table(Struct):
|
|
170
187
|
new_size = len(obj) if use_obj_size else self.size
|
171
188
|
self.resize(new_size=new_size)
|
172
189
|
|
173
|
-
def add_column(
|
190
|
+
def add_column(
|
191
|
+
self, name: str, obj: LGDOCollection, use_obj_size: bool = False
|
192
|
+
) -> None:
|
174
193
|
"""Alias for :meth:`.add_field` using table terminology 'column'."""
|
175
194
|
self.add_field(name, obj, use_obj_size=use_obj_size)
|
176
195
|
|
@@ -201,8 +220,10 @@ class Table(Struct):
|
|
201
220
|
set to ``False`` to turn off warnings associated with mismatched
|
202
221
|
`loc` parameter or :meth:`add_column` warnings.
|
203
222
|
"""
|
204
|
-
if other_table
|
205
|
-
log.warning(
|
223
|
+
if len(other_table) != len(self) and do_warn:
|
224
|
+
log.warning(
|
225
|
+
f"len(other_table) ({len(other_table)}) != len(self) ({len(self)})"
|
226
|
+
)
|
206
227
|
if cols is None:
|
207
228
|
cols = other_table.keys()
|
208
229
|
for name in cols:
|
lgdo/types/vectorofvectors.py
CHANGED
@@ -20,12 +20,12 @@ from .. import utils
|
|
20
20
|
from . import arrayofequalsizedarrays as aoesa
|
21
21
|
from . import vovutils
|
22
22
|
from .array import Array
|
23
|
-
from .lgdo import
|
23
|
+
from .lgdo import LGDOCollection
|
24
24
|
|
25
25
|
log = logging.getLogger(__name__)
|
26
26
|
|
27
27
|
|
28
|
-
class VectorOfVectors(
|
28
|
+
class VectorOfVectors(LGDOCollection):
|
29
29
|
"""A n-dimensional variable-length 1D array of variable-length 1D arrays.
|
30
30
|
|
31
31
|
If the vector is 2-dimensional, the internal representation is as two NumPy
|
@@ -138,7 +138,7 @@ class VectorOfVectors(LGDO):
|
|
138
138
|
# FIXME: have to copy the buffers, otherwise self will not own the
|
139
139
|
# data and self.resize() will fail. Is it possible to avoid this?
|
140
140
|
flattened_data = np.copy(
|
141
|
-
container.pop(f"node{data.ndim-1}-data", np.empty(0, dtype=dtype))
|
141
|
+
container.pop(f"node{data.ndim - 1}-data", np.empty(0, dtype=dtype))
|
142
142
|
)
|
143
143
|
|
144
144
|
# if user-provided dtype is different than dtype from Awkward, cast
|
@@ -210,20 +210,17 @@ class VectorOfVectors(LGDO):
|
|
210
210
|
elif self.flattened_data is None:
|
211
211
|
self.flattened_data = flattened_data
|
212
212
|
|
213
|
-
|
214
|
-
self.dtype = self.flattened_data.dtype
|
215
|
-
|
216
|
-
# set ndim
|
217
|
-
self.ndim = 2
|
218
|
-
pointer = self.flattened_data
|
219
|
-
while True:
|
220
|
-
if isinstance(pointer, Array):
|
221
|
-
break
|
213
|
+
super().__init__(attrs)
|
222
214
|
|
223
|
-
|
224
|
-
|
215
|
+
@property
|
216
|
+
def ndim(self):
|
217
|
+
return 1 + (
|
218
|
+
1 if isinstance(self.flattened_data, Array) else self.flattened_data.ndim
|
219
|
+
)
|
225
220
|
|
226
|
-
|
221
|
+
@property
|
222
|
+
def dtype(self) -> np.dtype:
|
223
|
+
return self.flattened_data.dtype
|
227
224
|
|
228
225
|
def datatype_name(self) -> str:
|
229
226
|
return "array"
|
@@ -276,7 +273,30 @@ class VectorOfVectors(LGDO):
|
|
276
273
|
else:
|
277
274
|
raise NotImplementedError
|
278
275
|
|
279
|
-
def
|
276
|
+
def reserve_capacity(self, cap_cl, *cap_args) -> None:
|
277
|
+
"""Set capacity of internal data arrays. Expect number of args to
|
278
|
+
equal `self.n_dim`. First arg is capacity of cumulative length array.
|
279
|
+
If `self.n_dim` is 2, second argument is capacity of flattened data,
|
280
|
+
otherwise arguments are fed recursively to remaining dimensions.
|
281
|
+
"""
|
282
|
+
self.cumulative_length.reserve_capacity(cap_cl)
|
283
|
+
self.flattened_data.reserve_capacity(*cap_args)
|
284
|
+
|
285
|
+
def get_capacity(self) -> tuple[int]:
|
286
|
+
"""Get tuple containing capacity of each dimension. First dimension
|
287
|
+
is cumulative length array. Last dimension is flattened data.
|
288
|
+
"""
|
289
|
+
fd_cap = self.flattened_data.get_capacity()
|
290
|
+
if isinstance(fd_cap, int):
|
291
|
+
return (self.cumulative_length.get_capacity(), fd_cap)
|
292
|
+
return (self.cumulative_length.get_capacity(), *fd_cap)
|
293
|
+
|
294
|
+
def trim_capacity(self) -> None:
|
295
|
+
"Set capacity for all dimensions to minimum needed to hold data"
|
296
|
+
self.cumulative_length.trim_capacity()
|
297
|
+
self.flattened_data.trim_capacity()
|
298
|
+
|
299
|
+
def resize(self, new_size: int, trim: bool = False) -> None:
|
280
300
|
"""Resize vector along the first axis.
|
281
301
|
|
282
302
|
`self.flattened_data` is resized only if `new_size` is smaller than the
|
@@ -286,6 +306,8 @@ class VectorOfVectors(LGDO):
|
|
286
306
|
`self.cumulative_length` is padded with its last element. This
|
287
307
|
corresponds to appending empty vectors.
|
288
308
|
|
309
|
+
If `trim` is ``True``, resize capacity to match new size
|
310
|
+
|
289
311
|
Examples
|
290
312
|
--------
|
291
313
|
>>> vov = VectorOfVectors([[1, 2, 3], [4, 5]])
|
@@ -303,23 +325,22 @@ class VectorOfVectors(LGDO):
|
|
303
325
|
[3],
|
304
326
|
]
|
305
327
|
"""
|
306
|
-
vidx = self.cumulative_length
|
307
328
|
old_s = len(self)
|
308
|
-
dlen = new_size - old_s
|
309
|
-
csum = vidx[-1] if len(self) > 0 else 0
|
310
329
|
|
311
330
|
# first resize the cumulative length
|
312
|
-
self.cumulative_length.resize(new_size)
|
331
|
+
self.cumulative_length.resize(new_size, trim)
|
313
332
|
|
314
333
|
# if new_size > size, new elements are filled with zeros, let's fix
|
315
334
|
# that
|
316
|
-
if
|
317
|
-
self.cumulative_length[old_s:] =
|
335
|
+
if new_size > old_s:
|
336
|
+
self.cumulative_length[old_s:] = self.cumulative_length[old_s - 1]
|
318
337
|
|
319
338
|
# then resize the data array
|
320
339
|
# if dlen > 0 this has no effect
|
321
340
|
if len(self.cumulative_length) > 0:
|
322
|
-
self.flattened_data.resize(self.cumulative_length[-1])
|
341
|
+
self.flattened_data.resize(self.cumulative_length[-1], trim)
|
342
|
+
else:
|
343
|
+
self.flattened_data.resize(0, trim)
|
323
344
|
|
324
345
|
def append(self, new: NDArray) -> None:
|
325
346
|
"""Append a 1D vector `new` at the end.
|
@@ -334,20 +355,7 @@ class VectorOfVectors(LGDO):
|
|
334
355
|
[8 9],
|
335
356
|
]
|
336
357
|
"""
|
337
|
-
|
338
|
-
# first extend cumulative_length by +1
|
339
|
-
self.cumulative_length.resize(len(self) + 1)
|
340
|
-
# set it at the right value
|
341
|
-
newlen = (
|
342
|
-
self.cumulative_length[-2] + len(new) if len(self) > 1 else len(new)
|
343
|
-
)
|
344
|
-
self.cumulative_length[-1] = newlen
|
345
|
-
# then resize flattened_data to accommodate the new vector
|
346
|
-
self.flattened_data.resize(len(self.flattened_data) + len(new))
|
347
|
-
# finally set it
|
348
|
-
self[-1] = new
|
349
|
-
else:
|
350
|
-
raise NotImplementedError
|
358
|
+
self.insert(len(self), new)
|
351
359
|
|
352
360
|
def insert(self, i: int, new: NDArray) -> None:
|
353
361
|
"""Insert a vector at index `i`.
|
@@ -364,23 +372,15 @@ class VectorOfVectors(LGDO):
|
|
364
372
|
[8 9],
|
365
373
|
[4 5],
|
366
374
|
]
|
367
|
-
|
368
|
-
Warning
|
369
|
-
-------
|
370
|
-
This method involves a significant amount of memory re-allocation and
|
371
|
-
is expected to perform poorly on large vectors.
|
372
375
|
"""
|
373
376
|
if self.ndim == 2:
|
374
|
-
if i
|
375
|
-
msg = f"index {i} is out of bounds for vector
|
377
|
+
if i > len(self):
|
378
|
+
msg = f"index {i} is out of bounds for vector with size {len(self)}"
|
376
379
|
raise IndexError(msg)
|
377
380
|
|
378
|
-
self.
|
379
|
-
|
380
|
-
)
|
381
|
-
self.cumulative_length = Array(
|
382
|
-
np.insert(self.cumulative_length, i, self.cumulative_length[i - 1])
|
383
|
-
)
|
381
|
+
i_start = 0 if i == 0 else self.cumulative_length[i - 1]
|
382
|
+
self.flattened_data.insert(i_start, new)
|
383
|
+
self.cumulative_length.insert(i, i_start)
|
384
384
|
self.cumulative_length[i:] += np.uint32(len(new))
|
385
385
|
else:
|
386
386
|
raise NotImplementedError
|
@@ -400,11 +400,6 @@ class VectorOfVectors(LGDO):
|
|
400
400
|
[[8 9],
|
401
401
|
[4 5],
|
402
402
|
]
|
403
|
-
|
404
|
-
Warning
|
405
|
-
-------
|
406
|
-
This method involves a significant amount of memory re-allocation and
|
407
|
-
is expected to perform poorly on large vectors.
|
408
403
|
"""
|
409
404
|
if self.ndim == 2:
|
410
405
|
if i >= len(self):
|
@@ -414,27 +409,17 @@ class VectorOfVectors(LGDO):
|
|
414
409
|
vidx = self.cumulative_length
|
415
410
|
dlen = len(new) - len(self[i])
|
416
411
|
|
417
|
-
if dlen
|
418
|
-
#
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
)
|
429
|
-
else:
|
430
|
-
# set the already allocated indices
|
431
|
-
self.flattened_data[vidx[i - 1] : vidx[i]] = new[: len(self[i])]
|
432
|
-
# then insert the remaining
|
433
|
-
self.flattened_data = Array(
|
434
|
-
np.insert(self.flattened_data, vidx[i], new[len(self[i]) :])
|
435
|
-
)
|
436
|
-
|
437
|
-
vidx[i:] = vidx[i:] + dlen
|
412
|
+
if dlen != 0:
|
413
|
+
# move the subsequent entries
|
414
|
+
vidx[i:] += dlen
|
415
|
+
self.flattened_data.resize(vidx[-1])
|
416
|
+
self.flattened_data._nda[vidx[i] : vidx[-1]] = self.flattened_data._nda[
|
417
|
+
vidx[i] - dlen : vidx[-1] - dlen
|
418
|
+
]
|
419
|
+
|
420
|
+
# set the already allocated indices
|
421
|
+
start = vidx[i - 1] if i > 0 else 0
|
422
|
+
self.flattened_data[start : vidx[i]] = new
|
438
423
|
else:
|
439
424
|
raise NotImplementedError
|
440
425
|
|
@@ -484,7 +469,15 @@ class VectorOfVectors(LGDO):
|
|
484
469
|
cum_lens = np.add(start, lens.cumsum(), dtype=int)
|
485
470
|
|
486
471
|
# fill with fast vectorized routine
|
487
|
-
|
472
|
+
if np.issubdtype(self.flattened_data.dtype, np.unsignedinteger):
|
473
|
+
nan_val = np.iinfo(self.flattened_data.dtype).max
|
474
|
+
if np.issubdtype(self.flattened_data.dtype, np.integer):
|
475
|
+
nan_val = np.iinfo(self.flattened_data.dtype).min
|
476
|
+
else:
|
477
|
+
nan_val = np.nan
|
478
|
+
vovutils._nb_fill(
|
479
|
+
vec, lens, nan_val, self.flattened_data.nda[start : cum_lens[-1]]
|
480
|
+
)
|
488
481
|
|
489
482
|
# add new vector(s) length to cumulative_length
|
490
483
|
self.cumulative_length[i : i + len(lens)] = cum_lens
|
lgdo/types/vovutils.py
CHANGED
@@ -81,7 +81,7 @@ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> ND
|
|
81
81
|
|
82
82
|
@numba.guvectorize(
|
83
83
|
[
|
84
|
-
f"{data_type}[:,:],{size_type}[:],{data_type}[:]"
|
84
|
+
f"{data_type}[:,:],{size_type}[:],{data_type},{data_type}[:]"
|
85
85
|
for data_type in [
|
86
86
|
"b1",
|
87
87
|
"i1",
|
@@ -99,10 +99,12 @@ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> ND
|
|
99
99
|
]
|
100
100
|
for size_type in ["i4", "i8", "u4", "u8"]
|
101
101
|
],
|
102
|
-
"(l,m),(l),(n)",
|
102
|
+
"(l,m),(l),(),(n)",
|
103
103
|
**nb_kwargs,
|
104
104
|
)
|
105
|
-
def _nb_fill(
|
105
|
+
def _nb_fill(
|
106
|
+
aoa_in: NDArray, len_in: NDArray, nan_val: int | float, flattened_array_out: NDArray
|
107
|
+
):
|
106
108
|
"""Vectorized function to fill flattened array from array of arrays and
|
107
109
|
lengths. Values in aoa_in past lengths will not be copied.
|
108
110
|
|
@@ -112,6 +114,9 @@ def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
|
|
112
114
|
array of arrays containing values to be copied
|
113
115
|
len_in
|
114
116
|
array of vector lengths for each row of aoa_in
|
117
|
+
nan_val
|
118
|
+
value to use when len_in is longer than aoa_in. Should use
|
119
|
+
np.nan for floating point, and 0xfff... for integer types
|
115
120
|
flattened_array_out
|
116
121
|
flattened array to copy values into. Must be longer than sum of
|
117
122
|
lengths in len_in
|
@@ -122,9 +127,14 @@ def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
|
|
122
127
|
raise ValueError(msg)
|
123
128
|
|
124
129
|
start = 0
|
130
|
+
max_len = aoa_in.shape[1]
|
125
131
|
for i, ll in enumerate(len_in):
|
126
132
|
stop = start + ll
|
127
|
-
|
133
|
+
if ll > max_len:
|
134
|
+
flattened_array_out[start : start + max_len] = aoa_in[i, :]
|
135
|
+
flattened_array_out[start + max_len : stop] = nan_val
|
136
|
+
else:
|
137
|
+
flattened_array_out[start:stop] = aoa_in[i, :ll]
|
128
138
|
start = stop
|
129
139
|
|
130
140
|
|
File without changes
|
File without changes
|