legend-pydataobj 1.11.6__py3-none-any.whl → 1.12.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lgdo/types/table.py CHANGED
@@ -19,7 +19,7 @@ from pandas.io.formats import format as fmt
19
19
 
20
20
  from .array import Array
21
21
  from .arrayofequalsizedarrays import ArrayOfEqualSizedArrays
22
- from .lgdo import LGDO
22
+ from .lgdo import LGDO, LGDOCollection
23
23
  from .scalar import Scalar
24
24
  from .struct import Struct
25
25
  from .vectorofvectors import VectorOfVectors
@@ -27,13 +27,9 @@ from .vectorofvectors import VectorOfVectors
27
27
  log = logging.getLogger(__name__)
28
28
 
29
29
 
30
- class Table(Struct):
30
+ class Table(Struct, LGDOCollection):
31
31
  """A special struct of arrays or subtable columns of equal length.
32
32
 
33
- Holds onto an internal read/write location ``loc`` that is useful in
34
- managing table I/O using functions like :meth:`push_row`, :meth:`is_full`,
35
- and :meth:`clear`.
36
-
37
33
  Note
38
34
  ----
39
35
  If you write to a table and don't fill it up to its total size, be sure to
@@ -49,7 +45,7 @@ class Table(Struct):
49
45
 
50
46
  def __init__(
51
47
  self,
52
- col_dict: Mapping[str, LGDO] | pd.DataFrame | ak.Array | None = None,
48
+ col_dict: Mapping[str, LGDOCollection] | pd.DataFrame | ak.Array | None = None,
53
49
  size: int | None = None,
54
50
  attrs: Mapping[str, Any] | None = None,
55
51
  ) -> None:
@@ -65,7 +61,7 @@ class Table(Struct):
65
61
  col_dict
66
62
  instantiate this table using the supplied mapping of column names
67
63
  and array-like objects. Supported input types are: mapping of
68
- strings to LGDOs, :class:`pd.DataFrame` and :class:`ak.Array`.
64
+ strings to LGDOCollections, :class:`pd.DataFrame` and :class:`ak.Array`.
69
65
  Note 1: no copy is performed, the objects are used directly (unless
70
66
  :class:`ak.Array` is provided). Note 2: if `size` is not ``None``,
71
67
  all arrays will be resized to match it. Note 3: if the arrays have
@@ -85,7 +81,8 @@ class Table(Struct):
85
81
  col_dict = _ak_to_lgdo_or_col_dict(col_dict)
86
82
 
87
83
  # call Struct constructor
88
- super().__init__(obj_dict=col_dict, attrs=attrs)
84
+ Struct.__init__(self, obj_dict=col_dict)
85
+ LGDOCollection.__init__(self, attrs=attrs)
89
86
 
90
87
  # if col_dict is not empty, set size according to it
91
88
  # if size is also supplied, resize all fields to match it
@@ -93,13 +90,10 @@ class Table(Struct):
93
90
  if col_dict is not None and len(col_dict) > 0:
94
91
  self.resize(new_size=size, do_warn=(size is None))
95
92
 
96
- # if no col_dict, just set the size (default to 1024)
93
+ # if no col_dict, just set the size
97
94
  else:
98
95
  self.size = size if size is not None else None
99
96
 
100
- # always start at loc=0
101
- self.loc = 0
102
-
103
97
  def datatype_name(self) -> str:
104
98
  return "table"
105
99
 
@@ -107,7 +101,31 @@ class Table(Struct):
107
101
  """Provides ``__len__`` for this array-like class."""
108
102
  return self.size
109
103
 
110
- def resize(self, new_size: int | None = None, do_warn: bool = False) -> None:
104
+ def reserve_capacity(self, capacity: int | list) -> None:
105
+ "Set size (number of rows) of internal memory buffer"
106
+ if isinstance(capacity, int):
107
+ for obj in self.values():
108
+ obj.reserve_capacity(capacity)
109
+ else:
110
+ if len(capacity) != len(self.keys()):
111
+ msg = "List of capacities must have same length as number of keys"
112
+ raise ValueError(msg)
113
+
114
+ for obj, cap in zip(self.values(), capacity):
115
+ obj.reserve_capacity(cap)
116
+
117
+ def get_capacity(self) -> int:
118
+ "Get list of capacities for each key"
119
+ return [v.get_capacity() for v in self.values()]
120
+
121
+ def trim_capacity(self) -> int:
122
+ "Set capacity to be minimum needed to support Array size"
123
+ for v in self.values():
124
+ v.trim_capacity()
125
+
126
+ def resize(
127
+ self, new_size: int | None = None, do_warn: bool = False, trim: bool = False
128
+ ) -> None:
111
129
  # if new_size = None, use the size from the first field
112
130
  for field, obj in self.items():
113
131
  if new_size is None:
@@ -119,21 +137,20 @@ class Table(Struct):
119
137
  f"with size {len(obj)} != {new_size}"
120
138
  )
121
139
  if isinstance(obj, Table):
122
- obj.resize(new_size)
140
+ obj.resize(new_size, trim)
123
141
  else:
124
- obj.resize(new_size)
142
+ obj.resize(new_size, trim)
125
143
  self.size = new_size
126
144
 
127
- def push_row(self) -> None:
128
- self.loc += 1
129
-
130
- def is_full(self) -> bool:
131
- return self.loc >= self.size
132
-
133
- def clear(self) -> None:
134
- self.loc = 0
145
+ def insert(self, i: int, vals: dict) -> None:
146
+ "Insert vals into table at row i. Vals is a mapping from table key to val"
147
+ for k, ar in self.items():
148
+ ar.insert(i, vals[k])
149
+ self.size += 1
135
150
 
136
- def add_field(self, name: str, obj: LGDO, use_obj_size: bool = False) -> None:
151
+ def add_field(
152
+ self, name: str, obj: LGDOCollection, use_obj_size: bool = False
153
+ ) -> None:
137
154
  """Add a field (column) to the table.
138
155
 
139
156
  Use the name "field" here to match the terminology used in
@@ -170,7 +187,9 @@ class Table(Struct):
170
187
  new_size = len(obj) if use_obj_size else self.size
171
188
  self.resize(new_size=new_size)
172
189
 
173
- def add_column(self, name: str, obj: LGDO, use_obj_size: bool = False) -> None:
190
+ def add_column(
191
+ self, name: str, obj: LGDOCollection, use_obj_size: bool = False
192
+ ) -> None:
174
193
  """Alias for :meth:`.add_field` using table terminology 'column'."""
175
194
  self.add_field(name, obj, use_obj_size=use_obj_size)
176
195
 
@@ -201,8 +220,10 @@ class Table(Struct):
201
220
  set to ``False`` to turn off warnings associated with mismatched
202
221
  `loc` parameter or :meth:`add_column` warnings.
203
222
  """
204
- if other_table.loc != self.loc and do_warn:
205
- log.warning(f"other_table.loc ({other_table.loc}) != self.loc({self.loc})")
223
+ if len(other_table) != len(self) and do_warn:
224
+ log.warning(
225
+ f"len(other_table) ({len(other_table)}) != len(self) ({len(self)})"
226
+ )
206
227
  if cols is None:
207
228
  cols = other_table.keys()
208
229
  for name in cols:
@@ -20,12 +20,12 @@ from .. import utils
20
20
  from . import arrayofequalsizedarrays as aoesa
21
21
  from . import vovutils
22
22
  from .array import Array
23
- from .lgdo import LGDO
23
+ from .lgdo import LGDOCollection
24
24
 
25
25
  log = logging.getLogger(__name__)
26
26
 
27
27
 
28
- class VectorOfVectors(LGDO):
28
+ class VectorOfVectors(LGDOCollection):
29
29
  """A n-dimensional variable-length 1D array of variable-length 1D arrays.
30
30
 
31
31
  If the vector is 2-dimensional, the internal representation is as two NumPy
@@ -138,7 +138,7 @@ class VectorOfVectors(LGDO):
138
138
  # FIXME: have to copy the buffers, otherwise self will not own the
139
139
  # data and self.resize() will fail. Is it possible to avoid this?
140
140
  flattened_data = np.copy(
141
- container.pop(f"node{data.ndim-1}-data", np.empty(0, dtype=dtype))
141
+ container.pop(f"node{data.ndim - 1}-data", np.empty(0, dtype=dtype))
142
142
  )
143
143
 
144
144
  # if user-provided dtype is different than dtype from Awkward, cast
@@ -210,20 +210,17 @@ class VectorOfVectors(LGDO):
210
210
  elif self.flattened_data is None:
211
211
  self.flattened_data = flattened_data
212
212
 
213
- # finally set dtype
214
- self.dtype = self.flattened_data.dtype
215
-
216
- # set ndim
217
- self.ndim = 2
218
- pointer = self.flattened_data
219
- while True:
220
- if isinstance(pointer, Array):
221
- break
213
+ super().__init__(attrs)
222
214
 
223
- self.ndim += 1
224
- pointer = pointer.flattened_data
215
+ @property
216
+ def ndim(self):
217
+ return 1 + (
218
+ 1 if isinstance(self.flattened_data, Array) else self.flattened_data.ndim
219
+ )
225
220
 
226
- super().__init__(attrs)
221
+ @property
222
+ def dtype(self) -> np.dtype:
223
+ return self.flattened_data.dtype
227
224
 
228
225
  def datatype_name(self) -> str:
229
226
  return "array"
@@ -276,7 +273,30 @@ class VectorOfVectors(LGDO):
276
273
  else:
277
274
  raise NotImplementedError
278
275
 
279
- def resize(self, new_size: int) -> None:
276
+ def reserve_capacity(self, cap_cl, *cap_args) -> None:
277
+ """Set capacity of internal data arrays. Expect number of args to
278
+ equal `self.n_dim`. First arg is capacity of cumulative length array.
279
+ If `self.n_dim` is 2, second argument is capacity of flattened data,
280
+ otherwise arguments are fed recursively to remaining dimensions.
281
+ """
282
+ self.cumulative_length.reserve_capacity(cap_cl)
283
+ self.flattened_data.reserve_capacity(*cap_args)
284
+
285
+ def get_capacity(self) -> tuple[int]:
286
+ """Get tuple containing capacity of each dimension. First dimension
287
+ is cumulative length array. Last dimension is flattened data.
288
+ """
289
+ fd_cap = self.flattened_data.get_capacity()
290
+ if isinstance(fd_cap, int):
291
+ return (self.cumulative_length.get_capacity(), fd_cap)
292
+ return (self.cumulative_length.get_capacity(), *fd_cap)
293
+
294
+ def trim_capacity(self) -> None:
295
+ "Set capacity for all dimensions to minimum needed to hold data"
296
+ self.cumulative_length.trim_capacity()
297
+ self.flattened_data.trim_capacity()
298
+
299
+ def resize(self, new_size: int, trim: bool = False) -> None:
280
300
  """Resize vector along the first axis.
281
301
 
282
302
  `self.flattened_data` is resized only if `new_size` is smaller than the
@@ -286,6 +306,8 @@ class VectorOfVectors(LGDO):
286
306
  `self.cumulative_length` is padded with its last element. This
287
307
  corresponds to appending empty vectors.
288
308
 
309
+ If `trim` is ``True``, resize capacity to match new size
310
+
289
311
  Examples
290
312
  --------
291
313
  >>> vov = VectorOfVectors([[1, 2, 3], [4, 5]])
@@ -303,23 +325,22 @@ class VectorOfVectors(LGDO):
303
325
  [3],
304
326
  ]
305
327
  """
306
- vidx = self.cumulative_length
307
328
  old_s = len(self)
308
- dlen = new_size - old_s
309
- csum = vidx[-1] if len(self) > 0 else 0
310
329
 
311
330
  # first resize the cumulative length
312
- self.cumulative_length.resize(new_size)
331
+ self.cumulative_length.resize(new_size, trim)
313
332
 
314
333
  # if new_size > size, new elements are filled with zeros, let's fix
315
334
  # that
316
- if dlen > 0:
317
- self.cumulative_length[old_s:] = csum
335
+ if new_size > old_s:
336
+ self.cumulative_length[old_s:] = self.cumulative_length[old_s - 1]
318
337
 
319
338
  # then resize the data array
320
339
  # if dlen > 0 this has no effect
321
340
  if len(self.cumulative_length) > 0:
322
- self.flattened_data.resize(self.cumulative_length[-1])
341
+ self.flattened_data.resize(self.cumulative_length[-1], trim)
342
+ else:
343
+ self.flattened_data.resize(0, trim)
323
344
 
324
345
  def append(self, new: NDArray) -> None:
325
346
  """Append a 1D vector `new` at the end.
@@ -334,20 +355,7 @@ class VectorOfVectors(LGDO):
334
355
  [8 9],
335
356
  ]
336
357
  """
337
- if self.ndim == 2:
338
- # first extend cumulative_length by +1
339
- self.cumulative_length.resize(len(self) + 1)
340
- # set it at the right value
341
- newlen = (
342
- self.cumulative_length[-2] + len(new) if len(self) > 1 else len(new)
343
- )
344
- self.cumulative_length[-1] = newlen
345
- # then resize flattened_data to accommodate the new vector
346
- self.flattened_data.resize(len(self.flattened_data) + len(new))
347
- # finally set it
348
- self[-1] = new
349
- else:
350
- raise NotImplementedError
358
+ self.insert(len(self), new)
351
359
 
352
360
  def insert(self, i: int, new: NDArray) -> None:
353
361
  """Insert a vector at index `i`.
@@ -364,23 +372,15 @@ class VectorOfVectors(LGDO):
364
372
  [8 9],
365
373
  [4 5],
366
374
  ]
367
-
368
- Warning
369
- -------
370
- This method involves a significant amount of memory re-allocation and
371
- is expected to perform poorly on large vectors.
372
375
  """
373
376
  if self.ndim == 2:
374
- if i >= len(self):
375
- msg = f"index {i} is out of bounds for vector owith size {len(self)}"
377
+ if i > len(self):
378
+ msg = f"index {i} is out of bounds for vector with size {len(self)}"
376
379
  raise IndexError(msg)
377
380
 
378
- self.flattened_data = Array(
379
- np.insert(self.flattened_data, self.cumulative_length[i - 1], new)
380
- )
381
- self.cumulative_length = Array(
382
- np.insert(self.cumulative_length, i, self.cumulative_length[i - 1])
383
- )
381
+ i_start = 0 if i == 0 else self.cumulative_length[i - 1]
382
+ self.flattened_data.insert(i_start, new)
383
+ self.cumulative_length.insert(i, i_start)
384
384
  self.cumulative_length[i:] += np.uint32(len(new))
385
385
  else:
386
386
  raise NotImplementedError
@@ -400,11 +400,6 @@ class VectorOfVectors(LGDO):
400
400
  [[8 9],
401
401
  [4 5],
402
402
  ]
403
-
404
- Warning
405
- -------
406
- This method involves a significant amount of memory re-allocation and
407
- is expected to perform poorly on large vectors.
408
403
  """
409
404
  if self.ndim == 2:
410
405
  if i >= len(self):
@@ -414,27 +409,17 @@ class VectorOfVectors(LGDO):
414
409
  vidx = self.cumulative_length
415
410
  dlen = len(new) - len(self[i])
416
411
 
417
- if dlen == 0:
418
- # don't waste resources
419
- self[i] = new
420
- elif dlen < 0:
421
- start = vidx[i - 1]
422
- stop = start + len(new)
423
- # set the already allocated indices
424
- self.flattened_data[start:stop] = new
425
- # then delete the extra indices
426
- self.flattened_data = Array(
427
- np.delete(self.flattened_data, np.s_[stop : vidx[i]])
428
- )
429
- else:
430
- # set the already allocated indices
431
- self.flattened_data[vidx[i - 1] : vidx[i]] = new[: len(self[i])]
432
- # then insert the remaining
433
- self.flattened_data = Array(
434
- np.insert(self.flattened_data, vidx[i], new[len(self[i]) :])
435
- )
436
-
437
- vidx[i:] = vidx[i:] + dlen
412
+ if dlen != 0:
413
+ # move the subsequent entries
414
+ vidx[i:] += dlen
415
+ self.flattened_data.resize(vidx[-1])
416
+ self.flattened_data._nda[vidx[i] : vidx[-1]] = self.flattened_data._nda[
417
+ vidx[i] - dlen : vidx[-1] - dlen
418
+ ]
419
+
420
+ # set the already allocated indices
421
+ start = vidx[i - 1] if i > 0 else 0
422
+ self.flattened_data[start : vidx[i]] = new
438
423
  else:
439
424
  raise NotImplementedError
440
425
 
@@ -484,7 +469,15 @@ class VectorOfVectors(LGDO):
484
469
  cum_lens = np.add(start, lens.cumsum(), dtype=int)
485
470
 
486
471
  # fill with fast vectorized routine
487
- vovutils._nb_fill(vec, lens, self.flattened_data.nda[start : cum_lens[-1]])
472
+ if np.issubdtype(self.flattened_data.dtype, np.unsignedinteger):
473
+ nan_val = np.iinfo(self.flattened_data.dtype).max
474
+ if np.issubdtype(self.flattened_data.dtype, np.integer):
475
+ nan_val = np.iinfo(self.flattened_data.dtype).min
476
+ else:
477
+ nan_val = np.nan
478
+ vovutils._nb_fill(
479
+ vec, lens, nan_val, self.flattened_data.nda[start : cum_lens[-1]]
480
+ )
488
481
 
489
482
  # add new vector(s) length to cumulative_length
490
483
  self.cumulative_length[i : i + len(lens)] = cum_lens
lgdo/types/vovutils.py CHANGED
@@ -81,7 +81,7 @@ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> ND
81
81
 
82
82
  @numba.guvectorize(
83
83
  [
84
- f"{data_type}[:,:],{size_type}[:],{data_type}[:]"
84
+ f"{data_type}[:,:],{size_type}[:],{data_type},{data_type}[:]"
85
85
  for data_type in [
86
86
  "b1",
87
87
  "i1",
@@ -99,10 +99,12 @@ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> ND
99
99
  ]
100
100
  for size_type in ["i4", "i8", "u4", "u8"]
101
101
  ],
102
- "(l,m),(l),(n)",
102
+ "(l,m),(l),(),(n)",
103
103
  **nb_kwargs,
104
104
  )
105
- def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
105
+ def _nb_fill(
106
+ aoa_in: NDArray, len_in: NDArray, nan_val: int | float, flattened_array_out: NDArray
107
+ ):
106
108
  """Vectorized function to fill flattened array from array of arrays and
107
109
  lengths. Values in aoa_in past lengths will not be copied.
108
110
 
@@ -112,6 +114,9 @@ def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
112
114
  array of arrays containing values to be copied
113
115
  len_in
114
116
  array of vector lengths for each row of aoa_in
117
+ nan_val
118
+ value to use when len_in is longer than aoa_in. Should use
119
+ np.nan for floating point, and 0xfff... for integer types
115
120
  flattened_array_out
116
121
  flattened array to copy values into. Must be longer than sum of
117
122
  lengths in len_in
@@ -122,9 +127,14 @@ def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
122
127
  raise ValueError(msg)
123
128
 
124
129
  start = 0
130
+ max_len = aoa_in.shape[1]
125
131
  for i, ll in enumerate(len_in):
126
132
  stop = start + ll
127
- flattened_array_out[start:stop] = aoa_in[i, :ll]
133
+ if ll > max_len:
134
+ flattened_array_out[start : start + max_len] = aoa_in[i, :]
135
+ flattened_array_out[start + max_len : stop] = nan_val
136
+ else:
137
+ flattened_array_out[start:stop] = aoa_in[i, :ll]
128
138
  start = stop
129
139
 
130
140