legend-pydataobj 1.11.7__py3-none-any.whl → 1.12.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: legend_pydataobj
3
- Version: 1.11.7
3
+ Version: 1.12.0a1
4
4
  Summary: LEGEND Python Data Objects
5
5
  Author: The LEGEND Collaboration
6
6
  Maintainer: The LEGEND Collaboration
@@ -1,6 +1,6 @@
1
- legend_pydataobj-1.11.7.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
1
+ legend_pydataobj-1.12.0a1.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
2
2
  lgdo/__init__.py,sha256=QMYK9HhoMi0pbahPN8mPD18gyTxscFgo7QKfCxVhy-0,3196
3
- lgdo/_version.py,sha256=WYo6AtimYOvXEEB_DEJYUqS-yeVHGFoR5t7JM_9dSwo,513
3
+ lgdo/_version.py,sha256=kTYHwRhTzZEJHpwJeVgXBi4yFTeQDpnR6MYkvCMA06Q,515
4
4
  lgdo/cli.py,sha256=s_EWTBWW76l7zWb6gaTSTjiT-0RzzcYEmjeFEQCVxfk,4647
5
5
  lgdo/lgdo_utils.py,sha256=6a2YWEwpyEMXlAyTHZMO01aqxy6SxJzPZkGNWKNWuS0,2567
6
6
  lgdo/lh5_store.py,sha256=5BzbJA9sLcqjp8bJDc2olwOiw0VS6rmfg3cfh1kQkRY,8512
@@ -14,18 +14,18 @@ lgdo/compression/radware.py,sha256=GcNTtjuyL7VBBqziUBmSqNXuhqy1bJJgvcyvyumPtrc,2
14
14
  lgdo/compression/utils.py,sha256=W2RkBrxPpXlat84dnU9Ad7d_tTws0irtGl7O1dNWjnk,1140
15
15
  lgdo/compression/varlen.py,sha256=6ZZUItyoOfygDdE0DyoISeFZfqdbH6xl7T0eclfarzg,15127
16
16
  lgdo/lh5/__init__.py,sha256=y1XE_mpFWwamrl7WVjAVSVB25X4PrEfdVXSneSQEmlQ,825
17
- lgdo/lh5/concat.py,sha256=5nO7dNSb0UEP9rZiWGTKH5Cfwsm5LSm3tBJM4Kd70u0,6336
18
- lgdo/lh5/core.py,sha256=__-A6Abctzfwfo4-xJi68xs2e4vfzONEQTJVrUCOw-I,13922
17
+ lgdo/lh5/concat.py,sha256=BZCgK7TWPKK8fMmha8K83d3bC31FVO1b5LOW7x-Ru1s,6186
18
+ lgdo/lh5/core.py,sha256=GjosZGUp4GSO5FtWV9eXUt_6DGU_OwJXODlj5K1j93M,13320
19
19
  lgdo/lh5/datatype.py,sha256=O_7BqOlX8PFMyG0ppkfUT5aps5HEqX0bpuKcJO3jhu0,1691
20
20
  lgdo/lh5/exceptions.py,sha256=3kj8avXl4eBGvebl3LG12gJEmw91W0T8PYR0AfvUAyM,1211
21
- lgdo/lh5/iterator.py,sha256=ZaBBnmuNIjinwO0JUY55wLxX8Om9rVRRzXBC5uHmSKM,19772
22
- lgdo/lh5/store.py,sha256=3wAaQDd1Zmo0_bQ9DbB-FbKS4Uy_Tb642qKHXtZpSw4,10643
21
+ lgdo/lh5/iterator.py,sha256=1ob9B7Bf3ioGCtZkUZoL6ibTxAwLf4ld8_33ghVVEa4,20498
22
+ lgdo/lh5/store.py,sha256=MYbMt-Mc7izELxuyLlSrrYrylCIzxc2CLzZYIVbZ33w,8455
23
23
  lgdo/lh5/tools.py,sha256=T9CgHA8A3_tVBMtiNJ6hATQKhdqI61m3cX4p2wGKc6c,9937
24
24
  lgdo/lh5/utils.py,sha256=ioz8DlyXZsejwnU2qYdIccdHcF12H62jgLkZsiDOLSM,6243
25
25
  lgdo/lh5/_serializers/__init__.py,sha256=eZzxMp1SeZWG0PkEXUiCz3XyprQ8EmelHUmJogC8xYE,1263
26
26
  lgdo/lh5/_serializers/read/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
27
  lgdo/lh5/_serializers/read/array.py,sha256=uWfMCihfAmW2DE2ewip2qCK_kvQC_mb2zvOv26uzijc,1000
28
- lgdo/lh5/_serializers/read/composite.py,sha256=yTm5dfTgkIL7eG9iZXxhdiRhG04cQLd_hybP4wmxCJE,11809
28
+ lgdo/lh5/_serializers/read/composite.py,sha256=UvkZHEhf0V7SFLxzF52eyP68hU0guGOLqosrfmIfeys,11729
29
29
  lgdo/lh5/_serializers/read/encoded.py,sha256=Q98c08d8LkZq2AlY4rThYECVaEqwbv4T2Urn7TGnsyE,4130
30
30
  lgdo/lh5/_serializers/read/ndarray.py,sha256=lFCXD6bSzmMOH7cVmvRYXakkfMCI8EoqTPNONRJ1F0s,3690
31
31
  lgdo/lh5/_serializers/read/scalar.py,sha256=kwhWm1T91pXf86CqtUUD8_qheSR92gXZrQVtssV5YCg,922
@@ -37,20 +37,20 @@ lgdo/lh5/_serializers/write/composite.py,sha256=I6lH0nWFIpAfZyG4-0rLxzg3mfazZ_FE
37
37
  lgdo/lh5/_serializers/write/scalar.py,sha256=JPt_fcdTKOSFp5hfJdcKIfK4hxhcD8vhOlvDF-7btQ8,763
38
38
  lgdo/lh5/_serializers/write/vector_of_vectors.py,sha256=puGQX9XF5P_5DVbm_Cc6TvPrsDywgBLSYtkqFNltbB4,3493
39
39
  lgdo/types/__init__.py,sha256=DNfOErPiAZg-7Gygkp6ZKAi20Yrm1mfderZHvKo1Y4s,821
40
- lgdo/types/array.py,sha256=sUxh1CNCaefrnybt5qdjmmMpVQa_RqFxUv1tJ_pyBbc,6537
40
+ lgdo/types/array.py,sha256=e3p93yrfzSmyBgWdGqqtETcKpM7_FxENaAErru15rvo,8904
41
41
  lgdo/types/arrayofequalsizedarrays.py,sha256=DOGJiTmc1QCdm7vLbE6uIRXoMPtt8uuCfmwQawgWf5s,4949
42
- lgdo/types/encoded.py,sha256=JW4U5ow7KLMzhKnmhdnxbC3SZJAs4bOEDZWKG4KY1uU,15293
42
+ lgdo/types/encoded.py,sha256=_e8u_BPfpjJbLnEdyTo9QG3kbNsGj0BN4gjdj3L1ndw,15640
43
43
  lgdo/types/fixedsizearray.py,sha256=7RjUwTz1bW0pcrdy27JlfrXPAuOU89Kj7pOuSUCojK8,1527
44
- lgdo/types/histogram.py,sha256=y6j2VDuGYYnLy7WI4J90ApS0PAwic4kCpouZPX09Nus,19974
45
- lgdo/types/lgdo.py,sha256=RQ2P70N7IWMBDnLLuJI3sm6zQTIKyOMSsKZtBNzmE90,2928
44
+ lgdo/types/histogram.py,sha256=Jz1lLH56BfYnmcUhxUHK1h2wLDQ0Abgyd-6LznU-3-k,19979
45
+ lgdo/types/lgdo.py,sha256=21YNtJCHnSO3M60rjsAdbMO5crDjL_0BtuFpudZ2xvU,4500
46
46
  lgdo/types/scalar.py,sha256=c5Es2vyDqyWTPV6mujzfIzMpC1jNWkEIcvYyWQUxH3Q,1933
47
47
  lgdo/types/struct.py,sha256=Q0OWLVd4B0ciLb8t6VsxU3MPbmGLZ7WfQNno1lSQS0Q,4918
48
- lgdo/types/table.py,sha256=VIHQOPXJHJgiCjMMb_p7EdbcCqLFSObHMdHSxC1Dm5Y,19212
49
- lgdo/types/vectorofvectors.py,sha256=K8w7CZou857I9YGkeOe2uYB20gbHl4OV9xhnnJPNOjc,24665
50
- lgdo/types/vovutils.py,sha256=7BWPP0BSj-92ifbCIUBcfqxG5-TS8uxujTyJJuDFI04,10302
48
+ lgdo/types/table.py,sha256=FkWesoEA9bmGGSW8Ewig1Zs77ffUoR_nggfYSmkWpjU,20079
49
+ lgdo/types/vectorofvectors.py,sha256=CtPR2WDBmJmzzfXwH4aUcNMB5LvTiGWmL_qRbFah3to,24756
50
+ lgdo/types/vovutils.py,sha256=WjvPLEJrRNjktnbyfypfgxZX-K_aOvcwPygfzoknsyA,10701
51
51
  lgdo/types/waveformtable.py,sha256=f2tS4f1OEoYaTM5ldCX9zmw8iSISCT3t3wS1SrPdu_o,9901
52
- legend_pydataobj-1.11.7.dist-info/METADATA,sha256=Z0-UFMzWILag78U1HkNpbYwKDb_JZkZ8kZLtW4T8gw0,44443
53
- legend_pydataobj-1.11.7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
54
- legend_pydataobj-1.11.7.dist-info/entry_points.txt,sha256=0KWfnwbuwhNn0vPUqARukjp04Ca6lzfZBSirouRmk7I,76
55
- legend_pydataobj-1.11.7.dist-info/top_level.txt,sha256=KyR-EUloqiXcQ62IWnzBmtInDtvsHl4q2ZJAZgTcLXE,5
56
- legend_pydataobj-1.11.7.dist-info/RECORD,,
52
+ legend_pydataobj-1.12.0a1.dist-info/METADATA,sha256=55pMph32j8h4LKGnoVEdvHX27bHr8k__sdT4L9O5dIA,44445
53
+ legend_pydataobj-1.12.0a1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
54
+ legend_pydataobj-1.12.0a1.dist-info/entry_points.txt,sha256=0KWfnwbuwhNn0vPUqARukjp04Ca6lzfZBSirouRmk7I,76
55
+ legend_pydataobj-1.12.0a1.dist-info/top_level.txt,sha256=KyR-EUloqiXcQ62IWnzBmtInDtvsHl4q2ZJAZgTcLXE,5
56
+ legend_pydataobj-1.12.0a1.dist-info/RECORD,,
lgdo/_version.py CHANGED
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '1.11.7'
21
- __version_tuple__ = version_tuple = (1, 11, 7)
20
+ __version__ = version = '1.12.0a1'
21
+ __version_tuple__ = version_tuple = (1, 12, 0)
@@ -353,15 +353,13 @@ def _h5_read_table(
353
353
  table = Table(col_dict=col_dict, attrs=attrs)
354
354
 
355
355
  # set (write) loc to end of tree
356
- table.loc = n_rows_read
356
+ table.resize(do_warn=True)
357
357
  return table, n_rows_read
358
358
 
359
359
  # We have read all fields into the object buffer. Run
360
360
  # checks: All columns should be the same size. So update
361
361
  # table's size as necessary, warn if any mismatches are found
362
362
  obj_buf.resize(do_warn=True)
363
- # set (write) loc to end of tree
364
- obj_buf.loc = obj_buf_start + n_rows_read
365
363
 
366
364
  # check attributes
367
365
  utils.check_obj_buf_attrs(obj_buf.attrs, attrs, fname, oname)
lgdo/lh5/concat.py CHANGED
@@ -76,7 +76,7 @@ def _get_lgdos(file, obj_list):
76
76
  continue
77
77
 
78
78
  # read as little as possible
79
- obj, _ = store.read(current, h5f0, n_rows=1)
79
+ obj = store.read(current, h5f0, n_rows=1)
80
80
  if isinstance(obj, (Table, Array, VectorOfVectors)):
81
81
  lgdos.append(current)
82
82
 
@@ -139,12 +139,6 @@ def _remove_nested_fields(lgdos: dict, obj_list: list):
139
139
  _inplace_table_filter(key, val, obj_list)
140
140
 
141
141
 
142
- def _slice(obj, n_rows):
143
- ak_obj = obj.view_as("ak")[:n_rows]
144
- obj_type = type(obj)
145
- return obj_type(ak_obj)
146
-
147
-
148
142
  def lh5concat(
149
143
  lh5_files: list,
150
144
  output: str,
@@ -186,8 +180,8 @@ def lh5concat(
186
180
  # loop over lgdo objects
187
181
  for lgdo in lgdos:
188
182
  # iterate over the files
189
- for lh5_obj, _, n_rows in LH5Iterator(lh5_files, lgdo):
190
- data = {lgdo: _slice(lh5_obj, n_rows)}
183
+ for lh5_obj in LH5Iterator(lh5_files, lgdo):
184
+ data = {lgdo: lh5_obj}
191
185
 
192
186
  # remove the nested fields
193
187
  _remove_nested_fields(data, obj_list)
lgdo/lh5/core.py CHANGED
@@ -4,6 +4,7 @@ import bisect
4
4
  import inspect
5
5
  import sys
6
6
  from collections.abc import Mapping, Sequence
7
+ from contextlib import suppress
7
8
  from typing import Any
8
9
 
9
10
  import h5py
@@ -92,8 +93,7 @@ def read(
92
93
  will be set to ``True``, while the rest will default to ``False``.
93
94
  obj_buf
94
95
  Read directly into memory provided in `obj_buf`. Note: the buffer
95
- will be expanded to accommodate the data requested. To maintain the
96
- buffer length, send in ``n_rows = len(obj_buf)``.
96
+ will be resized to accommodate the data retrieved.
97
97
  obj_buf_start
98
98
  Start location in ``obj_buf`` for read. For concatenating data to
99
99
  array-like objects.
@@ -106,12 +106,8 @@ def read(
106
106
 
107
107
  Returns
108
108
  -------
109
- (object, n_rows_read)
110
- `object` is the read-out object `n_rows_read` is the number of rows
111
- successfully read out. Essential for arrays when the amount of data
112
- is smaller than the object buffer. For scalars and structs
113
- `n_rows_read` will be``1``. For tables it is redundant with
114
- ``table.loc``. If `obj_buf` is ``None``, only `object` is returned.
109
+ object
110
+ the read-out object
115
111
  """
116
112
  if isinstance(lh5_file, h5py.File):
117
113
  lh5_obj = lh5_file[name]
@@ -119,12 +115,12 @@ def read(
119
115
  lh5_file = h5py.File(lh5_file, mode="r", locking=locking)
120
116
  lh5_obj = lh5_file[name]
121
117
  else:
122
- lh5_files = list(lh5_file)
123
-
124
- n_rows_read = 0
125
- obj_buf_is_new = False
118
+ if obj_buf is not None:
119
+ obj_buf.resize(obj_buf_start)
120
+ else:
121
+ obj_buf_start = 0
126
122
 
127
- for i, h5f in enumerate(lh5_files):
123
+ for i, h5f in enumerate(lh5_file):
128
124
  if (
129
125
  isinstance(idx, (list, tuple))
130
126
  and len(idx) > 0
@@ -146,33 +142,26 @@ def read(
146
142
  idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
147
143
  else:
148
144
  idx_i = None
149
- n_rows_i = n_rows - n_rows_read
150
145
 
151
- obj_ret = read(
146
+ obj_buf_start_i = len(obj_buf) if obj_buf else 0
147
+ n_rows_i = n_rows - (obj_buf_start_i - obj_buf_start)
148
+
149
+ obj_buf = read(
152
150
  name,
153
151
  h5f,
154
- start_row,
152
+ start_row if i == 0 else 0,
155
153
  n_rows_i,
156
154
  idx_i,
157
155
  use_h5idx,
158
156
  field_mask,
159
157
  obj_buf,
160
- obj_buf_start,
158
+ obj_buf_start_i,
161
159
  decompress,
162
160
  )
163
- if isinstance(obj_ret, tuple):
164
- obj_buf, n_rows_read_i = obj_ret
165
- obj_buf_is_new = True
166
- else:
167
- obj_buf = obj_ret
168
- n_rows_read_i = len(obj_buf)
169
161
 
170
- n_rows_read += n_rows_read_i
171
- if n_rows_read >= n_rows or obj_buf is None:
172
- return obj_buf, n_rows_read
173
- start_row = 0
174
- obj_buf_start += n_rows_read_i
175
- return obj_buf if obj_buf_is_new else (obj_buf, n_rows_read)
162
+ if obj_buf is None or (len(obj_buf) - obj_buf_start) >= n_rows:
163
+ return obj_buf
164
+ return obj_buf
176
165
 
177
166
  if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
178
167
  idx = idx[0]
@@ -192,8 +181,10 @@ def read(
192
181
  obj_buf_start=obj_buf_start,
193
182
  decompress=decompress,
194
183
  )
184
+ with suppress(AttributeError):
185
+ obj.resize(obj_buf_start + n_rows_read)
195
186
 
196
- return obj if obj_buf is None else (obj, n_rows_read)
187
+ return obj
197
188
 
198
189
 
199
190
  def write(
lgdo/lh5/iterator.py CHANGED
@@ -24,7 +24,8 @@ class LH5Iterator(typing.Iterator):
24
24
 
25
25
  This can be used as an iterator:
26
26
 
27
- >>> for lh5_obj, i_entry, n_rows in LH5Iterator(...):
27
+
28
+ >>> for lh5_obj in LH5Iterator(...):
28
29
  >>> # do the thing!
29
30
 
30
31
  This is intended for if you are reading a large quantity of data. This
@@ -42,6 +43,8 @@ class LH5Iterator(typing.Iterator):
42
43
  In addition to accessing requested data via ``lh5_obj``, several
43
44
  properties exist to tell you where that data came from:
44
45
 
46
+ - lh5_it.current_i_entry: get the index within the entry list of the
47
+ first entry that is currently read
45
48
  - lh5_it.current_local_entries: get the entry numbers relative to the
46
49
  file the data came from
47
50
  - lh5_it.current_global_entries: get the entry number relative to the
@@ -49,9 +52,9 @@ class LH5Iterator(typing.Iterator):
49
52
  - lh5_it.current_files: get the file name corresponding to each entry
50
53
  - lh5_it.current_groups: get the group name corresponding to each entry
51
54
 
52
- This class can also be used either for random access:
55
+ This class can also be used for random access:
53
56
 
54
- >>> lh5_obj, n_rows = lh5_it.read(i_entry)
57
+ >>> lh5_obj = lh5_it.read(i_entry)
55
58
 
56
59
  to read the block of entries starting at i_entry. In case of multiple files
57
60
  or the use of an event selection, i_entry refers to a global event index
@@ -65,6 +68,8 @@ class LH5Iterator(typing.Iterator):
65
68
  base_path: str = "",
66
69
  entry_list: list[int] | list[list[int]] | None = None,
67
70
  entry_mask: list[bool] | list[list[bool]] | None = None,
71
+ i_start: int = 0,
72
+ n_entries: int | None = None,
68
73
  field_mask: dict[str, bool] | list[str] | tuple[str] | None = None,
69
74
  buffer_len: int = "100*MB",
70
75
  file_cache: int = 10,
@@ -89,6 +94,10 @@ class LH5Iterator(typing.Iterator):
89
94
  entry_mask
90
95
  mask of entries to read. If a list of arrays is provided, expect
91
96
  one for each file. Ignore if a selection list is provided.
97
+ i_start
98
+ index of first entry to start at when iterating
99
+ n_entries
100
+ number of entries to read before terminating iteration
92
101
  field_mask
93
102
  mask of which fields to read. See :meth:`LH5Store.read` for
94
103
  more details.
@@ -183,7 +192,8 @@ class LH5Iterator(typing.Iterator):
183
192
  msg = f"can't open any files from {lh5_files}"
184
193
  raise RuntimeError(msg)
185
194
 
186
- self.n_rows = 0
195
+ self.i_start = i_start
196
+ self.n_entries = n_entries
187
197
  self.current_i_entry = 0
188
198
  self.next_i_entry = 0
189
199
 
@@ -317,14 +327,21 @@ class LH5Iterator(typing.Iterator):
317
327
  )
318
328
  return self.global_entry_list
319
329
 
320
- def read(self, i_entry: int) -> tuple[LGDO, int]:
321
- """Read the nextlocal chunk of events, starting at i_entry. Return the
322
- LH5 buffer and number of rows read."""
323
- self.n_rows = 0
324
- i_file = np.searchsorted(self.entry_map, i_entry, "right")
330
+ def read(self, i_entry: int, n_entries: int | None = None) -> LGDO:
331
+ "Read the nextlocal chunk of events, starting at entry."
332
+ self.lh5_buffer.resize(0)
333
+
334
+ if n_entries is None:
335
+ n_entries = self.buffer_len
336
+ elif n_entries == 0:
337
+ return self.lh5_buffer
338
+ elif n_entries > self.buffer_len:
339
+ msg = "n_entries cannot be larger than buffer_len"
340
+ raise ValueError(msg)
325
341
 
326
342
  # if file hasn't been opened yet, search through files
327
343
  # sequentially until we find the right one
344
+ i_file = np.searchsorted(self.entry_map, i_entry, "right")
328
345
  if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("q").max:
329
346
  while i_file < len(self.lh5_files) and i_entry >= self._get_file_cumentries(
330
347
  i_file
@@ -332,10 +349,10 @@ class LH5Iterator(typing.Iterator):
332
349
  i_file += 1
333
350
 
334
351
  if i_file == len(self.lh5_files):
335
- return (self.lh5_buffer, self.n_rows)
352
+ return self.lh5_buffer
336
353
  local_i_entry = i_entry - self._get_file_cumentries(i_file - 1)
337
354
 
338
- while self.n_rows < self.buffer_len and i_file < len(self.file_map):
355
+ while len(self.lh5_buffer) < n_entries and i_file < len(self.file_map):
339
356
  # Loop through files
340
357
  local_idx = self.get_file_entrylist(i_file)
341
358
  if local_idx is not None and len(local_idx) == 0:
@@ -344,18 +361,17 @@ class LH5Iterator(typing.Iterator):
344
361
  continue
345
362
 
346
363
  i_local = local_i_entry if local_idx is None else local_idx[local_i_entry]
347
- self.lh5_buffer, n_rows = self.lh5_st.read(
364
+ self.lh5_buffer = self.lh5_st.read(
348
365
  self.groups[i_file],
349
366
  self.lh5_files[i_file],
350
367
  start_row=i_local,
351
- n_rows=self.buffer_len - self.n_rows,
368
+ n_rows=n_entries - len(self.lh5_buffer),
352
369
  idx=local_idx,
353
370
  field_mask=self.field_mask,
354
371
  obj_buf=self.lh5_buffer,
355
- obj_buf_start=self.n_rows,
372
+ obj_buf_start=len(self.lh5_buffer),
356
373
  )
357
374
 
358
- self.n_rows += n_rows
359
375
  i_file += 1
360
376
  local_i_entry = 0
361
377
 
@@ -364,7 +380,7 @@ class LH5Iterator(typing.Iterator):
364
380
  if self.friend is not None:
365
381
  self.friend.read(i_entry)
366
382
 
367
- return (self.lh5_buffer, self.n_rows)
383
+ return self.lh5_buffer
368
384
 
369
385
  def reset_field_mask(self, mask):
370
386
  """Replaces the field mask of this iterator and any friends with mask"""
@@ -375,7 +391,7 @@ class LH5Iterator(typing.Iterator):
375
391
  @property
376
392
  def current_local_entries(self) -> NDArray[int]:
377
393
  """Return list of local file entries in buffer"""
378
- cur_entries = np.zeros(self.n_rows, dtype="int32")
394
+ cur_entries = np.zeros(len(self.lh5_buffer), dtype="int32")
379
395
  i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
380
396
  file_start = self._get_file_cumentries(i_file - 1)
381
397
  i_local = self.current_i_entry - file_start
@@ -402,7 +418,7 @@ class LH5Iterator(typing.Iterator):
402
418
  @property
403
419
  def current_global_entries(self) -> NDArray[int]:
404
420
  """Return list of local file entries in buffer"""
405
- cur_entries = np.zeros(self.n_rows, dtype="int32")
421
+ cur_entries = np.zeros(len(self.lh5_buffer), dtype="int32")
406
422
  i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
407
423
  file_start = self._get_file_cumentries(i_file - 1)
408
424
  i_local = self.current_i_entry - file_start
@@ -433,7 +449,7 @@ class LH5Iterator(typing.Iterator):
433
449
  @property
434
450
  def current_files(self) -> NDArray[str]:
435
451
  """Return list of file names for entries in buffer"""
436
- cur_files = np.zeros(self.n_rows, dtype=object)
452
+ cur_files = np.zeros(len(self.lh5_buffer), dtype=object)
437
453
  i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
438
454
  file_start = self._get_file_cumentries(i_file - 1)
439
455
  i_local = self.current_i_entry - file_start
@@ -455,7 +471,7 @@ class LH5Iterator(typing.Iterator):
455
471
  @property
456
472
  def current_groups(self) -> NDArray[str]:
457
473
  """Return list of group names for entries in buffer"""
458
- cur_groups = np.zeros(self.n_rows, dtype=object)
474
+ cur_groups = np.zeros(len(self.lh5_buffer), dtype=object)
459
475
  i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
460
476
  file_start = self._get_file_cumentries(i_file - 1)
461
477
  i_local = self.current_i_entry - file_start
@@ -485,14 +501,19 @@ class LH5Iterator(typing.Iterator):
485
501
  def __iter__(self) -> typing.Iterator:
486
502
  """Loop through entries in blocks of size buffer_len."""
487
503
  self.current_i_entry = 0
488
- self.next_i_entry = 0
504
+ self.next_i_entry = self.i_start
489
505
  return self
490
506
 
491
507
  def __next__(self) -> tuple[LGDO, int, int]:
492
- """Read next buffer_len entries and return lh5_table, iterator entry
493
- and n_rows read."""
494
- buf, n_rows = self.read(self.next_i_entry)
495
- self.next_i_entry = self.current_i_entry + n_rows
496
- if n_rows == 0:
508
+ """Read next buffer_len entries and return lh5_table and iterator entry."""
509
+ n_entries = self.n_entries
510
+ if n_entries is not None:
511
+ n_entries = min(
512
+ self.buffer_len, n_entries + self.i_start - self.next_i_entry
513
+ )
514
+
515
+ buf = self.read(self.next_i_entry, n_entries)
516
+ if len(buf) == 0:
497
517
  raise StopIteration
498
- return (buf, self.current_i_entry, n_rows)
518
+ self.next_i_entry = self.current_i_entry + len(buf)
519
+ return buf
lgdo/lh5/store.py CHANGED
@@ -5,7 +5,6 @@ HDF5 files.
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
- import bisect
9
8
  import logging
10
9
  import os
11
10
  import sys
@@ -15,11 +14,11 @@ from inspect import signature
15
14
  from typing import Any
16
15
 
17
16
  import h5py
18
- import numpy as np
19
17
  from numpy.typing import ArrayLike
20
18
 
21
19
  from .. import types
22
20
  from . import _serializers, utils
21
+ from .core import read
23
22
 
24
23
  log = logging.getLogger(__name__)
25
24
 
@@ -155,7 +154,7 @@ class LH5Store:
155
154
  """Returns an LH5 object appropriate for use as a pre-allocated buffer
156
155
  in a read loop. Sets size to `size` if object has a size.
157
156
  """
158
- obj, n_rows = self.read(name, lh5_file, n_rows=0, field_mask=field_mask)
157
+ obj = self.read(name, lh5_file, n_rows=0, field_mask=field_mask)
159
158
  if hasattr(obj, "resize") and size is not None:
160
159
  obj.resize(new_size=size)
161
160
  return obj
@@ -182,72 +181,20 @@ class LH5Store:
182
181
  """
183
182
  # grab files from store
184
183
  if isinstance(lh5_file, (str, h5py.File)):
185
- lh5_obj = self.gimme_file(lh5_file, "r", **file_kwargs)[name]
184
+ h5f = self.gimme_file(lh5_file, "r", **file_kwargs)
186
185
  else:
187
- lh5_files = list(lh5_file)
188
- n_rows_read = 0
189
-
190
- for i, h5f in enumerate(lh5_files):
191
- if (
192
- isinstance(idx, (list, tuple))
193
- and len(idx) > 0
194
- and not np.isscalar(idx[0])
195
- ):
196
- # a list of lists: must be one per file
197
- idx_i = idx[i]
198
- elif idx is not None:
199
- # make idx a proper tuple if it's not one already
200
- if not (isinstance(idx, tuple) and len(idx) == 1):
201
- idx = (idx,)
202
- # idx is a long continuous array
203
- n_rows_i = utils.read_n_rows(name, h5f)
204
- # find the length of the subset of idx that contains indices
205
- # that are less than n_rows_i
206
- n_rows_to_read_i = bisect.bisect_left(idx[0], n_rows_i)
207
- # now split idx into idx_i and the remainder
208
- idx_i = np.array(idx[0])[:n_rows_to_read_i]
209
- idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
210
- else:
211
- idx_i = None
212
- n_rows_i = n_rows - n_rows_read
213
-
214
- obj_buf, n_rows_read_i = self.read(
215
- name,
216
- h5f,
217
- start_row,
218
- n_rows_i,
219
- idx_i,
220
- use_h5idx,
221
- field_mask,
222
- obj_buf,
223
- obj_buf_start,
224
- decompress,
225
- )
226
-
227
- n_rows_read += n_rows_read_i
228
- if n_rows_read >= n_rows or obj_buf is None:
229
- return obj_buf, n_rows_read
230
- start_row = 0
231
- obj_buf_start += n_rows_read_i
232
- return obj_buf, n_rows_read
233
-
234
- if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
235
- idx = idx[0]
236
- if isinstance(idx, np.ndarray) and idx.dtype == np.dtype("?"):
237
- idx = np.where(idx)[0]
238
-
239
- return _serializers._h5_read_lgdo(
240
- lh5_obj.id,
241
- lh5_obj.file.filename,
242
- lh5_obj.name,
243
- start_row=start_row,
244
- n_rows=n_rows,
245
- idx=idx,
246
- use_h5idx=use_h5idx,
247
- field_mask=field_mask,
248
- obj_buf=obj_buf,
249
- obj_buf_start=obj_buf_start,
250
- decompress=decompress,
186
+ h5f = [self.gimme_file(f, "r", **file_kwargs) for f in lh5_file]
187
+ return read(
188
+ name,
189
+ h5f,
190
+ start_row,
191
+ n_rows,
192
+ idx,
193
+ use_h5idx,
194
+ field_mask,
195
+ obj_buf,
196
+ obj_buf_start,
197
+ decompress,
251
198
  )
252
199
 
253
200
  def write(
lgdo/types/array.py CHANGED
@@ -17,12 +17,12 @@ import pint_pandas # noqa: F401
17
17
 
18
18
  from .. import utils
19
19
  from ..units import default_units_registry as u
20
- from .lgdo import LGDO
20
+ from .lgdo import LGDOCollection
21
21
 
22
22
  log = logging.getLogger(__name__)
23
23
 
24
24
 
25
- class Array(LGDO):
25
+ class Array(LGDOCollection):
26
26
  r"""Holds an :class:`numpy.ndarray` and attributes.
27
27
 
28
28
  :class:`Array` (and the other various array types) holds an `nda` instead
@@ -78,11 +78,7 @@ class Array(LGDO):
78
78
  elif isinstance(nda, Array):
79
79
  nda = nda.nda
80
80
 
81
- elif not isinstance(nda, np.ndarray):
82
- nda = np.array(nda)
83
-
84
81
  self.nda = nda
85
- self.dtype = self.nda.dtype
86
82
 
87
83
  super().__init__(attrs)
88
84
 
@@ -96,18 +92,83 @@ class Array(LGDO):
96
92
  return dt + "<" + nd + ">{" + et + "}"
97
93
 
98
94
  def __len__(self) -> int:
99
- return len(self.nda)
95
+ return self._size
96
+
97
+ @property
98
+ def nda(self):
99
+ return self._nda[: self._size, ...] if self._nda.shape != () else self._nda
100
+
101
+ @nda.setter
102
+ def nda(self, value):
103
+ self._nda = value if isinstance(value, np.ndarray) else np.array(value)
104
+ self._size = len(self._nda) if self._nda.shape != () else 0
105
+
106
+ @property
107
+ def dtype(self):
108
+ return self._nda.dtype
109
+
110
+ @property
111
+ def shape(self):
112
+ return (len(self),) + self._nda.shape[1:]
113
+
114
+ def reserve_capacity(self, capacity: int) -> None:
115
+ "Set size (number of rows) of internal memory buffer"
116
+ if capacity < len(self):
117
+ msg = "Cannot reduce capacity below Array length"
118
+ raise ValueError(msg)
119
+ self._nda.resize((capacity,) + self._nda.shape[1:], refcheck=False)
120
+
121
+ def get_capacity(self) -> int:
122
+ "Get capacity (i.e. max size before memory must be re-allocated)"
123
+ return len(self._nda)
124
+
125
+ def trim_capacity(self) -> None:
126
+ "Set capacity to be minimum needed to support Array size"
127
+ self.reserve_capacity(np.prod(self.shape))
128
+
129
+ def resize(self, new_size: int, trim=False) -> None:
130
+ """Set size of Array in rows. Only change capacity if it must be
131
+ increased to accommodate new rows; in this case double capacity.
132
+ If trim is True, capacity will be set to match size."""
133
+
134
+ self._size = new_size
135
+
136
+ if trim and new_size != self.get_capacity:
137
+ self.reserve_capacity(new_size)
100
138
 
101
- def resize(self, new_size: int) -> None:
102
- new_shape = (new_size,) + self.nda.shape[1:]
103
- return self.nda.resize(new_shape, refcheck=True)
139
+ # If capacity is not big enough, set to next power of 2 big enough
140
+ if new_size > self.get_capacity():
141
+ self.reserve_capacity(int(2 ** (np.ceil(np.log2(new_size)))))
104
142
 
105
143
  def append(self, value: np.ndarray) -> None:
106
- self.resize(len(self) + 1)
107
- self.nda[-1] = value
144
+ "Append value to end of array (with copy)"
145
+ self.insert(len(self), value)
108
146
 
109
147
  def insert(self, i: int, value: int | float) -> None:
110
- self.nda = np.insert(self.nda, i, value)
148
+ "Insert value into row i (with copy)"
149
+ if i > len(self):
150
+ msg = f"index {i} is out of bounds for array with size {len(self)}"
151
+ raise IndexError(msg)
152
+
153
+ value = np.array(value)
154
+ if value.shape == self.shape[1:]:
155
+ self.resize(len(self) + 1)
156
+ self[i + 1 :] = self[i:-1]
157
+ self[i] = value
158
+ elif value.shape[1:] == self.shape[1:]:
159
+ self.resize(len(self) + len(value))
160
+ self[i + len(value) :] = self[i : -len(value)]
161
+ self[i : i + len(value)] = value
162
+ else:
163
+ msg = f"Could not insert value with shape {value.shape} into Array with shape {self.shape}"
164
+ raise ValueError(msg)
165
+
166
+ def replace(self, i: int, value: int | float) -> None:
167
+ "Replace value at row i"
168
+ if i >= len(self):
169
+ msg = f"index {i} is out of bounds for array with size {len(self)}"
170
+ raise IndexError(msg)
171
+ self[i] = value
111
172
 
112
173
  def __getitem__(self, key):
113
174
  return self.nda[key]
lgdo/types/encoded.py CHANGED
@@ -11,12 +11,12 @@ from numpy.typing import NDArray
11
11
 
12
12
  from .. import utils
13
13
  from .array import Array
14
- from .lgdo import LGDO
14
+ from .lgdo import LGDOCollection
15
15
  from .scalar import Scalar
16
16
  from .vectorofvectors import VectorOfVectors
17
17
 
18
18
 
19
- class VectorOfEncodedVectors(LGDO):
19
+ class VectorOfEncodedVectors(LGDOCollection):
20
20
  """An array of variable-length encoded arrays.
21
21
 
22
22
  Used to represent an encoded :class:`.VectorOfVectors`. In addition to an
@@ -92,6 +92,17 @@ class VectorOfEncodedVectors(LGDO):
92
92
 
93
93
  return False
94
94
 
95
+ def reserve_capacity(self, *capacity: int) -> None:
96
+ self.encoded_data.reserve_capacity(*capacity)
97
+ self.decoded_size.reserve_capacity(capacity[0])
98
+
99
+ def get_capacity(self) -> tuple:
100
+ return (self.decoded_size.get_capacity, *self.encoded_data.get_capacity())
101
+
102
+ def trim_capacity(self) -> None:
103
+ self.encoded_data.trim_capacity()
104
+ self.decoded_size.trim_capacity()
105
+
95
106
  def resize(self, new_size: int) -> None:
96
107
  """Resize vector along the first axis.
97
108
 
@@ -102,21 +113,6 @@ class VectorOfEncodedVectors(LGDO):
102
113
  self.encoded_data.resize(new_size)
103
114
  self.decoded_size.resize(new_size)
104
115
 
105
- def append(self, value: tuple[NDArray, int]) -> None:
106
- """Append a 1D encoded vector at the end.
107
-
108
- Parameters
109
- ----------
110
- value
111
- a tuple holding the encoded array and its decoded size.
112
-
113
- See Also
114
- --------
115
- .VectorOfVectors.append
116
- """
117
- self.encoded_data.append(value[0])
118
- self.decoded_size.append(value[1])
119
-
120
116
  def insert(self, i: int, value: tuple[NDArray, int]) -> None:
121
117
  """Insert an encoded vector at index `i`.
122
118
 
@@ -282,7 +278,7 @@ class VectorOfEncodedVectors(LGDO):
282
278
  raise ValueError(msg)
283
279
 
284
280
 
285
- class ArrayOfEncodedEqualSizedArrays(LGDO):
281
+ class ArrayOfEncodedEqualSizedArrays(LGDOCollection):
286
282
  """An array of encoded arrays with equal decoded size.
287
283
 
288
284
  Used to represent an encoded :class:`.ArrayOfEqualSizedArrays`. In addition
@@ -349,14 +345,23 @@ class ArrayOfEncodedEqualSizedArrays(LGDO):
349
345
 
350
346
  return False
351
347
 
352
- def resize(self, new_size: int) -> None:
348
+ def reserve_capacity(self, *capacity: int) -> None:
349
+ self.encoded_data.reserve_capacity(capacity)
350
+
351
+ def get_capacity(self) -> tuple:
352
+ return self.encoded_data.get_capacity()
353
+
354
+ def trim_capacity(self) -> None:
355
+ self.encoded_data.trim_capacity()
356
+
357
+ def resize(self, new_size: int, trim: bool = False) -> None:
353
358
  """Resize array along the first axis.
354
359
 
355
360
  See Also
356
361
  --------
357
362
  .VectorOfVectors.resize
358
363
  """
359
- self.encoded_data.resize(new_size)
364
+ self.encoded_data.resize(new_size, trim)
360
365
 
361
366
  def append(self, value: NDArray) -> None:
362
367
  """Append a 1D encoded array at the end.
lgdo/types/histogram.py CHANGED
@@ -424,7 +424,7 @@ class Histogram(Struct):
424
424
  dict.__setitem__(self, name, obj)
425
425
  else:
426
426
  msg = "histogram fields cannot be mutated "
427
- raise TypeError(msg)
427
+ raise AttributeError(msg)
428
428
 
429
429
  def __getattr__(self, name: str) -> None:
430
430
  # do not allow for new attributes on this
lgdo/types/lgdo.py CHANGED
@@ -92,3 +92,53 @@ class LGDO(ABC):
92
92
 
93
93
  def __repr__(self) -> str:
94
94
  return self.__class__.__name__ + f"(attrs={self.attrs!r})"
95
+
96
+
97
+ class LGDOCollection(LGDO):
98
+ """Abstract base class representing a LEGEND Collection Object (LGDO).
99
+ This defines the interface for classes used as table columns.
100
+ """
101
+
102
+ @abstractmethod
103
+ def __init__(self, attrs: dict[str, Any] | None = None) -> None:
104
+ super().__init__(attrs)
105
+
106
+ @abstractmethod
107
+ def __len__(self) -> int:
108
+ """Provides ``__len__`` for this array-like class."""
109
+
110
+ @abstractmethod
111
+ def reserve_capacity(self, capacity: int) -> None:
112
+ """Reserve capacity (in rows) for later use. Internal memory buffers
113
+ will have enough entries to store this many rows.
114
+ """
115
+
116
+ @abstractmethod
117
+ def get_capacity(self) -> int:
118
+ "get reserved capacity of internal memory buffers in rows"
119
+
120
+ @abstractmethod
121
+ def trim_capacity(self) -> None:
122
+ """set capacity to only what is required to store current contents
123
+ of LGDOCollection
124
+ """
125
+
126
+ @abstractmethod
127
+ def resize(self, new_size: int, trim: bool = False) -> None:
128
+ """Return this LGDO's datatype attribute string."""
129
+
130
+ def append(self, val) -> None:
131
+ "append val to end of LGDOCollection"
132
+ self.insert(len(self), val)
133
+
134
+ @abstractmethod
135
+ def insert(self, i: int, val) -> None:
136
+ "insert val into LGDOCollection at position i"
137
+
138
+ @abstractmethod
139
+ def replace(self, i: int, val) -> None:
140
+ "replace item at position i with val in LGDOCollection"
141
+
142
+ def clear(self, trim: bool = False) -> None:
143
+ "set size of LGDOCollection to zero"
144
+ self.resize(0, trim=trim)
lgdo/types/table.py CHANGED
@@ -19,7 +19,7 @@ from pandas.io.formats import format as fmt
19
19
 
20
20
  from .array import Array
21
21
  from .arrayofequalsizedarrays import ArrayOfEqualSizedArrays
22
- from .lgdo import LGDO
22
+ from .lgdo import LGDO, LGDOCollection
23
23
  from .scalar import Scalar
24
24
  from .struct import Struct
25
25
  from .vectorofvectors import VectorOfVectors
@@ -27,13 +27,9 @@ from .vectorofvectors import VectorOfVectors
27
27
  log = logging.getLogger(__name__)
28
28
 
29
29
 
30
- class Table(Struct):
30
+ class Table(Struct, LGDOCollection):
31
31
  """A special struct of arrays or subtable columns of equal length.
32
32
 
33
- Holds onto an internal read/write location ``loc`` that is useful in
34
- managing table I/O using functions like :meth:`push_row`, :meth:`is_full`,
35
- and :meth:`clear`.
36
-
37
33
  Note
38
34
  ----
39
35
  If you write to a table and don't fill it up to its total size, be sure to
@@ -49,7 +45,7 @@ class Table(Struct):
49
45
 
50
46
  def __init__(
51
47
  self,
52
- col_dict: Mapping[str, LGDO] | pd.DataFrame | ak.Array | None = None,
48
+ col_dict: Mapping[str, LGDOCollection] | pd.DataFrame | ak.Array | None = None,
53
49
  size: int | None = None,
54
50
  attrs: Mapping[str, Any] | None = None,
55
51
  ) -> None:
@@ -65,7 +61,7 @@ class Table(Struct):
65
61
  col_dict
66
62
  instantiate this table using the supplied mapping of column names
67
63
  and array-like objects. Supported input types are: mapping of
68
- strings to LGDOs, :class:`pd.DataFrame` and :class:`ak.Array`.
64
+ strings to LGDOCollections, :class:`pd.DataFrame` and :class:`ak.Array`.
69
65
  Note 1: no copy is performed, the objects are used directly (unless
70
66
  :class:`ak.Array` is provided). Note 2: if `size` is not ``None``,
71
67
  all arrays will be resized to match it. Note 3: if the arrays have
@@ -85,7 +81,8 @@ class Table(Struct):
85
81
  col_dict = _ak_to_lgdo_or_col_dict(col_dict)
86
82
 
87
83
  # call Struct constructor
88
- super().__init__(obj_dict=col_dict, attrs=attrs)
84
+ Struct.__init__(self, obj_dict=col_dict)
85
+ LGDOCollection.__init__(self, attrs=attrs)
89
86
 
90
87
  # if col_dict is not empty, set size according to it
91
88
  # if size is also supplied, resize all fields to match it
@@ -93,13 +90,10 @@ class Table(Struct):
93
90
  if col_dict is not None and len(col_dict) > 0:
94
91
  self.resize(new_size=size, do_warn=(size is None))
95
92
 
96
- # if no col_dict, just set the size (default to 1024)
93
+ # if no col_dict, just set the size
97
94
  else:
98
95
  self.size = size if size is not None else None
99
96
 
100
- # always start at loc=0
101
- self.loc = 0
102
-
103
97
  def datatype_name(self) -> str:
104
98
  return "table"
105
99
 
@@ -107,7 +101,31 @@ class Table(Struct):
107
101
  """Provides ``__len__`` for this array-like class."""
108
102
  return self.size
109
103
 
110
- def resize(self, new_size: int | None = None, do_warn: bool = False) -> None:
104
+ def reserve_capacity(self, capacity: int | list) -> None:
105
+ "Set size (number of rows) of internal memory buffer"
106
+ if isinstance(capacity, int):
107
+ for obj in self.values():
108
+ obj.reserve_capacity(capacity)
109
+ else:
110
+ if len(capacity) != len(self.keys()):
111
+ msg = "List of capacities must have same length as number of keys"
112
+ raise ValueError(msg)
113
+
114
+ for obj, cap in zip(self.values(), capacity):
115
+ obj.reserve_capacity(cap)
116
+
117
+ def get_capacity(self) -> int:
118
+ "Get list of capacities for each key"
119
+ return [v.get_capacity() for v in self.values()]
120
+
121
+ def trim_capacity(self) -> int:
122
+ "Set capacity to be minimum needed to support Array size"
123
+ for v in self.values():
124
+ v.trim_capacity()
125
+
126
+ def resize(
127
+ self, new_size: int | None = None, do_warn: bool = False, trim: bool = False
128
+ ) -> None:
111
129
  # if new_size = None, use the size from the first field
112
130
  for field, obj in self.items():
113
131
  if new_size is None:
@@ -119,21 +137,20 @@ class Table(Struct):
119
137
  f"with size {len(obj)} != {new_size}"
120
138
  )
121
139
  if isinstance(obj, Table):
122
- obj.resize(new_size)
140
+ obj.resize(new_size, trim)
123
141
  else:
124
- obj.resize(new_size)
142
+ obj.resize(new_size, trim)
125
143
  self.size = new_size
126
144
 
127
- def push_row(self) -> None:
128
- self.loc += 1
129
-
130
- def is_full(self) -> bool:
131
- return self.loc >= self.size
132
-
133
- def clear(self) -> None:
134
- self.loc = 0
145
+ def insert(self, i: int, vals: dict) -> None:
146
+ "Insert vals into table at row i. Vals is a mapping from table key to val"
147
+ for k, ar in self.items():
148
+ ar.insert(i, vals[k])
149
+ self.size += 1
135
150
 
136
- def add_field(self, name: str, obj: LGDO, use_obj_size: bool = False) -> None:
151
+ def add_field(
152
+ self, name: str, obj: LGDOCollection, use_obj_size: bool = False
153
+ ) -> None:
137
154
  """Add a field (column) to the table.
138
155
 
139
156
  Use the name "field" here to match the terminology used in
@@ -170,7 +187,9 @@ class Table(Struct):
170
187
  new_size = len(obj) if use_obj_size else self.size
171
188
  self.resize(new_size=new_size)
172
189
 
173
- def add_column(self, name: str, obj: LGDO, use_obj_size: bool = False) -> None:
190
+ def add_column(
191
+ self, name: str, obj: LGDOCollection, use_obj_size: bool = False
192
+ ) -> None:
174
193
  """Alias for :meth:`.add_field` using table terminology 'column'."""
175
194
  self.add_field(name, obj, use_obj_size=use_obj_size)
176
195
 
@@ -201,8 +220,10 @@ class Table(Struct):
201
220
  set to ``False`` to turn off warnings associated with mismatched
202
221
  `loc` parameter or :meth:`add_column` warnings.
203
222
  """
204
- if other_table.loc != self.loc and do_warn:
205
- log.warning(f"other_table.loc ({other_table.loc}) != self.loc({self.loc})")
223
+ if len(other_table) != len(self) and do_warn:
224
+ log.warning(
225
+ f"len(other_table) ({len(other_table)}) != len(self) ({len(self)})"
226
+ )
206
227
  if cols is None:
207
228
  cols = other_table.keys()
208
229
  for name in cols:
@@ -20,12 +20,12 @@ from .. import utils
20
20
  from . import arrayofequalsizedarrays as aoesa
21
21
  from . import vovutils
22
22
  from .array import Array
23
- from .lgdo import LGDO
23
+ from .lgdo import LGDOCollection
24
24
 
25
25
  log = logging.getLogger(__name__)
26
26
 
27
27
 
28
- class VectorOfVectors(LGDO):
28
+ class VectorOfVectors(LGDOCollection):
29
29
  """A n-dimensional variable-length 1D array of variable-length 1D arrays.
30
30
 
31
31
  If the vector is 2-dimensional, the internal representation is as two NumPy
@@ -210,20 +210,17 @@ class VectorOfVectors(LGDO):
210
210
  elif self.flattened_data is None:
211
211
  self.flattened_data = flattened_data
212
212
 
213
- # finally set dtype
214
- self.dtype = self.flattened_data.dtype
215
-
216
- # set ndim
217
- self.ndim = 2
218
- pointer = self.flattened_data
219
- while True:
220
- if isinstance(pointer, Array):
221
- break
213
+ super().__init__(attrs)
222
214
 
223
- self.ndim += 1
224
- pointer = pointer.flattened_data
215
+ @property
216
+ def ndim(self):
217
+ return 1 + (
218
+ 1 if isinstance(self.flattened_data, Array) else self.flattened_data.ndim
219
+ )
225
220
 
226
- super().__init__(attrs)
221
+ @property
222
+ def dtype(self) -> np.dtype:
223
+ return self.flattened_data.dtype
227
224
 
228
225
  def datatype_name(self) -> str:
229
226
  return "array"
@@ -276,7 +273,30 @@ class VectorOfVectors(LGDO):
276
273
  else:
277
274
  raise NotImplementedError
278
275
 
279
- def resize(self, new_size: int) -> None:
276
+ def reserve_capacity(self, cap_cl, *cap_args) -> None:
277
+ """Set capacity of internal data arrays. Expect number of args to
278
+ equal `self.n_dim`. First arg is capacity of cumulative length array.
279
+ If `self.n_dim` is 2, second argument is capacity of flattened data,
280
+ otherwise arguments are fed recursively to remaining dimensions.
281
+ """
282
+ self.cumulative_length.reserve_capacity(cap_cl)
283
+ self.flattened_data.reserve_capacity(*cap_args)
284
+
285
+ def get_capacity(self) -> tuple[int]:
286
+ """Get tuple containing capacity of each dimension. First dimension
287
+ is cumulative length array. Last dimension is flattened data.
288
+ """
289
+ fd_cap = self.flattened_data.get_capacity()
290
+ if isinstance(fd_cap, int):
291
+ return (self.cumulative_length.get_capacity(), fd_cap)
292
+ return (self.cumulative_length.get_capacity(), *fd_cap)
293
+
294
+ def trim_capacity(self) -> None:
295
+ "Set capacity for all dimensions to minimum needed to hold data"
296
+ self.cumulative_length.trim_capacity()
297
+ self.flattened_data.trim_capacity()
298
+
299
+ def resize(self, new_size: int, trim: bool = False) -> None:
280
300
  """Resize vector along the first axis.
281
301
 
282
302
  `self.flattened_data` is resized only if `new_size` is smaller than the
@@ -286,6 +306,8 @@ class VectorOfVectors(LGDO):
286
306
  `self.cumulative_length` is padded with its last element. This
287
307
  corresponds to appending empty vectors.
288
308
 
309
+ If `trim` is ``True``, resize capacity to match new size
310
+
289
311
  Examples
290
312
  --------
291
313
  >>> vov = VectorOfVectors([[1, 2, 3], [4, 5]])
@@ -303,23 +325,22 @@ class VectorOfVectors(LGDO):
303
325
  [3],
304
326
  ]
305
327
  """
306
- vidx = self.cumulative_length
307
328
  old_s = len(self)
308
- dlen = new_size - old_s
309
- csum = vidx[-1] if len(self) > 0 else 0
310
329
 
311
330
  # first resize the cumulative length
312
- self.cumulative_length.resize(new_size)
331
+ self.cumulative_length.resize(new_size, trim)
313
332
 
314
333
  # if new_size > size, new elements are filled with zeros, let's fix
315
334
  # that
316
- if dlen > 0:
317
- self.cumulative_length[old_s:] = csum
335
+ if new_size > old_s:
336
+ self.cumulative_length[old_s:] = self.cumulative_length[old_s - 1]
318
337
 
319
338
  # then resize the data array
320
339
  # if dlen > 0 this has no effect
321
340
  if len(self.cumulative_length) > 0:
322
- self.flattened_data.resize(self.cumulative_length[-1])
341
+ self.flattened_data.resize(self.cumulative_length[-1], trim)
342
+ else:
343
+ self.flattened_data.resize(0, trim)
323
344
 
324
345
  def append(self, new: NDArray) -> None:
325
346
  """Append a 1D vector `new` at the end.
@@ -334,20 +355,7 @@ class VectorOfVectors(LGDO):
334
355
  [8 9],
335
356
  ]
336
357
  """
337
- if self.ndim == 2:
338
- # first extend cumulative_length by +1
339
- self.cumulative_length.resize(len(self) + 1)
340
- # set it at the right value
341
- newlen = (
342
- self.cumulative_length[-2] + len(new) if len(self) > 1 else len(new)
343
- )
344
- self.cumulative_length[-1] = newlen
345
- # then resize flattened_data to accommodate the new vector
346
- self.flattened_data.resize(len(self.flattened_data) + len(new))
347
- # finally set it
348
- self[-1] = new
349
- else:
350
- raise NotImplementedError
358
+ self.insert(len(self), new)
351
359
 
352
360
  def insert(self, i: int, new: NDArray) -> None:
353
361
  """Insert a vector at index `i`.
@@ -364,23 +372,15 @@ class VectorOfVectors(LGDO):
364
372
  [8 9],
365
373
  [4 5],
366
374
  ]
367
-
368
- Warning
369
- -------
370
- This method involves a significant amount of memory re-allocation and
371
- is expected to perform poorly on large vectors.
372
375
  """
373
376
  if self.ndim == 2:
374
- if i >= len(self):
375
- msg = f"index {i} is out of bounds for vector owith size {len(self)}"
377
+ if i > len(self):
378
+ msg = f"index {i} is out of bounds for vector with size {len(self)}"
376
379
  raise IndexError(msg)
377
380
 
378
- self.flattened_data = Array(
379
- np.insert(self.flattened_data, self.cumulative_length[i - 1], new)
380
- )
381
- self.cumulative_length = Array(
382
- np.insert(self.cumulative_length, i, self.cumulative_length[i - 1])
383
- )
381
+ i_start = 0 if i == 0 else self.cumulative_length[i - 1]
382
+ self.flattened_data.insert(i_start, new)
383
+ self.cumulative_length.insert(i, i_start)
384
384
  self.cumulative_length[i:] += np.uint32(len(new))
385
385
  else:
386
386
  raise NotImplementedError
@@ -400,11 +400,6 @@ class VectorOfVectors(LGDO):
400
400
  [[8 9],
401
401
  [4 5],
402
402
  ]
403
-
404
- Warning
405
- -------
406
- This method involves a significant amount of memory re-allocation and
407
- is expected to perform poorly on large vectors.
408
403
  """
409
404
  if self.ndim == 2:
410
405
  if i >= len(self):
@@ -414,27 +409,17 @@ class VectorOfVectors(LGDO):
414
409
  vidx = self.cumulative_length
415
410
  dlen = len(new) - len(self[i])
416
411
 
417
- if dlen == 0:
418
- # don't waste resources
419
- self[i] = new
420
- elif dlen < 0:
421
- start = vidx[i - 1]
422
- stop = start + len(new)
423
- # set the already allocated indices
424
- self.flattened_data[start:stop] = new
425
- # then delete the extra indices
426
- self.flattened_data = Array(
427
- np.delete(self.flattened_data, np.s_[stop : vidx[i]])
428
- )
429
- else:
430
- # set the already allocated indices
431
- self.flattened_data[vidx[i - 1] : vidx[i]] = new[: len(self[i])]
432
- # then insert the remaining
433
- self.flattened_data = Array(
434
- np.insert(self.flattened_data, vidx[i], new[len(self[i]) :])
435
- )
436
-
437
- vidx[i:] = vidx[i:] + dlen
412
+ if dlen != 0:
413
+ # move the subsequent entries
414
+ vidx[i:] += dlen
415
+ self.flattened_data.resize(vidx[-1])
416
+ self.flattened_data._nda[vidx[i] : vidx[-1]] = self.flattened_data._nda[
417
+ vidx[i] - dlen : vidx[-1] - dlen
418
+ ]
419
+
420
+ # set the already allocated indices
421
+ start = vidx[i - 1] if i > 0 else 0
422
+ self.flattened_data[start : vidx[i]] = new
438
423
  else:
439
424
  raise NotImplementedError
440
425
 
@@ -484,7 +469,15 @@ class VectorOfVectors(LGDO):
484
469
  cum_lens = np.add(start, lens.cumsum(), dtype=int)
485
470
 
486
471
  # fill with fast vectorized routine
487
- vovutils._nb_fill(vec, lens, self.flattened_data.nda[start : cum_lens[-1]])
472
+ if np.issubdtype(self.flattened_data.dtype, np.unsignedinteger):
473
+ nan_val = np.iinfo(self.flattened_data.dtype).max
474
+ if np.issubdtype(self.flattened_data.dtype, np.integer):
475
+ nan_val = np.iinfo(self.flattened_data.dtype).min
476
+ else:
477
+ nan_val = np.nan
478
+ vovutils._nb_fill(
479
+ vec, lens, nan_val, self.flattened_data.nda[start : cum_lens[-1]]
480
+ )
488
481
 
489
482
  # add new vector(s) length to cumulative_length
490
483
  self.cumulative_length[i : i + len(lens)] = cum_lens
lgdo/types/vovutils.py CHANGED
@@ -81,7 +81,7 @@ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> ND
81
81
 
82
82
  @numba.guvectorize(
83
83
  [
84
- f"{data_type}[:,:],{size_type}[:],{data_type}[:]"
84
+ f"{data_type}[:,:],{size_type}[:],{data_type},{data_type}[:]"
85
85
  for data_type in [
86
86
  "b1",
87
87
  "i1",
@@ -99,10 +99,12 @@ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> ND
99
99
  ]
100
100
  for size_type in ["i4", "i8", "u4", "u8"]
101
101
  ],
102
- "(l,m),(l),(n)",
102
+ "(l,m),(l),(),(n)",
103
103
  **nb_kwargs,
104
104
  )
105
- def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
105
+ def _nb_fill(
106
+ aoa_in: NDArray, len_in: NDArray, nan_val: int | float, flattened_array_out: NDArray
107
+ ):
106
108
  """Vectorized function to fill flattened array from array of arrays and
107
109
  lengths. Values in aoa_in past lengths will not be copied.
108
110
 
@@ -112,6 +114,9 @@ def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
112
114
  array of arrays containing values to be copied
113
115
  len_in
114
116
  array of vector lengths for each row of aoa_in
117
+ nan_val
118
+ value to use when len_in is longer than aoa_in. Should use
119
+ np.nan for floating point, and 0xfff... for integer types
115
120
  flattened_array_out
116
121
  flattened array to copy values into. Must be longer than sum of
117
122
  lengths in len_in
@@ -122,9 +127,14 @@ def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray):
122
127
  raise ValueError(msg)
123
128
 
124
129
  start = 0
130
+ max_len = aoa_in.shape[1]
125
131
  for i, ll in enumerate(len_in):
126
132
  stop = start + ll
127
- flattened_array_out[start:stop] = aoa_in[i, :ll]
133
+ if ll > max_len:
134
+ flattened_array_out[start : start + max_len] = aoa_in[i, :]
135
+ flattened_array_out[start + max_len : stop] = nan_val
136
+ else:
137
+ flattened_array_out[start:stop] = aoa_in[i, :ll]
128
138
  start = stop
129
139
 
130
140