legend-pydataobj 1.9.0__py3-none-any.whl → 1.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lgdo/lh5/iterator.py CHANGED
@@ -2,11 +2,14 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import typing
5
+ from warnings import warn
5
6
 
6
7
  import numpy as np
7
8
  import pandas as pd
9
+ from numpy.typing import NDArray
8
10
 
9
11
  from ..types import Array, Scalar, Struct, VectorOfVectors
12
+ from ..units import default_units_registry as ureg
10
13
  from .store import LH5Store
11
14
  from .utils import expand_path
12
15
 
@@ -19,35 +22,53 @@ class LH5Iterator(typing.Iterator):
19
22
  at a time. This also accepts an entry list/mask to enable event selection,
20
23
  and a field mask.
21
24
 
22
- This class can be used either for random access:
25
+ This can be used as an iterator:
23
26
 
24
- >>> lh5_obj, n_rows = lh5_it.read(entry)
25
-
26
- to read the block of entries starting at entry. In case of multiple files
27
- or the use of an event selection, entry refers to a global event index
28
- across files and does not count events that are excluded by the selection.
29
-
30
- This can also be used as an iterator:
31
-
32
- >>> for lh5_obj, entry, n_rows in LH5Iterator(...):
27
+ >>> for lh5_obj, i_entry, n_rows in LH5Iterator(...):
33
28
  >>> # do the thing!
34
29
 
35
- This is intended for if you are reading a large quantity of data but
36
- want to limit your memory usage (particularly when reading in waveforms!).
30
+ This is intended for if you are reading a large quantity of data. This
31
+ will ensure that you traverse files efficiently to minimize caching time
32
+ and will limit your memory usage (particularly when reading in waveforms!).
37
33
  The ``lh5_obj`` that is read by this class is reused in order to avoid
38
34
  reallocation of memory; this means that if you want to hold on to data
39
35
  between reads, you will have to copy it somewhere!
36
+
37
+ When defining an LH5Iterator, you must give it a list of files and the
38
+ hdf5 groups containing the data tables you are reading. You may also
39
+ provide a field mask, and an entry list or mask, specifying which entries
40
+ to read from the files. You may also pair it with a friend iterator, which
41
+ contains a parallel group of files which will be simultaneously read.
42
+ In addition to accessing requested data via ``lh5_obj``, several
43
+ properties exist to tell you where that data came from:
44
+
45
+ - lh5_it.current_local_entries: get the entry numbers relative to the
46
+ file the data came from
47
+ - lh5_it.current_global_entries: get the entry number relative to the
48
+ full dataset
49
+ - lh5_it.current_files: get the file name corresponding to each entry
50
+ - lh5_it.current_groups: get the group name corresponding to each entry
51
+
52
+ This class can also be used either for random access:
53
+
54
+ >>> lh5_obj, n_rows = lh5_it.read(i_entry)
55
+
56
+ to read the block of entries starting at i_entry. In case of multiple files
57
+ or the use of an event selection, i_entry refers to a global event index
58
+ across files and does not count events that are excluded by the selection.
40
59
  """
41
60
 
42
61
  def __init__(
43
62
  self,
44
63
  lh5_files: str | list[str],
45
- groups: str | list[str],
64
+ groups: str | list[str] | list[list[str]],
46
65
  base_path: str = "",
47
66
  entry_list: list[int] | list[list[int]] | None = None,
48
67
  entry_mask: list[bool] | list[list[bool]] | None = None,
49
68
  field_mask: dict[str, bool] | list[str] | tuple[str] | None = None,
50
- buffer_len: int = 3200,
69
+ buffer_len: int = "100*MB",
70
+ file_cache: int = 10,
71
+ file_map: NDArray[int] = None,
51
72
  friend: typing.Iterator | None = None,
52
73
  ) -> None:
53
74
  """
@@ -57,9 +78,10 @@ class LH5Iterator(typing.Iterator):
57
78
  file or files to read from. May include wildcards and environment
58
79
  variables.
59
80
  groups
60
- HDF5 group(s) to read. If a list is provided for both lh5_files
61
- and group, they must be the same size. If a file is wild-carded,
62
- the same group will be assigned to each file found
81
+ HDF5 group(s) to read. If a list of strings is provided, use
82
+ same groups for each file. If a list of lists is provided, size
83
+ of outer list must match size of file list, and each inner list
84
+ will apply to a single file (or set of wildcarded files)
63
85
  entry_list
64
86
  list of entry numbers to read. If a nested list is provided,
65
87
  expect one top-level list for each file, containing a list of
@@ -72,66 +94,98 @@ class LH5Iterator(typing.Iterator):
72
94
  more details.
73
95
  buffer_len
74
96
  number of entries to read at a time while iterating through files.
97
+ file_cache
98
+ maximum number of files to keep open at a time
99
+ file_map
100
+ cumulative file/group entries. This can be provided on construction
101
+ to speed up random or sparse access; otherwise, we sequentially
102
+ read the size of each group. WARNING: no checks for accuracy are
103
+ performed so only use this if you know what you are doing!
75
104
  friend
76
105
  a \"friend\" LH5Iterator that will be read in parallel with this.
77
106
  The friend should have the same length and entry list. A single
78
107
  LH5 table containing columns from both iterators will be returned.
108
+ Note that buffer_len will be set to the minimum of the two.
79
109
  """
80
- self.lh5_st = LH5Store(base_path=base_path, keep_open=True)
110
+ self.lh5_st = LH5Store(base_path=base_path, keep_open=file_cache)
81
111
 
82
112
  # List of files, with wildcards and env vars expanded
83
113
  if isinstance(lh5_files, str):
84
114
  lh5_files = [lh5_files]
85
- if isinstance(groups, list):
86
- lh5_files *= len(groups)
87
- elif not isinstance(lh5_files, list):
115
+ elif not isinstance(lh5_files, (list, set, tuple)):
88
116
  msg = "lh5_files must be a string or list of strings"
89
117
  raise ValueError(msg)
90
118
 
91
119
  if isinstance(groups, str):
92
- groups = [groups] * len(lh5_files)
120
+ groups = [[groups]] * len(lh5_files)
93
121
  elif not isinstance(groups, list):
94
- msg = "group must be a string or list of strings"
122
+ msg = "group must be a string or appropriate list"
123
+ raise ValueError(msg)
124
+ elif all(isinstance(g, str) for g in groups):
125
+ groups = [groups] * len(lh5_files)
126
+ elif len(groups) == len(lh5_files) and all(
127
+ isinstance(gr_list, (list, set, tuple)) for gr_list in groups
128
+ ):
129
+ pass
130
+ else:
131
+ msg = "group must be a string or appropriate list"
95
132
  raise ValueError(msg)
96
133
 
97
134
  if len(groups) != len(lh5_files):
98
135
  msg = "lh5_files and groups must have same length"
99
136
  raise ValueError(msg)
100
137
 
138
+ # make flattened outer-product-like list of files and groups
101
139
  self.lh5_files = []
102
140
  self.groups = []
103
141
  for f, g in zip(lh5_files, groups):
104
- f_exp = expand_path(f, list=True, base_path=base_path)
105
- self.lh5_files += f_exp
106
- self.groups += [g] * len(f_exp)
142
+ for f_exp in expand_path(f, list=True, base_path=base_path):
143
+ self.lh5_files += [f_exp] * len(g)
144
+ self.groups += list(g)
107
145
 
108
146
  if entry_list is not None and entry_mask is not None:
109
147
  msg = "entry_list and entry_mask arguments are mutually exclusive"
110
148
  raise ValueError(msg)
111
149
 
112
150
  # Map to last row in each file
113
- self.file_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i")
151
+ if file_map is None:
152
+ self.file_map = np.full(len(self.lh5_files), np.iinfo("q").max, "q")
153
+ else:
154
+ self.file_map = np.array(file_map)
155
+
114
156
  # Map to last iterator entry for each file
115
- self.entry_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i")
157
+ self.entry_map = np.full(len(self.lh5_files), np.iinfo("q").max, "q")
116
158
  self.buffer_len = buffer_len
117
159
 
118
160
  if len(self.lh5_files) > 0:
119
161
  f = self.lh5_files[0]
120
162
  g = self.groups[0]
163
+ n_rows = self.lh5_st.read_n_rows(g, f)
164
+
165
+ if isinstance(self.buffer_len, str):
166
+ self.buffer_len = ureg.Quantity(buffer_len)
167
+ if isinstance(self.buffer_len, ureg.Quantity):
168
+ self.buffer_len = int(
169
+ self.buffer_len
170
+ / (self.lh5_st.read_size_in_bytes(g, f) * ureg.B)
171
+ * n_rows
172
+ )
173
+
121
174
  self.lh5_buffer = self.lh5_st.get_buffer(
122
175
  g,
123
176
  f,
124
177
  size=self.buffer_len,
125
178
  field_mask=field_mask,
126
179
  )
127
- self.file_map[0] = self.lh5_st.read_n_rows(g, f)
180
+ if file_map is None:
181
+ self.file_map[0] = n_rows
128
182
  else:
129
183
  msg = f"can't open any files from {lh5_files}"
130
184
  raise RuntimeError(msg)
131
185
 
132
186
  self.n_rows = 0
133
- self.current_entry = 0
134
- self.next_entry = 0
187
+ self.current_i_entry = 0
188
+ self.next_i_entry = 0
135
189
 
136
190
  self.field_mask = field_mask
137
191
 
@@ -142,13 +196,13 @@ class LH5Iterator(typing.Iterator):
142
196
  entry_list = list(entry_list)
143
197
  if isinstance(entry_list[0], int):
144
198
  self.local_entry_list = [None] * len(self.file_map)
145
- self.global_entry_list = np.array(entry_list, "i")
199
+ self.global_entry_list = np.array(entry_list, "q")
146
200
  self.global_entry_list.sort()
147
201
 
148
202
  else:
149
203
  self.local_entry_list = [[]] * len(self.file_map)
150
204
  for i_file, local_list in enumerate(entry_list):
151
- self.local_entry_list[i_file] = np.array(local_list, "i")
205
+ self.local_entry_list[i_file] = np.array(local_list, "q")
152
206
  self.local_entry_list[i_file].sort()
153
207
 
154
208
  elif entry_mask is not None:
@@ -168,6 +222,15 @@ class LH5Iterator(typing.Iterator):
168
222
  if not isinstance(friend, typing.Iterator):
169
223
  msg = "Friend must be an Iterator"
170
224
  raise ValueError(msg)
225
+
226
+ # set buffer_lens to be equal
227
+ if self.buffer_len < friend.buffer_len:
228
+ friend.buffer_len = self.buffer_len
229
+ friend.lh5_buffer.resize(self.buffer_len)
230
+ elif self.buffer_len > friend.buffer_len:
231
+ self.buffer_len = friend.buffer_len
232
+ self.lh5_buffer.resize(friend.buffer_len)
233
+
171
234
  self.lh5_buffer.join(friend.lh5_buffer)
172
235
  self.friend = friend
173
236
 
@@ -176,33 +239,52 @@ class LH5Iterator(typing.Iterator):
176
239
  if i_file < 0:
177
240
  return 0
178
241
  fcl = self.file_map[i_file]
179
- if fcl == np.iinfo("i").max:
180
- fcl = self._get_file_cumlen(i_file - 1) + self.lh5_st.read_n_rows(
181
- self.groups[i_file], self.lh5_files[i_file]
182
- )
183
- self.file_map[i_file] = fcl
242
+
243
+ # if we haven't already calculated, calculate for all files up to i_file
244
+ if fcl == np.iinfo("q").max:
245
+ i_start = np.searchsorted(self.file_map, np.iinfo("q").max)
246
+ fcl = self.file_map[i_start - 1] if i_start > 0 else 0
247
+
248
+ for i in range(i_start, i_file + 1):
249
+ fcl += self.lh5_st.read_n_rows(self.groups[i], self.lh5_files[i])
250
+ self.file_map[i] = fcl
184
251
  return fcl
185
252
 
253
+ @property
254
+ def current_entry(self) -> int:
255
+ "deprecated alias for current_i_entry"
256
+ warn(
257
+ "current_entry has been renamed to current_i_entry.",
258
+ DeprecationWarning,
259
+ stacklevel=2,
260
+ )
261
+
262
+ return self.current_i_entry
263
+
186
264
  def _get_file_cumentries(self, i_file: int) -> int:
187
265
  """Helper to get cumulative iterator entries in file"""
188
266
  if i_file < 0:
189
267
  return 0
190
268
  n = self.entry_map[i_file]
191
- if n == np.iinfo("i").max:
192
- elist = self.get_file_entrylist(i_file)
193
- fcl = self._get_file_cumlen(i_file)
194
- if elist is None:
195
- # no entry list provided
196
- n = fcl
197
- else:
198
- file_entries = self.get_file_entrylist(i_file)
199
- n = len(file_entries)
200
- # check that file entries fall inside of file
201
- if n > 0 and file_entries[-1] >= fcl:
202
- logging.warning(f"Found entries out of range for file {i_file}")
203
- n = np.searchsorted(file_entries, fcl, "right")
204
- n += self._get_file_cumentries(i_file - 1)
205
- self.entry_map[i_file] = n
269
+
270
+ # if we haven't already calculated, calculate for all files up to i_file
271
+ if n == np.iinfo("q").max:
272
+ i_start = np.searchsorted(self.entry_map, np.iinfo("q").max)
273
+ n = self.entry_map[i_start - 1] if i_start > 0 else 0
274
+
275
+ for i in range(i_start, i_file + 1):
276
+ elist = self.get_file_entrylist(i)
277
+ fcl = self._get_file_cumlen(i)
278
+ if elist is None:
279
+ # no entry list provided
280
+ n = fcl
281
+ else:
282
+ n += len(elist)
283
+ # check that file entries fall inside of file
284
+ if len(elist) > 0 and elist[-1] >= fcl:
285
+ logging.warning(f"Found entries out of range for file {i}")
286
+ n += np.searchsorted(elist, fcl, "right") - len(elist)
287
+ self.entry_map[i] = n
206
288
  return n
207
289
 
208
290
  def get_file_entrylist(self, i_file: int) -> np.ndarray:
@@ -218,50 +300,50 @@ class LH5Iterator(typing.Iterator):
218
300
  f_end = self._get_file_cumlen(i_file)
219
301
  i_start = self._get_file_cumentries(i_file - 1)
220
302
  i_stop = np.searchsorted(self.global_entry_list, f_end, "right")
221
- elist = np.array(self.global_entry_list[i_start:i_stop], "i") - f_start
303
+ elist = np.array(self.global_entry_list[i_start:i_stop], "q") - f_start
222
304
  self.local_entry_list[i_file] = elist
223
305
  return elist
224
306
 
225
307
  def get_global_entrylist(self) -> np.ndarray:
226
308
  """Get global entry list, constructing it if needed"""
227
309
  if self.global_entry_list is None and self.local_entry_list is not None:
228
- self.global_entry_list = np.zeros(len(self), "i")
310
+ self.global_entry_list = np.zeros(len(self), "q")
229
311
  for i_file in range(len(self.lh5_files)):
230
- i_start = self.get_file_cumentries(i_file - 1)
231
- i_stop = self.get_file_cumentries(i_file)
232
- f_start = self.get_file_cumlen(i_file - 1)
312
+ i_start = self._get_file_cumentries(i_file - 1)
313
+ i_stop = self._get_file_cumentries(i_file)
314
+ f_start = self._get_file_cumlen(i_file - 1)
233
315
  self.global_entry_list[i_start:i_stop] = (
234
316
  self.get_file_entrylist(i_file) + f_start
235
317
  )
236
318
  return self.global_entry_list
237
319
 
238
- def read(self, entry: int) -> tuple[LGDO, int]:
239
- """Read the nextlocal chunk of events, starting at entry. Return the
320
+ def read(self, i_entry: int) -> tuple[LGDO, int]:
321
+ """Read the nextlocal chunk of events, starting at i_entry. Return the
240
322
  LH5 buffer and number of rows read."""
241
323
  self.n_rows = 0
242
- i_file = np.searchsorted(self.entry_map, entry, "right")
324
+ i_file = np.searchsorted(self.entry_map, i_entry, "right")
243
325
 
244
326
  # if file hasn't been opened yet, search through files
245
327
  # sequentially until we find the right one
246
- if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("i").max:
247
- while i_file < len(self.lh5_files) and entry >= self._get_file_cumentries(
328
+ if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("q").max:
329
+ while i_file < len(self.lh5_files) and i_entry >= self._get_file_cumentries(
248
330
  i_file
249
331
  ):
250
332
  i_file += 1
251
333
 
252
334
  if i_file == len(self.lh5_files):
253
335
  return (self.lh5_buffer, self.n_rows)
254
- local_entry = entry - self._get_file_cumentries(i_file - 1)
336
+ local_i_entry = i_entry - self._get_file_cumentries(i_file - 1)
255
337
 
256
338
  while self.n_rows < self.buffer_len and i_file < len(self.file_map):
257
339
  # Loop through files
258
340
  local_idx = self.get_file_entrylist(i_file)
259
341
  if local_idx is not None and len(local_idx) == 0:
260
342
  i_file += 1
261
- local_entry = 0
343
+ local_i_entry = 0
262
344
  continue
263
345
 
264
- i_local = local_idx[local_entry] if local_idx is not None else local_entry
346
+ i_local = local_i_entry if local_idx is None else local_idx[local_i_entry]
265
347
  self.lh5_buffer, n_rows = self.lh5_st.read(
266
348
  self.groups[i_file],
267
349
  self.lh5_files[i_file],
@@ -275,12 +357,12 @@ class LH5Iterator(typing.Iterator):
275
357
 
276
358
  self.n_rows += n_rows
277
359
  i_file += 1
278
- local_entry = 0
360
+ local_i_entry = 0
279
361
 
280
- self.current_entry = entry
362
+ self.current_i_entry = i_entry
281
363
 
282
364
  if self.friend is not None:
283
- self.friend.read(entry)
365
+ self.friend.read(i_entry)
284
366
 
285
367
  return (self.lh5_buffer, self.n_rows)
286
368
 
@@ -290,6 +372,108 @@ class LH5Iterator(typing.Iterator):
290
372
  if self.friend is not None:
291
373
  self.friend.reset_field_mask(mask)
292
374
 
375
+ @property
376
+ def current_local_entries(self) -> NDArray[int]:
377
+ """Return list of local file entries in buffer"""
378
+ cur_entries = np.zeros(self.n_rows, dtype="int32")
379
+ i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
380
+ file_start = self._get_file_cumentries(i_file - 1)
381
+ i_local = self.current_i_entry - file_start
382
+ i = 0
383
+
384
+ while i < len(cur_entries):
385
+ # number of entries to read from this file
386
+ file_end = self._get_file_cumentries(i_file)
387
+ n = min(file_end - file_start - i_local, len(cur_entries) - i)
388
+ entries = self.get_file_entrylist(i_file)
389
+
390
+ if entries is None:
391
+ cur_entries[i : i + n] = np.arange(i_local, i_local + n)
392
+ else:
393
+ cur_entries[i : i + n] = entries[i_local : i_local + n]
394
+
395
+ i_file += 1
396
+ file_start = file_end
397
+ i_local = 0
398
+ i += n
399
+
400
+ return cur_entries
401
+
402
+ @property
403
+ def current_global_entries(self) -> NDArray[int]:
404
+ """Return list of local file entries in buffer"""
405
+ cur_entries = np.zeros(self.n_rows, dtype="int32")
406
+ i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
407
+ file_start = self._get_file_cumentries(i_file - 1)
408
+ i_local = self.current_i_entry - file_start
409
+ i = 0
410
+
411
+ while i < len(cur_entries):
412
+ # number of entries to read from this file
413
+ file_end = self._get_file_cumentries(i_file)
414
+ n = min(file_end - file_start - i_local, len(cur_entries) - i)
415
+ entries = self.get_file_entrylist(i_file)
416
+
417
+ if entries is None:
418
+ cur_entries[i : i + n] = self._get_file_cumlen(i_file - 1) + np.arange(
419
+ i_local, i_local + n
420
+ )
421
+ else:
422
+ cur_entries[i : i + n] = (
423
+ self._get_file_cumlen(i_file - 1) + entries[i_local : i_local + n]
424
+ )
425
+
426
+ i_file += 1
427
+ file_start = file_end
428
+ i_local = 0
429
+ i += n
430
+
431
+ return cur_entries
432
+
433
+ @property
434
+ def current_files(self) -> NDArray[str]:
435
+ """Return list of file names for entries in buffer"""
436
+ cur_files = np.zeros(self.n_rows, dtype=object)
437
+ i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
438
+ file_start = self._get_file_cumentries(i_file - 1)
439
+ i_local = self.current_i_entry - file_start
440
+ i = 0
441
+
442
+ while i < len(cur_files):
443
+ # number of entries to read from this file
444
+ file_end = self._get_file_cumentries(i_file)
445
+ n = min(file_end - file_start - i_local, len(cur_files) - i)
446
+ cur_files[i : i + n] = self.lh5_files[i_file]
447
+
448
+ i_file += 1
449
+ file_start = file_end
450
+ i_local = 0
451
+ i += n
452
+
453
+ return cur_files
454
+
455
+ @property
456
+ def current_groups(self) -> NDArray[str]:
457
+ """Return list of group names for entries in buffer"""
458
+ cur_groups = np.zeros(self.n_rows, dtype=object)
459
+ i_file = np.searchsorted(self.entry_map, self.current_i_entry, "right")
460
+ file_start = self._get_file_cumentries(i_file - 1)
461
+ i_local = self.current_i_entry - file_start
462
+ i = 0
463
+
464
+ while i < len(cur_groups):
465
+ # number of entries to read from this file
466
+ file_end = self._get_file_cumentries(i_file)
467
+ n = min(file_end - file_start - i_local, len(cur_groups) - i)
468
+ cur_groups[i : i + n] = self.groups[i_file]
469
+
470
+ i_file += 1
471
+ file_start = file_end
472
+ i_local = 0
473
+ i += n
474
+
475
+ return cur_groups
476
+
293
477
  def __len__(self) -> int:
294
478
  """Return the total number of entries."""
295
479
  return (
@@ -300,15 +484,15 @@ class LH5Iterator(typing.Iterator):
300
484
 
301
485
  def __iter__(self) -> typing.Iterator:
302
486
  """Loop through entries in blocks of size buffer_len."""
303
- self.current_entry = 0
304
- self.next_entry = 0
487
+ self.current_i_entry = 0
488
+ self.next_i_entry = 0
305
489
  return self
306
490
 
307
491
  def __next__(self) -> tuple[LGDO, int, int]:
308
492
  """Read next buffer_len entries and return lh5_table, iterator entry
309
493
  and n_rows read."""
310
- buf, n_rows = self.read(self.next_entry)
311
- self.next_entry = self.current_entry + n_rows
494
+ buf, n_rows = self.read(self.next_i_entry)
495
+ self.next_i_entry = self.current_i_entry + n_rows
312
496
  if n_rows == 0:
313
497
  raise StopIteration
314
- return (buf, self.current_entry, n_rows)
498
+ return (buf, self.current_i_entry, n_rows)