legend-pydataobj 1.9.0__py3-none-any.whl → 1.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lgdo/lh5/core.py CHANGED
@@ -1,15 +1,18 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import bisect
3
4
  import inspect
4
5
  import sys
5
6
  from collections.abc import Mapping, Sequence
6
7
  from typing import Any
7
8
 
8
9
  import h5py
10
+ import numpy as np
9
11
  from numpy.typing import ArrayLike
10
12
 
11
13
  from .. import types
12
14
  from . import _serializers
15
+ from .utils import read_n_rows
13
16
 
14
17
 
15
18
  def read(
@@ -23,6 +26,7 @@ def read(
23
26
  obj_buf: types.LGDO = None,
24
27
  obj_buf_start: int = 0,
25
28
  decompress: bool = True,
29
+ locking: bool = False,
26
30
  ) -> types.LGDO | tuple[types.LGDO, int]:
27
31
  """Read LH5 object data from a file.
28
32
 
@@ -97,6 +101,8 @@ def read(
97
101
  Decompress data encoded with LGDO's compression routines right
98
102
  after reading. The option has no effect on data encoded with HDF5
99
103
  built-in filters, which is always decompressed upstream by HDF5.
104
+ locking
105
+ Lock HDF5 file while reading
100
106
 
101
107
  Returns
102
108
  -------
@@ -110,17 +116,72 @@ def read(
110
116
  if isinstance(lh5_file, h5py.File):
111
117
  lh5_obj = lh5_file[name]
112
118
  elif isinstance(lh5_file, str):
113
- lh5_file = h5py.File(lh5_file, mode="r")
119
+ lh5_file = h5py.File(lh5_file, mode="r", locking=locking)
114
120
  lh5_obj = lh5_file[name]
115
121
  else:
116
- lh5_obj = []
117
- for h5f in lh5_file:
118
- if isinstance(h5f, str):
119
- h5f = h5py.File(h5f, mode="r") # noqa: PLW2901
120
- lh5_obj += [h5f[name]]
122
+ lh5_files = list(lh5_file)
123
+ n_rows_read = 0
124
+ obj_buf_is_new = False
125
+
126
+ for i, h5f in enumerate(lh5_files):
127
+ if (
128
+ isinstance(idx, (list, tuple))
129
+ and len(idx) > 0
130
+ and not np.isscalar(idx[0])
131
+ ):
132
+ # a list of lists: must be one per file
133
+ idx_i = idx[i]
134
+ elif idx is not None:
135
+ # make idx a proper tuple if it's not one already
136
+ if not (isinstance(idx, tuple) and len(idx) == 1):
137
+ idx = (idx,)
138
+ # idx is a long continuous array
139
+ n_rows_i = read_n_rows(name, h5f)
140
+ # find the length of the subset of idx that contains indices
141
+ # that are less than n_rows_i
142
+ n_rows_to_read_i = bisect.bisect_left(idx[0], n_rows_i)
143
+ # now split idx into idx_i and the remainder
144
+ idx_i = np.array(idx[0])[:n_rows_to_read_i]
145
+ idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
146
+ else:
147
+ idx_i = None
148
+ n_rows_i = n_rows - n_rows_read
149
+
150
+ obj_ret = read(
151
+ name,
152
+ h5f,
153
+ start_row,
154
+ n_rows_i,
155
+ idx_i,
156
+ use_h5idx,
157
+ field_mask,
158
+ obj_buf,
159
+ obj_buf_start,
160
+ decompress,
161
+ )
162
+ if isinstance(obj_ret, tuple):
163
+ obj_buf, n_rows_read_i = obj_ret
164
+ obj_buf_is_new = True
165
+ else:
166
+ obj_buf = obj_ret
167
+ n_rows_read_i = len(obj_buf)
168
+
169
+ n_rows_read += n_rows_read_i
170
+ if n_rows_read >= n_rows or obj_buf is None:
171
+ return obj_buf, n_rows_read
172
+ start_row = 0
173
+ obj_buf_start += n_rows_read_i
174
+ return obj_buf if obj_buf_is_new else (obj_buf, n_rows_read)
175
+
176
+ if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
177
+ idx = idx[0]
178
+ if isinstance(idx, np.ndarray) and idx.dtype == np.dtype("?"):
179
+ idx = np.where(idx)[0]
121
180
 
122
181
  obj, n_rows_read = _serializers._h5_read_lgdo(
123
- lh5_obj,
182
+ lh5_obj.id,
183
+ lh5_obj.file.filename,
184
+ lh5_obj.name,
124
185
  start_row=start_row,
125
186
  n_rows=n_rows,
126
187
  idx=idx,
@@ -143,6 +204,7 @@ def write(
143
204
  n_rows: int | None = None,
144
205
  wo_mode: str = "append",
145
206
  write_start: int = 0,
207
+ page_buffer: int = 0,
146
208
  **h5py_kwargs,
147
209
  ) -> None:
148
210
  """Write an LGDO into an LH5 file.
@@ -218,6 +280,11 @@ def write(
218
280
  write_start
219
281
  row in the output file (if already existing) to start overwriting
220
282
  from.
283
+ page_buffer
284
+ enable paged aggregation with a buffer of this size in bytes
285
+ Only used when creating a new file. Useful when writing a file
286
+ with a large number of small datasets. This is a short-hand for
287
+ ``(fs_stragety="page", fs_pagesize=[page_buffer])``
221
288
  **h5py_kwargs
222
289
  additional keyword arguments forwarded to
223
290
  :meth:`h5py.Group.create_dataset` to specify, for example, an HDF5
@@ -225,6 +292,13 @@ def write(
225
292
  datasets. **Note: `compression` Ignored if compression is specified
226
293
  as an `obj` attribute.**
227
294
  """
295
+ if wo_mode in ("w", "write", "of", "overwrite_file"):
296
+ h5py_kwargs.update(
297
+ {
298
+ "fs_strategy": "page",
299
+ "fs_page_size": page_buffer,
300
+ }
301
+ )
228
302
  return _serializers._h5_write_lgdo(
229
303
  obj,
230
304
  name,
lgdo/lh5/exceptions.py CHANGED
@@ -4,11 +4,11 @@ import h5py
4
4
 
5
5
 
6
6
  class LH5DecodeError(Exception):
7
- def __init__(self, message: str, obj: h5py.Dataset | h5py.Group) -> None:
7
+ def __init__(self, message: str, fname: str, oname: str) -> None:
8
8
  super().__init__(message)
9
9
 
10
- self.file = obj.file.filename
11
- self.obj = obj.name
10
+ self.file = fname
11
+ self.obj = oname
12
12
 
13
13
  def __str__(self) -> str:
14
14
  return (