legend-pydataobj 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,17 +4,21 @@ import logging
4
4
  import sys
5
5
  from bisect import bisect_left
6
6
 
7
+ import h5py
7
8
  import numpy as np
8
9
 
9
10
  from ....types import Array
10
11
  from ... import datatype
11
12
  from ...exceptions import LH5DecodeError
13
+ from .utils import read_attrs
12
14
 
13
15
  log = logging.getLogger(__name__)
14
16
 
15
17
 
16
18
  def _h5_read_ndarray(
17
19
  h5d,
20
+ fname,
21
+ oname,
18
22
  start_row=0,
19
23
  n_rows=sys.maxsize,
20
24
  idx=None,
@@ -24,48 +28,49 @@ def _h5_read_ndarray(
24
28
  ):
25
29
  if obj_buf is not None and not isinstance(obj_buf, Array):
26
30
  msg = "object buffer is not an Array"
27
- raise LH5DecodeError(msg, h5d)
31
+ raise LH5DecodeError(msg, fname, oname)
28
32
 
29
33
  # compute the number of rows to read
30
34
  # we culled idx above for start_row and n_rows, now we have to apply
31
35
  # the constraint of the length of the dataset
32
36
  try:
33
- ds_n_rows = h5d.shape[0]
37
+ fspace = h5d.get_space()
38
+ ds_n_rows = fspace.shape[0]
34
39
  except AttributeError as e:
35
40
  msg = "does not seem to be an HDF5 dataset"
36
- raise LH5DecodeError(msg, h5d) from e
41
+ raise LH5DecodeError(msg, fname, oname) from e
37
42
 
38
43
  if idx is not None:
39
- if len(idx[0]) > 0 and idx[0][-1] >= ds_n_rows:
44
+ if len(idx) > 0 and idx[-1] >= ds_n_rows:
40
45
  log.warning("idx indexed past the end of the array in the file. Culling...")
41
46
  n_rows_to_read = bisect_left(idx[0], ds_n_rows)
42
- idx = (idx[0][:n_rows_to_read],)
43
- if len(idx[0]) == 0:
47
+ idx = (idx[:n_rows_to_read],)
48
+ if len(idx) == 0:
44
49
  log.warning("idx empty after culling.")
45
- n_rows_to_read = len(idx[0])
50
+ n_rows_to_read = len(idx)
46
51
  else:
47
52
  n_rows_to_read = ds_n_rows - start_row
48
53
  if n_rows_to_read > n_rows:
49
54
  n_rows_to_read = n_rows
50
55
 
51
- # if idx is passed, check if we can make it a slice instead (faster)
52
- change_idx_to_slice = False
53
-
54
- # prepare the selection for the read. Use idx if available
55
- if idx is not None:
56
- # check if idx is empty and convert to slice instead
57
- if len(idx[0]) == 0:
58
- source_sel = np.s_[0:0]
59
- change_idx_to_slice = True
60
- # check if idx is contiguous and increasing
61
- # if so, convert it to a slice instead (faster)
62
- elif np.all(np.diff(idx[0]) == 1):
63
- source_sel = np.s_[idx[0][0] : idx[0][-1] + 1]
64
- change_idx_to_slice = True
65
- else:
66
- source_sel = idx
67
- else:
68
- source_sel = np.s_[start_row : start_row + n_rows_to_read]
56
+ if idx is None:
57
+ fspace.select_hyperslab(
58
+ (start_row,) + (0,) * (h5d.rank - 1),
59
+ (1,) * h5d.rank,
60
+ None,
61
+ (n_rows_to_read,) + fspace.shape[1:],
62
+ )
63
+ elif use_h5idx:
64
+ # Note that h5s will automatically merge adjacent elements into a range
65
+ fspace.select_none()
66
+ for i in idx:
67
+ fspace.select_hyperslab(
68
+ (i,) + (0,) * (h5d.rank - 1),
69
+ (1,) * h5d.rank,
70
+ None,
71
+ (1,) + fspace.shape[1:],
72
+ h5py.h5s.SELECT_OR,
73
+ )
69
74
 
70
75
  # Now read the array
71
76
  if obj_buf is not None and n_rows_to_read > 0:
@@ -74,26 +79,35 @@ def _h5_read_ndarray(
74
79
  obj_buf.resize(buf_size)
75
80
  dest_sel = np.s_[obj_buf_start:buf_size]
76
81
 
77
- # this is required to make the read of multiple files faster
78
- # until a better solution found.
79
- if change_idx_to_slice or idx is None or use_h5idx:
80
- h5d.read_direct(obj_buf.nda, source_sel, dest_sel)
82
+ if idx is None or use_h5idx:
83
+ mspace = h5py.h5s.create_simple(obj_buf.nda.shape)
84
+ mspace.select_hyperslab(
85
+ (obj_buf_start,) + (0,) * (h5d.rank - 1),
86
+ (1,) * h5d.rank,
87
+ None,
88
+ (n_rows_to_read,) + fspace.shape[1:],
89
+ )
90
+ h5d.read(mspace, fspace, obj_buf.nda)
81
91
  else:
82
- # it is faster to read the whole object and then do fancy indexing
83
- obj_buf.nda[dest_sel] = h5d[...][source_sel]
84
-
92
+ tmp = np.empty(fspace.shape, h5d.dtype)
93
+ h5d.read(fspace, fspace, tmp)
94
+ obj_buf.nda[dest_sel, ...] = tmp[idx, ...]
85
95
  nda = obj_buf.nda
86
96
  elif n_rows == 0:
87
97
  tmp_shape = (0,) + h5d.shape[1:]
88
98
  nda = np.empty(tmp_shape, h5d.dtype)
89
- elif change_idx_to_slice or idx is None or use_h5idx:
90
- nda = h5d[source_sel]
91
99
  else:
92
- # it is faster to read the whole object and then do fancy indexing
93
- nda = h5d[...][source_sel]
100
+ mspace = h5py.h5s.create_simple((n_rows_to_read,) + fspace.shape[1:])
101
+ nda = np.empty(mspace.shape, h5d.dtype)
102
+ if idx is None or use_h5idx:
103
+ h5d.read(mspace, fspace, nda)
104
+ else:
105
+ tmp = np.empty(fspace.shape, h5d.dtype)
106
+ h5d.read(fspace, fspace, tmp)
107
+ nda[:, ...] = tmp[idx, ...]
94
108
 
95
109
  # Finally, set attributes and return objects
96
- attrs = dict(h5d.attrs)
110
+ attrs = read_attrs(h5d, fname, oname)
97
111
 
98
112
  # special handling for bools
99
113
  # (c and Julia store as uint8 so cast to bool)
@@ -2,20 +2,27 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
 
5
+ import h5py
5
6
  import numpy as np
6
7
 
7
8
  from ....types import Scalar
8
9
  from ...exceptions import LH5DecodeError
10
+ from .utils import read_attrs
9
11
 
10
12
  log = logging.getLogger(__name__)
11
13
 
12
14
 
13
15
  def _h5_read_scalar(
14
16
  h5d,
17
+ fname,
18
+ oname,
15
19
  obj_buf=None,
16
20
  ):
17
- value = h5d[()]
18
- attrs = dict(h5d.attrs)
21
+ value = np.empty((), h5d.dtype)
22
+ sp = h5py.h5s.create(h5py.h5s.SCALAR)
23
+ h5d.read(sp, sp, value)
24
+ value = value[()]
25
+ attrs = read_attrs(h5d, fname, oname)
19
26
 
20
27
  # special handling for bools
21
28
  # (c and Julia store as uint8 so cast to bool)
@@ -25,7 +32,7 @@ def _h5_read_scalar(
25
32
  if obj_buf is not None:
26
33
  if not isinstance(obj_buf, Scalar):
27
34
  msg = "object buffer a Scalar"
28
- raise LH5DecodeError(msg, h5d)
35
+ raise LH5DecodeError(msg, fname, oname)
29
36
 
30
37
  obj_buf.value = value
31
38
  obj_buf.attrs.update(attrs)
@@ -1,12 +1,35 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import h5py
4
+ import numpy as np
5
+
3
6
  from ...exceptions import LH5DecodeError
4
7
 
5
8
 
6
- def check_obj_buf_attrs(attrs, new_attrs, obj):
9
+ def check_obj_buf_attrs(attrs, new_attrs, fname, oname):
7
10
  if set(attrs.keys()) != set(new_attrs.keys()):
8
11
  msg = (
9
12
  f"existing buffer and new data chunk have different attributes: "
10
- f"obj_buf.attrs={attrs} != {obj.file.filename}[{obj.name}].attrs={new_attrs}"
13
+ f"obj_buf.attrs={attrs} != {fname}[{oname}].attrs={new_attrs}"
11
14
  )
12
- raise LH5DecodeError(msg, obj)
15
+ raise LH5DecodeError(msg, fname, oname)
16
+
17
+
18
+ def read_attrs(h5o, fname, oname):
19
+ """Read all attributes for an hdf5 dataset or group using low level API
20
+ and return them as a dict. Assume all are strings or scalar types."""
21
+ attrs = {}
22
+ for i_attr in range(h5py.h5a.get_num_attrs(h5o)):
23
+ h5a = h5py.h5a.open(h5o, index=i_attr)
24
+ name = h5a.get_name().decode()
25
+ if h5a.shape != ():
26
+ msg = f"attribute {name} is not a string or scalar"
27
+ raise LH5DecodeError(msg, fname, oname)
28
+ val = np.empty((), h5a.dtype)
29
+ h5a.read(val)
30
+ if h5a.get_type().get_class() == h5py.h5t.STRING:
31
+ attrs[name] = val.item().decode()
32
+ else:
33
+ attrs[name] = val.item()
34
+ h5a.close()
35
+ return attrs
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  import logging
4
4
  import sys
5
5
 
6
+ import h5py
6
7
  import numba
7
8
  import numpy as np
8
9
 
@@ -15,12 +16,15 @@ from ...exceptions import LH5DecodeError
15
16
  from .array import (
16
17
  _h5_read_array,
17
18
  )
19
+ from .utils import read_attrs
18
20
 
19
21
  log = logging.getLogger(__name__)
20
22
 
21
23
 
22
24
  def _h5_read_vector_of_vectors(
23
25
  h5g,
26
+ fname,
27
+ oname,
24
28
  start_row=0,
25
29
  n_rows=sys.maxsize,
26
30
  idx=None,
@@ -30,12 +34,15 @@ def _h5_read_vector_of_vectors(
30
34
  ):
31
35
  if obj_buf is not None and not isinstance(obj_buf, VectorOfVectors):
32
36
  msg = "object buffer is not a VectorOfVectors"
33
- raise LH5DecodeError(msg, h5g)
37
+ raise LH5DecodeError(msg, fname, oname)
34
38
 
35
39
  # read out cumulative_length
36
40
  cumulen_buf = None if obj_buf is None else obj_buf.cumulative_length
41
+ h5d_cl = h5py.h5d.open(h5g, b"cumulative_length")
37
42
  cumulative_length, n_rows_read = _h5_read_array(
38
- h5g["cumulative_length"],
43
+ h5d_cl,
44
+ fname,
45
+ f"{oname}/cumulative_length",
39
46
  start_row=start_row,
40
47
  n_rows=n_rows,
41
48
  idx=idx,
@@ -51,17 +58,19 @@ def _h5_read_vector_of_vectors(
51
58
  if idx is not None and n_rows_read > 0:
52
59
  # get the starting indices for each array in flattened data:
53
60
  # the starting index for array[i] is cumulative_length[i-1]
54
- idx2 = (np.asarray(idx[0]).copy() - 1,)
61
+ idx2 = np.asarray(idx).copy() - 1
55
62
 
56
63
  # re-read cumulative_length with these indices
57
64
  # note this will allocate memory for fd_starts!
58
65
  fd_start = None
59
- if idx2[0][0] == -1:
60
- idx2 = (idx2[0][1:],)
66
+ if idx2[0] == -1:
67
+ idx2 = idx2[1:]
61
68
  fd_start = 0 # this variable avoids an ndarray append
62
69
 
63
70
  fd_starts, fds_n_rows_read = _h5_read_array(
64
- h5g["cumulative_length"],
71
+ h5d_cl,
72
+ fname,
73
+ f"{oname}/cumulative_length",
65
74
  start_row=start_row,
66
75
  n_rows=n_rows,
67
76
  idx=idx2,
@@ -98,7 +107,11 @@ def _h5_read_vector_of_vectors(
98
107
  # need to read out the cumulen sample -before- the first sample
99
108
  # read above in order to get the starting row of the first
100
109
  # vector to read out in flattened_data
101
- fd_start = h5g["cumulative_length"][start_row - 1]
110
+ fspace = h5d_cl.get_space()
111
+ fspace.select_elements([[start_row - 1]])
112
+ mspace = h5py.h5s.create(h5py.h5s.SCALAR)
113
+ fd_start = np.empty((), h5d_cl.dtype)
114
+ h5d_cl.read(mspace, fspace, fd_start)
102
115
 
103
116
  # check limits for values that will be used subsequently
104
117
  if this_cumulen_nda[-1] < fd_start:
@@ -112,7 +125,7 @@ def _h5_read_vector_of_vectors(
112
125
  f"cumulative_length non-increasing between entries "
113
126
  f"{start_row} and {start_row+n_rows_read}"
114
127
  )
115
- raise LH5DecodeError(msg, h5g)
128
+ raise LH5DecodeError(msg, fname, oname)
116
129
 
117
130
  # determine the number of rows for the flattened_data readout
118
131
  fd_n_rows = this_cumulen_nda[-1] if n_rows_read > 0 else 0
@@ -126,6 +139,8 @@ def _h5_read_vector_of_vectors(
126
139
  # read for flattened_data
127
140
  this_cumulen_nda -= fd_start
128
141
 
142
+ h5d_cl.close()
143
+
129
144
  # If we started with a partially-filled buffer, add the
130
145
  # appropriate offset for the start of the in-memory flattened
131
146
  # data for this read.
@@ -144,17 +159,23 @@ def _h5_read_vector_of_vectors(
144
159
  fd_buf.resize(fdb_size)
145
160
 
146
161
  # now read
147
- lgdotype = dtypeutils.datatype(h5g["flattened_data"].attrs["datatype"])
162
+ h5o = h5py.h5o.open(h5g, b"flattened_data")
163
+ h5a_dtype = h5py.h5a.open(h5o, b"datatype")
164
+ val = np.empty((), "O")
165
+ h5a_dtype.read(val)
166
+ lgdotype = dtypeutils.datatype(val.item().decode())
148
167
  if lgdotype is Array:
149
168
  _func = _h5_read_array
150
169
  elif lgdotype is VectorOfVectors:
151
170
  _func = _h5_read_vector_of_vectors
152
171
  else:
153
172
  msg = "type {lgdotype.__name__} is not supported"
154
- raise LH5DecodeError(msg, h5g, "flattened_data")
173
+ raise LH5DecodeError(msg, fname, f"{oname}/flattened_data")
155
174
 
156
175
  flattened_data, _ = _func(
157
- h5g["flattened_data"],
176
+ h5o,
177
+ fname,
178
+ f"{oname}/flattened_data",
158
179
  start_row=fd_start,
159
180
  n_rows=fd_n_rows,
160
181
  idx=fd_idx,
@@ -162,6 +183,7 @@ def _h5_read_vector_of_vectors(
162
183
  obj_buf=fd_buf,
163
184
  obj_buf_start=fd_buf_start,
164
185
  )
186
+ h5o.close()
165
187
 
166
188
  if obj_buf is not None:
167
189
  # if the buffer is partially filled, cumulative_length will be invalid
@@ -176,7 +198,7 @@ def _h5_read_vector_of_vectors(
176
198
  VectorOfVectors(
177
199
  flattened_data=flattened_data,
178
200
  cumulative_length=cumulative_length,
179
- attrs=dict(h5g.attrs),
201
+ attrs=read_attrs(h5g, fname, oname),
180
202
  ),
181
203
  n_rows_read,
182
204
  )
@@ -194,4 +216,4 @@ def _make_fd_idx(starts, stops, idx):
194
216
  for i in range(starts[j], stops[j]):
195
217
  idx[k] = i
196
218
  k += 1
197
- return (idx,)
219
+ return idx
@@ -71,7 +71,12 @@ def _h5_write_array(
71
71
  _attrs = obj.getattrs(datatype=True)
72
72
  _attrs.pop("compression", None)
73
73
  _attrs.pop("hdf5_settings", None)
74
- ds.attrs.update(_attrs)
74
+ ds.attrs.update(
75
+ {
76
+ k: v.encode("utf-8") if isinstance(v, str) else v
77
+ for k, v in _attrs.items()
78
+ }
79
+ )
75
80
 
76
81
  return
77
82
 
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ import os
5
+ from inspect import signature
4
6
 
5
7
  import h5py
6
8
 
@@ -27,6 +29,10 @@ def _h5_write_lgdo(
27
29
  ):
28
30
  assert isinstance(obj, types.LGDO)
29
31
 
32
+ file_kwargs = {
33
+ k: h5py_kwargs[k] for k in h5py_kwargs & signature(h5py.File).parameters.keys()
34
+ }
35
+ h5py_kwargs = {k: h5py_kwargs[k] for k in h5py_kwargs - file_kwargs.keys()}
30
36
  if wo_mode == "write_safe":
31
37
  wo_mode = "w"
32
38
  if wo_mode == "append":
@@ -46,10 +52,9 @@ def _h5_write_lgdo(
46
52
  # In hdf5, 'a' is really "modify" -- in addition to appending, you can
47
53
  # change any object in the file. So we use file:append for
48
54
  # write_object:overwrite.
49
- mode = "w" if wo_mode == "of" else "a"
50
-
51
55
  if not isinstance(lh5_file, h5py.File):
52
- lh5_file = h5py.File(lh5_file, mode=mode)
56
+ mode = "w" if wo_mode == "of" or not os.path.exists(lh5_file) else "a"
57
+ lh5_file = h5py.File(lh5_file, mode=mode, **file_kwargs)
53
58
 
54
59
  log.debug(
55
60
  f"writing {obj!r}[{start_row}:{n_rows}] as "
@@ -65,8 +70,12 @@ def _h5_write_lgdo(
65
70
 
66
71
  # struct, table, waveform table or histogram.
67
72
  if isinstance(obj, types.Struct):
68
- if isinstance(obj, types.Histogram) and wo_mode not in ["w", "o", "of"]:
69
- msg = f"can't append-write histogram in wo_mode '{wo_mode}'"
73
+ if (
74
+ isinstance(obj, types.Histogram)
75
+ and wo_mode not in ["w", "o", "of"]
76
+ and name in group
77
+ ):
78
+ msg = f"can't append-write to histogram in wo_mode '{wo_mode}'"
70
79
  raise LH5EncodeError(msg, lh5_file, group, name)
71
80
  if isinstance(obj, types.Histogram) and write_start != 0:
72
81
  msg = f"can't write histogram in wo_mode '{wo_mode}' with write_start != 0"
@@ -20,4 +20,9 @@ def _h5_write_scalar(obj, name, lh5_file, group="/", wo_mode="append"):
20
20
  raise LH5EncodeError(msg, lh5_file, group, name)
21
21
 
22
22
  ds = group.create_dataset(name, shape=(), data=obj.value)
23
- ds.attrs.update(obj.attrs)
23
+ ds.attrs.update(
24
+ {
25
+ k: v.encode("utf-8") if isinstance(v, str) else v
26
+ for k, v in obj.attrs.items()
27
+ }
28
+ )
lgdo/lh5/core.py CHANGED
@@ -1,15 +1,18 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import bisect
3
4
  import inspect
4
5
  import sys
5
6
  from collections.abc import Mapping, Sequence
6
7
  from typing import Any
7
8
 
8
9
  import h5py
10
+ import numpy as np
9
11
  from numpy.typing import ArrayLike
10
12
 
11
13
  from .. import types
12
14
  from . import _serializers
15
+ from .utils import read_n_rows
13
16
 
14
17
 
15
18
  def read(
@@ -23,6 +26,7 @@ def read(
23
26
  obj_buf: types.LGDO = None,
24
27
  obj_buf_start: int = 0,
25
28
  decompress: bool = True,
29
+ locking: bool = False,
26
30
  ) -> types.LGDO | tuple[types.LGDO, int]:
27
31
  """Read LH5 object data from a file.
28
32
 
@@ -97,6 +101,8 @@ def read(
97
101
  Decompress data encoded with LGDO's compression routines right
98
102
  after reading. The option has no effect on data encoded with HDF5
99
103
  built-in filters, which is always decompressed upstream by HDF5.
104
+ locking
105
+ Lock HDF5 file while reading
100
106
 
101
107
  Returns
102
108
  -------
@@ -110,17 +116,69 @@ def read(
110
116
  if isinstance(lh5_file, h5py.File):
111
117
  lh5_obj = lh5_file[name]
112
118
  elif isinstance(lh5_file, str):
113
- lh5_file = h5py.File(lh5_file, mode="r")
119
+ lh5_file = h5py.File(lh5_file, mode="r", locking=locking)
114
120
  lh5_obj = lh5_file[name]
115
121
  else:
116
- lh5_obj = []
117
- for h5f in lh5_file:
118
- if isinstance(h5f, str):
119
- h5f = h5py.File(h5f, mode="r") # noqa: PLW2901
120
- lh5_obj += [h5f[name]]
122
+ lh5_files = list(lh5_file)
123
+ n_rows_read = 0
124
+ obj_buf_is_new = False
121
125
 
126
+ for i, h5f in enumerate(lh5_files):
127
+ if (
128
+ isinstance(idx, (list, tuple))
129
+ and len(idx) > 0
130
+ and not np.isscalar(idx[0])
131
+ ):
132
+ # a list of lists: must be one per file
133
+ idx_i = idx[i]
134
+ elif idx is not None:
135
+ # make idx a proper tuple if it's not one already
136
+ if not (isinstance(idx, tuple) and len(idx) == 1):
137
+ idx = (idx,)
138
+ # idx is a long continuous array
139
+ n_rows_i = read_n_rows(name, h5f)
140
+ # find the length of the subset of idx that contains indices
141
+ # that are less than n_rows_i
142
+ n_rows_to_read_i = bisect.bisect_left(idx[0], n_rows_i)
143
+ # now split idx into idx_i and the remainder
144
+ idx_i = np.array(idx[0])[:n_rows_to_read_i]
145
+ idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
146
+ else:
147
+ idx_i = None
148
+ n_rows_i = n_rows - n_rows_read
149
+
150
+ obj_ret = read(
151
+ name,
152
+ h5f,
153
+ start_row,
154
+ n_rows_i,
155
+ idx_i,
156
+ use_h5idx,
157
+ field_mask,
158
+ obj_buf,
159
+ obj_buf_start,
160
+ decompress,
161
+ )
162
+ if isinstance(obj_ret, tuple):
163
+ obj_buf, n_rows_read_i = obj_ret
164
+ obj_buf_is_new = True
165
+ else:
166
+ obj_buf = obj_ret
167
+ n_rows_read_i = len(obj_buf)
168
+
169
+ n_rows_read += n_rows_read_i
170
+ if n_rows_read >= n_rows or obj_buf is None:
171
+ return obj_buf, n_rows_read
172
+ start_row = 0
173
+ obj_buf_start += n_rows_read_i
174
+ return obj_buf if obj_buf_is_new else (obj_buf, n_rows_read)
175
+
176
+ if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
177
+ idx = idx[0]
122
178
  obj, n_rows_read = _serializers._h5_read_lgdo(
123
- lh5_obj,
179
+ lh5_obj.id,
180
+ lh5_obj.file.filename,
181
+ lh5_obj.name,
124
182
  start_row=start_row,
125
183
  n_rows=n_rows,
126
184
  idx=idx,
@@ -143,6 +201,7 @@ def write(
143
201
  n_rows: int | None = None,
144
202
  wo_mode: str = "append",
145
203
  write_start: int = 0,
204
+ page_buffer: int = 0,
146
205
  **h5py_kwargs,
147
206
  ) -> None:
148
207
  """Write an LGDO into an LH5 file.
@@ -218,6 +277,11 @@ def write(
218
277
  write_start
219
278
  row in the output file (if already existing) to start overwriting
220
279
  from.
280
+ page_buffer
281
+ enable paged aggregation with a buffer of this size in bytes
282
+ Only used when creating a new file. Useful when writing a file
283
+ with a large number of small datasets. This is a short-hand for
284
+ ``(fs_stragety="page", fs_pagesize=[page_buffer])``
221
285
  **h5py_kwargs
222
286
  additional keyword arguments forwarded to
223
287
  :meth:`h5py.Group.create_dataset` to specify, for example, an HDF5
@@ -225,6 +289,13 @@ def write(
225
289
  datasets. **Note: `compression` Ignored if compression is specified
226
290
  as an `obj` attribute.**
227
291
  """
292
+ if wo_mode in ("w", "write", "of", "overwrite_file"):
293
+ h5py_kwargs.update(
294
+ {
295
+ "fs_strategy": "page",
296
+ "fs_page_size": page_buffer,
297
+ }
298
+ )
228
299
  return _serializers._h5_write_lgdo(
229
300
  obj,
230
301
  name,
lgdo/lh5/exceptions.py CHANGED
@@ -4,11 +4,11 @@ import h5py
4
4
 
5
5
 
6
6
  class LH5DecodeError(Exception):
7
- def __init__(self, message: str, obj: h5py.Dataset | h5py.Group) -> None:
7
+ def __init__(self, message: str, fname: str, oname: str) -> None:
8
8
  super().__init__(message)
9
9
 
10
- self.file = obj.file.filename
11
- self.obj = obj.name
10
+ self.file = fname
11
+ self.obj = oname
12
12
 
13
13
  def __str__(self) -> str:
14
14
  return (