legend-pydataobj 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lgdo/lh5/store.py CHANGED
@@ -5,13 +5,16 @@ HDF5 files.
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
+ import bisect
8
9
  import logging
9
10
  import os
10
11
  import sys
11
12
  from collections.abc import Mapping, Sequence
13
+ from inspect import signature
12
14
  from typing import Any
13
15
 
14
16
  import h5py
17
+ import numpy as np
15
18
  from numpy.typing import ArrayLike
16
19
 
17
20
  from .. import types
@@ -34,7 +37,9 @@ class LH5Store:
34
37
  lgdo.waveformtable.WaveformTable
35
38
  """
36
39
 
37
- def __init__(self, base_path: str = "", keep_open: bool = False) -> None:
40
+ def __init__(
41
+ self, base_path: str = "", keep_open: bool = False, locking: bool = False
42
+ ) -> None:
38
43
  """
39
44
  Parameters
40
45
  ----------
@@ -43,12 +48,21 @@ class LH5Store:
43
48
  keep_open
44
49
  whether to keep files open by storing the :mod:`h5py` objects as
45
50
  class attributes.
51
+ locking
52
+ whether to lock files when reading
46
53
  """
47
54
  self.base_path = "" if base_path == "" else utils.expand_path(base_path)
48
55
  self.keep_open = keep_open
56
+ self.locking = locking
49
57
  self.files = {}
50
58
 
51
- def gimme_file(self, lh5_file: str | h5py.File, mode: str = "r") -> h5py.File:
59
+ def gimme_file(
60
+ self,
61
+ lh5_file: str | h5py.File,
62
+ mode: str = "r",
63
+ page_buffer: int = 0,
64
+ **file_kwargs,
65
+ ) -> h5py.File:
52
66
  """Returns a :mod:`h5py` file object from the store or creates a new one.
53
67
 
54
68
  Parameters
@@ -57,12 +71,20 @@ class LH5Store:
57
71
  LH5 file name.
58
72
  mode
59
73
  mode in which to open file. See :class:`h5py.File` documentation.
74
+ page_buffer
75
+ enable paged aggregation with a buffer of this size in bytes
76
+ Only used when creating a new file. Useful when writing a file
77
+ with a large number of small datasets. This is a short-hand for
78
+ ``(fs_stragety="page", fs_pagesize=[page_buffer])``
79
+ file_kwargs
80
+ Keyword arguments for :class:`h5py.File`
60
81
  """
61
82
  if isinstance(lh5_file, h5py.File):
62
83
  return lh5_file
63
84
 
64
85
  if mode == "r":
65
86
  lh5_file = utils.expand_path(lh5_file, base_path=self.base_path)
87
+ file_kwargs["locking"] = self.locking
66
88
 
67
89
  if lh5_file in self.files:
68
90
  return self.files[lh5_file]
@@ -72,20 +94,30 @@ class LH5Store:
72
94
  else:
73
95
  full_path = lh5_file
74
96
 
97
+ file_exists = os.path.exists(full_path)
75
98
  if mode != "r":
76
99
  directory = os.path.dirname(full_path)
77
100
  if directory != "" and not os.path.exists(directory):
78
101
  log.debug(f"making path {directory}")
79
102
  os.makedirs(directory)
80
103
 
81
- if mode == "r" and not os.path.exists(full_path):
104
+ if mode == "r" and not file_exists:
82
105
  msg = f"file {full_path} not found"
83
106
  raise FileNotFoundError(msg)
107
+ if not file_exists:
108
+ mode = "w"
84
109
 
85
- if mode != "r" and os.path.exists(full_path):
110
+ if mode != "r" and file_exists:
86
111
  log.debug(f"opening existing file {full_path} in mode '{mode}'")
87
112
 
88
- h5f = h5py.File(full_path, mode)
113
+ if mode == "w":
114
+ file_kwargs.update(
115
+ {
116
+ "fs_strategy": "page",
117
+ "fs_page_size": page_buffer,
118
+ }
119
+ )
120
+ h5f = h5py.File(full_path, mode, **file_kwargs)
89
121
 
90
122
  if self.keep_open:
91
123
  self.files[lh5_file] = h5f
@@ -135,6 +167,7 @@ class LH5Store:
135
167
  obj_buf: types.LGDO = None,
136
168
  obj_buf_start: int = 0,
137
169
  decompress: bool = True,
170
+ **file_kwargs,
138
171
  ) -> tuple[types.LGDO, int]:
139
172
  """Read LH5 object data from a file in the store.
140
173
 
@@ -143,13 +176,62 @@ class LH5Store:
143
176
  .lh5.core.read
144
177
  """
145
178
  # grab files from store
146
- if not isinstance(lh5_file, (str, h5py.File)):
147
- lh5_obj = [self.gimme_file(f, "r")[name] for f in list(lh5_file)]
179
+ if isinstance(lh5_file, (str, h5py.File)):
180
+ lh5_obj = self.gimme_file(lh5_file, "r", **file_kwargs)[name]
148
181
  else:
149
- lh5_obj = self.gimme_file(lh5_file, "r")[name]
150
-
182
+ lh5_files = list(lh5_file)
183
+ n_rows_read = 0
184
+
185
+ for i, h5f in enumerate(lh5_files):
186
+ if (
187
+ isinstance(idx, (list, tuple))
188
+ and len(idx) > 0
189
+ and not np.isscalar(idx[0])
190
+ ):
191
+ # a list of lists: must be one per file
192
+ idx_i = idx[i]
193
+ elif idx is not None:
194
+ # make idx a proper tuple if it's not one already
195
+ if not (isinstance(idx, tuple) and len(idx) == 1):
196
+ idx = (idx,)
197
+ # idx is a long continuous array
198
+ n_rows_i = utils.read_n_rows(name, h5f)
199
+ # find the length of the subset of idx that contains indices
200
+ # that are less than n_rows_i
201
+ n_rows_to_read_i = bisect.bisect_left(idx[0], n_rows_i)
202
+ # now split idx into idx_i and the remainder
203
+ idx_i = np.array(idx[0])[:n_rows_to_read_i]
204
+ idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
205
+ else:
206
+ idx_i = None
207
+ n_rows_i = n_rows - n_rows_read
208
+
209
+ obj_buf, n_rows_read_i = self.read(
210
+ name,
211
+ h5f,
212
+ start_row,
213
+ n_rows_i,
214
+ idx_i,
215
+ use_h5idx,
216
+ field_mask,
217
+ obj_buf,
218
+ obj_buf_start,
219
+ decompress,
220
+ )
221
+
222
+ n_rows_read += n_rows_read_i
223
+ if n_rows_read >= n_rows or obj_buf is None:
224
+ return obj_buf, n_rows_read
225
+ start_row = 0
226
+ obj_buf_start += n_rows_read_i
227
+ return obj_buf, n_rows_read
228
+
229
+ if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
230
+ idx = idx[0]
151
231
  return _serializers._h5_read_lgdo(
152
- lh5_obj,
232
+ lh5_obj.id,
233
+ lh5_obj.file.filename,
234
+ lh5_obj.name,
153
235
  start_row=start_row,
154
236
  n_rows=n_rows,
155
237
  idx=idx,
@@ -170,6 +252,7 @@ class LH5Store:
170
252
  n_rows: int | None = None,
171
253
  wo_mode: str = "append",
172
254
  write_start: int = 0,
255
+ page_buffer: int = 0,
173
256
  **h5py_kwargs,
174
257
  ) -> None:
175
258
  """Write an LGDO into an LH5 file.
@@ -199,10 +282,17 @@ class LH5Store:
199
282
  # write_object:overwrite.
200
283
  mode = "w" if wo_mode == "of" else "a"
201
284
 
285
+ file_kwargs = {
286
+ k: h5py_kwargs[k]
287
+ for k in h5py_kwargs & signature(h5py.File).parameters.keys()
288
+ }
289
+
202
290
  return _serializers._h5_write_lgdo(
203
291
  obj,
204
292
  name,
205
- self.gimme_file(lh5_file, mode=mode),
293
+ self.gimme_file(
294
+ lh5_file, mode=mode, page_buffer=page_buffer, **file_kwargs
295
+ ),
206
296
  group=group,
207
297
  start_row=start_row,
208
298
  n_rows=n_rows,
lgdo/lh5/tools.py CHANGED
@@ -128,7 +128,7 @@ def show(
128
128
 
129
129
  # open file
130
130
  if isinstance(lh5_file, str):
131
- lh5_file = h5py.File(utils.expand_path(lh5_file), "r")
131
+ lh5_file = h5py.File(utils.expand_path(lh5_file), "r", locking=False)
132
132
 
133
133
  # go to group
134
134
  if lh5_group != "/":
lgdo/lh5/utils.py CHANGED
@@ -125,7 +125,12 @@ def get_h5_group(
125
125
  else:
126
126
  group = base_group.create_group(group)
127
127
  if grp_attrs is not None:
128
- group.attrs.update(grp_attrs)
128
+ group.attrs.update(
129
+ {
130
+ k: v.encode("utf-8") if isinstance(v, str) else v
131
+ for k, v in grp_attrs.items()
132
+ }
133
+ )
129
134
  return group
130
135
  if (
131
136
  grp_attrs is not None
@@ -141,7 +146,13 @@ def get_h5_group(
141
146
  log.debug(f"overwriting {group}.attrs...")
142
147
  for key in group.attrs:
143
148
  group.attrs.pop(key)
144
- group.attrs.update(grp_attrs)
149
+
150
+ group.attrs.update(
151
+ {
152
+ k: v.encode("utf-8") if isinstance(v, str) else v
153
+ for k, v in grp_attrs.items()
154
+ }
155
+ )
145
156
 
146
157
  return group
147
158
 
lgdo/types/histogram.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  from collections.abc import Iterable
4
5
  from typing import Any
5
6
 
@@ -12,6 +13,8 @@ from .lgdo import LGDO
12
13
  from .scalar import Scalar
13
14
  from .struct import Struct
14
15
 
16
+ log = logging.getLogger(__name__)
17
+
15
18
 
16
19
  class Histogram(Struct):
17
20
  class Axis(Struct):
@@ -197,6 +200,7 @@ class Histogram(Struct):
197
200
  isdensity: bool = False,
198
201
  attrs: dict[str, Any] | None = None,
199
202
  binedge_attrs: dict[str, Any] | None = None,
203
+ flow: bool = True,
200
204
  ) -> None:
201
205
  """A special struct to contain histogrammed data.
202
206
 
@@ -221,6 +225,16 @@ class Histogram(Struct):
221
225
  as binning.
222
226
  attrs
223
227
  a set of user attributes to be carried along with this LGDO.
228
+ flow
229
+ If ``False``, discard counts in over-/underflow bins of the passed
230
+ :class:`hist.Hist` instance. If ``True``, this data will also be discarded,
231
+ but a warning is emitted.
232
+
233
+ .. note ::
234
+
235
+ :class:`Histogram` does not support storing counts in overflow or
236
+ underflow bins. This parameter just controls, whether a warning will
237
+ be emitted.
224
238
  """
225
239
  if isinstance(weights, hist.Hist):
226
240
  if binning is not None:
@@ -230,9 +244,10 @@ class Histogram(Struct):
230
244
  msg = "not allowed to pass isdensity=True if constructing from hist.Hist instance"
231
245
  raise ValueError(msg)
232
246
 
233
- if weights.sum(flow=True) != weights.sum(flow=False):
234
- msg = "flow bins of hist.Hist cannot be represented"
235
- raise ValueError(msg)
247
+ if weights.sum(flow=True) != weights.sum(flow=False) and flow:
248
+ log.warning(
249
+ "flow bins of hist.Hist cannot be represented, their counts are discarded"
250
+ )
236
251
  weights_view = weights.view(flow=False)
237
252
  if type(weights_view) is not np.ndarray:
238
253
  msg = "only simple numpy-backed storages can be used in a hist.Hist"