legend-pydataobj 1.5.0a5__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {legend_pydataobj-1.5.0a5.dist-info → legend_pydataobj-1.6.0.dist-info}/METADATA +1 -1
  2. legend_pydataobj-1.6.0.dist-info/RECORD +54 -0
  3. {legend_pydataobj-1.5.0a5.dist-info → legend_pydataobj-1.6.0.dist-info}/WHEEL +1 -1
  4. {legend_pydataobj-1.5.0a5.dist-info → legend_pydataobj-1.6.0.dist-info}/entry_points.txt +1 -0
  5. lgdo/__init__.py +7 -4
  6. lgdo/_version.py +2 -2
  7. lgdo/cli.py +237 -12
  8. lgdo/compression/__init__.py +1 -0
  9. lgdo/lh5/__init__.py +9 -1
  10. lgdo/lh5/_serializers/__init__.py +43 -0
  11. lgdo/lh5/_serializers/read/__init__.py +0 -0
  12. lgdo/lh5/_serializers/read/array.py +34 -0
  13. lgdo/lh5/_serializers/read/composite.py +405 -0
  14. lgdo/lh5/_serializers/read/encoded.py +129 -0
  15. lgdo/lh5/_serializers/read/ndarray.py +104 -0
  16. lgdo/lh5/_serializers/read/scalar.py +34 -0
  17. lgdo/lh5/_serializers/read/utils.py +12 -0
  18. lgdo/lh5/_serializers/read/vector_of_vectors.py +195 -0
  19. lgdo/lh5/_serializers/write/__init__.py +0 -0
  20. lgdo/lh5/_serializers/write/array.py +92 -0
  21. lgdo/lh5/_serializers/write/composite.py +259 -0
  22. lgdo/lh5/_serializers/write/scalar.py +23 -0
  23. lgdo/lh5/_serializers/write/vector_of_vectors.py +95 -0
  24. lgdo/lh5/core.py +272 -0
  25. lgdo/lh5/datatype.py +46 -0
  26. lgdo/lh5/exceptions.py +34 -0
  27. lgdo/lh5/iterator.py +1 -1
  28. lgdo/lh5/store.py +69 -1160
  29. lgdo/lh5/tools.py +27 -53
  30. lgdo/lh5/utils.py +130 -27
  31. lgdo/lh5_store.py +59 -2
  32. lgdo/logging.py +4 -3
  33. lgdo/types/__init__.py +1 -0
  34. lgdo/types/array.py +3 -0
  35. lgdo/types/arrayofequalsizedarrays.py +1 -0
  36. lgdo/types/encoded.py +3 -8
  37. lgdo/types/fixedsizearray.py +1 -0
  38. lgdo/types/struct.py +1 -0
  39. lgdo/types/table.py +69 -26
  40. lgdo/types/vectorofvectors.py +314 -458
  41. lgdo/types/vovutils.py +320 -0
  42. lgdo/types/waveformtable.py +1 -0
  43. lgdo/utils.py +1 -32
  44. legend_pydataobj-1.5.0a5.dist-info/RECORD +0 -36
  45. {legend_pydataobj-1.5.0a5.dist-info → legend_pydataobj-1.6.0.dist-info}/LICENSE +0 -0
  46. {legend_pydataobj-1.5.0a5.dist-info → legend_pydataobj-1.6.0.dist-info}/top_level.txt +0 -0
lgdo/lh5/core.py ADDED
@@ -0,0 +1,272 @@
1
+ from __future__ import annotations
2
+
3
+ import inspect
4
+ import sys
5
+ from collections.abc import Mapping, Sequence
6
+ from typing import Any
7
+
8
+ import h5py
9
+ from numpy.typing import ArrayLike
10
+
11
+ from .. import types
12
+ from . import _serializers
13
+
14
+
15
+ def read(
16
+ name: str,
17
+ lh5_file: str | h5py.File | Sequence[str | h5py.File],
18
+ start_row: int = 0,
19
+ n_rows: int = sys.maxsize,
20
+ idx: ArrayLike = None,
21
+ use_h5idx: bool = False,
22
+ field_mask: Mapping[str, bool] | Sequence[str] | None = None,
23
+ obj_buf: types.LGDO = None,
24
+ obj_buf_start: int = 0,
25
+ decompress: bool = True,
26
+ ) -> types.LGDO | tuple[types.LGDO, int]:
27
+ """Read LH5 object data from a file.
28
+
29
+ Note
30
+ ----
31
+ Use the ``idx`` parameter to read out particular rows of the data. The
32
+ ``use_h5idx`` flag controls whether *only* those rows are read from
33
+ disk or if the rows are indexed after reading the entire object.
34
+ Reading individual rows can be orders of magnitude slower than reading
35
+ the whole object and then indexing the desired rows. The default
36
+ behavior (``use_h5idx=False``) is to use slightly more memory for a
37
+ much faster read. See `legend-pydataobj/issues/#29
38
+ <https://github.com/legend-exp/legend-pydataobj/issues/29>`_ for
39
+ additional information.
40
+
41
+ Parameters
42
+ ----------
43
+ name
44
+ Name of the LH5 object to be read (including its group path).
45
+ lh5_file
46
+ The file(s) containing the object to be read out. If a list of
47
+ files, array-like object data will be concatenated into the output
48
+ object.
49
+ start_row
50
+ Starting entry for the object read (for array-like objects). For a
51
+ list of files, only applies to the first file.
52
+ n_rows
53
+ The maximum number of rows to read (for array-like objects). The
54
+ actual number of rows read will be returned as one of the return
55
+ values (see below).
56
+ idx
57
+ For NumPy-style "fancying indexing" for the read to select only
58
+ some rows, e.g. after applying some cuts to particular columns.
59
+ Only selection along the first axis is supported, so tuple
60
+ arguments must be one-tuples. If `n_rows` is not false, `idx` will
61
+ be truncated to `n_rows` before reading. To use with a list of
62
+ files, can pass in a list of `idx`'s (one for each file) or use a
63
+ long contiguous list (e.g. built from a previous identical read).
64
+ If used in conjunction with `start_row` and `n_rows`, will be
65
+ sliced to obey those constraints, where `n_rows` is interpreted as
66
+ the (max) number of *selected* values (in `idx`) to be read out.
67
+ Note that the ``use_h5idx`` parameter controls some behaviour of
68
+ the read and that the default behavior (``use_h5idx=False``)
69
+ prioritizes speed over a small memory penalty.
70
+ use_h5idx
71
+ ``True`` will directly pass the ``idx`` parameter to the underlying
72
+ :mod:`h5py` call such that only the selected rows are read directly
73
+ into memory, which conserves memory at the cost of speed. There can
74
+ be a significant penalty to speed for larger files (1 - 2 orders of
75
+ magnitude longer time). ``False`` (default) will read the entire
76
+ object into memory before performing the indexing. The default is
77
+ much faster but requires additional memory, though a relatively
78
+ small amount in the typical use case. It is recommended to leave
79
+ this parameter as its default.
80
+ field_mask
81
+ For tables and structs, determines which fields get read out.
82
+ Only applies to immediate fields of the requested objects. If a dict
83
+ is used, a default dict will be made with the default set to the
84
+ opposite of the first element in the dict. This way if one specifies
85
+ a few fields at ``False``, all but those fields will be read out,
86
+ while if one specifies just a few fields as ``True``, only those
87
+ fields will be read out. If a list is provided, the listed fields
88
+ will be set to ``True``, while the rest will default to ``False``.
89
+ obj_buf
90
+ Read directly into memory provided in `obj_buf`. Note: the buffer
91
+ will be expanded to accommodate the data requested. To maintain the
92
+ buffer length, send in ``n_rows = len(obj_buf)``.
93
+ obj_buf_start
94
+ Start location in ``obj_buf`` for read. For concatenating data to
95
+ array-like objects.
96
+ decompress
97
+ Decompress data encoded with LGDO's compression routines right
98
+ after reading. The option has no effect on data encoded with HDF5
99
+ built-in filters, which is always decompressed upstream by HDF5.
100
+
101
+ Returns
102
+ -------
103
+ (object, n_rows_read)
104
+ `object` is the read-out object `n_rows_read` is the number of rows
105
+ successfully read out. Essential for arrays when the amount of data
106
+ is smaller than the object buffer. For scalars and structs
107
+ `n_rows_read` will be``1``. For tables it is redundant with
108
+ ``table.loc``. If `obj_buf` is ``None``, only `object` is returned.
109
+ """
110
+ obj, n_rows_read = _serializers._h5_read_lgdo(
111
+ name,
112
+ lh5_file,
113
+ start_row=start_row,
114
+ n_rows=n_rows,
115
+ idx=idx,
116
+ use_h5idx=use_h5idx,
117
+ field_mask=field_mask,
118
+ obj_buf=obj_buf,
119
+ obj_buf_start=obj_buf_start,
120
+ decompress=decompress,
121
+ )
122
+
123
+ return obj if obj_buf is None else (obj, n_rows_read)
124
+
125
+
126
+ def write(
127
+ obj: types.LGDO,
128
+ name: str,
129
+ lh5_file: str | h5py.File,
130
+ group: str | h5py.Group = "/",
131
+ start_row: int = 0,
132
+ n_rows: int | None = None,
133
+ wo_mode: str = "append",
134
+ write_start: int = 0,
135
+ **h5py_kwargs,
136
+ ) -> None:
137
+ """Write an LGDO into an LH5 file.
138
+
139
+ If the `obj` :class:`.LGDO` has a `compression` attribute, its value is
140
+ interpreted as the algorithm to be used to compress `obj` before
141
+ writing to disk. The type of `compression` can be:
142
+
143
+ string, kwargs dictionary, hdf5plugin filter
144
+ interpreted as the name of a built-in or custom `HDF5 compression
145
+ filter <https://docs.h5py.org/en/stable/high/dataset.html#filter-pipeline>`_
146
+ (``"gzip"``, ``"lzf"``, :mod:`hdf5plugin` filter object etc.) and
147
+ passed directly to :meth:`h5py.Group.create_dataset`.
148
+
149
+ :class:`.WaveformCodec` object
150
+ If `obj` is a :class:`.WaveformTable` and ``obj.values`` holds the
151
+ attribute, compress ``values`` using this algorithm. More
152
+ documentation about the supported waveform compression algorithms at
153
+ :mod:`.lgdo.compression`.
154
+
155
+ If the `obj` :class:`.LGDO` has a `hdf5_settings` attribute holding a
156
+ dictionary, it is interpreted as a list of keyword arguments to be
157
+ forwarded directly to :meth:`h5py.Group.create_dataset` (exactly like
158
+ the first format of `compression` above). This is the preferred way to
159
+ specify HDF5 dataset options such as chunking etc. If compression
160
+ options are specified, they take precedence over those set with the
161
+ `compression` attribute.
162
+
163
+ Note
164
+ ----------
165
+ The `compression` LGDO attribute takes precedence over the default HDF5
166
+ compression settings. The `hdf5_settings` attribute takes precedence
167
+ over `compression`. These attributes are not written to disk.
168
+
169
+ Note
170
+ ----------
171
+ HDF5 compression is skipped for the `encoded_data.flattened_data`
172
+ dataset of :class:`.VectorOfEncodedVectors` and
173
+ :class:`.ArrayOfEncodedEqualSizedArrays`.
174
+
175
+ Parameters
176
+ ----------
177
+ obj
178
+ LH5 object. if object is array-like, writes `n_rows` starting from
179
+ `start_row` in `obj`.
180
+ name
181
+ name of the object in the output HDF5 file.
182
+ lh5_file
183
+ HDF5 file name or :class:`h5py.File` object.
184
+ group
185
+ HDF5 group name or :class:`h5py.Group` object in which `obj` should
186
+ be written.
187
+ start_row
188
+ first row in `obj` to be written.
189
+ n_rows
190
+ number of rows in `obj` to be written.
191
+ wo_mode
192
+ - ``write_safe`` or ``w``: only proceed with writing if the
193
+ object does not already exist in the file.
194
+ - ``append`` or ``a``: append along axis 0 (the first dimension)
195
+ of array-like objects and array-like subfields of structs.
196
+ :class:`~.lgdo.scalar.Scalar` objects get overwritten.
197
+ - ``overwrite`` or ``o``: replace data in the file if present,
198
+ starting from `write_start`. Note: overwriting with `write_start` =
199
+ end of array is the same as ``append``.
200
+ - ``overwrite_file`` or ``of``: delete file if present prior to
201
+ writing to it. `write_start` should be 0 (its ignored).
202
+ - ``append_column`` or ``ac``: append columns from an
203
+ :class:`~.lgdo.table.Table` `obj` only if there is an existing
204
+ :class:`~.lgdo.table.Table` in the `lh5_file` with the same
205
+ `name` and :class:`~.lgdo.table.Table.size`. If the sizes don't
206
+ match, or if there are matching fields, it errors out.
207
+ write_start
208
+ row in the output file (if already existing) to start overwriting
209
+ from.
210
+ **h5py_kwargs
211
+ additional keyword arguments forwarded to
212
+ :meth:`h5py.Group.create_dataset` to specify, for example, an HDF5
213
+ compression filter to be applied before writing non-scalar
214
+ datasets. **Note: `compression` Ignored if compression is specified
215
+ as an `obj` attribute.**
216
+ """
217
+ return _serializers._h5_write_lgdo(
218
+ obj,
219
+ name,
220
+ lh5_file,
221
+ group=group,
222
+ start_row=start_row,
223
+ n_rows=n_rows,
224
+ wo_mode=wo_mode,
225
+ write_start=write_start,
226
+ **h5py_kwargs,
227
+ )
228
+
229
+
230
+ def read_as(
231
+ name: str,
232
+ lh5_file: str | h5py.File | Sequence[str | h5py.File],
233
+ library: str,
234
+ **kwargs,
235
+ ) -> Any:
236
+ """Read LH5 data from disk straight into a third-party data format view.
237
+
238
+ This function is nothing more than a shortcut chained call to
239
+ :func:`.read` and to :meth:`.LGDO.view_as`.
240
+
241
+ Parameters
242
+ ----------
243
+ name
244
+ LH5 object name on disk.
245
+ lh5_file
246
+ LH5 file name.
247
+ library
248
+ string ID of the third-party data format library (``np``, ``pd``,
249
+ ``ak``, etc).
250
+
251
+ See Also
252
+ --------
253
+ .read, .LGDO.view_as
254
+ """
255
+ # determine which keyword arguments should be forwarded to read() and which
256
+ # should be forwarded to view_as()
257
+ read_kwargs = inspect.signature(read).parameters.keys()
258
+
259
+ kwargs1 = {}
260
+ kwargs2 = {}
261
+ for k, v in kwargs.items():
262
+ if k in read_kwargs:
263
+ kwargs1[k] = v
264
+ else:
265
+ kwargs2[k] = v
266
+
267
+ # read the LGDO from disk
268
+ # NOTE: providing a buffer does not make much sense
269
+ obj = read(name, lh5_file, **kwargs1)
270
+
271
+ # and finally return a view
272
+ return obj.view_as(library, **kwargs2)
lgdo/lh5/datatype.py ADDED
@@ -0,0 +1,46 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from collections import OrderedDict
5
+
6
+ from .. import types as lgdo
7
+
8
+ _lgdo_datatype_map: dict[str, lgdo.LGDO] = OrderedDict(
9
+ [
10
+ (lgdo.Scalar, r"^real$|^bool$|^complex$|^bool$|^string$"),
11
+ (lgdo.VectorOfVectors, r"^array<1>\{array<1>\{.+\}\}$"),
12
+ (lgdo.VectorOfEncodedVectors, r"^array<1>\{encoded_array<1>\{.+\}\}$"),
13
+ (
14
+ lgdo.ArrayOfEncodedEqualSizedArrays,
15
+ r"^array_of_encoded_equalsized_arrays<1,1>\{.+\}$",
16
+ ),
17
+ (lgdo.Struct, r"^struct\{.*\}$"),
18
+ (lgdo.Table, r"^table\{.*\}$"),
19
+ (lgdo.FixedSizeArray, r"^fixedsize_array<1>\{.+\}$"),
20
+ (lgdo.ArrayOfEqualSizedArrays, r"^array_of_equalsized_arrays<1,1>\{.+\}$"),
21
+ (lgdo.Array, r"^array<1>\{.+\}$"),
22
+ ]
23
+ )
24
+ """Mapping between LGDO types and regular expression defining the corresponding datatype string"""
25
+
26
+
27
+ def datatype(expr: str) -> type:
28
+ """Return the LGDO type corresponding to a datatype string."""
29
+ expr = expr.strip()
30
+ for type_, regex in _lgdo_datatype_map.items():
31
+ if re.search(regex, expr):
32
+ return type_
33
+
34
+ msg = f"unknown datatype '{expr}'"
35
+ raise RuntimeError(msg)
36
+
37
+
38
+ def get_nested_datatype_string(expr: str) -> str:
39
+ """Matches the content of the outermost curly brackets."""
40
+ return re.search(r"\{(.*)\}$", expr).group(1)
41
+
42
+
43
+ def get_struct_fields(expr: str) -> list[str]:
44
+ """Returns a list of :class:`~.lgdo.types.struct.Struct` fields, given its datatype string."""
45
+ fields = get_nested_datatype_string(expr).split(",")
46
+ return [] if fields == [""] else fields
lgdo/lh5/exceptions.py ADDED
@@ -0,0 +1,34 @@
1
+ from __future__ import annotations
2
+
3
+ import h5py
4
+
5
+
6
+ class LH5DecodeError(Exception):
7
+ def __init__(self, message: str, file: str, obj: str) -> None:
8
+ super().__init__(message)
9
+
10
+ self.file = file.filename if isinstance(file, h5py.File) else file
11
+ self.obj = obj
12
+
13
+ def __str__(self) -> str:
14
+ return (
15
+ f"while reading object '{self.obj}' in file {self.file}: "
16
+ + super().__str__()
17
+ )
18
+
19
+
20
+ class LH5EncodeError(Exception):
21
+ def __init__(
22
+ self, message: str, file: str | h5py.File, group: str | h5py.Group, name: str
23
+ ) -> None:
24
+ super().__init__(message)
25
+
26
+ self.file = file.filename if isinstance(file, h5py.File) else file
27
+ self.group = (group.name if isinstance(file, h5py.File) else group).rstrip("/")
28
+ self.name = name.lstrip("/")
29
+
30
+ def __str__(self) -> str:
31
+ return (
32
+ f"while writing object {self.group}/{self.name} to file {self.file}: "
33
+ + super().__str__()
34
+ )
lgdo/lh5/iterator.py CHANGED
@@ -73,7 +73,7 @@ class LH5Iterator(typing.Iterator):
73
73
  buffer_len
74
74
  number of entries to read at a time while iterating through files.
75
75
  friend
76
- a ''friend'' LH5Iterator that will be read in parallel with this.
76
+ a \"friend\" LH5Iterator that will be read in parallel with this.
77
77
  The friend should have the same length and entry list. A single
78
78
  LH5 table containing columns from both iterators will be returned.
79
79
  """