legend-pydataobj 1.11.13__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lgdo/lh5/store.py CHANGED
@@ -5,21 +5,20 @@ HDF5 files.
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
- import bisect
9
8
  import logging
10
- import os
11
9
  import sys
12
10
  from collections import OrderedDict
13
11
  from collections.abc import Mapping, Sequence
14
12
  from inspect import signature
13
+ from pathlib import Path
15
14
  from typing import Any
16
15
 
17
16
  import h5py
18
- import numpy as np
19
17
  from numpy.typing import ArrayLike
20
18
 
21
19
  from .. import types
22
20
  from . import _serializers, utils
21
+ from .core import read
23
22
 
24
23
  log = logging.getLogger(__name__)
25
24
 
@@ -93,16 +92,16 @@ class LH5Store:
93
92
  return self.files[lh5_file]
94
93
 
95
94
  if self.base_path != "":
96
- full_path = os.path.join(self.base_path, lh5_file)
95
+ full_path = Path(self.base_path) / lh5_file
97
96
  else:
98
- full_path = lh5_file
97
+ full_path = Path(lh5_file)
99
98
 
100
- file_exists = os.path.exists(full_path)
99
+ file_exists = full_path.exists()
101
100
  if mode != "r":
102
- directory = os.path.dirname(full_path)
103
- if directory != "" and not os.path.exists(directory):
101
+ directory = full_path.parent
102
+ if directory != "" and not full_path.parent.exists():
104
103
  log.debug(f"making path {directory}")
105
- os.makedirs(directory)
104
+ directory.mkdir(parents=True, exist_ok=True)
106
105
 
107
106
  if mode == "r" and not file_exists:
108
107
  msg = f"file {full_path} not found"
@@ -155,7 +154,7 @@ class LH5Store:
155
154
  """Returns an LH5 object appropriate for use as a pre-allocated buffer
156
155
  in a read loop. Sets size to `size` if object has a size.
157
156
  """
158
- obj, n_rows = self.read(name, lh5_file, n_rows=0, field_mask=field_mask)
157
+ obj = self.read(name, lh5_file, n_rows=0, field_mask=field_mask)
159
158
  if hasattr(obj, "resize") and size is not None:
160
159
  obj.resize(new_size=size)
161
160
  return obj
@@ -182,72 +181,20 @@ class LH5Store:
182
181
  """
183
182
  # grab files from store
184
183
  if isinstance(lh5_file, (str, h5py.File)):
185
- lh5_obj = self.gimme_file(lh5_file, "r", **file_kwargs)[name]
184
+ h5f = self.gimme_file(lh5_file, "r", **file_kwargs)
186
185
  else:
187
- lh5_files = list(lh5_file)
188
- n_rows_read = 0
189
-
190
- for i, h5f in enumerate(lh5_files):
191
- if (
192
- isinstance(idx, (list, tuple))
193
- and len(idx) > 0
194
- and not np.isscalar(idx[0])
195
- ):
196
- # a list of lists: must be one per file
197
- idx_i = idx[i]
198
- elif idx is not None:
199
- # make idx a proper tuple if it's not one already
200
- if not (isinstance(idx, tuple) and len(idx) == 1):
201
- idx = (idx,)
202
- # idx is a long continuous array
203
- n_rows_i = utils.read_n_rows(name, h5f)
204
- # find the length of the subset of idx that contains indices
205
- # that are less than n_rows_i
206
- n_rows_to_read_i = bisect.bisect_left(idx[0], n_rows_i)
207
- # now split idx into idx_i and the remainder
208
- idx_i = np.array(idx[0])[:n_rows_to_read_i]
209
- idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i
210
- else:
211
- idx_i = None
212
- n_rows_i = n_rows - n_rows_read
213
-
214
- obj_buf, n_rows_read_i = self.read(
215
- name,
216
- h5f,
217
- start_row,
218
- n_rows_i,
219
- idx_i,
220
- use_h5idx,
221
- field_mask,
222
- obj_buf,
223
- obj_buf_start,
224
- decompress,
225
- )
226
-
227
- n_rows_read += n_rows_read_i
228
- if n_rows_read >= n_rows or obj_buf is None:
229
- return obj_buf, n_rows_read
230
- start_row = 0
231
- obj_buf_start += n_rows_read_i
232
- return obj_buf, n_rows_read
233
-
234
- if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]):
235
- idx = idx[0]
236
- if isinstance(idx, np.ndarray) and idx.dtype == np.dtype("?"):
237
- idx = np.where(idx)[0]
238
-
239
- return _serializers._h5_read_lgdo(
240
- lh5_obj.id,
241
- lh5_obj.file.filename,
242
- lh5_obj.name,
243
- start_row=start_row,
244
- n_rows=n_rows,
245
- idx=idx,
246
- use_h5idx=use_h5idx,
247
- field_mask=field_mask,
248
- obj_buf=obj_buf,
249
- obj_buf_start=obj_buf_start,
250
- decompress=decompress,
186
+ h5f = [self.gimme_file(f, "r", **file_kwargs) for f in lh5_file]
187
+ return read(
188
+ name,
189
+ h5f,
190
+ start_row,
191
+ n_rows,
192
+ idx,
193
+ use_h5idx,
194
+ field_mask,
195
+ obj_buf,
196
+ obj_buf_start,
197
+ decompress,
251
198
  )
252
199
 
253
200
  def write(
lgdo/lh5/tools.py CHANGED
@@ -1,16 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import fnmatch
4
- import glob
5
4
  import logging
6
- import os
7
5
  from copy import copy
8
- from warnings import warn
9
6
 
10
7
  import h5py
11
- import numpy as np
12
- import pandas as pd
13
- from numpy.typing import NDArray
14
8
 
15
9
  from . import utils
16
10
  from .store import LH5Store
@@ -223,108 +217,3 @@ def show(
223
217
  break
224
218
 
225
219
  key = k_new
226
-
227
-
228
- def load_nda(
229
- f_list: str | list[str],
230
- par_list: list[str],
231
- lh5_group: str = "",
232
- idx_list: list[NDArray | list | tuple] | None = None,
233
- ) -> dict[str, NDArray]:
234
- r"""Build a dictionary of :class:`numpy.ndarray`\ s from LH5 data.
235
-
236
- Given a list of files, a list of LH5 table parameters, and an optional
237
- group path, return a NumPy array with all values for each parameter.
238
-
239
- Parameters
240
- ----------
241
- f_list
242
- A list of files. Can contain wildcards.
243
- par_list
244
- A list of parameters to read from each file.
245
- lh5_group
246
- group path within which to find the specified parameters.
247
- idx_list
248
- for fancy-indexed reads. Must be one index array for each file in
249
- `f_list`.
250
-
251
- Returns
252
- -------
253
- par_data
254
- A dictionary of the parameter data keyed by the elements of `par_list`.
255
- Each entry contains the data for the specified parameter concatenated
256
- over all files in `f_list`.
257
- """
258
- warn(
259
- "load_nda() is deprecated. "
260
- "Please replace it with LH5Store.read(...).view_as('np'), "
261
- "or just read_as(..., 'np'). "
262
- "load_nda() will be removed in a future release.",
263
- DeprecationWarning,
264
- stacklevel=2,
265
- )
266
-
267
- if isinstance(f_list, str):
268
- f_list = [f_list]
269
- if idx_list is not None:
270
- idx_list = [idx_list]
271
- if idx_list is not None and len(f_list) != len(idx_list):
272
- msg = f"f_list length ({len(f_list)}) != idx_list length ({len(idx_list)})!"
273
- raise ValueError(msg)
274
-
275
- # Expand wildcards
276
- f_list = [f for f_wc in f_list for f in sorted(glob.glob(os.path.expandvars(f_wc)))]
277
-
278
- sto = LH5Store()
279
- par_data = {par: [] for par in par_list}
280
- for ii, ff in enumerate(f_list):
281
- f = sto.gimme_file(ff, "r")
282
- for par in par_list:
283
- if f"{lh5_group}/{par}" not in f:
284
- msg = f"'{lh5_group}/{par}' not in file {ff}"
285
- raise RuntimeError(msg)
286
-
287
- if idx_list is None:
288
- data, _ = sto.read(f"{lh5_group}/{par}", f)
289
- else:
290
- data, _ = sto.read(f"{lh5_group}/{par}", f, idx=idx_list[ii])
291
- if not data:
292
- continue
293
- par_data[par].append(data.nda)
294
- return {par: np.concatenate(par_data[par]) for par in par_list}
295
-
296
-
297
- def load_dfs(
298
- f_list: str | list[str],
299
- par_list: list[str],
300
- lh5_group: str = "",
301
- idx_list: list[NDArray | list | tuple] | None = None,
302
- ) -> pd.DataFrame:
303
- """Build a :class:`pandas.DataFrame` from LH5 data.
304
-
305
- Given a list of files (can use wildcards), a list of LH5 columns, and
306
- optionally the group path, return a :class:`pandas.DataFrame` with all
307
- values for each parameter.
308
-
309
- See Also
310
- --------
311
- :func:`load_nda`
312
-
313
- Returns
314
- -------
315
- dataframe
316
- contains columns for each parameter in `par_list`, and rows containing
317
- all data for the associated parameters concatenated over all files in
318
- `f_list`.
319
- """
320
- warn(
321
- "load_dfs() is deprecated. "
322
- "Please replace it with LH5Store.read(...).view_as('pd'), "
323
- "or just read_as(..., 'pd'). "
324
- "load_dfs() will be removed in a future release.",
325
- DeprecationWarning,
326
- stacklevel=2,
327
- )
328
- return pd.DataFrame(
329
- load_nda(f_list, par_list, lh5_group=lh5_group, idx_list=idx_list)
330
- )
lgdo/lh5/utils.py CHANGED
@@ -7,6 +7,7 @@ import logging
7
7
  import os
8
8
  import string
9
9
  from collections.abc import Mapping, Sequence
10
+ from pathlib import Path
10
11
  from typing import Any
11
12
 
12
13
  import h5py
@@ -153,7 +154,7 @@ def expand_vars(expr: str, substitute: dict[str, str] | None = None) -> str:
153
154
 
154
155
  # use provided mapping
155
156
  # then expand env variables
156
- return os.path.expandvars(string.Template(expr).safe_substitute(substitute))
157
+ return os.path.expandvars(string.Template(str(expr)).safe_substitute(substitute))
157
158
 
158
159
 
159
160
  def expand_path(
@@ -183,14 +184,15 @@ def expand_path(
183
184
  Unique absolute path, or list of all absolute paths
184
185
  """
185
186
  if base_path is not None and base_path != "":
186
- base_path = os.path.expanduser(os.path.expandvars(base_path))
187
- path = os.path.join(base_path, path)
187
+ base_path = Path(os.path.expandvars(base_path)).expanduser()
188
+ path = base_path / path
188
189
 
189
190
  # first expand variables
190
191
  _path = expand_vars(path, substitute)
191
192
 
192
193
  # then expand wildcards
193
- paths = sorted(glob.glob(os.path.expanduser(_path)))
194
+ # pathlib glob works differently so use glob for now
195
+ paths = sorted(glob.glob(str(Path(_path).expanduser()))) # noqa: PTH207
194
196
 
195
197
  if base_path is not None and base_path != "":
196
198
  paths = [os.path.relpath(p, base_path) for p in paths]
lgdo/types/array.py CHANGED
@@ -6,7 +6,7 @@ corresponding utilities.
6
6
  from __future__ import annotations
7
7
 
8
8
  import logging
9
- from collections.abc import Iterator
9
+ from collections.abc import Collection, Iterator
10
10
  from typing import Any
11
11
 
12
12
  import awkward as ak
@@ -17,12 +17,12 @@ import pint_pandas # noqa: F401
17
17
 
18
18
  from .. import utils
19
19
  from ..units import default_units_registry as u
20
- from .lgdo import LGDO
20
+ from .lgdo import LGDOCollection
21
21
 
22
22
  log = logging.getLogger(__name__)
23
23
 
24
24
 
25
- class Array(LGDO):
25
+ class Array(LGDOCollection):
26
26
  r"""Holds an :class:`numpy.ndarray` and attributes.
27
27
 
28
28
  :class:`Array` (and the other various array types) holds an `nda` instead
@@ -78,11 +78,7 @@ class Array(LGDO):
78
78
  elif isinstance(nda, Array):
79
79
  nda = nda.nda
80
80
 
81
- elif not isinstance(nda, np.ndarray):
82
- nda = np.array(nda)
83
-
84
81
  self.nda = nda
85
- self.dtype = self.nda.dtype
86
82
 
87
83
  super().__init__(attrs)
88
84
 
@@ -96,18 +92,91 @@ class Array(LGDO):
96
92
  return dt + "<" + nd + ">{" + et + "}"
97
93
 
98
94
  def __len__(self) -> int:
99
- return len(self.nda)
100
-
101
- def resize(self, new_size: int) -> None:
102
- new_shape = (new_size,) + self.nda.shape[1:]
103
- return self.nda.resize(new_shape, refcheck=True)
95
+ return self._size
96
+
97
+ @property
98
+ def nda(self):
99
+ return self._nda[: self._size, ...] if self._nda.shape != () else self._nda
100
+
101
+ @nda.setter
102
+ def nda(self, value):
103
+ self._nda = value if isinstance(value, np.ndarray) else np.array(value)
104
+ self._size = len(self._nda) if self._nda.shape != () else 0
105
+
106
+ @property
107
+ def dtype(self):
108
+ return self._nda.dtype
109
+
110
+ @property
111
+ def shape(self):
112
+ return (len(self),) + self._nda.shape[1:]
113
+
114
+ def reserve_capacity(self, capacity: int) -> None:
115
+ "Set size (number of rows) of internal memory buffer"
116
+ if capacity < len(self):
117
+ msg = "Cannot reduce capacity below Array length"
118
+ raise ValueError(msg)
119
+ self._nda.resize((capacity,) + self._nda.shape[1:], refcheck=False)
120
+
121
+ def get_capacity(self) -> int:
122
+ "Get capacity (i.e. max size before memory must be re-allocated)"
123
+ return len(self._nda)
124
+
125
+ def trim_capacity(self) -> None:
126
+ "Set capacity to be minimum needed to support Array size"
127
+ self.reserve_capacity(np.prod(self.shape))
128
+
129
+ def resize(self, new_size: int | Collection[int], trim=False) -> None:
130
+ """Set size of Array in rows. Only change capacity if it must be
131
+ increased to accommodate new rows; in this case double capacity.
132
+ If trim is True, capacity will be set to match size. If new_size
133
+ is an int, do not change size of inner dimensions.
134
+
135
+ If new_size is a collection, internal memory will be re-allocated, so
136
+ this should be done only rarely!"""
137
+
138
+ if isinstance(new_size, Collection):
139
+ self._size = new_size[0]
140
+ self._nda.resize(new_size)
141
+ else:
142
+ self._size = new_size
143
+
144
+ if trim and new_size != self.get_capacity:
145
+ self.reserve_capacity(new_size)
146
+
147
+ # If capacity is not big enough, set to next power of 2 big enough
148
+ if new_size > self.get_capacity():
149
+ self.reserve_capacity(int(2 ** (np.ceil(np.log2(new_size)))))
104
150
 
105
151
  def append(self, value: np.ndarray) -> None:
106
- self.resize(len(self) + 1)
107
- self.nda[-1] = value
152
+ "Append value to end of array (with copy)"
153
+ self.insert(len(self), value)
108
154
 
109
155
  def insert(self, i: int, value: int | float) -> None:
110
- self.nda = np.insert(self.nda, i, value)
156
+ "Insert value into row i (with copy)"
157
+ if i > len(self):
158
+ msg = f"index {i} is out of bounds for array with size {len(self)}"
159
+ raise IndexError(msg)
160
+
161
+ value = np.array(value)
162
+ if value.shape == self.shape[1:]:
163
+ self.resize(len(self) + 1)
164
+ self[i + 1 :] = self[i:-1]
165
+ self[i] = value
166
+ elif value.shape[1:] == self.shape[1:]:
167
+ self.resize(len(self) + len(value))
168
+ self[i + len(value) :] = self[i : -len(value)]
169
+ self[i : i + len(value)] = value
170
+ else:
171
+ msg = f"Could not insert value with shape {value.shape} into Array with shape {self.shape}"
172
+ raise ValueError(msg)
173
+
174
+ def replace(self, i: int, value: int | float) -> None:
175
+ "Replace value at row i"
176
+ if i >= len(self):
177
+ msg = f"index {i} is out of bounds for array with size {len(self)}"
178
+ raise IndexError(msg)
179
+ self[i] = value
111
180
 
112
181
  def __getitem__(self, key):
113
182
  return self.nda[key]
lgdo/types/encoded.py CHANGED
@@ -11,12 +11,12 @@ from numpy.typing import NDArray
11
11
 
12
12
  from .. import utils
13
13
  from .array import Array
14
- from .lgdo import LGDO
14
+ from .lgdo import LGDOCollection
15
15
  from .scalar import Scalar
16
16
  from .vectorofvectors import VectorOfVectors
17
17
 
18
18
 
19
- class VectorOfEncodedVectors(LGDO):
19
+ class VectorOfEncodedVectors(LGDOCollection):
20
20
  """An array of variable-length encoded arrays.
21
21
 
22
22
  Used to represent an encoded :class:`.VectorOfVectors`. In addition to an
@@ -92,6 +92,17 @@ class VectorOfEncodedVectors(LGDO):
92
92
 
93
93
  return False
94
94
 
95
+ def reserve_capacity(self, *capacity: int) -> None:
96
+ self.encoded_data.reserve_capacity(*capacity)
97
+ self.decoded_size.reserve_capacity(capacity[0])
98
+
99
+ def get_capacity(self) -> tuple:
100
+ return (self.decoded_size.get_capacity, *self.encoded_data.get_capacity())
101
+
102
+ def trim_capacity(self) -> None:
103
+ self.encoded_data.trim_capacity()
104
+ self.decoded_size.trim_capacity()
105
+
95
106
  def resize(self, new_size: int) -> None:
96
107
  """Resize vector along the first axis.
97
108
 
@@ -102,21 +113,6 @@ class VectorOfEncodedVectors(LGDO):
102
113
  self.encoded_data.resize(new_size)
103
114
  self.decoded_size.resize(new_size)
104
115
 
105
- def append(self, value: tuple[NDArray, int]) -> None:
106
- """Append a 1D encoded vector at the end.
107
-
108
- Parameters
109
- ----------
110
- value
111
- a tuple holding the encoded array and its decoded size.
112
-
113
- See Also
114
- --------
115
- .VectorOfVectors.append
116
- """
117
- self.encoded_data.append(value[0])
118
- self.decoded_size.append(value[1])
119
-
120
116
  def insert(self, i: int, value: tuple[NDArray, int]) -> None:
121
117
  """Insert an encoded vector at index `i`.
122
118
 
@@ -282,7 +278,7 @@ class VectorOfEncodedVectors(LGDO):
282
278
  raise ValueError(msg)
283
279
 
284
280
 
285
- class ArrayOfEncodedEqualSizedArrays(LGDO):
281
+ class ArrayOfEncodedEqualSizedArrays(LGDOCollection):
286
282
  """An array of encoded arrays with equal decoded size.
287
283
 
288
284
  Used to represent an encoded :class:`.ArrayOfEqualSizedArrays`. In addition
@@ -349,14 +345,23 @@ class ArrayOfEncodedEqualSizedArrays(LGDO):
349
345
 
350
346
  return False
351
347
 
352
- def resize(self, new_size: int) -> None:
348
+ def reserve_capacity(self, *capacity: int) -> None:
349
+ self.encoded_data.reserve_capacity(capacity)
350
+
351
+ def get_capacity(self) -> tuple:
352
+ return self.encoded_data.get_capacity()
353
+
354
+ def trim_capacity(self) -> None:
355
+ self.encoded_data.trim_capacity()
356
+
357
+ def resize(self, new_size: int, trim: bool = False) -> None:
353
358
  """Resize array along the first axis.
354
359
 
355
360
  See Also
356
361
  --------
357
362
  .VectorOfVectors.resize
358
363
  """
359
- self.encoded_data.resize(new_size)
364
+ self.encoded_data.resize(new_size, trim)
360
365
 
361
366
  def append(self, value: NDArray) -> None:
362
367
  """Append a 1D encoded array at the end.
lgdo/types/histogram.py CHANGED
@@ -424,7 +424,7 @@ class Histogram(Struct):
424
424
  dict.__setitem__(self, name, obj)
425
425
  else:
426
426
  msg = "histogram fields cannot be mutated "
427
- raise TypeError(msg)
427
+ raise AttributeError(msg)
428
428
 
429
429
  def __getattr__(self, name: str) -> None:
430
430
  # do not allow for new attributes on this
lgdo/types/lgdo.py CHANGED
@@ -92,3 +92,53 @@ class LGDO(ABC):
92
92
 
93
93
  def __repr__(self) -> str:
94
94
  return self.__class__.__name__ + f"(attrs={self.attrs!r})"
95
+
96
+
97
+ class LGDOCollection(LGDO):
98
+ """Abstract base class representing a LEGEND Collection Object (LGDO).
99
+ This defines the interface for classes used as table columns.
100
+ """
101
+
102
+ @abstractmethod
103
+ def __init__(self, attrs: dict[str, Any] | None = None) -> None:
104
+ super().__init__(attrs)
105
+
106
+ @abstractmethod
107
+ def __len__(self) -> int:
108
+ """Provides ``__len__`` for this array-like class."""
109
+
110
+ @abstractmethod
111
+ def reserve_capacity(self, capacity: int) -> None:
112
+ """Reserve capacity (in rows) for later use. Internal memory buffers
113
+ will have enough entries to store this many rows.
114
+ """
115
+
116
+ @abstractmethod
117
+ def get_capacity(self) -> int:
118
+ "get reserved capacity of internal memory buffers in rows"
119
+
120
+ @abstractmethod
121
+ def trim_capacity(self) -> None:
122
+ """set capacity to only what is required to store current contents
123
+ of LGDOCollection
124
+ """
125
+
126
+ @abstractmethod
127
+ def resize(self, new_size: int, trim: bool = False) -> None:
128
+ """Return this LGDO's datatype attribute string."""
129
+
130
+ def append(self, val) -> None:
131
+ "append val to end of LGDOCollection"
132
+ self.insert(len(self), val)
133
+
134
+ @abstractmethod
135
+ def insert(self, i: int, val) -> None:
136
+ "insert val into LGDOCollection at position i"
137
+
138
+ @abstractmethod
139
+ def replace(self, i: int, val) -> None:
140
+ "replace item at position i with val in LGDOCollection"
141
+
142
+ def clear(self, trim: bool = False) -> None:
143
+ "set size of LGDOCollection to zero"
144
+ self.resize(0, trim=trim)