asyncmd 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,20 +12,36 @@
12
12
  #
13
13
  # You should have received a copy of the GNU General Public License
14
14
  # along with asyncmd. If not, see <https://www.gnu.org/licenses/>.
15
- import io
16
- import os
17
- import copy
18
- import typing
15
+ """
16
+ This module contains the implementation the asyncmd.Trajectory class.
17
+
18
+ It also contains some helper function related to the global Trajectory registry
19
+ used for trajectory function value caching.
20
+ The actual :class:`TrajectoryFunctionValueCache` classes can be found in the
21
+ ``trajectory_cache`` module.
22
+ """
19
23
  import asyncio
24
+ import collections
25
+ import dataclasses
20
26
  import hashlib
27
+ import io
21
28
  import logging
22
- import zipfile
23
- import collections
24
- import numpy as np
25
- import MDAnalysis as mda
29
+ import os
30
+ import typing
26
31
 
32
+ import MDAnalysis as mda
33
+ import numpy as np
27
34
 
28
35
  from .._config import _GLOBALS
36
+ from .trajectory_cache import (TrajectoryFunctionValueCache,
37
+ TrajectoryFunctionValueCacheInH5PY,
38
+ TrajectoryFunctionValueCacheInMemory,
39
+ TrajectoryFunctionValueCacheInNPZ,
40
+ ValuesAlreadyStoredError)
41
+
42
+ if typing.TYPE_CHECKING: # pragma: no cover
43
+ # only import for typing to avoid circular imports
44
+ from .functionwrapper import TrajectoryFunctionWrapper
29
45
 
30
46
 
31
47
  logger = logging.getLogger(__name__)
@@ -34,20 +50,58 @@ logger = logging.getLogger(__name__)
34
50
  # dictionary in which we keep track of trajectory objects
35
51
  # we use it to always return the *same* object for the same trajectory (by hash)
36
52
  # this makes it easy to ensure that we never calculate CV functions twice
37
- _TRAJECTORIES_BY_HASH = {}
53
+ _TRAJECTORIES_BY_HASH: dict[int, "Trajectory"] = {}
54
+
55
+
56
+ def clear_all_cache_values_for_all_trajectories() -> None:
57
+ """
58
+ Clear all function values cached for each :class:`Trajectory` currently in existence.
59
+
60
+ For file-based caches, this also removes the associated cache files.
61
+ """
62
+ for traj in _TRAJECTORIES_BY_HASH.values():
63
+ traj.clear_all_cache_values()
64
+
65
+
66
+ def _update_cache_type_for_all_trajectories(copy_content: bool = True,
67
+ clear_old_cache: bool = False,
68
+ ) -> None:
69
+ """
70
+ Update the cache type for each :class:`Trajectory` currently in existence.
71
+
72
+ By default the content of the current caches is copied to the new caches.
73
+ This will only have an effect if the globally set ``cache_type`` differs
74
+ from what each `Trajectory` currently uses.
75
+ See :func:`asyncmd.config.set_trajectory_cache_type` to set the ``cache_type``.
76
+ To clear the old/previously set caches (after copying their values), pass
77
+ ``clear_old_cache=True``.
78
+
79
+ Parameters
80
+ ----------
81
+ copy_content : bool, optional
82
+ Whether to copy the current cache content to the new cache,
83
+ by default True
84
+ clear_old_cache : bool, optional
85
+ Whether to clear the old/previously set cache, by default False.
86
+ """
87
+ for traj in _TRAJECTORIES_BY_HASH.values():
88
+ traj.update_cache_type(copy_content=copy_content,
89
+ clear_old_cache=clear_old_cache,
90
+ )
38
91
 
39
92
 
40
93
  def _forget_all_trajectories() -> None:
41
94
  """
42
95
  Forget about the existence of all :class:`Trajectory` objects.
43
96
 
44
- This will result in new :class:`Trajectory` objects beeing created even for
45
- the same underlying trajectory_files. Usualy you do not want this as it
46
- results in unecessary calculations if the same wrapped and cached function
97
+ This will result in new :class:`Trajectory` objects being created even for
98
+ the same underlying trajectory_files. Usually you do not want this as it
99
+ results in unnecessary calculations if the same wrapped and cached function
47
100
  is applied to both objects. This function exists as a hidden function as it
48
101
  is used in the tests and it might be helpful under certain circumstances.
49
102
  Use only if you know why you are using it!
50
103
  """
104
+ # pylint: disable-next=global-variable-not-assigned
51
105
  global _TRAJECTORIES_BY_HASH
52
106
  all_keys = set(_TRAJECTORIES_BY_HASH.keys())
53
107
  for key in all_keys:
@@ -58,9 +112,9 @@ def _forget_trajectory(traj_hash: int) -> None:
58
112
  """
59
113
  Forget about the existence of a given :class:`Trajectory` object.
60
114
 
61
- This will result in new :class:`Trajectory` objects beeing created even for
62
- the same underlying trajectory_files. Usualy you do not want this as it
63
- results in unecessary calculations if the same wrapped and cached function
115
+ This will result in new :class:`Trajectory` objects being created even for
116
+ the same underlying trajectory_files. Usually you do not want this as it
117
+ results in unnecessary calculations if the same wrapped and cached function
64
118
  is applied to both objects. This function exists as a hidden function as it
65
119
  is used when deleting a :class:`Trajectory` (i.e. calling its `__del__`
66
120
  method) and it might be helpful under certain circumstances. Use only if
@@ -71,6 +125,7 @@ def _forget_trajectory(traj_hash: int) -> None:
71
125
  traj_hash : int
72
126
  The hash of the :class:`Trajectory` to forget about.
73
127
  """
128
+ # pylint: disable-next=global-variable-not-assigned
74
129
  global _TRAJECTORIES_BY_HASH
75
130
  try:
76
131
  del _TRAJECTORIES_BY_HASH[traj_hash]
@@ -79,35 +134,89 @@ def _forget_trajectory(traj_hash: int) -> None:
79
134
  pass
80
135
 
81
136
 
137
+ @dataclasses.dataclass(frozen=True)
138
+ class _TrajectoryPropertyData:
139
+ """
140
+ Dataclass to store/bundle all information that is read from the trajectory
141
+ and made available as :class:`Trajectory` properties.
142
+
143
+ All data are immutable (we use ``frozen=True``), because the data are read
144
+ from the underlying trajectory file(s) only once and if they change the hash
145
+ (i.e. the :class:`Trajectory` object the data is tied to) will also change.
146
+ """
147
+ length: int
148
+ dt: float
149
+ first_time: float
150
+ last_time: float
151
+ first_step: int | None
152
+ last_step: int | None
153
+
154
+
155
+ @dataclasses.dataclass(frozen=True)
156
+ class _TrajectoryFileData:
157
+ """
158
+ Dataclass to store/bundle all information related to the file-paths and
159
+ trajectory hash for :class:`Trajectory` objects.
160
+
161
+ All of this is set in :meth:`Trajectory.__new__` and must not be overridden
162
+ or set again in :meth:`Trajectory.__init__`!
163
+ """
164
+ trajectory_files: list[str]
165
+ structure_file: str
166
+ workdir: str
167
+ trajectory_hash: int
168
+
169
+
82
170
  class Trajectory:
83
171
  """
84
172
  Represent a trajectory.
85
173
 
86
174
  Keep track of the paths of the trajectory and the structure files.
87
175
  Caches values for (wrapped) functions acting on the trajectory.
88
- Supports pickling and unpickling with the cached values restored, the
89
- values will be written to a hidden numpy npz file next to the trajectory.
176
+ Supports pickling and unpickling with the cached values restored, if a
177
+ non-persistent cache is used when pickling, the values will be written to a
178
+ hidden numpy npz file next to the trajectory and will be read at unpickling.
90
179
  Supports equality checks with other :class:`Trajectory`.
91
180
  Also makes available (and caches) a number of useful attributes, e.g.
92
- ``first_step`` and ``last_step`` (the first and last intergation step in
93
- the trajectory), ``dt``, ``first_time``, ``last_time``,
94
- ``length`` (in frames) and ``nstout``.
181
+ ``first_step`` and ``last_step`` (the first and last integration step in
182
+ the trajectory), ``dt``, ``first_time``, ``last_time``,and ``length`` (in
183
+ frames). All properties are read-only (for the simple reason that they
184
+ depend only on the underlying trajectory files).
185
+ A special case is ``nstout``, the output frequency in integration steps.
186
+ Since it can not be reliably read/inferred from the trajectory files alone,
187
+ it can be set by the user (at initialization or later via the property).
95
188
 
96
189
  Notes
97
190
  -----
98
191
  ``first_step`` and ``last_step`` is only useful for trajectories that come
99
192
  directly from a :class:`asyncmd.mdengine.MDEngine`.
100
- As soon as the trajecory has been concatenated using MDAnalysis (e.g. with
193
+ As soon as the trajectory has been concatenated using MDAnalysis (e.g. with
101
194
  the ``TrajectoryConcatenator``) the step information is just the frame
102
195
  number in the trajectory part that became first/last frame in the
103
196
  concatenated trajectory.
104
197
  """
105
198
 
106
- def __init__(self, trajectory_files: typing.Union[list[str], str],
107
- structure_file: str,
108
- nstout: typing.Optional[int] = None,
109
- cache_type: typing.Optional[str] = None,
110
- **kwargs):
199
+ _CACHE_CLASS_FOR_TYPE: dict[str, type[TrajectoryFunctionValueCache]] = {
200
+ "h5py": TrajectoryFunctionValueCacheInH5PY,
201
+ "npz": TrajectoryFunctionValueCacheInNPZ,
202
+ "memory": TrajectoryFunctionValueCacheInMemory,
203
+ }
204
+ _file_data: _TrajectoryFileData # type annotation for stuff we set in __new__
205
+
206
+ # Note: We want __init__ and __new__ to have the same call signature
207
+ # (at least for users, __new__ takes `old_workdir`...).
208
+ # So we will have unused arguments in __init__ (for the stuff we set
209
+ # in __new__) and we will have unused arguments in __new__ (for the
210
+ # stuff we set in __init__).
211
+ # The __new__/__init__ implementation is needed to get the global
212
+ # trajectory registry to work (to make each traj unique for the same
213
+ # hash), but pylint can not know that, so
214
+ def __init__(
215
+ self,
216
+ # pylint: disable-next=unused-argument
217
+ trajectory_files: list[str] | str, structure_file: str,
218
+ nstout: int | None = None,
219
+ ) -> None:
111
220
  """
112
221
  Initialize a :class:`Trajectory`.
113
222
 
@@ -121,12 +230,6 @@ class Trajectory:
121
230
  nstout : int or None, optional
122
231
  The output frequency used when creating the trajectory,
123
232
  by default None
124
- cache_type : str or None, optional
125
- The cache type for the CV values cached for this trajectory,
126
- must be one of 'h5py', 'npz' or 'memory'.
127
- If None we will use 'h5py' if a h5py cache has been registered and
128
- if not fallback to 'npz'.
129
- See also the ``asyncmd.config.register_h5py_cache()`` function.
130
233
 
131
234
  Raises
132
235
  ------
@@ -134,65 +237,35 @@ class Trajectory:
134
237
  If the ``trajectory_files`` or the ``structure_file`` are not
135
238
  accessible.
136
239
  """
137
- # NOTE: we assume tra = trr and struct = tpr
138
- # but we also expect that anything which works for mdanalysis as
139
- # tra and struct should also work here as tra and struct
140
- # TODO: currently we do not use kwargs?!
141
- #dval = object()
142
- #for kwarg, value in kwargs.items():
143
- # cval = getattr(self, kwarg, dval)
144
- # if cval is not dval:
145
- # if isinstance(value, type(cval)):
146
- # # value is of same type as default so set it
147
- # setattr(self, kwarg, value)
148
- # else:
149
- # logger.warn(f"Setting attribute {kwarg} with "
150
- # + f"mismatching type ({type(value)}). "
151
- # + f" Default type is {type(cval)}."
152
- # )
153
- # else:
154
- # # not previously defined, so warn that we ignore it
155
- # logger.warning("Ignoring unknown keyword-argument %s.", kwarg)
156
- # NOTE: self._trajectory_files is set in __new__ because we otherwise
157
- # would sanitize the files twice, but we need to check in __new__
158
- # to make pickling work
159
- # self._structure_file is also set in __new__ together with the
160
- # trajectory_files as we also sanitize its path
161
- # self._traj_hash and self._workdir are also set by __new__!
162
- # self._trajectory_files
163
- # self._structure_file
164
- # self._workdir
165
- # self._traj_hash
240
+ # NOTE: We expect that anything which works for mdanalysis as
241
+ # traj and struct should also work here as traj and struct
242
+ # NOTE: self._file_data is set in __new__ because we otherwise would:
243
+ # - calculate the hash twice (need it in __new__),
244
+ # - sanitize the files twice, but we need to check in __new__
245
+ # to make pickling work
246
+ # The _TrajectoryFileData dataclass therefore contains everything
247
+ # (and only those things) we need in __new__
248
+ # self._file_data
166
249
  # properties
167
250
  self.nstout = nstout # use the setter to make basic sanity checks
168
- self._len = None
169
- self._first_step = None
170
- self._last_step = None
171
- self._dt = None
172
- self._first_time = None
173
- self._last_time = None
174
- # stuff for caching of functions applied to this traj
175
- self._memory_cache = None
176
- self._npz_cache = None
177
- self._h5py_cache = None
178
- self._cache_type = None
179
- # remember if we use the global default value,
180
- # if yes we use the (possibly changed) global default when unpickling
181
- self._using_default_cache_type = True
182
- # use our property logic for checking the value
183
- # (Note that self._trajectory_hash has already been set by __new__)
184
- self.cache_type = cache_type
251
+ # store for all (immutable) properties we read from the trajectory files
252
+ self._property_data: None | _TrajectoryPropertyData = None
253
+ # setup cache for functions applied to this traj
254
+ self._cache = self._setup_cache()
185
255
  # Locking mechanism such that only one application of a specific
186
256
  # CV func can run at any given time on this trajectory
187
- self._semaphores_by_func_id = collections.defaultdict(
188
- asyncio.BoundedSemaphore
189
- )
190
-
191
- def __new__(cls, trajectory_files: typing.Union[list[str], str],
192
- structure_file: str,
193
- nstout: typing.Optional[int] = None,
194
- cache_type: typing.Optional[str] = None,
195
- **kwargs):
257
+ self._semaphores_by_func_id: collections.defaultdict[
258
+ str,
259
+ asyncio.BoundedSemaphore,
260
+ ] = collections.defaultdict(asyncio.BoundedSemaphore)
261
+
262
+ def __new__(cls,
263
+ trajectory_files: list[str] | str, structure_file: str,
264
+ # (see above note for __init__ why its ok to ignore this)
265
+ # pylint: disable-next:unused-argument
266
+ nstout: int | None = None,
267
+ **kwargs) -> "Trajectory":
268
+ # pylint: disable-next=global-variable-not-assigned
196
269
  global _TRAJECTORIES_BY_HASH # our global traj registry
197
270
  # see if old_workdir is given to sanitize file paths
198
271
  old_workdir = kwargs.get("old_workdir", None)
@@ -208,13 +281,6 @@ class Trajectory:
208
281
  try:
209
282
  # see if we (i.e. a traj with the same hash) are already existing
210
283
  other_traj = _TRAJECTORIES_BY_HASH[traj_hash]
211
- # if yes return 'ourself'
212
- # (but make sure that the filepaths match even after a potential
213
- # change of workdir)
214
- other_traj._trajectory_files = trajectory_files
215
- other_traj._structure_file = structure_file
216
- other_traj._workdir = current_workdir
217
- return other_traj
218
284
  except KeyError:
219
285
  # not yet in there, so need to create us
220
286
  # we just create cls so that we will be "created" by init or
@@ -222,100 +288,144 @@ class Trajectory:
222
288
  # NOTE: we need to make sure that every attribute we set
223
289
  # below is not overwritten by setstate and/or init!
224
290
  obj = super().__new__(cls)
225
- # but set self._traj_hash so we dont recalculate it
226
- obj._traj_hash = traj_hash
227
- # and set self._trajectory_files so we dont sanitize twice
228
- obj._trajectory_files = trajectory_files
229
- # also set self._structure_file
230
- obj._structure_file = structure_file
231
- # and set self._workdir to the new value
291
+ # we directly set hash, files and friends so we dont recalculate
292
+ # the hash and dont sanitize the file paths twice
232
293
  # Note:
233
294
  # we remember the current workdir to be able to unpickle as long as
234
295
  # either the relpath between traj and old/new workdir does not change
235
296
  # or the trajectory did not change its location but we changed workdir
236
297
  # (we need the workdir only for the second option)
237
- obj._workdir = current_workdir
298
+ obj._file_data = _TrajectoryFileData(
299
+ trajectory_files=trajectory_files,
300
+ structure_file=structure_file,
301
+ workdir=current_workdir,
302
+ trajectory_hash=traj_hash,
303
+ )
238
304
  # and add us to the global trajectory registry
239
305
  _TRAJECTORIES_BY_HASH[traj_hash] = obj
240
306
  return obj
241
307
 
242
- #def __del__(self):
243
- # TODO: running 'del traj' does not call this function,
244
- # it only decreases the reference count by one,
245
- # but since we still have the traj in the traj by hash dictionary
246
- # i.e. we still have a reference, it will not call __del__ which
247
- # is only called when the reference count reaches zero
308
+ # we already exist (a traj object for the same traj files/hash),
309
+ # so return 'ourself'
310
+ # (but make sure that the filepaths match even after a potential
311
+ # change of workdir)
312
+ other_traj._file_data = _TrajectoryFileData(
313
+ trajectory_files=trajectory_files,
314
+ structure_file=structure_file,
315
+ workdir=current_workdir,
316
+ trajectory_hash=traj_hash,
317
+ )
318
+ return other_traj
319
+
320
+ # def __del__(self):
321
+ # NOTE: Running 'del traj' does not call this function,
322
+ # it only decreases the reference count by one.
323
+ # But since we still have the traj in the traj by hash dictionary
324
+ # i.e. we still have a reference, it will not call __del__ which
325
+ # is only called when the reference count reaches zero.
326
+ # So implementing it is quite pointless and misleading!
248
327
  # _forget_trajectory(traj_hash=self.trajectory_hash)
249
328
 
250
329
  @classmethod
251
- def _sanitize_file_paths(cls,
252
- trajectory_files: typing.Union[list[str], str],
330
+ def _sanitize_file_paths(cls, *,
331
+ trajectory_files: list[str] | str,
253
332
  structure_file: str,
254
- current_workdir: typing.Optional[str] = None,
255
- old_workdir: typing.Optional[str] = None,
256
- ) -> typing.Tuple[list[str], str]:
257
- # NOTE: this returns relpath if no old_workdir is given and the traj
258
- # is accessible
259
- # if old_workdir is given (and the traj not accesible) it (tries)
260
- # to find the traj by assuming the traj did not change place and
261
- # we just need to add the "path_diff" from old to new workdir to
262
- # the path, if the file is then still not there it raises a
263
- # FileNotFoundError
264
- # NOTE: (for pickling and aimmd storage behavior):
265
- # The above makes it possible to either change the workdir of the
266
- # python session OR change the location of the trajectories as
267
- # as long as the relative path between trajectory and python
268
- # workdir does not change!
333
+ current_workdir: str,
334
+ old_workdir: str | None = None,
335
+ ) -> tuple[list[str], str]:
336
+ """
337
+ Return relpath for all files if no old_workdir is given and the trajectory
338
+ and structure files are accessible.
339
+
340
+ If old_workdir is given (and the traj not accessible) it (tries) to find
341
+ the trajs/struct by assuming the files did not change place and we just
342
+ need to add the "path_diff" from old to new workdir to the path, if the
343
+ file is then still not there it raises a FileNotFoundError.
344
+
345
+ Note: The file-path treatment here makes it possible to either change
346
+ the workdir of the python session OR change the location of the
347
+ trajectories as as long as the relative path between trajectory
348
+ and python workdir does not change!
349
+
350
+ Parameters
351
+ ----------
352
+ trajectory_files : list[str] | str
353
+ Absolute or relative path(s) to the trajectory file(s),
354
+ e.g. trr, xtc, dcd, ...
355
+ Can be one str (one file) or a list of str (multiple traj files).
356
+ structure_file : str
357
+ Absolute or relative path to the structure file (e.g. tpr, gro).
358
+ current_workdir : str
359
+ The current working directory to use for "path_diff" calculations.
360
+ old_workdir : str | None, optional
361
+ The old working directory (e.g. at pickling time), by default None.
362
+ If None, no "path_diff" calculations will be performed, i.e. it is
363
+ assumed the working directory did not change or we are not unpickling.
364
+
365
+ Returns
366
+ -------
367
+ tuple[list[str], str]
368
+ trajectory_files, structure_file
369
+ Sanitized file-paths if the files exists, trajectory_files is always
370
+ a list[str], even if it is only one file.
371
+
372
+ Raises
373
+ ------
374
+ FileNotFoundError
375
+ When the trajectory or structure files can not be found.
376
+ """
269
377
  def sanitize_path(f, pathdiff=None):
270
378
  if os.path.isfile(f):
271
379
  return os.path.relpath(f)
272
- elif pathdiff is not None:
380
+ if pathdiff is not None:
273
381
  f_diff = os.path.join(pathdiff, f)
274
382
  if os.path.isfile(f_diff):
275
383
  return os.path.relpath(f_diff)
276
384
  # if we get until here we cant find the file
277
385
  err_msg = f"File {f} is not accessible"
278
- if pathdiff is not None:
279
- err_msg += f" (we also tried {f_diff})."
280
- else:
281
- err_msg += "."
386
+ err_msg += f" (we also tried {f_diff})." if pathdiff is not None else "."
282
387
  raise FileNotFoundError(err_msg)
283
388
 
284
389
  if old_workdir is not None:
285
- if current_workdir is None:
286
- raise ValueError("'old_workdir' given but 'current_workdir' "
287
- "was None.")
288
390
  path_diff = os.path.relpath(old_workdir, current_workdir)
289
391
  else:
290
392
  path_diff = None
291
-
292
393
  if isinstance(trajectory_files, str):
293
394
  trajectory_files = [trajectory_files]
294
-
295
395
  traj_files_sanitized = [sanitize_path(f=traj_f, pathdiff=path_diff)
296
396
  for traj_f in trajectory_files
297
397
  ]
298
- struct_file_sanitized = sanitize_path(f=structure_file,
299
- pathdiff=path_diff,
300
- )
301
-
398
+ struct_file_sanitized = sanitize_path(f=structure_file, pathdiff=path_diff)
302
399
  return traj_files_sanitized, struct_file_sanitized
303
400
 
304
401
  @classmethod
305
- def _calc_traj_hash(cls, trajectory_files):
306
- # calculate a hash over the first and last part of the traj files
307
- # (we use it to make sure the cached CV values match the traj)
308
- # note that we do not include the structure file on purpose because
309
- # that allows for changing .gro <-> .tpr or similar
310
- # (which we expect to not change the calculated CV values)
311
- # TODO: how much should we read?
402
+ def _calc_traj_hash(cls, trajectory_files: list[str]) -> int:
403
+ """
404
+ Calculate a hash over the first and last part of the traj files.
405
+
406
+ We use it to make sure the cached CV values match the traj.
407
+ Note that we do not include the structure file on purpose because
408
+ that allows for changing .gro <-> .tpr or similar (which we expect to
409
+ not change the calculated CV values).
410
+
411
+ Parameters
412
+ ----------
413
+ trajectory_files : list[str]
414
+ Path(s) to the trajectory file(s).
415
+
416
+ Returns
417
+ -------
418
+ int
419
+ The hash calculated over the trajectory files.
420
+ """
421
+ # TODO: how much should we read to calculate the hash?
312
422
  # (I [hejung] think the first and last .5 MB are enough)
313
423
  data = bytes()
314
424
  for traj_f in trajectory_files:
315
- #data += traj_f.encode("utf-8") # DONT include filepaths!...
425
+ # data += traj_f.encode("utf-8") # DONT include filepaths!...
316
426
  fsize = os.stat(traj_f).st_size
317
427
  data += str(fsize).encode("utf-8")
318
- if fsize == 0:
428
+ if not fsize:
319
429
  # Note: we could also just warn as long as we do not do the
320
430
  # negative seek below if filesize == 0. However,
321
431
  # mdanalysis throws errors for empty trajectories anyway
@@ -326,7 +436,7 @@ class Trajectory:
326
436
  # read the first bit of each file
327
437
  data += traj_file.read(max_to_read)
328
438
  # and read the last bit of each file
329
- # Note that the last bit potentially overlapps with the first
439
+ # Note that the last bit potentially overlaps with the first
330
440
  traj_file.seek(-max_to_read, io.SEEK_END)
331
441
  data += traj_file.read(max_to_read)
332
442
  # calculate one hash over all traj_files
@@ -339,171 +449,161 @@ class Trajectory:
339
449
  )
340
450
  return traj_hash
341
451
 
342
- @property
343
- def cache_type(self):
452
+ def _setup_cache(self) -> TrajectoryFunctionValueCache:
344
453
  """
345
- String indicating the currently used cache type. Can also be (re)set.
454
+ Initialize and return a cache with the cache type/class set by _GLOBALS/config.
455
+
456
+ If the initialized cache is empty, this also checks for any npz cache
457
+ files and tries to append them to the new cache (irrespective of the
458
+ cache type).
346
459
  """
347
- return copy.copy(self._cache_type)
460
+ cache = self._CACHE_CLASS_FOR_TYPE[
461
+ _GLOBALS["TRAJECTORY_FUNCTION_CACHE_TYPE"]
462
+ ](traj_hash=self.trajectory_hash,
463
+ traj_files=self.trajectory_files,
464
+ )
465
+ # only try to read npz files if our cache is empty and not already npz
466
+ if not cache and _GLOBALS["TRAJECTORY_FUNCTION_CACHE_TYPE"] != "npz":
467
+ # cache is empty at initialization
468
+ # check if we can find a npz-cache to populate from
469
+ if os.path.isfile(
470
+ TrajectoryFunctionValueCacheInNPZ.get_cache_filename(
471
+ traj_files=self.trajectory_files
472
+ )
473
+ ):
474
+ logger.info("Initialized %s with an empty cache, but found "
475
+ "a (probably) matching npz cache file. Populating "
476
+ "our cache with the values stored there.",
477
+ self,
478
+ )
479
+ cache_to_copy = TrajectoryFunctionValueCacheInNPZ(
480
+ traj_hash=self.trajectory_hash,
481
+ traj_files=self.trajectory_files,
482
+ )
483
+ for func_id, values in cache_to_copy.items():
484
+ cache.append(func_id=func_id, values=values)
485
+ return cache
348
486
 
349
- @cache_type.setter
350
- def cache_type(self, value: typing.Optional[str]):
487
+ def update_cache_type(self, copy_content: bool = True,
488
+ clear_old_cache: bool = False) -> None:
351
489
  """
352
- Set the cache type.
490
+ Update the :class:`TrajectoryFunctionValueCache` this :class:`Trajectory` uses.
491
+
492
+ By default the content of the current cache is copied to the new cache.
493
+ This will only have an effect if the globally set ``cache_type`` differs
494
+ from what this `Trajectory` currently uses.
495
+ See :func:`asyncmd.config.set_trajectory_cache_type` to set the ``cache_type``.
496
+ To clear the old/previously set cache (after copying its values), pass
497
+ ``clear_old_cache=True``.
353
498
 
354
499
  Parameters
355
500
  ----------
356
- value : str or None
357
- Either a string indicating the type or None to choose the preferred
358
- cache type from the available ones.
359
- If a string it must be one of 'h5py', 'npz' or 'memory'.
360
-
361
- Raises
362
- ------
363
- ValueError
364
- Raised if value is not one of the available cache types.
501
+ copy_content : bool, optional
502
+ Whether to copy the current cache content to the new cache,
503
+ by default True
504
+ clear_old_cache : bool, optional
505
+ Whether to clear the old/previously set cache, by default False.
365
506
  """
366
- if value is None:
367
- use_default_cache_type = True
368
- # find preferred cache type that is available
369
- try:
370
- value = _GLOBALS["TRAJECTORY_FUNCTION_CACHE_TYPE"]
371
- except KeyError:
372
- # no default cache type set
373
- # default to numpy npz
374
- value = "npz"
375
- else:
376
- use_default_cache_type = False
377
- value = value.lower()
378
- allowed_values = ["h5py", "npz", "memory"]
379
- if value not in allowed_values:
380
- raise ValueError("Given cache type must be `None` or one of "
381
- + f"{allowed_values}. Was: {value}.")
382
- self._cache_type = value
383
- self._using_default_cache_type = use_default_cache_type
384
- self._setup_cache()
385
-
386
- def _setup_cache(self) -> None:
387
- # set up the cache indicated by self.cache_type and all others to None
388
- # also makes sure that all previously cached values are transfered
389
- # to the newly setup cache
390
- # NOTE: we setup an npz cache to see if there are any saved values
391
- # that we would want to add to the newly setup cache
392
- # We do this because upon pickling we save everything to npz
393
- # Note that we can just set self._npz to this cache because it is
394
- # stateless (in the sense that if it existed it be exactly the same)
395
- self._npz_cache = TrajectoryFunctionValueCacheNPZ(
396
- fname_trajs=self.trajectory_files,
397
- hash_traj=self._traj_hash,
398
- )
399
- if self._cache_type == "memory":
400
- if self._memory_cache is None:
401
- self._memory_cache = TrajectoryFunctionValueCacheMEMORY()
402
- else:
403
- # we already have a mem cache so just try to use it
404
- pass
405
- if self._h5py_cache is not None:
406
- self._cache_content_to_new_cache(
407
- old_cache=self._h5py_cache,
408
- new_cache=self._memory_cache,
409
- )
410
- self._h5py_cache = None
411
- self._cache_content_to_new_cache(
412
- old_cache=self._npz_cache,
413
- new_cache=self._memory_cache,
414
- )
415
- self._npz_cache = None
416
- elif self._cache_type == "h5py":
417
- try:
418
- h5py_cache = _GLOBALS["H5PY_CACHE"]
419
- except KeyError as exc:
420
- raise ValueError(
421
- "No h5py cache file registered yet. Try calling "
422
- + "``asyncmd.config.register_h5py_cache_file()``"
423
- + " with the appropriate arguments first") from exc
424
- if self._h5py_cache is None:
425
- # dont have one yet so setup the cache
426
- self._h5py_cache = TrajectoryFunctionValueCacheH5PY(
427
- h5py_cache=h5py_cache,
428
- hash_traj=self._traj_hash,
429
- )
430
- else:
431
- # we already have a h5py cache...
432
- if self._h5py_cache.h5py_cache is h5py_cache:
433
- # and it is in the same file/group location
434
- # so we do nothing but making sure that all values from
435
- # other caches are transfered
507
+ cache_type = _GLOBALS["TRAJECTORY_FUNCTION_CACHE_TYPE"]
508
+ if isinstance(self._cache, self._CACHE_CLASS_FOR_TYPE[cache_type]):
509
+ logger.info("Cache type is already %s. Not doing anything.", cache_type)
510
+ return
511
+ # init the new cache
512
+ cache = self._CACHE_CLASS_FOR_TYPE[cache_type](
513
+ traj_hash=self.trajectory_hash,
514
+ traj_files=self.trajectory_files,
515
+ )
516
+ if copy_content:
517
+ # and copy/append everything from current cache to the new one
518
+ for func_id, values in self._cache.items():
519
+ try:
520
+ cache.append(func_id=func_id, values=values)
521
+ except ValuesAlreadyStoredError:
522
+ # if we just initialized a non-empty cache we might already
523
+ # have some of the values cached there, ignore them
436
524
  pass
437
- else:
438
- # lets copy the stuff from the old to the new h5py cache
439
- old_h5py_cache = self._h5py_cache
440
- self._h5py_cache = TrajectoryFunctionValueCacheH5PY(
441
- h5py_cache=h5py_cache,
442
- hash_traj=self._traj_hash,
443
- )
444
- self._cache_content_to_new_cache(
445
- old_cache=old_h5py_cache,
446
- new_cache=self._h5py_cache,
447
- )
448
- # transfer all values from other cache types and empty them
449
- if self._memory_cache is not None:
450
- self._cache_content_to_new_cache(
451
- old_cache=self._memory_cache,
452
- new_cache=self._h5py_cache,
453
- )
454
- self._memory_cache = None
455
- self._cache_content_to_new_cache(
456
- old_cache=self._npz_cache,
457
- new_cache=self._h5py_cache,
458
- )
459
- self._npz_cache = None
460
- elif self._cache_type == "npz":
461
- if self._h5py_cache is not None:
462
- self._cache_content_to_new_cache(
463
- old_cache=self._h5py_cache,
464
- new_cache=self._npz_cache,
465
- )
466
- self._h5py_cache = None
467
- if self._memory_cache is not None:
468
- self._cache_content_to_new_cache(
469
- old_cache=self._memory_cache,
470
- new_cache=self._npz_cache,
471
- )
472
- self._memory_cache = None
473
- else:
474
- raise RuntimeError("This should never happen. self._cache_type "
475
- + "must be one of 'memory', 'h5py', 'npz' when "
476
- + "self._setup_cache is called. "
477
- + f"Was {self._cache_type}.")
525
+ if clear_old_cache:
526
+ self._cache.clear_all_values()
527
+ self._cache = cache
478
528
 
479
- def _populate_properties(self) -> None:
529
+ def clear_all_cache_values(self) -> None:
480
530
  """
481
- Populate cached properties from the underlying trajectory.
531
+ Clear all function values cached for this :class:`Trajectory`.
532
+
533
+ For file-based caches, this also removes the associated cache files.
534
+ Note that this just calls the underlying :class:`TrajectoryFunctionValueCache`
535
+ classes ``clear_all_values`` method.
536
+ """
537
+ self._cache.clear_all_values()
538
+
539
+ def _retrieve_cached_values(self, func_wrapper: "TrajectoryFunctionWrapper",
540
+ ) -> np.ndarray | None:
541
+ """
542
+ Retrieve values cached for given :class:`TrajectoryFunctionWrapper`.
543
+
544
+ Return ``None`` if no values are cached (yet).
545
+
546
+ Parameters
547
+ ----------
548
+ func_wrapper : TrajectoryFunctionWrapper
549
+ The TrajectoryFunctionWrapper for which we (try to) retrieve cached values.
550
+
551
+ Returns
552
+ -------
553
+ np.ndarray | None
554
+ Cached function values or None if none are found.
555
+ """
556
+ try:
557
+ values = self._cache[func_wrapper.id]
558
+ except KeyError:
559
+ values = None
560
+ return values
561
+
562
+ def _register_cached_values(self, values: np.ndarray,
563
+ func_wrapper: "TrajectoryFunctionWrapper",
564
+ ) -> None:
565
+ """
566
+ Add values to cache for given TrajectoryFunctionWrapper.
567
+
568
+ Parameters
569
+ ----------
570
+ values : np.ndarray
571
+ The values to add.
572
+ func_wrapper : TrajectoryFunctionWrapper
573
+ The TrajectoryFunctionWrapper this values belong to.
574
+ """
575
+ self._cache.append(func_id=func_wrapper.id, values=values)
576
+
577
+ def _populate_property_data(self) -> _TrajectoryPropertyData:
578
+ """
579
+ Populate and return cached properties from the underlying trajectory.
580
+
581
+ Returns a :class:`_TrajectoryPropertyData` class.
482
582
  """
483
583
  # create/open a mdanalysis universe to get...
484
584
  u = mda.Universe(self.structure_file, *self.trajectory_files)
485
585
  # ...the number of frames
486
- self._len = len(u.trajectory)
586
+ length = len(u.trajectory)
487
587
  # ...the first integration step and time
488
588
  ts = u.trajectory[0]
489
- # FIXME: using None here means we will try to repopulate the properties
490
- # every time we access step property for a traj-format which
491
- # does not have step data!
492
- # TODO: which traj formats have step data set in MDAnalysis?
493
- # XTC and TRR have it for sure (with the wraparound issue)
494
- self._first_step = ts.data.get("step", None)
495
- self._first_time = ts.time
589
+ first_step = ts.data.get("step", None)
590
+ first_time = ts.time
496
591
  # ...the time diff between subsequent **frames** (not steps)
497
- self._dt = ts.dt
592
+ dt = ts.dt
498
593
  # ...the last integration step and time
499
594
  ts = u.trajectory[-1]
500
- # TODO: which traj formats have step data set in MDAnalysis?
501
- # XTC and TRR have it for sure (with the wraparound issue)
502
- self._last_step = ts.data.get("step", None)
503
- self._last_time = ts.time
504
- if all([t.lower().endswith((".xtc", ".trr"))
505
- for t in self.trajectory_files]):
506
- self._fix_trr_xtc_step_wraparound(universe=u)
595
+ last_step = ts.data.get("step", None)
596
+ last_time = ts.time
597
+ # See if we apply the wraparound issue fix
598
+ # Note: we are using some of the info we just read here (all explicitly passed)!
599
+ if all(
600
+ t.lower().endswith((".xtc", ".trr")) for t in self.trajectory_files
601
+ ):
602
+ first_step, last_step = self._fix_trr_xtc_step_wraparound(
603
+ universe=u,
604
+ first_time=first_time, last_time=last_time,
605
+ first_step=first_step, last_step=last_step,
606
+ )
507
607
  else:
508
608
  # bail out if traj is not an XTC or TRR
509
609
  logger.info("%s is not of type XTC or TRR. Not applying "
@@ -511,9 +611,23 @@ class Trajectory:
511
611
  # make sure the trajectory is closed by MDAnalysis
512
612
  u.trajectory.close()
513
613
  del u
514
-
515
- def _fix_trr_xtc_step_wraparound(self, universe: mda.Universe) -> None:
614
+ # finally populate and return the dataclass with what we just read
615
+ # (and possibly corrected)
616
+ return _TrajectoryPropertyData(
617
+ length=length, dt=dt,
618
+ first_time=first_time, last_time=last_time,
619
+ first_step=first_step, last_step=last_step,
620
+ )
621
+
622
+ def _fix_trr_xtc_step_wraparound(self, *,
623
+ universe: mda.Universe,
624
+ first_time: float, last_time: float,
625
+ first_step: int, last_step: int,
626
+ ) -> tuple[int, int]:
516
627
  # check/correct for wraparounds in the integration step numbers
628
+ # return (corrected or not) first_step, last_step
629
+ # I.e. it is save to always set first_step, last_step with the return
630
+ # of this method.
517
631
  # NOTE: fails if the trajectory has length = 1!
518
632
  # NOTE: strictly spoken we should not assume wraparound behavior,
519
633
  # but it seems reasonable for the stepnum,
@@ -525,52 +639,46 @@ class Trajectory:
525
639
  # dividing the times by integrator_dt, this should be reasonably
526
640
  # save for normal MD settings where integrator_dt should be on the
527
641
  # order of 1-10 fs
528
- if self._len == 1:
642
+ if (n_frames := len(universe.trajectory)) == 1:
529
643
  # bail out if the trajectory has length=1
530
644
  # as we can not calculate dt if we only have one frame
531
645
  logger.info("%s has only one frame. Can not correct for "
532
646
  "potential wraparound of the integration step.",
533
647
  self)
534
- return # bail out
648
+ return first_step, last_step # bail out
535
649
  # get the time offset for first and last frame, they need to match for
536
650
  # our wraparound fix to work
537
- ts = universe.trajectory[0]
538
- time_offset = ts.data.get("time_offset", 0)
539
- ts = universe.trajectory[-1]
540
- if ts.data.get("time_offset", 0) != time_offset:
651
+ time_offset = universe.trajectory[0].data.get("time_offset", 0)
652
+ if universe.trajectory[-1].data.get("time_offset", 0) != time_offset:
541
653
  logger.info("Time offset of the first and last time in "
542
654
  "%s do not match. Not correcting for potential "
543
655
  "wraparound of the integration step.",
544
656
  self)
545
- return # bail out
546
- delta_s = self._last_step - self._first_step
547
- delta_t = round(self._last_time - self._first_time, ndigits=6)
548
- # first make sure traj is continous (i.e. not a concatenation where we
549
- # carried over the time and step data from the original trajs)
550
- n_frames = len(universe.trajectory)
551
- n_max_samples = 100 # use at most 100 frames to see if it is continous
552
- if n_frames > n_max_samples:
553
- skip = n_frames // n_max_samples
554
- else:
555
- skip = 1
556
- step_nums = [ts.data["step"] for ts in universe.trajectory[::skip]]
557
- step_diffs = np.diff(step_nums)
558
- first_diff = step_diffs[0]
559
- if first_diff < 0:
657
+ return first_step, last_step # bail out
658
+ delta_s = last_step - first_step
659
+ delta_t = round(last_time - first_time, ndigits=6)
660
+ # first make sure traj is continuous (i.e. not a concatenation where we
661
+ # carried over the time and step data from the original trajs).
662
+ # Use at most 100 (equally spaced) frames to see if it is continuous.
663
+ skip = n_frames // 100 if n_frames > 100 else 1
664
+ step_diffs = np.diff([ts.data["step"]
665
+ for ts in universe.trajectory[::skip]]
666
+ )
667
+ if (first_diff := step_diffs[0]) < 0:
560
668
  # we possibly wrapped around at the first step
561
669
  first_diff += 2**32
562
670
  for diff in step_diffs[1:]:
563
671
  if diff != first_diff:
564
- # bail out because traj is not continous in time
565
- logger.debug("%s is not from one continous propagation, i.e. "
672
+ # bail out because traj is not continuous in time
673
+ logger.debug("%s is not from one continuous propagation, i.e. "
566
674
  "the step difference between subsequent steps is "
567
675
  "not constant. Not applying TRR/XTC step "
568
676
  "wraparound fix and using step as read from the "
569
677
  "underlying trajectory.",
570
678
  self)
571
- return
679
+ return first_step, last_step
572
680
  # now the actual fix
573
- if delta_s != 0:
681
+ if delta_s: # delta_s != 0
574
682
  if delta_s > 0:
575
683
  # both (last and first) wrapped around the same number of times
576
684
  integrator_dt = round(delta_t / delta_s, ndigits=6)
@@ -580,17 +688,16 @@ class Trajectory:
580
688
  # NOTE: should we round or floor? I (hejung) think round is what we
581
689
  # want, it will get us to the nearest int, which is good if
582
690
  # we e.g. have 0.99999999999 instead of 1
583
- first_step = round((self._first_time - time_offset) / integrator_dt)
584
- last_step = round((self._last_time - time_offset) / integrator_dt)
585
- self._first_step = first_step
586
- self._last_step = last_step
587
- else: # delta_s == 0
588
- # can only end up here if we have more than one frame in trajectory
589
- # **and** the first and last frame have the same integration step
590
- # which should be very rare and we can not correct anyway as the
591
- # trajectory can not be from a continous propagation, so we can not
592
- # end up here at all?
593
- raise RuntimeError("This should not be possible?!")
691
+ first_step = round((first_time - time_offset) / integrator_dt)
692
+ last_step = round((last_time - time_offset) / integrator_dt)
693
+ return first_step, last_step
694
+ # delta_s == 0
695
+ # can only end up here if we have more than one frame in trajectory
696
+ # **and** the first and last frame have the same integration step
697
+ # which should be very rare and we can not correct anyway as the
698
+ # trajectory can not be from a continuous propagation, so we can not
699
+ # end up here at all?
700
+ raise RuntimeError("This should not be possible?!")
594
701
 
595
702
  def __len__(self) -> int:
596
703
  """
@@ -601,9 +708,9 @@ class Trajectory:
601
708
  int
602
709
  The number of frames in the trajectory.
603
710
  """
604
- if self._len is None:
605
- self._populate_properties()
606
- return self._len
711
+ if self._property_data is None:
712
+ self._property_data = self._populate_property_data()
713
+ return self._property_data.length
607
714
 
608
715
  def __repr__(self) -> str:
609
716
  if len(self.trajectory_files) == 1:
@@ -614,6 +721,9 @@ class Trajectory:
614
721
  + f" structure_file={self.structure_file})"
615
722
  )
616
723
 
724
+ def __hash__(self) -> int:
725
+ return self.trajectory_hash
726
+
617
727
  def __eq__(self, other: object) -> bool:
618
728
  if not isinstance(other, Trajectory):
619
729
  # if its not a trajectory it cant be equal
@@ -621,37 +731,35 @@ class Trajectory:
621
731
  if self.trajectory_hash != other.trajectory_hash:
622
732
  # if it has a different hash it cant be equal
623
733
  return False
624
- # TODO: check for cached CV values? I (hejung) think it does not really
625
- # make sense...
626
734
 
627
- # if we got until here the two trajs are equal
735
+ # if we got until here the two trajectories are equal
628
736
  return True
629
737
 
630
738
  def __ne__(self, other: object) -> bool:
631
- return not self.__eq__(other=other)
739
+ return not self.__eq__(other)
632
740
 
633
741
  @property
634
742
  def structure_file(self) -> str:
635
743
  """Return relative path to the structure file."""
636
- return copy.copy(self._structure_file)
744
+ return self._file_data.structure_file
637
745
 
638
746
  @property
639
- def trajectory_files(self) -> str:
747
+ def trajectory_files(self) -> list[str]:
640
748
  """Return relative path to the trajectory files."""
641
- return copy.copy(self._trajectory_files)
749
+ return self._file_data.trajectory_files
642
750
 
643
751
  @property
644
752
  def trajectory_hash(self) -> int:
645
- """Return hash over the trajecory files"""
646
- return copy.copy(self._traj_hash)
753
+ """Return hash over the trajectory files"""
754
+ return self._file_data.trajectory_hash
647
755
 
648
756
  @property
649
- def nstout(self) -> typing.Union[int, None]:
757
+ def nstout(self) -> int | None:
650
758
  """Output frequency between subsequent frames in integration steps."""
651
759
  return self._nstout
652
760
 
653
761
  @nstout.setter
654
- def nstout(self, val: typing.Union[int, None]) -> None:
762
+ def nstout(self, val: int | None) -> None:
655
763
  if val is not None:
656
764
  # ensure that it is an int
657
765
  val = int(val)
@@ -659,445 +767,80 @@ class Trajectory:
659
767
  self._nstout = val
660
768
 
661
769
  @property
662
- def first_step(self) -> int:
770
+ def first_step(self) -> int | None:
663
771
  """Return the integration step of the first frame in the trajectory."""
664
- if self._first_step is None:
665
- self._populate_properties()
666
- return self._first_step
772
+ if self._property_data is None:
773
+ self._property_data = self._populate_property_data()
774
+ return self._property_data.first_step
667
775
 
668
776
  @property
669
- def last_step(self) -> int:
777
+ def last_step(self) -> int | None:
670
778
  """Return the integration step of the last frame in the trajectory."""
671
- if self._last_step is None:
672
- self._populate_properties()
673
- return self._last_step
779
+ if self._property_data is None:
780
+ self._property_data = self._populate_property_data()
781
+ return self._property_data.last_step
674
782
 
675
783
  @property
676
784
  def dt(self) -> float:
677
- """The time intervall between subsequent *frames* (not steps) in ps."""
678
- if self._dt is None:
679
- self._populate_properties()
680
- return self._dt
785
+ """The time interval between subsequent *frames* (not steps) in ps."""
786
+ if self._property_data is None:
787
+ self._property_data = self._populate_property_data()
788
+ return self._property_data.dt
681
789
 
682
790
  @property
683
791
  def first_time(self) -> float:
684
792
  """Return the integration timestep of the first frame in ps."""
685
- if self._first_time is None:
686
- self._populate_properties()
687
- return self._first_time
793
+ if self._property_data is None:
794
+ self._property_data = self._populate_property_data()
795
+ return self._property_data.first_time
688
796
 
689
797
  @property
690
798
  def last_time(self) -> float:
691
799
  """Return the integration timestep of the last frame in ps."""
692
- if self._last_time is None:
693
- self._populate_properties()
694
- return self._last_time
695
-
696
- async def _apply_wrapped_func(self, func_id, wrapped_func):
697
- async with self._semaphores_by_func_id[func_id]:
698
- # sort out which cache we use
699
- # NOTE: only one cache should ever be not None, so order should not
700
- # matter here
701
- # anyway I (hejung) think this order is even what we want:
702
- # 1.) use h5py cache if registered
703
- # 2.) use npz cache (the default since h5py is not registered
704
- # if not set by the user)
705
- # 3.) use memory/local cache (only if set on traj creation
706
- # or if set as default cache)
707
- if self._h5py_cache is not None:
708
- return await self._apply_wrapped_func_cached(
709
- func_id=func_id,
710
- wrapped_func=wrapped_func,
711
- cache=self._h5py_cache,
712
- )
713
- if self._npz_cache is not None:
714
- return await self._apply_wrapped_func_cached(
715
- func_id=func_id,
716
- wrapped_func=wrapped_func,
717
- cache=self._npz_cache
718
- )
719
- if self._memory_cache is not None:
720
- return await self._apply_wrapped_func_cached(
721
- func_id=func_id,
722
- wrapped_func=wrapped_func,
723
- cache=self._memory_cache,
724
- )
725
- # if we get until here we have no cache!
726
- logger.warning("No cache associated with %s. Returning calculated "
727
- "function values anyway but no caching can/will be "
728
- "performed!",
729
- self,
730
- )
731
- return await wrapped_func.get_values_for_trajectory(self)
732
-
733
- async def _apply_wrapped_func_cached(
734
- self, func_id: str, wrapped_func,
735
- cache: collections.abc.Mapping[str, np.ndarray],
736
- ):
737
- try:
738
- # see if it is in cache
739
- return copy.copy(cache[func_id])
740
- except KeyError:
741
- # if not calculate, store and return
742
- # send function application to seperate process and wait
743
- # until it finishes
744
- vals = await wrapped_func.get_values_for_trajectory(self)
745
- cache.append(func_id=func_id, vals=vals)
746
- return vals
747
-
748
- def _cache_content_to_new_cache(
749
- self,
750
- old_cache: collections.abc.Mapping[str, np.ndarray],
751
- new_cache: collections.abc.Mapping[str, np.ndarray],
752
- ):
753
- for func_id, values in old_cache.items():
754
- if func_id in new_cache:
755
- continue # dont try to add what is already in there
756
- new_cache.append(func_id=func_id, vals=values)
757
-
758
- def __getstate__(self):
800
+ if self._property_data is None:
801
+ self._property_data = self._populate_property_data()
802
+ return self._property_data.last_time
803
+
804
+ def __getstate__(self) -> dict[str, typing.Any]:
759
805
  # enable pickling of Trajectory
760
806
  # this should make it possible to pass it into a ProcessPoolExecutor
761
- # and lets us calculate TrajectoryFunction values asyncronously
807
+ # and lets us calculate TrajectoryFunction values asynchronously
762
808
  state = self.__dict__.copy()
763
- # NOTE: we always save to npz here and then we check for npz always
764
- # when initializing a `new` trajectory and add all values to
765
- # the then preferred cache
766
- if self._npz_cache is None:
767
- self._npz_cache = TrajectoryFunctionValueCacheNPZ(
768
- fname_trajs=self.trajectory_files,
769
- hash_traj=self._traj_hash,
770
- )
771
- if self._memory_cache is not None:
772
- self._cache_content_to_new_cache(old_cache=self._memory_cache,
773
- new_cache=self._npz_cache,
774
- )
775
- if self._h5py_cache is not None:
776
- self._cache_content_to_new_cache(old_cache=self._h5py_cache,
777
- new_cache=self._npz_cache,
778
- )
779
- # and set npz cache back to None since we have not been using it
780
- self._npz_cache = None
781
- state["_h5py_cache"] = None
782
- state["_npz_cache"] = None
783
- state["_memory_cache"] = None
809
+ # special handling for case of function values cached in memory
810
+ if isinstance(self._cache, TrajectoryFunctionValueCacheInMemory):
811
+ # write it to npz so we can unpickle with values for any cache type
812
+ # (if we unpickle with an empty cache we will [try to] read the npz)
813
+ npz_cache = TrajectoryFunctionValueCacheInNPZ(
814
+ traj_hash=self.trajectory_hash,
815
+ traj_files=self.trajectory_files,
816
+ )
817
+ for func_id, values in self._cache.items():
818
+ try:
819
+ npz_cache.append(func_id=func_id, values=values)
820
+ except ValuesAlreadyStoredError:
821
+ # ignore if we already have them
822
+ pass
823
+ state["_cache"] = None
784
824
  state["_semaphores_by_func_id"] = collections.defaultdict(
785
825
  asyncio.BoundedSemaphore
786
826
  )
787
827
  return state
788
828
 
789
- def __setstate__(self, d: dict):
829
+ def __setstate__(self, d: dict) -> None:
790
830
  # remove the attributes we set in __new__ from dict
791
831
  # (otherwise we would overwrite what we set in __new__)
792
- del d["_trajectory_files"]
793
- del d["_structure_file"]
794
- del d["_traj_hash"]
795
- try:
796
- del d["_workdir"]
797
- except KeyError:
798
- # 'old' trajectory objects dont have a _workdir attribute
799
- pass
800
- # now we can update without overwritting what we set in __new__
832
+ del d["_file_data"]
833
+ # now we can update without overwriting what we set in __new__
801
834
  self.__dict__.update(d)
802
- # sort out which cache we were using (and which we will use now)
803
- if self._using_default_cache_type:
804
- # if we were using the global default when pickling use it now too
805
- # Note that this will raise the ValueError from _setup_cache if
806
- # no h5py cache has been registered but it is set as default
807
- # (which is intended because it is the same behavior as when
808
- # initializing a new trajectory in the same situation)
809
- self.cache_type = None # this calls _setup_cache
810
- return # get out of here, no need to setup the cache twice
811
- if self.cache_type == "h5py":
812
- # make sure h5py cache is set before trying to unpickle with it
813
- try:
814
- _ = _GLOBALS["H5PY_CACHE"]
815
- except KeyError:
816
- # this will (probably) fallback to npz but I (hejung) think it
817
- # is nice if we use the possibly set global default?
818
- # Note that this will not err but just emit the warning to log
819
- # when we change the cache but it will err when the global
820
- # default cache is set to h5py (as above)
821
- logger.warning("Trying to unpickle %s with cache_type "
822
- "'h5py' not possible without a registered "
823
- "cache. Falling back to global default type. "
824
- "See 'asyncmd.config.register_h5py_cache' and "
825
- "'asyncmd.config.set_default_cache_type'.",
826
- self
827
- )
828
- self.cache_type = None # this calls _setup_cache
829
- return # get out of here, no need to setup the cache twice
830
- # setup the cache for all cases where we are not using default cache
831
- # (or had "h5py" but could not unpickle with "h5py" now [and are
832
- # therefore also using the default])
833
- self._setup_cache()
834
-
835
- def __getnewargs_ex__(self):
835
+ # and finally setup the cache according to what the global config says
836
+ self._cache = self._setup_cache()
837
+
838
+ def __getnewargs_ex__(self) -> tuple[tuple, dict[str, typing.Any]]:
836
839
  # new needs the trajectory_files to be able to calculate the traj_hash
837
840
  # and since we want __new__ to have the same call signature as __init__
838
841
  # we also add all the init args here too
839
842
  return ((), {"trajectory_files": self.trajectory_files,
840
843
  "structure_file": self.structure_file,
841
844
  "nstout": self.nstout,
842
- "cache_type": self.cache_type,
843
- "old_workdir": self._workdir,
845
+ "old_workdir": self._file_data.workdir,
844
846
  })
845
-
846
-
847
- class TrajectoryFunctionValueCacheMEMORY(collections.abc.Mapping):
848
- """
849
- Interface for caching trajectory function values in memory in a dict.
850
- """
851
-
852
- def __init__(self, *args, **kwargs) -> None:
853
- """Initialize a `TrajectoryFunctionValueCacheMEMORY`."""
854
- self._func_values_by_id = {}
855
-
856
- def __len__(self) -> int:
857
- return len(self._func_values_by_id)
858
-
859
- def __iter__(self):
860
- return self._func_values_by_id.__iter__()
861
-
862
- def __getitem__(self, key: str) -> np.ndarray:
863
- if not isinstance(key, str):
864
- raise TypeError("Keys must be of type str.")
865
- return self._func_values_by_id[key]
866
-
867
- def append(self, func_id: str, vals: np.ndarray) -> None:
868
- if not isinstance(func_id, str):
869
- raise TypeError("func_id must be of type str.")
870
- if func_id in self._func_values_by_id:
871
- # first check if it already in there
872
- raise ValueError("There are already values stored for func_id "
873
- + f"{func_id}. Changing the stored values is not "
874
- + "supported.")
875
- self._func_values_by_id[func_id] = vals
876
-
877
-
878
- class TrajectoryFunctionValueCacheNPZ(collections.abc.Mapping):
879
- """
880
- Interface for caching trajectory function values in a numpy npz file.
881
-
882
- Drop-in replacement for the dictionary that is used for in-memory caching.
883
- """
884
-
885
- _hash_traj_npz_key = "hash_of_trajs" # key of hash_traj in npz file
886
-
887
- # NOTE: this is written with the assumption that stored trajectories are
888
- # immutable (except for adding additional stored function values)
889
- # but we assume that the actual underlying trajectory stays the same,
890
- # i.e. it is not extended after first storing it
891
- # If it changes between two npz-cache initializiations, it will have
892
- # a different traj-hash and all cached CV values will be recalculated
893
-
894
- # NOTE: npz appending inspired by: https://stackoverflow.com/a/66618141
895
-
896
- # NOTE/FIXME: It would be nice to use the MAX_FILES_OPEN semaphore
897
- # but then we need async/await and then we need to go to a 'create'
898
- # classmethod that is async and required for initialization
899
- # (because __init__ cant be async)
900
- # but since we (have to) open the npz file in the other magic methods
901
- # too it does not really matter (as they can not be async either)?
902
- # ...and as we also leave some room for non-semaphored file openings anyway
903
-
904
- def __init__(self, fname_trajs: list[str], hash_traj: int) -> None:
905
- """
906
- Initialize a `TrajectoryFunctionValueCacheNPZ`.
907
-
908
- Parameters
909
- ----------
910
- fname_trajs : list[str]
911
- Absolute filenames to the trajectories for which we cache CV values.
912
- hash_traj : int
913
- Hash over the first part of the trajectory file,
914
- used to make sure we cache only for the right trajectory
915
- (and not any trajectories with the same filename).
916
- """
917
- self.fname_npz = self._get_cache_filename(fname_trajs=fname_trajs,
918
- trajectory_hash=hash_traj,
919
- )
920
- self._hash_traj = hash_traj
921
- self._func_ids = []
922
- # sort out if we have an associated npz file already
923
- # and if it is from/for the "right" trajectory file
924
- self._ensure_consistent_npz()
925
-
926
- def _ensure_consistent_npz(self):
927
- # next line makes sure we only remember func_ids from the current npz
928
- self._func_ids = []
929
- if not os.path.isfile(self.fname_npz):
930
- # no npz so nothing to do except making sure we have no func_ids
931
- return
932
- existing_npz_matches = False
933
- with np.load(self.fname_npz, allow_pickle=False) as npzfile:
934
- try:
935
- saved_hash_traj = npzfile[self._hash_traj_npz_key][0]
936
- except KeyError:
937
- # we probably tripped over an old formatted npz
938
- # so we will just rewrite it completely with hash
939
- pass
940
- else:
941
- # old hash found, lets compare the two hashes
942
- existing_npz_matches = (self._hash_traj == saved_hash_traj)
943
- if existing_npz_matches:
944
- # if they do populate self with the func_ids we have
945
- # cached values for
946
- for k in npzfile.keys():
947
- if k != self._hash_traj_npz_key:
948
- self._func_ids.append(str(k))
949
- # now if the old npz did not match we should remove it
950
- # then we will rewrite it with the first cached CV values
951
- if not existing_npz_matches:
952
- logger.debug("Found existing npz file (%s) but the "
953
- "trajectory hash does not match. "
954
- "Recreating the npz cache from scratch.",
955
- self.fname_npz
956
- )
957
- os.unlink(self.fname_npz)
958
-
959
- @classmethod
960
- def _get_cache_filename(cls, fname_trajs: list[str],
961
- trajectory_hash: int) -> str:
962
- """
963
- Construct cachefilename from trajectory fname.
964
-
965
- Parameters
966
- ----------
967
- fname_trajs : list[str]
968
- Path to the trajectory for which we cache.
969
- trajectory_hash : int
970
- Hash of the trajectory (files).
971
-
972
- Returns
973
- -------
974
- str
975
- Path to the cachefile associated with trajectory.
976
- """
977
- head, tail = os.path.split(fname_trajs[0])
978
- return os.path.join(head,
979
- f".{tail}{'_MULTIPART' if len(fname_trajs) > 1 else ''}_asyncmd_cv_cache.npz"
980
- )
981
-
982
- def __len__(self) -> int:
983
- return len(self._func_ids)
984
-
985
- def __iter__(self):
986
- for func_id in self._func_ids:
987
- yield func_id
988
-
989
- def __getitem__(self, key: str) -> np.ndarray:
990
- if not isinstance(key, str):
991
- raise TypeError("Keys must be of type str.")
992
- if key in self._func_ids:
993
- with np.load(self.fname_npz, allow_pickle=False) as npzfile:
994
- return npzfile[key]
995
- else:
996
- raise KeyError(f"No values for {key} cached (yet).")
997
-
998
- def append(self, func_id: str, vals: np.ndarray) -> None:
999
- """
1000
- Append values for given func_id.
1001
-
1002
- Parameters
1003
- ----------
1004
- func_id : str
1005
- Function identifier.
1006
- vals : np.ndarray
1007
- Values of application of function with given func_id.
1008
-
1009
- Raises
1010
- ------
1011
- TypeError
1012
- If ``func_id`` is not a string.
1013
- ValueError
1014
- If there are already values stored for ``func_id`` in self.
1015
- """
1016
- if not isinstance(func_id, str):
1017
- raise TypeError("func_id must be of type str.")
1018
- if func_id in self._func_ids:
1019
- # first check if it already in there
1020
- raise ValueError("There are already values stored for func_id "
1021
- + f"{func_id}. Changing the stored values is not "
1022
- + "supported.")
1023
- if len(self) == 0:
1024
- # these are the first cached CV values for this traj
1025
- # so we just create the (empty) npz file
1026
- np.savez(self.fname_npz)
1027
- # and write the trajectory hash
1028
- self._append_data_to_npz(name=self._hash_traj_npz_key,
1029
- value=np.array([self._hash_traj]),
1030
- )
1031
- # now we can append either way
1032
- # either already something cached, or freshly created empty file
1033
- self._append_data_to_npz(name=func_id, value=vals)
1034
- # add func_id to list of func_ids that we know are cached in npz
1035
- self._func_ids.append(func_id)
1036
-
1037
- def _append_data_to_npz(self, name: str, value: np.ndarray) -> None:
1038
- # npz files are just zipped together collections of npy files
1039
- # so we just make a npy file saved into a BytesIO and then write that
1040
- # to the end of the npz file
1041
- bio = io.BytesIO()
1042
- np.save(bio, value)
1043
- with zipfile.ZipFile(file=self.fname_npz,
1044
- mode="a", # append!
1045
- # uncompressed (but) zip archive member
1046
- compression=zipfile.ZIP_STORED,
1047
- ) as zfile:
1048
- zfile.writestr(f"{name}.npy", data=bio.getvalue())
1049
-
1050
-
1051
- class TrajectoryFunctionValueCacheH5PY(collections.abc.Mapping):
1052
- """
1053
- Interface for caching trajectory function values in a given h5py group.
1054
-
1055
- Drop-in replacement for the dictionary that is used for in-memory caching.
1056
- """
1057
-
1058
- # NOTE: this is written with the assumption that stored trajectories are
1059
- # immutable (except for adding additional stored function values)
1060
- # but we assume that the actual underlying trajectory stays the same,
1061
- # i.e. it is not extended after first storing it
1062
-
1063
- def __init__(self, h5py_cache, hash_traj: int):
1064
- self.h5py_cache = h5py_cache
1065
- self._hash_traj = hash_traj
1066
- self._h5py_paths = {"ids": "FunctionIDs",
1067
- "vals": "FunctionValues"
1068
- }
1069
- self._root_grp = h5py_cache.require_group(
1070
- "asyncmd/"
1071
- + "TrajectoryFunctionValueCache/"
1072
- + f"{self._hash_traj}"
1073
- )
1074
- self._ids_grp = self._root_grp.require_group(self._h5py_paths["ids"])
1075
- self._vals_grp = self._root_grp.require_group(self._h5py_paths["vals"])
1076
-
1077
- def __len__(self):
1078
- return len(self._ids_grp.keys())
1079
-
1080
- def __iter__(self):
1081
- for idx in range(len(self)):
1082
- yield self._ids_grp[str(idx)].asstr()[()]
1083
-
1084
- def __getitem__(self, key):
1085
- if not isinstance(key, str):
1086
- raise TypeError("Keys must be of type str.")
1087
- for idx, k_val in enumerate(self):
1088
- if key == k_val:
1089
- return self._vals_grp[str(idx)][:]
1090
- # if we got until here the key is not in there
1091
- raise KeyError("Key not found.")
1092
-
1093
- def append(self, func_id, vals):
1094
- if not isinstance(func_id, str):
1095
- raise TypeError("Keys (func_id) must be of type str.")
1096
- if func_id in self:
1097
- raise ValueError("There are already values stored for func_id "
1098
- + f"{func_id}. Changing the stored values is not "
1099
- + "supported.")
1100
- # TODO: do we also want to check vals for type?
1101
- name = str(len(self))
1102
- _ = self._ids_grp.create_dataset(name, data=func_id)
1103
- _ = self._vals_grp.create_dataset(name, data=vals)