asyncmd 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1103 @@
1
+ # This file is part of asyncmd.
2
+ #
3
+ # asyncmd is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # asyncmd is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with asyncmd. If not, see <https://www.gnu.org/licenses/>.
15
+ import io
16
+ import os
17
+ import copy
18
+ import typing
19
+ import asyncio
20
+ import hashlib
21
+ import logging
22
+ import zipfile
23
+ import collections
24
+ import numpy as np
25
+ import MDAnalysis as mda
26
+
27
+
28
+ from .._config import _GLOBALS
29
+
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ # dictionary in which we keep track of trajectory objects
35
+ # we use it to always return the *same* object for the same trajectory (by hash)
36
+ # this makes it easy to ensure that we never calculate CV functions twice
37
+ _TRAJECTORIES_BY_HASH = {}
38
+
39
+
40
+ def _forget_all_trajectories() -> None:
41
+ """
42
+ Forget about the existence of all :class:`Trajectory` objects.
43
+
44
+ This will result in new :class:`Trajectory` objects beeing created even for
45
+ the same underlying trajectory_files. Usualy you do not want this as it
46
+ results in unecessary calculations if the same wrapped and cached function
47
+ is applied to both objects. This function exists as a hidden function as it
48
+ is used in the tests and it might be helpful under certain circumstances.
49
+ Use only if you know why you are using it!
50
+ """
51
+ global _TRAJECTORIES_BY_HASH
52
+ all_keys = set(_TRAJECTORIES_BY_HASH.keys())
53
+ for key in all_keys:
54
+ del _TRAJECTORIES_BY_HASH[key]
55
+
56
+
57
+ def _forget_trajectory(traj_hash: int) -> None:
58
+ """
59
+ Forget about the existence of a given :class:`Trajectory` object.
60
+
61
+ This will result in new :class:`Trajectory` objects beeing created even for
62
+ the same underlying trajectory_files. Usualy you do not want this as it
63
+ results in unecessary calculations if the same wrapped and cached function
64
+ is applied to both objects. This function exists as a hidden function as it
65
+ is used when deleting a :class:`Trajectory` (i.e. calling its `__del__`
66
+ method) and it might be helpful under certain circumstances. Use only if
67
+ you know why you are using it!
68
+
69
+ Parameters
70
+ ----------
71
+ traj_hash : int
72
+ The hash of the :class:`Trajectory` to forget about.
73
+ """
74
+ global _TRAJECTORIES_BY_HASH
75
+ try:
76
+ del _TRAJECTORIES_BY_HASH[traj_hash]
77
+ except KeyError:
78
+ # not in there, do nothing
79
+ pass
80
+
81
+
82
+ class Trajectory:
83
+ """
84
+ Represent a trajectory.
85
+
86
+ Keep track of the paths of the trajectory and the structure files.
87
+ Caches values for (wrapped) functions acting on the trajectory.
88
+ Supports pickling and unpickling with the cached values restored, the
89
+ values will be written to a hidden numpy npz file next to the trajectory.
90
+ Supports equality checks with other :class:`Trajectory`.
91
+ Also makes available (and caches) a number of useful attributes, e.g.
92
+ ``first_step`` and ``last_step`` (the first and last intergation step in
93
+ the trajectory), ``dt``, ``first_time``, ``last_time``,
94
+ ``length`` (in frames) and ``nstout``.
95
+
96
+ Notes
97
+ -----
98
+ ``first_step`` and ``last_step`` is only useful for trajectories that come
99
+ directly from a :class:`asyncmd.mdengine.MDEngine`.
100
+ As soon as the trajecory has been concatenated using MDAnalysis (e.g. with
101
+ the ``TrajectoryConcatenator``) the step information is just the frame
102
+ number in the trajectory part that became first/last frame in the
103
+ concatenated trajectory.
104
+ """
105
+
106
+ def __init__(self, trajectory_files: typing.Union[list[str], str],
107
+ structure_file: str,
108
+ nstout: typing.Optional[int] = None,
109
+ cache_type: typing.Optional[str] = None,
110
+ **kwargs):
111
+ """
112
+ Initialize a :class:`Trajectory`.
113
+
114
+ Parameters
115
+ ----------
116
+ trajectory_files : list[str] or str
117
+ Absolute or relative path(s) to the trajectory file(s),
118
+ e.g. trr, xtc, dcd, ...
119
+ structure_file : str
120
+ Absolute or relative path to the structure file (e.g. tpr, gro).
121
+ nstout : int or None, optional
122
+ The output frequency used when creating the trajectory,
123
+ by default None
124
+ cache_type : str or None, optional
125
+ The cache type for the CV values cached for this trajectory,
126
+ must be one of 'h5py', 'npz' or 'memory'.
127
+ If None we will use 'h5py' if a h5py cache has been registered and
128
+ if not fallback to 'npz'.
129
+ See also the ``asyncmd.config.register_h5py_cache()`` function.
130
+
131
+ Raises
132
+ ------
133
+ FileNotFoundError
134
+ If the ``trajectory_files`` or the ``structure_file`` are not
135
+ accessible.
136
+ """
137
+ # NOTE: we assume tra = trr and struct = tpr
138
+ # but we also expect that anything which works for mdanalysis as
139
+ # tra and struct should also work here as tra and struct
140
+ # TODO: currently we do not use kwargs?!
141
+ #dval = object()
142
+ #for kwarg, value in kwargs.items():
143
+ # cval = getattr(self, kwarg, dval)
144
+ # if cval is not dval:
145
+ # if isinstance(value, type(cval)):
146
+ # # value is of same type as default so set it
147
+ # setattr(self, kwarg, value)
148
+ # else:
149
+ # logger.warn(f"Setting attribute {kwarg} with "
150
+ # + f"mismatching type ({type(value)}). "
151
+ # + f" Default type is {type(cval)}."
152
+ # )
153
+ # else:
154
+ # # not previously defined, so warn that we ignore it
155
+ # logger.warning("Ignoring unknown keyword-argument %s.", kwarg)
156
+ # NOTE: self._trajectory_files is set in __new__ because we otherwise
157
+ # would sanitize the files twice, but we need to check in __new__
158
+ # to make pickling work
159
+ # self._structure_file is also set in __new__ together with the
160
+ # trajectory_files as we also sanitize its path
161
+ # self._traj_hash and self._workdir are also set by __new__!
162
+ # self._trajectory_files
163
+ # self._structure_file
164
+ # self._workdir
165
+ # self._traj_hash
166
+ # properties
167
+ self.nstout = nstout # use the setter to make basic sanity checks
168
+ self._len = None
169
+ self._first_step = None
170
+ self._last_step = None
171
+ self._dt = None
172
+ self._first_time = None
173
+ self._last_time = None
174
+ # stuff for caching of functions applied to this traj
175
+ self._memory_cache = None
176
+ self._npz_cache = None
177
+ self._h5py_cache = None
178
+ self._cache_type = None
179
+ # remember if we use the global default value,
180
+ # if yes we use the (possibly changed) global default when unpickling
181
+ self._using_default_cache_type = True
182
+ # use our property logic for checking the value
183
+ # (Note that self._trajectory_hash has already been set by __new__)
184
+ self.cache_type = cache_type
185
+ # Locking mechanism such that only one application of a specific
186
+ # CV func can run at any given time on this trajectory
187
+ self._semaphores_by_func_id = collections.defaultdict(
188
+ asyncio.BoundedSemaphore
189
+ )
190
+
191
+ def __new__(cls, trajectory_files: typing.Union[list[str], str],
192
+ structure_file: str,
193
+ nstout: typing.Optional[int] = None,
194
+ cache_type: typing.Optional[str] = None,
195
+ **kwargs):
196
+ global _TRAJECTORIES_BY_HASH # our global traj registry
197
+ # see if old_workdir is given to sanitize file paths
198
+ old_workdir = kwargs.get("old_workdir", None)
199
+ # get cwd to get (and set) it only once for init and unpickle
200
+ current_workdir = os.path.abspath(os.getcwd())
201
+ trajectory_files, structure_file = Trajectory._sanitize_file_paths(
202
+ trajectory_files=trajectory_files,
203
+ structure_file=structure_file,
204
+ current_workdir=current_workdir,
205
+ old_workdir=old_workdir,
206
+ )
207
+ traj_hash = Trajectory._calc_traj_hash(trajectory_files)
208
+ try:
209
+ # see if we (i.e. a traj with the same hash) are already existing
210
+ other_traj = _TRAJECTORIES_BY_HASH[traj_hash]
211
+ # if yes return 'ourself'
212
+ # (but make sure that the filepaths match even after a potential
213
+ # change of workdir)
214
+ other_traj._trajectory_files = trajectory_files
215
+ other_traj._structure_file = structure_file
216
+ other_traj._workdir = current_workdir
217
+ return other_traj
218
+ except KeyError:
219
+ # not yet in there, so need to create us
220
+ # we just create cls so that we will be "created" by init or
221
+ # unpickled by setstate
222
+ # NOTE: we need to make sure that every attribute we set
223
+ # below is not overwritten by setstate and/or init!
224
+ obj = super().__new__(cls)
225
+ # but set self._traj_hash so we dont recalculate it
226
+ obj._traj_hash = traj_hash
227
+ # and set self._trajectory_files so we dont sanitize twice
228
+ obj._trajectory_files = trajectory_files
229
+ # also set self._structure_file
230
+ obj._structure_file = structure_file
231
+ # and set self._workdir to the new value
232
+ # Note:
233
+ # we remember the current workdir to be able to unpickle as long as
234
+ # either the relpath between traj and old/new workdir does not change
235
+ # or the trajectory did not change its location but we changed workdir
236
+ # (we need the workdir only for the second option)
237
+ obj._workdir = current_workdir
238
+ # and add us to the global trajectory registry
239
+ _TRAJECTORIES_BY_HASH[traj_hash] = obj
240
+ return obj
241
+
242
+ #def __del__(self):
243
+ # TODO: running 'del traj' does not call this function,
244
+ # it only decreases the reference count by one,
245
+ # but since we still have the traj in the traj by hash dictionary
246
+ # i.e. we still have a reference, it will not call __del__ which
247
+ # is only called when the reference count reaches zero
248
+ # _forget_trajectory(traj_hash=self.trajectory_hash)
249
+
250
+ @classmethod
251
+ def _sanitize_file_paths(cls,
252
+ trajectory_files: typing.Union[list[str], str],
253
+ structure_file: str,
254
+ current_workdir: typing.Optional[str] = None,
255
+ old_workdir: typing.Optional[str] = None,
256
+ ) -> typing.Tuple[list[str], str]:
257
+ # NOTE: this returns relpath if no old_workdir is given and the traj
258
+ # is accessible
259
+ # if old_workdir is given (and the traj not accesible) it (tries)
260
+ # to find the traj by assuming the traj did not change place and
261
+ # we just need to add the "path_diff" from old to new workdir to
262
+ # the path, if the file is then still not there it raises a
263
+ # FileNotFoundError
264
+ # NOTE: (for pickling and aimmd storage behavior):
265
+ # The above makes it possible to either change the workdir of the
266
+ # python session OR change the location of the trajectories as
267
+ # as long as the relative path between trajectory and python
268
+ # workdir does not change!
269
+ def sanitize_path(f, pathdiff=None):
270
+ if os.path.isfile(f):
271
+ return os.path.relpath(f)
272
+ elif pathdiff is not None:
273
+ f_diff = os.path.join(pathdiff, f)
274
+ if os.path.isfile(f_diff):
275
+ return os.path.relpath(f_diff)
276
+ # if we get until here we cant find the file
277
+ err_msg = f"File {f} is not accessible"
278
+ if pathdiff is not None:
279
+ err_msg += f" (we also tried {f_diff})."
280
+ else:
281
+ err_msg += "."
282
+ raise FileNotFoundError(err_msg)
283
+
284
+ if old_workdir is not None:
285
+ if current_workdir is None:
286
+ raise ValueError("'old_workdir' given but 'current_workdir' "
287
+ "was None.")
288
+ path_diff = os.path.relpath(old_workdir, current_workdir)
289
+ else:
290
+ path_diff = None
291
+
292
+ if isinstance(trajectory_files, str):
293
+ trajectory_files = [trajectory_files]
294
+
295
+ traj_files_sanitized = [sanitize_path(f=traj_f, pathdiff=path_diff)
296
+ for traj_f in trajectory_files
297
+ ]
298
+ struct_file_sanitized = sanitize_path(f=structure_file,
299
+ pathdiff=path_diff,
300
+ )
301
+
302
+ return traj_files_sanitized, struct_file_sanitized
303
+
304
+ @classmethod
305
+ def _calc_traj_hash(cls, trajectory_files):
306
+ # calculate a hash over the first and last part of the traj files
307
+ # (we use it to make sure the cached CV values match the traj)
308
+ # note that we do not include the structure file on purpose because
309
+ # that allows for changing .gro <-> .tpr or similar
310
+ # (which we expect to not change the calculated CV values)
311
+ # TODO: how much should we read?
312
+ # (I [hejung] think the first and last .5 MB are enough)
313
+ data = bytes()
314
+ for traj_f in trajectory_files:
315
+ #data += traj_f.encode("utf-8") # DONT include filepaths!...
316
+ fsize = os.stat(traj_f).st_size
317
+ data += str(fsize).encode("utf-8")
318
+ if fsize == 0:
319
+ # Note: we could also just warn as long as we do not do the
320
+ # negative seek below if filesize == 0. However,
321
+ # mdanalysis throws errors for empty trajectories anyway
322
+ raise ValueError(f"Trajectory file {traj_f} is of size 0.")
323
+ # read (at most) the first and last 0.5 MB of each file
324
+ max_to_read = min((512, fsize))
325
+ with open(traj_f, "rb") as traj_file:
326
+ # read the first bit of each file
327
+ data += traj_file.read(max_to_read)
328
+ # and read the last bit of each file
329
+ # Note that the last bit potentially overlapps with the first
330
+ traj_file.seek(-max_to_read, io.SEEK_END)
331
+ data += traj_file.read(max_to_read)
332
+ # calculate one hash over all traj_files
333
+ traj_hash = int(hashlib.blake2b(data,
334
+ # digest size 8 bytes = 64 bit
335
+ # to make sure the hash fits into
336
+ # the npz as int64 and not object
337
+ digest_size=8).hexdigest(),
338
+ base=16,
339
+ )
340
+ return traj_hash
341
+
342
+ @property
343
+ def cache_type(self):
344
+ """
345
+ String indicating the currently used cache type. Can also be (re)set.
346
+ """
347
+ return copy.copy(self._cache_type)
348
+
349
+ @cache_type.setter
350
+ def cache_type(self, value: typing.Optional[str]):
351
+ """
352
+ Set the cache type.
353
+
354
+ Parameters
355
+ ----------
356
+ value : str or None
357
+ Either a string indicating the type or None to choose the preferred
358
+ cache type from the available ones.
359
+ If a string it must be one of 'h5py', 'npz' or 'memory'.
360
+
361
+ Raises
362
+ ------
363
+ ValueError
364
+ Raised if value is not one of the available cache types.
365
+ """
366
+ if value is None:
367
+ use_default_cache_type = True
368
+ # find preferred cache type that is available
369
+ try:
370
+ value = _GLOBALS["TRAJECTORY_FUNCTION_CACHE_TYPE"]
371
+ except KeyError:
372
+ # no default cache type set
373
+ # default to numpy npz
374
+ value = "npz"
375
+ else:
376
+ use_default_cache_type = False
377
+ value = value.lower()
378
+ allowed_values = ["h5py", "npz", "memory"]
379
+ if value not in allowed_values:
380
+ raise ValueError("Given cache type must be `None` or one of "
381
+ + f"{allowed_values}. Was: {value}.")
382
+ self._cache_type = value
383
+ self._using_default_cache_type = use_default_cache_type
384
+ self._setup_cache()
385
+
386
+ def _setup_cache(self) -> None:
387
+ # set up the cache indicated by self.cache_type and all others to None
388
+ # also makes sure that all previously cached values are transfered
389
+ # to the newly setup cache
390
+ # NOTE: we setup an npz cache to see if there are any saved values
391
+ # that we would want to add to the newly setup cache
392
+ # We do this because upon pickling we save everything to npz
393
+ # Note that we can just set self._npz to this cache because it is
394
+ # stateless (in the sense that if it existed it be exactly the same)
395
+ self._npz_cache = TrajectoryFunctionValueCacheNPZ(
396
+ fname_trajs=self.trajectory_files,
397
+ hash_traj=self._traj_hash,
398
+ )
399
+ if self._cache_type == "memory":
400
+ if self._memory_cache is None:
401
+ self._memory_cache = TrajectoryFunctionValueCacheMEMORY()
402
+ else:
403
+ # we already have a mem cache so just try to use it
404
+ pass
405
+ if self._h5py_cache is not None:
406
+ self._cache_content_to_new_cache(
407
+ old_cache=self._h5py_cache,
408
+ new_cache=self._memory_cache,
409
+ )
410
+ self._h5py_cache = None
411
+ self._cache_content_to_new_cache(
412
+ old_cache=self._npz_cache,
413
+ new_cache=self._memory_cache,
414
+ )
415
+ self._npz_cache = None
416
+ elif self._cache_type == "h5py":
417
+ try:
418
+ h5py_cache = _GLOBALS["H5PY_CACHE"]
419
+ except KeyError as exc:
420
+ raise ValueError(
421
+ "No h5py cache file registered yet. Try calling "
422
+ + "``asyncmd.config.register_h5py_cache_file()``"
423
+ + " with the appropriate arguments first") from exc
424
+ if self._h5py_cache is None:
425
+ # dont have one yet so setup the cache
426
+ self._h5py_cache = TrajectoryFunctionValueCacheH5PY(
427
+ h5py_cache=h5py_cache,
428
+ hash_traj=self._traj_hash,
429
+ )
430
+ else:
431
+ # we already have a h5py cache...
432
+ if self._h5py_cache.h5py_cache is h5py_cache:
433
+ # and it is in the same file/group location
434
+ # so we do nothing but making sure that all values from
435
+ # other caches are transfered
436
+ pass
437
+ else:
438
+ # lets copy the stuff from the old to the new h5py cache
439
+ old_h5py_cache = self._h5py_cache
440
+ self._h5py_cache = TrajectoryFunctionValueCacheH5PY(
441
+ h5py_cache=h5py_cache,
442
+ hash_traj=self._traj_hash,
443
+ )
444
+ self._cache_content_to_new_cache(
445
+ old_cache=old_h5py_cache,
446
+ new_cache=self._h5py_cache,
447
+ )
448
+ # transfer all values from other cache types and empty them
449
+ if self._memory_cache is not None:
450
+ self._cache_content_to_new_cache(
451
+ old_cache=self._memory_cache,
452
+ new_cache=self._h5py_cache,
453
+ )
454
+ self._memory_cache = None
455
+ self._cache_content_to_new_cache(
456
+ old_cache=self._npz_cache,
457
+ new_cache=self._h5py_cache,
458
+ )
459
+ self._npz_cache = None
460
+ elif self._cache_type == "npz":
461
+ if self._h5py_cache is not None:
462
+ self._cache_content_to_new_cache(
463
+ old_cache=self._h5py_cache,
464
+ new_cache=self._npz_cache,
465
+ )
466
+ self._h5py_cache = None
467
+ if self._memory_cache is not None:
468
+ self._cache_content_to_new_cache(
469
+ old_cache=self._memory_cache,
470
+ new_cache=self._npz_cache,
471
+ )
472
+ self._memory_cache = None
473
+ else:
474
+ raise RuntimeError("This should never happen. self._cache_type "
475
+ + "must be one of 'memory', 'h5py', 'npz' when "
476
+ + "self._setup_cache is called. "
477
+ + f"Was {self._cache_type}.")
478
+
479
+ def _populate_properties(self) -> None:
480
+ """
481
+ Populate cached properties from the underlying trajectory.
482
+ """
483
+ # create/open a mdanalysis universe to get...
484
+ u = mda.Universe(self.structure_file, *self.trajectory_files)
485
+ # ...the number of frames
486
+ self._len = len(u.trajectory)
487
+ # ...the first integration step and time
488
+ ts = u.trajectory[0]
489
+ # FIXME: using None here means we will try to repopulate the properties
490
+ # every time we access step property for a traj-format which
491
+ # does not have step data!
492
+ # TODO: which traj formats have step data set in MDAnalysis?
493
+ # XTC and TRR have it for sure (with the wraparound issue)
494
+ self._first_step = ts.data.get("step", None)
495
+ self._first_time = ts.time
496
+ # ...the time diff between subsequent **frames** (not steps)
497
+ self._dt = ts.dt
498
+ # ...the last integration step and time
499
+ ts = u.trajectory[-1]
500
+ # TODO: which traj formats have step data set in MDAnalysis?
501
+ # XTC and TRR have it for sure (with the wraparound issue)
502
+ self._last_step = ts.data.get("step", None)
503
+ self._last_time = ts.time
504
+ if all([t.lower().endswith((".xtc", ".trr"))
505
+ for t in self.trajectory_files]):
506
+ self._fix_trr_xtc_step_wraparound(universe=u)
507
+ else:
508
+ # bail out if traj is not an XTC or TRR
509
+ logger.info("%s is not of type XTC or TRR. Not applying "
510
+ "wraparound fix.", self)
511
+ # make sure the trajectory is closed by MDAnalysis
512
+ u.trajectory.close()
513
+ del u
514
+
515
+ def _fix_trr_xtc_step_wraparound(self, universe: mda.Universe) -> None:
516
+ # check/correct for wraparounds in the integration step numbers
517
+ # NOTE: fails if the trajectory has length = 1!
518
+ # NOTE: strictly spoken we should not assume wraparound behavior,
519
+ # but it seems reasonable for the stepnum,
520
+ # see e.g. https://www.airs.com/blog/archives/120
521
+ # all times are in pico second (as this is MDAnalysis unit of time)
522
+ # we round integrator_dt and delta_t to precision of
523
+ # 0.000001 ps = 0.001 fs = 1 as
524
+ # we do this to avoid accumulating floating point inaccuracies when
525
+ # dividing the times by integrator_dt, this should be reasonably
526
+ # save for normal MD settings where integrator_dt should be on the
527
+ # order of 1-10 fs
528
+ if self._len == 1:
529
+ # bail out if the trajectory has length=1
530
+ # as we can not calculate dt if we only have one frame
531
+ logger.info("%s has only one frame. Can not correct for "
532
+ "potential wraparound of the integration step.",
533
+ self)
534
+ return # bail out
535
+ # get the time offset for first and last frame, they need to match for
536
+ # our wraparound fix to work
537
+ ts = universe.trajectory[0]
538
+ time_offset = ts.data.get("time_offset", 0)
539
+ ts = universe.trajectory[-1]
540
+ if ts.data.get("time_offset", 0) != time_offset:
541
+ logger.info("Time offset of the first and last time in "
542
+ "%s do not match. Not correcting for potential "
543
+ "wraparound of the integration step.",
544
+ self)
545
+ return # bail out
546
+ delta_s = self._last_step - self._first_step
547
+ delta_t = round(self._last_time - self._first_time, ndigits=6)
548
+ # first make sure traj is continous (i.e. not a concatenation where we
549
+ # carried over the time and step data from the original trajs)
550
+ n_frames = len(universe.trajectory)
551
+ n_max_samples = 100 # use at most 100 frames to see if it is continous
552
+ if n_frames > n_max_samples:
553
+ skip = n_frames // n_max_samples
554
+ else:
555
+ skip = 1
556
+ step_nums = [ts.data["step"] for ts in universe.trajectory[::skip]]
557
+ step_diffs = np.diff(step_nums)
558
+ first_diff = step_diffs[0]
559
+ if first_diff < 0:
560
+ # we possibly wrapped around at the first step
561
+ first_diff += 2**32
562
+ for diff in step_diffs[1:]:
563
+ if diff != first_diff:
564
+ # bail out because traj is not continous in time
565
+ logger.debug("%s is not from one continous propagation, i.e. "
566
+ "the step difference between subsequent steps is "
567
+ "not constant. Not applying TRR/XTC step "
568
+ "wraparound fix and using step as read from the "
569
+ "underlying trajectory.",
570
+ self)
571
+ return
572
+ # now the actual fix
573
+ if delta_s != 0:
574
+ if delta_s > 0:
575
+ # both (last and first) wrapped around the same number of times
576
+ integrator_dt = round(delta_t / delta_s, ndigits=6)
577
+ else: # delta_s < 0
578
+ # last wrapped one time more than first
579
+ integrator_dt = round(delta_t / (delta_s + 2**32), ndigits=6)
580
+ # NOTE: should we round or floor? I (hejung) think round is what we
581
+ # want, it will get us to the nearest int, which is good if
582
+ # we e.g. have 0.99999999999 instead of 1
583
+ first_step = round((self._first_time - time_offset) / integrator_dt)
584
+ last_step = round((self._last_time - time_offset) / integrator_dt)
585
+ self._first_step = first_step
586
+ self._last_step = last_step
587
+ else: # delta_s == 0
588
+ # can only end up here if we have more than one frame in trajectory
589
+ # **and** the first and last frame have the same integration step
590
+ # which should be very rare and we can not correct anyway as the
591
+ # trajectory can not be from a continous propagation, so we can not
592
+ # end up here at all?
593
+ raise RuntimeError("This should not be possible?!")
594
+
595
+ def __len__(self) -> int:
596
+ """
597
+ Return the number of frames in the trajectory.
598
+
599
+ Returns
600
+ -------
601
+ int
602
+ The number of frames in the trajectory.
603
+ """
604
+ if self._len is None:
605
+ self._populate_properties()
606
+ return self._len
607
+
608
+ def __repr__(self) -> str:
609
+ if len(self.trajectory_files) == 1:
610
+ return (f"Trajectory(trajectory_files={self.trajectory_files[0]},"
611
+ + f" structure_file={self.structure_file})"
612
+ )
613
+ return (f"Trajectory(trajectory_files={self.trajectory_files},"
614
+ + f" structure_file={self.structure_file})"
615
+ )
616
+
617
+ def __eq__(self, other: object) -> bool:
618
+ if not isinstance(other, Trajectory):
619
+ # if its not a trajectory it cant be equal
620
+ return False
621
+ if self.trajectory_hash != other.trajectory_hash:
622
+ # if it has a different hash it cant be equal
623
+ return False
624
+ # TODO: check for cached CV values? I (hejung) think it does not really
625
+ # make sense...
626
+
627
+ # if we got until here the two trajs are equal
628
+ return True
629
+
630
+ def __ne__(self, other: object) -> bool:
631
+ return not self.__eq__(other=other)
632
+
633
+ @property
634
+ def structure_file(self) -> str:
635
+ """Return relative path to the structure file."""
636
+ return copy.copy(self._structure_file)
637
+
638
+ @property
639
+ def trajectory_files(self) -> str:
640
+ """Return relative path to the trajectory files."""
641
+ return copy.copy(self._trajectory_files)
642
+
643
+ @property
644
+ def trajectory_hash(self) -> int:
645
+ """Return hash over the trajecory files"""
646
+ return copy.copy(self._traj_hash)
647
+
648
+ @property
649
+ def nstout(self) -> typing.Union[int, None]:
650
+ """Output frequency between subsequent frames in integration steps."""
651
+ return self._nstout
652
+
653
+ @nstout.setter
654
+ def nstout(self, val: typing.Union[int, None]) -> None:
655
+ if val is not None:
656
+ # ensure that it is an int
657
+ val = int(val)
658
+ # enable setting to None
659
+ self._nstout = val
660
+
661
+ @property
662
+ def first_step(self) -> int:
663
+ """Return the integration step of the first frame in the trajectory."""
664
+ if self._first_step is None:
665
+ self._populate_properties()
666
+ return self._first_step
667
+
668
+ @property
669
+ def last_step(self) -> int:
670
+ """Return the integration step of the last frame in the trajectory."""
671
+ if self._last_step is None:
672
+ self._populate_properties()
673
+ return self._last_step
674
+
675
+ @property
676
+ def dt(self) -> float:
677
+ """The time intervall between subsequent *frames* (not steps) in ps."""
678
+ if self._dt is None:
679
+ self._populate_properties()
680
+ return self._dt
681
+
682
+ @property
683
+ def first_time(self) -> float:
684
+ """Return the integration timestep of the first frame in ps."""
685
+ if self._first_time is None:
686
+ self._populate_properties()
687
+ return self._first_time
688
+
689
+ @property
690
+ def last_time(self) -> float:
691
+ """Return the integration timestep of the last frame in ps."""
692
+ if self._last_time is None:
693
+ self._populate_properties()
694
+ return self._last_time
695
+
696
+ async def _apply_wrapped_func(self, func_id, wrapped_func):
697
+ async with self._semaphores_by_func_id[func_id]:
698
+ # sort out which cache we use
699
+ # NOTE: only one cache should ever be not None, so order should not
700
+ # matter here
701
+ # anyway I (hejung) think this order is even what we want:
702
+ # 1.) use h5py cache if registered
703
+ # 2.) use npz cache (the default since h5py is not registered
704
+ # if not set by the user)
705
+ # 3.) use memory/local cache (only if set on traj creation
706
+ # or if set as default cache)
707
+ if self._h5py_cache is not None:
708
+ return await self._apply_wrapped_func_cached(
709
+ func_id=func_id,
710
+ wrapped_func=wrapped_func,
711
+ cache=self._h5py_cache,
712
+ )
713
+ if self._npz_cache is not None:
714
+ return await self._apply_wrapped_func_cached(
715
+ func_id=func_id,
716
+ wrapped_func=wrapped_func,
717
+ cache=self._npz_cache
718
+ )
719
+ if self._memory_cache is not None:
720
+ return await self._apply_wrapped_func_cached(
721
+ func_id=func_id,
722
+ wrapped_func=wrapped_func,
723
+ cache=self._memory_cache,
724
+ )
725
+ # if we get until here we have no cache!
726
+ logger.warning("No cache associated with %s. Returning calculated "
727
+ "function values anyway but no caching can/will be "
728
+ "performed!",
729
+ self,
730
+ )
731
+ return await wrapped_func.get_values_for_trajectory(self)
732
+
733
+ async def _apply_wrapped_func_cached(
734
+ self, func_id: str, wrapped_func,
735
+ cache: collections.abc.Mapping[str, np.ndarray],
736
+ ):
737
+ try:
738
+ # see if it is in cache
739
+ return copy.copy(cache[func_id])
740
+ except KeyError:
741
+ # if not calculate, store and return
742
+ # send function application to seperate process and wait
743
+ # until it finishes
744
+ vals = await wrapped_func.get_values_for_trajectory(self)
745
+ cache.append(func_id=func_id, vals=vals)
746
+ return vals
747
+
748
+ def _cache_content_to_new_cache(
749
+ self,
750
+ old_cache: collections.abc.Mapping[str, np.ndarray],
751
+ new_cache: collections.abc.Mapping[str, np.ndarray],
752
+ ):
753
+ for func_id, values in old_cache.items():
754
+ if func_id in new_cache:
755
+ continue # dont try to add what is already in there
756
+ new_cache.append(func_id=func_id, vals=values)
757
+
758
+ def __getstate__(self):
759
+ # enable pickling of Trajectory
760
+ # this should make it possible to pass it into a ProcessPoolExecutor
761
+ # and lets us calculate TrajectoryFunction values asyncronously
762
+ state = self.__dict__.copy()
763
+ # NOTE: we always save to npz here and then we check for npz always
764
+ # when initializing a `new` trajectory and add all values to
765
+ # the then preferred cache
766
+ if self._npz_cache is None:
767
+ self._npz_cache = TrajectoryFunctionValueCacheNPZ(
768
+ fname_trajs=self.trajectory_files,
769
+ hash_traj=self._traj_hash,
770
+ )
771
+ if self._memory_cache is not None:
772
+ self._cache_content_to_new_cache(old_cache=self._memory_cache,
773
+ new_cache=self._npz_cache,
774
+ )
775
+ if self._h5py_cache is not None:
776
+ self._cache_content_to_new_cache(old_cache=self._h5py_cache,
777
+ new_cache=self._npz_cache,
778
+ )
779
+ # and set npz cache back to None since we have not been using it
780
+ self._npz_cache = None
781
+ state["_h5py_cache"] = None
782
+ state["_npz_cache"] = None
783
+ state["_memory_cache"] = None
784
+ state["_semaphores_by_func_id"] = collections.defaultdict(
785
+ asyncio.BoundedSemaphore
786
+ )
787
+ return state
788
+
789
+ def __setstate__(self, d: dict):
790
+ # remove the attributes we set in __new__ from dict
791
+ # (otherwise we would overwrite what we set in __new__)
792
+ del d["_trajectory_files"]
793
+ del d["_structure_file"]
794
+ del d["_traj_hash"]
795
+ try:
796
+ del d["_workdir"]
797
+ except KeyError:
798
+ # 'old' trajectory objects dont have a _workdir attribute
799
+ pass
800
+ # now we can update without overwritting what we set in __new__
801
+ self.__dict__.update(d)
802
+ # sort out which cache we were using (and which we will use now)
803
+ if self._using_default_cache_type:
804
+ # if we were using the global default when pickling use it now too
805
+ # Note that this will raise the ValueError from _setup_cache if
806
+ # no h5py cache has been registered but it is set as default
807
+ # (which is intended because it is the same behavior as when
808
+ # initializing a new trajectory in the same situation)
809
+ self.cache_type = None # this calls _setup_cache
810
+ return # get out of here, no need to setup the cache twice
811
+ if self.cache_type == "h5py":
812
+ # make sure h5py cache is set before trying to unpickle with it
813
+ try:
814
+ _ = _GLOBALS["H5PY_CACHE"]
815
+ except KeyError:
816
+ # this will (probably) fallback to npz but I (hejung) think it
817
+ # is nice if we use the possibly set global default?
818
+ # Note that this will not err but just emit the warning to log
819
+ # when we change the cache but it will err when the global
820
+ # default cache is set to h5py (as above)
821
+ logger.warning("Trying to unpickle %s with cache_type "
822
+ "'h5py' not possible without a registered "
823
+ "cache. Falling back to global default type."
824
+ "See 'asyncmd.config.register_h5py_cache' and"
825
+ " 'asyncmd.config.set_default_cache_type'.",
826
+ self
827
+ )
828
+ self.cache_type = None # this calls _setup_cache
829
+ return # get out of here, no need to setup the cache twice
830
+ # setup the cache for all cases where we are not using default cache
831
+ # (or had "h5py" but could not unpickle with "h5py" now [and are
832
+ # therefore also using the default])
833
+ self._setup_cache()
834
+
835
+ def __getnewargs_ex__(self):
836
+ # new needs the trajectory_files to be able to calculate the traj_hash
837
+ # and since we want __new__ to have the same call signature as __init__
838
+ # we also add all the init args here too
839
+ return ((), {"trajectory_files": self.trajectory_files,
840
+ "structure_file": self.structure_file,
841
+ "nstout": self.nstout,
842
+ "cache_type": self.cache_type,
843
+ "old_workdir": self._workdir,
844
+ })
845
+
846
+
847
+ class TrajectoryFunctionValueCacheMEMORY(collections.abc.Mapping):
848
+ """
849
+ Interface for caching trajectory function values in memory in a dict.
850
+ """
851
+
852
+ def __init__(self, *args, **kwargs) -> None:
853
+ """Initialize a `TrajectoryFunctionValueCacheMEMORY`."""
854
+ self._func_values_by_id = {}
855
+
856
+ def __len__(self) -> int:
857
+ return len(self._func_values_by_id)
858
+
859
+ def __iter__(self):
860
+ return self._func_values_by_id.__iter__()
861
+
862
+ def __getitem__(self, key: str) -> np.ndarray:
863
+ if not isinstance(key, str):
864
+ raise TypeError("Keys must be of type str.")
865
+ return self._func_values_by_id[key]
866
+
867
+ def append(self, func_id: str, vals: np.ndarray) -> None:
868
+ if not isinstance(func_id, str):
869
+ raise TypeError("func_id must be of type str.")
870
+ if func_id in self._func_values_by_id:
871
+ # first check if it already in there
872
+ raise ValueError("There are already values stored for func_id "
873
+ + f"{func_id}. Changing the stored values is not "
874
+ + "supported.")
875
+ self._func_values_by_id[func_id] = vals
876
+
877
+
878
+ class TrajectoryFunctionValueCacheNPZ(collections.abc.Mapping):
879
+ """
880
+ Interface for caching trajectory function values in a numpy npz file.
881
+
882
+ Drop-in replacement for the dictionary that is used for in-memory caching.
883
+ """
884
+
885
+ _hash_traj_npz_key = "hash_of_trajs" # key of hash_traj in npz file
886
+
887
+ # NOTE: this is written with the assumption that stored trajectories are
888
+ # immutable (except for adding additional stored function values)
889
+ # but we assume that the actual underlying trajectory stays the same,
890
+ # i.e. it is not extended after first storing it
891
+ # If it changes between two npz-cache initializiations, it will have
892
+ # a different traj-hash and all cached CV values will be recalculated
893
+
894
+ # NOTE: npz appending inspired by: https://stackoverflow.com/a/66618141
895
+
896
+ # NOTE/FIXME: It would be nice to use the MAX_FILES_OPEN semaphore
897
+ # but then we need async/await and then we need to go to a 'create'
898
+ # classmethod that is async and required for initialization
899
+ # (because __init__ cant be async)
900
+ # but since we (have to) open the npz file in the other magic methods
901
+ # too it does not really matter (as they can not be async either)?
902
+ # ...and as we also leave some room for non-semaphored file openings anyway
903
+
904
+ def __init__(self, fname_trajs: list[str], hash_traj: int) -> None:
905
+ """
906
+ Initialize a `TrajectoryFunctionValueCacheNPZ`.
907
+
908
+ Parameters
909
+ ----------
910
+ fname_trajs : list[str]
911
+ Absolute filenames to the trajectories for which we cache CV values.
912
+ hash_traj : int
913
+ Hash over the first part of the trajectory file,
914
+ used to make sure we cache only for the right trajectory
915
+ (and not any trajectories with the same filename).
916
+ """
917
+ self.fname_npz = self._get_cache_filename(fname_trajs=fname_trajs,
918
+ trajectory_hash=hash_traj,
919
+ )
920
+ self._hash_traj = hash_traj
921
+ self._func_ids = []
922
+ # sort out if we have an associated npz file already
923
+ # and if it is from/for the "right" trajectory file
924
+ self._ensure_consistent_npz()
925
+
926
+ def _ensure_consistent_npz(self):
927
+ # next line makes sure we only remember func_ids from the current npz
928
+ self._func_ids = []
929
+ if not os.path.isfile(self.fname_npz):
930
+ # no npz so nothing to do except making sure we have no func_ids
931
+ return
932
+ existing_npz_matches = False
933
+ with np.load(self.fname_npz, allow_pickle=False) as npzfile:
934
+ try:
935
+ saved_hash_traj = npzfile[self._hash_traj_npz_key][0]
936
+ except KeyError:
937
+ # we probably tripped over an old formatted npz
938
+ # so we will just rewrite it completely with hash
939
+ pass
940
+ else:
941
+ # old hash found, lets compare the two hashes
942
+ existing_npz_matches = (self._hash_traj == saved_hash_traj)
943
+ if existing_npz_matches:
944
+ # if they do populate self with the func_ids we have
945
+ # cached values for
946
+ for k in npzfile.keys():
947
+ if k != self._hash_traj_npz_key:
948
+ self._func_ids.append(str(k))
949
+ # now if the old npz did not match we should remove it
950
+ # then we will rewrite it with the first cached CV values
951
+ if not existing_npz_matches:
952
+ logger.debug("Found existing npz file (%s) but the"
953
+ " trajectory hash does not match."
954
+ " Recreating the npz cache from scratch.",
955
+ self.fname_npz
956
+ )
957
+ os.unlink(self.fname_npz)
958
+
959
+ @classmethod
960
+ def _get_cache_filename(cls, fname_trajs: list[str],
961
+ trajectory_hash: int) -> str:
962
+ """
963
+ Construct cachefilename from trajectory fname.
964
+
965
+ Parameters
966
+ ----------
967
+ fname_trajs : list[str]
968
+ Path to the trajectory for which we cache.
969
+ trajectory_hash : int
970
+ Hash of the trajectory (files).
971
+
972
+ Returns
973
+ -------
974
+ str
975
+ Path to the cachefile associated with trajectory.
976
+ """
977
+ head, tail = os.path.split(fname_trajs[0])
978
+ return os.path.join(head,
979
+ f".{tail}{'_MULTIPART' if len(fname_trajs) > 1 else ''}_asyncmd_cv_cache.npz"
980
+ )
981
+
982
+ def __len__(self) -> int:
983
+ return len(self._func_ids)
984
+
985
+ def __iter__(self):
986
+ for func_id in self._func_ids:
987
+ yield func_id
988
+
989
+ def __getitem__(self, key: str) -> np.ndarray:
990
+ if not isinstance(key, str):
991
+ raise TypeError("Keys must be of type str.")
992
+ if key in self._func_ids:
993
+ with np.load(self.fname_npz, allow_pickle=False) as npzfile:
994
+ return npzfile[key]
995
+ else:
996
+ raise KeyError(f"No values for {key} cached (yet).")
997
+
998
+ def append(self, func_id: str, vals: np.ndarray) -> None:
999
+ """
1000
+ Append values for given func_id.
1001
+
1002
+ Parameters
1003
+ ----------
1004
+ func_id : str
1005
+ Function identifier.
1006
+ vals : np.ndarray
1007
+ Values of application of function with given func_id.
1008
+
1009
+ Raises
1010
+ ------
1011
+ TypeError
1012
+ If ``func_id`` is not a string.
1013
+ ValueError
1014
+ If there are already values stored for ``func_id`` in self.
1015
+ """
1016
+ if not isinstance(func_id, str):
1017
+ raise TypeError("func_id must be of type str.")
1018
+ if func_id in self._func_ids:
1019
+ # first check if it already in there
1020
+ raise ValueError("There are already values stored for func_id "
1021
+ + f"{func_id}. Changing the stored values is not "
1022
+ + "supported.")
1023
+ if len(self) == 0:
1024
+ # these are the first cached CV values for this traj
1025
+ # so we just create the (empty) npz file
1026
+ np.savez(self.fname_npz)
1027
+ # and write the trajectory hash
1028
+ self._append_data_to_npz(name=self._hash_traj_npz_key,
1029
+ value=np.array([self._hash_traj]),
1030
+ )
1031
+ # now we can append either way
1032
+ # either already something cached, or freshly created empty file
1033
+ self._append_data_to_npz(name=func_id, value=vals)
1034
+ # add func_id to list of func_ids that we know are cached in npz
1035
+ self._func_ids.append(func_id)
1036
+
1037
+ def _append_data_to_npz(self, name: str, value: np.ndarray) -> None:
1038
+ # npz files are just zipped together collections of npy files
1039
+ # so we just make a npy file saved into a BytesIO and then write that
1040
+ # to the end of the npz file
1041
+ bio = io.BytesIO()
1042
+ np.save(bio, value)
1043
+ with zipfile.ZipFile(file=self.fname_npz,
1044
+ mode="a", # append!
1045
+ # uncompressed (but) zip archive member
1046
+ compression=zipfile.ZIP_STORED,
1047
+ ) as zfile:
1048
+ zfile.writestr(f"{name}.npy", data=bio.getvalue())
1049
+
1050
+
1051
+ class TrajectoryFunctionValueCacheH5PY(collections.abc.Mapping):
1052
+ """
1053
+ Interface for caching trajectory function values in a given h5py group.
1054
+
1055
+ Drop-in replacement for the dictionary that is used for in-memory caching.
1056
+ """
1057
+
1058
+ # NOTE: this is written with the assumption that stored trajectories are
1059
+ # immutable (except for adding additional stored function values)
1060
+ # but we assume that the actual underlying trajectory stays the same,
1061
+ # i.e. it is not extended after first storing it
1062
+
1063
+ def __init__(self, h5py_cache, hash_traj: int):
1064
+ self.h5py_cache = h5py_cache
1065
+ self._hash_traj = hash_traj
1066
+ self._h5py_paths = {"ids": "FunctionIDs",
1067
+ "vals": "FunctionValues"
1068
+ }
1069
+ self._root_grp = h5py_cache.require_group(
1070
+ "asyncmd/"
1071
+ + "TrajectoryFunctionValueCache/"
1072
+ + f"{self._hash_traj}"
1073
+ )
1074
+ self._ids_grp = self._root_grp.require_group(self._h5py_paths["ids"])
1075
+ self._vals_grp = self._root_grp.require_group(self._h5py_paths["vals"])
1076
+
1077
+ def __len__(self):
1078
+ return len(self._ids_grp.keys())
1079
+
1080
+ def __iter__(self):
1081
+ for idx in range(len(self)):
1082
+ yield self._ids_grp[str(idx)].asstr()[()]
1083
+
1084
+ def __getitem__(self, key):
1085
+ if not isinstance(key, str):
1086
+ raise TypeError("Keys must be of type str.")
1087
+ for idx, k_val in enumerate(self):
1088
+ if key == k_val:
1089
+ return self._vals_grp[str(idx)][:]
1090
+ # if we got until here the key is not in there
1091
+ raise KeyError("Key not found.")
1092
+
1093
+ def append(self, func_id, vals):
1094
+ if not isinstance(func_id, str):
1095
+ raise TypeError("Keys (func_id) must be of type str.")
1096
+ if func_id in self:
1097
+ raise ValueError("There are already values stored for func_id "
1098
+ + f"{func_id}. Changing the stored values is not "
1099
+ + "supported.")
1100
+ # TODO: do we also want to check vals for type?
1101
+ name = str(len(self))
1102
+ _ = self._ids_grp.create_dataset(name, data=func_id)
1103
+ _ = self._vals_grp.create_dataset(name, data=vals)