asyncmd 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- asyncmd/__init__.py +18 -0
- asyncmd/_config.py +26 -0
- asyncmd/_version.py +75 -0
- asyncmd/config.py +203 -0
- asyncmd/gromacs/__init__.py +16 -0
- asyncmd/gromacs/mdconfig.py +351 -0
- asyncmd/gromacs/mdengine.py +1127 -0
- asyncmd/gromacs/utils.py +197 -0
- asyncmd/mdconfig.py +440 -0
- asyncmd/mdengine.py +100 -0
- asyncmd/slurm.py +1199 -0
- asyncmd/tools.py +86 -0
- asyncmd/trajectory/__init__.py +25 -0
- asyncmd/trajectory/convert.py +577 -0
- asyncmd/trajectory/functionwrapper.py +556 -0
- asyncmd/trajectory/propagate.py +937 -0
- asyncmd/trajectory/trajectory.py +1103 -0
- asyncmd/utils.py +148 -0
- asyncmd-0.3.2.dist-info/LICENSE +232 -0
- asyncmd-0.3.2.dist-info/METADATA +179 -0
- asyncmd-0.3.2.dist-info/RECORD +23 -0
- asyncmd-0.3.2.dist-info/WHEEL +5 -0
- asyncmd-0.3.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1103 @@
|
|
1
|
+
# This file is part of asyncmd.
|
2
|
+
#
|
3
|
+
# asyncmd is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# asyncmd is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with asyncmd. If not, see <https://www.gnu.org/licenses/>.
|
15
|
+
import io
|
16
|
+
import os
|
17
|
+
import copy
|
18
|
+
import typing
|
19
|
+
import asyncio
|
20
|
+
import hashlib
|
21
|
+
import logging
|
22
|
+
import zipfile
|
23
|
+
import collections
|
24
|
+
import numpy as np
|
25
|
+
import MDAnalysis as mda
|
26
|
+
|
27
|
+
|
28
|
+
from .._config import _GLOBALS
|
29
|
+
|
30
|
+
|
31
|
+
logger = logging.getLogger(__name__)
|
32
|
+
|
33
|
+
|
34
|
+
# dictionary in which we keep track of trajectory objects
|
35
|
+
# we use it to always return the *same* object for the same trajectory (by hash)
|
36
|
+
# this makes it easy to ensure that we never calculate CV functions twice
|
37
|
+
_TRAJECTORIES_BY_HASH = {}
|
38
|
+
|
39
|
+
|
40
|
+
def _forget_all_trajectories() -> None:
|
41
|
+
"""
|
42
|
+
Forget about the existence of all :class:`Trajectory` objects.
|
43
|
+
|
44
|
+
This will result in new :class:`Trajectory` objects beeing created even for
|
45
|
+
the same underlying trajectory_files. Usualy you do not want this as it
|
46
|
+
results in unecessary calculations if the same wrapped and cached function
|
47
|
+
is applied to both objects. This function exists as a hidden function as it
|
48
|
+
is used in the tests and it might be helpful under certain circumstances.
|
49
|
+
Use only if you know why you are using it!
|
50
|
+
"""
|
51
|
+
global _TRAJECTORIES_BY_HASH
|
52
|
+
all_keys = set(_TRAJECTORIES_BY_HASH.keys())
|
53
|
+
for key in all_keys:
|
54
|
+
del _TRAJECTORIES_BY_HASH[key]
|
55
|
+
|
56
|
+
|
57
|
+
def _forget_trajectory(traj_hash: int) -> None:
|
58
|
+
"""
|
59
|
+
Forget about the existence of a given :class:`Trajectory` object.
|
60
|
+
|
61
|
+
This will result in new :class:`Trajectory` objects beeing created even for
|
62
|
+
the same underlying trajectory_files. Usualy you do not want this as it
|
63
|
+
results in unecessary calculations if the same wrapped and cached function
|
64
|
+
is applied to both objects. This function exists as a hidden function as it
|
65
|
+
is used when deleting a :class:`Trajectory` (i.e. calling its `__del__`
|
66
|
+
method) and it might be helpful under certain circumstances. Use only if
|
67
|
+
you know why you are using it!
|
68
|
+
|
69
|
+
Parameters
|
70
|
+
----------
|
71
|
+
traj_hash : int
|
72
|
+
The hash of the :class:`Trajectory` to forget about.
|
73
|
+
"""
|
74
|
+
global _TRAJECTORIES_BY_HASH
|
75
|
+
try:
|
76
|
+
del _TRAJECTORIES_BY_HASH[traj_hash]
|
77
|
+
except KeyError:
|
78
|
+
# not in there, do nothing
|
79
|
+
pass
|
80
|
+
|
81
|
+
|
82
|
+
class Trajectory:
|
83
|
+
"""
|
84
|
+
Represent a trajectory.
|
85
|
+
|
86
|
+
Keep track of the paths of the trajectory and the structure files.
|
87
|
+
Caches values for (wrapped) functions acting on the trajectory.
|
88
|
+
Supports pickling and unpickling with the cached values restored, the
|
89
|
+
values will be written to a hidden numpy npz file next to the trajectory.
|
90
|
+
Supports equality checks with other :class:`Trajectory`.
|
91
|
+
Also makes available (and caches) a number of useful attributes, e.g.
|
92
|
+
``first_step`` and ``last_step`` (the first and last intergation step in
|
93
|
+
the trajectory), ``dt``, ``first_time``, ``last_time``,
|
94
|
+
``length`` (in frames) and ``nstout``.
|
95
|
+
|
96
|
+
Notes
|
97
|
+
-----
|
98
|
+
``first_step`` and ``last_step`` is only useful for trajectories that come
|
99
|
+
directly from a :class:`asyncmd.mdengine.MDEngine`.
|
100
|
+
As soon as the trajecory has been concatenated using MDAnalysis (e.g. with
|
101
|
+
the ``TrajectoryConcatenator``) the step information is just the frame
|
102
|
+
number in the trajectory part that became first/last frame in the
|
103
|
+
concatenated trajectory.
|
104
|
+
"""
|
105
|
+
|
106
|
+
def __init__(self, trajectory_files: typing.Union[list[str], str],
|
107
|
+
structure_file: str,
|
108
|
+
nstout: typing.Optional[int] = None,
|
109
|
+
cache_type: typing.Optional[str] = None,
|
110
|
+
**kwargs):
|
111
|
+
"""
|
112
|
+
Initialize a :class:`Trajectory`.
|
113
|
+
|
114
|
+
Parameters
|
115
|
+
----------
|
116
|
+
trajectory_files : list[str] or str
|
117
|
+
Absolute or relative path(s) to the trajectory file(s),
|
118
|
+
e.g. trr, xtc, dcd, ...
|
119
|
+
structure_file : str
|
120
|
+
Absolute or relative path to the structure file (e.g. tpr, gro).
|
121
|
+
nstout : int or None, optional
|
122
|
+
The output frequency used when creating the trajectory,
|
123
|
+
by default None
|
124
|
+
cache_type : str or None, optional
|
125
|
+
The cache type for the CV values cached for this trajectory,
|
126
|
+
must be one of 'h5py', 'npz' or 'memory'.
|
127
|
+
If None we will use 'h5py' if a h5py cache has been registered and
|
128
|
+
if not fallback to 'npz'.
|
129
|
+
See also the ``asyncmd.config.register_h5py_cache()`` function.
|
130
|
+
|
131
|
+
Raises
|
132
|
+
------
|
133
|
+
FileNotFoundError
|
134
|
+
If the ``trajectory_files`` or the ``structure_file`` are not
|
135
|
+
accessible.
|
136
|
+
"""
|
137
|
+
# NOTE: we assume tra = trr and struct = tpr
|
138
|
+
# but we also expect that anything which works for mdanalysis as
|
139
|
+
# tra and struct should also work here as tra and struct
|
140
|
+
# TODO: currently we do not use kwargs?!
|
141
|
+
#dval = object()
|
142
|
+
#for kwarg, value in kwargs.items():
|
143
|
+
# cval = getattr(self, kwarg, dval)
|
144
|
+
# if cval is not dval:
|
145
|
+
# if isinstance(value, type(cval)):
|
146
|
+
# # value is of same type as default so set it
|
147
|
+
# setattr(self, kwarg, value)
|
148
|
+
# else:
|
149
|
+
# logger.warn(f"Setting attribute {kwarg} with "
|
150
|
+
# + f"mismatching type ({type(value)}). "
|
151
|
+
# + f" Default type is {type(cval)}."
|
152
|
+
# )
|
153
|
+
# else:
|
154
|
+
# # not previously defined, so warn that we ignore it
|
155
|
+
# logger.warning("Ignoring unknown keyword-argument %s.", kwarg)
|
156
|
+
# NOTE: self._trajectory_files is set in __new__ because we otherwise
|
157
|
+
# would sanitize the files twice, but we need to check in __new__
|
158
|
+
# to make pickling work
|
159
|
+
# self._structure_file is also set in __new__ together with the
|
160
|
+
# trajectory_files as we also sanitize its path
|
161
|
+
# self._traj_hash and self._workdir are also set by __new__!
|
162
|
+
# self._trajectory_files
|
163
|
+
# self._structure_file
|
164
|
+
# self._workdir
|
165
|
+
# self._traj_hash
|
166
|
+
# properties
|
167
|
+
self.nstout = nstout # use the setter to make basic sanity checks
|
168
|
+
self._len = None
|
169
|
+
self._first_step = None
|
170
|
+
self._last_step = None
|
171
|
+
self._dt = None
|
172
|
+
self._first_time = None
|
173
|
+
self._last_time = None
|
174
|
+
# stuff for caching of functions applied to this traj
|
175
|
+
self._memory_cache = None
|
176
|
+
self._npz_cache = None
|
177
|
+
self._h5py_cache = None
|
178
|
+
self._cache_type = None
|
179
|
+
# remember if we use the global default value,
|
180
|
+
# if yes we use the (possibly changed) global default when unpickling
|
181
|
+
self._using_default_cache_type = True
|
182
|
+
# use our property logic for checking the value
|
183
|
+
# (Note that self._trajectory_hash has already been set by __new__)
|
184
|
+
self.cache_type = cache_type
|
185
|
+
# Locking mechanism such that only one application of a specific
|
186
|
+
# CV func can run at any given time on this trajectory
|
187
|
+
self._semaphores_by_func_id = collections.defaultdict(
|
188
|
+
asyncio.BoundedSemaphore
|
189
|
+
)
|
190
|
+
|
191
|
+
def __new__(cls, trajectory_files: typing.Union[list[str], str],
|
192
|
+
structure_file: str,
|
193
|
+
nstout: typing.Optional[int] = None,
|
194
|
+
cache_type: typing.Optional[str] = None,
|
195
|
+
**kwargs):
|
196
|
+
global _TRAJECTORIES_BY_HASH # our global traj registry
|
197
|
+
# see if old_workdir is given to sanitize file paths
|
198
|
+
old_workdir = kwargs.get("old_workdir", None)
|
199
|
+
# get cwd to get (and set) it only once for init and unpickle
|
200
|
+
current_workdir = os.path.abspath(os.getcwd())
|
201
|
+
trajectory_files, structure_file = Trajectory._sanitize_file_paths(
|
202
|
+
trajectory_files=trajectory_files,
|
203
|
+
structure_file=structure_file,
|
204
|
+
current_workdir=current_workdir,
|
205
|
+
old_workdir=old_workdir,
|
206
|
+
)
|
207
|
+
traj_hash = Trajectory._calc_traj_hash(trajectory_files)
|
208
|
+
try:
|
209
|
+
# see if we (i.e. a traj with the same hash) are already existing
|
210
|
+
other_traj = _TRAJECTORIES_BY_HASH[traj_hash]
|
211
|
+
# if yes return 'ourself'
|
212
|
+
# (but make sure that the filepaths match even after a potential
|
213
|
+
# change of workdir)
|
214
|
+
other_traj._trajectory_files = trajectory_files
|
215
|
+
other_traj._structure_file = structure_file
|
216
|
+
other_traj._workdir = current_workdir
|
217
|
+
return other_traj
|
218
|
+
except KeyError:
|
219
|
+
# not yet in there, so need to create us
|
220
|
+
# we just create cls so that we will be "created" by init or
|
221
|
+
# unpickled by setstate
|
222
|
+
# NOTE: we need to make sure that every attribute we set
|
223
|
+
# below is not overwritten by setstate and/or init!
|
224
|
+
obj = super().__new__(cls)
|
225
|
+
# but set self._traj_hash so we dont recalculate it
|
226
|
+
obj._traj_hash = traj_hash
|
227
|
+
# and set self._trajectory_files so we dont sanitize twice
|
228
|
+
obj._trajectory_files = trajectory_files
|
229
|
+
# also set self._structure_file
|
230
|
+
obj._structure_file = structure_file
|
231
|
+
# and set self._workdir to the new value
|
232
|
+
# Note:
|
233
|
+
# we remember the current workdir to be able to unpickle as long as
|
234
|
+
# either the relpath between traj and old/new workdir does not change
|
235
|
+
# or the trajectory did not change its location but we changed workdir
|
236
|
+
# (we need the workdir only for the second option)
|
237
|
+
obj._workdir = current_workdir
|
238
|
+
# and add us to the global trajectory registry
|
239
|
+
_TRAJECTORIES_BY_HASH[traj_hash] = obj
|
240
|
+
return obj
|
241
|
+
|
242
|
+
#def __del__(self):
|
243
|
+
# TODO: running 'del traj' does not call this function,
|
244
|
+
# it only decreases the reference count by one,
|
245
|
+
# but since we still have the traj in the traj by hash dictionary
|
246
|
+
# i.e. we still have a reference, it will not call __del__ which
|
247
|
+
# is only called when the reference count reaches zero
|
248
|
+
# _forget_trajectory(traj_hash=self.trajectory_hash)
|
249
|
+
|
250
|
+
@classmethod
|
251
|
+
def _sanitize_file_paths(cls,
|
252
|
+
trajectory_files: typing.Union[list[str], str],
|
253
|
+
structure_file: str,
|
254
|
+
current_workdir: typing.Optional[str] = None,
|
255
|
+
old_workdir: typing.Optional[str] = None,
|
256
|
+
) -> typing.Tuple[list[str], str]:
|
257
|
+
# NOTE: this returns relpath if no old_workdir is given and the traj
|
258
|
+
# is accessible
|
259
|
+
# if old_workdir is given (and the traj not accesible) it (tries)
|
260
|
+
# to find the traj by assuming the traj did not change place and
|
261
|
+
# we just need to add the "path_diff" from old to new workdir to
|
262
|
+
# the path, if the file is then still not there it raises a
|
263
|
+
# FileNotFoundError
|
264
|
+
# NOTE: (for pickling and aimmd storage behavior):
|
265
|
+
# The above makes it possible to either change the workdir of the
|
266
|
+
# python session OR change the location of the trajectories as
|
267
|
+
# as long as the relative path between trajectory and python
|
268
|
+
# workdir does not change!
|
269
|
+
def sanitize_path(f, pathdiff=None):
|
270
|
+
if os.path.isfile(f):
|
271
|
+
return os.path.relpath(f)
|
272
|
+
elif pathdiff is not None:
|
273
|
+
f_diff = os.path.join(pathdiff, f)
|
274
|
+
if os.path.isfile(f_diff):
|
275
|
+
return os.path.relpath(f_diff)
|
276
|
+
# if we get until here we cant find the file
|
277
|
+
err_msg = f"File {f} is not accessible"
|
278
|
+
if pathdiff is not None:
|
279
|
+
err_msg += f" (we also tried {f_diff})."
|
280
|
+
else:
|
281
|
+
err_msg += "."
|
282
|
+
raise FileNotFoundError(err_msg)
|
283
|
+
|
284
|
+
if old_workdir is not None:
|
285
|
+
if current_workdir is None:
|
286
|
+
raise ValueError("'old_workdir' given but 'current_workdir' "
|
287
|
+
"was None.")
|
288
|
+
path_diff = os.path.relpath(old_workdir, current_workdir)
|
289
|
+
else:
|
290
|
+
path_diff = None
|
291
|
+
|
292
|
+
if isinstance(trajectory_files, str):
|
293
|
+
trajectory_files = [trajectory_files]
|
294
|
+
|
295
|
+
traj_files_sanitized = [sanitize_path(f=traj_f, pathdiff=path_diff)
|
296
|
+
for traj_f in trajectory_files
|
297
|
+
]
|
298
|
+
struct_file_sanitized = sanitize_path(f=structure_file,
|
299
|
+
pathdiff=path_diff,
|
300
|
+
)
|
301
|
+
|
302
|
+
return traj_files_sanitized, struct_file_sanitized
|
303
|
+
|
304
|
+
@classmethod
|
305
|
+
def _calc_traj_hash(cls, trajectory_files):
|
306
|
+
# calculate a hash over the first and last part of the traj files
|
307
|
+
# (we use it to make sure the cached CV values match the traj)
|
308
|
+
# note that we do not include the structure file on purpose because
|
309
|
+
# that allows for changing .gro <-> .tpr or similar
|
310
|
+
# (which we expect to not change the calculated CV values)
|
311
|
+
# TODO: how much should we read?
|
312
|
+
# (I [hejung] think the first and last .5 MB are enough)
|
313
|
+
data = bytes()
|
314
|
+
for traj_f in trajectory_files:
|
315
|
+
#data += traj_f.encode("utf-8") # DONT include filepaths!...
|
316
|
+
fsize = os.stat(traj_f).st_size
|
317
|
+
data += str(fsize).encode("utf-8")
|
318
|
+
if fsize == 0:
|
319
|
+
# Note: we could also just warn as long as we do not do the
|
320
|
+
# negative seek below if filesize == 0. However,
|
321
|
+
# mdanalysis throws errors for empty trajectories anyway
|
322
|
+
raise ValueError(f"Trajectory file {traj_f} is of size 0.")
|
323
|
+
# read (at most) the first and last 0.5 MB of each file
|
324
|
+
max_to_read = min((512, fsize))
|
325
|
+
with open(traj_f, "rb") as traj_file:
|
326
|
+
# read the first bit of each file
|
327
|
+
data += traj_file.read(max_to_read)
|
328
|
+
# and read the last bit of each file
|
329
|
+
# Note that the last bit potentially overlapps with the first
|
330
|
+
traj_file.seek(-max_to_read, io.SEEK_END)
|
331
|
+
data += traj_file.read(max_to_read)
|
332
|
+
# calculate one hash over all traj_files
|
333
|
+
traj_hash = int(hashlib.blake2b(data,
|
334
|
+
# digest size 8 bytes = 64 bit
|
335
|
+
# to make sure the hash fits into
|
336
|
+
# the npz as int64 and not object
|
337
|
+
digest_size=8).hexdigest(),
|
338
|
+
base=16,
|
339
|
+
)
|
340
|
+
return traj_hash
|
341
|
+
|
342
|
+
@property
|
343
|
+
def cache_type(self):
|
344
|
+
"""
|
345
|
+
String indicating the currently used cache type. Can also be (re)set.
|
346
|
+
"""
|
347
|
+
return copy.copy(self._cache_type)
|
348
|
+
|
349
|
+
@cache_type.setter
|
350
|
+
def cache_type(self, value: typing.Optional[str]):
|
351
|
+
"""
|
352
|
+
Set the cache type.
|
353
|
+
|
354
|
+
Parameters
|
355
|
+
----------
|
356
|
+
value : str or None
|
357
|
+
Either a string indicating the type or None to choose the preferred
|
358
|
+
cache type from the available ones.
|
359
|
+
If a string it must be one of 'h5py', 'npz' or 'memory'.
|
360
|
+
|
361
|
+
Raises
|
362
|
+
------
|
363
|
+
ValueError
|
364
|
+
Raised if value is not one of the available cache types.
|
365
|
+
"""
|
366
|
+
if value is None:
|
367
|
+
use_default_cache_type = True
|
368
|
+
# find preferred cache type that is available
|
369
|
+
try:
|
370
|
+
value = _GLOBALS["TRAJECTORY_FUNCTION_CACHE_TYPE"]
|
371
|
+
except KeyError:
|
372
|
+
# no default cache type set
|
373
|
+
# default to numpy npz
|
374
|
+
value = "npz"
|
375
|
+
else:
|
376
|
+
use_default_cache_type = False
|
377
|
+
value = value.lower()
|
378
|
+
allowed_values = ["h5py", "npz", "memory"]
|
379
|
+
if value not in allowed_values:
|
380
|
+
raise ValueError("Given cache type must be `None` or one of "
|
381
|
+
+ f"{allowed_values}. Was: {value}.")
|
382
|
+
self._cache_type = value
|
383
|
+
self._using_default_cache_type = use_default_cache_type
|
384
|
+
self._setup_cache()
|
385
|
+
|
386
|
+
def _setup_cache(self) -> None:
|
387
|
+
# set up the cache indicated by self.cache_type and all others to None
|
388
|
+
# also makes sure that all previously cached values are transfered
|
389
|
+
# to the newly setup cache
|
390
|
+
# NOTE: we setup an npz cache to see if there are any saved values
|
391
|
+
# that we would want to add to the newly setup cache
|
392
|
+
# We do this because upon pickling we save everything to npz
|
393
|
+
# Note that we can just set self._npz to this cache because it is
|
394
|
+
# stateless (in the sense that if it existed it be exactly the same)
|
395
|
+
self._npz_cache = TrajectoryFunctionValueCacheNPZ(
|
396
|
+
fname_trajs=self.trajectory_files,
|
397
|
+
hash_traj=self._traj_hash,
|
398
|
+
)
|
399
|
+
if self._cache_type == "memory":
|
400
|
+
if self._memory_cache is None:
|
401
|
+
self._memory_cache = TrajectoryFunctionValueCacheMEMORY()
|
402
|
+
else:
|
403
|
+
# we already have a mem cache so just try to use it
|
404
|
+
pass
|
405
|
+
if self._h5py_cache is not None:
|
406
|
+
self._cache_content_to_new_cache(
|
407
|
+
old_cache=self._h5py_cache,
|
408
|
+
new_cache=self._memory_cache,
|
409
|
+
)
|
410
|
+
self._h5py_cache = None
|
411
|
+
self._cache_content_to_new_cache(
|
412
|
+
old_cache=self._npz_cache,
|
413
|
+
new_cache=self._memory_cache,
|
414
|
+
)
|
415
|
+
self._npz_cache = None
|
416
|
+
elif self._cache_type == "h5py":
|
417
|
+
try:
|
418
|
+
h5py_cache = _GLOBALS["H5PY_CACHE"]
|
419
|
+
except KeyError as exc:
|
420
|
+
raise ValueError(
|
421
|
+
"No h5py cache file registered yet. Try calling "
|
422
|
+
+ "``asyncmd.config.register_h5py_cache_file()``"
|
423
|
+
+ " with the appropriate arguments first") from exc
|
424
|
+
if self._h5py_cache is None:
|
425
|
+
# dont have one yet so setup the cache
|
426
|
+
self._h5py_cache = TrajectoryFunctionValueCacheH5PY(
|
427
|
+
h5py_cache=h5py_cache,
|
428
|
+
hash_traj=self._traj_hash,
|
429
|
+
)
|
430
|
+
else:
|
431
|
+
# we already have a h5py cache...
|
432
|
+
if self._h5py_cache.h5py_cache is h5py_cache:
|
433
|
+
# and it is in the same file/group location
|
434
|
+
# so we do nothing but making sure that all values from
|
435
|
+
# other caches are transfered
|
436
|
+
pass
|
437
|
+
else:
|
438
|
+
# lets copy the stuff from the old to the new h5py cache
|
439
|
+
old_h5py_cache = self._h5py_cache
|
440
|
+
self._h5py_cache = TrajectoryFunctionValueCacheH5PY(
|
441
|
+
h5py_cache=h5py_cache,
|
442
|
+
hash_traj=self._traj_hash,
|
443
|
+
)
|
444
|
+
self._cache_content_to_new_cache(
|
445
|
+
old_cache=old_h5py_cache,
|
446
|
+
new_cache=self._h5py_cache,
|
447
|
+
)
|
448
|
+
# transfer all values from other cache types and empty them
|
449
|
+
if self._memory_cache is not None:
|
450
|
+
self._cache_content_to_new_cache(
|
451
|
+
old_cache=self._memory_cache,
|
452
|
+
new_cache=self._h5py_cache,
|
453
|
+
)
|
454
|
+
self._memory_cache = None
|
455
|
+
self._cache_content_to_new_cache(
|
456
|
+
old_cache=self._npz_cache,
|
457
|
+
new_cache=self._h5py_cache,
|
458
|
+
)
|
459
|
+
self._npz_cache = None
|
460
|
+
elif self._cache_type == "npz":
|
461
|
+
if self._h5py_cache is not None:
|
462
|
+
self._cache_content_to_new_cache(
|
463
|
+
old_cache=self._h5py_cache,
|
464
|
+
new_cache=self._npz_cache,
|
465
|
+
)
|
466
|
+
self._h5py_cache = None
|
467
|
+
if self._memory_cache is not None:
|
468
|
+
self._cache_content_to_new_cache(
|
469
|
+
old_cache=self._memory_cache,
|
470
|
+
new_cache=self._npz_cache,
|
471
|
+
)
|
472
|
+
self._memory_cache = None
|
473
|
+
else:
|
474
|
+
raise RuntimeError("This should never happen. self._cache_type "
|
475
|
+
+ "must be one of 'memory', 'h5py', 'npz' when "
|
476
|
+
+ "self._setup_cache is called. "
|
477
|
+
+ f"Was {self._cache_type}.")
|
478
|
+
|
479
|
+
def _populate_properties(self) -> None:
|
480
|
+
"""
|
481
|
+
Populate cached properties from the underlying trajectory.
|
482
|
+
"""
|
483
|
+
# create/open a mdanalysis universe to get...
|
484
|
+
u = mda.Universe(self.structure_file, *self.trajectory_files)
|
485
|
+
# ...the number of frames
|
486
|
+
self._len = len(u.trajectory)
|
487
|
+
# ...the first integration step and time
|
488
|
+
ts = u.trajectory[0]
|
489
|
+
# FIXME: using None here means we will try to repopulate the properties
|
490
|
+
# every time we access step property for a traj-format which
|
491
|
+
# does not have step data!
|
492
|
+
# TODO: which traj formats have step data set in MDAnalysis?
|
493
|
+
# XTC and TRR have it for sure (with the wraparound issue)
|
494
|
+
self._first_step = ts.data.get("step", None)
|
495
|
+
self._first_time = ts.time
|
496
|
+
# ...the time diff between subsequent **frames** (not steps)
|
497
|
+
self._dt = ts.dt
|
498
|
+
# ...the last integration step and time
|
499
|
+
ts = u.trajectory[-1]
|
500
|
+
# TODO: which traj formats have step data set in MDAnalysis?
|
501
|
+
# XTC and TRR have it for sure (with the wraparound issue)
|
502
|
+
self._last_step = ts.data.get("step", None)
|
503
|
+
self._last_time = ts.time
|
504
|
+
if all([t.lower().endswith((".xtc", ".trr"))
|
505
|
+
for t in self.trajectory_files]):
|
506
|
+
self._fix_trr_xtc_step_wraparound(universe=u)
|
507
|
+
else:
|
508
|
+
# bail out if traj is not an XTC or TRR
|
509
|
+
logger.info("%s is not of type XTC or TRR. Not applying "
|
510
|
+
"wraparound fix.", self)
|
511
|
+
# make sure the trajectory is closed by MDAnalysis
|
512
|
+
u.trajectory.close()
|
513
|
+
del u
|
514
|
+
|
515
|
+
def _fix_trr_xtc_step_wraparound(self, universe: mda.Universe) -> None:
|
516
|
+
# check/correct for wraparounds in the integration step numbers
|
517
|
+
# NOTE: fails if the trajectory has length = 1!
|
518
|
+
# NOTE: strictly spoken we should not assume wraparound behavior,
|
519
|
+
# but it seems reasonable for the stepnum,
|
520
|
+
# see e.g. https://www.airs.com/blog/archives/120
|
521
|
+
# all times are in pico second (as this is MDAnalysis unit of time)
|
522
|
+
# we round integrator_dt and delta_t to precision of
|
523
|
+
# 0.000001 ps = 0.001 fs = 1 as
|
524
|
+
# we do this to avoid accumulating floating point inaccuracies when
|
525
|
+
# dividing the times by integrator_dt, this should be reasonably
|
526
|
+
# save for normal MD settings where integrator_dt should be on the
|
527
|
+
# order of 1-10 fs
|
528
|
+
if self._len == 1:
|
529
|
+
# bail out if the trajectory has length=1
|
530
|
+
# as we can not calculate dt if we only have one frame
|
531
|
+
logger.info("%s has only one frame. Can not correct for "
|
532
|
+
"potential wraparound of the integration step.",
|
533
|
+
self)
|
534
|
+
return # bail out
|
535
|
+
# get the time offset for first and last frame, they need to match for
|
536
|
+
# our wraparound fix to work
|
537
|
+
ts = universe.trajectory[0]
|
538
|
+
time_offset = ts.data.get("time_offset", 0)
|
539
|
+
ts = universe.trajectory[-1]
|
540
|
+
if ts.data.get("time_offset", 0) != time_offset:
|
541
|
+
logger.info("Time offset of the first and last time in "
|
542
|
+
"%s do not match. Not correcting for potential "
|
543
|
+
"wraparound of the integration step.",
|
544
|
+
self)
|
545
|
+
return # bail out
|
546
|
+
delta_s = self._last_step - self._first_step
|
547
|
+
delta_t = round(self._last_time - self._first_time, ndigits=6)
|
548
|
+
# first make sure traj is continous (i.e. not a concatenation where we
|
549
|
+
# carried over the time and step data from the original trajs)
|
550
|
+
n_frames = len(universe.trajectory)
|
551
|
+
n_max_samples = 100 # use at most 100 frames to see if it is continous
|
552
|
+
if n_frames > n_max_samples:
|
553
|
+
skip = n_frames // n_max_samples
|
554
|
+
else:
|
555
|
+
skip = 1
|
556
|
+
step_nums = [ts.data["step"] for ts in universe.trajectory[::skip]]
|
557
|
+
step_diffs = np.diff(step_nums)
|
558
|
+
first_diff = step_diffs[0]
|
559
|
+
if first_diff < 0:
|
560
|
+
# we possibly wrapped around at the first step
|
561
|
+
first_diff += 2**32
|
562
|
+
for diff in step_diffs[1:]:
|
563
|
+
if diff != first_diff:
|
564
|
+
# bail out because traj is not continous in time
|
565
|
+
logger.debug("%s is not from one continous propagation, i.e. "
|
566
|
+
"the step difference between subsequent steps is "
|
567
|
+
"not constant. Not applying TRR/XTC step "
|
568
|
+
"wraparound fix and using step as read from the "
|
569
|
+
"underlying trajectory.",
|
570
|
+
self)
|
571
|
+
return
|
572
|
+
# now the actual fix
|
573
|
+
if delta_s != 0:
|
574
|
+
if delta_s > 0:
|
575
|
+
# both (last and first) wrapped around the same number of times
|
576
|
+
integrator_dt = round(delta_t / delta_s, ndigits=6)
|
577
|
+
else: # delta_s < 0
|
578
|
+
# last wrapped one time more than first
|
579
|
+
integrator_dt = round(delta_t / (delta_s + 2**32), ndigits=6)
|
580
|
+
# NOTE: should we round or floor? I (hejung) think round is what we
|
581
|
+
# want, it will get us to the nearest int, which is good if
|
582
|
+
# we e.g. have 0.99999999999 instead of 1
|
583
|
+
first_step = round((self._first_time - time_offset) / integrator_dt)
|
584
|
+
last_step = round((self._last_time - time_offset) / integrator_dt)
|
585
|
+
self._first_step = first_step
|
586
|
+
self._last_step = last_step
|
587
|
+
else: # delta_s == 0
|
588
|
+
# can only end up here if we have more than one frame in trajectory
|
589
|
+
# **and** the first and last frame have the same integration step
|
590
|
+
# which should be very rare and we can not correct anyway as the
|
591
|
+
# trajectory can not be from a continous propagation, so we can not
|
592
|
+
# end up here at all?
|
593
|
+
raise RuntimeError("This should not be possible?!")
|
594
|
+
|
595
|
+
def __len__(self) -> int:
|
596
|
+
"""
|
597
|
+
Return the number of frames in the trajectory.
|
598
|
+
|
599
|
+
Returns
|
600
|
+
-------
|
601
|
+
int
|
602
|
+
The number of frames in the trajectory.
|
603
|
+
"""
|
604
|
+
if self._len is None:
|
605
|
+
self._populate_properties()
|
606
|
+
return self._len
|
607
|
+
|
608
|
+
def __repr__(self) -> str:
|
609
|
+
if len(self.trajectory_files) == 1:
|
610
|
+
return (f"Trajectory(trajectory_files={self.trajectory_files[0]},"
|
611
|
+
+ f" structure_file={self.structure_file})"
|
612
|
+
)
|
613
|
+
return (f"Trajectory(trajectory_files={self.trajectory_files},"
|
614
|
+
+ f" structure_file={self.structure_file})"
|
615
|
+
)
|
616
|
+
|
617
|
+
def __eq__(self, other: object) -> bool:
|
618
|
+
if not isinstance(other, Trajectory):
|
619
|
+
# if its not a trajectory it cant be equal
|
620
|
+
return False
|
621
|
+
if self.trajectory_hash != other.trajectory_hash:
|
622
|
+
# if it has a different hash it cant be equal
|
623
|
+
return False
|
624
|
+
# TODO: check for cached CV values? I (hejung) think it does not really
|
625
|
+
# make sense...
|
626
|
+
|
627
|
+
# if we got until here the two trajs are equal
|
628
|
+
return True
|
629
|
+
|
630
|
+
def __ne__(self, other: object) -> bool:
|
631
|
+
return not self.__eq__(other=other)
|
632
|
+
|
633
|
+
@property
|
634
|
+
def structure_file(self) -> str:
|
635
|
+
"""Return relative path to the structure file."""
|
636
|
+
return copy.copy(self._structure_file)
|
637
|
+
|
638
|
+
@property
|
639
|
+
def trajectory_files(self) -> str:
|
640
|
+
"""Return relative path to the trajectory files."""
|
641
|
+
return copy.copy(self._trajectory_files)
|
642
|
+
|
643
|
+
@property
|
644
|
+
def trajectory_hash(self) -> int:
|
645
|
+
"""Return hash over the trajecory files"""
|
646
|
+
return copy.copy(self._traj_hash)
|
647
|
+
|
648
|
+
@property
|
649
|
+
def nstout(self) -> typing.Union[int, None]:
|
650
|
+
"""Output frequency between subsequent frames in integration steps."""
|
651
|
+
return self._nstout
|
652
|
+
|
653
|
+
@nstout.setter
|
654
|
+
def nstout(self, val: typing.Union[int, None]) -> None:
|
655
|
+
if val is not None:
|
656
|
+
# ensure that it is an int
|
657
|
+
val = int(val)
|
658
|
+
# enable setting to None
|
659
|
+
self._nstout = val
|
660
|
+
|
661
|
+
@property
|
662
|
+
def first_step(self) -> int:
|
663
|
+
"""Return the integration step of the first frame in the trajectory."""
|
664
|
+
if self._first_step is None:
|
665
|
+
self._populate_properties()
|
666
|
+
return self._first_step
|
667
|
+
|
668
|
+
@property
|
669
|
+
def last_step(self) -> int:
|
670
|
+
"""Return the integration step of the last frame in the trajectory."""
|
671
|
+
if self._last_step is None:
|
672
|
+
self._populate_properties()
|
673
|
+
return self._last_step
|
674
|
+
|
675
|
+
@property
|
676
|
+
def dt(self) -> float:
|
677
|
+
"""The time intervall between subsequent *frames* (not steps) in ps."""
|
678
|
+
if self._dt is None:
|
679
|
+
self._populate_properties()
|
680
|
+
return self._dt
|
681
|
+
|
682
|
+
@property
|
683
|
+
def first_time(self) -> float:
|
684
|
+
"""Return the integration timestep of the first frame in ps."""
|
685
|
+
if self._first_time is None:
|
686
|
+
self._populate_properties()
|
687
|
+
return self._first_time
|
688
|
+
|
689
|
+
@property
|
690
|
+
def last_time(self) -> float:
|
691
|
+
"""Return the integration timestep of the last frame in ps."""
|
692
|
+
if self._last_time is None:
|
693
|
+
self._populate_properties()
|
694
|
+
return self._last_time
|
695
|
+
|
696
|
+
async def _apply_wrapped_func(self, func_id, wrapped_func):
|
697
|
+
async with self._semaphores_by_func_id[func_id]:
|
698
|
+
# sort out which cache we use
|
699
|
+
# NOTE: only one cache should ever be not None, so order should not
|
700
|
+
# matter here
|
701
|
+
# anyway I (hejung) think this order is even what we want:
|
702
|
+
# 1.) use h5py cache if registered
|
703
|
+
# 2.) use npz cache (the default since h5py is not registered
|
704
|
+
# if not set by the user)
|
705
|
+
# 3.) use memory/local cache (only if set on traj creation
|
706
|
+
# or if set as default cache)
|
707
|
+
if self._h5py_cache is not None:
|
708
|
+
return await self._apply_wrapped_func_cached(
|
709
|
+
func_id=func_id,
|
710
|
+
wrapped_func=wrapped_func,
|
711
|
+
cache=self._h5py_cache,
|
712
|
+
)
|
713
|
+
if self._npz_cache is not None:
|
714
|
+
return await self._apply_wrapped_func_cached(
|
715
|
+
func_id=func_id,
|
716
|
+
wrapped_func=wrapped_func,
|
717
|
+
cache=self._npz_cache
|
718
|
+
)
|
719
|
+
if self._memory_cache is not None:
|
720
|
+
return await self._apply_wrapped_func_cached(
|
721
|
+
func_id=func_id,
|
722
|
+
wrapped_func=wrapped_func,
|
723
|
+
cache=self._memory_cache,
|
724
|
+
)
|
725
|
+
# if we get until here we have no cache!
|
726
|
+
logger.warning("No cache associated with %s. Returning calculated "
|
727
|
+
"function values anyway but no caching can/will be "
|
728
|
+
"performed!",
|
729
|
+
self,
|
730
|
+
)
|
731
|
+
return await wrapped_func.get_values_for_trajectory(self)
|
732
|
+
|
733
|
+
async def _apply_wrapped_func_cached(
|
734
|
+
self, func_id: str, wrapped_func,
|
735
|
+
cache: collections.abc.Mapping[str, np.ndarray],
|
736
|
+
):
|
737
|
+
try:
|
738
|
+
# see if it is in cache
|
739
|
+
return copy.copy(cache[func_id])
|
740
|
+
except KeyError:
|
741
|
+
# if not calculate, store and return
|
742
|
+
# send function application to seperate process and wait
|
743
|
+
# until it finishes
|
744
|
+
vals = await wrapped_func.get_values_for_trajectory(self)
|
745
|
+
cache.append(func_id=func_id, vals=vals)
|
746
|
+
return vals
|
747
|
+
|
748
|
+
def _cache_content_to_new_cache(
|
749
|
+
self,
|
750
|
+
old_cache: collections.abc.Mapping[str, np.ndarray],
|
751
|
+
new_cache: collections.abc.Mapping[str, np.ndarray],
|
752
|
+
):
|
753
|
+
for func_id, values in old_cache.items():
|
754
|
+
if func_id in new_cache:
|
755
|
+
continue # dont try to add what is already in there
|
756
|
+
new_cache.append(func_id=func_id, vals=values)
|
757
|
+
|
758
|
+
def __getstate__(self):
|
759
|
+
# enable pickling of Trajectory
|
760
|
+
# this should make it possible to pass it into a ProcessPoolExecutor
|
761
|
+
# and lets us calculate TrajectoryFunction values asyncronously
|
762
|
+
state = self.__dict__.copy()
|
763
|
+
# NOTE: we always save to npz here and then we check for npz always
|
764
|
+
# when initializing a `new` trajectory and add all values to
|
765
|
+
# the then preferred cache
|
766
|
+
if self._npz_cache is None:
|
767
|
+
self._npz_cache = TrajectoryFunctionValueCacheNPZ(
|
768
|
+
fname_trajs=self.trajectory_files,
|
769
|
+
hash_traj=self._traj_hash,
|
770
|
+
)
|
771
|
+
if self._memory_cache is not None:
|
772
|
+
self._cache_content_to_new_cache(old_cache=self._memory_cache,
|
773
|
+
new_cache=self._npz_cache,
|
774
|
+
)
|
775
|
+
if self._h5py_cache is not None:
|
776
|
+
self._cache_content_to_new_cache(old_cache=self._h5py_cache,
|
777
|
+
new_cache=self._npz_cache,
|
778
|
+
)
|
779
|
+
# and set npz cache back to None since we have not been using it
|
780
|
+
self._npz_cache = None
|
781
|
+
state["_h5py_cache"] = None
|
782
|
+
state["_npz_cache"] = None
|
783
|
+
state["_memory_cache"] = None
|
784
|
+
state["_semaphores_by_func_id"] = collections.defaultdict(
|
785
|
+
asyncio.BoundedSemaphore
|
786
|
+
)
|
787
|
+
return state
|
788
|
+
|
789
|
+
def __setstate__(self, d: dict):
|
790
|
+
# remove the attributes we set in __new__ from dict
|
791
|
+
# (otherwise we would overwrite what we set in __new__)
|
792
|
+
del d["_trajectory_files"]
|
793
|
+
del d["_structure_file"]
|
794
|
+
del d["_traj_hash"]
|
795
|
+
try:
|
796
|
+
del d["_workdir"]
|
797
|
+
except KeyError:
|
798
|
+
# 'old' trajectory objects dont have a _workdir attribute
|
799
|
+
pass
|
800
|
+
# now we can update without overwritting what we set in __new__
|
801
|
+
self.__dict__.update(d)
|
802
|
+
# sort out which cache we were using (and which we will use now)
|
803
|
+
if self._using_default_cache_type:
|
804
|
+
# if we were using the global default when pickling use it now too
|
805
|
+
# Note that this will raise the ValueError from _setup_cache if
|
806
|
+
# no h5py cache has been registered but it is set as default
|
807
|
+
# (which is intended because it is the same behavior as when
|
808
|
+
# initializing a new trajectory in the same situation)
|
809
|
+
self.cache_type = None # this calls _setup_cache
|
810
|
+
return # get out of here, no need to setup the cache twice
|
811
|
+
if self.cache_type == "h5py":
|
812
|
+
# make sure h5py cache is set before trying to unpickle with it
|
813
|
+
try:
|
814
|
+
_ = _GLOBALS["H5PY_CACHE"]
|
815
|
+
except KeyError:
|
816
|
+
# this will (probably) fallback to npz but I (hejung) think it
|
817
|
+
# is nice if we use the possibly set global default?
|
818
|
+
# Note that this will not err but just emit the warning to log
|
819
|
+
# when we change the cache but it will err when the global
|
820
|
+
# default cache is set to h5py (as above)
|
821
|
+
logger.warning("Trying to unpickle %s with cache_type "
|
822
|
+
"'h5py' not possible without a registered "
|
823
|
+
"cache. Falling back to global default type."
|
824
|
+
"See 'asyncmd.config.register_h5py_cache' and"
|
825
|
+
" 'asyncmd.config.set_default_cache_type'.",
|
826
|
+
self
|
827
|
+
)
|
828
|
+
self.cache_type = None # this calls _setup_cache
|
829
|
+
return # get out of here, no need to setup the cache twice
|
830
|
+
# setup the cache for all cases where we are not using default cache
|
831
|
+
# (or had "h5py" but could not unpickle with "h5py" now [and are
|
832
|
+
# therefore also using the default])
|
833
|
+
self._setup_cache()
|
834
|
+
|
835
|
+
def __getnewargs_ex__(self):
|
836
|
+
# new needs the trajectory_files to be able to calculate the traj_hash
|
837
|
+
# and since we want __new__ to have the same call signature as __init__
|
838
|
+
# we also add all the init args here too
|
839
|
+
return ((), {"trajectory_files": self.trajectory_files,
|
840
|
+
"structure_file": self.structure_file,
|
841
|
+
"nstout": self.nstout,
|
842
|
+
"cache_type": self.cache_type,
|
843
|
+
"old_workdir": self._workdir,
|
844
|
+
})
|
845
|
+
|
846
|
+
|
847
|
+
class TrajectoryFunctionValueCacheMEMORY(collections.abc.Mapping):
|
848
|
+
"""
|
849
|
+
Interface for caching trajectory function values in memory in a dict.
|
850
|
+
"""
|
851
|
+
|
852
|
+
def __init__(self, *args, **kwargs) -> None:
|
853
|
+
"""Initialize a `TrajectoryFunctionValueCacheMEMORY`."""
|
854
|
+
self._func_values_by_id = {}
|
855
|
+
|
856
|
+
def __len__(self) -> int:
|
857
|
+
return len(self._func_values_by_id)
|
858
|
+
|
859
|
+
def __iter__(self):
|
860
|
+
return self._func_values_by_id.__iter__()
|
861
|
+
|
862
|
+
def __getitem__(self, key: str) -> np.ndarray:
|
863
|
+
if not isinstance(key, str):
|
864
|
+
raise TypeError("Keys must be of type str.")
|
865
|
+
return self._func_values_by_id[key]
|
866
|
+
|
867
|
+
def append(self, func_id: str, vals: np.ndarray) -> None:
|
868
|
+
if not isinstance(func_id, str):
|
869
|
+
raise TypeError("func_id must be of type str.")
|
870
|
+
if func_id in self._func_values_by_id:
|
871
|
+
# first check if it already in there
|
872
|
+
raise ValueError("There are already values stored for func_id "
|
873
|
+
+ f"{func_id}. Changing the stored values is not "
|
874
|
+
+ "supported.")
|
875
|
+
self._func_values_by_id[func_id] = vals
|
876
|
+
|
877
|
+
|
878
|
+
class TrajectoryFunctionValueCacheNPZ(collections.abc.Mapping):
|
879
|
+
"""
|
880
|
+
Interface for caching trajectory function values in a numpy npz file.
|
881
|
+
|
882
|
+
Drop-in replacement for the dictionary that is used for in-memory caching.
|
883
|
+
"""
|
884
|
+
|
885
|
+
_hash_traj_npz_key = "hash_of_trajs" # key of hash_traj in npz file
|
886
|
+
|
887
|
+
# NOTE: this is written with the assumption that stored trajectories are
|
888
|
+
# immutable (except for adding additional stored function values)
|
889
|
+
# but we assume that the actual underlying trajectory stays the same,
|
890
|
+
# i.e. it is not extended after first storing it
|
891
|
+
# If it changes between two npz-cache initializiations, it will have
|
892
|
+
# a different traj-hash and all cached CV values will be recalculated
|
893
|
+
|
894
|
+
# NOTE: npz appending inspired by: https://stackoverflow.com/a/66618141
|
895
|
+
|
896
|
+
# NOTE/FIXME: It would be nice to use the MAX_FILES_OPEN semaphore
|
897
|
+
# but then we need async/await and then we need to go to a 'create'
|
898
|
+
# classmethod that is async and required for initialization
|
899
|
+
# (because __init__ cant be async)
|
900
|
+
# but since we (have to) open the npz file in the other magic methods
|
901
|
+
# too it does not really matter (as they can not be async either)?
|
902
|
+
# ...and as we also leave some room for non-semaphored file openings anyway
|
903
|
+
|
904
|
+
def __init__(self, fname_trajs: list[str], hash_traj: int) -> None:
|
905
|
+
"""
|
906
|
+
Initialize a `TrajectoryFunctionValueCacheNPZ`.
|
907
|
+
|
908
|
+
Parameters
|
909
|
+
----------
|
910
|
+
fname_trajs : list[str]
|
911
|
+
Absolute filenames to the trajectories for which we cache CV values.
|
912
|
+
hash_traj : int
|
913
|
+
Hash over the first part of the trajectory file,
|
914
|
+
used to make sure we cache only for the right trajectory
|
915
|
+
(and not any trajectories with the same filename).
|
916
|
+
"""
|
917
|
+
self.fname_npz = self._get_cache_filename(fname_trajs=fname_trajs,
|
918
|
+
trajectory_hash=hash_traj,
|
919
|
+
)
|
920
|
+
self._hash_traj = hash_traj
|
921
|
+
self._func_ids = []
|
922
|
+
# sort out if we have an associated npz file already
|
923
|
+
# and if it is from/for the "right" trajectory file
|
924
|
+
self._ensure_consistent_npz()
|
925
|
+
|
926
|
+
def _ensure_consistent_npz(self):
|
927
|
+
# next line makes sure we only remember func_ids from the current npz
|
928
|
+
self._func_ids = []
|
929
|
+
if not os.path.isfile(self.fname_npz):
|
930
|
+
# no npz so nothing to do except making sure we have no func_ids
|
931
|
+
return
|
932
|
+
existing_npz_matches = False
|
933
|
+
with np.load(self.fname_npz, allow_pickle=False) as npzfile:
|
934
|
+
try:
|
935
|
+
saved_hash_traj = npzfile[self._hash_traj_npz_key][0]
|
936
|
+
except KeyError:
|
937
|
+
# we probably tripped over an old formatted npz
|
938
|
+
# so we will just rewrite it completely with hash
|
939
|
+
pass
|
940
|
+
else:
|
941
|
+
# old hash found, lets compare the two hashes
|
942
|
+
existing_npz_matches = (self._hash_traj == saved_hash_traj)
|
943
|
+
if existing_npz_matches:
|
944
|
+
# if they do populate self with the func_ids we have
|
945
|
+
# cached values for
|
946
|
+
for k in npzfile.keys():
|
947
|
+
if k != self._hash_traj_npz_key:
|
948
|
+
self._func_ids.append(str(k))
|
949
|
+
# now if the old npz did not match we should remove it
|
950
|
+
# then we will rewrite it with the first cached CV values
|
951
|
+
if not existing_npz_matches:
|
952
|
+
logger.debug("Found existing npz file (%s) but the"
|
953
|
+
" trajectory hash does not match."
|
954
|
+
" Recreating the npz cache from scratch.",
|
955
|
+
self.fname_npz
|
956
|
+
)
|
957
|
+
os.unlink(self.fname_npz)
|
958
|
+
|
959
|
+
@classmethod
|
960
|
+
def _get_cache_filename(cls, fname_trajs: list[str],
|
961
|
+
trajectory_hash: int) -> str:
|
962
|
+
"""
|
963
|
+
Construct cachefilename from trajectory fname.
|
964
|
+
|
965
|
+
Parameters
|
966
|
+
----------
|
967
|
+
fname_trajs : list[str]
|
968
|
+
Path to the trajectory for which we cache.
|
969
|
+
trajectory_hash : int
|
970
|
+
Hash of the trajectory (files).
|
971
|
+
|
972
|
+
Returns
|
973
|
+
-------
|
974
|
+
str
|
975
|
+
Path to the cachefile associated with trajectory.
|
976
|
+
"""
|
977
|
+
head, tail = os.path.split(fname_trajs[0])
|
978
|
+
return os.path.join(head,
|
979
|
+
f".{tail}{'_MULTIPART' if len(fname_trajs) > 1 else ''}_asyncmd_cv_cache.npz"
|
980
|
+
)
|
981
|
+
|
982
|
+
def __len__(self) -> int:
|
983
|
+
return len(self._func_ids)
|
984
|
+
|
985
|
+
def __iter__(self):
|
986
|
+
for func_id in self._func_ids:
|
987
|
+
yield func_id
|
988
|
+
|
989
|
+
def __getitem__(self, key: str) -> np.ndarray:
|
990
|
+
if not isinstance(key, str):
|
991
|
+
raise TypeError("Keys must be of type str.")
|
992
|
+
if key in self._func_ids:
|
993
|
+
with np.load(self.fname_npz, allow_pickle=False) as npzfile:
|
994
|
+
return npzfile[key]
|
995
|
+
else:
|
996
|
+
raise KeyError(f"No values for {key} cached (yet).")
|
997
|
+
|
998
|
+
def append(self, func_id: str, vals: np.ndarray) -> None:
|
999
|
+
"""
|
1000
|
+
Append values for given func_id.
|
1001
|
+
|
1002
|
+
Parameters
|
1003
|
+
----------
|
1004
|
+
func_id : str
|
1005
|
+
Function identifier.
|
1006
|
+
vals : np.ndarray
|
1007
|
+
Values of application of function with given func_id.
|
1008
|
+
|
1009
|
+
Raises
|
1010
|
+
------
|
1011
|
+
TypeError
|
1012
|
+
If ``func_id`` is not a string.
|
1013
|
+
ValueError
|
1014
|
+
If there are already values stored for ``func_id`` in self.
|
1015
|
+
"""
|
1016
|
+
if not isinstance(func_id, str):
|
1017
|
+
raise TypeError("func_id must be of type str.")
|
1018
|
+
if func_id in self._func_ids:
|
1019
|
+
# first check if it already in there
|
1020
|
+
raise ValueError("There are already values stored for func_id "
|
1021
|
+
+ f"{func_id}. Changing the stored values is not "
|
1022
|
+
+ "supported.")
|
1023
|
+
if len(self) == 0:
|
1024
|
+
# these are the first cached CV values for this traj
|
1025
|
+
# so we just create the (empty) npz file
|
1026
|
+
np.savez(self.fname_npz)
|
1027
|
+
# and write the trajectory hash
|
1028
|
+
self._append_data_to_npz(name=self._hash_traj_npz_key,
|
1029
|
+
value=np.array([self._hash_traj]),
|
1030
|
+
)
|
1031
|
+
# now we can append either way
|
1032
|
+
# either already something cached, or freshly created empty file
|
1033
|
+
self._append_data_to_npz(name=func_id, value=vals)
|
1034
|
+
# add func_id to list of func_ids that we know are cached in npz
|
1035
|
+
self._func_ids.append(func_id)
|
1036
|
+
|
1037
|
+
def _append_data_to_npz(self, name: str, value: np.ndarray) -> None:
|
1038
|
+
# npz files are just zipped together collections of npy files
|
1039
|
+
# so we just make a npy file saved into a BytesIO and then write that
|
1040
|
+
# to the end of the npz file
|
1041
|
+
bio = io.BytesIO()
|
1042
|
+
np.save(bio, value)
|
1043
|
+
with zipfile.ZipFile(file=self.fname_npz,
|
1044
|
+
mode="a", # append!
|
1045
|
+
# uncompressed (but) zip archive member
|
1046
|
+
compression=zipfile.ZIP_STORED,
|
1047
|
+
) as zfile:
|
1048
|
+
zfile.writestr(f"{name}.npy", data=bio.getvalue())
|
1049
|
+
|
1050
|
+
|
1051
|
+
class TrajectoryFunctionValueCacheH5PY(collections.abc.Mapping):
|
1052
|
+
"""
|
1053
|
+
Interface for caching trajectory function values in a given h5py group.
|
1054
|
+
|
1055
|
+
Drop-in replacement for the dictionary that is used for in-memory caching.
|
1056
|
+
"""
|
1057
|
+
|
1058
|
+
# NOTE: this is written with the assumption that stored trajectories are
|
1059
|
+
# immutable (except for adding additional stored function values)
|
1060
|
+
# but we assume that the actual underlying trajectory stays the same,
|
1061
|
+
# i.e. it is not extended after first storing it
|
1062
|
+
|
1063
|
+
def __init__(self, h5py_cache, hash_traj: int):
|
1064
|
+
self.h5py_cache = h5py_cache
|
1065
|
+
self._hash_traj = hash_traj
|
1066
|
+
self._h5py_paths = {"ids": "FunctionIDs",
|
1067
|
+
"vals": "FunctionValues"
|
1068
|
+
}
|
1069
|
+
self._root_grp = h5py_cache.require_group(
|
1070
|
+
"asyncmd/"
|
1071
|
+
+ "TrajectoryFunctionValueCache/"
|
1072
|
+
+ f"{self._hash_traj}"
|
1073
|
+
)
|
1074
|
+
self._ids_grp = self._root_grp.require_group(self._h5py_paths["ids"])
|
1075
|
+
self._vals_grp = self._root_grp.require_group(self._h5py_paths["vals"])
|
1076
|
+
|
1077
|
+
def __len__(self):
|
1078
|
+
return len(self._ids_grp.keys())
|
1079
|
+
|
1080
|
+
def __iter__(self):
|
1081
|
+
for idx in range(len(self)):
|
1082
|
+
yield self._ids_grp[str(idx)].asstr()[()]
|
1083
|
+
|
1084
|
+
def __getitem__(self, key):
|
1085
|
+
if not isinstance(key, str):
|
1086
|
+
raise TypeError("Keys must be of type str.")
|
1087
|
+
for idx, k_val in enumerate(self):
|
1088
|
+
if key == k_val:
|
1089
|
+
return self._vals_grp[str(idx)][:]
|
1090
|
+
# if we got until here the key is not in there
|
1091
|
+
raise KeyError("Key not found.")
|
1092
|
+
|
1093
|
+
def append(self, func_id, vals):
|
1094
|
+
if not isinstance(func_id, str):
|
1095
|
+
raise TypeError("Keys (func_id) must be of type str.")
|
1096
|
+
if func_id in self:
|
1097
|
+
raise ValueError("There are already values stored for func_id "
|
1098
|
+
+ f"{func_id}. Changing the stored values is not "
|
1099
|
+
+ "supported.")
|
1100
|
+
# TODO: do we also want to check vals for type?
|
1101
|
+
name = str(len(self))
|
1102
|
+
_ = self._ids_grp.create_dataset(name, data=func_id)
|
1103
|
+
_ = self._vals_grp.create_dataset(name, data=vals)
|