PyPI - asyncmd - Versions diffs - 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

asyncmd 0.3.2py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

asyncmd/__init__.py +7 -0
asyncmd/_config.py +16 -9
asyncmd/_version.py +22 -36
asyncmd/config.py +66 -33
asyncmd/gromacs/__init__.py +3 -0
asyncmd/gromacs/mdconfig.py +7 -17
asyncmd/gromacs/mdengine.py +448 -424
asyncmd/gromacs/utils.py +40 -23
asyncmd/mdconfig.py +55 -165
asyncmd/mdengine.py +120 -39
asyncmd/slurm.py +210 -77
asyncmd/tools.py +284 -5
asyncmd/trajectory/__init__.py +19 -1
asyncmd/trajectory/convert.py +133 -97
asyncmd/trajectory/functionwrapper.py +211 -159
asyncmd/trajectory/propagate.py +308 -260
asyncmd/trajectory/trajectory.py +498 -755
asyncmd/trajectory/trajectory_cache.py +365 -0
asyncmd/utils.py +18 -13
asyncmd-0.4.0.dist-info/METADATA +90 -0
asyncmd-0.4.0.dist-info/RECORD +24 -0
{asyncmd-0.3.2.dist-info → asyncmd-0.4.0.dist-info}/WHEEL +1 -1
asyncmd-0.3.2.dist-info/METADATA +0 -179
asyncmd-0.3.2.dist-info/RECORD +0 -23
{asyncmd-0.3.2.dist-info → asyncmd-0.4.0.dist-info/licenses}/LICENSE +0 -0
{asyncmd-0.3.2.dist-info → asyncmd-0.4.0.dist-info}/top_level.txt +0 -0

asyncmd/trajectory/trajectory.py CHANGED Viewed

@@ -12,20 +12,36 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with asyncmd. If not, see <https://www.gnu.org/licenses/>.
-import io
-import os
-import copy
-import typing
+"""
+This module contains the implementation the asyncmd.Trajectory class.
+It also contains some helper function related to the global Trajectory registry
+used for trajectory function value caching.
+The actual :class:`TrajectoryFunctionValueCache` classes can be found in the
+``trajectory_cache`` module.
+"""
 import asyncio
+import collections
+import dataclasses
 import hashlib
+import io
 import logging
-import zipfile
-import collections
-import numpy as np
-import MDAnalysis as mda
+import os
+import typing
+import MDAnalysis as mda
+import numpy as np
 from .._config import _GLOBALS
+from .trajectory_cache import (TrajectoryFunctionValueCache,
+                               TrajectoryFunctionValueCacheInH5PY,
+                               TrajectoryFunctionValueCacheInMemory,
+                               TrajectoryFunctionValueCacheInNPZ,
+                               ValuesAlreadyStoredError)
+if typing.TYPE_CHECKING:  # pragma: no cover
+    # only import for typing to avoid circular imports
+    from .functionwrapper import TrajectoryFunctionWrapper
 logger = logging.getLogger(__name__)
@@ -34,20 +50,58 @@ logger = logging.getLogger(__name__)
 # dictionary in which we keep track of trajectory objects
 # we use it to always return the *same* object for the same trajectory (by hash)
 # this makes it easy to ensure that we never calculate CV functions twice
-_TRAJECTORIES_BY_HASH = {}
+_TRAJECTORIES_BY_HASH: dict[int, "Trajectory"] = {}
+def clear_all_cache_values_for_all_trajectories() -> None:
+    """
+    Clear all function values cached for each :class:`Trajectory` currently in existence.
+    For file-based caches, this also removes the associated cache files.
+    """
+    for traj in _TRAJECTORIES_BY_HASH.values():
+        traj.clear_all_cache_values()
+def _update_cache_type_for_all_trajectories(copy_content: bool = True,
+                                            clear_old_cache: bool = False,
+                                            ) -> None:
+    """
+    Update the cache type for each :class:`Trajectory` currently in existence.
+    By default the content of the current caches is copied to the new caches.
+    This will only have an effect if the globally set ``cache_type`` differs
+    from what each `Trajectory` currently uses.
+    See :func:`asyncmd.config.set_trajectory_cache_type` to set the ``cache_type``.
+    To clear the old/previously set caches (after copying their values), pass
+    ``clear_old_cache=True``.
+    Parameters
+    ----------
+    copy_content : bool, optional
+        Whether to copy the current cache content to the new cache,
+        by default True
+    clear_old_cache : bool, optional
+        Whether to clear the old/previously set cache, by default False.
+    """
+    for traj in _TRAJECTORIES_BY_HASH.values():
+        traj.update_cache_type(copy_content=copy_content,
+                               clear_old_cache=clear_old_cache,
+                               )
 def _forget_all_trajectories() -> None:
     """
     Forget about the existence of all :class:`Trajectory` objects.
-    This will result in new :class:`Trajectory` objects beeing created even for
-    the same underlying trajectory_files. Usualy you do not want this as it
-    results in unecessary calculations if the same wrapped and cached function
+    This will result in new :class:`Trajectory` objects being created even for
+    the same underlying trajectory_files. Usually you do not want this as it
+    results in unnecessary calculations if the same wrapped and cached function
     is applied to both objects. This function exists as a hidden function as it
     is used in the tests and it might be helpful under certain circumstances.
     Use only if you know why you are using it!
     """
+    # pylint: disable-next=global-variable-not-assigned
     global _TRAJECTORIES_BY_HASH
     all_keys = set(_TRAJECTORIES_BY_HASH.keys())
     for key in all_keys:
@@ -58,9 +112,9 @@ def _forget_trajectory(traj_hash: int) -> None:
     """
     Forget about the existence of a given :class:`Trajectory` object.
-    This will result in new :class:`Trajectory` objects beeing created even for
-    the same underlying trajectory_files. Usualy you do not want this as it
-    results in unecessary calculations if the same wrapped and cached function
+    This will result in new :class:`Trajectory` objects being created even for
+    the same underlying trajectory_files. Usually you do not want this as it
+    results in unnecessary calculations if the same wrapped and cached function
     is applied to both objects. This function exists as a hidden function as it
     is used when deleting a :class:`Trajectory` (i.e. calling its `__del__`
     method) and it might be helpful under certain circumstances. Use only if
@@ -71,6 +125,7 @@ def _forget_trajectory(traj_hash: int) -> None:
     traj_hash : int
         The hash of the :class:`Trajectory` to forget about.
     """
+    # pylint: disable-next=global-variable-not-assigned
     global _TRAJECTORIES_BY_HASH
     try:
         del _TRAJECTORIES_BY_HASH[traj_hash]
@@ -79,35 +134,89 @@ def _forget_trajectory(traj_hash: int) -> None:
         pass
+@dataclasses.dataclass(frozen=True)
+class _TrajectoryPropertyData:
+    """
+    Dataclass to store/bundle all information that is read from the trajectory
+    and made available as :class:`Trajectory` properties.
+    All data are immutable (we use ``frozen=True``), because the data are read
+    from the underlying trajectory file(s) only once and if they change the hash
+    (i.e. the :class:`Trajectory` object the data is tied to) will also change.
+    """
+    length: int
+    dt: float
+    first_time: float
+    last_time: float
+    first_step: int | None
+    last_step: int | None
+@dataclasses.dataclass(frozen=True)
+class _TrajectoryFileData:
+    """
+    Dataclass to store/bundle all information related to the file-paths and
+    trajectory hash for :class:`Trajectory` objects.
+    All of this is set in :meth:`Trajectory.__new__` and must not be overridden
+    or set again in :meth:`Trajectory.__init__`!
+    """
+    trajectory_files: list[str]
+    structure_file: str
+    workdir: str
+    trajectory_hash: int
 class Trajectory:
     """
     Represent a trajectory.
     Keep track of the paths of the trajectory and the structure files.
     Caches values for (wrapped) functions acting on the trajectory.
-    Supports pickling and unpickling with the cached values restored, the
-    values will be written to a hidden numpy npz file next to the trajectory.
+    Supports pickling and unpickling with the cached values restored, if a
+    non-persistent cache is used when pickling, the values will be written to a
+    hidden numpy npz file next to the trajectory and will be read at unpickling.
     Supports equality checks with other :class:`Trajectory`.
     Also makes available (and caches) a number of useful attributes, e.g.
-    ``first_step`` and ``last_step`` (the first and last intergation step in
-    the trajectory), ``dt``, ``first_time``, ``last_time``,
-    ``length`` (in frames) and ``nstout``.
+    ``first_step`` and ``last_step`` (the first and last integration step in
+    the trajectory), ``dt``, ``first_time``, ``last_time``,and ``length`` (in
+    frames). All properties are read-only (for the simple reason that they
+    depend only on the underlying trajectory files).
+    A special case is ``nstout``, the output frequency in integration steps.
+    Since it can not be reliably read/inferred from the trajectory files alone,
+    it can be set by the user (at initialization or later via the property).
     Notes
     -----
     ``first_step`` and ``last_step`` is only useful for trajectories that come
     directly from a :class:`asyncmd.mdengine.MDEngine`.
-    As soon as the trajecory has been concatenated using MDAnalysis (e.g. with
+    As soon as the trajectory has been concatenated using MDAnalysis (e.g. with
     the ``TrajectoryConcatenator``) the step information is just the frame
     number in the trajectory part that became first/last frame in the
     concatenated trajectory.
     """
-    def __init__(self, trajectory_files: typing.Union[list[str], str],
-                 structure_file: str,
-                 nstout: typing.Optional[int] = None,
-                 cache_type: typing.Optional[str] = None,
-                 **kwargs):
+    _CACHE_CLASS_FOR_TYPE: dict[str, type[TrajectoryFunctionValueCache]] = {
+           "h5py": TrajectoryFunctionValueCacheInH5PY,
+           "npz": TrajectoryFunctionValueCacheInNPZ,
+           "memory": TrajectoryFunctionValueCacheInMemory,
+    }
+    _file_data: _TrajectoryFileData  # type annotation for stuff we set in __new__
+    # Note: We want __init__ and __new__ to have the same call signature
+    #       (at least for users, __new__ takes `old_workdir`...).
+    #       So we will have unused arguments in __init__ (for the stuff we set
+    #       in __new__) and we will have unused arguments in __new__ (for the
+    #       stuff we set in __init__).
+    #       The __new__/__init__ implementation is needed to get the global
+    #       trajectory registry to work (to make each traj unique for the same
+    #       hash), but pylint can not know that, so
+    def __init__(
+            self,
+            # pylint: disable-next=unused-argument
+            trajectory_files: list[str] | str, structure_file: str,
+            nstout: int | None = None,
+                 ) -> None:
         """
         Initialize a :class:`Trajectory`.
@@ -121,12 +230,6 @@ class Trajectory:
         nstout : int or None, optional
             The output frequency used when creating the trajectory,
             by default None
-        cache_type : str or None, optional
-            The cache type for the CV values cached for this trajectory,
-            must be one of 'h5py', 'npz' or 'memory'.
-            If None we will use 'h5py' if a h5py cache has been registered and
-            if not fallback to 'npz'.
-            See also the ``asyncmd.config.register_h5py_cache()`` function.
         Raises
         ------
@@ -134,65 +237,35 @@ class Trajectory:
             If the ``trajectory_files`` or the ``structure_file`` are not
             accessible.
         """
-        # NOTE: we assume tra = trr and struct = tpr
-        #       but we also expect that anything which works for mdanalysis as
-        #       tra and struct should also work here as tra and struct
-        # TODO: currently we do not use kwargs?!
-        #dval = object()
-        #for kwarg, value in kwargs.items():
-        #    cval = getattr(self, kwarg, dval)
-        #    if cval is not dval:
-        #        if isinstance(value, type(cval)):
-        #            # value is of same type as default so set it
-        #            setattr(self, kwarg, value)
-        #        else:
-        #            logger.warn(f"Setting attribute {kwarg} with "
-        #                        + f"mismatching type ({type(value)}). "
-        #                        + f" Default type is {type(cval)}."
-        #                        )
-        #    else:
-        #        # not previously defined, so warn that we ignore it
-        #        logger.warning("Ignoring unknown keyword-argument %s.", kwarg)
-        # NOTE: self._trajectory_files is set in __new__ because we otherwise
-        #       would sanitize the files twice, but we need to check in __new__
-        #       to make pickling work
-        #       self._structure_file is also set in __new__ together with the
-        #       trajectory_files as we also sanitize its path
-        #       self._traj_hash and self._workdir are also set by __new__!
-        # self._trajectory_files
-        # self._structure_file
-        # self._workdir
-        # self._traj_hash
+        # NOTE: We expect that anything which works for mdanalysis as
+        #       traj and struct should also work here as traj and struct
+        # NOTE: self._file_data is set in __new__ because we otherwise would:
+        #       - calculate the hash twice (need it in __new__),
+        #       - sanitize the files twice, but we need to check in __new__
+        #         to make pickling work
+        #       The _TrajectoryFileData dataclass therefore contains everything
+        #       (and only those things) we need in __new__
+        # self._file_data
         # properties
         self.nstout = nstout  # use the setter to make basic sanity checks
-        self._len = None
-        self._first_step = None
-        self._last_step = None
-        self._dt = None
-        self._first_time = None
-        self._last_time = None
-        # stuff for caching of functions applied to this traj
-        self._memory_cache = None
-        self._npz_cache = None
-        self._h5py_cache = None
-        self._cache_type = None
-        # remember if we use the global default value,
-        # if yes we use the (possibly changed) global default when unpickling
-        self._using_default_cache_type = True
-        # use our property logic for checking the value
-        # (Note that self._trajectory_hash has already been set by __new__)
-        self.cache_type = cache_type
+        # store for all (immutable) properties we read from the trajectory files
+        self._property_data: None | _TrajectoryPropertyData = None
+        # setup cache for functions applied to this traj
+        self._cache = self._setup_cache()
         # Locking mechanism such that only one application of a specific
         # CV func can run at any given time on this trajectory
-        self._semaphores_by_func_id = collections.defaultdict(
-                                                    asyncio.BoundedSemaphore
-                                                              )
-    def __new__(cls, trajectory_files: typing.Union[list[str], str],
-                structure_file: str,
-                nstout: typing.Optional[int] = None,
-                cache_type: typing.Optional[str] = None,
-                **kwargs):
+        self._semaphores_by_func_id: collections.defaultdict[
+            str,
+            asyncio.BoundedSemaphore,
+        ] = collections.defaultdict(asyncio.BoundedSemaphore)
+    def __new__(cls,
+                trajectory_files: list[str] | str, structure_file: str,
+                # (see above note for __init__ why its ok to ignore this)
+                # pylint: disable-next:unused-argument
+                nstout: int | None = None,
+                **kwargs) -> "Trajectory":
+        # pylint: disable-next=global-variable-not-assigned
         global _TRAJECTORIES_BY_HASH  # our global traj registry
         # see if old_workdir is given to sanitize file paths
         old_workdir = kwargs.get("old_workdir", None)
@@ -208,13 +281,6 @@ class Trajectory:
         try:
             # see if we (i.e. a traj with the same hash) are already existing
             other_traj = _TRAJECTORIES_BY_HASH[traj_hash]
-            # if yes return 'ourself'
-            # (but make sure that the filepaths match even after a potential
-            #   change of workdir)
-            other_traj._trajectory_files = trajectory_files
-            other_traj._structure_file = structure_file
-            other_traj._workdir = current_workdir
-            return other_traj
         except KeyError:
             # not yet in there, so need to create us
             # we just create cls so that we will be "created" by init or
@@ -222,100 +288,144 @@ class Trajectory:
             # NOTE: we need to make sure that every attribute we set
             #       below is not overwritten by setstate and/or init!
             obj = super().__new__(cls)
-            # but set self._traj_hash so we dont recalculate it
-            obj._traj_hash = traj_hash
-            # and set self._trajectory_files so we dont sanitize twice
-            obj._trajectory_files = trajectory_files
-            # also set self._structure_file
-            obj._structure_file = structure_file
-            # and set self._workdir to the new value
+            # we directly set hash, files and friends so we dont recalculate
+            # the hash and dont sanitize the file paths twice
             # Note:
             # we remember the current workdir to be able to unpickle as long as
             # either the relpath between traj and old/new workdir does not change
             # or the trajectory did not change its location but we changed workdir
             # (we need the workdir only for the second option)
-            obj._workdir = current_workdir
+            obj._file_data = _TrajectoryFileData(
+                                    trajectory_files=trajectory_files,
+                                    structure_file=structure_file,
+                                    workdir=current_workdir,
+                                    trajectory_hash=traj_hash,
+                                    )
             # and add us to the global trajectory registry
             _TRAJECTORIES_BY_HASH[traj_hash] = obj
             return obj
-    #def __del__(self):
-        # TODO: running 'del traj' does not call this function,
-        #       it only decreases the reference count by one,
-        #       but since we still have the traj in the traj by hash dictionary
-        #       i.e. we still have a reference, it will not call __del__ which
-        #       is only called when the reference count reaches zero
+        # we already exist (a traj object for the same traj files/hash),
+        # so return 'ourself'
+        # (but make sure that the filepaths match even after a potential
+        #  change of workdir)
+        other_traj._file_data = _TrajectoryFileData(
+                                    trajectory_files=trajectory_files,
+                                    structure_file=structure_file,
+                                    workdir=current_workdir,
+                                    trajectory_hash=traj_hash,
+                                    )
+        return other_traj
+    # def __del__(self):
+    # NOTE: Running 'del traj' does not call this function,
+    #       it only decreases the reference count by one.
+    #       But since we still have the traj in the traj by hash dictionary
+    #       i.e. we still have a reference, it will not call __del__ which
+    #       is only called when the reference count reaches zero.
+    #       So implementing it is quite pointless and misleading!
     #    _forget_trajectory(traj_hash=self.trajectory_hash)
     @classmethod
-    def _sanitize_file_paths(cls,
-                             trajectory_files: typing.Union[list[str], str],
+    def _sanitize_file_paths(cls, *,
+                             trajectory_files: list[str] | str,
                              structure_file: str,
-                             current_workdir: typing.Optional[str] = None,
-                             old_workdir: typing.Optional[str] = None,
-                             ) -> typing.Tuple[list[str], str]:
-        # NOTE: this returns relpath if no old_workdir is given and the traj
-        #       is accessible
-        #       if old_workdir is given (and the traj not accesible) it (tries)
-        #       to find the traj by assuming the traj did not change place and
-        #       we just need to add the "path_diff" from old to new workdir to
-        #       the path, if the file is then still not there it raises a
-        #       FileNotFoundError
-        # NOTE: (for pickling and aimmd storage behavior):
-        #       The above makes it possible to either change the workdir of the
-        #       python session OR change the location of the trajectories as
-        #       as long as the relative path between trajectory and python
-        #       workdir does not change!
+                             current_workdir: str,
+                             old_workdir: str | None = None,
+                             ) -> tuple[list[str], str]:
+        """
+        Return relpath for all files if no old_workdir is given and the trajectory
+        and structure files are accessible.
+        If old_workdir is given (and the traj not accessible) it (tries) to find
+        the trajs/struct by assuming the files did not change place and we just
+        need to add the "path_diff" from old to new workdir to the path, if the
+        file is then still not there it raises a FileNotFoundError.
+        Note: The file-path treatment here makes it possible to either change
+              the workdir of the python session OR change the location of the
+              trajectories as as long as the relative path between trajectory
+              and python workdir does not change!
+        Parameters
+        ----------
+        trajectory_files : list[str] | str
+            Absolute or relative path(s) to the trajectory file(s),
+            e.g. trr, xtc, dcd, ...
+            Can be one str (one file) or a list of str (multiple traj files).
+        structure_file : str
+            Absolute or relative path to the structure file (e.g. tpr, gro).
+        current_workdir : str
+            The current working directory to use for "path_diff" calculations.
+        old_workdir : str | None, optional
+            The old working directory (e.g. at pickling time), by default None.
+            If None, no "path_diff" calculations will be performed, i.e. it is
+            assumed the working directory did not change or we are not unpickling.
+        Returns
+        -------
+        tuple[list[str], str]
+            trajectory_files, structure_file
+            Sanitized file-paths if the files exists, trajectory_files is always
+            a list[str], even if it is only one file.
+        Raises
+        ------
+        FileNotFoundError
+            When the trajectory or structure files can not be found.
+        """
         def sanitize_path(f, pathdiff=None):
             if os.path.isfile(f):
                 return os.path.relpath(f)
-            elif pathdiff is not None:
+            if pathdiff is not None:
                 f_diff = os.path.join(pathdiff, f)
                 if os.path.isfile(f_diff):
                     return os.path.relpath(f_diff)
             # if we get until here we cant find the file
             err_msg = f"File {f} is not accessible"
-            if pathdiff is not None:
-                err_msg += f" (we also tried {f_diff})."
-            else:
-                err_msg += "."
+            err_msg += f" (we also tried {f_diff})." if pathdiff is not None else "."
             raise FileNotFoundError(err_msg)
         if old_workdir is not None:
-            if current_workdir is None:
-                raise ValueError("'old_workdir' given but 'current_workdir' "
-                                 "was None.")
             path_diff = os.path.relpath(old_workdir, current_workdir)
         else:
             path_diff = None
         if isinstance(trajectory_files, str):
             trajectory_files = [trajectory_files]
         traj_files_sanitized = [sanitize_path(f=traj_f, pathdiff=path_diff)
                                 for traj_f in trajectory_files
                                 ]
-        struct_file_sanitized = sanitize_path(f=structure_file,
-                                              pathdiff=path_diff,
-                                              )
+        struct_file_sanitized = sanitize_path(f=structure_file, pathdiff=path_diff)
         return traj_files_sanitized, struct_file_sanitized
     @classmethod
-    def _calc_traj_hash(cls, trajectory_files):
-        # calculate a hash over the first and last part of the traj files
-        # (we use it to make sure the cached CV values match the traj)
-        # note that we do not include the structure file on purpose because
-        # that allows for changing .gro <-> .tpr or similar
-        # (which we expect to not change the calculated CV values)
-        # TODO: how much should we read?
+    def _calc_traj_hash(cls, trajectory_files: list[str]) -> int:
+        """
+        Calculate a hash over the first and last part of the traj files.
+        We use it to make sure the cached CV values match the traj.
+        Note that we do not include the structure file on purpose because
+        that allows for changing .gro <-> .tpr or similar (which we expect to
+        not change the calculated CV values).
+        Parameters
+        ----------
+        trajectory_files : list[str]
+            Path(s) to the trajectory file(s).
+        Returns
+        -------
+        int
+            The hash calculated over the trajectory files.
+        """
+        # TODO: how much should we read to calculate the hash?
         #      (I [hejung] think the first and last .5 MB are enough)
         data = bytes()
         for traj_f in trajectory_files:
-            #data += traj_f.encode("utf-8")  # DONT include filepaths!...
+            # data += traj_f.encode("utf-8")  # DONT include filepaths!...
             fsize = os.stat(traj_f).st_size
             data += str(fsize).encode("utf-8")
-            if fsize == 0:
+            if not fsize:
                 # Note: we could also just warn as long as we do not do the
                 #       negative seek below if filesize == 0. However,
                 #       mdanalysis throws errors for empty trajectories anyway
@@ -326,7 +436,7 @@ class Trajectory:
                 # read the first bit of each file
                 data += traj_file.read(max_to_read)
                 # and read the last bit of each file
-                # Note that the last bit potentially overlapps with the first
+                # Note that the last bit potentially overlaps with the first
                 traj_file.seek(-max_to_read, io.SEEK_END)
                 data += traj_file.read(max_to_read)
         # calculate one hash over all traj_files
@@ -339,171 +449,161 @@ class Trajectory:
                         )
         return traj_hash
-    @property
-    def cache_type(self):
+    def _setup_cache(self) -> TrajectoryFunctionValueCache:
         """
-        String indicating the currently used cache type. Can also be (re)set.
+        Initialize and return a cache with the cache type/class set by _GLOBALS/config.
+        If the initialized cache is empty, this also checks for any npz cache
+        files and tries to append them to the new cache (irrespective of the
+        cache type).
         """
-        return copy.copy(self._cache_type)
+        cache = self._CACHE_CLASS_FOR_TYPE[
+                                _GLOBALS["TRAJECTORY_FUNCTION_CACHE_TYPE"]
+                                           ](traj_hash=self.trajectory_hash,
+                                             traj_files=self.trajectory_files,
+                                             )
+        # only try to read npz files if our cache is empty and not already npz
+        if not cache and _GLOBALS["TRAJECTORY_FUNCTION_CACHE_TYPE"] != "npz":
+            # cache is empty at initialization
+            # check if we can find a npz-cache to populate from
+            if os.path.isfile(
+                TrajectoryFunctionValueCacheInNPZ.get_cache_filename(
+                    traj_files=self.trajectory_files
+                )
+            ):
+                logger.info("Initialized %s with an empty cache, but found "
+                            "a (probably) matching npz cache file. Populating "
+                            "our cache with the values stored there.",
+                            self,
+                            )
+                cache_to_copy = TrajectoryFunctionValueCacheInNPZ(
+                                        traj_hash=self.trajectory_hash,
+                                        traj_files=self.trajectory_files,
+                                                                  )
+                for func_id, values in cache_to_copy.items():
+                    cache.append(func_id=func_id, values=values)
+        return cache
-    @cache_type.setter
-    def cache_type(self, value: typing.Optional[str]):
+    def update_cache_type(self, copy_content: bool = True,
+                          clear_old_cache: bool = False) -> None:
         """
-        Set the cache type.
+        Update the :class:`TrajectoryFunctionValueCache` this :class:`Trajectory` uses.
+        By default the content of the current cache is copied to the new cache.
+        This will only have an effect if the globally set ``cache_type`` differs
+        from what this `Trajectory` currently uses.
+        See :func:`asyncmd.config.set_trajectory_cache_type` to set the ``cache_type``.
+        To clear the old/previously set cache (after copying its values), pass
+        ``clear_old_cache=True``.
         Parameters
         ----------
-        value : str or None
-            Either a string indicating the type or None to choose the preferred
-            cache type from the available ones.
-            If a string it must be one of 'h5py', 'npz' or 'memory'.
-        Raises
-        ------
-        ValueError
-            Raised if value is not one of the available cache types.
+        copy_content : bool, optional
+            Whether to copy the current cache content to the new cache,
+            by default True
+        clear_old_cache : bool, optional
+            Whether to clear the old/previously set cache, by default False.
         """
-        if value is None:
-            use_default_cache_type = True
-            # find preferred cache type that is available
-            try:
-                value = _GLOBALS["TRAJECTORY_FUNCTION_CACHE_TYPE"]
-            except KeyError:
-                # no default cache type set
-                # default to numpy npz
-                value = "npz"
-        else:
-            use_default_cache_type = False
-        value = value.lower()
-        allowed_values = ["h5py", "npz", "memory"]
-        if value not in allowed_values:
-            raise ValueError("Given cache type must be `None` or one of "
-                             + f"{allowed_values}. Was: {value}.")
-        self._cache_type = value
-        self._using_default_cache_type = use_default_cache_type
-        self._setup_cache()
-    def _setup_cache(self) -> None:
-        # set up the cache indicated by self.cache_type and all others to None
-        # also makes sure that all previously cached values are transfered
-        # to the newly setup cache
-        # NOTE: we setup an npz cache to see if there are any saved values
-        #       that we would want to add to the newly setup cache
-        #       We do this because upon pickling we save everything to npz
-        # Note that we can just set self._npz to this cache because it is
-        # stateless (in the sense that if it existed it be exactly the same)
-        self._npz_cache = TrajectoryFunctionValueCacheNPZ(
-                                        fname_trajs=self.trajectory_files,
-                                        hash_traj=self._traj_hash,
-                                                          )
-        if self._cache_type == "memory":
-            if self._memory_cache is None:
-                self._memory_cache = TrajectoryFunctionValueCacheMEMORY()
-            else:
-                # we already have a mem cache so just try to use it
-                pass
-            if self._h5py_cache is not None:
-                self._cache_content_to_new_cache(
-                                        old_cache=self._h5py_cache,
-                                        new_cache=self._memory_cache,
-                                                 )
-                self._h5py_cache = None
-            self._cache_content_to_new_cache(
-                                        old_cache=self._npz_cache,
-                                        new_cache=self._memory_cache,
-                                             )
-            self._npz_cache = None
-        elif self._cache_type == "h5py":
-            try:
-                h5py_cache = _GLOBALS["H5PY_CACHE"]
-            except KeyError as exc:
-                raise ValueError(
-                    "No h5py cache file registered yet. Try calling "
-                    + "``asyncmd.config.register_h5py_cache_file()``"
-                    + " with the appropriate arguments first") from exc
-            if self._h5py_cache is None:
-                # dont have one yet so setup the cache
-                self._h5py_cache = TrajectoryFunctionValueCacheH5PY(
-                                                h5py_cache=h5py_cache,
-                                                hash_traj=self._traj_hash,
-                                                                    )
-            else:
-                # we already have a h5py cache...
-                if self._h5py_cache.h5py_cache is h5py_cache:
-                    # and it is in the same file/group location
-                    # so we do nothing but making sure that all values from
-                    # other caches are transfered
+        cache_type = _GLOBALS["TRAJECTORY_FUNCTION_CACHE_TYPE"]
+        if isinstance(self._cache, self._CACHE_CLASS_FOR_TYPE[cache_type]):
+            logger.info("Cache type is already %s. Not doing anything.", cache_type)
+            return
+        # init the new cache
+        cache = self._CACHE_CLASS_FOR_TYPE[cache_type](
+                                            traj_hash=self.trajectory_hash,
+                                            traj_files=self.trajectory_files,
+                                            )
+        if copy_content:
+            # and copy/append everything from current cache to the new one
+            for func_id, values in self._cache.items():
+                try:
+                    cache.append(func_id=func_id, values=values)
+                except ValuesAlreadyStoredError:
+                    # if we just initialized a non-empty cache we might already
+                    # have some of the values cached there, ignore them
                     pass
-                else:
-                    # lets copy the stuff from the old to the new h5py cache
-                    old_h5py_cache = self._h5py_cache
-                    self._h5py_cache = TrajectoryFunctionValueCacheH5PY(
-                                                h5py_cache=h5py_cache,
-                                                hash_traj=self._traj_hash,
-                                                                    )
-                    self._cache_content_to_new_cache(
-                                        old_cache=old_h5py_cache,
-                                        new_cache=self._h5py_cache,
-                                                     )
-            # transfer all values from other cache types and empty them
-            if self._memory_cache is not None:
-                self._cache_content_to_new_cache(
-                                        old_cache=self._memory_cache,
-                                        new_cache=self._h5py_cache,
-                                                 )
-                self._memory_cache = None
-            self._cache_content_to_new_cache(
-                                        old_cache=self._npz_cache,
-                                        new_cache=self._h5py_cache,
-                                             )
-            self._npz_cache = None
-        elif self._cache_type == "npz":
-            if self._h5py_cache is not None:
-                self._cache_content_to_new_cache(
-                                        old_cache=self._h5py_cache,
-                                        new_cache=self._npz_cache,
-                                                 )
-                self._h5py_cache = None
-            if self._memory_cache is not None:
-                self._cache_content_to_new_cache(
-                                        old_cache=self._memory_cache,
-                                        new_cache=self._npz_cache,
-                                                 )
-                self._memory_cache = None
-        else:
-            raise RuntimeError("This should never happen. self._cache_type "
-                               + "must be one of 'memory', 'h5py', 'npz' when "
-                               + "self._setup_cache is called. "
-                               + f"Was {self._cache_type}.")
+        if clear_old_cache:
+            self._cache.clear_all_values()
+        self._cache = cache
-    def _populate_properties(self) -> None:
+    def clear_all_cache_values(self) -> None:
         """
-        Populate cached properties from the underlying trajectory.
+        Clear all function values cached for this :class:`Trajectory`.
+        For file-based caches, this also removes the associated cache files.
+        Note that this just calls the underlying :class:`TrajectoryFunctionValueCache`
+        classes ``clear_all_values`` method.
+        """
+        self._cache.clear_all_values()
+    def _retrieve_cached_values(self, func_wrapper: "TrajectoryFunctionWrapper",
+                                ) -> np.ndarray | None:
+        """
+        Retrieve values cached for given :class:`TrajectoryFunctionWrapper`.
+        Return ``None`` if no values are cached (yet).
+        Parameters
+        ----------
+        func_wrapper : TrajectoryFunctionWrapper
+            The TrajectoryFunctionWrapper for which we (try to) retrieve cached values.
+        Returns
+        -------
+        np.ndarray | None
+            Cached function values or None if none are found.
+        """
+        try:
+            values = self._cache[func_wrapper.id]
+        except KeyError:
+            values = None
+        return values
+    def _register_cached_values(self, values: np.ndarray,
+                                func_wrapper: "TrajectoryFunctionWrapper",
+                                ) -> None:
+        """
+        Add values to cache for given TrajectoryFunctionWrapper.
+        Parameters
+        ----------
+        values : np.ndarray
+            The values to add.
+        func_wrapper : TrajectoryFunctionWrapper
+            The TrajectoryFunctionWrapper this values belong to.
+        """
+        self._cache.append(func_id=func_wrapper.id, values=values)
+    def _populate_property_data(self) -> _TrajectoryPropertyData:
+        """
+        Populate and return cached properties from the underlying trajectory.
+        Returns a :class:`_TrajectoryPropertyData` class.
         """
         # create/open a mdanalysis universe to get...
         u = mda.Universe(self.structure_file, *self.trajectory_files)
         # ...the number of frames
-        self._len = len(u.trajectory)
+        length = len(u.trajectory)
         # ...the first integration step and time
         ts = u.trajectory[0]
-        # FIXME: using None here means we will try to repopulate the properties
-        #        every time we access step property for a traj-format which
-        #        does not have step data!
-        # TODO: which traj formats have step data set in MDAnalysis?
-        #       XTC and TRR have it for sure (with the wraparound issue)
-        self._first_step = ts.data.get("step", None)
-        self._first_time = ts.time
+        first_step = ts.data.get("step", None)
+        first_time = ts.time
         # ...the time diff between subsequent **frames** (not steps)
-        self._dt = ts.dt
+        dt = ts.dt
         # ...the last integration step and time
         ts = u.trajectory[-1]
-        # TODO: which traj formats have step data set in MDAnalysis?
-        #       XTC and TRR have it for sure (with the wraparound issue)
-        self._last_step = ts.data.get("step", None)
-        self._last_time = ts.time
-        if all([t.lower().endswith((".xtc", ".trr"))
-                for t in self.trajectory_files]):
-            self._fix_trr_xtc_step_wraparound(universe=u)
+        last_step = ts.data.get("step", None)
+        last_time = ts.time
+        # See if we apply the wraparound issue fix
+        # Note: we are using some of the info we just read here (all explicitly passed)!
+        if all(
+            t.lower().endswith((".xtc", ".trr")) for t in self.trajectory_files
+        ):
+            first_step, last_step = self._fix_trr_xtc_step_wraparound(
+                                        universe=u,
+                                        first_time=first_time, last_time=last_time,
+                                        first_step=first_step, last_step=last_step,
+                                        )
         else:
             # bail out if traj is not an XTC or TRR
             logger.info("%s is not of type XTC or TRR. Not applying "
@@ -511,9 +611,23 @@ class Trajectory:
         # make sure the trajectory is closed by MDAnalysis
         u.trajectory.close()
         del u
-    def _fix_trr_xtc_step_wraparound(self, universe: mda.Universe) -> None:
+        # finally populate and return the dataclass with what we just read
+        # (and possibly corrected)
+        return _TrajectoryPropertyData(
+                                length=length, dt=dt,
+                                first_time=first_time, last_time=last_time,
+                                first_step=first_step, last_step=last_step,
+                                )
+    def _fix_trr_xtc_step_wraparound(self, *,
+                                     universe: mda.Universe,
+                                     first_time: float, last_time: float,
+                                     first_step: int, last_step: int,
+                                     ) -> tuple[int, int]:
         # check/correct for wraparounds in the integration step numbers
+        # return (corrected or not) first_step, last_step
+        # I.e. it is save to always set first_step, last_step with the return
+        # of this method.
         # NOTE: fails if the trajectory has length = 1!
         # NOTE: strictly spoken we should not assume wraparound behavior,
         #       but it seems reasonable for the stepnum,
@@ -525,52 +639,46 @@ class Trajectory:
         # dividing the times by integrator_dt, this should be reasonably
         # save for normal MD settings where integrator_dt should be on the
         # order of 1-10 fs
-        if self._len == 1:
+        if (n_frames := len(universe.trajectory)) == 1:
             # bail out if the trajectory has length=1
             # as we can not calculate dt if we only have one frame
             logger.info("%s has only one frame. Can not correct for "
                         "potential wraparound of the integration step.",
                         self)
-            return  # bail out
+            return first_step, last_step  # bail out
         # get the time offset for first and last frame, they need to match for
         # our wraparound fix to work
-        ts = universe.trajectory[0]
-        time_offset = ts.data.get("time_offset", 0)
-        ts = universe.trajectory[-1]
-        if ts.data.get("time_offset", 0) != time_offset:
+        time_offset = universe.trajectory[0].data.get("time_offset", 0)
+        if universe.trajectory[-1].data.get("time_offset", 0) != time_offset:
             logger.info("Time offset of the first and last time in "
                         "%s do not match. Not correcting for potential "
                         "wraparound of the integration step.",
                         self)
-            return  # bail out
-        delta_s = self._last_step - self._first_step
-        delta_t = round(self._last_time - self._first_time, ndigits=6)
-        # first make sure traj is continous (i.e. not a concatenation where we
-        #  carried over the time and step data from the original trajs)
-        n_frames = len(universe.trajectory)
-        n_max_samples = 100  # use at most 100 frames to see if it is continous
-        if n_frames > n_max_samples:
-            skip = n_frames // n_max_samples
-        else:
-            skip = 1
-        step_nums = [ts.data["step"] for ts in universe.trajectory[::skip]]
-        step_diffs = np.diff(step_nums)
-        first_diff = step_diffs[0]
-        if first_diff < 0:
+            return first_step, last_step  # bail out
+        delta_s = last_step - first_step
+        delta_t = round(last_time - first_time, ndigits=6)
+        # first make sure traj is continuous (i.e. not a concatenation where we
+        # carried over the time and step data from the original trajs).
+        # Use at most 100 (equally spaced) frames to see if it is continuous.
+        skip = n_frames // 100 if n_frames > 100 else 1
+        step_diffs = np.diff([ts.data["step"]
+                              for ts in universe.trajectory[::skip]]
+                             )
+        if (first_diff := step_diffs[0]) < 0:
             # we possibly wrapped around at the first step
             first_diff += 2**32
         for diff in step_diffs[1:]:
             if diff != first_diff:
-                # bail out because traj is not continous in time
-                logger.debug("%s is not from one continous propagation, i.e. "
+                # bail out because traj is not continuous in time
+                logger.debug("%s is not from one continuous propagation, i.e. "
                              "the step difference between subsequent steps is "
                              "not constant. Not applying TRR/XTC step "
                              "wraparound fix and using step as read from the "
                              "underlying trajectory.",
                              self)
-            return
+            return first_step, last_step
         # now the actual fix
-        if delta_s != 0:
+        if delta_s:  # delta_s != 0
             if delta_s > 0:
                 # both (last and first) wrapped around the same number of times
                 integrator_dt = round(delta_t / delta_s, ndigits=6)
@@ -580,17 +688,16 @@ class Trajectory:
             # NOTE: should we round or floor? I (hejung) think round is what we
             #       want, it will get us to the nearest int, which is good if
             #       we e.g. have 0.99999999999 instead of 1
-            first_step = round((self._first_time - time_offset) / integrator_dt)
-            last_step = round((self._last_time - time_offset) / integrator_dt)
-            self._first_step = first_step
-            self._last_step = last_step
-        else:  # delta_s == 0
-            # can only end up here if we have more than one frame in trajectory
-            # **and** the first and last frame have the same integration step
-            # which should be very rare and we can not correct anyway as the
-            # trajectory can not be from a continous propagation, so we can not
-            # end up here at all?
-            raise RuntimeError("This should not be possible?!")
+            first_step = round((first_time - time_offset) / integrator_dt)
+            last_step = round((last_time - time_offset) / integrator_dt)
+            return first_step, last_step
+        # delta_s == 0
+        # can only end up here if we have more than one frame in trajectory
+        # **and** the first and last frame have the same integration step
+        # which should be very rare and we can not correct anyway as the
+        # trajectory can not be from a continuous propagation, so we can not
+        # end up here at all?
+        raise RuntimeError("This should not be possible?!")
     def __len__(self) -> int:
         """
@@ -601,9 +708,9 @@ class Trajectory:
         int
             The number of frames in the trajectory.
         """
-        if self._len is None:
-            self._populate_properties()
-        return self._len
+        if self._property_data is None:
+            self._property_data = self._populate_property_data()
+        return self._property_data.length
     def __repr__(self) -> str:
         if len(self.trajectory_files) == 1:
@@ -614,6 +721,9 @@ class Trajectory:
                 + f" structure_file={self.structure_file})"
                 )
+    def __hash__(self) -> int:
+        return self.trajectory_hash
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, Trajectory):
             # if its not a trajectory it cant be equal
@@ -621,37 +731,35 @@ class Trajectory:
         if self.trajectory_hash != other.trajectory_hash:
             # if it has a different hash it cant be equal
             return False
-        # TODO: check for cached CV values? I (hejung) think it does not really
-        #       make sense...
-        # if we got until here the two trajs are equal
+        # if we got until here the two trajectories are equal
         return True
     def __ne__(self, other: object) -> bool:
-        return not self.__eq__(other=other)
+        return not self.__eq__(other)
     @property
     def structure_file(self) -> str:
         """Return relative path to the structure file."""
-        return copy.copy(self._structure_file)
+        return self._file_data.structure_file
     @property
-    def trajectory_files(self) -> str:
+    def trajectory_files(self) -> list[str]:
         """Return relative path to the trajectory files."""
-        return copy.copy(self._trajectory_files)
+        return self._file_data.trajectory_files
     @property
     def trajectory_hash(self) -> int:
-        """Return hash over the trajecory files"""
-        return copy.copy(self._traj_hash)
+        """Return hash over the trajectory files"""
+        return self._file_data.trajectory_hash
     @property
-    def nstout(self) -> typing.Union[int, None]:
+    def nstout(self) -> int | None:
         """Output frequency between subsequent frames in integration steps."""
         return self._nstout
     @nstout.setter
-    def nstout(self, val: typing.Union[int, None]) -> None:
+    def nstout(self, val: int | None) -> None:
         if val is not None:
             # ensure that it is an int
             val = int(val)
@@ -659,445 +767,80 @@ class Trajectory:
         self._nstout = val
     @property
-    def first_step(self) -> int:
+    def first_step(self) -> int | None:
         """Return the integration step of the first frame in the trajectory."""
-        if self._first_step is None:
-            self._populate_properties()
-        return self._first_step
+        if self._property_data is None:
+            self._property_data = self._populate_property_data()
+        return self._property_data.first_step
     @property
-    def last_step(self) -> int:
+    def last_step(self) -> int | None:
         """Return the integration step of the last frame in the trajectory."""
-        if self._last_step is None:
-            self._populate_properties()
-        return self._last_step
+        if self._property_data is None:
+            self._property_data = self._populate_property_data()
+        return self._property_data.last_step
     @property
     def dt(self) -> float:
-        """The time intervall between subsequent *frames* (not steps) in ps."""
-        if self._dt is None:
-            self._populate_properties()
-        return self._dt
+        """The time interval between subsequent *frames* (not steps) in ps."""
+        if self._property_data is None:
+            self._property_data = self._populate_property_data()
+        return self._property_data.dt
     @property
     def first_time(self) -> float:
         """Return the integration timestep of the first frame in ps."""
-        if self._first_time is None:
-            self._populate_properties()
-        return self._first_time
+        if self._property_data is None:
+            self._property_data = self._populate_property_data()
+        return self._property_data.first_time
     @property
     def last_time(self) -> float:
         """Return the integration timestep of the last frame in ps."""
-        if self._last_time is None:
-            self._populate_properties()
-        return self._last_time
-    async def _apply_wrapped_func(self, func_id, wrapped_func):
-        async with self._semaphores_by_func_id[func_id]:
-            # sort out which cache we use
-            # NOTE: only one cache should ever be not None, so order should not
-            #       matter here
-            #       anyway I (hejung) think this order is even what we want:
-            #       1.) use h5py cache if registered
-            #       2.) use npz cache (the default since h5py is not registered
-            #                          if not set by the user)
-            #       3.) use memory/local cache (only if set on traj creation
-            #                                   or if set as default cache)
-            if self._h5py_cache is not None:
-                return await self._apply_wrapped_func_cached(
-                                                    func_id=func_id,
-                                                    wrapped_func=wrapped_func,
-                                                    cache=self._h5py_cache,
-                                                             )
-            if self._npz_cache is not None:
-                return await self._apply_wrapped_func_cached(
-                                                    func_id=func_id,
-                                                    wrapped_func=wrapped_func,
-                                                    cache=self._npz_cache
-                                                             )
-            if self._memory_cache is not None:
-                return await self._apply_wrapped_func_cached(
-                                                    func_id=func_id,
-                                                    wrapped_func=wrapped_func,
-                                                    cache=self._memory_cache,
-                                                             )
-            # if we get until here we have no cache!
-            logger.warning("No cache associated with %s. Returning calculated "
-                           "function values anyway but no caching can/will be "
-                           "performed!",
-                           self,
-                           )
-            return await wrapped_func.get_values_for_trajectory(self)
-    async def _apply_wrapped_func_cached(
-                            self, func_id: str, wrapped_func,
-                            cache: collections.abc.Mapping[str, np.ndarray],
-                                         ):
-        try:
-            # see if it is in cache
-            return copy.copy(cache[func_id])
-        except KeyError:
-            # if not calculate, store and return
-            # send function application to seperate process and wait
-            # until it finishes
-            vals = await wrapped_func.get_values_for_trajectory(self)
-            cache.append(func_id=func_id, vals=vals)
-            return vals
-    def _cache_content_to_new_cache(
-                        self,
-                        old_cache: collections.abc.Mapping[str, np.ndarray],
-                        new_cache: collections.abc.Mapping[str, np.ndarray],
-                                    ):
-        for func_id, values in old_cache.items():
-            if func_id in new_cache:
-                continue  # dont try to add what is already in there
-            new_cache.append(func_id=func_id, vals=values)
-    def __getstate__(self):
+        if self._property_data is None:
+            self._property_data = self._populate_property_data()
+        return self._property_data.last_time
+    def __getstate__(self) -> dict[str, typing.Any]:
         # enable pickling of Trajectory
         # this should make it possible to pass it into a ProcessPoolExecutor
-        # and lets us calculate TrajectoryFunction values asyncronously
+        # and lets us calculate TrajectoryFunction values asynchronously
         state = self.__dict__.copy()
-        # NOTE: we always save to npz here and then we check for npz always
-        #       when initializing a `new` trajectory and add all values to
-        #       the then preferred cache
-        if self._npz_cache is None:
-            self._npz_cache = TrajectoryFunctionValueCacheNPZ(
-                                        fname_trajs=self.trajectory_files,
-                                        hash_traj=self._traj_hash,
-                                                             )
-            if self._memory_cache is not None:
-                self._cache_content_to_new_cache(old_cache=self._memory_cache,
-                                                 new_cache=self._npz_cache,
-                                                 )
-            if self._h5py_cache is not None:
-                self._cache_content_to_new_cache(old_cache=self._h5py_cache,
-                                                 new_cache=self._npz_cache,
-                                                 )
-            # and set npz cache back to None since we have not been using it
-            self._npz_cache = None
-        state["_h5py_cache"] = None
-        state["_npz_cache"] = None
-        state["_memory_cache"] = None
+        # special handling for case of function values cached in memory
+        if isinstance(self._cache, TrajectoryFunctionValueCacheInMemory):
+            # write it to npz so we can unpickle with values for any cache type
+            # (if we unpickle with an empty cache we will [try to] read the npz)
+            npz_cache = TrajectoryFunctionValueCacheInNPZ(
+                                    traj_hash=self.trajectory_hash,
+                                    traj_files=self.trajectory_files,
+                                                          )
+            for func_id, values in self._cache.items():
+                try:
+                    npz_cache.append(func_id=func_id, values=values)
+                except ValuesAlreadyStoredError:
+                    # ignore if we already have them
+                    pass
+        state["_cache"] = None
         state["_semaphores_by_func_id"] = collections.defaultdict(
                                                     asyncio.BoundedSemaphore
                                                                   )
         return state
-    def __setstate__(self, d: dict):
+    def __setstate__(self, d: dict) -> None:
         # remove the attributes we set in __new__ from dict
         # (otherwise we would overwrite what we set in __new__)
-        del d["_trajectory_files"]
-        del d["_structure_file"]
-        del d["_traj_hash"]
-        try:
-            del d["_workdir"]
-        except KeyError:
-            # 'old' trajectory objects dont have a _workdir attribute
-            pass
-        # now we can update without overwritting what we set in __new__
+        del d["_file_data"]
+        # now we can update without overwriting what we set in __new__
         self.__dict__.update(d)
-        # sort out which cache we were using (and which we will use now)
-        if self._using_default_cache_type:
-            # if we were using the global default when pickling use it now too
-            # Note that this will raise the ValueError from _setup_cache if
-            # no h5py cache has been registered but it is set as default
-            # (which is intended because it is the same behavior as when
-            #  initializing a new trajectory in the same situation)
-            self.cache_type = None  # this calls _setup_cache
-            return  # get out of here, no need to setup the cache twice
-        if self.cache_type == "h5py":
-            # make sure h5py cache is set before trying to unpickle with it
-            try:
-                _ = _GLOBALS["H5PY_CACHE"]
-            except KeyError:
-                # this will (probably) fallback to npz but I (hejung) think it
-                # is nice if we use the possibly set global default?
-                # Note that this will not err but just emit the warning to log
-                # when we change the cache but it will err when the global
-                # default cache is set to h5py (as above)
-                logger.warning("Trying to unpickle %s with cache_type "
-                               "'h5py' not possible without a registered "
-                               "cache. Falling back to global default type."
-                               "See 'asyncmd.config.register_h5py_cache' and"
-                               " 'asyncmd.config.set_default_cache_type'.",
-                                self
-                               )
-                self.cache_type = None  # this calls _setup_cache
-                return  # get out of here, no need to setup the cache twice
-        # setup the cache for all cases where we are not using default cache
-        # (or had "h5py" but could not unpickle with "h5py" now [and are
-        #  therefore also using the default])
-        self._setup_cache()
-    def __getnewargs_ex__(self):
+        # and finally setup the cache according to what the global config says
+        self._cache = self._setup_cache()
+    def __getnewargs_ex__(self) -> tuple[tuple, dict[str, typing.Any]]:
         # new needs the trajectory_files to be able to calculate the traj_hash
         # and since we want __new__ to have the same call signature as __init__
         # we also add all the init args here too
         return ((), {"trajectory_files": self.trajectory_files,
                      "structure_file": self.structure_file,
                      "nstout": self.nstout,
-                     "cache_type": self.cache_type,
-                     "old_workdir": self._workdir,
+                     "old_workdir": self._file_data.workdir,
                      })
-class TrajectoryFunctionValueCacheMEMORY(collections.abc.Mapping):
-    """
-    Interface for caching trajectory function values in memory in a dict.
-    """
-    def __init__(self, *args, **kwargs) -> None:
-        """Initialize a `TrajectoryFunctionValueCacheMEMORY`."""
-        self._func_values_by_id = {}
-    def __len__(self) -> int:
-        return len(self._func_values_by_id)
-    def __iter__(self):
-        return self._func_values_by_id.__iter__()
-    def __getitem__(self, key: str) -> np.ndarray:
-        if not isinstance(key, str):
-            raise TypeError("Keys must be of type str.")
-        return self._func_values_by_id[key]
-    def append(self, func_id: str, vals: np.ndarray) -> None:
-        if not isinstance(func_id, str):
-            raise TypeError("func_id must be of type str.")
-        if func_id in self._func_values_by_id:
-            # first check if it already in there
-            raise ValueError("There are already values stored for func_id "
-                             + f"{func_id}. Changing the stored values is not "
-                             + "supported.")
-        self._func_values_by_id[func_id] = vals
-class TrajectoryFunctionValueCacheNPZ(collections.abc.Mapping):
-    """
-    Interface for caching trajectory function values in a numpy npz file.
-    Drop-in replacement for the dictionary that is used for in-memory caching.
-    """
-    _hash_traj_npz_key = "hash_of_trajs"  # key of hash_traj in npz file
-    # NOTE: this is written with the assumption that stored trajectories are
-    #       immutable (except for adding additional stored function values)
-    #       but we assume that the actual underlying trajectory stays the same,
-    #       i.e. it is not extended after first storing it
-    #       If it changes between two npz-cache initializiations, it will have
-    #       a different traj-hash and all cached CV values will be recalculated
-    # NOTE: npz appending inspired by: https://stackoverflow.com/a/66618141
-    # NOTE/FIXME: It would be nice to use the MAX_FILES_OPEN semaphore
-    #    but then we need async/await and then we need to go to a 'create'
-    #    classmethod that is async and required for initialization
-    #    (because __init__ cant be async)
-    #    but since we (have to) open the npz file in the other magic methods
-    #    too it does not really matter (as they can not be async either)?
-    # ...and as we also leave some room for non-semaphored file openings anyway
-    def __init__(self, fname_trajs: list[str], hash_traj: int) -> None:
-        """
-        Initialize a `TrajectoryFunctionValueCacheNPZ`.
-        Parameters
-        ----------
-        fname_trajs : list[str]
-            Absolute filenames to the trajectories for which we cache CV values.
-        hash_traj : int
-            Hash over the first part of the trajectory file,
-            used to make sure we cache only for the right trajectory
-            (and not any trajectories with the same filename).
-        """
-        self.fname_npz = self._get_cache_filename(fname_trajs=fname_trajs,
-                                                  trajectory_hash=hash_traj,
-                                                  )
-        self._hash_traj = hash_traj
-        self._func_ids = []
-        # sort out if we have an associated npz file already
-        # and if it is from/for the "right" trajectory file
-        self._ensure_consistent_npz()
-    def _ensure_consistent_npz(self):
-        # next line makes sure we only remember func_ids from the current npz
-        self._func_ids = []
-        if not os.path.isfile(self.fname_npz):
-            # no npz so nothing to do except making sure we have no func_ids
-            return
-        existing_npz_matches = False
-        with np.load(self.fname_npz, allow_pickle=False) as npzfile:
-            try:
-                saved_hash_traj = npzfile[self._hash_traj_npz_key][0]
-            except KeyError:
-                # we probably tripped over an old formatted npz
-                # so we will just rewrite it completely with hash
-                pass
-            else:
-                # old hash found, lets compare the two hashes
-                existing_npz_matches = (self._hash_traj == saved_hash_traj)
-                if existing_npz_matches:
-                    # if they do populate self with the func_ids we have
-                    # cached values for
-                    for k in npzfile.keys():
-                        if k != self._hash_traj_npz_key:
-                            self._func_ids.append(str(k))
-        # now if the old npz did not match we should remove it
-        # then we will rewrite it with the first cached CV values
-        if not existing_npz_matches:
-            logger.debug("Found existing npz file (%s) but the"
-                         " trajectory hash does not match."
-                         " Recreating the npz cache from scratch.",
-                         self.fname_npz
-                         )
-            os.unlink(self.fname_npz)
-    @classmethod
-    def _get_cache_filename(cls, fname_trajs: list[str],
-                            trajectory_hash: int) -> str:
-        """
-        Construct cachefilename from trajectory fname.
-        Parameters
-        ----------
-        fname_trajs : list[str]
-            Path to the trajectory for which we cache.
-        trajectory_hash : int
-            Hash of the trajectory (files).
-        Returns
-        -------
-        str
-            Path to the cachefile associated with trajectory.
-        """
-        head, tail = os.path.split(fname_trajs[0])
-        return os.path.join(head,
-            f".{tail}{'_MULTIPART' if len(fname_trajs) > 1 else ''}_asyncmd_cv_cache.npz"
-                            )
-    def __len__(self) -> int:
-        return len(self._func_ids)
-    def __iter__(self):
-        for func_id in self._func_ids:
-            yield func_id
-    def __getitem__(self, key: str) -> np.ndarray:
-        if not isinstance(key, str):
-            raise TypeError("Keys must be of type str.")
-        if key in self._func_ids:
-            with np.load(self.fname_npz, allow_pickle=False) as npzfile:
-                return npzfile[key]
-        else:
-            raise KeyError(f"No values for {key} cached (yet).")
-    def append(self, func_id: str, vals: np.ndarray) -> None:
-        """
-        Append values for given func_id.
-        Parameters
-        ----------
-        func_id : str
-            Function identifier.
-        vals : np.ndarray
-            Values of application of function with given func_id.
-        Raises
-        ------
-        TypeError
-            If ``func_id`` is not a string.
-        ValueError
-            If there are already values stored for ``func_id`` in self.
-        """
-        if not isinstance(func_id, str):
-            raise TypeError("func_id must be of type str.")
-        if func_id in self._func_ids:
-            # first check if it already in there
-            raise ValueError("There are already values stored for func_id "
-                             + f"{func_id}. Changing the stored values is not "
-                             + "supported.")
-        if len(self) == 0:
-            # these are the first cached CV values for this traj
-            # so we just create the (empty) npz file
-            np.savez(self.fname_npz)
-            # and write the trajectory hash
-            self._append_data_to_npz(name=self._hash_traj_npz_key,
-                                     value=np.array([self._hash_traj]),
-                                     )
-        # now we can append either way
-        # either already something cached, or freshly created empty file
-        self._append_data_to_npz(name=func_id, value=vals)
-        # add func_id to list of func_ids that we know are cached in npz
-        self._func_ids.append(func_id)
-    def _append_data_to_npz(self, name: str, value: np.ndarray) -> None:
-        # npz files are just zipped together collections of npy files
-        # so we just make a npy file saved into a BytesIO and then write that
-        # to the end of the npz file
-        bio = io.BytesIO()
-        np.save(bio, value)
-        with zipfile.ZipFile(file=self.fname_npz,
-                             mode="a",  # append!
-                             # uncompressed (but) zip archive member
-                             compression=zipfile.ZIP_STORED,
-                             ) as zfile:
-            zfile.writestr(f"{name}.npy", data=bio.getvalue())
-class TrajectoryFunctionValueCacheH5PY(collections.abc.Mapping):
-    """
-    Interface for caching trajectory function values in a given h5py group.
-    Drop-in replacement for the dictionary that is used for in-memory caching.
-    """
-    # NOTE: this is written with the assumption that stored trajectories are
-    #       immutable (except for adding additional stored function values)
-    #       but we assume that the actual underlying trajectory stays the same,
-    #       i.e. it is not extended after first storing it
-    def __init__(self, h5py_cache, hash_traj: int):
-        self.h5py_cache = h5py_cache
-        self._hash_traj = hash_traj
-        self._h5py_paths = {"ids": "FunctionIDs",
-                            "vals": "FunctionValues"
-                            }
-        self._root_grp = h5py_cache.require_group(
-                                            "asyncmd/"
-                                            + "TrajectoryFunctionValueCache/"
-                                            + f"{self._hash_traj}"
-                                                  )
-        self._ids_grp = self._root_grp.require_group(self._h5py_paths["ids"])
-        self._vals_grp = self._root_grp.require_group(self._h5py_paths["vals"])
-    def __len__(self):
-        return len(self._ids_grp.keys())
-    def __iter__(self):
-        for idx in range(len(self)):
-            yield self._ids_grp[str(idx)].asstr()[()]
-    def __getitem__(self, key):
-        if not isinstance(key, str):
-            raise TypeError("Keys must be of type str.")
-        for idx, k_val in enumerate(self):
-            if key == k_val:
-                return self._vals_grp[str(idx)][:]
-        # if we got until here the key is not in there
-        raise KeyError("Key not found.")
-    def append(self, func_id, vals):
-        if not isinstance(func_id, str):
-            raise TypeError("Keys (func_id) must be of type str.")
-        if func_id in self:
-            raise ValueError("There are already values stored for func_id "
-                             + f"{func_id}. Changing the stored values is not "
-                             + "supported.")
-        # TODO: do we also want to check vals for type?
-        name = str(len(self))
-        _ = self._ids_grp.create_dataset(name, data=func_id)
-        _ = self._vals_grp.create_dataset(name, data=vals)

asyncmd 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

asyncmd 0.3.2py3-none-any.whl → 0.4.0py3-none-any.whl