adaptivepy-sampling 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adaptivepy/__init__.py +7 -0
- adaptivepy/api.py +229 -0
- adaptivepy/cli/__init__.py +5 -0
- adaptivepy/cli/run.py +68 -0
- adaptivepy/clustering/__init__.py +103 -0
- adaptivepy/clustering/base.py +73 -0
- adaptivepy/clustering/regular_space.py +135 -0
- adaptivepy/clustering/sklearn_kmeans.py +93 -0
- adaptivepy/clustering/sklearn_minibatch.py +94 -0
- adaptivepy/config/__init__.py +17 -0
- adaptivepy/config/schema.py +196 -0
- adaptivepy/io/__init__.py +27 -0
- adaptivepy/io/loader.py +267 -0
- adaptivepy/io/trajectory.py +151 -0
- adaptivepy/models.py +83 -0
- adaptivepy/output/__init__.py +23 -0
- adaptivepy/output/pdb_writer.py +59 -0
- adaptivepy/output/writer.py +229 -0
- adaptivepy/policies/__init__.py +21 -0
- adaptivepy/policies/base.py +105 -0
- adaptivepy/policies/least_counts.py +43 -0
- adaptivepy/policies/random.py +53 -0
- adaptivepy/selection/__init__.py +5 -0
- adaptivepy/selection/frame_selector.py +132 -0
- adaptivepy/stats/__init__.py +15 -0
- adaptivepy/stats/cluster_stats.py +118 -0
- adaptivepy/utils/__init__.py +6 -0
- adaptivepy/utils/io_utils.py +49 -0
- adaptivepy/utils/logging.py +55 -0
- adaptivepy_sampling-0.1.0.dist-info/METADATA +52 -0
- adaptivepy_sampling-0.1.0.dist-info/RECORD +34 -0
- adaptivepy_sampling-0.1.0.dist-info/WHEEL +5 -0
- adaptivepy_sampling-0.1.0.dist-info/entry_points.txt +2 -0
- adaptivepy_sampling-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""KMeans clustering via scikit-learn."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from sklearn.cluster import KMeans
|
|
9
|
+
|
|
10
|
+
from adaptivepy.clustering.base import Clusterer
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SklearnKMeansClusterer(Clusterer):
|
|
14
|
+
"""Wrap ``sklearn.cluster.KMeans`` for AdaptivePy.
|
|
15
|
+
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
n_clusters : int
|
|
19
|
+
Number of clusters.
|
|
20
|
+
random_state : int or None
|
|
21
|
+
Random seed passed to KMeans.
|
|
22
|
+
**kwargs
|
|
23
|
+
Additional keyword arguments forwarded to ``KMeans``.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
n_clusters: int,
|
|
29
|
+
random_state: Optional[int] = None,
|
|
30
|
+
**kwargs: Any,
|
|
31
|
+
) -> None:
|
|
32
|
+
self.n_clusters = n_clusters
|
|
33
|
+
self.random_state = random_state
|
|
34
|
+
self._extra_kwargs = kwargs
|
|
35
|
+
self._model: Optional[KMeans] = None
|
|
36
|
+
|
|
37
|
+
def fit(self, X: np.ndarray) -> "SklearnKMeansClusterer":
|
|
38
|
+
"""Fit KMeans on the provided feature matrix.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
X : np.ndarray
|
|
43
|
+
Feature matrix of shape ``(n_samples, n_features)``.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
SklearnKMeansClusterer
|
|
48
|
+
Fitted clusterer.
|
|
49
|
+
"""
|
|
50
|
+
self._model = KMeans(
|
|
51
|
+
n_clusters=self.n_clusters,
|
|
52
|
+
random_state=self.random_state,
|
|
53
|
+
n_init=10,
|
|
54
|
+
**self._extra_kwargs,
|
|
55
|
+
)
|
|
56
|
+
self._model.fit(X)
|
|
57
|
+
return self
|
|
58
|
+
|
|
59
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
60
|
+
"""Predict cluster labels for ``X``.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
X : np.ndarray
|
|
65
|
+
Feature matrix.
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
np.ndarray
|
|
70
|
+
Cluster labels.
|
|
71
|
+
|
|
72
|
+
Raises
|
|
73
|
+
------
|
|
74
|
+
RuntimeError
|
|
75
|
+
If ``fit`` has not been called.
|
|
76
|
+
"""
|
|
77
|
+
if self._model is None:
|
|
78
|
+
raise RuntimeError("Clusterer must be fitted before predict.")
|
|
79
|
+
return self._model.predict(X)
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def cluster_centers_(self) -> Optional[np.ndarray]:
|
|
83
|
+
"""Return KMeans cluster centers."""
|
|
84
|
+
if self._model is None:
|
|
85
|
+
return None
|
|
86
|
+
return self._model.cluster_centers_
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def model(self) -> Any:
|
|
90
|
+
"""Return the fitted ``KMeans`` instance."""
|
|
91
|
+
if self._model is None:
|
|
92
|
+
raise RuntimeError("Clusterer must be fitted before accessing model.")
|
|
93
|
+
return self._model
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""MiniBatchKMeans clustering via scikit-learn."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from sklearn.cluster import MiniBatchKMeans
|
|
9
|
+
|
|
10
|
+
from adaptivepy.clustering.base import Clusterer
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SklearnMiniBatchClusterer(Clusterer):
|
|
14
|
+
"""Wrap ``sklearn.cluster.MiniBatchKMeans`` for large datasets.
|
|
15
|
+
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
n_clusters : int
|
|
19
|
+
Number of clusters.
|
|
20
|
+
random_state : int or None
|
|
21
|
+
Random seed passed to MiniBatchKMeans.
|
|
22
|
+
**kwargs
|
|
23
|
+
Additional keyword arguments forwarded to ``MiniBatchKMeans``.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
n_clusters: int,
|
|
29
|
+
random_state: Optional[int] = None,
|
|
30
|
+
**kwargs: Any,
|
|
31
|
+
) -> None:
|
|
32
|
+
self.n_clusters = n_clusters
|
|
33
|
+
self.random_state = random_state
|
|
34
|
+
self._extra_kwargs = kwargs
|
|
35
|
+
self._model: Optional[MiniBatchKMeans] = None
|
|
36
|
+
|
|
37
|
+
def fit(self, X: np.ndarray) -> "SklearnMiniBatchClusterer":
|
|
38
|
+
"""Fit MiniBatchKMeans on the provided feature matrix.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
X : np.ndarray
|
|
43
|
+
Feature matrix of shape ``(n_samples, n_features)``.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
SklearnMiniBatchClusterer
|
|
48
|
+
Fitted clusterer.
|
|
49
|
+
"""
|
|
50
|
+
self._model = MiniBatchKMeans(
|
|
51
|
+
n_clusters=self.n_clusters,
|
|
52
|
+
random_state=self.random_state,
|
|
53
|
+
n_init=3,
|
|
54
|
+
batch_size=min(1024, max(256, X.shape[0] // 10)),
|
|
55
|
+
**self._extra_kwargs,
|
|
56
|
+
)
|
|
57
|
+
self._model.fit(X)
|
|
58
|
+
return self
|
|
59
|
+
|
|
60
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
61
|
+
"""Predict cluster labels for ``X``.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
X : np.ndarray
|
|
66
|
+
Feature matrix.
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
np.ndarray
|
|
71
|
+
Cluster labels.
|
|
72
|
+
|
|
73
|
+
Raises
|
|
74
|
+
------
|
|
75
|
+
RuntimeError
|
|
76
|
+
If ``fit`` has not been called.
|
|
77
|
+
"""
|
|
78
|
+
if self._model is None:
|
|
79
|
+
raise RuntimeError("Clusterer must be fitted before predict.")
|
|
80
|
+
return self._model.predict(X)
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def cluster_centers_(self) -> Optional[np.ndarray]:
|
|
84
|
+
"""Return MiniBatchKMeans cluster centers."""
|
|
85
|
+
if self._model is None:
|
|
86
|
+
return None
|
|
87
|
+
return self._model.cluster_centers_
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def model(self) -> Any:
|
|
91
|
+
"""Return the fitted ``MiniBatchKMeans`` instance."""
|
|
92
|
+
if self._model is None:
|
|
93
|
+
raise RuntimeError("Clusterer must be fitted before accessing model.")
|
|
94
|
+
return self._model
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Configuration package for AdaptivePy."""
|
|
2
|
+
|
|
3
|
+
from adaptivepy.config.schema import (
|
|
4
|
+
ClusteringConfig,
|
|
5
|
+
RunConfig,
|
|
6
|
+
SeedSelectionConfig,
|
|
7
|
+
config_to_dict,
|
|
8
|
+
load_config,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ClusteringConfig",
|
|
13
|
+
"RunConfig",
|
|
14
|
+
"SeedSelectionConfig",
|
|
15
|
+
"config_to_dict",
|
|
16
|
+
"load_config",
|
|
17
|
+
]
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""Configuration schema and validation for AdaptivePy runs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
DEFAULT_CLUSTERING_METHOD = "kmeans"
|
|
13
|
+
DEFAULT_SEED_SELECTION = "nearest_center"
|
|
14
|
+
DEFAULT_N_SEEDS = 10
|
|
15
|
+
DEFAULT_RANDOM_SEED = 42
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ClusteringConfig:
|
|
20
|
+
"""Clustering backend configuration.
|
|
21
|
+
|
|
22
|
+
Attributes
|
|
23
|
+
----------
|
|
24
|
+
method : str
|
|
25
|
+
Clustering method name: ``kmeans``, ``minibatch_kmeans``, or
|
|
26
|
+
``regular_space``.
|
|
27
|
+
n_clusters : int
|
|
28
|
+
Number of clusters to fit.
|
|
29
|
+
params : dict
|
|
30
|
+
Additional keyword arguments passed to the clusterer.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
method: str = DEFAULT_CLUSTERING_METHOD
|
|
34
|
+
n_clusters: int = 10
|
|
35
|
+
params: Dict[str, Any] = field(default_factory=dict)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class SeedSelectionConfig:
|
|
40
|
+
"""Frame-level seed selection configuration.
|
|
41
|
+
|
|
42
|
+
Attributes
|
|
43
|
+
----------
|
|
44
|
+
method : str
|
|
45
|
+
Selection method: ``nearest_center`` or ``random_frame``.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
method: str = DEFAULT_SEED_SELECTION
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class RunConfig:
|
|
53
|
+
"""Full configuration for an adaptive sampling run.
|
|
54
|
+
|
|
55
|
+
Attributes
|
|
56
|
+
----------
|
|
57
|
+
features_dir : Path
|
|
58
|
+
Directory containing ``*.npy`` feature files.
|
|
59
|
+
output_dir : Path
|
|
60
|
+
Directory where results are written.
|
|
61
|
+
trajectories_dir : Path or None
|
|
62
|
+
Optional directory containing coordinate trajectories.
|
|
63
|
+
topology : Path or None
|
|
64
|
+
Topology file required when trajectories are provided.
|
|
65
|
+
clustering : ClusteringConfig
|
|
66
|
+
Clustering settings.
|
|
67
|
+
policies : list of str
|
|
68
|
+
Policy names to evaluate in parallel.
|
|
69
|
+
n_seeds : int
|
|
70
|
+
Number of seed frames to select per policy.
|
|
71
|
+
seed_selection : SeedSelectionConfig
|
|
72
|
+
Frame selection method within chosen clusters.
|
|
73
|
+
random_seed : int
|
|
74
|
+
Global random seed for reproducibility.
|
|
75
|
+
write_pdbs : bool
|
|
76
|
+
Whether to write PDB files when trajectories are available.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
features_dir: Path
|
|
80
|
+
output_dir: Path
|
|
81
|
+
trajectories_dir: Optional[Path] = None
|
|
82
|
+
topology: Optional[Path] = None
|
|
83
|
+
clustering: ClusteringConfig = field(default_factory=ClusteringConfig)
|
|
84
|
+
policies: List[str] = field(default_factory=lambda: ["least_counts"])
|
|
85
|
+
n_seeds: int = DEFAULT_N_SEEDS
|
|
86
|
+
seed_selection: SeedSelectionConfig = field(default_factory=SeedSelectionConfig)
|
|
87
|
+
random_seed: int = DEFAULT_RANDOM_SEED
|
|
88
|
+
write_pdbs: bool = True
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def load_config(path: str | Path) -> RunConfig:
|
|
92
|
+
"""Load and parse a YAML run configuration file.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
path : str or Path
|
|
97
|
+
Path to the YAML configuration file.
|
|
98
|
+
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
101
|
+
RunConfig
|
|
102
|
+
Parsed and validated configuration object.
|
|
103
|
+
|
|
104
|
+
Raises
|
|
105
|
+
------
|
|
106
|
+
FileNotFoundError
|
|
107
|
+
If the configuration file does not exist.
|
|
108
|
+
ValueError
|
|
109
|
+
If required fields are missing or invalid.
|
|
110
|
+
"""
|
|
111
|
+
config_path = Path(path)
|
|
112
|
+
if not config_path.is_file():
|
|
113
|
+
raise FileNotFoundError(f"Configuration file not found: {config_path}")
|
|
114
|
+
|
|
115
|
+
with config_path.open("r", encoding="utf-8") as handle:
|
|
116
|
+
raw = yaml.safe_load(handle) or {}
|
|
117
|
+
|
|
118
|
+
if "features_dir" not in raw:
|
|
119
|
+
raise ValueError("Configuration must specify 'features_dir'.")
|
|
120
|
+
if "output_dir" not in raw:
|
|
121
|
+
raise ValueError("Configuration must specify 'output_dir'.")
|
|
122
|
+
|
|
123
|
+
features_dir = Path(raw["features_dir"])
|
|
124
|
+
output_dir = Path(raw["output_dir"])
|
|
125
|
+
trajectories_dir = (
|
|
126
|
+
Path(raw["trajectories_dir"]) if raw.get("trajectories_dir") else None
|
|
127
|
+
)
|
|
128
|
+
topology = Path(raw["topology"]) if raw.get("topology") else None
|
|
129
|
+
|
|
130
|
+
if trajectories_dir is not None and topology is None:
|
|
131
|
+
raise ValueError(
|
|
132
|
+
"When 'trajectories_dir' is provided, 'topology' must also be set."
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
clustering_raw = raw.get("clustering", {})
|
|
136
|
+
clustering = ClusteringConfig(
|
|
137
|
+
method=clustering_raw.get("method", DEFAULT_CLUSTERING_METHOD),
|
|
138
|
+
n_clusters=int(clustering_raw.get("n_clusters", 10)),
|
|
139
|
+
params=dict(clustering_raw.get("params", {})),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
seed_raw = raw.get("seed_selection", {})
|
|
143
|
+
seed_selection = SeedSelectionConfig(
|
|
144
|
+
method=seed_raw.get("method", DEFAULT_SEED_SELECTION),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
policies = raw.get("policies", ["least_counts"])
|
|
148
|
+
if isinstance(policies, str):
|
|
149
|
+
policies = [policies]
|
|
150
|
+
|
|
151
|
+
return RunConfig(
|
|
152
|
+
features_dir=features_dir,
|
|
153
|
+
output_dir=output_dir,
|
|
154
|
+
trajectories_dir=trajectories_dir,
|
|
155
|
+
topology=topology,
|
|
156
|
+
clustering=clustering,
|
|
157
|
+
policies=policies,
|
|
158
|
+
n_seeds=int(raw.get("n_seeds", DEFAULT_N_SEEDS)),
|
|
159
|
+
seed_selection=seed_selection,
|
|
160
|
+
random_seed=int(raw.get("random_seed", DEFAULT_RANDOM_SEED)),
|
|
161
|
+
write_pdbs=bool(raw.get("write_pdbs", True)),
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def config_to_dict(config: RunConfig) -> Dict[str, Any]:
|
|
166
|
+
"""Convert a :class:`RunConfig` to a plain dictionary for serialization.
|
|
167
|
+
|
|
168
|
+
Parameters
|
|
169
|
+
----------
|
|
170
|
+
config : RunConfig
|
|
171
|
+
Configuration object to serialize.
|
|
172
|
+
|
|
173
|
+
Returns
|
|
174
|
+
-------
|
|
175
|
+
dict
|
|
176
|
+
YAML-serializable configuration dictionary.
|
|
177
|
+
"""
|
|
178
|
+
result: Dict[str, Any] = {
|
|
179
|
+
"features_dir": str(config.features_dir),
|
|
180
|
+
"output_dir": str(config.output_dir),
|
|
181
|
+
"clustering": {
|
|
182
|
+
"method": config.clustering.method,
|
|
183
|
+
"n_clusters": config.clustering.n_clusters,
|
|
184
|
+
"params": config.clustering.params,
|
|
185
|
+
},
|
|
186
|
+
"policies": list(config.policies),
|
|
187
|
+
"n_seeds": config.n_seeds,
|
|
188
|
+
"seed_selection": {"method": config.seed_selection.method},
|
|
189
|
+
"random_seed": config.random_seed,
|
|
190
|
+
"write_pdbs": config.write_pdbs,
|
|
191
|
+
}
|
|
192
|
+
if config.trajectories_dir is not None:
|
|
193
|
+
result["trajectories_dir"] = str(config.trajectories_dir)
|
|
194
|
+
if config.topology is not None:
|
|
195
|
+
result["topology"] = str(config.topology)
|
|
196
|
+
return result
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Input/output utilities for AdaptivePy."""
|
|
2
|
+
|
|
3
|
+
from adaptivepy.io.loader import (
|
|
4
|
+
list_feature_files,
|
|
5
|
+
list_trajectory_files,
|
|
6
|
+
load_features,
|
|
7
|
+
validate_dataset,
|
|
8
|
+
validate_feature_trajectory_mapping,
|
|
9
|
+
)
|
|
10
|
+
from adaptivepy.io.trajectory import (
|
|
11
|
+
build_trajectory_map,
|
|
12
|
+
extract_frame,
|
|
13
|
+
load_trajectory,
|
|
14
|
+
validate_trajectory_frame_counts,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"build_trajectory_map",
|
|
19
|
+
"extract_frame",
|
|
20
|
+
"list_feature_files",
|
|
21
|
+
"list_trajectory_files",
|
|
22
|
+
"load_features",
|
|
23
|
+
"load_trajectory",
|
|
24
|
+
"validate_dataset",
|
|
25
|
+
"validate_feature_trajectory_mapping",
|
|
26
|
+
"validate_trajectory_frame_counts",
|
|
27
|
+
]
|
adaptivepy/io/loader.py
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
"""Feature loading and dataset validation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, List, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from adaptivepy.models import Dataset, FrameRecord
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def list_feature_files(features_dir: Path) -> List[Path]:
|
|
17
|
+
"""List ``*.npy`` feature files in a directory, sorted by name.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
features_dir : Path
|
|
22
|
+
Directory containing feature arrays.
|
|
23
|
+
|
|
24
|
+
Returns
|
|
25
|
+
-------
|
|
26
|
+
list of Path
|
|
27
|
+
Sorted paths to feature files.
|
|
28
|
+
|
|
29
|
+
Raises
|
|
30
|
+
------
|
|
31
|
+
FileNotFoundError
|
|
32
|
+
If the directory does not exist.
|
|
33
|
+
ValueError
|
|
34
|
+
If no ``*.npy`` files are found.
|
|
35
|
+
"""
|
|
36
|
+
features_dir = Path(features_dir)
|
|
37
|
+
if not features_dir.is_dir():
|
|
38
|
+
raise FileNotFoundError(f"Features directory not found: {features_dir}")
|
|
39
|
+
|
|
40
|
+
files = sorted(features_dir.glob("*.npy"))
|
|
41
|
+
if not files:
|
|
42
|
+
raise ValueError(f"No .npy feature files found in {features_dir}")
|
|
43
|
+
return files
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def list_trajectory_files(trajectories_dir: Path) -> List[Path]:
|
|
47
|
+
"""List coordinate trajectory files supported by mdtraj.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
trajectories_dir : Path
|
|
52
|
+
Directory containing trajectory files.
|
|
53
|
+
|
|
54
|
+
Returns
|
|
55
|
+
-------
|
|
56
|
+
list of Path
|
|
57
|
+
Sorted paths to trajectory files (``.xtc``, ``.dcd``, ``.trr``).
|
|
58
|
+
|
|
59
|
+
Raises
|
|
60
|
+
------
|
|
61
|
+
FileNotFoundError
|
|
62
|
+
If the directory does not exist.
|
|
63
|
+
ValueError
|
|
64
|
+
If no supported trajectory files are found.
|
|
65
|
+
"""
|
|
66
|
+
trajectories_dir = Path(trajectories_dir)
|
|
67
|
+
if not trajectories_dir.is_dir():
|
|
68
|
+
raise FileNotFoundError(
|
|
69
|
+
f"Trajectories directory not found: {trajectories_dir}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
extensions = ("*.xtc", "*.dcd", "*.trr", "*.nc", "*.pdb")
|
|
73
|
+
files: List[Path] = []
|
|
74
|
+
for pattern in extensions:
|
|
75
|
+
files.extend(trajectories_dir.glob(pattern))
|
|
76
|
+
files = sorted(set(files), key=lambda p: p.name)
|
|
77
|
+
|
|
78
|
+
if not files:
|
|
79
|
+
raise ValueError(
|
|
80
|
+
f"No supported trajectory files found in {trajectories_dir}"
|
|
81
|
+
)
|
|
82
|
+
return files
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _stem(path: Path) -> str:
|
|
86
|
+
"""Return the filename stem without extension."""
|
|
87
|
+
return path.stem
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def validate_feature_trajectory_mapping(
|
|
91
|
+
feature_files: List[Path],
|
|
92
|
+
trajectory_files: Optional[List[Path]] = None,
|
|
93
|
+
) -> None:
|
|
94
|
+
"""Ensure feature and trajectory filenames match one-to-one.
|
|
95
|
+
|
|
96
|
+
Parameters
|
|
97
|
+
----------
|
|
98
|
+
feature_files : list of Path
|
|
99
|
+
Feature ``.npy`` file paths.
|
|
100
|
+
trajectory_files : list of Path or None
|
|
101
|
+
Optional coordinate trajectory file paths.
|
|
102
|
+
|
|
103
|
+
Raises
|
|
104
|
+
------
|
|
105
|
+
ValueError
|
|
106
|
+
If stems do not match exactly between features and trajectories.
|
|
107
|
+
"""
|
|
108
|
+
if trajectory_files is None:
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
feature_stems = {_stem(f) for f in feature_files}
|
|
112
|
+
traj_stems = {_stem(t) for t in trajectory_files}
|
|
113
|
+
|
|
114
|
+
missing_traj = feature_stems - traj_stems
|
|
115
|
+
missing_features = traj_stems - feature_stems
|
|
116
|
+
|
|
117
|
+
if missing_traj or missing_features:
|
|
118
|
+
messages = []
|
|
119
|
+
if missing_traj:
|
|
120
|
+
messages.append(
|
|
121
|
+
f"Features without matching trajectories: {sorted(missing_traj)}"
|
|
122
|
+
)
|
|
123
|
+
if missing_features:
|
|
124
|
+
messages.append(
|
|
125
|
+
f"Trajectories without matching features: {sorted(missing_features)}"
|
|
126
|
+
)
|
|
127
|
+
raise ValueError("; ".join(messages))
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def load_features(features_dir: Path) -> Dataset:
|
|
131
|
+
"""Load feature arrays from disk and build a :class:`Dataset`.
|
|
132
|
+
|
|
133
|
+
Each ``*.npy`` file must have shape ``(n_frames, n_features)``. Per-frame
|
|
134
|
+
``FrameRecord`` objects are created while preserving trajectory identity.
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
features_dir : Path
|
|
139
|
+
Directory containing feature files.
|
|
140
|
+
|
|
141
|
+
Returns
|
|
142
|
+
-------
|
|
143
|
+
Dataset
|
|
144
|
+
Loaded dataset with concatenated feature matrix and frame records.
|
|
145
|
+
|
|
146
|
+
Raises
|
|
147
|
+
------
|
|
148
|
+
ValueError
|
|
149
|
+
If feature arrays have inconsistent dimensionality.
|
|
150
|
+
"""
|
|
151
|
+
feature_files = list_feature_files(features_dir)
|
|
152
|
+
frames: List[FrameRecord] = []
|
|
153
|
+
feature_blocks: List[np.ndarray] = []
|
|
154
|
+
traj_index_map: Dict[int, Tuple[int, int]] = {}
|
|
155
|
+
traj_names: List[str] = []
|
|
156
|
+
global_offset = 0
|
|
157
|
+
n_features: Optional[int] = None
|
|
158
|
+
|
|
159
|
+
for traj_id, feature_path in enumerate(feature_files):
|
|
160
|
+
features = np.load(feature_path)
|
|
161
|
+
if features.ndim != 2:
|
|
162
|
+
raise ValueError(
|
|
163
|
+
f"Feature file {feature_path} must be 2D (n_frames, n_features), "
|
|
164
|
+
f"got shape {features.shape}"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if n_features is None:
|
|
168
|
+
n_features = features.shape[1]
|
|
169
|
+
elif features.shape[1] != n_features:
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"Inconsistent feature dimension in {feature_path}: "
|
|
172
|
+
f"expected {n_features}, got {features.shape[1]}"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
n_frames = features.shape[0]
|
|
176
|
+
start_idx = global_offset
|
|
177
|
+
end_idx = global_offset + n_frames
|
|
178
|
+
|
|
179
|
+
for frame_id in range(n_frames):
|
|
180
|
+
global_index = global_offset + frame_id
|
|
181
|
+
frames.append(
|
|
182
|
+
FrameRecord(
|
|
183
|
+
traj_id=traj_id,
|
|
184
|
+
frame_id=frame_id,
|
|
185
|
+
features=features[frame_id],
|
|
186
|
+
global_index=global_index,
|
|
187
|
+
)
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
feature_blocks.append(features)
|
|
191
|
+
traj_index_map[traj_id] = (start_idx, end_idx)
|
|
192
|
+
traj_names.append(_stem(feature_path))
|
|
193
|
+
global_offset = end_idx
|
|
194
|
+
|
|
195
|
+
logger.info(
|
|
196
|
+
"Loaded %s: %d frames, %d features",
|
|
197
|
+
feature_path.name,
|
|
198
|
+
n_frames,
|
|
199
|
+
n_features,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
feature_matrix = (
|
|
203
|
+
np.vstack(feature_blocks) if feature_blocks else np.empty((0, 0))
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
return Dataset(
|
|
207
|
+
frames=frames,
|
|
208
|
+
feature_matrix=feature_matrix,
|
|
209
|
+
traj_index_map=traj_index_map,
|
|
210
|
+
traj_names=traj_names,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def validate_dataset(
|
|
215
|
+
dataset: Dataset,
|
|
216
|
+
trajectory_files: Optional[List[Path]] = None,
|
|
217
|
+
) -> None:
|
|
218
|
+
"""Run consistency checks on a loaded dataset.
|
|
219
|
+
|
|
220
|
+
Parameters
|
|
221
|
+
----------
|
|
222
|
+
dataset : Dataset
|
|
223
|
+
Dataset to validate.
|
|
224
|
+
trajectory_files : list of Path or None
|
|
225
|
+
Optional trajectory files for cross-validation.
|
|
226
|
+
|
|
227
|
+
Raises
|
|
228
|
+
------
|
|
229
|
+
ValueError
|
|
230
|
+
If internal consistency checks fail.
|
|
231
|
+
"""
|
|
232
|
+
if dataset.feature_matrix is None or len(dataset.frames) == 0:
|
|
233
|
+
raise ValueError("Dataset is empty.")
|
|
234
|
+
|
|
235
|
+
n_frames, n_features = dataset.feature_matrix.shape
|
|
236
|
+
if n_frames != len(dataset.frames):
|
|
237
|
+
raise ValueError(
|
|
238
|
+
"Feature matrix row count does not match number of frame records."
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
for record in dataset.frames:
|
|
242
|
+
if record.features.shape != (n_features,):
|
|
243
|
+
raise ValueError(
|
|
244
|
+
f"Frame ({record.traj_id}, {record.frame_id}) has invalid "
|
|
245
|
+
f"feature shape {record.features.shape}."
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
if trajectory_files is not None:
|
|
249
|
+
feature_stems = set(dataset.traj_names)
|
|
250
|
+
traj_stems = {path.stem for path in trajectory_files}
|
|
251
|
+
missing_traj = feature_stems - traj_stems
|
|
252
|
+
missing_features = traj_stems - feature_stems
|
|
253
|
+
if missing_traj or missing_features:
|
|
254
|
+
messages = []
|
|
255
|
+
if missing_traj:
|
|
256
|
+
messages.append(
|
|
257
|
+
f"Features without matching trajectories: {sorted(missing_traj)}"
|
|
258
|
+
)
|
|
259
|
+
if missing_features:
|
|
260
|
+
messages.append(
|
|
261
|
+
f"Trajectories without matching features: {sorted(missing_features)}"
|
|
262
|
+
)
|
|
263
|
+
raise ValueError("; ".join(messages))
|
|
264
|
+
if len(trajectory_files) != len(dataset.traj_names):
|
|
265
|
+
raise ValueError(
|
|
266
|
+
"Number of trajectory files must match number of feature files."
|
|
267
|
+
)
|