adaptivepy-sampling 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,93 @@
1
+ """KMeans clustering via scikit-learn."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, Optional
6
+
7
+ import numpy as np
8
+ from sklearn.cluster import KMeans
9
+
10
+ from adaptivepy.clustering.base import Clusterer
11
+
12
+
13
+ class SklearnKMeansClusterer(Clusterer):
14
+ """Wrap ``sklearn.cluster.KMeans`` for AdaptivePy.
15
+
16
+ Parameters
17
+ ----------
18
+ n_clusters : int
19
+ Number of clusters.
20
+ random_state : int or None
21
+ Random seed passed to KMeans.
22
+ **kwargs
23
+ Additional keyword arguments forwarded to ``KMeans``.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ n_clusters: int,
29
+ random_state: Optional[int] = None,
30
+ **kwargs: Any,
31
+ ) -> None:
32
+ self.n_clusters = n_clusters
33
+ self.random_state = random_state
34
+ self._extra_kwargs = kwargs
35
+ self._model: Optional[KMeans] = None
36
+
37
+ def fit(self, X: np.ndarray) -> "SklearnKMeansClusterer":
38
+ """Fit KMeans on the provided feature matrix.
39
+
40
+ Parameters
41
+ ----------
42
+ X : np.ndarray
43
+ Feature matrix of shape ``(n_samples, n_features)``.
44
+
45
+ Returns
46
+ -------
47
+ SklearnKMeansClusterer
48
+ Fitted clusterer.
49
+ """
50
+ self._model = KMeans(
51
+ n_clusters=self.n_clusters,
52
+ random_state=self.random_state,
53
+ n_init=10,
54
+ **self._extra_kwargs,
55
+ )
56
+ self._model.fit(X)
57
+ return self
58
+
59
+ def predict(self, X: np.ndarray) -> np.ndarray:
60
+ """Predict cluster labels for ``X``.
61
+
62
+ Parameters
63
+ ----------
64
+ X : np.ndarray
65
+ Feature matrix.
66
+
67
+ Returns
68
+ -------
69
+ np.ndarray
70
+ Cluster labels.
71
+
72
+ Raises
73
+ ------
74
+ RuntimeError
75
+ If ``fit`` has not been called.
76
+ """
77
+ if self._model is None:
78
+ raise RuntimeError("Clusterer must be fitted before predict.")
79
+ return self._model.predict(X)
80
+
81
+ @property
82
+ def cluster_centers_(self) -> Optional[np.ndarray]:
83
+ """Return KMeans cluster centers."""
84
+ if self._model is None:
85
+ return None
86
+ return self._model.cluster_centers_
87
+
88
+ @property
89
+ def model(self) -> Any:
90
+ """Return the fitted ``KMeans`` instance."""
91
+ if self._model is None:
92
+ raise RuntimeError("Clusterer must be fitted before accessing model.")
93
+ return self._model
@@ -0,0 +1,94 @@
1
+ """MiniBatchKMeans clustering via scikit-learn."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Optional
6
+
7
+ import numpy as np
8
+ from sklearn.cluster import MiniBatchKMeans
9
+
10
+ from adaptivepy.clustering.base import Clusterer
11
+
12
+
13
+ class SklearnMiniBatchClusterer(Clusterer):
14
+ """Wrap ``sklearn.cluster.MiniBatchKMeans`` for large datasets.
15
+
16
+ Parameters
17
+ ----------
18
+ n_clusters : int
19
+ Number of clusters.
20
+ random_state : int or None
21
+ Random seed passed to MiniBatchKMeans.
22
+ **kwargs
23
+ Additional keyword arguments forwarded to ``MiniBatchKMeans``.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ n_clusters: int,
29
+ random_state: Optional[int] = None,
30
+ **kwargs: Any,
31
+ ) -> None:
32
+ self.n_clusters = n_clusters
33
+ self.random_state = random_state
34
+ self._extra_kwargs = kwargs
35
+ self._model: Optional[MiniBatchKMeans] = None
36
+
37
+ def fit(self, X: np.ndarray) -> "SklearnMiniBatchClusterer":
38
+ """Fit MiniBatchKMeans on the provided feature matrix.
39
+
40
+ Parameters
41
+ ----------
42
+ X : np.ndarray
43
+ Feature matrix of shape ``(n_samples, n_features)``.
44
+
45
+ Returns
46
+ -------
47
+ SklearnMiniBatchClusterer
48
+ Fitted clusterer.
49
+ """
50
+ self._model = MiniBatchKMeans(
51
+ n_clusters=self.n_clusters,
52
+ random_state=self.random_state,
53
+ n_init=3,
54
+ batch_size=min(1024, max(256, X.shape[0] // 10)),
55
+ **self._extra_kwargs,
56
+ )
57
+ self._model.fit(X)
58
+ return self
59
+
60
+ def predict(self, X: np.ndarray) -> np.ndarray:
61
+ """Predict cluster labels for ``X``.
62
+
63
+ Parameters
64
+ ----------
65
+ X : np.ndarray
66
+ Feature matrix.
67
+
68
+ Returns
69
+ -------
70
+ np.ndarray
71
+ Cluster labels.
72
+
73
+ Raises
74
+ ------
75
+ RuntimeError
76
+ If ``fit`` has not been called.
77
+ """
78
+ if self._model is None:
79
+ raise RuntimeError("Clusterer must be fitted before predict.")
80
+ return self._model.predict(X)
81
+
82
+ @property
83
+ def cluster_centers_(self) -> Optional[np.ndarray]:
84
+ """Return MiniBatchKMeans cluster centers."""
85
+ if self._model is None:
86
+ return None
87
+ return self._model.cluster_centers_
88
+
89
+ @property
90
+ def model(self) -> Any:
91
+ """Return the fitted ``MiniBatchKMeans`` instance."""
92
+ if self._model is None:
93
+ raise RuntimeError("Clusterer must be fitted before accessing model.")
94
+ return self._model
@@ -0,0 +1,17 @@
1
+ """Configuration package for AdaptivePy."""
2
+
3
+ from adaptivepy.config.schema import (
4
+ ClusteringConfig,
5
+ RunConfig,
6
+ SeedSelectionConfig,
7
+ config_to_dict,
8
+ load_config,
9
+ )
10
+
11
+ __all__ = [
12
+ "ClusteringConfig",
13
+ "RunConfig",
14
+ "SeedSelectionConfig",
15
+ "config_to_dict",
16
+ "load_config",
17
+ ]
@@ -0,0 +1,196 @@
1
+ """Configuration schema and validation for AdaptivePy runs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ import yaml
10
+
11
+
12
+ DEFAULT_CLUSTERING_METHOD = "kmeans"
13
+ DEFAULT_SEED_SELECTION = "nearest_center"
14
+ DEFAULT_N_SEEDS = 10
15
+ DEFAULT_RANDOM_SEED = 42
16
+
17
+
18
+ @dataclass
19
+ class ClusteringConfig:
20
+ """Clustering backend configuration.
21
+
22
+ Attributes
23
+ ----------
24
+ method : str
25
+ Clustering method name: ``kmeans``, ``minibatch_kmeans``, or
26
+ ``regular_space``.
27
+ n_clusters : int
28
+ Number of clusters to fit.
29
+ params : dict
30
+ Additional keyword arguments passed to the clusterer.
31
+ """
32
+
33
+ method: str = DEFAULT_CLUSTERING_METHOD
34
+ n_clusters: int = 10
35
+ params: Dict[str, Any] = field(default_factory=dict)
36
+
37
+
38
+ @dataclass
39
+ class SeedSelectionConfig:
40
+ """Frame-level seed selection configuration.
41
+
42
+ Attributes
43
+ ----------
44
+ method : str
45
+ Selection method: ``nearest_center`` or ``random_frame``.
46
+ """
47
+
48
+ method: str = DEFAULT_SEED_SELECTION
49
+
50
+
51
+ @dataclass
52
+ class RunConfig:
53
+ """Full configuration for an adaptive sampling run.
54
+
55
+ Attributes
56
+ ----------
57
+ features_dir : Path
58
+ Directory containing ``*.npy`` feature files.
59
+ output_dir : Path
60
+ Directory where results are written.
61
+ trajectories_dir : Path or None
62
+ Optional directory containing coordinate trajectories.
63
+ topology : Path or None
64
+ Topology file required when trajectories are provided.
65
+ clustering : ClusteringConfig
66
+ Clustering settings.
67
+ policies : list of str
68
+ Policy names to evaluate in parallel.
69
+ n_seeds : int
70
+ Number of seed frames to select per policy.
71
+ seed_selection : SeedSelectionConfig
72
+ Frame selection method within chosen clusters.
73
+ random_seed : int
74
+ Global random seed for reproducibility.
75
+ write_pdbs : bool
76
+ Whether to write PDB files when trajectories are available.
77
+ """
78
+
79
+ features_dir: Path
80
+ output_dir: Path
81
+ trajectories_dir: Optional[Path] = None
82
+ topology: Optional[Path] = None
83
+ clustering: ClusteringConfig = field(default_factory=ClusteringConfig)
84
+ policies: List[str] = field(default_factory=lambda: ["least_counts"])
85
+ n_seeds: int = DEFAULT_N_SEEDS
86
+ seed_selection: SeedSelectionConfig = field(default_factory=SeedSelectionConfig)
87
+ random_seed: int = DEFAULT_RANDOM_SEED
88
+ write_pdbs: bool = True
89
+
90
+
91
+ def load_config(path: str | Path) -> RunConfig:
92
+ """Load and parse a YAML run configuration file.
93
+
94
+ Parameters
95
+ ----------
96
+ path : str or Path
97
+ Path to the YAML configuration file.
98
+
99
+ Returns
100
+ -------
101
+ RunConfig
102
+ Parsed and validated configuration object.
103
+
104
+ Raises
105
+ ------
106
+ FileNotFoundError
107
+ If the configuration file does not exist.
108
+ ValueError
109
+ If required fields are missing or invalid.
110
+ """
111
+ config_path = Path(path)
112
+ if not config_path.is_file():
113
+ raise FileNotFoundError(f"Configuration file not found: {config_path}")
114
+
115
+ with config_path.open("r", encoding="utf-8") as handle:
116
+ raw = yaml.safe_load(handle) or {}
117
+
118
+ if "features_dir" not in raw:
119
+ raise ValueError("Configuration must specify 'features_dir'.")
120
+ if "output_dir" not in raw:
121
+ raise ValueError("Configuration must specify 'output_dir'.")
122
+
123
+ features_dir = Path(raw["features_dir"])
124
+ output_dir = Path(raw["output_dir"])
125
+ trajectories_dir = (
126
+ Path(raw["trajectories_dir"]) if raw.get("trajectories_dir") else None
127
+ )
128
+ topology = Path(raw["topology"]) if raw.get("topology") else None
129
+
130
+ if trajectories_dir is not None and topology is None:
131
+ raise ValueError(
132
+ "When 'trajectories_dir' is provided, 'topology' must also be set."
133
+ )
134
+
135
+ clustering_raw = raw.get("clustering", {})
136
+ clustering = ClusteringConfig(
137
+ method=clustering_raw.get("method", DEFAULT_CLUSTERING_METHOD),
138
+ n_clusters=int(clustering_raw.get("n_clusters", 10)),
139
+ params=dict(clustering_raw.get("params", {})),
140
+ )
141
+
142
+ seed_raw = raw.get("seed_selection", {})
143
+ seed_selection = SeedSelectionConfig(
144
+ method=seed_raw.get("method", DEFAULT_SEED_SELECTION),
145
+ )
146
+
147
+ policies = raw.get("policies", ["least_counts"])
148
+ if isinstance(policies, str):
149
+ policies = [policies]
150
+
151
+ return RunConfig(
152
+ features_dir=features_dir,
153
+ output_dir=output_dir,
154
+ trajectories_dir=trajectories_dir,
155
+ topology=topology,
156
+ clustering=clustering,
157
+ policies=policies,
158
+ n_seeds=int(raw.get("n_seeds", DEFAULT_N_SEEDS)),
159
+ seed_selection=seed_selection,
160
+ random_seed=int(raw.get("random_seed", DEFAULT_RANDOM_SEED)),
161
+ write_pdbs=bool(raw.get("write_pdbs", True)),
162
+ )
163
+
164
+
165
+ def config_to_dict(config: RunConfig) -> Dict[str, Any]:
166
+ """Convert a :class:`RunConfig` to a plain dictionary for serialization.
167
+
168
+ Parameters
169
+ ----------
170
+ config : RunConfig
171
+ Configuration object to serialize.
172
+
173
+ Returns
174
+ -------
175
+ dict
176
+ YAML-serializable configuration dictionary.
177
+ """
178
+ result: Dict[str, Any] = {
179
+ "features_dir": str(config.features_dir),
180
+ "output_dir": str(config.output_dir),
181
+ "clustering": {
182
+ "method": config.clustering.method,
183
+ "n_clusters": config.clustering.n_clusters,
184
+ "params": config.clustering.params,
185
+ },
186
+ "policies": list(config.policies),
187
+ "n_seeds": config.n_seeds,
188
+ "seed_selection": {"method": config.seed_selection.method},
189
+ "random_seed": config.random_seed,
190
+ "write_pdbs": config.write_pdbs,
191
+ }
192
+ if config.trajectories_dir is not None:
193
+ result["trajectories_dir"] = str(config.trajectories_dir)
194
+ if config.topology is not None:
195
+ result["topology"] = str(config.topology)
196
+ return result
@@ -0,0 +1,27 @@
1
+ """Input/output utilities for AdaptivePy."""
2
+
3
+ from adaptivepy.io.loader import (
4
+ list_feature_files,
5
+ list_trajectory_files,
6
+ load_features,
7
+ validate_dataset,
8
+ validate_feature_trajectory_mapping,
9
+ )
10
+ from adaptivepy.io.trajectory import (
11
+ build_trajectory_map,
12
+ extract_frame,
13
+ load_trajectory,
14
+ validate_trajectory_frame_counts,
15
+ )
16
+
17
+ __all__ = [
18
+ "build_trajectory_map",
19
+ "extract_frame",
20
+ "list_feature_files",
21
+ "list_trajectory_files",
22
+ "load_features",
23
+ "load_trajectory",
24
+ "validate_dataset",
25
+ "validate_feature_trajectory_mapping",
26
+ "validate_trajectory_frame_counts",
27
+ ]
@@ -0,0 +1,267 @@
1
+ """Feature loading and dataset validation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Dict, List, Optional, Tuple
8
+
9
+ import numpy as np
10
+
11
+ from adaptivepy.models import Dataset, FrameRecord
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def list_feature_files(features_dir: Path) -> List[Path]:
17
+ """List ``*.npy`` feature files in a directory, sorted by name.
18
+
19
+ Parameters
20
+ ----------
21
+ features_dir : Path
22
+ Directory containing feature arrays.
23
+
24
+ Returns
25
+ -------
26
+ list of Path
27
+ Sorted paths to feature files.
28
+
29
+ Raises
30
+ ------
31
+ FileNotFoundError
32
+ If the directory does not exist.
33
+ ValueError
34
+ If no ``*.npy`` files are found.
35
+ """
36
+ features_dir = Path(features_dir)
37
+ if not features_dir.is_dir():
38
+ raise FileNotFoundError(f"Features directory not found: {features_dir}")
39
+
40
+ files = sorted(features_dir.glob("*.npy"))
41
+ if not files:
42
+ raise ValueError(f"No .npy feature files found in {features_dir}")
43
+ return files
44
+
45
+
46
+ def list_trajectory_files(trajectories_dir: Path) -> List[Path]:
47
+ """List coordinate trajectory files supported by mdtraj.
48
+
49
+ Parameters
50
+ ----------
51
+ trajectories_dir : Path
52
+ Directory containing trajectory files.
53
+
54
+ Returns
55
+ -------
56
+ list of Path
57
+ Sorted paths to trajectory files (``.xtc``, ``.dcd``, ``.trr``).
58
+
59
+ Raises
60
+ ------
61
+ FileNotFoundError
62
+ If the directory does not exist.
63
+ ValueError
64
+ If no supported trajectory files are found.
65
+ """
66
+ trajectories_dir = Path(trajectories_dir)
67
+ if not trajectories_dir.is_dir():
68
+ raise FileNotFoundError(
69
+ f"Trajectories directory not found: {trajectories_dir}"
70
+ )
71
+
72
+ extensions = ("*.xtc", "*.dcd", "*.trr", "*.nc", "*.pdb")
73
+ files: List[Path] = []
74
+ for pattern in extensions:
75
+ files.extend(trajectories_dir.glob(pattern))
76
+ files = sorted(set(files), key=lambda p: p.name)
77
+
78
+ if not files:
79
+ raise ValueError(
80
+ f"No supported trajectory files found in {trajectories_dir}"
81
+ )
82
+ return files
83
+
84
+
85
+ def _stem(path: Path) -> str:
86
+ """Return the filename stem without extension."""
87
+ return path.stem
88
+
89
+
90
+ def validate_feature_trajectory_mapping(
91
+ feature_files: List[Path],
92
+ trajectory_files: Optional[List[Path]] = None,
93
+ ) -> None:
94
+ """Ensure feature and trajectory filenames match one-to-one.
95
+
96
+ Parameters
97
+ ----------
98
+ feature_files : list of Path
99
+ Feature ``.npy`` file paths.
100
+ trajectory_files : list of Path or None
101
+ Optional coordinate trajectory file paths.
102
+
103
+ Raises
104
+ ------
105
+ ValueError
106
+ If stems do not match exactly between features and trajectories.
107
+ """
108
+ if trajectory_files is None:
109
+ return
110
+
111
+ feature_stems = {_stem(f) for f in feature_files}
112
+ traj_stems = {_stem(t) for t in trajectory_files}
113
+
114
+ missing_traj = feature_stems - traj_stems
115
+ missing_features = traj_stems - feature_stems
116
+
117
+ if missing_traj or missing_features:
118
+ messages = []
119
+ if missing_traj:
120
+ messages.append(
121
+ f"Features without matching trajectories: {sorted(missing_traj)}"
122
+ )
123
+ if missing_features:
124
+ messages.append(
125
+ f"Trajectories without matching features: {sorted(missing_features)}"
126
+ )
127
+ raise ValueError("; ".join(messages))
128
+
129
+
130
+ def load_features(features_dir: Path) -> Dataset:
131
+ """Load feature arrays from disk and build a :class:`Dataset`.
132
+
133
+ Each ``*.npy`` file must have shape ``(n_frames, n_features)``. Per-frame
134
+ ``FrameRecord`` objects are created while preserving trajectory identity.
135
+
136
+ Parameters
137
+ ----------
138
+ features_dir : Path
139
+ Directory containing feature files.
140
+
141
+ Returns
142
+ -------
143
+ Dataset
144
+ Loaded dataset with concatenated feature matrix and frame records.
145
+
146
+ Raises
147
+ ------
148
+ ValueError
149
+ If feature arrays have inconsistent dimensionality.
150
+ """
151
+ feature_files = list_feature_files(features_dir)
152
+ frames: List[FrameRecord] = []
153
+ feature_blocks: List[np.ndarray] = []
154
+ traj_index_map: Dict[int, Tuple[int, int]] = {}
155
+ traj_names: List[str] = []
156
+ global_offset = 0
157
+ n_features: Optional[int] = None
158
+
159
+ for traj_id, feature_path in enumerate(feature_files):
160
+ features = np.load(feature_path)
161
+ if features.ndim != 2:
162
+ raise ValueError(
163
+ f"Feature file {feature_path} must be 2D (n_frames, n_features), "
164
+ f"got shape {features.shape}"
165
+ )
166
+
167
+ if n_features is None:
168
+ n_features = features.shape[1]
169
+ elif features.shape[1] != n_features:
170
+ raise ValueError(
171
+ f"Inconsistent feature dimension in {feature_path}: "
172
+ f"expected {n_features}, got {features.shape[1]}"
173
+ )
174
+
175
+ n_frames = features.shape[0]
176
+ start_idx = global_offset
177
+ end_idx = global_offset + n_frames
178
+
179
+ for frame_id in range(n_frames):
180
+ global_index = global_offset + frame_id
181
+ frames.append(
182
+ FrameRecord(
183
+ traj_id=traj_id,
184
+ frame_id=frame_id,
185
+ features=features[frame_id],
186
+ global_index=global_index,
187
+ )
188
+ )
189
+
190
+ feature_blocks.append(features)
191
+ traj_index_map[traj_id] = (start_idx, end_idx)
192
+ traj_names.append(_stem(feature_path))
193
+ global_offset = end_idx
194
+
195
+ logger.info(
196
+ "Loaded %s: %d frames, %d features",
197
+ feature_path.name,
198
+ n_frames,
199
+ n_features,
200
+ )
201
+
202
+ feature_matrix = (
203
+ np.vstack(feature_blocks) if feature_blocks else np.empty((0, 0))
204
+ )
205
+
206
+ return Dataset(
207
+ frames=frames,
208
+ feature_matrix=feature_matrix,
209
+ traj_index_map=traj_index_map,
210
+ traj_names=traj_names,
211
+ )
212
+
213
+
214
+ def validate_dataset(
215
+ dataset: Dataset,
216
+ trajectory_files: Optional[List[Path]] = None,
217
+ ) -> None:
218
+ """Run consistency checks on a loaded dataset.
219
+
220
+ Parameters
221
+ ----------
222
+ dataset : Dataset
223
+ Dataset to validate.
224
+ trajectory_files : list of Path or None
225
+ Optional trajectory files for cross-validation.
226
+
227
+ Raises
228
+ ------
229
+ ValueError
230
+ If internal consistency checks fail.
231
+ """
232
+ if dataset.feature_matrix is None or len(dataset.frames) == 0:
233
+ raise ValueError("Dataset is empty.")
234
+
235
+ n_frames, n_features = dataset.feature_matrix.shape
236
+ if n_frames != len(dataset.frames):
237
+ raise ValueError(
238
+ "Feature matrix row count does not match number of frame records."
239
+ )
240
+
241
+ for record in dataset.frames:
242
+ if record.features.shape != (n_features,):
243
+ raise ValueError(
244
+ f"Frame ({record.traj_id}, {record.frame_id}) has invalid "
245
+ f"feature shape {record.features.shape}."
246
+ )
247
+
248
+ if trajectory_files is not None:
249
+ feature_stems = set(dataset.traj_names)
250
+ traj_stems = {path.stem for path in trajectory_files}
251
+ missing_traj = feature_stems - traj_stems
252
+ missing_features = traj_stems - feature_stems
253
+ if missing_traj or missing_features:
254
+ messages = []
255
+ if missing_traj:
256
+ messages.append(
257
+ f"Features without matching trajectories: {sorted(missing_traj)}"
258
+ )
259
+ if missing_features:
260
+ messages.append(
261
+ f"Trajectories without matching features: {sorted(missing_features)}"
262
+ )
263
+ raise ValueError("; ".join(messages))
264
+ if len(trajectory_files) != len(dataset.traj_names):
265
+ raise ValueError(
266
+ "Number of trajectory files must match number of feature files."
267
+ )