PyPI - adaptivepy-sampling - Versions diffs - 0.1.0__py3-none-any.whl - Mend

adaptivepy-sampling 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

adaptivepy/__init__.py +7 -0
adaptivepy/api.py +229 -0
adaptivepy/cli/__init__.py +5 -0
adaptivepy/cli/run.py +68 -0
adaptivepy/clustering/__init__.py +103 -0
adaptivepy/clustering/base.py +73 -0
adaptivepy/clustering/regular_space.py +135 -0
adaptivepy/clustering/sklearn_kmeans.py +93 -0
adaptivepy/clustering/sklearn_minibatch.py +94 -0
adaptivepy/config/__init__.py +17 -0
adaptivepy/config/schema.py +196 -0
adaptivepy/io/__init__.py +27 -0
adaptivepy/io/loader.py +267 -0
adaptivepy/io/trajectory.py +151 -0
adaptivepy/models.py +83 -0
adaptivepy/output/__init__.py +23 -0
adaptivepy/output/pdb_writer.py +59 -0
adaptivepy/output/writer.py +229 -0
adaptivepy/policies/__init__.py +21 -0
adaptivepy/policies/base.py +105 -0
adaptivepy/policies/least_counts.py +43 -0
adaptivepy/policies/random.py +53 -0
adaptivepy/selection/__init__.py +5 -0
adaptivepy/selection/frame_selector.py +132 -0
adaptivepy/stats/__init__.py +15 -0
adaptivepy/stats/cluster_stats.py +118 -0
adaptivepy/utils/__init__.py +6 -0
adaptivepy/utils/io_utils.py +49 -0
adaptivepy/utils/logging.py +55 -0
adaptivepy_sampling-0.1.0.dist-info/METADATA +52 -0
adaptivepy_sampling-0.1.0.dist-info/RECORD +34 -0
adaptivepy_sampling-0.1.0.dist-info/WHEEL +5 -0
adaptivepy_sampling-0.1.0.dist-info/entry_points.txt +2 -0
adaptivepy_sampling-0.1.0.dist-info/top_level.txt +1 -0

adaptivepy/io/trajectory.py ADDED Viewed

@@ -0,0 +1,151 @@
+"""Coordinate trajectory loading and frame extraction via mdtraj."""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional
+import mdtraj as md
+from adaptivepy.io.loader import list_trajectory_files
+logger = logging.getLogger(__name__)
+def build_trajectory_map(
+    trajectories_dir: Path,
+    traj_names: List[str],
+) -> Dict[int, Path]:
+    """Map trajectory IDs to coordinate file paths by matching stems.
+    Parameters
+    ----------
+    trajectories_dir : Path
+        Directory containing coordinate trajectories.
+    traj_names : list of str
+        Basenames corresponding to feature files (e.g. ``traj_0``).
+    Returns
+    -------
+    dict
+        Mapping from ``traj_id`` to trajectory file path.
+    Raises
+    ------
+    ValueError
+        If a trajectory file cannot be found for any ``traj_name``.
+    """
+    traj_files = list_trajectory_files(trajectories_dir)
+    stem_to_path = {path.stem: path for path in traj_files}
+    mapping: Dict[int, Path] = {}
+    for traj_id, name in enumerate(traj_names):
+        if name not in stem_to_path:
+            raise ValueError(
+                f"No trajectory file found matching feature stem '{name}' "
+                f"in {trajectories_dir}"
+            )
+        mapping[traj_id] = stem_to_path[name]
+    return mapping
+def load_trajectory(topology: Path, trajectory_path: Path) -> md.Trajectory:
+    """Load a single trajectory using mdtraj.
+    Parameters
+    ----------
+    topology : Path
+        Topology file (PDB, parm7, etc.).
+    trajectory_path : Path
+        Coordinate trajectory file.
+    Returns
+    -------
+    mdtraj.Trajectory
+        Loaded trajectory object.
+    """
+    logger.info("Loading trajectory %s with topology %s", trajectory_path, topology)
+    return md.load(str(trajectory_path), top=str(topology))
+def extract_frame(
+    topology: Path,
+    trajectory_path: Path,
+    frame_id: int,
+) -> md.Trajectory:
+    """Load a trajectory and return a single-frame subset.
+    Parameters
+    ----------
+    topology : Path
+        Topology file path.
+    trajectory_path : Path
+        Coordinate trajectory file path.
+    frame_id : int
+        Zero-based frame index to extract.
+    Returns
+    -------
+    mdtraj.Trajectory
+        Single-frame trajectory suitable for PDB export.
+    """
+    traj = load_trajectory(topology, trajectory_path)
+    if frame_id < 0 or frame_id >= traj.n_frames:
+        raise IndexError(
+            f"frame_id {frame_id} out of range for trajectory with "
+            f"{traj.n_frames} frames ({trajectory_path})"
+        )
+    return traj[frame_id]
+def get_trajectory_frame_count(topology: Path, trajectory_path: Path) -> int:
+    """Return the number of frames in a trajectory without loading all coordinates.
+    Parameters
+    ----------
+    topology : Path
+        Topology file path.
+    trajectory_path : Path
+        Coordinate trajectory file path.
+    Returns
+    -------
+    int
+        Number of frames in the trajectory.
+    """
+    traj = md.load(str(trajectory_path), top=str(topology))
+    return traj.n_frames
+def validate_trajectory_frame_counts(
+    topology: Path,
+    trajectory_map: Dict[int, Path],
+    expected_counts: Dict[int, int],
+) -> None:
+    """Verify trajectory frame counts match feature frame counts.
+    Parameters
+    ----------
+    topology : Path
+        Topology file path.
+    trajectory_map : dict
+        Mapping from ``traj_id`` to trajectory file.
+    expected_counts : dict
+        Expected frame count per ``traj_id`` from features.
+    Raises
+    ------
+    ValueError
+        If any trajectory has a different number of frames than its features.
+    """
+    for traj_id, traj_path in trajectory_map.items():
+        n_traj_frames = get_trajectory_frame_count(topology, traj_path)
+        n_feature_frames = expected_counts.get(traj_id)
+        if n_feature_frames is None:
+            continue
+        if n_traj_frames != n_feature_frames:
+            raise ValueError(
+                f"Frame count mismatch for traj_id {traj_id} ({traj_path.name}): "
+                f"trajectory has {n_traj_frames}, features have {n_feature_frames}"
+            )

adaptivepy/models.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""Core data models for AdaptivePy."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+import numpy as np
+@dataclass
+class FrameRecord:
+    """A single frame tracked through the adaptive sampling pipeline.
+    Attributes
+    ----------
+    traj_id : int
+        Index of the source trajectory.
+    frame_id : int
+        Frame index within the source trajectory.
+    features : np.ndarray
+        Feature vector for this frame, shape ``(n_features,)``.
+    cluster_id : int or None
+        Assigned cluster label after clustering.
+    global_index : int or None
+        Row index in the concatenated feature matrix.
+    """
+    traj_id: int
+    frame_id: int
+    features: np.ndarray
+    cluster_id: Optional[int] = None
+    global_index: Optional[int] = None
+@dataclass
+class Dataset:
+    """Internal representation of loaded trajectory features.
+    Attributes
+    ----------
+    frames : list of FrameRecord
+        One record per frame across all trajectories.
+    feature_matrix : np.ndarray
+        Concatenated features, shape ``(n_total_frames, n_features)``.
+    traj_index_map : dict
+        Maps ``traj_id`` to ``(start_index, end_index)`` in ``feature_matrix``.
+    traj_names : list of str
+        Basenames of feature files (without extension), e.g. ``traj_0``.
+    """
+    frames: List[FrameRecord] = field(default_factory=list)
+    feature_matrix: Optional[np.ndarray] = None
+    traj_index_map: Dict[int, tuple[int, int]] = field(default_factory=dict)
+    traj_names: List[str] = field(default_factory=list)
+@dataclass
+class SeedResult:
+    """A selected seed frame produced by a policy.
+    Attributes
+    ----------
+    seed_id : int
+        Sequential identifier within a policy run.
+    policy : str
+        Name of the policy that selected this seed.
+    traj_id : int
+        Source trajectory index.
+    frame_id : int
+        Frame index within the source trajectory.
+    cluster_id : int
+        Cluster from which the seed was drawn.
+    global_index : int
+        Row index in the concatenated feature matrix.
+    """
+    seed_id: int
+    policy: str
+    traj_id: int
+    frame_id: int
+    cluster_id: int
+    global_index: int

adaptivepy/output/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""Output writers for AdaptivePy."""
+from adaptivepy.output.pdb_writer import write_seed_pdbs
+from adaptivepy.output.writer import (
+    write_assignments,
+    write_cluster_model,
+    write_cluster_statistics,
+    write_combined_metadata,
+    write_policy_outputs,
+    write_run_config,
+    write_seeds_csv,
+)
+__all__ = [
+    "write_assignments",
+    "write_cluster_model",
+    "write_cluster_statistics",
+    "write_combined_metadata",
+    "write_policy_outputs",
+    "write_run_config",
+    "write_seed_pdbs",
+    "write_seeds_csv",
+]

adaptivepy/output/pdb_writer.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""PDB export for selected seed frames using mdtraj."""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Dict, List
+from adaptivepy.io.trajectory import extract_frame
+from adaptivepy.models import SeedResult
+from adaptivepy.utils.io_utils import ensure_dir
+logger = logging.getLogger(__name__)
+def write_seed_pdbs(
+    seeds: List[SeedResult],
+    topology: Path,
+    trajectory_map: Dict[int, Path],
+    output_dir: Path,
+) -> List[Path]:
+    """Extract coordinate frames and save each seed as a PDB file.
+    Parameters
+    ----------
+    seeds : list of SeedResult
+        Selected seed frames.
+    topology : Path
+        Topology file for mdtraj loading.
+    trajectory_map : dict
+        Mapping from ``traj_id`` to trajectory file path.
+    output_dir : Path
+        Directory where PDB files are written (typically ``pdbs/``).
+    Returns
+    -------
+    list of Path
+        Paths to written PDB files.
+    """
+    pdb_dir = ensure_dir(output_dir / "pdbs")
+    written: List[Path] = []
+    for seed in seeds:
+        traj_path = trajectory_map[seed.traj_id]
+        frame = extract_frame(topology, traj_path, seed.frame_id)
+        pdb_path = pdb_dir / (
+            f"seed_{seed.seed_id}_traj{seed.traj_id}_frame{seed.frame_id}.pdb"
+        )
+        frame.save_pdb(str(pdb_path))
+        written.append(pdb_path)
+        logger.info(
+            "Wrote PDB for seed %d: traj=%d frame=%d -> %s",
+            seed.seed_id,
+            seed.traj_id,
+            seed.frame_id,
+            pdb_path.name,
+        )
+    return written

adaptivepy/output/writer.py ADDED Viewed

@@ -0,0 +1,229 @@
+"""Write run outputs: CSV metadata, numpy arrays, and serialized models."""
+from __future__ import annotations
+import csv
+import logging
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+import joblib
+import numpy as np
+import yaml
+from adaptivepy.config.schema import RunConfig, config_to_dict
+from adaptivepy.models import SeedResult
+from adaptivepy.stats.cluster_stats import ClusterStats, cluster_stats_to_rows
+from adaptivepy.utils.io_utils import copy_file, ensure_dir
+logger = logging.getLogger(__name__)
+def write_run_config(config: RunConfig, output_dir: Path, source_path: Path) -> Path:
+    """Save a copy of the run configuration to the output directory.
+    Parameters
+    ----------
+    config : RunConfig
+        Parsed configuration object.
+    output_dir : Path
+        Run output directory.
+    source_path : Path
+        Original YAML file path (copied verbatim when available).
+    Returns
+    -------
+    Path
+        Path to the saved configuration file.
+    """
+    dst = ensure_dir(output_dir) / "run_config.yaml"
+    if source_path.is_file():
+        copy_file(source_path, dst)
+    else:
+        with dst.open("w", encoding="utf-8") as handle:
+            yaml.safe_dump(config_to_dict(config), handle, sort_keys=False)
+    return dst
+def write_assignments(assignments: np.ndarray, output_dir: Path) -> Path:
+    """Save per-frame cluster assignments as a numpy array.
+    Parameters
+    ----------
+    assignments : np.ndarray
+        Cluster label per frame.
+    output_dir : Path
+        Run output directory.
+    Returns
+    -------
+    Path
+        Path to ``assignments.npy``.
+    """
+    path = ensure_dir(output_dir) / "assignments.npy"
+    np.save(path, assignments)
+    return path
+def write_cluster_model(model: Any, output_dir: Path) -> Path:
+    """Serialize the fitted clustering model with joblib.
+    Parameters
+    ----------
+    model : object
+        Fitted clustering model.
+    output_dir : Path
+        Run output directory.
+    Returns
+    -------
+    Path
+        Path to ``cluster_model.pkl``.
+    """
+    path = ensure_dir(output_dir) / "cluster_model.pkl"
+    joblib.dump(model, path)
+    return path
+def write_cluster_statistics(
+    cluster_stats: ClusterStats,
+    output_dir: Path,
+) -> Path:
+    """Write cluster population statistics to ``metadata.csv``.
+    Parameters
+    ----------
+    cluster_stats : dict
+        Per-cluster statistics.
+    output_dir : Path
+        Run output directory.
+    Returns
+    -------
+    Path
+        Path to ``metadata.csv``.
+    """
+    path = ensure_dir(output_dir) / "metadata.csv"
+    rows = cluster_stats_to_rows(cluster_stats)
+    with path.open("w", newline="", encoding="utf-8") as handle:
+        writer = csv.DictWriter(handle, fieldnames=["cluster_id", "population"])
+        writer.writeheader()
+        writer.writerows(rows)
+    return path
+def write_seeds_csv(seeds: Iterable[SeedResult], output_dir: Path) -> Path:
+    """Write selected seeds to ``seeds.csv``.
+    Parameters
+    ----------
+    seeds : iterable of SeedResult
+        Seed records for one policy.
+    output_dir : Path
+        Policy-specific output directory.
+    Returns
+    -------
+    Path
+        Path to ``seeds.csv``.
+    """
+    path = ensure_dir(output_dir) / "seeds.csv"
+    fieldnames = [
+        "seed_id",
+        "policy",
+        "traj_id",
+        "frame_id",
+        "cluster_id",
+        "global_index",
+    ]
+    with path.open("w", newline="", encoding="utf-8") as handle:
+        writer = csv.DictWriter(handle, fieldnames=fieldnames)
+        writer.writeheader()
+        for seed in seeds:
+            writer.writerow(
+                {
+                    "seed_id": seed.seed_id,
+                    "policy": seed.policy,
+                    "traj_id": seed.traj_id,
+                    "frame_id": seed.frame_id,
+                    "cluster_id": seed.cluster_id,
+                    "global_index": seed.global_index,
+                }
+            )
+    return path
+def write_combined_metadata(
+    policy_seeds: Dict[str, List[SeedResult]],
+    output_dir: Path,
+) -> Path:
+    """Write a combined seed table across all policies.
+    Parameters
+    ----------
+    policy_seeds : dict
+        Mapping from policy name to seed lists.
+    output_dir : Path
+        Top-level results directory.
+    Returns
+    -------
+    Path
+        Path to ``combined_metadata.csv``.
+    """
+    path = ensure_dir(output_dir) / "combined_metadata.csv"
+    fieldnames = [
+        "seed_id",
+        "policy",
+        "traj_id",
+        "frame_id",
+        "cluster_id",
+        "global_index",
+    ]
+    with path.open("w", newline="", encoding="utf-8") as handle:
+        writer = csv.DictWriter(handle, fieldnames=fieldnames)
+        writer.writeheader()
+        for _policy, seeds in policy_seeds.items():
+            for seed in seeds:
+                writer.writerow(
+                    {
+                        "seed_id": seed.seed_id,
+                        "policy": seed.policy,
+                        "traj_id": seed.traj_id,
+                        "frame_id": seed.frame_id,
+                        "cluster_id": seed.cluster_id,
+                        "global_index": seed.global_index,
+                    }
+                )
+    logger.info("Wrote combined metadata to %s", path)
+    return path
+def write_policy_outputs(
+    policy_name: str,
+    seeds: List[SeedResult],
+    cluster_stats: ClusterStats,
+    results_dir: Path,
+) -> Path:
+    """Write all outputs for a single policy into its subdirectory.
+    Parameters
+    ----------
+    policy_name : str
+        Policy identifier used as subdirectory name.
+    seeds : list of SeedResult
+        Seeds selected by the policy.
+    cluster_stats : dict
+        Global cluster statistics (same for all policies).
+    results_dir : Path
+        Top-level results directory.
+    Returns
+    -------
+    Path
+        Policy output directory path.
+    """
+    policy_dir = ensure_dir(results_dir / policy_name)
+    write_seeds_csv(seeds, policy_dir)
+    write_cluster_statistics(cluster_stats, policy_dir)
+    return policy_dir

adaptivepy/policies/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Adaptive sampling policies for AdaptivePy."""
+from adaptivepy.policies.base import (
+    POLICY_REGISTRY,
+    Policy,
+    get_policy,
+    list_policies,
+    register_policy,
+)
+# Import concrete policies so they self-register.
+from adaptivepy.policies import least_counts  # noqa: F401
+from adaptivepy.policies import random  # noqa: F401
+__all__ = [
+    "POLICY_REGISTRY",
+    "Policy",
+    "get_policy",
+    "list_policies",
+    "register_policy",
+]

adaptivepy/policies/base.py ADDED Viewed

@@ -0,0 +1,105 @@
+"""Adaptive sampling policy base class and registry."""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import Dict, List, Type
+from adaptivepy.stats.cluster_stats import ClusterStats
+POLICY_REGISTRY: Dict[str, Type["Policy"]] = {}
+def register_policy(cls: Type["Policy"]) -> Type["Policy"]:
+    """Register a policy class in :data:`POLICY_REGISTRY`.
+    Parameters
+    ----------
+    cls : type
+        Policy subclass with a ``name`` class attribute.
+    Returns
+    -------
+    type
+        The registered policy class (unchanged).
+    Raises
+    ------
+    ValueError
+        If the policy name is missing or already registered.
+    """
+    if not getattr(cls, "name", None):
+        raise ValueError(f"Policy {cls.__name__} must define a 'name' attribute.")
+    if cls.name in POLICY_REGISTRY:
+        raise ValueError(f"Policy '{cls.name}' is already registered.")
+    POLICY_REGISTRY[cls.name] = cls
+    return cls
+class Policy(ABC):
+    """Base class for cluster selection policies.
+    Subclasses implement :meth:`select_clusters` to choose which clusters
+    should contribute seed frames.
+    """
+    name: str = ""
+    @abstractmethod
+    def select_clusters(
+        self,
+        cluster_stats: ClusterStats,
+        n_seeds: int,
+    ) -> List[int]:
+        """Select cluster IDs from which to draw seed frames.
+        Parameters
+        ----------
+        cluster_stats : dict
+            Per-cluster population and frame lists.
+        n_seeds : int
+            Maximum number of clusters (seeds) to select.
+        Returns
+        -------
+        list of int
+            Selected cluster IDs.
+        """
+        ...
+def get_policy(name: str, **kwargs) -> Policy:
+    """Instantiate a registered policy by name.
+    Parameters
+    ----------
+    name : str
+        Registered policy name.
+    **kwargs
+        Constructor arguments forwarded to the policy class.
+    Returns
+    -------
+    Policy
+        Policy instance.
+    Raises
+    ------
+    ValueError
+        If the policy name is unknown.
+    """
+    if name not in POLICY_REGISTRY:
+        available = ", ".join(sorted(POLICY_REGISTRY))
+        raise ValueError(f"Unknown policy '{name}'. Available: {available}")
+    return POLICY_REGISTRY[name](**kwargs)
+def list_policies() -> List[str]:
+    """Return names of all registered policies.
+    Returns
+    -------
+    list of str
+        Sorted policy names.
+    """
+    return sorted(POLICY_REGISTRY.keys())