PyPI - variantfold - Versions diffs - 0.1.0__py3-none-any.whl - Mend

variantfold 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

variantfold/__init__.py +41 -0
variantfold/cli.py +161 -0
variantfold/config.py +104 -0
variantfold/graphs.py +224 -0
variantfold/model.py +316 -0
variantfold/pipeline.py +254 -0
variantfold/structure.py +256 -0
variantfold/variants.py +283 -0
variantfold-0.1.0.dist-info/METADATA +151 -0
variantfold-0.1.0.dist-info/RECORD +13 -0
variantfold-0.1.0.dist-info/WHEEL +5 -0
variantfold-0.1.0.dist-info/entry_points.txt +2 -0
variantfold-0.1.0.dist-info/top_level.txt +1 -0

variantfold/__init__.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""
+VariantFold — Classify variants of uncertain significance using
+AlphaFold-predicted protein structures and graph neural networks.
+Workflow
+--------
+1. Parse ClinVar variant data
+2. Mutate reference protein sequences
+3. Predict 3-D structures with ColabFold / AlphaFold
+4. Convert PDB structures to residue-level graphs
+5. Train a GCN classifier (benign vs pathogenic)
+6. Classify VUS with the trained model
+"""
+__version__ = "0.1.0"
+from variantfold.config import VariantFoldConfig
+from variantfold.variants import (
+    parse_clinvar_variant,
+    load_clinvar_table,
+    swap_amino_acid,
+    generate_mutant_sequences,
+)
+from variantfold.graphs import pdb_to_graph, load_pdb_directory
+from variantfold.model import VariantGCN, train_model, evaluate_model, predict_vus
+from variantfold.pipeline import VariantFoldPipeline
+__all__ = [
+    "VariantFoldConfig",
+    "parse_clinvar_variant",
+    "load_clinvar_table",
+    "swap_amino_acid",
+    "generate_mutant_sequences",
+    "pdb_to_graph",
+    "load_pdb_directory",
+    "VariantGCN",
+    "train_model",
+    "evaluate_model",
+    "predict_vus",
+    "VariantFoldPipeline",
+]

variantfold/cli.py ADDED Viewed

@@ -0,0 +1,161 @@
+"""
+Command-line interface for VariantFold.
+Usage
+-----
+    variantfold run --gene VHL --email me@example.com --steps 1,3,4,5
+    variantfold predict --model model.pt --pdb-dir ./vus_library/
+"""
+from __future__ import annotations
+import argparse
+import logging
+import sys
+from variantfold import __version__
+def _build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="variantfold",
+        description="Classify VUS using AlphaFold structures and GNNs.",
+    )
+    p.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
+    sub = p.add_subparsers(dest="command")
+    # ---- run ----------------------------------------------------------------
+    run_p = sub.add_parser("run", help="Run the full or partial pipeline.")
+    run_p.add_argument("--gene", required=True, help="HGNC gene symbol.")
+    run_p.add_argument("--email", required=True, help="Email for NCBI Entrez.")
+    run_p.add_argument(
+        "--work-dir", default=None,
+        help="Working directory (default: ./variantfold_<gene>).",
+    )
+    run_p.add_argument(
+        "--steps", default="1,3,4,5",
+        help=(
+            "Comma-separated step numbers to run.  "
+            "1=parse, 2=predict structures, 3=collect models, "
+            "4=train, 5=classify VUS.  "
+            "Default: 1,3,4,5 (skip structure prediction)."
+        ),
+    )
+    run_p.add_argument(
+        "--accession", default=None,
+        help="Protein accession number (auto-fetched if omitted).",
+    )
+    run_p.add_argument("--epochs", type=int, default=200)
+    run_p.add_argument("--lr", type=float, default=0.01)
+    run_p.add_argument("--batch-size", type=int, default=32)
+    run_p.add_argument("--distance-threshold", type=float, default=6.5)
+    run_p.add_argument("--hidden-dim", type=int, default=64)
+    run_p.add_argument("--num-layers", type=int, default=3)
+    run_p.add_argument(
+        "--legacy-features", action="store_true",
+        help="Use pLDDT-only features (1-dim) instead of rich features (24-dim).",
+    )
+    run_p.add_argument("--seed", type=int, default=42)
+    run_p.add_argument("-v", "--verbose", action="store_true")
+    # ---- predict (standalone inference) -------------------------------------
+    pred_p = sub.add_parser("predict", help="Classify PDBs with a trained model.")
+    pred_p.add_argument("--model", required=True, help="Path to .pt model file.")
+    pred_p.add_argument("--pdb-dir", required=True, help="Directory of VUS PDB files.")
+    pred_p.add_argument("--output", default="vus_predictions.csv")
+    pred_p.add_argument("--distance-threshold", type=float, default=6.5)
+    pred_p.add_argument("--batch-size", type=int, default=32)
+    pred_p.add_argument(
+        "--legacy-features", action="store_true",
+        help="Use pLDDT-only features (must match training mode).",
+    )
+    pred_p.add_argument("-v", "--verbose", action="store_true")
+    return p
+def main(argv=None) -> None:
+    parser = _build_parser()
+    args = parser.parse_args(argv)
+    if args.command is None:
+        parser.print_help()
+        sys.exit(1)
+    level = logging.DEBUG if getattr(args, "verbose", False) else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
+    )
+    if args.command == "run":
+        _cmd_run(args)
+    elif args.command == "predict":
+        _cmd_predict(args)
+def _cmd_run(args) -> None:
+    from variantfold.config import VariantFoldConfig
+    from variantfold.pipeline import VariantFoldPipeline
+    cfg = VariantFoldConfig(
+        gene_symbol=args.gene,
+        entrez_email=args.email,
+        work_dir=args.work_dir,
+        accession_number=args.accession,
+        epochs=args.epochs,
+        learning_rate=args.lr,
+        batch_size=args.batch_size,
+        distance_threshold=args.distance_threshold,
+        gcn_hidden_dim=args.hidden_dim,
+        gcn_num_layers=args.num_layers,
+        use_residue_features=not args.legacy_features,
+        random_seed=args.seed,
+    )
+    pipe = VariantFoldPipeline(cfg)
+    steps = {int(s.strip()) for s in args.steps.split(",")}
+    if 1 in steps:
+        pipe.step1_parse_variants()
+    if 2 in steps:
+        pipe.step2_predict_structures()
+    if 3 in steps:
+        pipe.step3_collect_models()
+    if 4 in steps:
+        metrics = pipe.step4_train()
+        print(f"\nTest accuracy: {metrics['accuracy']:.4f}")
+        print(f"Confusion matrix:\n{metrics['confusion_matrix']}")
+    if 5 in steps:
+        df = pipe.step5_classify_vus()
+        print(f"\nVUS predictions:\n{df.to_string(index=False)}")
+def _cmd_predict(args) -> None:
+    import pandas as pd
+    from variantfold.graphs import load_pdb_directory
+    from variantfold.model import load_model, predict_vus
+    model = load_model(args.model)
+    use_rich = not args.legacy_features
+    graphs = load_pdb_directory(
+        args.pdb_dir, label=None,
+        distance_threshold=args.distance_threshold,
+        use_residue_features=use_rich,
+    )
+    if not graphs:
+        print(f"No PDB files found in {args.pdb_dir}", file=sys.stderr)
+        sys.exit(1)
+    results = predict_vus(model, graphs, batch_size=args.batch_size)
+    df = pd.DataFrame(results)
+    df.to_csv(args.output, index=False)
+    print(f"Predictions saved to {args.output}")
+    print(df.to_string(index=False))
+if __name__ == "__main__":
+    main()

variantfold/config.py ADDED Viewed

@@ -0,0 +1,104 @@
+"""
+Central configuration for a VariantFold run.
+All paths, thresholds, and hyper-parameters live here so that nothing
+is hard-coded to Google Drive or Colab.
+"""
+from __future__ import annotations
+import logging
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+logger = logging.getLogger(__name__)
+@dataclass
+class VariantFoldConfig:
+    """Settings for one end-to-end VariantFold analysis."""
+    # ---- Identity -----------------------------------------------------------
+    gene_symbol: str
+    entrez_email: str  # required by NCBI Entrez
+    # ---- Paths (default to ./variantfold_<gene>/) ---------------------------
+    work_dir: Optional[str] = None  # root working directory
+    # ---- Variant parsing ----------------------------------------------------
+    clinvar_benign_file: str = "clinvar_result_bng.txt"
+    clinvar_pathogenic_file: str = "clinvar_result_ptg.txt"
+    clinvar_vus_file: str = "clinvar_result_vus.txt"
+    # ---- Protein sequence ---------------------------------------------------
+    accession_number: Optional[str] = None  # auto-fetched if None
+    # ---- Structure prediction (ColabFold) -----------------------------------
+    num_models: int = 5
+    num_relax: int = 0
+    msa_mode: str = "mmseqs2_uniref_env"
+    pair_mode: str = "unpaired_paired"
+    model_type: str = "auto"
+    num_recycles: Optional[int] = None  # None = auto
+    recycle_early_stop_tolerance: Optional[float] = None
+    num_seeds: int = 1
+    use_dropout: bool = False
+    use_templates: bool = False
+    # ---- Graph construction -------------------------------------------------
+    distance_threshold: float = 6.5  # Å, residue contact threshold
+    use_residue_features: bool = True  # one-hot AA + coords + pLDDT
+    # ---- GCN training -------------------------------------------------------
+    gcn_hidden_dim: int = 64
+    gcn_num_layers: int = 3
+    gcn_dropout: float = 0.5
+    learning_rate: float = 0.01
+    epochs: int = 200
+    batch_size: int = 32
+    train_fraction: float = 0.8
+    random_seed: int = 42
+    # ---- Derived paths (set in __post_init__) -------------------------------
+    benign_dir: str = field(init=False, repr=False)
+    pathogenic_dir: str = field(init=False, repr=False)
+    vus_dir: str = field(init=False, repr=False)
+    benign_library: str = field(init=False, repr=False)
+    pathogenic_library: str = field(init=False, repr=False)
+    vus_library: str = field(init=False, repr=False)
+    def __post_init__(self) -> None:
+        if self.work_dir is None:
+            self.work_dir = os.path.join(".", f"variantfold_{self.gene_symbol}")
+        root = Path(self.work_dir)
+        self.benign_dir = str(root / "Benign")
+        self.pathogenic_dir = str(root / "Pathogenic")
+        self.vus_dir = str(root / "VUS")
+        self.benign_library = str(root / "Benign" / "library_bng")
+        self.pathogenic_library = str(root / "Pathogenic" / "library_ptg")
+        self.vus_library = str(root / "VUS" / "library_vus")
+    # ---- Helpers ------------------------------------------------------------
+    def ensure_directories(self) -> None:
+        """Create the full directory tree if it doesn't exist."""
+        for d in [
+            self.work_dir,
+            self.benign_dir,
+            self.pathogenic_dir,
+            self.vus_dir,
+            self.benign_library,
+            self.pathogenic_library,
+            self.vus_library,
+        ]:
+            os.makedirs(d, exist_ok=True)
+            logger.debug("Ensured directory: %s", d)
+    @property
+    def num_node_features(self) -> int:
+        """Number of features per graph node."""
+        if self.use_residue_features:
+            return 24  # 20 one-hot AA + 3 coords + 1 pLDDT
+        return 1  # pLDDT only (legacy mode)

variantfold/graphs.py ADDED Viewed

@@ -0,0 +1,224 @@
+"""
+Convert PDB structure files to PyTorch Geometric residue-level graphs.
+Fixes from audit
+-----------------
+- BUG-7  : Labels are set correctly per-graph (no more hardcoded y=1).
+- BUG-8  : VUS samples carry label=None rather than fake class 2.
+- DESIGN-3: Rich node features (one-hot AA + 3-D coords + pLDDT).
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import List, Optional, Tuple
+import numpy as np
+import torch
+from torch_geometric.data import Data
+logger = logging.getLogger(__name__)
+# ---- Amino-acid one-hot encoding -------------------------------------------
+_AA_ORDER = "ACDEFGHIKLMNPQRSTVWY"
+_AA_INDEX = {aa: i for i, aa in enumerate(_AA_ORDER)}
+# Mapping 3-letter codes to 1-letter for PDB ATOM records
+_AA3_TO_1 = {
+    "ALA": "A", "CYS": "C", "ASP": "D", "GLU": "E", "PHE": "F",
+    "GLY": "G", "HIS": "H", "ILE": "I", "LYS": "K", "LEU": "L",
+    "MET": "M", "ASN": "N", "PRO": "P", "GLN": "Q", "ARG": "R",
+    "SER": "S", "THR": "T", "VAL": "V", "TRP": "W", "TYR": "Y",
+}
+def _one_hot_aa(resname_3: str) -> np.ndarray:
+    """Return a length-20 one-hot vector for the amino acid."""
+    vec = np.zeros(20, dtype=np.float32)
+    one = _AA3_TO_1.get(resname_3.strip().upper())
+    if one is not None and one in _AA_INDEX:
+        vec[_AA_INDEX[one]] = 1.0
+    return vec
+# ---- Distance matrix -------------------------------------------------------
+def _distance_matrix(coords: np.ndarray) -> np.ndarray:
+    """Pairwise Euclidean distance between rows of *coords*."""
+    diff = np.expand_dims(coords, 1) - np.expand_dims(coords, 0)
+    return np.sqrt((diff ** 2).sum(axis=-1))
+# ---- Core graph builder -----------------------------------------------------
+def pdb_to_graph(
+    pdb_path: str,
+    distance_threshold: float = 6.5,
+    use_residue_features: bool = True,
+    label: Optional[int] = None,
+) -> Data:
+    """Convert a PDB file to a PyTorch Geometric ``Data`` object.
+    Nodes correspond to residues.  An edge is added between two
+    residues whose Cα atoms are within *distance_threshold* Å.
+    Parameters
+    ----------
+    pdb_path : str
+        Path to a ``.pdb`` file.
+    distance_threshold : float
+        Contact distance cutoff in Ångströms.
+    use_residue_features : bool
+        If True, each node carries 24 features (one-hot AA, x/y/z, pLDDT).
+        If False, each node carries only pLDDT (legacy 1-feature mode).
+    label : int or None
+        Graph-level class label (0 = benign, 1 = pathogenic, None = VUS).
+    Returns
+    -------
+    torch_geometric.data.Data
+    """
+    from biopandas.pdb import PandasPdb
+    ppdb = PandasPdb().read_pdb(str(pdb_path))
+    atom_df = ppdb.df["ATOM"]
+    # Aggregate per residue
+    residue_groups = atom_df.groupby(
+        "residue_number", as_index=False,
+    )
+    residue_df = residue_groups[
+        ["x_coord", "y_coord", "z_coord", "b_factor"]
+    ].mean().sort_values("residue_number")
+    # Also get the residue name for one-hot encoding
+    resnames = (
+        atom_df.groupby("residue_number", as_index=False)["residue_name"]
+        .first()
+        .sort_values("residue_number")["residue_name"]
+        .values
+    )
+    coords = residue_df[["x_coord", "y_coord", "z_coord"]].values
+    plddt = residue_df["b_factor"].values
+    n_residues = len(coords)
+    # --- Build node features -------------------------------------------------
+    if use_residue_features:
+        # 20 one-hot AA + 3 normalised coords + 1 pLDDT
+        one_hot = np.array([_one_hot_aa(r) for r in resnames], dtype=np.float32)
+        # Normalise coordinates (zero-centre)
+        normed_coords = (coords - coords.mean(axis=0)).astype(np.float32)
+        # Normalise pLDDT to [0, 1]
+        plddt_norm = (plddt / 100.0).reshape(-1, 1).astype(np.float32)
+        features = np.concatenate([one_hot, normed_coords, plddt_norm], axis=1)
+    else:
+        features = plddt.reshape(-1, 1).astype(np.float32)
+    x = torch.from_numpy(features)
+    # --- Build adjacency (contact map) ---------------------------------------
+    dist_mat = _distance_matrix(coords)
+    adj = dist_mat < distance_threshold
+    np.fill_diagonal(adj, False)
+    src, dst = np.nonzero(adj)
+    edge_index = torch.tensor(
+        np.stack([src, dst]), dtype=torch.long,
+    )
+    # --- Construct Data object -----------------------------------------------
+    y = torch.tensor([label], dtype=torch.long) if label is not None else None
+    data = Data(x=x, edge_index=edge_index, y=y)
+    data.pdb_path = str(pdb_path)
+    data.num_residues = n_residues
+    return data
+# ---- Directory loader -------------------------------------------------------
+def load_pdb_directory(
+    directory: str,
+    label: Optional[int] = None,
+    distance_threshold: float = 6.5,
+    use_residue_features: bool = True,
+    filename_pattern: str = "*.pdb",
+) -> List[Data]:
+    """Load all PDB files from a directory and convert to graphs.
+    Parameters
+    ----------
+    directory : str
+        Path to a folder containing ``.pdb`` files.
+    label : int or None
+        Class label to assign to every graph in the directory.
+    distance_threshold : float
+        Contact cutoff (Å).
+    use_residue_features : bool
+        Whether to use rich (24-dim) or minimal (1-dim) features.
+    filename_pattern : str
+        Glob pattern to match PDB files.
+    Returns
+    -------
+    list of Data
+    """
+    dirpath = Path(directory)
+    if not dirpath.is_dir():
+        raise FileNotFoundError(f"Directory not found: {directory}")
+    pdb_files = sorted(dirpath.glob(filename_pattern))
+    if not pdb_files:
+        logger.warning("No PDB files matching %r in %s", filename_pattern, directory)
+        return []
+    graphs: list[Data] = []
+    for pdb_file in pdb_files:
+        try:
+            g = pdb_to_graph(
+                str(pdb_file),
+                distance_threshold=distance_threshold,
+                use_residue_features=use_residue_features,
+                label=label,
+            )
+            graphs.append(g)
+        except Exception as exc:
+            logger.warning("Failed to convert %s: %s", pdb_file.name, exc)
+    logger.info(
+        "Loaded %d graphs from %s (label=%s)", len(graphs), directory, label,
+    )
+    return graphs
+def collect_best_models(
+    source_dir: str,
+    dest_dir: str,
+    pattern: str = "*model_1_seed_000.pdb",
+) -> List[Path]:
+    """Find the best-ranked PDB from each ColabFold job and copy to *dest_dir*.
+    This replaces the notebook's ``search_and_move_files`` function.
+    Files are *copied* (not moved) to avoid destructive side-effects.
+    Returns the list of destination paths.
+    """
+    import shutil
+    src = Path(source_dir)
+    dst = Path(dest_dir)
+    dst.mkdir(parents=True, exist_ok=True)
+    copied: list[Path] = []
+    for pdb in src.rglob(pattern):
+        dest_path = dst / pdb.name
+        shutil.copy2(pdb, dest_path)
+        copied.append(dest_path)
+        logger.debug("Copied %s → %s", pdb, dest_path)
+    logger.info("Collected %d PDB models into %s", len(copied), dest_dir)
+    return copied