PyPI - shape-complementarity - Versions diffs - 0.1.0__cp310-abi3-win_amd64.whl - Mend

shape-complementarity 0.1.0__cp310-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

shape_complementarity/__init__.py +20 -0
shape_complementarity/_core.pyd +0 -0
shape_complementarity/batch.py +80 -0
shape_complementarity/io.py +402 -0
shape_complementarity-0.1.0.dist-info/METADATA +12 -0
shape_complementarity-0.1.0.dist-info/RECORD +9 -0
shape_complementarity-0.1.0.dist-info/WHEEL +4 -0
shape_complementarity-0.1.0.dist-info/licenses/LICENSE +21 -0
shape_complementarity-0.1.0.dist-info/sboms/shape-complementarity.cyclonedx.json +1380 -0

shape_complementarity/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+from shape_complementarity._core import ScResult, compute_sc
+from shape_complementarity.batch import score_many
+from shape_complementarity.io import (
+    from_biotite,
+    from_boltzgen_refold,
+    from_boltzgen_structure,
+    from_pdb,
+    from_structure,
+)
+__all__ = [
+    "compute_sc",
+    "ScResult",
+    "from_pdb",
+    "from_structure",
+    "from_biotite",
+    "from_boltzgen_structure",
+    "from_boltzgen_refold",
+    "score_many",
+]

shape_complementarity/_core.pyd ADDED Viewed

Binary file

shape_complementarity/batch.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""Multiprocessing batch scoring of PDB files.
+Uses ProcessPoolExecutor with the 'spawn' start method (required on macOS).
+Rust-side Rayon parallelism is disabled by default to avoid oversubscription
+when many worker processes are already running concurrently.
+"""
+from __future__ import annotations
+import concurrent.futures
+import multiprocessing as mp
+from pathlib import Path
+def _score_one(args: tuple) -> dict:
+    """Top-level worker function (must be picklable — no closures)."""
+    pdb_path, chains_a, chains_b, kwargs = args
+    try:
+        from shape_complementarity.io import from_pdb
+        result = from_pdb(pdb_path, chains_a, chains_b, **kwargs)
+        return {
+            "path": str(pdb_path),
+            "sc": result.sc,
+            "median_distance": result.median_distance,
+            "trimmed_area": result.trimmed_area,
+            "atoms_a": result.atoms_a,
+            "atoms_b": result.atoms_b,
+            "status": "ok",
+            "error": None,
+        }
+    except Exception as exc:  # noqa: BLE001
+        return {
+            "path": str(pdb_path),
+            "sc": float("nan"),
+            "median_distance": float("nan"),
+            "trimmed_area": float("nan"),
+            "atoms_a": 0,
+            "atoms_b": 0,
+            "status": "error",
+            "error": str(exc),
+        }
+def score_many(
+    pdb_paths: list,
+    chains_a: list[str],
+    chains_b: list[str] | None = None,
+    n_workers: int = 8,
+    parallel: bool = False,
+    **kwargs,
+) -> "pd.DataFrame":
+    """Score many PDB files in parallel.
+    Args:
+        pdb_paths:  list of file paths
+        chains_a:   chain IDs for molecule A
+        chains_b:   chain IDs for molecule B (None = complement of chains_a)
+        n_workers:  number of worker processes
+        parallel:   enable Rayon parallelism inside each worker (default False
+                    to avoid oversubscription with multiple processes)
+        **kwargs:   forwarded to from_pdb (model, include_hetatm, etc.)
+    Returns:
+        DataFrame with columns:
+            path, sc, median_distance, trimmed_area, atoms_a, atoms_b,
+            status ('ok' or 'error'), error (None or message string)
+    """
+    import pandas as pd
+    kwargs["parallel"] = parallel
+    args_list = [(str(p), chains_a, chains_b, kwargs) for p in pdb_paths]
+    ctx = mp.get_context("spawn")
+    with concurrent.futures.ProcessPoolExecutor(
+        max_workers=n_workers, mp_context=ctx
+    ) as executor:
+        rows = list(executor.map(_score_one, args_list))
+    return pd.DataFrame(rows)

shape_complementarity/io.py ADDED Viewed

@@ -0,0 +1,402 @@
+"""PDB/CIF parsing for pysc.
+Parsing logic mirrors src/bin/sc.rs in sc-rs exactly so that parity tests pass:
+- ATOM records only (HETATM skipped unless include_hetatm=True)
+- Alternate locations: keep ' ' and 'A', skip all others
+- Hydrogens: excluded by default using the same heuristic as the CLI
+BoltzGen integration (from_biotite, from_boltzgen_structure, from_boltzgen_refold)
+is appended at the bottom. The recommended entry point for BoltzGen output is
+from_boltzgen_refold() on the refold_cif/*.cif files, which contain full all-atom
+coordinates validated by Boltz. from_pdb() also works for the same files.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import TYPE_CHECKING
+import numpy as np
+if TYPE_CHECKING:
+    from Bio.PDB.Structure import Structure as BioStructure
+from shape_complementarity._core import ScResult, compute_sc
+# ── Hydrogen detection (mirrors sc-rs bin/sc.rs) ────────────────────────────
+def _is_hydrogen(atom_name: str, element: str) -> bool:
+    """Mirror the hydrogen-detection logic in sc-rs bin/sc.rs."""
+    elem = element.strip().upper()
+    name = atom_name.strip()
+    if elem == "H":
+        return True
+    if name.startswith("H"):
+        return True
+    if name.endswith("H"):
+        return True
+    # Catch names like "1H", "2HB" (digit-prefixed hydrogen names)
+    if "H" in name and name[:1].isdigit():
+        return True
+    return False
+# ── biopython helpers ────────────────────────────────────────────────────────
+def _select_real_atom(atom):
+    """Return a concrete (non-disordered) Atom for altloc ' ' or 'A'.
+    Returns None if no acceptable altloc exists.
+    """
+    if atom.is_disordered():
+        child_dict = atom.child_dict
+        for altloc in ("A", " "):
+            if altloc in child_dict:
+                return child_dict[altloc]
+        return None
+    altloc = atom.altloc
+    if altloc not in (" ", "A"):
+        return None
+    return atom
+def _extract_atom_arrays(
+    model,
+    chains: list[str],
+    include_hetatm: bool,
+    include_hydrogens: bool,
+) -> tuple[list[list[float]], list[str], list[str]]:
+    coords: list[list[float]] = []
+    atom_names: list[str] = []
+    res_names: list[str] = []
+    for chain in model.get_chains():
+        if chain.id not in chains:
+            continue
+        for residue in chain.get_residues():
+            # residue.id[0] == ' ' for standard ATOM records
+            het = residue.id[0]
+            if not include_hetatm and het != " ":
+                continue
+            for disordered_or_atom in residue.get_atoms():
+                real = _select_real_atom(disordered_or_atom)
+                if real is None:
+                    continue
+                atom_name = real.name.strip()
+                element = (real.element or "").strip()
+                if not include_hydrogens and _is_hydrogen(atom_name, element):
+                    continue
+                c = real.coord
+                coords.append([float(c[0]), float(c[1]), float(c[2])])
+                atom_names.append(atom_name)
+                res_names.append(residue.resname.strip())
+    return coords, atom_names, res_names
+def _load_structure(path: str | Path):
+    from Bio.PDB import MMCIFParser, PDBParser
+    path = Path(path)
+    suffix = path.suffix.lower()
+    if suffix in (".cif", ".mmcif"):
+        parser = MMCIFParser(QUIET=True)
+    else:
+        parser = PDBParser(QUIET=True)
+    return parser.get_structure(path.stem, str(path))
+# ── Public biopython-based API ───────────────────────────────────────────────
+def from_structure(
+    structure: "BioStructure",
+    chains_a: list[str],
+    chains_b: list[str] | None = None,
+    model: int = 0,
+    include_hetatm: bool = False,
+    include_hydrogens: bool = False,
+    parallel: bool = True,
+) -> ScResult:
+    """Compute SC from a biopython Structure object.
+    Args:
+        structure:          biopython Structure (any source)
+        chains_a:           chain IDs for molecule A
+        chains_b:           chain IDs for molecule B; None = all chains not in chains_a
+        model:              model index (0-based)
+        include_hetatm:     include HETATM residues (default False, matches sc-rs)
+        include_hydrogens:  include hydrogen atoms (default False, matches sc-rs)
+        parallel:           enable Rayon parallelism inside sc-rs
+    """
+    models = list(structure.get_models())
+    if model >= len(models):
+        raise ValueError(
+            f"model index {model} out of range (structure has {len(models)} model(s))"
+        )
+    m = models[model]
+    if chains_b is None:
+        all_chain_ids = {ch.id for ch in m.get_chains()}
+        chains_b = sorted(all_chain_ids - set(chains_a))
+    coords_a, names_a, res_a = _extract_atom_arrays(m, chains_a, include_hetatm, include_hydrogens)
+    coords_b, names_b, res_b = _extract_atom_arrays(m, chains_b, include_hetatm, include_hydrogens)
+    return compute_sc(coords_a, names_a, res_a, coords_b, names_b, res_b, parallel)
+def from_pdb(
+    pdb_path: str | Path,
+    chains_a: list[str],
+    chains_b: list[str] | None = None,
+    model: int = 0,
+    include_hetatm: bool = False,
+    include_hydrogens: bool = False,
+    parallel: bool = True,
+) -> ScResult:
+    """Compute SC from a PDB or mmCIF file.
+    Args:
+        pdb_path:           path to .pdb, .ent, or .cif file
+        chains_a:           chain IDs for molecule A
+        chains_b:           chain IDs for molecule B; None = all chains not in chains_a
+        model:              model index (0-based, default first model)
+        include_hetatm:     include HETATM residues (default False)
+        include_hydrogens:  include hydrogen atoms (default False)
+        parallel:           enable Rayon parallelism inside sc-rs
+    """
+    structure = _load_structure(pdb_path)
+    return from_structure(
+        structure,
+        chains_a,
+        chains_b,
+        model=model,
+        include_hetatm=include_hetatm,
+        include_hydrogens=include_hydrogens,
+        parallel=parallel,
+    )
+# ── BoltzGen integration ─────────────────────────────────────────────────────
+#
+# BoltzGen pipeline output layout (relative to output_dir):
+#
+#   intermediate_designs/<id>.npz            – Structure NPZ, backbone only
+#                                              (sidechains are [0,0,0])
+#   intermediate_designs_inverse_folded/
+#     <id>.npz                               – Structure NPZ, full all-atom
+#     fold_out_npz/<id>.npz                  – raw fold tensors (coords + confidences)
+#     refold_cif/<id>.cif                    – ← USE THIS for SC scoring
+#     refold_design_cif/<id>.cif             – binder-only refold
+#
+# The refold_cif files are full-atom mmCIF with pLDDT in B-factors. They are
+# the Boltz-validated structures and the right input for SC. Both from_pdb()
+# and from_boltzgen_refold() accept them.
+#
+# Chain naming in BoltzGen output: the binder chain is typically the last chain
+# (e.g. "B" when the target is "A"), but verify from the CIF or the Record JSON
+# rather than assuming.
+def _extract_boltzgen_chains(
+    structure,
+    target_chains: list[str],
+    include_hydrogens: bool,
+) -> tuple[list[list[float]], list[str], list[str]]:
+    """Extract atom arrays from a BoltzGen Structure object for the given chains.
+    Iterates chains → residues → atoms using the absolute-index layout of
+    Structure.chains / .residues / .atoms (all indices are into the global arrays).
+    Skips atoms where is_present is False.
+    """
+    all_chain_names = [str(n) for n in structure.chains["name"]]
+    target_set = set(target_chains)
+    coords: list[list[float]] = []
+    atom_names: list[str] = []
+    res_names: list[str] = []
+    for ci, chain in enumerate(structure.chains):
+        if str(chain["name"]) not in target_set:
+            continue
+        res_start = int(chain["res_idx"])
+        res_count = int(chain["res_num"])
+        for ri in range(res_start, res_start + res_count):
+            res = structure.residues[ri]
+            res_name = str(res["name"])
+            a_start = int(res["atom_idx"])
+            a_count = int(res["atom_num"])
+            for ai in range(a_start, a_start + a_count):
+                atom = structure.atoms[ai]
+                if not bool(atom["is_present"]):
+                    continue
+                atom_name = str(atom["name"])
+                if not include_hydrogens and _is_hydrogen(atom_name, ""):
+                    continue
+                c = atom["coords"]
+                coords.append([float(c[0]), float(c[1]), float(c[2])])
+                atom_names.append(atom_name)
+                res_names.append(res_name)
+    missing = target_set - set(all_chain_names)
+    if missing:
+        raise ValueError(
+            f"Chain(s) {sorted(missing)} not found in Structure. "
+            f"Available: {list(dict.fromkeys(all_chain_names))}"
+        )
+    return coords, atom_names, res_names
+def from_boltzgen_structure(
+    structure,
+    chains_a: list[str],
+    chains_b: list[str] | None = None,
+    include_hydrogens: bool = False,
+    parallel: bool = True,
+) -> ScResult:
+    """Compute SC from an in-memory BoltzGen Structure object.
+    Accepts any object with .atoms / .residues / .chains numpy structured arrays
+    matching the BoltzGen dtype layout (boltzgen.data.data.Structure).
+    Args:
+        structure:          BoltzGen Structure (or duck-typed equivalent)
+        chains_a:           chain IDs for molecule A (e.g. ["B"] for binder)
+        chains_b:           chain IDs for molecule B; None = all chains not in chains_a
+        include_hydrogens:  include hydrogen atoms (default False)
+        parallel:           enable Rayon parallelism inside sc-rs
+    Important: use post-refold structures for meaningful SC scores. Structures
+    from intermediate_designs/ have zeroed sidechain coordinates and will give
+    unreliable results. Prefer from_boltzgen_refold() on the refold_cif files,
+    or load the Structure NPZ from intermediate_designs_inverse_folded/ after
+    refolding completes.
+    """
+    all_chain_names = [str(n) for n in structure.chains["name"]]
+    if chains_b is None:
+        chains_a_set = set(chains_a)
+        chains_b = list(dict.fromkeys(
+            n for n in all_chain_names if n not in chains_a_set
+        ))
+    coords_a, names_a, res_a = _extract_boltzgen_chains(structure, chains_a, include_hydrogens)
+    coords_b, names_b, res_b = _extract_boltzgen_chains(structure, chains_b, include_hydrogens)
+    return compute_sc(coords_a, names_a, res_a, coords_b, names_b, res_b, parallel)
+def from_biotite(
+    atom_array,
+    chains_a: list[str],
+    chains_b: list[str] | None = None,
+    include_hetatm: bool = False,
+    include_hydrogens: bool = False,
+    parallel: bool = True,
+) -> ScResult:
+    """Compute SC from a biotite AtomArray or AtomArrayStack (first model used).
+    BoltzGen's analysis stack (analyze_utils.py) works with biotite AtomArrays.
+    This function is the natural bridge when you already have an AtomArray loaded.
+    Example — scoring a BoltzGen refold CIF with biotite directly:
+        import biotite.structure.io.pdbx as pdbx
+        cif = pdbx.CIFFile.read("refold_cif/design_0.cif")
+        atoms = pdbx.get_structure(cif, model=1, use_author_fields=False)
+        result = pysc.from_biotite(atoms, chains_a=["B"], chains_b=["A"])
+    Args:
+        atom_array:         biotite AtomArray (or first-model slice of AtomArrayStack)
+        chains_a:           chain IDs for molecule A
+        chains_b:           chain IDs for molecule B; None = all chains not in chains_a
+        include_hetatm:     include hetero atoms (default False)
+        include_hydrogens:  include hydrogen atoms (default False)
+        parallel:           enable Rayon parallelism inside sc-rs
+    """
+    # Support AtomArrayStack by taking the first model
+    try:
+        import biotite.structure as struc
+        if isinstance(atom_array, struc.AtomArrayStack):
+            atom_array = atom_array[0]
+    except ImportError:
+        pass  # duck-typing fallback: assume it already behaves like an AtomArray
+    all_chains = list(dict.fromkeys(str(c) for c in atom_array.chain_id))
+    if chains_b is None:
+        chains_a_set = set(chains_a)
+        chains_b = [c for c in all_chains if c not in chains_a_set]
+    def _extract(chain_ids: list[str]) -> tuple[list, list, list]:
+        mask = np.zeros(len(atom_array), dtype=bool)
+        for ch in chain_ids:
+            mask |= atom_array.chain_id == ch
+        if not include_hetatm:
+            mask &= ~atom_array.hetero
+        sub = atom_array[mask]
+        coords: list[list[float]] = []
+        anames: list[str] = []
+        rnames: list[str] = []
+        for i in range(len(sub)):
+            name = str(sub.atom_name[i])
+            elem = str(sub.element[i]) if hasattr(sub, "element") else ""
+            if not include_hydrogens and _is_hydrogen(name, elem):
+                continue
+            c = sub.coord[i]
+            coords.append([float(c[0]), float(c[1]), float(c[2])])
+            anames.append(name)
+            rnames.append(str(sub.res_name[i]))
+        return coords, anames, rnames
+    coords_a, names_a, res_a = _extract(chains_a)
+    coords_b, names_b, res_b = _extract(chains_b)
+    return compute_sc(coords_a, names_a, res_a, coords_b, names_b, res_b, parallel)
+def from_boltzgen_refold(
+    refold_cif_path: str | Path,
+    chains_a: list[str],
+    chains_b: list[str] | None = None,
+    include_hydrogens: bool = False,
+    parallel: bool = True,
+) -> ScResult:
+    """Compute SC from a BoltzGen refold_cif/*.cif file using biotite.
+    This is the recommended entry point when scoring BoltzGen designs.
+    It mirrors the CIF-loading pattern used by BoltzGen's own analyze_utils.py.
+    Args:
+        refold_cif_path:    path to refold_cif/<id>.cif or refold_design_cif/<id>.cif
+        chains_a:           chain IDs for molecule A (typically the binder)
+        chains_b:           chain IDs for molecule B; None = all chains not in chains_a
+        include_hydrogens:  include hydrogen atoms (default False)
+        parallel:           enable Rayon parallelism inside sc-rs
+    Raises:
+        ImportError: if biotite is not installed. Install with: pip install biotite
+    """
+    try:
+        import biotite.structure.io.pdbx as pdbx
+    except ImportError as exc:
+        raise ImportError(
+            "biotite is required for from_boltzgen_refold(). "
+            "Install it with: pip install biotite"
+        ) from exc
+    cif_file = pdbx.CIFFile.read(str(refold_cif_path))
+    # model=1 is 1-based in biotite; use_author_fields=False uses label_* fields
+    # (consistent with how BoltzGen writes chain IDs)
+    atom_array = pdbx.get_structure(cif_file, model=1, use_author_fields=False)
+    return from_biotite(
+        atom_array,
+        chains_a,
+        chains_b,
+        include_hetatm=False,
+        include_hydrogens=include_hydrogens,
+        parallel=parallel,
+    )

shape_complementarity-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,12 @@
+Metadata-Version: 2.4
+Name: shape-complementarity
+Version: 0.1.0
+Requires-Dist: numpy>=1.24
+Requires-Dist: biopython>=1.83
+License-File: LICENSE
+Summary: PyO3 bindings to sc-rs for Lawrence-Colman Shape Complementarity
+License: MIT
+Requires-Python: >=3.10
+Project-URL: Bug Tracker, https://github.com/aarteixeira/shape-complementarity/issues
+Project-URL: Homepage, https://github.com/aarteixeira/shape-complementarity
+Project-URL: Repository, https://github.com/aarteixeira/shape-complementarity

shape_complementarity-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+shape_complementarity/__init__.py,sha256=2oRvOqa5HGZdzwCnbcfOP3EVJTjYYSa-eDgKhMdKNfg,464
+shape_complementarity/_core.pyd,sha256=LONuhCzz0OXGI3K4zSrooHfsUre_wVzocVRuxE4pz0Y,671744
+shape_complementarity/batch.py,sha256=WGcowAbk1o-kvNMJljLK0Y33zkta5W0JEB31ccgdW8E,2657
+shape_complementarity/io.py,sha256=WTaoF8JraQy-0J_xyFTiKcwEk4NaSKnzzqQcXu4c1AY,15839
+shape_complementarity-0.1.0.dist-info/METADATA,sha256=JbHV5DCxUPnx8T4wevEaf_EFT-VVqRGM6BqfsEDuR5w,496
+shape_complementarity-0.1.0.dist-info/WHEEL,sha256=OUT0XP5TL9Hq-6CIgsb5m6BAU8pfcNqYjx0xnFDWhNs,96
+shape_complementarity-0.1.0.dist-info/licenses/LICENSE,sha256=XKKSDU9WlUEAyPNlRhq6e2xhVNpJc097JwPZJ1rUnRE,1077
+shape_complementarity-0.1.0.dist-info/sboms/shape-complementarity.cyclonedx.json,sha256=NoXa_punHoyjufzla--bJWmuH99mafVybrK-39pDXys,42436
+shape_complementarity-0.1.0.dist-info/RECORD,,

shape_complementarity-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: maturin (1.13.3)
+Root-Is-Purelib: false
+Tag: cp310-abi3-win_amd64

shape_complementarity-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.