PyPI - hxprobe - Versions diffs - 0.1.0__py3-none-any.whl - Mend

hxprobe 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

hxprobe/__init__.py +29 -0
hxprobe/cli.py +101 -0
hxprobe/data/ubiquitin_dgopen.csv +42 -0
hxprobe/data/ubiquitin_ensemble.pdb.gz +0 -0
hxprobe/diff.py +68 -0
hxprobe/ensemble.py +60 -0
hxprobe/operator.py +252 -0
hxprobe/probe.py +165 -0
hxprobe/protonate.py +70 -0
hxprobe-0.1.0.dist-info/METADATA +165 -0
hxprobe-0.1.0.dist-info/RECORD +15 -0
hxprobe-0.1.0.dist-info/WHEEL +5 -0
hxprobe-0.1.0.dist-info/entry_points.txt +2 -0
hxprobe-0.1.0.dist-info/licenses/LICENSE +21 -0
hxprobe-0.1.0.dist-info/top_level.txt +1 -0

hxprobe/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+"""hxprobe -- the 50-conformer probe.
+Read residue-resolved hydrogen-exchange opening free energies out of a
+conformational ensemble with a white-box, two-parameter physical operator.
+"""
+from .operator import (BETA_C, BETA_H, CUT_NC_NM, CUT_NH_NM, IJ_NC, IJ_NH,
+                       R_KCAL, T_REF, ProtectionResult, compute, nc_nh_frame)
+from .ensemble import load_ensemble, optionally_protonate
+from .probe import (convergence, example_ensemble_path, global_unfolding,
+                    load_experimental, score_ensemble, spearman)
+__version__ = "0.1.0"
+__all__ = [
+    "__version__",
+    "score_ensemble",
+    "convergence",
+    "global_unfolding",
+    "compute",
+    "nc_nh_frame",
+    "ProtectionResult",
+    "load_ensemble",
+    "optionally_protonate",
+    "example_ensemble_path",
+    "load_experimental",
+    "spearman",
+    "BETA_C", "BETA_H", "CUT_NC_NM", "CUT_NH_NM", "IJ_NC", "IJ_NH",
+    "R_KCAL", "T_REF",
+]

hxprobe/cli.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Command-line interface for hxprobe."""
+from __future__ import annotations
+import argparse
+import sys
+def _add_common(p):
+    p.add_argument("ensemble", help="multi-model PDB (.pdb/.pdb.gz) or trajectory file")
+    p.add_argument("--top", default=None, help="topology file (for trajectory inputs)")
+    p.add_argument("--protonate", default="auto",
+                   choices=["auto", "none", "pdbfixer"],
+                   help="how to obtain backbone amide hydrogens (default: auto)")
+    p.add_argument("--betaC", type=float, default=None, help="contact coefficient")
+    p.add_argument("--betaH", type=float, default=None, help="H-bond coefficient")
+    p.add_argument("--temperature", type=float, default=None, help="temperature (K)")
+def _operator_kw(args):
+    from . import BETA_C, BETA_H, T_REF
+    return dict(
+        protonate=args.protonate,
+        betaC=BETA_C if args.betaC is None else args.betaC,
+        betaH=BETA_H if args.betaH is None else args.betaH,
+        temperature=T_REF if args.temperature is None else args.temperature,
+    )
+def _cmd_score(args):
+    from . import score_ensemble
+    res = score_ensemble(args.ensemble, top=args.top, **_operator_kw(args))
+    df = res.to_dataframe()
+    if args.out:
+        res.to_csv(args.out)
+        print(f"wrote {len(res)} residues to {args.out}")
+    else:
+        print(df.to_string(index=False))
+    return 0
+def _cmd_converge(args):
+    from . import convergence
+    out = convergence(args.ensemble, top=args.top, **_operator_kw(args))
+    try:
+        print(out.to_string(index=False))
+    except AttributeError:
+        for row in out:
+            print(row)
+    return 0
+def _cmd_example(args):
+    from . import (convergence, example_ensemble_path, load_experimental,
+                   score_ensemble, spearman)
+    path = example_ensemble_path()
+    exp = load_experimental()
+    print(f"bundled example: 50-conformer leakage-free ubiquitin ensemble\n  {path}")
+    res = score_ensemble(path, protonate="none")  # already protonated
+    ref = [exp.get(int(rs)) for rs in res.resSeq]
+    rho = spearman(res.lnPF, [r if r is not None else float("nan") for r in ref])
+    n_overlap = sum(1 for r in ref if r is not None)
+    print(f"\nper-residue ln PF vs experimental dG_open (native-state HX):")
+    print(f"  Spearman rho = {rho:+.3f}  over {n_overlap} measured residues")
+    print(f"\nconvergence with ensemble size:")
+    conv = convergence(path, protonate="none", reference=exp)
+    try:
+        print(conv.to_string(index=False))
+    except AttributeError:
+        for row in conv:
+            print("  ", row)
+    return 0
+def build_parser():
+    p = argparse.ArgumentParser(
+        prog="hxprobe",
+        description="The 50-conformer probe: residue-resolved hydrogen-exchange "
+                    "opening free energies from conformational ensembles.")
+    sub = p.add_subparsers(dest="command", required=True)
+    s = sub.add_parser("score", help="per-residue opening free energies for an ensemble")
+    _add_common(s)
+    s.add_argument("--out", default=None, help="write a CSV instead of printing")
+    s.set_defaults(func=_cmd_score)
+    c = sub.add_parser("converge", help="convergence of the readout with ensemble size")
+    _add_common(c)
+    c.set_defaults(func=_cmd_converge)
+    e = sub.add_parser("example", help="run the bundled ubiquitin example")
+    e.set_defaults(func=_cmd_example)
+    return p
+def main(argv=None):
+    args = build_parser().parse_args(argv)
+    return args.func(args)
+if __name__ == "__main__":
+    sys.exit(main())

hxprobe/data/ubiquitin_dgopen.csv ADDED Viewed

@@ -0,0 +1,42 @@
+resi,dGopen_kcal,grade,resn
+2,1.831,B,
+3,6.335,B,
+4,6.307,B,
+5,5.897,B,
+6,4.929,B,
+7,6.108,B,
+12,2.969,B,
+13,5.202,B,
+15,2.526,B,
+16,3.436,B,
+17,5.565,B,
+22,2.171,B,
+23,5.771,B,
+25,4.292,B,
+26,7.699,B,
+28,5.513,B,
+29,5.897,B,
+30,7.169,B,
+31,4.703,B,
+32,2.015,B,
+36,2.837,B,
+40,2.679,B,
+41,4.093,B,
+42,4.503,B,
+44,6.307,B,
+45,4.333,B,
+48,4.586,B,
+49,2.482,B,
+50,3.855,B,
+55,4.818,B,
+56,6.045,B,
+57,3.436,B,
+58,2.969,B,
+59,5.612,B,
+60,2.426,B,
+61,4.611,B,
+65,4.201,B,
+67,3.93,B,
+68,4.407,B,
+69,5.339,B,
+70,4.724,B,

hxprobe/data/ubiquitin_ensemble.pdb.gz ADDED Viewed

Binary file

hxprobe/diff.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""Differentiable Best--Vendruscolo operator (optional, requires PyTorch).
+The hard contact and hydrogen-bond counts are replaced by smooth sigmoidal
+switching functions controlled by a temperature ``tau``; the discrete operator
+is recovered as ``tau -> 0``.  Because protection then becomes a differentiable
+function of atomic coordinates, the residue-level readout can in principle
+provide gradients to steer a generator toward the rare openings it
+under-populates.
+Install with ``pip install hxprobe[diff]``.
+"""
+from __future__ import annotations
+from .operator import BETA_C, BETA_H, CUT_NC_NM, CUT_NH_NM, IJ_NC, IJ_NH, R_KCAL, T_REF
+def soft_nc_nh(xyz, amideN_idx, amideH_idx, heavy_idx, O_idx, resid,
+               cut_Nc=CUT_NC_NM, cut_Nh=CUT_NH_NM, ij_Nc=IJ_NC, ij_Nh=IJ_NH,
+               tau=0.02):
+    """Differentiable per-residue (N_C, N_H) for one conformer.
+    Parameters
+    ----------
+    xyz : torch.Tensor ``[n_atoms, 3]`` (nanometres, requires_grad as needed)
+    amideN_idx, amideH_idx : list[int]
+        Per-residue amide N / amide H atom indices (``-1`` if absent).
+    heavy_idx, O_idx : list[int]
+        Heavy-atom and backbone-carbonyl-oxygen atom indices.
+    resid : sequence[int]
+        Residue index of every atom (used for the sequence-separation mask).
+    tau : float
+        Switching temperature; smaller is sharper (recovers the hard operator).
+    Returns
+    -------
+    (NC, NH) : torch.Tensor, torch.Tensor    each ``[n_res]``
+    """
+    import torch
+    dev = xyz.device
+    R = len(amideN_idx)
+    NC = torch.zeros(R, device=dev)
+    NH = torch.zeros(R, device=dev)
+    heavy = torch.as_tensor(heavy_idx, device=dev, dtype=torch.long)
+    hres = torch.as_tensor([resid[i] for i in heavy_idx], device=dev)
+    Os = torch.as_tensor(O_idx, device=dev, dtype=torch.long)
+    Ores = torch.as_tensor([resid[i] for i in O_idx], device=dev)
+    for r in range(R):
+        ni = amideN_idx[r]
+        if ni < 0:
+            continue
+        ri = resid[ni]
+        d = torch.norm(xyz[heavy] - xyz[ni], dim=1)
+        mask_c = (torch.abs(hres - ri) >= ij_Nc).float()
+        NC[r] = torch.sum(torch.sigmoid((cut_Nc - d) / tau) * mask_c)
+        hi = amideH_idx[r]
+        if hi >= 0:
+            dh = torch.norm(xyz[Os] - xyz[hi], dim=1)
+            mask_h = (torch.abs(Ores - ri) >= ij_Nh).float()
+            NH[r] = torch.sum(torch.sigmoid((cut_Nh - dh) / tau) * mask_h)
+    return NC, NH
+def soft_lnpf(xyz, amideN_idx, amideH_idx, heavy_idx, O_idx, resid,
+              betaC=BETA_C, betaH=BETA_H, **kw):
+    """Differentiable per-residue ln PF for one conformer."""
+    NC, NH = soft_nc_nh(xyz, amideN_idx, amideH_idx, heavy_idx, O_idx, resid, **kw)
+    return betaC * NC + betaH * NH

hxprobe/ensemble.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Loading conformational ensembles.
+Thin wrappers over MDTraj that accept the common ways an ensemble is stored:
+a multi-model PDB (optionally gzipped), or a trajectory file plus a topology.
+"""
+from __future__ import annotations
+from typing import Optional
+def load_ensemble(path: str, top: Optional[str] = None):
+    """Load a conformational ensemble as an ``mdtraj.Trajectory``.
+    Parameters
+    ----------
+    path : str
+        A multi-model PDB (``.pdb`` / ``.pdb.gz``) or a trajectory file
+        (``.xtc``, ``.dcd``, ``.h5`` ...).
+    top : str, optional
+        Topology file (e.g. a ``.pdb``); required for trajectory formats that
+        do not embed topology.
+    Returns
+    -------
+    mdtraj.Trajectory
+    """
+    import mdtraj as md
+    if top is not None:
+        return md.load(path, top=top)
+    return md.load(path)
+def optionally_protonate(traj, method: str = "auto"):
+    """Return an ensemble guaranteed to be scorable for the H-bond term.
+    ``method``:
+    * ``"none"``   -- score as-is (geometric amide-H placement is used inside
+      the operator when explicit hydrogens are absent).
+    * ``"pdbfixer"`` -- repair missing heavy atoms and add real hydrogens with
+      PDBFixer/OpenMM (the ``hxprobe[fix]`` extra). Most faithful for raw
+      crystal or heavy-atom generated structures.
+    * ``"auto"`` (default) -- use PDBFixer if it is installed and hydrogens are
+      missing, otherwise fall back to ``"none"``.
+    """
+    from .protonate import has_explicit_hydrogens, pdbfixer_protonate
+    if method == "none":
+        return traj
+    if has_explicit_hydrogens(traj):
+        return traj
+    if method == "pdbfixer":
+        return pdbfixer_protonate(traj)
+    if method == "auto":
+        try:
+            return pdbfixer_protonate(traj)
+        except Exception:
+            return traj
+    raise ValueError(f"unknown protonation method: {method!r}")

hxprobe/operator.py ADDED Viewed

@@ -0,0 +1,252 @@
+"""The white-box Best--Vendruscolo forward operator.
+For each backbone amide *i* the log protection factor is the ensemble average
+    ln PF_i = beta_C * <N_C,i> + beta_H * <N_H,i>
+where ``N_C`` is the number of heavy atoms within ``cut_Nc`` of the amide
+nitrogen (sequence separation >= ``ij_Nc``) and ``N_H`` is the number of
+backbone carbonyl oxygens within ``cut_Nh`` of the amide hydrogen (sequence
+separation >= ``ij_Nh``).  Under the EX2 regime the protection factor converts
+to a per-residue opening free energy
+    dG_open,i = RT * ln PF_i.
+The two coefficients are fixed to their classical Best--Vendruscolo values
+(0.35 and 2.0) and are *not* fitted to stability data, so any residue-level
+signal the operator recovers originates in the ensemble itself.
+This is a faithful, dependency-light re-implementation of the operator used in
+the accompanying study; geometry is computed with MDTraj and NumPy only.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Optional
+import numpy as np
+# Best--Vendruscolo classical coefficients (never fitted to stability labels).
+BETA_C = 0.35
+BETA_H = 2.0
+# Geometric cut-offs in nanometres (6.5 Angstrom contacts, 2.6 Angstrom H-bond).
+CUT_NC_NM = 0.65
+CUT_NH_NM = 0.26
+IJ_NC = 3
+IJ_NH = 2
+# Gas constant (kcal / mol / K) and reference temperature.
+R_KCAL = 0.0019872041
+T_REF = 298.15
+_THREE_TO_ONE = {
+    "ALA": "A", "ARG": "R", "ASN": "N", "ASP": "D", "CYS": "C", "GLN": "Q",
+    "GLU": "E", "GLY": "G", "HIS": "H", "ILE": "I", "LEU": "L", "LYS": "K",
+    "MET": "M", "PHE": "F", "PRO": "P", "SER": "S", "THR": "T", "TRP": "W",
+    "TYR": "Y", "VAL": "V",
+}
+_AMIDE_H_NAMES = ("H", "HN", "H1", "HT1")
+_N_H_BOND_NM = 0.101  # ~1.01 Angstrom N-H bond length, for geometric placement.
+@dataclass
+class ProtectionResult:
+    """Per-residue protection factors and opening free energies for an ensemble."""
+    resSeq: np.ndarray           # residue numbers (author/topology numbering)
+    resn: np.ndarray             # one-letter residue names
+    NC_mean: np.ndarray          # ensemble-averaged heavy-atom contacts
+    NH_mean: np.ndarray          # ensemble-averaged amide hydrogen bonds
+    lnPF: np.ndarray             # ln protection factor
+    dGopen_kcal: np.ndarray      # opening free energy (kcal/mol)
+    NC: np.ndarray = field(repr=False)   # per-conformer contacts  [n_res, n_frames]
+    NH: np.ndarray = field(repr=False)   # per-conformer H-bonds   [n_res, n_frames]
+    weights: np.ndarray = field(repr=False)
+    temperature: float = T_REF
+    n_frames: int = 0
+    @property
+    def log10PF(self) -> np.ndarray:
+        return self.lnPF / np.log(10.0)
+    def to_dataframe(self):
+        import pandas as pd
+        return pd.DataFrame({
+            "resSeq": self.resSeq,
+            "resn": self.resn,
+            "NC_mean": self.NC_mean,
+            "NH_mean": self.NH_mean,
+            "lnPF": self.lnPF,
+            "log10PF": self.log10PF,
+            "dGopen_kcal": self.dGopen_kcal,
+        })
+    def to_csv(self, path: str) -> None:
+        self.to_dataframe().to_csv(path, index=False)
+    def __len__(self) -> int:
+        return len(self.resSeq)
+def _largest_protein_chain(traj):
+    """Atom-slice ``traj`` down to the chain with the most standard residues."""
+    top = traj.topology
+    best, best_n = None, -1
+    for ch in top.chains:
+        n = sum(1 for r in ch.residues if r.name in _THREE_TO_ONE)
+        if n > best_n:
+            best, best_n = ch, n
+    if best is None:
+        return traj
+    return traj.atom_slice([a.index for a in best.atoms])
+def _geometric_amide_h(xyz, top):
+    """Geometric backbone amide-H position per residue, in the residue plane.
+    Used only when explicit hydrogens are absent.  H is placed 1.01 A from N
+    along the external bisector of the C(prev)-N-CA angle, the standard sp2
+    amide placement.  Returns ``{residue.index: h_xyz_nm}``.
+    """
+    out = {}
+    residues = list(top.residues)
+    for r in residues:
+        if r.name == "PRO" or r.name not in _THREE_TO_ONE:
+            continue
+        atoms = {a.name: a.index for a in r.atoms}
+        if "N" not in atoms or "CA" not in atoms:
+            continue
+        prev_c = None
+        if r.index > 0:
+            pr = residues[r.index - 1]
+            if pr.chain.index == r.chain.index:
+                prev_c = next((a.index for a in pr.atoms if a.name == "C"), None)
+        if prev_c is None:
+            continue
+        n = xyz[atoms["N"]]
+        u_ca = xyz[atoms["CA"]] - n
+        u_c = xyz[prev_c] - n
+        nu_ca = np.linalg.norm(u_ca)
+        nu_c = np.linalg.norm(u_c)
+        if nu_ca < 1e-6 or nu_c < 1e-6:
+            continue
+        bis = u_ca / nu_ca + u_c / nu_c
+        nb = np.linalg.norm(bis)
+        if nb < 1e-6:
+            continue
+        out[r.index] = n - _N_H_BOND_NM * (bis / nb)
+    return out
+def nc_nh_frame(traj_single, cut_Nc=CUT_NC_NM, cut_Nh=CUT_NH_NM,
+                ij_Nc=IJ_NC, ij_Nh=IJ_NH, place_h_if_missing=True):
+    """Per-residue (N_C, N_H) for a single-frame MDTraj trajectory.
+    Returns ``{resSeq: (one_letter, N_C, N_H)}``.  Prolines and non-standard
+    residues are skipped (no exchangeable backbone amide).
+    """
+    top = traj_single.topology
+    xyz = traj_single.xyz[0]
+    heavy = np.array([a.index for a in top.atoms
+                      if a.element is not None and a.element.symbol != "H"])
+    hv_res = np.array([top.atom(i).residue.index for i in heavy])
+    o_idx = np.array([a.index for a in top.atoms if a.name == "O"])
+    o_res = np.array([top.atom(i).residue.index for i in o_idx])
+    has_explicit_h = any(a.name in _AMIDE_H_NAMES and a.element is not None
+                         and a.element.symbol == "H" for a in top.atoms)
+    geo_h = {} if has_explicit_h else (
+        _geometric_amide_h(xyz, top) if place_h_if_missing else {})
+    out = {}
+    for res in top.residues:
+        if res.name == "PRO" or res.name not in _THREE_TO_ONE:
+            continue
+        ns = [a for a in res.atoms if a.name == "N"]
+        if not ns:
+            continue
+        ri = res.index
+        npos = xyz[ns[0].index]
+        if heavy.size:
+            d = np.linalg.norm(xyz[heavy] - npos, axis=1)
+            nc = int(((d < cut_Nc) & (np.abs(hv_res - ri) >= ij_Nc)).sum())
+        else:
+            nc = 0
+        h_pos = None
+        hs = [a for a in res.atoms if a.name in _AMIDE_H_NAMES]
+        if hs:
+            h_pos = xyz[hs[0].index]
+        elif ri in geo_h:
+            h_pos = geo_h[ri]
+        nh = 0
+        if h_pos is not None and o_idx.size:
+            dh = np.linalg.norm(xyz[o_idx] - h_pos, axis=1)
+            nh = int(((dh < cut_Nh) & (np.abs(o_res - ri) >= ij_Nh)).sum())
+        out[res.resSeq] = (_THREE_TO_ONE[res.name], nc, nh)
+    return out
+def compute(traj, weights=None, betaC=BETA_C, betaH=BETA_H,
+            cut_Nc=CUT_NC_NM, cut_Nh=CUT_NH_NM, ij_Nc=IJ_NC, ij_Nh=IJ_NH,
+            temperature=T_REF, select_largest_chain=True,
+            place_h_if_missing=True) -> ProtectionResult:
+    """Score a conformational ensemble into per-residue opening free energies.
+    Parameters
+    ----------
+    traj : mdtraj.Trajectory
+        The conformational ensemble (one or more frames).
+    weights : array-like, optional
+        Per-conformer Boltzmann weights (defaults to uniform).
+    betaC, betaH : float
+        Operator coefficients (default to the classical 0.35 / 2.0).
+    temperature : float
+        Temperature (K) for the RT * ln PF conversion.
+    """
+    if select_largest_chain:
+        traj = _largest_protein_chain(traj)
+    F = traj.n_frames
+    if weights is None:
+        w = np.ones(F) / F
+    else:
+        w = np.asarray(weights, float)
+        w = w / w.sum()
+    perframe = [nc_nh_frame(traj[k], cut_Nc, cut_Nh, ij_Nc, ij_Nh,
+                            place_h_if_missing) for k in range(F)]
+    all_res = sorted(set().union(*[set(d) for d in perframe])) if perframe else []
+    resSeq, resn = [], []
+    NCm, NHm = [], []
+    for rs in all_res:
+        ncv = np.array([perframe[k][rs][1] if rs in perframe[k] else np.nan
+                        for k in range(F)], float)
+        nhv = np.array([perframe[k][rs][2] if rs in perframe[k] else np.nan
+                        for k in range(F)], float)
+        name = next(perframe[k][rs][0] for k in range(F) if rs in perframe[k])
+        resSeq.append(rs)
+        resn.append(name)
+        NCm.append(ncv)
+        NHm.append(nhv)
+    NC = np.array(NCm) if NCm else np.zeros((0, F))
+    NH = np.array(NHm) if NHm else np.zeros((0, F))
+    # weighted ensemble average over the frames in which each residue is present
+    nc_mean = np.zeros(len(all_res))
+    nh_mean = np.zeros(len(all_res))
+    for i in range(len(all_res)):
+        m = ~np.isnan(NC[i])
+        ww = w[m] / w[m].sum() if m.any() else w
+        nc_mean[i] = np.sum(ww * NC[i][m]) if m.any() else np.nan
+        nh_mean[i] = np.sum(ww * NH[i][m]) if m.any() else np.nan
+    lnPF = betaC * nc_mean + betaH * nh_mean
+    dG = R_KCAL * temperature * lnPF
+    return ProtectionResult(
+        resSeq=np.array(resSeq), resn=np.array(resn),
+        NC_mean=nc_mean, NH_mean=nh_mean, lnPF=lnPF, dGopen_kcal=dG,
+        NC=NC, NH=NH, weights=w, temperature=temperature, n_frames=F,
+    )

hxprobe/probe.py ADDED Viewed

@@ -0,0 +1,165 @@
+"""High-level interface: the 50-conformer probe.
+* :func:`score_ensemble` -- per-residue opening free energies for an ensemble.
+* :func:`convergence`    -- how the readout converges with ensemble size.
+* :func:`global_unfolding` -- the most-open-conformer global stability proxy.
+* :func:`example_ensemble_path` / :func:`load_experimental` -- bundled data.
+"""
+from __future__ import annotations
+from importlib import resources
+from typing import Optional, Sequence, Union
+import numpy as np
+from .operator import (BETA_C, BETA_H, R_KCAL, T_REF, ProtectionResult,
+                       _largest_protein_chain, compute)
+from .ensemble import load_ensemble, optionally_protonate
+# --------------------------------------------------------------------------- #
+# numpy-only Spearman (avoids a SciPy dependency)
+# --------------------------------------------------------------------------- #
+def _rankdata(x: np.ndarray) -> np.ndarray:
+    x = np.asarray(x, float)
+    order = np.argsort(x, kind="mergesort")
+    ranks = np.empty(len(x), float)
+    ranks[order] = np.arange(1, len(x) + 1, dtype=float)
+    sx = x[order]
+    i = 0
+    n = len(x)
+    while i < n:
+        j = i
+        while j + 1 < n and sx[j + 1] == sx[i]:
+            j += 1
+        if j > i:
+            ranks[order[i:j + 1]] = (i + 1 + j + 1) / 2.0
+        i = j + 1
+    return ranks
+def spearman(a: Sequence[float], b: Sequence[float]) -> float:
+    """Spearman rank correlation (NumPy only)."""
+    a = np.asarray(a, float)
+    b = np.asarray(b, float)
+    m = np.isfinite(a) & np.isfinite(b)
+    if m.sum() < 3:
+        return float("nan")
+    ra, rb = _rankdata(a[m]), _rankdata(b[m])
+    return float(np.corrcoef(ra, rb)[0, 1])
+def _as_traj(ensemble, top=None):
+    if isinstance(ensemble, str):
+        return load_ensemble(ensemble, top=top)
+    return ensemble
+def score_ensemble(ensemble, top: Optional[str] = None, protonate: str = "auto",
+                   betaC: float = BETA_C, betaH: float = BETA_H,
+                   temperature: float = T_REF, **kw) -> ProtectionResult:
+    """Score a conformational ensemble into per-residue opening free energies.
+    Parameters
+    ----------
+    ensemble : str or mdtraj.Trajectory
+        Path to a multi-model PDB / trajectory, or a loaded trajectory.
+    top : str, optional
+        Topology file when ``ensemble`` is a trajectory path.
+    protonate : {"auto", "none", "pdbfixer"}
+        How to obtain backbone amide hydrogens for the H-bond term.  ``"auto"``
+        uses explicit hydrogens if present, else PDBFixer if installed, else a
+        geometric placement.  See :func:`hxprobe.ensemble.optionally_protonate`.
+    """
+    traj = _as_traj(ensemble, top)
+    traj = _largest_protein_chain(traj)
+    traj = optionally_protonate(traj, method=protonate)
+    return compute(traj, betaC=betaC, betaH=betaH, temperature=temperature,
+                   select_largest_chain=False, **kw)
+def _lnpf_from_first_n(res: ProtectionResult, n: int, betaC: float, betaH: float):
+    nc = np.nanmean(res.NC[:, :n], axis=1)
+    nh = np.nanmean(res.NH[:, :n], axis=1)
+    return betaC * nc + betaH * nh
+def convergence(ensemble, ns: Optional[Sequence[int]] = None,
+                reference: Optional[dict] = None, top: Optional[str] = None,
+                protonate: str = "auto", betaC: float = BETA_C,
+                betaH: float = BETA_H, **kw):
+    """Convergence of the readout with ensemble size.
+    Returns a list of dicts (and, if pandas is available, a DataFrame) with, for
+    each ``n``: the Spearman correlation of the ``n``-conformer ln PF against
+    the full-ensemble ln PF (self-convergence) and, if ``reference`` is given,
+    against the experimental opening free energies.
+    ``reference`` maps ``resSeq -> experimental dG_open`` (see
+    :func:`load_experimental`).
+    """
+    res = score_ensemble(ensemble, top=top, protonate=protonate,
+                         betaC=betaC, betaH=betaH, **kw)
+    F = res.n_frames
+    if ns is None:
+        ns = [n for n in (5, 10, 25, 50, 100, 200) if n <= F] or [F]
+        if F not in ns:
+            ns = list(ns) + [F]
+    full = res.lnPF
+    ref_vec = None
+    if reference is not None:
+        ref_vec = np.array([reference.get(int(rs), np.nan) for rs in res.resSeq], float)
+    rows = []
+    for n in ns:
+        n = int(min(n, F))
+        lnpf_n = _lnpf_from_first_n(res, n, betaC, betaH)
+        row = {"n": n, "self_spearman": spearman(lnpf_n, full)}
+        if ref_vec is not None:
+            row["ref_spearman"] = spearman(lnpf_n, ref_vec)
+        rows.append(row)
+    try:
+        import pandas as pd
+        return pd.DataFrame(rows)
+    except Exception:
+        return rows
+def global_unfolding(ensemble, top: Optional[str] = None, protonate: str = "auto",
+                     betaC: float = BETA_C, betaH: float = BETA_H,
+                     temperature: float = T_REF, **kw) -> float:
+    """Most-open-conformer protection: a global fold-stability proxy.
+    For each conformer the mean log-protection over residues is computed; the
+    observable is ``RT * min_c <ln PF>_residues``, i.e. the protection of the
+    single most-open conformer, taken as a proxy for the unfolded state.
+    Larger values track higher global stability (dG_fold).
+    """
+    res = score_ensemble(ensemble, top=top, protonate=protonate,
+                         betaC=betaC, betaH=betaH, temperature=temperature, **kw)
+    per_conf_lnpf = betaC * res.NC + betaH * res.NH        # [n_res, n_frames]
+    g = np.nanmean(per_conf_lnpf, axis=0)                  # per-conformer mean
+    return float(R_KCAL * temperature * np.nanmin(g))
+# --------------------------------------------------------------------------- #
+# bundled example data (a 50-conformer leakage-free ubiquitin ensemble)
+# --------------------------------------------------------------------------- #
+def example_ensemble_path() -> str:
+    """Path to the bundled 50-conformer ubiquitin ensemble (multi-model PDB)."""
+    return str(resources.files("hxprobe").joinpath("data/ubiquitin_ensemble.pdb.gz"))
+def load_experimental(path: Optional[str] = None) -> dict:
+    """Load experimental per-residue opening free energies as ``{resSeq: dG}``.
+    With no argument, returns the bundled ubiquitin native-state HX dataset.
+    """
+    if path is None:
+        path = str(resources.files("hxprobe").joinpath("data/ubiquitin_dgopen.csv"))
+    import csv
+    out = {}
+    with open(path) as fh:
+        for row in csv.DictReader(fh):
+            out[int(row["resi"])] = float(row["dGopen_kcal"])
+    return out

hxprobe/protonate.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""Protonation / repair backends.
+The contact term ``N_C`` needs no hydrogens, but the hydrogen-bond term
+``N_H`` needs a backbone amide hydrogen.  Three options are available:
+* explicit hydrogens already in the ensemble -- used directly;
+* geometric placement inside the operator (NumPy only, no extra deps);
+* PDBFixer/OpenMM repair + real hydrogen addition (``hxprobe[fix]``), which
+  also rebuilds missing heavy atoms and is the most faithful for raw crystal
+  or heavy-atom generated structures.
+"""
+from __future__ import annotations
+_AMIDE_H_NAMES = ("H", "HN", "H1", "HT1")
+def has_explicit_hydrogens(traj) -> bool:
+    """True if the topology contains backbone amide hydrogens."""
+    for a in traj.topology.atoms:
+        if a.name in _AMIDE_H_NAMES and a.element is not None and a.element.symbol == "H":
+            return True
+    return False
+def _fix_one_frame(traj1, add_missing_atoms=True, ph=7.0):
+    import os
+    import tempfile
+    import mdtraj as md
+    from pdbfixer import PDBFixer
+    from openmm.app import PDBFile
+    shm = "/dev/shm" if os.path.isdir("/dev/shm") else None
+    tmp = tempfile.mktemp(suffix=".pdb", dir=shm)
+    traj1.save_pdb(tmp)
+    fixer = PDBFixer(filename=tmp)
+    fixer.findMissingResidues()
+    fixer.missingResidues = {}              # do not model whole missing residues
+    fixer.findNonstandardResidues()
+    fixer.findMissingAtoms()
+    if not add_missing_atoms:
+        fixer.missingAtoms = {}
+        fixer.missingTerminals = {}
+    fixer.addMissingAtoms()
+    fixer.addMissingHydrogens(ph)
+    out = tempfile.mktemp(suffix=".pdb", dir=shm)
+    with open(out, "w") as fh:
+        PDBFile.writeFile(fixer.topology, fixer.positions, fh)
+    fixed = md.load(out)
+    os.remove(tmp)
+    os.remove(out)
+    return fixed
+def pdbfixer_protonate(traj, add_missing_atoms=True, ph=7.0):
+    """Repair + protonate every frame with PDBFixer; return a single trajectory.
+    Requires the ``hxprobe[fix]`` extra (``pdbfixer``, ``openmm``).  Frames whose
+    repaired atom count differs from the modal count are dropped so the result
+    can be stacked into one trajectory.
+    """
+    import numpy as np
+    import mdtraj as md
+    fixed = [_fix_one_frame(traj[k], add_missing_atoms, ph)
+             for k in range(traj.n_frames)]
+    counts = [f.n_atoms for f in fixed]
+    modal = max(set(counts), key=counts.count)
+    keep = [f for f in fixed if f.n_atoms == modal]
+    xyz = np.concatenate([f.xyz for f in keep], axis=0)
+    return md.Trajectory(xyz, keep[0].topology)

hxprobe-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,165 @@
+Metadata-Version: 2.4
+Name: hxprobe
+Version: 0.1.0
+Summary: The 50-conformer probe: residue-resolved hydrogen-exchange opening free energies from conformational ensembles
+Author: hxprobe authors
+License: MIT
+Project-URL: Homepage, https://github.com/woshuizhaol/hxprobe
+Project-URL: Source, https://github.com/woshuizhaol/hxprobe
+Project-URL: Issues, https://github.com/woshuizhaol/hxprobe/issues
+Keywords: hydrogen-deuterium exchange,protection factor,conformational ensemble,generative models,protein dynamics,free energy,structural biology
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Classifier: Topic :: Scientific/Engineering :: Chemistry
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy>=1.21
+Requires-Dist: pandas>=1.3
+Requires-Dist: mdtraj>=1.9
+Provides-Extra: fix
+Requires-Dist: pdbfixer>=1.8; extra == "fix"
+Requires-Dist: openmm>=7.6; extra == "fix"
+Provides-Extra: diff
+Requires-Dist: torch>=1.10; extra == "diff"
+Provides-Extra: dev
+Requires-Dist: pytest>=7; extra == "dev"
+Requires-Dist: build; extra == "dev"
+Requires-Dist: twine; extra == "dev"
+Dynamic: license-file
+# hxprobe — the 50-conformer probe
+**Read residue-resolved hydrogen-exchange opening free energies out of a conformational ensemble.**
+`hxprobe` turns a conformational ensemble (from a generative model, molecular
+dynamics, or any source) into per-residue protection factors and opening free
+energies (ΔG_open, in kcal/mol) using a white-box, two-parameter physical
+operator. It is an inexpensive, physically interpretable probe of how well an
+ensemble reproduces the *near-equilibrium local opening* that hydrogen–deuterium
+exchange measures — and it converges within roughly **50 conformers**, which is
+where the name comes from.
+```
+ln PF_i = β_C · ⟨N_C,i⟩ + β_H · ⟨N_H,i⟩            (ensemble average)
+ΔG_open,i = RT · ln PF_i                            (EX2 regime)
+```
+`N_C` counts heavy atoms near each backbone amide nitrogen and `N_H` counts the
+amide's backbone hydrogen bonds, averaged over the ensemble. The two
+coefficients are **fixed to their classical Best–Vendruscolo values** (0.35 and
+2.0) and are *not* fitted to stability data, so any residue-level signal the
+probe recovers comes from the ensemble, not from a tuned scoring function.
+## Install
+```bash
+pip install hxprobe
+```
+This pulls in `numpy`, `pandas`, and `mdtraj`. Two optional extras:
+```bash
+pip install "hxprobe[fix]"   # PDBFixer/OpenMM: repair + protonate raw structures
+pip install "hxprobe[diff]"  # PyTorch: differentiable operator for steering
+```
+## Quickstart
+```python
+import hxprobe
+# Score the bundled 50-conformer ubiquitin ensemble (already protonated).
+res = hxprobe.score_ensemble(hxprobe.example_ensemble_path(), protonate="none")
+print(res.to_dataframe().head())          # resSeq, resn, NC_mean, NH_mean, lnPF, dGopen_kcal
+# Compare to experimental native-state HX opening free energies.
+exp = hxprobe.load_experimental()          # {resSeq: dG_open}
+ref = [exp.get(int(r), float("nan")) for r in res.resSeq]
+print("Spearman vs experiment:", round(hxprobe.spearman(res.lnPF, ref), 3))
+```
+Score *your own* ensemble — a multi-model PDB, or a trajectory plus topology:
+```python
+res = hxprobe.score_ensemble("my_ensemble.pdb")               # multi-model PDB
+res = hxprobe.score_ensemble("traj.xtc", top="topology.pdb")  # trajectory + topology
+res.to_csv("opening_free_energies.csv")
+```
+If your structures are raw heavy-atom coordinates without hydrogens, the H-bond
+term is obtained either by a geometric amide-H placement (default, no extra
+dependencies) or, more faithfully, with PDBFixer:
+```python
+res = hxprobe.score_ensemble("raw_heavy_atom.pdb", protonate="pdbfixer")  # needs hxprobe[fix]
+```
+## Command line
+```bash
+hxprobe example                       # run the bundled ubiquitin demo
+hxprobe score my_ensemble.pdb         # print the per-residue table
+hxprobe score traj.xtc --top top.pdb --out dG.csv
+hxprobe converge my_ensemble.pdb      # show convergence with ensemble size
+```
+## What you get back
+`score_ensemble` returns a `ProtectionResult` with NumPy arrays and a
+`.to_dataframe()` / `.to_csv()` helper:
+| field | meaning |
+|---|---|
+| `resSeq`, `resn` | residue number and one-letter code |
+| `NC_mean`, `NH_mean` | ensemble-averaged contacts / hydrogen bonds |
+| `lnPF`, `log10PF` | log protection factor |
+| `dGopen_kcal` | opening free energy ΔG_open (kcal/mol) |
+Two further entry points:
+* **`convergence(ensemble)`** — Spearman correlation of the `n`-conformer
+  readout against the full-ensemble readout (and, optionally, against an
+  experimental reference), showing the plateau near ~50 conformers.
+* **`global_unfolding(ensemble)`** — `RT · min_c ⟨ln PF⟩_residues`, the
+  protection of the most-open conformer, a bounded proxy for global fold
+  stability (the unfolded-state limit of the ensemble).
+## How it works
+For each backbone amide (prolines and non-standard residues are skipped):
+* **`N_C`** — heavy atoms within **6.5 Å** of the amide nitrogen, sequence
+  separation `|i − j| ≥ 3`, hydrogens excluded.
+* **`N_H`** — backbone carbonyl oxygens within **2.6 Å** of the amide hydrogen,
+  sequence separation `|i − j| ≥ 2`.
+Counts are computed per conformer and **averaged over the ensemble before** the
+linear combination is formed, so a residue that is buried in most conformers but
+exposed in a rare open state receives the reduced mean contact count its
+protection reflects. The contact term dominates, so the readout is robust even
+when hydrogens are placed geometrically rather than with a full protonation step.
+## Reproducing the bundled example
+`hxprobe example` scores a 50-conformer leakage-free ubiquitin ensemble and
+recovers the experimental native-state opening free energies at Spearman
+ρ ≈ 0.58, with the correlation plateauing by ~25–50 conformers — the behaviour
+that motivates the probe.
+## Citing
+If you use `hxprobe`, please cite the accompanying study on residue-resolved
+hydrogen-exchange free energies as a benchmark for generative conformational
+ensembles. (Reference to be added on publication.)
+## License
+MIT.

hxprobe-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+hxprobe/__init__.py,sha256=3eGKz1beI297aF91CGuJovNlJOFNAIZXgmLenGQggD8,929
+hxprobe/cli.py,sha256=QPo_Z8CLKI6b4Gb8epBK6rAhECEgrEO5IEQQBrRWRMI,3568
+hxprobe/diff.py,sha256=LEXvAX1SHNDQrDS0W0e-I9JW6cwh_BJ0Va7fpyTmoIw,2737
+hxprobe/ensemble.py,sha256=P7xKxjbSHRD6gPKDIE3ocJfUS1VqN5UEsCfqA_LlO0w,1908
+hxprobe/operator.py,sha256=-xBF-H3aKamPgAFKx46_pCUypq7xQeWjj-o5_vRR56A,9298
+hxprobe/probe.py,sha256=5CxjViokZVGLH1hZq5JdXRIc5OGzv6_bUxxNwi1pYSA,6575
+hxprobe/protonate.py,sha256=ahNldxZP_92Vt44vaGiB5jN-doN6LVyhKMw97pu8xNk,2533
+hxprobe/data/ubiquitin_dgopen.csv,sha256=8S53ApPbBjwqiEc4jjJZvhIQBvaO7ByvifOZ6YCnowo,513
+hxprobe/data/ubiquitin_ensemble.pdb.gz,sha256=p1Q0iIK-2_gwz5ydbs3-_RkW1lT28vmAbb0Dn2TaKsA,992264
+hxprobe-0.1.0.dist-info/licenses/LICENSE,sha256=EMVzXBEI7U0YNZRIGU2A73z5wMbPNz4AOBPNHEnMrw0,1072
+hxprobe-0.1.0.dist-info/METADATA,sha256=9tbagX6n0HhKrIkXyC26h_3ZOzjeek-xMx2d0MwxICs,6622
+hxprobe-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+hxprobe-0.1.0.dist-info/entry_points.txt,sha256=nCLOTyzpNlgf2UoTEdGFfhJhc1Ad7YPDuEVs5Nj0gb4,45
+hxprobe-0.1.0.dist-info/top_level.txt,sha256=zk2Sk2LzTFrZMHcer6AVGPoGOWKK2wxm22H1vdrcZv0,8
+hxprobe-0.1.0.dist-info/RECORD,,

hxprobe-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

hxprobe-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ hxprobe = hxprobe.cli:main

hxprobe-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 hxprobe authors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

hxprobe-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ hxprobe