mhcmatch 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mhcmatch/__init__.py +44 -0
- mhcmatch/cli.py +148 -0
- mhcmatch/data/PROVENANCE.md +34 -0
- mhcmatch/data/mhci_pseudo.fa +8286 -0
- mhcmatch/data/mhcii_pseudo.fa +4418 -0
- mhcmatch/data/structural_pockets_mhc1.tsv +6 -0
- mhcmatch/data/structural_pockets_mhc2.tsv +5 -0
- mhcmatch/diffusion.py +127 -0
- mhcmatch/logo.py +65 -0
- mhcmatch/proteome.py +89 -0
- mhcmatch/pseudoseq.py +300 -0
- mhcmatch/search.py +50 -0
- mhcmatch/store.py +356 -0
- mhcmatch-0.1.0.dist-info/METADATA +128 -0
- mhcmatch-0.1.0.dist-info/RECORD +18 -0
- mhcmatch-0.1.0.dist-info/WHEEL +4 -0
- mhcmatch-0.1.0.dist-info/entry_points.txt +2 -0
- mhcmatch-0.1.0.dist-info/licenses/LICENSE +674 -0
mhcmatch/__init__.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""mhcmatch: peptide-MHC presentation, cross-reactivity, and motif tools on the seqtree substrate.
|
|
2
|
+
|
|
3
|
+
- :class:`Store` -- MHC restriction / presentation prediction, protein scanning, anchor/TCR-facing
|
|
4
|
+
decomposition, from a reference epitope panel (isalgo/pmhc_data).
|
|
5
|
+
- :mod:`search` -- large-scale similarity search (TCR-facing recognition vs same-MHC presentation)
|
|
6
|
+
and neoantigen molecular mimicry (:func:`search.find_mimics`).
|
|
7
|
+
- :class:`Proteome` -- near-exact source-peptide lookup (neoantigen -> parent protein).
|
|
8
|
+
- :class:`Pseudoseq` -- pseudosequence allele similarity & cross-allele diffusion (rare-allele rescue).
|
|
9
|
+
- :func:`logo.motif` -- per-allele motif logos + length distributions.
|
|
10
|
+
|
|
11
|
+
Theory: ``appendix/mhcmatch.tex``. Roadmap: ``ROADMAP.md``.
|
|
12
|
+
"""
|
|
13
|
+
from importlib.metadata import PackageNotFoundError, version as _version
|
|
14
|
+
|
|
15
|
+
from . import logo, search
|
|
16
|
+
from .diffusion import AnchorModel
|
|
17
|
+
from .proteome import Proteome, SourceHit
|
|
18
|
+
from .pseudoseq import (Pseudoseq, learn_anchor_weights, load_pseudo, normalize_allele,
|
|
19
|
+
resolve_allele)
|
|
20
|
+
from .store import Decomposition, Restriction, Store, anchor_indices, infer_class
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"Store",
|
|
24
|
+
"Restriction",
|
|
25
|
+
"Decomposition",
|
|
26
|
+
"infer_class",
|
|
27
|
+
"anchor_indices",
|
|
28
|
+
"search",
|
|
29
|
+
"AnchorModel",
|
|
30
|
+
"Proteome",
|
|
31
|
+
"SourceHit",
|
|
32
|
+
"Pseudoseq",
|
|
33
|
+
"learn_anchor_weights",
|
|
34
|
+
"load_pseudo",
|
|
35
|
+
"normalize_allele",
|
|
36
|
+
"resolve_allele",
|
|
37
|
+
"logo",
|
|
38
|
+
"__version__",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
__version__ = _version("mhcmatch")
|
|
43
|
+
except PackageNotFoundError: # running from a source tree without an install
|
|
44
|
+
__version__ = "0.1.0"
|
mhcmatch/cli.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Command-line interface for mhcmatch: ``mhcmatch <command> ...``.
|
|
2
|
+
|
|
3
|
+
Commands: ``decompose`` (no data needed), ``restriction``, ``scan``, ``logo`` (need a pmhc_data
|
|
4
|
+
table via ``--pmhc`` or ``$MHCMATCH_PMHC``), and ``source`` (needs a proteome FASTA).
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
from . import Proteome, Store
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _add_store_opts(p):
|
|
15
|
+
p.add_argument("--pmhc", help="pmhc_data TSV(.gz); else $MHCMATCH_PMHC/pmhc_<tier>.tsv.gz")
|
|
16
|
+
p.add_argument("--tier", default="full", choices=("full", "shortlist"))
|
|
17
|
+
p.add_argument("--species", default="human", choices=("human", "mouse"))
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _store(a):
|
|
21
|
+
return Store.from_pmhc(a.pmhc, tier=a.tier, species=a.species)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _read_seq(arg):
|
|
25
|
+
"""A raw sequence, or the concatenated sequences of a FASTA file path."""
|
|
26
|
+
if os.path.exists(arg):
|
|
27
|
+
from .proteome import read_fasta
|
|
28
|
+
seqs = read_fasta(arg)
|
|
29
|
+
if seqs:
|
|
30
|
+
return "".join(seqs.values())
|
|
31
|
+
return arg.strip()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def cmd_decompose(a):
|
|
35
|
+
d = Store().decompose(a.peptide, cls=a.cls)
|
|
36
|
+
print(f"peptide {d.peptide}")
|
|
37
|
+
print(f"anchors {','.join(str(i + 1) for i in d.anchors)}")
|
|
38
|
+
print(f"tcr_facing {d.tcr_facing}")
|
|
39
|
+
print(f"presentation {d.presentation}")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _resolve_panel_allele(store, name, cls):
|
|
43
|
+
"""Map a user-typed allele to a panel allele (exact, else prefix on punctuation-stripped name)."""
|
|
44
|
+
pool = (cls,) if cls else ("mhc1", "mhc2")
|
|
45
|
+
panel = {al for c in pool for al in store.alleles(c)}
|
|
46
|
+
if name in panel:
|
|
47
|
+
return name
|
|
48
|
+
key = name.replace("*", "").replace(":", "")
|
|
49
|
+
hits = sorted(a for a in panel if a.replace("*", "").replace(":", "").startswith(key))
|
|
50
|
+
if hits:
|
|
51
|
+
print(f"# resolved '{name}' -> '{hits[0]}'")
|
|
52
|
+
return hits[0]
|
|
53
|
+
print(f"# allele '{name}' not found in panel")
|
|
54
|
+
return name
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def cmd_restriction(a):
|
|
58
|
+
store = _store(a)
|
|
59
|
+
allele = _resolve_panel_allele(store, a.allele, a.cls) if a.allele else None
|
|
60
|
+
res = store.restriction(a.peptide, cls=a.cls, alleles=[allele] if allele else "all",
|
|
61
|
+
top=a.top, diffuse=a.diffuse)
|
|
62
|
+
if not res:
|
|
63
|
+
print("no presenting allele (no presentation-signature neighbours)")
|
|
64
|
+
return
|
|
65
|
+
print(f"{'allele':<18}{'vote':>7}{'enr':>7}" + ("{:>8}".format("score") if a.diffuse else "")
|
|
66
|
+
+ f"{'binder':>8}")
|
|
67
|
+
for r in res:
|
|
68
|
+
line = f"{r.allele:<18}{r.vote:>7.2f}{r.enrichment:>7.1f}"
|
|
69
|
+
if a.diffuse:
|
|
70
|
+
line += f"{(r.anchor_score or 0.0):>8.2f}"
|
|
71
|
+
print(line + f"{'yes' if r.binder else 'no':>8}")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def cmd_scan(a):
|
|
75
|
+
hits = _store(a).scan_protein(_read_seq(a.protein), cls=a.cls or "mhc1",
|
|
76
|
+
alleles=[a.allele] if a.allele else "all", top=a.top,
|
|
77
|
+
correction=a.correction)
|
|
78
|
+
label = f" ({a.correction} FWER/FDR)" if a.correction else ""
|
|
79
|
+
print(f"# {len(hits)} presented window(s){label}")
|
|
80
|
+
for pos, pep, binders in hits:
|
|
81
|
+
print(f"{pos:>5} {pep:<14} {','.join(b.allele for b in binders)}")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def cmd_source(a):
|
|
85
|
+
hits = Proteome.from_fasta(a.proteome).find_source(a.peptide, max_subs=a.max_subs)
|
|
86
|
+
if not hits:
|
|
87
|
+
print("# no source within max_subs")
|
|
88
|
+
return
|
|
89
|
+
for h in hits:
|
|
90
|
+
muts = ",".join(f"{q}{i + 1}{r}" for i, q, r in h.mutations) or "exact"
|
|
91
|
+
print(f"{h.protein}\tpos {h.position}\tsubs {h.n_subs}\t{h.ref_peptide}\t{muts}")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def cmd_logo(a):
|
|
95
|
+
from . import logo
|
|
96
|
+
m = logo.motif(_store(a), a.allele, a.cls or "mhc1")
|
|
97
|
+
print(f"# {a.allele} width={m['width']} n={m['n']} lengths={dict(sorted(m['length_hist'].items()))}")
|
|
98
|
+
for i, (bits, col) in enumerate(zip(m["bits"], m["pwm"]), 1):
|
|
99
|
+
top = sorted(col.items(), key=lambda x: -x[1])[:3]
|
|
100
|
+
print(f" pos {i:>2} {bits:4.2f} bits " + " ".join(f"{aa}:{p:.2f}" for aa, p in top))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def main(argv=None):
|
|
104
|
+
ap = argparse.ArgumentParser(prog="mhcmatch", description="peptide-MHC presentation tools")
|
|
105
|
+
sub = ap.add_subparsers(dest="cmd", required=True)
|
|
106
|
+
|
|
107
|
+
d = sub.add_parser("decompose", help="split a peptide into anchor / TCR-facing parts (X masks)")
|
|
108
|
+
d.add_argument("peptide")
|
|
109
|
+
d.add_argument("--cls", choices=("mhc1", "mhc2"))
|
|
110
|
+
d.set_defaults(fn=cmd_decompose)
|
|
111
|
+
|
|
112
|
+
r = sub.add_parser("restriction", help="rank presenting alleles for a peptide")
|
|
113
|
+
r.add_argument("peptide")
|
|
114
|
+
r.add_argument("--allele", help="restrict to a single allele")
|
|
115
|
+
r.add_argument("--cls", choices=("mhc1", "mhc2"))
|
|
116
|
+
r.add_argument("--diffuse", action="store_true", help="rare-allele-aware (diffusion-shrunk anchors)")
|
|
117
|
+
r.add_argument("--top", type=int, default=10)
|
|
118
|
+
_add_store_opts(r)
|
|
119
|
+
r.set_defaults(fn=cmd_restriction)
|
|
120
|
+
|
|
121
|
+
s = sub.add_parser("scan", help="find presented peptides in a protein (sequence or FASTA path)")
|
|
122
|
+
s.add_argument("protein")
|
|
123
|
+
s.add_argument("--allele")
|
|
124
|
+
s.add_argument("--cls", choices=("mhc1", "mhc2"))
|
|
125
|
+
s.add_argument("--top", type=int, default=3)
|
|
126
|
+
s.add_argument("--correction", choices=("bonferroni", "bh"),
|
|
127
|
+
help="multiple-testing control over windows x alleles (FWER / BH-FDR)")
|
|
128
|
+
_add_store_opts(s)
|
|
129
|
+
s.set_defaults(fn=cmd_scan)
|
|
130
|
+
|
|
131
|
+
so = sub.add_parser("source", help="find the self peptide a neoantigen derives from")
|
|
132
|
+
so.add_argument("peptide")
|
|
133
|
+
so.add_argument("--proteome", required=True, help="reference proteome FASTA(.gz)")
|
|
134
|
+
so.add_argument("--max-subs", type=int, default=1)
|
|
135
|
+
so.set_defaults(fn=cmd_source)
|
|
136
|
+
|
|
137
|
+
lg = sub.add_parser("logo", help="motif logo (information content) + length distribution")
|
|
138
|
+
lg.add_argument("allele")
|
|
139
|
+
lg.add_argument("--cls", choices=("mhc1", "mhc2"))
|
|
140
|
+
_add_store_opts(lg)
|
|
141
|
+
lg.set_defaults(fn=cmd_logo)
|
|
142
|
+
|
|
143
|
+
a = ap.parse_args(argv)
|
|
144
|
+
a.fn(a)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
if __name__ == "__main__":
|
|
148
|
+
main()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Vendored data provenance
|
|
2
|
+
|
|
3
|
+
## `mhci_pseudo.fa` / `mhcii_pseudo.fa`
|
|
4
|
+
|
|
5
|
+
NetMHCpan-style **34-residue MHC pseudosequences** — the polymorphic groove positions that contact
|
|
6
|
+
the peptide (class I: α1/α2 of the MHC heavy chain; class II: α1 of the α-chain + β1 of the
|
|
7
|
+
β-chain). Header format `>ALLELE|n=<count>`; `X` marks an ambiguous/unresolved position.
|
|
8
|
+
|
|
9
|
+
- **MHC-I:** 4143 alleles (human HLA-A/B/C, mouse H-2, and other species).
|
|
10
|
+
- **MHC-II:** 2209 alleles (human HLA-DR/DQ/DP, mouse H-2 I-A/I-E, others).
|
|
11
|
+
|
|
12
|
+
Copied verbatim from the sibling `antigenomics/tcren` repository
|
|
13
|
+
(`tcren-ms/src/tcren/data/{mhci,mhcii}_pseudo.fa`, built by its `scripts/build_pseudo_fasta.py`),
|
|
14
|
+
which derives them from the NetMHCpan pseudosequence definition. Used by `mhcmatch.pseudoseq` as the
|
|
15
|
+
allele-similarity alphabet for the cross-allele diffusion model (see `appendix/mhcmatch.tex` §4).
|
|
16
|
+
|
|
17
|
+
These files are static reference data and small (~340 KB total), so they are vendored rather than
|
|
18
|
+
fetched. Re-sync from `tcren` if the pseudosequence definition is updated upstream.
|
|
19
|
+
|
|
20
|
+
## `structural_pockets_mhc1.tsv` / `structural_pockets_mhc2.tsv`
|
|
21
|
+
|
|
22
|
+
Per-anchor **structural pocket weights**: for each peptide anchor (MHC-I P1/P2/P3/PΩ-1/PΩ; MHC-II
|
|
23
|
+
P1/P4/P6/P9), the frequency with which each of the 34 groove pseudosequence positions makes a
|
|
24
|
+
heavy-atom contact (<5 Å) with that anchor residue, measured over pMHC crystal structures. Used as a
|
|
25
|
+
data-independent alternative to the learned-MI groove weights via `AnchorModel(weights="structural")`.
|
|
26
|
+
|
|
27
|
+
Measured by `bench/structural_pockets.py` from the **Canonical2026** TCR:pMHC structure set
|
|
28
|
+
(`antigenomics/tcren`, 372 usable structures): the 34-mer pseudosequence is threaded onto each groove
|
|
29
|
+
with tcren's C++ fitting aligner (`tcren._align`; no mmseqs/arda). Class per structure is assigned by
|
|
30
|
+
best pseudosequence fit (MHC-I single chain vs MHC-II α1+β1 chain-pair), giving **279 MHC-I** and
|
|
31
|
+
**93 MHC-II** structures. Regenerate with:
|
|
32
|
+
|
|
33
|
+
conda run -n tcren-nb python bench/structural_pockets.py \
|
|
34
|
+
--structures ../tcren-ms/data/Canonical2026 --out src/mhcmatch/data
|