mhcmatch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mhcmatch/__init__.py ADDED
@@ -0,0 +1,44 @@
1
+ """mhcmatch: peptide-MHC presentation, cross-reactivity, and motif tools on the seqtree substrate.
2
+
3
+ - :class:`Store` -- MHC restriction / presentation prediction, protein scanning, anchor/TCR-facing
4
+ decomposition, from a reference epitope panel (isalgo/pmhc_data).
5
+ - :mod:`search` -- large-scale similarity search (TCR-facing recognition vs same-MHC presentation)
6
+ and neoantigen molecular mimicry (:func:`search.find_mimics`).
7
+ - :class:`Proteome` -- near-exact source-peptide lookup (neoantigen -> parent protein).
8
+ - :class:`Pseudoseq` -- pseudosequence allele similarity & cross-allele diffusion (rare-allele rescue).
9
+ - :func:`logo.motif` -- per-allele motif logos + length distributions.
10
+
11
+ Theory: ``appendix/mhcmatch.tex``. Roadmap: ``ROADMAP.md``.
12
+ """
13
+ from importlib.metadata import PackageNotFoundError, version as _version
14
+
15
+ from . import logo, search
16
+ from .diffusion import AnchorModel
17
+ from .proteome import Proteome, SourceHit
18
+ from .pseudoseq import (Pseudoseq, learn_anchor_weights, load_pseudo, normalize_allele,
19
+ resolve_allele)
20
+ from .store import Decomposition, Restriction, Store, anchor_indices, infer_class
21
+
22
+ __all__ = [
23
+ "Store",
24
+ "Restriction",
25
+ "Decomposition",
26
+ "infer_class",
27
+ "anchor_indices",
28
+ "search",
29
+ "AnchorModel",
30
+ "Proteome",
31
+ "SourceHit",
32
+ "Pseudoseq",
33
+ "learn_anchor_weights",
34
+ "load_pseudo",
35
+ "normalize_allele",
36
+ "resolve_allele",
37
+ "logo",
38
+ "__version__",
39
+ ]
40
+
41
+ try:
42
+ __version__ = _version("mhcmatch")
43
+ except PackageNotFoundError: # running from a source tree without an install
44
+ __version__ = "0.1.0"
mhcmatch/cli.py ADDED
@@ -0,0 +1,148 @@
1
+ """Command-line interface for mhcmatch: ``mhcmatch <command> ...``.
2
+
3
+ Commands: ``decompose`` (no data needed), ``restriction``, ``scan``, ``logo`` (need a pmhc_data
4
+ table via ``--pmhc`` or ``$MHCMATCH_PMHC``), and ``source`` (needs a proteome FASTA).
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import argparse
9
+ import os
10
+
11
+ from . import Proteome, Store
12
+
13
+
14
+ def _add_store_opts(p):
15
+ p.add_argument("--pmhc", help="pmhc_data TSV(.gz); else $MHCMATCH_PMHC/pmhc_<tier>.tsv.gz")
16
+ p.add_argument("--tier", default="full", choices=("full", "shortlist"))
17
+ p.add_argument("--species", default="human", choices=("human", "mouse"))
18
+
19
+
20
+ def _store(a):
21
+ return Store.from_pmhc(a.pmhc, tier=a.tier, species=a.species)
22
+
23
+
24
+ def _read_seq(arg):
25
+ """A raw sequence, or the concatenated sequences of a FASTA file path."""
26
+ if os.path.exists(arg):
27
+ from .proteome import read_fasta
28
+ seqs = read_fasta(arg)
29
+ if seqs:
30
+ return "".join(seqs.values())
31
+ return arg.strip()
32
+
33
+
34
+ def cmd_decompose(a):
35
+ d = Store().decompose(a.peptide, cls=a.cls)
36
+ print(f"peptide {d.peptide}")
37
+ print(f"anchors {','.join(str(i + 1) for i in d.anchors)}")
38
+ print(f"tcr_facing {d.tcr_facing}")
39
+ print(f"presentation {d.presentation}")
40
+
41
+
42
+ def _resolve_panel_allele(store, name, cls):
43
+ """Map a user-typed allele to a panel allele (exact, else prefix on punctuation-stripped name)."""
44
+ pool = (cls,) if cls else ("mhc1", "mhc2")
45
+ panel = {al for c in pool for al in store.alleles(c)}
46
+ if name in panel:
47
+ return name
48
+ key = name.replace("*", "").replace(":", "")
49
+ hits = sorted(a for a in panel if a.replace("*", "").replace(":", "").startswith(key))
50
+ if hits:
51
+ print(f"# resolved '{name}' -> '{hits[0]}'")
52
+ return hits[0]
53
+ print(f"# allele '{name}' not found in panel")
54
+ return name
55
+
56
+
57
+ def cmd_restriction(a):
58
+ store = _store(a)
59
+ allele = _resolve_panel_allele(store, a.allele, a.cls) if a.allele else None
60
+ res = store.restriction(a.peptide, cls=a.cls, alleles=[allele] if allele else "all",
61
+ top=a.top, diffuse=a.diffuse)
62
+ if not res:
63
+ print("no presenting allele (no presentation-signature neighbours)")
64
+ return
65
+ print(f"{'allele':<18}{'vote':>7}{'enr':>7}" + ("{:>8}".format("score") if a.diffuse else "")
66
+ + f"{'binder':>8}")
67
+ for r in res:
68
+ line = f"{r.allele:<18}{r.vote:>7.2f}{r.enrichment:>7.1f}"
69
+ if a.diffuse:
70
+ line += f"{(r.anchor_score or 0.0):>8.2f}"
71
+ print(line + f"{'yes' if r.binder else 'no':>8}")
72
+
73
+
74
+ def cmd_scan(a):
75
+ hits = _store(a).scan_protein(_read_seq(a.protein), cls=a.cls or "mhc1",
76
+ alleles=[a.allele] if a.allele else "all", top=a.top,
77
+ correction=a.correction)
78
+ label = f" ({a.correction} FWER/FDR)" if a.correction else ""
79
+ print(f"# {len(hits)} presented window(s){label}")
80
+ for pos, pep, binders in hits:
81
+ print(f"{pos:>5} {pep:<14} {','.join(b.allele for b in binders)}")
82
+
83
+
84
+ def cmd_source(a):
85
+ hits = Proteome.from_fasta(a.proteome).find_source(a.peptide, max_subs=a.max_subs)
86
+ if not hits:
87
+ print("# no source within max_subs")
88
+ return
89
+ for h in hits:
90
+ muts = ",".join(f"{q}{i + 1}{r}" for i, q, r in h.mutations) or "exact"
91
+ print(f"{h.protein}\tpos {h.position}\tsubs {h.n_subs}\t{h.ref_peptide}\t{muts}")
92
+
93
+
94
+ def cmd_logo(a):
95
+ from . import logo
96
+ m = logo.motif(_store(a), a.allele, a.cls or "mhc1")
97
+ print(f"# {a.allele} width={m['width']} n={m['n']} lengths={dict(sorted(m['length_hist'].items()))}")
98
+ for i, (bits, col) in enumerate(zip(m["bits"], m["pwm"]), 1):
99
+ top = sorted(col.items(), key=lambda x: -x[1])[:3]
100
+ print(f" pos {i:>2} {bits:4.2f} bits " + " ".join(f"{aa}:{p:.2f}" for aa, p in top))
101
+
102
+
103
+ def main(argv=None):
104
+ ap = argparse.ArgumentParser(prog="mhcmatch", description="peptide-MHC presentation tools")
105
+ sub = ap.add_subparsers(dest="cmd", required=True)
106
+
107
+ d = sub.add_parser("decompose", help="split a peptide into anchor / TCR-facing parts (X masks)")
108
+ d.add_argument("peptide")
109
+ d.add_argument("--cls", choices=("mhc1", "mhc2"))
110
+ d.set_defaults(fn=cmd_decompose)
111
+
112
+ r = sub.add_parser("restriction", help="rank presenting alleles for a peptide")
113
+ r.add_argument("peptide")
114
+ r.add_argument("--allele", help="restrict to a single allele")
115
+ r.add_argument("--cls", choices=("mhc1", "mhc2"))
116
+ r.add_argument("--diffuse", action="store_true", help="rare-allele-aware (diffusion-shrunk anchors)")
117
+ r.add_argument("--top", type=int, default=10)
118
+ _add_store_opts(r)
119
+ r.set_defaults(fn=cmd_restriction)
120
+
121
+ s = sub.add_parser("scan", help="find presented peptides in a protein (sequence or FASTA path)")
122
+ s.add_argument("protein")
123
+ s.add_argument("--allele")
124
+ s.add_argument("--cls", choices=("mhc1", "mhc2"))
125
+ s.add_argument("--top", type=int, default=3)
126
+ s.add_argument("--correction", choices=("bonferroni", "bh"),
127
+ help="multiple-testing control over windows x alleles (FWER / BH-FDR)")
128
+ _add_store_opts(s)
129
+ s.set_defaults(fn=cmd_scan)
130
+
131
+ so = sub.add_parser("source", help="find the self peptide a neoantigen derives from")
132
+ so.add_argument("peptide")
133
+ so.add_argument("--proteome", required=True, help="reference proteome FASTA(.gz)")
134
+ so.add_argument("--max-subs", type=int, default=1)
135
+ so.set_defaults(fn=cmd_source)
136
+
137
+ lg = sub.add_parser("logo", help="motif logo (information content) + length distribution")
138
+ lg.add_argument("allele")
139
+ lg.add_argument("--cls", choices=("mhc1", "mhc2"))
140
+ _add_store_opts(lg)
141
+ lg.set_defaults(fn=cmd_logo)
142
+
143
+ a = ap.parse_args(argv)
144
+ a.fn(a)
145
+
146
+
147
+ if __name__ == "__main__":
148
+ main()
@@ -0,0 +1,34 @@
1
+ # Vendored data provenance
2
+
3
+ ## `mhci_pseudo.fa` / `mhcii_pseudo.fa`
4
+
5
+ NetMHCpan-style **34-residue MHC pseudosequences** — the polymorphic groove positions that contact
6
+ the peptide (class I: α1/α2 of the MHC heavy chain; class II: α1 of the α-chain + β1 of the
7
+ β-chain). Header format `>ALLELE|n=<count>`; `X` marks an ambiguous/unresolved position.
8
+
9
+ - **MHC-I:** 4143 alleles (human HLA-A/B/C, mouse H-2, and other species).
10
+ - **MHC-II:** 2209 alleles (human HLA-DR/DQ/DP, mouse H-2 I-A/I-E, others).
11
+
12
+ Copied verbatim from the sibling `antigenomics/tcren` repository
13
+ (`tcren-ms/src/tcren/data/{mhci,mhcii}_pseudo.fa`, built by its `scripts/build_pseudo_fasta.py`),
14
+ which derives them from the NetMHCpan pseudosequence definition. Used by `mhcmatch.pseudoseq` as the
15
+ allele-similarity alphabet for the cross-allele diffusion model (see `appendix/mhcmatch.tex` §4).
16
+
17
+ These files are static reference data and small (~340 KB total), so they are vendored rather than
18
+ fetched. Re-sync from `tcren` if the pseudosequence definition is updated upstream.
19
+
20
+ ## `structural_pockets_mhc1.tsv` / `structural_pockets_mhc2.tsv`
21
+
22
+ Per-anchor **structural pocket weights**: for each peptide anchor (MHC-I P1/P2/P3/PΩ-1/PΩ; MHC-II
23
+ P1/P4/P6/P9), the frequency with which each of the 34 groove pseudosequence positions makes a
24
+ heavy-atom contact (<5 Å) with that anchor residue, measured over pMHC crystal structures. Used as a
25
+ data-independent alternative to the learned-MI groove weights via `AnchorModel(weights="structural")`.
26
+
27
+ Measured by `bench/structural_pockets.py` from the **Canonical2026** TCR:pMHC structure set
28
+ (`antigenomics/tcren`, 372 usable structures): the 34-mer pseudosequence is threaded onto each groove
29
+ with tcren's C++ fitting aligner (`tcren._align`; no mmseqs/arda). Class per structure is assigned by
30
+ best pseudosequence fit (MHC-I single chain vs MHC-II α1+β1 chain-pair), giving **279 MHC-I** and
31
+ **93 MHC-II** structures. Regenerate with:
32
+
33
+ conda run -n tcren-nb python bench/structural_pockets.py \
34
+ --structures ../tcren-ms/data/Canonical2026 --out src/mhcmatch/data