seqtree 0.0.2__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {seqtree-0.0.2 → seqtree-0.0.3}/CMakeLists.txt +4 -0
  2. {seqtree-0.0.2 → seqtree-0.0.3}/PKG-INFO +34 -9
  3. {seqtree-0.0.2 → seqtree-0.0.3}/README.md +31 -8
  4. seqtree-0.0.3/ROADMAP.md +172 -0
  5. {seqtree-0.0.2 → seqtree-0.0.3}/appendix/.gitignore +1 -1
  6. seqtree-0.0.3/appendix/.latexmkrc +2 -0
  7. seqtree-0.0.3/appendix/Makefile +17 -0
  8. seqtree-0.0.3/appendix/epitope_detection.pdf +0 -0
  9. seqtree-0.0.3/appendix/evalue.pdf +0 -0
  10. {seqtree-0.0.2 → seqtree-0.0.3}/appendix/evalue.tex +283 -31
  11. seqtree-0.0.3/appendix/evalue_matrix.pdf +0 -0
  12. seqtree-0.0.3/appendix/mhc1_rocpr.pdf +0 -0
  13. seqtree-0.0.3/appendix/mhc2_rocpr.pdf +0 -0
  14. {seqtree-0.0.2 → seqtree-0.0.3}/appendix/refs.bib +63 -0
  15. seqtree-0.0.3/include/seqtree/kmer_index.hpp +59 -0
  16. {seqtree-0.0.2 → seqtree-0.0.3}/include/seqtree/seqtree.hpp +29 -0
  17. {seqtree-0.0.2 → seqtree-0.0.3}/include/seqtree/types.hpp +8 -1
  18. {seqtree-0.0.2 → seqtree-0.0.3}/pyproject.toml +2 -1
  19. {seqtree-0.0.2 → seqtree-0.0.3}/python/seqtree/__init__.py +12 -0
  20. seqtree-0.0.3/python/seqtree/layout.py +131 -0
  21. seqtree-0.0.3/python/seqtree/pmhc.py +229 -0
  22. seqtree-0.0.3/python/seqtree/pmhc_evalue.py +33 -0
  23. {seqtree-0.0.2 → seqtree-0.0.3}/src/_bindings.cpp +94 -2
  24. {seqtree-0.0.2 → seqtree-0.0.3}/src/engine_seqtm.cpp +12 -6
  25. {seqtree-0.0.2 → seqtree-0.0.3}/src/engines.hpp +1 -0
  26. seqtree-0.0.3/src/kmer_index.cpp +219 -0
  27. seqtree-0.0.3/src/positional_matrix.cpp +42 -0
  28. {seqtree-0.0.2 → seqtree-0.0.3}/src/searcher.cpp +5 -3
  29. seqtree-0.0.2/appendix/Makefile +0 -7
  30. {seqtree-0.0.2 → seqtree-0.0.3}/.gitattributes +0 -0
  31. {seqtree-0.0.2 → seqtree-0.0.3}/.gitignore +0 -0
  32. {seqtree-0.0.2 → seqtree-0.0.3}/LICENSE +0 -0
  33. {seqtree-0.0.2 → seqtree-0.0.3}/python/seqtree/control.py +0 -0
  34. {seqtree-0.0.2 → seqtree-0.0.3}/python/seqtree/data/control_human_trb_aa.txt.gz +0 -0
  35. {seqtree-0.0.2 → seqtree-0.0.3}/python/seqtree/evalue.py +0 -0
  36. {seqtree-0.0.2 → seqtree-0.0.3}/python/seqtree/py.typed +0 -0
  37. {seqtree-0.0.2 → seqtree-0.0.3}/src/blosum62.inc +0 -0
  38. {seqtree-0.0.2 → seqtree-0.0.3}/src/codec.cpp +0 -0
  39. {seqtree-0.0.2 → seqtree-0.0.3}/src/engine_seqtrie.cpp +0 -0
  40. {seqtree-0.0.2 → seqtree-0.0.3}/src/index.cpp +0 -0
  41. {seqtree-0.0.2 → seqtree-0.0.3}/src/pam50.inc +0 -0
  42. {seqtree-0.0.2 → seqtree-0.0.3}/src/substitution_matrix.cpp +0 -0
  43. {seqtree-0.0.2 → seqtree-0.0.3}/src/trie.cpp +0 -0
  44. {seqtree-0.0.2 → seqtree-0.0.3}/src/trie.hpp +0 -0
@@ -20,6 +20,8 @@ find_package(Threads REQUIRED)
20
20
  add_library(seqtree_core STATIC
21
21
  src/codec.cpp
22
22
  src/substitution_matrix.cpp
23
+ src/positional_matrix.cpp
24
+ src/kmer_index.cpp
23
25
  src/trie.cpp
24
26
  src/index.cpp
25
27
  src/engine_seqtm.cpp
@@ -45,6 +47,8 @@ if(SEQTREE_TESTS)
45
47
  tests/cpp/test_engines.cpp
46
48
  tests/cpp/test_edge.cpp
47
49
  tests/cpp/test_serialize.cpp
50
+ tests/cpp/test_positional.cpp
51
+ tests/cpp/test_kmer_index.cpp
48
52
  )
49
53
  target_include_directories(seqtree_tests PRIVATE tests/cpp src)
50
54
  target_link_libraries(seqtree_tests PRIVATE seqtree_core)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: seqtree
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: Fast fuzzy search over biological sequences (C++ core, Python bindings)
5
5
  Keywords: sequence-search,fuzzy-matching,CDR3,immunology,bioinformatics,trie
6
6
  Author-Email: ISALGO laboratory <mikhail.shugay@gmail.com>
@@ -27,6 +27,8 @@ Requires-Dist: pydata-sphinx-theme; extra == "docs"
27
27
  Requires-Dist: nbsphinx; extra == "docs"
28
28
  Provides-Extra: control
29
29
  Requires-Dist: huggingface_hub; extra == "control"
30
+ Provides-Extra: pmhc
31
+ Requires-Dist: huggingface_hub; extra == "pmhc"
30
32
  Description-Content-Type: text/markdown
31
33
 
32
34
  # seqtree
@@ -54,12 +56,24 @@ Two search engines over one trie:
54
56
  `(ref_id, score, n_subs, n_ins, n_dels)`. Downstream libraries map `ref_id` back
55
57
  to their own payloads (V gene, MHC, counts) and filter.
56
58
 
59
+ Beyond search, seqtree ships:
60
+
61
+ - **Substitution matrices** — built-in `BLOSUM62` and `PAM50`, plus custom matrices via
62
+ `SubstitutionMatrix.from_similarity` (Gram-distance penalty `s(a,a)+s(b,b)−2·s(a,b)`).
63
+ - **E-values / significance** — calibrate hit counts against a background control repertoire
64
+ (`load_control` + `evalues`), the TCRNET approach on a finite-sample footing. See the
65
+ [E-value guide](https://antigenomics.github.io/seqtree/evalue.html).
66
+
57
67
  ## Install
58
68
 
59
69
  ```fish
60
- pip install seqtree # prebuilt wheels for CPython 3.10–3.13 (Linux/macOS/Windows)
70
+ pip install seqtree # prebuilt wheels for CPython 3.10–3.13
61
71
  ```
62
72
 
73
+ Prebuilt wheels cover **Linux x86-64**, **macOS arm64 (Apple Silicon)**, and **Windows x86-64**.
74
+ There are **no Intel/x86-64 macOS wheels** — Intel Macs build from source (see below), which just
75
+ needs a C++17 compiler and CMake (pulled in automatically by the build).
76
+
63
77
  ## Build from source
64
78
 
65
79
  ```fish
@@ -92,6 +106,13 @@ print(aln.aligned_query, aln.aligned_ref, aln.ops)
92
106
 
93
107
  # batch-vs-batch (auto-indexes the larger set)
94
108
  pairs = seqtree.pairwise_batch(query_set, db_set, p, alphabet="aa")
109
+
110
+ # E-values against a background control repertoire (TCRNET-style significance)
111
+ control = seqtree.load_control("human_trb_aa", size=1_000_000)
112
+ target = seqtree.Index.build(vdjdb_cdr3s, alphabet="aa")
113
+ for q, r in zip(queries, seqtree.evalues(target, control, queries, p)):
114
+ if r["p_enrichment"] < 1e-3:
115
+ print(q, r["E"], r["n_target"], r["n_control"])
95
116
  ```
96
117
 
97
118
  ## Tests
@@ -106,13 +127,16 @@ pytest tests/python # Python tests
106
127
  ## Benchmarks
107
128
 
108
129
  ```fish
109
- python bench/bench.py # recall vs ground truth (real VDJdb data)
110
- python bench/bench_gnuplot.py # max-edit-3 throughput SVG figures (needs gnuplot)
111
- env RUN_BENCHMARK=1 python bench/bench.py --sizes 1000000 --queries 1000000 --threads 16
130
+ python bench/bench_gnuplot.py # throughput / scaling / matrix / collisions → SVG (needs gnuplot)
131
+ python bench/bench.py # recall vs ground truth (real VDJdb data)
132
+ python bench/bench_evalue.py # true E-value benchmark (target vs background control)
133
+ python bench/bench_evalue_matrix.py # significance across reference/control/query/scope grid
134
+ python bench/bench_epitope.py # epitope detection-complexity (GIL vs NLV)
112
135
  ```
113
136
 
114
- `bench/bench_gnuplot.py` renders queries/ms vs reference-set size (both engines), peak RSS, and
115
- alignment-fetch cost. See [docs/benchmarks.rst](docs/benchmarks.rst).
137
+ Figures (throughput, scaling, matrix-scoring overhead, collisions, E-value matrix, epitope
138
+ detection) and the full methodology are in the [benchmarks docs](https://antigenomics.github.io/seqtree/benchmarks.html).
139
+ Set `RUN_BENCHMARK=1` for the large tiers.
116
140
 
117
141
  ## Development
118
142
 
@@ -122,5 +146,6 @@ This repo follows **git-flow**:
122
146
  - `dev` — integration branch for day-to-day work.
123
147
  - feature branches branch off `dev` and merge back via PR; releases merge `dev` → `master`.
124
148
 
125
- Roadmap (affine gaps, position-specific matrices, e-value / significance via
126
- control-set and tf-idf, succinct memory packing) lives in [docs/roadmap.rst](docs/roadmap.rst).
149
+ Roadmap (affine gaps, position-specific matrices, succinct memory packing) lives in
150
+ [docs/roadmap.rst](docs/roadmap.rst). Control-set E-values already ship — see the
151
+ [E-value guide](https://antigenomics.github.io/seqtree/evalue.html).
@@ -23,12 +23,24 @@ Two search engines over one trie:
23
23
  `(ref_id, score, n_subs, n_ins, n_dels)`. Downstream libraries map `ref_id` back
24
24
  to their own payloads (V gene, MHC, counts) and filter.
25
25
 
26
+ Beyond search, seqtree ships:
27
+
28
+ - **Substitution matrices** — built-in `BLOSUM62` and `PAM50`, plus custom matrices via
29
+ `SubstitutionMatrix.from_similarity` (Gram-distance penalty `s(a,a)+s(b,b)−2·s(a,b)`).
30
+ - **E-values / significance** — calibrate hit counts against a background control repertoire
31
+ (`load_control` + `evalues`), the TCRNET approach on a finite-sample footing. See the
32
+ [E-value guide](https://antigenomics.github.io/seqtree/evalue.html).
33
+
26
34
  ## Install
27
35
 
28
36
  ```fish
29
- pip install seqtree # prebuilt wheels for CPython 3.10–3.13 (Linux/macOS/Windows)
37
+ pip install seqtree # prebuilt wheels for CPython 3.10–3.13
30
38
  ```
31
39
 
40
+ Prebuilt wheels cover **Linux x86-64**, **macOS arm64 (Apple Silicon)**, and **Windows x86-64**.
41
+ There are **no Intel/x86-64 macOS wheels** — Intel Macs build from source (see below), which just
42
+ needs a C++17 compiler and CMake (pulled in automatically by the build).
43
+
32
44
  ## Build from source
33
45
 
34
46
  ```fish
@@ -61,6 +73,13 @@ print(aln.aligned_query, aln.aligned_ref, aln.ops)
61
73
 
62
74
  # batch-vs-batch (auto-indexes the larger set)
63
75
  pairs = seqtree.pairwise_batch(query_set, db_set, p, alphabet="aa")
76
+
77
+ # E-values against a background control repertoire (TCRNET-style significance)
78
+ control = seqtree.load_control("human_trb_aa", size=1_000_000)
79
+ target = seqtree.Index.build(vdjdb_cdr3s, alphabet="aa")
80
+ for q, r in zip(queries, seqtree.evalues(target, control, queries, p)):
81
+ if r["p_enrichment"] < 1e-3:
82
+ print(q, r["E"], r["n_target"], r["n_control"])
64
83
  ```
65
84
 
66
85
  ## Tests
@@ -75,13 +94,16 @@ pytest tests/python # Python tests
75
94
  ## Benchmarks
76
95
 
77
96
  ```fish
78
- python bench/bench.py # recall vs ground truth (real VDJdb data)
79
- python bench/bench_gnuplot.py # max-edit-3 throughput SVG figures (needs gnuplot)
80
- env RUN_BENCHMARK=1 python bench/bench.py --sizes 1000000 --queries 1000000 --threads 16
97
+ python bench/bench_gnuplot.py # throughput / scaling / matrix / collisions → SVG (needs gnuplot)
98
+ python bench/bench.py # recall vs ground truth (real VDJdb data)
99
+ python bench/bench_evalue.py # true E-value benchmark (target vs background control)
100
+ python bench/bench_evalue_matrix.py # significance across reference/control/query/scope grid
101
+ python bench/bench_epitope.py # epitope detection-complexity (GIL vs NLV)
81
102
  ```
82
103
 
83
- `bench/bench_gnuplot.py` renders queries/ms vs reference-set size (both engines), peak RSS, and
84
- alignment-fetch cost. See [docs/benchmarks.rst](docs/benchmarks.rst).
104
+ Figures (throughput, scaling, matrix-scoring overhead, collisions, E-value matrix, epitope
105
+ detection) and the full methodology are in the [benchmarks docs](https://antigenomics.github.io/seqtree/benchmarks.html).
106
+ Set `RUN_BENCHMARK=1` for the large tiers.
85
107
 
86
108
  ## Development
87
109
 
@@ -91,5 +113,6 @@ This repo follows **git-flow**:
91
113
  - `dev` — integration branch for day-to-day work.
92
114
  - feature branches branch off `dev` and merge back via PR; releases merge `dev` → `master`.
93
115
 
94
- Roadmap (affine gaps, position-specific matrices, e-value / significance via
95
- control-set and tf-idf, succinct memory packing) lives in [docs/roadmap.rst](docs/roadmap.rst).
116
+ Roadmap (affine gaps, position-specific matrices, succinct memory packing) lives in
117
+ [docs/roadmap.rst](docs/roadmap.rst). Control-set E-values already ship — see the
118
+ [E-value guide](https://antigenomics.github.io/seqtree/evalue.html).
@@ -0,0 +1,172 @@
1
+ # Downstream wrappers roadmap: `vdjmatch` and `mhcmatch`
2
+
3
+ **Status:** living draft. Owner: @mikessh. Extend freely — sections marked _(TBD: owner)_ await
4
+ more detail.
5
+
6
+ **Purpose.** `seqtree` is the shared substrate — a fast, payload-agnostic fuzzy-search core (C++ +
7
+ Python) with a control-calibrated E-value theory (see `appendix/evalue.tex`). It deliberately stops
8
+ at the reference-implementation level: the *applied* tools — `vdjmatch` (TCR antigen specificity) and
9
+ `mhcmatch` (peptide–MHC) — are separate packages built on top of it. This file is the contract
10
+ between them, written so an agent developing either package can pick it up cold: it states what
11
+ `seqtree` already provides, where each wrapper plugs in, and which design decisions are still open.
12
+
13
+ **How to use this doc (for agents).** Before implementing a feature in `vdjmatch`/`mhcmatch`:
14
+ 1. Check **§1 Substrate** for the `seqtree` primitive that already covers it — do not reimplement
15
+ search, E-values, anchor masking, or k-mer indexing.
16
+ 2. Read the matching subsection in **§2** (vdjmatch) or **§3** (mhcmatch) for the intended design and
17
+ the relevant `appendix/evalue.tex` section.
18
+ 3. Treat the math in the appendix as the spec; treat formulas here marked _(design)_ as direction,
19
+ not final — confirm with the owner before committing to one.
20
+
21
+ ---
22
+
23
+ ## 1. Substrate: what `seqtree` provides
24
+
25
+ | Capability | Module / symbol | Notes |
26
+ |---|---|---|
27
+ | Fuzzy fixed-length search | `seqtree.Index`, engines `seqtm` / `seqtrie` | per-type edit caps + Hamming fast path; or banded DP |
28
+ | Local / best-window mode | `Mode::Local` (C++) | class-II register, general local match |
29
+ | Substitution scoring | `SubstitutionMatrix` (Gram → squared-distance penalty `s_aa+s_bb−2·s_ab`) | payload-agnostic |
30
+ | Per-position scoring | `PositionalMatrix` (`penalty(pos,a,b)` = base × per-position weight; weight 0 = free/anchor) | the hook for PSSMs and anchor masking |
31
+ | k-mer seed index | `KmerIndex` (C++): unique-k-mer trie + CSR postings + per-peptide allele tag; `seed_and_gather` (GIL-released, parallel) | million-scale candidate generation |
32
+ | Control-calibrated E-values | `seqtree.evalues`, `seqtree.load_control` | `Ê = (N/M)·n_C`, Poisson tail, `exclude_exact` |
33
+ | Anchor / layout model | `seqtree.layout`: `AnchorSpec`, `DEFAULTS`, `mask_anchors`, `kmers`, `presentation_features`, `weight_profile` | parametrized anchors; class-II register trick |
34
+ | pMHC homology + reverse | `seqtree.pmhc`: `PMHCStore`, `search_homologs`, `assign_allele`, `find_mimics`, `build_kmer_index` | reference impl; mhcmatch productionizes |
35
+ | Presentation-aware E-values | `seqtree.pmhc_evalue.homolog_evalue` | per-allele null |
36
+
37
+ **E-value theory** lives in `appendix/evalue.tex`: the empirical-control null (§Setup, §Null), the
38
+ Poisson/Chen–Stein bound (§Poisson), multiple testing (§E-value), the closest-hit Gumbel law
39
+ (§Gumbel), Karlin–Altschul as the i.i.d. special case (§KA), the pMHC presentation-aware extension
40
+ (§Epitopes, §Reverse problem), and elementary applications — UMI/barcode birthday-collision and
41
+ CDR3-nt error clustering (§Related applications).
42
+
43
+ **Datasets.** `isalgo/pmhc_data` ships two tiers (see `appendix` Table "Scope of the two pmhc_data
44
+ tiers"): **full** (every IEDB-positive epitope–allele assay) and **shortlist** (epitope–allele pairs
45
+ with ≥2 supporting publications). `mhcmatch` should expose tier choice; the benchmark uses full.
46
+ TCR side: VDJdb (specificity-labelled) + `isalgo/airr_control` (matched control repertoire).
47
+
48
+ ---
49
+
50
+ ## 2. `vdjmatch` — TCR antigen-specificity
51
+
52
+ **Goal.** Given a query TCR (CDR3, optionally V/J, optionally paired α/β), score its similarity to
53
+ known antigen-specific TCRs and return a control-calibrated E-value per candidate epitope —
54
+ generalizing TCRNET-style neighbour counting (appendix §Intro) to a usable annotation tool.
55
+
56
+ ### 2.1 Train / test / validation splits
57
+ - **Split by epitope, not by sequence.** Held-out *epitopes* test generalization to unseen
58
+ specificities; held-out *sequences within an epitope* test within-specificity recall. Never let the
59
+ same clonotype leak across splits (dedup to unique clonotypes first — appendix Prop. on collapse).
60
+ - **Three-way:** train (fit substitution/positional weights, §2.2), validation (tune scope/budget and
61
+ E-value thresholds), test (report ROC/PR per epitope, as the MHC-guess benchmark does per allele).
62
+ - **Null/control** is independent of the split: `airr_control` supplies the background `P₀`; size it
63
+ per appendix §"How large must the control be?".
64
+ - _(TBD: owner)_ exact epitope-level CV scheme, minimum cluster size, negative sampling.
65
+
66
+ ### 2.2 Custom substitution matrices (epitope–paratope interaction-aware)
67
+ - **Hook:** `seqtree.PositionalMatrix` (per-position `penalty(pos,a,b)`) and/or a bespoke
68
+ `SubstitutionMatrix`. seqtree already wires `PositionalMatrix` into the seqtm Hamming path.
69
+ - **Intent:** replace BLOSUM with a CDR3-position-weighted matrix reflecting which residues actually
70
+ contact the peptide–MHC (paratope), learned from structural contacts or from specificity data
71
+ (residues whose substitution most changes specificity get the most weight; framework positions →
72
+ weight 0). Mirrors how `layout.weight_profile` builds anchor/TCR-facing weights for pMHC.
73
+ - _(TBD: owner)_ source of contact statistics (structures vs. learned), per-V-gene vs. global matrix.
74
+
75
+ ### 2.3 Single-chain E-value (have today)
76
+ - Per chain (β, or α), this is exactly `seqtree.evalues`: count neighbours of the query among the
77
+ epitope-specific set vs. the control, report `Ê` and `p_enrich`. No new theory needed.
78
+
79
+ ### 2.4 Paired α/β E-value (both chains known)
80
+ - _(design)_ Under chain independence in the null, the **joint** null ball-mass factorizes:
81
+ `π₀^{αβ} ≈ π₀^α · π₀^β`, so the paired intensity is `λ_αβ = N · π₀^α π₀^β` and the paired enrichment
82
+ is a Poisson tail on the count of references matching **both** chains within budget. Equivalent to
83
+ Fisher-combining the per-chain `p_enrich` when independence holds; the appendix `b₂` co-occupancy
84
+ term measures the dependence to correct for. Implement as a joint `seed_and_gather` keyed on a
85
+ paired peptide id.
86
+ - This is strictly more specific than either chain alone (joint null mass is tiny) → far smaller
87
+ E-values for true pairs.
88
+
89
+ ### 2.5 Paired estimate for a single chain (rarity of the unknown chain)
90
+ - _(design)_ When only one chain (say β) is observed, marginalize the paired E-value over the unknown
91
+ α: weight the single-chain evidence by the **rarity** of the partner that would complete a known
92
+ pair — a generation-probability / abundance prior on α (`Pgen`, appendix §thymic/selection). A β
93
+ matching a rare-α-paired reference is more informative than one matching a common-α pairing. Yields
94
+ a "paired-equivalent" E-value from one chain without observing the other.
95
+ - _(TBD: owner)_ the exact prior (OLGA/Murugan `Pgen` vs. empirical α-abundance), and calibration.
96
+
97
+ ### 2.6 API sketch _(non-binding)_
98
+ ```python
99
+ vm = vdjmatch.Annotator.from_vdjdb(chains=("beta",), matrix=paratope_matrix, control=airr_control)
100
+ hits = vm.annotate(cdr3b="CASS...", v="TRBV...", scope=2) # -> [(epitope, Ê, p_enrich), ...]
101
+ hits = vm.annotate_paired(cdr3a=..., cdr3b=...) # §2.4
102
+ hits = vm.annotate(cdr3b=..., partner_prior="pgen") # §2.5
103
+ ```
104
+
105
+ ---
106
+
107
+ ## 3. `mhcmatch` — peptide–MHC
108
+
109
+ **Goal.** Productionize the `seqtree.pmhc` reference layer: epitope homology / cross-reactivity,
110
+ molecular mimicry, allele guessing, and non-binder filtering — with tuned thresholds and the
111
+ additions below.
112
+
113
+ ### 3.1 Reuse from seqtree (have today)
114
+ - Homology / mimicry: `search_homologs`, `find_mimics` (anchor-masked TCR-facing k-mers; per-allele
115
+ presentation-aware E-values via `pmhc_evalue.homolog_evalue`). Positive control: the Dolton et al.
116
+ A\*02:01 cross-reactive trio.
117
+ - Allele guessing (reverse problem): `assign_allele` + the vote-fraction ranking / register trick
118
+ validated in `bench/bench_mhc_guess.py` (appendix §Reverse problem). ROC-AUC 0.90–0.98.
119
+ - **Non-binder filter** (appendix §Reverse problem, "Filtering non-binders"): _binds no MHC_ →
120
+ best-over-panel E-value high / no allele scores above background; _doesn't bind allele a_ →
121
+ per-allele `E_a > α`. mhcmatch exposes both thresholds.
122
+ - Tier choice (full vs shortlist, §1 Datasets).
123
+
124
+ ### 3.2 Motif logos
125
+ - For each allele (and each class-II core register), render a sequence logo of the anchor / pocket
126
+ residues from its presented set — the visual counterpart of `presentation_features`. Use a standard
127
+ information-content logo (bits per position). Class II: build the logo over the register-aligned
128
+ 9-mer cores (the register trick already picks the core), so anchors P1/P4/P6/P9 are columns.
129
+ - _(TBD: owner)_ logo library choice; whether to weight by `n_references` (shortlist confidence).
130
+
131
+ ### 3.3 MHC pseudosequence → clustering & promiscuity
132
+ - **Pseudosequence:** represent each allele by the polymorphic residues that line the peptide-binding
133
+ groove (the pseudosequence of peptide-contacting positions, as used for per-allele MHC binding
134
+ prediction by Glynn, Ghersi & Singh, *PNAS* 2025, [doi:10.1073/pnas.2405106122](https://doi.org/10.1073/pnas.2405106122)).
135
+ This turns "allele" from an opaque label into a short sequence the **same seqtree engine** can compare.
136
+ - **Cluster MHCs** by pseudosequence distance (seqtree fuzzy search on the pseudosequence "alphabet"
137
+ of groove residues) → groups of functionally similar alleles. This is the principled way to express
138
+ **cross-allele similarity**, which `seqtree.pmhc` deliberately does *not* model (appendix §Impl.
139
+ limitation: "distinct alleles are distinct nulls"). mhcmatch is where that limitation is lifted.
140
+ - **Promiscuity** (esp. class II — appendix §"Class-II promiscuity"): a peptide presented across a
141
+ pseudosequence-cluster of alleles is promiscuous; quantify as the spread of its positive alleles
142
+ over the MHC clustering, and use the cluster to pool nulls for related alleles when data are thin.
143
+ - _(TBD: owner)_ pseudosequence position set per locus (HLA-A/B/C, DR/DQ/DP, mouse H-2), distance
144
+ metric, cluster cut.
145
+
146
+ ### 3.4 API sketch _(non-binding)_
147
+ ```python
148
+ mm = mhcmatch.Store.from_pmhc(tier="shortlist")
149
+ mm.logo("HLA-A*02:01") # §3.2
150
+ clusters = mm.cluster_alleles(class_="mhc1") # §3.3, via pseudosequence
151
+ mm.is_binder("SIINFEKL", allele="H-2Kb", alpha=0.05) # §3.1 non-binder filter
152
+ mm.promiscuity("PKYVKQNTLKLAT") # §3.3
153
+ ```
154
+
155
+ ---
156
+
157
+ ## 4. Shared conventions
158
+ - **seqtree is upstream and stays generic.** New general-purpose primitives (a learned matrix loader,
159
+ a logo helper, a pseudosequence comparator) may land in seqtree if reusable; tuned thresholds,
160
+ predictors, and domain glue stay in the wrappers.
161
+ - **Anchors** are parametrized in `seqtree.layout` (presets per class, overridable) — wrappers pass
162
+ `AnchorSpec`, they don't hardcode positions.
163
+ - **Citations:** never fabricate. Verify every DOI via a tool before adding it (PubMed/arXiv).
164
+ - **Versioning / gitflow:** feature branch → `dev` → `master`; end commit messages with the
165
+ `Co-Authored-By` trailer; don't publish to PyPI without an explicit release.
166
+
167
+ ## 5. Pointers
168
+ - E-value theory & all derivations: `appendix/evalue.tex` (compiled `appendix/evalue.pdf`).
169
+ - pMHC usage & limitations: `docs/pmhc.rst`. Benchmarks & figures: `docs/benchmarks.rst`,
170
+ `bench/bench_mhc_guess.py`.
171
+ - seqtree internal roadmap (PSSM-graded d_TCR, native local align, Flashback build, predictor proteome
172
+ scans): `docs/roadmap.rst`.
@@ -7,4 +7,4 @@
7
7
  *.log
8
8
  *.out
9
9
  *.toc
10
- evalue.pdf
10
+ # evalue.pdf is committed (linked from the docs); regenerate with `make`.
@@ -0,0 +1,2 @@
1
+ # OldStandard OpenType text+math requires lualatex (see Makefile).
2
+ $pdf_mode = 4;
@@ -0,0 +1,17 @@
1
+ # Compile the E-value derivation. Requires a TeX distribution (latexmk + lualatex + bibtex)
2
+ # and rsvg-convert (librsvg) to convert the referenced SVG plots to PDF for \includegraphics.
3
+ # lualatex (not pdflatex) is required for the OldStandard OpenType text+math fonts; the
4
+ # compiler is pinned in .latexmkrc ($pdf_mode = 4) so a bare `latexmk` also picks it up.
5
+ FIGS = epitope_detection.pdf evalue_matrix.pdf mhc1_rocpr.pdf mhc2_rocpr.pdf
6
+
7
+ evalue.pdf: evalue.tex refs.bib $(FIGS)
8
+ latexmk -lualatex -interaction=nonstopmode -halt-on-error evalue.tex
9
+
10
+ # Benchmark SVGs (bench/figures) -> PDF for inclusion as figures.
11
+ %.pdf: ../bench/figures/%.svg
12
+ rsvg-convert -f pdf -o $@ $<
13
+
14
+ .PHONY: clean
15
+ clean:
16
+ latexmk -C evalue.tex
17
+ rm -f $(FIGS)
Binary file