seqtree 0.0.1__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {seqtree-0.0.1 → seqtree-0.0.3}/.gitignore +1 -0
  2. {seqtree-0.0.1 → seqtree-0.0.3}/CMakeLists.txt +9 -1
  3. {seqtree-0.0.1 → seqtree-0.0.3}/PKG-INFO +46 -6
  4. {seqtree-0.0.1 → seqtree-0.0.3}/README.md +41 -5
  5. seqtree-0.0.3/ROADMAP.md +172 -0
  6. seqtree-0.0.3/appendix/.gitignore +10 -0
  7. seqtree-0.0.3/appendix/.latexmkrc +2 -0
  8. seqtree-0.0.3/appendix/Makefile +17 -0
  9. seqtree-0.0.3/appendix/epitope_detection.pdf +0 -0
  10. seqtree-0.0.3/appendix/evalue.pdf +0 -0
  11. seqtree-0.0.3/appendix/evalue.tex +758 -0
  12. seqtree-0.0.3/appendix/evalue_matrix.pdf +0 -0
  13. seqtree-0.0.3/appendix/mhc1_rocpr.pdf +0 -0
  14. seqtree-0.0.3/appendix/mhc2_rocpr.pdf +0 -0
  15. seqtree-0.0.3/appendix/refs.bib +204 -0
  16. seqtree-0.0.3/include/seqtree/kmer_index.hpp +59 -0
  17. {seqtree-0.0.1 → seqtree-0.0.3}/include/seqtree/seqtree.hpp +44 -0
  18. {seqtree-0.0.1 → seqtree-0.0.3}/include/seqtree/types.hpp +8 -1
  19. {seqtree-0.0.1 → seqtree-0.0.3}/pyproject.toml +3 -1
  20. seqtree-0.0.3/python/seqtree/__init__.py +44 -0
  21. seqtree-0.0.3/python/seqtree/control.py +96 -0
  22. seqtree-0.0.3/python/seqtree/data/control_human_trb_aa.txt.gz +0 -0
  23. seqtree-0.0.3/python/seqtree/evalue.py +79 -0
  24. seqtree-0.0.3/python/seqtree/layout.py +131 -0
  25. seqtree-0.0.3/python/seqtree/pmhc.py +229 -0
  26. seqtree-0.0.3/python/seqtree/pmhc_evalue.py +33 -0
  27. seqtree-0.0.3/src/_bindings.cpp +421 -0
  28. {seqtree-0.0.1 → seqtree-0.0.3}/src/engine_seqtm.cpp +18 -8
  29. {seqtree-0.0.1 → seqtree-0.0.3}/src/engines.hpp +2 -0
  30. {seqtree-0.0.1 → seqtree-0.0.3}/src/index.cpp +117 -1
  31. seqtree-0.0.3/src/kmer_index.cpp +219 -0
  32. seqtree-0.0.3/src/pam50.inc +32 -0
  33. seqtree-0.0.3/src/positional_matrix.cpp +42 -0
  34. {seqtree-0.0.1 → seqtree-0.0.3}/src/searcher.cpp +8 -3
  35. {seqtree-0.0.1 → seqtree-0.0.3}/src/substitution_matrix.cpp +10 -4
  36. seqtree-0.0.1/python/seqtree/__init__.py +0 -10
  37. seqtree-0.0.1/src/_bindings.cpp +0 -235
  38. {seqtree-0.0.1 → seqtree-0.0.3}/.gitattributes +0 -0
  39. {seqtree-0.0.1 → seqtree-0.0.3}/LICENSE +0 -0
  40. {seqtree-0.0.1 → seqtree-0.0.3}/python/seqtree/py.typed +0 -0
  41. {seqtree-0.0.1 → seqtree-0.0.3}/src/blosum62.inc +0 -0
  42. {seqtree-0.0.1 → seqtree-0.0.3}/src/codec.cpp +0 -0
  43. {seqtree-0.0.1 → seqtree-0.0.3}/src/engine_seqtrie.cpp +0 -0
  44. {seqtree-0.0.1 → seqtree-0.0.3}/src/trie.cpp +0 -0
  45. {seqtree-0.0.1 → seqtree-0.0.3}/src/trie.hpp +0 -0
@@ -10,3 +10,4 @@ __pycache__/
10
10
  _skbuild/
11
11
  docs/_build/
12
12
  bench/figures/
13
+ bench/cache/
@@ -7,7 +7,9 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
7
7
  if(NOT CMAKE_BUILD_TYPE)
8
8
  set(CMAKE_BUILD_TYPE Release)
9
9
  endif()
10
- set(CMAKE_CXX_FLAGS_RELEASE "-O3")
10
+ if(NOT MSVC)
11
+ set(CMAKE_CXX_FLAGS_RELEASE "-O3")
12
+ endif() # MSVC Release already uses /O2; -O3 is not a valid MSVC flag
11
13
 
12
14
  option(SEQTREE_TESTS "Build C++ tests" OFF)
13
15
  option(SEQTREE_BENCH "Build C++ benchmarks" OFF)
@@ -18,6 +20,8 @@ find_package(Threads REQUIRED)
18
20
  add_library(seqtree_core STATIC
19
21
  src/codec.cpp
20
22
  src/substitution_matrix.cpp
23
+ src/positional_matrix.cpp
24
+ src/kmer_index.cpp
21
25
  src/trie.cpp
22
26
  src/index.cpp
23
27
  src/engine_seqtm.cpp
@@ -41,6 +45,10 @@ if(SEQTREE_TESTS)
41
45
  tests/cpp/test_matrix.cpp
42
46
  tests/cpp/test_trie.cpp
43
47
  tests/cpp/test_engines.cpp
48
+ tests/cpp/test_edge.cpp
49
+ tests/cpp/test_serialize.cpp
50
+ tests/cpp/test_positional.cpp
51
+ tests/cpp/test_kmer_index.cpp
44
52
  )
45
53
  target_include_directories(seqtree_tests PRIVATE tests/cpp src)
46
54
  target_link_libraries(seqtree_tests PRIVATE seqtree_core)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: seqtree
3
- Version: 0.0.1
3
+ Version: 0.0.3
4
4
  Summary: Fast fuzzy search over biological sequences (C++ core, Python bindings)
5
5
  Keywords: sequence-search,fuzzy-matching,CDR3,immunology,bioinformatics,trie
6
6
  Author-Email: ISALGO laboratory <mikhail.shugay@gmail.com>
@@ -25,10 +25,17 @@ Provides-Extra: docs
25
25
  Requires-Dist: sphinx; extra == "docs"
26
26
  Requires-Dist: pydata-sphinx-theme; extra == "docs"
27
27
  Requires-Dist: nbsphinx; extra == "docs"
28
+ Provides-Extra: control
29
+ Requires-Dist: huggingface_hub; extra == "control"
30
+ Provides-Extra: pmhc
31
+ Requires-Dist: huggingface_hub; extra == "pmhc"
28
32
  Description-Content-Type: text/markdown
29
33
 
30
34
  # seqtree
31
35
 
36
+ [![PyPI](https://img.shields.io/pypi/v/seqtree.svg)](https://pypi.org/project/seqtree/)
37
+ [![Python](https://img.shields.io/pypi/pyversions/seqtree.svg)](https://pypi.org/project/seqtree/)
38
+ [![License](https://img.shields.io/pypi/l/seqtree.svg)](LICENSE)
32
39
  [![CI](https://github.com/antigenomics/seqtree/actions/workflows/ci.yml/badge.svg)](https://github.com/antigenomics/seqtree/actions/workflows/ci.yml)
33
40
  [![Docs](https://github.com/antigenomics/seqtree/actions/workflows/docs.yml/badge.svg)](https://antigenomics.github.io/seqtree/)
34
41
 
@@ -49,7 +56,25 @@ Two search engines over one trie:
49
56
  `(ref_id, score, n_subs, n_ins, n_dels)`. Downstream libraries map `ref_id` back
50
57
  to their own payloads (V gene, MHC, counts) and filter.
51
58
 
52
- ## Build
59
+ Beyond search, seqtree ships:
60
+
61
+ - **Substitution matrices** — built-in `BLOSUM62` and `PAM50`, plus custom matrices via
62
+ `SubstitutionMatrix.from_similarity` (Gram-distance penalty `s(a,a)+s(b,b)−2·s(a,b)`).
63
+ - **E-values / significance** — calibrate hit counts against a background control repertoire
64
+ (`load_control` + `evalues`), the TCRNET approach on a finite-sample footing. See the
65
+ [E-value guide](https://antigenomics.github.io/seqtree/evalue.html).
66
+
67
+ ## Install
68
+
69
+ ```fish
70
+ pip install seqtree # prebuilt wheels for CPython 3.10–3.13
71
+ ```
72
+
73
+ Prebuilt wheels cover **Linux x86-64**, **macOS arm64 (Apple Silicon)**, and **Windows x86-64**.
74
+ There are **no Intel/x86-64 macOS wheels** — Intel Macs build from source (see below), which just
75
+ needs a C++17 compiler and CMake (pulled in automatically by the build).
76
+
77
+ ## Build from source
53
78
 
54
79
  ```fish
55
80
  bash setup.sh # repo-local .venv + editable install
@@ -81,6 +106,13 @@ print(aln.aligned_query, aln.aligned_ref, aln.ops)
81
106
 
82
107
  # batch-vs-batch (auto-indexes the larger set)
83
108
  pairs = seqtree.pairwise_batch(query_set, db_set, p, alphabet="aa")
109
+
110
+ # E-values against a background control repertoire (TCRNET-style significance)
111
+ control = seqtree.load_control("human_trb_aa", size=1_000_000)
112
+ target = seqtree.Index.build(vdjdb_cdr3s, alphabet="aa")
113
+ for q, r in zip(queries, seqtree.evalues(target, control, queries, p)):
114
+ if r["p_enrichment"] < 1e-3:
115
+ print(q, r["E"], r["n_target"], r["n_control"])
84
116
  ```
85
117
 
86
118
  ## Tests
@@ -95,10 +127,17 @@ pytest tests/python # Python tests
95
127
  ## Benchmarks
96
128
 
97
129
  ```fish
98
- python bench/bench.py # fast tier (real VDJdb data)
99
- env RUN_BENCHMARK=1 python bench/bench.py --sizes 1000000 --queries 1000000 --threads 16
130
+ python bench/bench_gnuplot.py # throughput / scaling / matrix / collisions → SVG (needs gnuplot)
131
+ python bench/bench.py # recall vs ground truth (real VDJdb data)
132
+ python bench/bench_evalue.py # true E-value benchmark (target vs background control)
133
+ python bench/bench_evalue_matrix.py # significance across reference/control/query/scope grid
134
+ python bench/bench_epitope.py # epitope detection-complexity (GIL vs NLV)
100
135
  ```
101
136
 
137
+ Figures (throughput, scaling, matrix-scoring overhead, collisions, E-value matrix, epitope
138
+ detection) and the full methodology are in the [benchmarks docs](https://antigenomics.github.io/seqtree/benchmarks.html).
139
+ Set `RUN_BENCHMARK=1` for the large tiers.
140
+
102
141
  ## Development
103
142
 
104
143
  This repo follows **git-flow**:
@@ -107,5 +146,6 @@ This repo follows **git-flow**:
107
146
  - `dev` — integration branch for day-to-day work.
108
147
  - feature branches branch off `dev` and merge back via PR; releases merge `dev` → `master`.
109
148
 
110
- Roadmap (affine gaps, position-specific matrices, e-value / significance via
111
- control-set and tf-idf, succinct memory packing) lives in [docs/roadmap.rst](docs/roadmap.rst).
149
+ Roadmap (affine gaps, position-specific matrices, succinct memory packing) lives in
150
+ [docs/roadmap.rst](docs/roadmap.rst). Control-set E-values already ship — see the
151
+ [E-value guide](https://antigenomics.github.io/seqtree/evalue.html).
@@ -1,5 +1,8 @@
1
1
  # seqtree
2
2
 
3
+ [![PyPI](https://img.shields.io/pypi/v/seqtree.svg)](https://pypi.org/project/seqtree/)
4
+ [![Python](https://img.shields.io/pypi/pyversions/seqtree.svg)](https://pypi.org/project/seqtree/)
5
+ [![License](https://img.shields.io/pypi/l/seqtree.svg)](LICENSE)
3
6
  [![CI](https://github.com/antigenomics/seqtree/actions/workflows/ci.yml/badge.svg)](https://github.com/antigenomics/seqtree/actions/workflows/ci.yml)
4
7
  [![Docs](https://github.com/antigenomics/seqtree/actions/workflows/docs.yml/badge.svg)](https://antigenomics.github.io/seqtree/)
5
8
 
@@ -20,7 +23,25 @@ Two search engines over one trie:
20
23
  `(ref_id, score, n_subs, n_ins, n_dels)`. Downstream libraries map `ref_id` back
21
24
  to their own payloads (V gene, MHC, counts) and filter.
22
25
 
23
- ## Build
26
+ Beyond search, seqtree ships:
27
+
28
+ - **Substitution matrices** — built-in `BLOSUM62` and `PAM50`, plus custom matrices via
29
+ `SubstitutionMatrix.from_similarity` (Gram-distance penalty `s(a,a)+s(b,b)−2·s(a,b)`).
30
+ - **E-values / significance** — calibrate hit counts against a background control repertoire
31
+ (`load_control` + `evalues`), the TCRNET approach on a finite-sample footing. See the
32
+ [E-value guide](https://antigenomics.github.io/seqtree/evalue.html).
33
+
34
+ ## Install
35
+
36
+ ```fish
37
+ pip install seqtree # prebuilt wheels for CPython 3.10–3.13
38
+ ```
39
+
40
+ Prebuilt wheels cover **Linux x86-64**, **macOS arm64 (Apple Silicon)**, and **Windows x86-64**.
41
+ There are **no Intel/x86-64 macOS wheels** — Intel Macs build from source (see below), which just
42
+ needs a C++17 compiler and CMake (pulled in automatically by the build).
43
+
44
+ ## Build from source
24
45
 
25
46
  ```fish
26
47
  bash setup.sh # repo-local .venv + editable install
@@ -52,6 +73,13 @@ print(aln.aligned_query, aln.aligned_ref, aln.ops)
52
73
 
53
74
  # batch-vs-batch (auto-indexes the larger set)
54
75
  pairs = seqtree.pairwise_batch(query_set, db_set, p, alphabet="aa")
76
+
77
+ # E-values against a background control repertoire (TCRNET-style significance)
78
+ control = seqtree.load_control("human_trb_aa", size=1_000_000)
79
+ target = seqtree.Index.build(vdjdb_cdr3s, alphabet="aa")
80
+ for q, r in zip(queries, seqtree.evalues(target, control, queries, p)):
81
+ if r["p_enrichment"] < 1e-3:
82
+ print(q, r["E"], r["n_target"], r["n_control"])
55
83
  ```
56
84
 
57
85
  ## Tests
@@ -66,10 +94,17 @@ pytest tests/python # Python tests
66
94
  ## Benchmarks
67
95
 
68
96
  ```fish
69
- python bench/bench.py # fast tier (real VDJdb data)
70
- env RUN_BENCHMARK=1 python bench/bench.py --sizes 1000000 --queries 1000000 --threads 16
97
+ python bench/bench_gnuplot.py # throughput / scaling / matrix / collisions → SVG (needs gnuplot)
98
+ python bench/bench.py # recall vs ground truth (real VDJdb data)
99
+ python bench/bench_evalue.py # true E-value benchmark (target vs background control)
100
+ python bench/bench_evalue_matrix.py # significance across reference/control/query/scope grid
101
+ python bench/bench_epitope.py # epitope detection-complexity (GIL vs NLV)
71
102
  ```
72
103
 
104
+ Figures (throughput, scaling, matrix-scoring overhead, collisions, E-value matrix, epitope
105
+ detection) and the full methodology are in the [benchmarks docs](https://antigenomics.github.io/seqtree/benchmarks.html).
106
+ Set `RUN_BENCHMARK=1` for the large tiers.
107
+
73
108
  ## Development
74
109
 
75
110
  This repo follows **git-flow**:
@@ -78,5 +113,6 @@ This repo follows **git-flow**:
78
113
  - `dev` — integration branch for day-to-day work.
79
114
  - feature branches branch off `dev` and merge back via PR; releases merge `dev` → `master`.
80
115
 
81
- Roadmap (affine gaps, position-specific matrices, e-value / significance via
82
- control-set and tf-idf, succinct memory packing) lives in [docs/roadmap.rst](docs/roadmap.rst).
116
+ Roadmap (affine gaps, position-specific matrices, succinct memory packing) lives in
117
+ [docs/roadmap.rst](docs/roadmap.rst). Control-set E-values already ship — see the
118
+ [E-value guide](https://antigenomics.github.io/seqtree/evalue.html).
@@ -0,0 +1,172 @@
1
+ # Downstream wrappers roadmap: `vdjmatch` and `mhcmatch`
2
+
3
+ **Status:** living draft. Owner: @mikessh. Extend freely — sections marked _(TBD: owner)_ await
4
+ more detail.
5
+
6
+ **Purpose.** `seqtree` is the shared substrate — a fast, payload-agnostic fuzzy-search core (C++ +
7
+ Python) with a control-calibrated E-value theory (see `appendix/evalue.tex`). It deliberately stops
8
+ at the reference-implementation level: the *applied* tools — `vdjmatch` (TCR antigen specificity) and
9
+ `mhcmatch` (peptide–MHC) — are separate packages built on top of it. This file is the contract
10
+ between them, written so an agent developing either package can pick it up cold: it states what
11
+ `seqtree` already provides, where each wrapper plugs in, and which design decisions are still open.
12
+
13
+ **How to use this doc (for agents).** Before implementing a feature in `vdjmatch`/`mhcmatch`:
14
+ 1. Check **§1 Substrate** for the `seqtree` primitive that already covers it — do not reimplement
15
+ search, E-values, anchor masking, or k-mer indexing.
16
+ 2. Read the matching subsection in **§2** (vdjmatch) or **§3** (mhcmatch) for the intended design and
17
+ the relevant `appendix/evalue.tex` section.
18
+ 3. Treat the math in the appendix as the spec; treat formulas here marked _(design)_ as direction,
19
+ not final — confirm with the owner before committing to one.
20
+
21
+ ---
22
+
23
+ ## 1. Substrate: what `seqtree` provides
24
+
25
+ | Capability | Module / symbol | Notes |
26
+ |---|---|---|
27
+ | Fuzzy fixed-length search | `seqtree.Index`, engines `seqtm` / `seqtrie` | per-type edit caps + Hamming fast path; or banded DP |
28
+ | Local / best-window mode | `Mode::Local` (C++) | class-II register, general local match |
29
+ | Substitution scoring | `SubstitutionMatrix` (Gram → squared-distance penalty `s_aa+s_bb−2·s_ab`) | payload-agnostic |
30
+ | Per-position scoring | `PositionalMatrix` (`penalty(pos,a,b)` = base × per-position weight; weight 0 = free/anchor) | the hook for PSSMs and anchor masking |
31
+ | k-mer seed index | `KmerIndex` (C++): unique-k-mer trie + CSR postings + per-peptide allele tag; `seed_and_gather` (GIL-released, parallel) | million-scale candidate generation |
32
+ | Control-calibrated E-values | `seqtree.evalues`, `seqtree.load_control` | `Ê = (N/M)·n_C`, Poisson tail, `exclude_exact` |
33
+ | Anchor / layout model | `seqtree.layout`: `AnchorSpec`, `DEFAULTS`, `mask_anchors`, `kmers`, `presentation_features`, `weight_profile` | parametrized anchors; class-II register trick |
34
+ | pMHC homology + reverse | `seqtree.pmhc`: `PMHCStore`, `search_homologs`, `assign_allele`, `find_mimics`, `build_kmer_index` | reference impl; mhcmatch productionizes |
35
+ | Presentation-aware E-values | `seqtree.pmhc_evalue.homolog_evalue` | per-allele null |
36
+
37
+ **E-value theory** lives in `appendix/evalue.tex`: the empirical-control null (§Setup, §Null), the
38
+ Poisson/Chen–Stein bound (§Poisson), multiple testing (§E-value), the closest-hit Gumbel law
39
+ (§Gumbel), Karlin–Altschul as the i.i.d. special case (§KA), the pMHC presentation-aware extension
40
+ (§Epitopes, §Reverse problem), and elementary applications — UMI/barcode birthday-collision and
41
+ CDR3-nt error clustering (§Related applications).
42
+
43
+ **Datasets.** `isalgo/pmhc_data` ships two tiers (see `appendix` Table "Scope of the two pmhc_data
44
+ tiers"): **full** (every IEDB-positive epitope–allele assay) and **shortlist** (epitope–allele pairs
45
+ with ≥2 supporting publications). `mhcmatch` should expose tier choice; the benchmark uses full.
46
+ TCR side: VDJdb (specificity-labelled) + `isalgo/airr_control` (matched control repertoire).
47
+
48
+ ---
49
+
50
+ ## 2. `vdjmatch` — TCR antigen-specificity
51
+
52
+ **Goal.** Given a query TCR (CDR3, optionally V/J, optionally paired α/β), score its similarity to
53
+ known antigen-specific TCRs and return a control-calibrated E-value per candidate epitope —
54
+ generalizing TCRNET-style neighbour counting (appendix §Intro) to a usable annotation tool.
55
+
56
+ ### 2.1 Train / test / validation splits
57
+ - **Split by epitope, not by sequence.** Held-out *epitopes* test generalization to unseen
58
+ specificities; held-out *sequences within an epitope* test within-specificity recall. Never let the
59
+ same clonotype leak across splits (dedup to unique clonotypes first — appendix Prop. on collapse).
60
+ - **Three-way:** train (fit substitution/positional weights, §2.2), validation (tune scope/budget and
61
+ E-value thresholds), test (report ROC/PR per epitope, as the MHC-guess benchmark does per allele).
62
+ - **Null/control** is independent of the split: `airr_control` supplies the background `P₀`; size it
63
+ per appendix §"How large must the control be?".
64
+ - _(TBD: owner)_ exact epitope-level CV scheme, minimum cluster size, negative sampling.
65
+
66
+ ### 2.2 Custom substitution matrices (epitope–paratope interaction-aware)
67
+ - **Hook:** `seqtree.PositionalMatrix` (per-position `penalty(pos,a,b)`) and/or a bespoke
68
+ `SubstitutionMatrix`. seqtree already wires `PositionalMatrix` into the seqtm Hamming path.
69
+ - **Intent:** replace BLOSUM with a CDR3-position-weighted matrix reflecting which residues actually
70
+ contact the peptide–MHC (paratope), learned from structural contacts or from specificity data
71
+ (residues whose substitution most changes specificity get the most weight; framework positions →
72
+ weight 0). Mirrors how `layout.weight_profile` builds anchor/TCR-facing weights for pMHC.
73
+ - _(TBD: owner)_ source of contact statistics (structures vs. learned), per-V-gene vs. global matrix.
74
+
75
+ ### 2.3 Single-chain E-value (have today)
76
+ - Per chain (β, or α), this is exactly `seqtree.evalues`: count neighbours of the query among the
77
+ epitope-specific set vs. the control, report `Ê` and `p_enrich`. No new theory needed.
78
+
79
+ ### 2.4 Paired α/β E-value (both chains known)
80
+ - _(design)_ Under chain independence in the null, the **joint** null ball-mass factorizes:
81
+ `π₀^{αβ} ≈ π₀^α · π₀^β`, so the paired intensity is `λ_αβ = N · π₀^α π₀^β` and the paired enrichment
82
+ is a Poisson tail on the count of references matching **both** chains within budget. Equivalent to
83
+ Fisher-combining the per-chain `p_enrich` when independence holds; the appendix `b₂` co-occupancy
84
+ term measures the dependence to correct for. Implement as a joint `seed_and_gather` keyed on a
85
+ paired peptide id.
86
+ - This is strictly more specific than either chain alone (joint null mass is tiny) → far smaller
87
+ E-values for true pairs.
88
+
89
+ ### 2.5 Paired estimate for a single chain (rarity of the unknown chain)
90
+ - _(design)_ When only one chain (say β) is observed, marginalize the paired E-value over the unknown
91
+ α: weight the single-chain evidence by the **rarity** of the partner that would complete a known
92
+ pair — a generation-probability / abundance prior on α (`Pgen`, appendix §thymic/selection). A β
93
+ matching a rare-α-paired reference is more informative than one matching a common-α pairing. Yields
94
+ a "paired-equivalent" E-value from one chain without observing the other.
95
+ - _(TBD: owner)_ the exact prior (OLGA/Murugan `Pgen` vs. empirical α-abundance), and calibration.
96
+
97
+ ### 2.6 API sketch _(non-binding)_
98
+ ```python
99
+ vm = vdjmatch.Annotator.from_vdjdb(chains=("beta",), matrix=paratope_matrix, control=airr_control)
100
+ hits = vm.annotate(cdr3b="CASS...", v="TRBV...", scope=2) # -> [(epitope, Ê, p_enrich), ...]
101
+ hits = vm.annotate_paired(cdr3a=..., cdr3b=...) # §2.4
102
+ hits = vm.annotate(cdr3b=..., partner_prior="pgen") # §2.5
103
+ ```
104
+
105
+ ---
106
+
107
+ ## 3. `mhcmatch` — peptide–MHC
108
+
109
+ **Goal.** Productionize the `seqtree.pmhc` reference layer: epitope homology / cross-reactivity,
110
+ molecular mimicry, allele guessing, and non-binder filtering — with tuned thresholds and the
111
+ additions below.
112
+
113
+ ### 3.1 Reuse from seqtree (have today)
114
+ - Homology / mimicry: `search_homologs`, `find_mimics` (anchor-masked TCR-facing k-mers; per-allele
115
+ presentation-aware E-values via `pmhc_evalue.homolog_evalue`). Positive control: the Dolton et al.
116
+ A\*02:01 cross-reactive trio.
117
+ - Allele guessing (reverse problem): `assign_allele` + the vote-fraction ranking / register trick
118
+ validated in `bench/bench_mhc_guess.py` (appendix §Reverse problem). ROC-AUC 0.90–0.98.
119
+ - **Non-binder filter** (appendix §Reverse problem, "Filtering non-binders"): _binds no MHC_ →
120
+ best-over-panel E-value high / no allele scores above background; _doesn't bind allele a_ →
121
+ per-allele `E_a > α`. mhcmatch exposes both thresholds.
122
+ - Tier choice (full vs shortlist, §1 Datasets).
123
+
124
+ ### 3.2 Motif logos
125
+ - For each allele (and each class-II core register), render a sequence logo of the anchor / pocket
126
+ residues from its presented set — the visual counterpart of `presentation_features`. Use a standard
127
+ information-content logo (bits per position). Class II: build the logo over the register-aligned
128
+ 9-mer cores (the register trick already picks the core), so anchors P1/P4/P6/P9 are columns.
129
+ - _(TBD: owner)_ logo library choice; whether to weight by `n_references` (shortlist confidence).
130
+
131
+ ### 3.3 MHC pseudosequence → clustering & promiscuity
132
+ - **Pseudosequence:** represent each allele by the polymorphic residues that line the peptide-binding
133
+ groove (the pseudosequence of peptide-contacting positions, as used for per-allele MHC binding
134
+ prediction by Glynn, Ghersi & Singh, *PNAS* 2025, [doi:10.1073/pnas.2405106122](https://doi.org/10.1073/pnas.2405106122)).
135
+ This turns "allele" from an opaque label into a short sequence the **same seqtree engine** can compare.
136
+ - **Cluster MHCs** by pseudosequence distance (seqtree fuzzy search on the pseudosequence "alphabet"
137
+ of groove residues) → groups of functionally similar alleles. This is the principled way to express
138
+ **cross-allele similarity**, which `seqtree.pmhc` deliberately does *not* model (appendix §Impl.
139
+ limitation: "distinct alleles are distinct nulls"). mhcmatch is where that limitation is lifted.
140
+ - **Promiscuity** (esp. class II — appendix §"Class-II promiscuity"): a peptide presented across a
141
+ pseudosequence-cluster of alleles is promiscuous; quantify as the spread of its positive alleles
142
+ over the MHC clustering, and use the cluster to pool nulls for related alleles when data are thin.
143
+ - _(TBD: owner)_ pseudosequence position set per locus (HLA-A/B/C, DR/DQ/DP, mouse H-2), distance
144
+ metric, cluster cut.
145
+
146
+ ### 3.4 API sketch _(non-binding)_
147
+ ```python
148
+ mm = mhcmatch.Store.from_pmhc(tier="shortlist")
149
+ mm.logo("HLA-A*02:01") # §3.2
150
+ clusters = mm.cluster_alleles(class_="mhc1") # §3.3, via pseudosequence
151
+ mm.is_binder("SIINFEKL", allele="H-2Kb", alpha=0.05) # §3.1 non-binder filter
152
+ mm.promiscuity("PKYVKQNTLKLAT") # §3.3
153
+ ```
154
+
155
+ ---
156
+
157
+ ## 4. Shared conventions
158
+ - **seqtree is upstream and stays generic.** New general-purpose primitives (a learned matrix loader,
159
+ a logo helper, a pseudosequence comparator) may land in seqtree if reusable; tuned thresholds,
160
+ predictors, and domain glue stay in the wrappers.
161
+ - **Anchors** are parametrized in `seqtree.layout` (presets per class, overridable) — wrappers pass
162
+ `AnchorSpec`, they don't hardcode positions.
163
+ - **Citations:** never fabricate. Verify every DOI via a tool before adding it (PubMed/arXiv).
164
+ - **Versioning / gitflow:** feature branch → `dev` → `master`; end commit messages with the
165
+ `Co-Authored-By` trailer; don't publish to PyPI without an explicit release.
166
+
167
+ ## 5. Pointers
168
+ - E-value theory & all derivations: `appendix/evalue.tex` (compiled `appendix/evalue.pdf`).
169
+ - pMHC usage & limitations: `docs/pmhc.rst`. Benchmarks & figures: `docs/benchmarks.rst`,
170
+ `bench/bench_mhc_guess.py`.
171
+ - seqtree internal roadmap (PSSM-graded d_TCR, native local align, Flashback build, predictor proteome
172
+ scans): `docs/roadmap.rst`.
@@ -0,0 +1,10 @@
1
+ # LaTeX build artifacts
2
+ *.aux
3
+ *.bbl
4
+ *.blg
5
+ *.fdb_latexmk
6
+ *.fls
7
+ *.log
8
+ *.out
9
+ *.toc
10
+ # evalue.pdf is committed (linked from the docs); regenerate with `make`.
@@ -0,0 +1,2 @@
1
+ # OldStandard OpenType text+math requires lualatex (see Makefile).
2
+ $pdf_mode = 4;
@@ -0,0 +1,17 @@
1
+ # Compile the E-value derivation. Requires a TeX distribution (latexmk + lualatex + bibtex)
2
+ # and rsvg-convert (librsvg) to convert the referenced SVG plots to PDF for \includegraphics.
3
+ # lualatex (not pdflatex) is required for the OldStandard OpenType text+math fonts; the
4
+ # compiler is pinned in .latexmkrc ($pdf_mode = 4) so a bare `latexmk` also picks it up.
5
+ FIGS = epitope_detection.pdf evalue_matrix.pdf mhc1_rocpr.pdf mhc2_rocpr.pdf
6
+
7
+ evalue.pdf: evalue.tex refs.bib $(FIGS)
8
+ latexmk -lualatex -interaction=nonstopmode -halt-on-error evalue.tex
9
+
10
+ # Benchmark SVGs (bench/figures) -> PDF for inclusion as figures.
11
+ %.pdf: ../bench/figures/%.svg
12
+ rsvg-convert -f pdf -o $@ $<
13
+
14
+ .PHONY: clean
15
+ clean:
16
+ latexmk -C evalue.tex
17
+ rm -f $(FIGS)
Binary file