seqtree 0.0.1__tar.gz → 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {seqtree-0.0.1 → seqtree-0.0.3}/.gitignore +1 -0
- {seqtree-0.0.1 → seqtree-0.0.3}/CMakeLists.txt +9 -1
- {seqtree-0.0.1 → seqtree-0.0.3}/PKG-INFO +46 -6
- {seqtree-0.0.1 → seqtree-0.0.3}/README.md +41 -5
- seqtree-0.0.3/ROADMAP.md +172 -0
- seqtree-0.0.3/appendix/.gitignore +10 -0
- seqtree-0.0.3/appendix/.latexmkrc +2 -0
- seqtree-0.0.3/appendix/Makefile +17 -0
- seqtree-0.0.3/appendix/epitope_detection.pdf +0 -0
- seqtree-0.0.3/appendix/evalue.pdf +0 -0
- seqtree-0.0.3/appendix/evalue.tex +758 -0
- seqtree-0.0.3/appendix/evalue_matrix.pdf +0 -0
- seqtree-0.0.3/appendix/mhc1_rocpr.pdf +0 -0
- seqtree-0.0.3/appendix/mhc2_rocpr.pdf +0 -0
- seqtree-0.0.3/appendix/refs.bib +204 -0
- seqtree-0.0.3/include/seqtree/kmer_index.hpp +59 -0
- {seqtree-0.0.1 → seqtree-0.0.3}/include/seqtree/seqtree.hpp +44 -0
- {seqtree-0.0.1 → seqtree-0.0.3}/include/seqtree/types.hpp +8 -1
- {seqtree-0.0.1 → seqtree-0.0.3}/pyproject.toml +3 -1
- seqtree-0.0.3/python/seqtree/__init__.py +44 -0
- seqtree-0.0.3/python/seqtree/control.py +96 -0
- seqtree-0.0.3/python/seqtree/data/control_human_trb_aa.txt.gz +0 -0
- seqtree-0.0.3/python/seqtree/evalue.py +79 -0
- seqtree-0.0.3/python/seqtree/layout.py +131 -0
- seqtree-0.0.3/python/seqtree/pmhc.py +229 -0
- seqtree-0.0.3/python/seqtree/pmhc_evalue.py +33 -0
- seqtree-0.0.3/src/_bindings.cpp +421 -0
- {seqtree-0.0.1 → seqtree-0.0.3}/src/engine_seqtm.cpp +18 -8
- {seqtree-0.0.1 → seqtree-0.0.3}/src/engines.hpp +2 -0
- {seqtree-0.0.1 → seqtree-0.0.3}/src/index.cpp +117 -1
- seqtree-0.0.3/src/kmer_index.cpp +219 -0
- seqtree-0.0.3/src/pam50.inc +32 -0
- seqtree-0.0.3/src/positional_matrix.cpp +42 -0
- {seqtree-0.0.1 → seqtree-0.0.3}/src/searcher.cpp +8 -3
- {seqtree-0.0.1 → seqtree-0.0.3}/src/substitution_matrix.cpp +10 -4
- seqtree-0.0.1/python/seqtree/__init__.py +0 -10
- seqtree-0.0.1/src/_bindings.cpp +0 -235
- {seqtree-0.0.1 → seqtree-0.0.3}/.gitattributes +0 -0
- {seqtree-0.0.1 → seqtree-0.0.3}/LICENSE +0 -0
- {seqtree-0.0.1 → seqtree-0.0.3}/python/seqtree/py.typed +0 -0
- {seqtree-0.0.1 → seqtree-0.0.3}/src/blosum62.inc +0 -0
- {seqtree-0.0.1 → seqtree-0.0.3}/src/codec.cpp +0 -0
- {seqtree-0.0.1 → seqtree-0.0.3}/src/engine_seqtrie.cpp +0 -0
- {seqtree-0.0.1 → seqtree-0.0.3}/src/trie.cpp +0 -0
- {seqtree-0.0.1 → seqtree-0.0.3}/src/trie.hpp +0 -0
|
@@ -7,7 +7,9 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|
|
7
7
|
if(NOT CMAKE_BUILD_TYPE)
|
|
8
8
|
set(CMAKE_BUILD_TYPE Release)
|
|
9
9
|
endif()
|
|
10
|
-
|
|
10
|
+
if(NOT MSVC)
|
|
11
|
+
set(CMAKE_CXX_FLAGS_RELEASE "-O3")
|
|
12
|
+
endif() # MSVC Release already uses /O2; -O3 is not a valid MSVC flag
|
|
11
13
|
|
|
12
14
|
option(SEQTREE_TESTS "Build C++ tests" OFF)
|
|
13
15
|
option(SEQTREE_BENCH "Build C++ benchmarks" OFF)
|
|
@@ -18,6 +20,8 @@ find_package(Threads REQUIRED)
|
|
|
18
20
|
add_library(seqtree_core STATIC
|
|
19
21
|
src/codec.cpp
|
|
20
22
|
src/substitution_matrix.cpp
|
|
23
|
+
src/positional_matrix.cpp
|
|
24
|
+
src/kmer_index.cpp
|
|
21
25
|
src/trie.cpp
|
|
22
26
|
src/index.cpp
|
|
23
27
|
src/engine_seqtm.cpp
|
|
@@ -41,6 +45,10 @@ if(SEQTREE_TESTS)
|
|
|
41
45
|
tests/cpp/test_matrix.cpp
|
|
42
46
|
tests/cpp/test_trie.cpp
|
|
43
47
|
tests/cpp/test_engines.cpp
|
|
48
|
+
tests/cpp/test_edge.cpp
|
|
49
|
+
tests/cpp/test_serialize.cpp
|
|
50
|
+
tests/cpp/test_positional.cpp
|
|
51
|
+
tests/cpp/test_kmer_index.cpp
|
|
44
52
|
)
|
|
45
53
|
target_include_directories(seqtree_tests PRIVATE tests/cpp src)
|
|
46
54
|
target_link_libraries(seqtree_tests PRIVATE seqtree_core)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: seqtree
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Summary: Fast fuzzy search over biological sequences (C++ core, Python bindings)
|
|
5
5
|
Keywords: sequence-search,fuzzy-matching,CDR3,immunology,bioinformatics,trie
|
|
6
6
|
Author-Email: ISALGO laboratory <mikhail.shugay@gmail.com>
|
|
@@ -25,10 +25,17 @@ Provides-Extra: docs
|
|
|
25
25
|
Requires-Dist: sphinx; extra == "docs"
|
|
26
26
|
Requires-Dist: pydata-sphinx-theme; extra == "docs"
|
|
27
27
|
Requires-Dist: nbsphinx; extra == "docs"
|
|
28
|
+
Provides-Extra: control
|
|
29
|
+
Requires-Dist: huggingface_hub; extra == "control"
|
|
30
|
+
Provides-Extra: pmhc
|
|
31
|
+
Requires-Dist: huggingface_hub; extra == "pmhc"
|
|
28
32
|
Description-Content-Type: text/markdown
|
|
29
33
|
|
|
30
34
|
# seqtree
|
|
31
35
|
|
|
36
|
+
[](https://pypi.org/project/seqtree/)
|
|
37
|
+
[](https://pypi.org/project/seqtree/)
|
|
38
|
+
[](LICENSE)
|
|
32
39
|
[](https://github.com/antigenomics/seqtree/actions/workflows/ci.yml)
|
|
33
40
|
[](https://antigenomics.github.io/seqtree/)
|
|
34
41
|
|
|
@@ -49,7 +56,25 @@ Two search engines over one trie:
|
|
|
49
56
|
`(ref_id, score, n_subs, n_ins, n_dels)`. Downstream libraries map `ref_id` back
|
|
50
57
|
to their own payloads (V gene, MHC, counts) and filter.
|
|
51
58
|
|
|
52
|
-
|
|
59
|
+
Beyond search, seqtree ships:
|
|
60
|
+
|
|
61
|
+
- **Substitution matrices** — built-in `BLOSUM62` and `PAM50`, plus custom matrices via
|
|
62
|
+
`SubstitutionMatrix.from_similarity` (Gram-distance penalty `s(a,a)+s(b,b)−2·s(a,b)`).
|
|
63
|
+
- **E-values / significance** — calibrate hit counts against a background control repertoire
|
|
64
|
+
(`load_control` + `evalues`), the TCRNET approach on a finite-sample footing. See the
|
|
65
|
+
[E-value guide](https://antigenomics.github.io/seqtree/evalue.html).
|
|
66
|
+
|
|
67
|
+
## Install
|
|
68
|
+
|
|
69
|
+
```fish
|
|
70
|
+
pip install seqtree # prebuilt wheels for CPython 3.10–3.13
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Prebuilt wheels cover **Linux x86-64**, **macOS arm64 (Apple Silicon)**, and **Windows x86-64**.
|
|
74
|
+
There are **no Intel/x86-64 macOS wheels** — Intel Macs build from source (see below), which just
|
|
75
|
+
needs a C++17 compiler and CMake (pulled in automatically by the build).
|
|
76
|
+
|
|
77
|
+
## Build from source
|
|
53
78
|
|
|
54
79
|
```fish
|
|
55
80
|
bash setup.sh # repo-local .venv + editable install
|
|
@@ -81,6 +106,13 @@ print(aln.aligned_query, aln.aligned_ref, aln.ops)
|
|
|
81
106
|
|
|
82
107
|
# batch-vs-batch (auto-indexes the larger set)
|
|
83
108
|
pairs = seqtree.pairwise_batch(query_set, db_set, p, alphabet="aa")
|
|
109
|
+
|
|
110
|
+
# E-values against a background control repertoire (TCRNET-style significance)
|
|
111
|
+
control = seqtree.load_control("human_trb_aa", size=1_000_000)
|
|
112
|
+
target = seqtree.Index.build(vdjdb_cdr3s, alphabet="aa")
|
|
113
|
+
for q, r in zip(queries, seqtree.evalues(target, control, queries, p)):
|
|
114
|
+
if r["p_enrichment"] < 1e-3:
|
|
115
|
+
print(q, r["E"], r["n_target"], r["n_control"])
|
|
84
116
|
```
|
|
85
117
|
|
|
86
118
|
## Tests
|
|
@@ -95,10 +127,17 @@ pytest tests/python # Python tests
|
|
|
95
127
|
## Benchmarks
|
|
96
128
|
|
|
97
129
|
```fish
|
|
98
|
-
python bench/
|
|
99
|
-
|
|
130
|
+
python bench/bench_gnuplot.py # throughput / scaling / matrix / collisions → SVG (needs gnuplot)
|
|
131
|
+
python bench/bench.py # recall vs ground truth (real VDJdb data)
|
|
132
|
+
python bench/bench_evalue.py # true E-value benchmark (target vs background control)
|
|
133
|
+
python bench/bench_evalue_matrix.py # significance across reference/control/query/scope grid
|
|
134
|
+
python bench/bench_epitope.py # epitope detection-complexity (GIL vs NLV)
|
|
100
135
|
```
|
|
101
136
|
|
|
137
|
+
Figures (throughput, scaling, matrix-scoring overhead, collisions, E-value matrix, epitope
|
|
138
|
+
detection) and the full methodology are in the [benchmarks docs](https://antigenomics.github.io/seqtree/benchmarks.html).
|
|
139
|
+
Set `RUN_BENCHMARK=1` for the large tiers.
|
|
140
|
+
|
|
102
141
|
## Development
|
|
103
142
|
|
|
104
143
|
This repo follows **git-flow**:
|
|
@@ -107,5 +146,6 @@ This repo follows **git-flow**:
|
|
|
107
146
|
- `dev` — integration branch for day-to-day work.
|
|
108
147
|
- feature branches branch off `dev` and merge back via PR; releases merge `dev` → `master`.
|
|
109
148
|
|
|
110
|
-
Roadmap (affine gaps, position-specific matrices,
|
|
111
|
-
|
|
149
|
+
Roadmap (affine gaps, position-specific matrices, succinct memory packing) lives in
|
|
150
|
+
[docs/roadmap.rst](docs/roadmap.rst). Control-set E-values already ship — see the
|
|
151
|
+
[E-value guide](https://antigenomics.github.io/seqtree/evalue.html).
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
# seqtree
|
|
2
2
|
|
|
3
|
+
[](https://pypi.org/project/seqtree/)
|
|
4
|
+
[](https://pypi.org/project/seqtree/)
|
|
5
|
+
[](LICENSE)
|
|
3
6
|
[](https://github.com/antigenomics/seqtree/actions/workflows/ci.yml)
|
|
4
7
|
[](https://antigenomics.github.io/seqtree/)
|
|
5
8
|
|
|
@@ -20,7 +23,25 @@ Two search engines over one trie:
|
|
|
20
23
|
`(ref_id, score, n_subs, n_ins, n_dels)`. Downstream libraries map `ref_id` back
|
|
21
24
|
to their own payloads (V gene, MHC, counts) and filter.
|
|
22
25
|
|
|
23
|
-
|
|
26
|
+
Beyond search, seqtree ships:
|
|
27
|
+
|
|
28
|
+
- **Substitution matrices** — built-in `BLOSUM62` and `PAM50`, plus custom matrices via
|
|
29
|
+
`SubstitutionMatrix.from_similarity` (Gram-distance penalty `s(a,a)+s(b,b)−2·s(a,b)`).
|
|
30
|
+
- **E-values / significance** — calibrate hit counts against a background control repertoire
|
|
31
|
+
(`load_control` + `evalues`), the TCRNET approach on a finite-sample footing. See the
|
|
32
|
+
[E-value guide](https://antigenomics.github.io/seqtree/evalue.html).
|
|
33
|
+
|
|
34
|
+
## Install
|
|
35
|
+
|
|
36
|
+
```fish
|
|
37
|
+
pip install seqtree # prebuilt wheels for CPython 3.10–3.13
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Prebuilt wheels cover **Linux x86-64**, **macOS arm64 (Apple Silicon)**, and **Windows x86-64**.
|
|
41
|
+
There are **no Intel/x86-64 macOS wheels** — Intel Macs build from source (see below), which just
|
|
42
|
+
needs a C++17 compiler and CMake (pulled in automatically by the build).
|
|
43
|
+
|
|
44
|
+
## Build from source
|
|
24
45
|
|
|
25
46
|
```fish
|
|
26
47
|
bash setup.sh # repo-local .venv + editable install
|
|
@@ -52,6 +73,13 @@ print(aln.aligned_query, aln.aligned_ref, aln.ops)
|
|
|
52
73
|
|
|
53
74
|
# batch-vs-batch (auto-indexes the larger set)
|
|
54
75
|
pairs = seqtree.pairwise_batch(query_set, db_set, p, alphabet="aa")
|
|
76
|
+
|
|
77
|
+
# E-values against a background control repertoire (TCRNET-style significance)
|
|
78
|
+
control = seqtree.load_control("human_trb_aa", size=1_000_000)
|
|
79
|
+
target = seqtree.Index.build(vdjdb_cdr3s, alphabet="aa")
|
|
80
|
+
for q, r in zip(queries, seqtree.evalues(target, control, queries, p)):
|
|
81
|
+
if r["p_enrichment"] < 1e-3:
|
|
82
|
+
print(q, r["E"], r["n_target"], r["n_control"])
|
|
55
83
|
```
|
|
56
84
|
|
|
57
85
|
## Tests
|
|
@@ -66,10 +94,17 @@ pytest tests/python # Python tests
|
|
|
66
94
|
## Benchmarks
|
|
67
95
|
|
|
68
96
|
```fish
|
|
69
|
-
python bench/
|
|
70
|
-
|
|
97
|
+
python bench/bench_gnuplot.py # throughput / scaling / matrix / collisions → SVG (needs gnuplot)
|
|
98
|
+
python bench/bench.py # recall vs ground truth (real VDJdb data)
|
|
99
|
+
python bench/bench_evalue.py # true E-value benchmark (target vs background control)
|
|
100
|
+
python bench/bench_evalue_matrix.py # significance across reference/control/query/scope grid
|
|
101
|
+
python bench/bench_epitope.py # epitope detection-complexity (GIL vs NLV)
|
|
71
102
|
```
|
|
72
103
|
|
|
104
|
+
Figures (throughput, scaling, matrix-scoring overhead, collisions, E-value matrix, epitope
|
|
105
|
+
detection) and the full methodology are in the [benchmarks docs](https://antigenomics.github.io/seqtree/benchmarks.html).
|
|
106
|
+
Set `RUN_BENCHMARK=1` for the large tiers.
|
|
107
|
+
|
|
73
108
|
## Development
|
|
74
109
|
|
|
75
110
|
This repo follows **git-flow**:
|
|
@@ -78,5 +113,6 @@ This repo follows **git-flow**:
|
|
|
78
113
|
- `dev` — integration branch for day-to-day work.
|
|
79
114
|
- feature branches branch off `dev` and merge back via PR; releases merge `dev` → `master`.
|
|
80
115
|
|
|
81
|
-
Roadmap (affine gaps, position-specific matrices,
|
|
82
|
-
|
|
116
|
+
Roadmap (affine gaps, position-specific matrices, succinct memory packing) lives in
|
|
117
|
+
[docs/roadmap.rst](docs/roadmap.rst). Control-set E-values already ship — see the
|
|
118
|
+
[E-value guide](https://antigenomics.github.io/seqtree/evalue.html).
|
seqtree-0.0.3/ROADMAP.md
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# Downstream wrappers roadmap: `vdjmatch` and `mhcmatch`
|
|
2
|
+
|
|
3
|
+
**Status:** living draft. Owner: @mikessh. Extend freely — sections marked _(TBD: owner)_ await
|
|
4
|
+
more detail.
|
|
5
|
+
|
|
6
|
+
**Purpose.** `seqtree` is the shared substrate — a fast, payload-agnostic fuzzy-search core (C++ +
|
|
7
|
+
Python) with a control-calibrated E-value theory (see `appendix/evalue.tex`). It deliberately stops
|
|
8
|
+
at the reference-implementation level: the *applied* tools — `vdjmatch` (TCR antigen specificity) and
|
|
9
|
+
`mhcmatch` (peptide–MHC) — are separate packages built on top of it. This file is the contract
|
|
10
|
+
between them, written so an agent developing either package can pick it up cold: it states what
|
|
11
|
+
`seqtree` already provides, where each wrapper plugs in, and which design decisions are still open.
|
|
12
|
+
|
|
13
|
+
**How to use this doc (for agents).** Before implementing a feature in `vdjmatch`/`mhcmatch`:
|
|
14
|
+
1. Check **§1 Substrate** for the `seqtree` primitive that already covers it — do not reimplement
|
|
15
|
+
search, E-values, anchor masking, or k-mer indexing.
|
|
16
|
+
2. Read the matching subsection in **§2** (vdjmatch) or **§3** (mhcmatch) for the intended design and
|
|
17
|
+
the relevant `appendix/evalue.tex` section.
|
|
18
|
+
3. Treat the math in the appendix as the spec; treat formulas here marked _(design)_ as direction,
|
|
19
|
+
not final — confirm with the owner before committing to one.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## 1. Substrate: what `seqtree` provides
|
|
24
|
+
|
|
25
|
+
| Capability | Module / symbol | Notes |
|
|
26
|
+
|---|---|---|
|
|
27
|
+
| Fuzzy fixed-length search | `seqtree.Index`, engines `seqtm` / `seqtrie` | per-type edit caps + Hamming fast path; or banded DP |
|
|
28
|
+
| Local / best-window mode | `Mode::Local` (C++) | class-II register, general local match |
|
|
29
|
+
| Substitution scoring | `SubstitutionMatrix` (Gram → squared-distance penalty `s_aa+s_bb−2·s_ab`) | payload-agnostic |
|
|
30
|
+
| Per-position scoring | `PositionalMatrix` (`penalty(pos,a,b)` = base × per-position weight; weight 0 = free/anchor) | the hook for PSSMs and anchor masking |
|
|
31
|
+
| k-mer seed index | `KmerIndex` (C++): unique-k-mer trie + CSR postings + per-peptide allele tag; `seed_and_gather` (GIL-released, parallel) | million-scale candidate generation |
|
|
32
|
+
| Control-calibrated E-values | `seqtree.evalues`, `seqtree.load_control` | `Ê = (N/M)·n_C`, Poisson tail, `exclude_exact` |
|
|
33
|
+
| Anchor / layout model | `seqtree.layout`: `AnchorSpec`, `DEFAULTS`, `mask_anchors`, `kmers`, `presentation_features`, `weight_profile` | parametrized anchors; class-II register trick |
|
|
34
|
+
| pMHC homology + reverse | `seqtree.pmhc`: `PMHCStore`, `search_homologs`, `assign_allele`, `find_mimics`, `build_kmer_index` | reference impl; mhcmatch productionizes |
|
|
35
|
+
| Presentation-aware E-values | `seqtree.pmhc_evalue.homolog_evalue` | per-allele null |
|
|
36
|
+
|
|
37
|
+
**E-value theory** lives in `appendix/evalue.tex`: the empirical-control null (§Setup, §Null), the
|
|
38
|
+
Poisson/Chen–Stein bound (§Poisson), multiple testing (§E-value), the closest-hit Gumbel law
|
|
39
|
+
(§Gumbel), Karlin–Altschul as the i.i.d. special case (§KA), the pMHC presentation-aware extension
|
|
40
|
+
(§Epitopes, §Reverse problem), and elementary applications — UMI/barcode birthday-collision and
|
|
41
|
+
CDR3-nt error clustering (§Related applications).
|
|
42
|
+
|
|
43
|
+
**Datasets.** `isalgo/pmhc_data` ships two tiers (see `appendix` Table "Scope of the two pmhc_data
|
|
44
|
+
tiers"): **full** (every IEDB-positive epitope–allele assay) and **shortlist** (epitope–allele pairs
|
|
45
|
+
with ≥2 supporting publications). `mhcmatch` should expose tier choice; the benchmark uses full.
|
|
46
|
+
TCR side: VDJdb (specificity-labelled) + `isalgo/airr_control` (matched control repertoire).
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## 2. `vdjmatch` — TCR antigen-specificity
|
|
51
|
+
|
|
52
|
+
**Goal.** Given a query TCR (CDR3, optionally V/J, optionally paired α/β), score its similarity to
|
|
53
|
+
known antigen-specific TCRs and return a control-calibrated E-value per candidate epitope —
|
|
54
|
+
generalizing TCRNET-style neighbour counting (appendix §Intro) to a usable annotation tool.
|
|
55
|
+
|
|
56
|
+
### 2.1 Train / test / validation splits
|
|
57
|
+
- **Split by epitope, not by sequence.** Held-out *epitopes* test generalization to unseen
|
|
58
|
+
specificities; held-out *sequences within an epitope* test within-specificity recall. Never let the
|
|
59
|
+
same clonotype leak across splits (dedup to unique clonotypes first — appendix Prop. on collapse).
|
|
60
|
+
- **Three-way:** train (fit substitution/positional weights, §2.2), validation (tune scope/budget and
|
|
61
|
+
E-value thresholds), test (report ROC/PR per epitope, as the MHC-guess benchmark does per allele).
|
|
62
|
+
- **Null/control** is independent of the split: `airr_control` supplies the background `P₀`; size it
|
|
63
|
+
per appendix §"How large must the control be?".
|
|
64
|
+
- _(TBD: owner)_ exact epitope-level CV scheme, minimum cluster size, negative sampling.
|
|
65
|
+
|
|
66
|
+
### 2.2 Custom substitution matrices (epitope–paratope interaction-aware)
|
|
67
|
+
- **Hook:** `seqtree.PositionalMatrix` (per-position `penalty(pos,a,b)`) and/or a bespoke
|
|
68
|
+
`SubstitutionMatrix`. seqtree already wires `PositionalMatrix` into the seqtm Hamming path.
|
|
69
|
+
- **Intent:** replace BLOSUM with a CDR3-position-weighted matrix reflecting which residues actually
|
|
70
|
+
contact the peptide–MHC (paratope), learned from structural contacts or from specificity data
|
|
71
|
+
(residues whose substitution most changes specificity get the most weight; framework positions →
|
|
72
|
+
weight 0). Mirrors how `layout.weight_profile` builds anchor/TCR-facing weights for pMHC.
|
|
73
|
+
- _(TBD: owner)_ source of contact statistics (structures vs. learned), per-V-gene vs. global matrix.
|
|
74
|
+
|
|
75
|
+
### 2.3 Single-chain E-value (have today)
|
|
76
|
+
- Per chain (β, or α), this is exactly `seqtree.evalues`: count neighbours of the query among the
|
|
77
|
+
epitope-specific set vs. the control, report `Ê` and `p_enrich`. No new theory needed.
|
|
78
|
+
|
|
79
|
+
### 2.4 Paired α/β E-value (both chains known)
|
|
80
|
+
- _(design)_ Under chain independence in the null, the **joint** null ball-mass factorizes:
|
|
81
|
+
`π₀^{αβ} ≈ π₀^α · π₀^β`, so the paired intensity is `λ_αβ = N · π₀^α π₀^β` and the paired enrichment
|
|
82
|
+
is a Poisson tail on the count of references matching **both** chains within budget. Equivalent to
|
|
83
|
+
Fisher-combining the per-chain `p_enrich` when independence holds; the appendix `b₂` co-occupancy
|
|
84
|
+
term measures the dependence to correct for. Implement as a joint `seed_and_gather` keyed on a
|
|
85
|
+
paired peptide id.
|
|
86
|
+
- This is strictly more specific than either chain alone (joint null mass is tiny) → far smaller
|
|
87
|
+
E-values for true pairs.
|
|
88
|
+
|
|
89
|
+
### 2.5 Paired estimate for a single chain (rarity of the unknown chain)
|
|
90
|
+
- _(design)_ When only one chain (say β) is observed, marginalize the paired E-value over the unknown
|
|
91
|
+
α: weight the single-chain evidence by the **rarity** of the partner that would complete a known
|
|
92
|
+
pair — a generation-probability / abundance prior on α (`Pgen`, appendix §thymic/selection). A β
|
|
93
|
+
matching a rare-α-paired reference is more informative than one matching a common-α pairing. Yields
|
|
94
|
+
a "paired-equivalent" E-value from one chain without observing the other.
|
|
95
|
+
- _(TBD: owner)_ the exact prior (OLGA/Murugan `Pgen` vs. empirical α-abundance), and calibration.
|
|
96
|
+
|
|
97
|
+
### 2.6 API sketch _(non-binding)_
|
|
98
|
+
```python
|
|
99
|
+
vm = vdjmatch.Annotator.from_vdjdb(chains=("beta",), matrix=paratope_matrix, control=airr_control)
|
|
100
|
+
hits = vm.annotate(cdr3b="CASS...", v="TRBV...", scope=2) # -> [(epitope, Ê, p_enrich), ...]
|
|
101
|
+
hits = vm.annotate_paired(cdr3a=..., cdr3b=...) # §2.4
|
|
102
|
+
hits = vm.annotate(cdr3b=..., partner_prior="pgen") # §2.5
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## 3. `mhcmatch` — peptide–MHC
|
|
108
|
+
|
|
109
|
+
**Goal.** Productionize the `seqtree.pmhc` reference layer: epitope homology / cross-reactivity,
|
|
110
|
+
molecular mimicry, allele guessing, and non-binder filtering — with tuned thresholds and the
|
|
111
|
+
additions below.
|
|
112
|
+
|
|
113
|
+
### 3.1 Reuse from seqtree (have today)
|
|
114
|
+
- Homology / mimicry: `search_homologs`, `find_mimics` (anchor-masked TCR-facing k-mers; per-allele
|
|
115
|
+
presentation-aware E-values via `pmhc_evalue.homolog_evalue`). Positive control: the Dolton et al.
|
|
116
|
+
A\*02:01 cross-reactive trio.
|
|
117
|
+
- Allele guessing (reverse problem): `assign_allele` + the vote-fraction ranking / register trick
|
|
118
|
+
validated in `bench/bench_mhc_guess.py` (appendix §Reverse problem). ROC-AUC 0.90–0.98.
|
|
119
|
+
- **Non-binder filter** (appendix §Reverse problem, "Filtering non-binders"): _binds no MHC_ →
|
|
120
|
+
best-over-panel E-value high / no allele scores above background; _doesn't bind allele a_ →
|
|
121
|
+
per-allele `E_a > α`. mhcmatch exposes both thresholds.
|
|
122
|
+
- Tier choice (full vs shortlist, §1 Datasets).
|
|
123
|
+
|
|
124
|
+
### 3.2 Motif logos
|
|
125
|
+
- For each allele (and each class-II core register), render a sequence logo of the anchor / pocket
|
|
126
|
+
residues from its presented set — the visual counterpart of `presentation_features`. Use a standard
|
|
127
|
+
information-content logo (bits per position). Class II: build the logo over the register-aligned
|
|
128
|
+
9-mer cores (the register trick already picks the core), so anchors P1/P4/P6/P9 are columns.
|
|
129
|
+
- _(TBD: owner)_ logo library choice; whether to weight by `n_references` (shortlist confidence).
|
|
130
|
+
|
|
131
|
+
### 3.3 MHC pseudosequence → clustering & promiscuity
|
|
132
|
+
- **Pseudosequence:** represent each allele by the polymorphic residues that line the peptide-binding
|
|
133
|
+
groove (the pseudosequence of peptide-contacting positions, as used for per-allele MHC binding
|
|
134
|
+
prediction by Glynn, Ghersi & Singh, *PNAS* 2025, [doi:10.1073/pnas.2405106122](https://doi.org/10.1073/pnas.2405106122)).
|
|
135
|
+
This turns "allele" from an opaque label into a short sequence the **same seqtree engine** can compare.
|
|
136
|
+
- **Cluster MHCs** by pseudosequence distance (seqtree fuzzy search on the pseudosequence "alphabet"
|
|
137
|
+
of groove residues) → groups of functionally similar alleles. This is the principled way to express
|
|
138
|
+
**cross-allele similarity**, which `seqtree.pmhc` deliberately does *not* model (appendix §Impl.
|
|
139
|
+
limitation: "distinct alleles are distinct nulls"). mhcmatch is where that limitation is lifted.
|
|
140
|
+
- **Promiscuity** (esp. class II — appendix §"Class-II promiscuity"): a peptide presented across a
|
|
141
|
+
pseudosequence-cluster of alleles is promiscuous; quantify as the spread of its positive alleles
|
|
142
|
+
over the MHC clustering, and use the cluster to pool nulls for related alleles when data are thin.
|
|
143
|
+
- _(TBD: owner)_ pseudosequence position set per locus (HLA-A/B/C, DR/DQ/DP, mouse H-2), distance
|
|
144
|
+
metric, cluster cut.
|
|
145
|
+
|
|
146
|
+
### 3.4 API sketch _(non-binding)_
|
|
147
|
+
```python
|
|
148
|
+
mm = mhcmatch.Store.from_pmhc(tier="shortlist")
|
|
149
|
+
mm.logo("HLA-A*02:01") # §3.2
|
|
150
|
+
clusters = mm.cluster_alleles(class_="mhc1") # §3.3, via pseudosequence
|
|
151
|
+
mm.is_binder("SIINFEKL", allele="H-2Kb", alpha=0.05) # §3.1 non-binder filter
|
|
152
|
+
mm.promiscuity("PKYVKQNTLKLAT") # §3.3
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## 4. Shared conventions
|
|
158
|
+
- **seqtree is upstream and stays generic.** New general-purpose primitives (a learned matrix loader,
|
|
159
|
+
a logo helper, a pseudosequence comparator) may land in seqtree if reusable; tuned thresholds,
|
|
160
|
+
predictors, and domain glue stay in the wrappers.
|
|
161
|
+
- **Anchors** are parametrized in `seqtree.layout` (presets per class, overridable) — wrappers pass
|
|
162
|
+
`AnchorSpec`, they don't hardcode positions.
|
|
163
|
+
- **Citations:** never fabricate. Verify every DOI via a tool before adding it (PubMed/arXiv).
|
|
164
|
+
- **Versioning / gitflow:** feature branch → `dev` → `master`; end commit messages with the
|
|
165
|
+
`Co-Authored-By` trailer; don't publish to PyPI without an explicit release.
|
|
166
|
+
|
|
167
|
+
## 5. Pointers
|
|
168
|
+
- E-value theory & all derivations: `appendix/evalue.tex` (compiled `appendix/evalue.pdf`).
|
|
169
|
+
- pMHC usage & limitations: `docs/pmhc.rst`. Benchmarks & figures: `docs/benchmarks.rst`,
|
|
170
|
+
`bench/bench_mhc_guess.py`.
|
|
171
|
+
- seqtree internal roadmap (PSSM-graded d_TCR, native local align, Flashback build, predictor proteome
|
|
172
|
+
scans): `docs/roadmap.rst`.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Compile the E-value derivation. Requires a TeX distribution (latexmk + lualatex + bibtex)
|
|
2
|
+
# and rsvg-convert (librsvg) to convert the referenced SVG plots to PDF for \includegraphics.
|
|
3
|
+
# lualatex (not pdflatex) is required for the OldStandard OpenType text+math fonts; the
|
|
4
|
+
# compiler is pinned in .latexmkrc ($pdf_mode = 4) so a bare `latexmk` also picks it up.
|
|
5
|
+
FIGS = epitope_detection.pdf evalue_matrix.pdf mhc1_rocpr.pdf mhc2_rocpr.pdf
|
|
6
|
+
|
|
7
|
+
evalue.pdf: evalue.tex refs.bib $(FIGS)
|
|
8
|
+
latexmk -lualatex -interaction=nonstopmode -halt-on-error evalue.tex
|
|
9
|
+
|
|
10
|
+
# Benchmark SVGs (bench/figures) -> PDF for inclusion as figures.
|
|
11
|
+
%.pdf: ../bench/figures/%.svg
|
|
12
|
+
rsvg-convert -f pdf -o $@ $<
|
|
13
|
+
|
|
14
|
+
.PHONY: clean
|
|
15
|
+
clean:
|
|
16
|
+
latexmk -C evalue.tex
|
|
17
|
+
rm -f $(FIGS)
|
|
Binary file
|
|
Binary file
|