seqtree 0.0.1__tar.gz → 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {seqtree-0.0.1 → seqtree-0.0.2}/.gitignore +1 -0
  2. {seqtree-0.0.1 → seqtree-0.0.2}/CMakeLists.txt +5 -1
  3. {seqtree-0.0.1 → seqtree-0.0.2}/PKG-INFO +18 -3
  4. {seqtree-0.0.1 → seqtree-0.0.2}/README.md +15 -2
  5. seqtree-0.0.2/appendix/.gitignore +10 -0
  6. seqtree-0.0.2/appendix/Makefile +7 -0
  7. seqtree-0.0.2/appendix/evalue.tex +506 -0
  8. seqtree-0.0.2/appendix/refs.bib +141 -0
  9. {seqtree-0.0.1 → seqtree-0.0.2}/include/seqtree/seqtree.hpp +15 -0
  10. {seqtree-0.0.1 → seqtree-0.0.2}/pyproject.toml +2 -1
  11. seqtree-0.0.2/python/seqtree/__init__.py +32 -0
  12. seqtree-0.0.2/python/seqtree/control.py +96 -0
  13. seqtree-0.0.2/python/seqtree/data/control_human_trb_aa.txt.gz +0 -0
  14. seqtree-0.0.2/python/seqtree/evalue.py +79 -0
  15. {seqtree-0.0.1 → seqtree-0.0.2}/src/_bindings.cpp +111 -17
  16. {seqtree-0.0.1 → seqtree-0.0.2}/src/engine_seqtm.cpp +7 -3
  17. {seqtree-0.0.1 → seqtree-0.0.2}/src/engines.hpp +1 -0
  18. {seqtree-0.0.1 → seqtree-0.0.2}/src/index.cpp +117 -1
  19. seqtree-0.0.2/src/pam50.inc +32 -0
  20. {seqtree-0.0.1 → seqtree-0.0.2}/src/searcher.cpp +3 -0
  21. {seqtree-0.0.1 → seqtree-0.0.2}/src/substitution_matrix.cpp +10 -4
  22. seqtree-0.0.1/python/seqtree/__init__.py +0 -10
  23. {seqtree-0.0.1 → seqtree-0.0.2}/.gitattributes +0 -0
  24. {seqtree-0.0.1 → seqtree-0.0.2}/LICENSE +0 -0
  25. {seqtree-0.0.1 → seqtree-0.0.2}/include/seqtree/types.hpp +0 -0
  26. {seqtree-0.0.1 → seqtree-0.0.2}/python/seqtree/py.typed +0 -0
  27. {seqtree-0.0.1 → seqtree-0.0.2}/src/blosum62.inc +0 -0
  28. {seqtree-0.0.1 → seqtree-0.0.2}/src/codec.cpp +0 -0
  29. {seqtree-0.0.1 → seqtree-0.0.2}/src/engine_seqtrie.cpp +0 -0
  30. {seqtree-0.0.1 → seqtree-0.0.2}/src/trie.cpp +0 -0
  31. {seqtree-0.0.1 → seqtree-0.0.2}/src/trie.hpp +0 -0
@@ -10,3 +10,4 @@ __pycache__/
10
10
  _skbuild/
11
11
  docs/_build/
12
12
  bench/figures/
13
+ bench/cache/
@@ -7,7 +7,9 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
7
7
  if(NOT CMAKE_BUILD_TYPE)
8
8
  set(CMAKE_BUILD_TYPE Release)
9
9
  endif()
10
- set(CMAKE_CXX_FLAGS_RELEASE "-O3")
10
+ if(NOT MSVC)
11
+ set(CMAKE_CXX_FLAGS_RELEASE "-O3")
12
+ endif() # MSVC Release already uses /O2; -O3 is not a valid MSVC flag
11
13
 
12
14
  option(SEQTREE_TESTS "Build C++ tests" OFF)
13
15
  option(SEQTREE_BENCH "Build C++ benchmarks" OFF)
@@ -41,6 +43,8 @@ if(SEQTREE_TESTS)
41
43
  tests/cpp/test_matrix.cpp
42
44
  tests/cpp/test_trie.cpp
43
45
  tests/cpp/test_engines.cpp
46
+ tests/cpp/test_edge.cpp
47
+ tests/cpp/test_serialize.cpp
44
48
  )
45
49
  target_include_directories(seqtree_tests PRIVATE tests/cpp src)
46
50
  target_link_libraries(seqtree_tests PRIVATE seqtree_core)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: seqtree
3
- Version: 0.0.1
3
+ Version: 0.0.2
4
4
  Summary: Fast fuzzy search over biological sequences (C++ core, Python bindings)
5
5
  Keywords: sequence-search,fuzzy-matching,CDR3,immunology,bioinformatics,trie
6
6
  Author-Email: ISALGO laboratory <mikhail.shugay@gmail.com>
@@ -25,10 +25,15 @@ Provides-Extra: docs
25
25
  Requires-Dist: sphinx; extra == "docs"
26
26
  Requires-Dist: pydata-sphinx-theme; extra == "docs"
27
27
  Requires-Dist: nbsphinx; extra == "docs"
28
+ Provides-Extra: control
29
+ Requires-Dist: huggingface_hub; extra == "control"
28
30
  Description-Content-Type: text/markdown
29
31
 
30
32
  # seqtree
31
33
 
34
+ [![PyPI](https://img.shields.io/pypi/v/seqtree.svg)](https://pypi.org/project/seqtree/)
35
+ [![Python](https://img.shields.io/pypi/pyversions/seqtree.svg)](https://pypi.org/project/seqtree/)
36
+ [![License](https://img.shields.io/pypi/l/seqtree.svg)](LICENSE)
32
37
  [![CI](https://github.com/antigenomics/seqtree/actions/workflows/ci.yml/badge.svg)](https://github.com/antigenomics/seqtree/actions/workflows/ci.yml)
33
38
  [![Docs](https://github.com/antigenomics/seqtree/actions/workflows/docs.yml/badge.svg)](https://antigenomics.github.io/seqtree/)
34
39
 
@@ -49,7 +54,13 @@ Two search engines over one trie:
49
54
  `(ref_id, score, n_subs, n_ins, n_dels)`. Downstream libraries map `ref_id` back
50
55
  to their own payloads (V gene, MHC, counts) and filter.
51
56
 
52
- ## Build
57
+ ## Install
58
+
59
+ ```fish
60
+ pip install seqtree # prebuilt wheels for CPython 3.10–3.13 (Linux/macOS/Windows)
61
+ ```
62
+
63
+ ## Build from source
53
64
 
54
65
  ```fish
55
66
  bash setup.sh # repo-local .venv + editable install
@@ -95,10 +106,14 @@ pytest tests/python # Python tests
95
106
  ## Benchmarks
96
107
 
97
108
  ```fish
98
- python bench/bench.py # fast tier (real VDJdb data)
109
+ python bench/bench.py # recall vs ground truth (real VDJdb data)
110
+ python bench/bench_gnuplot.py # max-edit-3 throughput → SVG figures (needs gnuplot)
99
111
  env RUN_BENCHMARK=1 python bench/bench.py --sizes 1000000 --queries 1000000 --threads 16
100
112
  ```
101
113
 
114
+ `bench/bench_gnuplot.py` renders queries/ms vs reference-set size (both engines), peak RSS, and
115
+ alignment-fetch cost. See [docs/benchmarks.rst](docs/benchmarks.rst).
116
+
102
117
  ## Development
103
118
 
104
119
  This repo follows **git-flow**:
@@ -1,5 +1,8 @@
1
1
  # seqtree
2
2
 
3
+ [![PyPI](https://img.shields.io/pypi/v/seqtree.svg)](https://pypi.org/project/seqtree/)
4
+ [![Python](https://img.shields.io/pypi/pyversions/seqtree.svg)](https://pypi.org/project/seqtree/)
5
+ [![License](https://img.shields.io/pypi/l/seqtree.svg)](LICENSE)
3
6
  [![CI](https://github.com/antigenomics/seqtree/actions/workflows/ci.yml/badge.svg)](https://github.com/antigenomics/seqtree/actions/workflows/ci.yml)
4
7
  [![Docs](https://github.com/antigenomics/seqtree/actions/workflows/docs.yml/badge.svg)](https://antigenomics.github.io/seqtree/)
5
8
 
@@ -20,7 +23,13 @@ Two search engines over one trie:
20
23
  `(ref_id, score, n_subs, n_ins, n_dels)`. Downstream libraries map `ref_id` back
21
24
  to their own payloads (V gene, MHC, counts) and filter.
22
25
 
23
- ## Build
26
+ ## Install
27
+
28
+ ```fish
29
+ pip install seqtree # prebuilt wheels for CPython 3.10–3.13 (Linux/macOS/Windows)
30
+ ```
31
+
32
+ ## Build from source
24
33
 
25
34
  ```fish
26
35
  bash setup.sh # repo-local .venv + editable install
@@ -66,10 +75,14 @@ pytest tests/python # Python tests
66
75
  ## Benchmarks
67
76
 
68
77
  ```fish
69
- python bench/bench.py # fast tier (real VDJdb data)
78
+ python bench/bench.py # recall vs ground truth (real VDJdb data)
79
+ python bench/bench_gnuplot.py # max-edit-3 throughput → SVG figures (needs gnuplot)
70
80
  env RUN_BENCHMARK=1 python bench/bench.py --sizes 1000000 --queries 1000000 --threads 16
71
81
  ```
72
82
 
83
+ `bench/bench_gnuplot.py` renders queries/ms vs reference-set size (both engines), peak RSS, and
84
+ alignment-fetch cost. See [docs/benchmarks.rst](docs/benchmarks.rst).
85
+
73
86
  ## Development
74
87
 
75
88
  This repo follows **git-flow**:
@@ -0,0 +1,10 @@
1
+ # LaTeX build artifacts
2
+ *.aux
3
+ *.bbl
4
+ *.blg
5
+ *.fdb_latexmk
6
+ *.fls
7
+ *.log
8
+ *.out
9
+ *.toc
10
+ evalue.pdf
@@ -0,0 +1,7 @@
1
+ # Compile the E-value derivation. Requires a TeX distribution (latexmk + pdflatex + bibtex).
2
+ evalue.pdf: evalue.tex refs.bib
3
+ latexmk -pdf -interaction=nonstopmode -halt-on-error evalue.tex
4
+
5
+ .PHONY: clean
6
+ clean:
7
+ latexmk -C evalue.tex
@@ -0,0 +1,506 @@
1
+ \documentclass[11pt]{article}
2
+
3
+ \usepackage[margin=1in]{geometry}
4
+ \usepackage{amsmath,amssymb,amsthm}
5
+ \usepackage{mathtools}
6
+ \usepackage[hidelinks]{hyperref}
7
+
8
+ \newtheorem{theorem}{Theorem}
9
+ \newtheorem{lemma}{Lemma}
10
+ \newtheorem{proposition}{Proposition}
11
+ \newtheorem{corollary}{Corollary}
12
+ \theoremstyle{definition}
13
+ \newtheorem{definition}{Definition}
14
+ \newtheorem{assumption}{Assumption}
15
+ \theoremstyle{remark}
16
+ \newtheorem{remark}{Remark}
17
+
18
+ \DeclareMathOperator{\Poisson}{Poisson}
19
+ \DeclareMathOperator{\Binomial}{Binomial}
20
+ \DeclareMathOperator{\Var}{Var}
21
+ \DeclareMathOperator{\dTV}{d_{\mathrm{TV}}}
22
+ \newcommand{\Pgen}{P_{\mathrm{gen}}}
23
+ \newcommand{\Pzero}{P_{0}}
24
+ \newcommand{\Pdata}{P_{\mathrm{data}}}
25
+ \newcommand{\ball}[2]{B_{#1}(#2)}
26
+ \newcommand{\R}{\mathbb{R}}
27
+ \newcommand{\E}{\mathbb{E}}
28
+ \newcommand{\Prob}{\mathbb{P}}
29
+ \newcommand{\one}{\mathbf{1}}
30
+
31
+ \title{A control-calibrated E-value for fuzzy TCR sequence search\\ over biologically redundant reference sets}
32
+ \author{seqtree --- technical appendix}
33
+ \date{}
34
+
35
+ \begin{document}
36
+ \maketitle
37
+
38
+ \begin{abstract}
39
+ We derive a BLAST-style E-value~\cite{Karlin1990,Altschul1990} for ``hits'' returned by fuzzy
40
+ search over T-cell receptor (TCR) CDR3 sequences, adapted to the defining difficulty of immune
41
+ repertoires: the reference set is highly \emph{redundant}, and the redundancy is biological
42
+ (convergent V(D)J recombination, public clones, clonal expansion) rather than statistical noise.
43
+ The classical Karlin--Altschul theory assumes a database of independent, identically distributed
44
+ letters; under that null, redundancy-driven near-matches are absurdly significant. We replace the
45
+ i.i.d.-letter null with an \emph{empirical background} $\Pzero$ estimated from a matched control
46
+ repertoire, retain the Poisson/Gumbel limit superstructure with an explicit non-asymptotic error
47
+ bound (Chen--Stein / Le~Cam~\cite{Arratia1990,LeCam1960}), and handle clonal over-dispersion by
48
+ collapsing to unique clonotypes. The resulting E-value is automatically deflated for hits that the
49
+ generation process alone explains and is large only for antigen-driven convergence. This puts the TCRNET approach---counting
50
+ sequence neighbours against a real-world control repertoire, first introduced by
51
+ Ritvo~et~al.~\cite{Ritvo2018} and formalized as an annotation framework
52
+ by~\cite{Pogorelyy2019}---on a rigorous, finite-sample footing, and we show the classical
53
+ Karlin--Altschul E-value is its product-measure, ungapped special case.
54
+ \end{abstract}
55
+
56
+ \section{Introduction: the redundancy problem}
57
+
58
+ Given a query CDR3 $q$ and a target set $D$ (e.g.\ VDJdb), fuzzy search returns the neighbours of
59
+ $q$ within a fixed scope/budget $\theta$. We want a significance value for such hits. BLAST answers
60
+ this for protein search with the Karlin--Altschul E-value
61
+ \begin{equation}\label{eq:ka}
62
+ E = K\,m\,n\,e^{-\lambda^\ast S},
63
+ \end{equation}
64
+ where $m$ is the query length and $n$ the total database length (both in residues), $S$ is the
65
+ alignment score of the hit under a substitution matrix with entries $s_{ij}$, $\lambda^\ast$ is the
66
+ unique positive root of $\sum_{ij} p_i p_j e^{\lambda^\ast s_{ij}} = 1$ (the natural scale that turns
67
+ scores into log-probabilities for i.i.d.\ letters with background frequencies $p_i$), and $K>0$ is a
68
+ prefactor---the ``clumping'' or edge-effect constant---fixed by the score distribution. $E$ is the
69
+ expected number of distinct alignments scoring at least $S$ by chance; the number of such alignments
70
+ is asymptotically Poisson, so $\Pr(\text{at least one}) = 1-e^{-E}$~\cite{Karlin1990,Karlin1993}.
71
+ The whole construction rests on the database being a string of i.i.d.\ letters.
72
+
73
+ Immune repertoires violate the i.i.d.\ assumption catastrophically. CDR3s are produced by V(D)J
74
+ recombination, whose generation probability $\Pgen$, first derived and inferred from sequence
75
+ repertoires by Murugan~et~al.~\cite{Murugan2012}, is sharply non-uniform; convergent recombination
76
+ makes some sequences enormously over-represented; clonal
77
+ expansion and public clones create exact and near duplicates. A query in a common, high-$\Pgen$
78
+ region of sequence space has many neighbours \emph{for purely generative reasons}. An i.i.d.\ null
79
+ would flag these as wildly significant, which is biologically meaningless. The signal we actually
80
+ want is the opposite: \emph{more} neighbours than the background generative process predicts, the
81
+ hallmark of antigen-driven selection.
82
+
83
+ Our approach: define the null by an empirical background distribution $\Pzero$ that carries the
84
+ generative and baseline-sharing redundancy but no antigen-driven enrichment, estimate the
85
+ null neighbourhood mass from a matched control repertoire (the user-supplied
86
+ \texttt{isalgo/airr\_control} set), and calibrate the E-value against it. This is a rigorous,
87
+ finite-sample generalization of Karlin--Altschul to a non-i.i.d., biologically structured null, and
88
+ the statistical formalization of TCRNET-style neighbour counting against a real
89
+ control~\cite{Ritvo2018,Pogorelyy2019}.
90
+
91
+ \section{Setup and notation}\label{sec:setup}
92
+
93
+ Let $\Sigma$ be the amino-acid alphabet and $\mathcal{X} = \bigcup_{L\ge 0}\Sigma^L$ the space of
94
+ CDR3 sequences. The search engine defines, for a query $q$ and budget $\theta\ge 0$, a non-negative
95
+ score $s_\theta(q,x)$ and a \emph{ball}
96
+ \begin{equation}\label{eq:ball}
97
+ \ball{\theta}{q} = \{x \in \mathcal{X} : s(q,x) \le \theta\}, \qquad s(q,q)=0,\; s\ge 0.
98
+ \end{equation}
99
+ The score need not be a metric: with a substitution matrix it is the squared-distance penalty
100
+ $\mathrm{pen}(a,b)=s_{aa}+s_{bb}-2s_{ab}$ summed along the optimal alignment (plus gap costs); in
101
+ unit-cost mode it is an edit count. Both define a legitimate ball. We work with two background laws
102
+ on $\mathcal{X}$: the \emph{realized-repertoire background} $\Pzero$ (what a healthy, unselected
103
+ repertoire instantiates) and the \emph{generation law} $\Pgen$ (the V(D)J model). Let
104
+ \begin{equation}\label{eq:pi0}
105
+ \pi_0(q,\theta) = \Pzero\!\left(\ball{\theta}{q}\right) = \sum_{x \in \ball{\theta}{q}} \Pzero(x),
106
+ \qquad
107
+ \pi_{\mathrm{gen}}(q,\theta) = \Pgen\!\left(\ball{\theta}{q}\right).
108
+ \end{equation}
109
+ A control sample $C=(C_1,\dots,C_M)$ and a target set $D$ are given. Crucially, all counts are over
110
+ \emph{distinct clonotypes}: the engine deduplicates hits by reference id, so
111
+ \begin{equation}
112
+ n_S(q,\theta) = \#\{\,x \in S \text{ distinct} : x \in \ball{\theta}{q}\,\}, \qquad S\in\{C,D\}.
113
+ \end{equation}
114
+ Write $N=|D|$ (unique clonotypes; see \S\ref{sec:overdispersion}).
115
+
116
+ \begin{lemma}[Scope monotonicity]\label{lem:monotone}
117
+ The balls nest, $\ball{\theta}{q}\subseteq\ball{\theta'}{q}$ for $\theta\le\theta'$ (since $s\ge0$ and
118
+ the cut is by a single threshold). Hence $\pi_0(q,\cdot)$, $n_S(q,\cdot)$, the intensity
119
+ $\lambda(q,\cdot)$ and the E-value $E(q,\cdot)$ are all non-decreasing in the scope/budget $\theta$,
120
+ and the closest-hit score $S_{\min}(q)$ of \S\ref{sec:gumbel} is the smallest $\theta$ with
121
+ $n_D(q,\theta)>0$. This justifies sweeping $\theta$ to trace an E-value curve per query.
122
+ \end{lemma}
123
+
124
+ \begin{assumption}[Exchangeability under $H_0$]\label{ass:exch}
125
+ Under the null, the unique clonotypes of $D$ are exchangeable with marginal law $\Pzero$.
126
+ \end{assumption}
127
+ \begin{assumption}[Independent control draws]\label{ass:indep}
128
+ The unique clonotypes of $C$ are i.i.d.\ (or exchangeable) $\sim \Pzero$.
129
+ \end{assumption}
130
+ \begin{assumption}[Background match]\label{ass:match}
131
+ $C$ and $D$ share the background $\Pzero$ (same generation $+$ sampling process, matched chain,
132
+ species and length composition). All validity is conditional on Assumption~\ref{ass:match}.
133
+ \end{assumption}
134
+
135
+ \section{Null hypothesis and estimator hierarchy}\label{sec:null}
136
+
137
+ \begin{definition}[Per-query null]
138
+ $H_0(q)$: the neighbours of $q$ in $D$ arise from $\Pzero$ with no antigen-driven excess, i.e.\
139
+ each $x\in D$ satisfies $\E[\one(x\in\ball{\theta}{q})] = \pi_0(q,\theta)$. The alternative
140
+ $H_1(q)$ posits excess mass $\pi_D(q,\theta) > \pi_0(q,\theta)$.
141
+ \end{definition}
142
+
143
+ \begin{lemma}[Self-match exclusion / punctured null]\label{lem:selfmatch}
144
+ When the query is itself a database member ($q\in D$, as in a VDJdb-vs-VDJdb scan), the count
145
+ $n_D(q,\theta)$ contains the exact self-match (and any exact duplicates of $q$), which are
146
+ \emph{deterministic} identity hits, not random draws from $\Pzero$. Including them biases both the
147
+ observed count and the null. The correct neighbour statistic is the \emph{punctured} count over the
148
+ distance-positive ball,
149
+ \begin{equation}\label{eq:punctured}
150
+ n_D^{>}(q,\theta) = \#\{x\in D : 0 < s(q,x) \le \theta\},
151
+ \end{equation}
152
+ with null intensity $\lambda^{>}(q,\theta) = (N-m_q)\,\pi_0^{>}(q,\theta)$, where $m_q$ is the
153
+ multiplicity of $q$ in $D$ and $\pi_0^{>} = \Pzero(\ball{\theta}{q}) - \Pzero(\{x:s(q,x)=0\})$ removes
154
+ the point mass at exact matches. The control estimator is punctured identically,
155
+ $\hat\pi^{>} = n_C^{>}(q,\theta)/M$, so the deterministic identity term cancels in the calibrated
156
+ E-value. (For $q\notin D$ the puncture is vacuous and $n_D^{>}=n_D$.)
157
+ \end{lemma}
158
+
159
+ \begin{remark}[Consistency of the puncture, and when \emph{not} to use it]\label{rem:punct-app}
160
+ The puncture is valid \emph{only if applied to both sides}: the E-value $E=(N/M)\,n_C$ estimates
161
+ $N\pi_0$ for one and the same ball, so dropping the $s=0$ point mass from the target count requires
162
+ estimating the punctured mass $\pi_0^{>}$ from the \emph{punctured} control count $n_C^{>}$. Doing so
163
+ does change the numeric E-value (it shrinks by the removed exact-match mass, $E^{>}=(N/M)n_C^{>}\le
164
+ E$), but it leaves the \emph{inference} unbiased: the exact-match term is deterministic and enters
165
+ observed count and null intensity identically, so it carries no signal and its removal neither
166
+ creates nor destroys significance for the genuine neighbours. Puncturing only one side
167
+ (target but not control, or vice versa) \emph{does} bias the test and must be avoided.
168
+
169
+ This exclusion is a \textbf{benchmark device}, not a default for applications. In the
170
+ VDJdb-vs-VDJdb benchmark the queries are drawn from the target, so every query carries a guaranteed
171
+ trivial self-hit that would otherwise inflate every count uniformly; puncturing removes it. In a real
172
+ annotation task the query is a \emph{novel} sequence scored against a reference database
173
+ ($q\notin D$), where an exact database match is the strongest and most informative hit and must be
174
+ kept. Hence \texttt{seqtree.evalues} leaves \texttt{exclude\_exact=False} by default and the benchmark
175
+ sets it \texttt{True}.
176
+ \end{remark}
177
+
178
+ The estimand is the per-query Poisson intensity $\lambda(q,\theta) = N\,\pi_0(q,\theta)$ (read as
179
+ $\lambda^{>}$ with the puncture of Lemma~\ref{lem:selfmatch} whenever $q\in D$). Two
180
+ estimators of $\pi_0$ target \emph{different} nulls and must not be conflated.
181
+ \begin{itemize}
182
+ \item \textbf{Control / Monte-Carlo (primary):} $\hat\pi(q,\theta) = n_C(q,\theta)/M$, unbiased
183
+ for $\Pzero(\ball{\theta}{q})$, with $M\hat\pi \sim \Binomial(M,\pi_0)$ under
184
+ Assumption~\ref{ass:indep}. It captures the \emph{realized} background, including public-clone
185
+ sharing and finite-repertoire convergence.
186
+ \item \textbf{Generation / analytic (cross-check):} $\hat\pi_{\mathrm{gen}}(q,\theta) =
187
+ \sum_{x\in\ball{\theta}{q}} \Pgen(x)$, computed by enumerating the (small, for small $\theta$)
188
+ ball with the engine and weighting by the V(D)J generation probability of the Murugan~et~al.\
189
+ model~\cite{Murugan2012}. It targets the pure generation null $\Pgen(\ball{\theta}{q})$, which
190
+ omits selection and sampling.
191
+ \end{itemize}
192
+
193
+ \begin{remark}[Selection factor and the thymic correction]\label{rem:thymic}
194
+ $\Pgen$ is a \emph{pre-selection} law; only a fraction of generated receptors survive thymic and
195
+ peripheral selection. Elhanati~et~al.~\cite{Elhanati2014} model this with a per-sequence
196
+ \emph{selection factor} $Q(\sigma)\ge 0$ on the recombination outcome $\sigma=(\vec a,V,J)$, inferred
197
+ by maximum likelihood, giving the post-selection law
198
+ \begin{equation}\label{eq:qfactor}
199
+ \Pzero(\sigma) = \tfrac1Z\,Q(\sigma)\,\Pgen(\sigma),\qquad
200
+ Z=\sum_\sigma Q(\sigma)\Pgen(\sigma)=1 \;\;(\langle Q\rangle_{\Pgen}=1).
201
+ \end{equation}
202
+ The normalization $\langle Q\rangle=1$ means $Q$ \emph{redistributes} mass without a global rescale;
203
+ the structured part (selection reinforces recombination biases, with the observed
204
+ $\Pdata(Q)/\Pgen(Q)$ saturating around $\approx 7$~\cite{Elhanati2014}) reshapes the ball mass per
205
+ sequence. Separately, the \emph{physical} thymic acceptance fraction---the fraction of recombined
206
+ cells that survive to the naive repertoire---is $\alpha\lesssim 15\%$ (consistent with $10$--$30\%$
207
+ for positive selection and $\approx5\%$ for full selection)~\cite{Elhanati2014}, and selection cuts
208
+ repertoire diversity by $\approx 6$ bits ($\sim 50$-fold). Two consequences for the E-value.
209
+ (i) The empirical control $\Pzero$ already \emph{is} the post-selection law of~\eqref{eq:qfactor}, so
210
+ the control estimator $\hat\pi$ needs no $Q$ and no $\alpha$; $Q$ enters only the analytic estimator,
211
+ where one uses $Q\,\Pgen$ in place of $\Pgen$. (ii) The global acceptance fraction $\alpha$ is
212
+ sequence-independent and \emph{cancels} in every ratio and in $\hat\pi$ (which calibrates against the
213
+ control's own size $M$); it would matter only for an \emph{absolute} naive-frequency
214
+ estimate $f(\sigma)=\alpha\,Q(\sigma)\Pgen(\sigma)$, e.g.\ when the $\hat\pi_{\mathrm{gen}}$ fallback
215
+ for a rare query (\S\ref{sec:precision}) is read as an expected count of cells rather than a
216
+ probability.
217
+ \end{remark}
218
+
219
+ \begin{lemma}[The two nulls differ]\label{lem:hierarchy}
220
+ In general $\Pzero \ne \Pgen$: thymic and peripheral selection deplete some motifs while
221
+ finite-sample public-clone sharing enriches others, so neither $\pi_0 \le \pi_{\mathrm{gen}}$ nor
222
+ the reverse holds universally. Hence $\hat\pi_{\mathrm{gen}}$ is used as a variance-reducing control
223
+ variate and as a fallback for queries too rare for the control (\S\ref{sec:precision}), not as a
224
+ substitute for $\hat\pi$.
225
+ \end{lemma}
226
+
227
+ \section{Poisson approximation with an explicit error bound}\label{sec:poisson}
228
+
229
+ Fix $q,\theta$. For the unique clonotypes $x_1,\dots,x_N$ of $D$ set
230
+ $X_i=\one(x_i\in\ball{\theta}{q})$, $p_i=\E X_i=\pi_0$, $W=\sum_i X_i$, $\lambda=\sum_i p_i = N\pi_0$,
231
+ and let $Z\sim\Poisson(\lambda)$. We use the following standard objects. $\mathcal{L}(W)$ denotes the
232
+ \emph{law} (probability distribution) of $W$. The \emph{total-variation distance} between two laws
233
+ $\mu,\nu$ on $\mathbb{Z}_{\ge 0}$ is
234
+ \begin{equation}\label{eq:dtv}
235
+ \dTV(\mu,\nu) = \sup_{A\subseteq\mathbb{Z}_{\ge0}} |\mu(A)-\nu(A)| = \tfrac12\sum_{k\ge0}|\mu(k)-\nu(k)|,
236
+ \end{equation}
237
+ so a bound on $\dTV$ bounds the error of \emph{every} event probability simultaneously. A family of
238
+ \emph{dependency neighbourhoods} is a choice, for each index $i$, of a set $B_i\ni i$ such that $X_i$
239
+ is independent of (or nearly independent of) $\{X_j : j\notin B_i\}$; intuitively $B_i$ collects the
240
+ clonotypes whose ball-membership is statistically coupled to $x_i$'s (here, those sharing a motif).
241
+ The residual $b_3$ below measures exactly how far that near-independence falls short.
242
+
243
+ \begin{theorem}[Chen--Stein bound \cite{Arratia1989,Arratia1990}]\label{thm:cs}
244
+ For any dependency neighbourhoods $\{B_i \ni i\}$,
245
+ \begin{equation}\label{eq:cs}
246
+ \dTV\!\big(\mathcal{L}(W),\Poisson(\lambda)\big) \le b_1+b_2+b_3,
247
+ \end{equation}
248
+ with $b_1=\sum_i\sum_{j\in B_i}p_ip_j$, $b_2=\sum_i\sum_{j\in B_i, j\ne i}\E[X_iX_j]$, and
249
+ $b_3=\sum_i \E\big|\,\E[X_i-p_i\mid \sigma(X_j: j\notin B_i)]\,\big|$.
250
+ \end{theorem}
251
+
252
+ \begin{corollary}[Le Cam regime \cite{LeCam1960}]\label{cor:lecam}
253
+ If the collapsed clonotypes are independent under $H_0$, take $B_i=\{i\}$; then $b_2=b_3=0$ and
254
+ \begin{equation}\label{eq:lecam}
255
+ \dTV\!\big(\mathcal{L}(W),\Poisson(\lambda)\big) \le \sum_i p_i^2 = N\pi_0^2 = \lambda\,\pi_0 .
256
+ \end{equation}
257
+ The bound is small precisely in the regime of interest: a rare ball $\pi_0\ll 1$ with moderate
258
+ $\lambda$ gives error $\le \lambda\pi_0 \to 0$.
259
+ \end{corollary}
260
+
261
+ \begin{corollary}[Void and tail probabilities]\label{cor:void}
262
+ With $w=n_D(q,\theta)$ observed,
263
+ \begin{align}
264
+ p_{\mathrm{any}}(q,\theta) &= \Prob(W\ge 1) = 1-e^{-\lambda} + O(N\pi_0^2), \label{eq:pany}\\
265
+ p(q,\theta) &= \Prob(Z \ge w) = 1 - \sum_{k<w} \frac{e^{-\lambda}\lambda^k}{k!},
266
+ \quad |\,\Prob(W\ge w) - p(q,\theta)\,| \le b_1+b_2+b_3. \label{eq:ptail}
267
+ \end{align}
268
+ Both follow from Theorem~\ref{thm:cs}: the void probability is the event $A=\{0\}$ and the tail is
269
+ $A=\{w,w+1,\dots\}$, and by~\eqref{eq:dtv} the error on any single event is at most $\dTV\le
270
+ b_1+b_2+b_3$ (so $O(N\pi_0^2)$ in the independent regime, Corollary~\ref{cor:lecam}).
271
+ \end{corollary}
272
+
273
+ \begin{remark}[Where biology enters]
274
+ Convergent recombination makes the dependency neighbourhood $B_i$ nontrivial: $j\in B_i$ when $x_i$
275
+ and $x_j$ share a high-$\Pgen$ motif. The term $b_2=\sum_i\sum_{j\in B_i}\E[X_iX_j]$ is the excess
276
+ \emph{pairwise} ball co-occupancy. Under $H_0$ it is controlled by the pairwise ball mass, estimable
277
+ from the control by counting \emph{pairs} of control sequences both in $\ball{\theta}{q}$; the
278
+ Poisson regime holds when this estimate is $\ll\lambda$. The very same $b_2$ is inflated under $H_1$
279
+ (antigen-driven clusters co-occupy the ball), so it is simultaneously the null error term and the
280
+ quantity carrying the signal.
281
+ \end{remark}
282
+
283
+ \section{Clonal redundancy and over-dispersion}\label{sec:overdispersion}
284
+
285
+ \begin{proposition}[Collapsing restores Poisson]\label{prop:collapse}
286
+ Let the raw target carry clonotypes with multiplicities (clone sizes) $m_x$. Counting reads/cells in
287
+ the ball gives a compound-Poisson total $T=\sum_{k=1}^{K} m_k$ with $K\sim\Poisson(\lambda)$ and
288
+ $m_k$ i.i.d.\ $\sim G$, so $\E T=\lambda\mu_G$ and $\Var T=\lambda\,\E[m^2]$, with over-dispersion
289
+ index $\Var T/\E T=\E[m^2]/\mu_G\ge 1$. Collapsing to unique clonotypes is the projection $G\equiv1$,
290
+ which removes multiplicity-driven over-dispersion and returns the Poisson count $W$ of
291
+ \S\ref{sec:poisson}. We therefore deduplicate $C$ and $D$ to unique clonotypes by default.
292
+ \end{proposition}
293
+
294
+ \begin{proposition}[Negative-binomial robustness check]\label{prop:nb}
295
+ If multiplicities must be modelled (e.g.\ read-level tests) and $G$ is geometric, $T$ is
296
+ negative-binomial; report the NB tail $\Prob(\mathrm{NB}\ge w)$ with mean $\lambda\mu_G$ and
297
+ dispersion estimated from observed clone sizes. Under $H_1$ antigen-driven clones are stochastically
298
+ larger, so power is retained; the collapsed Poisson test remains the assumption-light default.
299
+ \end{proposition}
300
+
301
+ \begin{proposition}[tf--idf is self-information weighting]\label{prop:tfidf}
302
+ Weighting each target hit $x$ by its background self-information $w(x) = -\log\Pzero(\{x\}$-ball$)$
303
+ makes the expected per-hit contribution constant under $H_0$; the inverse-document-frequency weight
304
+ is exactly the inverse background ball mass, and the term frequency is the clone multiplicity. In the
305
+ rare regime the control-set E-value satisfies $E \approx e^{-\sum_{x} \mathrm{idf}(x)}$, so the
306
+ ``control-set'' and ``tf--idf'' approaches to redundancy are one object.
307
+ \end{proposition}
308
+
309
+ \section{The E-value and multiple testing}\label{sec:evalue}
310
+
311
+ \begin{definition}[E-value]\label{def:evalue}
312
+ For a query family $\mathcal{Q}$, the expected number of background hits is
313
+ \begin{equation}\label{eq:evalue}
314
+ E_{\mathrm{tot}}(\theta) = \E_{H_0}[\#\text{hits}] = \sum_{q\in\mathcal{Q}} N\,\pi_0(q,\theta)
315
+ = \sum_{q\in\mathcal{Q}} \lambda(q,\theta).
316
+ \end{equation}
317
+ The per-query specialization $\mathcal{Q}=\{q\}$ gives the BLAST-convention E-value
318
+ $E(q,\theta)=N\pi_0(q,\theta)$, estimated by
319
+ \begin{equation}\label{eq:Ehat}
320
+ \widehat E(q,\theta) = \frac{N}{M}\, n_C(q,\theta), \qquad p_{\mathrm{any}} = 1-e^{-\widehat E}.
321
+ \end{equation}
322
+ \end{definition}
323
+
324
+ \begin{proposition}[Assumption-free expectation]\label{prop:linearity}
325
+ Equation~\eqref{eq:evalue} holds by linearity of expectation regardless of any dependence among
326
+ hits (clonal, convergent, or across $\mathcal{Q}$). Consequently $\Prob(\#\text{false hits}\ge 1)\le
327
+ E_{\mathrm{tot}}$ by Markov's inequality, and $E_{\mathrm{tot}}$ bounds the expected number of false
328
+ discoveries. This robustness---no independence needed for the \emph{mean}---is why the E-value, not
329
+ the Poisson tail, is the primary report.
330
+ \end{proposition}
331
+
332
+ \begin{proposition}[Family-wise and false-discovery control]\label{prop:multiple}
333
+ Two thresholding regimes for a family of $|\mathcal{Q}|$ tested queries:
334
+ \begin{enumerate}
335
+ \item \emph{E-value / Bonferroni (FWER).} Reporting every query with
336
+ $\widehat E(q,\theta)\le\alpha/|\mathcal{Q}|$ controls the family-wise error rate at level $\alpha$:
337
+ by Proposition~\ref{prop:linearity} the expected number of false positives is
338
+ $\sum_q \widehat E \le \alpha$, and $\Prob(\ge 1\text{ false positive})\le\alpha$ by Markov. No
339
+ independence is required. A fixed E-value cutoff (e.g.\ $\widehat E\le 1$, the BLAST default) is the
340
+ $\alpha=|\mathcal{Q}|$ case and bounds the \emph{expected count} of false positives by $1$.
341
+ \item \emph{p-value / Benjamini--Hochberg (FDR).} Using the per-query enrichment p-values
342
+ $p(q,\theta)=\Prob(Z\ge n_D^{>}(q,\theta))$ from Corollary~\ref{cor:void}, the
343
+ Benjamini--Hochberg procedure~\cite{Benjamini1995}---sort $p_{(1)}\le\dots\le p_{(|\mathcal{Q}|)}$,
344
+ reject the $k$ largest with $p_{(k)}\le \tfrac{k}{|\mathcal{Q}|}\alpha$---controls the false
345
+ discovery rate at $\alpha$ under positive dependence of the test statistics, the relevant regime
346
+ here (convergent clusters induce positive correlation).
347
+ \end{enumerate}
348
+ \end{proposition}
349
+
350
+ \begin{proposition}[Detectability / minimum cluster size]\label{prop:power}
351
+ Under $H_1(q)$ let the antigen-driven excess add $k$ neighbours beyond the background mean
352
+ $\lambda=\lambda^{>}(q,\theta)$, so $n_D^{>}\approx\lambda+k$. The enrichment test at E-value cutoff
353
+ $\widehat E\le\alpha$ rejects when $n_D^{>}\ge w_\alpha$, the smallest $w$ with
354
+ $\Prob(\Poisson(\lambda)\ge w)\le\alpha$. For small $\lambda$ (the typical rare-ball regime),
355
+ $w_\alpha$ grows only logarithmically, $w_\alpha \approx \dfrac{\log(1/\alpha)}{\log\log(1/\alpha)-\log\lambda}$
356
+ by the Poisson right tail, so a cluster of a handful of convergent neighbours is already detectable;
357
+ for moderate $\lambda$ the Gaussian approximation gives the familiar
358
+ $k \gtrsim z_{1-\alpha}\sqrt{\lambda}$. The control size enters only through the resolution of
359
+ $\widehat\lambda$ (\S\ref{sec:precision}): $M$ must be large enough that the sampling noise of
360
+ $\widehat E$ is below the excess $k$ being claimed.
361
+ \end{proposition}
362
+
363
+ \subsection{Epitope detection complexity}\label{sec:epitope-complexity}
364
+
365
+ Proposition~\ref{prop:power} concerns one query; in practice one samples a depth-$n$ repertoire and
366
+ asks how much of an epitope-specific response is recoverable. Let an epitope's TCR repertoire
367
+ $R_e$ have $K$ unique clonotypes and within-set scope-$\theta$ neighbour graph with degree
368
+ distribution $\{d_x\}_{x\in R_e}$ and neighbour density
369
+ $\rho = \tfrac{1}{K(K-1)}\sum_x d_x = \overline{d}/(K-1)$ (the probability that two random members of
370
+ $R_e$ are within $\theta$).
371
+
372
+ \begin{proposition}[Detection curve from the degree law]\label{prop:epitope}
373
+ Draw $n$ clonotypes i.i.d.\ from $R_e$. A node of full-set degree $d_x$ retains in expectation
374
+ $d_x\,(n-1)/(K-1)$ of its neighbours (hypergeometric sampling). Against the near-empty background
375
+ ball ($\lambda\approx0$, so $w_\alpha$ is $O(1)$, Proposition~\ref{prop:power}), the node is detected
376
+ once this exceeds a level $d_{\min}(\alpha)=O(1)$, i.e.\ at sampling depth
377
+ \begin{equation}\label{eq:nstar}
378
+ n^{*}_x \approx 1 + d_{\min}\,\frac{K-1}{d_x},
379
+ \end{equation}
380
+ and the detectable fraction at depth $n$ is
381
+ \begin{equation}\label{eq:phi}
382
+ \varphi(n) = \frac{1}{K}\,\#\Big\{x : d_x \ge d_{\min}\tfrac{K-1}{n-1}\Big\},
383
+ \end{equation}
384
+ fixed entirely by the degree law. Equivalently the expected number of within-sample neighbour pairs
385
+ is $\binom{n}{2}\rho$, so the first significant pair appears near $n\approx\sqrt{2/\rho}$. The
386
+ \emph{detection complexity} of $R_e$---the depth to recover a target fraction of the response---is
387
+ therefore set by the upper tail of $\{d_x\}$ (equivalently by $\rho$ and the largest cluster): a
388
+ repertoire dominated by one large convergent cluster is detected at small $n$, a diverse repertoire
389
+ of many near-singletons requires deep sampling.
390
+ \end{proposition}
391
+
392
+ \begin{remark}[Worked example: A*02 NLV vs GIL]\label{rem:nlvgil}
393
+ Measured on VDJdb TRB / HLA-A*02 repertoires against a $10^6$-sequence OLGA background at scope
394
+ $\theta=1$ substitution: \textbf{GIL} (GILGFVFTL, influenza M1; $K=5236$) has
395
+ $\rho = 3.4\times10^{-4}$, max degree $52$, and one dominant component of $896$ ($17\%$ of the set);
396
+ \textbf{NLV} (NLVPMVATV, CMV pp65; $K=13044$) has $\rho = 2.8\times10^{-5}$ ($\approx12\times$
397
+ sparser), max degree $22$, and a largest component of only $152$ ($1.2\%$). Equation~\eqref{eq:phi}
398
+ then predicts---and the subsampled Benjamini--Hochberg significant fraction confirms (Fig.,
399
+ \texttt{bench/bench\_epitope.py})---that GIL is $\sim$20--30\% recovered by $n\sim10^3$ sampled TCRs
400
+ while NLV stays below $5\%$ even at $n\sim5\times10^3$. The two epitopes have detection complexities
401
+ differing by more than an order of magnitude purely from repertoire structure, with no change to the
402
+ search or the background.
403
+ \end{remark}
404
+
405
+ \section{How large must the control be?}\label{sec:precision}
406
+
407
+ Since $M\hat\pi\sim\Binomial(M,\pi_0)$, $\Var\hat\pi=\pi_0(1-\pi_0)/M$ and the relative error is
408
+ $\mathrm{CV}(\hat\pi)\approx (M\pi_0)^{-1/2}$.
409
+ \begin{proposition}[Resolution]\label{prop:resolution}
410
+ To resolve a target E-value $E^\ast=N\pi_0$ to relative error $\rho$ requires
411
+ \begin{equation}\label{eq:resolution}
412
+ M \gtrsim \frac{1}{\rho^2\pi_0} = \frac{N}{\rho^2 E^\ast}.
413
+ \end{equation}
414
+ Resolving $E^\ast\sim 1$ to $10\%$ thus needs $M\sim 100\,N$.
415
+ \end{proposition}
416
+ \begin{proposition}[Empty-ball regime]\label{prop:zero}
417
+ If $n_C=0$, the point estimate $\hat\pi=0$ is degenerate; use the rule of three $\pi_0 \lesssim 3/M$
418
+ (95\%), or a $\mathrm{Beta}(n_C+a,\,M-n_C+b)$ posterior, which propagates control uncertainty into a
419
+ Poisson--Gamma (negative-binomial) posterior-predictive p-value for $n_D$. The implementation reports
420
+ the rule-of-three upper bound $\widehat E \le 3N/M$ when $n_C=0$.
421
+ \end{proposition}
422
+ When $M$ is inadequate for a rare $q$, the analytic $\hat\pi_{\mathrm{gen}}$ is exact per query for
423
+ any $M$ and serves as the fallback (Lemma~\ref{lem:hierarchy}).
424
+
425
+ \section{Composition and length are handled automatically}\label{sec:composition}
426
+
427
+ \begin{proposition}\label{prop:composition}
428
+ Because $\pi_0(q,\theta)=\Pzero(\ball{\theta}{q})$ is computed for the \emph{specific} query $q$, the
429
+ control estimator $n_C(q,\theta)/M$ conditions on $q$'s length and composition automatically: the
430
+ same biases that make $q$ common make $n_C$ large. This is the finite-sample, composition-exact
431
+ analogue of the Karlin--Altschul $K\,mn$ length normalization, which is needed precisely because the
432
+ i.i.d.\ background is query-independent. The only caveat is statistical: rare $q$ require adequate
433
+ $M$ (\S\ref{sec:precision}), else fall back to $\hat\pi_{\mathrm{gen}}$.
434
+ \end{proposition}
435
+
436
+ \section{The closest hit: an extreme-value law}\label{sec:gumbel}
437
+
438
+ \begin{theorem}[Poisson $\Rightarrow$ Gumbel]\label{thm:gumbel}
439
+ Let $\lambda(q,t)=N\,\Pzero(\ball{t}{q})$. By the Poisson approximation applied at each radius,
440
+ \begin{equation}
441
+ \Prob\!\big(S_{\min}(q) > t\big) \approx e^{-\lambda(q,t)} = e^{-N\Pzero(\ball{t}{q})}.
442
+ \end{equation}
443
+ If $\log\Pzero(\ball{t}{q}) \approx a + \beta t$ (log-linear ball-mass growth, the generic regime),
444
+ then the best score $Y=-S_{\min}$, centred at $u_N=(\log N + a)/\beta$, obeys
445
+ $\Prob(Y-u_N\le y) \to \exp(-e^{-\beta y})$, a Gumbel law with scale $1/\beta$. Here $\beta$ is the
446
+ \emph{empirical} ball-mass log-slope (regress $\log n_C(q,t)$ on $t$), not the Karlin--Altschul
447
+ $\lambda^\ast$. (For lattice scores the Gumbel carries the usual periodic correction.)
448
+ \end{theorem}
449
+
450
+ \section{Relation to Karlin--Altschul}\label{sec:ka}
451
+
452
+ \begin{theorem}[KA is the product-measure, ungapped case]\label{thm:reduction}
453
+ If $\Pzero=\bigotimes_\ell p$ is a product measure and $s$ is the ungapped additive score, then
454
+ $\Pzero(\ball{\theta}{q})$ factorizes and, by Cram\'er's theorem, $-\frac1{|q|}\log
455
+ \Pzero(\ball{\theta}{q}) \to \lambda^\ast$, the Karlin--Altschul parameter solving $\sum_{ij}p_ip_j
456
+ e^{\lambda^\ast s_{ij}}=1$. The intensity $\lambda(q,\theta)=N\Pzero(\ball{\theta}{q})$ then reduces
457
+ to $E=K\,m\,n\,e^{-\lambda^\ast S}$ with $K$ the Poisson-clumping constant, recovering
458
+ \cite{Karlin1990,Karlin1993,Altschul1997}.
459
+ \end{theorem}
460
+ Thus the present framework generalizes Karlin--Altschul in three ways: (i) the product measure
461
+ $\bigotimes p$ is replaced by the empirical/generative background $\Pzero$; (ii) gaps and
462
+ matrix-weighted balls are admitted via the engine's score; (iii) the asymptotic constants
463
+ $K,\lambda^\ast$ are replaced by a finite-$N$, finite-$M$ non-asymptotic error bound
464
+ (Theorem~\ref{thm:cs}).
465
+
466
+ \section{The epitope case: a limitation}\label{sec:epitope}
467
+
468
+ \begin{remark}
469
+ For TCR CDR3, $\Pzero$ is generation/repertoire-driven and a healthy-donor control instantiates it.
470
+ For \emph{epitopes} (MHC-presented peptides) the relevant background is presentation, not V(D)J
471
+ generation. The machinery of \S\S\ref{sec:setup}--\ref{sec:gumbel} applies verbatim with
472
+ $\Pzero:=\Pzero^{\mathrm{pep}}$ and a presented-peptide control, but: (L1) there is no closed-form
473
+ V(D)J-style generation model $\Pgen$, so only the empirical estimator survives; (L2) presentation is HLA-restricted,
474
+ so $\Pzero^{\mathrm{pep}}$ is allele-conditional and the control must be HLA-matched or marginalized
475
+ over an HLA frequency distribution; (L3) anchor-residue structure argues for position-weighted ball
476
+ geometry. We therefore claim soundness for epitopes only to the extent that a faithful
477
+ presented-peptide control is available; no generation-based null is claimed there.
478
+ \end{remark}
479
+
480
+ \section{Practical defaults and algorithm}\label{sec:practice}
481
+
482
+ \begin{enumerate}
483
+ \item Deduplicate $C$ and $D$ to unique clonotypes (Proposition~\ref{prop:collapse}).
484
+ \item Build a \texttt{seqtree} index of $C$; for each query compute $n_C$ and $n_D$ at scope
485
+ $\theta$ via batched search. When the query may itself be in $D$ or $C$, use the punctured counts
486
+ $n^{>}$ that drop distance-zero (exact/self) hits (Lemma~\ref{lem:selfmatch}).
487
+ \item Report $\widehat E=(N/M)\,n_C^{>}$ (Eq.~\eqref{eq:Ehat}), $p_{\mathrm{any}}=1-e^{-\widehat E}$,
488
+ and $p_{\mathrm{enrich}}=\Prob(\Poisson(\widehat E)\ge n_D^{>})$; use the rule of three when
489
+ $n_C^{>}=0$ (Proposition~\ref{prop:zero}).
490
+ \item Across a query family, threshold on $\widehat E$ for FWER control or apply
491
+ Benjamini--Hochberg to the $p_{\mathrm{enrich}}$ for FDR control (Proposition~\ref{prop:multiple}).
492
+ \item Validate the Poisson regime via the pairwise co-occupancy estimate of $b_2$; if inflated,
493
+ use the negative-binomial check (Proposition~\ref{prop:nb}).
494
+ \item Size the control by Eq.~\eqref{eq:resolution}; fall back to the model-based
495
+ $\hat\pi_{\mathrm{gen}}=\sum_{\ball{\theta}{q}} q\,\Pgen$ (Murugan model, thymic factor
496
+ $q\approx1/2.7$) for rare queries.
497
+ \end{enumerate}
498
+
499
+ This is implemented in \texttt{seqtree.evalues} (with \texttt{exclude\_exact} for the punctured
500
+ counts), a thin layer over batched search; the control loader \texttt{seqtree.load\_control} supplies
501
+ a deduplicated background.
502
+
503
+ \bibliographystyle{plain}
504
+ \bibliography{refs}
505
+
506
+ \end{document}