kmate 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kmate-0.1.0/LICENSE +21 -0
- kmate-0.1.0/PKG-INFO +153 -0
- kmate-0.1.0/README.md +137 -0
- kmate-0.1.0/pyproject.toml +30 -0
- kmate-0.1.0/setup.cfg +4 -0
- kmate-0.1.0/src/kmate/__init__.py +7 -0
- kmate-0.1.0/src/kmate/block_em.py +295 -0
- kmate-0.1.0/src/kmate/block_haplotype_em.py +85 -0
- kmate-0.1.0/src/kmate/build_kmer_db.py +34 -0
- kmate-0.1.0/src/kmate/build_kmer_pa.py +304 -0
- kmate-0.1.0/src/kmate/build_var_pa.py +131 -0
- kmate-0.1.0/src/kmate/cli.py +51 -0
- kmate-0.1.0/src/kmate/data/selftest/kmer_pa_Chr1.kmer_pa.npz +0 -0
- kmate-0.1.0/src/kmate/data/selftest/kmer_pa_Chr1.meta.npz +0 -0
- kmate-0.1.0/src/kmate/data/selftest/reads.fq.gz +0 -0
- kmate-0.1.0/src/kmate/data/selftest/truth_h.tsv +6 -0
- kmate-0.1.0/src/kmate/data/selftest/var_Chr1.meta.npz +0 -0
- kmate-0.1.0/src/kmate/data/selftest/var_Chr1.var_called.npz +0 -0
- kmate-0.1.0/src/kmate/data/selftest/var_Chr1.var_pa.npz +0 -0
- kmate-0.1.0/src/kmate/em_solver.py +125 -0
- kmate-0.1.0/src/kmate/filter_kmer_pa_production.py +98 -0
- kmate-0.1.0/src/kmate/kmer_count.py +272 -0
- kmate-0.1.0/src/kmate/per_sample_per_chrom.py +429 -0
- kmate-0.1.0/src/kmate/selftest.py +111 -0
- kmate-0.1.0/src/kmate.egg-info/PKG-INFO +153 -0
- kmate-0.1.0/src/kmate.egg-info/SOURCES.txt +30 -0
- kmate-0.1.0/src/kmate.egg-info/dependency_links.txt +1 -0
- kmate-0.1.0/src/kmate.egg-info/entry_points.txt +2 -0
- kmate-0.1.0/src/kmate.egg-info/requires.txt +3 -0
- kmate-0.1.0/src/kmate.egg-info/top_level.txt +1 -0
- kmate-0.1.0/tests/test_genomewide_validation.py +181 -0
- kmate-0.1.0/tests/test_kmer_count.py +75 -0
kmate-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Tatiana Bellagio
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
kmate-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kmate
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: k-mer-based founder-mixture frequency estimation for pool-seq
|
|
5
|
+
Author: Tatiana Bellagio
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Tatianabellagio/kMate
|
|
8
|
+
Keywords: pool-seq,k-mer,allele-frequency,pangenome,EM,GrENE-net
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: numpy
|
|
13
|
+
Requires-Dist: scipy
|
|
14
|
+
Requires-Dist: pysam
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
<p align="left">
|
|
18
|
+
<img src="assets/kMate_logo.png" alt="kMate" width="420">
|
|
19
|
+
</p>
|
|
20
|
+
|
|
21
|
+
# kMate
|
|
22
|
+
|
|
23
|
+
[](LICENSE)
|
|
24
|
+
|
|
25
|
+
Per-sample, per-record **allele-frequency estimation from pooled sequencing** against a
|
|
26
|
+
multi-founder reference panel. kMate runs a weighted k-mer Poisson EM on the founder
|
|
27
|
+
simplex to estimate founder frequencies (`h`), then projects through a per-record
|
|
28
|
+
presence/absence matrix (`var_pa`, the founder × variant alt-allele matrix $V_\mathrm{pa}$)
|
|
29
|
+
to allele frequencies for **SNPs, indels, and SVs in a single pass**, with no per-variant genotyping.
|
|
30
|
+
|
|
31
|
+
## Overview
|
|
32
|
+
|
|
33
|
+
<p align="center">
|
|
34
|
+
<a href="poster_PEQG/poster_peqg.pdf">
|
|
35
|
+
<img src="assets/poster_peqg.png" alt="kMate PEQG 2026 poster: tracking structural-variant trajectories across climates with alignment-free allele-frequency estimation" width="900">
|
|
36
|
+
</a>
|
|
37
|
+
</p>
|
|
38
|
+
|
|
39
|
+
The picture above (our [PEQG 2026 poster](poster_PEQG/poster_peqg.pdf), click to enlarge) walks through the whole idea: the
|
|
40
|
+
GrENE-Net experiment evolved an equal mixture of **231 *Arabidopsis* founders** at 43 climate
|
|
41
|
+
sites over 3 years, pool-sequencing the surviving populations each generation. kMate takes those
|
|
42
|
+
pooled k-mer counts, solves a Poisson EM for the founder mixture against the panel's
|
|
43
|
+
`kmer_pa`/`var_pa` matrices, and reads out per-record allele frequencies for **SNPs *and* SVs** at
|
|
44
|
+
once. Benchmarked against simulated pools at 10× coverage, estimates track the truth closely, letting
|
|
45
|
+
us follow structural-variant frequency trajectories across climates (e.g. a 181-bp insertion in
|
|
46
|
+
the cold-regulated *COR413-PM2* gene, rising in cold gardens and falling in warm ones).
|
|
47
|
+
|
|
48
|
+
## Install
|
|
49
|
+
|
|
50
|
+
Create the `kmate` environment (mamba or conda) with its dependencies, then install the package:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
git clone https://github.com/Tatianabellagio/kMate.git
|
|
54
|
+
cd kMate
|
|
55
|
+
mamba create -n kmate -c conda-forge -c bioconda python numpy scipy pysam jellyfish samtools
|
|
56
|
+
mamba activate kmate
|
|
57
|
+
pip install -e . # installs the `kmate` command (no compilation step)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Python deps: `numpy`, `scipy`, `pysam`. kMate also calls `jellyfish` (k-mer counting) and `samtools` (read handling), both installed by the `mamba create` above. This gives you a `kmate` command with subcommands (`kmate --help`).
|
|
61
|
+
|
|
62
|
+
### Verify the install
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
kmate selftest
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
This runs the bundled tiny fixture (a real Chr1 panel slice + a simulated 5-founder pool) end-to-end — exercising the full k-mer-count → EM → AF-projection path through `jellyfish`/`samtools` — and checks that the planted founder mixture is recovered. It takes a few seconds, needs no network, and prints `PASS` on a correct install. Run this **before** pointing kMate at your own data.
|
|
69
|
+
|
|
70
|
+
## Usage
|
|
71
|
+
|
|
72
|
+
kMate processes **one pooled sample at a time, per chromosome**.
|
|
73
|
+
|
|
74
|
+
**You need**
|
|
75
|
+
- **Pooled reads**: paired FASTQ (`R1.fq R2.fq`) of one pool/sample.
|
|
76
|
+
- **A reference panel** encoded as per-chromosome matrices: `kmer_pa` (k-mer × founder presence/absence), `var_pa` (founder × variant alt-allele), and record `meta`. Built once from your founders' phased VCF (see [Building a panel](#building-a-panel)). The 231-founder *Arabidopsis thaliana* panel used by GrENE-Net is available on request; the matrix files are large and are not stored in the Git repo.
|
|
77
|
+
|
|
78
|
+
**Run**
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
kmate run \
|
|
82
|
+
--kmer-pa-prefix data/kmer_pa_231_arch3_filt2inv/kmer_pa \
|
|
83
|
+
--var-pa panel/arch3/chr1/var_pa_231_arch3_chr1.var_pa.npz \
|
|
84
|
+
--var-called panel/arch3/chr1/var_pa_231_arch3_chr1.var_called.npz \
|
|
85
|
+
--var-meta panel/arch3/chr1/var_pa_231_arch3_chr1.meta.npz \
|
|
86
|
+
--reads R1.fq R2.fq --sample MYSAMPLE --out MYSAMPLE.tsv \
|
|
87
|
+
--threads 8 --chroms Chr1 --kmer-weight inv_mb --block-mode global
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
(`kmate run --help` lists every flag. Existing scripts that call `python src/per_sample_per_chrom.py ...` still work via thin shims that forward to the package.)
|
|
91
|
+
|
|
92
|
+
**Estimator mode** (`--block-mode`)
|
|
93
|
+
- `global`: one founder mixture per chromosome. Use for **selfing / inbred / founder (F0)** pools.
|
|
94
|
+
- `window`: per-window mixture with HMM smoothing, for **recombinant** pools. `--block-mode window` alone reproduces the production "star2" recipe (10 kb windows, 5 smoothing passes).
|
|
95
|
+
|
|
96
|
+
**Output**: a per-record TSV, one row per panel variant (SNP / indel / SV):
|
|
97
|
+
|
|
98
|
+
| chrom | pos | ref_len | alt_len | alt_freq | info | n_called | se |
|
|
99
|
+
|---|---|---|---|---|---|---|---|
|
|
100
|
+
|
|
101
|
+
`alt_freq` is the estimated alternate-allele frequency in the pool; `n_called` and `se` carry support/uncertainty. (`--var-called` adds a per-record called-mask; `--kmer-db` lets you count k-mers once and query per-chrom instead of re-scanning reads; `--hash-size` tunes the Jellyfish hash, e.g. lower it to `100M` on memory-capped jobs.)
|
|
102
|
+
|
|
103
|
+
## How it works
|
|
104
|
+
|
|
105
|
+
From the read k-mer spectrum, kMate solves a weighted **Poisson EM on the 231-founder
|
|
106
|
+
simplex** for the founder mixture `h`, then projects `h` through the panel's
|
|
107
|
+
presence/absence matrix `var_pa` to a per-record allele frequency for SNPs, indels and SVs
|
|
108
|
+
together, in one pass. Processing one chromosome at a time keeps peak memory ~5× below a
|
|
109
|
+
genome-wide solve (per-chrom `h` agrees to ~0.1%). Full math + code wiring: [`ALGORITHM.md`](ALGORITHM.md).
|
|
110
|
+
|
|
111
|
+
## Building a panel
|
|
112
|
+
|
|
113
|
+
To run kMate on your own founder set you build the panel matrices once from a multi-founder
|
|
114
|
+
**phased VCF**: `var_pa` from the founder genotypes and `kmer_pa` from a k-mer index of the
|
|
115
|
+
founders. The builders live in [`panel/`](panel/) and [`data/`](data/). The bundled
|
|
116
|
+
231-founder *Arabidopsis* panel (used by GrENE-Net) and its exact construction are documented
|
|
117
|
+
in [`docs/PIPELINE_STATE.md`](docs/PIPELINE_STATE.md) §0.
|
|
118
|
+
|
|
119
|
+
## Repository layout
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
src/kmate/ the kMate package (em_solver, kmer_count, block_em, per_sample_per_chrom, cli, selftest)
|
|
123
|
+
pyproject.toml, conda/ packaging: pip-installable `kmate` CLI + conda recipe
|
|
124
|
+
panel/ founder-panel construction (var_pa builders, k-mer index)
|
|
125
|
+
data/ prebuilt panel matrices (kmer_pa_*, var_pa_*) + sample lists
|
|
126
|
+
grenenet/ GrENE-Net application: production scale-out over the evolved cohort
|
|
127
|
+
benchmarks/ end-to-end accuracy benchmarks (p80 control, p231 headline)
|
|
128
|
+
sims/ pool-seq simulation framework (AF truth); see sims/README.md
|
|
129
|
+
docs/ methods + analysis writeups
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Documentation
|
|
133
|
+
|
|
134
|
+
| Doc | What it is |
|
|
135
|
+
|---|---|
|
|
136
|
+
| [`docs/PIPELINE_STATE.md`](docs/PIPELINE_STATE.md) | Production inputs, run recipe, and environment; the project source of truth. |
|
|
137
|
+
| [`ALGORITHM.md`](ALGORITHM.md) | The kMate algorithm, math, and code wiring. |
|
|
138
|
+
| [`BACKGROUND.md`](BACKGROUND.md) | Project framing, known biases, and design decisions. |
|
|
139
|
+
| [`SAVIO_HPC.md`](SAVIO_HPC.md) | Cluster ops (partitions, sbatch recipes). |
|
|
140
|
+
|
|
141
|
+
## Using kMate
|
|
142
|
+
|
|
143
|
+
kMate is not yet published as a standalone method. If you are interested in using kMate
|
|
144
|
+
for your project, or in collaborating, please get in touch:
|
|
145
|
+
|
|
146
|
+
**Tatiana Bellagio** (tatianabellagio@gmail.com)
|
|
147
|
+
|
|
148
|
+
kMate was developed for, and underlies the allele-frequency analyses of, the GrENE-Net
|
|
149
|
+
outdoor evolution experiment in *Arabidopsis thaliana*.
|
|
150
|
+
|
|
151
|
+
## License
|
|
152
|
+
|
|
153
|
+
Released under the [MIT License](LICENSE).
|
kmate-0.1.0/README.md
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
<p align="left">
|
|
2
|
+
<img src="assets/kMate_logo.png" alt="kMate" width="420">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
# kMate
|
|
6
|
+
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
|
|
9
|
+
Per-sample, per-record **allele-frequency estimation from pooled sequencing** against a
|
|
10
|
+
multi-founder reference panel. kMate runs a weighted k-mer Poisson EM on the founder
|
|
11
|
+
simplex to estimate founder frequencies (`h`), then projects through a per-record
|
|
12
|
+
presence/absence matrix (`var_pa`, the founder × variant alt-allele matrix $V_\mathrm{pa}$)
|
|
13
|
+
to allele frequencies for **SNPs, indels, and SVs in a single pass**, with no per-variant genotyping.
|
|
14
|
+
|
|
15
|
+
## Overview
|
|
16
|
+
|
|
17
|
+
<p align="center">
|
|
18
|
+
<a href="poster_PEQG/poster_peqg.pdf">
|
|
19
|
+
<img src="assets/poster_peqg.png" alt="kMate PEQG 2026 poster: tracking structural-variant trajectories across climates with alignment-free allele-frequency estimation" width="900">
|
|
20
|
+
</a>
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
The picture above (our [PEQG 2026 poster](poster_PEQG/poster_peqg.pdf), click to enlarge) walks through the whole idea: the
|
|
24
|
+
GrENE-Net experiment evolved an equal mixture of **231 *Arabidopsis* founders** at 43 climate
|
|
25
|
+
sites over 3 years, pool-sequencing the surviving populations each generation. kMate takes those
|
|
26
|
+
pooled k-mer counts, solves a Poisson EM for the founder mixture against the panel's
|
|
27
|
+
`kmer_pa`/`var_pa` matrices, and reads out per-record allele frequencies for **SNPs *and* SVs** at
|
|
28
|
+
once. Benchmarked against simulated pools at 10× coverage, estimates track the truth closely, letting
|
|
29
|
+
us follow structural-variant frequency trajectories across climates (e.g. a 181-bp insertion in
|
|
30
|
+
the cold-regulated *COR413-PM2* gene, rising in cold gardens and falling in warm ones).
|
|
31
|
+
|
|
32
|
+
## Install
|
|
33
|
+
|
|
34
|
+
Create the `kmate` environment (mamba or conda) with its dependencies, then install the package:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
git clone https://github.com/Tatianabellagio/kMate.git
|
|
38
|
+
cd kMate
|
|
39
|
+
mamba create -n kmate -c conda-forge -c bioconda python numpy scipy pysam jellyfish samtools
|
|
40
|
+
mamba activate kmate
|
|
41
|
+
pip install -e . # installs the `kmate` command (no compilation step)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Python deps: `numpy`, `scipy`, `pysam`. kMate also calls `jellyfish` (k-mer counting) and `samtools` (read handling), both installed by the `mamba create` above. This gives you a `kmate` command with subcommands (`kmate --help`).
|
|
45
|
+
|
|
46
|
+
### Verify the install
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
kmate selftest
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
This runs the bundled tiny fixture (a real Chr1 panel slice + a simulated 5-founder pool) end-to-end — exercising the full k-mer-count → EM → AF-projection path through `jellyfish`/`samtools` — and checks that the planted founder mixture is recovered. It takes a few seconds, needs no network, and prints `PASS` on a correct install. Run this **before** pointing kMate at your own data.
|
|
53
|
+
|
|
54
|
+
## Usage
|
|
55
|
+
|
|
56
|
+
kMate processes **one pooled sample at a time, per chromosome**.
|
|
57
|
+
|
|
58
|
+
**You need**
|
|
59
|
+
- **Pooled reads**: paired FASTQ (`R1.fq R2.fq`) of one pool/sample.
|
|
60
|
+
- **A reference panel** encoded as per-chromosome matrices: `kmer_pa` (k-mer × founder presence/absence), `var_pa` (founder × variant alt-allele), and record `meta`. Built once from your founders' phased VCF (see [Building a panel](#building-a-panel)). The 231-founder *Arabidopsis thaliana* panel used by GrENE-Net is available on request; the matrix files are large and are not stored in the Git repo.
|
|
61
|
+
|
|
62
|
+
**Run**
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
kmate run \
|
|
66
|
+
--kmer-pa-prefix data/kmer_pa_231_arch3_filt2inv/kmer_pa \
|
|
67
|
+
--var-pa panel/arch3/chr1/var_pa_231_arch3_chr1.var_pa.npz \
|
|
68
|
+
--var-called panel/arch3/chr1/var_pa_231_arch3_chr1.var_called.npz \
|
|
69
|
+
--var-meta panel/arch3/chr1/var_pa_231_arch3_chr1.meta.npz \
|
|
70
|
+
--reads R1.fq R2.fq --sample MYSAMPLE --out MYSAMPLE.tsv \
|
|
71
|
+
--threads 8 --chroms Chr1 --kmer-weight inv_mb --block-mode global
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
(`kmate run --help` lists every flag. Existing scripts that call `python src/per_sample_per_chrom.py ...` still work via thin shims that forward to the package.)
|
|
75
|
+
|
|
76
|
+
**Estimator mode** (`--block-mode`)
|
|
77
|
+
- `global`: one founder mixture per chromosome. Use for **selfing / inbred / founder (F0)** pools.
|
|
78
|
+
- `window`: per-window mixture with HMM smoothing, for **recombinant** pools. `--block-mode window` alone reproduces the production "star2" recipe (10 kb windows, 5 smoothing passes).
|
|
79
|
+
|
|
80
|
+
**Output**: a per-record TSV, one row per panel variant (SNP / indel / SV):
|
|
81
|
+
|
|
82
|
+
| chrom | pos | ref_len | alt_len | alt_freq | info | n_called | se |
|
|
83
|
+
|---|---|---|---|---|---|---|---|
|
|
84
|
+
|
|
85
|
+
`alt_freq` is the estimated alternate-allele frequency in the pool; `n_called` and `se` carry support/uncertainty. (`--var-called` adds a per-record called-mask; `--kmer-db` lets you count k-mers once and query per-chrom instead of re-scanning reads; `--hash-size` tunes the Jellyfish hash, e.g. lower it to `100M` on memory-capped jobs.)
|
|
86
|
+
|
|
87
|
+
## How it works
|
|
88
|
+
|
|
89
|
+
From the read k-mer spectrum, kMate solves a weighted **Poisson EM on the 231-founder
|
|
90
|
+
simplex** for the founder mixture `h`, then projects `h` through the panel's
|
|
91
|
+
presence/absence matrix `var_pa` to a per-record allele frequency for SNPs, indels and SVs
|
|
92
|
+
together, in one pass. Processing one chromosome at a time keeps peak memory ~5× below a
|
|
93
|
+
genome-wide solve (per-chrom `h` agrees to ~0.1%). Full math + code wiring: [`ALGORITHM.md`](ALGORITHM.md).
|
|
94
|
+
|
|
95
|
+
## Building a panel
|
|
96
|
+
|
|
97
|
+
To run kMate on your own founder set you build the panel matrices once from a multi-founder
|
|
98
|
+
**phased VCF**: `var_pa` from the founder genotypes and `kmer_pa` from a k-mer index of the
|
|
99
|
+
founders. The builders live in [`panel/`](panel/) and [`data/`](data/). The bundled
|
|
100
|
+
231-founder *Arabidopsis* panel (used by GrENE-Net) and its exact construction are documented
|
|
101
|
+
in [`docs/PIPELINE_STATE.md`](docs/PIPELINE_STATE.md) §0.
|
|
102
|
+
|
|
103
|
+
## Repository layout
|
|
104
|
+
|
|
105
|
+
```
|
|
106
|
+
src/kmate/ the kMate package (em_solver, kmer_count, block_em, per_sample_per_chrom, cli, selftest)
|
|
107
|
+
pyproject.toml, conda/ packaging: pip-installable `kmate` CLI + conda recipe
|
|
108
|
+
panel/ founder-panel construction (var_pa builders, k-mer index)
|
|
109
|
+
data/ prebuilt panel matrices (kmer_pa_*, var_pa_*) + sample lists
|
|
110
|
+
grenenet/ GrENE-Net application: production scale-out over the evolved cohort
|
|
111
|
+
benchmarks/ end-to-end accuracy benchmarks (p80 control, p231 headline)
|
|
112
|
+
sims/ pool-seq simulation framework (AF truth); see sims/README.md
|
|
113
|
+
docs/ methods + analysis writeups
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Documentation
|
|
117
|
+
|
|
118
|
+
| Doc | What it is |
|
|
119
|
+
|---|---|
|
|
120
|
+
| [`docs/PIPELINE_STATE.md`](docs/PIPELINE_STATE.md) | Production inputs, run recipe, and environment; the project source of truth. |
|
|
121
|
+
| [`ALGORITHM.md`](ALGORITHM.md) | The kMate algorithm, math, and code wiring. |
|
|
122
|
+
| [`BACKGROUND.md`](BACKGROUND.md) | Project framing, known biases, and design decisions. |
|
|
123
|
+
| [`SAVIO_HPC.md`](SAVIO_HPC.md) | Cluster ops (partitions, sbatch recipes). |
|
|
124
|
+
|
|
125
|
+
## Using kMate
|
|
126
|
+
|
|
127
|
+
kMate is not yet published as a standalone method. If you are interested in using kMate
|
|
128
|
+
for your project, or in collaborating, please get in touch:
|
|
129
|
+
|
|
130
|
+
**Tatiana Bellagio** (tatianabellagio@gmail.com)
|
|
131
|
+
|
|
132
|
+
kMate was developed for, and underlies the allele-frequency analyses of, the GrENE-Net
|
|
133
|
+
outdoor evolution experiment in *Arabidopsis thaliana*.
|
|
134
|
+
|
|
135
|
+
## License
|
|
136
|
+
|
|
137
|
+
Released under the [MIT License](LICENSE).
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "kmate"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "k-mer-based founder-mixture frequency estimation for pool-seq"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Tatiana Bellagio" }]
|
|
13
|
+
keywords = ["pool-seq", "k-mer", "allele-frequency", "pangenome", "EM", "GrENE-net"]
|
|
14
|
+
# Python deps only. Non-Python runtime tools (jellyfish, samtools) are declared
|
|
15
|
+
# in the conda recipe (conda/meta.yaml), since pip cannot install them.
|
|
16
|
+
dependencies = ["numpy", "scipy", "pysam"]
|
|
17
|
+
|
|
18
|
+
[project.scripts]
|
|
19
|
+
kmate = "kmate.cli:main"
|
|
20
|
+
|
|
21
|
+
[project.urls]
|
|
22
|
+
Homepage = "https://github.com/Tatianabellagio/kMate"
|
|
23
|
+
|
|
24
|
+
[tool.setuptools.packages.find]
|
|
25
|
+
where = ["src"]
|
|
26
|
+
include = ["kmate*"]
|
|
27
|
+
|
|
28
|
+
[tool.setuptools.package-data]
|
|
29
|
+
# Ship the tiny end-to-end self-test fixture (`kmate selftest`).
|
|
30
|
+
kmate = ["data/selftest/*"]
|
kmate-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""kMate — k-mer-based founder-mixture frequency estimation for pool-seq.
|
|
2
|
+
|
|
3
|
+
Two estimators (`--block-mode global|window`), optional per-bubble k-mer
|
|
4
|
+
de-replication weighting (`--kmer-weight inv_mb`), and a count-once/query path
|
|
5
|
+
for scale-out. See `kmate --help` and the docs/ directory.
|
|
6
|
+
"""
|
|
7
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Per-LD-block founder-frequency estimation.
|
|
3
|
+
|
|
4
|
+
Architecture (HAFpipe / hapFIRE style):
|
|
5
|
+
For evolved pool samples, chromosomes are mosaic — different genomic
|
|
6
|
+
regions carry different ancestry mixtures because of recombination +
|
|
7
|
+
selection. A single genome-wide h discards this signal. Per-block h
|
|
8
|
+
captures local ancestry, and per-record alt_freq becomes:
|
|
9
|
+
|
|
10
|
+
alt_freq(record) = h_local[block(record)] @ var_pa[:, record]
|
|
11
|
+
|
|
12
|
+
Block definitions:
|
|
13
|
+
- 'window': fixed bp windows (default 200 kb). Simple; ignores LD.
|
|
14
|
+
- 'ld' (TODO): LD-cluster based on founder-genotype correlation
|
|
15
|
+
between adjacent bubbles. Variable-length, biologically correct.
|
|
16
|
+
hapFIRE uses BigLD.
|
|
17
|
+
|
|
18
|
+
For a window of W kb in Arabidopsis (selfing ~97%), expected founder
|
|
19
|
+
haplotypes per window is small (10–50 distinct), so EM is well-conditioned
|
|
20
|
+
even with K_window ~ 100 k-mers/founder.
|
|
21
|
+
"""
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
import time
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
|
|
26
|
+
import numpy as np
|
|
27
|
+
from .em_solver import solve_em
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class BlockSpec:
|
|
32
|
+
"""Genomic block: (chrom, start, end) in 1-based coordinates."""
|
|
33
|
+
chrom: str
|
|
34
|
+
start: int
|
|
35
|
+
end: int
|
|
36
|
+
|
|
37
|
+
def __repr__(self):
|
|
38
|
+
return f"{self.chrom}:{self.start}-{self.end}"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def define_windows(bubble_chrom, bubble_start, bubble_end,
|
|
42
|
+
window_bp=200_000, window_step: int | None = None):
|
|
43
|
+
"""Define fixed-bp windows covering all bubbles.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
window_bp: width of each window in bp.
|
|
47
|
+
window_step: step between window starts. If None or == window_bp, windows
|
|
48
|
+
are disjoint (legacy behaviour). If < window_bp, windows overlap by
|
|
49
|
+
(window_bp - window_step). E.g. window_bp=10000, window_step=3000 →
|
|
50
|
+
10 kb windows stepping every 3 kb, each k-mer in ~3-4 covering
|
|
51
|
+
windows. Used for Route 2 (overlapping-window smoothing).
|
|
52
|
+
|
|
53
|
+
Returns list of BlockSpec tuples, ordered by (chrom, start).
|
|
54
|
+
"""
|
|
55
|
+
if window_step is None:
|
|
56
|
+
window_step = window_bp
|
|
57
|
+
if window_step <= 0:
|
|
58
|
+
raise ValueError(f"window_step must be > 0, got {window_step}")
|
|
59
|
+
blocks = []
|
|
60
|
+
for chrom in sorted(set(bubble_chrom)):
|
|
61
|
+
m = bubble_chrom == chrom
|
|
62
|
+
if not m.any():
|
|
63
|
+
continue
|
|
64
|
+
chrom_max = int(bubble_end[m].max())
|
|
65
|
+
n_w = (chrom_max + window_step - 1) // window_step
|
|
66
|
+
for w in range(n_w):
|
|
67
|
+
start = w * window_step + 1
|
|
68
|
+
end = start + window_bp - 1
|
|
69
|
+
blocks.append(BlockSpec(chrom=chrom, start=start, end=end))
|
|
70
|
+
return blocks
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def assign_kmers_to_blocks(kmer_bubble_id, bubble_chrom, bubble_start,
|
|
74
|
+
bubble_end, blocks):
|
|
75
|
+
"""For each k-mer, return its block index (or -1 if no block contains it).
|
|
76
|
+
|
|
77
|
+
A k-mer is assigned to the block whose [start, end] range overlaps its
|
|
78
|
+
bubble centroid. Bubbles spanning a block boundary are assigned to the
|
|
79
|
+
block containing their start.
|
|
80
|
+
|
|
81
|
+
Returns a 1D K-vec assuming disjoint windows (each k-mer in exactly one
|
|
82
|
+
block).
|
|
83
|
+
"""
|
|
84
|
+
K = len(kmer_bubble_id)
|
|
85
|
+
bubble_centroid = (bubble_start + bubble_end) // 2
|
|
86
|
+
|
|
87
|
+
# Precompute per-chrom block index ranges for fast lookup
|
|
88
|
+
chrom_blocks = {}
|
|
89
|
+
for i, b in enumerate(blocks):
|
|
90
|
+
chrom_blocks.setdefault(b.chrom, []).append((b.start, b.end, i))
|
|
91
|
+
for c in chrom_blocks:
|
|
92
|
+
chrom_blocks[c].sort()
|
|
93
|
+
|
|
94
|
+
# For each bubble, find its block
|
|
95
|
+
bubble_block = np.full(len(bubble_chrom), -1, dtype=np.int32)
|
|
96
|
+
for b_idx in range(len(bubble_chrom)):
|
|
97
|
+
c = str(bubble_chrom[b_idx])
|
|
98
|
+
p = int(bubble_centroid[b_idx])
|
|
99
|
+
if c not in chrom_blocks:
|
|
100
|
+
continue
|
|
101
|
+
# Linear scan within chromosome — could binary search, but block
|
|
102
|
+
# count per chrom is small (~150 for 200kb on Chr1)
|
|
103
|
+
for s, e, idx in chrom_blocks[c]:
|
|
104
|
+
if s <= p <= e:
|
|
105
|
+
bubble_block[b_idx] = idx
|
|
106
|
+
break
|
|
107
|
+
|
|
108
|
+
# Map k-mers via their bubble_id
|
|
109
|
+
kmer_block = bubble_block[kmer_bubble_id]
|
|
110
|
+
return kmer_block
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def assign_records_to_blocks(record_chrom, record_pos, blocks):
|
|
114
|
+
"""Per-var_pa-record block index. Same scheme as assign_kmers_to_blocks."""
|
|
115
|
+
chrom_blocks = {}
|
|
116
|
+
for i, b in enumerate(blocks):
|
|
117
|
+
chrom_blocks.setdefault(b.chrom, []).append((b.start, b.end, i))
|
|
118
|
+
for c in chrom_blocks:
|
|
119
|
+
chrom_blocks[c].sort()
|
|
120
|
+
|
|
121
|
+
rec_block = np.full(len(record_chrom), -1, dtype=np.int32)
|
|
122
|
+
# Group records by chrom and use searchsorted for vectorization
|
|
123
|
+
for c, items in chrom_blocks.items():
|
|
124
|
+
starts = np.array([s for s, e, idx in items])
|
|
125
|
+
ends = np.array([e for s, e, idx in items])
|
|
126
|
+
idxs = np.array([idx for s, e, idx in items])
|
|
127
|
+
mask = record_chrom == c
|
|
128
|
+
if not mask.any():
|
|
129
|
+
continue
|
|
130
|
+
pos_c = record_pos[mask]
|
|
131
|
+
# find the block whose start <= pos <= end. Since blocks are
|
|
132
|
+
# contiguous and sorted, binary search by start.
|
|
133
|
+
i_starts = np.searchsorted(starts, pos_c, side="right") - 1
|
|
134
|
+
i_starts = np.clip(i_starts, 0, len(starts) - 1)
|
|
135
|
+
in_range = (pos_c >= starts[i_starts]) & (pos_c <= ends[i_starts])
|
|
136
|
+
block_for_rec = np.where(in_range, idxs[i_starts], -1)
|
|
137
|
+
rec_block[mask] = block_for_rec
|
|
138
|
+
return rec_block
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def solve_em_per_block(counts, kmer_pa_dense, kmer_block, n_blocks,
|
|
142
|
+
coverage, em_max_iter=200, tol=1e-7,
|
|
143
|
+
min_kmers_per_block=200, verbose=False,
|
|
144
|
+
n_workers=4,
|
|
145
|
+
global_anchor_weight: float = 0.0,
|
|
146
|
+
omega=None):
|
|
147
|
+
"""Run EM independently per block.
|
|
148
|
+
|
|
149
|
+
omega: optional K-vec of per-k-mer weights ω_k (e.g. 1/m_b). Sliced per block
|
|
150
|
+
and passed to solve_em (omega=None → unweighted MLE; identical to old behavior).
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
counts: K-vec of observed counts (already filtered to block coverage)
|
|
154
|
+
kmer_pa_dense: F × K float32 kmer_pa matrix
|
|
155
|
+
kmer_block: K-vec of block index per k-mer (-1 = no block)
|
|
156
|
+
n_blocks: number of blocks
|
|
157
|
+
coverage: scalar coverage estimate
|
|
158
|
+
min_kmers_per_block: blocks with fewer NONZERO-count k-mers fall
|
|
159
|
+
back to a global-h estimate (their h is undefined locally).
|
|
160
|
+
n_workers: thread workers for per-block EM (numpy releases GIL during
|
|
161
|
+
BLAS, so threading shares memory. Default 4. Set to 1 for serial.)
|
|
162
|
+
global_anchor_weight: λ ≥ 0. If > 0, per-block EM is solved with a
|
|
163
|
+
Dirichlet pseudocount centered on `global_h` (the chrom-wide EM
|
|
164
|
+
solution that's already computed for fallback). λ = 0 → pure
|
|
165
|
+
per-window MLE (legacy behaviour). 0.05–0.5 = mild anchor.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
h_blocks: (n_blocks, F) float32 — per-block ancestry vectors. For
|
|
169
|
+
blocks with too few k-mers, falls back to global-h (computed
|
|
170
|
+
once over all k-mers).
|
|
171
|
+
block_status: (n_blocks,) int — 0=local fit, 1=global fallback,
|
|
172
|
+
2=excluded (no k-mers).
|
|
173
|
+
global_h: (F,) — fallback value
|
|
174
|
+
"""
|
|
175
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
176
|
+
try:
|
|
177
|
+
from threadpoolctl import threadpool_limits
|
|
178
|
+
except ImportError:
|
|
179
|
+
# threadpoolctl only bounds BLAS oversubscription across the parallel
|
|
180
|
+
# per-block fits; the EM is still correct without it (just potentially
|
|
181
|
+
# thread-greedy). Degrade gracefully rather than hard-crash a cohort run
|
|
182
|
+
# if the env is missing the package.
|
|
183
|
+
from contextlib import contextmanager
|
|
184
|
+
@contextmanager
|
|
185
|
+
def threadpool_limits(limits=None):
|
|
186
|
+
yield
|
|
187
|
+
|
|
188
|
+
F = kmer_pa_dense.shape[0]
|
|
189
|
+
h_blocks = np.zeros((n_blocks, F), dtype=np.float32)
|
|
190
|
+
block_status = np.full(n_blocks, 2, dtype=np.int8)
|
|
191
|
+
|
|
192
|
+
# Global fallback: EM on ALL k-mers
|
|
193
|
+
if verbose:
|
|
194
|
+
print(f" computing global-h fallback...", flush=True)
|
|
195
|
+
t = time.time()
|
|
196
|
+
global_h, info = solve_em(counts, kmer_pa_dense, coverage,
|
|
197
|
+
max_iter=em_max_iter, tol=tol,
|
|
198
|
+
omega=omega)
|
|
199
|
+
global_h = global_h.astype(np.float32)
|
|
200
|
+
if verbose:
|
|
201
|
+
print(f" global EM: {info['iterations']} iters, {time.time()-t:.0f}s",
|
|
202
|
+
flush=True)
|
|
203
|
+
if global_anchor_weight > 0:
|
|
204
|
+
print(f" global_anchor_weight λ={global_anchor_weight} — "
|
|
205
|
+
f"per-window EM will be anchored toward h_global",
|
|
206
|
+
flush=True)
|
|
207
|
+
|
|
208
|
+
nz = counts > 0
|
|
209
|
+
t0 = time.time()
|
|
210
|
+
|
|
211
|
+
# Pre-compute per-block k-mer index lists once.
|
|
212
|
+
block_kmer_idx = [np.flatnonzero(kmer_block == b) for b in range(n_blocks)]
|
|
213
|
+
|
|
214
|
+
def _fit_one(b):
|
|
215
|
+
idxs = block_kmer_idx[b]
|
|
216
|
+
if len(idxs) == 0:
|
|
217
|
+
return b, global_h, 2
|
|
218
|
+
idxs_nz = idxs[nz[idxs]]
|
|
219
|
+
if len(idxs_nz) < min_kmers_per_block:
|
|
220
|
+
return b, global_h, 1
|
|
221
|
+
cn_b = np.ascontiguousarray(kmer_pa_dense[:, idxs_nz])
|
|
222
|
+
c_b = counts[idxs_nz]
|
|
223
|
+
omega_b = None if omega is None else omega[idxs_nz]
|
|
224
|
+
if global_anchor_weight > 0:
|
|
225
|
+
h_b, _ = solve_em(c_b, cn_b, coverage,
|
|
226
|
+
max_iter=em_max_iter, tol=tol,
|
|
227
|
+
prior_h=global_h,
|
|
228
|
+
prior_weight=global_anchor_weight,
|
|
229
|
+
omega=omega_b)
|
|
230
|
+
else:
|
|
231
|
+
h_b, _ = solve_em(c_b, cn_b, coverage,
|
|
232
|
+
max_iter=em_max_iter, tol=tol,
|
|
233
|
+
omega=omega_b)
|
|
234
|
+
return b, h_b.astype(np.float32), 0
|
|
235
|
+
|
|
236
|
+
# Limit per-thread BLAS to avoid oversubscription. n_workers × inner_threads
|
|
237
|
+
# should ≈ total cores.
|
|
238
|
+
inner_threads = max(1, 8 // n_workers)
|
|
239
|
+
if n_workers == 1:
|
|
240
|
+
results = [_fit_one(b) for b in range(n_blocks)]
|
|
241
|
+
else:
|
|
242
|
+
with threadpool_limits(limits=inner_threads):
|
|
243
|
+
with ThreadPoolExecutor(max_workers=n_workers) as ex:
|
|
244
|
+
results = list(ex.map(_fit_one, range(n_blocks)))
|
|
245
|
+
|
|
246
|
+
n_local = n_global = n_excluded = 0
|
|
247
|
+
for b, h_b, st in results:
|
|
248
|
+
h_blocks[b] = h_b
|
|
249
|
+
block_status[b] = st
|
|
250
|
+
if st == 0: n_local += 1
|
|
251
|
+
elif st == 1: n_global += 1
|
|
252
|
+
else: n_excluded += 1
|
|
253
|
+
|
|
254
|
+
if verbose:
|
|
255
|
+
print(f" per-block EM: {n_local} local fits, {n_global} global "
|
|
256
|
+
f"fallbacks, {n_excluded} excluded; {time.time()-t0:.0f}s "
|
|
257
|
+
f"(workers={n_workers}, inner_threads={inner_threads})",
|
|
258
|
+
flush=True)
|
|
259
|
+
return h_blocks, block_status, global_h
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def project_blocks_to_records(h_blocks, global_h, var_pa, record_block,
|
|
263
|
+
var_called=None):
|
|
264
|
+
"""Project per-block h to per-record alt freqs by hard window assignment.
|
|
265
|
+
|
|
266
|
+
Each record uses the h of the window containing it, h_blocks[record_block[r]]
|
|
267
|
+
(already set to global_h for fallback/excluded windows by solve_em_per_block;
|
|
268
|
+
records with no window, record_block == -1, use global_h). With HMM smoothing
|
|
269
|
+
applied upstream, this hard assignment is the production projection.
|
|
270
|
+
|
|
271
|
+
With var_called, AF is the missing-aware (h@var_pa)/(h@var_called);
|
|
272
|
+
without it, ./. is treated as REF (→ SV AF under-call at high missingness).
|
|
273
|
+
|
|
274
|
+
Returns (alt_freq, info), both length-N float64. info[r] is the projection
|
|
275
|
+
denominator — the h-weighted called mass at r (1.0 when no called mask).
|
|
276
|
+
"""
|
|
277
|
+
N = var_pa.shape[1]
|
|
278
|
+
out = np.zeros(N, dtype=np.float64)
|
|
279
|
+
info = np.ones(N, dtype=np.float64)
|
|
280
|
+
rb = np.asarray(record_block)
|
|
281
|
+
var_pa_csc = var_pa.tocsc()
|
|
282
|
+
cvc_csc = var_called.tocsc() if var_called is not None else None
|
|
283
|
+
for b in np.unique(rb):
|
|
284
|
+
mask = (rb == b)
|
|
285
|
+
if not mask.any():
|
|
286
|
+
continue
|
|
287
|
+
h = (h_blocks[b] if b >= 0 else global_h).astype(np.float32)
|
|
288
|
+
num = (h @ var_pa_csc[:, mask].toarray()).astype(np.float64)
|
|
289
|
+
if cvc_csc is not None:
|
|
290
|
+
den = (h @ cvc_csc[:, mask].toarray()).astype(np.float64)
|
|
291
|
+
out[mask] = num / np.maximum(den, 1e-12)
|
|
292
|
+
info[mask] = den
|
|
293
|
+
else:
|
|
294
|
+
out[mask] = num
|
|
295
|
+
return out, info
|