kmate 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. kmate-0.1.0/LICENSE +21 -0
  2. kmate-0.1.0/PKG-INFO +153 -0
  3. kmate-0.1.0/README.md +137 -0
  4. kmate-0.1.0/pyproject.toml +30 -0
  5. kmate-0.1.0/setup.cfg +4 -0
  6. kmate-0.1.0/src/kmate/__init__.py +7 -0
  7. kmate-0.1.0/src/kmate/block_em.py +295 -0
  8. kmate-0.1.0/src/kmate/block_haplotype_em.py +85 -0
  9. kmate-0.1.0/src/kmate/build_kmer_db.py +34 -0
  10. kmate-0.1.0/src/kmate/build_kmer_pa.py +304 -0
  11. kmate-0.1.0/src/kmate/build_var_pa.py +131 -0
  12. kmate-0.1.0/src/kmate/cli.py +51 -0
  13. kmate-0.1.0/src/kmate/data/selftest/kmer_pa_Chr1.kmer_pa.npz +0 -0
  14. kmate-0.1.0/src/kmate/data/selftest/kmer_pa_Chr1.meta.npz +0 -0
  15. kmate-0.1.0/src/kmate/data/selftest/reads.fq.gz +0 -0
  16. kmate-0.1.0/src/kmate/data/selftest/truth_h.tsv +6 -0
  17. kmate-0.1.0/src/kmate/data/selftest/var_Chr1.meta.npz +0 -0
  18. kmate-0.1.0/src/kmate/data/selftest/var_Chr1.var_called.npz +0 -0
  19. kmate-0.1.0/src/kmate/data/selftest/var_Chr1.var_pa.npz +0 -0
  20. kmate-0.1.0/src/kmate/em_solver.py +125 -0
  21. kmate-0.1.0/src/kmate/filter_kmer_pa_production.py +98 -0
  22. kmate-0.1.0/src/kmate/kmer_count.py +272 -0
  23. kmate-0.1.0/src/kmate/per_sample_per_chrom.py +429 -0
  24. kmate-0.1.0/src/kmate/selftest.py +111 -0
  25. kmate-0.1.0/src/kmate.egg-info/PKG-INFO +153 -0
  26. kmate-0.1.0/src/kmate.egg-info/SOURCES.txt +30 -0
  27. kmate-0.1.0/src/kmate.egg-info/dependency_links.txt +1 -0
  28. kmate-0.1.0/src/kmate.egg-info/entry_points.txt +2 -0
  29. kmate-0.1.0/src/kmate.egg-info/requires.txt +3 -0
  30. kmate-0.1.0/src/kmate.egg-info/top_level.txt +1 -0
  31. kmate-0.1.0/tests/test_genomewide_validation.py +181 -0
  32. kmate-0.1.0/tests/test_kmer_count.py +75 -0
kmate-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Tatiana Bellagio
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
kmate-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,153 @@
1
+ Metadata-Version: 2.4
2
+ Name: kmate
3
+ Version: 0.1.0
4
+ Summary: k-mer-based founder-mixture frequency estimation for pool-seq
5
+ Author: Tatiana Bellagio
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/Tatianabellagio/kMate
8
+ Keywords: pool-seq,k-mer,allele-frequency,pangenome,EM,GrENE-net
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: numpy
13
+ Requires-Dist: scipy
14
+ Requires-Dist: pysam
15
+ Dynamic: license-file
16
+
17
+ <p align="left">
18
+ <img src="assets/kMate_logo.png" alt="kMate" width="420">
19
+ </p>
20
+
21
+ # kMate
22
+
23
+ [![license: MIT](https://img.shields.io/github/license/Tatianabellagio/kMate)](LICENSE)
24
+
25
+ Per-sample, per-record **allele-frequency estimation from pooled sequencing** against a
26
+ multi-founder reference panel. kMate runs a weighted k-mer Poisson EM on the founder
27
+ simplex to estimate founder frequencies (`h`), then projects through a per-record
28
+ presence/absence matrix (`var_pa`, the founder × variant alt-allele matrix $V_\mathrm{pa}$)
29
+ to allele frequencies for **SNPs, indels, and SVs in a single pass**, with no per-variant genotyping.
30
+
31
+ ## Overview
32
+
33
+ <p align="center">
34
+ <a href="poster_PEQG/poster_peqg.pdf">
35
+ <img src="assets/poster_peqg.png" alt="kMate PEQG 2026 poster: tracking structural-variant trajectories across climates with alignment-free allele-frequency estimation" width="900">
36
+ </a>
37
+ </p>
38
+
39
+ The picture above (our [PEQG 2026 poster](poster_PEQG/poster_peqg.pdf), click to enlarge) walks through the whole idea: the
40
+ GrENE-Net experiment evolved an equal mixture of **231 *Arabidopsis* founders** at 43 climate
41
+ sites over 3 years, pool-sequencing the surviving populations each generation. kMate takes those
42
+ pooled k-mer counts, solves a Poisson EM for the founder mixture against the panel's
43
+ `kmer_pa`/`var_pa` matrices, and reads out per-record allele frequencies for **SNPs *and* SVs** at
44
+ once. Benchmarked against simulated pools at 10× coverage, estimates track the truth closely, letting
45
+ us follow structural-variant frequency trajectories across climates (e.g. a 181-bp insertion in
46
+ the cold-regulated *COR413-PM2* gene, rising in cold gardens and falling in warm ones).
47
+
48
+ ## Install
49
+
50
+ Create the `kmate` environment (mamba or conda) with its dependencies, then install the package:
51
+
52
+ ```bash
53
+ git clone https://github.com/Tatianabellagio/kMate.git
54
+ cd kMate
55
+ mamba create -n kmate -c conda-forge -c bioconda python numpy scipy pysam jellyfish samtools
56
+ mamba activate kmate
57
+ pip install -e . # installs the `kmate` command (no compilation step)
58
+ ```
59
+
60
+ Python deps: `numpy`, `scipy`, `pysam`. kMate also calls `jellyfish` (k-mer counting) and `samtools` (read handling), both installed by the `mamba create` above. This gives you a `kmate` command with subcommands (`kmate --help`).
61
+
62
+ ### Verify the install
63
+
64
+ ```bash
65
+ kmate selftest
66
+ ```
67
+
68
+ This runs the bundled tiny fixture (a real Chr1 panel slice + a simulated 5-founder pool) end-to-end — exercising the full k-mer-count → EM → AF-projection path through `jellyfish`/`samtools` — and checks that the planted founder mixture is recovered. It takes a few seconds, needs no network, and prints `PASS` on a correct install. Run this **before** pointing kMate at your own data.
69
+
70
+ ## Usage
71
+
72
+ kMate processes **one pooled sample at a time, per chromosome**.
73
+
74
+ **You need**
75
+ - **Pooled reads**: paired FASTQ (`R1.fq R2.fq`) of one pool/sample.
76
+ - **A reference panel** encoded as per-chromosome matrices: `kmer_pa` (k-mer × founder presence/absence), `var_pa` (founder × variant alt-allele), and record `meta`. Built once from your founders' phased VCF (see [Building a panel](#building-a-panel)). The 231-founder *Arabidopsis thaliana* panel used by GrENE-Net is available on request; the matrix files are large and are not stored in the Git repo.
77
+
78
+ **Run**
79
+
80
+ ```bash
81
+ kmate run \
82
+ --kmer-pa-prefix data/kmer_pa_231_arch3_filt2inv/kmer_pa \
83
+ --var-pa panel/arch3/chr1/var_pa_231_arch3_chr1.var_pa.npz \
84
+ --var-called panel/arch3/chr1/var_pa_231_arch3_chr1.var_called.npz \
85
+ --var-meta panel/arch3/chr1/var_pa_231_arch3_chr1.meta.npz \
86
+ --reads R1.fq R2.fq --sample MYSAMPLE --out MYSAMPLE.tsv \
87
+ --threads 8 --chroms Chr1 --kmer-weight inv_mb --block-mode global
88
+ ```
89
+
90
+ (`kmate run --help` lists every flag. Existing scripts that call `python src/per_sample_per_chrom.py ...` still work via thin shims that forward to the package.)
91
+
92
+ **Estimator mode** (`--block-mode`)
93
+ - `global`: one founder mixture per chromosome. Use for **selfing / inbred / founder (F0)** pools.
94
+ - `window`: per-window mixture with HMM smoothing, for **recombinant** pools. `--block-mode window` alone reproduces the production "star2" recipe (10 kb windows, 5 smoothing passes).
95
+
96
+ **Output**: a per-record TSV, one row per panel variant (SNP / indel / SV):
97
+
98
+ | chrom | pos | ref_len | alt_len | alt_freq | info | n_called | se |
99
+ |---|---|---|---|---|---|---|---|
100
+
101
+ `alt_freq` is the estimated alternate-allele frequency in the pool; `n_called` and `se` carry support/uncertainty. (`--var-called` adds a per-record called-mask; `--kmer-db` lets you count k-mers once and query per-chrom instead of re-scanning reads; `--hash-size` tunes the Jellyfish hash, e.g. lower it to `100M` on memory-capped jobs.)
102
+
103
+ ## How it works
104
+
105
+ From the read k-mer spectrum, kMate solves a weighted **Poisson EM on the 231-founder
106
+ simplex** for the founder mixture `h`, then projects `h` through the panel's
107
+ presence/absence matrix `var_pa` to a per-record allele frequency for SNPs, indels and SVs
108
+ together, in one pass. Processing one chromosome at a time keeps peak memory ~5× below a
109
+ genome-wide solve (per-chrom `h` agrees to ~0.1%). Full math + code wiring: [`ALGORITHM.md`](ALGORITHM.md).
110
+
111
+ ## Building a panel
112
+
113
+ To run kMate on your own founder set you build the panel matrices once from a multi-founder
114
+ **phased VCF**: `var_pa` from the founder genotypes and `kmer_pa` from a k-mer index of the
115
+ founders. The builders live in [`panel/`](panel/) and [`data/`](data/). The bundled
116
+ 231-founder *Arabidopsis* panel (used by GrENE-Net) and its exact construction are documented
117
+ in [`docs/PIPELINE_STATE.md`](docs/PIPELINE_STATE.md) §0.
118
+
119
+ ## Repository layout
120
+
121
+ ```
122
+ src/kmate/ the kMate package (em_solver, kmer_count, block_em, per_sample_per_chrom, cli, selftest)
123
+ pyproject.toml, conda/ packaging: pip-installable `kmate` CLI + conda recipe
124
+ panel/ founder-panel construction (var_pa builders, k-mer index)
125
+ data/ prebuilt panel matrices (kmer_pa_*, var_pa_*) + sample lists
126
+ grenenet/ GrENE-Net application: production scale-out over the evolved cohort
127
+ benchmarks/ end-to-end accuracy benchmarks (p80 control, p231 headline)
128
+ sims/ pool-seq simulation framework (AF truth); see sims/README.md
129
+ docs/ methods + analysis writeups
130
+ ```
131
+
132
+ ## Documentation
133
+
134
+ | Doc | What it is |
135
+ |---|---|
136
+ | [`docs/PIPELINE_STATE.md`](docs/PIPELINE_STATE.md) | Production inputs, run recipe, and environment; the project source of truth. |
137
+ | [`ALGORITHM.md`](ALGORITHM.md) | The kMate algorithm, math, and code wiring. |
138
+ | [`BACKGROUND.md`](BACKGROUND.md) | Project framing, known biases, and design decisions. |
139
+ | [`SAVIO_HPC.md`](SAVIO_HPC.md) | Cluster ops (partitions, sbatch recipes). |
140
+
141
+ ## Using kMate
142
+
143
+ kMate is not yet published as a standalone method. If you are interested in using kMate
144
+ for your project, or in collaborating, please get in touch:
145
+
146
+ **Tatiana Bellagio** (tatianabellagio@gmail.com)
147
+
148
+ kMate was developed for, and underlies the allele-frequency analyses of, the GrENE-Net
149
+ outdoor evolution experiment in *Arabidopsis thaliana*.
150
+
151
+ ## License
152
+
153
+ Released under the [MIT License](LICENSE).
kmate-0.1.0/README.md ADDED
@@ -0,0 +1,137 @@
1
+ <p align="left">
2
+ <img src="assets/kMate_logo.png" alt="kMate" width="420">
3
+ </p>
4
+
5
+ # kMate
6
+
7
+ [![license: MIT](https://img.shields.io/github/license/Tatianabellagio/kMate)](LICENSE)
8
+
9
+ Per-sample, per-record **allele-frequency estimation from pooled sequencing** against a
10
+ multi-founder reference panel. kMate runs a weighted k-mer Poisson EM on the founder
11
+ simplex to estimate founder frequencies (`h`), then projects through a per-record
12
+ presence/absence matrix (`var_pa`, the founder × variant alt-allele matrix $V_\mathrm{pa}$)
13
+ to allele frequencies for **SNPs, indels, and SVs in a single pass**, with no per-variant genotyping.
14
+
15
+ ## Overview
16
+
17
+ <p align="center">
18
+ <a href="poster_PEQG/poster_peqg.pdf">
19
+ <img src="assets/poster_peqg.png" alt="kMate PEQG 2026 poster: tracking structural-variant trajectories across climates with alignment-free allele-frequency estimation" width="900">
20
+ </a>
21
+ </p>
22
+
23
+ The picture above (our [PEQG 2026 poster](poster_PEQG/poster_peqg.pdf), click to enlarge) walks through the whole idea: the
24
+ GrENE-Net experiment evolved an equal mixture of **231 *Arabidopsis* founders** at 43 climate
25
+ sites over 3 years, pool-sequencing the surviving populations each generation. kMate takes those
26
+ pooled k-mer counts, solves a Poisson EM for the founder mixture against the panel's
27
+ `kmer_pa`/`var_pa` matrices, and reads out per-record allele frequencies for **SNPs *and* SVs** at
28
+ once. Benchmarked against simulated pools at 10× coverage, estimates track the truth closely, letting
29
+ us follow structural-variant frequency trajectories across climates (e.g. a 181-bp insertion in
30
+ the cold-regulated *COR413-PM2* gene, rising in cold gardens and falling in warm ones).
31
+
32
+ ## Install
33
+
34
+ Create the `kmate` environment (mamba or conda) with its dependencies, then install the package:
35
+
36
+ ```bash
37
+ git clone https://github.com/Tatianabellagio/kMate.git
38
+ cd kMate
39
+ mamba create -n kmate -c conda-forge -c bioconda python numpy scipy pysam jellyfish samtools
40
+ mamba activate kmate
41
+ pip install -e . # installs the `kmate` command (no compilation step)
42
+ ```
43
+
44
+ Python deps: `numpy`, `scipy`, `pysam`. kMate also calls `jellyfish` (k-mer counting) and `samtools` (read handling), both installed by the `mamba create` above. This gives you a `kmate` command with subcommands (`kmate --help`).
45
+
46
+ ### Verify the install
47
+
48
+ ```bash
49
+ kmate selftest
50
+ ```
51
+
52
+ This runs the bundled tiny fixture (a real Chr1 panel slice + a simulated 5-founder pool) end-to-end — exercising the full k-mer-count → EM → AF-projection path through `jellyfish`/`samtools` — and checks that the planted founder mixture is recovered. It takes a few seconds, needs no network, and prints `PASS` on a correct install. Run this **before** pointing kMate at your own data.
53
+
54
+ ## Usage
55
+
56
+ kMate processes **one pooled sample at a time, per chromosome**.
57
+
58
+ **You need**
59
+ - **Pooled reads**: paired FASTQ (`R1.fq R2.fq`) of one pool/sample.
60
+ - **A reference panel** encoded as per-chromosome matrices: `kmer_pa` (k-mer × founder presence/absence), `var_pa` (founder × variant alt-allele), and record `meta`. Built once from your founders' phased VCF (see [Building a panel](#building-a-panel)). The 231-founder *Arabidopsis thaliana* panel used by GrENE-Net is available on request; the matrix files are large and are not stored in the Git repo.
61
+
62
+ **Run**
63
+
64
+ ```bash
65
+ kmate run \
66
+ --kmer-pa-prefix data/kmer_pa_231_arch3_filt2inv/kmer_pa \
67
+ --var-pa panel/arch3/chr1/var_pa_231_arch3_chr1.var_pa.npz \
68
+ --var-called panel/arch3/chr1/var_pa_231_arch3_chr1.var_called.npz \
69
+ --var-meta panel/arch3/chr1/var_pa_231_arch3_chr1.meta.npz \
70
+ --reads R1.fq R2.fq --sample MYSAMPLE --out MYSAMPLE.tsv \
71
+ --threads 8 --chroms Chr1 --kmer-weight inv_mb --block-mode global
72
+ ```
73
+
74
+ (`kmate run --help` lists every flag. Existing scripts that call `python src/per_sample_per_chrom.py ...` still work via thin shims that forward to the package.)
75
+
76
+ **Estimator mode** (`--block-mode`)
77
+ - `global`: one founder mixture per chromosome. Use for **selfing / inbred / founder (F0)** pools.
78
+ - `window`: per-window mixture with HMM smoothing, for **recombinant** pools. `--block-mode window` alone reproduces the production "star2" recipe (10 kb windows, 5 smoothing passes).
79
+
80
+ **Output**: a per-record TSV, one row per panel variant (SNP / indel / SV):
81
+
82
+ | chrom | pos | ref_len | alt_len | alt_freq | info | n_called | se |
83
+ |---|---|---|---|---|---|---|---|
84
+
85
+ `alt_freq` is the estimated alternate-allele frequency in the pool; `n_called` and `se` carry support/uncertainty. (`--var-called` adds a per-record called-mask; `--kmer-db` lets you count k-mers once and query per-chrom instead of re-scanning reads; `--hash-size` tunes the Jellyfish hash, e.g. lower it to `100M` on memory-capped jobs.)
86
+
87
+ ## How it works
88
+
89
+ From the read k-mer spectrum, kMate solves a weighted **Poisson EM on the 231-founder
90
+ simplex** for the founder mixture `h`, then projects `h` through the panel's
91
+ presence/absence matrix `var_pa` to a per-record allele frequency for SNPs, indels and SVs
92
+ together, in one pass. Processing one chromosome at a time keeps peak memory ~5× below a
93
+ genome-wide solve (per-chrom `h` agrees to ~0.1%). Full math + code wiring: [`ALGORITHM.md`](ALGORITHM.md).
94
+
95
+ ## Building a panel
96
+
97
+ To run kMate on your own founder set you build the panel matrices once from a multi-founder
98
+ **phased VCF**: `var_pa` from the founder genotypes and `kmer_pa` from a k-mer index of the
99
+ founders. The builders live in [`panel/`](panel/) and [`data/`](data/). The bundled
100
+ 231-founder *Arabidopsis* panel (used by GrENE-Net) and its exact construction are documented
101
+ in [`docs/PIPELINE_STATE.md`](docs/PIPELINE_STATE.md) §0.
102
+
103
+ ## Repository layout
104
+
105
+ ```
106
+ src/kmate/ the kMate package (em_solver, kmer_count, block_em, per_sample_per_chrom, cli, selftest)
107
+ pyproject.toml, conda/ packaging: pip-installable `kmate` CLI + conda recipe
108
+ panel/ founder-panel construction (var_pa builders, k-mer index)
109
+ data/ prebuilt panel matrices (kmer_pa_*, var_pa_*) + sample lists
110
+ grenenet/ GrENE-Net application: production scale-out over the evolved cohort
111
+ benchmarks/ end-to-end accuracy benchmarks (p80 control, p231 headline)
112
+ sims/ pool-seq simulation framework (AF truth); see sims/README.md
113
+ docs/ methods + analysis writeups
114
+ ```
115
+
116
+ ## Documentation
117
+
118
+ | Doc | What it is |
119
+ |---|---|
120
+ | [`docs/PIPELINE_STATE.md`](docs/PIPELINE_STATE.md) | Production inputs, run recipe, and environment; the project source of truth. |
121
+ | [`ALGORITHM.md`](ALGORITHM.md) | The kMate algorithm, math, and code wiring. |
122
+ | [`BACKGROUND.md`](BACKGROUND.md) | Project framing, known biases, and design decisions. |
123
+ | [`SAVIO_HPC.md`](SAVIO_HPC.md) | Cluster ops (partitions, sbatch recipes). |
124
+
125
+ ## Using kMate
126
+
127
+ kMate is not yet published as a standalone method. If you are interested in using kMate
128
+ for your project, or in collaborating, please get in touch:
129
+
130
+ **Tatiana Bellagio** (tatianabellagio@gmail.com)
131
+
132
+ kMate was developed for, and underlies the allele-frequency analyses of, the GrENE-Net
133
+ outdoor evolution experiment in *Arabidopsis thaliana*.
134
+
135
+ ## License
136
+
137
+ Released under the [MIT License](LICENSE).
@@ -0,0 +1,30 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "kmate"
7
+ version = "0.1.0"
8
+ description = "k-mer-based founder-mixture frequency estimation for pool-seq"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Tatiana Bellagio" }]
13
+ keywords = ["pool-seq", "k-mer", "allele-frequency", "pangenome", "EM", "GrENE-net"]
14
+ # Python deps only. Non-Python runtime tools (jellyfish, samtools) are declared
15
+ # in the conda recipe (conda/meta.yaml), since pip cannot install them.
16
+ dependencies = ["numpy", "scipy", "pysam"]
17
+
18
+ [project.scripts]
19
+ kmate = "kmate.cli:main"
20
+
21
+ [project.urls]
22
+ Homepage = "https://github.com/Tatianabellagio/kMate"
23
+
24
+ [tool.setuptools.packages.find]
25
+ where = ["src"]
26
+ include = ["kmate*"]
27
+
28
+ [tool.setuptools.package-data]
29
+ # Ship the tiny end-to-end self-test fixture (`kmate selftest`).
30
+ kmate = ["data/selftest/*"]
kmate-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,7 @@
1
+ """kMate — k-mer-based founder-mixture frequency estimation for pool-seq.
2
+
3
+ Two estimators (`--block-mode global|window`), optional per-bubble k-mer
4
+ de-replication weighting (`--kmer-weight inv_mb`), and a count-once/query path
5
+ for scale-out. See `kmate --help` and the docs/ directory.
6
+ """
7
+ __version__ = "0.1.0"
@@ -0,0 +1,295 @@
1
+ """
2
+ Per-LD-block founder-frequency estimation.
3
+
4
+ Architecture (HAFpipe / hapFIRE style):
5
+ For evolved pool samples, chromosomes are mosaic — different genomic
6
+ regions carry different ancestry mixtures because of recombination +
7
+ selection. A single genome-wide h discards this signal. Per-block h
8
+ captures local ancestry, and per-record alt_freq becomes:
9
+
10
+ alt_freq(record) = h_local[block(record)] @ var_pa[:, record]
11
+
12
+ Block definitions:
13
+ - 'window': fixed bp windows (default 200 kb). Simple; ignores LD.
14
+ - 'ld' (TODO): LD-cluster based on founder-genotype correlation
15
+ between adjacent bubbles. Variable-length, biologically correct.
16
+ hapFIRE uses BigLD.
17
+
18
+ For a window of W kb in Arabidopsis (selfing ~97%), expected founder
19
+ haplotypes per window is small (10–50 distinct), so EM is well-conditioned
20
+ even with K_window ~ 100 k-mers/founder.
21
+ """
22
+ from __future__ import annotations
23
+ import time
24
+ from dataclasses import dataclass
25
+
26
+ import numpy as np
27
+ from .em_solver import solve_em
28
+
29
+
30
+ @dataclass
31
+ class BlockSpec:
32
+ """Genomic block: (chrom, start, end) in 1-based coordinates."""
33
+ chrom: str
34
+ start: int
35
+ end: int
36
+
37
+ def __repr__(self):
38
+ return f"{self.chrom}:{self.start}-{self.end}"
39
+
40
+
41
+ def define_windows(bubble_chrom, bubble_start, bubble_end,
42
+ window_bp=200_000, window_step: int | None = None):
43
+ """Define fixed-bp windows covering all bubbles.
44
+
45
+ Args:
46
+ window_bp: width of each window in bp.
47
+ window_step: step between window starts. If None or == window_bp, windows
48
+ are disjoint (legacy behaviour). If < window_bp, windows overlap by
49
+ (window_bp - window_step). E.g. window_bp=10000, window_step=3000 →
50
+ 10 kb windows stepping every 3 kb, each k-mer in ~3-4 covering
51
+ windows. Used for Route 2 (overlapping-window smoothing).
52
+
53
+ Returns list of BlockSpec tuples, ordered by (chrom, start).
54
+ """
55
+ if window_step is None:
56
+ window_step = window_bp
57
+ if window_step <= 0:
58
+ raise ValueError(f"window_step must be > 0, got {window_step}")
59
+ blocks = []
60
+ for chrom in sorted(set(bubble_chrom)):
61
+ m = bubble_chrom == chrom
62
+ if not m.any():
63
+ continue
64
+ chrom_max = int(bubble_end[m].max())
65
+ n_w = (chrom_max + window_step - 1) // window_step
66
+ for w in range(n_w):
67
+ start = w * window_step + 1
68
+ end = start + window_bp - 1
69
+ blocks.append(BlockSpec(chrom=chrom, start=start, end=end))
70
+ return blocks
71
+
72
+
73
+ def assign_kmers_to_blocks(kmer_bubble_id, bubble_chrom, bubble_start,
74
+ bubble_end, blocks):
75
+ """For each k-mer, return its block index (or -1 if no block contains it).
76
+
77
+ A k-mer is assigned to the block whose [start, end] range overlaps its
78
+ bubble centroid. Bubbles spanning a block boundary are assigned to the
79
+ block containing their start.
80
+
81
+ Returns a 1D K-vec assuming disjoint windows (each k-mer in exactly one
82
+ block).
83
+ """
84
+ K = len(kmer_bubble_id)
85
+ bubble_centroid = (bubble_start + bubble_end) // 2
86
+
87
+ # Precompute per-chrom block index ranges for fast lookup
88
+ chrom_blocks = {}
89
+ for i, b in enumerate(blocks):
90
+ chrom_blocks.setdefault(b.chrom, []).append((b.start, b.end, i))
91
+ for c in chrom_blocks:
92
+ chrom_blocks[c].sort()
93
+
94
+ # For each bubble, find its block
95
+ bubble_block = np.full(len(bubble_chrom), -1, dtype=np.int32)
96
+ for b_idx in range(len(bubble_chrom)):
97
+ c = str(bubble_chrom[b_idx])
98
+ p = int(bubble_centroid[b_idx])
99
+ if c not in chrom_blocks:
100
+ continue
101
+ # Linear scan within chromosome — could binary search, but block
102
+ # count per chrom is small (~150 for 200kb on Chr1)
103
+ for s, e, idx in chrom_blocks[c]:
104
+ if s <= p <= e:
105
+ bubble_block[b_idx] = idx
106
+ break
107
+
108
+ # Map k-mers via their bubble_id
109
+ kmer_block = bubble_block[kmer_bubble_id]
110
+ return kmer_block
111
+
112
+
113
+ def assign_records_to_blocks(record_chrom, record_pos, blocks):
114
+ """Per-var_pa-record block index. Same scheme as assign_kmers_to_blocks."""
115
+ chrom_blocks = {}
116
+ for i, b in enumerate(blocks):
117
+ chrom_blocks.setdefault(b.chrom, []).append((b.start, b.end, i))
118
+ for c in chrom_blocks:
119
+ chrom_blocks[c].sort()
120
+
121
+ rec_block = np.full(len(record_chrom), -1, dtype=np.int32)
122
+ # Group records by chrom and use searchsorted for vectorization
123
+ for c, items in chrom_blocks.items():
124
+ starts = np.array([s for s, e, idx in items])
125
+ ends = np.array([e for s, e, idx in items])
126
+ idxs = np.array([idx for s, e, idx in items])
127
+ mask = record_chrom == c
128
+ if not mask.any():
129
+ continue
130
+ pos_c = record_pos[mask]
131
+ # find the block whose start <= pos <= end. Since blocks are
132
+ # contiguous and sorted, binary search by start.
133
+ i_starts = np.searchsorted(starts, pos_c, side="right") - 1
134
+ i_starts = np.clip(i_starts, 0, len(starts) - 1)
135
+ in_range = (pos_c >= starts[i_starts]) & (pos_c <= ends[i_starts])
136
+ block_for_rec = np.where(in_range, idxs[i_starts], -1)
137
+ rec_block[mask] = block_for_rec
138
+ return rec_block
139
+
140
+
141
+ def solve_em_per_block(counts, kmer_pa_dense, kmer_block, n_blocks,
142
+ coverage, em_max_iter=200, tol=1e-7,
143
+ min_kmers_per_block=200, verbose=False,
144
+ n_workers=4,
145
+ global_anchor_weight: float = 0.0,
146
+ omega=None):
147
+ """Run EM independently per block.
148
+
149
+ omega: optional K-vec of per-k-mer weights ω_k (e.g. 1/m_b). Sliced per block
150
+ and passed to solve_em (omega=None → unweighted MLE; identical to old behavior).
151
+
152
+ Args:
153
+ counts: K-vec of observed counts (already filtered to block coverage)
154
+ kmer_pa_dense: F × K float32 kmer_pa matrix
155
+ kmer_block: K-vec of block index per k-mer (-1 = no block)
156
+ n_blocks: number of blocks
157
+ coverage: scalar coverage estimate
158
+ min_kmers_per_block: blocks with fewer NONZERO-count k-mers fall
159
+ back to a global-h estimate (their h is undefined locally).
160
+ n_workers: thread workers for per-block EM (numpy releases GIL during
161
+ BLAS, so threading shares memory. Default 4. Set to 1 for serial.)
162
+ global_anchor_weight: λ ≥ 0. If > 0, per-block EM is solved with a
163
+ Dirichlet pseudocount centered on `global_h` (the chrom-wide EM
164
+ solution that's already computed for fallback). λ = 0 → pure
165
+ per-window MLE (legacy behaviour). 0.05–0.5 = mild anchor.
166
+
167
+ Returns:
168
+ h_blocks: (n_blocks, F) float32 — per-block ancestry vectors. For
169
+ blocks with too few k-mers, falls back to global-h (computed
170
+ once over all k-mers).
171
+ block_status: (n_blocks,) int — 0=local fit, 1=global fallback,
172
+ 2=excluded (no k-mers).
173
+ global_h: (F,) — fallback value
174
+ """
175
+ from concurrent.futures import ThreadPoolExecutor
176
+ try:
177
+ from threadpoolctl import threadpool_limits
178
+ except ImportError:
179
+ # threadpoolctl only bounds BLAS oversubscription across the parallel
180
+ # per-block fits; the EM is still correct without it (just potentially
181
+ # thread-greedy). Degrade gracefully rather than hard-crash a cohort run
182
+ # if the env is missing the package.
183
+ from contextlib import contextmanager
184
+ @contextmanager
185
+ def threadpool_limits(limits=None):
186
+ yield
187
+
188
+ F = kmer_pa_dense.shape[0]
189
+ h_blocks = np.zeros((n_blocks, F), dtype=np.float32)
190
+ block_status = np.full(n_blocks, 2, dtype=np.int8)
191
+
192
+ # Global fallback: EM on ALL k-mers
193
+ if verbose:
194
+ print(f" computing global-h fallback...", flush=True)
195
+ t = time.time()
196
+ global_h, info = solve_em(counts, kmer_pa_dense, coverage,
197
+ max_iter=em_max_iter, tol=tol,
198
+ omega=omega)
199
+ global_h = global_h.astype(np.float32)
200
+ if verbose:
201
+ print(f" global EM: {info['iterations']} iters, {time.time()-t:.0f}s",
202
+ flush=True)
203
+ if global_anchor_weight > 0:
204
+ print(f" global_anchor_weight λ={global_anchor_weight} — "
205
+ f"per-window EM will be anchored toward h_global",
206
+ flush=True)
207
+
208
+ nz = counts > 0
209
+ t0 = time.time()
210
+
211
+ # Pre-compute per-block k-mer index lists once.
212
+ block_kmer_idx = [np.flatnonzero(kmer_block == b) for b in range(n_blocks)]
213
+
214
+ def _fit_one(b):
215
+ idxs = block_kmer_idx[b]
216
+ if len(idxs) == 0:
217
+ return b, global_h, 2
218
+ idxs_nz = idxs[nz[idxs]]
219
+ if len(idxs_nz) < min_kmers_per_block:
220
+ return b, global_h, 1
221
+ cn_b = np.ascontiguousarray(kmer_pa_dense[:, idxs_nz])
222
+ c_b = counts[idxs_nz]
223
+ omega_b = None if omega is None else omega[idxs_nz]
224
+ if global_anchor_weight > 0:
225
+ h_b, _ = solve_em(c_b, cn_b, coverage,
226
+ max_iter=em_max_iter, tol=tol,
227
+ prior_h=global_h,
228
+ prior_weight=global_anchor_weight,
229
+ omega=omega_b)
230
+ else:
231
+ h_b, _ = solve_em(c_b, cn_b, coverage,
232
+ max_iter=em_max_iter, tol=tol,
233
+ omega=omega_b)
234
+ return b, h_b.astype(np.float32), 0
235
+
236
+ # Limit per-thread BLAS to avoid oversubscription. n_workers × inner_threads
237
+ # should ≈ total cores.
238
+ inner_threads = max(1, 8 // n_workers)
239
+ if n_workers == 1:
240
+ results = [_fit_one(b) for b in range(n_blocks)]
241
+ else:
242
+ with threadpool_limits(limits=inner_threads):
243
+ with ThreadPoolExecutor(max_workers=n_workers) as ex:
244
+ results = list(ex.map(_fit_one, range(n_blocks)))
245
+
246
+ n_local = n_global = n_excluded = 0
247
+ for b, h_b, st in results:
248
+ h_blocks[b] = h_b
249
+ block_status[b] = st
250
+ if st == 0: n_local += 1
251
+ elif st == 1: n_global += 1
252
+ else: n_excluded += 1
253
+
254
+ if verbose:
255
+ print(f" per-block EM: {n_local} local fits, {n_global} global "
256
+ f"fallbacks, {n_excluded} excluded; {time.time()-t0:.0f}s "
257
+ f"(workers={n_workers}, inner_threads={inner_threads})",
258
+ flush=True)
259
+ return h_blocks, block_status, global_h
260
+
261
+
262
+ def project_blocks_to_records(h_blocks, global_h, var_pa, record_block,
263
+ var_called=None):
264
+ """Project per-block h to per-record alt freqs by hard window assignment.
265
+
266
+ Each record uses the h of the window containing it, h_blocks[record_block[r]]
267
+ (already set to global_h for fallback/excluded windows by solve_em_per_block;
268
+ records with no window, record_block == -1, use global_h). With HMM smoothing
269
+ applied upstream, this hard assignment is the production projection.
270
+
271
+ With var_called, AF is the missing-aware (h@var_pa)/(h@var_called);
272
+ without it, ./. is treated as REF (→ SV AF under-call at high missingness).
273
+
274
+ Returns (alt_freq, info), both length-N float64. info[r] is the projection
275
+ denominator — the h-weighted called mass at r (1.0 when no called mask).
276
+ """
277
+ N = var_pa.shape[1]
278
+ out = np.zeros(N, dtype=np.float64)
279
+ info = np.ones(N, dtype=np.float64)
280
+ rb = np.asarray(record_block)
281
+ var_pa_csc = var_pa.tocsc()
282
+ cvc_csc = var_called.tocsc() if var_called is not None else None
283
+ for b in np.unique(rb):
284
+ mask = (rb == b)
285
+ if not mask.any():
286
+ continue
287
+ h = (h_blocks[b] if b >= 0 else global_h).astype(np.float32)
288
+ num = (h @ var_pa_csc[:, mask].toarray()).astype(np.float64)
289
+ if cvc_csc is not None:
290
+ den = (h @ cvc_csc[:, mask].toarray()).astype(np.float64)
291
+ out[mask] = num / np.maximum(den, 1e-12)
292
+ info[mask] = den
293
+ else:
294
+ out[mask] = num
295
+ return out, info