admixture-cache 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- admixture_cache/__init__.py +77 -0
- admixture_cache/alignment.py +122 -0
- admixture_cache/builder.py +880 -0
- admixture_cache/cli.py +515 -0
- admixture_cache/errors.py +27 -0
- admixture_cache/io.py +129 -0
- admixture_cache/manifest.py +70 -0
- admixture_cache/orchestration.py +130 -0
- admixture_cache/projection.py +100 -0
- admixture_cache/py.typed +0 -0
- admixture_cache/runner.py +69 -0
- admixture_cache-1.0.0.dist-info/METADATA +215 -0
- admixture_cache-1.0.0.dist-info/RECORD +17 -0
- admixture_cache-1.0.0.dist-info/WHEEL +5 -0
- admixture_cache-1.0.0.dist-info/entry_points.txt +2 -0
- admixture_cache-1.0.0.dist-info/licenses/LICENSE +21 -0
- admixture_cache-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""admixture-cache — precomputed-P supervised-ADMIXTURE projection.
|
|
2
|
+
|
|
3
|
+
Split the slow supervised-ADMIXTURE training pass (panel-only,
|
|
4
|
+
~hours, one-time per panel × K × clusters_yaml combo) out of the
|
|
5
|
+
per-target hot path. After building, project a new target's K-vector
|
|
6
|
+
in <2 seconds via NumPy SLSQP against the cached P matrix.
|
|
7
|
+
|
|
8
|
+
Two phases, two APIs:
|
|
9
|
+
|
|
10
|
+
1. **Panel cache build** (operator-facing, slow):
|
|
11
|
+
- :func:`build_panel_cache` runs stock ADMIXTURE × N restarts via
|
|
12
|
+
an injected ToolRunner, validates multimodality, writes the
|
|
13
|
+
canonical cached P + manifest.
|
|
14
|
+
|
|
15
|
+
2. **Per-target projection** (consumer-facing, fast):
|
|
16
|
+
- :func:`project_target` aligns target.bed to cached panel.bim
|
|
17
|
+
+ axes (via plink2), reads the target as a dosage vector,
|
|
18
|
+
solves for Q via scipy SLSQP under the binomial admixture
|
|
19
|
+
likelihood.
|
|
20
|
+
|
|
21
|
+
The math is validated to <1e-5 absolute Q-vector match against stock
|
|
22
|
+
ADMIXTURE on representative panels (15K samples × 850K SNPs at K=4).
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from admixture_cache.alignment import (
|
|
28
|
+
align_target_to_panel_bim,
|
|
29
|
+
extract_target_dosage_via_plink2,
|
|
30
|
+
)
|
|
31
|
+
from admixture_cache.builder import build_panel_cache, ld_prune_panel
|
|
32
|
+
from admixture_cache.errors import PanelCacheError, PopAutomationConfigError
|
|
33
|
+
from admixture_cache.io import (
|
|
34
|
+
load_cache_manifest,
|
|
35
|
+
load_cached_p,
|
|
36
|
+
sha256_file,
|
|
37
|
+
verify_cache_matches_current_config,
|
|
38
|
+
)
|
|
39
|
+
from admixture_cache.manifest import PanelCacheManifest
|
|
40
|
+
from admixture_cache.orchestration import project_target
|
|
41
|
+
from admixture_cache.projection import (
|
|
42
|
+
ProjectionResult,
|
|
43
|
+
numpy_supervised_projection,
|
|
44
|
+
)
|
|
45
|
+
from admixture_cache.runner import ToolRunner
|
|
46
|
+
|
|
47
|
+
__version__ = "1.0.0"
|
|
48
|
+
|
|
49
|
+
__all__ = [
|
|
50
|
+
# Public API — cache build (slow, one-time)
|
|
51
|
+
"build_panel_cache",
|
|
52
|
+
"ld_prune_panel", # optional pre-step before build_panel_cache
|
|
53
|
+
# Public API — per-target projection (fast)
|
|
54
|
+
"project_target",
|
|
55
|
+
"numpy_supervised_projection",
|
|
56
|
+
# Public API — alignment + dosage I/O
|
|
57
|
+
"align_target_to_panel_bim",
|
|
58
|
+
"extract_target_dosage_via_plink2",
|
|
59
|
+
# Public API — cache I/O + validation
|
|
60
|
+
"load_cached_p",
|
|
61
|
+
"load_cache_manifest",
|
|
62
|
+
"verify_cache_matches_current_config",
|
|
63
|
+
"sha256_file",
|
|
64
|
+
# Schemas
|
|
65
|
+
"PanelCacheManifest",
|
|
66
|
+
"ProjectionResult",
|
|
67
|
+
# Error type
|
|
68
|
+
"PanelCacheError",
|
|
69
|
+
# Back-compat alias for the upstream source-of-extraction; kept
|
|
70
|
+
# importable for callers mid-migration. Identical to
|
|
71
|
+
# PanelCacheError; safe to delete once no consumer relies on it.
|
|
72
|
+
"PopAutomationConfigError",
|
|
73
|
+
# Runner Protocol (for consumers' type hints)
|
|
74
|
+
"ToolRunner",
|
|
75
|
+
# Version
|
|
76
|
+
"__version__",
|
|
77
|
+
]
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""Target-to-panel alignment and dosage extraction (plink2-backed).
|
|
2
|
+
|
|
3
|
+
Per-target work that runs every projection: filter the target BED to
|
|
4
|
+
the cached panel.bim variant set, flip REF/ALT axes to match the
|
|
5
|
+
panel via ``plink2 --alt1-allele``, and extract genotype dosage as a
|
|
6
|
+
NumPy 1D array.
|
|
7
|
+
|
|
8
|
+
REF/ALT axis mismatch silently produces wrong Q vectors (the binomial
|
|
9
|
+
likelihood inverts every affected SNP's allele count), so axis
|
|
10
|
+
alignment is mandatory — never let the caller skip it.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
|
|
21
|
+
from admixture_cache.errors import PanelCacheError
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from admixture_cache.runner import ToolRunner
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def align_target_to_panel_bim(
|
|
30
|
+
*, target_bed: Path, panel_bim: Path,
|
|
31
|
+
output_prefix: Path, plink2_runner: ToolRunner,
|
|
32
|
+
log_dir: Path,
|
|
33
|
+
timeout_seconds: int = 600,
|
|
34
|
+
) -> Path:
|
|
35
|
+
"""Filter target.bed to cached panel.bim variant set + align REF/ALT
|
|
36
|
+
axes via plink2 --alt1-allele.
|
|
37
|
+
|
|
38
|
+
REF/ALT axis mismatch between target and reference panel silently
|
|
39
|
+
produces wrong Q vectors (the binomial likelihood inverts every
|
|
40
|
+
affected SNP's allele count). --alt1-allele forces the target's
|
|
41
|
+
ALT1 column to match the panel's ALT1 column at every overlapping
|
|
42
|
+
variant, flipping dosages where needed.
|
|
43
|
+
|
|
44
|
+
Returns the path to the aligned target .bed file.
|
|
45
|
+
"""
|
|
46
|
+
output_prefix.parent.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
|
|
49
|
+
plink2_runner.run(
|
|
50
|
+
args=[
|
|
51
|
+
"--bfile", str(target_bed.with_suffix("")),
|
|
52
|
+
"--extract", str(panel_bim),
|
|
53
|
+
# --alt1-allele <bim_file> <alt-col> <id-col>
|
|
54
|
+
# bim_file columns are 1-based: 2=ID, 5=ALT, 6=REF
|
|
55
|
+
"--alt1-allele", str(panel_bim), "5", "2",
|
|
56
|
+
"--make-bed",
|
|
57
|
+
"--out", str(output_prefix),
|
|
58
|
+
],
|
|
59
|
+
cwd=output_prefix.parent,
|
|
60
|
+
log_dir=log_dir,
|
|
61
|
+
timeout_seconds=timeout_seconds,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
aligned_bed = output_prefix.with_suffix(".bed")
|
|
65
|
+
if not aligned_bed.exists():
|
|
66
|
+
raise PanelCacheError(
|
|
67
|
+
f"align_target_to_panel_bim: plink2 succeeded but "
|
|
68
|
+
f"{aligned_bed} not produced",
|
|
69
|
+
)
|
|
70
|
+
return aligned_bed
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def extract_target_dosage_via_plink2(
|
|
74
|
+
*, target_bed: Path, output_prefix: Path,
|
|
75
|
+
plink2_runner: ToolRunner, log_dir: Path,
|
|
76
|
+
timeout_seconds: int = 600,
|
|
77
|
+
) -> np.ndarray:
|
|
78
|
+
"""Extract target dosage via ``plink2 --recode A`` (text format),
|
|
79
|
+
then parse to a NumPy 1D array of len M (M = SNPs in target.bim,
|
|
80
|
+
NaN for missing).
|
|
81
|
+
|
|
82
|
+
For a single target, this is acceptable (~28 sec on 850K SNPs).
|
|
83
|
+
A future optimization is to replace with ``bed-reader`` library
|
|
84
|
+
for direct binary BED reading (~30× faster).
|
|
85
|
+
|
|
86
|
+
Returns dosage as float64 1D array.
|
|
87
|
+
"""
|
|
88
|
+
output_prefix.parent.mkdir(parents=True, exist_ok=True)
|
|
89
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
90
|
+
|
|
91
|
+
plink2_runner.run(
|
|
92
|
+
args=[
|
|
93
|
+
"--bfile", str(target_bed.with_suffix("")),
|
|
94
|
+
"--recode", "A",
|
|
95
|
+
"--out", str(output_prefix),
|
|
96
|
+
],
|
|
97
|
+
cwd=output_prefix.parent,
|
|
98
|
+
log_dir=log_dir,
|
|
99
|
+
timeout_seconds=timeout_seconds,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
raw_path = output_prefix.with_suffix(".raw")
|
|
103
|
+
if not raw_path.exists():
|
|
104
|
+
raise PanelCacheError(
|
|
105
|
+
f"extract_target_dosage_via_plink2: {raw_path} not produced",
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
import pandas as pd
|
|
109
|
+
|
|
110
|
+
raw = pd.read_csv(raw_path, sep="\t", na_values=["NA"])
|
|
111
|
+
if raw.shape[0] != 1:
|
|
112
|
+
raise PanelCacheError(
|
|
113
|
+
f"extract_target_dosage_via_plink2: expected 1 sample in "
|
|
114
|
+
f"{raw_path}, got {raw.shape[0]}",
|
|
115
|
+
)
|
|
116
|
+
# First 6 columns are FID IID PAT MAT SEX PHENOTYPE; rest are dosages.
|
|
117
|
+
# np.asarray() guarantees a typed ndarray result even when pandas
|
|
118
|
+
# types resolve to Any (strict mypy in CI doesn't ship pandas-stubs).
|
|
119
|
+
return np.asarray(raw.iloc[0, 6:].to_numpy()).astype(np.float64)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
__all__ = ["align_target_to_panel_bim", "extract_target_dosage_via_plink2"]
|