admixture-cache 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,77 @@
1
+ """admixture-cache — precomputed-P supervised-ADMIXTURE projection.
2
+
3
+ Split the slow supervised-ADMIXTURE training pass (panel-only,
4
+ ~hours, one-time per panel × K × clusters_yaml combo) out of the
5
+ per-target hot path. After building, project a new target's K-vector
6
+ in <2 seconds via NumPy SLSQP against the cached P matrix.
7
+
8
+ Two phases, two APIs:
9
+
10
+ 1. **Panel cache build** (operator-facing, slow):
11
+ - :func:`build_panel_cache` runs stock ADMIXTURE × N restarts via
12
+ an injected ToolRunner, validates multimodality, writes the
13
+ canonical cached P + manifest.
14
+
15
+ 2. **Per-target projection** (consumer-facing, fast):
16
+ - :func:`project_target` aligns target.bed to cached panel.bim
17
+ + axes (via plink2), reads the target as a dosage vector,
18
+ solves for Q via scipy SLSQP under the binomial admixture
19
+ likelihood.
20
+
21
+ The math is validated to <1e-5 absolute Q-vector match against stock
22
+ ADMIXTURE on representative panels (15K samples × 850K SNPs at K=4).
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from admixture_cache.alignment import (
28
+ align_target_to_panel_bim,
29
+ extract_target_dosage_via_plink2,
30
+ )
31
+ from admixture_cache.builder import build_panel_cache, ld_prune_panel
32
+ from admixture_cache.errors import PanelCacheError, PopAutomationConfigError
33
+ from admixture_cache.io import (
34
+ load_cache_manifest,
35
+ load_cached_p,
36
+ sha256_file,
37
+ verify_cache_matches_current_config,
38
+ )
39
+ from admixture_cache.manifest import PanelCacheManifest
40
+ from admixture_cache.orchestration import project_target
41
+ from admixture_cache.projection import (
42
+ ProjectionResult,
43
+ numpy_supervised_projection,
44
+ )
45
+ from admixture_cache.runner import ToolRunner
46
+
47
+ __version__ = "1.0.0"
48
+
49
+ __all__ = [
50
+ # Public API — cache build (slow, one-time)
51
+ "build_panel_cache",
52
+ "ld_prune_panel", # optional pre-step before build_panel_cache
53
+ # Public API — per-target projection (fast)
54
+ "project_target",
55
+ "numpy_supervised_projection",
56
+ # Public API — alignment + dosage I/O
57
+ "align_target_to_panel_bim",
58
+ "extract_target_dosage_via_plink2",
59
+ # Public API — cache I/O + validation
60
+ "load_cached_p",
61
+ "load_cache_manifest",
62
+ "verify_cache_matches_current_config",
63
+ "sha256_file",
64
+ # Schemas
65
+ "PanelCacheManifest",
66
+ "ProjectionResult",
67
+ # Error type
68
+ "PanelCacheError",
69
+ # Back-compat alias for the upstream source-of-extraction; kept
70
+ # importable for callers mid-migration. Identical to
71
+ # PanelCacheError; safe to delete once no consumer relies on it.
72
+ "PopAutomationConfigError",
73
+ # Runner Protocol (for consumers' type hints)
74
+ "ToolRunner",
75
+ # Version
76
+ "__version__",
77
+ ]
@@ -0,0 +1,122 @@
1
+ """Target-to-panel alignment and dosage extraction (plink2-backed).
2
+
3
+ Per-target work that runs every projection: filter the target BED to
4
+ the cached panel.bim variant set, flip REF/ALT axes to match the
5
+ panel via ``plink2 --alt1-allele``, and extract genotype dosage as a
6
+ NumPy 1D array.
7
+
8
+ REF/ALT axis mismatch silently produces wrong Q vectors (the binomial
9
+ likelihood inverts every affected SNP's allele count), so axis
10
+ alignment is mandatory — never let the caller skip it.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ from pathlib import Path
17
+ from typing import TYPE_CHECKING
18
+
19
+ import numpy as np
20
+
21
+ from admixture_cache.errors import PanelCacheError
22
+
23
+ if TYPE_CHECKING:
24
+ from admixture_cache.runner import ToolRunner
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ def align_target_to_panel_bim(
30
+ *, target_bed: Path, panel_bim: Path,
31
+ output_prefix: Path, plink2_runner: ToolRunner,
32
+ log_dir: Path,
33
+ timeout_seconds: int = 600,
34
+ ) -> Path:
35
+ """Filter target.bed to cached panel.bim variant set + align REF/ALT
36
+ axes via plink2 --alt1-allele.
37
+
38
+ REF/ALT axis mismatch between target and reference panel silently
39
+ produces wrong Q vectors (the binomial likelihood inverts every
40
+ affected SNP's allele count). --alt1-allele forces the target's
41
+ ALT1 column to match the panel's ALT1 column at every overlapping
42
+ variant, flipping dosages where needed.
43
+
44
+ Returns the path to the aligned target .bed file.
45
+ """
46
+ output_prefix.parent.mkdir(parents=True, exist_ok=True)
47
+ log_dir.mkdir(parents=True, exist_ok=True)
48
+
49
+ plink2_runner.run(
50
+ args=[
51
+ "--bfile", str(target_bed.with_suffix("")),
52
+ "--extract", str(panel_bim),
53
+ # --alt1-allele <bim_file> <alt-col> <id-col>
54
+ # bim_file columns are 1-based: 2=ID, 5=ALT, 6=REF
55
+ "--alt1-allele", str(panel_bim), "5", "2",
56
+ "--make-bed",
57
+ "--out", str(output_prefix),
58
+ ],
59
+ cwd=output_prefix.parent,
60
+ log_dir=log_dir,
61
+ timeout_seconds=timeout_seconds,
62
+ )
63
+
64
+ aligned_bed = output_prefix.with_suffix(".bed")
65
+ if not aligned_bed.exists():
66
+ raise PanelCacheError(
67
+ f"align_target_to_panel_bim: plink2 succeeded but "
68
+ f"{aligned_bed} not produced",
69
+ )
70
+ return aligned_bed
71
+
72
+
73
+ def extract_target_dosage_via_plink2(
74
+ *, target_bed: Path, output_prefix: Path,
75
+ plink2_runner: ToolRunner, log_dir: Path,
76
+ timeout_seconds: int = 600,
77
+ ) -> np.ndarray:
78
+ """Extract target dosage via ``plink2 --recode A`` (text format),
79
+ then parse to a NumPy 1D array of len M (M = SNPs in target.bim,
80
+ NaN for missing).
81
+
82
+ For a single target, this is acceptable (~28 sec on 850K SNPs).
83
+ A future optimization is to replace with ``bed-reader`` library
84
+ for direct binary BED reading (~30× faster).
85
+
86
+ Returns dosage as float64 1D array.
87
+ """
88
+ output_prefix.parent.mkdir(parents=True, exist_ok=True)
89
+ log_dir.mkdir(parents=True, exist_ok=True)
90
+
91
+ plink2_runner.run(
92
+ args=[
93
+ "--bfile", str(target_bed.with_suffix("")),
94
+ "--recode", "A",
95
+ "--out", str(output_prefix),
96
+ ],
97
+ cwd=output_prefix.parent,
98
+ log_dir=log_dir,
99
+ timeout_seconds=timeout_seconds,
100
+ )
101
+
102
+ raw_path = output_prefix.with_suffix(".raw")
103
+ if not raw_path.exists():
104
+ raise PanelCacheError(
105
+ f"extract_target_dosage_via_plink2: {raw_path} not produced",
106
+ )
107
+
108
+ import pandas as pd
109
+
110
+ raw = pd.read_csv(raw_path, sep="\t", na_values=["NA"])
111
+ if raw.shape[0] != 1:
112
+ raise PanelCacheError(
113
+ f"extract_target_dosage_via_plink2: expected 1 sample in "
114
+ f"{raw_path}, got {raw.shape[0]}",
115
+ )
116
+ # First 6 columns are FID IID PAT MAT SEX PHENOTYPE; rest are dosages.
117
+ # np.asarray() guarantees a typed ndarray result even when pandas
118
+ # types resolve to Any (strict mypy in CI doesn't ship pandas-stubs).
119
+ return np.asarray(raw.iloc[0, 6:].to_numpy()).astype(np.float64)
120
+
121
+
122
+ __all__ = ["align_target_to_panel_bim", "extract_target_dosage_via_plink2"]