admixture-cache 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- admixture_cache-1.0.0/LICENSE +21 -0
- admixture_cache-1.0.0/PKG-INFO +215 -0
- admixture_cache-1.0.0/README.md +173 -0
- admixture_cache-1.0.0/pyproject.toml +111 -0
- admixture_cache-1.0.0/setup.cfg +4 -0
- admixture_cache-1.0.0/src/admixture_cache/__init__.py +77 -0
- admixture_cache-1.0.0/src/admixture_cache/alignment.py +122 -0
- admixture_cache-1.0.0/src/admixture_cache/builder.py +880 -0
- admixture_cache-1.0.0/src/admixture_cache/cli.py +515 -0
- admixture_cache-1.0.0/src/admixture_cache/errors.py +27 -0
- admixture_cache-1.0.0/src/admixture_cache/io.py +129 -0
- admixture_cache-1.0.0/src/admixture_cache/manifest.py +70 -0
- admixture_cache-1.0.0/src/admixture_cache/orchestration.py +130 -0
- admixture_cache-1.0.0/src/admixture_cache/projection.py +100 -0
- admixture_cache-1.0.0/src/admixture_cache/py.typed +0 -0
- admixture_cache-1.0.0/src/admixture_cache/runner.py +69 -0
- admixture_cache-1.0.0/src/admixture_cache.egg-info/PKG-INFO +215 -0
- admixture_cache-1.0.0/src/admixture_cache.egg-info/SOURCES.txt +20 -0
- admixture_cache-1.0.0/src/admixture_cache.egg-info/dependency_links.txt +1 -0
- admixture_cache-1.0.0/src/admixture_cache.egg-info/entry_points.txt +2 -0
- admixture_cache-1.0.0/src/admixture_cache.egg-info/requires.txt +13 -0
- admixture_cache-1.0.0/src/admixture_cache.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Carsten Erickson
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: admixture-cache
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Precomputed-P supervised-ADMIXTURE projection cache: build slow once, project fast per target.
|
|
5
|
+
Author-email: Carsten Erickson <carstene@mailbox.org>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/carstenerickson/admixture-cache
|
|
8
|
+
Project-URL: Issues, https://github.com/carstenerickson/admixture-cache/issues
|
|
9
|
+
Project-URL: Source, https://github.com/carstenerickson/admixture-cache
|
|
10
|
+
Project-URL: Changelog, https://github.com/carstenerickson/admixture-cache/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: bioinformatics,genetics,admixture,ancestry,population-genetics,supervised-admixture,projection-cache
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
17
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
25
|
+
Classifier: Typing :: Typed
|
|
26
|
+
Requires-Python: <3.15,>=3.11
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: numpy<3,>=1.26
|
|
30
|
+
Requires-Dist: scipy<2,>=1.11
|
|
31
|
+
Requires-Dist: pydantic<3,>=2.5
|
|
32
|
+
Requires-Dist: pandas<3,>=2.0
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
35
|
+
Requires-Dist: pytest-cov>=5.0; extra == "dev"
|
|
36
|
+
Requires-Dist: ruff>=0.6; extra == "dev"
|
|
37
|
+
Requires-Dist: mypy>=1.11; extra == "dev"
|
|
38
|
+
Requires-Dist: pandas-stubs>=2.0; extra == "dev"
|
|
39
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
40
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
41
|
+
Dynamic: license-file
|
|
42
|
+
|
|
43
|
+
# admixture-cache
|
|
44
|
+
|
|
45
|
+
Precomputed-P supervised-ADMIXTURE projection cache. Build the slow training pass once per panel × K × clusters_yaml combo; project new targets in ~2 seconds.
|
|
46
|
+
|
|
47
|
+
## Why this exists
|
|
48
|
+
|
|
49
|
+
Supervised ADMIXTURE training on a real-world panel takes hours to days per restart (K=21 regional cache: ~12-14 hr × 5 restarts; K=4 ancestral_cluster: ~5-7 hr × 5 restarts). For consumer pipelines serving many users, re-running this training per target is wasteful — the P matrix is determined almost entirely by the panel, not the target.
|
|
50
|
+
|
|
51
|
+
`admixture-cache` splits the supervised-ADMIXTURE workflow into:
|
|
52
|
+
|
|
53
|
+
1. **Panel cache build** (operator, slow, one-time per panel update): stock ADMIXTURE × N restarts → cache best-LL P matrix + multimodality SD + manifest.
|
|
54
|
+
2. **Per-target projection** (consumer, fast, every run): align target.bed to cached panel variants + axes (plink2), load dosages, solve for Q via scipy SLSQP under the standard binomial admixture likelihood.
|
|
55
|
+
|
|
56
|
+
The projection math matches stock ADMIXTURE Q values to within ~1e-5 absolute on representative workloads (15K × 850K matrix at K=4).
|
|
57
|
+
|
|
58
|
+
## Install
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install admixture-cache
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Python 3.11 through 3.14 are supported. End-to-end paths require **ADMIXTURE** (for `build`) and **plink2** (for `project` / `verify`) on `PATH`. Pure-library use without those binaries is fine — only the build/projection orchestrators shell out.
|
|
65
|
+
|
|
66
|
+
## Quickstart — library
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from pathlib import Path
|
|
70
|
+
from admixture_cache import build_panel_cache, project_target
|
|
71
|
+
|
|
72
|
+
# One-time, slow (~hours per restart per cache)
|
|
73
|
+
manifest = build_panel_cache(
|
|
74
|
+
panel_bed=Path("panel.bed"),
|
|
75
|
+
panel_pop_file=Path("panel.pop"),
|
|
76
|
+
clusters_yaml=Path("clusters.yaml"),
|
|
77
|
+
k=21,
|
|
78
|
+
cache_dir=Path("data/regional_k21_cache/"),
|
|
79
|
+
admixture_runner=my_tool_runner, # see ToolRunner Protocol below
|
|
80
|
+
track="regional",
|
|
81
|
+
panel_id="aadr_v66_ho",
|
|
82
|
+
panel_version="v66.0",
|
|
83
|
+
admixture_version="1.4.0",
|
|
84
|
+
seeds=[1, 2, 3, 4, 5],
|
|
85
|
+
sd_threshold=0.02,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Per-target, fast (~2 seconds end-to-end)
|
|
89
|
+
result = project_target(
|
|
90
|
+
target_bed=Path("target.bed"),
|
|
91
|
+
cache_dir=Path("data/regional_k21_cache/"),
|
|
92
|
+
plink2_runner=my_plink2_runner,
|
|
93
|
+
work_dir=Path("scratch/projection/"),
|
|
94
|
+
)
|
|
95
|
+
print(result.target_q) # K-vector
|
|
96
|
+
print(result.cluster_order) # K names
|
|
97
|
+
print(result.panel_stability_max_sd) # cached panel restart_sd
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Quickstart — CLI
|
|
101
|
+
|
|
102
|
+
Installing the package registers the `admixture-cache` console script with four subcommands:
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
# 1. Build a panel cache (slow, one-time).
|
|
106
|
+
admixture-cache build \
|
|
107
|
+
--panel-bed panel.bed \
|
|
108
|
+
--panel-pop panel.pop \
|
|
109
|
+
--clusters-yaml clusters.yaml \
|
|
110
|
+
--k 21 \
|
|
111
|
+
--cache-dir data/regional_k21_cache/ \
|
|
112
|
+
--track regional \
|
|
113
|
+
--panel-id aadr_v66_ho \
|
|
114
|
+
--panel-version v66.0 \
|
|
115
|
+
--seeds 1,2,3,4,5
|
|
116
|
+
|
|
117
|
+
# 2. Project a target against an existing cache (fast).
|
|
118
|
+
admixture-cache project \
|
|
119
|
+
--target-bed target.bed \
|
|
120
|
+
--cache-dir data/regional_k21_cache/ \
|
|
121
|
+
--work-dir scratch/projection/
|
|
122
|
+
|
|
123
|
+
# 3. Check whether a cache matches the current panel/YAML/K config.
|
|
124
|
+
admixture-cache verify \
|
|
125
|
+
--panel-bed panel.bed \
|
|
126
|
+
--clusters-yaml clusters.yaml \
|
|
127
|
+
--k 21 \
|
|
128
|
+
--cache-dir data/regional_k21_cache/
|
|
129
|
+
|
|
130
|
+
# 4. (Future) pull a canonical published cache. Placeholder until v1.0
|
|
131
|
+
# canonical-release artifacts ship.
|
|
132
|
+
admixture-cache download regional-k21-aadr-v66-ho
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
The default `SubprocessToolRunner` runs the local `admixture` / `plink2` binaries on `PATH`; override with `--admixture-binary` / `--plink2-binary` to point at a specific build.
|
|
136
|
+
|
|
137
|
+
`build`, `project`, and `verify` all surface a non-zero exit code on failure with a descriptive `error: …` line on stderr. `project --json` emits machine-readable JSON instead of human-readable text.
|
|
138
|
+
|
|
139
|
+
## ToolRunner Protocol
|
|
140
|
+
|
|
141
|
+
When calling the library from Python (rather than via the CLI), pass any object satisfying the `ToolRunner` Protocol:
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
from collections.abc import Callable
|
|
145
|
+
from pathlib import Path
|
|
146
|
+
|
|
147
|
+
class MyToolRunner:
|
|
148
|
+
def run(
|
|
149
|
+
self,
|
|
150
|
+
*,
|
|
151
|
+
args: list[str],
|
|
152
|
+
cwd: Path,
|
|
153
|
+
log_dir: Path,
|
|
154
|
+
timeout_seconds: int = 600,
|
|
155
|
+
# The two kwargs below are OPTIONAL but REQUIRED for
|
|
156
|
+
# parallel `build_panel_cache` (max_parallel_restarts > 1):
|
|
157
|
+
log_name: str | None = None,
|
|
158
|
+
pid_callback: Callable[[int], None] | None = None,
|
|
159
|
+
) -> object:
|
|
160
|
+
...
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
- `log_name` — admixture-cache passes the per-restart canonical log filename (e.g. `restart_3.out`). Honor it when set; fall back to your own naming scheme when `None`. Required for parallel mode (concurrent restarts share `log_dir` and need disambiguated filenames).
|
|
164
|
+
- `pid_callback` — call with the subprocess PID immediately after spawning. admixture-cache uses this to SIGTERM in-flight restarts on first-failure cancellation. Required for parallel mode.
|
|
165
|
+
- Spawn subprocesses with `start_new_session=True` so each child gets its own process group. The cancellation path signals the pgid (via `os.killpg`) rather than the bare PID — avoids the classic UNIX PID-recycle race when a subprocess exits between PID capture and the cancellation pass.
|
|
166
|
+
|
|
167
|
+
Adapters that forward via `**kwargs` (e.g. `def run(self, **kwargs): return self._inner.run(**kwargs)`) are recognized as supporting both extensions — but the inner runner MUST actually honor them. A `**kwargs` forwarder that silently strips unknown kwargs will pass the parallel-mode guard but produce incoherent logs and broken cancellation.
|
|
168
|
+
|
|
169
|
+
For non-parallel use (`max_parallel_restarts=1`), both extensions are optional — only the four baseline kwargs are required.
|
|
170
|
+
|
|
171
|
+
## Cache directory layout
|
|
172
|
+
|
|
173
|
+
After `build_panel_cache` succeeds, `cache_dir` contains:
|
|
174
|
+
|
|
175
|
+
```
|
|
176
|
+
cache_dir/
|
|
177
|
+
├── panel.K.P # Best-LL restart's allele freqs (M × K)
|
|
178
|
+
├── panel.K.Q # Best-LL restart's non-target Q (N × K)
|
|
179
|
+
├── panel.bim # Variant set + REF/ALT axes (alignment ref)
|
|
180
|
+
├── restart_sd.json # Per-cluster SD across restarts
|
|
181
|
+
├── cluster_order.json # K column → cluster name mapping
|
|
182
|
+
├── manifest.json # Panel SHA + YAML SHA + K + version pins
|
|
183
|
+
└── build_logs/ # ADMIXTURE stdout/stderr per restart
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
Cache validity is determined by `manifest.json` SHAs matching the current config (panel.bim, clusters_yaml, K, optional geo-filter YAMLs). Any mismatch → consumer code can fall back to a full ADMIXTURE training pass or rebuild the cache.
|
|
187
|
+
|
|
188
|
+
## When to use this
|
|
189
|
+
|
|
190
|
+
- **Multi-user services**: cache once, project for every user (~5,000× per-target speedup at scale)
|
|
191
|
+
- **Reproducibility**: published canonical caches (forthcoming via GitHub Releases) give byte-identical P across consumers
|
|
192
|
+
- **CI/CD**: faster integration tests once you have a cache
|
|
193
|
+
|
|
194
|
+
## When NOT to use this
|
|
195
|
+
|
|
196
|
+
- **One-time analyses** with a custom panel that won't be reused — full ADMIXTURE is simpler
|
|
197
|
+
- **Novel methodologies** requiring per-target P refinement — the projection assumes P is fully determined by the panel
|
|
198
|
+
|
|
199
|
+
## Status
|
|
200
|
+
|
|
201
|
+
- **v1.0.0** — first PyPI release. Library + CLI surface frozen at this point; cache directory layout is stable at schema v1. Tracks numerical parity against stock ADMIXTURE; canonical published-cache artifacts to follow as separate GitHub releases.
|
|
202
|
+
|
|
203
|
+
See [CHANGELOG.md](CHANGELOG.md) for the per-release detail.
|
|
204
|
+
|
|
205
|
+
## Contributing
|
|
206
|
+
|
|
207
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for dev setup, the three local validation gates (pytest / ruff / mypy), commit conventions, and the tag → OIDC PyPI release procedure.
|
|
208
|
+
|
|
209
|
+
## Acknowledgments
|
|
210
|
+
|
|
211
|
+
This library was extracted from [ancestry-pipeline](https://github.com/carstenerickson/ancestry-pipeline)'s in-pipeline supervised-ADMIXTURE projection module (`pop_automation/admixture_projection.py`, ~744 LOC, validated against real-world workloads). The split lets sibling projects depend on the cache layer without pulling in the larger orchestrator.
|
|
212
|
+
|
|
213
|
+
## License
|
|
214
|
+
|
|
215
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# admixture-cache
|
|
2
|
+
|
|
3
|
+
Precomputed-P supervised-ADMIXTURE projection cache. Build the slow training pass once per panel × K × clusters_yaml combo; project new targets in ~2 seconds.
|
|
4
|
+
|
|
5
|
+
## Why this exists
|
|
6
|
+
|
|
7
|
+
Supervised ADMIXTURE training on a real-world panel takes hours to days per restart (K=21 regional cache: ~12-14 hr × 5 restarts; K=4 ancestral_cluster: ~5-7 hr × 5 restarts). For consumer pipelines serving many users, re-running this training per target is wasteful — the P matrix is determined almost entirely by the panel, not the target.
|
|
8
|
+
|
|
9
|
+
`admixture-cache` splits the supervised-ADMIXTURE workflow into:
|
|
10
|
+
|
|
11
|
+
1. **Panel cache build** (operator, slow, one-time per panel update): stock ADMIXTURE × N restarts → cache best-LL P matrix + multimodality SD + manifest.
|
|
12
|
+
2. **Per-target projection** (consumer, fast, every run): align target.bed to cached panel variants + axes (plink2), load dosages, solve for Q via scipy SLSQP under the standard binomial admixture likelihood.
|
|
13
|
+
|
|
14
|
+
The projection math matches stock ADMIXTURE Q values to within ~1e-5 absolute on representative workloads (15K × 850K matrix at K=4).
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install admixture-cache
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Python 3.11 through 3.14 are supported. End-to-end paths require **ADMIXTURE** (for `build`) and **plink2** (for `project` / `verify`) on `PATH`. Pure-library use without those binaries is fine — only the build/projection orchestrators shell out.
|
|
23
|
+
|
|
24
|
+
## Quickstart — library
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from admixture_cache import build_panel_cache, project_target
|
|
29
|
+
|
|
30
|
+
# One-time, slow (~hours per restart per cache)
|
|
31
|
+
manifest = build_panel_cache(
|
|
32
|
+
panel_bed=Path("panel.bed"),
|
|
33
|
+
panel_pop_file=Path("panel.pop"),
|
|
34
|
+
clusters_yaml=Path("clusters.yaml"),
|
|
35
|
+
k=21,
|
|
36
|
+
cache_dir=Path("data/regional_k21_cache/"),
|
|
37
|
+
admixture_runner=my_tool_runner, # see ToolRunner Protocol below
|
|
38
|
+
track="regional",
|
|
39
|
+
panel_id="aadr_v66_ho",
|
|
40
|
+
panel_version="v66.0",
|
|
41
|
+
admixture_version="1.4.0",
|
|
42
|
+
seeds=[1, 2, 3, 4, 5],
|
|
43
|
+
sd_threshold=0.02,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Per-target, fast (~2 seconds end-to-end)
|
|
47
|
+
result = project_target(
|
|
48
|
+
target_bed=Path("target.bed"),
|
|
49
|
+
cache_dir=Path("data/regional_k21_cache/"),
|
|
50
|
+
plink2_runner=my_plink2_runner,
|
|
51
|
+
work_dir=Path("scratch/projection/"),
|
|
52
|
+
)
|
|
53
|
+
print(result.target_q) # K-vector
|
|
54
|
+
print(result.cluster_order) # K names
|
|
55
|
+
print(result.panel_stability_max_sd) # cached panel restart_sd
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Quickstart — CLI
|
|
59
|
+
|
|
60
|
+
Installing the package registers the `admixture-cache` console script with four subcommands:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# 1. Build a panel cache (slow, one-time).
|
|
64
|
+
admixture-cache build \
|
|
65
|
+
--panel-bed panel.bed \
|
|
66
|
+
--panel-pop panel.pop \
|
|
67
|
+
--clusters-yaml clusters.yaml \
|
|
68
|
+
--k 21 \
|
|
69
|
+
--cache-dir data/regional_k21_cache/ \
|
|
70
|
+
--track regional \
|
|
71
|
+
--panel-id aadr_v66_ho \
|
|
72
|
+
--panel-version v66.0 \
|
|
73
|
+
--seeds 1,2,3,4,5
|
|
74
|
+
|
|
75
|
+
# 2. Project a target against an existing cache (fast).
|
|
76
|
+
admixture-cache project \
|
|
77
|
+
--target-bed target.bed \
|
|
78
|
+
--cache-dir data/regional_k21_cache/ \
|
|
79
|
+
--work-dir scratch/projection/
|
|
80
|
+
|
|
81
|
+
# 3. Check whether a cache matches the current panel/YAML/K config.
|
|
82
|
+
admixture-cache verify \
|
|
83
|
+
--panel-bed panel.bed \
|
|
84
|
+
--clusters-yaml clusters.yaml \
|
|
85
|
+
--k 21 \
|
|
86
|
+
--cache-dir data/regional_k21_cache/
|
|
87
|
+
|
|
88
|
+
# 4. (Future) pull a canonical published cache. Placeholder until v1.0
|
|
89
|
+
# canonical-release artifacts ship.
|
|
90
|
+
admixture-cache download regional-k21-aadr-v66-ho
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
The default `SubprocessToolRunner` runs the local `admixture` / `plink2` binaries on `PATH`; override with `--admixture-binary` / `--plink2-binary` to point at a specific build.
|
|
94
|
+
|
|
95
|
+
`build`, `project`, and `verify` all surface a non-zero exit code on failure with a descriptive `error: …` line on stderr. `project --json` emits machine-readable JSON instead of human-readable text.
|
|
96
|
+
|
|
97
|
+
## ToolRunner Protocol
|
|
98
|
+
|
|
99
|
+
When calling the library from Python (rather than via the CLI), pass any object satisfying the `ToolRunner` Protocol:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from collections.abc import Callable
|
|
103
|
+
from pathlib import Path
|
|
104
|
+
|
|
105
|
+
class MyToolRunner:
|
|
106
|
+
def run(
|
|
107
|
+
self,
|
|
108
|
+
*,
|
|
109
|
+
args: list[str],
|
|
110
|
+
cwd: Path,
|
|
111
|
+
log_dir: Path,
|
|
112
|
+
timeout_seconds: int = 600,
|
|
113
|
+
# The two kwargs below are OPTIONAL but REQUIRED for
|
|
114
|
+
# parallel `build_panel_cache` (max_parallel_restarts > 1):
|
|
115
|
+
log_name: str | None = None,
|
|
116
|
+
pid_callback: Callable[[int], None] | None = None,
|
|
117
|
+
) -> object:
|
|
118
|
+
...
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
- `log_name` — admixture-cache passes the per-restart canonical log filename (e.g. `restart_3.out`). Honor it when set; fall back to your own naming scheme when `None`. Required for parallel mode (concurrent restarts share `log_dir` and need disambiguated filenames).
|
|
122
|
+
- `pid_callback` — call with the subprocess PID immediately after spawning. admixture-cache uses this to SIGTERM in-flight restarts on first-failure cancellation. Required for parallel mode.
|
|
123
|
+
- Spawn subprocesses with `start_new_session=True` so each child gets its own process group. The cancellation path signals the pgid (via `os.killpg`) rather than the bare PID — avoids the classic UNIX PID-recycle race when a subprocess exits between PID capture and the cancellation pass.
|
|
124
|
+
|
|
125
|
+
Adapters that forward via `**kwargs` (e.g. `def run(self, **kwargs): return self._inner.run(**kwargs)`) are recognized as supporting both extensions — but the inner runner MUST actually honor them. A `**kwargs` forwarder that silently strips unknown kwargs will pass the parallel-mode guard but produce incoherent logs and broken cancellation.
|
|
126
|
+
|
|
127
|
+
For non-parallel use (`max_parallel_restarts=1`), both extensions are optional — only the four baseline kwargs are required.
|
|
128
|
+
|
|
129
|
+
## Cache directory layout
|
|
130
|
+
|
|
131
|
+
After `build_panel_cache` succeeds, `cache_dir` contains:
|
|
132
|
+
|
|
133
|
+
```
|
|
134
|
+
cache_dir/
|
|
135
|
+
├── panel.K.P # Best-LL restart's allele freqs (M × K)
|
|
136
|
+
├── panel.K.Q # Best-LL restart's non-target Q (N × K)
|
|
137
|
+
├── panel.bim # Variant set + REF/ALT axes (alignment ref)
|
|
138
|
+
├── restart_sd.json # Per-cluster SD across restarts
|
|
139
|
+
├── cluster_order.json # K column → cluster name mapping
|
|
140
|
+
├── manifest.json # Panel SHA + YAML SHA + K + version pins
|
|
141
|
+
└── build_logs/ # ADMIXTURE stdout/stderr per restart
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Cache validity is determined by `manifest.json` SHAs matching the current config (panel.bim, clusters_yaml, K, optional geo-filter YAMLs). Any mismatch → consumer code can fall back to a full ADMIXTURE training pass or rebuild the cache.
|
|
145
|
+
|
|
146
|
+
## When to use this
|
|
147
|
+
|
|
148
|
+
- **Multi-user services**: cache once, project for every user (~5,000× per-target speedup at scale)
|
|
149
|
+
- **Reproducibility**: published canonical caches (forthcoming via GitHub Releases) give byte-identical P across consumers
|
|
150
|
+
- **CI/CD**: faster integration tests once you have a cache
|
|
151
|
+
|
|
152
|
+
## When NOT to use this
|
|
153
|
+
|
|
154
|
+
- **One-time analyses** with a custom panel that won't be reused — full ADMIXTURE is simpler
|
|
155
|
+
- **Novel methodologies** requiring per-target P refinement — the projection assumes P is fully determined by the panel
|
|
156
|
+
|
|
157
|
+
## Status
|
|
158
|
+
|
|
159
|
+
- **v1.0.0** — first PyPI release. Library + CLI surface frozen at this point; cache directory layout is stable at schema v1. Tracks numerical parity against stock ADMIXTURE; canonical published-cache artifacts to follow as separate GitHub releases.
|
|
160
|
+
|
|
161
|
+
See [CHANGELOG.md](CHANGELOG.md) for the per-release detail.
|
|
162
|
+
|
|
163
|
+
## Contributing
|
|
164
|
+
|
|
165
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for dev setup, the three local validation gates (pytest / ruff / mypy), commit conventions, and the tag → OIDC PyPI release procedure.
|
|
166
|
+
|
|
167
|
+
## Acknowledgments
|
|
168
|
+
|
|
169
|
+
This library was extracted from [ancestry-pipeline](https://github.com/carstenerickson/ancestry-pipeline)'s in-pipeline supervised-ADMIXTURE projection module (`pop_automation/admixture_projection.py`, ~744 LOC, validated against real-world workloads). The split lets sibling projects depend on the cache layer without pulling in the larger orchestrator.
|
|
170
|
+
|
|
171
|
+
## License
|
|
172
|
+
|
|
173
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "admixture-cache"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Precomputed-P supervised-ADMIXTURE projection cache: build slow once, project fast per target."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
requires-python = ">=3.11,<3.15"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Carsten Erickson", email = "carstene@mailbox.org" },
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"bioinformatics",
|
|
17
|
+
"genetics",
|
|
18
|
+
"admixture",
|
|
19
|
+
"ancestry",
|
|
20
|
+
"population-genetics",
|
|
21
|
+
"supervised-admixture",
|
|
22
|
+
"projection-cache",
|
|
23
|
+
]
|
|
24
|
+
classifiers = [
|
|
25
|
+
"Development Status :: 4 - Beta",
|
|
26
|
+
"Environment :: Console",
|
|
27
|
+
"Intended Audience :: Science/Research",
|
|
28
|
+
"License :: OSI Approved :: MIT License",
|
|
29
|
+
"Operating System :: POSIX :: Linux",
|
|
30
|
+
"Operating System :: MacOS :: MacOS X",
|
|
31
|
+
"Programming Language :: Python :: 3",
|
|
32
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
33
|
+
"Programming Language :: Python :: 3.11",
|
|
34
|
+
"Programming Language :: Python :: 3.12",
|
|
35
|
+
"Programming Language :: Python :: 3.13",
|
|
36
|
+
"Programming Language :: Python :: 3.14",
|
|
37
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
38
|
+
"Typing :: Typed",
|
|
39
|
+
]
|
|
40
|
+
dependencies = [
|
|
41
|
+
"numpy>=1.26,<3",
|
|
42
|
+
"scipy>=1.11,<2",
|
|
43
|
+
"pydantic>=2.5,<3",
|
|
44
|
+
# pandas is used by `extract_target_dosage_via_plink2` to parse
|
|
45
|
+
# plink2 `--recode A` text output. Imported inline (not at module
|
|
46
|
+
# load) so the import cost only hits the projection hot path,
|
|
47
|
+
# but it IS required for the default project_target flow.
|
|
48
|
+
"pandas>=2.0,<3",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
[project.optional-dependencies]
|
|
52
|
+
dev = [
|
|
53
|
+
"pytest>=8.0",
|
|
54
|
+
"pytest-cov>=5.0",
|
|
55
|
+
"ruff>=0.6",
|
|
56
|
+
"mypy>=1.11",
|
|
57
|
+
# pandas-stubs gives mypy proper types for the pandas calls in
|
|
58
|
+
# alignment.py — without it strict mode reports `no-any-return`
|
|
59
|
+
# on the DataFrame-to-ndarray conversion.
|
|
60
|
+
"pandas-stubs>=2.0",
|
|
61
|
+
"build>=1.2",
|
|
62
|
+
"twine>=5.0",
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
[project.scripts]
|
|
66
|
+
admixture-cache = "admixture_cache.cli:cli"
|
|
67
|
+
|
|
68
|
+
[project.urls]
|
|
69
|
+
Homepage = "https://github.com/carstenerickson/admixture-cache"
|
|
70
|
+
Issues = "https://github.com/carstenerickson/admixture-cache/issues"
|
|
71
|
+
Source = "https://github.com/carstenerickson/admixture-cache"
|
|
72
|
+
Changelog = "https://github.com/carstenerickson/admixture-cache/blob/main/CHANGELOG.md"
|
|
73
|
+
|
|
74
|
+
[tool.setuptools.packages.find]
|
|
75
|
+
where = ["src"]
|
|
76
|
+
include = ["admixture_cache*"]
|
|
77
|
+
|
|
78
|
+
[tool.setuptools.package-data]
|
|
79
|
+
admixture_cache = ["py.typed"]
|
|
80
|
+
|
|
81
|
+
[tool.pytest.ini_options]
|
|
82
|
+
testpaths = ["tests"]
|
|
83
|
+
python_files = ["test_*.py"]
|
|
84
|
+
addopts = "-ra --strict-markers"
|
|
85
|
+
|
|
86
|
+
[tool.ruff]
|
|
87
|
+
target-version = "py311"
|
|
88
|
+
line-length = 100
|
|
89
|
+
|
|
90
|
+
[tool.ruff.lint]
|
|
91
|
+
select = ["E", "F", "W", "I", "B", "UP", "SIM", "RUF"]
|
|
92
|
+
ignore = [
|
|
93
|
+
"E501", # line length handled by ruff format
|
|
94
|
+
"RUF001", # × and – in strings: intentional in scientific docstrings
|
|
95
|
+
"RUF002", # × and – in docstrings: intentional
|
|
96
|
+
"RUF003", # × and – in comments: intentional
|
|
97
|
+
"RUF022", # __all__ semantically grouped, not alphabetical
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
[tool.mypy]
|
|
101
|
+
python_version = "3.11"
|
|
102
|
+
strict = true
|
|
103
|
+
files = ["src/admixture_cache"]
|
|
104
|
+
|
|
105
|
+
[[tool.mypy.overrides]]
|
|
106
|
+
module = "scipy.optimize"
|
|
107
|
+
ignore_missing_imports = true
|
|
108
|
+
|
|
109
|
+
[[tool.mypy.overrides]]
|
|
110
|
+
module = "pandas"
|
|
111
|
+
ignore_missing_imports = true
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""admixture-cache — precomputed-P supervised-ADMIXTURE projection.
|
|
2
|
+
|
|
3
|
+
Split the slow supervised-ADMIXTURE training pass (panel-only,
|
|
4
|
+
~hours, one-time per panel × K × clusters_yaml combo) out of the
|
|
5
|
+
per-target hot path. After building, project a new target's K-vector
|
|
6
|
+
in <2 seconds via NumPy SLSQP against the cached P matrix.
|
|
7
|
+
|
|
8
|
+
Two phases, two APIs:
|
|
9
|
+
|
|
10
|
+
1. **Panel cache build** (operator-facing, slow):
|
|
11
|
+
- :func:`build_panel_cache` runs stock ADMIXTURE × N restarts via
|
|
12
|
+
an injected ToolRunner, validates multimodality, writes the
|
|
13
|
+
canonical cached P + manifest.
|
|
14
|
+
|
|
15
|
+
2. **Per-target projection** (consumer-facing, fast):
|
|
16
|
+
- :func:`project_target` aligns target.bed to cached panel.bim
|
|
17
|
+
+ axes (via plink2), reads the target as a dosage vector,
|
|
18
|
+
solves for Q via scipy SLSQP under the binomial admixture
|
|
19
|
+
likelihood.
|
|
20
|
+
|
|
21
|
+
The math is validated to <1e-5 absolute Q-vector match against stock
|
|
22
|
+
ADMIXTURE on representative panels (15K samples × 850K SNPs at K=4).
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from admixture_cache.alignment import (
|
|
28
|
+
align_target_to_panel_bim,
|
|
29
|
+
extract_target_dosage_via_plink2,
|
|
30
|
+
)
|
|
31
|
+
from admixture_cache.builder import build_panel_cache, ld_prune_panel
|
|
32
|
+
from admixture_cache.errors import PanelCacheError, PopAutomationConfigError
|
|
33
|
+
from admixture_cache.io import (
|
|
34
|
+
load_cache_manifest,
|
|
35
|
+
load_cached_p,
|
|
36
|
+
sha256_file,
|
|
37
|
+
verify_cache_matches_current_config,
|
|
38
|
+
)
|
|
39
|
+
from admixture_cache.manifest import PanelCacheManifest
|
|
40
|
+
from admixture_cache.orchestration import project_target
|
|
41
|
+
from admixture_cache.projection import (
|
|
42
|
+
ProjectionResult,
|
|
43
|
+
numpy_supervised_projection,
|
|
44
|
+
)
|
|
45
|
+
from admixture_cache.runner import ToolRunner
|
|
46
|
+
|
|
47
|
+
__version__ = "1.0.0"
|
|
48
|
+
|
|
49
|
+
__all__ = [
|
|
50
|
+
# Public API — cache build (slow, one-time)
|
|
51
|
+
"build_panel_cache",
|
|
52
|
+
"ld_prune_panel", # optional pre-step before build_panel_cache
|
|
53
|
+
# Public API — per-target projection (fast)
|
|
54
|
+
"project_target",
|
|
55
|
+
"numpy_supervised_projection",
|
|
56
|
+
# Public API — alignment + dosage I/O
|
|
57
|
+
"align_target_to_panel_bim",
|
|
58
|
+
"extract_target_dosage_via_plink2",
|
|
59
|
+
# Public API — cache I/O + validation
|
|
60
|
+
"load_cached_p",
|
|
61
|
+
"load_cache_manifest",
|
|
62
|
+
"verify_cache_matches_current_config",
|
|
63
|
+
"sha256_file",
|
|
64
|
+
# Schemas
|
|
65
|
+
"PanelCacheManifest",
|
|
66
|
+
"ProjectionResult",
|
|
67
|
+
# Error type
|
|
68
|
+
"PanelCacheError",
|
|
69
|
+
# Back-compat alias for the upstream source-of-extraction; kept
|
|
70
|
+
# importable for callers mid-migration. Identical to
|
|
71
|
+
# PanelCacheError; safe to delete once no consumer relies on it.
|
|
72
|
+
"PopAutomationConfigError",
|
|
73
|
+
# Runner Protocol (for consumers' type hints)
|
|
74
|
+
"ToolRunner",
|
|
75
|
+
# Version
|
|
76
|
+
"__version__",
|
|
77
|
+
]
|