admixture-cache 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Carsten Erickson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,215 @@
1
+ Metadata-Version: 2.4
2
+ Name: admixture-cache
3
+ Version: 1.0.0
4
+ Summary: Precomputed-P supervised-ADMIXTURE projection cache: build slow once, project fast per target.
5
+ Author-email: Carsten Erickson <carstene@mailbox.org>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/carstenerickson/admixture-cache
8
+ Project-URL: Issues, https://github.com/carstenerickson/admixture-cache/issues
9
+ Project-URL: Source, https://github.com/carstenerickson/admixture-cache
10
+ Project-URL: Changelog, https://github.com/carstenerickson/admixture-cache/blob/main/CHANGELOG.md
11
+ Keywords: bioinformatics,genetics,admixture,ancestry,population-genetics,supervised-admixture,projection-cache
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: POSIX :: Linux
17
+ Classifier: Operating System :: MacOS :: MacOS X
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3 :: Only
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Programming Language :: Python :: 3.14
24
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
25
+ Classifier: Typing :: Typed
26
+ Requires-Python: <3.15,>=3.11
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: numpy<3,>=1.26
30
+ Requires-Dist: scipy<2,>=1.11
31
+ Requires-Dist: pydantic<3,>=2.5
32
+ Requires-Dist: pandas<3,>=2.0
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest>=8.0; extra == "dev"
35
+ Requires-Dist: pytest-cov>=5.0; extra == "dev"
36
+ Requires-Dist: ruff>=0.6; extra == "dev"
37
+ Requires-Dist: mypy>=1.11; extra == "dev"
38
+ Requires-Dist: pandas-stubs>=2.0; extra == "dev"
39
+ Requires-Dist: build>=1.2; extra == "dev"
40
+ Requires-Dist: twine>=5.0; extra == "dev"
41
+ Dynamic: license-file
42
+
43
+ # admixture-cache
44
+
45
+ Precomputed-P supervised-ADMIXTURE projection cache. Build the slow training pass once per panel × K × clusters_yaml combo; project new targets in ~2 seconds.
46
+
47
+ ## Why this exists
48
+
49
+ Supervised ADMIXTURE training on a real-world panel takes hours to days per restart (K=21 regional cache: ~12-14 hr × 5 restarts; K=4 ancestral_cluster: ~5-7 hr × 5 restarts). For consumer pipelines serving many users, re-running this training per target is wasteful — the P matrix is determined almost entirely by the panel, not the target.
50
+
51
+ `admixture-cache` splits the supervised-ADMIXTURE workflow into:
52
+
53
+ 1. **Panel cache build** (operator, slow, one-time per panel update): stock ADMIXTURE × N restarts → cache best-LL P matrix + multimodality SD + manifest.
54
+ 2. **Per-target projection** (consumer, fast, every run): align target.bed to cached panel variants + axes (plink2), load dosages, solve for Q via scipy SLSQP under the standard binomial admixture likelihood.
55
+
56
+ The projection math matches stock ADMIXTURE Q values to within ~1e-5 absolute on representative workloads (15K × 850K matrix at K=4).
57
+
58
+ ## Install
59
+
60
+ ```bash
61
+ pip install admixture-cache
62
+ ```
63
+
64
+ Python 3.11 through 3.14 are supported. End-to-end paths require **ADMIXTURE** (for `build`) and **plink2** (for `project` / `verify`) on `PATH`. Pure-library use without those binaries is fine — only the build/projection orchestrators shell out.
65
+
66
+ ## Quickstart — library
67
+
68
+ ```python
69
+ from pathlib import Path
70
+ from admixture_cache import build_panel_cache, project_target
71
+
72
+ # One-time, slow (~hours per restart per cache)
73
+ manifest = build_panel_cache(
74
+ panel_bed=Path("panel.bed"),
75
+ panel_pop_file=Path("panel.pop"),
76
+ clusters_yaml=Path("clusters.yaml"),
77
+ k=21,
78
+ cache_dir=Path("data/regional_k21_cache/"),
79
+ admixture_runner=my_tool_runner, # see ToolRunner Protocol below
80
+ track="regional",
81
+ panel_id="aadr_v66_ho",
82
+ panel_version="v66.0",
83
+ admixture_version="1.4.0",
84
+ seeds=[1, 2, 3, 4, 5],
85
+ sd_threshold=0.02,
86
+ )
87
+
88
+ # Per-target, fast (~2 seconds end-to-end)
89
+ result = project_target(
90
+ target_bed=Path("target.bed"),
91
+ cache_dir=Path("data/regional_k21_cache/"),
92
+ plink2_runner=my_plink2_runner,
93
+ work_dir=Path("scratch/projection/"),
94
+ )
95
+ print(result.target_q) # K-vector
96
+ print(result.cluster_order) # K names
97
+ print(result.panel_stability_max_sd) # cached panel restart_sd
98
+ ```
99
+
100
+ ## Quickstart — CLI
101
+
102
+ Installing the package registers the `admixture-cache` console script with four subcommands:
103
+
104
+ ```bash
105
+ # 1. Build a panel cache (slow, one-time).
106
+ admixture-cache build \
107
+ --panel-bed panel.bed \
108
+ --panel-pop panel.pop \
109
+ --clusters-yaml clusters.yaml \
110
+ --k 21 \
111
+ --cache-dir data/regional_k21_cache/ \
112
+ --track regional \
113
+ --panel-id aadr_v66_ho \
114
+ --panel-version v66.0 \
115
+ --seeds 1,2,3,4,5
116
+
117
+ # 2. Project a target against an existing cache (fast).
118
+ admixture-cache project \
119
+ --target-bed target.bed \
120
+ --cache-dir data/regional_k21_cache/ \
121
+ --work-dir scratch/projection/
122
+
123
+ # 3. Check whether a cache matches the current panel/YAML/K config.
124
+ admixture-cache verify \
125
+ --panel-bed panel.bed \
126
+ --clusters-yaml clusters.yaml \
127
+ --k 21 \
128
+ --cache-dir data/regional_k21_cache/
129
+
130
+ # 4. (Future) pull a canonical published cache. Placeholder until v1.0
131
+ # canonical-release artifacts ship.
132
+ admixture-cache download regional-k21-aadr-v66-ho
133
+ ```
134
+
135
+ The default `SubprocessToolRunner` runs the local `admixture` / `plink2` binaries on `PATH`; override with `--admixture-binary` / `--plink2-binary` to point at a specific build.
136
+
137
+ `build`, `project`, and `verify` all surface a non-zero exit code on failure with a descriptive `error: …` line on stderr. `project --json` emits machine-readable JSON instead of human-readable text.
138
+
139
+ ## ToolRunner Protocol
140
+
141
+ When calling the library from Python (rather than via the CLI), pass any object satisfying the `ToolRunner` Protocol:
142
+
143
+ ```python
144
+ from collections.abc import Callable
145
+ from pathlib import Path
146
+
147
+ class MyToolRunner:
148
+ def run(
149
+ self,
150
+ *,
151
+ args: list[str],
152
+ cwd: Path,
153
+ log_dir: Path,
154
+ timeout_seconds: int = 600,
155
+ # The two kwargs below are OPTIONAL but REQUIRED for
156
+ # parallel `build_panel_cache` (max_parallel_restarts > 1):
157
+ log_name: str | None = None,
158
+ pid_callback: Callable[[int], None] | None = None,
159
+ ) -> object:
160
+ ...
161
+ ```
162
+
163
+ - `log_name` — admixture-cache passes the per-restart canonical log filename (e.g. `restart_3.out`). Honor it when set; fall back to your own naming scheme when `None`. Required for parallel mode (concurrent restarts share `log_dir` and need disambiguated filenames).
164
+ - `pid_callback` — call with the subprocess PID immediately after spawning. admixture-cache uses this to SIGTERM in-flight restarts on first-failure cancellation. Required for parallel mode.
165
+ - Spawn subprocesses with `start_new_session=True` so each child gets its own process group. The cancellation path signals the pgid (via `os.killpg`) rather than the bare PID — avoids the classic UNIX PID-recycle race when a subprocess exits between PID capture and the cancellation pass.
166
+
167
+ Adapters that forward via `**kwargs` (e.g. `def run(self, **kwargs): return self._inner.run(**kwargs)`) are recognized as supporting both extensions — but the inner runner MUST actually honor them. A `**kwargs` forwarder that silently strips unknown kwargs will pass the parallel-mode guard but produce incoherent logs and broken cancellation.
168
+
169
+ For non-parallel use (`max_parallel_restarts=1`), both extensions are optional — only the four baseline kwargs are required.
170
+
171
+ ## Cache directory layout
172
+
173
+ After `build_panel_cache` succeeds, `cache_dir` contains:
174
+
175
+ ```
176
+ cache_dir/
177
+ ├── panel.K.P # Best-LL restart's allele freqs (M × K)
178
+ ├── panel.K.Q # Best-LL restart's non-target Q (N × K)
179
+ ├── panel.bim # Variant set + REF/ALT axes (alignment ref)
180
+ ├── restart_sd.json # Per-cluster SD across restarts
181
+ ├── cluster_order.json # K column → cluster name mapping
182
+ ├── manifest.json # Panel SHA + YAML SHA + K + version pins
183
+ └── build_logs/ # ADMIXTURE stdout/stderr per restart
184
+ ```
185
+
186
+ Cache validity is determined by `manifest.json` SHAs matching the current config (panel.bim, clusters_yaml, K, optional geo-filter YAMLs). Any mismatch → consumer code can fall back to a full ADMIXTURE training pass or rebuild the cache.
187
+
188
+ ## When to use this
189
+
190
+ - **Multi-user services**: cache once, project for every user (~5,000× per-target speedup at scale)
191
+ - **Reproducibility**: published canonical caches (forthcoming via GitHub Releases) give byte-identical P across consumers
192
+ - **CI/CD**: faster integration tests once you have a cache
193
+
194
+ ## When NOT to use this
195
+
196
+ - **One-time analyses** with a custom panel that won't be reused — full ADMIXTURE is simpler
197
+ - **Novel methodologies** requiring per-target P refinement — the projection assumes P is fully determined by the panel
198
+
199
+ ## Status
200
+
201
+ - **v1.0.0** — first PyPI release. Library + CLI surface frozen at this point; cache directory layout is stable at schema v1. Tracks numerical parity against stock ADMIXTURE; canonical published-cache artifacts to follow as separate GitHub releases.
202
+
203
+ See [CHANGELOG.md](CHANGELOG.md) for the per-release detail.
204
+
205
+ ## Contributing
206
+
207
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for dev setup, the three local validation gates (pytest / ruff / mypy), commit conventions, and the tag → OIDC PyPI release procedure.
208
+
209
+ ## Acknowledgments
210
+
211
+ This library was extracted from [ancestry-pipeline](https://github.com/carstenerickson/ancestry-pipeline)'s in-pipeline supervised-ADMIXTURE projection module (`pop_automation/admixture_projection.py`, ~744 LOC, validated against real-world workloads). The split lets sibling projects depend on the cache layer without pulling in the larger orchestrator.
212
+
213
+ ## License
214
+
215
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,173 @@
1
+ # admixture-cache
2
+
3
+ Precomputed-P supervised-ADMIXTURE projection cache. Build the slow training pass once per panel × K × clusters_yaml combo; project new targets in ~2 seconds.
4
+
5
+ ## Why this exists
6
+
7
+ Supervised ADMIXTURE training on a real-world panel takes hours to days per restart (K=21 regional cache: ~12-14 hr × 5 restarts; K=4 ancestral_cluster: ~5-7 hr × 5 restarts). For consumer pipelines serving many users, re-running this training per target is wasteful — the P matrix is determined almost entirely by the panel, not the target.
8
+
9
+ `admixture-cache` splits the supervised-ADMIXTURE workflow into:
10
+
11
+ 1. **Panel cache build** (operator, slow, one-time per panel update): stock ADMIXTURE × N restarts → cache best-LL P matrix + multimodality SD + manifest.
12
+ 2. **Per-target projection** (consumer, fast, every run): align target.bed to cached panel variants + axes (plink2), load dosages, solve for Q via scipy SLSQP under the standard binomial admixture likelihood.
13
+
14
+ The projection math matches stock ADMIXTURE Q values to within ~1e-5 absolute on representative workloads (15K × 850K matrix at K=4).
15
+
16
+ ## Install
17
+
18
+ ```bash
19
+ pip install admixture-cache
20
+ ```
21
+
22
+ Python 3.11 through 3.14 are supported. End-to-end paths require **ADMIXTURE** (for `build`) and **plink2** (for `project` / `verify`) on `PATH`. Pure-library use without those binaries is fine — only the build/projection orchestrators shell out.
23
+
24
+ ## Quickstart — library
25
+
26
+ ```python
27
+ from pathlib import Path
28
+ from admixture_cache import build_panel_cache, project_target
29
+
30
+ # One-time, slow (~hours per restart per cache)
31
+ manifest = build_panel_cache(
32
+ panel_bed=Path("panel.bed"),
33
+ panel_pop_file=Path("panel.pop"),
34
+ clusters_yaml=Path("clusters.yaml"),
35
+ k=21,
36
+ cache_dir=Path("data/regional_k21_cache/"),
37
+ admixture_runner=my_tool_runner, # see ToolRunner Protocol below
38
+ track="regional",
39
+ panel_id="aadr_v66_ho",
40
+ panel_version="v66.0",
41
+ admixture_version="1.4.0",
42
+ seeds=[1, 2, 3, 4, 5],
43
+ sd_threshold=0.02,
44
+ )
45
+
46
+ # Per-target, fast (~2 seconds end-to-end)
47
+ result = project_target(
48
+ target_bed=Path("target.bed"),
49
+ cache_dir=Path("data/regional_k21_cache/"),
50
+ plink2_runner=my_plink2_runner,
51
+ work_dir=Path("scratch/projection/"),
52
+ )
53
+ print(result.target_q) # K-vector
54
+ print(result.cluster_order) # K names
55
+ print(result.panel_stability_max_sd) # cached panel restart_sd
56
+ ```
57
+
58
+ ## Quickstart — CLI
59
+
60
+ Installing the package registers the `admixture-cache` console script with four subcommands:
61
+
62
+ ```bash
63
+ # 1. Build a panel cache (slow, one-time).
64
+ admixture-cache build \
65
+ --panel-bed panel.bed \
66
+ --panel-pop panel.pop \
67
+ --clusters-yaml clusters.yaml \
68
+ --k 21 \
69
+ --cache-dir data/regional_k21_cache/ \
70
+ --track regional \
71
+ --panel-id aadr_v66_ho \
72
+ --panel-version v66.0 \
73
+ --seeds 1,2,3,4,5
74
+
75
+ # 2. Project a target against an existing cache (fast).
76
+ admixture-cache project \
77
+ --target-bed target.bed \
78
+ --cache-dir data/regional_k21_cache/ \
79
+ --work-dir scratch/projection/
80
+
81
+ # 3. Check whether a cache matches the current panel/YAML/K config.
82
+ admixture-cache verify \
83
+ --panel-bed panel.bed \
84
+ --clusters-yaml clusters.yaml \
85
+ --k 21 \
86
+ --cache-dir data/regional_k21_cache/
87
+
88
+ # 4. (Future) pull a canonical published cache. Placeholder until v1.0
89
+ # canonical-release artifacts ship.
90
+ admixture-cache download regional-k21-aadr-v66-ho
91
+ ```
92
+
93
+ The default `SubprocessToolRunner` runs the local `admixture` / `plink2` binaries on `PATH`; override with `--admixture-binary` / `--plink2-binary` to point at a specific build.
94
+
95
+ `build`, `project`, and `verify` all surface a non-zero exit code on failure with a descriptive `error: …` line on stderr. `project --json` emits machine-readable JSON instead of human-readable text.
96
+
97
+ ## ToolRunner Protocol
98
+
99
+ When calling the library from Python (rather than via the CLI), pass any object satisfying the `ToolRunner` Protocol:
100
+
101
+ ```python
102
+ from collections.abc import Callable
103
+ from pathlib import Path
104
+
105
+ class MyToolRunner:
106
+ def run(
107
+ self,
108
+ *,
109
+ args: list[str],
110
+ cwd: Path,
111
+ log_dir: Path,
112
+ timeout_seconds: int = 600,
113
+ # The two kwargs below are OPTIONAL but REQUIRED for
114
+ # parallel `build_panel_cache` (max_parallel_restarts > 1):
115
+ log_name: str | None = None,
116
+ pid_callback: Callable[[int], None] | None = None,
117
+ ) -> object:
118
+ ...
119
+ ```
120
+
121
+ - `log_name` — admixture-cache passes the per-restart canonical log filename (e.g. `restart_3.out`). Honor it when set; fall back to your own naming scheme when `None`. Required for parallel mode (concurrent restarts share `log_dir` and need disambiguated filenames).
122
+ - `pid_callback` — call with the subprocess PID immediately after spawning. admixture-cache uses this to SIGTERM in-flight restarts on first-failure cancellation. Required for parallel mode.
123
+ - Spawn subprocesses with `start_new_session=True` so each child gets its own process group. The cancellation path signals the pgid (via `os.killpg`) rather than the bare PID — avoids the classic UNIX PID-recycle race when a subprocess exits between PID capture and the cancellation pass.
124
+
125
+ Adapters that forward via `**kwargs` (e.g. `def run(self, **kwargs): return self._inner.run(**kwargs)`) are recognized as supporting both extensions — but the inner runner MUST actually honor them. A `**kwargs` forwarder that silently strips unknown kwargs will pass the parallel-mode guard but produce incoherent logs and broken cancellation.
126
+
127
+ For non-parallel use (`max_parallel_restarts=1`), both extensions are optional — only the four baseline kwargs are required.
128
+
129
+ ## Cache directory layout
130
+
131
+ After `build_panel_cache` succeeds, `cache_dir` contains:
132
+
133
+ ```
134
+ cache_dir/
135
+ ├── panel.K.P # Best-LL restart's allele freqs (M × K)
136
+ ├── panel.K.Q # Best-LL restart's non-target Q (N × K)
137
+ ├── panel.bim # Variant set + REF/ALT axes (alignment ref)
138
+ ├── restart_sd.json # Per-cluster SD across restarts
139
+ ├── cluster_order.json # K column → cluster name mapping
140
+ ├── manifest.json # Panel SHA + YAML SHA + K + version pins
141
+ └── build_logs/ # ADMIXTURE stdout/stderr per restart
142
+ ```
143
+
144
+ Cache validity is determined by `manifest.json` SHAs matching the current config (panel.bim, clusters_yaml, K, optional geo-filter YAMLs). Any mismatch → consumer code can fall back to a full ADMIXTURE training pass or rebuild the cache.
145
+
146
+ ## When to use this
147
+
148
+ - **Multi-user services**: cache once, project for every user (~5,000× per-target speedup at scale)
149
+ - **Reproducibility**: published canonical caches (forthcoming via GitHub Releases) give byte-identical P across consumers
150
+ - **CI/CD**: faster integration tests once you have a cache
151
+
152
+ ## When NOT to use this
153
+
154
+ - **One-time analyses** with a custom panel that won't be reused — full ADMIXTURE is simpler
155
+ - **Novel methodologies** requiring per-target P refinement — the projection assumes P is fully determined by the panel
156
+
157
+ ## Status
158
+
159
+ - **v1.0.0** — first PyPI release. Library + CLI surface frozen at this point; cache directory layout is stable at schema v1. Tracks numerical parity against stock ADMIXTURE; canonical published-cache artifacts to follow as separate GitHub releases.
160
+
161
+ See [CHANGELOG.md](CHANGELOG.md) for the per-release detail.
162
+
163
+ ## Contributing
164
+
165
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for dev setup, the three local validation gates (pytest / ruff / mypy), commit conventions, and the tag → OIDC PyPI release procedure.
166
+
167
+ ## Acknowledgments
168
+
169
+ This library was extracted from [ancestry-pipeline](https://github.com/carstenerickson/ancestry-pipeline)'s in-pipeline supervised-ADMIXTURE projection module (`pop_automation/admixture_projection.py`, ~744 LOC, validated against real-world workloads). The split lets sibling projects depend on the cache layer without pulling in the larger orchestrator.
170
+
171
+ ## License
172
+
173
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,111 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "admixture-cache"
7
+ version = "1.0.0"
8
+ description = "Precomputed-P supervised-ADMIXTURE projection cache: build slow once, project fast per target."
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ requires-python = ">=3.11,<3.15"
12
+ authors = [
13
+ { name = "Carsten Erickson", email = "carstene@mailbox.org" },
14
+ ]
15
+ keywords = [
16
+ "bioinformatics",
17
+ "genetics",
18
+ "admixture",
19
+ "ancestry",
20
+ "population-genetics",
21
+ "supervised-admixture",
22
+ "projection-cache",
23
+ ]
24
+ classifiers = [
25
+ "Development Status :: 4 - Beta",
26
+ "Environment :: Console",
27
+ "Intended Audience :: Science/Research",
28
+ "License :: OSI Approved :: MIT License",
29
+ "Operating System :: POSIX :: Linux",
30
+ "Operating System :: MacOS :: MacOS X",
31
+ "Programming Language :: Python :: 3",
32
+ "Programming Language :: Python :: 3 :: Only",
33
+ "Programming Language :: Python :: 3.11",
34
+ "Programming Language :: Python :: 3.12",
35
+ "Programming Language :: Python :: 3.13",
36
+ "Programming Language :: Python :: 3.14",
37
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
38
+ "Typing :: Typed",
39
+ ]
40
+ dependencies = [
41
+ "numpy>=1.26,<3",
42
+ "scipy>=1.11,<2",
43
+ "pydantic>=2.5,<3",
44
+ # pandas is used by `extract_target_dosage_via_plink2` to parse
45
+ # plink2 `--recode A` text output. Imported inline (not at module
46
+ # load) so the import cost only hits the projection hot path,
47
+ # but it IS required for the default project_target flow.
48
+ "pandas>=2.0,<3",
49
+ ]
50
+
51
+ [project.optional-dependencies]
52
+ dev = [
53
+ "pytest>=8.0",
54
+ "pytest-cov>=5.0",
55
+ "ruff>=0.6",
56
+ "mypy>=1.11",
57
+ # pandas-stubs gives mypy proper types for the pandas calls in
58
+ # alignment.py — without it strict mode reports `no-any-return`
59
+ # on the DataFrame-to-ndarray conversion.
60
+ "pandas-stubs>=2.0",
61
+ "build>=1.2",
62
+ "twine>=5.0",
63
+ ]
64
+
65
+ [project.scripts]
66
+ admixture-cache = "admixture_cache.cli:cli"
67
+
68
+ [project.urls]
69
+ Homepage = "https://github.com/carstenerickson/admixture-cache"
70
+ Issues = "https://github.com/carstenerickson/admixture-cache/issues"
71
+ Source = "https://github.com/carstenerickson/admixture-cache"
72
+ Changelog = "https://github.com/carstenerickson/admixture-cache/blob/main/CHANGELOG.md"
73
+
74
+ [tool.setuptools.packages.find]
75
+ where = ["src"]
76
+ include = ["admixture_cache*"]
77
+
78
+ [tool.setuptools.package-data]
79
+ admixture_cache = ["py.typed"]
80
+
81
+ [tool.pytest.ini_options]
82
+ testpaths = ["tests"]
83
+ python_files = ["test_*.py"]
84
+ addopts = "-ra --strict-markers"
85
+
86
+ [tool.ruff]
87
+ target-version = "py311"
88
+ line-length = 100
89
+
90
+ [tool.ruff.lint]
91
+ select = ["E", "F", "W", "I", "B", "UP", "SIM", "RUF"]
92
+ ignore = [
93
+ "E501", # line length handled by ruff format
94
+ "RUF001", # × and – in strings: intentional in scientific docstrings
95
+ "RUF002", # × and – in docstrings: intentional
96
+ "RUF003", # × and – in comments: intentional
97
+ "RUF022", # __all__ semantically grouped, not alphabetical
98
+ ]
99
+
100
+ [tool.mypy]
101
+ python_version = "3.11"
102
+ strict = true
103
+ files = ["src/admixture_cache"]
104
+
105
+ [[tool.mypy.overrides]]
106
+ module = "scipy.optimize"
107
+ ignore_missing_imports = true
108
+
109
+ [[tool.mypy.overrides]]
110
+ module = "pandas"
111
+ ignore_missing_imports = true
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,77 @@
1
+ """admixture-cache — precomputed-P supervised-ADMIXTURE projection.
2
+
3
+ Split the slow supervised-ADMIXTURE training pass (panel-only,
4
+ ~hours, one-time per panel × K × clusters_yaml combo) out of the
5
+ per-target hot path. After building, project a new target's K-vector
6
+ in <2 seconds via NumPy SLSQP against the cached P matrix.
7
+
8
+ Two phases, two APIs:
9
+
10
+ 1. **Panel cache build** (operator-facing, slow):
11
+ - :func:`build_panel_cache` runs stock ADMIXTURE × N restarts via
12
+ an injected ToolRunner, validates multimodality, writes the
13
+ canonical cached P + manifest.
14
+
15
+ 2. **Per-target projection** (consumer-facing, fast):
16
+ - :func:`project_target` aligns target.bed to cached panel.bim
17
+ + axes (via plink2), reads the target as a dosage vector,
18
+ solves for Q via scipy SLSQP under the binomial admixture
19
+ likelihood.
20
+
21
+ The math is validated to <1e-5 absolute Q-vector match against stock
22
+ ADMIXTURE on representative panels (15K samples × 850K SNPs at K=4).
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from admixture_cache.alignment import (
28
+ align_target_to_panel_bim,
29
+ extract_target_dosage_via_plink2,
30
+ )
31
+ from admixture_cache.builder import build_panel_cache, ld_prune_panel
32
+ from admixture_cache.errors import PanelCacheError, PopAutomationConfigError
33
+ from admixture_cache.io import (
34
+ load_cache_manifest,
35
+ load_cached_p,
36
+ sha256_file,
37
+ verify_cache_matches_current_config,
38
+ )
39
+ from admixture_cache.manifest import PanelCacheManifest
40
+ from admixture_cache.orchestration import project_target
41
+ from admixture_cache.projection import (
42
+ ProjectionResult,
43
+ numpy_supervised_projection,
44
+ )
45
+ from admixture_cache.runner import ToolRunner
46
+
47
+ __version__ = "1.0.0"
48
+
49
+ __all__ = [
50
+ # Public API — cache build (slow, one-time)
51
+ "build_panel_cache",
52
+ "ld_prune_panel", # optional pre-step before build_panel_cache
53
+ # Public API — per-target projection (fast)
54
+ "project_target",
55
+ "numpy_supervised_projection",
56
+ # Public API — alignment + dosage I/O
57
+ "align_target_to_panel_bim",
58
+ "extract_target_dosage_via_plink2",
59
+ # Public API — cache I/O + validation
60
+ "load_cached_p",
61
+ "load_cache_manifest",
62
+ "verify_cache_matches_current_config",
63
+ "sha256_file",
64
+ # Schemas
65
+ "PanelCacheManifest",
66
+ "ProjectionResult",
67
+ # Error type
68
+ "PanelCacheError",
69
+ # Back-compat alias for the upstream source-of-extraction; kept
70
+ # importable for callers mid-migration. Identical to
71
+ # PanelCacheError; safe to delete once no consumer relies on it.
72
+ "PopAutomationConfigError",
73
+ # Runner Protocol (for consumers' type hints)
74
+ "ToolRunner",
75
+ # Version
76
+ "__version__",
77
+ ]