aadr-subset 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aadr_subset-0.1.0/LICENSE +21 -0
- aadr_subset-0.1.0/PKG-INFO +291 -0
- aadr_subset-0.1.0/README.md +249 -0
- aadr_subset-0.1.0/pyproject.toml +109 -0
- aadr_subset-0.1.0/setup.cfg +4 -0
- aadr_subset-0.1.0/src/aadr_subset/__init__.py +5 -0
- aadr_subset-0.1.0/src/aadr_subset/__main__.py +6 -0
- aadr_subset-0.1.0/src/aadr_subset/cli.py +359 -0
- aadr_subset-0.1.0/src/aadr_subset/commands/__init__.py +1 -0
- aadr_subset-0.1.0/src/aadr_subset/commands/inspect_cmd.py +129 -0
- aadr_subset-0.1.0/src/aadr_subset/commands/report_cmd.py +168 -0
- aadr_subset-0.1.0/src/aadr_subset/commands/select_cmd.py +366 -0
- aadr_subset-0.1.0/src/aadr_subset/commands/template_cmd.py +58 -0
- aadr_subset-0.1.0/src/aadr_subset/commands/validate_cmd.py +40 -0
- aadr_subset-0.1.0/src/aadr_subset/engine.py +568 -0
- aadr_subset-0.1.0/src/aadr_subset/errors.py +99 -0
- aadr_subset-0.1.0/src/aadr_subset/formats.py +301 -0
- aadr_subset-0.1.0/src/aadr_subset/py.typed +0 -0
- aadr_subset-0.1.0/src/aadr_subset/reporting.py +423 -0
- aadr_subset-0.1.0/src/aadr_subset/schemas/selector.schema.json +168 -0
- aadr_subset-0.1.0/src/aadr_subset/selector.py +819 -0
- aadr_subset-0.1.0/src/aadr_subset/templates/bronze_age_europe.yaml +52 -0
- aadr_subset-0.1.0/src/aadr_subset/templates/iron_age_britain.yaml +33 -0
- aadr_subset-0.1.0/src/aadr_subset/templates/modern_european.yaml +55 -0
- aadr_subset-0.1.0/src/aadr_subset/templates/neolithic_anatolia.yaml +44 -0
- aadr_subset-0.1.0/src/aadr_subset/templates/viking_period_scandinavian.yaml +54 -0
- aadr_subset-0.1.0/src/aadr_subset/templates/wsh_steppe_pool.yaml +57 -0
- aadr_subset-0.1.0/src/aadr_subset/templates.py +76 -0
- aadr_subset-0.1.0/src/aadr_subset/types.py +166 -0
- aadr_subset-0.1.0/src/aadr_subset.egg-info/PKG-INFO +291 -0
- aadr_subset-0.1.0/src/aadr_subset.egg-info/SOURCES.txt +33 -0
- aadr_subset-0.1.0/src/aadr_subset.egg-info/dependency_links.txt +1 -0
- aadr_subset-0.1.0/src/aadr_subset.egg-info/entry_points.txt +2 -0
- aadr_subset-0.1.0/src/aadr_subset.egg-info/requires.txt +18 -0
- aadr_subset-0.1.0/src/aadr_subset.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Carsten Erickson
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aadr-subset
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Declarative AADR panel subsetting from YAML selectors; the missing first-class tool for cohort definitions in ancient-DNA / population-genetics workflows.
|
|
5
|
+
Author-email: Carsten Erickson <carstene@mailbox.org>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/carstenerickson/aadr-subset
|
|
8
|
+
Project-URL: Issues, https://github.com/carstenerickson/aadr-subset/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/carstenerickson/aadr-subset/blob/main/CHANGELOG.md
|
|
10
|
+
Keywords: bioinformatics,genetics,aadr,ancient-dna,admixtools,population-genetics
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
15
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
20
|
+
Classifier: Typing :: Typed
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: aadr-resolve<0.3,>=0.2.0
|
|
25
|
+
Requires-Dist: pandas<3,>=2.2
|
|
26
|
+
Requires-Dist: pyyaml>=6.0
|
|
27
|
+
Requires-Dist: ruamel.yaml>=0.18
|
|
28
|
+
Requires-Dist: jsonschema>=4.20
|
|
29
|
+
Requires-Dist: click<9,>=8.1
|
|
30
|
+
Requires-Dist: rfc8785>=0.1
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-cov>=5.0; extra == "dev"
|
|
34
|
+
Requires-Dist: ruff>=0.6; extra == "dev"
|
|
35
|
+
Requires-Dist: mypy>=1.11; extra == "dev"
|
|
36
|
+
Requires-Dist: types-jsonschema>=4.0; extra == "dev"
|
|
37
|
+
Requires-Dist: types-PyYAML>=6.0; extra == "dev"
|
|
38
|
+
Requires-Dist: pandas-stubs>=2.2; extra == "dev"
|
|
39
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
40
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
41
|
+
Dynamic: license-file
|
|
42
|
+
|
|
43
|
+
# aadr-subset
|
|
44
|
+
|
|
45
|
+
Declarative AADR panel subsetting from YAML selectors. Replaces ad-hoc
|
|
46
|
+
`awk` pipelines and one-off scripts with version-stable,
|
|
47
|
+
PR-reviewable cohort definitions. Built on top of
|
|
48
|
+
[aadr-resolve](https://github.com/carstenerickson/aadr-resolve) for
|
|
49
|
+
cross-AADR-version sample-ID mapping.
|
|
50
|
+
|
|
51
|
+
```yaml
|
|
52
|
+
# britain_iron_age.yaml
|
|
53
|
+
populations: [England_IA, England_IA.AG, England_IA.SG]
|
|
54
|
+
date: {min_calbp: 1900, max_calbp: 2400}
|
|
55
|
+
min_coverage: 0.3
|
|
56
|
+
exclude:
|
|
57
|
+
individual_ids: [I12345] # known contaminated sample
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
$ aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno -o cohort.ids
|
|
62
|
+
Selector: britain_iron_age.yaml (sha256:1a2b3c4...d5e6f7g)
|
|
63
|
+
.anno: v66.HO.aadr.PUB.anno (v66.0, class E)
|
|
64
|
+
|
|
65
|
+
Matched 45 samples across 1 populations.
|
|
66
|
+
|
|
67
|
+
Per-population: England_IA=45
|
|
68
|
+
|
|
69
|
+
Wrote cohort.ids (45 lines)
|
|
70
|
+
Done in 0.18s (parse 0.16s, eval 0.02s, write 0.00s).
|
|
71
|
+
|
|
72
|
+
$ plink2 --pfile aadr_v66 --keep cohort.ids --make-pgen --out britain_iron_age
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Why it exists
|
|
76
|
+
|
|
77
|
+
Ancient-DNA workflows live and die on cohort definitions — *which samples
|
|
78
|
+
go into this analysis*. Today that's typically a hand-curated set of
|
|
79
|
+
Group_ID literals in someone's shell script, prone to: silent breakage
|
|
80
|
+
when AADR releases a new version with renamed labels; no version pinning
|
|
81
|
+
in commit history; no way to share the exact cohort between collaborators
|
|
82
|
+
short of swapping `.ind` files.
|
|
83
|
+
|
|
84
|
+
`aadr-subset` makes the cohort itself a first-class artifact:
|
|
85
|
+
|
|
86
|
+
- **Selector YAMLs are version-stable.** They cite AADR releases via
|
|
87
|
+
`tested_against:` metadata; the `selector_signature` (RFC 8785 JCS
|
|
88
|
+
SHA-256 over the canonical form) gives you a hash that survives
|
|
89
|
+
YAML formatting churn.
|
|
90
|
+
- **Reviewable in PRs.** The grammar is flat (top-level AND with
|
|
91
|
+
one-level `any:` OR and one-level `exclude:` NOT). What you see is
|
|
92
|
+
what runs.
|
|
93
|
+
- **Cross-version via `aadr-resolve`.** `resolve_to_version:` lifts
|
|
94
|
+
Individual_IDs from an older release to the newer one through the
|
|
95
|
+
GID-stable bridge + MID-rename map.
|
|
96
|
+
- **Five subcommands** cover the full lifecycle: `validate`, `select`,
|
|
97
|
+
`inspect`, `report`, `template`.
|
|
98
|
+
|
|
99
|
+
## Install
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
pip install aadr-subset # once PyPI'd; currently:
|
|
103
|
+
pip install git+https://github.com/carstenerickson/aadr-subset.git
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Python 3.11+. The only external dependency is `aadr-resolve` (also
|
|
107
|
+
installed via git URL until both ship to PyPI).
|
|
108
|
+
|
|
109
|
+
For development:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
git clone https://github.com/carstenerickson/aadr-subset.git
|
|
113
|
+
cd aadr-subset
|
|
114
|
+
pip install -e ".[dev]"
|
|
115
|
+
pytest
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## The five subcommands
|
|
119
|
+
|
|
120
|
+
### `validate SELECTOR.yaml`
|
|
121
|
+
|
|
122
|
+
JSON-schema + semantic-constraint check on a selector. No `.anno`
|
|
123
|
+
required. Useful as a CI gate.
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
$ aadr-subset validate britain_iron_age.yaml
|
|
127
|
+
# exit 0 on valid; exit 4 on schema or semantic violation
|
|
128
|
+
# Errors carry precise file:line:col + JSON pointer:
|
|
129
|
+
$ aadr-subset validate broken.yaml
|
|
130
|
+
broken.yaml:7:5: at /populations/2: 42 is not of type 'string'
|
|
131
|
+
broken.yaml:12:3: at /any/0/min_coverage: -0.5 is less than the minimum of 0
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### `select SELECTOR.yaml ANNO.anno [-o PATH] [--format ids|tsv|json]`
|
|
135
|
+
|
|
136
|
+
The main case: materialize a selector against a target `.anno` and
|
|
137
|
+
write matched sample IDs / TSV / JSON.
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno -o cohort.ids
|
|
141
|
+
aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno --format tsv -o cohort.tsv
|
|
142
|
+
aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno --format json -o cohort.json
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Cross-version flow (selector defined against an older release than the
|
|
146
|
+
materialized one):
|
|
147
|
+
|
|
148
|
+
```yaml
|
|
149
|
+
# britain_v62_lift.yaml
|
|
150
|
+
individual_ids: [I12345, I12346]
|
|
151
|
+
source_version: v62.0
|
|
152
|
+
resolve_to_version: v66.0
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
aadr-subset select britain_v62_lift.yaml v66.HO.aadr.PUB.anno \
|
|
157
|
+
--source-anno v62.0_HO_public.anno \
|
|
158
|
+
-o lifted.ids
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
v62.0 inputs (class D — no native coverage column) need a derived proxy
|
|
162
|
+
for `min_coverage:` filters:
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
aadr-subset select britain_iron_age.yaml v62.0_HO_public.anno \
|
|
166
|
+
--coverage-derive snps_hit_1240k -o cohort.ids
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### `inspect SELECTOR.yaml ANNO.anno`
|
|
170
|
+
|
|
171
|
+
Dry-run: shows what a selector matches without writing any file.
|
|
172
|
+
Always exits 0 — meant for debugging selector logic.
|
|
173
|
+
|
|
174
|
+
```
|
|
175
|
+
$ aadr-subset inspect britain_iron_age.yaml v66.HO.aadr.PUB.anno
|
|
176
|
+
Selector: britain_iron_age.yaml
|
|
177
|
+
.anno: v66.HO.aadr.PUB.anno (v66.0, class E, 27,755 samples)
|
|
178
|
+
|
|
179
|
+
Matched: 45 samples across 1 population
|
|
180
|
+
|
|
181
|
+
Per-population breakdown:
|
|
182
|
+
England_IA 45
|
|
183
|
+
|
|
184
|
+
Branch contributions:
|
|
185
|
+
top_level 45
|
|
186
|
+
|
|
187
|
+
Date range of matched: 1934 - 2398 calBP (median 2103)
|
|
188
|
+
Coverage range: 0.34 - 4.81x (median 1.28)
|
|
189
|
+
|
|
190
|
+
Selector signature: sha256:1a2b3c4d5e6f7g8h9i0j1k2l3m4n5o6p7q8r9s0t1u2v3w4x5y6z7a8b9c0d1e
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### `report SELECTOR.yaml ANNO.anno [-o PATH] [--format tsv|json]`
|
|
194
|
+
|
|
195
|
+
Per-population aggregates: how many samples each Group_ID contributed,
|
|
196
|
+
with date range and coverage stats.
|
|
197
|
+
|
|
198
|
+
```
|
|
199
|
+
$ aadr-subset report britain_iron_age.yaml v66.HO.aadr.PUB.anno
|
|
200
|
+
group_id n_matched n_in_anno pct_matched date_min_calbp date_max_calbp coverage_median
|
|
201
|
+
England_IA 45 51 88.2 1934 2398 1.28
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
`--include-empty-groups` adds rows for `.anno` groups that matched
|
|
205
|
+
zero samples (useful for population-survey workflows).
|
|
206
|
+
|
|
207
|
+
### `template [NAME] [-o PATH]`
|
|
208
|
+
|
|
209
|
+
Ships starter selectors for common cohorts. No-arg form lists
|
|
210
|
+
shipped templates; arg form emits the verbatim YAML (comments + metadata
|
|
211
|
+
block preserved) to stdout or `--out PATH`.
|
|
212
|
+
|
|
213
|
+
```
|
|
214
|
+
$ aadr-subset template
|
|
215
|
+
bronze_age_europe
|
|
216
|
+
iron_age_britain
|
|
217
|
+
modern_european
|
|
218
|
+
neolithic_anatolia
|
|
219
|
+
viking_period_scandinavian
|
|
220
|
+
wsh_steppe_pool
|
|
221
|
+
|
|
222
|
+
$ aadr-subset template iron_age_britain -o britain.yaml
|
|
223
|
+
# britain.yaml now contains a working starting point — edit + extend.
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
All shipped templates are verified against AADR **v62.0** and **v66.0** —
|
|
227
|
+
each template's `tested_against:` metadata reflects the releases it
|
|
228
|
+
resolves to non-zero matches against.
|
|
229
|
+
|
|
230
|
+
## Exit codes
|
|
231
|
+
|
|
232
|
+
| Code | Meaning |
|
|
233
|
+
|---|---|
|
|
234
|
+
| 0 | Success |
|
|
235
|
+
| 1 | Soft validation failure (e.g. zero-match without `--allow-empty`, `--strict-resolve` missing IIDs) |
|
|
236
|
+
| 2 | I/O failure (file not found, `.anno` schema unrecognized, etc.) |
|
|
237
|
+
| 4 | Usage error (schema violation, flag misuse, unknown template) |
|
|
238
|
+
| 70 | Internal error (please file an issue) |
|
|
239
|
+
|
|
240
|
+
## Selector grammar (overview)
|
|
241
|
+
|
|
242
|
+
Flat — one level of nesting maximum. Top-level keys AND-combine.
|
|
243
|
+
|
|
244
|
+
```yaml
|
|
245
|
+
# Top-level AND
|
|
246
|
+
populations: [Western_HG, Eastern_HG] # match against group_id
|
|
247
|
+
individual_ids: [Loschbour, KO1] # match against individual_id
|
|
248
|
+
individual_ids_source: ids.txt # newline-delimited file
|
|
249
|
+
modern_only: true # shorthand: date_calbp <= 70
|
|
250
|
+
min_coverage: 0.3
|
|
251
|
+
coverage_column: snps_hit_1240k # override; selector-side wins over --coverage-derive
|
|
252
|
+
date:
|
|
253
|
+
min_calbp: 1900
|
|
254
|
+
max_calbp: 2400
|
|
255
|
+
source_version: v62.0 # cross-version lift
|
|
256
|
+
resolve_to_version: v66.0
|
|
257
|
+
|
|
258
|
+
# One-level OR (matches any branch)
|
|
259
|
+
any:
|
|
260
|
+
- populations: [Western_HG]
|
|
261
|
+
min_coverage: 1.0
|
|
262
|
+
- populations: [Eastern_HG]
|
|
263
|
+
min_coverage: 0.5
|
|
264
|
+
|
|
265
|
+
# One-level NOT-of-OR (drops matches)
|
|
266
|
+
exclude:
|
|
267
|
+
group_ids: [English.SG]
|
|
268
|
+
individual_ids: [I12345]
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
Full spec: [aadr-subset HLD](https://github.com/carstenerickson/aadr-subset/blob/main/docs/hld.md).
|
|
272
|
+
|
|
273
|
+
## Composing with `plink2`
|
|
274
|
+
|
|
275
|
+
```bash
|
|
276
|
+
# Materialize a cohort
|
|
277
|
+
aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno -o cohort.ids
|
|
278
|
+
|
|
279
|
+
# Use it as a plink2 keep set
|
|
280
|
+
plink2 --pfile aadr_v66 \
|
|
281
|
+
--keep cohort.ids \
|
|
282
|
+
--make-pgen --out britain_iron_age_subset
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
`select --format json` produces a structured artifact suitable for
|
|
286
|
+
pipeline metadata logging (records the selector signature, AADR version,
|
|
287
|
+
schema class, and effective coverage column).
|
|
288
|
+
|
|
289
|
+
## License
|
|
290
|
+
|
|
291
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
# aadr-subset
|
|
2
|
+
|
|
3
|
+
Declarative AADR panel subsetting from YAML selectors. Replaces ad-hoc
|
|
4
|
+
`awk` pipelines and one-off scripts with version-stable,
|
|
5
|
+
PR-reviewable cohort definitions. Built on top of
|
|
6
|
+
[aadr-resolve](https://github.com/carstenerickson/aadr-resolve) for
|
|
7
|
+
cross-AADR-version sample-ID mapping.
|
|
8
|
+
|
|
9
|
+
```yaml
|
|
10
|
+
# britain_iron_age.yaml
|
|
11
|
+
populations: [England_IA, England_IA.AG, England_IA.SG]
|
|
12
|
+
date: {min_calbp: 1900, max_calbp: 2400}
|
|
13
|
+
min_coverage: 0.3
|
|
14
|
+
exclude:
|
|
15
|
+
individual_ids: [I12345] # known contaminated sample
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
$ aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno -o cohort.ids
|
|
20
|
+
Selector: britain_iron_age.yaml (sha256:1a2b3c4...d5e6f7g)
|
|
21
|
+
.anno: v66.HO.aadr.PUB.anno (v66.0, class E)
|
|
22
|
+
|
|
23
|
+
Matched 45 samples across 1 populations.
|
|
24
|
+
|
|
25
|
+
Per-population: England_IA=45
|
|
26
|
+
|
|
27
|
+
Wrote cohort.ids (45 lines)
|
|
28
|
+
Done in 0.18s (parse 0.16s, eval 0.02s, write 0.00s).
|
|
29
|
+
|
|
30
|
+
$ plink2 --pfile aadr_v66 --keep cohort.ids --make-pgen --out britain_iron_age
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Why it exists
|
|
34
|
+
|
|
35
|
+
Ancient-DNA workflows live and die on cohort definitions — *which samples
|
|
36
|
+
go into this analysis*. Today that's typically a hand-curated set of
|
|
37
|
+
Group_ID literals in someone's shell script, prone to: silent breakage
|
|
38
|
+
when AADR releases a new version with renamed labels; no version pinning
|
|
39
|
+
in commit history; no way to share the exact cohort between collaborators
|
|
40
|
+
short of swapping `.ind` files.
|
|
41
|
+
|
|
42
|
+
`aadr-subset` makes the cohort itself a first-class artifact:
|
|
43
|
+
|
|
44
|
+
- **Selector YAMLs are version-stable.** They cite AADR releases via
|
|
45
|
+
`tested_against:` metadata; the `selector_signature` (RFC 8785 JCS
|
|
46
|
+
SHA-256 over the canonical form) gives you a hash that survives
|
|
47
|
+
YAML formatting churn.
|
|
48
|
+
- **Reviewable in PRs.** The grammar is flat (top-level AND with
|
|
49
|
+
one-level `any:` OR and one-level `exclude:` NOT). What you see is
|
|
50
|
+
what runs.
|
|
51
|
+
- **Cross-version via `aadr-resolve`.** `resolve_to_version:` lifts
|
|
52
|
+
Individual_IDs from an older release to the newer one through the
|
|
53
|
+
GID-stable bridge + MID-rename map.
|
|
54
|
+
- **Five subcommands** cover the full lifecycle: `validate`, `select`,
|
|
55
|
+
`inspect`, `report`, `template`.
|
|
56
|
+
|
|
57
|
+
## Install
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install aadr-subset # once PyPI'd; currently:
|
|
61
|
+
pip install git+https://github.com/carstenerickson/aadr-subset.git
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Python 3.11+. The only external dependency is `aadr-resolve` (also
|
|
65
|
+
installed via git URL until both ship to PyPI).
|
|
66
|
+
|
|
67
|
+
For development:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
git clone https://github.com/carstenerickson/aadr-subset.git
|
|
71
|
+
cd aadr-subset
|
|
72
|
+
pip install -e ".[dev]"
|
|
73
|
+
pytest
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## The five subcommands
|
|
77
|
+
|
|
78
|
+
### `validate SELECTOR.yaml`
|
|
79
|
+
|
|
80
|
+
JSON-schema + semantic-constraint check on a selector. No `.anno`
|
|
81
|
+
required. Useful as a CI gate.
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
$ aadr-subset validate britain_iron_age.yaml
|
|
85
|
+
# exit 0 on valid; exit 4 on schema or semantic violation
|
|
86
|
+
# Errors carry precise file:line:col + JSON pointer:
|
|
87
|
+
$ aadr-subset validate broken.yaml
|
|
88
|
+
broken.yaml:7:5: at /populations/2: 42 is not of type 'string'
|
|
89
|
+
broken.yaml:12:3: at /any/0/min_coverage: -0.5 is less than the minimum of 0
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### `select SELECTOR.yaml ANNO.anno [-o PATH] [--format ids|tsv|json]`
|
|
93
|
+
|
|
94
|
+
The main case: materialize a selector against a target `.anno` and
|
|
95
|
+
write matched sample IDs / TSV / JSON.
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno -o cohort.ids
|
|
99
|
+
aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno --format tsv -o cohort.tsv
|
|
100
|
+
aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno --format json -o cohort.json
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Cross-version flow (selector defined against an older release than the
|
|
104
|
+
materialized one):
|
|
105
|
+
|
|
106
|
+
```yaml
|
|
107
|
+
# britain_v62_lift.yaml
|
|
108
|
+
individual_ids: [I12345, I12346]
|
|
109
|
+
source_version: v62.0
|
|
110
|
+
resolve_to_version: v66.0
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
aadr-subset select britain_v62_lift.yaml v66.HO.aadr.PUB.anno \
|
|
115
|
+
--source-anno v62.0_HO_public.anno \
|
|
116
|
+
-o lifted.ids
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
v62.0 inputs (class D — no native coverage column) need a derived proxy
|
|
120
|
+
for `min_coverage:` filters:
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
aadr-subset select britain_iron_age.yaml v62.0_HO_public.anno \
|
|
124
|
+
--coverage-derive snps_hit_1240k -o cohort.ids
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### `inspect SELECTOR.yaml ANNO.anno`
|
|
128
|
+
|
|
129
|
+
Dry-run: shows what a selector matches without writing any file.
|
|
130
|
+
Always exits 0 — meant for debugging selector logic.
|
|
131
|
+
|
|
132
|
+
```
|
|
133
|
+
$ aadr-subset inspect britain_iron_age.yaml v66.HO.aadr.PUB.anno
|
|
134
|
+
Selector: britain_iron_age.yaml
|
|
135
|
+
.anno: v66.HO.aadr.PUB.anno (v66.0, class E, 27,755 samples)
|
|
136
|
+
|
|
137
|
+
Matched: 45 samples across 1 population
|
|
138
|
+
|
|
139
|
+
Per-population breakdown:
|
|
140
|
+
England_IA 45
|
|
141
|
+
|
|
142
|
+
Branch contributions:
|
|
143
|
+
top_level 45
|
|
144
|
+
|
|
145
|
+
Date range of matched: 1934 - 2398 calBP (median 2103)
|
|
146
|
+
Coverage range: 0.34 - 4.81x (median 1.28)
|
|
147
|
+
|
|
148
|
+
Selector signature: sha256:1a2b3c4d5e6f7g8h9i0j1k2l3m4n5o6p7q8r9s0t1u2v3w4x5y6z7a8b9c0d1e
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### `report SELECTOR.yaml ANNO.anno [-o PATH] [--format tsv|json]`
|
|
152
|
+
|
|
153
|
+
Per-population aggregates: how many samples each Group_ID contributed,
|
|
154
|
+
with date range and coverage stats.
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
$ aadr-subset report britain_iron_age.yaml v66.HO.aadr.PUB.anno
|
|
158
|
+
group_id n_matched n_in_anno pct_matched date_min_calbp date_max_calbp coverage_median
|
|
159
|
+
England_IA 45 51 88.2 1934 2398 1.28
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
`--include-empty-groups` adds rows for `.anno` groups that matched
|
|
163
|
+
zero samples (useful for population-survey workflows).
|
|
164
|
+
|
|
165
|
+
### `template [NAME] [-o PATH]`
|
|
166
|
+
|
|
167
|
+
Ships starter selectors for common cohorts. No-arg form lists
|
|
168
|
+
shipped templates; arg form emits the verbatim YAML (comments + metadata
|
|
169
|
+
block preserved) to stdout or `--out PATH`.
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
$ aadr-subset template
|
|
173
|
+
bronze_age_europe
|
|
174
|
+
iron_age_britain
|
|
175
|
+
modern_european
|
|
176
|
+
neolithic_anatolia
|
|
177
|
+
viking_period_scandinavian
|
|
178
|
+
wsh_steppe_pool
|
|
179
|
+
|
|
180
|
+
$ aadr-subset template iron_age_britain -o britain.yaml
|
|
181
|
+
# britain.yaml now contains a working starting point — edit + extend.
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
All shipped templates are verified against AADR **v62.0** and **v66.0** —
|
|
185
|
+
each template's `tested_against:` metadata reflects the releases it
|
|
186
|
+
resolves to non-zero matches against.
|
|
187
|
+
|
|
188
|
+
## Exit codes
|
|
189
|
+
|
|
190
|
+
| Code | Meaning |
|
|
191
|
+
|---|---|
|
|
192
|
+
| 0 | Success |
|
|
193
|
+
| 1 | Soft validation failure (e.g. zero-match without `--allow-empty`, `--strict-resolve` missing IIDs) |
|
|
194
|
+
| 2 | I/O failure (file not found, `.anno` schema unrecognized, etc.) |
|
|
195
|
+
| 4 | Usage error (schema violation, flag misuse, unknown template) |
|
|
196
|
+
| 70 | Internal error (please file an issue) |
|
|
197
|
+
|
|
198
|
+
## Selector grammar (overview)
|
|
199
|
+
|
|
200
|
+
Flat — one level of nesting maximum. Top-level keys AND-combine.
|
|
201
|
+
|
|
202
|
+
```yaml
|
|
203
|
+
# Top-level AND
|
|
204
|
+
populations: [Western_HG, Eastern_HG] # match against group_id
|
|
205
|
+
individual_ids: [Loschbour, KO1] # match against individual_id
|
|
206
|
+
individual_ids_source: ids.txt # newline-delimited file
|
|
207
|
+
modern_only: true # shorthand: date_calbp <= 70
|
|
208
|
+
min_coverage: 0.3
|
|
209
|
+
coverage_column: snps_hit_1240k # override; selector-side wins over --coverage-derive
|
|
210
|
+
date:
|
|
211
|
+
min_calbp: 1900
|
|
212
|
+
max_calbp: 2400
|
|
213
|
+
source_version: v62.0 # cross-version lift
|
|
214
|
+
resolve_to_version: v66.0
|
|
215
|
+
|
|
216
|
+
# One-level OR (matches any branch)
|
|
217
|
+
any:
|
|
218
|
+
- populations: [Western_HG]
|
|
219
|
+
min_coverage: 1.0
|
|
220
|
+
- populations: [Eastern_HG]
|
|
221
|
+
min_coverage: 0.5
|
|
222
|
+
|
|
223
|
+
# One-level NOT-of-OR (drops matches)
|
|
224
|
+
exclude:
|
|
225
|
+
group_ids: [English.SG]
|
|
226
|
+
individual_ids: [I12345]
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
Full spec: [aadr-subset HLD](https://github.com/carstenerickson/aadr-subset/blob/main/docs/hld.md).
|
|
230
|
+
|
|
231
|
+
## Composing with `plink2`
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
# Materialize a cohort
|
|
235
|
+
aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno -o cohort.ids
|
|
236
|
+
|
|
237
|
+
# Use it as a plink2 keep set
|
|
238
|
+
plink2 --pfile aadr_v66 \
|
|
239
|
+
--keep cohort.ids \
|
|
240
|
+
--make-pgen --out britain_iron_age_subset
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
`select --format json` produces a structured artifact suitable for
|
|
244
|
+
pipeline metadata logging (records the selector signature, AADR version,
|
|
245
|
+
schema class, and effective coverage column).
|
|
246
|
+
|
|
247
|
+
## License
|
|
248
|
+
|
|
249
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "aadr-subset"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Declarative AADR panel subsetting from YAML selectors; the missing first-class tool for cohort definitions in ancient-DNA / population-genetics workflows."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Carsten Erickson", email = "carstene@mailbox.org" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["bioinformatics", "genetics", "aadr", "ancient-dna", "admixtools", "population-genetics"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Operating System :: POSIX :: Linux",
|
|
21
|
+
"Operating System :: MacOS :: MacOS X",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Programming Language :: Python :: 3.13",
|
|
25
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
26
|
+
"Typing :: Typed",
|
|
27
|
+
]
|
|
28
|
+
dependencies = [
|
|
29
|
+
# aadr-resolve is the hard library dependency per HLD §aadr-resolve as a
|
|
30
|
+
# library dependency. Tracking the 0.2.x line: 0.2.0 ships the
|
|
31
|
+
# `MissingNativeFieldError`, `AnnoFrame.path`, and `resolve_master_ids`
|
|
32
|
+
# APIs that aadr-subset relies on.
|
|
33
|
+
"aadr-resolve>=0.2.0,<0.3",
|
|
34
|
+
"pandas>=2.2,<3",
|
|
35
|
+
"pyyaml>=6.0",
|
|
36
|
+
"ruamel.yaml>=0.18",
|
|
37
|
+
"jsonschema>=4.20",
|
|
38
|
+
"click>=8.1,<9",
|
|
39
|
+
"rfc8785>=0.1",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.optional-dependencies]
|
|
43
|
+
dev = [
|
|
44
|
+
"pytest>=8.0",
|
|
45
|
+
"pytest-cov>=5.0",
|
|
46
|
+
"ruff>=0.6",
|
|
47
|
+
"mypy>=1.11",
|
|
48
|
+
"types-jsonschema>=4.0",
|
|
49
|
+
"types-PyYAML>=6.0",
|
|
50
|
+
"pandas-stubs>=2.2",
|
|
51
|
+
"build>=1.2",
|
|
52
|
+
"twine>=5.0",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
[project.scripts]
|
|
56
|
+
aadr-subset = "aadr_subset.cli:main"
|
|
57
|
+
|
|
58
|
+
[project.urls]
|
|
59
|
+
Homepage = "https://github.com/carstenerickson/aadr-subset"
|
|
60
|
+
Issues = "https://github.com/carstenerickson/aadr-subset/issues"
|
|
61
|
+
Changelog = "https://github.com/carstenerickson/aadr-subset/blob/main/CHANGELOG.md"
|
|
62
|
+
|
|
63
|
+
[tool.setuptools.packages.find]
|
|
64
|
+
where = ["src"]
|
|
65
|
+
|
|
66
|
+
[tool.setuptools.package-data]
|
|
67
|
+
aadr_subset = ["py.typed", "schemas/*.json", "templates/*.yaml"]
|
|
68
|
+
|
|
69
|
+
[tool.ruff]
|
|
70
|
+
line-length = 100
|
|
71
|
+
target-version = "py311"
|
|
72
|
+
|
|
73
|
+
[tool.ruff.lint]
|
|
74
|
+
select = ["E", "F", "W", "I", "UP", "B", "RUF"]
|
|
75
|
+
# RUF003: ambiguous-unicode-in-comment. We intentionally use ∪ / × etc. for
|
|
76
|
+
# set-theory and matrix notation in test/code comments — these are clearer
|
|
77
|
+
# than the ASCII equivalents and don't affect runtime behavior.
|
|
78
|
+
ignore = ["RUF003"]
|
|
79
|
+
|
|
80
|
+
[tool.mypy]
|
|
81
|
+
python_version = "3.11"
|
|
82
|
+
strict = true
|
|
83
|
+
|
|
84
|
+
[tool.pytest.ini_options]
|
|
85
|
+
markers = [
|
|
86
|
+
"slow: tests that take more than a few seconds",
|
|
87
|
+
"integration: end-to-end tests requiring committed fixtures",
|
|
88
|
+
"external_tool: tests requiring plink2 or pgen-samplebind on PATH",
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
[tool.coverage.run]
|
|
92
|
+
source = ["src/aadr_subset"]
|
|
93
|
+
omit = [
|
|
94
|
+
# Orchestration shells (per LLD §1.5). Exercised by integration tests
|
|
95
|
+
# via subprocess, where coverage in this process isn't measured.
|
|
96
|
+
"src/aadr_subset/__main__.py",
|
|
97
|
+
"src/aadr_subset/cli.py",
|
|
98
|
+
"src/aadr_subset/commands/*",
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
[tool.coverage.report]
|
|
102
|
+
fail_under = 90
|
|
103
|
+
exclude_also = [
|
|
104
|
+
# Defensive branches; not exercised by happy paths.
|
|
105
|
+
"raise InvariantViolation",
|
|
106
|
+
"except Exception",
|
|
107
|
+
# Forward-compat fallbacks in _locate_node etc.
|
|
108
|
+
"except.*: # ?defensive",
|
|
109
|
+
]
|