aadr-subset 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. aadr_subset-0.1.0/LICENSE +21 -0
  2. aadr_subset-0.1.0/PKG-INFO +291 -0
  3. aadr_subset-0.1.0/README.md +249 -0
  4. aadr_subset-0.1.0/pyproject.toml +109 -0
  5. aadr_subset-0.1.0/setup.cfg +4 -0
  6. aadr_subset-0.1.0/src/aadr_subset/__init__.py +5 -0
  7. aadr_subset-0.1.0/src/aadr_subset/__main__.py +6 -0
  8. aadr_subset-0.1.0/src/aadr_subset/cli.py +359 -0
  9. aadr_subset-0.1.0/src/aadr_subset/commands/__init__.py +1 -0
  10. aadr_subset-0.1.0/src/aadr_subset/commands/inspect_cmd.py +129 -0
  11. aadr_subset-0.1.0/src/aadr_subset/commands/report_cmd.py +168 -0
  12. aadr_subset-0.1.0/src/aadr_subset/commands/select_cmd.py +366 -0
  13. aadr_subset-0.1.0/src/aadr_subset/commands/template_cmd.py +58 -0
  14. aadr_subset-0.1.0/src/aadr_subset/commands/validate_cmd.py +40 -0
  15. aadr_subset-0.1.0/src/aadr_subset/engine.py +568 -0
  16. aadr_subset-0.1.0/src/aadr_subset/errors.py +99 -0
  17. aadr_subset-0.1.0/src/aadr_subset/formats.py +301 -0
  18. aadr_subset-0.1.0/src/aadr_subset/py.typed +0 -0
  19. aadr_subset-0.1.0/src/aadr_subset/reporting.py +423 -0
  20. aadr_subset-0.1.0/src/aadr_subset/schemas/selector.schema.json +168 -0
  21. aadr_subset-0.1.0/src/aadr_subset/selector.py +819 -0
  22. aadr_subset-0.1.0/src/aadr_subset/templates/bronze_age_europe.yaml +52 -0
  23. aadr_subset-0.1.0/src/aadr_subset/templates/iron_age_britain.yaml +33 -0
  24. aadr_subset-0.1.0/src/aadr_subset/templates/modern_european.yaml +55 -0
  25. aadr_subset-0.1.0/src/aadr_subset/templates/neolithic_anatolia.yaml +44 -0
  26. aadr_subset-0.1.0/src/aadr_subset/templates/viking_period_scandinavian.yaml +54 -0
  27. aadr_subset-0.1.0/src/aadr_subset/templates/wsh_steppe_pool.yaml +57 -0
  28. aadr_subset-0.1.0/src/aadr_subset/templates.py +76 -0
  29. aadr_subset-0.1.0/src/aadr_subset/types.py +166 -0
  30. aadr_subset-0.1.0/src/aadr_subset.egg-info/PKG-INFO +291 -0
  31. aadr_subset-0.1.0/src/aadr_subset.egg-info/SOURCES.txt +33 -0
  32. aadr_subset-0.1.0/src/aadr_subset.egg-info/dependency_links.txt +1 -0
  33. aadr_subset-0.1.0/src/aadr_subset.egg-info/entry_points.txt +2 -0
  34. aadr_subset-0.1.0/src/aadr_subset.egg-info/requires.txt +18 -0
  35. aadr_subset-0.1.0/src/aadr_subset.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Carsten Erickson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,291 @@
1
+ Metadata-Version: 2.4
2
+ Name: aadr-subset
3
+ Version: 0.1.0
4
+ Summary: Declarative AADR panel subsetting from YAML selectors; the missing first-class tool for cohort definitions in ancient-DNA / population-genetics workflows.
5
+ Author-email: Carsten Erickson <carstene@mailbox.org>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/carstenerickson/aadr-subset
8
+ Project-URL: Issues, https://github.com/carstenerickson/aadr-subset/issues
9
+ Project-URL: Changelog, https://github.com/carstenerickson/aadr-subset/blob/main/CHANGELOG.md
10
+ Keywords: bioinformatics,genetics,aadr,ancient-dna,admixtools,population-genetics
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: POSIX :: Linux
15
+ Classifier: Operating System :: MacOS :: MacOS X
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
20
+ Classifier: Typing :: Typed
21
+ Requires-Python: >=3.11
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: aadr-resolve<0.3,>=0.2.0
25
+ Requires-Dist: pandas<3,>=2.2
26
+ Requires-Dist: pyyaml>=6.0
27
+ Requires-Dist: ruamel.yaml>=0.18
28
+ Requires-Dist: jsonschema>=4.20
29
+ Requires-Dist: click<9,>=8.1
30
+ Requires-Dist: rfc8785>=0.1
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=8.0; extra == "dev"
33
+ Requires-Dist: pytest-cov>=5.0; extra == "dev"
34
+ Requires-Dist: ruff>=0.6; extra == "dev"
35
+ Requires-Dist: mypy>=1.11; extra == "dev"
36
+ Requires-Dist: types-jsonschema>=4.0; extra == "dev"
37
+ Requires-Dist: types-PyYAML>=6.0; extra == "dev"
38
+ Requires-Dist: pandas-stubs>=2.2; extra == "dev"
39
+ Requires-Dist: build>=1.2; extra == "dev"
40
+ Requires-Dist: twine>=5.0; extra == "dev"
41
+ Dynamic: license-file
42
+
43
+ # aadr-subset
44
+
45
+ Declarative AADR panel subsetting from YAML selectors. Replaces ad-hoc
46
+ `awk` pipelines and one-off scripts with version-stable,
47
+ PR-reviewable cohort definitions. Built on top of
48
+ [aadr-resolve](https://github.com/carstenerickson/aadr-resolve) for
49
+ cross-AADR-version sample-ID mapping.
50
+
51
+ ```yaml
52
+ # britain_iron_age.yaml
53
+ populations: [England_IA, England_IA.AG, England_IA.SG]
54
+ date: {min_calbp: 1900, max_calbp: 2400}
55
+ min_coverage: 0.3
56
+ exclude:
57
+ individual_ids: [I12345] # known contaminated sample
58
+ ```
59
+
60
+ ```bash
61
+ $ aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno -o cohort.ids
62
+ Selector: britain_iron_age.yaml (sha256:1a2b3c4...d5e6f7g)
63
+ .anno: v66.HO.aadr.PUB.anno (v66.0, class E)
64
+
65
+ Matched 45 samples across 1 populations.
66
+
67
+ Per-population: England_IA=45
68
+
69
+ Wrote cohort.ids (45 lines)
70
+ Done in 0.18s (parse 0.16s, eval 0.02s, write 0.00s).
71
+
72
+ $ plink2 --pfile aadr_v66 --keep cohort.ids --make-pgen --out britain_iron_age
73
+ ```
74
+
75
+ ## Why it exists
76
+
77
+ Ancient-DNA workflows live and die on cohort definitions — *which samples
78
+ go into this analysis*. Today that's typically a hand-curated set of
79
+ Group_ID literals in someone's shell script, prone to: silent breakage
80
+ when AADR releases a new version with renamed labels; no version pinning
81
+ in commit history; no way to share the exact cohort between collaborators
82
+ short of swapping `.ind` files.
83
+
84
+ `aadr-subset` makes the cohort itself a first-class artifact:
85
+
86
+ - **Selector YAMLs are version-stable.** They cite AADR releases via
87
+ `tested_against:` metadata; the `selector_signature` (RFC 8785 JCS
88
+ SHA-256 over the canonical form) gives you a hash that survives
89
+ YAML formatting churn.
90
+ - **Reviewable in PRs.** The grammar is flat (top-level AND with
91
+ one-level `any:` OR and one-level `exclude:` NOT). What you see is
92
+ what runs.
93
+ - **Cross-version via `aadr-resolve`.** `resolve_to_version:` lifts
94
+ Individual_IDs from an older release to the newer one through the
95
+ GID-stable bridge + MID-rename map.
96
+ - **Five subcommands** cover the full lifecycle: `validate`, `select`,
97
+ `inspect`, `report`, `template`.
98
+
99
+ ## Install
100
+
101
+ ```bash
102
+ pip install aadr-subset # once PyPI'd; currently:
103
+ pip install git+https://github.com/carstenerickson/aadr-subset.git
104
+ ```
105
+
106
+ Python 3.11+. The only external dependency is `aadr-resolve` (also
107
+ installed via git URL until both ship to PyPI).
108
+
109
+ For development:
110
+
111
+ ```bash
112
+ git clone https://github.com/carstenerickson/aadr-subset.git
113
+ cd aadr-subset
114
+ pip install -e ".[dev]"
115
+ pytest
116
+ ```
117
+
118
+ ## The five subcommands
119
+
120
+ ### `validate SELECTOR.yaml`
121
+
122
+ JSON-schema + semantic-constraint check on a selector. No `.anno`
123
+ required. Useful as a CI gate.
124
+
125
+ ```bash
126
+ $ aadr-subset validate britain_iron_age.yaml
127
+ # exit 0 on valid; exit 4 on schema or semantic violation
128
+ # Errors carry precise file:line:col + JSON pointer:
129
+ $ aadr-subset validate broken.yaml
130
+ broken.yaml:7:5: at /populations/2: 42 is not of type 'string'
131
+ broken.yaml:12:3: at /any/0/min_coverage: -0.5 is less than the minimum of 0
132
+ ```
133
+
134
+ ### `select SELECTOR.yaml ANNO.anno [-o PATH] [--format ids|tsv|json]`
135
+
136
+ The main case: materialize a selector against a target `.anno` and
137
+ write matched sample IDs / TSV / JSON.
138
+
139
+ ```bash
140
+ aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno -o cohort.ids
141
+ aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno --format tsv -o cohort.tsv
142
+ aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno --format json -o cohort.json
143
+ ```
144
+
145
+ Cross-version flow (selector defined against an older release than the
146
+ materialized one):
147
+
148
+ ```yaml
149
+ # britain_v62_lift.yaml
150
+ individual_ids: [I12345, I12346]
151
+ source_version: v62.0
152
+ resolve_to_version: v66.0
153
+ ```
154
+
155
+ ```bash
156
+ aadr-subset select britain_v62_lift.yaml v66.HO.aadr.PUB.anno \
157
+ --source-anno v62.0_HO_public.anno \
158
+ -o lifted.ids
159
+ ```
160
+
161
+ v62.0 inputs (class D — no native coverage column) need a derived proxy
162
+ for `min_coverage:` filters:
163
+
164
+ ```bash
165
+ aadr-subset select britain_iron_age.yaml v62.0_HO_public.anno \
166
+ --coverage-derive snps_hit_1240k -o cohort.ids
167
+ ```
168
+
169
+ ### `inspect SELECTOR.yaml ANNO.anno`
170
+
171
+ Dry-run: shows what a selector matches without writing any file.
172
+ Always exits 0 — meant for debugging selector logic.
173
+
174
+ ```
175
+ $ aadr-subset inspect britain_iron_age.yaml v66.HO.aadr.PUB.anno
176
+ Selector: britain_iron_age.yaml
177
+ .anno: v66.HO.aadr.PUB.anno (v66.0, class E, 27,755 samples)
178
+
179
+ Matched: 45 samples across 1 population
180
+
181
+ Per-population breakdown:
182
+ England_IA 45
183
+
184
+ Branch contributions:
185
+ top_level 45
186
+
187
+ Date range of matched: 1934 - 2398 calBP (median 2103)
188
+ Coverage range: 0.34 - 4.81x (median 1.28)
189
+
190
+ Selector signature: sha256:1a2b3c4d5e6f7g8h9i0j1k2l3m4n5o6p7q8r9s0t1u2v3w4x5y6z7a8b9c0d1e
191
+ ```
192
+
193
+ ### `report SELECTOR.yaml ANNO.anno [-o PATH] [--format tsv|json]`
194
+
195
+ Per-population aggregates: how many samples each Group_ID contributed,
196
+ with date range and coverage stats.
197
+
198
+ ```
199
+ $ aadr-subset report britain_iron_age.yaml v66.HO.aadr.PUB.anno
200
+ group_id n_matched n_in_anno pct_matched date_min_calbp date_max_calbp coverage_median
201
+ England_IA 45 51 88.2 1934 2398 1.28
202
+ ```
203
+
204
+ `--include-empty-groups` adds rows for `.anno` groups that matched
205
+ zero samples (useful for population-survey workflows).
206
+
207
+ ### `template [NAME] [-o PATH]`
208
+
209
+ Ships starter selectors for common cohorts. No-arg form lists
210
+ shipped templates; arg form emits the verbatim YAML (comments + metadata
211
+ block preserved) to stdout or `--out PATH`.
212
+
213
+ ```
214
+ $ aadr-subset template
215
+ bronze_age_europe
216
+ iron_age_britain
217
+ modern_european
218
+ neolithic_anatolia
219
+ viking_period_scandinavian
220
+ wsh_steppe_pool
221
+
222
+ $ aadr-subset template iron_age_britain -o britain.yaml
223
+ # britain.yaml now contains a working starting point — edit + extend.
224
+ ```
225
+
226
+ All shipped templates are verified against AADR **v62.0** and **v66.0** —
227
+ each template's `tested_against:` metadata reflects the releases it
228
+ resolves to non-zero matches against.
229
+
230
+ ## Exit codes
231
+
232
+ | Code | Meaning |
233
+ |---|---|
234
+ | 0 | Success |
235
+ | 1 | Soft validation failure (e.g. zero-match without `--allow-empty`, `--strict-resolve` missing IIDs) |
236
+ | 2 | I/O failure (file not found, `.anno` schema unrecognized, etc.) |
237
+ | 4 | Usage error (schema violation, flag misuse, unknown template) |
238
+ | 70 | Internal error (please file an issue) |
239
+
240
+ ## Selector grammar (overview)
241
+
242
+ Flat — one level of nesting maximum. Top-level keys AND-combine.
243
+
244
+ ```yaml
245
+ # Top-level AND
246
+ populations: [Western_HG, Eastern_HG] # match against group_id
247
+ individual_ids: [Loschbour, KO1] # match against individual_id
248
+ individual_ids_source: ids.txt # newline-delimited file
249
+ modern_only: true # shorthand: date_calbp <= 70
250
+ min_coverage: 0.3
251
+ coverage_column: snps_hit_1240k # override; selector-side wins over --coverage-derive
252
+ date:
253
+ min_calbp: 1900
254
+ max_calbp: 2400
255
+ source_version: v62.0 # cross-version lift
256
+ resolve_to_version: v66.0
257
+
258
+ # One-level OR (matches any branch)
259
+ any:
260
+ - populations: [Western_HG]
261
+ min_coverage: 1.0
262
+ - populations: [Eastern_HG]
263
+ min_coverage: 0.5
264
+
265
+ # One-level NOT-of-OR (drops matches)
266
+ exclude:
267
+ group_ids: [English.SG]
268
+ individual_ids: [I12345]
269
+ ```
270
+
271
+ Full spec: [aadr-subset HLD](https://github.com/carstenerickson/aadr-subset/blob/main/docs/hld.md).
272
+
273
+ ## Composing with `plink2`
274
+
275
+ ```bash
276
+ # Materialize a cohort
277
+ aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno -o cohort.ids
278
+
279
+ # Use it as a plink2 keep set
280
+ plink2 --pfile aadr_v66 \
281
+ --keep cohort.ids \
282
+ --make-pgen --out britain_iron_age_subset
283
+ ```
284
+
285
+ `select --format json` produces a structured artifact suitable for
286
+ pipeline metadata logging (records the selector signature, AADR version,
287
+ schema class, and effective coverage column).
288
+
289
+ ## License
290
+
291
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,249 @@
1
+ # aadr-subset
2
+
3
+ Declarative AADR panel subsetting from YAML selectors. Replaces ad-hoc
4
+ `awk` pipelines and one-off scripts with version-stable,
5
+ PR-reviewable cohort definitions. Built on top of
6
+ [aadr-resolve](https://github.com/carstenerickson/aadr-resolve) for
7
+ cross-AADR-version sample-ID mapping.
8
+
9
+ ```yaml
10
+ # britain_iron_age.yaml
11
+ populations: [England_IA, England_IA.AG, England_IA.SG]
12
+ date: {min_calbp: 1900, max_calbp: 2400}
13
+ min_coverage: 0.3
14
+ exclude:
15
+ individual_ids: [I12345] # known contaminated sample
16
+ ```
17
+
18
+ ```bash
19
+ $ aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno -o cohort.ids
20
+ Selector: britain_iron_age.yaml (sha256:1a2b3c4...d5e6f7g)
21
+ .anno: v66.HO.aadr.PUB.anno (v66.0, class E)
22
+
23
+ Matched 45 samples across 1 populations.
24
+
25
+ Per-population: England_IA=45
26
+
27
+ Wrote cohort.ids (45 lines)
28
+ Done in 0.18s (parse 0.16s, eval 0.02s, write 0.00s).
29
+
30
+ $ plink2 --pfile aadr_v66 --keep cohort.ids --make-pgen --out britain_iron_age
31
+ ```
32
+
33
+ ## Why it exists
34
+
35
+ Ancient-DNA workflows live and die on cohort definitions — *which samples
36
+ go into this analysis*. Today that's typically a hand-curated set of
37
+ Group_ID literals in someone's shell script, prone to: silent breakage
38
+ when AADR releases a new version with renamed labels; no version pinning
39
+ in commit history; no way to share the exact cohort between collaborators
40
+ short of swapping `.ind` files.
41
+
42
+ `aadr-subset` makes the cohort itself a first-class artifact:
43
+
44
+ - **Selector YAMLs are version-stable.** They cite AADR releases via
45
+ `tested_against:` metadata; the `selector_signature` (RFC 8785 JCS
46
+ SHA-256 over the canonical form) gives you a hash that survives
47
+ YAML formatting churn.
48
+ - **Reviewable in PRs.** The grammar is flat (top-level AND with
49
+ one-level `any:` OR and one-level `exclude:` NOT). What you see is
50
+ what runs.
51
+ - **Cross-version via `aadr-resolve`.** `resolve_to_version:` lifts
52
+ Individual_IDs from an older release to the newer one through the
53
+ GID-stable bridge + MID-rename map.
54
+ - **Five subcommands** cover the full lifecycle: `validate`, `select`,
55
+ `inspect`, `report`, `template`.
56
+
57
+ ## Install
58
+
59
+ ```bash
60
+ pip install aadr-subset # once PyPI'd; currently:
61
+ pip install git+https://github.com/carstenerickson/aadr-subset.git
62
+ ```
63
+
64
+ Python 3.11+. The only external dependency is `aadr-resolve` (also
65
+ installed via git URL until both ship to PyPI).
66
+
67
+ For development:
68
+
69
+ ```bash
70
+ git clone https://github.com/carstenerickson/aadr-subset.git
71
+ cd aadr-subset
72
+ pip install -e ".[dev]"
73
+ pytest
74
+ ```
75
+
76
+ ## The five subcommands
77
+
78
+ ### `validate SELECTOR.yaml`
79
+
80
+ JSON-schema + semantic-constraint check on a selector. No `.anno`
81
+ required. Useful as a CI gate.
82
+
83
+ ```bash
84
+ $ aadr-subset validate britain_iron_age.yaml
85
+ # exit 0 on valid; exit 4 on schema or semantic violation
86
+ # Errors carry precise file:line:col + JSON pointer:
87
+ $ aadr-subset validate broken.yaml
88
+ broken.yaml:7:5: at /populations/2: 42 is not of type 'string'
89
+ broken.yaml:12:3: at /any/0/min_coverage: -0.5 is less than the minimum of 0
90
+ ```
91
+
92
+ ### `select SELECTOR.yaml ANNO.anno [-o PATH] [--format ids|tsv|json]`
93
+
94
+ The main case: materialize a selector against a target `.anno` and
95
+ write matched sample IDs / TSV / JSON.
96
+
97
+ ```bash
98
+ aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno -o cohort.ids
99
+ aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno --format tsv -o cohort.tsv
100
+ aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno --format json -o cohort.json
101
+ ```
102
+
103
+ Cross-version flow (selector defined against an older release than the
104
+ materialized one):
105
+
106
+ ```yaml
107
+ # britain_v62_lift.yaml
108
+ individual_ids: [I12345, I12346]
109
+ source_version: v62.0
110
+ resolve_to_version: v66.0
111
+ ```
112
+
113
+ ```bash
114
+ aadr-subset select britain_v62_lift.yaml v66.HO.aadr.PUB.anno \
115
+ --source-anno v62.0_HO_public.anno \
116
+ -o lifted.ids
117
+ ```
118
+
119
+ v62.0 inputs (class D — no native coverage column) need a derived proxy
120
+ for `min_coverage:` filters:
121
+
122
+ ```bash
123
+ aadr-subset select britain_iron_age.yaml v62.0_HO_public.anno \
124
+ --coverage-derive snps_hit_1240k -o cohort.ids
125
+ ```
126
+
127
+ ### `inspect SELECTOR.yaml ANNO.anno`
128
+
129
+ Dry-run: shows what a selector matches without writing any file.
130
+ Always exits 0 — meant for debugging selector logic.
131
+
132
+ ```
133
+ $ aadr-subset inspect britain_iron_age.yaml v66.HO.aadr.PUB.anno
134
+ Selector: britain_iron_age.yaml
135
+ .anno: v66.HO.aadr.PUB.anno (v66.0, class E, 27,755 samples)
136
+
137
+ Matched: 45 samples across 1 population
138
+
139
+ Per-population breakdown:
140
+ England_IA 45
141
+
142
+ Branch contributions:
143
+ top_level 45
144
+
145
+ Date range of matched: 1934 - 2398 calBP (median 2103)
146
+ Coverage range: 0.34 - 4.81x (median 1.28)
147
+
148
+ Selector signature: sha256:1a2b3c4d5e6f7g8h9i0j1k2l3m4n5o6p7q8r9s0t1u2v3w4x5y6z7a8b9c0d1e
149
+ ```
150
+
151
+ ### `report SELECTOR.yaml ANNO.anno [-o PATH] [--format tsv|json]`
152
+
153
+ Per-population aggregates: how many samples each Group_ID contributed,
154
+ with date range and coverage stats.
155
+
156
+ ```
157
+ $ aadr-subset report britain_iron_age.yaml v66.HO.aadr.PUB.anno
158
+ group_id n_matched n_in_anno pct_matched date_min_calbp date_max_calbp coverage_median
159
+ England_IA 45 51 88.2 1934 2398 1.28
160
+ ```
161
+
162
+ `--include-empty-groups` adds rows for `.anno` groups that matched
163
+ zero samples (useful for population-survey workflows).
164
+
165
+ ### `template [NAME] [-o PATH]`
166
+
167
+ Ships starter selectors for common cohorts. No-arg form lists
168
+ shipped templates; arg form emits the verbatim YAML (comments + metadata
169
+ block preserved) to stdout or `--out PATH`.
170
+
171
+ ```
172
+ $ aadr-subset template
173
+ bronze_age_europe
174
+ iron_age_britain
175
+ modern_european
176
+ neolithic_anatolia
177
+ viking_period_scandinavian
178
+ wsh_steppe_pool
179
+
180
+ $ aadr-subset template iron_age_britain -o britain.yaml
181
+ # britain.yaml now contains a working starting point — edit + extend.
182
+ ```
183
+
184
+ All shipped templates are verified against AADR **v62.0** and **v66.0** —
185
+ each template's `tested_against:` metadata reflects the releases it
186
+ resolves to non-zero matches against.
187
+
188
+ ## Exit codes
189
+
190
+ | Code | Meaning |
191
+ |---|---|
192
+ | 0 | Success |
193
+ | 1 | Soft validation failure (e.g. zero-match without `--allow-empty`, `--strict-resolve` missing IIDs) |
194
+ | 2 | I/O failure (file not found, `.anno` schema unrecognized, etc.) |
195
+ | 4 | Usage error (schema violation, flag misuse, unknown template) |
196
+ | 70 | Internal error (please file an issue) |
197
+
198
+ ## Selector grammar (overview)
199
+
200
+ Flat — one level of nesting maximum. Top-level keys AND-combine.
201
+
202
+ ```yaml
203
+ # Top-level AND
204
+ populations: [Western_HG, Eastern_HG] # match against group_id
205
+ individual_ids: [Loschbour, KO1] # match against individual_id
206
+ individual_ids_source: ids.txt # newline-delimited file
207
+ modern_only: true # shorthand: date_calbp <= 70
208
+ min_coverage: 0.3
209
+ coverage_column: snps_hit_1240k # override; selector-side wins over --coverage-derive
210
+ date:
211
+ min_calbp: 1900
212
+ max_calbp: 2400
213
+ source_version: v62.0 # cross-version lift
214
+ resolve_to_version: v66.0
215
+
216
+ # One-level OR (matches any branch)
217
+ any:
218
+ - populations: [Western_HG]
219
+ min_coverage: 1.0
220
+ - populations: [Eastern_HG]
221
+ min_coverage: 0.5
222
+
223
+ # One-level NOT-of-OR (drops matches)
224
+ exclude:
225
+ group_ids: [English.SG]
226
+ individual_ids: [I12345]
227
+ ```
228
+
229
+ Full spec: [aadr-subset HLD](https://github.com/carstenerickson/aadr-subset/blob/main/docs/hld.md).
230
+
231
+ ## Composing with `plink2`
232
+
233
+ ```bash
234
+ # Materialize a cohort
235
+ aadr-subset select britain_iron_age.yaml v66.HO.aadr.PUB.anno -o cohort.ids
236
+
237
+ # Use it as a plink2 keep set
238
+ plink2 --pfile aadr_v66 \
239
+ --keep cohort.ids \
240
+ --make-pgen --out britain_iron_age_subset
241
+ ```
242
+
243
+ `select --format json` produces a structured artifact suitable for
244
+ pipeline metadata logging (records the selector signature, AADR version,
245
+ schema class, and effective coverage column).
246
+
247
+ ## License
248
+
249
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,109 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "aadr-subset"
7
+ version = "0.1.0"
8
+ description = "Declarative AADR panel subsetting from YAML selectors; the missing first-class tool for cohort definitions in ancient-DNA / population-genetics workflows."
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ requires-python = ">=3.11"
12
+ authors = [
13
+ { name = "Carsten Erickson", email = "carstene@mailbox.org" },
14
+ ]
15
+ keywords = ["bioinformatics", "genetics", "aadr", "ancient-dna", "admixtools", "population-genetics"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Science/Research",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Operating System :: POSIX :: Linux",
21
+ "Operating System :: MacOS :: MacOS X",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
26
+ "Typing :: Typed",
27
+ ]
28
+ dependencies = [
29
+ # aadr-resolve is the hard library dependency per HLD §aadr-resolve as a
30
+ # library dependency. Tracking the 0.2.x line: 0.2.0 ships the
31
+ # `MissingNativeFieldError`, `AnnoFrame.path`, and `resolve_master_ids`
32
+ # APIs that aadr-subset relies on.
33
+ "aadr-resolve>=0.2.0,<0.3",
34
+ "pandas>=2.2,<3",
35
+ "pyyaml>=6.0",
36
+ "ruamel.yaml>=0.18",
37
+ "jsonschema>=4.20",
38
+ "click>=8.1,<9",
39
+ "rfc8785>=0.1",
40
+ ]
41
+
42
+ [project.optional-dependencies]
43
+ dev = [
44
+ "pytest>=8.0",
45
+ "pytest-cov>=5.0",
46
+ "ruff>=0.6",
47
+ "mypy>=1.11",
48
+ "types-jsonschema>=4.0",
49
+ "types-PyYAML>=6.0",
50
+ "pandas-stubs>=2.2",
51
+ "build>=1.2",
52
+ "twine>=5.0",
53
+ ]
54
+
55
+ [project.scripts]
56
+ aadr-subset = "aadr_subset.cli:main"
57
+
58
+ [project.urls]
59
+ Homepage = "https://github.com/carstenerickson/aadr-subset"
60
+ Issues = "https://github.com/carstenerickson/aadr-subset/issues"
61
+ Changelog = "https://github.com/carstenerickson/aadr-subset/blob/main/CHANGELOG.md"
62
+
63
+ [tool.setuptools.packages.find]
64
+ where = ["src"]
65
+
66
+ [tool.setuptools.package-data]
67
+ aadr_subset = ["py.typed", "schemas/*.json", "templates/*.yaml"]
68
+
69
+ [tool.ruff]
70
+ line-length = 100
71
+ target-version = "py311"
72
+
73
+ [tool.ruff.lint]
74
+ select = ["E", "F", "W", "I", "UP", "B", "RUF"]
75
+ # RUF003: ambiguous-unicode-in-comment. We intentionally use ∪ / × etc. for
76
+ # set-theory and matrix notation in test/code comments — these are clearer
77
+ # than the ASCII equivalents and don't affect runtime behavior.
78
+ ignore = ["RUF003"]
79
+
80
+ [tool.mypy]
81
+ python_version = "3.11"
82
+ strict = true
83
+
84
+ [tool.pytest.ini_options]
85
+ markers = [
86
+ "slow: tests that take more than a few seconds",
87
+ "integration: end-to-end tests requiring committed fixtures",
88
+ "external_tool: tests requiring plink2 or pgen-samplebind on PATH",
89
+ ]
90
+
91
+ [tool.coverage.run]
92
+ source = ["src/aadr_subset"]
93
+ omit = [
94
+ # Orchestration shells (per LLD §1.5). Exercised by integration tests
95
+ # via subprocess, where coverage in this process isn't measured.
96
+ "src/aadr_subset/__main__.py",
97
+ "src/aadr_subset/cli.py",
98
+ "src/aadr_subset/commands/*",
99
+ ]
100
+
101
+ [tool.coverage.report]
102
+ fail_under = 90
103
+ exclude_also = [
104
+ # Defensive branches; not exercised by happy paths.
105
+ "raise InvariantViolation",
106
+ "except Exception",
107
+ # Forward-compat fallbacks in _locate_node etc.
108
+ "except.*: # ?defensive",
109
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,5 @@
1
+ """aadr-subset: declarative AADR panel subsetting from YAML selectors."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ __all__ = ["__version__"]
@@ -0,0 +1,6 @@
1
+ """Enable `python -m aadr_subset` invocation."""
2
+
3
+ from aadr_subset.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()