aadr-resolve 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. aadr_resolve-0.1.0/LICENSE +21 -0
  2. aadr_resolve-0.1.0/PKG-INFO +364 -0
  3. aadr_resolve-0.1.0/README.md +326 -0
  4. aadr_resolve-0.1.0/pyproject.toml +93 -0
  5. aadr_resolve-0.1.0/setup.cfg +4 -0
  6. aadr_resolve-0.1.0/src/aadr_resolve/__init__.py +177 -0
  7. aadr_resolve-0.1.0/src/aadr_resolve/__main__.py +8 -0
  8. aadr_resolve-0.1.0/src/aadr_resolve/annoframe.py +189 -0
  9. aadr_resolve-0.1.0/src/aadr_resolve/bridge.py +334 -0
  10. aadr_resolve-0.1.0/src/aadr_resolve/cli.py +99 -0
  11. aadr_resolve-0.1.0/src/aadr_resolve/cohort.py +457 -0
  12. aadr_resolve-0.1.0/src/aadr_resolve/commands/__init__.py +1 -0
  13. aadr_resolve-0.1.0/src/aadr_resolve/commands/cohort_cmd.py +203 -0
  14. aadr_resolve-0.1.0/src/aadr_resolve/commands/diff_cmd.py +179 -0
  15. aadr_resolve-0.1.0/src/aadr_resolve/commands/join_cmd.py +96 -0
  16. aadr_resolve-0.1.0/src/aadr_resolve/commands/lookup_cmd.py +99 -0
  17. aadr_resolve-0.1.0/src/aadr_resolve/commands/schema_cmd.py +66 -0
  18. aadr_resolve-0.1.0/src/aadr_resolve/coverage_norm.py +106 -0
  19. aadr_resolve-0.1.0/src/aadr_resolve/date_norm.py +36 -0
  20. aadr_resolve-0.1.0/src/aadr_resolve/diff.py +194 -0
  21. aadr_resolve-0.1.0/src/aadr_resolve/errors.py +85 -0
  22. aadr_resolve-0.1.0/src/aadr_resolve/gates.py +224 -0
  23. aadr_resolve-0.1.0/src/aadr_resolve/group_classifier.py +109 -0
  24. aadr_resolve-0.1.0/src/aadr_resolve/join.py +80 -0
  25. aadr_resolve-0.1.0/src/aadr_resolve/library_token.py +340 -0
  26. aadr_resolve-0.1.0/src/aadr_resolve/loader.py +169 -0
  27. aadr_resolve-0.1.0/src/aadr_resolve/lookup.py +195 -0
  28. aadr_resolve-0.1.0/src/aadr_resolve/py.typed +0 -0
  29. aadr_resolve-0.1.0/src/aadr_resolve/reporting.py +101 -0
  30. aadr_resolve-0.1.0/src/aadr_resolve/schema.py +109 -0
  31. aadr_resolve-0.1.0/src/aadr_resolve/schemas/class_A.yaml +134 -0
  32. aadr_resolve-0.1.0/src/aadr_resolve/schemas/class_B.yaml +138 -0
  33. aadr_resolve-0.1.0/src/aadr_resolve/schemas/class_C.yaml +138 -0
  34. aadr_resolve-0.1.0/src/aadr_resolve/schemas/class_D.yaml +133 -0
  35. aadr_resolve-0.1.0/src/aadr_resolve/schemas/class_E.yaml +140 -0
  36. aadr_resolve-0.1.0/src/aadr_resolve/types.py +501 -0
  37. aadr_resolve-0.1.0/src/aadr_resolve/version_inference.py +55 -0
  38. aadr_resolve-0.1.0/src/aadr_resolve.egg-info/PKG-INFO +364 -0
  39. aadr_resolve-0.1.0/src/aadr_resolve.egg-info/SOURCES.txt +41 -0
  40. aadr_resolve-0.1.0/src/aadr_resolve.egg-info/dependency_links.txt +1 -0
  41. aadr_resolve-0.1.0/src/aadr_resolve.egg-info/entry_points.txt +2 -0
  42. aadr_resolve-0.1.0/src/aadr_resolve.egg-info/requires.txt +12 -0
  43. aadr_resolve-0.1.0/src/aadr_resolve.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Carsten Erickson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,364 @@
1
+ Metadata-Version: 2.4
2
+ Name: aadr-resolve
3
+ Version: 0.1.0
4
+ Summary: AADR cross-version GeneticID / MasterID join utility for ancient-DNA / population-genetics workflows.
5
+ Author-email: Carsten Erickson <carstene@mailbox.org>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/carstenerickson/aadr-resolve
8
+ Project-URL: Repository, https://github.com/carstenerickson/aadr-resolve
9
+ Project-URL: Issues, https://github.com/carstenerickson/aadr-resolve/issues
10
+ Project-URL: Changelog, https://github.com/carstenerickson/aadr-resolve/blob/main/CHANGELOG.md
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: MacOS
16
+ Classifier: Operating System :: POSIX :: Linux
17
+ Classifier: Programming Language :: Python :: 3 :: Only
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: <3.14,>=3.11
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: pandas<3,>=2.0
27
+ Requires-Dist: click<9,>=8.1
28
+ Requires-Dist: pyyaml<7,>=6.0
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest>=8; extra == "dev"
31
+ Requires-Dist: pytest-cov; extra == "dev"
32
+ Requires-Dist: mypy>=1.10; extra == "dev"
33
+ Requires-Dist: ruff>=0.5; extra == "dev"
34
+ Requires-Dist: build; extra == "dev"
35
+ Requires-Dist: twine; extra == "dev"
36
+ Requires-Dist: types-PyYAML; extra == "dev"
37
+ Dynamic: license-file
38
+
39
+ # aadr-resolve
40
+
41
+ AADR cross-version GeneticID / MasterID join utility for ancient-DNA and
42
+ population-genetics workflows.
43
+
44
+ `aadr-resolve` reads AADR (Allen Ancient DNA Resource) `.anno` files
45
+ across one or more releases and resolves the cross-version sample-ID
46
+ join through the Master ID column — the part every ancient-DNA pipeline
47
+ currently re-implements with custom awk. It handles AADR's progressive
48
+ de-anonymization (`I0001` in v44.3 → `Loschbour.AG` in v66) and the
49
+ periodic Master-ID renames (9-18 per consecutive version pair; ~62
50
+ cumulative v44.3 → v66.0) automatically.
51
+
52
+ The HLD pins behavior and the LLD pins implementation; both live in the
53
+ companion wiki:
54
+
55
+ - HLD: `cs-wiki/projects/aadr-resolve.md`
56
+ - LLD: `cs-wiki/projects/aadr-resolve-lld.md`
57
+ - Bench-verify report: `cs-wiki/projects/aadr-resolve-bench-verify.md`
58
+
59
+ ## Install
60
+
61
+ ```bash
62
+ pip install aadr-resolve
63
+ ```
64
+
65
+ Requires Python 3.11+. Dependencies: pandas 2.x, click 8.x, PyYAML 6.x.
66
+
67
+ ## Quickstart
68
+
69
+ **Resolve a single sample across two AADR releases.**
70
+
71
+ ```bash
72
+ aadr-resolve lookup I0001 \
73
+ --anno-files v44.3_1240K_public.anno \
74
+ --anno-files v66.0_1240K_public.anno
75
+ ```
76
+
77
+ Output (stdout):
78
+
79
+ ```
80
+ query: I0001
81
+ canonical individual_id: Loschbour (matched via individual_id)
82
+ v44.3 rows: 1
83
+ I0001 Luxembourg_Loschbour 537,182 SNPs
84
+ v66.0 rows: 2
85
+ Loschbour.AG Luxembourg_Mesolithic.AG 155,036 SNPs pgid=33
86
+ Loschbour.DG Luxembourg_Mesolithic.DG 620,881 SNPs pgid=39136
87
+ master_id_bridge: v44.3 I0001 → v66.0 Loschbour (via shared GID Loschbour.DG)
88
+ status: present_in_2_of_2_versions; multi_row; individual_id_renamed
89
+ ```
90
+
91
+ **Recreate a cohort against a newer release.**
92
+
93
+ ```bash
94
+ aadr-resolve cohort patterson_2022_whga.txt \
95
+ --anno-files v44.3_1240K_public.anno \
96
+ --anno-files v66.0_1240K_public.anno \
97
+ --cohort-version v44.3 \
98
+ -o whga_v66_manifest.tsv
99
+ ```
100
+
101
+ The manifest is a TSV with one row per (individual × library), with
102
+ per-version `genetic_id` / `group_id` / `snps_hit_1240k` columns,
103
+ ready to feed into downstream relabeling tools like `pgen-samplebind`.
104
+
105
+ **Structured diff between two releases.**
106
+
107
+ ```bash
108
+ aadr-resolve diff v62.0.anno v66.0.anno --tsv > v62_to_v66_changes.tsv
109
+ ```
110
+
111
+ Emits one row per change event: added, removed, genetic_id_renamed,
112
+ master_id_renamed, group_changed (with a per-class label —
113
+ `convention_restructure_suffix` etc.).
114
+
115
+ ## Subcommands
116
+
117
+ | Command | Purpose |
118
+ |----------|-------------------------------------------------------------|
119
+ | `lookup` | Resolve a single sample across N versions |
120
+ | `cohort` | Emit a cross-version manifest for a user-supplied cohort |
121
+ | `diff` | Structured diff between two versions |
122
+ | `join` | Wide-format pairwise table over the full intersection |
123
+ | `schema` | Diagnostic: report the detected schema class |
124
+
125
+ ### `aadr-resolve lookup`
126
+
127
+ ```
128
+ aadr-resolve lookup INDIVIDUAL_OR_GENETIC_ID \
129
+ --anno-files PATH [--anno-files PATH ...]
130
+ [--json]
131
+ ```
132
+
133
+ Treated as `individual_id` by default; falls back to `genetic_id` if no
134
+ IID matches. The MID-rename bridge is built automatically from the
135
+ supplied versions and reported under `master_id_bridge` in the output.
136
+
137
+ ### `aadr-resolve cohort`
138
+
139
+ ```
140
+ aadr-resolve cohort COHORT_FILE \
141
+ --anno-files PATH [--anno-files PATH ...]
142
+ [--cohort-version LABEL]
143
+ -o OUT.tsv [--json]
144
+ [--no-propagate]
145
+ [--collapse-to-individual]
146
+ [--gid-preference AG,DG,SG,HO,TW,BY,AA,EC,WGC,bare]
147
+ [--turnover-warn 0.05] [--turnover-fail 0.30]
148
+ [--cohort-coverage-warn 0.50] [--cohort-coverage-fail 0.25]
149
+ ```
150
+
151
+ `COHORT_FILE` is a TSV: one column for `individual_id`, optional second
152
+ column for `cohort_label`. `--cohort-version` is auto-detected from the
153
+ supplied annos when omitted. Default output is row-per-(individual ×
154
+ library); `--collapse-to-individual` reduces to one row per individual
155
+ via the `--gid-preference` suffix priority.
156
+
157
+ ### `aadr-resolve diff`
158
+
159
+ ```
160
+ aadr-resolve diff V_OLD.anno V_NEW.anno
161
+ [--json | --tsv]
162
+ [-o OUT]
163
+ [--include-class CLASS [--include-class CLASS ...]]
164
+ [--all-events]
165
+ [--turnover-warn 0.05] [--turnover-fail 0.30]
166
+ [--substantive-regroup-fail INT]
167
+ ```
168
+
169
+ JSON output is summary-first: per-class counts always included;
170
+ per-event arrays only for `substantive_regroup` (always) and any class
171
+ named via `--include-class`, or all classes when `--all-events` is set.
172
+ `--tsv` switches to streamed one-row-per-event format.
173
+
174
+ ### `aadr-resolve join`
175
+
176
+ ```
177
+ aadr-resolve join V_OLD.anno V_NEW.anno
178
+ -o OUT.tsv [--json]
179
+ [--collapse-to-individual]
180
+ [--gid-preference AG,DG,SG,HO,TW,BY,AA,EC,WGC,bare]
181
+ ```
182
+
183
+ Wide-format pairwise table over the full v_old ∪ v_new canonical
184
+ individual_id set. Same output schema as `cohort`; useful when you
185
+ don't have a pre-existing cohort list.
186
+
187
+ ### `aadr-resolve schema`
188
+
189
+ ```
190
+ aadr-resolve schema PATH [--json]
191
+ ```
192
+
193
+ Diagnostic: detects which schema class (A–E) the `.anno` belongs to,
194
+ reports the column layout. Useful for debugging "why does this `.anno`
195
+ not load."
196
+
197
+ ## Shared options
198
+
199
+ These apply to all subcommands:
200
+
201
+ | Option | Default | Notes |
202
+ |------------------------------|---------|----------------------------------------------------------------|
203
+ | `--schema-override CLASS` | auto | Force schema class A/B/C/D/E (e.g., renamed `.anno`) |
204
+ | `--version-label LABEL` | auto | Force version label (when filename pattern doesn't match) |
205
+ | `--mid-bridge FILE` | none | Manual master_id-rename TSV layered on auto-detected bridge |
206
+ | `--on-mid-collision {error,warn}` | error | Cross-lab MID collision policy |
207
+ | `--quiet` | false | Suppress the "Wrote N rows" progress line |
208
+
209
+ ## Library API
210
+
211
+ The same functionality is available in-process:
212
+
213
+ ```python
214
+ from aadr_resolve import (
215
+ AnnoFrame,
216
+ resolve_master_ids,
217
+ resolve_genetic_ids,
218
+ )
219
+
220
+ # Resolve v44.3 Master IDs to v66.0 GeneticIDs
221
+ result = resolve_master_ids(
222
+ ["I0001", "Bichon", "Mota"],
223
+ src_version="v44.3",
224
+ dst_version="v66.0",
225
+ anno_paths={
226
+ "v44.3": "v44.3_1240K_public.anno",
227
+ "v66.0": "v66.0_1240K_public.anno",
228
+ },
229
+ )
230
+ # result = {"I0001": "Loschbour.AG", "Bichon": "Bichon.SG", "Mota": None}
231
+ ```
232
+
233
+ `resolve_genetic_ids` does the GID → GID inverse:
234
+
235
+ ```python
236
+ result = resolve_genetic_ids(
237
+ ["I0001"],
238
+ src_version="v44.3",
239
+ dst_version="v66.0",
240
+ anno_paths={...},
241
+ )
242
+ # result = {"I0001": ["Loschbour.AG", "Loschbour.DG"]} # multi-row IID
243
+ ```
244
+
245
+ Direct `AnnoFrame` access for lower-level work:
246
+
247
+ ```python
248
+ from aadr_resolve import AnnoFrame
249
+
250
+ af = AnnoFrame.from_path("v66.0_1240K_public.anno", version_label="v66.0")
251
+ af.schema_class # SchemaClass.E
252
+ af.individual_id # pd.Series of canonical IIDs
253
+ af.genetic_id # pd.Series
254
+ af.persistent_genetic_id # pd.Series of Int64 nullable (E only; all-NaN elsewhere)
255
+ af.date_calbp # pd.Series of Int64 nullable
256
+ af.coverage # pd.Series of Float64 nullable
257
+ af.path # original Path, useful for re-creating anno_paths dicts
258
+ ```
259
+
260
+ ### Exception hierarchy
261
+
262
+ All errors derive from `aadr_resolve.AadrResolveError`. Sibling tools
263
+ catching aadr-resolve errors can `except aadr_resolve.<Class>`:
264
+
265
+ | Class | Maps to exit | Trigger |
266
+ |-----------------------------|--------------|------------------------------------------------------|
267
+ | `ValidationError` | 1 | Turnover gate, coverage gate, substantive-regroup gate |
268
+ | `IOFailure` | 2 | File not found, lock held, malformed TSV |
269
+ | `InvariantViolation` | 3 | Schema YAML malformed (rare) |
270
+ | `SchemaDetectionError` | 3 | Header signature unknown |
271
+ | `MissingNativeFieldError` | 3 | Canonical field requested for a class that lacks it |
272
+ | `CollisionDetected` | 3 | Cross-lab MID collision under `error` policy |
273
+ | `UsageError` | 4 | Bad CLI args; cohort file has no matching version |
274
+
275
+ ## Exit codes
276
+
277
+ Stable across versions. CI workflows can grep:
278
+
279
+ - `0` — success
280
+ - `1` — soft-validation failure (any of the gates)
281
+ - `2` — I/O failure
282
+ - `3` — invariant violation (schema, MID collision)
283
+ - `4` — usage error (bad CLI args)
284
+
285
+ ## Troubleshooting
286
+
287
+ **"unknown .anno schema signature"** — your `.anno` header doesn't
288
+ match any of the 5 known classes. Either the file is from a newer AADR
289
+ release (file an issue with the bench-verify diff), or the file has
290
+ been edited. Workarounds:
291
+
292
+ - `--schema-override A|B|C|D|E` forces a class without signature check.
293
+ - `--version-label vN.N` forces a version label when the filename
294
+ doesn't match a known pattern.
295
+
296
+ **"cross-lab MID collision"** — the GID-stability check found a Master
297
+ ID that maps to two different individuals in different versions.
298
+ This indicates either a real data error in AADR or a cross-lab naming
299
+ collision (rare). Workarounds:
300
+
301
+ - `--on-mid-collision warn` continues with a stderr warning and marks
302
+ affected rows with `library_chain_ambiguous` status.
303
+ - `--mid-bridge FILE` lets you specify the correct mapping manually.
304
+
305
+ **"sample turnover gate (fail)"** — removal rate exceeded the
306
+ `--turnover-fail` threshold (default 30%). Indicates either a major
307
+ AADR cleanup (the v62→v66 bump removed ~17%) or that the wrong files
308
+ are being compared. Override with `--turnover-fail 1.0` to disable.
309
+
310
+ **"cohort coverage gate (fail)"** — fewer than 25% of cohort entries
311
+ resolved in the supplied versions. Usually means the cohort file uses
312
+ IDs from a version not in the supplied set. Check `--cohort-version`.
313
+
314
+ **Pandas ParserError on a v52 / v54 `.anno`** — these versions contain
315
+ embedded quote characters in some `full_date` cells. aadr-resolve reads
316
+ with `csv.QUOTE_NONE` to side-step pandas's default quote-handling;
317
+ upgrade if you're on an older version.
318
+
319
+ ## Composition with the broader ecosystem
320
+
321
+ ```bash
322
+ aadr-resolve cohort patterson_2022.txt \
323
+ --anno-files v44.3.anno --anno-files v66.0.anno \
324
+ -o cohort_manifest.tsv
325
+ pgen-samplebind merge \
326
+ --relabel-from cohort_manifest.tsv \
327
+ --output merged_v66.pgen \
328
+ v44.3.pgen v66.0.pgen
329
+ ```
330
+
331
+ The manifest's column layout is documented in HLD §Output: cohort.
332
+
333
+ ## Development
334
+
335
+ ```bash
336
+ git clone https://github.com/carstenerickson/aadr-resolve
337
+ cd aadr-resolve
338
+ python -m venv .venv && source .venv/bin/activate
339
+ pip install -e ".[dev]"
340
+
341
+ # Default suite (fast; ~10s)
342
+ pytest -ra
343
+
344
+ # Slow tests (synth perf benchmark)
345
+ pytest -m slow -ra
346
+
347
+ # External tests (real AADR files; requires AADR_CACHE env var)
348
+ AADR_CACHE=/path/to/cache pytest -m external -ra
349
+
350
+ # Standalone perf benchmark with per-phase timings
351
+ AADR_CACHE=/path/to/cache python -m benchmarks.perf_bench
352
+
353
+ # Lint + format + types
354
+ ruff check src/ tests/
355
+ ruff format --check src/ tests/
356
+ mypy src/
357
+ ```
358
+
359
+ CI runs the default suite across Python 3.11/3.12/3.13 × Ubuntu+macOS;
360
+ see `.github/workflows/ci.yml`.
361
+
362
+ ## License
363
+
364
+ MIT.