aadr-resolve 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aadr_resolve-0.1.0/LICENSE +21 -0
- aadr_resolve-0.1.0/PKG-INFO +364 -0
- aadr_resolve-0.1.0/README.md +326 -0
- aadr_resolve-0.1.0/pyproject.toml +93 -0
- aadr_resolve-0.1.0/setup.cfg +4 -0
- aadr_resolve-0.1.0/src/aadr_resolve/__init__.py +177 -0
- aadr_resolve-0.1.0/src/aadr_resolve/__main__.py +8 -0
- aadr_resolve-0.1.0/src/aadr_resolve/annoframe.py +189 -0
- aadr_resolve-0.1.0/src/aadr_resolve/bridge.py +334 -0
- aadr_resolve-0.1.0/src/aadr_resolve/cli.py +99 -0
- aadr_resolve-0.1.0/src/aadr_resolve/cohort.py +457 -0
- aadr_resolve-0.1.0/src/aadr_resolve/commands/__init__.py +1 -0
- aadr_resolve-0.1.0/src/aadr_resolve/commands/cohort_cmd.py +203 -0
- aadr_resolve-0.1.0/src/aadr_resolve/commands/diff_cmd.py +179 -0
- aadr_resolve-0.1.0/src/aadr_resolve/commands/join_cmd.py +96 -0
- aadr_resolve-0.1.0/src/aadr_resolve/commands/lookup_cmd.py +99 -0
- aadr_resolve-0.1.0/src/aadr_resolve/commands/schema_cmd.py +66 -0
- aadr_resolve-0.1.0/src/aadr_resolve/coverage_norm.py +106 -0
- aadr_resolve-0.1.0/src/aadr_resolve/date_norm.py +36 -0
- aadr_resolve-0.1.0/src/aadr_resolve/diff.py +194 -0
- aadr_resolve-0.1.0/src/aadr_resolve/errors.py +85 -0
- aadr_resolve-0.1.0/src/aadr_resolve/gates.py +224 -0
- aadr_resolve-0.1.0/src/aadr_resolve/group_classifier.py +109 -0
- aadr_resolve-0.1.0/src/aadr_resolve/join.py +80 -0
- aadr_resolve-0.1.0/src/aadr_resolve/library_token.py +340 -0
- aadr_resolve-0.1.0/src/aadr_resolve/loader.py +169 -0
- aadr_resolve-0.1.0/src/aadr_resolve/lookup.py +195 -0
- aadr_resolve-0.1.0/src/aadr_resolve/py.typed +0 -0
- aadr_resolve-0.1.0/src/aadr_resolve/reporting.py +101 -0
- aadr_resolve-0.1.0/src/aadr_resolve/schema.py +109 -0
- aadr_resolve-0.1.0/src/aadr_resolve/schemas/class_A.yaml +134 -0
- aadr_resolve-0.1.0/src/aadr_resolve/schemas/class_B.yaml +138 -0
- aadr_resolve-0.1.0/src/aadr_resolve/schemas/class_C.yaml +138 -0
- aadr_resolve-0.1.0/src/aadr_resolve/schemas/class_D.yaml +133 -0
- aadr_resolve-0.1.0/src/aadr_resolve/schemas/class_E.yaml +140 -0
- aadr_resolve-0.1.0/src/aadr_resolve/types.py +501 -0
- aadr_resolve-0.1.0/src/aadr_resolve/version_inference.py +55 -0
- aadr_resolve-0.1.0/src/aadr_resolve.egg-info/PKG-INFO +364 -0
- aadr_resolve-0.1.0/src/aadr_resolve.egg-info/SOURCES.txt +41 -0
- aadr_resolve-0.1.0/src/aadr_resolve.egg-info/dependency_links.txt +1 -0
- aadr_resolve-0.1.0/src/aadr_resolve.egg-info/entry_points.txt +2 -0
- aadr_resolve-0.1.0/src/aadr_resolve.egg-info/requires.txt +12 -0
- aadr_resolve-0.1.0/src/aadr_resolve.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Carsten Erickson
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aadr-resolve
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: AADR cross-version GeneticID / MasterID join utility for ancient-DNA / population-genetics workflows.
|
|
5
|
+
Author-email: Carsten Erickson <carstene@mailbox.org>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/carstenerickson/aadr-resolve
|
|
8
|
+
Project-URL: Repository, https://github.com/carstenerickson/aadr-resolve
|
|
9
|
+
Project-URL: Issues, https://github.com/carstenerickson/aadr-resolve/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/carstenerickson/aadr-resolve/blob/main/CHANGELOG.md
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: MacOS
|
|
16
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
17
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Python: <3.14,>=3.11
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: pandas<3,>=2.0
|
|
27
|
+
Requires-Dist: click<9,>=8.1
|
|
28
|
+
Requires-Dist: pyyaml<7,>=6.0
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
32
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
33
|
+
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
34
|
+
Requires-Dist: build; extra == "dev"
|
|
35
|
+
Requires-Dist: twine; extra == "dev"
|
|
36
|
+
Requires-Dist: types-PyYAML; extra == "dev"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# aadr-resolve
|
|
40
|
+
|
|
41
|
+
AADR cross-version GeneticID / MasterID join utility for ancient-DNA and
|
|
42
|
+
population-genetics workflows.
|
|
43
|
+
|
|
44
|
+
`aadr-resolve` reads AADR (Allen Ancient DNA Resource) `.anno` files
|
|
45
|
+
across one or more releases and resolves the cross-version sample-ID
|
|
46
|
+
join through the Master ID column — the part every ancient-DNA pipeline
|
|
47
|
+
currently re-implements with custom awk. It handles AADR's progressive
|
|
48
|
+
de-anonymization (`I0001` in v44.3 → `Loschbour.AG` in v66) and the
|
|
49
|
+
periodic Master-ID renames (9-18 per consecutive version pair; ~62
|
|
50
|
+
cumulative v44.3 → v66.0) automatically.
|
|
51
|
+
|
|
52
|
+
The HLD pins behavior and the LLD pins implementation; both live in the
|
|
53
|
+
companion wiki:
|
|
54
|
+
|
|
55
|
+
- HLD: `cs-wiki/projects/aadr-resolve.md`
|
|
56
|
+
- LLD: `cs-wiki/projects/aadr-resolve-lld.md`
|
|
57
|
+
- Bench-verify report: `cs-wiki/projects/aadr-resolve-bench-verify.md`
|
|
58
|
+
|
|
59
|
+
## Install
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install aadr-resolve
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Requires Python 3.11+. Dependencies: pandas 2.x, click 8.x, PyYAML 6.x.
|
|
66
|
+
|
|
67
|
+
## Quickstart
|
|
68
|
+
|
|
69
|
+
**Resolve a single sample across two AADR releases.**
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
aadr-resolve lookup I0001 \
|
|
73
|
+
--anno-files v44.3_1240K_public.anno \
|
|
74
|
+
--anno-files v66.0_1240K_public.anno
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Output (stdout):
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
query: I0001
|
|
81
|
+
canonical individual_id: Loschbour (matched via individual_id)
|
|
82
|
+
v44.3 rows: 1
|
|
83
|
+
I0001 Luxembourg_Loschbour 537,182 SNPs
|
|
84
|
+
v66.0 rows: 2
|
|
85
|
+
Loschbour.AG Luxembourg_Mesolithic.AG 155,036 SNPs pgid=33
|
|
86
|
+
Loschbour.DG Luxembourg_Mesolithic.DG 620,881 SNPs pgid=39136
|
|
87
|
+
master_id_bridge: v44.3 I0001 → v66.0 Loschbour (via shared GID Loschbour.DG)
|
|
88
|
+
status: present_in_2_of_2_versions; multi_row; individual_id_renamed
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
**Recreate a cohort against a newer release.**
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
aadr-resolve cohort patterson_2022_whga.txt \
|
|
95
|
+
--anno-files v44.3_1240K_public.anno \
|
|
96
|
+
--anno-files v66.0_1240K_public.anno \
|
|
97
|
+
--cohort-version v44.3 \
|
|
98
|
+
-o whga_v66_manifest.tsv
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
The manifest is a TSV with one row per (individual × library), with
|
|
102
|
+
per-version `genetic_id` / `group_id` / `snps_hit_1240k` columns,
|
|
103
|
+
ready to feed into downstream relabeling tools like `pgen-samplebind`.
|
|
104
|
+
|
|
105
|
+
**Structured diff between two releases.**
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
aadr-resolve diff v62.0.anno v66.0.anno --tsv > v62_to_v66_changes.tsv
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Emits one row per change event: added, removed, genetic_id_renamed,
|
|
112
|
+
master_id_renamed, group_changed (with a per-class label —
|
|
113
|
+
`convention_restructure_suffix` etc.).
|
|
114
|
+
|
|
115
|
+
## Subcommands
|
|
116
|
+
|
|
117
|
+
| Command | Purpose |
|
|
118
|
+
|----------|-------------------------------------------------------------|
|
|
119
|
+
| `lookup` | Resolve a single sample across N versions |
|
|
120
|
+
| `cohort` | Emit a cross-version manifest for a user-supplied cohort |
|
|
121
|
+
| `diff` | Structured diff between two versions |
|
|
122
|
+
| `join` | Wide-format pairwise table over the full intersection |
|
|
123
|
+
| `schema` | Diagnostic: report the detected schema class |
|
|
124
|
+
|
|
125
|
+
### `aadr-resolve lookup`
|
|
126
|
+
|
|
127
|
+
```
|
|
128
|
+
aadr-resolve lookup INDIVIDUAL_OR_GENETIC_ID \
|
|
129
|
+
--anno-files PATH [--anno-files PATH ...]
|
|
130
|
+
[--json]
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Treated as `individual_id` by default; falls back to `genetic_id` if no
|
|
134
|
+
IID matches. The MID-rename bridge is built automatically from the
|
|
135
|
+
supplied versions and reported under `master_id_bridge` in the output.
|
|
136
|
+
|
|
137
|
+
### `aadr-resolve cohort`
|
|
138
|
+
|
|
139
|
+
```
|
|
140
|
+
aadr-resolve cohort COHORT_FILE \
|
|
141
|
+
--anno-files PATH [--anno-files PATH ...]
|
|
142
|
+
[--cohort-version LABEL]
|
|
143
|
+
-o OUT.tsv [--json]
|
|
144
|
+
[--no-propagate]
|
|
145
|
+
[--collapse-to-individual]
|
|
146
|
+
[--gid-preference AG,DG,SG,HO,TW,BY,AA,EC,WGC,bare]
|
|
147
|
+
[--turnover-warn 0.05] [--turnover-fail 0.30]
|
|
148
|
+
[--cohort-coverage-warn 0.50] [--cohort-coverage-fail 0.25]
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
`COHORT_FILE` is a TSV: one column for `individual_id`, optional second
|
|
152
|
+
column for `cohort_label`. `--cohort-version` is auto-detected from the
|
|
153
|
+
supplied annos when omitted. Default output is row-per-(individual ×
|
|
154
|
+
library); `--collapse-to-individual` reduces to one row per individual
|
|
155
|
+
via the `--gid-preference` suffix priority.
|
|
156
|
+
|
|
157
|
+
### `aadr-resolve diff`
|
|
158
|
+
|
|
159
|
+
```
|
|
160
|
+
aadr-resolve diff V_OLD.anno V_NEW.anno
|
|
161
|
+
[--json | --tsv]
|
|
162
|
+
[-o OUT]
|
|
163
|
+
[--include-class CLASS [--include-class CLASS ...]]
|
|
164
|
+
[--all-events]
|
|
165
|
+
[--turnover-warn 0.05] [--turnover-fail 0.30]
|
|
166
|
+
[--substantive-regroup-fail INT]
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
JSON output is summary-first: per-class counts always included;
|
|
170
|
+
per-event arrays only for `substantive_regroup` (always) and any class
|
|
171
|
+
named via `--include-class`, or all classes when `--all-events` is set.
|
|
172
|
+
`--tsv` switches to streamed one-row-per-event format.
|
|
173
|
+
|
|
174
|
+
### `aadr-resolve join`
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
aadr-resolve join V_OLD.anno V_NEW.anno
|
|
178
|
+
-o OUT.tsv [--json]
|
|
179
|
+
[--collapse-to-individual]
|
|
180
|
+
[--gid-preference AG,DG,SG,HO,TW,BY,AA,EC,WGC,bare]
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Wide-format pairwise table over the full v_old ∪ v_new canonical
|
|
184
|
+
individual_id set. Same output schema as `cohort`; useful when you
|
|
185
|
+
don't have a pre-existing cohort list.
|
|
186
|
+
|
|
187
|
+
### `aadr-resolve schema`
|
|
188
|
+
|
|
189
|
+
```
|
|
190
|
+
aadr-resolve schema PATH [--json]
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
Diagnostic: detects which schema class (A–E) the `.anno` belongs to,
|
|
194
|
+
reports the column layout. Useful for debugging "why does this `.anno`
|
|
195
|
+
not load."
|
|
196
|
+
|
|
197
|
+
## Shared options
|
|
198
|
+
|
|
199
|
+
These apply to all subcommands:
|
|
200
|
+
|
|
201
|
+
| Option | Default | Notes |
|
|
202
|
+
|------------------------------|---------|----------------------------------------------------------------|
|
|
203
|
+
| `--schema-override CLASS` | auto | Force schema class A/B/C/D/E (e.g., renamed `.anno`) |
|
|
204
|
+
| `--version-label LABEL` | auto | Force version label (when filename pattern doesn't match) |
|
|
205
|
+
| `--mid-bridge FILE` | none | Manual master_id-rename TSV layered on auto-detected bridge |
|
|
206
|
+
| `--on-mid-collision {error,warn}` | error | Cross-lab MID collision policy |
|
|
207
|
+
| `--quiet` | false | Suppress the "Wrote N rows" progress line |
|
|
208
|
+
|
|
209
|
+
## Library API
|
|
210
|
+
|
|
211
|
+
The same functionality is available in-process:
|
|
212
|
+
|
|
213
|
+
```python
|
|
214
|
+
from aadr_resolve import (
|
|
215
|
+
AnnoFrame,
|
|
216
|
+
resolve_master_ids,
|
|
217
|
+
resolve_genetic_ids,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Resolve v44.3 Master IDs to v66.0 GeneticIDs
|
|
221
|
+
result = resolve_master_ids(
|
|
222
|
+
["I0001", "Bichon", "Mota"],
|
|
223
|
+
src_version="v44.3",
|
|
224
|
+
dst_version="v66.0",
|
|
225
|
+
anno_paths={
|
|
226
|
+
"v44.3": "v44.3_1240K_public.anno",
|
|
227
|
+
"v66.0": "v66.0_1240K_public.anno",
|
|
228
|
+
},
|
|
229
|
+
)
|
|
230
|
+
# result = {"I0001": "Loschbour.AG", "Bichon": "Bichon.SG", "Mota": None}
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
`resolve_genetic_ids` does the GID → GID inverse:
|
|
234
|
+
|
|
235
|
+
```python
|
|
236
|
+
result = resolve_genetic_ids(
|
|
237
|
+
["I0001"],
|
|
238
|
+
src_version="v44.3",
|
|
239
|
+
dst_version="v66.0",
|
|
240
|
+
anno_paths={...},
|
|
241
|
+
)
|
|
242
|
+
# result = {"I0001": ["Loschbour.AG", "Loschbour.DG"]} # multi-row IID
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
Direct `AnnoFrame` access for lower-level work:
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
from aadr_resolve import AnnoFrame
|
|
249
|
+
|
|
250
|
+
af = AnnoFrame.from_path("v66.0_1240K_public.anno", version_label="v66.0")
|
|
251
|
+
af.schema_class # SchemaClass.E
|
|
252
|
+
af.individual_id # pd.Series of canonical IIDs
|
|
253
|
+
af.genetic_id # pd.Series
|
|
254
|
+
af.persistent_genetic_id # pd.Series of Int64 nullable (E only; all-NaN elsewhere)
|
|
255
|
+
af.date_calbp # pd.Series of Int64 nullable
|
|
256
|
+
af.coverage # pd.Series of Float64 nullable
|
|
257
|
+
af.path # original Path, useful for re-creating anno_paths dicts
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Exception hierarchy
|
|
261
|
+
|
|
262
|
+
All errors derive from `aadr_resolve.AadrResolveError`. Sibling tools
|
|
263
|
+
catching aadr-resolve errors can `except aadr_resolve.<Class>`:
|
|
264
|
+
|
|
265
|
+
| Class | Maps to exit | Trigger |
|
|
266
|
+
|-----------------------------|--------------|------------------------------------------------------|
|
|
267
|
+
| `ValidationError` | 1 | Turnover gate, coverage gate, substantive-regroup gate |
|
|
268
|
+
| `IOFailure` | 2 | File not found, lock held, malformed TSV |
|
|
269
|
+
| `InvariantViolation` | 3 | Schema YAML malformed (rare) |
|
|
270
|
+
| `SchemaDetectionError` | 3 | Header signature unknown |
|
|
271
|
+
| `MissingNativeFieldError` | 3 | Canonical field requested for a class that lacks it |
|
|
272
|
+
| `CollisionDetected` | 3 | Cross-lab MID collision under `error` policy |
|
|
273
|
+
| `UsageError` | 4 | Bad CLI args; cohort file has no matching version |
|
|
274
|
+
|
|
275
|
+
## Exit codes
|
|
276
|
+
|
|
277
|
+
Stable across versions. CI workflows can grep:
|
|
278
|
+
|
|
279
|
+
- `0` — success
|
|
280
|
+
- `1` — soft-validation failure (any of the gates)
|
|
281
|
+
- `2` — I/O failure
|
|
282
|
+
- `3` — invariant violation (schema, MID collision)
|
|
283
|
+
- `4` — usage error (bad CLI args)
|
|
284
|
+
|
|
285
|
+
## Troubleshooting
|
|
286
|
+
|
|
287
|
+
**"unknown .anno schema signature"** — your `.anno` header doesn't
|
|
288
|
+
match any of the 5 known classes. Either the file is from a newer AADR
|
|
289
|
+
release (file an issue with the bench-verify diff), or the file has
|
|
290
|
+
been edited. Workarounds:
|
|
291
|
+
|
|
292
|
+
- `--schema-override A|B|C|D|E` forces a class without signature check.
|
|
293
|
+
- `--version-label vN.N` forces a version label when the filename
|
|
294
|
+
doesn't match a known pattern.
|
|
295
|
+
|
|
296
|
+
**"cross-lab MID collision"** — the GID-stability check found a Master
|
|
297
|
+
ID that maps to two different individuals in different versions.
|
|
298
|
+
This indicates either a real data error in AADR or a cross-lab naming
|
|
299
|
+
collision (rare). Workarounds:
|
|
300
|
+
|
|
301
|
+
- `--on-mid-collision warn` continues with a stderr warning and marks
|
|
302
|
+
affected rows with `library_chain_ambiguous` status.
|
|
303
|
+
- `--mid-bridge FILE` lets you specify the correct mapping manually.
|
|
304
|
+
|
|
305
|
+
**"sample turnover gate (fail)"** — removal rate exceeded the
|
|
306
|
+
`--turnover-fail` threshold (default 30%). Indicates either a major
|
|
307
|
+
AADR cleanup (the v62→v66 bump removed ~17%) or that the wrong files
|
|
308
|
+
are being compared. Override with `--turnover-fail 1.0` to disable.
|
|
309
|
+
|
|
310
|
+
**"cohort coverage gate (fail)"** — fewer than 25% of cohort entries
|
|
311
|
+
resolved in the supplied versions. Usually means the cohort file uses
|
|
312
|
+
IDs from a version not in the supplied set. Check `--cohort-version`.
|
|
313
|
+
|
|
314
|
+
**Pandas ParserError on a v52 / v54 `.anno`** — these versions contain
|
|
315
|
+
embedded quote characters in some `full_date` cells. aadr-resolve reads
|
|
316
|
+
with `csv.QUOTE_NONE` to side-step pandas's default quote-handling;
|
|
317
|
+
upgrade if you're on an older version.
|
|
318
|
+
|
|
319
|
+
## Composition with the broader ecosystem
|
|
320
|
+
|
|
321
|
+
```bash
|
|
322
|
+
aadr-resolve cohort patterson_2022.txt \
|
|
323
|
+
--anno-files v44.3.anno --anno-files v66.0.anno \
|
|
324
|
+
-o cohort_manifest.tsv
|
|
325
|
+
pgen-samplebind merge \
|
|
326
|
+
--relabel-from cohort_manifest.tsv \
|
|
327
|
+
--output merged_v66.pgen \
|
|
328
|
+
v44.3.pgen v66.0.pgen
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
The manifest's column layout is documented in HLD §Output: cohort.
|
|
332
|
+
|
|
333
|
+
## Development
|
|
334
|
+
|
|
335
|
+
```bash
|
|
336
|
+
git clone https://github.com/carstenerickson/aadr-resolve
|
|
337
|
+
cd aadr-resolve
|
|
338
|
+
python -m venv .venv && source .venv/bin/activate
|
|
339
|
+
pip install -e ".[dev]"
|
|
340
|
+
|
|
341
|
+
# Default suite (fast; ~10s)
|
|
342
|
+
pytest -ra
|
|
343
|
+
|
|
344
|
+
# Slow tests (synth perf benchmark)
|
|
345
|
+
pytest -m slow -ra
|
|
346
|
+
|
|
347
|
+
# External tests (real AADR files; requires AADR_CACHE env var)
|
|
348
|
+
AADR_CACHE=/path/to/cache pytest -m external -ra
|
|
349
|
+
|
|
350
|
+
# Standalone perf benchmark with per-phase timings
|
|
351
|
+
AADR_CACHE=/path/to/cache python -m benchmarks.perf_bench
|
|
352
|
+
|
|
353
|
+
# Lint + format + types
|
|
354
|
+
ruff check src/ tests/
|
|
355
|
+
ruff format --check src/ tests/
|
|
356
|
+
mypy src/
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
CI runs the default suite across Python 3.11/3.12/3.13 × Ubuntu+macOS;
|
|
360
|
+
see `.github/workflows/ci.yml`.
|
|
361
|
+
|
|
362
|
+
## License
|
|
363
|
+
|
|
364
|
+
MIT.
|