aadr-resolve 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aadr_resolve/__init__.py +177 -0
- aadr_resolve/__main__.py +8 -0
- aadr_resolve/annoframe.py +189 -0
- aadr_resolve/bridge.py +334 -0
- aadr_resolve/cli.py +99 -0
- aadr_resolve/cohort.py +457 -0
- aadr_resolve/commands/__init__.py +1 -0
- aadr_resolve/commands/cohort_cmd.py +203 -0
- aadr_resolve/commands/diff_cmd.py +179 -0
- aadr_resolve/commands/join_cmd.py +96 -0
- aadr_resolve/commands/lookup_cmd.py +99 -0
- aadr_resolve/commands/schema_cmd.py +66 -0
- aadr_resolve/coverage_norm.py +106 -0
- aadr_resolve/date_norm.py +36 -0
- aadr_resolve/diff.py +194 -0
- aadr_resolve/errors.py +85 -0
- aadr_resolve/gates.py +224 -0
- aadr_resolve/group_classifier.py +109 -0
- aadr_resolve/join.py +80 -0
- aadr_resolve/library_token.py +340 -0
- aadr_resolve/loader.py +169 -0
- aadr_resolve/lookup.py +195 -0
- aadr_resolve/py.typed +0 -0
- aadr_resolve/reporting.py +101 -0
- aadr_resolve/schema.py +109 -0
- aadr_resolve/schemas/class_A.yaml +134 -0
- aadr_resolve/schemas/class_B.yaml +138 -0
- aadr_resolve/schemas/class_C.yaml +138 -0
- aadr_resolve/schemas/class_D.yaml +133 -0
- aadr_resolve/schemas/class_E.yaml +140 -0
- aadr_resolve/types.py +501 -0
- aadr_resolve/version_inference.py +55 -0
- aadr_resolve-0.1.0.dist-info/METADATA +364 -0
- aadr_resolve-0.1.0.dist-info/RECORD +38 -0
- aadr_resolve-0.1.0.dist-info/WHEEL +5 -0
- aadr_resolve-0.1.0.dist-info/entry_points.txt +2 -0
- aadr_resolve-0.1.0.dist-info/licenses/LICENSE +21 -0
- aadr_resolve-0.1.0.dist-info/top_level.txt +1 -0
aadr_resolve/__init__.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""aadr-resolve: AADR cross-version GeneticID / MasterID join utility."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from .types import MIDBridge
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
|
|
13
|
+
from .annoframe import AnnoFrame
|
|
14
|
+
from .bridge import detect_bridge
|
|
15
|
+
from .errors import (
|
|
16
|
+
AadrResolveError,
|
|
17
|
+
CollisionDetected,
|
|
18
|
+
InvariantViolation,
|
|
19
|
+
IOFailure,
|
|
20
|
+
MissingNativeFieldError,
|
|
21
|
+
SchemaDetectionError,
|
|
22
|
+
UsageError,
|
|
23
|
+
ValidationError,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
__all__ = [ # noqa: RUF022 — grouped intentionally (data type, API, errors, version)
|
|
27
|
+
# Core data type
|
|
28
|
+
"AnnoFrame",
|
|
29
|
+
# Library API top-level functions
|
|
30
|
+
"resolve_genetic_ids",
|
|
31
|
+
"resolve_master_ids",
|
|
32
|
+
# Exception hierarchy (aadr-subset etc. catch + re-raise these)
|
|
33
|
+
"AadrResolveError",
|
|
34
|
+
"CollisionDetected",
|
|
35
|
+
"InvariantViolation",
|
|
36
|
+
"IOFailure",
|
|
37
|
+
"MissingNativeFieldError",
|
|
38
|
+
"SchemaDetectionError",
|
|
39
|
+
"UsageError",
|
|
40
|
+
"ValidationError",
|
|
41
|
+
# Version
|
|
42
|
+
"__version__",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def resolve_master_ids(
|
|
47
|
+
ids: list[str],
|
|
48
|
+
src_version: str,
|
|
49
|
+
dst_version: str,
|
|
50
|
+
*,
|
|
51
|
+
anno_paths: dict[str, Path | str] | None = None,
|
|
52
|
+
anno_frames: dict[str, AnnoFrame] | None = None,
|
|
53
|
+
mid_bridge: Path | str | None = None,
|
|
54
|
+
) -> dict[str, str | None]:
|
|
55
|
+
"""Resolve a list of individual_ids from src_version to dst_version
|
|
56
|
+
GeneticIDs through the MID-rename bridge.
|
|
57
|
+
|
|
58
|
+
Per HLD §Library API surface. For each input id, returns its
|
|
59
|
+
representative GeneticID in dst_version (the alphabetically-first
|
|
60
|
+
among the matching rows for that individual). None if the individual
|
|
61
|
+
is absent from dst_version.
|
|
62
|
+
|
|
63
|
+
Either `anno_paths` (dict version_label -> Path) OR `anno_frames`
|
|
64
|
+
(dict version_label -> AnnoFrame) must be supplied.
|
|
65
|
+
|
|
66
|
+
`mid_bridge` (Path or str to a TSV file) supplies manual MID-rename
|
|
67
|
+
entries that layer on top of the GID-stable auto-detection — the
|
|
68
|
+
same semantics as the `--mid-bridge` CLI flag. Format: header
|
|
69
|
+
`v_old_label\\tmid_old\\tv_new_label\\tmid_new`."""
|
|
70
|
+
afs = _resolve_anno_frames(anno_paths, anno_frames, src_version, dst_version)
|
|
71
|
+
bridge = _build_bridge(list(afs.values()), mid_bridge)
|
|
72
|
+
af_dst = afs[dst_version]
|
|
73
|
+
iids_dst = af_dst.individual_id.tolist()
|
|
74
|
+
gids_dst = af_dst.genetic_id.tolist()
|
|
75
|
+
|
|
76
|
+
out: dict[str, str | None] = {}
|
|
77
|
+
for query in ids:
|
|
78
|
+
canonical = bridge.canonical_id(src_version, query)
|
|
79
|
+
# Find rows in dst_version whose canonical individual_id matches.
|
|
80
|
+
matches: list[str] = []
|
|
81
|
+
for iid, gid in zip(iids_dst, gids_dst, strict=True):
|
|
82
|
+
if not isinstance(iid, str) or not iid:
|
|
83
|
+
continue
|
|
84
|
+
if bridge.canonical_id(dst_version, iid) == canonical:
|
|
85
|
+
matches.append(str(gid))
|
|
86
|
+
out[query] = sorted(matches)[0] if matches else None
|
|
87
|
+
return out
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def resolve_genetic_ids(
|
|
91
|
+
ids: list[str],
|
|
92
|
+
src_version: str,
|
|
93
|
+
dst_version: str,
|
|
94
|
+
*,
|
|
95
|
+
anno_paths: dict[str, Path | str] | None = None,
|
|
96
|
+
anno_frames: dict[str, AnnoFrame] | None = None,
|
|
97
|
+
mid_bridge: Path | str | None = None,
|
|
98
|
+
) -> dict[str, list[str]]:
|
|
99
|
+
"""Resolve a list of GeneticIDs from src_version to ALL dst_version
|
|
100
|
+
GeneticIDs that share the same individual.
|
|
101
|
+
|
|
102
|
+
Multi-row-per-IID semantics preserved: one input GeneticID maps to a
|
|
103
|
+
list (may be multi-row if the individual has multiple libraries in
|
|
104
|
+
dst_version).
|
|
105
|
+
|
|
106
|
+
`mid_bridge` (Path or str): optional manual MID-rename override TSV;
|
|
107
|
+
same semantics as the `--mid-bridge` CLI flag.
|
|
108
|
+
|
|
109
|
+
Empty list if no rows in dst_version share the individual."""
|
|
110
|
+
afs = _resolve_anno_frames(anno_paths, anno_frames, src_version, dst_version)
|
|
111
|
+
bridge = _build_bridge(list(afs.values()), mid_bridge)
|
|
112
|
+
af_src = afs[src_version]
|
|
113
|
+
af_dst = afs[dst_version]
|
|
114
|
+
|
|
115
|
+
# src GID -> src IID via af_src.
|
|
116
|
+
src_iids = af_src.individual_id.tolist()
|
|
117
|
+
src_gids = af_src.genetic_id.tolist()
|
|
118
|
+
src_gid_to_iid: dict[str, str] = {}
|
|
119
|
+
for iid, gid in zip(src_iids, src_gids, strict=True):
|
|
120
|
+
if isinstance(gid, str) and gid and isinstance(iid, str) and iid:
|
|
121
|
+
src_gid_to_iid[gid] = iid
|
|
122
|
+
|
|
123
|
+
# dst canonical -> list of dst GIDs.
|
|
124
|
+
dst_iids = af_dst.individual_id.tolist()
|
|
125
|
+
dst_gids = af_dst.genetic_id.tolist()
|
|
126
|
+
dst_canonical_to_gids: dict[str, list[str]] = {}
|
|
127
|
+
for iid, gid in zip(dst_iids, dst_gids, strict=True):
|
|
128
|
+
if not isinstance(iid, str) or not iid or not isinstance(gid, str) or not gid:
|
|
129
|
+
continue
|
|
130
|
+
canonical = bridge.canonical_id(dst_version, iid)
|
|
131
|
+
dst_canonical_to_gids.setdefault(canonical, []).append(str(gid))
|
|
132
|
+
|
|
133
|
+
out: dict[str, list[str]] = {}
|
|
134
|
+
for query in ids:
|
|
135
|
+
src_iid = src_gid_to_iid.get(query)
|
|
136
|
+
if src_iid is None:
|
|
137
|
+
out[query] = []
|
|
138
|
+
continue
|
|
139
|
+
canonical = bridge.canonical_id(src_version, src_iid)
|
|
140
|
+
out[query] = sorted(dst_canonical_to_gids.get(canonical, []))
|
|
141
|
+
return out
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _build_bridge(
|
|
145
|
+
anno_frames: list[AnnoFrame],
|
|
146
|
+
mid_bridge: Path | str | None,
|
|
147
|
+
) -> MIDBridge:
|
|
148
|
+
"""Build a MIDBridge from anno_frames, layering in manual overrides if
|
|
149
|
+
`mid_bridge` is supplied. Internal helper shared by the resolve_*
|
|
150
|
+
functions."""
|
|
151
|
+
bridge = detect_bridge(anno_frames)
|
|
152
|
+
if mid_bridge is not None:
|
|
153
|
+
from .bridge import load_manual_bridge, merge_with_overrides
|
|
154
|
+
|
|
155
|
+
overrides = load_manual_bridge(Path(mid_bridge))
|
|
156
|
+
bridge, _warnings = merge_with_overrides(bridge, overrides)
|
|
157
|
+
return bridge
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _resolve_anno_frames(
|
|
161
|
+
anno_paths: dict[str, Path | str] | None,
|
|
162
|
+
anno_frames: dict[str, AnnoFrame] | None,
|
|
163
|
+
src_version: str,
|
|
164
|
+
dst_version: str,
|
|
165
|
+
) -> dict[str, AnnoFrame]:
|
|
166
|
+
if anno_frames is not None:
|
|
167
|
+
if src_version not in anno_frames or dst_version not in anno_frames:
|
|
168
|
+
raise KeyError(f"anno_frames must contain both {src_version!r} and {dst_version!r}")
|
|
169
|
+
return anno_frames
|
|
170
|
+
if anno_paths is not None:
|
|
171
|
+
if src_version not in anno_paths or dst_version not in anno_paths:
|
|
172
|
+
raise KeyError(f"anno_paths must contain both {src_version!r} and {dst_version!r}")
|
|
173
|
+
return {
|
|
174
|
+
label: AnnoFrame.from_path(Path(path), version_label=label)
|
|
175
|
+
for label, path in anno_paths.items()
|
|
176
|
+
}
|
|
177
|
+
raise ValueError("either anno_paths or anno_frames must be supplied")
|
aadr_resolve/__main__.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""AnnoFrame: the central library data type. Per LLD §2.4 + §3.6.
|
|
2
|
+
|
|
3
|
+
Day-1 scaffold: typed accessors for string columns (genetic_id, individual_id,
|
|
4
|
+
group_id, persistent_genetic_id). The Float64/Int64 accessors (date_calbp,
|
|
5
|
+
date_sd_bp, coverage, coverage_via) raise NotImplementedError — they land in
|
|
6
|
+
Day 2 alongside date_norm.py and coverage_norm.py.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Self
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
from .types import SchemaClass, SchemaClassDef
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class AnnoFrame:
|
|
22
|
+
"""Schema-resolved .anno reader. Library-level public API.
|
|
23
|
+
|
|
24
|
+
See HLD §Library API surface and LLD §2.4 for the contract."""
|
|
25
|
+
|
|
26
|
+
version: str # parsed version label, e.g., "v66.0"
|
|
27
|
+
schema_class: SchemaClass # detected or override
|
|
28
|
+
schema_def: SchemaClassDef # loaded YAML for the class
|
|
29
|
+
df: pd.DataFrame # raw string-dtype cells, indexed 0..n_rows-1
|
|
30
|
+
# Original .anno path the loader read from. None when constructed
|
|
31
|
+
# directly (tests, in-memory cases). Day-6 addition so sibling tools
|
|
32
|
+
# (e.g., aadr-subset) can call resolve_master_ids without re-tracking
|
|
33
|
+
# paths separately.
|
|
34
|
+
path: Path | None = field(default=None, compare=False)
|
|
35
|
+
# Day-2 caches; declared here so the dataclass shape is stable.
|
|
36
|
+
_date_calbp_cache: pd.Series | None = field(default=None, repr=False, compare=False)
|
|
37
|
+
_coverage_cache: dict[str, pd.Series] = field(default_factory=dict, repr=False, compare=False)
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def from_path(
|
|
41
|
+
cls,
|
|
42
|
+
path: Path | str,
|
|
43
|
+
*,
|
|
44
|
+
version_label: str | None = None,
|
|
45
|
+
schema_override: SchemaClass | None = None,
|
|
46
|
+
) -> Self:
|
|
47
|
+
"""Load an .anno end-to-end. Delegates to loader.read_anno()."""
|
|
48
|
+
# Local import to break the loader <-> annoframe cycle.
|
|
49
|
+
from .loader import read_anno
|
|
50
|
+
|
|
51
|
+
return read_anno( # type: ignore[return-value]
|
|
52
|
+
Path(path),
|
|
53
|
+
version_label=version_label,
|
|
54
|
+
schema_override=schema_override,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# === Cardinality ===
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def n_rows(self) -> int:
|
|
61
|
+
return len(self.df)
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def n_columns(self) -> int:
|
|
65
|
+
return int(self.df.shape[1])
|
|
66
|
+
|
|
67
|
+
# === Identity columns (always string dtype) ===
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def genetic_id(self) -> pd.Series:
|
|
71
|
+
return self._raw_column("genetic_id").astype("string").copy()
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def individual_id(self) -> pd.Series:
|
|
75
|
+
return self._raw_column("individual_id").astype("string").copy()
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def group_id(self) -> pd.Series:
|
|
79
|
+
return self._raw_column("group_id").astype("string").copy()
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def persistent_genetic_id(self) -> pd.Series | None:
|
|
83
|
+
"""v66+ only (class E). Returns None for classes A-D.
|
|
84
|
+
|
|
85
|
+
Int64 nullable Series of the numeric Persistent Genetic ID column."""
|
|
86
|
+
if self.schema_class != SchemaClass.E:
|
|
87
|
+
return None
|
|
88
|
+
from .date_norm import to_int64_nullable
|
|
89
|
+
|
|
90
|
+
raw = self._raw_column("persistent_genetic_id")
|
|
91
|
+
return to_int64_nullable(raw).copy()
|
|
92
|
+
|
|
93
|
+
# === Int64-nullable date accessors ===
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def date_calbp(self) -> pd.Series:
|
|
97
|
+
"""Canonical calBP (integer years before 1950 CE) as nullable Int64.
|
|
98
|
+
|
|
99
|
+
Cached on first access; cleared by reset_caches(). Per HLD §Date
|
|
100
|
+
normalization, bench-verified clean across all 6 versions (0 nulls;
|
|
101
|
+
range 0-185000)."""
|
|
102
|
+
if self._date_calbp_cache is None:
|
|
103
|
+
from .date_norm import to_int64_nullable
|
|
104
|
+
|
|
105
|
+
raw = self._raw_column("date_mean_bp")
|
|
106
|
+
self._date_calbp_cache = to_int64_nullable(raw)
|
|
107
|
+
return self._date_calbp_cache.copy()
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def date_sd_bp(self) -> pd.Series:
|
|
111
|
+
"""Date SD (BP) as nullable Int64. Used by CI-aware time-series cohorts."""
|
|
112
|
+
from .date_norm import to_int64_nullable
|
|
113
|
+
|
|
114
|
+
raw = self._raw_column("date_sd_bp")
|
|
115
|
+
return to_int64_nullable(raw).copy()
|
|
116
|
+
|
|
117
|
+
# === Float64 coverage accessors ===
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def coverage(self) -> pd.Series:
|
|
121
|
+
"""1240k-target coverage Float64. NaN for missing.
|
|
122
|
+
|
|
123
|
+
For class D (v62, no native column) returns an all-NaN Series of
|
|
124
|
+
length n_rows. Use coverage_via('snps_hit_1240k') for the derived
|
|
125
|
+
proxy (with Poisson-divergence stderr warning)."""
|
|
126
|
+
return self.coverage_via("coverage_1240k")
|
|
127
|
+
|
|
128
|
+
def coverage_via(self, canonical_field: str) -> pd.Series:
|
|
129
|
+
"""Float64 coverage from a specified canonical field.
|
|
130
|
+
|
|
131
|
+
See coverage_norm.resolve_coverage for the per-class routing logic
|
|
132
|
+
+ the Poisson-divergence caveat when 'snps_hit_1240k' is requested.
|
|
133
|
+
|
|
134
|
+
Results are cached on the AnnoFrame instance (one cache per
|
|
135
|
+
canonical_field); copies are returned to library consumers so
|
|
136
|
+
mutation by the caller doesn't corrupt the cache."""
|
|
137
|
+
if canonical_field in self._coverage_cache:
|
|
138
|
+
return self._coverage_cache[canonical_field].copy()
|
|
139
|
+
|
|
140
|
+
from .coverage_norm import resolve_coverage
|
|
141
|
+
|
|
142
|
+
series = resolve_coverage(self, canonical_field)
|
|
143
|
+
self._coverage_cache[canonical_field] = series
|
|
144
|
+
return series.copy()
|
|
145
|
+
|
|
146
|
+
def reset_caches(self) -> None:
|
|
147
|
+
"""Clear the typed-accessor caches.
|
|
148
|
+
|
|
149
|
+
Useful in tests when the same AnnoFrame instance is reused across
|
|
150
|
+
scenarios that monkeypatch coverage_norm.PANEL_CARDINALITY_1240K
|
|
151
|
+
(the cached values would otherwise reflect the pre-patch state)."""
|
|
152
|
+
self._date_calbp_cache = None
|
|
153
|
+
self._coverage_cache.clear()
|
|
154
|
+
|
|
155
|
+
# === Internal helpers ===
|
|
156
|
+
|
|
157
|
+
def _raw_column(self, canonical_field: str) -> pd.Series:
|
|
158
|
+
"""Raw string Series for a canonical field. Raises if absent in class."""
|
|
159
|
+
col_idx = self.schema_def.column_for(canonical_field)
|
|
160
|
+
return self.df.iloc[:, col_idx - 1]
|
|
161
|
+
|
|
162
|
+
# === Diagnostic ===
|
|
163
|
+
|
|
164
|
+
def to_dict(self) -> dict[str, Any]:
|
|
165
|
+
"""Serializable summary for the `schema` subcommand's JSON output."""
|
|
166
|
+
mapped_fields: dict[str, dict[str, Any]] = {}
|
|
167
|
+
for canonical, mapping in self.schema_def.fields.items():
|
|
168
|
+
mapped_fields[canonical] = {
|
|
169
|
+
"column": mapping.column,
|
|
170
|
+
"normalized_header": mapping.normalized_header,
|
|
171
|
+
"display_header": mapping.display_header,
|
|
172
|
+
}
|
|
173
|
+
return {
|
|
174
|
+
"version": self.version,
|
|
175
|
+
"schema_class": self.schema_class.value,
|
|
176
|
+
"n_rows": self.n_rows,
|
|
177
|
+
"n_columns": self.n_columns,
|
|
178
|
+
"detection_signature": list(self.schema_def.detection_signature),
|
|
179
|
+
"fields": mapped_fields,
|
|
180
|
+
"not_present": list(self.schema_def.not_present),
|
|
181
|
+
"notes": list(self.schema_def.notes),
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
def __repr__(self) -> str:
|
|
185
|
+
return (
|
|
186
|
+
f"AnnoFrame(version={self.version!r}, "
|
|
187
|
+
f"schema_class={self.schema_class.value!r}, "
|
|
188
|
+
f"n_rows={self.n_rows}, n_columns={self.n_columns})"
|
|
189
|
+
)
|