aadr-resolve 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,177 @@
1
+ """aadr-resolve: AADR cross-version GeneticID / MasterID join utility."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING
7
+
8
+ if TYPE_CHECKING:
9
+ from .types import MIDBridge
10
+
11
+ __version__ = "0.1.0"
12
+
13
+ from .annoframe import AnnoFrame
14
+ from .bridge import detect_bridge
15
+ from .errors import (
16
+ AadrResolveError,
17
+ CollisionDetected,
18
+ InvariantViolation,
19
+ IOFailure,
20
+ MissingNativeFieldError,
21
+ SchemaDetectionError,
22
+ UsageError,
23
+ ValidationError,
24
+ )
25
+
26
+ __all__ = [ # noqa: RUF022 — grouped intentionally (data type, API, errors, version)
27
+ # Core data type
28
+ "AnnoFrame",
29
+ # Library API top-level functions
30
+ "resolve_genetic_ids",
31
+ "resolve_master_ids",
32
+ # Exception hierarchy (aadr-subset etc. catch + re-raise these)
33
+ "AadrResolveError",
34
+ "CollisionDetected",
35
+ "InvariantViolation",
36
+ "IOFailure",
37
+ "MissingNativeFieldError",
38
+ "SchemaDetectionError",
39
+ "UsageError",
40
+ "ValidationError",
41
+ # Version
42
+ "__version__",
43
+ ]
44
+
45
+
46
+ def resolve_master_ids(
47
+ ids: list[str],
48
+ src_version: str,
49
+ dst_version: str,
50
+ *,
51
+ anno_paths: dict[str, Path | str] | None = None,
52
+ anno_frames: dict[str, AnnoFrame] | None = None,
53
+ mid_bridge: Path | str | None = None,
54
+ ) -> dict[str, str | None]:
55
+ """Resolve a list of individual_ids from src_version to dst_version
56
+ GeneticIDs through the MID-rename bridge.
57
+
58
+ Per HLD §Library API surface. For each input id, returns its
59
+ representative GeneticID in dst_version (the alphabetically-first
60
+ among the matching rows for that individual). None if the individual
61
+ is absent from dst_version.
62
+
63
+ Either `anno_paths` (dict version_label -> Path) OR `anno_frames`
64
+ (dict version_label -> AnnoFrame) must be supplied.
65
+
66
+ `mid_bridge` (Path or str to a TSV file) supplies manual MID-rename
67
+ entries that layer on top of the GID-stable auto-detection — the
68
+ same semantics as the `--mid-bridge` CLI flag. Format: header
69
+ `v_old_label\\tmid_old\\tv_new_label\\tmid_new`."""
70
+ afs = _resolve_anno_frames(anno_paths, anno_frames, src_version, dst_version)
71
+ bridge = _build_bridge(list(afs.values()), mid_bridge)
72
+ af_dst = afs[dst_version]
73
+ iids_dst = af_dst.individual_id.tolist()
74
+ gids_dst = af_dst.genetic_id.tolist()
75
+
76
+ out: dict[str, str | None] = {}
77
+ for query in ids:
78
+ canonical = bridge.canonical_id(src_version, query)
79
+ # Find rows in dst_version whose canonical individual_id matches.
80
+ matches: list[str] = []
81
+ for iid, gid in zip(iids_dst, gids_dst, strict=True):
82
+ if not isinstance(iid, str) or not iid:
83
+ continue
84
+ if bridge.canonical_id(dst_version, iid) == canonical:
85
+ matches.append(str(gid))
86
+ out[query] = sorted(matches)[0] if matches else None
87
+ return out
88
+
89
+
90
+ def resolve_genetic_ids(
91
+ ids: list[str],
92
+ src_version: str,
93
+ dst_version: str,
94
+ *,
95
+ anno_paths: dict[str, Path | str] | None = None,
96
+ anno_frames: dict[str, AnnoFrame] | None = None,
97
+ mid_bridge: Path | str | None = None,
98
+ ) -> dict[str, list[str]]:
99
+ """Resolve a list of GeneticIDs from src_version to ALL dst_version
100
+ GeneticIDs that share the same individual.
101
+
102
+ Multi-row-per-IID semantics preserved: one input GeneticID maps to a
103
+ list (may be multi-row if the individual has multiple libraries in
104
+ dst_version).
105
+
106
+ `mid_bridge` (Path or str): optional manual MID-rename override TSV;
107
+ same semantics as the `--mid-bridge` CLI flag.
108
+
109
+ Empty list if no rows in dst_version share the individual."""
110
+ afs = _resolve_anno_frames(anno_paths, anno_frames, src_version, dst_version)
111
+ bridge = _build_bridge(list(afs.values()), mid_bridge)
112
+ af_src = afs[src_version]
113
+ af_dst = afs[dst_version]
114
+
115
+ # src GID -> src IID via af_src.
116
+ src_iids = af_src.individual_id.tolist()
117
+ src_gids = af_src.genetic_id.tolist()
118
+ src_gid_to_iid: dict[str, str] = {}
119
+ for iid, gid in zip(src_iids, src_gids, strict=True):
120
+ if isinstance(gid, str) and gid and isinstance(iid, str) and iid:
121
+ src_gid_to_iid[gid] = iid
122
+
123
+ # dst canonical -> list of dst GIDs.
124
+ dst_iids = af_dst.individual_id.tolist()
125
+ dst_gids = af_dst.genetic_id.tolist()
126
+ dst_canonical_to_gids: dict[str, list[str]] = {}
127
+ for iid, gid in zip(dst_iids, dst_gids, strict=True):
128
+ if not isinstance(iid, str) or not iid or not isinstance(gid, str) or not gid:
129
+ continue
130
+ canonical = bridge.canonical_id(dst_version, iid)
131
+ dst_canonical_to_gids.setdefault(canonical, []).append(str(gid))
132
+
133
+ out: dict[str, list[str]] = {}
134
+ for query in ids:
135
+ src_iid = src_gid_to_iid.get(query)
136
+ if src_iid is None:
137
+ out[query] = []
138
+ continue
139
+ canonical = bridge.canonical_id(src_version, src_iid)
140
+ out[query] = sorted(dst_canonical_to_gids.get(canonical, []))
141
+ return out
142
+
143
+
144
+ def _build_bridge(
145
+ anno_frames: list[AnnoFrame],
146
+ mid_bridge: Path | str | None,
147
+ ) -> MIDBridge:
148
+ """Build a MIDBridge from anno_frames, layering in manual overrides if
149
+ `mid_bridge` is supplied. Internal helper shared by the resolve_*
150
+ functions."""
151
+ bridge = detect_bridge(anno_frames)
152
+ if mid_bridge is not None:
153
+ from .bridge import load_manual_bridge, merge_with_overrides
154
+
155
+ overrides = load_manual_bridge(Path(mid_bridge))
156
+ bridge, _warnings = merge_with_overrides(bridge, overrides)
157
+ return bridge
158
+
159
+
160
+ def _resolve_anno_frames(
161
+ anno_paths: dict[str, Path | str] | None,
162
+ anno_frames: dict[str, AnnoFrame] | None,
163
+ src_version: str,
164
+ dst_version: str,
165
+ ) -> dict[str, AnnoFrame]:
166
+ if anno_frames is not None:
167
+ if src_version not in anno_frames or dst_version not in anno_frames:
168
+ raise KeyError(f"anno_frames must contain both {src_version!r} and {dst_version!r}")
169
+ return anno_frames
170
+ if anno_paths is not None:
171
+ if src_version not in anno_paths or dst_version not in anno_paths:
172
+ raise KeyError(f"anno_paths must contain both {src_version!r} and {dst_version!r}")
173
+ return {
174
+ label: AnnoFrame.from_path(Path(path), version_label=label)
175
+ for label, path in anno_paths.items()
176
+ }
177
+ raise ValueError("either anno_paths or anno_frames must be supplied")
@@ -0,0 +1,8 @@
1
+ """Enables `python -m aadr_resolve`."""
2
+
3
+ import sys
4
+
5
+ from .cli import main
6
+
7
+ if __name__ == "__main__":
8
+ sys.exit(main())
@@ -0,0 +1,189 @@
1
+ """AnnoFrame: the central library data type. Per LLD §2.4 + §3.6.
2
+
3
+ Day-1 scaffold: typed accessors for string columns (genetic_id, individual_id,
4
+ group_id, persistent_genetic_id). The Float64/Int64 accessors (date_calbp,
5
+ date_sd_bp, coverage, coverage_via) raise NotImplementedError — they land in
6
+ Day 2 alongside date_norm.py and coverage_norm.py.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import dataclass, field
12
+ from pathlib import Path
13
+ from typing import Any, Self
14
+
15
+ import pandas as pd
16
+
17
+ from .types import SchemaClass, SchemaClassDef
18
+
19
+
20
+ @dataclass
21
+ class AnnoFrame:
22
+ """Schema-resolved .anno reader. Library-level public API.
23
+
24
+ See HLD §Library API surface and LLD §2.4 for the contract."""
25
+
26
+ version: str # parsed version label, e.g., "v66.0"
27
+ schema_class: SchemaClass # detected or override
28
+ schema_def: SchemaClassDef # loaded YAML for the class
29
+ df: pd.DataFrame # raw string-dtype cells, indexed 0..n_rows-1
30
+ # Original .anno path the loader read from. None when constructed
31
+ # directly (tests, in-memory cases). Day-6 addition so sibling tools
32
+ # (e.g., aadr-subset) can call resolve_master_ids without re-tracking
33
+ # paths separately.
34
+ path: Path | None = field(default=None, compare=False)
35
+ # Day-2 caches; declared here so the dataclass shape is stable.
36
+ _date_calbp_cache: pd.Series | None = field(default=None, repr=False, compare=False)
37
+ _coverage_cache: dict[str, pd.Series] = field(default_factory=dict, repr=False, compare=False)
38
+
39
+ @classmethod
40
+ def from_path(
41
+ cls,
42
+ path: Path | str,
43
+ *,
44
+ version_label: str | None = None,
45
+ schema_override: SchemaClass | None = None,
46
+ ) -> Self:
47
+ """Load an .anno end-to-end. Delegates to loader.read_anno()."""
48
+ # Local import to break the loader <-> annoframe cycle.
49
+ from .loader import read_anno
50
+
51
+ return read_anno( # type: ignore[return-value]
52
+ Path(path),
53
+ version_label=version_label,
54
+ schema_override=schema_override,
55
+ )
56
+
57
+ # === Cardinality ===
58
+
59
+ @property
60
+ def n_rows(self) -> int:
61
+ return len(self.df)
62
+
63
+ @property
64
+ def n_columns(self) -> int:
65
+ return int(self.df.shape[1])
66
+
67
+ # === Identity columns (always string dtype) ===
68
+
69
+ @property
70
+ def genetic_id(self) -> pd.Series:
71
+ return self._raw_column("genetic_id").astype("string").copy()
72
+
73
+ @property
74
+ def individual_id(self) -> pd.Series:
75
+ return self._raw_column("individual_id").astype("string").copy()
76
+
77
+ @property
78
+ def group_id(self) -> pd.Series:
79
+ return self._raw_column("group_id").astype("string").copy()
80
+
81
+ @property
82
+ def persistent_genetic_id(self) -> pd.Series | None:
83
+ """v66+ only (class E). Returns None for classes A-D.
84
+
85
+ Int64 nullable Series of the numeric Persistent Genetic ID column."""
86
+ if self.schema_class != SchemaClass.E:
87
+ return None
88
+ from .date_norm import to_int64_nullable
89
+
90
+ raw = self._raw_column("persistent_genetic_id")
91
+ return to_int64_nullable(raw).copy()
92
+
93
+ # === Int64-nullable date accessors ===
94
+
95
+ @property
96
+ def date_calbp(self) -> pd.Series:
97
+ """Canonical calBP (integer years before 1950 CE) as nullable Int64.
98
+
99
+ Cached on first access; cleared by reset_caches(). Per HLD §Date
100
+ normalization, bench-verified clean across all 6 versions (0 nulls;
101
+ range 0-185000)."""
102
+ if self._date_calbp_cache is None:
103
+ from .date_norm import to_int64_nullable
104
+
105
+ raw = self._raw_column("date_mean_bp")
106
+ self._date_calbp_cache = to_int64_nullable(raw)
107
+ return self._date_calbp_cache.copy()
108
+
109
+ @property
110
+ def date_sd_bp(self) -> pd.Series:
111
+ """Date SD (BP) as nullable Int64. Used by CI-aware time-series cohorts."""
112
+ from .date_norm import to_int64_nullable
113
+
114
+ raw = self._raw_column("date_sd_bp")
115
+ return to_int64_nullable(raw).copy()
116
+
117
+ # === Float64 coverage accessors ===
118
+
119
+ @property
120
+ def coverage(self) -> pd.Series:
121
+ """1240k-target coverage Float64. NaN for missing.
122
+
123
+ For class D (v62, no native column) returns an all-NaN Series of
124
+ length n_rows. Use coverage_via('snps_hit_1240k') for the derived
125
+ proxy (with Poisson-divergence stderr warning)."""
126
+ return self.coverage_via("coverage_1240k")
127
+
128
+ def coverage_via(self, canonical_field: str) -> pd.Series:
129
+ """Float64 coverage from a specified canonical field.
130
+
131
+ See coverage_norm.resolve_coverage for the per-class routing logic
132
+ + the Poisson-divergence caveat when 'snps_hit_1240k' is requested.
133
+
134
+ Results are cached on the AnnoFrame instance (one cache per
135
+ canonical_field); copies are returned to library consumers so
136
+ mutation by the caller doesn't corrupt the cache."""
137
+ if canonical_field in self._coverage_cache:
138
+ return self._coverage_cache[canonical_field].copy()
139
+
140
+ from .coverage_norm import resolve_coverage
141
+
142
+ series = resolve_coverage(self, canonical_field)
143
+ self._coverage_cache[canonical_field] = series
144
+ return series.copy()
145
+
146
+ def reset_caches(self) -> None:
147
+ """Clear the typed-accessor caches.
148
+
149
+ Useful in tests when the same AnnoFrame instance is reused across
150
+ scenarios that monkeypatch coverage_norm.PANEL_CARDINALITY_1240K
151
+ (the cached values would otherwise reflect the pre-patch state)."""
152
+ self._date_calbp_cache = None
153
+ self._coverage_cache.clear()
154
+
155
+ # === Internal helpers ===
156
+
157
+ def _raw_column(self, canonical_field: str) -> pd.Series:
158
+ """Raw string Series for a canonical field. Raises if absent in class."""
159
+ col_idx = self.schema_def.column_for(canonical_field)
160
+ return self.df.iloc[:, col_idx - 1]
161
+
162
+ # === Diagnostic ===
163
+
164
+ def to_dict(self) -> dict[str, Any]:
165
+ """Serializable summary for the `schema` subcommand's JSON output."""
166
+ mapped_fields: dict[str, dict[str, Any]] = {}
167
+ for canonical, mapping in self.schema_def.fields.items():
168
+ mapped_fields[canonical] = {
169
+ "column": mapping.column,
170
+ "normalized_header": mapping.normalized_header,
171
+ "display_header": mapping.display_header,
172
+ }
173
+ return {
174
+ "version": self.version,
175
+ "schema_class": self.schema_class.value,
176
+ "n_rows": self.n_rows,
177
+ "n_columns": self.n_columns,
178
+ "detection_signature": list(self.schema_def.detection_signature),
179
+ "fields": mapped_fields,
180
+ "not_present": list(self.schema_def.not_present),
181
+ "notes": list(self.schema_def.notes),
182
+ }
183
+
184
+ def __repr__(self) -> str:
185
+ return (
186
+ f"AnnoFrame(version={self.version!r}, "
187
+ f"schema_class={self.schema_class.value!r}, "
188
+ f"n_rows={self.n_rows}, n_columns={self.n_columns})"
189
+ )