geneharmony 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ """
2
+ geneharmony — async toolkit to normalize gene identifiers and annotate gene sets.
3
+ Resolves symbols/IDs against the Alliance of Genome Resources (AGR) and annotates with data from AGR or user-ingested datasets.
4
+ """
5
+ __version__ = "0.3.0"
6
+
7
+ from .annotator import Annotator
8
+ from .datasets import AGRDataset
9
+ from .taxa import TaxonField, taxon_mapper, resolve_taxon
10
+
11
+ __all__ = [
12
+ "Annotator",
13
+ "AGRDataset",
14
+ "TaxonField",
15
+ "taxon_mapper",
16
+ "resolve_taxon",
17
+ ]
@@ -0,0 +1,356 @@
1
+ """User-facing entry point: bulk download, per-gene API, and annotate.
2
+
3
+ `Annotator` ties the lower-level pieces (`AGRClient`, `Downloader`, the gene
4
+ index, the dataset registry) into one object over a resolved cache directory. The
5
+ intended use is an iterative *filter-then-requery* traversal — one primary AGR
6
+ dataset per `annotate` call — so cardinality stays under the caller's control:
7
+
8
+ ann = Annotator()
9
+ orth = await ann.annotate(["TP53", "BRCA1"], AGRDataset.ORTHOLOGY, taxon="human")
10
+ mouse = orth.loc[orth.Gene2SpeciesTaxonID == "NCBITaxon:10090", "Gene2ID"].unique()
11
+ pheno = await ann.annotate(list(mouse), AGRDataset.PHENOTYPES, taxon="mouse")
12
+
13
+ `annotate` wide-left-joins each source onto the normalized base frame in order:
14
+ an AGR dataset contributes its native columns; an ingested external annotation
15
+ (referenced by name) contributes columns prefixed with `name.`.
16
+ """
17
+
18
+ import asyncio
19
+ import math
20
+ import os
21
+ from contextlib import AsyncExitStack
22
+ from pathlib import Path
23
+ from typing import Final
24
+ import pandas as pd
25
+
26
+ from .client import AGRClient
27
+ from .datasets import DATASETS, AGRDataset, ApiSpec, BulkSpec
28
+ from .downloader import Downloader
29
+ from .ingest import load_tsv_gz
30
+ from .models import DownloadFile
31
+ from .normalizer import GeneIndex, build_gene_index
32
+ from .taxa import resolve_taxon
33
+ from .store import read_parquet, write_parquet
34
+
35
+ type Genes = str | list[str] | pd.DataFrame
36
+
37
+ _APP_DIR: Final = "geneharmony"
38
+ _GENE_ID: Final = "GeneId"
39
+ _PAGE_SIZE: Final = 500
40
+
41
+
42
+ def default_cache_dir() -> Path:
43
+ base = os.environ.get("XDG_CACHE_HOME")
44
+ root = Path(base) if base else Path.home() / ".cache"
45
+ return root / _APP_DIR
46
+
47
+
48
+ def resolve_cache_dir(cache_dir: Path | None) -> Path:
49
+ cache = cache_dir or default_cache_dir()
50
+ cache.mkdir(parents=True, exist_ok=True)
51
+ return cache
52
+
53
+ class Annotator:
54
+ def __init__(
55
+ self,
56
+ cache_dir: Path | None = None,
57
+ *,
58
+ client: AGRClient | None = None,
59
+ downloader: Downloader | None = None,
60
+ ) -> None:
61
+ self._cache = resolve_cache_dir(cache_dir)
62
+ self._client = client
63
+ self._downloader = downloader
64
+ self._index: GeneIndex | None = None
65
+
66
+ async def normalize(
67
+ self,
68
+ genes: str | list[str],
69
+ *,
70
+ taxon: str | None = None,
71
+ limit: int | None = 1,
72
+ case_insensitive: bool = False,
73
+ ) -> pd.DataFrame:
74
+ index = await self._gene_index()
75
+ return index.lookup(
76
+ genes,
77
+ taxon=taxon,
78
+ limit=limit,
79
+ case_insensitive=case_insensitive,
80
+ )
81
+
82
+ async def download(self, dataset: AGRDataset, *, refresh: bool = False) -> Path:
83
+ """Download a dataset's bulk file, convert TSV -> Parquet, drop the .tsv.gz."""
84
+ bulk = DATASETS[dataset].bulk
85
+ if bulk is None:
86
+ raise ValueError(f"{dataset!r} has no bulk file; query it via annotate()")
87
+ dest = self._cache / "bulk" / f"{dataset}.parquet"
88
+ if dest.exists() and not refresh:
89
+ return dest
90
+ async with AsyncExitStack() as stack:
91
+ client = self._client or await stack.enter_async_context(AGRClient())
92
+ downloader = self._downloader or await stack.enter_async_context(Downloader())
93
+ file = _select_download(await client.list_downloads(), bulk)
94
+ tmp = dest.with_name(f"{dest.stem}.tsv.gz")
95
+ await downloader.download(file.s3Url, tmp)
96
+ write_parquet(load_tsv_gz(tmp, dtype=str), dest)
97
+ tmp.unlink(missing_ok=True)
98
+ return dest
99
+
100
+ async def ingest_annotation(
101
+ self,
102
+ source: str | Path | pd.DataFrame,
103
+ name: str,
104
+ *,
105
+ gene_id_column: str | list[str],
106
+ normalize: bool = True,
107
+ taxon: str | None = None,
108
+ case_insensitive: bool = False,
109
+ override: bool = False,
110
+ ) -> tuple[dict, pd.DataFrame | None]:
111
+ """Store an external annotation table, keyed by canonical `GeneId`.
112
+
113
+ `gene_id_column` may name several columns; they are tried left-to-right
114
+ per row, the first identifier that resolves wins (a fallback for tables
115
+ whose primary ID column has gaps). The id columns are kept as-is and a
116
+ separate `GeneId` column is added for the resolved canonical id, so the
117
+ input must not already contain a `GeneId` column. The returned unmapped
118
+ frame holds the rows (with their original columns) where no candidate
119
+ resolved.
120
+ """
121
+ dest = self._cache / "external" / f"{name}.parquet"
122
+ if dest.exists() and not override:
123
+ return _ingest_summary(name, None, None, None, None), None
124
+
125
+ df = source.copy() if isinstance(source, pd.DataFrame) else _read_table(source)
126
+ columns = [gene_id_column] if isinstance(gene_id_column, str) else list(gene_id_column)
127
+ if not columns:
128
+ raise ValueError("gene_id_column must name at least one column")
129
+ missing = [c for c in columns if c not in df.columns]
130
+ if missing:
131
+ raise KeyError(
132
+ f"gene_id_column(s) {missing!r} not found; columns are {list(df.columns)}"
133
+ )
134
+ if _GENE_ID in df.columns:
135
+ raise ValueError(
136
+ f"input already has a {_GENE_ID!r} column; rename it — a separate "
137
+ f"{_GENE_ID!r} column is added for the resolved canonical id."
138
+ )
139
+
140
+ if normalize:
141
+ str_cols = df[columns].astype(str)
142
+ values = [v for v in pd.unique(str_cols.values.ravel()) if v != "nan"]
143
+ mapping = await self._id_map(values, taxon, case_insensitive)
144
+ resolved = str_cols.apply(lambda col: col.map(mapping)).bfill(axis=1).iloc[:, 0]
145
+ else:
146
+ resolved = df[columns].bfill(axis=1).iloc[:, 0]
147
+
148
+ df[_GENE_ID] = resolved
149
+
150
+ rows_in = len(df)
151
+ rows_dropped = 0
152
+ unmapped_df: pd.DataFrame | None = None
153
+ if normalize:
154
+ unmapped_mask = df[_GENE_ID].isna()
155
+ unmapped_df = df[unmapped_mask].copy()
156
+ rows_dropped = int(unmapped_mask.sum())
157
+ df = df[~unmapped_mask].reset_index(drop=True)
158
+
159
+ write_parquet(df, dest)
160
+ return _ingest_summary(name, rows_in, len(df), rows_dropped, normalize), unmapped_df
161
+
162
+ async def annotate(
163
+ self,
164
+ genes: Genes,
165
+ *sources: AGRDataset | str,
166
+ taxon: str | None = None,
167
+ limit: int | None = 1,
168
+ case_insensitive: bool = False,
169
+ ) -> pd.DataFrame:
170
+ if isinstance(genes, pd.DataFrame):
171
+ base = genes.copy()
172
+ else:
173
+ base = await self.normalize(
174
+ genes, taxon=taxon, limit=limit, case_insensitive=case_insensitive
175
+ )
176
+ if _GENE_ID not in base.columns:
177
+ raise KeyError(f"base frame has no {_GENE_ID!r} column to join on")
178
+
179
+ gene_ids = base[_GENE_ID].dropna().unique().tolist()
180
+ out = base
181
+ for source in sources:
182
+ if isinstance(source, AGRDataset):
183
+ frame, key = await self._load_agr_source(source, gene_ids)
184
+ else:
185
+ frame, key = self._load_external(source)
186
+ frame[f'has.{source}'] = True
187
+ out = out.merge(frame, how="left", left_on=_GENE_ID, right_on=key)
188
+ out[f'has.{source}'] = out[f'has.{source}'].fillna(False)
189
+ return out.reset_index(drop=True)
190
+
191
+ async def get_orthologs(
192
+ self,
193
+ genes: Genes,
194
+ taxon: str | None = None,
195
+ target_taxon: str | None = None,
196
+ limit: int | None = 1,
197
+ case_insensitive: bool = False
198
+ ) -> pd.DataFrame:
199
+ """Convenience method to get orthologs for a set of genes."""
200
+ df = await self.annotate(
201
+ genes,
202
+ AGRDataset.ORTHOLOGY,
203
+ taxon=taxon,
204
+ limit=limit,
205
+ case_insensitive=case_insensitive,
206
+ )
207
+ if target_taxon:
208
+ df = df[df["Gene2SpeciesTaxonID"] == resolve_taxon(target_taxon).id]
209
+
210
+ return df[["query", "match_kind", "Gene2ID", "Gene2Symbol", "Gene2SpeciesTaxonID"]]
211
+
212
+
213
+ async def _gene_index(self) -> GeneIndex:
214
+ """Lazily build the gene index, caching it for the Annotator's lifetime.
215
+
216
+ The `GENE` bulk file is downloaded and converted to `bulk/gene.parquet`
217
+ like any other dataset; the index is then built from it in memory.
218
+ """
219
+ if self._index is None:
220
+ path = await self.download(AGRDataset.GENE)
221
+ self._index = build_gene_index(read_parquet(path))
222
+ return self._index
223
+
224
+ async def _id_map(
225
+ self, queries: list[str], taxon: str | None, case_insensitive: bool = False
226
+ ) -> dict[str, str]:
227
+ index = await self._gene_index()
228
+ unique = list(dict.fromkeys(queries))
229
+ df = index.lookup(unique, taxon=taxon, limit=1, case_insensitive=case_insensitive)
230
+ df = df[df["match_kind"].notna()]
231
+ return dict(zip(df["query"], df[_GENE_ID]))
232
+
233
+ async def _load_agr_source(
234
+ self, dataset: AGRDataset, gene_ids: list[str]
235
+ ) -> tuple[pd.DataFrame, str]:
236
+ spec = DATASETS[dataset]
237
+ if spec.bulk is not None:
238
+ path = self._cache / "bulk" / f"{dataset}.parquet"
239
+ if not path.exists():
240
+ await self.download(dataset)
241
+ frame = read_parquet(path)
242
+ key = spec.bulk.join_key
243
+ return frame[frame[key].isin(set(gene_ids))].reset_index(drop=True), key
244
+ assert spec.api is not None
245
+ return await self._fetch_api(dataset, spec.api, gene_ids), spec.api.join_key
246
+
247
+ async def _fetch_api(
248
+ self, dataset: AGRDataset, api: ApiSpec, gene_ids: list[str]
249
+ ) -> pd.DataFrame:
250
+ api_dir = self._cache / "api" / dataset
251
+ cached = [g for g in gene_ids if (api_dir / f"{_safe(g)}.parquet").exists()]
252
+ missing = [g for g in gene_ids if g not in set(cached)]
253
+
254
+ frames = [read_parquet(api_dir / f"{_safe(g)}.parquet") for g in cached]
255
+ if missing:
256
+ async with AsyncExitStack() as stack:
257
+ client = self._client or await stack.enter_async_context(AGRClient())
258
+ frames.extend(
259
+ await asyncio.gather(
260
+ *[self._fetch_one(client, api, g, api_dir) for g in missing]
261
+ )
262
+ )
263
+ if not frames:
264
+ return pd.DataFrame(columns=[api.join_key])
265
+ return pd.concat(frames, ignore_index=True)
266
+
267
+ async def _fetch_one(
268
+ self, client: AGRClient, api: ApiSpec, gene_id: str, api_dir: Path
269
+ ) -> pd.DataFrame:
270
+ results = await _fetch_all_pages(client, api.endpoint.format(gene_id=gene_id))
271
+ rows = [api.project(gene_id, r) for r in results]
272
+ frame = pd.DataFrame(rows) if rows else pd.DataFrame(columns=[api.join_key])
273
+ write_parquet(frame, api_dir / f"{_safe(gene_id)}.parquet")
274
+ return frame
275
+
276
+ def _load_external(self, name: str) -> tuple[pd.DataFrame, str]:
277
+ path = self._cache / "external" / f"{name}.parquet"
278
+ if not path.exists():
279
+ raise KeyError(
280
+ f"unknown source {name!r}: not an AGRDataset or an ingested annotation"
281
+ )
282
+ frame = read_parquet(path)
283
+ frame = frame.rename(
284
+ columns={c: f"{name}.{c}" for c in frame.columns if c != _GENE_ID}
285
+ )
286
+ return frame, _GENE_ID
287
+
288
+
289
+ def _select_download(files: list[DownloadFile], spec: BulkSpec) -> DownloadFile:
290
+ match = next(
291
+ (
292
+ f
293
+ for f in files
294
+ if f.dataType == spec.data_type
295
+ and f.fileType == spec.file_type
296
+ and f.dataSubType == spec.data_sub_type
297
+ ),
298
+ None,
299
+ )
300
+ if match is None:
301
+ raise LookupError(
302
+ f"no download matching {spec.data_type}/{spec.file_type}/{spec.data_sub_type}"
303
+ )
304
+ return match
305
+
306
+
307
+ async def _fetch_all_pages(
308
+ client: AGRClient, endpoint: str, page_size: int = _PAGE_SIZE
309
+ ) -> list[dict]:
310
+ first = await client.get_json(endpoint, params={"limit": page_size, "page": 1})
311
+ results = list(first.get("results", []))
312
+ pages = math.ceil(first.get("total", len(results)) / page_size)
313
+ if pages > 1:
314
+ rest = await asyncio.gather(
315
+ *[
316
+ client.get_json(endpoint, params={"limit": page_size, "page": p})
317
+ for p in range(2, pages + 1)
318
+ ]
319
+ )
320
+ for page in rest:
321
+ results.extend(page.get("results", []))
322
+ return results
323
+
324
+
325
+ def _safe(gene_id: str) -> str:
326
+ return gene_id.replace(":", "_").replace("/", "_")
327
+
328
+
329
+ def _read_table(path: str | Path) -> pd.DataFrame:
330
+ suffix = Path(path).suffix.lower()
331
+ if suffix == ".parquet":
332
+ return pd.read_parquet(path)
333
+ if suffix in (".tsv", ".tab", ".txt"):
334
+ return pd.read_csv(path, sep="\t")
335
+ if suffix == ".csv":
336
+ return pd.read_csv(path)
337
+ raise ValueError(
338
+ f"unsupported annotation file type {suffix!r} for {path!r} "
339
+ "(expected .csv, .tsv/.tab/.txt, or .parquet)"
340
+ )
341
+
342
+
343
+ def _ingest_summary(
344
+ name: str,
345
+ rows_in: int | None,
346
+ rows_stored: int | None,
347
+ rows_dropped: int | None,
348
+ normalized: bool | None,
349
+ ) -> dict:
350
+ return {
351
+ "annotation_name": name,
352
+ "rows_in": rows_in,
353
+ "rows_stored": rows_stored,
354
+ "rows_dropped_unmapped": rows_dropped,
355
+ "normalized": normalized,
356
+ }
geneharmony/client.py ADDED
@@ -0,0 +1,100 @@
1
+ """Async HTTP client for the Alliance of Genome Resources REST API.
2
+
3
+ A single `AGRClient` owns one pooled `httpx.AsyncClient` and bounds in-flight
4
+ requests with a semaphore. GETs retry transient failures (429/5xx, timeouts,
5
+ transport errors) with exponential backoff and jitter, honoring `Retry-After`.
6
+ """
7
+
8
+ import asyncio
9
+ import random
10
+ from collections.abc import Mapping
11
+ from datetime import datetime, timezone
12
+ from email.utils import parsedate_to_datetime
13
+ from typing import Any, Final, Self
14
+
15
+ import httpx
16
+ from pydantic import TypeAdapter
17
+
18
+ from .models import DownloadFile
19
+
20
+ AGR_BASE_URL: Final = "https://www.alliancegenome.org/api"
21
+
22
+ type Params = Mapping[str, str | int | bool]
23
+
24
+ _RETRYABLE_STATUS: Final[frozenset[int]] = frozenset({429, 502, 503, 504})
25
+ _DOWNLOADS_ADAPTER: Final = TypeAdapter(list[DownloadFile])
26
+
27
+
28
+ def _parse_retry_after(value: str | None) -> float | None:
29
+ if value is None:
30
+ return None
31
+ if value.isdigit():
32
+ return float(value)
33
+ try:
34
+ retry_at = parsedate_to_datetime(value)
35
+ except (TypeError, ValueError):
36
+ return None
37
+ return max(0.0, (retry_at - datetime.now(timezone.utc)).total_seconds())
38
+
39
+
40
+ class AGRClient:
41
+ def __init__(
42
+ self,
43
+ base_url: str = AGR_BASE_URL,
44
+ *,
45
+ max_concurrent: int = 5,
46
+ timeout: httpx.Timeout = httpx.Timeout(10.0, read=120.0),
47
+ max_retries: int = 4,
48
+ backoff_base: float = 0.5,
49
+ backoff_cap: float = 30.0,
50
+ ) -> None:
51
+ self._client = httpx.AsyncClient(
52
+ base_url=base_url,
53
+ timeout=timeout,
54
+ limits=httpx.Limits(max_connections=max_concurrent),
55
+ )
56
+ self._sem = asyncio.Semaphore(max_concurrent)
57
+ self._max_retries = max_retries
58
+ self._backoff_base = backoff_base
59
+ self._backoff_cap = backoff_cap
60
+
61
+ async def get_json(self, path: str, params: Params | None = None) -> Any:
62
+ return (await self._get(path, params)).json()
63
+
64
+ async def get_text(self, path: str, params: Params | None = None) -> str:
65
+ return (await self._get(path, params)).text
66
+
67
+ async def list_downloads(self) -> list[DownloadFile]:
68
+ return _DOWNLOADS_ADAPTER.validate_python(await self.get_json("/downloads"))
69
+
70
+ async def _get(self, path: str, params: Params | None) -> httpx.Response:
71
+ for attempt in range(self._max_retries + 1):
72
+ last = attempt == self._max_retries
73
+ try:
74
+ async with self._sem:
75
+ response = await self._client.get(path, params=params)
76
+ except (httpx.TransportError, httpx.TimeoutException):
77
+ if last:
78
+ raise
79
+ delay = self._backoff(attempt)
80
+ else:
81
+ if response.status_code not in _RETRYABLE_STATUS or last:
82
+ response.raise_for_status()
83
+ return response
84
+ delay = _parse_retry_after(response.headers.get("Retry-After"))
85
+ if delay is None:
86
+ delay = self._backoff(attempt)
87
+ await asyncio.sleep(delay)
88
+ raise AssertionError("retry loop exited without returning")
89
+
90
+ def _backoff(self, attempt: int) -> float:
91
+ return random.uniform(0.0, min(self._backoff_cap, self._backoff_base * 2**attempt))
92
+
93
+ async def aclose(self) -> None:
94
+ await self._client.aclose()
95
+
96
+ async def __aenter__(self) -> Self:
97
+ return self
98
+
99
+ async def __aexit__(self, *_exc_info: object) -> None:
100
+ await self.aclose()
@@ -0,0 +1,107 @@
1
+ """Registry of AGR datasets the annotator can pull, with their backends.
2
+
3
+ Each `AGRDataset` maps to a `DatasetSpec` describing how to obtain it:
4
+ - `bulk` — a selector into the `/downloads` listing (matched at runtime, never a
5
+ hardcoded `s3Url`) plus the column its rows join on.
6
+ - `api` — a per-gene endpoint template, the column its projected rows join on,
7
+ and a `project` callable flattening one API result into a single flat row.
8
+
9
+ Orthology is served from its bulk TSV (complete, richly columned); phenotypes and
10
+ alleles from the per-gene API (their bulk files are nested per-MOD JSON, deferred).
11
+ The API orthology projection mirrors the bulk column names so either backend
12
+ yields the same `Gene1ID`/`Gene2ID`/`Gene2SpeciesTaxonID` shape. `GENE` is the
13
+ bulk file backing the in-memory gene index — downloaded through the same path,
14
+ but built into a `GeneIndex` rather than joined onto a base frame.
15
+ """
16
+
17
+ import enum
18
+ from collections.abc import Callable
19
+ from typing import Any, Final, NamedTuple
20
+
21
+ type Json = dict[str, Any]
22
+ type Projector = Callable[[str, Json], dict[str, Any]]
23
+
24
+
25
+ class AGRDataset(enum.StrEnum):
26
+ GENE = "gene"
27
+ ORTHOLOGY = "orthology"
28
+ PHENOTYPES = "phenotypes"
29
+ ALLELES = "alleles"
30
+
31
+
32
+ class BulkSpec(NamedTuple):
33
+ data_type: str
34
+ file_type: str
35
+ data_sub_type: str
36
+ join_key: str
37
+
38
+
39
+ class ApiSpec(NamedTuple):
40
+ endpoint: str
41
+ join_key: str
42
+ project: Projector
43
+
44
+
45
+ class DatasetSpec(NamedTuple):
46
+ bulk: BulkSpec | None
47
+ api: ApiSpec | None
48
+
49
+
50
+ def _project_orthologs(gene_id: str, result: Json) -> dict[str, Any]:
51
+ g = result.get("geneToGeneOrthologyGenerated", {})
52
+ subject = g.get("subjectGene", {})
53
+ obj = g.get("objectGene", {})
54
+ return {
55
+ "Gene1ID": subject.get("primaryExternalId", gene_id),
56
+ "Gene1Symbol": subject.get("geneSymbol", {}).get("displayText"),
57
+ "Gene1SpeciesTaxonID": subject.get("taxon", {}).get("curie"),
58
+ "Gene2ID": obj.get("primaryExternalId"),
59
+ "Gene2Symbol": obj.get("geneSymbol", {}).get("displayText"),
60
+ "Gene2SpeciesTaxonID": obj.get("taxon", {}).get("curie"),
61
+ "Gene2SpeciesName": obj.get("taxon", {}).get("name"),
62
+ "Confidence": g.get("confidence", {}).get("name"),
63
+ "IsBestScore": g.get("isBestScore", {}).get("name"),
64
+ "IsBestRevScore": g.get("isBestScoreReverse", {}).get("name"),
65
+ }
66
+
67
+
68
+ def _project_phenotypes(gene_id: str, result: Json) -> dict[str, Any]:
69
+ return {
70
+ "gene_id": gene_id,
71
+ "phenotypeStatement": result.get("phenotypeStatement"),
72
+ "references": "|".join(result.get("pubmedPubModIDs") or []),
73
+ }
74
+
75
+
76
+ def _project_alleles(gene_id: str, result: Json) -> dict[str, Any]:
77
+ variants = result.get("variantList") or []
78
+ variant = variants[0] if variants else {}
79
+ return {
80
+ "gene_id": gene_id,
81
+ "allele_id": (result.get("allele") or {}).get("curie"),
82
+ "symbol": result.get("symbol"),
83
+ "alterationType": result.get("alterationType"),
84
+ "hasPhenotype": result.get("hasPhenotype", False),
85
+ "hasDisease": result.get("hasDisease", False),
86
+ "variantType": (variant.get("variantType") or {}).get("name"),
87
+ }
88
+
89
+
90
+ DATASETS: Final[dict[AGRDataset, DatasetSpec]] = {
91
+ AGRDataset.GENE: DatasetSpec(
92
+ bulk=BulkSpec("GENE", "TSV", "COMBINED", "GeneId"),
93
+ api=None,
94
+ ),
95
+ AGRDataset.ORTHOLOGY: DatasetSpec(
96
+ bulk=BulkSpec("ORTHOLOGY-ALLIANCE", "TSV", "COMBINED", "Gene1ID"),
97
+ api=ApiSpec("/gene/{gene_id}/orthologs", "Gene1ID", _project_orthologs),
98
+ ),
99
+ AGRDataset.PHENOTYPES: DatasetSpec(
100
+ bulk=None,
101
+ api=ApiSpec("/gene/{gene_id}/phenotypes", "gene_id", _project_phenotypes),
102
+ ),
103
+ AGRDataset.ALLELES: DatasetSpec(
104
+ bulk=None,
105
+ api=ApiSpec("/gene/{gene_id}/alleles", "gene_id", _project_alleles),
106
+ ),
107
+ }
@@ -0,0 +1,96 @@
1
+ """Streaming file downloader for arbitrary HTTP(S) URLs.
2
+
3
+ `Downloader` fetches a file from any absolute URL, streaming it to disk via an
4
+ atomic temp-then-rename write; an existing file whose byte size already matches
5
+ the expected size is left untouched. Bytes are written verbatim — compressed
6
+ files stay compressed on disk and are inflated downstream at ingest. It is not
7
+ tied to any particular host or data provider.
8
+ """
9
+
10
+ import asyncio
11
+ import os
12
+ import random
13
+ from pathlib import Path
14
+ from typing import Final, Self
15
+ import httpx
16
+
17
+ _CHUNK_SIZE: Final = 1 << 20
18
+ _RETRYABLE_STATUS: Final[frozenset[int]] = frozenset({429, 500, 502, 503, 504})
19
+
20
+
21
+ class SizeMismatchError(RuntimeError):
22
+ def __init__(self, url: str, expected: int, actual: int) -> None:
23
+ super().__init__(f"{url}: expected {expected} bytes, got {actual}")
24
+ self.url = url
25
+ self.expected = expected
26
+ self.actual = actual
27
+
28
+
29
+ class Downloader:
30
+ def __init__(
31
+ self,
32
+ *,
33
+ max_concurrent: int = 3,
34
+ timeout: httpx.Timeout = httpx.Timeout(10.0, read=None),
35
+ max_retries: int = 3,
36
+ backoff_base: float = 0.5,
37
+ backoff_cap: float = 30.0,
38
+ ) -> None:
39
+ self._client = httpx.AsyncClient(
40
+ follow_redirects=True,
41
+ timeout=timeout,
42
+ limits=httpx.Limits(max_connections=max_concurrent),
43
+ )
44
+ self._sem = asyncio.Semaphore(max_concurrent)
45
+ self._max_retries = max_retries
46
+ self._backoff_base = backoff_base
47
+ self._backoff_cap = backoff_cap
48
+
49
+ async def download(
50
+ self, url: str, dest: Path, *, expected_size: int | None = None
51
+ ) -> Path:
52
+ if dest.exists() and (expected_size is None or dest.stat().st_size == expected_size):
53
+ return dest
54
+ dest.parent.mkdir(parents=True, exist_ok=True)
55
+ tmp = dest.with_name(dest.name + ".part")
56
+ for attempt in range(self._max_retries + 1):
57
+ last = attempt == self._max_retries
58
+ try:
59
+ size = await self._stream_to(url, tmp)
60
+ except (httpx.TransportError, httpx.TimeoutException):
61
+ if last:
62
+ raise
63
+ except httpx.HTTPStatusError as exc:
64
+ if last or exc.response.status_code not in _RETRYABLE_STATUS:
65
+ raise
66
+ else:
67
+ if expected_size is not None and size != expected_size:
68
+ tmp.unlink(missing_ok=True)
69
+ raise SizeMismatchError(url, expected_size, size)
70
+ os.replace(tmp, dest)
71
+ return dest
72
+ await asyncio.sleep(self._backoff(attempt))
73
+ raise AssertionError("retry loop exited without returning")
74
+
75
+ async def _stream_to(self, url: str, tmp: Path) -> int:
76
+ async with self._sem, self._client.stream("GET", url) as response:
77
+ if response.is_error:
78
+ await response.aread()
79
+ response.raise_for_status()
80
+ written = 0
81
+ with tmp.open("wb") as f:
82
+ async for chunk in response.aiter_bytes(_CHUNK_SIZE):
83
+ written += f.write(chunk)
84
+ return written
85
+
86
+ def _backoff(self, attempt: int) -> float:
87
+ return random.uniform(0.0, min(self._backoff_cap, self._backoff_base * 2**attempt))
88
+
89
+ async def aclose(self) -> None:
90
+ await self._client.aclose()
91
+
92
+ async def __aenter__(self) -> Self:
93
+ return self
94
+
95
+ async def __aexit__(self, *_exc_info: object) -> None:
96
+ await self.aclose()
geneharmony/ingest.py ADDED
@@ -0,0 +1,22 @@
1
+ """Decompress and parse cached AGR bulk download files.
2
+
3
+ Bulk files are stored gzipped (`.json.gz` / `.tsv.gz`). These helpers read
4
+ straight from the compressed file into memory — JSON into the parsed object,
5
+ TSV into a DataFrame — so no decompressed copy is written to disk. AGR TSV files
6
+ carry a leading block of `#` comment lines before the header, which is skipped.
7
+ """
8
+
9
+ import gzip
10
+ import json
11
+ from pathlib import Path
12
+ from typing import Any
13
+ import pandas as pd
14
+
15
+
16
+ def load_json_gz(path: Path) -> Any:
17
+ with gzip.open(path, "rt", encoding="utf-8") as f:
18
+ return json.load(f)
19
+
20
+
21
+ def load_tsv_gz(path: Path, dtype: type[str] | None = None) -> pd.DataFrame:
22
+ return pd.read_csv(path, sep="\t", comment="#", compression="gzip", dtype=dtype)
geneharmony/models.py ADDED
@@ -0,0 +1,15 @@
1
+ from datetime import datetime
2
+ from pydantic import BaseModel, PositiveInt
3
+
4
+ class DownloadFile(BaseModel):
5
+ filename: str
6
+ s3Path: str
7
+ s3Url: str
8
+ stableURL: str
9
+ releaseVersion: str
10
+ size: PositiveInt
11
+ lastModified: datetime
12
+ dataType: str
13
+ fileType: str
14
+ dataSubType: str
15
+ fileExtension: str
@@ -0,0 +1,190 @@
1
+ """In-memory gene normalizer built from the GENE-TSV-COMBINED bulk file.
2
+
3
+ `load_gene_index` reads the file and precomputes O(1) lookups from every
4
+ identifier form — primary ID, deprecated (secondary) ID, official symbol,
5
+ synonym, systematic name and external cross-reference (e.g. `NCBI_Gene:`,
6
+ `ENSEMBL:`, `UniProtKB:`) — to row positions in the loaded table.
7
+
8
+ `GeneIndex.lookup` takes one query or a list and returns a DataFrame with one
9
+ row per match: the original `query`, the `match_kind`, and every column of the
10
+ matched gene record. Matches are ranked by precedence (primary ID > secondary ID
11
+ > official symbol > synonym > cross-reference); `limit` caps matches per query and `taxon` narrows
12
+ symbols that recur across species. Unmatched queries are still returned, with a
13
+ null `match_kind`. Matching is case-sensitive unless `case_insensitive=True`,
14
+ since case can be meaningful across species (human TP53 vs mouse Trp53).
15
+ """
16
+
17
+ import enum
18
+ from dataclasses import dataclass
19
+ from pathlib import Path
20
+ from typing import Final, NamedTuple
21
+ import pandas as pd
22
+
23
+ from .ingest import load_tsv_gz
24
+ from .taxa import resolve_taxon
25
+
26
+ type _Tables = dict["MatchKind", dict[str, list[int]]]
27
+
28
+ # Cross-reference databases whose IDs denote protein families / enzyme classes
29
+ # rather than genes; one such token fans out to hundreds of genes, so they are
30
+ # excluded from the index. Keys are the token prefix before the first ':'.
31
+ _XREF_EXCLUDED_PREFIXES: Final[frozenset[str]] = frozenset(
32
+ {"PANTHER", "TreeFam", "ExPASy", "TCDB"}
33
+ )
34
+
35
+
36
+ class MatchKind(enum.IntEnum):
37
+ PRIMARY_ID = 0
38
+ SECONDARY_ID = 1
39
+ OFFICIAL_SYMBOL = 2
40
+ SYNONYM = 3
41
+ CROSS_REFERENCE = 4
42
+
43
+
44
+ class GeneMatch(NamedTuple):
45
+ row: int
46
+ kind: MatchKind
47
+
48
+
49
+ @dataclass(slots=True)
50
+ class GeneIndex:
51
+ records: pd.DataFrame
52
+ _taxon_ids: tuple[str, ...]
53
+ _exact: _Tables
54
+ _folded: _Tables | None = None
55
+
56
+ def lookup(
57
+ self,
58
+ queries: str | list[str],
59
+ *,
60
+ taxon: str | None = None,
61
+ limit: int | None = 1,
62
+ case_insensitive: bool = False,
63
+ ) -> pd.DataFrame:
64
+ if isinstance(queries, str):
65
+ queries = [queries]
66
+ taxon_id = resolve_taxon(taxon).id if taxon is not None else None
67
+
68
+ order: list[int] = []
69
+ query_col: list[str] = []
70
+ kind_col: list[str] = []
71
+ rows: list[int] = []
72
+ miss_order: list[int] = []
73
+ miss_query: list[str] = []
74
+
75
+ for i, query in enumerate(queries):
76
+ matches = self._resolve(query, taxon_id, case_insensitive)
77
+ if limit is not None:
78
+ matches = matches[:limit]
79
+ if matches:
80
+ for match in matches:
81
+ order.append(i)
82
+ query_col.append(query)
83
+ kind_col.append(match.kind.name)
84
+ rows.append(match.row)
85
+ else:
86
+ miss_order.append(i)
87
+ miss_query.append(query)
88
+
89
+ matched = self.records.iloc[rows].reset_index(drop=True)
90
+ matched.insert(0, "match_kind", kind_col)
91
+ matched.insert(0, "query", query_col)
92
+ matched.insert(0, "_order", order)
93
+
94
+ if miss_query:
95
+ missed = pd.DataFrame({"_order": miss_order, "query": miss_query, "match_kind": None})
96
+ matched = pd.concat([matched, missed], ignore_index=True)
97
+
98
+ return (
99
+ matched.sort_values("_order", kind="stable")
100
+ .drop(columns="_order")
101
+ .reset_index(drop=True)
102
+ )
103
+
104
+ def _resolve(self, query: str, taxon_id: str | None, case_insensitive: bool) -> list[GeneMatch]:
105
+ tables = self._exact
106
+ key = query
107
+ if case_insensitive:
108
+ tables = self._folded_tables()
109
+ key = query.casefold()
110
+
111
+ matches = [
112
+ GeneMatch(row, kind)
113
+ for kind in MatchKind
114
+ for row in tables[kind].get(key, ())
115
+ ]
116
+ if taxon_id is not None:
117
+ matches = [m for m in matches if self._taxon_ids[m.row] == taxon_id]
118
+
119
+ seen: set[int] = set()
120
+ unique: list[GeneMatch] = []
121
+ for match in matches:
122
+ if match.row not in seen:
123
+ seen.add(match.row)
124
+ unique.append(match)
125
+ return unique
126
+
127
+ def _folded_tables(self) -> _Tables:
128
+ if self._folded is None:
129
+ self._folded = _fold(self._exact)
130
+ return self._folded
131
+
132
+
133
+ def _fold(tables: _Tables) -> _Tables:
134
+ folded: _Tables = {}
135
+ for kind, table in tables.items():
136
+ merged: dict[str, list[int]] = {}
137
+ for key, table_rows in table.items():
138
+ merged.setdefault(key.casefold(), []).extend(table_rows)
139
+ folded[kind] = merged
140
+ return folded
141
+
142
+
143
+ def build_gene_index(records: pd.DataFrame) -> GeneIndex:
144
+ primary: dict[str, list[int]] = {}
145
+ secondary: dict[str, list[int]] = {}
146
+ official: dict[str, list[int]] = {}
147
+ synonym: dict[str, list[int]] = {}
148
+ cross_reference: dict[str, list[int]] = {}
149
+
150
+ for i, gene_id in enumerate(records["GeneId"].tolist()):
151
+ primary.setdefault(gene_id, []).append(i)
152
+
153
+ for i, symbol in enumerate(records["GeneSymbol"].tolist()):
154
+ if isinstance(symbol, str):
155
+ official.setdefault(symbol, []).append(i)
156
+
157
+ for column in ("GeneSynonyms", "GeneSystematicName"):
158
+ for i, value in enumerate(records[column].tolist()):
159
+ if isinstance(value, str):
160
+ for token in value.split("|"):
161
+ if not token:
162
+ continue
163
+ synonym.setdefault(token, []).append(i)
164
+
165
+ for i, value in enumerate(records["GeneSecondaryIds"].tolist()):
166
+ if isinstance(value, str):
167
+ for token in value.split("|"):
168
+ if not token:
169
+ continue
170
+ secondary.setdefault(token, []).append(i)
171
+
172
+ for i, value in enumerate(records["GeneCrossReferences"].tolist()):
173
+ if isinstance(value, str):
174
+ for token in value.split("|"):
175
+ if not token or token.split(":", 1)[0] in _XREF_EXCLUDED_PREFIXES:
176
+ continue
177
+ cross_reference.setdefault(token, []).append(i)
178
+
179
+ exact: _Tables = {
180
+ MatchKind.PRIMARY_ID: primary,
181
+ MatchKind.SECONDARY_ID: secondary,
182
+ MatchKind.OFFICIAL_SYMBOL: official,
183
+ MatchKind.SYNONYM: synonym,
184
+ MatchKind.CROSS_REFERENCE: cross_reference,
185
+ }
186
+ return GeneIndex(records, tuple(records["Taxon"].tolist()), exact)
187
+
188
+
189
+ def load_gene_index(path: Path) -> GeneIndex:
190
+ return build_gene_index(load_tsv_gz(path, dtype=str))
geneharmony/store.py ADDED
@@ -0,0 +1,61 @@
1
+ """Atomic Parquet persistence for cached frames.
2
+
3
+ Backs the bulk, per-gene API, and external-annotation caches. Writes go through a
4
+ same-directory temp file then `os.replace`, so a reader never sees a half-written
5
+ file. Object columns holding dicts/lists are JSON-encoded to strings before
6
+ writing (Parquet can't represent them natively); `read_parquet(decode_json=...)`
7
+ reverses that for the columns a caller knows were nested.
8
+ """
9
+
10
+ import json
11
+ import os
12
+ import tempfile
13
+ from collections.abc import Callable, Iterable
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ import pandas as pd
18
+
19
+
20
+ def _is_missing(value: Any) -> bool:
21
+ return value is None or (isinstance(value, float) and pd.isna(value))
22
+
23
+
24
+ def _encode_nested(df: pd.DataFrame) -> pd.DataFrame:
25
+ nested = [
26
+ col
27
+ for col in df.columns
28
+ if isinstance(next((v for v in df[col] if not _is_missing(v)), None), (dict, list))
29
+ ]
30
+ if not nested:
31
+ return df
32
+ out = df.copy()
33
+ for col in nested:
34
+ out[col] = [None if _is_missing(v) else json.dumps(v) for v in out[col]]
35
+ return out
36
+
37
+
38
+ def _atomic_write(path: Path, write_fn: Callable[[Path], None]) -> None:
39
+ fd, tmp_name = tempfile.mkstemp(dir=path.parent, prefix=f".{path.name}.", suffix=".tmp")
40
+ os.close(fd)
41
+ tmp = Path(tmp_name)
42
+ try:
43
+ write_fn(tmp)
44
+ os.replace(tmp, path)
45
+ except BaseException:
46
+ tmp.unlink(missing_ok=True)
47
+ raise
48
+
49
+
50
+ def write_parquet(df: pd.DataFrame, path: Path) -> None:
51
+ path.parent.mkdir(parents=True, exist_ok=True)
52
+ encoded = _encode_nested(df)
53
+ _atomic_write(path, lambda p: encoded.to_parquet(p, compression="zstd", index=False))
54
+
55
+
56
+ def read_parquet(path: Path, *, decode_json: Iterable[str] = ()) -> pd.DataFrame:
57
+ df = pd.read_parquet(path)
58
+ for col in decode_json:
59
+ if col in df.columns:
60
+ df[col] = [None if _is_missing(v) else json.loads(v) for v in df[col]]
61
+ return df
geneharmony/taxa.json ADDED
@@ -0,0 +1,11 @@
1
+ [
2
+ {"id": "NCBITaxon:9606", "species": "Homo sapiens", "common": ["human"]},
3
+ {"id": "NCBITaxon:10090", "species": "Mus musculus", "common": ["mouse"]},
4
+ {"id": "NCBITaxon:10116", "species": "Rattus norvegicus", "common": ["rat"]},
5
+ {"id": "NCBITaxon:7955", "species": "Danio rerio", "common": ["zebrafish"]},
6
+ {"id": "NCBITaxon:7227", "species": "Drosophila melanogaster", "common": ["fly", "fruit fly"]},
7
+ {"id": "NCBITaxon:6239", "species": "Caenorhabditis elegans", "common": ["worm", "roundworm"]},
8
+ {"id": "NCBITaxon:559292", "species": "Saccharomyces cerevisiae S288C", "common": ["yeast", "budding yeast", "saccharomyces cerevisiae"]},
9
+ {"id": "NCBITaxon:8355", "species": "Xenopus laevis", "common": ["african clawed frog"]},
10
+ {"id": "NCBITaxon:8364", "species": "Xenopus tropicalis", "common": ["western clawed frog", "tropical clawed frog"]}
11
+ ]
geneharmony/taxa.py ADDED
@@ -0,0 +1,86 @@
1
+ """Species taxon resolution, built from `taxa.json`.
2
+
3
+ One entry per AGR species. `resolve_taxon` maps any alias — canonical
4
+ `NCBITaxon:` ID, bare number, species name or common name — to its `Taxon`
5
+ record; `taxon_mapper` builds a `value -> field` callable for annotating a taxon
6
+ column of a DataFrame.
7
+ """
8
+
9
+ import enum
10
+ import json
11
+ from collections.abc import Callable
12
+ from pathlib import Path
13
+ from typing import Final, NamedTuple
14
+ from importlib.resources import files
15
+
16
+ _TAXA_PATH = files("geneharmony").joinpath("taxa.json")
17
+
18
+
19
+ class Taxon(NamedTuple):
20
+ """A resolved species: its canonical NCBITaxon ID, species name and common names."""
21
+
22
+ id: str
23
+ species: str
24
+ common: tuple[str, ...]
25
+
26
+ @property
27
+ def number(self) -> str:
28
+ """The bare NCBI taxon number, without the `NCBITaxon:` prefix."""
29
+ return self.id.split(":", 1)[1]
30
+
31
+ @property
32
+ def common_name(self) -> str | None:
33
+ """The primary common name, or None if the species has none."""
34
+ return self.common[0] if self.common else None
35
+
36
+
37
+ class TaxonField(enum.StrEnum):
38
+ ID = "id"
39
+ NUMBER = "number"
40
+ SPECIES = "species"
41
+ COMMON_NAME = "common_name"
42
+
43
+
44
+ def _load_taxa() -> tuple[Taxon, ...]:
45
+ return tuple(
46
+ Taxon(entry["id"], entry["species"], tuple(entry["common"]))
47
+ for entry in json.loads(_TAXA_PATH.read_text())
48
+ )
49
+
50
+
51
+ _TAXA: Final[tuple[Taxon, ...]] = _load_taxa()
52
+
53
+
54
+ _TAXON_BY_ALIAS: Final[dict[str, Taxon]] = {
55
+ alias.casefold(): taxon
56
+ for taxon in _TAXA
57
+ for alias in (taxon.id, taxon.number, taxon.species, *taxon.common)
58
+ }
59
+
60
+
61
+ def resolve_taxon(value: str) -> Taxon:
62
+ """Resolve a taxon ID, number, species name or common name to its `Taxon` record.
63
+
64
+ Pull out the part you need with `.id`, `.species`, `.common_name` or `.number`.
65
+ """
66
+ taxon = _TAXON_BY_ALIAS.get(value.strip().casefold())
67
+ if taxon is None:
68
+ raise ValueError(f"unknown taxon: {value!r}")
69
+ return taxon
70
+
71
+
72
+ def taxon_mapper(field: TaxonField) -> Callable[[object], str | None]:
73
+ """Build a `value -> field` function for mapping a taxon column.
74
+
75
+ The returned callable takes any taxon alias (ID, number, species, common name)
76
+ and returns the requested `field`; unknown or non-string values yield `None`.
77
+ Intended for `df[col].map(taxon_mapper(TaxonField.COMMON_NAME))`.
78
+ """
79
+
80
+ def mapper(value: object) -> str | None:
81
+ if not isinstance(value, str):
82
+ return None
83
+ taxon = _TAXON_BY_ALIAS.get(value.strip().casefold())
84
+ return getattr(taxon, field) if taxon is not None else None
85
+
86
+ return mapper
@@ -0,0 +1,225 @@
1
+ Metadata-Version: 2.4
2
+ Name: geneharmony
3
+ Version: 0.3.0
4
+ Summary: Async toolkit to normalize gene identifiers and annotate gene sets with data from the Alliance of Genome Resources (AGR) or user-ingested datasets.
5
+ Project-URL: Homepage, https://github.com/limenode/geneharmony
6
+ Project-URL: Repository, https://github.com/limenode/geneharmony
7
+ Project-URL: Issues, https://github.com/limenode/geneharmony/issues
8
+ Author-email: Lionel Sequeira <lionelsequeira@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: agr,alliance of genome resources,annotation,bioinformatics,gene,genomics
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: Python :: 3.14
19
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
20
+ Classifier: Typing :: Typed
21
+ Requires-Python: >=3.12
22
+ Requires-Dist: httpx>=0.28
23
+ Requires-Dist: pandas>=3
24
+ Requires-Dist: pyarrow>=14
25
+ Requires-Dist: pydantic>=2
26
+ Description-Content-Type: text/markdown
27
+
28
+ # geneharmony
29
+
30
+ An async Python toolkit that normalizes gene identifiers and annotates gene sets using the [Alliance of Genome Resources](https://www.alliancegenome.org) (AGR) REST API and bulk-download files, with functionality to append local annotations.
31
+
32
+ It resolves gene symbols and identifiers to canonical genes using an in-memory index built from AGR's bulk gene file, fetches per-gene API data concurrently, and downloads and parses AGR bulk datasets.
33
+
34
+ ## Highlights
35
+
36
+ - **Gene normalization**: Resolve symbols, primary/secondary IDs, synonyms, systematic names, and external cross-references (NCBI, Ensembl, UniProtKB, RefSeq, …) to the appropriate records in the AGR's `GENE-TSV-COMBINED` file.
37
+ - **Nine model organisms**: Human, mouse, rat, zebrafish, fly, worm, yeast, african clawed frog, and western clawed frog.
38
+ - **Concurrent and resilient**: Pooled, rate-limited HTTP with automatic retry/backoff for transient failures.
39
+ - **Transparent caching**: Bulk files, per-gene API results, and ingested annotations are cached as Parquet to expedite repeat runs.
40
+ - **Bring your own data**: Ingest external annotation tables keyed on whatever gene identifier you have; they normalize to canonical AGR genes and join cleanly.
41
+
42
+ ## Install
43
+
44
+ Requires **Python 3.12+**. Install from PyPI with pip or uv — all dependencies (`httpx`, `pydantic` v2, `pandas` 3.x, `pyarrow`) are resolved automatically:
45
+
46
+ ```bash
47
+ pip install geneharmony
48
+ # or
49
+ uv add geneharmony
50
+ ```
51
+
52
+ ## Development
53
+
54
+ Contributors use **pixi** (conda-forge) for a reproducible environment from the lockfile. End users do not need pixi.
55
+
56
+ ```bash
57
+ # 1. Install the environment from the lockfile
58
+ pixi install
59
+
60
+ # 2. Run Python inside the environment
61
+ pixi run python <script>
62
+
63
+ # 3. Or open the interactive driver notebook
64
+ pixi run jupyter lab src/notebook.ipynb
65
+ ```
66
+
67
+ Notebook outputs are stripped from version control via a git clean filter. The filter config is repo-local, so enable it once per clone:
68
+
69
+ ```bash
70
+ git config filter.nbstrip.clean "pixi run jupyter nbconvert --clear-output --to notebook --stdin --stdout --log-level=ERROR"
71
+ git config filter.nbstrip.smudge cat
72
+ ```
73
+
74
+ ## Usage
75
+
76
+ The `Annotator` is the single entry point. It is async, so call its methods with `await` (inside a notebook cell, an `async def`, or `asyncio.run(...)`).
77
+
78
+ ### Quick start
79
+
80
+ ```python
81
+ from geneharmony import Annotator, AGRDataset
82
+
83
+ ann = Annotator()
84
+
85
+ # Resolve gene symbols to canonical AGR records
86
+ genes = await ann.normalize(["TP53", "BRCA1"], taxon="human")
87
+
88
+ # Annotate genes with phenotypic information
89
+ annotated_genes = await ann.annotate(["Atp7b", "Ttn"], AGRDataset.PHENOTYPES, taxon="mouse")
90
+ ```
91
+
92
+ ### Resolving genes (`normalize`)
93
+
94
+ `normalize` accepts an identifier or a list of identifiers and returns one row per match. Unmatched queries are **retained** with a null `match_kind` so misses stay visible.
95
+
96
+ ```python
97
+ df = await ann.normalize(
98
+ ["TP53", "ENSG00000141510", "not_a_gene"],
99
+ taxon="human", # any alias: "human", "9606", "Homo sapiens", "NCBITaxon:9606"
100
+ limit=1, # max matches per query; use None for all
101
+ case_insensitive=False, # case can be meaningful (human TP53 vs mouse Trp53)
102
+ )
103
+
104
+ resolved = df[df.match_kind.notna()] # drop the misses
105
+ ```
106
+
107
+ Matches are ranked by identifier precedence:
108
+
109
+ ```
110
+ PRIMARY_ID > SECONDARY_ID > OFFICIAL_SYMBOL > SYNONYM > CROSS_REFERENCE
111
+ ```
112
+
113
+ ### Annotating genes (`annotate`)
114
+
115
+ `annotate` builds a normalized base frame, then **left joins** one or more sources onto the canonical `GeneId`:
116
+
117
+ ```python
118
+ from geneharmony import AGRDataset
119
+
120
+ orth = await ann.annotate(
121
+ ["TP53", "BRCA1"],
122
+ AGRDataset.ORTHOLOGY,
123
+ taxon="human",
124
+ )
125
+ ```
126
+
127
+ For chaining annotate calls, the recommended pattern is an **iterative filter-then-requery traversal** — one AGR dataset per call — so result cardinality stays under your control:
128
+
129
+ ```python
130
+ # 1. Find orthologs of human genes
131
+ orth = await ann.annotate(["TP53", "BRCA1"], AGRDataset.ORTHOLOGY, taxon="human")
132
+
133
+ # 2. Keep the mouse orthologs
134
+ mouse = orth.loc[orth.Gene2SpeciesTaxonID == "NCBITaxon:10090", "Gene2ID"].unique()
135
+
136
+ # 3. Fetch their phenotypes
137
+ pheno = await ann.annotate(list(mouse), AGRDataset.PHENOTYPES, taxon="mouse")
138
+ ```
139
+
140
+ #### Available AGR datasets
141
+
142
+ | Dataset | Backend | Key columns contributed |
143
+ | ----------------------- | ------------ | ----------------------------------------------------------- |
144
+ | `AGRDataset.ORTHOLOGY` | Bulk TSV | `Gene2ID`, `Gene2Symbol`, `Gene2SpeciesTaxonID`, … |
145
+ | `AGRDataset.PHENOTYPES` | Per-gene API | `phenotypeStatement`, `references` |
146
+ | `AGRDataset.ALLELES` | Per-gene API | `allele_id`, `symbol`, `alterationType`, `variantType`, … |
147
+
148
+ ### Orthologs convenience helper
149
+
150
+ For the common ortholog case there is a shortcut that returns a tidy subset:
151
+
152
+ ```python
153
+ orthologs = await ann.get_orthologs(
154
+ ["TP53", "BRCA1"],
155
+ taxon="human",
156
+ target_taxon="mouse", # optional: filter to one target species
157
+ )
158
+ # -> columns: query, match_kind, Gene2ID, Gene2Symbol, Gene2SpeciesTaxonID
159
+ ```
160
+
161
+ ### Downloading bulk datasets (`download`)
162
+
163
+ Bulk datasets are downloaded and converted to Parquet on first use (and cached thereafter). You can pre-fetch one explicitly:
164
+
165
+ ```python
166
+ path = await ann.download(AGRDataset.ORTHOLOGY)
167
+ # Force a refresh across AGR releases:
168
+ path = await ann.download(AGRDataset.ORTHOLOGY, refresh=True)
169
+ ```
170
+
171
+ ### Ingesting your own annotations (`ingest_annotation`)
172
+
173
+ Bring an external table (CSV, TSV, or Parquet file, or a `DataFrame`), normalize its gene identifiers to canonical AGR genes, and store it for joining by name:
174
+
175
+ ```python
176
+ summary, unmapped = await ann.ingest_annotation(
177
+ "my_expression_table.csv",
178
+ name="expression",
179
+ gene_id_column="symbol", # or a list of columns, tried left-to-right per row
180
+ taxon="human",
181
+ )
182
+
183
+ # Join it alongside an AGR dataset; its columns are prefixed `expression.`
184
+ df = await ann.annotate(["TP53", "BRCA1"], AGRDataset.ORTHOLOGY, "expression", taxon="human")
185
+ ```
186
+
187
+ `summary` reports rows in / stored / dropped; `unmapped` holds the rows whose identifiers could not be resolved (with their original columns) so nothing is silently lost.
188
+
189
+ ### Supported species
190
+
191
+ | Common name | Species | Taxon ID |
192
+ | --------------------- | -------------------------------- | ------------------ |
193
+ | human | *Homo sapiens* | `NCBITaxon:9606` |
194
+ | mouse | *Mus musculus* | `NCBITaxon:10090` |
195
+ | rat | *Rattus norvegicus* | `NCBITaxon:10116` |
196
+ | zebrafish | *Danio rerio* | `NCBITaxon:7955` |
197
+ | fly / fruit fly | *Drosophila melanogaster* | `NCBITaxon:7227` |
198
+ | worm / roundworm | *Caenorhabditis elegans* | `NCBITaxon:6239` |
199
+ | yeast / budding yeast | *Saccharomyces cerevisiae S288C* | `NCBITaxon:559292` |
200
+ | african clawed frog | *Xenopus laevis* | `NCBITaxon:8355` |
201
+ | western clawed frog | *Xenopus tropicalis* | `NCBITaxon:8364` |
202
+
203
+ Any of the aliases above — common name, full species name, bare number, or `NCBITaxon:` ID — can be passed as `taxon`.
204
+
205
+ ## Caching
206
+
207
+ Results are cached so repeat work is fast and largely offline. The cache defaults to `$XDG_CACHE_HOME/geneharmony` (falling back to `~/.cache/geneharmony`); pass a `cache_dir` to `Annotator(...)` to share or relocate it.
208
+
209
+ ```
210
+ bulk/<dataset>.parquet # downloaded + converted bulk datasets (incl. the gene index source)
211
+ api/<dataset>/<gene_id>.parquet # per-gene API results
212
+ external/<name>.parquet # ingested annotations
213
+ ```
214
+
215
+ The gene index is built from `bulk/gene.parquet`, which becomes stale across AGR releases. Refresh it with `await ann.download(AGRDataset.GENE, refresh=True)` (or by deleting the file).
216
+
217
+ ## Acknowledgements & Citation
218
+
219
+ This project is a client for data and services provided by the **Alliance of Genome Resources (AGR)**. It is not affiliated with or endorsed by the Alliance. All gene, ortholog, phenotype, and allele data are sourced from AGR and its member model-organism databases, and remain subject to the Alliance's terms of use.
220
+
221
+ If you use data obtained through this wrapper, please cite the Alliance of Genome Resources:
222
+
223
+ > [Updates to the Alliance of Genome Resources central infrastructure.](https://pubmed.ncbi.nlm.nih.gov/38552170/) 2024. Alliance of Genome Resources Consortium. Genetics. 2024 May 7;227(1):iyae049. doi: 10.1093/genetics/iyae049. PMID: 38552170.
224
+
225
+ Please also consult the [Alliance citation and data-usage guidelines](https://www.alliancegenome.org/cite-us) and acknowledge the underlying model-organism databases (e.g. SGD, WormBase, FlyBase, ZFIN, MGI, RGD, Xenbase) as appropriate.
@@ -0,0 +1,15 @@
1
+ geneharmony/__init__.py,sha256=0-sHvuoBiKxIeivfD7iKyrgfeQmBclTQHnnKjQSlXtk,475
2
+ geneharmony/annotator.py,sha256=hXTYESOfC1ADKBsMo5kh-7NLhqPUjrdSfgN5jEpmojw,13459
3
+ geneharmony/client.py,sha256=dG-lm9t5Krv540DEbmpOwJYR6zZbGGyk3aArb_9EDAQ,3523
4
+ geneharmony/datasets.py,sha256=GHWXgNxN3cngUfzOaYxgu4vDzraCtx10gNheDTa3_U4,3839
5
+ geneharmony/downloader.py,sha256=X0OSczbigq6dtEj0LtiLNn3xqHVx5QUh4qvAbOjVg7U,3538
6
+ geneharmony/ingest.py,sha256=-BViPSxGMKCymlLwkCegIVnHckLLxpvbfZ6GrD2oZ7o,757
7
+ geneharmony/models.py,sha256=rGiMCo07EvxV_0GXjwnNE0XUdglsH3yRzPscNaz_b54,326
8
+ geneharmony/normalizer.py,sha256=8fm1gs6GAimBtTum0tFIfCClw0mLpJYeGlfhX6demVo,6600
9
+ geneharmony/store.py,sha256=Ukx0OwPwY0kKkcpUHKBNBdhmhav1gINqTQDYK50Vwy0,1938
10
+ geneharmony/taxa.json,sha256=TiseisPSZw3oj1I2v3y98x2rCtnKhMefBt1PRMuIFkY,864
11
+ geneharmony/taxa.py,sha256=zqUw8Ma62W5yrg6fqAxZNVjxMH9Ca0l_YXXDCmZR89I,2599
12
+ geneharmony-0.3.0.dist-info/METADATA,sha256=5YkxIUwkPmr_tss9XdFR9_OZGXcDWrwuP_eTq8TykgM,10066
13
+ geneharmony-0.3.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
14
+ geneharmony-0.3.0.dist-info/licenses/LICENSE,sha256=r0h8rf3XM4VXvFcvxGHMrqJos3l3v654b1Z2Tb5Tk6w,1072
15
+ geneharmony-0.3.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Lionel Sequeira
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.