geneharmony 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geneharmony/__init__.py +17 -0
- geneharmony/annotator.py +356 -0
- geneharmony/client.py +100 -0
- geneharmony/datasets.py +107 -0
- geneharmony/downloader.py +96 -0
- geneharmony/ingest.py +22 -0
- geneharmony/models.py +15 -0
- geneharmony/normalizer.py +190 -0
- geneharmony/store.py +61 -0
- geneharmony/taxa.json +11 -0
- geneharmony/taxa.py +86 -0
- geneharmony-0.3.0.dist-info/METADATA +225 -0
- geneharmony-0.3.0.dist-info/RECORD +15 -0
- geneharmony-0.3.0.dist-info/WHEEL +4 -0
- geneharmony-0.3.0.dist-info/licenses/LICENSE +21 -0
geneharmony/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
geneharmony — async toolkit to normalize gene identifiers and annotate gene sets.
|
|
3
|
+
Resolves symbols/IDs against the Alliance of Genome Resources (AGR) and annotates with data from AGR or user-ingested datasets.
|
|
4
|
+
"""
|
|
5
|
+
__version__ = "0.3.0"
|
|
6
|
+
|
|
7
|
+
from .annotator import Annotator
|
|
8
|
+
from .datasets import AGRDataset
|
|
9
|
+
from .taxa import TaxonField, taxon_mapper, resolve_taxon
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"Annotator",
|
|
13
|
+
"AGRDataset",
|
|
14
|
+
"TaxonField",
|
|
15
|
+
"taxon_mapper",
|
|
16
|
+
"resolve_taxon",
|
|
17
|
+
]
|
geneharmony/annotator.py
ADDED
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
"""User-facing entry point: bulk download, per-gene API, and annotate.
|
|
2
|
+
|
|
3
|
+
`Annotator` ties the lower-level pieces (`AGRClient`, `Downloader`, the gene
|
|
4
|
+
index, the dataset registry) into one object over a resolved cache directory. The
|
|
5
|
+
intended use is an iterative *filter-then-requery* traversal — one primary AGR
|
|
6
|
+
dataset per `annotate` call — so cardinality stays under the caller's control:
|
|
7
|
+
|
|
8
|
+
ann = Annotator()
|
|
9
|
+
orth = await ann.annotate(["TP53", "BRCA1"], AGRDataset.ORTHOLOGY, taxon="human")
|
|
10
|
+
mouse = orth.loc[orth.Gene2SpeciesTaxonID == "NCBITaxon:10090", "Gene2ID"].unique()
|
|
11
|
+
pheno = await ann.annotate(list(mouse), AGRDataset.PHENOTYPES, taxon="mouse")
|
|
12
|
+
|
|
13
|
+
`annotate` wide-left-joins each source onto the normalized base frame in order:
|
|
14
|
+
an AGR dataset contributes its native columns; an ingested external annotation
|
|
15
|
+
(referenced by name) contributes columns prefixed with `name.`.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import math
|
|
20
|
+
import os
|
|
21
|
+
from contextlib import AsyncExitStack
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Final
|
|
24
|
+
import pandas as pd
|
|
25
|
+
|
|
26
|
+
from .client import AGRClient
|
|
27
|
+
from .datasets import DATASETS, AGRDataset, ApiSpec, BulkSpec
|
|
28
|
+
from .downloader import Downloader
|
|
29
|
+
from .ingest import load_tsv_gz
|
|
30
|
+
from .models import DownloadFile
|
|
31
|
+
from .normalizer import GeneIndex, build_gene_index
|
|
32
|
+
from .taxa import resolve_taxon
|
|
33
|
+
from .store import read_parquet, write_parquet
|
|
34
|
+
|
|
35
|
+
type Genes = str | list[str] | pd.DataFrame
|
|
36
|
+
|
|
37
|
+
_APP_DIR: Final = "geneharmony"
|
|
38
|
+
_GENE_ID: Final = "GeneId"
|
|
39
|
+
_PAGE_SIZE: Final = 500
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def default_cache_dir() -> Path:
|
|
43
|
+
base = os.environ.get("XDG_CACHE_HOME")
|
|
44
|
+
root = Path(base) if base else Path.home() / ".cache"
|
|
45
|
+
return root / _APP_DIR
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def resolve_cache_dir(cache_dir: Path | None) -> Path:
|
|
49
|
+
cache = cache_dir or default_cache_dir()
|
|
50
|
+
cache.mkdir(parents=True, exist_ok=True)
|
|
51
|
+
return cache
|
|
52
|
+
|
|
53
|
+
class Annotator:
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
cache_dir: Path | None = None,
|
|
57
|
+
*,
|
|
58
|
+
client: AGRClient | None = None,
|
|
59
|
+
downloader: Downloader | None = None,
|
|
60
|
+
) -> None:
|
|
61
|
+
self._cache = resolve_cache_dir(cache_dir)
|
|
62
|
+
self._client = client
|
|
63
|
+
self._downloader = downloader
|
|
64
|
+
self._index: GeneIndex | None = None
|
|
65
|
+
|
|
66
|
+
async def normalize(
|
|
67
|
+
self,
|
|
68
|
+
genes: str | list[str],
|
|
69
|
+
*,
|
|
70
|
+
taxon: str | None = None,
|
|
71
|
+
limit: int | None = 1,
|
|
72
|
+
case_insensitive: bool = False,
|
|
73
|
+
) -> pd.DataFrame:
|
|
74
|
+
index = await self._gene_index()
|
|
75
|
+
return index.lookup(
|
|
76
|
+
genes,
|
|
77
|
+
taxon=taxon,
|
|
78
|
+
limit=limit,
|
|
79
|
+
case_insensitive=case_insensitive,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
async def download(self, dataset: AGRDataset, *, refresh: bool = False) -> Path:
|
|
83
|
+
"""Download a dataset's bulk file, convert TSV -> Parquet, drop the .tsv.gz."""
|
|
84
|
+
bulk = DATASETS[dataset].bulk
|
|
85
|
+
if bulk is None:
|
|
86
|
+
raise ValueError(f"{dataset!r} has no bulk file; query it via annotate()")
|
|
87
|
+
dest = self._cache / "bulk" / f"{dataset}.parquet"
|
|
88
|
+
if dest.exists() and not refresh:
|
|
89
|
+
return dest
|
|
90
|
+
async with AsyncExitStack() as stack:
|
|
91
|
+
client = self._client or await stack.enter_async_context(AGRClient())
|
|
92
|
+
downloader = self._downloader or await stack.enter_async_context(Downloader())
|
|
93
|
+
file = _select_download(await client.list_downloads(), bulk)
|
|
94
|
+
tmp = dest.with_name(f"{dest.stem}.tsv.gz")
|
|
95
|
+
await downloader.download(file.s3Url, tmp)
|
|
96
|
+
write_parquet(load_tsv_gz(tmp, dtype=str), dest)
|
|
97
|
+
tmp.unlink(missing_ok=True)
|
|
98
|
+
return dest
|
|
99
|
+
|
|
100
|
+
async def ingest_annotation(
|
|
101
|
+
self,
|
|
102
|
+
source: str | Path | pd.DataFrame,
|
|
103
|
+
name: str,
|
|
104
|
+
*,
|
|
105
|
+
gene_id_column: str | list[str],
|
|
106
|
+
normalize: bool = True,
|
|
107
|
+
taxon: str | None = None,
|
|
108
|
+
case_insensitive: bool = False,
|
|
109
|
+
override: bool = False,
|
|
110
|
+
) -> tuple[dict, pd.DataFrame | None]:
|
|
111
|
+
"""Store an external annotation table, keyed by canonical `GeneId`.
|
|
112
|
+
|
|
113
|
+
`gene_id_column` may name several columns; they are tried left-to-right
|
|
114
|
+
per row, the first identifier that resolves wins (a fallback for tables
|
|
115
|
+
whose primary ID column has gaps). The id columns are kept as-is and a
|
|
116
|
+
separate `GeneId` column is added for the resolved canonical id, so the
|
|
117
|
+
input must not already contain a `GeneId` column. The returned unmapped
|
|
118
|
+
frame holds the rows (with their original columns) where no candidate
|
|
119
|
+
resolved.
|
|
120
|
+
"""
|
|
121
|
+
dest = self._cache / "external" / f"{name}.parquet"
|
|
122
|
+
if dest.exists() and not override:
|
|
123
|
+
return _ingest_summary(name, None, None, None, None), None
|
|
124
|
+
|
|
125
|
+
df = source.copy() if isinstance(source, pd.DataFrame) else _read_table(source)
|
|
126
|
+
columns = [gene_id_column] if isinstance(gene_id_column, str) else list(gene_id_column)
|
|
127
|
+
if not columns:
|
|
128
|
+
raise ValueError("gene_id_column must name at least one column")
|
|
129
|
+
missing = [c for c in columns if c not in df.columns]
|
|
130
|
+
if missing:
|
|
131
|
+
raise KeyError(
|
|
132
|
+
f"gene_id_column(s) {missing!r} not found; columns are {list(df.columns)}"
|
|
133
|
+
)
|
|
134
|
+
if _GENE_ID in df.columns:
|
|
135
|
+
raise ValueError(
|
|
136
|
+
f"input already has a {_GENE_ID!r} column; rename it — a separate "
|
|
137
|
+
f"{_GENE_ID!r} column is added for the resolved canonical id."
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
if normalize:
|
|
141
|
+
str_cols = df[columns].astype(str)
|
|
142
|
+
values = [v for v in pd.unique(str_cols.values.ravel()) if v != "nan"]
|
|
143
|
+
mapping = await self._id_map(values, taxon, case_insensitive)
|
|
144
|
+
resolved = str_cols.apply(lambda col: col.map(mapping)).bfill(axis=1).iloc[:, 0]
|
|
145
|
+
else:
|
|
146
|
+
resolved = df[columns].bfill(axis=1).iloc[:, 0]
|
|
147
|
+
|
|
148
|
+
df[_GENE_ID] = resolved
|
|
149
|
+
|
|
150
|
+
rows_in = len(df)
|
|
151
|
+
rows_dropped = 0
|
|
152
|
+
unmapped_df: pd.DataFrame | None = None
|
|
153
|
+
if normalize:
|
|
154
|
+
unmapped_mask = df[_GENE_ID].isna()
|
|
155
|
+
unmapped_df = df[unmapped_mask].copy()
|
|
156
|
+
rows_dropped = int(unmapped_mask.sum())
|
|
157
|
+
df = df[~unmapped_mask].reset_index(drop=True)
|
|
158
|
+
|
|
159
|
+
write_parquet(df, dest)
|
|
160
|
+
return _ingest_summary(name, rows_in, len(df), rows_dropped, normalize), unmapped_df
|
|
161
|
+
|
|
162
|
+
async def annotate(
|
|
163
|
+
self,
|
|
164
|
+
genes: Genes,
|
|
165
|
+
*sources: AGRDataset | str,
|
|
166
|
+
taxon: str | None = None,
|
|
167
|
+
limit: int | None = 1,
|
|
168
|
+
case_insensitive: bool = False,
|
|
169
|
+
) -> pd.DataFrame:
|
|
170
|
+
if isinstance(genes, pd.DataFrame):
|
|
171
|
+
base = genes.copy()
|
|
172
|
+
else:
|
|
173
|
+
base = await self.normalize(
|
|
174
|
+
genes, taxon=taxon, limit=limit, case_insensitive=case_insensitive
|
|
175
|
+
)
|
|
176
|
+
if _GENE_ID not in base.columns:
|
|
177
|
+
raise KeyError(f"base frame has no {_GENE_ID!r} column to join on")
|
|
178
|
+
|
|
179
|
+
gene_ids = base[_GENE_ID].dropna().unique().tolist()
|
|
180
|
+
out = base
|
|
181
|
+
for source in sources:
|
|
182
|
+
if isinstance(source, AGRDataset):
|
|
183
|
+
frame, key = await self._load_agr_source(source, gene_ids)
|
|
184
|
+
else:
|
|
185
|
+
frame, key = self._load_external(source)
|
|
186
|
+
frame[f'has.{source}'] = True
|
|
187
|
+
out = out.merge(frame, how="left", left_on=_GENE_ID, right_on=key)
|
|
188
|
+
out[f'has.{source}'] = out[f'has.{source}'].fillna(False)
|
|
189
|
+
return out.reset_index(drop=True)
|
|
190
|
+
|
|
191
|
+
async def get_orthologs(
|
|
192
|
+
self,
|
|
193
|
+
genes: Genes,
|
|
194
|
+
taxon: str | None = None,
|
|
195
|
+
target_taxon: str | None = None,
|
|
196
|
+
limit: int | None = 1,
|
|
197
|
+
case_insensitive: bool = False
|
|
198
|
+
) -> pd.DataFrame:
|
|
199
|
+
"""Convenience method to get orthologs for a set of genes."""
|
|
200
|
+
df = await self.annotate(
|
|
201
|
+
genes,
|
|
202
|
+
AGRDataset.ORTHOLOGY,
|
|
203
|
+
taxon=taxon,
|
|
204
|
+
limit=limit,
|
|
205
|
+
case_insensitive=case_insensitive,
|
|
206
|
+
)
|
|
207
|
+
if target_taxon:
|
|
208
|
+
df = df[df["Gene2SpeciesTaxonID"] == resolve_taxon(target_taxon).id]
|
|
209
|
+
|
|
210
|
+
return df[["query", "match_kind", "Gene2ID", "Gene2Symbol", "Gene2SpeciesTaxonID"]]
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
async def _gene_index(self) -> GeneIndex:
|
|
214
|
+
"""Lazily build the gene index, caching it for the Annotator's lifetime.
|
|
215
|
+
|
|
216
|
+
The `GENE` bulk file is downloaded and converted to `bulk/gene.parquet`
|
|
217
|
+
like any other dataset; the index is then built from it in memory.
|
|
218
|
+
"""
|
|
219
|
+
if self._index is None:
|
|
220
|
+
path = await self.download(AGRDataset.GENE)
|
|
221
|
+
self._index = build_gene_index(read_parquet(path))
|
|
222
|
+
return self._index
|
|
223
|
+
|
|
224
|
+
async def _id_map(
|
|
225
|
+
self, queries: list[str], taxon: str | None, case_insensitive: bool = False
|
|
226
|
+
) -> dict[str, str]:
|
|
227
|
+
index = await self._gene_index()
|
|
228
|
+
unique = list(dict.fromkeys(queries))
|
|
229
|
+
df = index.lookup(unique, taxon=taxon, limit=1, case_insensitive=case_insensitive)
|
|
230
|
+
df = df[df["match_kind"].notna()]
|
|
231
|
+
return dict(zip(df["query"], df[_GENE_ID]))
|
|
232
|
+
|
|
233
|
+
async def _load_agr_source(
|
|
234
|
+
self, dataset: AGRDataset, gene_ids: list[str]
|
|
235
|
+
) -> tuple[pd.DataFrame, str]:
|
|
236
|
+
spec = DATASETS[dataset]
|
|
237
|
+
if spec.bulk is not None:
|
|
238
|
+
path = self._cache / "bulk" / f"{dataset}.parquet"
|
|
239
|
+
if not path.exists():
|
|
240
|
+
await self.download(dataset)
|
|
241
|
+
frame = read_parquet(path)
|
|
242
|
+
key = spec.bulk.join_key
|
|
243
|
+
return frame[frame[key].isin(set(gene_ids))].reset_index(drop=True), key
|
|
244
|
+
assert spec.api is not None
|
|
245
|
+
return await self._fetch_api(dataset, spec.api, gene_ids), spec.api.join_key
|
|
246
|
+
|
|
247
|
+
async def _fetch_api(
|
|
248
|
+
self, dataset: AGRDataset, api: ApiSpec, gene_ids: list[str]
|
|
249
|
+
) -> pd.DataFrame:
|
|
250
|
+
api_dir = self._cache / "api" / dataset
|
|
251
|
+
cached = [g for g in gene_ids if (api_dir / f"{_safe(g)}.parquet").exists()]
|
|
252
|
+
missing = [g for g in gene_ids if g not in set(cached)]
|
|
253
|
+
|
|
254
|
+
frames = [read_parquet(api_dir / f"{_safe(g)}.parquet") for g in cached]
|
|
255
|
+
if missing:
|
|
256
|
+
async with AsyncExitStack() as stack:
|
|
257
|
+
client = self._client or await stack.enter_async_context(AGRClient())
|
|
258
|
+
frames.extend(
|
|
259
|
+
await asyncio.gather(
|
|
260
|
+
*[self._fetch_one(client, api, g, api_dir) for g in missing]
|
|
261
|
+
)
|
|
262
|
+
)
|
|
263
|
+
if not frames:
|
|
264
|
+
return pd.DataFrame(columns=[api.join_key])
|
|
265
|
+
return pd.concat(frames, ignore_index=True)
|
|
266
|
+
|
|
267
|
+
async def _fetch_one(
|
|
268
|
+
self, client: AGRClient, api: ApiSpec, gene_id: str, api_dir: Path
|
|
269
|
+
) -> pd.DataFrame:
|
|
270
|
+
results = await _fetch_all_pages(client, api.endpoint.format(gene_id=gene_id))
|
|
271
|
+
rows = [api.project(gene_id, r) for r in results]
|
|
272
|
+
frame = pd.DataFrame(rows) if rows else pd.DataFrame(columns=[api.join_key])
|
|
273
|
+
write_parquet(frame, api_dir / f"{_safe(gene_id)}.parquet")
|
|
274
|
+
return frame
|
|
275
|
+
|
|
276
|
+
def _load_external(self, name: str) -> tuple[pd.DataFrame, str]:
|
|
277
|
+
path = self._cache / "external" / f"{name}.parquet"
|
|
278
|
+
if not path.exists():
|
|
279
|
+
raise KeyError(
|
|
280
|
+
f"unknown source {name!r}: not an AGRDataset or an ingested annotation"
|
|
281
|
+
)
|
|
282
|
+
frame = read_parquet(path)
|
|
283
|
+
frame = frame.rename(
|
|
284
|
+
columns={c: f"{name}.{c}" for c in frame.columns if c != _GENE_ID}
|
|
285
|
+
)
|
|
286
|
+
return frame, _GENE_ID
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _select_download(files: list[DownloadFile], spec: BulkSpec) -> DownloadFile:
|
|
290
|
+
match = next(
|
|
291
|
+
(
|
|
292
|
+
f
|
|
293
|
+
for f in files
|
|
294
|
+
if f.dataType == spec.data_type
|
|
295
|
+
and f.fileType == spec.file_type
|
|
296
|
+
and f.dataSubType == spec.data_sub_type
|
|
297
|
+
),
|
|
298
|
+
None,
|
|
299
|
+
)
|
|
300
|
+
if match is None:
|
|
301
|
+
raise LookupError(
|
|
302
|
+
f"no download matching {spec.data_type}/{spec.file_type}/{spec.data_sub_type}"
|
|
303
|
+
)
|
|
304
|
+
return match
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
async def _fetch_all_pages(
|
|
308
|
+
client: AGRClient, endpoint: str, page_size: int = _PAGE_SIZE
|
|
309
|
+
) -> list[dict]:
|
|
310
|
+
first = await client.get_json(endpoint, params={"limit": page_size, "page": 1})
|
|
311
|
+
results = list(first.get("results", []))
|
|
312
|
+
pages = math.ceil(first.get("total", len(results)) / page_size)
|
|
313
|
+
if pages > 1:
|
|
314
|
+
rest = await asyncio.gather(
|
|
315
|
+
*[
|
|
316
|
+
client.get_json(endpoint, params={"limit": page_size, "page": p})
|
|
317
|
+
for p in range(2, pages + 1)
|
|
318
|
+
]
|
|
319
|
+
)
|
|
320
|
+
for page in rest:
|
|
321
|
+
results.extend(page.get("results", []))
|
|
322
|
+
return results
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def _safe(gene_id: str) -> str:
|
|
326
|
+
return gene_id.replace(":", "_").replace("/", "_")
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _read_table(path: str | Path) -> pd.DataFrame:
|
|
330
|
+
suffix = Path(path).suffix.lower()
|
|
331
|
+
if suffix == ".parquet":
|
|
332
|
+
return pd.read_parquet(path)
|
|
333
|
+
if suffix in (".tsv", ".tab", ".txt"):
|
|
334
|
+
return pd.read_csv(path, sep="\t")
|
|
335
|
+
if suffix == ".csv":
|
|
336
|
+
return pd.read_csv(path)
|
|
337
|
+
raise ValueError(
|
|
338
|
+
f"unsupported annotation file type {suffix!r} for {path!r} "
|
|
339
|
+
"(expected .csv, .tsv/.tab/.txt, or .parquet)"
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def _ingest_summary(
|
|
344
|
+
name: str,
|
|
345
|
+
rows_in: int | None,
|
|
346
|
+
rows_stored: int | None,
|
|
347
|
+
rows_dropped: int | None,
|
|
348
|
+
normalized: bool | None,
|
|
349
|
+
) -> dict:
|
|
350
|
+
return {
|
|
351
|
+
"annotation_name": name,
|
|
352
|
+
"rows_in": rows_in,
|
|
353
|
+
"rows_stored": rows_stored,
|
|
354
|
+
"rows_dropped_unmapped": rows_dropped,
|
|
355
|
+
"normalized": normalized,
|
|
356
|
+
}
|
geneharmony/client.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Async HTTP client for the Alliance of Genome Resources REST API.
|
|
2
|
+
|
|
3
|
+
A single `AGRClient` owns one pooled `httpx.AsyncClient` and bounds in-flight
|
|
4
|
+
requests with a semaphore. GETs retry transient failures (429/5xx, timeouts,
|
|
5
|
+
transport errors) with exponential backoff and jitter, honoring `Retry-After`.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import random
|
|
10
|
+
from collections.abc import Mapping
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from email.utils import parsedate_to_datetime
|
|
13
|
+
from typing import Any, Final, Self
|
|
14
|
+
|
|
15
|
+
import httpx
|
|
16
|
+
from pydantic import TypeAdapter
|
|
17
|
+
|
|
18
|
+
from .models import DownloadFile
|
|
19
|
+
|
|
20
|
+
AGR_BASE_URL: Final = "https://www.alliancegenome.org/api"
|
|
21
|
+
|
|
22
|
+
type Params = Mapping[str, str | int | bool]
|
|
23
|
+
|
|
24
|
+
_RETRYABLE_STATUS: Final[frozenset[int]] = frozenset({429, 502, 503, 504})
|
|
25
|
+
_DOWNLOADS_ADAPTER: Final = TypeAdapter(list[DownloadFile])
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _parse_retry_after(value: str | None) -> float | None:
|
|
29
|
+
if value is None:
|
|
30
|
+
return None
|
|
31
|
+
if value.isdigit():
|
|
32
|
+
return float(value)
|
|
33
|
+
try:
|
|
34
|
+
retry_at = parsedate_to_datetime(value)
|
|
35
|
+
except (TypeError, ValueError):
|
|
36
|
+
return None
|
|
37
|
+
return max(0.0, (retry_at - datetime.now(timezone.utc)).total_seconds())
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class AGRClient:
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
base_url: str = AGR_BASE_URL,
|
|
44
|
+
*,
|
|
45
|
+
max_concurrent: int = 5,
|
|
46
|
+
timeout: httpx.Timeout = httpx.Timeout(10.0, read=120.0),
|
|
47
|
+
max_retries: int = 4,
|
|
48
|
+
backoff_base: float = 0.5,
|
|
49
|
+
backoff_cap: float = 30.0,
|
|
50
|
+
) -> None:
|
|
51
|
+
self._client = httpx.AsyncClient(
|
|
52
|
+
base_url=base_url,
|
|
53
|
+
timeout=timeout,
|
|
54
|
+
limits=httpx.Limits(max_connections=max_concurrent),
|
|
55
|
+
)
|
|
56
|
+
self._sem = asyncio.Semaphore(max_concurrent)
|
|
57
|
+
self._max_retries = max_retries
|
|
58
|
+
self._backoff_base = backoff_base
|
|
59
|
+
self._backoff_cap = backoff_cap
|
|
60
|
+
|
|
61
|
+
async def get_json(self, path: str, params: Params | None = None) -> Any:
|
|
62
|
+
return (await self._get(path, params)).json()
|
|
63
|
+
|
|
64
|
+
async def get_text(self, path: str, params: Params | None = None) -> str:
|
|
65
|
+
return (await self._get(path, params)).text
|
|
66
|
+
|
|
67
|
+
async def list_downloads(self) -> list[DownloadFile]:
|
|
68
|
+
return _DOWNLOADS_ADAPTER.validate_python(await self.get_json("/downloads"))
|
|
69
|
+
|
|
70
|
+
async def _get(self, path: str, params: Params | None) -> httpx.Response:
|
|
71
|
+
for attempt in range(self._max_retries + 1):
|
|
72
|
+
last = attempt == self._max_retries
|
|
73
|
+
try:
|
|
74
|
+
async with self._sem:
|
|
75
|
+
response = await self._client.get(path, params=params)
|
|
76
|
+
except (httpx.TransportError, httpx.TimeoutException):
|
|
77
|
+
if last:
|
|
78
|
+
raise
|
|
79
|
+
delay = self._backoff(attempt)
|
|
80
|
+
else:
|
|
81
|
+
if response.status_code not in _RETRYABLE_STATUS or last:
|
|
82
|
+
response.raise_for_status()
|
|
83
|
+
return response
|
|
84
|
+
delay = _parse_retry_after(response.headers.get("Retry-After"))
|
|
85
|
+
if delay is None:
|
|
86
|
+
delay = self._backoff(attempt)
|
|
87
|
+
await asyncio.sleep(delay)
|
|
88
|
+
raise AssertionError("retry loop exited without returning")
|
|
89
|
+
|
|
90
|
+
def _backoff(self, attempt: int) -> float:
|
|
91
|
+
return random.uniform(0.0, min(self._backoff_cap, self._backoff_base * 2**attempt))
|
|
92
|
+
|
|
93
|
+
async def aclose(self) -> None:
|
|
94
|
+
await self._client.aclose()
|
|
95
|
+
|
|
96
|
+
async def __aenter__(self) -> Self:
|
|
97
|
+
return self
|
|
98
|
+
|
|
99
|
+
async def __aexit__(self, *_exc_info: object) -> None:
|
|
100
|
+
await self.aclose()
|
geneharmony/datasets.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Registry of AGR datasets the annotator can pull, with their backends.
|
|
2
|
+
|
|
3
|
+
Each `AGRDataset` maps to a `DatasetSpec` describing how to obtain it:
|
|
4
|
+
- `bulk` — a selector into the `/downloads` listing (matched at runtime, never a
|
|
5
|
+
hardcoded `s3Url`) plus the column its rows join on.
|
|
6
|
+
- `api` — a per-gene endpoint template, the column its projected rows join on,
|
|
7
|
+
and a `project` callable flattening one API result into a single flat row.
|
|
8
|
+
|
|
9
|
+
Orthology is served from its bulk TSV (complete, richly columned); phenotypes and
|
|
10
|
+
alleles from the per-gene API (their bulk files are nested per-MOD JSON, deferred).
|
|
11
|
+
The API orthology projection mirrors the bulk column names so either backend
|
|
12
|
+
yields the same `Gene1ID`/`Gene2ID`/`Gene2SpeciesTaxonID` shape. `GENE` is the
|
|
13
|
+
bulk file backing the in-memory gene index — downloaded through the same path,
|
|
14
|
+
but built into a `GeneIndex` rather than joined onto a base frame.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import enum
|
|
18
|
+
from collections.abc import Callable
|
|
19
|
+
from typing import Any, Final, NamedTuple
|
|
20
|
+
|
|
21
|
+
type Json = dict[str, Any]
|
|
22
|
+
type Projector = Callable[[str, Json], dict[str, Any]]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AGRDataset(enum.StrEnum):
|
|
26
|
+
GENE = "gene"
|
|
27
|
+
ORTHOLOGY = "orthology"
|
|
28
|
+
PHENOTYPES = "phenotypes"
|
|
29
|
+
ALLELES = "alleles"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class BulkSpec(NamedTuple):
|
|
33
|
+
data_type: str
|
|
34
|
+
file_type: str
|
|
35
|
+
data_sub_type: str
|
|
36
|
+
join_key: str
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ApiSpec(NamedTuple):
|
|
40
|
+
endpoint: str
|
|
41
|
+
join_key: str
|
|
42
|
+
project: Projector
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class DatasetSpec(NamedTuple):
|
|
46
|
+
bulk: BulkSpec | None
|
|
47
|
+
api: ApiSpec | None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _project_orthologs(gene_id: str, result: Json) -> dict[str, Any]:
|
|
51
|
+
g = result.get("geneToGeneOrthologyGenerated", {})
|
|
52
|
+
subject = g.get("subjectGene", {})
|
|
53
|
+
obj = g.get("objectGene", {})
|
|
54
|
+
return {
|
|
55
|
+
"Gene1ID": subject.get("primaryExternalId", gene_id),
|
|
56
|
+
"Gene1Symbol": subject.get("geneSymbol", {}).get("displayText"),
|
|
57
|
+
"Gene1SpeciesTaxonID": subject.get("taxon", {}).get("curie"),
|
|
58
|
+
"Gene2ID": obj.get("primaryExternalId"),
|
|
59
|
+
"Gene2Symbol": obj.get("geneSymbol", {}).get("displayText"),
|
|
60
|
+
"Gene2SpeciesTaxonID": obj.get("taxon", {}).get("curie"),
|
|
61
|
+
"Gene2SpeciesName": obj.get("taxon", {}).get("name"),
|
|
62
|
+
"Confidence": g.get("confidence", {}).get("name"),
|
|
63
|
+
"IsBestScore": g.get("isBestScore", {}).get("name"),
|
|
64
|
+
"IsBestRevScore": g.get("isBestScoreReverse", {}).get("name"),
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _project_phenotypes(gene_id: str, result: Json) -> dict[str, Any]:
|
|
69
|
+
return {
|
|
70
|
+
"gene_id": gene_id,
|
|
71
|
+
"phenotypeStatement": result.get("phenotypeStatement"),
|
|
72
|
+
"references": "|".join(result.get("pubmedPubModIDs") or []),
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _project_alleles(gene_id: str, result: Json) -> dict[str, Any]:
|
|
77
|
+
variants = result.get("variantList") or []
|
|
78
|
+
variant = variants[0] if variants else {}
|
|
79
|
+
return {
|
|
80
|
+
"gene_id": gene_id,
|
|
81
|
+
"allele_id": (result.get("allele") or {}).get("curie"),
|
|
82
|
+
"symbol": result.get("symbol"),
|
|
83
|
+
"alterationType": result.get("alterationType"),
|
|
84
|
+
"hasPhenotype": result.get("hasPhenotype", False),
|
|
85
|
+
"hasDisease": result.get("hasDisease", False),
|
|
86
|
+
"variantType": (variant.get("variantType") or {}).get("name"),
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
DATASETS: Final[dict[AGRDataset, DatasetSpec]] = {
|
|
91
|
+
AGRDataset.GENE: DatasetSpec(
|
|
92
|
+
bulk=BulkSpec("GENE", "TSV", "COMBINED", "GeneId"),
|
|
93
|
+
api=None,
|
|
94
|
+
),
|
|
95
|
+
AGRDataset.ORTHOLOGY: DatasetSpec(
|
|
96
|
+
bulk=BulkSpec("ORTHOLOGY-ALLIANCE", "TSV", "COMBINED", "Gene1ID"),
|
|
97
|
+
api=ApiSpec("/gene/{gene_id}/orthologs", "Gene1ID", _project_orthologs),
|
|
98
|
+
),
|
|
99
|
+
AGRDataset.PHENOTYPES: DatasetSpec(
|
|
100
|
+
bulk=None,
|
|
101
|
+
api=ApiSpec("/gene/{gene_id}/phenotypes", "gene_id", _project_phenotypes),
|
|
102
|
+
),
|
|
103
|
+
AGRDataset.ALLELES: DatasetSpec(
|
|
104
|
+
bulk=None,
|
|
105
|
+
api=ApiSpec("/gene/{gene_id}/alleles", "gene_id", _project_alleles),
|
|
106
|
+
),
|
|
107
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Streaming file downloader for arbitrary HTTP(S) URLs.
|
|
2
|
+
|
|
3
|
+
`Downloader` fetches a file from any absolute URL, streaming it to disk via an
|
|
4
|
+
atomic temp-then-rename write; an existing file whose byte size already matches
|
|
5
|
+
the expected size is left untouched. Bytes are written verbatim — compressed
|
|
6
|
+
files stay compressed on disk and are inflated downstream at ingest. It is not
|
|
7
|
+
tied to any particular host or data provider.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import os
|
|
12
|
+
import random
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Final, Self
|
|
15
|
+
import httpx
|
|
16
|
+
|
|
17
|
+
_CHUNK_SIZE: Final = 1 << 20
|
|
18
|
+
_RETRYABLE_STATUS: Final[frozenset[int]] = frozenset({429, 500, 502, 503, 504})
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SizeMismatchError(RuntimeError):
|
|
22
|
+
def __init__(self, url: str, expected: int, actual: int) -> None:
|
|
23
|
+
super().__init__(f"{url}: expected {expected} bytes, got {actual}")
|
|
24
|
+
self.url = url
|
|
25
|
+
self.expected = expected
|
|
26
|
+
self.actual = actual
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Downloader:
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
*,
|
|
33
|
+
max_concurrent: int = 3,
|
|
34
|
+
timeout: httpx.Timeout = httpx.Timeout(10.0, read=None),
|
|
35
|
+
max_retries: int = 3,
|
|
36
|
+
backoff_base: float = 0.5,
|
|
37
|
+
backoff_cap: float = 30.0,
|
|
38
|
+
) -> None:
|
|
39
|
+
self._client = httpx.AsyncClient(
|
|
40
|
+
follow_redirects=True,
|
|
41
|
+
timeout=timeout,
|
|
42
|
+
limits=httpx.Limits(max_connections=max_concurrent),
|
|
43
|
+
)
|
|
44
|
+
self._sem = asyncio.Semaphore(max_concurrent)
|
|
45
|
+
self._max_retries = max_retries
|
|
46
|
+
self._backoff_base = backoff_base
|
|
47
|
+
self._backoff_cap = backoff_cap
|
|
48
|
+
|
|
49
|
+
async def download(
|
|
50
|
+
self, url: str, dest: Path, *, expected_size: int | None = None
|
|
51
|
+
) -> Path:
|
|
52
|
+
if dest.exists() and (expected_size is None or dest.stat().st_size == expected_size):
|
|
53
|
+
return dest
|
|
54
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
tmp = dest.with_name(dest.name + ".part")
|
|
56
|
+
for attempt in range(self._max_retries + 1):
|
|
57
|
+
last = attempt == self._max_retries
|
|
58
|
+
try:
|
|
59
|
+
size = await self._stream_to(url, tmp)
|
|
60
|
+
except (httpx.TransportError, httpx.TimeoutException):
|
|
61
|
+
if last:
|
|
62
|
+
raise
|
|
63
|
+
except httpx.HTTPStatusError as exc:
|
|
64
|
+
if last or exc.response.status_code not in _RETRYABLE_STATUS:
|
|
65
|
+
raise
|
|
66
|
+
else:
|
|
67
|
+
if expected_size is not None and size != expected_size:
|
|
68
|
+
tmp.unlink(missing_ok=True)
|
|
69
|
+
raise SizeMismatchError(url, expected_size, size)
|
|
70
|
+
os.replace(tmp, dest)
|
|
71
|
+
return dest
|
|
72
|
+
await asyncio.sleep(self._backoff(attempt))
|
|
73
|
+
raise AssertionError("retry loop exited without returning")
|
|
74
|
+
|
|
75
|
+
async def _stream_to(self, url: str, tmp: Path) -> int:
|
|
76
|
+
async with self._sem, self._client.stream("GET", url) as response:
|
|
77
|
+
if response.is_error:
|
|
78
|
+
await response.aread()
|
|
79
|
+
response.raise_for_status()
|
|
80
|
+
written = 0
|
|
81
|
+
with tmp.open("wb") as f:
|
|
82
|
+
async for chunk in response.aiter_bytes(_CHUNK_SIZE):
|
|
83
|
+
written += f.write(chunk)
|
|
84
|
+
return written
|
|
85
|
+
|
|
86
|
+
def _backoff(self, attempt: int) -> float:
|
|
87
|
+
return random.uniform(0.0, min(self._backoff_cap, self._backoff_base * 2**attempt))
|
|
88
|
+
|
|
89
|
+
async def aclose(self) -> None:
|
|
90
|
+
await self._client.aclose()
|
|
91
|
+
|
|
92
|
+
async def __aenter__(self) -> Self:
|
|
93
|
+
return self
|
|
94
|
+
|
|
95
|
+
async def __aexit__(self, *_exc_info: object) -> None:
|
|
96
|
+
await self.aclose()
|
geneharmony/ingest.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Decompress and parse cached AGR bulk download files.
|
|
2
|
+
|
|
3
|
+
Bulk files are stored gzipped (`.json.gz` / `.tsv.gz`). These helpers read
|
|
4
|
+
straight from the compressed file into memory — JSON into the parsed object,
|
|
5
|
+
TSV into a DataFrame — so no decompressed copy is written to disk. AGR TSV files
|
|
6
|
+
carry a leading block of `#` comment lines before the header, which is skipped.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import gzip
|
|
10
|
+
import json
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def load_json_gz(path: Path) -> Any:
|
|
17
|
+
with gzip.open(path, "rt", encoding="utf-8") as f:
|
|
18
|
+
return json.load(f)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_tsv_gz(path: Path, dtype: type[str] | None = None) -> pd.DataFrame:
|
|
22
|
+
return pd.read_csv(path, sep="\t", comment="#", compression="gzip", dtype=dtype)
|
geneharmony/models.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from pydantic import BaseModel, PositiveInt
|
|
3
|
+
|
|
4
|
+
class DownloadFile(BaseModel):
|
|
5
|
+
filename: str
|
|
6
|
+
s3Path: str
|
|
7
|
+
s3Url: str
|
|
8
|
+
stableURL: str
|
|
9
|
+
releaseVersion: str
|
|
10
|
+
size: PositiveInt
|
|
11
|
+
lastModified: datetime
|
|
12
|
+
dataType: str
|
|
13
|
+
fileType: str
|
|
14
|
+
dataSubType: str
|
|
15
|
+
fileExtension: str
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""In-memory gene normalizer built from the GENE-TSV-COMBINED bulk file.
|
|
2
|
+
|
|
3
|
+
`load_gene_index` reads the file and precomputes O(1) lookups from every
|
|
4
|
+
identifier form — primary ID, deprecated (secondary) ID, official symbol,
|
|
5
|
+
synonym, systematic name and external cross-reference (e.g. `NCBI_Gene:`,
|
|
6
|
+
`ENSEMBL:`, `UniProtKB:`) — to row positions in the loaded table.
|
|
7
|
+
|
|
8
|
+
`GeneIndex.lookup` takes one query or a list and returns a DataFrame with one
|
|
9
|
+
row per match: the original `query`, the `match_kind`, and every column of the
|
|
10
|
+
matched gene record. Matches are ranked by precedence (primary ID > secondary ID
|
|
11
|
+
> official symbol > synonym > cross-reference); `limit` caps matches per query and `taxon` narrows
|
|
12
|
+
symbols that recur across species. Unmatched queries are still returned, with a
|
|
13
|
+
null `match_kind`. Matching is case-sensitive unless `case_insensitive=True`,
|
|
14
|
+
since case can be meaningful across species (human TP53 vs mouse Trp53).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import enum
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Final, NamedTuple
|
|
21
|
+
import pandas as pd
|
|
22
|
+
|
|
23
|
+
from .ingest import load_tsv_gz
|
|
24
|
+
from .taxa import resolve_taxon
|
|
25
|
+
|
|
26
|
+
type _Tables = dict["MatchKind", dict[str, list[int]]]
|
|
27
|
+
|
|
28
|
+
# Cross-reference databases whose IDs denote protein families / enzyme classes
|
|
29
|
+
# rather than genes; one such token fans out to hundreds of genes, so they are
|
|
30
|
+
# excluded from the index. Keys are the token prefix before the first ':'.
|
|
31
|
+
_XREF_EXCLUDED_PREFIXES: Final[frozenset[str]] = frozenset(
|
|
32
|
+
{"PANTHER", "TreeFam", "ExPASy", "TCDB"}
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class MatchKind(enum.IntEnum):
|
|
37
|
+
PRIMARY_ID = 0
|
|
38
|
+
SECONDARY_ID = 1
|
|
39
|
+
OFFICIAL_SYMBOL = 2
|
|
40
|
+
SYNONYM = 3
|
|
41
|
+
CROSS_REFERENCE = 4
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class GeneMatch(NamedTuple):
|
|
45
|
+
row: int
|
|
46
|
+
kind: MatchKind
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass(slots=True)
|
|
50
|
+
class GeneIndex:
|
|
51
|
+
records: pd.DataFrame
|
|
52
|
+
_taxon_ids: tuple[str, ...]
|
|
53
|
+
_exact: _Tables
|
|
54
|
+
_folded: _Tables | None = None
|
|
55
|
+
|
|
56
|
+
def lookup(
|
|
57
|
+
self,
|
|
58
|
+
queries: str | list[str],
|
|
59
|
+
*,
|
|
60
|
+
taxon: str | None = None,
|
|
61
|
+
limit: int | None = 1,
|
|
62
|
+
case_insensitive: bool = False,
|
|
63
|
+
) -> pd.DataFrame:
|
|
64
|
+
if isinstance(queries, str):
|
|
65
|
+
queries = [queries]
|
|
66
|
+
taxon_id = resolve_taxon(taxon).id if taxon is not None else None
|
|
67
|
+
|
|
68
|
+
order: list[int] = []
|
|
69
|
+
query_col: list[str] = []
|
|
70
|
+
kind_col: list[str] = []
|
|
71
|
+
rows: list[int] = []
|
|
72
|
+
miss_order: list[int] = []
|
|
73
|
+
miss_query: list[str] = []
|
|
74
|
+
|
|
75
|
+
for i, query in enumerate(queries):
|
|
76
|
+
matches = self._resolve(query, taxon_id, case_insensitive)
|
|
77
|
+
if limit is not None:
|
|
78
|
+
matches = matches[:limit]
|
|
79
|
+
if matches:
|
|
80
|
+
for match in matches:
|
|
81
|
+
order.append(i)
|
|
82
|
+
query_col.append(query)
|
|
83
|
+
kind_col.append(match.kind.name)
|
|
84
|
+
rows.append(match.row)
|
|
85
|
+
else:
|
|
86
|
+
miss_order.append(i)
|
|
87
|
+
miss_query.append(query)
|
|
88
|
+
|
|
89
|
+
matched = self.records.iloc[rows].reset_index(drop=True)
|
|
90
|
+
matched.insert(0, "match_kind", kind_col)
|
|
91
|
+
matched.insert(0, "query", query_col)
|
|
92
|
+
matched.insert(0, "_order", order)
|
|
93
|
+
|
|
94
|
+
if miss_query:
|
|
95
|
+
missed = pd.DataFrame({"_order": miss_order, "query": miss_query, "match_kind": None})
|
|
96
|
+
matched = pd.concat([matched, missed], ignore_index=True)
|
|
97
|
+
|
|
98
|
+
return (
|
|
99
|
+
matched.sort_values("_order", kind="stable")
|
|
100
|
+
.drop(columns="_order")
|
|
101
|
+
.reset_index(drop=True)
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def _resolve(self, query: str, taxon_id: str | None, case_insensitive: bool) -> list[GeneMatch]:
|
|
105
|
+
tables = self._exact
|
|
106
|
+
key = query
|
|
107
|
+
if case_insensitive:
|
|
108
|
+
tables = self._folded_tables()
|
|
109
|
+
key = query.casefold()
|
|
110
|
+
|
|
111
|
+
matches = [
|
|
112
|
+
GeneMatch(row, kind)
|
|
113
|
+
for kind in MatchKind
|
|
114
|
+
for row in tables[kind].get(key, ())
|
|
115
|
+
]
|
|
116
|
+
if taxon_id is not None:
|
|
117
|
+
matches = [m for m in matches if self._taxon_ids[m.row] == taxon_id]
|
|
118
|
+
|
|
119
|
+
seen: set[int] = set()
|
|
120
|
+
unique: list[GeneMatch] = []
|
|
121
|
+
for match in matches:
|
|
122
|
+
if match.row not in seen:
|
|
123
|
+
seen.add(match.row)
|
|
124
|
+
unique.append(match)
|
|
125
|
+
return unique
|
|
126
|
+
|
|
127
|
+
def _folded_tables(self) -> _Tables:
|
|
128
|
+
if self._folded is None:
|
|
129
|
+
self._folded = _fold(self._exact)
|
|
130
|
+
return self._folded
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _fold(tables: _Tables) -> _Tables:
|
|
134
|
+
folded: _Tables = {}
|
|
135
|
+
for kind, table in tables.items():
|
|
136
|
+
merged: dict[str, list[int]] = {}
|
|
137
|
+
for key, table_rows in table.items():
|
|
138
|
+
merged.setdefault(key.casefold(), []).extend(table_rows)
|
|
139
|
+
folded[kind] = merged
|
|
140
|
+
return folded
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def build_gene_index(records: pd.DataFrame) -> GeneIndex:
|
|
144
|
+
primary: dict[str, list[int]] = {}
|
|
145
|
+
secondary: dict[str, list[int]] = {}
|
|
146
|
+
official: dict[str, list[int]] = {}
|
|
147
|
+
synonym: dict[str, list[int]] = {}
|
|
148
|
+
cross_reference: dict[str, list[int]] = {}
|
|
149
|
+
|
|
150
|
+
for i, gene_id in enumerate(records["GeneId"].tolist()):
|
|
151
|
+
primary.setdefault(gene_id, []).append(i)
|
|
152
|
+
|
|
153
|
+
for i, symbol in enumerate(records["GeneSymbol"].tolist()):
|
|
154
|
+
if isinstance(symbol, str):
|
|
155
|
+
official.setdefault(symbol, []).append(i)
|
|
156
|
+
|
|
157
|
+
for column in ("GeneSynonyms", "GeneSystematicName"):
|
|
158
|
+
for i, value in enumerate(records[column].tolist()):
|
|
159
|
+
if isinstance(value, str):
|
|
160
|
+
for token in value.split("|"):
|
|
161
|
+
if not token:
|
|
162
|
+
continue
|
|
163
|
+
synonym.setdefault(token, []).append(i)
|
|
164
|
+
|
|
165
|
+
for i, value in enumerate(records["GeneSecondaryIds"].tolist()):
|
|
166
|
+
if isinstance(value, str):
|
|
167
|
+
for token in value.split("|"):
|
|
168
|
+
if not token:
|
|
169
|
+
continue
|
|
170
|
+
secondary.setdefault(token, []).append(i)
|
|
171
|
+
|
|
172
|
+
for i, value in enumerate(records["GeneCrossReferences"].tolist()):
|
|
173
|
+
if isinstance(value, str):
|
|
174
|
+
for token in value.split("|"):
|
|
175
|
+
if not token or token.split(":", 1)[0] in _XREF_EXCLUDED_PREFIXES:
|
|
176
|
+
continue
|
|
177
|
+
cross_reference.setdefault(token, []).append(i)
|
|
178
|
+
|
|
179
|
+
exact: _Tables = {
|
|
180
|
+
MatchKind.PRIMARY_ID: primary,
|
|
181
|
+
MatchKind.SECONDARY_ID: secondary,
|
|
182
|
+
MatchKind.OFFICIAL_SYMBOL: official,
|
|
183
|
+
MatchKind.SYNONYM: synonym,
|
|
184
|
+
MatchKind.CROSS_REFERENCE: cross_reference,
|
|
185
|
+
}
|
|
186
|
+
return GeneIndex(records, tuple(records["Taxon"].tolist()), exact)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def load_gene_index(path: Path) -> GeneIndex:
|
|
190
|
+
return build_gene_index(load_tsv_gz(path, dtype=str))
|
geneharmony/store.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Atomic Parquet persistence for cached frames.
|
|
2
|
+
|
|
3
|
+
Backs the bulk, per-gene API, and external-annotation caches. Writes go through a
|
|
4
|
+
same-directory temp file then `os.replace`, so a reader never sees a half-written
|
|
5
|
+
file. Object columns holding dicts/lists are JSON-encoded to strings before
|
|
6
|
+
writing (Parquet can't represent them natively); `read_parquet(decode_json=...)`
|
|
7
|
+
reverses that for the columns a caller knows were nested.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import tempfile
|
|
13
|
+
from collections.abc import Callable, Iterable
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import pandas as pd
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _is_missing(value: Any) -> bool:
|
|
21
|
+
return value is None or (isinstance(value, float) and pd.isna(value))
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _encode_nested(df: pd.DataFrame) -> pd.DataFrame:
|
|
25
|
+
nested = [
|
|
26
|
+
col
|
|
27
|
+
for col in df.columns
|
|
28
|
+
if isinstance(next((v for v in df[col] if not _is_missing(v)), None), (dict, list))
|
|
29
|
+
]
|
|
30
|
+
if not nested:
|
|
31
|
+
return df
|
|
32
|
+
out = df.copy()
|
|
33
|
+
for col in nested:
|
|
34
|
+
out[col] = [None if _is_missing(v) else json.dumps(v) for v in out[col]]
|
|
35
|
+
return out
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _atomic_write(path: Path, write_fn: Callable[[Path], None]) -> None:
|
|
39
|
+
fd, tmp_name = tempfile.mkstemp(dir=path.parent, prefix=f".{path.name}.", suffix=".tmp")
|
|
40
|
+
os.close(fd)
|
|
41
|
+
tmp = Path(tmp_name)
|
|
42
|
+
try:
|
|
43
|
+
write_fn(tmp)
|
|
44
|
+
os.replace(tmp, path)
|
|
45
|
+
except BaseException:
|
|
46
|
+
tmp.unlink(missing_ok=True)
|
|
47
|
+
raise
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def write_parquet(df: pd.DataFrame, path: Path) -> None:
|
|
51
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
encoded = _encode_nested(df)
|
|
53
|
+
_atomic_write(path, lambda p: encoded.to_parquet(p, compression="zstd", index=False))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def read_parquet(path: Path, *, decode_json: Iterable[str] = ()) -> pd.DataFrame:
|
|
57
|
+
df = pd.read_parquet(path)
|
|
58
|
+
for col in decode_json:
|
|
59
|
+
if col in df.columns:
|
|
60
|
+
df[col] = [None if _is_missing(v) else json.loads(v) for v in df[col]]
|
|
61
|
+
return df
|
geneharmony/taxa.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
[
|
|
2
|
+
{"id": "NCBITaxon:9606", "species": "Homo sapiens", "common": ["human"]},
|
|
3
|
+
{"id": "NCBITaxon:10090", "species": "Mus musculus", "common": ["mouse"]},
|
|
4
|
+
{"id": "NCBITaxon:10116", "species": "Rattus norvegicus", "common": ["rat"]},
|
|
5
|
+
{"id": "NCBITaxon:7955", "species": "Danio rerio", "common": ["zebrafish"]},
|
|
6
|
+
{"id": "NCBITaxon:7227", "species": "Drosophila melanogaster", "common": ["fly", "fruit fly"]},
|
|
7
|
+
{"id": "NCBITaxon:6239", "species": "Caenorhabditis elegans", "common": ["worm", "roundworm"]},
|
|
8
|
+
{"id": "NCBITaxon:559292", "species": "Saccharomyces cerevisiae S288C", "common": ["yeast", "budding yeast", "saccharomyces cerevisiae"]},
|
|
9
|
+
{"id": "NCBITaxon:8355", "species": "Xenopus laevis", "common": ["african clawed frog"]},
|
|
10
|
+
{"id": "NCBITaxon:8364", "species": "Xenopus tropicalis", "common": ["western clawed frog", "tropical clawed frog"]}
|
|
11
|
+
]
|
geneharmony/taxa.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Species taxon resolution, built from `taxa.json`.
|
|
2
|
+
|
|
3
|
+
One entry per AGR species. `resolve_taxon` maps any alias — canonical
|
|
4
|
+
`NCBITaxon:` ID, bare number, species name or common name — to its `Taxon`
|
|
5
|
+
record; `taxon_mapper` builds a `value -> field` callable for annotating a taxon
|
|
6
|
+
column of a DataFrame.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import enum
|
|
10
|
+
import json
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Final, NamedTuple
|
|
14
|
+
from importlib.resources import files
|
|
15
|
+
|
|
16
|
+
_TAXA_PATH = files("geneharmony").joinpath("taxa.json")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Taxon(NamedTuple):
|
|
20
|
+
"""A resolved species: its canonical NCBITaxon ID, species name and common names."""
|
|
21
|
+
|
|
22
|
+
id: str
|
|
23
|
+
species: str
|
|
24
|
+
common: tuple[str, ...]
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def number(self) -> str:
|
|
28
|
+
"""The bare NCBI taxon number, without the `NCBITaxon:` prefix."""
|
|
29
|
+
return self.id.split(":", 1)[1]
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def common_name(self) -> str | None:
|
|
33
|
+
"""The primary common name, or None if the species has none."""
|
|
34
|
+
return self.common[0] if self.common else None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class TaxonField(enum.StrEnum):
|
|
38
|
+
ID = "id"
|
|
39
|
+
NUMBER = "number"
|
|
40
|
+
SPECIES = "species"
|
|
41
|
+
COMMON_NAME = "common_name"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _load_taxa() -> tuple[Taxon, ...]:
|
|
45
|
+
return tuple(
|
|
46
|
+
Taxon(entry["id"], entry["species"], tuple(entry["common"]))
|
|
47
|
+
for entry in json.loads(_TAXA_PATH.read_text())
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
_TAXA: Final[tuple[Taxon, ...]] = _load_taxa()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
_TAXON_BY_ALIAS: Final[dict[str, Taxon]] = {
|
|
55
|
+
alias.casefold(): taxon
|
|
56
|
+
for taxon in _TAXA
|
|
57
|
+
for alias in (taxon.id, taxon.number, taxon.species, *taxon.common)
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def resolve_taxon(value: str) -> Taxon:
|
|
62
|
+
"""Resolve a taxon ID, number, species name or common name to its `Taxon` record.
|
|
63
|
+
|
|
64
|
+
Pull out the part you need with `.id`, `.species`, `.common_name` or `.number`.
|
|
65
|
+
"""
|
|
66
|
+
taxon = _TAXON_BY_ALIAS.get(value.strip().casefold())
|
|
67
|
+
if taxon is None:
|
|
68
|
+
raise ValueError(f"unknown taxon: {value!r}")
|
|
69
|
+
return taxon
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def taxon_mapper(field: TaxonField) -> Callable[[object], str | None]:
|
|
73
|
+
"""Build a `value -> field` function for mapping a taxon column.
|
|
74
|
+
|
|
75
|
+
The returned callable takes any taxon alias (ID, number, species, common name)
|
|
76
|
+
and returns the requested `field`; unknown or non-string values yield `None`.
|
|
77
|
+
Intended for `df[col].map(taxon_mapper(TaxonField.COMMON_NAME))`.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def mapper(value: object) -> str | None:
|
|
81
|
+
if not isinstance(value, str):
|
|
82
|
+
return None
|
|
83
|
+
taxon = _TAXON_BY_ALIAS.get(value.strip().casefold())
|
|
84
|
+
return getattr(taxon, field) if taxon is not None else None
|
|
85
|
+
|
|
86
|
+
return mapper
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: geneharmony
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Async toolkit to normalize gene identifiers and annotate gene sets with data from the Alliance of Genome Resources (AGR) or user-ingested datasets.
|
|
5
|
+
Project-URL: Homepage, https://github.com/limenode/geneharmony
|
|
6
|
+
Project-URL: Repository, https://github.com/limenode/geneharmony
|
|
7
|
+
Project-URL: Issues, https://github.com/limenode/geneharmony/issues
|
|
8
|
+
Author-email: Lionel Sequeira <lionelsequeira@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: agr,alliance of genome resources,annotation,bioinformatics,gene,genomics
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
20
|
+
Classifier: Typing :: Typed
|
|
21
|
+
Requires-Python: >=3.12
|
|
22
|
+
Requires-Dist: httpx>=0.28
|
|
23
|
+
Requires-Dist: pandas>=3
|
|
24
|
+
Requires-Dist: pyarrow>=14
|
|
25
|
+
Requires-Dist: pydantic>=2
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# geneharmony
|
|
29
|
+
|
|
30
|
+
An async Python toolkit that normalizes gene identifiers and annotates gene sets using the [Alliance of Genome Resources](https://www.alliancegenome.org) (AGR) REST API and bulk-download files, with functionality to append local annotations.
|
|
31
|
+
|
|
32
|
+
It resolves gene symbols and identifiers to canonical genes using an in-memory index built from AGR's bulk gene file, fetches per-gene API data concurrently, and downloads and parses AGR bulk datasets.
|
|
33
|
+
|
|
34
|
+
## Highlights
|
|
35
|
+
|
|
36
|
+
- **Gene normalization**: Resolve symbols, primary/secondary IDs, synonyms, systematic names, and external cross-references (NCBI, Ensembl, UniProtKB, RefSeq, …) to the appropriate records in the AGR's `GENE-TSV-COMBINED` file.
|
|
37
|
+
- **Nine model organisms**: Human, mouse, rat, zebrafish, fly, worm, yeast, african clawed frog, and western clawed frog.
|
|
38
|
+
- **Concurrent and resilient**: Pooled, rate-limited HTTP with automatic retry/backoff for transient failures.
|
|
39
|
+
- **Transparent caching**: Bulk files, per-gene API results, and ingested annotations are cached as Parquet to expedite repeat runs.
|
|
40
|
+
- **Bring your own data**: Ingest external annotation tables keyed on whatever gene identifier you have; they normalize to canonical AGR genes and join cleanly.
|
|
41
|
+
|
|
42
|
+
## Install
|
|
43
|
+
|
|
44
|
+
Requires **Python 3.12+**. Install from PyPI with pip or uv — all dependencies (`httpx`, `pydantic` v2, `pandas` 3.x, `pyarrow`) are resolved automatically:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install geneharmony
|
|
48
|
+
# or
|
|
49
|
+
uv add geneharmony
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Development
|
|
53
|
+
|
|
54
|
+
Contributors use **pixi** (conda-forge) for a reproducible environment from the lockfile. End users do not need pixi.
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# 1. Install the environment from the lockfile
|
|
58
|
+
pixi install
|
|
59
|
+
|
|
60
|
+
# 2. Run Python inside the environment
|
|
61
|
+
pixi run python <script>
|
|
62
|
+
|
|
63
|
+
# 3. Or open the interactive driver notebook
|
|
64
|
+
pixi run jupyter lab src/notebook.ipynb
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Notebook outputs are stripped from version control via a git clean filter. The filter config is repo-local, so enable it once per clone:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
git config filter.nbstrip.clean "pixi run jupyter nbconvert --clear-output --to notebook --stdin --stdout --log-level=ERROR"
|
|
71
|
+
git config filter.nbstrip.smudge cat
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Usage
|
|
75
|
+
|
|
76
|
+
The `Annotator` is the single entry point. It is async, so call its methods with `await` (inside a notebook cell, an `async def`, or `asyncio.run(...)`).
|
|
77
|
+
|
|
78
|
+
### Quick start
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from geneharmony import Annotator, AGRDataset
|
|
82
|
+
|
|
83
|
+
ann = Annotator()
|
|
84
|
+
|
|
85
|
+
# Resolve gene symbols to canonical AGR records
|
|
86
|
+
genes = await ann.normalize(["TP53", "BRCA1"], taxon="human")
|
|
87
|
+
|
|
88
|
+
# Annotate genes with phenotypic information
|
|
89
|
+
annotated_genes = await ann.annotate(["Atp7b", "Ttn"], AGRDataset.PHENOTYPES, taxon="mouse")
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Resolving genes (`normalize`)
|
|
93
|
+
|
|
94
|
+
`normalize` accepts an identifier or a list of identifiers and returns one row per match. Unmatched queries are **retained** with a null `match_kind` so misses stay visible.
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
df = await ann.normalize(
|
|
98
|
+
["TP53", "ENSG00000141510", "not_a_gene"],
|
|
99
|
+
taxon="human", # any alias: "human", "9606", "Homo sapiens", "NCBITaxon:9606"
|
|
100
|
+
limit=1, # max matches per query; use None for all
|
|
101
|
+
case_insensitive=False, # case can be meaningful (human TP53 vs mouse Trp53)
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
resolved = df[df.match_kind.notna()] # drop the misses
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Matches are ranked by identifier precedence:
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
PRIMARY_ID > SECONDARY_ID > OFFICIAL_SYMBOL > SYNONYM > CROSS_REFERENCE
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Annotating genes (`annotate`)
|
|
114
|
+
|
|
115
|
+
`annotate` builds a normalized base frame, then **left joins** one or more sources onto the canonical `GeneId`:
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from geneharmony import AGRDataset
|
|
119
|
+
|
|
120
|
+
orth = await ann.annotate(
|
|
121
|
+
["TP53", "BRCA1"],
|
|
122
|
+
AGRDataset.ORTHOLOGY,
|
|
123
|
+
taxon="human",
|
|
124
|
+
)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
For chaining annotate calls, the recommended pattern is an **iterative filter-then-requery traversal** — one AGR dataset per call — so result cardinality stays under your control:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
# 1. Find orthologs of human genes
|
|
131
|
+
orth = await ann.annotate(["TP53", "BRCA1"], AGRDataset.ORTHOLOGY, taxon="human")
|
|
132
|
+
|
|
133
|
+
# 2. Keep the mouse orthologs
|
|
134
|
+
mouse = orth.loc[orth.Gene2SpeciesTaxonID == "NCBITaxon:10090", "Gene2ID"].unique()
|
|
135
|
+
|
|
136
|
+
# 3. Fetch their phenotypes
|
|
137
|
+
pheno = await ann.annotate(list(mouse), AGRDataset.PHENOTYPES, taxon="mouse")
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
#### Available AGR datasets
|
|
141
|
+
|
|
142
|
+
| Dataset | Backend | Key columns contributed |
|
|
143
|
+
| ----------------------- | ------------ | ----------------------------------------------------------- |
|
|
144
|
+
| `AGRDataset.ORTHOLOGY` | Bulk TSV | `Gene2ID`, `Gene2Symbol`, `Gene2SpeciesTaxonID`, … |
|
|
145
|
+
| `AGRDataset.PHENOTYPES` | Per-gene API | `phenotypeStatement`, `references` |
|
|
146
|
+
| `AGRDataset.ALLELES` | Per-gene API | `allele_id`, `symbol`, `alterationType`, `variantType`, … |
|
|
147
|
+
|
|
148
|
+
### Orthologs convenience helper
|
|
149
|
+
|
|
150
|
+
For the common ortholog case there is a shortcut that returns a tidy subset:
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
orthologs = await ann.get_orthologs(
|
|
154
|
+
["TP53", "BRCA1"],
|
|
155
|
+
taxon="human",
|
|
156
|
+
target_taxon="mouse", # optional: filter to one target species
|
|
157
|
+
)
|
|
158
|
+
# -> columns: query, match_kind, Gene2ID, Gene2Symbol, Gene2SpeciesTaxonID
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Downloading bulk datasets (`download`)
|
|
162
|
+
|
|
163
|
+
Bulk datasets are downloaded and converted to Parquet on first use (and cached thereafter). You can pre-fetch one explicitly:
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
path = await ann.download(AGRDataset.ORTHOLOGY)
|
|
167
|
+
# Force a refresh across AGR releases:
|
|
168
|
+
path = await ann.download(AGRDataset.ORTHOLOGY, refresh=True)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### Ingesting your own annotations (`ingest_annotation`)
|
|
172
|
+
|
|
173
|
+
Bring an external table (CSV, TSV, or Parquet file, or a `DataFrame`), normalize its gene identifiers to canonical AGR genes, and store it for joining by name:
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
summary, unmapped = await ann.ingest_annotation(
|
|
177
|
+
"my_expression_table.csv",
|
|
178
|
+
name="expression",
|
|
179
|
+
gene_id_column="symbol", # or a list of columns, tried left-to-right per row
|
|
180
|
+
taxon="human",
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Join it alongside an AGR dataset; its columns are prefixed `expression.`
|
|
184
|
+
df = await ann.annotate(["TP53", "BRCA1"], AGRDataset.ORTHOLOGY, "expression", taxon="human")
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
`summary` reports rows in / stored / dropped; `unmapped` holds the rows whose identifiers could not be resolved (with their original columns) so nothing is silently lost.
|
|
188
|
+
|
|
189
|
+
### Supported species
|
|
190
|
+
|
|
191
|
+
| Common name | Species | Taxon ID |
|
|
192
|
+
| --------------------- | -------------------------------- | ------------------ |
|
|
193
|
+
| human | *Homo sapiens* | `NCBITaxon:9606` |
|
|
194
|
+
| mouse | *Mus musculus* | `NCBITaxon:10090` |
|
|
195
|
+
| rat | *Rattus norvegicus* | `NCBITaxon:10116` |
|
|
196
|
+
| zebrafish | *Danio rerio* | `NCBITaxon:7955` |
|
|
197
|
+
| fly / fruit fly | *Drosophila melanogaster* | `NCBITaxon:7227` |
|
|
198
|
+
| worm / roundworm | *Caenorhabditis elegans* | `NCBITaxon:6239` |
|
|
199
|
+
| yeast / budding yeast | *Saccharomyces cerevisiae S288C* | `NCBITaxon:559292` |
|
|
200
|
+
| african clawed frog | *Xenopus laevis* | `NCBITaxon:8355` |
|
|
201
|
+
| western clawed frog | *Xenopus tropicalis* | `NCBITaxon:8364` |
|
|
202
|
+
|
|
203
|
+
Any of the aliases above — common name, full species name, bare number, or `NCBITaxon:` ID — can be passed as `taxon`.
|
|
204
|
+
|
|
205
|
+
## Caching
|
|
206
|
+
|
|
207
|
+
Results are cached so repeat work is fast and largely offline. The cache defaults to `$XDG_CACHE_HOME/geneharmony` (falling back to `~/.cache/geneharmony`); pass a `cache_dir` to `Annotator(...)` to share or relocate it.
|
|
208
|
+
|
|
209
|
+
```
|
|
210
|
+
bulk/<dataset>.parquet # downloaded + converted bulk datasets (incl. the gene index source)
|
|
211
|
+
api/<dataset>/<gene_id>.parquet # per-gene API results
|
|
212
|
+
external/<name>.parquet # ingested annotations
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
The gene index is built from `bulk/gene.parquet`, which becomes stale across AGR releases. Refresh it with `await ann.download(AGRDataset.GENE, refresh=True)` (or by deleting the file).
|
|
216
|
+
|
|
217
|
+
## Acknowledgements & Citation
|
|
218
|
+
|
|
219
|
+
This project is a client for data and services provided by the **Alliance of Genome Resources (AGR)**. It is not affiliated with or endorsed by the Alliance. All gene, ortholog, phenotype, and allele data are sourced from AGR and its member model-organism databases, and remain subject to the Alliance's terms of use.
|
|
220
|
+
|
|
221
|
+
If you use data obtained through this wrapper, please cite the Alliance of Genome Resources:
|
|
222
|
+
|
|
223
|
+
> [Updates to the Alliance of Genome Resources central infrastructure.](https://pubmed.ncbi.nlm.nih.gov/38552170/) 2024. Alliance of Genome Resources Consortium. Genetics. 2024 May 7;227(1):iyae049. doi: 10.1093/genetics/iyae049. PMID: 38552170.
|
|
224
|
+
|
|
225
|
+
Please also consult the [Alliance citation and data-usage guidelines](https://www.alliancegenome.org/cite-us) and acknowledge the underlying model-organism databases (e.g. SGD, WormBase, FlyBase, ZFIN, MGI, RGD, Xenbase) as appropriate.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
geneharmony/__init__.py,sha256=0-sHvuoBiKxIeivfD7iKyrgfeQmBclTQHnnKjQSlXtk,475
|
|
2
|
+
geneharmony/annotator.py,sha256=hXTYESOfC1ADKBsMo5kh-7NLhqPUjrdSfgN5jEpmojw,13459
|
|
3
|
+
geneharmony/client.py,sha256=dG-lm9t5Krv540DEbmpOwJYR6zZbGGyk3aArb_9EDAQ,3523
|
|
4
|
+
geneharmony/datasets.py,sha256=GHWXgNxN3cngUfzOaYxgu4vDzraCtx10gNheDTa3_U4,3839
|
|
5
|
+
geneharmony/downloader.py,sha256=X0OSczbigq6dtEj0LtiLNn3xqHVx5QUh4qvAbOjVg7U,3538
|
|
6
|
+
geneharmony/ingest.py,sha256=-BViPSxGMKCymlLwkCegIVnHckLLxpvbfZ6GrD2oZ7o,757
|
|
7
|
+
geneharmony/models.py,sha256=rGiMCo07EvxV_0GXjwnNE0XUdglsH3yRzPscNaz_b54,326
|
|
8
|
+
geneharmony/normalizer.py,sha256=8fm1gs6GAimBtTum0tFIfCClw0mLpJYeGlfhX6demVo,6600
|
|
9
|
+
geneharmony/store.py,sha256=Ukx0OwPwY0kKkcpUHKBNBdhmhav1gINqTQDYK50Vwy0,1938
|
|
10
|
+
geneharmony/taxa.json,sha256=TiseisPSZw3oj1I2v3y98x2rCtnKhMefBt1PRMuIFkY,864
|
|
11
|
+
geneharmony/taxa.py,sha256=zqUw8Ma62W5yrg6fqAxZNVjxMH9Ca0l_YXXDCmZR89I,2599
|
|
12
|
+
geneharmony-0.3.0.dist-info/METADATA,sha256=5YkxIUwkPmr_tss9XdFR9_OZGXcDWrwuP_eTq8TykgM,10066
|
|
13
|
+
geneharmony-0.3.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
14
|
+
geneharmony-0.3.0.dist-info/licenses/LICENSE,sha256=r0h8rf3XM4VXvFcvxGHMrqJos3l3v654b1Z2Tb5Tk6w,1072
|
|
15
|
+
geneharmony-0.3.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Lionel Sequeira
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|