pyobo 0.10.12__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/__init__.py +0 -2
- pyobo/__main__.py +0 -2
- pyobo/api/__init__.py +0 -2
- pyobo/api/alts.py +6 -7
- pyobo/api/hierarchy.py +14 -15
- pyobo/api/metadata.py +3 -4
- pyobo/api/names.py +31 -32
- pyobo/api/properties.py +6 -7
- pyobo/api/relations.py +12 -11
- pyobo/api/species.py +5 -6
- pyobo/api/typedefs.py +1 -3
- pyobo/api/utils.py +61 -5
- pyobo/api/xrefs.py +4 -5
- pyobo/aws.py +3 -5
- pyobo/cli/__init__.py +0 -2
- pyobo/cli/aws.py +0 -2
- pyobo/cli/cli.py +0 -4
- pyobo/cli/database.py +1 -3
- pyobo/cli/lookup.py +0 -2
- pyobo/cli/utils.py +0 -2
- pyobo/constants.py +1 -33
- pyobo/getters.py +19 -26
- pyobo/gilda_utils.py +19 -17
- pyobo/identifier_utils.py +10 -10
- pyobo/mocks.py +5 -6
- pyobo/normalizer.py +24 -24
- pyobo/obographs.py +8 -5
- pyobo/plugins.py +3 -4
- pyobo/py.typed +0 -0
- pyobo/reader.py +19 -21
- pyobo/registries/__init__.py +0 -2
- pyobo/registries/metaregistry.py +6 -8
- pyobo/resource_utils.py +1 -3
- pyobo/resources/__init__.py +0 -2
- pyobo/resources/ncbitaxon.py +2 -3
- pyobo/resources/ro.py +2 -4
- pyobo/resources/so.py +55 -0
- pyobo/resources/so.tsv +2604 -0
- pyobo/sources/README.md +15 -0
- pyobo/sources/__init__.py +0 -2
- pyobo/sources/agrovoc.py +3 -3
- pyobo/sources/antibodyregistry.py +2 -3
- pyobo/sources/biogrid.py +4 -4
- pyobo/sources/ccle.py +3 -4
- pyobo/sources/cgnc.py +1 -3
- pyobo/sources/chebi.py +2 -4
- pyobo/sources/chembl.py +1 -3
- pyobo/sources/civic_gene.py +2 -3
- pyobo/sources/complexportal.py +57 -20
- pyobo/sources/conso.py +2 -4
- pyobo/sources/cpt.py +1 -3
- pyobo/sources/credit.py +1 -1
- pyobo/sources/cvx.py +1 -3
- pyobo/sources/depmap.py +3 -4
- pyobo/sources/dictybase_gene.py +15 -12
- pyobo/sources/drugbank.py +6 -7
- pyobo/sources/drugbank_salt.py +3 -4
- pyobo/sources/drugcentral.py +9 -8
- pyobo/sources/expasy.py +33 -16
- pyobo/sources/famplex.py +3 -5
- pyobo/sources/flybase.py +5 -6
- pyobo/sources/geonames.py +1 -1
- pyobo/sources/gmt_utils.py +5 -6
- pyobo/sources/go.py +4 -6
- pyobo/sources/gwascentral_phenotype.py +1 -3
- pyobo/sources/gwascentral_study.py +2 -3
- pyobo/sources/hgnc.py +30 -26
- pyobo/sources/hgncgenefamily.py +9 -11
- pyobo/sources/icd10.py +3 -4
- pyobo/sources/icd11.py +3 -4
- pyobo/sources/icd_utils.py +6 -7
- pyobo/sources/interpro.py +3 -5
- pyobo/sources/itis.py +1 -3
- pyobo/sources/kegg/__init__.py +0 -2
- pyobo/sources/kegg/api.py +3 -4
- pyobo/sources/kegg/genes.py +3 -4
- pyobo/sources/kegg/genome.py +19 -9
- pyobo/sources/kegg/pathway.py +5 -6
- pyobo/sources/mesh.py +19 -21
- pyobo/sources/mgi.py +1 -3
- pyobo/sources/mirbase.py +13 -9
- pyobo/sources/mirbase_constants.py +0 -2
- pyobo/sources/mirbase_family.py +1 -3
- pyobo/sources/mirbase_mature.py +1 -3
- pyobo/sources/msigdb.py +4 -5
- pyobo/sources/ncbigene.py +3 -5
- pyobo/sources/npass.py +2 -4
- pyobo/sources/omim_ps.py +1 -3
- pyobo/sources/pathbank.py +35 -28
- pyobo/sources/pfam.py +1 -3
- pyobo/sources/pfam_clan.py +1 -3
- pyobo/sources/pid.py +3 -5
- pyobo/sources/pombase.py +7 -6
- pyobo/sources/pubchem.py +2 -3
- pyobo/sources/reactome.py +30 -11
- pyobo/sources/rgd.py +3 -4
- pyobo/sources/rhea.py +7 -8
- pyobo/sources/ror.py +3 -2
- pyobo/sources/selventa/__init__.py +0 -2
- pyobo/sources/selventa/schem.py +1 -3
- pyobo/sources/selventa/scomp.py +1 -3
- pyobo/sources/selventa/sdis.py +1 -3
- pyobo/sources/selventa/sfam.py +1 -3
- pyobo/sources/sgd.py +1 -3
- pyobo/sources/slm.py +29 -17
- pyobo/sources/umls/__init__.py +0 -2
- pyobo/sources/umls/__main__.py +0 -2
- pyobo/sources/umls/get_synonym_types.py +1 -1
- pyobo/sources/umls/umls.py +2 -4
- pyobo/sources/uniprot/__init__.py +0 -2
- pyobo/sources/uniprot/uniprot.py +11 -10
- pyobo/sources/uniprot/uniprot_ptm.py +6 -5
- pyobo/sources/utils.py +3 -5
- pyobo/sources/wikipathways.py +1 -3
- pyobo/sources/zfin.py +20 -9
- pyobo/ssg/__init__.py +3 -2
- pyobo/struct/__init__.py +0 -2
- pyobo/struct/reference.py +22 -23
- pyobo/struct/struct.py +132 -116
- pyobo/struct/typedef.py +14 -10
- pyobo/struct/utils.py +0 -2
- pyobo/utils/__init__.py +0 -2
- pyobo/utils/cache.py +14 -6
- pyobo/utils/io.py +9 -10
- pyobo/utils/iter.py +5 -6
- pyobo/utils/misc.py +1 -3
- pyobo/utils/ndex_utils.py +6 -7
- pyobo/utils/path.py +4 -5
- pyobo/version.py +3 -5
- pyobo/xrefdb/__init__.py +0 -2
- pyobo/xrefdb/canonicalizer.py +27 -18
- pyobo/xrefdb/priority.py +0 -2
- pyobo/xrefdb/sources/__init__.py +3 -4
- pyobo/xrefdb/sources/biomappings.py +0 -2
- pyobo/xrefdb/sources/cbms2019.py +0 -2
- pyobo/xrefdb/sources/chembl.py +0 -2
- pyobo/xrefdb/sources/compath.py +1 -3
- pyobo/xrefdb/sources/famplex.py +3 -5
- pyobo/xrefdb/sources/gilda.py +0 -2
- pyobo/xrefdb/sources/intact.py +5 -5
- pyobo/xrefdb/sources/ncit.py +1 -3
- pyobo/xrefdb/sources/pubchem.py +2 -5
- pyobo/xrefdb/sources/wikidata.py +2 -4
- pyobo/xrefdb/xrefs_pipeline.py +15 -16
- {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/LICENSE +1 -1
- pyobo-0.11.1.dist-info/METADATA +711 -0
- pyobo-0.11.1.dist-info/RECORD +173 -0
- {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/WHEEL +1 -1
- pyobo-0.11.1.dist-info/entry_points.txt +2 -0
- pyobo-0.10.12.dist-info/METADATA +0 -499
- pyobo-0.10.12.dist-info/RECORD +0 -169
- pyobo-0.10.12.dist-info/entry_points.txt +0 -15
- {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/top_level.txt +0 -0
pyobo/sources/famplex.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for FamPlex."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
6
4
|
from collections import defaultdict
|
|
7
|
-
from
|
|
5
|
+
from collections.abc import Iterable, Mapping
|
|
8
6
|
|
|
9
7
|
import bioregistry
|
|
10
8
|
from pystow.utils import get_commit
|
|
@@ -62,7 +60,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
62
60
|
dtype=str,
|
|
63
61
|
force=force,
|
|
64
62
|
)
|
|
65
|
-
id_to_definition: Mapping[str,
|
|
63
|
+
id_to_definition: Mapping[str, tuple[str, str]] = {
|
|
66
64
|
identifier: (definition, provenance)
|
|
67
65
|
for identifier, provenance, definition in definitions_df.values
|
|
68
66
|
}
|
|
@@ -140,7 +138,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
140
138
|
yield term
|
|
141
139
|
|
|
142
140
|
|
|
143
|
-
def _get_xref_df(version: str) -> Mapping[str,
|
|
141
|
+
def _get_xref_df(version: str) -> Mapping[str, list[Reference]]:
|
|
144
142
|
base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}"
|
|
145
143
|
xrefs_url = f"{base_url}/equivalences.csv"
|
|
146
144
|
xrefs_df = ensure_df(PREFIX, url=xrefs_url, version=version, header=None, sep=",", dtype=str)
|
pyobo/sources/flybase.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for FlyBase Genes."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
6
|
-
from
|
|
4
|
+
from collections.abc import Iterable, Mapping
|
|
7
5
|
|
|
8
6
|
import pandas as pd
|
|
9
7
|
from tqdm.auto import tqdm
|
|
10
8
|
|
|
11
9
|
from pyobo import Reference
|
|
10
|
+
from pyobo.resources.so import get_so_name
|
|
12
11
|
from pyobo.struct import Obo, Term, from_species, orthologous
|
|
13
12
|
from pyobo.utils.io import multisetdict
|
|
14
13
|
from pyobo.utils.path import ensure_df
|
|
@@ -68,7 +67,7 @@ def _get_definitions(version: str, force: bool = False) -> Mapping[str, str]:
|
|
|
68
67
|
return dict(df.values)
|
|
69
68
|
|
|
70
69
|
|
|
71
|
-
def _get_human_orthologs(version: str, force: bool = False) -> Mapping[str,
|
|
70
|
+
def _get_human_orthologs(version: str, force: bool = False) -> Mapping[str, set[str]]:
|
|
72
71
|
url = (
|
|
73
72
|
f"http://ftp.flybase.net/releases/FB{version}/precomputed_files/"
|
|
74
73
|
f"orthologs/dmel_human_orthologs_disease_fb_{version}.tsv.gz"
|
|
@@ -135,7 +134,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
135
134
|
"FlyBase gene type is missing mapping to Sequence Ontology (SO): %s", gtype
|
|
136
135
|
)
|
|
137
136
|
else:
|
|
138
|
-
so[gtype] = Reference
|
|
137
|
+
so[gtype] = Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id))
|
|
139
138
|
|
|
140
139
|
for _, reference in sorted(so.items()):
|
|
141
140
|
yield Term(reference=reference)
|
|
@@ -155,7 +154,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
155
154
|
for hgnc_curie in human_orthologs.get(identifier, []):
|
|
156
155
|
if not hgnc_curie or pd.isna(hgnc_curie):
|
|
157
156
|
continue
|
|
158
|
-
hgnc_ortholog = Reference.from_curie(hgnc_curie
|
|
157
|
+
hgnc_ortholog = Reference.from_curie(hgnc_curie)
|
|
159
158
|
if hgnc_ortholog is None:
|
|
160
159
|
tqdm.write(f"[{PREFIX}] {identifier} had invalid ortholog: {hgnc_curie}")
|
|
161
160
|
else:
|
pyobo/sources/geonames.py
CHANGED
pyobo/sources/gmt_utils.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""GMT utilities."""
|
|
4
2
|
|
|
3
|
+
from collections.abc import Iterable
|
|
5
4
|
from pathlib import Path
|
|
6
|
-
from typing import
|
|
5
|
+
from typing import Union
|
|
7
6
|
|
|
8
|
-
GMTSummary =
|
|
9
|
-
WikiPathwaysGMTSummary =
|
|
7
|
+
GMTSummary = tuple[str, str, set[str]]
|
|
8
|
+
WikiPathwaysGMTSummary = tuple[str, str, str, str, str, set[str]]
|
|
10
9
|
|
|
11
10
|
|
|
12
11
|
def parse_gmt_file(path: Union[str, Path]) -> Iterable[GMTSummary]:
|
|
@@ -20,7 +19,7 @@ def parse_gmt_file(path: Union[str, Path]) -> Iterable[GMTSummary]:
|
|
|
20
19
|
yield _process_line(line)
|
|
21
20
|
|
|
22
21
|
|
|
23
|
-
def _process_line(line: str) ->
|
|
22
|
+
def _process_line(line: str) -> tuple[str, str, set[str]]:
|
|
24
23
|
"""Return the pathway name, url, and gene sets associated.
|
|
25
24
|
|
|
26
25
|
:param line: gmt file line
|
pyobo/sources/go.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Gene Ontology."""
|
|
4
2
|
|
|
5
3
|
from pyobo import get_descendants
|
|
@@ -14,13 +12,13 @@ __all__ = [
|
|
|
14
12
|
def is_biological_process(identifier: str) -> bool:
|
|
15
13
|
"""Return if the given GO identifier is a biological process.
|
|
16
14
|
|
|
17
|
-
>>> is_biological_process(
|
|
15
|
+
>>> is_biological_process("0006915")
|
|
18
16
|
True
|
|
19
|
-
>>> is_biological_process(
|
|
17
|
+
>>> is_biological_process("GO:0006915")
|
|
20
18
|
True
|
|
21
|
-
>>> is_molecular_function(
|
|
19
|
+
>>> is_molecular_function("0006915")
|
|
22
20
|
False
|
|
23
|
-
>>> is_cellular_component(
|
|
21
|
+
>>> is_cellular_component("0006915")
|
|
24
22
|
False
|
|
25
23
|
"""
|
|
26
24
|
return _is_descendant(identifier, "0008150")
|
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for GWAS Central."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
6
4
|
import tarfile
|
|
7
|
-
from
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
from typing import Optional
|
|
8
7
|
from xml.etree import ElementTree
|
|
9
8
|
|
|
10
9
|
from pyobo.struct import Obo, Reference, Term, has_part
|
pyobo/sources/hgnc.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for HGNC."""
|
|
4
2
|
|
|
5
3
|
import itertools as itt
|
|
@@ -7,13 +5,15 @@ import json
|
|
|
7
5
|
import logging
|
|
8
6
|
import typing
|
|
9
7
|
from collections import Counter, defaultdict
|
|
8
|
+
from collections.abc import Iterable
|
|
10
9
|
from operator import attrgetter
|
|
11
|
-
from typing import
|
|
10
|
+
from typing import Optional
|
|
12
11
|
|
|
13
12
|
from tabulate import tabulate
|
|
14
13
|
from tqdm.auto import tqdm
|
|
15
14
|
|
|
16
15
|
from pyobo.api.utils import get_version
|
|
16
|
+
from pyobo.resources.so import get_so_name
|
|
17
17
|
from pyobo.struct import (
|
|
18
18
|
Obo,
|
|
19
19
|
Reference,
|
|
@@ -38,8 +38,8 @@ logger = logging.getLogger(__name__)
|
|
|
38
38
|
|
|
39
39
|
PREFIX = "hgnc"
|
|
40
40
|
DEFINITIONS_URL_FMT = (
|
|
41
|
-
"
|
|
42
|
-
"
|
|
41
|
+
"https://storage.googleapis.com/public-download-files/hgnc/archive/archive/monthly/json/"
|
|
42
|
+
"hgnc_complete_set_{version}.json"
|
|
43
43
|
)
|
|
44
44
|
|
|
45
45
|
previous_symbol_type = SynonymTypeDef.from_text("previous_symbol")
|
|
@@ -223,7 +223,7 @@ class HGNCGetter(Obo):
|
|
|
223
223
|
alias_symbol_type,
|
|
224
224
|
]
|
|
225
225
|
root_terms = [
|
|
226
|
-
Reference(prefix="
|
|
226
|
+
Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id))
|
|
227
227
|
for so_id in sorted(set(LOCUS_TYPE_TO_SO.values()))
|
|
228
228
|
if so_id
|
|
229
229
|
]
|
|
@@ -238,12 +238,12 @@ def get_obo(*, force: bool = False) -> Obo:
|
|
|
238
238
|
return HGNCGetter(force=force)
|
|
239
239
|
|
|
240
240
|
|
|
241
|
-
def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Term]:
|
|
241
|
+
def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Term]:
|
|
242
242
|
"""Get HGNC terms."""
|
|
243
243
|
if version is None:
|
|
244
244
|
version = get_version("hgnc")
|
|
245
245
|
unhandled_entry_keys: typing.Counter[str] = Counter()
|
|
246
|
-
unhandle_locus_types:
|
|
246
|
+
unhandle_locus_types: defaultdict[str, dict[str, Term]] = defaultdict(dict)
|
|
247
247
|
path = ensure_path(
|
|
248
248
|
PREFIX,
|
|
249
249
|
url=DEFINITIONS_URL_FMT.format(version=version),
|
|
@@ -257,7 +257,7 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
|
|
|
257
257
|
yield Term.from_triple("NCBITaxon", "9606", "Homo sapiens")
|
|
258
258
|
yield from sorted(
|
|
259
259
|
{
|
|
260
|
-
Term(reference=Reference
|
|
260
|
+
Term(reference=Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id)))
|
|
261
261
|
for so_id in sorted(LOCUS_TYPE_TO_SO.values())
|
|
262
262
|
if so_id
|
|
263
263
|
},
|
|
@@ -364,23 +364,25 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
|
|
|
364
364
|
xref_identifiers = entry.pop(key, None)
|
|
365
365
|
if xref_identifiers is None:
|
|
366
366
|
continue
|
|
367
|
-
|
|
368
367
|
if isinstance(xref_identifiers, (str, int)):
|
|
368
|
+
xref_identifiers = [str(xref_identifiers)]
|
|
369
|
+
|
|
370
|
+
if xref_prefix == "merops.entry":
|
|
371
|
+
continue
|
|
372
|
+
# e.g., XM02-001 should be rewritten as XM02.001
|
|
373
|
+
xref_identifiers = [i.replace("-", ".") for i in xref_identifiers]
|
|
374
|
+
|
|
375
|
+
if xref_prefix == "refseq":
|
|
376
|
+
# e.g., strip off dots without substantiated record versions like in NM_021728.
|
|
377
|
+
xref_identifiers = [i.strip(".") for i in xref_identifiers]
|
|
378
|
+
|
|
379
|
+
if len(xref_identifiers) == 1:
|
|
369
380
|
term.append_exact_match(
|
|
370
|
-
Reference(prefix=xref_prefix, identifier=str(xref_identifiers))
|
|
381
|
+
Reference(prefix=xref_prefix, identifier=str(xref_identifiers[0]))
|
|
371
382
|
)
|
|
372
|
-
elif isinstance(xref_identifiers, list):
|
|
373
|
-
if len(xref_identifiers) == 1:
|
|
374
|
-
term.append_exact_match(
|
|
375
|
-
Reference(prefix=xref_prefix, identifier=str(xref_identifiers[0]))
|
|
376
|
-
)
|
|
377
|
-
else:
|
|
378
|
-
for xref_identifier in xref_identifiers:
|
|
379
|
-
term.append_xref(
|
|
380
|
-
Reference(prefix=xref_prefix, identifier=str(xref_identifier))
|
|
381
|
-
)
|
|
382
383
|
else:
|
|
383
|
-
|
|
384
|
+
for xref_identifier in xref_identifiers:
|
|
385
|
+
term.append_xref(Reference(prefix=xref_prefix, identifier=str(xref_identifier)))
|
|
384
386
|
|
|
385
387
|
for pubmed_id in entry.pop("pubmed_id", []):
|
|
386
388
|
term.append_provenance(Reference(prefix="pubmed", identifier=str(pubmed_id)))
|
|
@@ -417,9 +419,11 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
|
|
|
417
419
|
locus_group = entry.pop("locus_group")
|
|
418
420
|
so_id = LOCUS_TYPE_TO_SO.get(locus_type)
|
|
419
421
|
if so_id:
|
|
420
|
-
term.append_parent(Reference
|
|
422
|
+
term.append_parent(Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id)))
|
|
421
423
|
else:
|
|
422
|
-
term.append_parent(
|
|
424
|
+
term.append_parent(
|
|
425
|
+
Reference(prefix="SO", identifier="0000704", name=get_so_name("0000704"))
|
|
426
|
+
) # gene
|
|
423
427
|
unhandle_locus_types[locus_type][identifier] = term
|
|
424
428
|
term.append_property("locus_type", locus_type)
|
|
425
429
|
term.append_property("locus_group", locus_group)
|
|
@@ -459,8 +463,8 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
|
|
|
459
463
|
headers=["hgnc_id", "name", "obsolete", "link", "provenance"],
|
|
460
464
|
tablefmt="github",
|
|
461
465
|
)
|
|
462
|
-
print(f"## {k} ({len(v)})", file=file)
|
|
463
|
-
print(t, "\n", file=file)
|
|
466
|
+
print(f"## {k} ({len(v)})", file=file)
|
|
467
|
+
print(t, "\n", file=file)
|
|
464
468
|
|
|
465
469
|
unhandle_locus_type_counter = Counter(
|
|
466
470
|
{locus_type: len(d) for locus_type, d in unhandle_locus_types.items()}
|
pyobo/sources/hgncgenefamily.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for HGNC Gene Families."""
|
|
4
2
|
|
|
5
3
|
from collections import defaultdict
|
|
6
|
-
from
|
|
4
|
+
from collections.abc import Iterable, Mapping
|
|
7
5
|
|
|
8
6
|
import pandas as pd
|
|
9
7
|
|
|
@@ -23,13 +21,13 @@ __all__ = [
|
|
|
23
21
|
]
|
|
24
22
|
|
|
25
23
|
PREFIX = "hgnc.genegroup"
|
|
26
|
-
FAMILIES_URL = "
|
|
24
|
+
FAMILIES_URL = "https://storage.googleapis.com/public-download-files/hgnc/csv/csv/genefamily_db_tables/family.csv"
|
|
27
25
|
# TODO use family_alias.csv
|
|
28
|
-
HIERARCHY_URL =
|
|
29
|
-
"ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/csv/genefamily_db_tables/hierarchy.csv"
|
|
30
|
-
)
|
|
26
|
+
HIERARCHY_URL = "https://storage.googleapis.com/public-download-files/hgnc/csv/csv/genefamily_db_tables/hierarchy.csv"
|
|
31
27
|
|
|
32
|
-
symbol_type = SynonymTypeDef
|
|
28
|
+
symbol_type = SynonymTypeDef(
|
|
29
|
+
reference=Reference(prefix="OMO", identifier="0004000", name="has symbol")
|
|
30
|
+
)
|
|
33
31
|
|
|
34
32
|
|
|
35
33
|
class HGNCGroupGetter(Obo):
|
|
@@ -50,7 +48,7 @@ def get_obo(force: bool = False) -> Obo:
|
|
|
50
48
|
return HGNCGroupGetter(force=force)
|
|
51
49
|
|
|
52
50
|
|
|
53
|
-
def get_hierarchy(force: bool = False) -> Mapping[str,
|
|
51
|
+
def get_hierarchy(force: bool = False) -> Mapping[str, list[str]]:
|
|
54
52
|
"""Get the HGNC Gene Families hierarchy as a dictionary."""
|
|
55
53
|
path = ensure_path(PREFIX, url=HIERARCHY_URL, force=force)
|
|
56
54
|
df = pd.read_csv(path, dtype={"parent_fam_id": str, "child_fam_id": str})
|
|
@@ -80,7 +78,7 @@ def get_terms(force: bool = False) -> Iterable[Term]:
|
|
|
80
78
|
name=parent.name,
|
|
81
79
|
)
|
|
82
80
|
)
|
|
83
|
-
gene_group = Reference
|
|
81
|
+
gene_group = Reference(prefix="SO", identifier="0005855", name="gene group")
|
|
84
82
|
yield Term(reference=gene_group)
|
|
85
83
|
for term in terms:
|
|
86
84
|
if not term.parents:
|
|
@@ -100,7 +98,7 @@ def _get_terms_helper(force: bool = False) -> Iterable[Term]:
|
|
|
100
98
|
definition=definition,
|
|
101
99
|
)
|
|
102
100
|
if pubmed_ids and pd.notna(pubmed_ids):
|
|
103
|
-
for s in pubmed_ids.split(","):
|
|
101
|
+
for s in pubmed_ids.replace(" ", ",").split(","):
|
|
104
102
|
term.append_provenance(Reference(prefix="pubmed", identifier=s.strip()))
|
|
105
103
|
if desc_go and pd.notna(desc_go):
|
|
106
104
|
go_id = desc_go[len("http://purl.uniprot.org/go/") :]
|
pyobo/sources/icd10.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Convert ICD-10 to OBO.
|
|
4
2
|
|
|
5
3
|
Run with python -m pyobo.sources.icd10 -v
|
|
6
4
|
"""
|
|
7
5
|
|
|
8
6
|
import logging
|
|
9
|
-
from
|
|
7
|
+
from collections.abc import Iterable, Mapping
|
|
8
|
+
from typing import Any
|
|
10
9
|
|
|
11
10
|
import click
|
|
12
11
|
from more_click import verbose_option
|
|
@@ -57,7 +56,7 @@ def iter_terms() -> Iterable[Term]:
|
|
|
57
56
|
chapter_urls = res_json["child"]
|
|
58
57
|
tqdm.write(f"there are {len(chapter_urls)} chapters")
|
|
59
58
|
|
|
60
|
-
visited_identifiers:
|
|
59
|
+
visited_identifiers: set[str] = set()
|
|
61
60
|
for identifier in get_child_identifiers(ICD10_TOP_LEVEL_URL, res_json):
|
|
62
61
|
yield from visiter(
|
|
63
62
|
identifier,
|
pyobo/sources/icd11.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Convert ICD11 to OBO.
|
|
4
2
|
|
|
5
3
|
Run with python -m pyobo.sources.icd11 -v
|
|
@@ -8,7 +6,8 @@ Run with python -m pyobo.sources.icd11 -v
|
|
|
8
6
|
import json
|
|
9
7
|
import logging
|
|
10
8
|
import os
|
|
11
|
-
from
|
|
9
|
+
from collections.abc import Iterable, Mapping
|
|
10
|
+
from typing import Any
|
|
12
11
|
|
|
13
12
|
import click
|
|
14
13
|
from more_click import verbose_option
|
|
@@ -67,7 +66,7 @@ def iterate_icd11() -> Iterable[Term]:
|
|
|
67
66
|
|
|
68
67
|
tqdm.write(f'There are {len(res_json["child"])} top level entities')
|
|
69
68
|
|
|
70
|
-
visited_identifiers:
|
|
69
|
+
visited_identifiers: set[str] = set()
|
|
71
70
|
for identifier in get_child_identifiers(ICD11_TOP_LEVEL_URL, res_json):
|
|
72
71
|
yield from visiter(
|
|
73
72
|
identifier,
|
pyobo/sources/icd_utils.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Utilities or interacting with the ICD API.
|
|
4
2
|
|
|
5
3
|
Want to get your own API cliend ID and client secret?
|
|
@@ -11,8 +9,9 @@ Want to get your own API cliend ID and client secret?
|
|
|
11
9
|
import datetime
|
|
12
10
|
import json
|
|
13
11
|
import os
|
|
12
|
+
from collections.abc import Iterable, Mapping
|
|
14
13
|
from pathlib import Path
|
|
15
|
-
from typing import Any, Callable,
|
|
14
|
+
from typing import Any, Callable, Union
|
|
16
15
|
|
|
17
16
|
import pystow
|
|
18
17
|
import requests
|
|
@@ -20,7 +19,7 @@ from cachier import cachier
|
|
|
20
19
|
from pystow.config_api import ConfigError
|
|
21
20
|
from tqdm.auto import tqdm
|
|
22
21
|
|
|
23
|
-
from ..getters import
|
|
22
|
+
from ..getters import NoBuildError
|
|
24
23
|
from ..struct import Term
|
|
25
24
|
|
|
26
25
|
TOKEN_URL = "https://icdaccessmanagement.who.int/connect/token" # noqa:S105
|
|
@@ -43,7 +42,7 @@ def _get_entity(endpoint: str, identifier: str):
|
|
|
43
42
|
return res.json()
|
|
44
43
|
|
|
45
44
|
|
|
46
|
-
def get_child_identifiers(endpoint: str, res_json: Mapping[str, Any]) ->
|
|
45
|
+
def get_child_identifiers(endpoint: str, res_json: Mapping[str, Any]) -> list[str]:
|
|
47
46
|
"""Ge the child identifiers."""
|
|
48
47
|
return [url[len(endpoint) :].lstrip("/") for url in res_json.get("child", [])]
|
|
49
48
|
|
|
@@ -55,7 +54,7 @@ def get_icd_api_headers() -> Mapping[str, str]:
|
|
|
55
54
|
icd_client_id = pystow.get_config("pyobo", "icd_client_id", raise_on_missing=True)
|
|
56
55
|
icd_client_secret = pystow.get_config("pyobo", "icd_client_secret", raise_on_missing=True)
|
|
57
56
|
except ConfigError as e:
|
|
58
|
-
raise
|
|
57
|
+
raise NoBuildError from e
|
|
59
58
|
|
|
60
59
|
grant_type = "client_credentials"
|
|
61
60
|
body_params = {"grant_type": grant_type}
|
|
@@ -73,7 +72,7 @@ def get_icd_api_headers() -> Mapping[str, str]:
|
|
|
73
72
|
|
|
74
73
|
def visiter(
|
|
75
74
|
identifier: str,
|
|
76
|
-
visited_identifiers:
|
|
75
|
+
visited_identifiers: set[str],
|
|
77
76
|
directory: Union[str, Path],
|
|
78
77
|
*,
|
|
79
78
|
endpoint: str,
|
pyobo/sources/interpro.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for InterPro."""
|
|
4
2
|
|
|
5
3
|
from collections import defaultdict
|
|
6
|
-
from
|
|
4
|
+
from collections.abc import Iterable, Mapping
|
|
7
5
|
|
|
8
6
|
from .utils import get_go_mapping
|
|
9
7
|
from ..struct import Obo, Reference, Term
|
|
@@ -82,7 +80,7 @@ def iter_terms(*, version: str, proteins: bool = False, force: bool = False) ->
|
|
|
82
80
|
yield term
|
|
83
81
|
|
|
84
82
|
|
|
85
|
-
def get_interpro_go_df(version: str, force: bool = False) -> Mapping[str,
|
|
83
|
+
def get_interpro_go_df(version: str, force: bool = False) -> Mapping[str, set[tuple[str, str]]]:
|
|
86
84
|
"""Get InterPro to Gene Ontology molecular function mapping."""
|
|
87
85
|
url = f"https://ftp.ebi.ac.uk/pub/databases/interpro/releases/{version}/interpro2go"
|
|
88
86
|
path = ensure_path(PREFIX, url=url, name="interpro2go.tsv", version=version, force=force)
|
|
@@ -98,7 +96,7 @@ def get_interpro_tree(version: str, force: bool = False):
|
|
|
98
96
|
|
|
99
97
|
|
|
100
98
|
def _parse_tree_helper(lines: Iterable[str]):
|
|
101
|
-
rv1:
|
|
99
|
+
rv1: defaultdict[str, list[str]] = defaultdict(list)
|
|
102
100
|
previous_depth, previous_id = 0, ""
|
|
103
101
|
stack = [previous_id]
|
|
104
102
|
|
pyobo/sources/itis.py
CHANGED
|
@@ -1,13 +1,11 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for the Integrated Taxonomic Information System (ITIS)."""
|
|
4
2
|
|
|
5
3
|
import os
|
|
6
4
|
import shutil
|
|
7
5
|
import sqlite3
|
|
8
6
|
import zipfile
|
|
7
|
+
from collections.abc import Iterable
|
|
9
8
|
from contextlib import closing
|
|
10
|
-
from typing import Iterable
|
|
11
9
|
|
|
12
10
|
from pyobo.struct import Obo, Reference, Term
|
|
13
11
|
from pyobo.utils.io import multidict
|
pyobo/sources/kegg/__init__.py
CHANGED
pyobo/sources/kegg/api.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""API utilities for KEGG."""
|
|
4
2
|
|
|
5
3
|
import urllib.error
|
|
4
|
+
from collections.abc import Mapping
|
|
6
5
|
from dataclasses import dataclass
|
|
7
|
-
from typing import
|
|
6
|
+
from typing import Optional
|
|
8
7
|
|
|
9
8
|
from pyobo import Reference, Term, ensure_path
|
|
10
9
|
from pyobo.struct import from_species
|
|
@@ -132,7 +131,7 @@ def _ensure_conv_genome_helper(
|
|
|
132
131
|
version=version,
|
|
133
132
|
)
|
|
134
133
|
with path_rv.open("w") as file:
|
|
135
|
-
print(file=file)
|
|
134
|
+
print(file=file)
|
|
136
135
|
return path_rv.as_posix()
|
|
137
136
|
except FileNotFoundError:
|
|
138
137
|
return None
|
pyobo/sources/kegg/genes.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Convert KEGG Genes to OBO.
|
|
4
2
|
|
|
5
3
|
Run with ``python -m pyobo.sources.kegg.genes``
|
|
6
4
|
"""
|
|
7
5
|
|
|
8
6
|
import logging
|
|
9
|
-
from
|
|
7
|
+
from collections.abc import Iterable
|
|
8
|
+
from typing import Optional
|
|
10
9
|
|
|
11
10
|
import click
|
|
12
11
|
from more_click import verbose_option
|
|
@@ -90,7 +89,7 @@ def _make_terms(
|
|
|
90
89
|
)
|
|
91
90
|
continue
|
|
92
91
|
if ";" in line:
|
|
93
|
-
*_extras, name =
|
|
92
|
+
*_extras, name = (part.strip() for part in extras.split(";"))
|
|
94
93
|
else:
|
|
95
94
|
name = extras
|
|
96
95
|
|
pyobo/sources/kegg/genome.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Convert KEGG Genome to OBO.
|
|
4
2
|
|
|
5
3
|
Run with ``python -m pyobo.sources.kegg.genome``
|
|
6
4
|
"""
|
|
7
5
|
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
8
|
import logging
|
|
9
|
-
from
|
|
9
|
+
from collections.abc import Iterable
|
|
10
10
|
|
|
11
11
|
from tqdm.auto import tqdm
|
|
12
12
|
|
|
@@ -48,8 +48,11 @@ def get_obo() -> Obo:
|
|
|
48
48
|
return KEGGGenomeGetter()
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def parse_genome_line(line: str) -> KEGGGenome:
|
|
51
|
+
def parse_genome_line(line: str) -> KEGGGenome | None:
|
|
52
52
|
"""Parse a line from the KEGG Genome database."""
|
|
53
|
+
if not line.startswith("T"):
|
|
54
|
+
# This is for an NCBI Taxonomy
|
|
55
|
+
return None
|
|
53
56
|
line = line.strip()
|
|
54
57
|
identifier, rest = _s(line, "\t")
|
|
55
58
|
identifier = identifier[len("gn:") :]
|
|
@@ -96,6 +99,8 @@ def iter_kegg_genomes(version: str, desc: str) -> Iterable[KEGGGenome]:
|
|
|
96
99
|
it = tqdm(lines, desc=desc, unit_scale=True, unit="genome")
|
|
97
100
|
for line in it:
|
|
98
101
|
yv = parse_genome_line(line)
|
|
102
|
+
if yv is None:
|
|
103
|
+
continue
|
|
99
104
|
it.set_postfix({"id": yv.identifier, "name": yv.name})
|
|
100
105
|
yield yv
|
|
101
106
|
|
|
@@ -107,11 +112,16 @@ def iter_terms(version: str) -> Iterable[Term]:
|
|
|
107
112
|
for kegg_genome in iter_kegg_genomes(version=version, desc="KEGG Genomes"):
|
|
108
113
|
if kegg_genome.identifier in SKIP:
|
|
109
114
|
continue
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
reference = Reference(
|
|
118
|
+
prefix=KEGG_GENOME_PREFIX, identifier=kegg_genome.identifier, name=kegg_genome.name
|
|
119
|
+
)
|
|
120
|
+
except ValueError:
|
|
121
|
+
tqdm.write(f"[{KEGG_GENOME_PREFIX}] invalid identifier: {kegg_genome}")
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
term = Term(reference=reference)
|
|
115
125
|
if kegg_genome.taxonomy_id is not None:
|
|
116
126
|
taxonomy_name = get_ncbitaxon_name(kegg_genome.taxonomy_id)
|
|
117
127
|
if taxonomy_name is None:
|
pyobo/sources/kegg/pathway.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Convert KEGG Pathways to OBO.
|
|
4
2
|
|
|
5
3
|
Run with ``python -m pyobo.sources.kegg.pathway``
|
|
@@ -8,8 +6,9 @@ Run with ``python -m pyobo.sources.kegg.pathway``
|
|
|
8
6
|
import logging
|
|
9
7
|
import urllib.error
|
|
10
8
|
from collections import defaultdict
|
|
9
|
+
from collections.abc import Iterable, Mapping
|
|
11
10
|
from functools import partial
|
|
12
|
-
from typing import
|
|
11
|
+
from typing import Union
|
|
13
12
|
|
|
14
13
|
from tqdm.auto import tqdm
|
|
15
14
|
from tqdm.contrib.concurrent import thread_map
|
|
@@ -76,7 +75,7 @@ def iter_terms(version: str, skip_missing: bool = True) -> Iterable[Term]:
|
|
|
76
75
|
)
|
|
77
76
|
|
|
78
77
|
|
|
79
|
-
def _get_link_pathway_map(path: str) -> Mapping[str,
|
|
78
|
+
def _get_link_pathway_map(path: str) -> Mapping[str, list[str]]:
|
|
80
79
|
rv = defaultdict(list)
|
|
81
80
|
with open(path) as file:
|
|
82
81
|
for line in file:
|
|
@@ -110,7 +109,7 @@ def _iter_genome_terms(
|
|
|
110
109
|
list_pathway_lines = [line.strip() for line in file]
|
|
111
110
|
for line in list_pathway_lines:
|
|
112
111
|
line = line.strip()
|
|
113
|
-
pathway_id, name =
|
|
112
|
+
pathway_id, name = (part.strip() for part in line.split("\t"))
|
|
114
113
|
pathway_id = pathway_id[len("path:") :]
|
|
115
114
|
|
|
116
115
|
terms[pathway_id] = term = Term.from_triple(
|
|
@@ -149,7 +148,7 @@ def _iter_genome_terms(
|
|
|
149
148
|
|
|
150
149
|
def iter_kegg_pathway_paths(
|
|
151
150
|
version: str, skip_missing: bool = True
|
|
152
|
-
) -> Iterable[Union[
|
|
151
|
+
) -> Iterable[Union[tuple[KEGGGenome, str, str], tuple[None, None, None]]]:
|
|
153
152
|
"""Get paths for the KEGG Pathway files."""
|
|
154
153
|
genomes = list(iter_kegg_genomes(version=version, desc="KEGG Pathways"))
|
|
155
154
|
func = partial(_process_genome, version=version, skip_missing=skip_missing)
|