pyobo 0.11.2__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/.DS_Store +0 -0
- pyobo/__init__.py +95 -20
- pyobo/__main__.py +0 -0
- pyobo/api/__init__.py +81 -10
- pyobo/api/alts.py +52 -42
- pyobo/api/combine.py +39 -0
- pyobo/api/edges.py +68 -0
- pyobo/api/hierarchy.py +231 -203
- pyobo/api/metadata.py +14 -19
- pyobo/api/names.py +207 -127
- pyobo/api/properties.py +117 -117
- pyobo/api/relations.py +68 -94
- pyobo/api/species.py +24 -21
- pyobo/api/typedefs.py +11 -11
- pyobo/api/utils.py +66 -13
- pyobo/api/xrefs.py +107 -114
- pyobo/cli/__init__.py +0 -0
- pyobo/cli/cli.py +35 -50
- pyobo/cli/database.py +210 -160
- pyobo/cli/database_utils.py +155 -0
- pyobo/cli/lookup.py +163 -195
- pyobo/cli/utils.py +19 -6
- pyobo/constants.py +102 -3
- pyobo/getters.py +209 -191
- pyobo/gilda_utils.py +52 -250
- pyobo/identifier_utils/__init__.py +33 -0
- pyobo/identifier_utils/api.py +305 -0
- pyobo/identifier_utils/preprocessing.json +873 -0
- pyobo/identifier_utils/preprocessing.py +27 -0
- pyobo/identifier_utils/relations/__init__.py +8 -0
- pyobo/identifier_utils/relations/api.py +162 -0
- pyobo/identifier_utils/relations/data.json +5824 -0
- pyobo/identifier_utils/relations/data_owl.json +57 -0
- pyobo/identifier_utils/relations/data_rdf.json +1 -0
- pyobo/identifier_utils/relations/data_rdfs.json +7 -0
- pyobo/mocks.py +9 -6
- pyobo/ner/__init__.py +9 -0
- pyobo/ner/api.py +72 -0
- pyobo/ner/normalizer.py +33 -0
- pyobo/obographs.py +48 -40
- pyobo/plugins.py +5 -4
- pyobo/py.typed +0 -0
- pyobo/reader.py +1354 -395
- pyobo/reader_utils.py +155 -0
- pyobo/resource_utils.py +42 -22
- pyobo/resources/__init__.py +0 -0
- pyobo/resources/goc.py +75 -0
- pyobo/resources/goc.tsv +188 -0
- pyobo/resources/ncbitaxon.py +4 -5
- pyobo/resources/ncbitaxon.tsv.gz +0 -0
- pyobo/resources/ro.py +3 -2
- pyobo/resources/ro.tsv +0 -0
- pyobo/resources/so.py +0 -0
- pyobo/resources/so.tsv +0 -0
- pyobo/sources/README.md +12 -8
- pyobo/sources/__init__.py +52 -29
- pyobo/sources/agrovoc.py +0 -0
- pyobo/sources/antibodyregistry.py +11 -12
- pyobo/sources/bigg/__init__.py +13 -0
- pyobo/sources/bigg/bigg_compartment.py +81 -0
- pyobo/sources/bigg/bigg_metabolite.py +229 -0
- pyobo/sources/bigg/bigg_model.py +46 -0
- pyobo/sources/bigg/bigg_reaction.py +77 -0
- pyobo/sources/biogrid.py +1 -2
- pyobo/sources/ccle.py +7 -12
- pyobo/sources/cgnc.py +9 -6
- pyobo/sources/chebi.py +1 -1
- pyobo/sources/chembl/__init__.py +9 -0
- pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
- pyobo/sources/chembl/chembl_target.py +160 -0
- pyobo/sources/civic_gene.py +55 -15
- pyobo/sources/clinicaltrials.py +160 -0
- pyobo/sources/complexportal.py +24 -24
- pyobo/sources/conso.py +14 -22
- pyobo/sources/cpt.py +0 -0
- pyobo/sources/credit.py +1 -9
- pyobo/sources/cvx.py +27 -5
- pyobo/sources/depmap.py +9 -12
- pyobo/sources/dictybase_gene.py +2 -7
- pyobo/sources/drugbank/__init__.py +9 -0
- pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
- pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
- pyobo/sources/drugcentral.py +17 -13
- pyobo/sources/expasy.py +31 -34
- pyobo/sources/famplex.py +13 -18
- pyobo/sources/flybase.py +8 -13
- pyobo/sources/gard.py +62 -0
- pyobo/sources/geonames/__init__.py +9 -0
- pyobo/sources/geonames/features.py +28 -0
- pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
- pyobo/sources/geonames/utils.py +115 -0
- pyobo/sources/gmt_utils.py +6 -7
- pyobo/sources/go.py +20 -13
- pyobo/sources/gtdb.py +154 -0
- pyobo/sources/gwascentral/__init__.py +9 -0
- pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
- pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
- pyobo/sources/hgnc/__init__.py +9 -0
- pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
- pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
- pyobo/sources/icd/__init__.py +9 -0
- pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
- pyobo/sources/icd/icd11.py +148 -0
- pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
- pyobo/sources/interpro.py +4 -9
- pyobo/sources/itis.py +0 -5
- pyobo/sources/kegg/__init__.py +0 -0
- pyobo/sources/kegg/api.py +16 -38
- pyobo/sources/kegg/genes.py +9 -20
- pyobo/sources/kegg/genome.py +1 -7
- pyobo/sources/kegg/pathway.py +9 -21
- pyobo/sources/mesh.py +58 -24
- pyobo/sources/mgi.py +3 -10
- pyobo/sources/mirbase/__init__.py +11 -0
- pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
- pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
- pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
- pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
- pyobo/sources/msigdb.py +74 -39
- pyobo/sources/ncbi/__init__.py +9 -0
- pyobo/sources/ncbi/ncbi_gc.py +162 -0
- pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
- pyobo/sources/nih_reporter.py +60 -0
- pyobo/sources/nlm/__init__.py +9 -0
- pyobo/sources/nlm/nlm_catalog.py +48 -0
- pyobo/sources/nlm/nlm_publisher.py +36 -0
- pyobo/sources/nlm/utils.py +116 -0
- pyobo/sources/npass.py +6 -8
- pyobo/sources/omim_ps.py +11 -4
- pyobo/sources/pathbank.py +4 -8
- pyobo/sources/pfam/__init__.py +9 -0
- pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
- pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
- pyobo/sources/pharmgkb/__init__.py +15 -0
- pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
- pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
- pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
- pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
- pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
- pyobo/sources/pharmgkb/utils.py +86 -0
- pyobo/sources/pid.py +1 -6
- pyobo/sources/pombase.py +6 -10
- pyobo/sources/pubchem.py +4 -9
- pyobo/sources/reactome.py +5 -11
- pyobo/sources/rgd.py +11 -16
- pyobo/sources/rhea.py +37 -36
- pyobo/sources/ror.py +69 -42
- pyobo/sources/selventa/__init__.py +0 -0
- pyobo/sources/selventa/schem.py +4 -7
- pyobo/sources/selventa/scomp.py +1 -6
- pyobo/sources/selventa/sdis.py +4 -7
- pyobo/sources/selventa/sfam.py +1 -6
- pyobo/sources/sgd.py +6 -11
- pyobo/sources/signor/__init__.py +7 -0
- pyobo/sources/signor/download.py +41 -0
- pyobo/sources/signor/signor_complexes.py +105 -0
- pyobo/sources/slm.py +12 -15
- pyobo/sources/umls/__init__.py +7 -1
- pyobo/sources/umls/__main__.py +0 -0
- pyobo/sources/umls/get_synonym_types.py +20 -4
- pyobo/sources/umls/sty.py +57 -0
- pyobo/sources/umls/synonym_types.tsv +1 -1
- pyobo/sources/umls/umls.py +18 -22
- pyobo/sources/unimod.py +46 -0
- pyobo/sources/uniprot/__init__.py +1 -1
- pyobo/sources/uniprot/uniprot.py +40 -32
- pyobo/sources/uniprot/uniprot_ptm.py +4 -34
- pyobo/sources/utils.py +3 -2
- pyobo/sources/wikipathways.py +7 -10
- pyobo/sources/zfin.py +5 -10
- pyobo/ssg/__init__.py +12 -16
- pyobo/ssg/base.html +0 -0
- pyobo/ssg/index.html +26 -13
- pyobo/ssg/term.html +12 -2
- pyobo/ssg/typedef.html +0 -0
- pyobo/struct/__init__.py +54 -8
- pyobo/struct/functional/__init__.py +1 -0
- pyobo/struct/functional/dsl.py +2572 -0
- pyobo/struct/functional/macros.py +423 -0
- pyobo/struct/functional/obo_to_functional.py +385 -0
- pyobo/struct/functional/ontology.py +272 -0
- pyobo/struct/functional/utils.py +112 -0
- pyobo/struct/reference.py +331 -136
- pyobo/struct/struct.py +1484 -657
- pyobo/struct/struct_utils.py +1078 -0
- pyobo/struct/typedef.py +162 -210
- pyobo/struct/utils.py +12 -5
- pyobo/struct/vocabulary.py +138 -0
- pyobo/utils/__init__.py +0 -0
- pyobo/utils/cache.py +16 -15
- pyobo/utils/io.py +51 -41
- pyobo/utils/iter.py +5 -5
- pyobo/utils/misc.py +41 -53
- pyobo/utils/ndex_utils.py +0 -0
- pyobo/utils/path.py +73 -70
- pyobo/version.py +3 -3
- pyobo-0.12.1.dist-info/METADATA +671 -0
- pyobo-0.12.1.dist-info/RECORD +201 -0
- pyobo-0.12.1.dist-info/WHEEL +4 -0
- {pyobo-0.11.2.dist-info → pyobo-0.12.1.dist-info}/entry_points.txt +1 -0
- pyobo-0.12.1.dist-info/licenses/LICENSE +21 -0
- pyobo/aws.py +0 -162
- pyobo/cli/aws.py +0 -47
- pyobo/identifier_utils.py +0 -142
- pyobo/normalizer.py +0 -232
- pyobo/registries/__init__.py +0 -16
- pyobo/registries/metaregistry.json +0 -507
- pyobo/registries/metaregistry.py +0 -135
- pyobo/sources/icd11.py +0 -105
- pyobo/xrefdb/__init__.py +0 -1
- pyobo/xrefdb/canonicalizer.py +0 -214
- pyobo/xrefdb/priority.py +0 -59
- pyobo/xrefdb/sources/__init__.py +0 -60
- pyobo/xrefdb/sources/biomappings.py +0 -36
- pyobo/xrefdb/sources/cbms2019.py +0 -91
- pyobo/xrefdb/sources/chembl.py +0 -83
- pyobo/xrefdb/sources/compath.py +0 -82
- pyobo/xrefdb/sources/famplex.py +0 -64
- pyobo/xrefdb/sources/gilda.py +0 -50
- pyobo/xrefdb/sources/intact.py +0 -113
- pyobo/xrefdb/sources/ncit.py +0 -133
- pyobo/xrefdb/sources/pubchem.py +0 -27
- pyobo/xrefdb/sources/wikidata.py +0 -116
- pyobo/xrefdb/xrefs_pipeline.py +0 -180
- pyobo-0.11.2.dist-info/METADATA +0 -711
- pyobo-0.11.2.dist-info/RECORD +0 -157
- pyobo-0.11.2.dist-info/WHEEL +0 -5
- pyobo-0.11.2.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Shared code for geonames sources."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
from pyobo import Reference, Term, TypeDef, default_reference
|
|
11
|
+
from pyobo.struct.struct import CHARLIE_TERM, HUMAN_TERM, PYOBO_INJECTED
|
|
12
|
+
from pyobo.utils.path import ensure_df
|
|
13
|
+
|
|
14
|
+
PREFIX = "geonames"
|
|
15
|
+
PREFIX_FEATURE = "geonames.feature"
|
|
16
|
+
|
|
17
|
+
FEATURES_URL = "https://download.geonames.org/export/dump/featureCodes_en.txt"
|
|
18
|
+
COUNTRIES_URL = "https://download.geonames.org/export/dump/countryInfo.txt"
|
|
19
|
+
ADMIN1_URL = "https://download.geonames.org/export/dump/admin1CodesASCII.txt"
|
|
20
|
+
ADMIN2_URL = "https://download.geonames.org/export/dump/admin2Codes.txt"
|
|
21
|
+
CITIES_URL = "https://download.geonames.org/export/dump/cities15000.zip"
|
|
22
|
+
SYNONYMS_URL = "https://download.geonames.org/export/dump/alternateNamesV2.zip"
|
|
23
|
+
|
|
24
|
+
# External parent classes
|
|
25
|
+
CITY = Reference(prefix="ENVO", identifier="00000856", name="city")
|
|
26
|
+
NATION = Reference(prefix="ENVO", identifier="00000009", name="national geopolitical entity")
|
|
27
|
+
ADMIN_1 = Reference(prefix="ENVO", identifier="00000005", name="first-order administrative region")
|
|
28
|
+
ADMIN_2 = Reference(prefix="ENVO", identifier="00000006", name="second-order administrative region")
|
|
29
|
+
|
|
30
|
+
# Builtin classes
|
|
31
|
+
FEATURE = default_reference(PREFIX_FEATURE, "feature", "GeoNames feature")
|
|
32
|
+
FEATURE_TERM = Term(reference=FEATURE)
|
|
33
|
+
|
|
34
|
+
# Type definitions
|
|
35
|
+
CODE_TYPEDEF = TypeDef(
|
|
36
|
+
reference=default_reference(PREFIX, "code", name="GeoNames code"), is_metadata_tag=True
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
SYNONYMS_DF_COLUMNS = [
|
|
40
|
+
"id",
|
|
41
|
+
"geonames_id",
|
|
42
|
+
"iso_lang",
|
|
43
|
+
"synonym",
|
|
44
|
+
"is_preferred",
|
|
45
|
+
"is_short",
|
|
46
|
+
"is_colloquial",
|
|
47
|
+
"is_historic",
|
|
48
|
+
"start_time",
|
|
49
|
+
"end_time",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
P_CATEGORY = default_reference(PREFIX_FEATURE, "P", "city feature")
|
|
53
|
+
|
|
54
|
+
FEATURE_CATEGORIES = {
|
|
55
|
+
"A": default_reference(PREFIX_FEATURE, "A", "geopolitical feature"),
|
|
56
|
+
"H": default_reference(PREFIX_FEATURE, "H", "aquatic feature"),
|
|
57
|
+
"V": default_reference(PREFIX_FEATURE, "V", "floral feature feature"),
|
|
58
|
+
"S": default_reference(PREFIX_FEATURE, "S", "building feature"),
|
|
59
|
+
"U": default_reference(PREFIX_FEATURE, "U", "undersea feature"),
|
|
60
|
+
"T": default_reference(PREFIX_FEATURE, "T", "geographic feature"),
|
|
61
|
+
"L": default_reference(PREFIX_FEATURE, "L", "parks feature"),
|
|
62
|
+
"P": P_CATEGORY,
|
|
63
|
+
"R": default_reference(PREFIX_FEATURE, "R", "road or rail feature"),
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_features(*, force: bool = False) -> dict[str, Term]:
|
|
68
|
+
"""Get all features."""
|
|
69
|
+
df = ensure_df(
|
|
70
|
+
PREFIX,
|
|
71
|
+
url=FEATURES_URL,
|
|
72
|
+
force=force,
|
|
73
|
+
keep_default_na=False, # NA is a country code
|
|
74
|
+
dtype=str,
|
|
75
|
+
)
|
|
76
|
+
rv = {}
|
|
77
|
+
for identifier, name, description in df.values:
|
|
78
|
+
if pd.isna(identifier) or identifier == "null":
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
term = Term(
|
|
82
|
+
reference=Reference(
|
|
83
|
+
prefix=PREFIX_FEATURE, identifier=identifier, name=name if pd.notna(name) else None
|
|
84
|
+
),
|
|
85
|
+
definition=description if pd.notna(description) else None,
|
|
86
|
+
)
|
|
87
|
+
parent_letter, _, rest = identifier.partition(".")
|
|
88
|
+
if not rest:
|
|
89
|
+
tqdm.write(f"[{PREFIX_FEATURE}] unhandled identifier: {identifier}")
|
|
90
|
+
elif parent_letter not in FEATURE_CATEGORIES:
|
|
91
|
+
tqdm.write(f"[{PREFIX_FEATURE}] unhandled category: {parent_letter}")
|
|
92
|
+
else:
|
|
93
|
+
term.append_parent(FEATURE_CATEGORIES[parent_letter])
|
|
94
|
+
|
|
95
|
+
rv[identifier] = term
|
|
96
|
+
return rv
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_feature_terms(
|
|
100
|
+
force: bool = False, features: dict[str, Term] | None = None
|
|
101
|
+
) -> Iterable[Term]:
|
|
102
|
+
"""Get terms for GeoNames features."""
|
|
103
|
+
yield FEATURE_TERM
|
|
104
|
+
yield HUMAN_TERM
|
|
105
|
+
yield CHARLIE_TERM
|
|
106
|
+
for cat in FEATURE_CATEGORIES.values():
|
|
107
|
+
yield (
|
|
108
|
+
Term(reference=cat)
|
|
109
|
+
.append_parent(FEATURE_TERM)
|
|
110
|
+
.append_contributor(CHARLIE_TERM)
|
|
111
|
+
.append_comment(PYOBO_INJECTED)
|
|
112
|
+
)
|
|
113
|
+
if features is None:
|
|
114
|
+
features = get_features(force=force)
|
|
115
|
+
yield from features.values()
|
pyobo/sources/gmt_utils.py
CHANGED
|
@@ -2,17 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
from collections.abc import Iterable
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Union
|
|
6
5
|
|
|
7
6
|
GMTSummary = tuple[str, str, set[str]]
|
|
8
7
|
WikiPathwaysGMTSummary = tuple[str, str, str, str, str, set[str]]
|
|
9
8
|
|
|
10
9
|
|
|
11
|
-
def parse_gmt_file(path:
|
|
10
|
+
def parse_gmt_file(path: str | Path) -> Iterable[GMTSummary]:
|
|
12
11
|
"""Return file as list of pathway - gene sets (ENTREZ-identifiers).
|
|
13
12
|
|
|
14
13
|
:param path: path to GMT file
|
|
15
|
-
|
|
14
|
+
|
|
15
|
+
:yields: processed lines
|
|
16
16
|
"""
|
|
17
17
|
with open(path) as file:
|
|
18
18
|
for line in file:
|
|
@@ -23,15 +23,14 @@ def _process_line(line: str) -> tuple[str, str, set[str]]:
|
|
|
23
23
|
"""Return the pathway name, url, and gene sets associated.
|
|
24
24
|
|
|
25
25
|
:param line: gmt file line
|
|
26
|
-
|
|
27
|
-
:
|
|
28
|
-
:return: genes set associated
|
|
26
|
+
|
|
27
|
+
:returns: pathway name, pathway info url, and genes set associated
|
|
29
28
|
"""
|
|
30
29
|
name, info, *entries = (p.strip() for p in line.split("\t"))
|
|
31
30
|
return name, info, set(entries)
|
|
32
31
|
|
|
33
32
|
|
|
34
|
-
def parse_wikipathways_gmt(path:
|
|
33
|
+
def parse_wikipathways_gmt(path: str | Path) -> Iterable[WikiPathwaysGMTSummary]:
|
|
35
34
|
"""Parse WikiPathways GMT."""
|
|
36
35
|
for info, _uri, entries in parse_gmt_file(path):
|
|
37
36
|
info, version, identifier, species = info.split("%")
|
pyobo/sources/go.py
CHANGED
|
@@ -4,33 +4,46 @@ from pyobo import get_descendants
|
|
|
4
4
|
|
|
5
5
|
__all__ = [
|
|
6
6
|
"is_biological_process",
|
|
7
|
-
"is_molecular_function",
|
|
8
7
|
"is_cellular_component",
|
|
8
|
+
"is_molecular_function",
|
|
9
9
|
]
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
def is_biological_process(identifier: str) -> bool:
|
|
13
13
|
"""Return if the given GO identifier is a biological process.
|
|
14
14
|
|
|
15
|
+
:param identifier: A local unique identifier from GO
|
|
16
|
+
:return: If the identifier is a biological process
|
|
17
|
+
|
|
15
18
|
>>> is_biological_process("0006915")
|
|
16
19
|
True
|
|
17
20
|
>>> is_biological_process("GO:0006915")
|
|
18
21
|
True
|
|
19
|
-
>>> is_molecular_function("0006915")
|
|
20
|
-
False
|
|
21
|
-
>>> is_cellular_component("0006915")
|
|
22
|
-
False
|
|
23
22
|
"""
|
|
24
23
|
return _is_descendant(identifier, "0008150")
|
|
25
24
|
|
|
26
25
|
|
|
27
26
|
def is_molecular_function(identifier: str) -> bool:
|
|
28
|
-
"""Return if the given GO identifier is a molecular function.
|
|
27
|
+
"""Return if the given GO identifier is a molecular function.
|
|
28
|
+
|
|
29
|
+
:param identifier: A local unique identifier from GO
|
|
30
|
+
:return: If the identifier is a molecular function
|
|
31
|
+
|
|
32
|
+
>>> is_molecular_function("0006915")
|
|
33
|
+
False
|
|
34
|
+
"""
|
|
29
35
|
return _is_descendant(identifier, "0003674")
|
|
30
36
|
|
|
31
37
|
|
|
32
38
|
def is_cellular_component(identifier: str) -> bool:
|
|
33
|
-
"""Return if the given GO identifier is a cellular component.
|
|
39
|
+
"""Return if the given GO identifier is a cellular component.
|
|
40
|
+
|
|
41
|
+
:param identifier: A local unique identifier from GO
|
|
42
|
+
:return: If the identifier is a cellular component
|
|
43
|
+
|
|
44
|
+
>>> is_cellular_component("0006915")
|
|
45
|
+
False
|
|
46
|
+
"""
|
|
34
47
|
return _is_descendant(identifier, "0005575")
|
|
35
48
|
|
|
36
49
|
|
|
@@ -40,9 +53,3 @@ def _is_descendant(identifier: str, ancestor: str) -> bool:
|
|
|
40
53
|
identifier = f"go:{identifier}"
|
|
41
54
|
descendants = get_descendants("go", ancestor)
|
|
42
55
|
return descendants is not None and identifier in descendants
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
if __name__ == "__main__":
|
|
46
|
-
import doctest
|
|
47
|
-
|
|
48
|
-
doctest.testmod()
|
pyobo/sources/gtdb.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""Convert GTDB taxonomy to OBO format."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from tqdm.auto import tqdm
|
|
8
|
+
|
|
9
|
+
from pyobo.struct import Obo, Reference, Term
|
|
10
|
+
from pyobo.struct.typedef import has_taxonomy_rank
|
|
11
|
+
from pyobo.utils.path import ensure_path
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"GTDBGetter",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
PREFIX = "gtdb"
|
|
18
|
+
|
|
19
|
+
#: A mapping from GTDB prefixes to TAXRANK ranks
|
|
20
|
+
LEVEL_TO_TAXRANK = {
|
|
21
|
+
"d": Reference(prefix="TAXRANK", identifier="0000037", name="domain"),
|
|
22
|
+
"p": Reference(prefix="TAXRANK", identifier="0000001", name="phylum"),
|
|
23
|
+
"c": Reference(prefix="TAXRANK", identifier="0000002", name="class"),
|
|
24
|
+
"o": Reference(prefix="TAXRANK", identifier="0000003", name="order"),
|
|
25
|
+
"f": Reference(prefix="TAXRANK", identifier="0000004", name="family"),
|
|
26
|
+
"g": Reference(prefix="TAXRANK", identifier="0000005", name="genus"),
|
|
27
|
+
"s": Reference(prefix="TAXRANK", identifier="0000006", name="species"),
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
#: AR stands for archea
|
|
31
|
+
GTDB_AR_URL = "https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tsv.gz"
|
|
32
|
+
#: BAC stands for bacteria
|
|
33
|
+
GTDB_BAC_URL = "https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tsv.gz"
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class GTDBGetter(Obo):
|
|
39
|
+
"""An ontology representation of the GTDB taxonomy."""
|
|
40
|
+
|
|
41
|
+
ontology = bioversions_key = PREFIX
|
|
42
|
+
typedefs = [has_taxonomy_rank]
|
|
43
|
+
root_terms = [
|
|
44
|
+
Reference(prefix=PREFIX, identifier="d__Archea", name="Archea"),
|
|
45
|
+
Reference(prefix=PREFIX, identifier="d__Bacteria", name="Bacteria"),
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
49
|
+
"""Iterate over terms in the ontology."""
|
|
50
|
+
return iter_terms(version=self._version_or_raise, force=force)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
54
|
+
"""Iterate over GTDB terms."""
|
|
55
|
+
# Add the taxrank terms so we get nice display in Protege
|
|
56
|
+
for reference in LEVEL_TO_TAXRANK.values():
|
|
57
|
+
yield Term(reference=reference)
|
|
58
|
+
|
|
59
|
+
ar_path = ensure_path(PREFIX, url=GTDB_AR_URL, version=version, force=force)
|
|
60
|
+
bac_path = ensure_path(PREFIX, url=GTDB_BAC_URL, version=version, force=force)
|
|
61
|
+
columns = ["gtdb_taxonomy", "ncbi_species_taxid"]
|
|
62
|
+
for path_name, path in [
|
|
63
|
+
("ar", ar_path),
|
|
64
|
+
("bac", bac_path),
|
|
65
|
+
]:
|
|
66
|
+
df = pd.read_csv(path, sep="\t", dtype=str)
|
|
67
|
+
for tax_string, ncbitaxon_id in tqdm(
|
|
68
|
+
df[columns].values, desc=f"[{PREFIX}] processing {path_name}", unit_scale=True
|
|
69
|
+
):
|
|
70
|
+
yield from _process_row(tax_string, ncbitaxon_id)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _process_row(tax_string, ncbitaxon_id) -> Iterable[Term]:
|
|
74
|
+
if not isinstance(tax_string, str):
|
|
75
|
+
logger.warning(f"Invalid taxonomy string: {tax_string}")
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
taxa = _parse_tax_string(tax_string)
|
|
79
|
+
if not taxa:
|
|
80
|
+
logger.warning(f"No valid taxa found in: {tax_string}")
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
parent_reference = None
|
|
84
|
+
for level, name in taxa:
|
|
85
|
+
identifier = f"{level}__{name.replace(' ', '_')}"
|
|
86
|
+
term = Term(
|
|
87
|
+
reference=Reference(prefix=PREFIX, identifier=identifier, name=name),
|
|
88
|
+
)
|
|
89
|
+
term.annotate_object(has_taxonomy_rank, LEVEL_TO_TAXRANK[level])
|
|
90
|
+
|
|
91
|
+
if parent_reference:
|
|
92
|
+
term.append_parent(parent_reference)
|
|
93
|
+
if ncbitaxon_id and level == "s":
|
|
94
|
+
# if the level is "s", it's a species. There might be multiple
|
|
95
|
+
# mappings to NCBITaxon, so we only use "see also" as the predicate
|
|
96
|
+
term.append_xref(
|
|
97
|
+
Reference(prefix="ncbitaxon", identifier=ncbitaxon_id),
|
|
98
|
+
# TODO @jose use confidence=... keyword here
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
yield term
|
|
102
|
+
parent_reference = term.reference
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _parse_tax_string(tax_string: str) -> list[tuple[str, str]]:
|
|
106
|
+
"""Parse GTDB taxonomy string into (level, name) tuples."""
|
|
107
|
+
return [
|
|
108
|
+
level_name for part in _split_tax_string(tax_string) if (level_name := _parse_name(part))
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _split_tax_string(tax_string: str) -> list[str]:
|
|
113
|
+
return [p.strip() for p in tax_string.split(";") if p.strip()]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _parse_name(part: str) -> tuple[str, str] | None:
|
|
117
|
+
"""Parse a GTDB taxonomy identifier.
|
|
118
|
+
|
|
119
|
+
:param part: The string
|
|
120
|
+
:returns: A tuple with the level and name, if parsable
|
|
121
|
+
|
|
122
|
+
>>> _parse_name("f__Sulfolobaceae")
|
|
123
|
+
('f', 'Sulfoobaceae')
|
|
124
|
+
|
|
125
|
+
The following is malformed because it is missing a double underscore
|
|
126
|
+
|
|
127
|
+
>>> _parse_name("f_Sulfolobaceae")
|
|
128
|
+
|
|
129
|
+
The following is malformed because it has an invalid taxonomic level
|
|
130
|
+
|
|
131
|
+
>>> _parse_name("x__Sulfolobaceae")
|
|
132
|
+
|
|
133
|
+
The following is malformed because it's missing a name
|
|
134
|
+
|
|
135
|
+
>>> _parse_name("f__")
|
|
136
|
+
"""
|
|
137
|
+
if len(part) < 4 or "__" not in part:
|
|
138
|
+
logger.warning(f"Malformed taxon string: {part}")
|
|
139
|
+
return None
|
|
140
|
+
level, delimiter, name = part.partition("__")
|
|
141
|
+
if not delimiter:
|
|
142
|
+
logger.warning(f"Missing double underscore delimiter: {part}")
|
|
143
|
+
return None
|
|
144
|
+
if level not in LEVEL_TO_TAXRANK or not name:
|
|
145
|
+
logger.warning(f"Invalid taxonomic level `{level}` in {part}")
|
|
146
|
+
return None
|
|
147
|
+
if not name:
|
|
148
|
+
logger.warning(f"Missing name: {part}")
|
|
149
|
+
return None
|
|
150
|
+
return level, name
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
if __name__ == "__main__":
|
|
154
|
+
GTDBGetter().cli()
|
|
@@ -5,10 +5,11 @@ from collections.abc import Iterable
|
|
|
5
5
|
|
|
6
6
|
from tqdm.auto import tqdm, trange
|
|
7
7
|
|
|
8
|
-
from pyobo.sources.gwascentral_study import VERSION
|
|
9
8
|
from pyobo.struct import Obo, Reference, Term
|
|
10
9
|
from pyobo.utils.path import ensure_path
|
|
11
10
|
|
|
11
|
+
from .gwascentral_study import VERSION
|
|
12
|
+
|
|
12
13
|
__all__ = [
|
|
13
14
|
"GWASCentralPhenotypeGetter",
|
|
14
15
|
]
|
|
@@ -27,11 +28,6 @@ class GWASCentralPhenotypeGetter(Obo):
|
|
|
27
28
|
return iter_terms(force=force, version=self._version_or_raise)
|
|
28
29
|
|
|
29
30
|
|
|
30
|
-
def get_obo(force: bool = False) -> Obo:
|
|
31
|
-
"""Get GWAS Central Studies as OBO."""
|
|
32
|
-
return GWASCentralPhenotypeGetter(force=force)
|
|
33
|
-
|
|
34
|
-
|
|
35
31
|
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
36
32
|
"""Iterate over terms from GWAS Central Phenotype."""
|
|
37
33
|
for n in trange(1, 11000, desc=f"{PREFIX} download"):
|
|
@@ -43,11 +39,13 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
43
39
|
url=f"https://www.gwascentral.org/phenotype/HGVPM{n}?format=json",
|
|
44
40
|
name=f"HGVPM{n}.json",
|
|
45
41
|
force=force,
|
|
42
|
+
backend="requests",
|
|
43
|
+
timeout=1,
|
|
46
44
|
)
|
|
47
45
|
except OSError as e:
|
|
48
46
|
tqdm.write(f"{n}: {e}")
|
|
49
47
|
continue
|
|
50
|
-
with open(
|
|
48
|
+
with path.open() as file:
|
|
51
49
|
j = json.load(file)
|
|
52
50
|
|
|
53
51
|
description = j.get("description")
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import tarfile
|
|
5
5
|
from collections.abc import Iterable
|
|
6
|
-
from typing import Optional
|
|
7
6
|
from xml.etree import ElementTree
|
|
8
7
|
|
|
9
8
|
from pyobo.struct import Obo, Reference, Term, has_part
|
|
@@ -31,12 +30,7 @@ class GWASCentralStudyGetter(Obo):
|
|
|
31
30
|
return iterate_terms(force=force, version=self._version_or_raise)
|
|
32
31
|
|
|
33
32
|
|
|
34
|
-
def
|
|
35
|
-
"""Get GWAS Central Studies as OBO."""
|
|
36
|
-
return GWASCentralStudyGetter(force=force)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def _find_text(element, name: str) -> Optional[str]:
|
|
33
|
+
def _find_text(element, name: str) -> str | None:
|
|
40
34
|
x = element.find(name)
|
|
41
35
|
if x is not None:
|
|
42
36
|
return x.text
|