pyobo 0.11.2__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/.DS_Store +0 -0
- pyobo/__init__.py +95 -20
- pyobo/__main__.py +0 -0
- pyobo/api/__init__.py +81 -10
- pyobo/api/alts.py +52 -42
- pyobo/api/combine.py +39 -0
- pyobo/api/edges.py +68 -0
- pyobo/api/hierarchy.py +231 -203
- pyobo/api/metadata.py +14 -19
- pyobo/api/names.py +207 -127
- pyobo/api/properties.py +117 -113
- pyobo/api/relations.py +68 -94
- pyobo/api/species.py +24 -21
- pyobo/api/typedefs.py +11 -11
- pyobo/api/utils.py +66 -13
- pyobo/api/xrefs.py +108 -114
- pyobo/cli/__init__.py +0 -0
- pyobo/cli/cli.py +35 -50
- pyobo/cli/database.py +183 -161
- pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
- pyobo/cli/lookup.py +163 -195
- pyobo/cli/utils.py +19 -6
- pyobo/constants.py +102 -3
- pyobo/getters.py +196 -118
- pyobo/gilda_utils.py +79 -200
- pyobo/identifier_utils/__init__.py +41 -0
- pyobo/identifier_utils/api.py +296 -0
- pyobo/identifier_utils/model.py +130 -0
- pyobo/identifier_utils/preprocessing.json +812 -0
- pyobo/identifier_utils/preprocessing.py +61 -0
- pyobo/identifier_utils/relations/__init__.py +8 -0
- pyobo/identifier_utils/relations/api.py +162 -0
- pyobo/identifier_utils/relations/data.json +5824 -0
- pyobo/identifier_utils/relations/data_owl.json +57 -0
- pyobo/identifier_utils/relations/data_rdf.json +1 -0
- pyobo/identifier_utils/relations/data_rdfs.json +7 -0
- pyobo/mocks.py +9 -6
- pyobo/ner/__init__.py +9 -0
- pyobo/ner/api.py +72 -0
- pyobo/ner/normalizer.py +33 -0
- pyobo/obographs.py +43 -39
- pyobo/plugins.py +5 -4
- pyobo/py.typed +0 -0
- pyobo/reader.py +1358 -395
- pyobo/reader_utils.py +155 -0
- pyobo/resource_utils.py +42 -22
- pyobo/resources/__init__.py +0 -0
- pyobo/resources/goc.py +75 -0
- pyobo/resources/goc.tsv +188 -0
- pyobo/resources/ncbitaxon.py +4 -5
- pyobo/resources/ncbitaxon.tsv.gz +0 -0
- pyobo/resources/ro.py +3 -2
- pyobo/resources/ro.tsv +0 -0
- pyobo/resources/so.py +0 -0
- pyobo/resources/so.tsv +0 -0
- pyobo/sources/README.md +12 -8
- pyobo/sources/__init__.py +52 -29
- pyobo/sources/agrovoc.py +0 -0
- pyobo/sources/antibodyregistry.py +11 -12
- pyobo/sources/bigg/__init__.py +13 -0
- pyobo/sources/bigg/bigg_compartment.py +81 -0
- pyobo/sources/bigg/bigg_metabolite.py +229 -0
- pyobo/sources/bigg/bigg_model.py +46 -0
- pyobo/sources/bigg/bigg_reaction.py +77 -0
- pyobo/sources/biogrid.py +1 -2
- pyobo/sources/ccle.py +7 -12
- pyobo/sources/cgnc.py +0 -5
- pyobo/sources/chebi.py +1 -1
- pyobo/sources/chembl/__init__.py +9 -0
- pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
- pyobo/sources/chembl/chembl_target.py +160 -0
- pyobo/sources/civic_gene.py +55 -15
- pyobo/sources/clinicaltrials.py +160 -0
- pyobo/sources/complexportal.py +24 -24
- pyobo/sources/conso.py +14 -22
- pyobo/sources/cpt.py +0 -0
- pyobo/sources/credit.py +1 -9
- pyobo/sources/cvx.py +27 -5
- pyobo/sources/depmap.py +9 -12
- pyobo/sources/dictybase_gene.py +2 -7
- pyobo/sources/drugbank/__init__.py +9 -0
- pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
- pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
- pyobo/sources/drugcentral.py +17 -13
- pyobo/sources/expasy.py +31 -34
- pyobo/sources/famplex.py +13 -18
- pyobo/sources/flybase.py +3 -8
- pyobo/sources/gard.py +62 -0
- pyobo/sources/geonames/__init__.py +9 -0
- pyobo/sources/geonames/features.py +28 -0
- pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
- pyobo/sources/geonames/utils.py +115 -0
- pyobo/sources/gmt_utils.py +6 -7
- pyobo/sources/go.py +20 -13
- pyobo/sources/gtdb.py +154 -0
- pyobo/sources/gwascentral/__init__.py +9 -0
- pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
- pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
- pyobo/sources/hgnc/__init__.py +9 -0
- pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
- pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
- pyobo/sources/icd/__init__.py +9 -0
- pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
- pyobo/sources/icd/icd11.py +148 -0
- pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
- pyobo/sources/interpro.py +4 -9
- pyobo/sources/itis.py +0 -5
- pyobo/sources/kegg/__init__.py +0 -0
- pyobo/sources/kegg/api.py +16 -38
- pyobo/sources/kegg/genes.py +9 -20
- pyobo/sources/kegg/genome.py +1 -7
- pyobo/sources/kegg/pathway.py +9 -21
- pyobo/sources/mesh.py +58 -24
- pyobo/sources/mgi.py +3 -10
- pyobo/sources/mirbase/__init__.py +11 -0
- pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
- pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
- pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
- pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
- pyobo/sources/msigdb.py +74 -39
- pyobo/sources/ncbi/__init__.py +9 -0
- pyobo/sources/ncbi/ncbi_gc.py +162 -0
- pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
- pyobo/sources/nih_reporter.py +60 -0
- pyobo/sources/nlm/__init__.py +9 -0
- pyobo/sources/nlm/nlm_catalog.py +48 -0
- pyobo/sources/nlm/nlm_publisher.py +36 -0
- pyobo/sources/nlm/utils.py +116 -0
- pyobo/sources/npass.py +6 -8
- pyobo/sources/omim_ps.py +10 -3
- pyobo/sources/pathbank.py +4 -8
- pyobo/sources/pfam/__init__.py +9 -0
- pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
- pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
- pyobo/sources/pharmgkb/__init__.py +15 -0
- pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
- pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
- pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
- pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
- pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
- pyobo/sources/pharmgkb/utils.py +86 -0
- pyobo/sources/pid.py +1 -6
- pyobo/sources/pombase.py +6 -10
- pyobo/sources/pubchem.py +4 -9
- pyobo/sources/reactome.py +5 -11
- pyobo/sources/rgd.py +11 -16
- pyobo/sources/rhea.py +37 -36
- pyobo/sources/ror.py +69 -42
- pyobo/sources/selventa/__init__.py +0 -0
- pyobo/sources/selventa/schem.py +4 -7
- pyobo/sources/selventa/scomp.py +1 -6
- pyobo/sources/selventa/sdis.py +4 -7
- pyobo/sources/selventa/sfam.py +1 -6
- pyobo/sources/sgd.py +6 -11
- pyobo/sources/signor/__init__.py +7 -0
- pyobo/sources/signor/download.py +41 -0
- pyobo/sources/signor/signor_complexes.py +105 -0
- pyobo/sources/slm.py +12 -15
- pyobo/sources/umls/__init__.py +7 -1
- pyobo/sources/umls/__main__.py +0 -0
- pyobo/sources/umls/get_synonym_types.py +20 -4
- pyobo/sources/umls/sty.py +57 -0
- pyobo/sources/umls/synonym_types.tsv +1 -1
- pyobo/sources/umls/umls.py +18 -22
- pyobo/sources/unimod.py +46 -0
- pyobo/sources/uniprot/__init__.py +1 -1
- pyobo/sources/uniprot/uniprot.py +40 -32
- pyobo/sources/uniprot/uniprot_ptm.py +4 -34
- pyobo/sources/utils.py +3 -2
- pyobo/sources/wikipathways.py +7 -10
- pyobo/sources/zfin.py +5 -10
- pyobo/ssg/__init__.py +12 -16
- pyobo/ssg/base.html +0 -0
- pyobo/ssg/index.html +26 -13
- pyobo/ssg/term.html +12 -2
- pyobo/ssg/typedef.html +0 -0
- pyobo/struct/__init__.py +54 -8
- pyobo/struct/functional/__init__.py +1 -0
- pyobo/struct/functional/dsl.py +2572 -0
- pyobo/struct/functional/macros.py +423 -0
- pyobo/struct/functional/obo_to_functional.py +385 -0
- pyobo/struct/functional/ontology.py +270 -0
- pyobo/struct/functional/utils.py +112 -0
- pyobo/struct/reference.py +331 -136
- pyobo/struct/struct.py +1413 -643
- pyobo/struct/struct_utils.py +1078 -0
- pyobo/struct/typedef.py +162 -210
- pyobo/struct/utils.py +12 -5
- pyobo/struct/vocabulary.py +138 -0
- pyobo/utils/__init__.py +0 -0
- pyobo/utils/cache.py +13 -11
- pyobo/utils/io.py +17 -31
- pyobo/utils/iter.py +5 -5
- pyobo/utils/misc.py +41 -53
- pyobo/utils/ndex_utils.py +0 -0
- pyobo/utils/path.py +76 -70
- pyobo/version.py +3 -3
- {pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/METADATA +228 -229
- pyobo-0.12.0.dist-info/RECORD +202 -0
- pyobo-0.12.0.dist-info/WHEEL +4 -0
- {pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
- pyobo-0.12.0.dist-info/licenses/LICENSE +21 -0
- pyobo/aws.py +0 -162
- pyobo/cli/aws.py +0 -47
- pyobo/identifier_utils.py +0 -142
- pyobo/normalizer.py +0 -232
- pyobo/registries/__init__.py +0 -16
- pyobo/registries/metaregistry.json +0 -507
- pyobo/registries/metaregistry.py +0 -135
- pyobo/sources/icd11.py +0 -105
- pyobo/xrefdb/__init__.py +0 -1
- pyobo/xrefdb/canonicalizer.py +0 -214
- pyobo/xrefdb/priority.py +0 -59
- pyobo/xrefdb/sources/__init__.py +0 -60
- pyobo/xrefdb/sources/biomappings.py +0 -36
- pyobo/xrefdb/sources/cbms2019.py +0 -91
- pyobo/xrefdb/sources/chembl.py +0 -83
- pyobo/xrefdb/sources/compath.py +0 -82
- pyobo/xrefdb/sources/famplex.py +0 -64
- pyobo/xrefdb/sources/gilda.py +0 -50
- pyobo/xrefdb/sources/intact.py +0 -113
- pyobo/xrefdb/sources/ncit.py +0 -133
- pyobo/xrefdb/sources/pubchem.py +0 -27
- pyobo/xrefdb/sources/wikidata.py +0 -116
- pyobo-0.11.2.dist-info/RECORD +0 -157
- pyobo-0.11.2.dist-info/WHEEL +0 -5
- pyobo-0.11.2.dist-info/top_level.txt +0 -1
pyobo/sources/expasy.py
CHANGED
|
@@ -4,18 +4,18 @@ import logging
|
|
|
4
4
|
import re
|
|
5
5
|
from collections import defaultdict
|
|
6
6
|
from collections.abc import Iterable, Mapping
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
from .utils import get_go_mapping
|
|
10
|
-
from ..struct import Obo, Reference, Synonym, Term
|
|
11
|
-
from ..struct.typedef import enables, has_member, term_replaced_by
|
|
10
|
+
from ..struct import Annotation, Obo, OBOLiteral, Reference, Synonym, Term
|
|
11
|
+
from ..struct.typedef import enables, has_member, has_source, term_replaced_by
|
|
12
12
|
from ..utils.path import ensure_path
|
|
13
13
|
|
|
14
14
|
__all__ = [
|
|
15
15
|
"ExpasyGetter",
|
|
16
16
|
]
|
|
17
17
|
|
|
18
|
-
PREFIX = "
|
|
18
|
+
PREFIX = "ec"
|
|
19
19
|
EXPASY_DATABASE_URL = "ftp://ftp.expasy.org/databases/enzyme/enzyme.dat"
|
|
20
20
|
EXPASY_TREE_URL = "ftp://ftp.expasy.org/databases/enzyme/enzclass.txt"
|
|
21
21
|
|
|
@@ -43,33 +43,23 @@ class ExpasyGetter(Obo):
|
|
|
43
43
|
"""A getter for ExPASy Enzyme Classes."""
|
|
44
44
|
|
|
45
45
|
bioversions_key = ontology = PREFIX
|
|
46
|
-
typedefs = [has_member, enables, term_replaced_by]
|
|
46
|
+
typedefs = [has_member, enables, term_replaced_by, has_source]
|
|
47
47
|
root_terms = [
|
|
48
|
-
Reference(prefix=
|
|
49
|
-
Reference(prefix=
|
|
50
|
-
Reference(prefix=
|
|
51
|
-
Reference(prefix=
|
|
52
|
-
Reference(prefix=
|
|
53
|
-
Reference(prefix=
|
|
54
|
-
Reference(prefix=
|
|
48
|
+
Reference(prefix=PREFIX, identifier="1"),
|
|
49
|
+
Reference(prefix=PREFIX, identifier="2"),
|
|
50
|
+
Reference(prefix=PREFIX, identifier="3"),
|
|
51
|
+
Reference(prefix=PREFIX, identifier="4"),
|
|
52
|
+
Reference(prefix=PREFIX, identifier="5"),
|
|
53
|
+
Reference(prefix=PREFIX, identifier="6"),
|
|
54
|
+
Reference(prefix=PREFIX, identifier="7"),
|
|
55
55
|
]
|
|
56
|
-
|
|
57
|
-
"uniprot": "https://bioregistry.io/uniprot:",
|
|
58
|
-
"eccode": "https://bioregistry.io/eccode:",
|
|
59
|
-
"GO": "http://purl.obolibrary.org/obo/GO_",
|
|
60
|
-
"RO": "http://purl.obolibrary.org/obo/RO_",
|
|
61
|
-
}
|
|
56
|
+
property_values = [Annotation(has_source.reference, OBOLiteral.uri(EXPASY_DATABASE_URL))]
|
|
62
57
|
|
|
63
58
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
64
59
|
"""Iterate over terms in the ontology."""
|
|
65
60
|
return get_terms(version=self._version_or_raise, force=force)
|
|
66
61
|
|
|
67
62
|
|
|
68
|
-
def get_obo(force: bool = False) -> Obo:
|
|
69
|
-
"""Get ExPASy as OBO."""
|
|
70
|
-
return ExpasyGetter(force=force)
|
|
71
|
-
|
|
72
|
-
|
|
73
63
|
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
74
64
|
"""Get the ExPASy terms."""
|
|
75
65
|
tree_path = ensure_path(PREFIX, url=EXPASY_TREE_URL, version=version, force=force)
|
|
@@ -111,9 +101,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
111
101
|
reference=Reference(prefix=PREFIX, identifier=ec_code), is_obsolete=True
|
|
112
102
|
)
|
|
113
103
|
for transfer_id in transfer_ids:
|
|
114
|
-
term.
|
|
115
|
-
term_replaced_by, Reference(prefix=PREFIX, identifier=transfer_id)
|
|
116
|
-
)
|
|
104
|
+
term.append_replaced_by(Reference(prefix=PREFIX, identifier=transfer_id))
|
|
117
105
|
continue
|
|
118
106
|
|
|
119
107
|
parent_ec_code = data["parent"]["identifier"]
|
|
@@ -142,16 +130,17 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
142
130
|
reference=Reference(prefix=PREFIX, identifier=ec_code, name=name),
|
|
143
131
|
parents=[parent_term.reference],
|
|
144
132
|
synonyms=synonyms,
|
|
133
|
+
definition=data.get("reaction"),
|
|
145
134
|
)
|
|
146
135
|
for domain in data.get("domains", []):
|
|
147
|
-
term.
|
|
136
|
+
term.annotate_object(
|
|
148
137
|
has_member,
|
|
149
138
|
Reference.model_validate(
|
|
150
139
|
{"prefix": domain["namespace"], "identifier": domain["identifier"]},
|
|
151
140
|
),
|
|
152
141
|
)
|
|
153
142
|
for protein in data.get("proteins", []):
|
|
154
|
-
term.
|
|
143
|
+
term.annotate_object(
|
|
155
144
|
has_member,
|
|
156
145
|
Reference(
|
|
157
146
|
prefix=protein["namespace"],
|
|
@@ -167,18 +156,16 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
167
156
|
return terms.values()
|
|
168
157
|
|
|
169
158
|
|
|
170
|
-
"""TREE"""
|
|
171
|
-
|
|
172
|
-
|
|
173
159
|
def normalize_expasy_id(expasy_id: str) -> str:
|
|
174
160
|
"""Return a standardized ExPASy identifier string.
|
|
175
161
|
|
|
176
162
|
:param expasy_id: A possibly non-normalized ExPASy identifier
|
|
163
|
+
:return: A normalized string.
|
|
177
164
|
"""
|
|
178
165
|
return expasy_id.replace(" ", "")
|
|
179
166
|
|
|
180
167
|
|
|
181
|
-
def give_edge(unnormalized_ec_code: str) -> tuple[int,
|
|
168
|
+
def give_edge(unnormalized_ec_code: str) -> tuple[int, str | None, str]:
|
|
182
169
|
"""Return a (parent, child) tuple for given id."""
|
|
183
170
|
levels = [x for x in unnormalized_ec_code.replace(" ", "").replace("-", "").split(".") if x]
|
|
184
171
|
level = len(levels)
|
|
@@ -220,10 +207,11 @@ def get_tree(lines: Iterable[str]):
|
|
|
220
207
|
return rv
|
|
221
208
|
|
|
222
209
|
|
|
223
|
-
def get_database(lines: Iterable[str]) -> Mapping:
|
|
210
|
+
def get_database(lines: Iterable[str]) -> Mapping[str, dict[str, Any]]:
|
|
224
211
|
"""Parse the ExPASy database file and returns a list of enzyme entry dictionaries.
|
|
225
212
|
|
|
226
213
|
:param lines: An iterator over the ExPASy database file or file-like
|
|
214
|
+
:returns: A mapping from EC code to data
|
|
227
215
|
"""
|
|
228
216
|
rv = {}
|
|
229
217
|
for groups in _group_by_id(lines):
|
|
@@ -256,7 +244,13 @@ def get_database(lines: Iterable[str]) -> Mapping:
|
|
|
256
244
|
value = value.strip().removesuffix("and").rstrip(",").strip()
|
|
257
245
|
ec_data_entry["transfer_id"] = _parse_transfer(value)
|
|
258
246
|
elif descriptor == DE:
|
|
259
|
-
|
|
247
|
+
if "name" not in ec_data_entry["concept"]:
|
|
248
|
+
ec_data_entry["concept"]["name"] = ""
|
|
249
|
+
ec_data_entry["concept"]["name"] += value.rstrip(".") # type:ignore
|
|
250
|
+
elif descriptor == CA:
|
|
251
|
+
if "reaction" not in ec_data_entry:
|
|
252
|
+
ec_data_entry["reaction"] = ""
|
|
253
|
+
ec_data_entry["reaction"] += value.rstrip(".") # type:ignore
|
|
260
254
|
elif descriptor == AN:
|
|
261
255
|
ec_data_entry["synonyms"].append(value.rstrip(".")) # type:ignore
|
|
262
256
|
elif descriptor == PR:
|
|
@@ -290,6 +284,9 @@ TRANSFER_SPLIT_RE = re.compile(r",\s*|\s+and\s+")
|
|
|
290
284
|
def _parse_transfer(value: str) -> list[str]:
|
|
291
285
|
"""Parse transferred entry string.
|
|
292
286
|
|
|
287
|
+
:param value: A string for a transferred entry
|
|
288
|
+
:returns: A list of EC codes that it got transferred to
|
|
289
|
+
|
|
293
290
|
>>> _parse_transfer("Transferred entry: 1.1.1.198, 1.1.1.227 and 1.1.1.228.")
|
|
294
291
|
['1.1.1.198', '1.1.1.227', '1.1.1.228']
|
|
295
292
|
"""
|
pyobo/sources/famplex.py
CHANGED
|
@@ -8,8 +8,8 @@ import bioregistry
|
|
|
8
8
|
from pystow.utils import get_commit
|
|
9
9
|
|
|
10
10
|
from pyobo import get_name_id_mapping
|
|
11
|
-
from pyobo.struct import Obo, Reference, Term
|
|
12
|
-
from pyobo.struct.typedef import has_member, has_part, is_a, part_of
|
|
11
|
+
from pyobo.struct import Obo, Reference, Term, _parse_str_or_curie_or_uri
|
|
12
|
+
from pyobo.struct.typedef import has_citation, has_member, has_part, is_a, part_of
|
|
13
13
|
from pyobo.utils.io import multidict
|
|
14
14
|
from pyobo.utils.path import ensure_df
|
|
15
15
|
|
|
@@ -23,7 +23,7 @@ class FamPlexGetter(Obo):
|
|
|
23
23
|
|
|
24
24
|
ontology = PREFIX
|
|
25
25
|
dynamic_version = True
|
|
26
|
-
typedefs = [has_member, has_part, is_a, part_of]
|
|
26
|
+
typedefs = [has_member, has_part, is_a, part_of, has_citation]
|
|
27
27
|
|
|
28
28
|
def _get_version(self) -> str:
|
|
29
29
|
return get_commit("sorgerlab", "famplex")
|
|
@@ -33,11 +33,6 @@ class FamPlexGetter(Obo):
|
|
|
33
33
|
return get_terms(force=force, version=self._version_or_raise)
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
def get_obo(force: bool = False) -> Obo:
|
|
37
|
-
"""Get FamPlex as OBO."""
|
|
38
|
-
return FamPlexGetter(force=force)
|
|
39
|
-
|
|
40
|
-
|
|
41
36
|
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
42
37
|
"""Get the FamPlex terms."""
|
|
43
38
|
base_url = f"https://raw.githubusercontent.com/sorgerlab/famplex/{version}"
|
|
@@ -106,33 +101,33 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
106
101
|
for (entity,) in entities_df.values:
|
|
107
102
|
reference = Reference(prefix=PREFIX, identifier=entity, name=entity)
|
|
108
103
|
definition, provenance = id_to_definition.get(entity, (None, None))
|
|
109
|
-
provenance_reference = (
|
|
110
|
-
Reference.from_curie(provenance) if isinstance(provenance, str) else None
|
|
111
|
-
)
|
|
112
104
|
term = Term(
|
|
113
105
|
reference=reference,
|
|
114
106
|
definition=definition,
|
|
115
|
-
provenance=[] if provenance_reference is None else [provenance_reference],
|
|
116
107
|
)
|
|
117
108
|
|
|
109
|
+
provenance_reference = (
|
|
110
|
+
_parse_str_or_curie_or_uri(provenance) if isinstance(provenance, str) else None
|
|
111
|
+
)
|
|
112
|
+
if provenance_reference:
|
|
113
|
+
term.append_provenance(provenance_reference)
|
|
114
|
+
|
|
118
115
|
for xref_reference in id_xrefs.get(entity, []):
|
|
119
116
|
term.append_xref(xref_reference)
|
|
120
117
|
|
|
121
118
|
for r, t in out_edges.get(reference, []):
|
|
122
|
-
if r == "isa"
|
|
119
|
+
if r == "isa":
|
|
123
120
|
term.append_parent(t)
|
|
124
|
-
elif r == "isa":
|
|
125
|
-
term.append_relationship(is_a, t)
|
|
126
121
|
elif r == "partof":
|
|
127
|
-
term.
|
|
122
|
+
term.annotate_object(part_of, t)
|
|
128
123
|
else:
|
|
129
124
|
logging.warning("unhandled relation %s", r)
|
|
130
125
|
|
|
131
126
|
for r, h in in_edges.get(reference, []):
|
|
132
127
|
if r == "isa":
|
|
133
|
-
term.
|
|
128
|
+
term.annotate_object(has_member, h)
|
|
134
129
|
elif r == "partof":
|
|
135
|
-
term.
|
|
130
|
+
term.annotate_object(has_part, h)
|
|
136
131
|
else:
|
|
137
132
|
logging.warning("unhandled relation %s", r)
|
|
138
133
|
yield term
|
pyobo/sources/flybase.py
CHANGED
|
@@ -8,7 +8,7 @@ from tqdm.auto import tqdm
|
|
|
8
8
|
|
|
9
9
|
from pyobo import Reference
|
|
10
10
|
from pyobo.resources.so import get_so_name
|
|
11
|
-
from pyobo.struct import Obo, Term, from_species, orthologous
|
|
11
|
+
from pyobo.struct import Obo, Term, _parse_str_or_curie_or_uri, from_species, orthologous
|
|
12
12
|
from pyobo.utils.io import multisetdict
|
|
13
13
|
from pyobo.utils.path import ensure_df
|
|
14
14
|
|
|
@@ -91,11 +91,6 @@ def _get_synonyms(version, force):
|
|
|
91
91
|
return df # TODO use this
|
|
92
92
|
|
|
93
93
|
|
|
94
|
-
def get_obo(force: bool = False) -> Obo:
|
|
95
|
-
"""Get OBO."""
|
|
96
|
-
return FlyBaseGetter(force=force)
|
|
97
|
-
|
|
98
|
-
|
|
99
94
|
GTYPE_TO_SO = {
|
|
100
95
|
"SRP_RNA_gene": "0001269",
|
|
101
96
|
"protein_coding_gene": "0001217",
|
|
@@ -154,11 +149,11 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
154
149
|
for hgnc_curie in human_orthologs.get(identifier, []):
|
|
155
150
|
if not hgnc_curie or pd.isna(hgnc_curie):
|
|
156
151
|
continue
|
|
157
|
-
hgnc_ortholog =
|
|
152
|
+
hgnc_ortholog = _parse_str_or_curie_or_uri(hgnc_curie)
|
|
158
153
|
if hgnc_ortholog is None:
|
|
159
154
|
tqdm.write(f"[{PREFIX}] {identifier} had invalid ortholog: {hgnc_curie}")
|
|
160
155
|
else:
|
|
161
|
-
term.
|
|
156
|
+
term.annotate_object(orthologous, hgnc_ortholog)
|
|
162
157
|
taxonomy_id = abbr_to_taxonomy.get(organism)
|
|
163
158
|
if taxonomy_id is not None:
|
|
164
159
|
term.set_species(taxonomy_id)
|
pyobo/sources/gard.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Converter for GARD."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from pyobo.struct import Obo, Term, default_reference
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"GARDGetter",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
PREFIX = "gard"
|
|
14
|
+
PP = "gard.category"
|
|
15
|
+
URL = "https://rarediseases.info.nih.gov/assets/diseases.trimmed.json"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GARDGetter(Obo):
|
|
19
|
+
"""An ontology representation of GARD."""
|
|
20
|
+
|
|
21
|
+
bioversions_key = ontology = PREFIX
|
|
22
|
+
dynamic_version = True
|
|
23
|
+
|
|
24
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
25
|
+
"""Iterate over gene terms for GARD."""
|
|
26
|
+
yield from get_terms()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_terms() -> Iterable[Term]:
|
|
30
|
+
"""Get GARD terms."""
|
|
31
|
+
rows = requests.get(URL, timeout=5).json()
|
|
32
|
+
categories = {
|
|
33
|
+
category: default_reference(
|
|
34
|
+
prefix=PREFIX, identifier=category.lower().replace(" ", "_"), name=category
|
|
35
|
+
)
|
|
36
|
+
for row in rows
|
|
37
|
+
for category in row.get("diseaseCategories", [])
|
|
38
|
+
}
|
|
39
|
+
categories["uncategorized"] = default_reference(
|
|
40
|
+
prefix=PREFIX, identifier="uncategorized", name="Uncategorized Disease"
|
|
41
|
+
)
|
|
42
|
+
for category_reference in categories.values():
|
|
43
|
+
yield Term(reference=category_reference)
|
|
44
|
+
|
|
45
|
+
for row in rows:
|
|
46
|
+
term = Term.from_triple(PREFIX, identifier=str(row.pop("id")), name=row.pop("name"))
|
|
47
|
+
_name = row.pop("encodedName", None)
|
|
48
|
+
for synonym in row.pop("synonyms", []):
|
|
49
|
+
synonym = synonym.strip()
|
|
50
|
+
if synonym:
|
|
51
|
+
term.append_synonym(synonym)
|
|
52
|
+
for category in row.pop("diseaseCategories", ["uncategorized"]):
|
|
53
|
+
term.append_parent(categories[category])
|
|
54
|
+
|
|
55
|
+
_spanish_id = row.pop("spanishId", None)
|
|
56
|
+
_spanish_name = row.pop("spanishName", None)
|
|
57
|
+
|
|
58
|
+
yield term
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
if __name__ == "__main__":
|
|
62
|
+
GARDGetter().cli()
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Get terms from GeoNames Features."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from collections.abc import Iterable
|
|
7
|
+
|
|
8
|
+
from pyobo import Obo, Term
|
|
9
|
+
from pyobo.sources.geonames.utils import PREFIX_FEATURE, get_feature_terms
|
|
10
|
+
|
|
11
|
+
__all__ = ["GeonamesFeatureGetter"]
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class GeonamesFeatureGetter(Obo):
|
|
17
|
+
"""An ontology representation of GeoNames features."""
|
|
18
|
+
|
|
19
|
+
ontology = PREFIX_FEATURE
|
|
20
|
+
dynamic_version = True
|
|
21
|
+
|
|
22
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
23
|
+
"""Iterate over terms in the ontology."""
|
|
24
|
+
yield from get_feature_terms(force=force)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
if __name__ == "__main__":
|
|
28
|
+
GeonamesFeatureGetter.cli()
|
|
@@ -3,53 +3,81 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from collections.abc import
|
|
6
|
+
from collections.abc import Iterable, Mapping
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from pystow.utils import read_zipfile_csv
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
|
|
12
12
|
from pyobo import Obo, Term
|
|
13
|
-
from pyobo.
|
|
13
|
+
from pyobo.sources.geonames.utils import (
|
|
14
|
+
ADMIN1_URL,
|
|
15
|
+
ADMIN2_URL,
|
|
16
|
+
ADMIN_1,
|
|
17
|
+
ADMIN_2,
|
|
18
|
+
CITIES_URL,
|
|
19
|
+
CITY,
|
|
20
|
+
CODE_TYPEDEF,
|
|
21
|
+
COUNTRIES_URL,
|
|
22
|
+
FEATURE_TERM,
|
|
23
|
+
NATION,
|
|
24
|
+
P_CATEGORY,
|
|
25
|
+
PREFIX,
|
|
26
|
+
PREFIX_FEATURE,
|
|
27
|
+
SYNONYMS_DF_COLUMNS,
|
|
28
|
+
SYNONYMS_URL,
|
|
29
|
+
get_feature_terms,
|
|
30
|
+
)
|
|
31
|
+
from pyobo.struct import Reference, has_part, part_of
|
|
14
32
|
from pyobo.utils.path import ensure_df, ensure_path
|
|
15
33
|
|
|
16
34
|
__all__ = ["GeonamesGetter"]
|
|
17
35
|
|
|
18
36
|
logger = logging.getLogger(__name__)
|
|
19
37
|
|
|
20
|
-
PREFIX = "geonames"
|
|
21
|
-
COUNTRIES_URL = "https://download.geonames.org/export/dump/countryInfo.txt"
|
|
22
|
-
ADMIN1_URL = "https://download.geonames.org/export/dump/admin1CodesASCII.txt"
|
|
23
|
-
ADMIN2_URL = "https://download.geonames.org/export/dump/admin2Codes.txt"
|
|
24
|
-
CITIES_URL = "https://download.geonames.org/export/dump/cities15000.zip"
|
|
25
|
-
|
|
26
38
|
|
|
27
39
|
class GeonamesGetter(Obo):
|
|
28
40
|
"""An ontology representation of GeoNames."""
|
|
29
41
|
|
|
30
42
|
ontology = PREFIX
|
|
31
43
|
dynamic_version = True
|
|
32
|
-
typedefs = [part_of]
|
|
44
|
+
typedefs = [part_of, CODE_TYPEDEF, has_part]
|
|
33
45
|
|
|
34
46
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
35
47
|
"""Iterate over terms in the ontology."""
|
|
36
48
|
return get_terms(force=force)
|
|
37
49
|
|
|
38
50
|
|
|
39
|
-
def get_terms(*, force: bool = False) ->
|
|
51
|
+
def get_terms(*, force: bool = False) -> Iterable[Term]:
|
|
40
52
|
"""Get terms."""
|
|
53
|
+
yield Term(reference=NATION)
|
|
54
|
+
yield Term(reference=ADMIN_1).append_relationship(part_of, NATION)
|
|
55
|
+
yield Term(reference=ADMIN_2).append_relationship(part_of, ADMIN_1)
|
|
56
|
+
yield Term(reference=CITY)
|
|
57
|
+
|
|
58
|
+
# since the output here is only cities, we can slice this down
|
|
59
|
+
for term in get_feature_terms(force=force):
|
|
60
|
+
if term.identifier.startswith("P.") or term.pair == P_CATEGORY.pair or term == FEATURE_TERM:
|
|
61
|
+
yield term
|
|
62
|
+
|
|
41
63
|
code_to_country = get_code_to_country(force=force)
|
|
64
|
+
yield from code_to_country.values()
|
|
65
|
+
|
|
42
66
|
code_to_admin1 = get_code_to_admin1(code_to_country, force=force)
|
|
67
|
+
yield from code_to_admin1.values()
|
|
68
|
+
|
|
43
69
|
code_to_admin2 = get_code_to_admin2(
|
|
44
70
|
code_to_country=code_to_country, code_to_admin1=code_to_admin1, force=force
|
|
45
71
|
)
|
|
72
|
+
yield from code_to_admin2.values()
|
|
73
|
+
|
|
46
74
|
id_to_term = get_cities(
|
|
47
75
|
code_to_country=code_to_country,
|
|
48
76
|
code_to_admin1=code_to_admin1,
|
|
49
77
|
code_to_admin2=code_to_admin2,
|
|
50
78
|
force=force,
|
|
51
79
|
)
|
|
52
|
-
|
|
80
|
+
yield from list(id_to_term.values())
|
|
53
81
|
|
|
54
82
|
|
|
55
83
|
def get_code_to_country(*, force: bool = False) -> Mapping[str, Term]:
|
|
@@ -70,9 +98,13 @@ def get_code_to_country(*, force: bool = False) -> Mapping[str, Term]:
|
|
|
70
98
|
for identifier, name, code, fips, iso3 in countries_df[cols].values:
|
|
71
99
|
if pd.isna(code):
|
|
72
100
|
continue
|
|
73
|
-
term = Term
|
|
74
|
-
|
|
101
|
+
term = Term(
|
|
102
|
+
reference=Reference(
|
|
103
|
+
prefix=PREFIX, identifier=identifier, name=name if pd.notna(name) else None
|
|
104
|
+
),
|
|
105
|
+
type="Instance",
|
|
75
106
|
)
|
|
107
|
+
term.append_parent(NATION)
|
|
76
108
|
term.append_synonym(code)
|
|
77
109
|
if name.startswith("The "):
|
|
78
110
|
term.append_synonym(name.removeprefix("The "))
|
|
@@ -80,7 +112,7 @@ def get_code_to_country(*, force: bool = False) -> Mapping[str, Term]:
|
|
|
80
112
|
term.append_synonym(fips)
|
|
81
113
|
if pd.notna(iso3):
|
|
82
114
|
term.append_synonym(iso3)
|
|
83
|
-
term.
|
|
115
|
+
term.annotate_string(CODE_TYPEDEF, code)
|
|
84
116
|
code_to_country[code] = term
|
|
85
117
|
logger.info(f"got {len(code_to_country):,} country records")
|
|
86
118
|
return code_to_country
|
|
@@ -104,10 +136,14 @@ def get_code_to_admin1(
|
|
|
104
136
|
tqdm.write(f"Missing info for {name} / {asciiname} / {code=} / {identifier=}")
|
|
105
137
|
continue
|
|
106
138
|
|
|
107
|
-
term = Term
|
|
108
|
-
|
|
139
|
+
term = Term(
|
|
140
|
+
reference=Reference(
|
|
141
|
+
prefix=PREFIX, identifier=identifier, name=name if pd.notna(name) else None
|
|
142
|
+
),
|
|
143
|
+
type="Instance",
|
|
109
144
|
)
|
|
110
|
-
term.
|
|
145
|
+
term.append_parent(ADMIN_1)
|
|
146
|
+
term.annotate_string(CODE_TYPEDEF, code)
|
|
111
147
|
code_to_admin1[code] = term
|
|
112
148
|
|
|
113
149
|
country_code = code.split(".")[0]
|
|
@@ -132,10 +168,14 @@ def get_code_to_admin2(
|
|
|
132
168
|
for identifier, name, code in admin2_df[["geonames_id", "name", "code"]].values:
|
|
133
169
|
if pd.isna(identifier) or pd.isna(code):
|
|
134
170
|
continue
|
|
135
|
-
term = Term
|
|
136
|
-
|
|
171
|
+
term = Term(
|
|
172
|
+
reference=Reference(
|
|
173
|
+
prefix=PREFIX, identifier=identifier, name=name if pd.notna(name) else None
|
|
174
|
+
),
|
|
175
|
+
type="Instance",
|
|
137
176
|
)
|
|
138
|
-
term.
|
|
177
|
+
term.append_parent(ADMIN_2)
|
|
178
|
+
term.annotate_string(CODE_TYPEDEF, code)
|
|
139
179
|
code_to_admin2[code] = term
|
|
140
180
|
admin1_code = code.rsplit(".", 1)[0]
|
|
141
181
|
admin1_term = code_to_admin1.get(admin1_code)
|
|
@@ -181,6 +221,19 @@ def _get_cities_df(force: bool = False) -> pd.DataFrame:
|
|
|
181
221
|
return cities_df
|
|
182
222
|
|
|
183
223
|
|
|
224
|
+
def _get_synonyms_df(force: bool = False) -> pd.DataFrame:
|
|
225
|
+
"""Get the synonyms dataframe."""
|
|
226
|
+
path = ensure_path(PREFIX, url=SYNONYMS_URL, force=force)
|
|
227
|
+
synonyms_df = read_zipfile_csv(
|
|
228
|
+
path=path,
|
|
229
|
+
inner_path="alternateNamesV2.txt",
|
|
230
|
+
header=None,
|
|
231
|
+
names=SYNONYMS_DF_COLUMNS,
|
|
232
|
+
dtype=str,
|
|
233
|
+
)
|
|
234
|
+
return synonyms_df
|
|
235
|
+
|
|
236
|
+
|
|
184
237
|
def get_cities(
|
|
185
238
|
code_to_country,
|
|
186
239
|
code_to_admin1,
|
|
@@ -188,7 +241,8 @@ def get_cities(
|
|
|
188
241
|
*,
|
|
189
242
|
minimum_population: int = 100_000,
|
|
190
243
|
force: bool = False,
|
|
191
|
-
|
|
244
|
+
include_synonyms: bool = False,
|
|
245
|
+
) -> dict[str, Term]:
|
|
192
246
|
"""Get a mapping from city code to term."""
|
|
193
247
|
cities_df = _get_cities_df(force=force)
|
|
194
248
|
cities_df = cities_df[cities_df.population.astype(int) > minimum_population]
|
|
@@ -200,11 +254,18 @@ def get_cities(
|
|
|
200
254
|
|
|
201
255
|
cols = ["geonames_id", "name", "synonyms", "country_code", "admin1", "admin2", "feature_code"]
|
|
202
256
|
for identifier, name, synonyms, country, admin1, admin2, feature_code in cities_df[cols].values:
|
|
203
|
-
terms[identifier] = term = Term
|
|
204
|
-
|
|
257
|
+
terms[identifier] = term = Term(
|
|
258
|
+
reference=Reference(
|
|
259
|
+
prefix=PREFIX, identifier=identifier, name=name if pd.notna(name) else None
|
|
260
|
+
),
|
|
261
|
+
type="Instance",
|
|
205
262
|
)
|
|
206
|
-
|
|
207
|
-
|
|
263
|
+
# All cities are under the P branch, but the prefix is omitted for brevity in the TSV
|
|
264
|
+
term.append_parent(Reference(prefix=PREFIX_FEATURE, identifier=f"P.{feature_code}"))
|
|
265
|
+
term.append_parent(CITY)
|
|
266
|
+
|
|
267
|
+
if include_synonyms and synonyms and not isinstance(synonyms, float):
|
|
268
|
+
# TODO include language codes
|
|
208
269
|
for synonym in synonyms:
|
|
209
270
|
if pd.notna(synonym):
|
|
210
271
|
term.append_synonym(synonym)
|
|
@@ -254,4 +315,4 @@ def get_city_to_country() -> dict[str, str]:
|
|
|
254
315
|
|
|
255
316
|
|
|
256
317
|
if __name__ == "__main__":
|
|
257
|
-
GeonamesGetter
|
|
318
|
+
GeonamesGetter.cli()
|