pyobo 0.10.5__py3-none-any.whl → 0.10.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/__init__.py +1 -0
- pyobo/api/__init__.py +1 -0
- pyobo/api/names.py +21 -0
- pyobo/gilda_utils.py +54 -47
- pyobo/identifier_utils.py +1 -1
- pyobo/obographs.py +12 -4
- pyobo/reader.py +17 -14
- pyobo/sources/__init__.py +4 -0
- pyobo/sources/cgnc.py +2 -1
- pyobo/sources/chembl.py +2 -1
- pyobo/sources/complexportal.py +11 -0
- pyobo/sources/depmap.py +2 -0
- pyobo/sources/drugcentral.py +2 -1
- pyobo/sources/geonames.py +239 -0
- pyobo/sources/hgnc.py +32 -1
- pyobo/sources/mgi.py +3 -1
- pyobo/sources/mirbase.py +2 -0
- pyobo/sources/mirbase_family.py +5 -2
- pyobo/sources/npass.py +4 -2
- pyobo/sources/pombase.py +1 -1
- pyobo/sources/ror.py +163 -0
- pyobo/sources/sgd.py +2 -5
- pyobo/sources/slm.py +6 -6
- pyobo/sources/umls/get_synonym_types.py +36 -0
- pyobo/sources/umls/synonym_types.tsv +243 -242
- pyobo/sources/umls/umls.py +3 -7
- pyobo/sources/uniprot/uniprot.py +5 -5
- pyobo/sources/zfin.py +2 -1
- pyobo/struct/reference.py +17 -2
- pyobo/struct/struct.py +73 -19
- pyobo/struct/typedef.py +30 -7
- pyobo/version.py +1 -1
- {pyobo-0.10.5.dist-info → pyobo-0.10.7.dist-info}/METADATA +2 -2
- {pyobo-0.10.5.dist-info → pyobo-0.10.7.dist-info}/RECORD +38 -35
- {pyobo-0.10.5.dist-info → pyobo-0.10.7.dist-info}/WHEEL +1 -1
- {pyobo-0.10.5.dist-info → pyobo-0.10.7.dist-info}/LICENSE +0 -0
- {pyobo-0.10.5.dist-info → pyobo-0.10.7.dist-info}/entry_points.txt +0 -0
- {pyobo-0.10.5.dist-info → pyobo-0.10.7.dist-info}/top_level.txt +0 -0
pyobo/__init__.py
CHANGED
pyobo/api/__init__.py
CHANGED
pyobo/api/names.py
CHANGED
|
@@ -24,6 +24,7 @@ __all__ = [
|
|
|
24
24
|
"get_id_definition_mapping",
|
|
25
25
|
"get_synonyms",
|
|
26
26
|
"get_id_synonyms_mapping",
|
|
27
|
+
"get_obsolete",
|
|
27
28
|
]
|
|
28
29
|
|
|
29
30
|
logger = logging.getLogger(__name__)
|
|
@@ -184,6 +185,26 @@ def get_id_definition_mapping(
|
|
|
184
185
|
return _get_mapping()
|
|
185
186
|
|
|
186
187
|
|
|
188
|
+
def get_obsolete(
|
|
189
|
+
prefix: str,
|
|
190
|
+
*,
|
|
191
|
+
force: bool = False,
|
|
192
|
+
strict: bool = False,
|
|
193
|
+
version: Optional[str] = None,
|
|
194
|
+
) -> Set[str]:
|
|
195
|
+
"""Get the set of obsolete local unique identifiers."""
|
|
196
|
+
if version is None:
|
|
197
|
+
version = get_version(prefix)
|
|
198
|
+
path = prefix_cache_join(prefix, name="obsolete.tsv", version=version)
|
|
199
|
+
|
|
200
|
+
@cached_collection(path=path, force=force)
|
|
201
|
+
def _get_obsolete() -> Set[str]:
|
|
202
|
+
ontology = get_ontology(prefix, force=force, strict=strict, version=version)
|
|
203
|
+
return ontology.get_obsolete()
|
|
204
|
+
|
|
205
|
+
return set(_get_obsolete())
|
|
206
|
+
|
|
207
|
+
|
|
187
208
|
@wrap_norm_prefix
|
|
188
209
|
def get_synonyms(prefix: str, identifier: str) -> Optional[List[str]]:
|
|
189
210
|
"""Get the synonyms for an entity."""
|
pyobo/gilda_utils.py
CHANGED
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
"""PyOBO's Gilda utilities."""
|
|
4
4
|
|
|
5
|
-
import itertools as itt
|
|
6
5
|
import logging
|
|
7
6
|
from typing import Iterable, List, Optional, Tuple, Type, Union
|
|
8
7
|
|
|
@@ -11,6 +10,7 @@ import gilda.api
|
|
|
11
10
|
import gilda.term
|
|
12
11
|
from gilda.grounder import Grounder
|
|
13
12
|
from gilda.process import normalize
|
|
13
|
+
from gilda.term import filter_out_duplicates
|
|
14
14
|
from tqdm.auto import tqdm
|
|
15
15
|
|
|
16
16
|
from pyobo import (
|
|
@@ -18,6 +18,7 @@ from pyobo import (
|
|
|
18
18
|
get_id_species_mapping,
|
|
19
19
|
get_id_synonyms_mapping,
|
|
20
20
|
get_ids,
|
|
21
|
+
get_obsolete,
|
|
21
22
|
)
|
|
22
23
|
from pyobo.getters import NoBuild
|
|
23
24
|
from pyobo.utils.io import multidict
|
|
@@ -31,32 +32,6 @@ __all__ = [
|
|
|
31
32
|
logger = logging.getLogger(__name__)
|
|
32
33
|
|
|
33
34
|
|
|
34
|
-
_STATUSES = {"curated": 1, "name": 2, "synonym": 3, "former_name": 4}
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def filter_out_duplicates(terms: List[gilda.term.Term]) -> List[gilda.term.Term]:
|
|
38
|
-
"""Filter out duplicates."""
|
|
39
|
-
# TODO import from gilda.term import filter_out_duplicates when it gets moved,
|
|
40
|
-
# see https://github.com/indralab/gilda/pull/103
|
|
41
|
-
logger.debug("filtering %d terms for uniqueness", len(terms))
|
|
42
|
-
new_terms: List[gilda.term.Term] = [
|
|
43
|
-
min(terms_group, key=_status_key)
|
|
44
|
-
for _, terms_group in itt.groupby(sorted(terms, key=_term_key), key=_term_key)
|
|
45
|
-
]
|
|
46
|
-
# Re-sort the terms
|
|
47
|
-
new_terms = sorted(new_terms, key=lambda x: (x.text, x.db, x.id))
|
|
48
|
-
logger.debug("got %d unique terms.", len(new_terms))
|
|
49
|
-
return new_terms
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def _status_key(term: gilda.term.Term) -> int:
|
|
53
|
-
return _STATUSES[term.status]
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def _term_key(term: gilda.term.Term) -> Tuple[str, str, str]:
|
|
57
|
-
return term.db, term.id, term.text
|
|
58
|
-
|
|
59
|
-
|
|
60
35
|
def iter_gilda_prediction_tuples(
|
|
61
36
|
prefix: str,
|
|
62
37
|
relation: str = "skos:exactMatch",
|
|
@@ -115,10 +90,12 @@ def normalize_identifier(prefix: str, identifier: str) -> str:
|
|
|
115
90
|
|
|
116
91
|
def get_grounder(
|
|
117
92
|
prefixes: Union[str, Iterable[str]],
|
|
93
|
+
*,
|
|
118
94
|
unnamed: Optional[Iterable[str]] = None,
|
|
119
95
|
grounder_cls: Optional[Type[Grounder]] = None,
|
|
120
96
|
versions: Union[None, str, Iterable[Union[str, None]]] = None,
|
|
121
97
|
strict: bool = True,
|
|
98
|
+
skip_obsolete: bool = False,
|
|
122
99
|
) -> Grounder:
|
|
123
100
|
"""Get a Gilda grounder for the given prefix(es)."""
|
|
124
101
|
unnamed = set() if unnamed is None else set(unnamed)
|
|
@@ -140,7 +117,11 @@ def get_grounder(
|
|
|
140
117
|
try:
|
|
141
118
|
p_terms = list(
|
|
142
119
|
get_gilda_terms(
|
|
143
|
-
prefix,
|
|
120
|
+
prefix,
|
|
121
|
+
identifiers_are_names=prefix in unnamed,
|
|
122
|
+
version=version,
|
|
123
|
+
strict=strict,
|
|
124
|
+
skip_obsolete=skip_obsolete,
|
|
144
125
|
)
|
|
145
126
|
)
|
|
146
127
|
except NoBuild:
|
|
@@ -155,26 +136,50 @@ def get_grounder(
|
|
|
155
136
|
return grounder_cls(terms_dict)
|
|
156
137
|
|
|
157
138
|
|
|
139
|
+
def _fast_term(
|
|
140
|
+
*,
|
|
141
|
+
text: str,
|
|
142
|
+
prefix: str,
|
|
143
|
+
identifier: str,
|
|
144
|
+
name: str,
|
|
145
|
+
status: str,
|
|
146
|
+
organism: Optional[str] = None,
|
|
147
|
+
) -> gilda.term.Term:
|
|
148
|
+
return gilda.term.Term(
|
|
149
|
+
norm_text=normalize(text),
|
|
150
|
+
text=text,
|
|
151
|
+
db=prefix,
|
|
152
|
+
id=identifier,
|
|
153
|
+
entry_name=name,
|
|
154
|
+
status=status,
|
|
155
|
+
source=prefix,
|
|
156
|
+
organism=organism,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
158
160
|
def get_gilda_terms(
|
|
159
161
|
prefix: str,
|
|
162
|
+
*,
|
|
160
163
|
identifiers_are_names: bool = False,
|
|
161
164
|
version: Optional[str] = None,
|
|
162
165
|
strict: bool = True,
|
|
166
|
+
skip_obsolete: bool = False,
|
|
163
167
|
) -> Iterable[gilda.term.Term]:
|
|
164
168
|
"""Get gilda terms for the given namespace."""
|
|
165
169
|
id_to_name = get_id_name_mapping(prefix, version=version, strict=strict)
|
|
166
170
|
id_to_species = get_id_species_mapping(prefix, version=version, strict=strict)
|
|
171
|
+
obsoletes = get_obsolete(prefix, version=version, strict=strict) if skip_obsolete else set()
|
|
167
172
|
|
|
168
173
|
it = tqdm(id_to_name.items(), desc=f"[{prefix}] mapping", unit_scale=True, unit="name")
|
|
169
174
|
for identifier, name in it:
|
|
170
|
-
|
|
171
|
-
|
|
175
|
+
if identifier in obsoletes:
|
|
176
|
+
continue
|
|
177
|
+
yield _fast_term(
|
|
172
178
|
text=name,
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
179
|
+
prefix=prefix,
|
|
180
|
+
identifier=identifier,
|
|
181
|
+
name=name,
|
|
176
182
|
status="name",
|
|
177
|
-
source=prefix,
|
|
178
183
|
organism=id_to_species.get(identifier),
|
|
179
184
|
)
|
|
180
185
|
|
|
@@ -184,29 +189,31 @@ def get_gilda_terms(
|
|
|
184
189
|
id_to_synonyms.items(), desc=f"[{prefix}] mapping", unit_scale=True, unit="synonym"
|
|
185
190
|
)
|
|
186
191
|
for identifier, synonyms in it:
|
|
192
|
+
if identifier in obsoletes:
|
|
193
|
+
continue
|
|
187
194
|
name = id_to_name[identifier]
|
|
188
195
|
for synonym in synonyms:
|
|
189
|
-
|
|
190
|
-
|
|
196
|
+
if not synonym:
|
|
197
|
+
continue
|
|
198
|
+
yield _fast_term(
|
|
191
199
|
text=synonym,
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
200
|
+
prefix=prefix,
|
|
201
|
+
identifier=identifier,
|
|
202
|
+
name=name,
|
|
195
203
|
status="synonym",
|
|
196
|
-
source=prefix,
|
|
197
204
|
organism=id_to_species.get(identifier),
|
|
198
205
|
)
|
|
199
206
|
|
|
200
207
|
if identifiers_are_names:
|
|
201
208
|
it = tqdm(get_ids(prefix), desc=f"[{prefix}] mapping", unit_scale=True, unit="id")
|
|
202
209
|
for identifier in it:
|
|
203
|
-
|
|
204
|
-
|
|
210
|
+
if identifier in obsoletes:
|
|
211
|
+
continue
|
|
212
|
+
yield _fast_term(
|
|
205
213
|
text=identifier,
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
status="
|
|
210
|
-
source=prefix,
|
|
214
|
+
prefix=prefix,
|
|
215
|
+
identifier=identifier,
|
|
216
|
+
name=identifier,
|
|
217
|
+
status="name",
|
|
211
218
|
organism=id_to_species.get(identifier),
|
|
212
219
|
)
|
pyobo/identifier_utils.py
CHANGED
|
@@ -119,7 +119,7 @@ def wrap_norm_prefix(f):
|
|
|
119
119
|
|
|
120
120
|
def standardize_ec(ec: str) -> str:
|
|
121
121
|
"""Standardize an EC code identifier by removing all trailing dashes and dots."""
|
|
122
|
-
ec = ec.strip()
|
|
122
|
+
ec = ec.strip().replace(" ", "")
|
|
123
123
|
for _ in range(4):
|
|
124
124
|
ec = ec.rstrip("-").rstrip(".")
|
|
125
125
|
return ec
|
pyobo/obographs.py
CHANGED
|
@@ -19,7 +19,7 @@ from bioontologies.obograph import (
|
|
|
19
19
|
from bioontologies.robot import ParseResults
|
|
20
20
|
|
|
21
21
|
from pyobo.struct import Obo, Reference, Term
|
|
22
|
-
from pyobo.struct.typedef import is_a
|
|
22
|
+
from pyobo.struct.typedef import definition_source, is_a
|
|
23
23
|
|
|
24
24
|
__all__ = [
|
|
25
25
|
"graph_from_obo",
|
|
@@ -61,12 +61,12 @@ def _rewire(r: Reference) -> curies.Reference:
|
|
|
61
61
|
|
|
62
62
|
|
|
63
63
|
def _get_class_node(term: Term) -> Node:
|
|
64
|
-
if
|
|
65
|
-
definition = None
|
|
66
|
-
else:
|
|
64
|
+
if term.definition or term.provenance:
|
|
67
65
|
definition = Definition.from_parsed(
|
|
68
66
|
value=term.definition, references=[_rewire(p) for p in term.provenance]
|
|
69
67
|
)
|
|
68
|
+
else:
|
|
69
|
+
definition = None
|
|
70
70
|
|
|
71
71
|
if term.xrefs:
|
|
72
72
|
if not term.xref_types:
|
|
@@ -126,3 +126,11 @@ def _iter_edges(term: Term) -> Iterable[Edge]:
|
|
|
126
126
|
_rewire(typedef.reference),
|
|
127
127
|
_rewire(target),
|
|
128
128
|
)
|
|
129
|
+
|
|
130
|
+
for provenance_reference in term.provenance:
|
|
131
|
+
yield Edge.from_parsed(
|
|
132
|
+
_rewire(term.reference),
|
|
133
|
+
_rewire(definition_source.reference),
|
|
134
|
+
_rewire(provenance_reference),
|
|
135
|
+
)
|
|
136
|
+
# TODO also look through xrefs and seealso to get provenance xrefs?
|
pyobo/reader.py
CHANGED
|
@@ -444,20 +444,23 @@ def _extract_synonym(
|
|
|
444
444
|
break
|
|
445
445
|
|
|
446
446
|
stype: Optional[SynonymTypeDef] = None
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
447
|
+
for _stype in synonym_typedefs.values():
|
|
448
|
+
# Since there aren't a lot of carefully defined synonym definitions, it
|
|
449
|
+
# can appear as a string or curie. Therefore, we might see temporary prefixes
|
|
450
|
+
# get added, so we should check against full curies as well as local unique
|
|
451
|
+
# identifiers
|
|
452
|
+
if rest.startswith(_stype.curie):
|
|
453
|
+
rest = rest[len(_stype.curie) :].strip()
|
|
454
|
+
stype = _stype
|
|
455
|
+
break
|
|
456
|
+
elif rest.startswith(_stype.preferred_curie):
|
|
457
|
+
rest = rest[len(_stype.preferred_curie) :].strip()
|
|
458
|
+
stype = _stype
|
|
459
|
+
break
|
|
460
|
+
elif rest.startswith(_stype.identifier):
|
|
461
|
+
rest = rest[len(_stype.identifier) :].strip()
|
|
462
|
+
stype = _stype
|
|
463
|
+
break
|
|
461
464
|
|
|
462
465
|
if not rest.startswith("[") or not rest.endswith("]"):
|
|
463
466
|
logger.warning("[%s:%s] problem with synonym: %s", prefix, identifier, s)
|
pyobo/sources/__init__.py
CHANGED
|
@@ -20,6 +20,7 @@ from .drugcentral import DrugCentralGetter
|
|
|
20
20
|
from .expasy import ExpasyGetter
|
|
21
21
|
from .famplex import FamPlexGetter
|
|
22
22
|
from .flybase import FlyBaseGetter
|
|
23
|
+
from .geonames import GeonamesGetter
|
|
23
24
|
from .gwascentral_phenotype import GWASCentralPhenotypeGetter
|
|
24
25
|
from .gwascentral_study import GWASCentralStudyGetter
|
|
25
26
|
from .hgnc import HGNCGetter
|
|
@@ -46,6 +47,7 @@ from .pubchem import PubChemCompoundGetter
|
|
|
46
47
|
from .reactome import ReactomeGetter
|
|
47
48
|
from .rgd import RGDGetter
|
|
48
49
|
from .rhea import RheaGetter
|
|
50
|
+
from .ror import RORGetter
|
|
49
51
|
from .selventa import SCHEMGetter, SCOMPGetter, SDISGetter, SFAMGetter
|
|
50
52
|
from .sgd import SGDGetter
|
|
51
53
|
from .slm import SLMGetter
|
|
@@ -74,6 +76,7 @@ __all__ = [
|
|
|
74
76
|
"FlyBaseGetter",
|
|
75
77
|
"GWASCentralPhenotypeGetter",
|
|
76
78
|
"GWASCentralStudyGetter",
|
|
79
|
+
"GeonamesGetter",
|
|
77
80
|
"HGNCGetter",
|
|
78
81
|
"HGNCGroupGetter",
|
|
79
82
|
"ICD10Getter",
|
|
@@ -98,6 +101,7 @@ __all__ = [
|
|
|
98
101
|
"PomBaseGetter",
|
|
99
102
|
"PubChemCompoundGetter",
|
|
100
103
|
"RGDGetter",
|
|
104
|
+
"RORGetter",
|
|
101
105
|
"ReactomeGetter",
|
|
102
106
|
"RheaGetter",
|
|
103
107
|
"SCHEMGetter",
|
pyobo/sources/cgnc.py
CHANGED
|
@@ -8,6 +8,7 @@ from typing import Iterable
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
|
|
10
10
|
from pyobo.struct import Obo, Reference, Term, from_species
|
|
11
|
+
from pyobo.struct.typedef import exact_match
|
|
11
12
|
from pyobo.utils.path import ensure_df
|
|
12
13
|
|
|
13
14
|
__all__ = [
|
|
@@ -25,7 +26,7 @@ class CGNCGetter(Obo):
|
|
|
25
26
|
|
|
26
27
|
ontology = PREFIX
|
|
27
28
|
dynamic_version = True
|
|
28
|
-
typedefs = [from_species]
|
|
29
|
+
typedefs = [from_species, exact_match]
|
|
29
30
|
|
|
30
31
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
31
32
|
"""Iterate over terms in the ontology."""
|
pyobo/sources/chembl.py
CHANGED
|
@@ -12,7 +12,7 @@ from typing import Iterable
|
|
|
12
12
|
import chembl_downloader
|
|
13
13
|
|
|
14
14
|
from pyobo.struct import Obo, Reference, Term
|
|
15
|
-
from pyobo.struct.typedef import has_inchi, has_smiles
|
|
15
|
+
from pyobo.struct.typedef import exact_match, has_inchi, has_smiles
|
|
16
16
|
|
|
17
17
|
__all__ = [
|
|
18
18
|
"ChEMBLCompoundGetter",
|
|
@@ -45,6 +45,7 @@ class ChEMBLCompoundGetter(Obo):
|
|
|
45
45
|
|
|
46
46
|
ontology = "chembl.compound"
|
|
47
47
|
bioversions_key = "chembl"
|
|
48
|
+
typedefs = [exact_match]
|
|
48
49
|
|
|
49
50
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
50
51
|
"""Iterate over terms in the ontology."""
|
pyobo/sources/complexportal.py
CHANGED
|
@@ -82,6 +82,17 @@ def _parse_xrefs(s) -> List[Tuple[Reference, str]]:
|
|
|
82
82
|
logger.warning("xref missing (: %s", xref)
|
|
83
83
|
continue
|
|
84
84
|
note = note.rstrip(")")
|
|
85
|
+
note.replace("rhea:rhea ", "rhea:")
|
|
86
|
+
note.replace("rhea:Rhea ", "rhea:")
|
|
87
|
+
note.replace("eccode::", "eccode:")
|
|
88
|
+
note.replace("eccode:EC:", "eccode:")
|
|
89
|
+
note.replace("eccode:RHEA:", "rhea:")
|
|
90
|
+
if note.lower().startswith("rhea "):
|
|
91
|
+
note = note[len("Rhea ") :]
|
|
92
|
+
if note.lower().startswith("rhea:rhea "):
|
|
93
|
+
note = note[len("rhea:rhea ") :]
|
|
94
|
+
if note.lower().startswith("EC:"):
|
|
95
|
+
note = note[len("EC:") :]
|
|
85
96
|
try:
|
|
86
97
|
reference = Reference.from_curie(xref_curie)
|
|
87
98
|
except ValueError:
|
pyobo/sources/depmap.py
CHANGED
|
@@ -8,6 +8,7 @@ import pandas as pd
|
|
|
8
8
|
import pystow
|
|
9
9
|
|
|
10
10
|
from pyobo import Obo, Reference, Term
|
|
11
|
+
from pyobo.struct.typedef import exact_match
|
|
11
12
|
|
|
12
13
|
__all__ = [
|
|
13
14
|
"get_obo",
|
|
@@ -23,6 +24,7 @@ class DepMapGetter(Obo):
|
|
|
23
24
|
|
|
24
25
|
ontology = bioversions_key = PREFIX
|
|
25
26
|
data_version = VERSION
|
|
27
|
+
typedefs = [exact_match]
|
|
26
28
|
|
|
27
29
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
28
30
|
"""Iterate over terms in the ontology."""
|
pyobo/sources/drugcentral.py
CHANGED
|
@@ -12,7 +12,7 @@ import psycopg2
|
|
|
12
12
|
from tqdm.auto import tqdm
|
|
13
13
|
|
|
14
14
|
from pyobo.struct import Obo, Reference, Synonym, Term
|
|
15
|
-
from pyobo.struct.typedef import has_inchi, has_smiles
|
|
15
|
+
from pyobo.struct.typedef import exact_match, has_inchi, has_smiles
|
|
16
16
|
|
|
17
17
|
__all__ = [
|
|
18
18
|
"DrugCentralGetter",
|
|
@@ -34,6 +34,7 @@ class DrugCentralGetter(Obo):
|
|
|
34
34
|
"""An ontology representation of the DrugCentral database."""
|
|
35
35
|
|
|
36
36
|
ontology = bioversions_key = PREFIX
|
|
37
|
+
typedefs = [exact_match]
|
|
37
38
|
|
|
38
39
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
39
40
|
"""Iterate over terms in the ontology."""
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"""Get terms from geonames."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Collection, Iterable, Mapping
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from pystow.utils import read_zipfile_csv
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
from pyobo import Obo, Term
|
|
11
|
+
from pyobo.struct import Reference, part_of
|
|
12
|
+
from pyobo.utils.path import ensure_df, ensure_path
|
|
13
|
+
|
|
14
|
+
__all__ = ["GeonamesGetter"]
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
PREFIX = "geonames"
|
|
19
|
+
COUNTRIES_URL = "https://download.geonames.org/export/dump/countryInfo.txt"
|
|
20
|
+
ADMIN1_URL = "https://download.geonames.org/export/dump/admin1CodesASCII.txt"
|
|
21
|
+
ADMIN2_URL = "https://download.geonames.org/export/dump/admin2Codes.txt"
|
|
22
|
+
CITIES_URL = "https://download.geonames.org/export/dump/cities15000.zip"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class GeonamesGetter(Obo):
|
|
26
|
+
"""An ontology representation of GeoNames."""
|
|
27
|
+
|
|
28
|
+
ontology = PREFIX
|
|
29
|
+
dynamic_version = True
|
|
30
|
+
typedefs = [part_of]
|
|
31
|
+
|
|
32
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
33
|
+
"""Iterate over terms in the ontology."""
|
|
34
|
+
return get_terms(force=force)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_terms(*, force: bool = False) -> Collection[Term]:
|
|
38
|
+
"""Get terms."""
|
|
39
|
+
code_to_country = get_code_to_country(force=force)
|
|
40
|
+
code_to_admin1 = get_code_to_admin1(code_to_country, force=force)
|
|
41
|
+
code_to_admin2 = get_code_to_admin2(
|
|
42
|
+
code_to_country=code_to_country, code_to_admin1=code_to_admin1, force=force
|
|
43
|
+
)
|
|
44
|
+
id_to_term = get_cities(
|
|
45
|
+
code_to_country=code_to_country,
|
|
46
|
+
code_to_admin1=code_to_admin1,
|
|
47
|
+
code_to_admin2=code_to_admin2,
|
|
48
|
+
force=force,
|
|
49
|
+
)
|
|
50
|
+
return id_to_term.values()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_code_to_country(*, force: bool = False) -> Mapping[str, Term]:
|
|
54
|
+
"""Get a mapping from country code to country term."""
|
|
55
|
+
countries_df = ensure_df(
|
|
56
|
+
PREFIX,
|
|
57
|
+
url=COUNTRIES_URL,
|
|
58
|
+
force=force,
|
|
59
|
+
skiprows=49,
|
|
60
|
+
keep_default_na=False, # NA is a country code
|
|
61
|
+
dtype=str,
|
|
62
|
+
)
|
|
63
|
+
logger.info(f"got {len(countries_df.index):,} countries")
|
|
64
|
+
reorder = ["geonameid", *(c for c in countries_df.columns if c != "geonameid")]
|
|
65
|
+
countries_df = countries_df[reorder]
|
|
66
|
+
code_to_country = {}
|
|
67
|
+
cols = ["geonameid", "Country", "#ISO", "fips", "ISO3"]
|
|
68
|
+
for identifier, name, code, fips, iso3 in countries_df[cols].values:
|
|
69
|
+
if pd.isna(code):
|
|
70
|
+
continue
|
|
71
|
+
term = Term.from_triple(
|
|
72
|
+
"geonames", identifier, name if pd.notna(name) else None, type="Instance"
|
|
73
|
+
)
|
|
74
|
+
term.append_synonym(code)
|
|
75
|
+
if name.startswith("The "):
|
|
76
|
+
term.append_synonym(name.removeprefix("The "))
|
|
77
|
+
if pd.notna(fips):
|
|
78
|
+
term.append_synonym(fips)
|
|
79
|
+
if pd.notna(iso3):
|
|
80
|
+
term.append_synonym(iso3)
|
|
81
|
+
term.append_property("code", code)
|
|
82
|
+
code_to_country[code] = term
|
|
83
|
+
logger.info(f"got {len(code_to_country):,} country records")
|
|
84
|
+
return code_to_country
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_code_to_admin1(
|
|
88
|
+
code_to_country: Mapping[str, Term], *, force: bool = False
|
|
89
|
+
) -> Mapping[str, Term]:
|
|
90
|
+
"""Get a mapping from admin1 code to term."""
|
|
91
|
+
admin1_df = ensure_df(
|
|
92
|
+
PREFIX,
|
|
93
|
+
url=ADMIN1_URL,
|
|
94
|
+
header=None,
|
|
95
|
+
names=["code", "name", "asciiname", "geonames_id"],
|
|
96
|
+
dtype=str,
|
|
97
|
+
force=force,
|
|
98
|
+
)
|
|
99
|
+
code_to_admin1 = {}
|
|
100
|
+
for code, name, asciiname, identifier in admin1_df.values:
|
|
101
|
+
if pd.isna(identifier) or pd.isna(code):
|
|
102
|
+
tqdm.write(f"Missing info for {name} / {asciiname} / {code=} / {identifier=}")
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
term = Term.from_triple(
|
|
106
|
+
"geonames", identifier, name if pd.notna(name) else None, type="Instance"
|
|
107
|
+
)
|
|
108
|
+
term.append_property("code", code)
|
|
109
|
+
code_to_admin1[code] = term
|
|
110
|
+
|
|
111
|
+
country_code = code.split(".")[0]
|
|
112
|
+
country_term = code_to_country[country_code]
|
|
113
|
+
term.append_relationship(part_of, country_term)
|
|
114
|
+
return code_to_admin1
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def get_code_to_admin2(
|
|
118
|
+
*, code_to_country: Mapping[str, Term], code_to_admin1: Mapping[str, Term], force: bool = False
|
|
119
|
+
) -> Mapping[str, Term]:
|
|
120
|
+
"""Get a mapping from admin2 code to term."""
|
|
121
|
+
admin2_df = ensure_df(
|
|
122
|
+
PREFIX,
|
|
123
|
+
url=ADMIN2_URL,
|
|
124
|
+
header=None,
|
|
125
|
+
names=["code", "name", "asciiname", "geonames_id"],
|
|
126
|
+
dtype=str,
|
|
127
|
+
force=force,
|
|
128
|
+
)
|
|
129
|
+
code_to_admin2 = {}
|
|
130
|
+
for identifier, name, code in admin2_df[["geonames_id", "name", "code"]].values:
|
|
131
|
+
if pd.isna(identifier) or pd.isna(code):
|
|
132
|
+
continue
|
|
133
|
+
term = Term.from_triple(
|
|
134
|
+
"geonames", identifier, name if pd.notna(name) else None, type="Instance"
|
|
135
|
+
)
|
|
136
|
+
term.append_property("code", code)
|
|
137
|
+
code_to_admin2[code] = term
|
|
138
|
+
admin1_code = code.rsplit(".", 1)[0]
|
|
139
|
+
admin1_term = code_to_admin1.get(admin1_code)
|
|
140
|
+
if admin1_term:
|
|
141
|
+
term.append_relationship(part_of, admin1_term)
|
|
142
|
+
else:
|
|
143
|
+
country_code = admin1_code.split(".", 1)[0]
|
|
144
|
+
country_term = code_to_country[country_code]
|
|
145
|
+
term.append_relationship(part_of, country_term)
|
|
146
|
+
return code_to_admin2
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def get_cities(
|
|
150
|
+
code_to_country,
|
|
151
|
+
code_to_admin1,
|
|
152
|
+
code_to_admin2,
|
|
153
|
+
*,
|
|
154
|
+
minimum_population: int = 100_000,
|
|
155
|
+
force: bool = False,
|
|
156
|
+
) -> Mapping[str, Term]:
|
|
157
|
+
"""Get a mapping from city code to term."""
|
|
158
|
+
columns = [
|
|
159
|
+
"geonames_id",
|
|
160
|
+
"name",
|
|
161
|
+
"asciiname",
|
|
162
|
+
"synonyms",
|
|
163
|
+
"latitude",
|
|
164
|
+
"longitude",
|
|
165
|
+
"feature_class",
|
|
166
|
+
"feature_code",
|
|
167
|
+
"country_code",
|
|
168
|
+
"cc2",
|
|
169
|
+
"admin1",
|
|
170
|
+
"admin2",
|
|
171
|
+
"admin3",
|
|
172
|
+
"admin4",
|
|
173
|
+
"population",
|
|
174
|
+
"elevation",
|
|
175
|
+
"dem",
|
|
176
|
+
"timezone",
|
|
177
|
+
"date_modified",
|
|
178
|
+
]
|
|
179
|
+
path = ensure_path(PREFIX, url=CITIES_URL, force=force)
|
|
180
|
+
cities_df = read_zipfile_csv(
|
|
181
|
+
path=path,
|
|
182
|
+
inner_path="cities15000.txt",
|
|
183
|
+
header=None,
|
|
184
|
+
names=columns,
|
|
185
|
+
dtype=str,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
cities_df = cities_df[cities_df.population.astype(int) > minimum_population]
|
|
189
|
+
cities_df.synonyms = cities_df.synonyms.str.split(",")
|
|
190
|
+
|
|
191
|
+
terms = {}
|
|
192
|
+
for term in code_to_country.values():
|
|
193
|
+
terms[term.identifier] = term
|
|
194
|
+
|
|
195
|
+
cols = ["geonames_id", "name", "synonyms", "country_code", "admin1", "admin2", "feature_code"]
|
|
196
|
+
for identifier, name, synonyms, country, admin1, admin2, feature_code in cities_df[cols].values:
|
|
197
|
+
terms[identifier] = term = Term.from_triple(
|
|
198
|
+
"geonames", identifier, name if pd.notna(name) else None, type="Instance"
|
|
199
|
+
)
|
|
200
|
+
term.append_parent(Reference(prefix="geonames.feature", identifier=feature_code))
|
|
201
|
+
if synonyms and not isinstance(synonyms, float):
|
|
202
|
+
for synonym in synonyms:
|
|
203
|
+
if pd.notna(synonym):
|
|
204
|
+
term.append_synonym(synonym)
|
|
205
|
+
|
|
206
|
+
if pd.isna(admin1):
|
|
207
|
+
# TODO try to annotate these directly onto countries
|
|
208
|
+
tqdm.write(
|
|
209
|
+
f"[geonames:{identifier}] {name}, a city in {country}, is missing admin 1 code"
|
|
210
|
+
)
|
|
211
|
+
continue
|
|
212
|
+
|
|
213
|
+
admin1_full = f"{country}.{admin1}"
|
|
214
|
+
admin1_term = code_to_admin1.get(admin1_full)
|
|
215
|
+
if admin1_term is None:
|
|
216
|
+
logger.info(f"could not find admin1 {admin1_full}")
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
terms[admin1_term.identifier] = admin1_term
|
|
220
|
+
|
|
221
|
+
if pd.notna(admin2):
|
|
222
|
+
admin2_full = f"{country}.{admin1}.{admin2}"
|
|
223
|
+
admin2_term = code_to_admin2.get(admin2_full)
|
|
224
|
+
if admin2_term is None or admin1_term is None:
|
|
225
|
+
pass
|
|
226
|
+
# print("could not find admin2", admin2_full)
|
|
227
|
+
else:
|
|
228
|
+
term.append_relationship(part_of, admin2_term)
|
|
229
|
+
terms[admin2_term.identifier] = admin2_term
|
|
230
|
+
|
|
231
|
+
else: # pd.notna(admin1):
|
|
232
|
+
# If there's no admin 2, just annotate directly onto admin 1
|
|
233
|
+
term.append_relationship(part_of, admin1_term)
|
|
234
|
+
|
|
235
|
+
return terms
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
if __name__ == "__main__":
|
|
239
|
+
GeonamesGetter().write_default(write_obo=True, force=True)
|