pyobo 0.10.4__py3-none-any.whl → 0.10.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/__init__.py +1 -0
- pyobo/api/__init__.py +1 -0
- pyobo/api/names.py +21 -0
- pyobo/api/xrefs.py +8 -5
- pyobo/gilda_utils.py +54 -47
- pyobo/sources/__init__.py +10 -3
- pyobo/sources/cgnc.py +4 -3
- pyobo/sources/chembl.py +5 -3
- pyobo/sources/depmap.py +4 -2
- pyobo/sources/drugbank.py +4 -4
- pyobo/sources/drugcentral.py +9 -5
- pyobo/sources/geonames.py +229 -0
- pyobo/sources/hgnc.py +32 -1
- pyobo/sources/hgncgenefamily.py +1 -1
- pyobo/sources/mgi.py +6 -2
- pyobo/sources/mirbase.py +2 -0
- pyobo/sources/mirbase_family.py +5 -2
- pyobo/sources/mirbase_mature.py +5 -4
- pyobo/sources/npass.py +1 -1
- pyobo/sources/pombase.py +1 -1
- pyobo/sources/ror.py +163 -0
- pyobo/sources/sgd.py +2 -5
- pyobo/sources/slm.py +14 -12
- pyobo/sources/umls/get_synonym_types.py +36 -0
- pyobo/sources/umls/synonym_types.tsv +243 -242
- pyobo/sources/umls/umls.py +3 -7
- pyobo/sources/zfin.py +3 -2
- pyobo/struct/reference.py +13 -2
- pyobo/struct/struct.py +72 -18
- pyobo/struct/typedef.py +32 -6
- pyobo/version.py +1 -1
- {pyobo-0.10.4.dist-info → pyobo-0.10.6.dist-info}/METADATA +9 -9
- {pyobo-0.10.4.dist-info → pyobo-0.10.6.dist-info}/RECORD +37 -34
- {pyobo-0.10.4.dist-info → pyobo-0.10.6.dist-info}/WHEEL +1 -1
- {pyobo-0.10.4.dist-info → pyobo-0.10.6.dist-info}/LICENSE +0 -0
- {pyobo-0.10.4.dist-info → pyobo-0.10.6.dist-info}/entry_points.txt +0 -0
- {pyobo-0.10.4.dist-info → pyobo-0.10.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""Get terms from geonames."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Collection, Iterable, Mapping
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from pystow.utils import read_zipfile_csv
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
from pyobo import Obo, Term
|
|
11
|
+
from pyobo.struct import Reference, part_of
|
|
12
|
+
from pyobo.utils.path import ensure_df, ensure_path
|
|
13
|
+
|
|
14
|
+
__all__ = ["GeonamesGetter"]
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
PREFIX = "geonames"
|
|
19
|
+
COUNTRIES_URL = "https://download.geonames.org/export/dump/countryInfo.txt"
|
|
20
|
+
ADMIN1_URL = "https://download.geonames.org/export/dump/admin1CodesASCII.txt"
|
|
21
|
+
ADMIN2_URL = "https://download.geonames.org/export/dump/admin2Codes.txt"
|
|
22
|
+
CITIES_URL = "https://download.geonames.org/export/dump/cities15000.zip"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class GeonamesGetter(Obo):
|
|
26
|
+
"""An ontology representation of GeoNames."""
|
|
27
|
+
|
|
28
|
+
ontology = PREFIX
|
|
29
|
+
dynamic_version = True
|
|
30
|
+
typedefs = [part_of]
|
|
31
|
+
|
|
32
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
33
|
+
"""Iterate over terms in the ontology."""
|
|
34
|
+
return get_terms(force=force)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_terms(*, force: bool = False) -> Collection[Term]:
|
|
38
|
+
"""Get terms."""
|
|
39
|
+
code_to_country = get_code_to_country(force=force)
|
|
40
|
+
code_to_admin1 = get_code_to_admin1(code_to_country, force=force)
|
|
41
|
+
code_to_admin2 = get_code_to_admin2(code_to_admin1, force=force)
|
|
42
|
+
id_to_term = get_cities(
|
|
43
|
+
code_to_country=code_to_country,
|
|
44
|
+
code_to_admin1=code_to_admin1,
|
|
45
|
+
code_to_admin2=code_to_admin2,
|
|
46
|
+
force=force,
|
|
47
|
+
)
|
|
48
|
+
return id_to_term.values()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def get_code_to_country(*, force: bool = False) -> Mapping[str, Term]:
|
|
52
|
+
"""Get a mapping from country code to country term."""
|
|
53
|
+
countries_df = ensure_df(
|
|
54
|
+
PREFIX,
|
|
55
|
+
url=COUNTRIES_URL,
|
|
56
|
+
force=force,
|
|
57
|
+
skiprows=49,
|
|
58
|
+
keep_default_na=False, # NA is a country code
|
|
59
|
+
dtype=str,
|
|
60
|
+
)
|
|
61
|
+
logger.info(f"got {len(countries_df.index):,} countries")
|
|
62
|
+
reorder = ["geonameid", *(c for c in countries_df.columns if c != "geonameid")]
|
|
63
|
+
countries_df = countries_df[reorder]
|
|
64
|
+
code_to_country = {}
|
|
65
|
+
cols = ["geonameid", "Country", "#ISO", "fips", "ISO3"]
|
|
66
|
+
for identifier, name, code, fips, iso3 in countries_df[cols].values:
|
|
67
|
+
if pd.isna(code):
|
|
68
|
+
continue
|
|
69
|
+
term = Term.from_triple(
|
|
70
|
+
"geonames", identifier, name if pd.notna(name) else None, type="Instance"
|
|
71
|
+
)
|
|
72
|
+
term.append_synonym(code)
|
|
73
|
+
if name.startswith("The "):
|
|
74
|
+
term.append_synonym(name.removeprefix("The "))
|
|
75
|
+
if pd.notna(fips):
|
|
76
|
+
term.append_synonym(fips)
|
|
77
|
+
if pd.notna(iso3):
|
|
78
|
+
term.append_synonym(iso3)
|
|
79
|
+
term.append_property("code", code)
|
|
80
|
+
code_to_country[code] = term
|
|
81
|
+
logger.info(f"got {len(code_to_country):,} country records")
|
|
82
|
+
return code_to_country
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def get_code_to_admin1(
|
|
86
|
+
code_to_country: Mapping[str, Term], *, force: bool = False
|
|
87
|
+
) -> Mapping[str, Term]:
|
|
88
|
+
"""Get a mapping from admin1 code to term."""
|
|
89
|
+
admin1_df = ensure_df(
|
|
90
|
+
PREFIX,
|
|
91
|
+
url=ADMIN1_URL,
|
|
92
|
+
header=None,
|
|
93
|
+
names=["code", "name", "asciiname", "geonames_id"],
|
|
94
|
+
dtype=str,
|
|
95
|
+
force=force,
|
|
96
|
+
)
|
|
97
|
+
code_to_admin1 = {}
|
|
98
|
+
for code, name, asciiname, identifier in admin1_df.values:
|
|
99
|
+
if pd.isna(identifier) or pd.isna(code):
|
|
100
|
+
tqdm.write(f"Missing info for {name} / {asciiname} / {code=} / {identifier=}")
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
term = Term.from_triple(
|
|
104
|
+
"geonames", identifier, name if pd.notna(name) else None, type="Instance"
|
|
105
|
+
)
|
|
106
|
+
term.append_property("code", code)
|
|
107
|
+
code_to_admin1[code] = term
|
|
108
|
+
|
|
109
|
+
country_code = code.split(".")[0]
|
|
110
|
+
country_term = code_to_country[country_code]
|
|
111
|
+
term.append_relationship(part_of, country_term)
|
|
112
|
+
return code_to_admin1
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_code_to_admin2(
|
|
116
|
+
code_to_admin1: Mapping[str, Term], *, force: bool = False
|
|
117
|
+
) -> Mapping[str, Term]:
|
|
118
|
+
"""Get a mapping from admin2 code to term."""
|
|
119
|
+
admin2_df = ensure_df(
|
|
120
|
+
PREFIX,
|
|
121
|
+
url=ADMIN2_URL,
|
|
122
|
+
header=None,
|
|
123
|
+
names=["code", "name", "asciiname", "geonames_id"],
|
|
124
|
+
dtype=str,
|
|
125
|
+
force=force,
|
|
126
|
+
)
|
|
127
|
+
code_to_admin2 = {}
|
|
128
|
+
for identifier, name, code in admin2_df[["geonames_id", "name", "code"]].values:
|
|
129
|
+
if pd.isna(identifier) or pd.isna(code):
|
|
130
|
+
continue
|
|
131
|
+
term = Term.from_triple(
|
|
132
|
+
"geonames", identifier, name if pd.notna(name) else None, type="Instance"
|
|
133
|
+
)
|
|
134
|
+
term.append_property("code", code)
|
|
135
|
+
code_to_admin2[code] = term
|
|
136
|
+
admin1_code = code.rsplit(".", 1)[0]
|
|
137
|
+
admin1_term = code_to_admin1[admin1_code]
|
|
138
|
+
term.append_relationship(part_of, admin1_term)
|
|
139
|
+
return code_to_admin2
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def get_cities(
|
|
143
|
+
code_to_country,
|
|
144
|
+
code_to_admin1,
|
|
145
|
+
code_to_admin2,
|
|
146
|
+
*,
|
|
147
|
+
minimum_population: int = 100_000,
|
|
148
|
+
force: bool = False,
|
|
149
|
+
) -> Mapping[str, Term]:
|
|
150
|
+
"""Get a mapping from city code to term."""
|
|
151
|
+
columns = [
|
|
152
|
+
"geonames_id",
|
|
153
|
+
"name",
|
|
154
|
+
"asciiname",
|
|
155
|
+
"synonyms",
|
|
156
|
+
"latitude",
|
|
157
|
+
"longitude",
|
|
158
|
+
"feature_class",
|
|
159
|
+
"feature_code",
|
|
160
|
+
"country_code",
|
|
161
|
+
"cc2",
|
|
162
|
+
"admin1",
|
|
163
|
+
"admin2",
|
|
164
|
+
"admin3",
|
|
165
|
+
"admin4",
|
|
166
|
+
"population",
|
|
167
|
+
"elevation",
|
|
168
|
+
"dem",
|
|
169
|
+
"timezone",
|
|
170
|
+
"date_modified",
|
|
171
|
+
]
|
|
172
|
+
path = ensure_path(PREFIX, url=CITIES_URL, force=force)
|
|
173
|
+
cities_df = read_zipfile_csv(
|
|
174
|
+
path=path,
|
|
175
|
+
inner_path="cities15000.txt",
|
|
176
|
+
header=None,
|
|
177
|
+
names=columns,
|
|
178
|
+
dtype=str,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
cities_df = cities_df[cities_df.population.astype(int) > minimum_population]
|
|
182
|
+
cities_df.synonyms = cities_df.synonyms.str.split(",")
|
|
183
|
+
|
|
184
|
+
terms = {}
|
|
185
|
+
for term in code_to_country.values():
|
|
186
|
+
terms[term.identifier] = term
|
|
187
|
+
|
|
188
|
+
cols = ["geonames_id", "name", "synonyms", "country_code", "admin1", "admin2", "feature_code"]
|
|
189
|
+
for identifier, name, synonyms, country, admin1, admin2, feature_code in cities_df[cols].values:
|
|
190
|
+
terms[identifier] = term = Term.from_triple(
|
|
191
|
+
"geonames", identifier, name if pd.notna(name) else None, type="Instance"
|
|
192
|
+
)
|
|
193
|
+
term.append_parent(Reference(prefix="geonames.feature", identifier=feature_code))
|
|
194
|
+
if synonyms and not isinstance(synonyms, float):
|
|
195
|
+
for synonym in synonyms:
|
|
196
|
+
if pd.notna(synonym):
|
|
197
|
+
term.append_synonym(synonym)
|
|
198
|
+
|
|
199
|
+
if pd.isna(admin1):
|
|
200
|
+
tqdm.write(f"[geonames:{identifier}] missing admin 1 code for {name} ({country})")
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
admin1_full = f"{country}.{admin1}"
|
|
204
|
+
admin1_term = code_to_admin1.get(admin1_full)
|
|
205
|
+
if admin1_term is None:
|
|
206
|
+
logger.info(f"could not find admin1 {admin1_full}")
|
|
207
|
+
continue
|
|
208
|
+
|
|
209
|
+
terms[admin1_term.identifier] = admin1_term
|
|
210
|
+
|
|
211
|
+
if pd.notna(admin2):
|
|
212
|
+
admin2_full = f"{country}.{admin1}.{admin2}"
|
|
213
|
+
admin2_term = code_to_admin2.get(admin2_full)
|
|
214
|
+
if admin2_term is None or admin1_term is None:
|
|
215
|
+
pass
|
|
216
|
+
# print("could not find admin2", admin2_full)
|
|
217
|
+
else:
|
|
218
|
+
term.append_relationship(part_of, admin2_term)
|
|
219
|
+
terms[admin2_term.identifier] = admin2_term
|
|
220
|
+
|
|
221
|
+
else: # pd.notna(admin1):
|
|
222
|
+
# If there's no admin 2, just annotate directly onto admin 1
|
|
223
|
+
term.append_relationship(part_of, admin1_term)
|
|
224
|
+
|
|
225
|
+
return terms
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
if __name__ == "__main__":
|
|
229
|
+
GeonamesGetter().write_default(write_obo=True, force=True)
|
pyobo/sources/hgnc.py
CHANGED
|
@@ -27,6 +27,7 @@ from pyobo.struct import (
|
|
|
27
27
|
orthologous,
|
|
28
28
|
transcribes_to,
|
|
29
29
|
)
|
|
30
|
+
from pyobo.struct.typedef import exact_match
|
|
30
31
|
from pyobo.utils.path import ensure_path, prefix_directory_join
|
|
31
32
|
|
|
32
33
|
__all__ = [
|
|
@@ -108,6 +109,28 @@ ENCODINGS = {
|
|
|
108
109
|
"unknown": "GRP",
|
|
109
110
|
}
|
|
110
111
|
|
|
112
|
+
SKIP_KEYS = {
|
|
113
|
+
"date_approved_reserved",
|
|
114
|
+
"_version_",
|
|
115
|
+
"uuid",
|
|
116
|
+
"date_modified",
|
|
117
|
+
"date_name_changed",
|
|
118
|
+
"date_symbol_changed",
|
|
119
|
+
"symbol_report_tag",
|
|
120
|
+
"location_sortable",
|
|
121
|
+
"curator_notes",
|
|
122
|
+
"agr", # repeat of HGNC ID
|
|
123
|
+
"gencc", # repeat of HGNC ID
|
|
124
|
+
"bioparadigms_slc", # repeat of symbol
|
|
125
|
+
"lncrnadb", # repeat of symbol
|
|
126
|
+
"gtrnadb", # repeat of symbol
|
|
127
|
+
"horde_id", # repeat of symbol
|
|
128
|
+
"imgt", # repeat of symbol
|
|
129
|
+
"cd", # symbol
|
|
130
|
+
"homeodb", # TODO add to bioregistry, though this is defunct
|
|
131
|
+
"mamit-trnadb", # TODO add to bioregistry, though this is defunct
|
|
132
|
+
}
|
|
133
|
+
|
|
111
134
|
#: A mapping from HGNC's locus_type annotations to sequence ontology identifiers
|
|
112
135
|
LOCUS_TYPE_TO_SO = {
|
|
113
136
|
# protein-coding gene
|
|
@@ -190,6 +213,7 @@ class HGNCGetter(Obo):
|
|
|
190
213
|
transcribes_to,
|
|
191
214
|
orthologous,
|
|
192
215
|
member_of,
|
|
216
|
+
exact_match,
|
|
193
217
|
]
|
|
194
218
|
idspaces = IDSPACES
|
|
195
219
|
synonym_typedefs = [
|
|
@@ -330,6 +354,12 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
|
|
|
330
354
|
else:
|
|
331
355
|
tqdm.write(f"unhandled IUPHAR: {iuphar}")
|
|
332
356
|
|
|
357
|
+
for lrg_info in entry.pop("lsdb", []):
|
|
358
|
+
if lrg_info.startswith("LRG_"):
|
|
359
|
+
lrg_curie = lrg_info.split("|")[0]
|
|
360
|
+
_, lrg_id = lrg_curie.split("_")
|
|
361
|
+
term.append_xref(Reference(prefix="lrg", identifier=lrg_id))
|
|
362
|
+
|
|
333
363
|
for xref_prefix, key in gene_xrefs:
|
|
334
364
|
xref_identifiers = entry.pop(key, None)
|
|
335
365
|
if xref_identifiers is None:
|
|
@@ -397,7 +427,8 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
|
|
|
397
427
|
term.set_species(identifier="9606", name="Homo sapiens")
|
|
398
428
|
|
|
399
429
|
for key in entry:
|
|
400
|
-
|
|
430
|
+
if key not in SKIP_KEYS:
|
|
431
|
+
unhandled_entry_keys[key] += 1
|
|
401
432
|
yield term
|
|
402
433
|
|
|
403
434
|
with open(prefix_directory_join(PREFIX, name="unhandled.json"), "w") as file:
|
pyobo/sources/hgncgenefamily.py
CHANGED
pyobo/sources/mgi.py
CHANGED
|
@@ -9,6 +9,8 @@ from typing import Iterable
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
from tqdm.auto import tqdm
|
|
11
11
|
|
|
12
|
+
from pyobo.struct.typedef import exact_match
|
|
13
|
+
|
|
12
14
|
from ..struct import (
|
|
13
15
|
Obo,
|
|
14
16
|
Reference,
|
|
@@ -37,7 +39,7 @@ class MGIGetter(Obo):
|
|
|
37
39
|
|
|
38
40
|
ontology = PREFIX
|
|
39
41
|
dynamic_version = True
|
|
40
|
-
typedefs = [from_species, has_gene_product, transcribes_to]
|
|
42
|
+
typedefs = [from_species, has_gene_product, transcribes_to, exact_match]
|
|
41
43
|
|
|
42
44
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
43
45
|
"""Iterate over terms in the ontology."""
|
|
@@ -161,7 +163,9 @@ def get_terms(force: bool = False) -> Iterable[Term]:
|
|
|
161
163
|
for synonym in mgi_to_synonyms[identifier]:
|
|
162
164
|
term.append_synonym(Synonym(name=synonym))
|
|
163
165
|
if identifier in mgi_to_entrez_id:
|
|
164
|
-
term.
|
|
166
|
+
term.append_exact_match(
|
|
167
|
+
Reference(prefix="ncbigene", identifier=mgi_to_entrez_id[identifier])
|
|
168
|
+
)
|
|
165
169
|
for ensembl_id in mgi_to_ensemble_accession_ids[identifier]:
|
|
166
170
|
term.append_xref(Reference(prefix="ensembl", identifier=ensembl_id))
|
|
167
171
|
for ensembl_id in mgi_to_ensemble_transcript_ids[identifier]:
|
pyobo/sources/mirbase.py
CHANGED
|
@@ -136,6 +136,8 @@ def _process_definitions_lines(
|
|
|
136
136
|
xref_prefix, xref_identifier, xref_label = map(str.strip, line.split(";"))
|
|
137
137
|
xref_prefix = xref_prefix.lower()
|
|
138
138
|
xref_prefix = xref_mapping.get(xref_prefix, xref_prefix)
|
|
139
|
+
if xref_prefix == "pictar":
|
|
140
|
+
continue
|
|
139
141
|
xrefs.append(
|
|
140
142
|
Reference(prefix=xref_prefix, identifier=xref_identifier, name=xref_label or None)
|
|
141
143
|
)
|
pyobo/sources/mirbase_family.py
CHANGED
|
@@ -26,6 +26,7 @@ class MiRBaseFamilyGetter(Obo):
|
|
|
26
26
|
|
|
27
27
|
ontology = PREFIX
|
|
28
28
|
bioversions_key = "mirbase"
|
|
29
|
+
typedefs = [has_member]
|
|
29
30
|
|
|
30
31
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
31
32
|
"""Iterate over terms in the ontology."""
|
|
@@ -40,7 +41,9 @@ def get_obo(force: bool = False) -> Obo:
|
|
|
40
41
|
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
41
42
|
"""Get miRBase family terms."""
|
|
42
43
|
df = get_df(version, force=force)
|
|
43
|
-
for family_id, name, mirna_id, mirna_name in tqdm(
|
|
44
|
+
for family_id, name, mirna_id, mirna_name in tqdm(
|
|
45
|
+
df.values, total=len(df.index), unit_scale=True, desc="miRBase Family"
|
|
46
|
+
):
|
|
44
47
|
term = Term(
|
|
45
48
|
reference=Reference(prefix=PREFIX, identifier=family_id, name=name),
|
|
46
49
|
)
|
|
@@ -65,4 +68,4 @@ def get_df(version: str, force: bool = False) -> pd.DataFrame:
|
|
|
65
68
|
|
|
66
69
|
|
|
67
70
|
if __name__ == "__main__":
|
|
68
|
-
get_obo().write_default(use_tqdm=True)
|
|
71
|
+
get_obo().write_default(use_tqdm=True, write_obo=True, force=True)
|
pyobo/sources/mirbase_mature.py
CHANGED
|
@@ -39,15 +39,16 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
39
39
|
for _, name, previous_name, mirbase_mature_id in tqdm(
|
|
40
40
|
df.values, total=len(df.index), unit_scale=True
|
|
41
41
|
):
|
|
42
|
+
synonyms = []
|
|
43
|
+
if pd.notna(previous_name):
|
|
44
|
+
synonyms.append(Synonym(name=previous_name))
|
|
42
45
|
yield Term(
|
|
43
46
|
reference=Reference(
|
|
44
47
|
prefix=PREFIX, identifier=mirbase_mature_id, name=name if pd.notna(name) else None
|
|
45
48
|
),
|
|
46
|
-
synonyms=
|
|
47
|
-
Synonym(name=previous_name),
|
|
48
|
-
],
|
|
49
|
+
synonyms=synonyms,
|
|
49
50
|
)
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
if __name__ == "__main__":
|
|
53
|
-
|
|
54
|
+
get_obo().write_default(write_obo=True, write_obograph=True, use_tqdm=True)
|
pyobo/sources/npass.py
CHANGED
|
@@ -77,7 +77,7 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
77
77
|
logger.debug("multiple cids for %s: %s", identifier, pubchem_compound_ids)
|
|
78
78
|
for pubchem_compound_id in pubchem_compound_ids:
|
|
79
79
|
term.append_xref(
|
|
80
|
-
Reference(prefix="pubchem.compound", identifier=pubchem_compound_id)
|
|
80
|
+
Reference(prefix="pubchem.compound", identifier=pubchem_compound_id.strip())
|
|
81
81
|
)
|
|
82
82
|
|
|
83
83
|
for synonym in [iupac]:
|
pyobo/sources/pombase.py
CHANGED
|
@@ -29,7 +29,7 @@ class PomBaseGetter(Obo):
|
|
|
29
29
|
"""An ontology representation of PomBase's fission yeast gene nomenclature."""
|
|
30
30
|
|
|
31
31
|
ontology = bioversions_key = PREFIX
|
|
32
|
-
typedefs = [from_species, has_gene_product]
|
|
32
|
+
typedefs = [from_species, has_gene_product, orthologous]
|
|
33
33
|
|
|
34
34
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
35
35
|
"""Iterate over terms in the ontology."""
|
pyobo/sources/ror.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""Convert the Research Organization Registry (ROR) into an ontology."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import zipfile
|
|
5
|
+
from typing import Iterable
|
|
6
|
+
|
|
7
|
+
import bioregistry
|
|
8
|
+
import zenodo_client
|
|
9
|
+
from tqdm.auto import tqdm
|
|
10
|
+
|
|
11
|
+
from pyobo.struct import Obo, Reference, Term, TypeDef
|
|
12
|
+
from pyobo.struct.struct import acronym
|
|
13
|
+
|
|
14
|
+
PREFIX = "ror"
|
|
15
|
+
ROR_ZENODO_RECORD_ID = "10086202"
|
|
16
|
+
|
|
17
|
+
# Constants
|
|
18
|
+
ORG_CLASS = Reference(prefix="OBI", identifier="0000245")
|
|
19
|
+
LOCATED_IN = Reference(prefix="RO", identifier="0001025")
|
|
20
|
+
PART_OF = Reference(prefix="BFO", identifier="0000050")
|
|
21
|
+
HAS_PART = Reference(prefix="BFO", identifier="0000051")
|
|
22
|
+
SUCCESSOR = Reference(prefix="BFO", identifier="0000063")
|
|
23
|
+
PREDECESSOR = Reference(prefix="BFO", identifier="0000062")
|
|
24
|
+
|
|
25
|
+
RMAP = {
|
|
26
|
+
"Related": TypeDef.from_triple("rdfs", "seeAlso"),
|
|
27
|
+
"Child": TypeDef(HAS_PART),
|
|
28
|
+
"Parent": TypeDef(PART_OF),
|
|
29
|
+
"Predecessor": TypeDef(PREDECESSOR),
|
|
30
|
+
"Successor": TypeDef(SUCCESSOR),
|
|
31
|
+
"Located in": TypeDef(LOCATED_IN),
|
|
32
|
+
}
|
|
33
|
+
NAME_REMAPPING = {
|
|
34
|
+
"'s-Hertogenbosch": "Den Bosch", # SMH Netherlands, why u gotta be like this
|
|
35
|
+
"'s Heeren Loo": "s Heeren Loo",
|
|
36
|
+
"'s-Heerenberg": "s-Heerenberg",
|
|
37
|
+
"Institut Virion\\Serion": "Institut Virion/Serion",
|
|
38
|
+
"Hematology\\Oncology Clinic": "Hematology/Oncology Clinic",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class RORGetter(Obo):
|
|
43
|
+
"""An ontology representation of the ROR."""
|
|
44
|
+
|
|
45
|
+
ontology = bioregistry_key = PREFIX
|
|
46
|
+
typedefs = list(RMAP.values())
|
|
47
|
+
synonym_typedefs = [acronym]
|
|
48
|
+
idspaces = {
|
|
49
|
+
"ror": "https://ror.org/",
|
|
50
|
+
"geonames": "https://www.geonames.org/",
|
|
51
|
+
"envo": "http://purl.obolibrary.org/obo/ENVO_",
|
|
52
|
+
"bfo": "http://purl.obolibrary.org/obo/BFO_",
|
|
53
|
+
"ro": "http://purl.obolibrary.org/obo/RO_",
|
|
54
|
+
"obi": "http://purl.obolibrary.org/obo/OBI_",
|
|
55
|
+
"omo": "http://purl.obolibrary.org/obo/OMO_",
|
|
56
|
+
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
def __post_init__(self): # noqa: D105
|
|
60
|
+
self.data_version, _url, _path = _get_info()
|
|
61
|
+
super().__post_init__()
|
|
62
|
+
|
|
63
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
64
|
+
"""Iterate over terms in the ontology."""
|
|
65
|
+
return iterate_ror_terms(force=force)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def iterate_ror_terms(*, force: bool = False) -> Iterable[Term]:
|
|
69
|
+
"""Iterate over terms in ROR."""
|
|
70
|
+
version, source_uri, records = get_latest(force=force)
|
|
71
|
+
unhandled_xref_prefixes = set()
|
|
72
|
+
for record in tqdm(records, unit_scale=True, unit="record", desc=PREFIX):
|
|
73
|
+
identifier = record["id"].removeprefix("https://ror.org/")
|
|
74
|
+
name = record["name"]
|
|
75
|
+
name = NAME_REMAPPING.get(name, name)
|
|
76
|
+
|
|
77
|
+
term = Term(
|
|
78
|
+
reference=Reference(prefix=PREFIX, identifier=identifier, name=name), type="Instance"
|
|
79
|
+
)
|
|
80
|
+
term.append_parent(ORG_CLASS)
|
|
81
|
+
|
|
82
|
+
if name.startswith("The "):
|
|
83
|
+
term.append_synonym(name.removeprefix("The "))
|
|
84
|
+
|
|
85
|
+
for relationship in record.get("relationships", []):
|
|
86
|
+
target_id = relationship["id"].removeprefix("https://ror.org/")
|
|
87
|
+
term.append_relationship(
|
|
88
|
+
RMAP[relationship["type"]], Reference(prefix=PREFIX, identifier=target_id)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
term.is_obsolete = record.get("status") != "active"
|
|
92
|
+
|
|
93
|
+
for address in record.get("addresses", []):
|
|
94
|
+
city = address.get("geonames_city")
|
|
95
|
+
if not city:
|
|
96
|
+
continue
|
|
97
|
+
term.append_relationship(
|
|
98
|
+
RMAP["Located in"], Reference(prefix="geonames", identifier=str(city["id"]))
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
for label in record.get("labels", []):
|
|
102
|
+
label = label["label"] # there's a language availabel in this dict too
|
|
103
|
+
term.append_synonym(label)
|
|
104
|
+
if label.startswith("The "):
|
|
105
|
+
term.append_synonym(label.removeprefix("The "))
|
|
106
|
+
|
|
107
|
+
for synonym in record.get("aliases", []):
|
|
108
|
+
term.append_synonym(synonym)
|
|
109
|
+
if synonym.startswith("The "):
|
|
110
|
+
term.append_synonym(synonym.removeprefix("The "))
|
|
111
|
+
|
|
112
|
+
for acronym_synonym in record.get("acronyms", []):
|
|
113
|
+
term.append_synonym(acronym_synonym, type=acronym)
|
|
114
|
+
|
|
115
|
+
for prefix, xref_data in record.get("external_ids", {}).items():
|
|
116
|
+
if prefix == "OrgRef":
|
|
117
|
+
# OrgRef refers to wikipedia page id, see
|
|
118
|
+
# https://stackoverflow.com/questions/6168020/what-is-wikipedia-pageid-how-to-change-it-into-real-page-url
|
|
119
|
+
continue
|
|
120
|
+
norm_prefix = bioregistry.normalize_prefix(prefix)
|
|
121
|
+
if norm_prefix is None:
|
|
122
|
+
if prefix not in unhandled_xref_prefixes:
|
|
123
|
+
tqdm.write(f"Unhandled prefix: {prefix} in {name} ({term.curie}). Values:")
|
|
124
|
+
for xref_id in xref_data["all"]:
|
|
125
|
+
tqdm.write(f"- {xref_id}")
|
|
126
|
+
unhandled_xref_prefixes.add(prefix)
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
identifiers = xref_data["all"]
|
|
130
|
+
if isinstance(identifiers, str):
|
|
131
|
+
identifiers = [identifiers]
|
|
132
|
+
for xref_id in identifiers:
|
|
133
|
+
term.append_xref(Reference(prefix=norm_prefix, identifier=xref_id.replace(" ", "")))
|
|
134
|
+
|
|
135
|
+
yield term
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _get_info(*, force: bool = False):
|
|
139
|
+
client = zenodo_client.Zenodo()
|
|
140
|
+
latest_record_id = client.get_latest_record(ROR_ZENODO_RECORD_ID)
|
|
141
|
+
response = client.get_record(latest_record_id)
|
|
142
|
+
response_json = response.json()
|
|
143
|
+
version = response_json["metadata"]["version"].lstrip("v")
|
|
144
|
+
file_record = response_json["files"][0]
|
|
145
|
+
name = file_record["key"]
|
|
146
|
+
url = file_record["links"]["self"]
|
|
147
|
+
path = client.download(latest_record_id, name=name, force=force)
|
|
148
|
+
return version, url, path
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def get_latest(*, force: bool = False):
|
|
152
|
+
"""Get the latest ROR metadata and records."""
|
|
153
|
+
version, url, path = _get_info(force=force)
|
|
154
|
+
with zipfile.ZipFile(path) as zf:
|
|
155
|
+
for zip_info in zf.filelist:
|
|
156
|
+
if zip_info.filename.endswith(".json"):
|
|
157
|
+
with zf.open(zip_info) as file:
|
|
158
|
+
return version, url, json.load(file)
|
|
159
|
+
raise FileNotFoundError
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
if __name__ == "__main__":
|
|
163
|
+
RORGetter().write_default(write_obo=True, force=True)
|
pyobo/sources/sgd.py
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
from typing import Iterable
|
|
6
6
|
from urllib.parse import unquote_plus
|
|
7
7
|
|
|
8
|
-
from ..struct import Obo, Reference, Synonym,
|
|
8
|
+
from ..struct import Obo, Reference, Synonym, Term, from_species
|
|
9
9
|
from ..utils.path import ensure_tar_df
|
|
10
10
|
|
|
11
11
|
__all__ = [
|
|
@@ -21,15 +21,12 @@ URL = (
|
|
|
21
21
|
)
|
|
22
22
|
INNER_PATH = "S288C_reference_genome_R64-2-1_20150113/saccharomyces_cerevisiae_R64-2-1_20150113.gff"
|
|
23
23
|
|
|
24
|
-
alias_type = SynonymTypeDef.from_text("alias")
|
|
25
|
-
|
|
26
24
|
|
|
27
25
|
class SGDGetter(Obo):
|
|
28
26
|
"""An ontology representation of SGD's yeast gene nomenclature."""
|
|
29
27
|
|
|
30
28
|
bioversions_key = ontology = PREFIX
|
|
31
29
|
typedefs = [from_species]
|
|
32
|
-
synonym_typedefs = [alias_type]
|
|
33
30
|
|
|
34
31
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
35
32
|
"""Iterate over terms for SGD."""
|
|
@@ -68,7 +65,7 @@ def get_terms(ontology: Obo, force: bool = False) -> Iterable[Term]:
|
|
|
68
65
|
aliases = d.get("Alias")
|
|
69
66
|
if aliases:
|
|
70
67
|
for alias in aliases.split(","):
|
|
71
|
-
synonyms.append(Synonym(name=unquote_plus(alias)
|
|
68
|
+
synonyms.append(Synonym(name=unquote_plus(alias)))
|
|
72
69
|
|
|
73
70
|
term = Term(
|
|
74
71
|
reference=Reference(prefix=PREFIX, identifier=identifier, name=name),
|
pyobo/sources/slm.py
CHANGED
|
@@ -7,7 +7,9 @@ from typing import Iterable
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from tqdm.auto import tqdm
|
|
9
9
|
|
|
10
|
-
from pyobo import Obo,
|
|
10
|
+
from pyobo import Obo, Reference, Term
|
|
11
|
+
from pyobo.struct.struct import abbreviation as abbreviation_typedef
|
|
12
|
+
from pyobo.struct.typedef import exact_match, has_inchi, has_smiles
|
|
11
13
|
from pyobo.utils.path import ensure_df
|
|
12
14
|
|
|
13
15
|
__all__ = [
|
|
@@ -37,14 +39,13 @@ COLUMNS = [
|
|
|
37
39
|
"PMID",
|
|
38
40
|
]
|
|
39
41
|
|
|
40
|
-
abreviation_type = SynonymTypeDef.from_text("abbreviation")
|
|
41
|
-
|
|
42
42
|
|
|
43
43
|
class SLMGetter(Obo):
|
|
44
44
|
"""An ontology representation of SwissLipid's lipid nomenclature."""
|
|
45
45
|
|
|
46
46
|
ontology = bioversions_key = PREFIX
|
|
47
|
-
|
|
47
|
+
typedefs = [exact_match]
|
|
48
|
+
synonym_typedefs = [abbreviation_typedef]
|
|
48
49
|
|
|
49
50
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
50
51
|
"""Iterate over terms in the ontology."""
|
|
@@ -90,28 +91,29 @@ def iter_terms(version: str, force: bool = False):
|
|
|
90
91
|
else:
|
|
91
92
|
raise ValueError(identifier)
|
|
92
93
|
term = Term.from_triple(PREFIX, identifier, name)
|
|
93
|
-
|
|
94
|
+
if pd.notna(level):
|
|
95
|
+
term.append_property("level", level)
|
|
94
96
|
if pd.notna(abbreviation):
|
|
95
|
-
term.append_synonym(abbreviation, type=
|
|
97
|
+
term.append_synonym(abbreviation, type=abbreviation_typedef)
|
|
96
98
|
if pd.notna(synonyms):
|
|
97
99
|
for synonym in synonyms.split("|"):
|
|
98
100
|
term.append_synonym(synonym.strip())
|
|
99
101
|
if pd.notna(smiles):
|
|
100
|
-
term.append_property(
|
|
102
|
+
term.append_property(has_smiles, smiles)
|
|
101
103
|
if pd.notna(inchi) and inchi != "InChI=none":
|
|
102
104
|
if inchi.startswith("InChI="):
|
|
103
105
|
inchi = inchi[len("InChI=") :]
|
|
104
|
-
term.append_property(
|
|
106
|
+
term.append_property(has_inchi, inchi)
|
|
105
107
|
if pd.notna(inchikey):
|
|
106
108
|
if inchikey.startswith("InChIKey="):
|
|
107
109
|
inchikey = inchikey[len("InChIKey=") :]
|
|
108
|
-
term.
|
|
110
|
+
term.append_exact_match(Reference(prefix="inchikey", identifier=inchikey))
|
|
109
111
|
if pd.notna(chebi_id):
|
|
110
|
-
term.
|
|
112
|
+
term.append_exact_match(("chebi", chebi_id))
|
|
111
113
|
if pd.notna(lipidmaps_id):
|
|
112
|
-
term.
|
|
114
|
+
term.append_exact_match(("lipidmaps", lipidmaps_id))
|
|
113
115
|
if pd.notna(hmdb_id):
|
|
114
|
-
term.
|
|
116
|
+
term.append_exact_match(("hmdb", hmdb_id))
|
|
115
117
|
if pd.notna(pmids):
|
|
116
118
|
for pmid in pmids.split("|"):
|
|
117
119
|
term.append_provenance(("pubmed", pmid))
|