pyobo 0.11.0__py3-none-any.whl → 0.11.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/constants.py +1 -0
- pyobo/gilda_utils.py +14 -11
- pyobo/obographs.py +5 -2
- pyobo/resources/so.py +55 -0
- pyobo/resources/so.tsv +2604 -0
- pyobo/sources/complexportal.py +54 -15
- pyobo/sources/dictybase_gene.py +14 -9
- pyobo/sources/drugcentral.py +4 -1
- pyobo/sources/expasy.py +22 -4
- pyobo/sources/flybase.py +3 -2
- pyobo/sources/hgnc.py +24 -19
- pyobo/sources/hgncgenefamily.py +7 -7
- pyobo/sources/kegg/genome.py +18 -6
- pyobo/sources/mirbase.py +9 -3
- pyobo/sources/npass.py +1 -1
- pyobo/sources/pathbank.py +32 -23
- pyobo/sources/pombase.py +6 -3
- pyobo/sources/reactome.py +28 -7
- pyobo/sources/rgd.py +1 -1
- pyobo/sources/slm.py +28 -14
- pyobo/sources/uniprot/uniprot.py +7 -6
- pyobo/sources/zfin.py +18 -6
- pyobo/struct/reference.py +9 -8
- pyobo/struct/struct.py +30 -20
- pyobo/struct/typedef.py +5 -0
- pyobo/version.py +1 -1
- {pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/METADATA +50 -62
- {pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/RECORD +31 -45
- {pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/WHEEL +1 -1
- pyobo/apps/__init__.py +0 -3
- pyobo/apps/cli.py +0 -24
- pyobo/apps/gilda/__init__.py +0 -3
- pyobo/apps/gilda/__main__.py +0 -8
- pyobo/apps/gilda/app.py +0 -48
- pyobo/apps/gilda/cli.py +0 -36
- pyobo/apps/gilda/templates/base.html +0 -33
- pyobo/apps/gilda/templates/home.html +0 -11
- pyobo/apps/gilda/templates/matches.html +0 -32
- pyobo/apps/mapper/__init__.py +0 -3
- pyobo/apps/mapper/__main__.py +0 -11
- pyobo/apps/mapper/cli.py +0 -37
- pyobo/apps/mapper/mapper.py +0 -187
- pyobo/apps/mapper/templates/base.html +0 -35
- pyobo/apps/mapper/templates/mapper_home.html +0 -64
- pyobo-0.11.0.dist-info/LICENSE +0 -21
- {pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/entry_points.txt +0 -0
- {pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/top_level.txt +0 -0
pyobo/sources/complexportal.py
CHANGED
|
@@ -58,11 +58,31 @@ def _parse_members(s) -> list[tuple[Reference, str]]:
|
|
|
58
58
|
for member in s.split("|"):
|
|
59
59
|
entity_id, count = member.split("(")
|
|
60
60
|
count = count.rstrip(")")
|
|
61
|
-
if "
|
|
61
|
+
if entity_id.startswith("URS"):
|
|
62
|
+
prefix, identifier = "rnacentral", entity_id
|
|
63
|
+
elif entity_id.startswith("CPX"):
|
|
64
|
+
# TODO why self xref?
|
|
65
|
+
prefix, identifier = "complexportal", entity_id
|
|
66
|
+
elif entity_id.startswith("["):
|
|
67
|
+
continue # this is a list of uniprot IDs, not sure what to do with this
|
|
68
|
+
elif entity_id.startswith("EBI-"):
|
|
69
|
+
continue
|
|
70
|
+
elif ":" not in entity_id:
|
|
71
|
+
if "PRO_" in entity_id:
|
|
72
|
+
prefix = "uniprot.chain"
|
|
73
|
+
identifier = entity_id.split("-")[1]
|
|
74
|
+
elif "-" in entity_id:
|
|
75
|
+
prefix, identifier = "uniprot.isoform", entity_id
|
|
76
|
+
else:
|
|
77
|
+
prefix, identifier = "uniprot", entity_id
|
|
78
|
+
else:
|
|
62
79
|
prefix, identifier = entity_id.split(":", 1)
|
|
80
|
+
try:
|
|
81
|
+
reference = Reference(prefix=prefix, identifier=identifier)
|
|
82
|
+
except ValueError:
|
|
83
|
+
tqdm.write(f"failed to validate reference: {entity_id}")
|
|
63
84
|
else:
|
|
64
|
-
|
|
65
|
-
rv.append((Reference(prefix=prefix, identifier=identifier), count))
|
|
85
|
+
rv.append((reference, count))
|
|
66
86
|
return rv
|
|
67
87
|
|
|
68
88
|
|
|
@@ -74,27 +94,40 @@ def _parse_xrefs(s) -> list[tuple[Reference, str]]:
|
|
|
74
94
|
for xref in s.split("|"):
|
|
75
95
|
xref = xref.replace("protein ontology:PR:", "PR:")
|
|
76
96
|
xref = xref.replace("protein ontology:PR_", "PR:")
|
|
97
|
+
xref = xref.replace("rhea:rhea ", "rhea:")
|
|
98
|
+
xref = xref.replace("rhea:Rhea ", "rhea:")
|
|
99
|
+
xref = xref.replace("rhea:RHEA:rhea", "rhea:")
|
|
100
|
+
xref = xref.replace("rhea:RHEA: ", "rhea:")
|
|
101
|
+
xref = xref.replace("rhea:RHEA:rhea ", "rhea:")
|
|
102
|
+
xref = xref.replace("intenz:RHEA:", "rhea:")
|
|
103
|
+
xref = xref.replace("eccode::", "eccode:")
|
|
104
|
+
xref = xref.replace("eccode:EC:", "eccode:")
|
|
105
|
+
xref = xref.replace("intenz:EC:", "eccode:")
|
|
106
|
+
xref = xref.replace("eccode:RHEA:", "rhea:")
|
|
107
|
+
xref = xref.replace("efo:MONDO:", "MONDO:")
|
|
108
|
+
xref = xref.replace("omim:MIM:", "omim:")
|
|
109
|
+
xref = xref.replace("efo:HP:", "HP:")
|
|
110
|
+
xref = xref.replace("efo:Orphanet:", "Orphanet:")
|
|
111
|
+
xref = xref.replace("orphanet:ORDO:", "Orphanet:")
|
|
112
|
+
xref = xref.replace("biorxiv:doi.org/", "doi:")
|
|
113
|
+
xref = xref.replace("emdb:EMDB-", "emdb:EMD-")
|
|
114
|
+
xref = xref.replace("wwpdb:EMD-", "emdb:EMD-")
|
|
115
|
+
xref = xref.replace("signor:CPX-", "complexportal:CPX-")
|
|
116
|
+
|
|
77
117
|
try:
|
|
78
118
|
xref_curie, note = xref.split("(")
|
|
79
119
|
except ValueError:
|
|
80
120
|
logger.warning("xref missing (: %s", xref)
|
|
81
121
|
continue
|
|
82
122
|
note = note.rstrip(")")
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
note.replace("eccode:RHEA:", "rhea:")
|
|
88
|
-
if note.lower().startswith("rhea "):
|
|
89
|
-
note = note[len("Rhea ") :]
|
|
90
|
-
if note.lower().startswith("rhea:rhea "):
|
|
91
|
-
note = note[len("rhea:rhea ") :]
|
|
92
|
-
if note.lower().startswith("EC:"):
|
|
93
|
-
note = note[len("EC:") :]
|
|
123
|
+
|
|
124
|
+
if xref_curie.startswith("intenz:"):
|
|
125
|
+
xref_curie = _clean_intenz(xref_curie)
|
|
126
|
+
|
|
94
127
|
try:
|
|
95
128
|
reference = Reference.from_curie(xref_curie)
|
|
96
129
|
except ValueError:
|
|
97
|
-
logger.warning("can not parse CURIE: %s",
|
|
130
|
+
logger.warning("can not parse CURIE: %s", xref_curie)
|
|
98
131
|
continue
|
|
99
132
|
if reference is None:
|
|
100
133
|
logger.warning("reference is None after parsing: %s", xref)
|
|
@@ -103,6 +136,12 @@ def _parse_xrefs(s) -> list[tuple[Reference, str]]:
|
|
|
103
136
|
return rv
|
|
104
137
|
|
|
105
138
|
|
|
139
|
+
def _clean_intenz(s: str) -> str:
|
|
140
|
+
for _ in range(3):
|
|
141
|
+
s = s.rstrip("-").rstrip(".")
|
|
142
|
+
return s
|
|
143
|
+
|
|
144
|
+
|
|
106
145
|
class ComplexPortalGetter(Obo):
|
|
107
146
|
"""An ontology representation of the Complex Portal."""
|
|
108
147
|
|
pyobo/sources/dictybase_gene.py
CHANGED
|
@@ -9,8 +9,7 @@ from collections.abc import Iterable
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
from tqdm.auto import tqdm
|
|
11
11
|
|
|
12
|
-
from pyobo.struct import Obo,
|
|
13
|
-
from pyobo.utils.io import multisetdict
|
|
12
|
+
from pyobo.struct import Obo, Synonym, Term, from_species, has_gene_product
|
|
14
13
|
from pyobo.utils.path import ensure_df
|
|
15
14
|
|
|
16
15
|
__all__ = [
|
|
@@ -49,10 +48,11 @@ def get_obo(force: bool = False) -> Obo:
|
|
|
49
48
|
|
|
50
49
|
def get_terms(force: bool = False) -> Iterable[Term]:
|
|
51
50
|
"""Get terms."""
|
|
51
|
+
# TODO the mappings file has actually no uniprot at all, and requires text mining
|
|
52
52
|
# DDB ID DDB_G ID Name UniProt ID
|
|
53
|
-
uniprot_mappings = multisetdict(
|
|
54
|
-
|
|
55
|
-
)
|
|
53
|
+
# uniprot_mappings = multisetdict(
|
|
54
|
+
# ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values
|
|
55
|
+
# )
|
|
56
56
|
|
|
57
57
|
terms = ensure_df(PREFIX, url=URL, force=force, name="gene_info.tsv")
|
|
58
58
|
# GENE ID (DDB_G ID) Gene Name Synonyms Gene products
|
|
@@ -68,10 +68,15 @@ def get_terms(force: bool = False) -> Iterable[Term]:
|
|
|
68
68
|
if synonyms and pd.notna(synonyms):
|
|
69
69
|
for synonym in synonyms.split(","):
|
|
70
70
|
term.append_synonym(Synonym(synonym.strip()))
|
|
71
|
-
for uniprot_id in uniprot_mappings.get(identifier, []):
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
71
|
+
# for uniprot_id in uniprot_mappings.get(identifier, []):
|
|
72
|
+
# if not uniprot_id or pd.isna(uniprot_id) or uniprot_id in {"unknown", "pseudogene"}:
|
|
73
|
+
# continue
|
|
74
|
+
# try:
|
|
75
|
+
# uniprot_ref = Reference(prefix="uniprot", identifier=uniprot_id)
|
|
76
|
+
# except ValueError:
|
|
77
|
+
# tqdm.write(f"[dictybase.gene] invalid uniprot ref: {uniprot_id}")
|
|
78
|
+
# else:
|
|
79
|
+
# term.append_relationship(has_gene_product, uniprot_ref)
|
|
75
80
|
|
|
76
81
|
term.set_species(identifier="44689", name="Dictyostelium discoideum")
|
|
77
82
|
yield term
|
pyobo/sources/drugcentral.py
CHANGED
|
@@ -68,6 +68,9 @@ def iter_terms() -> Iterable[Term]:
|
|
|
68
68
|
if xref_prefix_norm is None:
|
|
69
69
|
tqdm.write(f"did not normalize {prefix}:{identifier}")
|
|
70
70
|
continue
|
|
71
|
+
if xref_prefix_norm == "pdb.ligand":
|
|
72
|
+
# there is a weird invalid escaped \W appearing in pdb ligand ids
|
|
73
|
+
identifier = identifier.strip()
|
|
71
74
|
identifier = bioregistry.standardize_identifier(xref_prefix_norm, identifier)
|
|
72
75
|
xrefs[str(drugcentral_id)].append(
|
|
73
76
|
Reference(prefix=xref_prefix_norm, identifier=identifier)
|
|
@@ -98,4 +101,4 @@ def iter_terms() -> Iterable[Term]:
|
|
|
98
101
|
|
|
99
102
|
|
|
100
103
|
if __name__ == "__main__":
|
|
101
|
-
|
|
104
|
+
DrugCentralGetter.cli()
|
pyobo/sources/expasy.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Convert ExPASy to OBO."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import re
|
|
4
5
|
from collections import defaultdict
|
|
5
6
|
from collections.abc import Iterable, Mapping
|
|
6
7
|
from typing import Any, Optional
|
|
@@ -42,7 +43,7 @@ class ExpasyGetter(Obo):
|
|
|
42
43
|
"""A getter for ExPASy Enzyme Classes."""
|
|
43
44
|
|
|
44
45
|
bioversions_key = ontology = PREFIX
|
|
45
|
-
typedefs = [has_member, enables]
|
|
46
|
+
typedefs = [has_member, enables, term_replaced_by]
|
|
46
47
|
root_terms = [
|
|
47
48
|
Reference(prefix="eccode", identifier="1"),
|
|
48
49
|
Reference(prefix="eccode", identifier="2"),
|
|
@@ -145,7 +146,9 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
145
146
|
for domain in data.get("domains", []):
|
|
146
147
|
term.append_relationship(
|
|
147
148
|
has_member,
|
|
148
|
-
Reference(
|
|
149
|
+
Reference.model_validate(
|
|
150
|
+
{"prefix": domain["namespace"], "identifier": domain["identifier"]},
|
|
151
|
+
),
|
|
149
152
|
)
|
|
150
153
|
for protein in data.get("proteins", []):
|
|
151
154
|
term.append_relationship(
|
|
@@ -248,8 +251,10 @@ def get_database(lines: Iterable[str]) -> Mapping:
|
|
|
248
251
|
elif descriptor == DE and value == "Deleted entry.":
|
|
249
252
|
ec_data_entry["deleted"] = True
|
|
250
253
|
elif descriptor == DE and value.startswith("Transferred entry: "):
|
|
251
|
-
|
|
252
|
-
|
|
254
|
+
# TODO There's a situation where there are enough transfers that it goes on to a second line
|
|
255
|
+
# the following line just gives up on this one. or maybe I don't understand
|
|
256
|
+
value = value.strip().removesuffix("and").rstrip(",").strip()
|
|
257
|
+
ec_data_entry["transfer_id"] = _parse_transfer(value)
|
|
253
258
|
elif descriptor == DE:
|
|
254
259
|
ec_data_entry["concept"]["name"] = value.rstrip(".") # type:ignore
|
|
255
260
|
elif descriptor == AN:
|
|
@@ -279,6 +284,19 @@ def get_database(lines: Iterable[str]) -> Mapping:
|
|
|
279
284
|
return rv
|
|
280
285
|
|
|
281
286
|
|
|
287
|
+
TRANSFER_SPLIT_RE = re.compile(r",\s*|\s+and\s+")
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def _parse_transfer(value: str) -> list[str]:
|
|
291
|
+
"""Parse transferred entry string.
|
|
292
|
+
|
|
293
|
+
>>> _parse_transfer("Transferred entry: 1.1.1.198, 1.1.1.227 and 1.1.1.228.")
|
|
294
|
+
['1.1.1.198', '1.1.1.227', '1.1.1.228']
|
|
295
|
+
"""
|
|
296
|
+
value = value[len("Transferred entry: ") :].rstrip().rstrip(".")
|
|
297
|
+
return sorted(x.strip().removeprefix("and").strip() for x in TRANSFER_SPLIT_RE.split(value))
|
|
298
|
+
|
|
299
|
+
|
|
282
300
|
def _group_by_id(lines):
|
|
283
301
|
"""Group lines by identifier."""
|
|
284
302
|
groups = []
|
pyobo/sources/flybase.py
CHANGED
|
@@ -7,6 +7,7 @@ import pandas as pd
|
|
|
7
7
|
from tqdm.auto import tqdm
|
|
8
8
|
|
|
9
9
|
from pyobo import Reference
|
|
10
|
+
from pyobo.resources.so import get_so_name
|
|
10
11
|
from pyobo.struct import Obo, Term, from_species, orthologous
|
|
11
12
|
from pyobo.utils.io import multisetdict
|
|
12
13
|
from pyobo.utils.path import ensure_df
|
|
@@ -133,7 +134,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
133
134
|
"FlyBase gene type is missing mapping to Sequence Ontology (SO): %s", gtype
|
|
134
135
|
)
|
|
135
136
|
else:
|
|
136
|
-
so[gtype] = Reference
|
|
137
|
+
so[gtype] = Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id))
|
|
137
138
|
|
|
138
139
|
for _, reference in sorted(so.items()):
|
|
139
140
|
yield Term(reference=reference)
|
|
@@ -153,7 +154,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
153
154
|
for hgnc_curie in human_orthologs.get(identifier, []):
|
|
154
155
|
if not hgnc_curie or pd.isna(hgnc_curie):
|
|
155
156
|
continue
|
|
156
|
-
hgnc_ortholog = Reference.from_curie(hgnc_curie
|
|
157
|
+
hgnc_ortholog = Reference.from_curie(hgnc_curie)
|
|
157
158
|
if hgnc_ortholog is None:
|
|
158
159
|
tqdm.write(f"[{PREFIX}] {identifier} had invalid ortholog: {hgnc_curie}")
|
|
159
160
|
else:
|
pyobo/sources/hgnc.py
CHANGED
|
@@ -13,6 +13,7 @@ from tabulate import tabulate
|
|
|
13
13
|
from tqdm.auto import tqdm
|
|
14
14
|
|
|
15
15
|
from pyobo.api.utils import get_version
|
|
16
|
+
from pyobo.resources.so import get_so_name
|
|
16
17
|
from pyobo.struct import (
|
|
17
18
|
Obo,
|
|
18
19
|
Reference,
|
|
@@ -37,8 +38,8 @@ logger = logging.getLogger(__name__)
|
|
|
37
38
|
|
|
38
39
|
PREFIX = "hgnc"
|
|
39
40
|
DEFINITIONS_URL_FMT = (
|
|
40
|
-
"
|
|
41
|
-
"
|
|
41
|
+
"https://storage.googleapis.com/public-download-files/hgnc/archive/archive/monthly/json/"
|
|
42
|
+
"hgnc_complete_set_{version}.json"
|
|
42
43
|
)
|
|
43
44
|
|
|
44
45
|
previous_symbol_type = SynonymTypeDef.from_text("previous_symbol")
|
|
@@ -222,7 +223,7 @@ class HGNCGetter(Obo):
|
|
|
222
223
|
alias_symbol_type,
|
|
223
224
|
]
|
|
224
225
|
root_terms = [
|
|
225
|
-
Reference(prefix="
|
|
226
|
+
Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id))
|
|
226
227
|
for so_id in sorted(set(LOCUS_TYPE_TO_SO.values()))
|
|
227
228
|
if so_id
|
|
228
229
|
]
|
|
@@ -256,7 +257,7 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
|
|
|
256
257
|
yield Term.from_triple("NCBITaxon", "9606", "Homo sapiens")
|
|
257
258
|
yield from sorted(
|
|
258
259
|
{
|
|
259
|
-
Term(reference=Reference
|
|
260
|
+
Term(reference=Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id)))
|
|
260
261
|
for so_id in sorted(LOCUS_TYPE_TO_SO.values())
|
|
261
262
|
if so_id
|
|
262
263
|
},
|
|
@@ -363,23 +364,25 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
|
|
|
363
364
|
xref_identifiers = entry.pop(key, None)
|
|
364
365
|
if xref_identifiers is None:
|
|
365
366
|
continue
|
|
366
|
-
|
|
367
367
|
if isinstance(xref_identifiers, (str, int)):
|
|
368
|
+
xref_identifiers = [str(xref_identifiers)]
|
|
369
|
+
|
|
370
|
+
if xref_prefix == "merops.entry":
|
|
371
|
+
continue
|
|
372
|
+
# e.g., XM02-001 should be rewritten as XM02.001
|
|
373
|
+
xref_identifiers = [i.replace("-", ".") for i in xref_identifiers]
|
|
374
|
+
|
|
375
|
+
if xref_prefix == "refseq":
|
|
376
|
+
# e.g., strip off dots without substantiated record versions like in NM_021728.
|
|
377
|
+
xref_identifiers = [i.strip(".") for i in xref_identifiers]
|
|
378
|
+
|
|
379
|
+
if len(xref_identifiers) == 1:
|
|
368
380
|
term.append_exact_match(
|
|
369
|
-
Reference(prefix=xref_prefix, identifier=str(xref_identifiers))
|
|
381
|
+
Reference(prefix=xref_prefix, identifier=str(xref_identifiers[0]))
|
|
370
382
|
)
|
|
371
|
-
elif isinstance(xref_identifiers, list):
|
|
372
|
-
if len(xref_identifiers) == 1:
|
|
373
|
-
term.append_exact_match(
|
|
374
|
-
Reference(prefix=xref_prefix, identifier=str(xref_identifiers[0]))
|
|
375
|
-
)
|
|
376
|
-
else:
|
|
377
|
-
for xref_identifier in xref_identifiers:
|
|
378
|
-
term.append_xref(
|
|
379
|
-
Reference(prefix=xref_prefix, identifier=str(xref_identifier))
|
|
380
|
-
)
|
|
381
383
|
else:
|
|
382
|
-
|
|
384
|
+
for xref_identifier in xref_identifiers:
|
|
385
|
+
term.append_xref(Reference(prefix=xref_prefix, identifier=str(xref_identifier)))
|
|
383
386
|
|
|
384
387
|
for pubmed_id in entry.pop("pubmed_id", []):
|
|
385
388
|
term.append_provenance(Reference(prefix="pubmed", identifier=str(pubmed_id)))
|
|
@@ -416,9 +419,11 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
|
|
|
416
419
|
locus_group = entry.pop("locus_group")
|
|
417
420
|
so_id = LOCUS_TYPE_TO_SO.get(locus_type)
|
|
418
421
|
if so_id:
|
|
419
|
-
term.append_parent(Reference
|
|
422
|
+
term.append_parent(Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id)))
|
|
420
423
|
else:
|
|
421
|
-
term.append_parent(
|
|
424
|
+
term.append_parent(
|
|
425
|
+
Reference(prefix="SO", identifier="0000704", name=get_so_name("0000704"))
|
|
426
|
+
) # gene
|
|
422
427
|
unhandle_locus_types[locus_type][identifier] = term
|
|
423
428
|
term.append_property("locus_type", locus_type)
|
|
424
429
|
term.append_property("locus_group", locus_group)
|
pyobo/sources/hgncgenefamily.py
CHANGED
|
@@ -21,13 +21,13 @@ __all__ = [
|
|
|
21
21
|
]
|
|
22
22
|
|
|
23
23
|
PREFIX = "hgnc.genegroup"
|
|
24
|
-
FAMILIES_URL = "
|
|
24
|
+
FAMILIES_URL = "https://storage.googleapis.com/public-download-files/hgnc/csv/csv/genefamily_db_tables/family.csv"
|
|
25
25
|
# TODO use family_alias.csv
|
|
26
|
-
HIERARCHY_URL =
|
|
27
|
-
"ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/csv/genefamily_db_tables/hierarchy.csv"
|
|
28
|
-
)
|
|
26
|
+
HIERARCHY_URL = "https://storage.googleapis.com/public-download-files/hgnc/csv/csv/genefamily_db_tables/hierarchy.csv"
|
|
29
27
|
|
|
30
|
-
symbol_type = SynonymTypeDef
|
|
28
|
+
symbol_type = SynonymTypeDef(
|
|
29
|
+
reference=Reference(prefix="OMO", identifier="0004000", name="has symbol")
|
|
30
|
+
)
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
class HGNCGroupGetter(Obo):
|
|
@@ -78,7 +78,7 @@ def get_terms(force: bool = False) -> Iterable[Term]:
|
|
|
78
78
|
name=parent.name,
|
|
79
79
|
)
|
|
80
80
|
)
|
|
81
|
-
gene_group = Reference
|
|
81
|
+
gene_group = Reference(prefix="SO", identifier="0005855", name="gene group")
|
|
82
82
|
yield Term(reference=gene_group)
|
|
83
83
|
for term in terms:
|
|
84
84
|
if not term.parents:
|
|
@@ -98,7 +98,7 @@ def _get_terms_helper(force: bool = False) -> Iterable[Term]:
|
|
|
98
98
|
definition=definition,
|
|
99
99
|
)
|
|
100
100
|
if pubmed_ids and pd.notna(pubmed_ids):
|
|
101
|
-
for s in pubmed_ids.split(","):
|
|
101
|
+
for s in pubmed_ids.replace(" ", ",").split(","):
|
|
102
102
|
term.append_provenance(Reference(prefix="pubmed", identifier=s.strip()))
|
|
103
103
|
if desc_go and pd.notna(desc_go):
|
|
104
104
|
go_id = desc_go[len("http://purl.uniprot.org/go/") :]
|
pyobo/sources/kegg/genome.py
CHANGED
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
Run with ``python -m pyobo.sources.kegg.genome``
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
6
8
|
import logging
|
|
7
9
|
from collections.abc import Iterable
|
|
8
10
|
|
|
@@ -46,8 +48,11 @@ def get_obo() -> Obo:
|
|
|
46
48
|
return KEGGGenomeGetter()
|
|
47
49
|
|
|
48
50
|
|
|
49
|
-
def parse_genome_line(line: str) -> KEGGGenome:
|
|
51
|
+
def parse_genome_line(line: str) -> KEGGGenome | None:
|
|
50
52
|
"""Parse a line from the KEGG Genome database."""
|
|
53
|
+
if not line.startswith("T"):
|
|
54
|
+
# This is for an NCBI Taxonomy
|
|
55
|
+
return None
|
|
51
56
|
line = line.strip()
|
|
52
57
|
identifier, rest = _s(line, "\t")
|
|
53
58
|
identifier = identifier[len("gn:") :]
|
|
@@ -94,6 +99,8 @@ def iter_kegg_genomes(version: str, desc: str) -> Iterable[KEGGGenome]:
|
|
|
94
99
|
it = tqdm(lines, desc=desc, unit_scale=True, unit="genome")
|
|
95
100
|
for line in it:
|
|
96
101
|
yv = parse_genome_line(line)
|
|
102
|
+
if yv is None:
|
|
103
|
+
continue
|
|
97
104
|
it.set_postfix({"id": yv.identifier, "name": yv.name})
|
|
98
105
|
yield yv
|
|
99
106
|
|
|
@@ -105,11 +112,16 @@ def iter_terms(version: str) -> Iterable[Term]:
|
|
|
105
112
|
for kegg_genome in iter_kegg_genomes(version=version, desc="KEGG Genomes"):
|
|
106
113
|
if kegg_genome.identifier in SKIP:
|
|
107
114
|
continue
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
reference = Reference(
|
|
118
|
+
prefix=KEGG_GENOME_PREFIX, identifier=kegg_genome.identifier, name=kegg_genome.name
|
|
119
|
+
)
|
|
120
|
+
except ValueError:
|
|
121
|
+
tqdm.write(f"[{KEGG_GENOME_PREFIX}] invalid identifier: {kegg_genome}")
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
term = Term(reference=reference)
|
|
113
125
|
if kegg_genome.taxonomy_id is not None:
|
|
114
126
|
taxonomy_name = get_ncbitaxon_name(kegg_genome.taxonomy_id)
|
|
115
127
|
if taxonomy_name is None:
|
pyobo/sources/mirbase.py
CHANGED
|
@@ -136,9 +136,15 @@ def _process_definitions_lines(
|
|
|
136
136
|
xref_prefix = xref_mapping.get(xref_prefix, xref_prefix)
|
|
137
137
|
if xref_prefix == "pictar":
|
|
138
138
|
continue
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
xref = Reference(
|
|
142
|
+
prefix=xref_prefix, identifier=xref_identifier, name=xref_label or None
|
|
143
|
+
)
|
|
144
|
+
except ValueError:
|
|
145
|
+
tqdm.write(f"invalid xref: {xref_prefix}:{xref_identifier}")
|
|
146
|
+
else:
|
|
147
|
+
xrefs.append(xref)
|
|
142
148
|
|
|
143
149
|
# TODO add pubmed references
|
|
144
150
|
|
pyobo/sources/npass.py
CHANGED
|
@@ -39,7 +39,7 @@ def get_obo(force: bool = False) -> Obo:
|
|
|
39
39
|
|
|
40
40
|
def get_df(version: str, force: bool = False) -> pd.DataFrame:
|
|
41
41
|
"""Get the NPASS chemical nomenclature."""
|
|
42
|
-
base_url = f"
|
|
42
|
+
base_url = f"https://bidd.group/NPASS/downloadFiles/NPASSv{version}_download"
|
|
43
43
|
url = f"{base_url}_naturalProducts_generalInfo.txt"
|
|
44
44
|
return ensure_df(
|
|
45
45
|
PREFIX,
|
pyobo/sources/pathbank.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""Converter for PathBank."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import logging
|
|
4
6
|
from collections import defaultdict
|
|
5
7
|
from collections.abc import Iterable, Mapping
|
|
@@ -8,7 +10,7 @@ import pandas as pd
|
|
|
8
10
|
from tqdm.auto import tqdm
|
|
9
11
|
|
|
10
12
|
from ..struct import Obo, Reference, Term
|
|
11
|
-
from ..struct.typedef import has_participant
|
|
13
|
+
from ..struct.typedef import has_category, has_participant
|
|
12
14
|
from ..utils.path import ensure_df
|
|
13
15
|
|
|
14
16
|
__all__ = [
|
|
@@ -68,7 +70,7 @@ class PathBankGetter(Obo):
|
|
|
68
70
|
"""An ontology representation of PathBank's pathway nomenclature."""
|
|
69
71
|
|
|
70
72
|
ontology = bioversions_key = PREFIX
|
|
71
|
-
typedefs = [has_participant]
|
|
73
|
+
typedefs = [has_participant, has_category]
|
|
72
74
|
|
|
73
75
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
74
76
|
"""Iterate over terms in the ontology."""
|
|
@@ -103,21 +105,30 @@ def get_protein_mapping(version: str, force: bool = False) -> Mapping[str, set[R
|
|
|
103
105
|
for pathway_id, protein_id in tqdm(
|
|
104
106
|
proteins_df.values, desc=f"[{PREFIX}] mapping proteins", unit_scale=True
|
|
105
107
|
):
|
|
106
|
-
|
|
107
|
-
|
|
108
|
+
try:
|
|
109
|
+
if "-" in protein_id:
|
|
110
|
+
reference = Reference(prefix="uniprot.isoform", identifier=protein_id)
|
|
111
|
+
else:
|
|
112
|
+
reference = Reference(prefix="uniprot", identifier=protein_id)
|
|
113
|
+
except ValueError:
|
|
114
|
+
tqdm.write(f"[pathbank] invalid uniprot identifier: {protein_id}")
|
|
115
|
+
else:
|
|
116
|
+
smpdb_id_to_proteins[pathway_id].add(reference)
|
|
108
117
|
return smpdb_id_to_proteins
|
|
109
118
|
|
|
110
119
|
|
|
111
120
|
def get_metabolite_df(version: str, force: bool = False) -> pd.DataFrame:
|
|
112
121
|
"""Get the metabolites dataframe."""
|
|
113
|
-
|
|
122
|
+
df = ensure_df(
|
|
114
123
|
PREFIX,
|
|
115
124
|
url=METABOLITE_URL,
|
|
116
125
|
sep=",",
|
|
117
|
-
usecols=["PathBank ID", "
|
|
126
|
+
usecols=["PathBank ID", "ChEBI ID"],
|
|
118
127
|
force=force,
|
|
119
128
|
version=version,
|
|
120
129
|
)
|
|
130
|
+
df = df[df["ChEBI ID"].notna()]
|
|
131
|
+
return df
|
|
121
132
|
|
|
122
133
|
|
|
123
134
|
def get_metabolite_mapping(version: str, force: bool = False) -> Mapping[str, set[Reference]]:
|
|
@@ -125,17 +136,20 @@ def get_metabolite_mapping(version: str, force: bool = False) -> Mapping[str, se
|
|
|
125
136
|
metabolites_df = get_metabolite_df(version=version, force=force)
|
|
126
137
|
smpdb_id_to_metabolites = defaultdict(set)
|
|
127
138
|
it = tqdm(metabolites_df.values, desc=f"[{PREFIX}] mapping metabolites", unit_scale=True)
|
|
128
|
-
for pathway_id, metabolite_id
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
prefix=PREFIX,
|
|
132
|
-
identifier=metabolite_id,
|
|
133
|
-
name=metabolite_name,
|
|
134
|
-
)
|
|
135
|
-
)
|
|
139
|
+
for pathway_id, metabolite_id in it:
|
|
140
|
+
reference = Reference(prefix="chebi", identifier=metabolite_id.strip())
|
|
141
|
+
smpdb_id_to_metabolites[pathway_id].add(reference)
|
|
136
142
|
return smpdb_id_to_metabolites
|
|
137
143
|
|
|
138
144
|
|
|
145
|
+
def _clean_description(description: str) -> str | None:
|
|
146
|
+
"""Clean the description."""
|
|
147
|
+
if pd.isna(description) or not description:
|
|
148
|
+
return None
|
|
149
|
+
parts = [part.strip() for part in description.strip().splitlines()]
|
|
150
|
+
return " ".join(parts)
|
|
151
|
+
|
|
152
|
+
|
|
139
153
|
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
140
154
|
"""Get PathBank's terms."""
|
|
141
155
|
smpdb_id_to_proteins = get_protein_mapping(version=version, force=force)
|
|
@@ -147,16 +161,11 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
147
161
|
reference = Reference(prefix=PREFIX, identifier=pathbank_id, name=name)
|
|
148
162
|
term = Term(
|
|
149
163
|
reference=reference,
|
|
150
|
-
#
|
|
151
|
-
|
|
152
|
-
)
|
|
153
|
-
term.append_parent(
|
|
154
|
-
Reference(
|
|
155
|
-
prefix=PREFIX,
|
|
156
|
-
identifier=subject.lower().replace(" ", "_"),
|
|
157
|
-
name=subject,
|
|
158
|
-
)
|
|
164
|
+
# TODO use _clean_description(description) to add a description,
|
|
165
|
+
# but there are weird parser errors
|
|
159
166
|
)
|
|
167
|
+
term.append_exact_match(Reference(prefix="smpdb", identifier=smpdb_id))
|
|
168
|
+
term.append_property(has_category, subject.lower().replace(" ", "_"))
|
|
160
169
|
term.extend_relationship(has_participant, smpdb_id_to_proteins[smpdb_id])
|
|
161
170
|
term.extend_relationship(has_participant, smpdb_id_to_metabolites[smpdb_id])
|
|
162
171
|
yield term
|
pyobo/sources/pombase.py
CHANGED
|
@@ -9,6 +9,7 @@ from tqdm.auto import tqdm
|
|
|
9
9
|
|
|
10
10
|
import pyobo
|
|
11
11
|
from pyobo import Reference
|
|
12
|
+
from pyobo.resources.so import get_so_name
|
|
12
13
|
from pyobo.struct import Obo, Term, from_species, has_gene_product, orthologous
|
|
13
14
|
from pyobo.utils.path import ensure_df
|
|
14
15
|
|
|
@@ -19,7 +20,7 @@ __all__ = [
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
21
22
|
PREFIX = "pombase"
|
|
22
|
-
|
|
23
|
+
GENE_NAMES_URL = "https://www.pombase.org/data/names_and_identifiers/gene_IDs_names_products.tsv"
|
|
23
24
|
ORTHOLOGS_URL = "https://www.pombase.org/data/orthologs/human-orthologs.txt.gz"
|
|
24
25
|
|
|
25
26
|
|
|
@@ -68,9 +69,11 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
68
69
|
if hgnc_id is not None:
|
|
69
70
|
identifier_to_hgnc_ids[identifier].add(hgnc_id)
|
|
70
71
|
|
|
71
|
-
df = ensure_df(PREFIX, url=
|
|
72
|
+
df = ensure_df(PREFIX, url=GENE_NAMES_URL, force=force, version=version)
|
|
72
73
|
so = {
|
|
73
|
-
gtype: Reference
|
|
74
|
+
gtype: Reference(
|
|
75
|
+
prefix="SO", identifier=POMBASE_TO_SO[gtype], name=get_so_name(POMBASE_TO_SO[gtype])
|
|
76
|
+
)
|
|
74
77
|
for gtype in sorted(df[df.columns[6]].unique())
|
|
75
78
|
}
|
|
76
79
|
for _, reference in sorted(so.items()):
|
pyobo/sources/reactome.py
CHANGED
|
@@ -70,7 +70,9 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
70
70
|
df["taxonomy_id"] = df["species"].map(get_ncbitaxon_id)
|
|
71
71
|
|
|
72
72
|
terms = {}
|
|
73
|
-
it = tqdm(
|
|
73
|
+
it = tqdm(
|
|
74
|
+
df.values, total=len(df.index), desc=f"mapping {PREFIX}", unit_scale=True, unit="pathway"
|
|
75
|
+
)
|
|
74
76
|
for reactome_id, name, species_name, taxonomy_id in it:
|
|
75
77
|
terms[reactome_id] = term = Term(
|
|
76
78
|
reference=Reference(prefix=PREFIX, identifier=reactome_id, name=name),
|
|
@@ -92,10 +94,21 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
92
94
|
terms[child_id].append_parent(terms[parent_id])
|
|
93
95
|
|
|
94
96
|
uniprot_pathway_df = ensure_participant_df(version=version, force=force)
|
|
95
|
-
for uniprot_id, reactome_id in tqdm(
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
97
|
+
for uniprot_id, reactome_id in tqdm(
|
|
98
|
+
uniprot_pathway_df.values,
|
|
99
|
+
total=len(uniprot_pathway_df),
|
|
100
|
+
unit_scale=True,
|
|
101
|
+
unit="pathway-protein",
|
|
102
|
+
):
|
|
103
|
+
if reactome_id not in terms:
|
|
104
|
+
tqdm.write(f"{reactome_id} appears in uniprot participants file but not pathways file")
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
if "-" in uniprot_id:
|
|
108
|
+
reference = Reference(prefix="uniprot.isoform", identifier=uniprot_id)
|
|
109
|
+
else:
|
|
110
|
+
reference = Reference(prefix="uniprot", identifier=uniprot_id)
|
|
111
|
+
terms[reactome_id].append_relationship(has_participant, reference)
|
|
99
112
|
|
|
100
113
|
chebi_pathway_url = f"https://reactome.org/download/{version}/ChEBI2Reactome_All_Levels.txt"
|
|
101
114
|
chebi_pathway_df = ensure_df(
|
|
@@ -106,7 +119,15 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
106
119
|
version=version,
|
|
107
120
|
force=force,
|
|
108
121
|
)
|
|
109
|
-
for chebi_id, reactome_id in tqdm(
|
|
122
|
+
for chebi_id, reactome_id in tqdm(
|
|
123
|
+
chebi_pathway_df.values,
|
|
124
|
+
total=len(chebi_pathway_df),
|
|
125
|
+
unit_scale=True,
|
|
126
|
+
unit="pathway-chemical",
|
|
127
|
+
):
|
|
128
|
+
if reactome_id not in terms:
|
|
129
|
+
tqdm.write(f"{reactome_id} appears in chebi participants file but not pathways file")
|
|
130
|
+
continue
|
|
110
131
|
terms[reactome_id].append_relationship(
|
|
111
132
|
has_participant, Reference(prefix="chebi", identifier=chebi_id)
|
|
112
133
|
)
|
|
@@ -133,4 +154,4 @@ def get_protein_to_pathways() -> Mapping[str, set[str]]:
|
|
|
133
154
|
|
|
134
155
|
|
|
135
156
|
if __name__ == "__main__":
|
|
136
|
-
|
|
157
|
+
ReactomeGetter.cli()
|