pyobo 0.10.7__py3-none-any.whl → 0.10.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/api/hierarchy.py +21 -11
- pyobo/api/properties.py +24 -9
- pyobo/api/xrefs.py +3 -1
- pyobo/getters.py +2 -2
- pyobo/gilda_utils.py +49 -19
- pyobo/sources/__init__.py +4 -0
- pyobo/sources/cgnc.py +1 -1
- pyobo/sources/chebi.py +4 -2
- pyobo/sources/civic_gene.py +55 -0
- pyobo/sources/cvx.py +18 -2
- pyobo/sources/famplex.py +5 -3
- pyobo/sources/mesh.py +29 -1
- pyobo/sources/ncbigene.py +5 -3
- pyobo/sources/omim_ps.py +39 -0
- pyobo/sources/rhea.py +141 -36
- pyobo/sources/umls/umls.py +17 -2
- pyobo/sources/uniprot/uniprot.py +123 -16
- pyobo/struct/__init__.py +1 -0
- pyobo/struct/struct.py +12 -6
- pyobo/struct/typedef.py +35 -5
- pyobo/utils/misc.py +22 -16
- pyobo/version.py +1 -1
- pyobo/xrefdb/sources/wikidata.py +5 -3
- {pyobo-0.10.7.dist-info → pyobo-0.10.9.dist-info}/METADATA +2 -2
- {pyobo-0.10.7.dist-info → pyobo-0.10.9.dist-info}/RECORD +29 -27
- {pyobo-0.10.7.dist-info → pyobo-0.10.9.dist-info}/WHEEL +1 -1
- {pyobo-0.10.7.dist-info → pyobo-0.10.9.dist-info}/LICENSE +0 -0
- {pyobo-0.10.7.dist-info → pyobo-0.10.9.dist-info}/entry_points.txt +0 -0
- {pyobo-0.10.7.dist-info → pyobo-0.10.9.dist-info}/top_level.txt +0 -0
pyobo/sources/omim_ps.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
"""Converter for OMIM Phenotypic Series."""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
from bioversions.utils import get_soup
|
|
9
|
+
|
|
10
|
+
from pyobo.struct import Obo, Term
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"OMIMPSGetter",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
PREFIX = "omim.ps"
|
|
19
|
+
URL = "https://omim.org/phenotypicSeriesTitles/all"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class OMIMPSGetter(Obo):
|
|
23
|
+
"""An ontology representation of OMIM Phenotypic Series."""
|
|
24
|
+
|
|
25
|
+
ontology = bioversions_key = PREFIX
|
|
26
|
+
|
|
27
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
28
|
+
"""Iterate over terms in the ontology."""
|
|
29
|
+
soup = get_soup(URL, user_agent="Mozilla/5.0")
|
|
30
|
+
rows = soup.find(id="mimContent").find("table").find("tbody").find_all("tr")
|
|
31
|
+
for row in rows:
|
|
32
|
+
anchor = row.find("td").find("a")
|
|
33
|
+
name = anchor.text.strip()
|
|
34
|
+
identifier = anchor.attrs["href"][len("/phenotypicSeries/") :]
|
|
35
|
+
yield Term.from_triple(PREFIX, identifier, name)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
if __name__ == "__main__":
|
|
39
|
+
OMIMPSGetter.cli()
|
pyobo/sources/rhea.py
CHANGED
|
@@ -3,31 +3,51 @@
|
|
|
3
3
|
"""Converter for Rhea."""
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Iterable
|
|
6
|
+
from typing import TYPE_CHECKING, Dict, Iterable, Optional
|
|
7
7
|
|
|
8
|
+
import bioversions
|
|
8
9
|
import pystow
|
|
9
10
|
|
|
10
11
|
from pyobo.struct import Obo, Reference, Term
|
|
11
12
|
from pyobo.struct.typedef import (
|
|
13
|
+
TypeDef,
|
|
14
|
+
enabled_by,
|
|
12
15
|
has_bidirectional_reaction,
|
|
16
|
+
has_input,
|
|
13
17
|
has_left_to_right_reaction,
|
|
18
|
+
has_output,
|
|
19
|
+
has_participant,
|
|
14
20
|
has_right_to_left_reaction,
|
|
21
|
+
reaction_enabled_by_molecular_function,
|
|
15
22
|
)
|
|
16
23
|
from pyobo.utils.path import ensure_df
|
|
17
24
|
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
import rdflib
|
|
27
|
+
|
|
18
28
|
__all__ = [
|
|
19
29
|
"RheaGetter",
|
|
20
30
|
]
|
|
21
31
|
|
|
22
32
|
logger = logging.getLogger(__name__)
|
|
23
33
|
PREFIX = "rhea"
|
|
34
|
+
RHEA_RDF_GZ_URL = "ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz"
|
|
24
35
|
|
|
25
36
|
|
|
26
37
|
class RheaGetter(Obo):
|
|
27
38
|
"""An ontology representation of Rhea's chemical reaction database."""
|
|
28
39
|
|
|
29
40
|
ontology = bioversions_key = PREFIX
|
|
30
|
-
typedefs = [
|
|
41
|
+
typedefs = [
|
|
42
|
+
has_left_to_right_reaction,
|
|
43
|
+
has_bidirectional_reaction,
|
|
44
|
+
has_right_to_left_reaction,
|
|
45
|
+
enabled_by,
|
|
46
|
+
has_input,
|
|
47
|
+
has_output,
|
|
48
|
+
has_participant,
|
|
49
|
+
reaction_enabled_by_molecular_function,
|
|
50
|
+
]
|
|
31
51
|
|
|
32
52
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
33
53
|
"""Iterate over terms in the ontology."""
|
|
@@ -39,25 +59,54 @@ def get_obo(force: bool = False) -> Obo:
|
|
|
39
59
|
return RheaGetter(force=force)
|
|
40
60
|
|
|
41
61
|
|
|
62
|
+
def ensure_rhea_rdf(version: Optional[str] = None, force: bool = False) -> "rdflib.Graph":
|
|
63
|
+
"""Get the Rhea RDF graph."""
|
|
64
|
+
# see docs: https://ftp.expasy.org/databases/rhea/rdf/rhea_rdf_documentation.pdf
|
|
65
|
+
if version is None:
|
|
66
|
+
version = bioversions.get_version(PREFIX)
|
|
67
|
+
return pystow.ensure_rdf(
|
|
68
|
+
"pyobo",
|
|
69
|
+
"raw",
|
|
70
|
+
PREFIX,
|
|
71
|
+
version,
|
|
72
|
+
url=RHEA_RDF_GZ_URL,
|
|
73
|
+
force=force,
|
|
74
|
+
parse_kwargs=dict(format="xml"),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _get_lr_name(name: str) -> str:
|
|
79
|
+
return name.replace(" = ", " => ")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _get_rl_name(name: str) -> str:
|
|
83
|
+
left, right = name.split(" = ", 1)
|
|
84
|
+
return f"{right} => {left}"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _get_bi_name(name: str) -> str:
|
|
88
|
+
return name.replace(" = ", " <=> ")
|
|
89
|
+
|
|
90
|
+
|
|
42
91
|
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
43
92
|
"""Iterate over terms in Rhea."""
|
|
44
|
-
|
|
45
|
-
graph = pystow.ensure_rdf(
|
|
46
|
-
"pyobo", "raw", PREFIX, version, url=url, force=force, parse_kwargs=dict(format="xml")
|
|
47
|
-
)
|
|
93
|
+
graph = ensure_rhea_rdf(version=version, force=force)
|
|
48
94
|
result = graph.query(
|
|
49
|
-
"""
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
95
|
+
"""\
|
|
96
|
+
PREFIX rh:<http://rdf.rhea-db.org/>
|
|
97
|
+
SELECT ?reaction ?reactionId ?reactionLabel WHERE {
|
|
98
|
+
?reaction rdfs:subClassOf rh:Reaction ;
|
|
99
|
+
rh:id ?reactionId ;
|
|
100
|
+
rdfs:label ?reactionLabel .
|
|
101
|
+
}
|
|
56
102
|
"""
|
|
57
103
|
)
|
|
58
|
-
names = {str(identifier): name for _, identifier, name in result}
|
|
104
|
+
names = {str(identifier): str(name) for _, identifier, name in result}
|
|
59
105
|
|
|
60
|
-
terms = {}
|
|
106
|
+
terms: Dict[str, Term] = {}
|
|
107
|
+
master_to_left: Dict[str, str] = {}
|
|
108
|
+
master_to_right: Dict[str, str] = {}
|
|
109
|
+
master_to_bi: Dict[str, str] = {}
|
|
61
110
|
|
|
62
111
|
directions = ensure_df(
|
|
63
112
|
PREFIX,
|
|
@@ -66,12 +115,16 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
66
115
|
force=force,
|
|
67
116
|
)
|
|
68
117
|
for master, lr, rl, bi in directions.values:
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
118
|
+
master_to_left[master] = lr
|
|
119
|
+
master_to_right[master] = rl
|
|
120
|
+
master_to_bi[master] = bi
|
|
121
|
+
|
|
122
|
+
name = names[master]
|
|
123
|
+
|
|
124
|
+
terms[master] = Term(reference=Reference(prefix=PREFIX, identifier=master, name=name))
|
|
125
|
+
terms[lr] = Term(reference=Reference(prefix=PREFIX, identifier=lr, name=_get_lr_name(name)))
|
|
126
|
+
terms[rl] = Term(reference=Reference(prefix=PREFIX, identifier=rl, name=_get_rl_name(name)))
|
|
127
|
+
terms[bi] = Term(reference=Reference(prefix=PREFIX, identifier=bi, name=_get_bi_name(name)))
|
|
75
128
|
|
|
76
129
|
terms[master].append_relationship(has_left_to_right_reaction, terms[lr])
|
|
77
130
|
terms[master].append_relationship(has_right_to_left_reaction, terms[rl])
|
|
@@ -80,6 +133,38 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
80
133
|
terms[rl].append_parent(terms[master])
|
|
81
134
|
terms[bi].append_parent(terms[master])
|
|
82
135
|
|
|
136
|
+
# inspired by https://github.com/geneontology/go-ontology/blob/master/src/sparql/construct-rhea-reactions.sparql
|
|
137
|
+
sparql = """\
|
|
138
|
+
PREFIX rh:<http://rdf.rhea-db.org/>
|
|
139
|
+
SELECT ?reactionId ?side ?chebi WHERE {
|
|
140
|
+
?reaction rdfs:subClassOf rh:Reaction ;
|
|
141
|
+
rh:id ?reactionId .
|
|
142
|
+
|
|
143
|
+
?reaction rh:side ?side .
|
|
144
|
+
?side rh:contains ?participant .
|
|
145
|
+
?participant rh:compound ?compound .
|
|
146
|
+
?compound rh:chebi|rh:underlyingChebi|(rh:reactivePart/rh:chebi) ?chebi .
|
|
147
|
+
}
|
|
148
|
+
"""
|
|
149
|
+
for master_rhea_id, side_uri, chebi_uri in graph.query(sparql):
|
|
150
|
+
master_rhea_id = str(master_rhea_id)
|
|
151
|
+
chebi_reference = Reference(
|
|
152
|
+
prefix="chebi", identifier=chebi_uri[len("http://purl.obolibrary.org/obo/CHEBI_") :]
|
|
153
|
+
)
|
|
154
|
+
side = side_uri.split("_")[-1] # L or R
|
|
155
|
+
if side == "L":
|
|
156
|
+
left_rhea_id = master_to_left[master_rhea_id]
|
|
157
|
+
right_rhea_id = master_to_right[master_rhea_id]
|
|
158
|
+
elif side == "R":
|
|
159
|
+
left_rhea_id = master_to_right[master_rhea_id]
|
|
160
|
+
right_rhea_id = master_to_left[master_rhea_id]
|
|
161
|
+
else:
|
|
162
|
+
raise ValueError(f"Invalid side: {side_uri}")
|
|
163
|
+
terms[master_rhea_id].append_relationship(has_participant, chebi_reference)
|
|
164
|
+
terms[master_to_bi[master_rhea_id]].append_relationship(has_participant, chebi_reference)
|
|
165
|
+
terms[left_rhea_id].append_relationship(has_input, chebi_reference)
|
|
166
|
+
terms[right_rhea_id].append_relationship(has_output, chebi_reference)
|
|
167
|
+
|
|
83
168
|
hierarchy = ensure_df(
|
|
84
169
|
PREFIX,
|
|
85
170
|
url="ftp://ftp.expasy.org/databases/rhea/tsv/rhea-relationships.tsv",
|
|
@@ -91,12 +176,14 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
91
176
|
raise ValueError(f"RHEA unrecognized relation: {relation}")
|
|
92
177
|
terms[source].append_parent(terms[target])
|
|
93
178
|
|
|
94
|
-
for xref_prefix, url in [
|
|
95
|
-
("ecocyc", "rhea2ecocyc"),
|
|
96
|
-
("kegg.reaction", "rhea2kegg_reaction"),
|
|
97
|
-
("reactome", "rhea2reactome"),
|
|
98
|
-
("macie", "rhea2macie"),
|
|
99
|
-
("metacyc", "rhea2metacyc"),
|
|
179
|
+
for xref_prefix, url, relation in [
|
|
180
|
+
("ecocyc", "rhea2ecocyc", None),
|
|
181
|
+
("kegg.reaction", "rhea2kegg_reaction", None),
|
|
182
|
+
("reactome", "rhea2reactome", None),
|
|
183
|
+
("macie", "rhea2macie", None),
|
|
184
|
+
("metacyc", "rhea2metacyc", None),
|
|
185
|
+
("go", "rhea2go", reaction_enabled_by_molecular_function),
|
|
186
|
+
("uniprot", "rhea2uniprot", enabled_by),
|
|
100
187
|
]:
|
|
101
188
|
xref_df = ensure_df(
|
|
102
189
|
PREFIX,
|
|
@@ -104,26 +191,44 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
104
191
|
version=version,
|
|
105
192
|
force=force,
|
|
106
193
|
)
|
|
107
|
-
for
|
|
108
|
-
if
|
|
194
|
+
for directional_rhea_id, _direction, _master_rhea_id, xref_id in xref_df.values:
|
|
195
|
+
if directional_rhea_id not in terms:
|
|
109
196
|
logger.debug(
|
|
110
197
|
"[%s] could not find %s:%s for xref %s:%s",
|
|
111
198
|
PREFIX,
|
|
112
199
|
PREFIX,
|
|
113
|
-
|
|
200
|
+
directional_rhea_id,
|
|
114
201
|
xref_prefix,
|
|
115
202
|
xref_id,
|
|
116
203
|
)
|
|
117
204
|
continue
|
|
118
|
-
|
|
205
|
+
target_reference = Reference(prefix=xref_prefix, identifier=xref_id)
|
|
206
|
+
if isinstance(relation, TypeDef):
|
|
207
|
+
terms[directional_rhea_id].append_relationship(relation, target_reference)
|
|
208
|
+
else:
|
|
209
|
+
terms[directional_rhea_id].append_xref(target_reference)
|
|
119
210
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
211
|
+
ec_df = ensure_df(
|
|
212
|
+
PREFIX,
|
|
213
|
+
url="ftp://ftp.expasy.org/databases/rhea/tsv/rhea-ec-iubmb.tsv",
|
|
214
|
+
version=version,
|
|
215
|
+
force=force,
|
|
216
|
+
)
|
|
217
|
+
for (
|
|
218
|
+
directional_rhea_id,
|
|
219
|
+
_status,
|
|
220
|
+
_direction,
|
|
221
|
+
_master_id,
|
|
222
|
+
ec,
|
|
223
|
+
_enzyme_status,
|
|
224
|
+
_iubmb,
|
|
225
|
+
) in ec_df.values:
|
|
226
|
+
terms[directional_rhea_id].append_relationship(
|
|
227
|
+
enabled_by, Reference(prefix="eccode", identifier=ec)
|
|
228
|
+
)
|
|
124
229
|
|
|
125
230
|
yield from terms.values()
|
|
126
231
|
|
|
127
232
|
|
|
128
233
|
if __name__ == "__main__":
|
|
129
|
-
RheaGetter.
|
|
234
|
+
RheaGetter().write_default(write_obo=True, force=True)
|
pyobo/sources/umls/umls.py
CHANGED
|
@@ -7,12 +7,13 @@ Run with ``python -m pyobo.sources.umls``
|
|
|
7
7
|
|
|
8
8
|
import itertools as itt
|
|
9
9
|
import operator
|
|
10
|
-
from
|
|
10
|
+
from collections import defaultdict
|
|
11
|
+
from typing import Iterable, Mapping, Set
|
|
11
12
|
|
|
12
13
|
import bioregistry
|
|
13
14
|
import pandas as pd
|
|
14
15
|
from tqdm.auto import tqdm
|
|
15
|
-
from umls_downloader import open_umls
|
|
16
|
+
from umls_downloader import open_umls, open_umls_semantic_types
|
|
16
17
|
|
|
17
18
|
from pyobo import Obo, Reference, Synonym, SynonymTypeDef, Term
|
|
18
19
|
|
|
@@ -66,8 +67,20 @@ def get_obo() -> Obo:
|
|
|
66
67
|
return UMLSGetter()
|
|
67
68
|
|
|
68
69
|
|
|
70
|
+
def get_semantic_types() -> Mapping[str, Set[str]]:
|
|
71
|
+
"""Get UMLS semantic types for each term."""
|
|
72
|
+
dd = defaultdict(set)
|
|
73
|
+
with open_umls_semantic_types() as file:
|
|
74
|
+
for line in tqdm(file, unit_scale=True):
|
|
75
|
+
cui, sty, _ = line.decode("utf8").split("|", 2)
|
|
76
|
+
dd[cui].add(sty)
|
|
77
|
+
return dict(dd)
|
|
78
|
+
|
|
79
|
+
|
|
69
80
|
def iter_terms(version: str) -> Iterable[Term]:
|
|
70
81
|
"""Iterate over UMLS terms."""
|
|
82
|
+
semantic_types = get_semantic_types()
|
|
83
|
+
|
|
71
84
|
with open_umls(version=version) as file:
|
|
72
85
|
it = tqdm(file, unit_scale=True, desc="[umls] parsing")
|
|
73
86
|
lines = (line.decode("utf-8").strip().split("|") for line in it)
|
|
@@ -118,6 +131,8 @@ def iter_terms(version: str) -> Iterable[Term]:
|
|
|
118
131
|
synonyms=synonyms,
|
|
119
132
|
xrefs=xrefs,
|
|
120
133
|
)
|
|
134
|
+
for sty_id in semantic_types.get(cui, set()):
|
|
135
|
+
term.append_parent(Reference(prefix="sty", identifier=sty_id))
|
|
121
136
|
yield term
|
|
122
137
|
|
|
123
138
|
|
pyobo/sources/uniprot/uniprot.py
CHANGED
|
@@ -2,8 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
"""Converter for UniProt."""
|
|
4
4
|
|
|
5
|
+
from operator import attrgetter
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from typing import Iterable, Optional
|
|
7
|
+
from typing import Iterable, List, Optional, cast
|
|
7
8
|
|
|
8
9
|
import bioversions
|
|
9
10
|
from tqdm.auto import tqdm
|
|
@@ -11,22 +12,52 @@ from tqdm.auto import tqdm
|
|
|
11
12
|
from pyobo import Obo, Reference
|
|
12
13
|
from pyobo.constants import RAW_MODULE
|
|
13
14
|
from pyobo.identifier_utils import standardize_ec
|
|
14
|
-
from pyobo.struct import Term, enables, from_species
|
|
15
|
+
from pyobo.struct import Term, derives_from, enables, from_species, participates_in
|
|
16
|
+
from pyobo.struct.typedef import gene_product_of, located_in, molecularly_interacts_with
|
|
15
17
|
from pyobo.utils.io import open_reader
|
|
16
18
|
|
|
17
19
|
PREFIX = "uniprot"
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
20
|
+
BASE_URL = "https://rest.uniprot.org/uniprotkb/stream"
|
|
21
|
+
SEARCH_URL = "https://rest.uniprot.org/uniprotkb/search"
|
|
22
|
+
QUERY = "(*) AND (reviewed:true)"
|
|
23
|
+
FIELDS = [
|
|
24
|
+
"accession",
|
|
25
|
+
"id",
|
|
26
|
+
"organism_id",
|
|
27
|
+
"protein_name",
|
|
28
|
+
"ec",
|
|
29
|
+
"lit_pubmed_id",
|
|
30
|
+
"xref_pdb",
|
|
31
|
+
"xref_proteomes",
|
|
32
|
+
"xref_geneid",
|
|
33
|
+
"rhea",
|
|
34
|
+
"go_c",
|
|
35
|
+
"go_f",
|
|
36
|
+
"go_p",
|
|
37
|
+
"ft_binding",
|
|
38
|
+
"cc_function",
|
|
39
|
+
]
|
|
40
|
+
PARAMS = {
|
|
41
|
+
"compressed": "true",
|
|
42
|
+
"format": "tsv",
|
|
43
|
+
# "size": 10, # only used with search
|
|
44
|
+
"query": QUERY,
|
|
45
|
+
"fields": FIELDS,
|
|
46
|
+
}
|
|
23
47
|
|
|
24
48
|
|
|
25
49
|
class UniProtGetter(Obo):
|
|
26
50
|
"""An ontology representation of the UniProt database."""
|
|
27
51
|
|
|
28
52
|
bioversions_key = ontology = PREFIX
|
|
29
|
-
typedefs = [
|
|
53
|
+
typedefs = [
|
|
54
|
+
from_species,
|
|
55
|
+
enables,
|
|
56
|
+
participates_in,
|
|
57
|
+
gene_product_of,
|
|
58
|
+
molecularly_interacts_with,
|
|
59
|
+
derives_from,
|
|
60
|
+
]
|
|
30
61
|
|
|
31
62
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
32
63
|
"""Iterate over terms in the ontology."""
|
|
@@ -42,13 +73,73 @@ def iter_terms(version: Optional[str] = None) -> Iterable[Term]:
|
|
|
42
73
|
"""Iterate over UniProt Terms."""
|
|
43
74
|
with open_reader(ensure(version=version)) as reader:
|
|
44
75
|
_ = next(reader) # header
|
|
45
|
-
for
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
#
|
|
50
|
-
|
|
76
|
+
for (
|
|
77
|
+
uniprot_id,
|
|
78
|
+
accession,
|
|
79
|
+
taxonomy_id,
|
|
80
|
+
_name, # this field should have the name, but it's a mismatch of random name annotations
|
|
81
|
+
ecs,
|
|
82
|
+
pubmeds,
|
|
83
|
+
pdbs,
|
|
84
|
+
proteome,
|
|
85
|
+
gene_id,
|
|
86
|
+
rhea_curies,
|
|
87
|
+
go_components,
|
|
88
|
+
go_functions,
|
|
89
|
+
go_processes,
|
|
90
|
+
bindings,
|
|
91
|
+
description,
|
|
92
|
+
) in tqdm(reader, desc="Mapping UniProt", unit_scale=True):
|
|
93
|
+
if description:
|
|
94
|
+
description = description.removeprefix("FUNCTION: ")
|
|
95
|
+
term = Term(
|
|
96
|
+
reference=Reference(prefix=PREFIX, identifier=uniprot_id, name=accession),
|
|
97
|
+
definition=description or None,
|
|
98
|
+
)
|
|
51
99
|
term.set_species(taxonomy_id)
|
|
100
|
+
if gene_id:
|
|
101
|
+
term.append_relationship(
|
|
102
|
+
gene_product_of, Reference(prefix="ncbigene", identifier=gene_id)
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# TODO add type=Reference(prefix="xsd", identifier="boolean")
|
|
106
|
+
term.append_property("reviewed", "true")
|
|
107
|
+
|
|
108
|
+
for go_process_ref in _parse_go(go_processes):
|
|
109
|
+
term.append_relationship(participates_in, go_process_ref)
|
|
110
|
+
for go_function_ref in _parse_go(go_functions):
|
|
111
|
+
term.append_relationship(enables, go_function_ref)
|
|
112
|
+
for go_component_ref in _parse_go(go_components):
|
|
113
|
+
term.append_relationship(located_in, go_component_ref)
|
|
114
|
+
|
|
115
|
+
if proteome:
|
|
116
|
+
uniprot_proteome_id = proteome.split(":")[0]
|
|
117
|
+
term.append_relationship(
|
|
118
|
+
derives_from,
|
|
119
|
+
Reference(prefix="uniprot.proteome", identifier=uniprot_proteome_id),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
if rhea_curies:
|
|
123
|
+
for rhea_curie in rhea_curies.split(" "):
|
|
124
|
+
term.append_relationship(
|
|
125
|
+
# FIXME this needs a different relation than enables
|
|
126
|
+
# see https://github.com/biopragmatics/pyobo/pull/168#issuecomment-1918680152
|
|
127
|
+
enables,
|
|
128
|
+
cast(Reference, Reference.from_curie(rhea_curie, strict=True)),
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
if bindings:
|
|
132
|
+
binding_references = set()
|
|
133
|
+
for part in bindings.split(";"):
|
|
134
|
+
part = part.strip()
|
|
135
|
+
if part.startswith("/ligand_id"):
|
|
136
|
+
curie = part.removeprefix('/ligand_id="').rstrip('"')
|
|
137
|
+
binding_references.add(
|
|
138
|
+
cast(Reference, Reference.from_curie(curie, strict=True))
|
|
139
|
+
)
|
|
140
|
+
for binding_reference in sorted(binding_references, key=attrgetter("curie")):
|
|
141
|
+
term.append_relationship(molecularly_interacts_with, binding_reference)
|
|
142
|
+
|
|
52
143
|
if ecs:
|
|
53
144
|
for ec in ecs.split(";"):
|
|
54
145
|
term.append_relationship(
|
|
@@ -63,11 +154,27 @@ def iter_terms(version: Optional[str] = None) -> Iterable[Term]:
|
|
|
63
154
|
yield term
|
|
64
155
|
|
|
65
156
|
|
|
66
|
-
def
|
|
157
|
+
def _parse_go(go_terms) -> List[Reference]:
|
|
158
|
+
rv = []
|
|
159
|
+
if go_terms:
|
|
160
|
+
for go_term in go_terms.split(";"):
|
|
161
|
+
go_id = go_term.rsplit("[GO:")[1].rstrip("]")
|
|
162
|
+
rv.append(Reference(prefix="go", identifier=go_id))
|
|
163
|
+
return rv
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def ensure(version: Optional[str] = None, force: bool = False) -> Path:
|
|
67
167
|
"""Ensure the reviewed uniprot names are available."""
|
|
68
168
|
if version is None:
|
|
69
169
|
version = bioversions.get_version("uniprot")
|
|
70
|
-
return RAW_MODULE.ensure(
|
|
170
|
+
return RAW_MODULE.ensure(
|
|
171
|
+
PREFIX,
|
|
172
|
+
version,
|
|
173
|
+
force=force,
|
|
174
|
+
name="reviewed.tsv.gz",
|
|
175
|
+
url=BASE_URL, # switch to SEARCH_URL for debugging
|
|
176
|
+
download_kwargs={"backend": "requests", "params": PARAMS},
|
|
177
|
+
)
|
|
71
178
|
|
|
72
179
|
|
|
73
180
|
if __name__ == "__main__":
|
pyobo/struct/__init__.py
CHANGED
pyobo/struct/struct.py
CHANGED
|
@@ -53,6 +53,7 @@ from .typedef import (
|
|
|
53
53
|
orthologous,
|
|
54
54
|
part_of,
|
|
55
55
|
see_also,
|
|
56
|
+
term_replaced_by,
|
|
56
57
|
)
|
|
57
58
|
from .utils import comma_separate, obo_escape_slim
|
|
58
59
|
from ..constants import (
|
|
@@ -299,6 +300,11 @@ class Term(Referenced):
|
|
|
299
300
|
self.append_property(comment.curie, value)
|
|
300
301
|
return self
|
|
301
302
|
|
|
303
|
+
def append_replaced_by(self, reference: ReferenceHint) -> "Term":
|
|
304
|
+
"""Add a replaced by relationship."""
|
|
305
|
+
self.append_relationship(term_replaced_by, reference)
|
|
306
|
+
return self
|
|
307
|
+
|
|
302
308
|
def append_parent(self, reference: ReferenceHint) -> "Term":
|
|
303
309
|
"""Add a parent to this entity."""
|
|
304
310
|
reference = _ensure_ref(reference)
|
|
@@ -395,14 +401,14 @@ class Term(Referenced):
|
|
|
395
401
|
|
|
396
402
|
def iterate_relations(self) -> Iterable[Tuple[TypeDef, Reference]]:
|
|
397
403
|
"""Iterate over pairs of typedefs and targets."""
|
|
398
|
-
for typedef, targets in self.relationships.items():
|
|
399
|
-
for target in targets:
|
|
404
|
+
for typedef, targets in sorted(self.relationships.items(), key=_sort_relations):
|
|
405
|
+
for target in sorted(targets, key=lambda ref: ref.preferred_curie):
|
|
400
406
|
yield typedef, target
|
|
401
407
|
|
|
402
408
|
def iterate_properties(self) -> Iterable[Tuple[str, str]]:
|
|
403
409
|
"""Iterate over pairs of property and values."""
|
|
404
|
-
for prop, values in self.properties.items():
|
|
405
|
-
for value in values:
|
|
410
|
+
for prop, values in sorted(self.properties.items()):
|
|
411
|
+
for value in sorted(values):
|
|
406
412
|
yield prop, value
|
|
407
413
|
|
|
408
414
|
def iterate_obo_lines(self, *, ontology, typedefs) -> Iterable[str]:
|
|
@@ -466,7 +472,7 @@ _TYPEDEF_WARNINGS: Set[Tuple[str, str]] = set()
|
|
|
466
472
|
|
|
467
473
|
def _sort_relations(r):
|
|
468
474
|
typedef, _references = r
|
|
469
|
-
return typedef.
|
|
475
|
+
return typedef.preferred_curie
|
|
470
476
|
|
|
471
477
|
|
|
472
478
|
def _sort_properties(r):
|
|
@@ -1017,7 +1023,7 @@ class Obo:
|
|
|
1017
1023
|
def iterate_id_name(self, *, use_tqdm: bool = False) -> Iterable[Tuple[str, str]]:
|
|
1018
1024
|
"""Iterate identifier name pairs."""
|
|
1019
1025
|
for term in self._iter_terms(use_tqdm=use_tqdm, desc=f"[{self.ontology}] getting names"):
|
|
1020
|
-
if term.name:
|
|
1026
|
+
if term.prefix == self.ontology and term.name:
|
|
1021
1027
|
yield term.identifier, term.name
|
|
1022
1028
|
|
|
1023
1029
|
def get_id_name_mapping(self, *, use_tqdm: bool = False) -> Mapping[str, str]:
|
pyobo/struct/typedef.py
CHANGED
|
@@ -48,6 +48,10 @@ __all__ = [
|
|
|
48
48
|
]
|
|
49
49
|
|
|
50
50
|
|
|
51
|
+
def _bool_to_obo(v: bool) -> str:
|
|
52
|
+
return "true" if v else "false"
|
|
53
|
+
|
|
54
|
+
|
|
51
55
|
@dataclass
|
|
52
56
|
class TypeDef(Referenced):
|
|
53
57
|
"""A type definition in OBO.
|
|
@@ -88,7 +92,7 @@ class TypeDef(Referenced):
|
|
|
88
92
|
yield f'def: "{self.definition}"'
|
|
89
93
|
|
|
90
94
|
if self.is_metadata_tag is not None:
|
|
91
|
-
yield f
|
|
95
|
+
yield f"is_metadata_tag: {_bool_to_obo(self.is_metadata_tag)}"
|
|
92
96
|
|
|
93
97
|
if self.namespace:
|
|
94
98
|
yield f"namespace: {self.namespace}"
|
|
@@ -113,6 +117,10 @@ class TypeDef(Referenced):
|
|
|
113
117
|
yield f"holds_over_chain: {_chain} ! {_names}"
|
|
114
118
|
if self.inverse:
|
|
115
119
|
yield f"inverse_of: {self.inverse}"
|
|
120
|
+
if self.domain:
|
|
121
|
+
yield f"domain: {self.domain}"
|
|
122
|
+
if self.range:
|
|
123
|
+
yield f"range: {self.range}"
|
|
116
124
|
|
|
117
125
|
@classmethod
|
|
118
126
|
def from_triple(cls, prefix: str, identifier: str, name: Optional[str] = None) -> "TypeDef":
|
|
@@ -161,13 +169,19 @@ species_specific = TypeDef(
|
|
|
161
169
|
"species with RO:0002162 (in taxon)",
|
|
162
170
|
)
|
|
163
171
|
has_left_to_right_reaction = TypeDef(
|
|
164
|
-
Reference(prefix="debio", identifier="0000007", name="has left-to-right reaction")
|
|
172
|
+
Reference(prefix="debio", identifier="0000007", name="has left-to-right reaction"),
|
|
173
|
+
is_metadata_tag=True,
|
|
165
174
|
)
|
|
166
175
|
has_right_to_left_reaction = TypeDef(
|
|
167
|
-
Reference(prefix="debio", identifier="0000008", name="has right-to-left reaction")
|
|
176
|
+
Reference(prefix="debio", identifier="0000008", name="has right-to-left reaction"),
|
|
177
|
+
is_metadata_tag=True,
|
|
168
178
|
)
|
|
169
179
|
has_bidirectional_reaction = TypeDef(
|
|
170
|
-
Reference(prefix="debio", identifier="0000009", name="has bi-directional reaction")
|
|
180
|
+
Reference(prefix="debio", identifier="0000009", name="has bi-directional reaction"),
|
|
181
|
+
is_metadata_tag=True,
|
|
182
|
+
)
|
|
183
|
+
reaction_enabled_by_molecular_function = TypeDef(
|
|
184
|
+
Reference(prefix="debio", identifier="0000047", name="reaction enabled by molecular function")
|
|
171
185
|
)
|
|
172
186
|
|
|
173
187
|
|
|
@@ -191,6 +205,15 @@ has_participant = TypeDef(
|
|
|
191
205
|
comment="Inverse of has participant",
|
|
192
206
|
inverse=Reference(prefix=RO_PREFIX, identifier="0000056", name="participates in"),
|
|
193
207
|
)
|
|
208
|
+
derives_from = TypeDef(
|
|
209
|
+
reference=Reference(prefix=RO_PREFIX, identifier="0001000", name="derives from"),
|
|
210
|
+
)
|
|
211
|
+
molecularly_interacts_with = TypeDef(
|
|
212
|
+
reference=Reference(prefix=RO_PREFIX, identifier="0002436", name="molecularly interacts with"),
|
|
213
|
+
)
|
|
214
|
+
located_in = TypeDef(
|
|
215
|
+
reference=Reference(prefix=RO_PREFIX, identifier="0001025", name="located in"),
|
|
216
|
+
)
|
|
194
217
|
exact_match = TypeDef(
|
|
195
218
|
reference=Reference(prefix="skos", identifier="exactMatch", name="exact match"),
|
|
196
219
|
)
|
|
@@ -291,7 +314,14 @@ editor_note = TypeDef.from_triple(prefix=IAO_PREFIX, identifier="0000116", name=
|
|
|
291
314
|
is_immediately_transformed_from = TypeDef.from_triple(
|
|
292
315
|
prefix=SIO_PREFIX, identifier="000658", name="is immediately transformed from"
|
|
293
316
|
)
|
|
294
|
-
|
|
317
|
+
|
|
318
|
+
_enables_reference = Reference(prefix=RO_PREFIX, identifier="0002327", name="enables")
|
|
319
|
+
_enabled_by_reference = Reference(prefix=RO_PREFIX, identifier="0002333", name="enabled by")
|
|
320
|
+
enables = TypeDef(reference=_enables_reference, inverse=_enabled_by_reference)
|
|
321
|
+
enabled_by = TypeDef(reference=_enabled_by_reference, inverse=_enables_reference)
|
|
322
|
+
|
|
323
|
+
has_input = TypeDef.from_triple(prefix=RO_PREFIX, identifier="0002233", name="has input")
|
|
324
|
+
has_output = TypeDef.from_triple(prefix=RO_PREFIX, identifier="0002234", name="has output")
|
|
295
325
|
|
|
296
326
|
"""ChEBI"""
|
|
297
327
|
|