pyobo 0.10.12__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/__init__.py +0 -2
- pyobo/__main__.py +0 -2
- pyobo/api/__init__.py +0 -2
- pyobo/api/alts.py +6 -7
- pyobo/api/hierarchy.py +14 -15
- pyobo/api/metadata.py +3 -4
- pyobo/api/names.py +31 -32
- pyobo/api/properties.py +6 -7
- pyobo/api/relations.py +12 -11
- pyobo/api/species.py +5 -6
- pyobo/api/typedefs.py +1 -3
- pyobo/api/utils.py +61 -5
- pyobo/api/xrefs.py +4 -5
- pyobo/aws.py +3 -5
- pyobo/cli/__init__.py +0 -2
- pyobo/cli/aws.py +0 -2
- pyobo/cli/cli.py +0 -4
- pyobo/cli/database.py +1 -3
- pyobo/cli/lookup.py +0 -2
- pyobo/cli/utils.py +0 -2
- pyobo/constants.py +1 -33
- pyobo/getters.py +19 -26
- pyobo/gilda_utils.py +19 -17
- pyobo/identifier_utils.py +10 -10
- pyobo/mocks.py +5 -6
- pyobo/normalizer.py +24 -24
- pyobo/obographs.py +8 -5
- pyobo/plugins.py +3 -4
- pyobo/py.typed +0 -0
- pyobo/reader.py +19 -21
- pyobo/registries/__init__.py +0 -2
- pyobo/registries/metaregistry.py +6 -8
- pyobo/resource_utils.py +1 -3
- pyobo/resources/__init__.py +0 -2
- pyobo/resources/ncbitaxon.py +2 -3
- pyobo/resources/ro.py +2 -4
- pyobo/resources/so.py +55 -0
- pyobo/resources/so.tsv +2604 -0
- pyobo/sources/README.md +15 -0
- pyobo/sources/__init__.py +0 -2
- pyobo/sources/agrovoc.py +3 -3
- pyobo/sources/antibodyregistry.py +2 -3
- pyobo/sources/biogrid.py +4 -4
- pyobo/sources/ccle.py +3 -4
- pyobo/sources/cgnc.py +1 -3
- pyobo/sources/chebi.py +2 -4
- pyobo/sources/chembl.py +1 -3
- pyobo/sources/civic_gene.py +2 -3
- pyobo/sources/complexportal.py +57 -20
- pyobo/sources/conso.py +2 -4
- pyobo/sources/cpt.py +1 -3
- pyobo/sources/credit.py +1 -1
- pyobo/sources/cvx.py +1 -3
- pyobo/sources/depmap.py +3 -4
- pyobo/sources/dictybase_gene.py +15 -12
- pyobo/sources/drugbank.py +6 -7
- pyobo/sources/drugbank_salt.py +3 -4
- pyobo/sources/drugcentral.py +9 -8
- pyobo/sources/expasy.py +33 -16
- pyobo/sources/famplex.py +3 -5
- pyobo/sources/flybase.py +5 -6
- pyobo/sources/geonames.py +1 -1
- pyobo/sources/gmt_utils.py +5 -6
- pyobo/sources/go.py +4 -6
- pyobo/sources/gwascentral_phenotype.py +1 -3
- pyobo/sources/gwascentral_study.py +2 -3
- pyobo/sources/hgnc.py +30 -26
- pyobo/sources/hgncgenefamily.py +9 -11
- pyobo/sources/icd10.py +3 -4
- pyobo/sources/icd11.py +3 -4
- pyobo/sources/icd_utils.py +6 -7
- pyobo/sources/interpro.py +3 -5
- pyobo/sources/itis.py +1 -3
- pyobo/sources/kegg/__init__.py +0 -2
- pyobo/sources/kegg/api.py +3 -4
- pyobo/sources/kegg/genes.py +3 -4
- pyobo/sources/kegg/genome.py +19 -9
- pyobo/sources/kegg/pathway.py +5 -6
- pyobo/sources/mesh.py +19 -21
- pyobo/sources/mgi.py +1 -3
- pyobo/sources/mirbase.py +13 -9
- pyobo/sources/mirbase_constants.py +0 -2
- pyobo/sources/mirbase_family.py +1 -3
- pyobo/sources/mirbase_mature.py +1 -3
- pyobo/sources/msigdb.py +4 -5
- pyobo/sources/ncbigene.py +3 -5
- pyobo/sources/npass.py +2 -4
- pyobo/sources/omim_ps.py +1 -3
- pyobo/sources/pathbank.py +35 -28
- pyobo/sources/pfam.py +1 -3
- pyobo/sources/pfam_clan.py +1 -3
- pyobo/sources/pid.py +3 -5
- pyobo/sources/pombase.py +7 -6
- pyobo/sources/pubchem.py +2 -3
- pyobo/sources/reactome.py +30 -11
- pyobo/sources/rgd.py +3 -4
- pyobo/sources/rhea.py +7 -8
- pyobo/sources/ror.py +3 -2
- pyobo/sources/selventa/__init__.py +0 -2
- pyobo/sources/selventa/schem.py +1 -3
- pyobo/sources/selventa/scomp.py +1 -3
- pyobo/sources/selventa/sdis.py +1 -3
- pyobo/sources/selventa/sfam.py +1 -3
- pyobo/sources/sgd.py +1 -3
- pyobo/sources/slm.py +29 -17
- pyobo/sources/umls/__init__.py +0 -2
- pyobo/sources/umls/__main__.py +0 -2
- pyobo/sources/umls/get_synonym_types.py +1 -1
- pyobo/sources/umls/umls.py +2 -4
- pyobo/sources/uniprot/__init__.py +0 -2
- pyobo/sources/uniprot/uniprot.py +11 -10
- pyobo/sources/uniprot/uniprot_ptm.py +6 -5
- pyobo/sources/utils.py +3 -5
- pyobo/sources/wikipathways.py +1 -3
- pyobo/sources/zfin.py +20 -9
- pyobo/ssg/__init__.py +3 -2
- pyobo/struct/__init__.py +0 -2
- pyobo/struct/reference.py +22 -23
- pyobo/struct/struct.py +132 -116
- pyobo/struct/typedef.py +14 -10
- pyobo/struct/utils.py +0 -2
- pyobo/utils/__init__.py +0 -2
- pyobo/utils/cache.py +14 -6
- pyobo/utils/io.py +9 -10
- pyobo/utils/iter.py +5 -6
- pyobo/utils/misc.py +1 -3
- pyobo/utils/ndex_utils.py +6 -7
- pyobo/utils/path.py +4 -5
- pyobo/version.py +3 -5
- pyobo/xrefdb/__init__.py +0 -2
- pyobo/xrefdb/canonicalizer.py +27 -18
- pyobo/xrefdb/priority.py +0 -2
- pyobo/xrefdb/sources/__init__.py +3 -4
- pyobo/xrefdb/sources/biomappings.py +0 -2
- pyobo/xrefdb/sources/cbms2019.py +0 -2
- pyobo/xrefdb/sources/chembl.py +0 -2
- pyobo/xrefdb/sources/compath.py +1 -3
- pyobo/xrefdb/sources/famplex.py +3 -5
- pyobo/xrefdb/sources/gilda.py +0 -2
- pyobo/xrefdb/sources/intact.py +5 -5
- pyobo/xrefdb/sources/ncit.py +1 -3
- pyobo/xrefdb/sources/pubchem.py +2 -5
- pyobo/xrefdb/sources/wikidata.py +2 -4
- pyobo/xrefdb/xrefs_pipeline.py +15 -16
- {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/LICENSE +1 -1
- pyobo-0.11.1.dist-info/METADATA +711 -0
- pyobo-0.11.1.dist-info/RECORD +173 -0
- {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/WHEEL +1 -1
- pyobo-0.11.1.dist-info/entry_points.txt +2 -0
- pyobo-0.10.12.dist-info/METADATA +0 -499
- pyobo-0.10.12.dist-info/RECORD +0 -169
- pyobo-0.10.12.dist-info/entry_points.txt +0 -15
- {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/top_level.txt +0 -0
pyobo/sources/README.md
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Sources
|
|
2
|
+
|
|
3
|
+
1. Create a new module in `pyobo.sources` named with the prefix for the resource you're ontologizing
|
|
4
|
+
2. Make sure your resource has a corresponding prefix in [the Bioregistry](https://github.com/biopragmatics/bioregistry)
|
|
5
|
+
3. Subclass the `pyobo.Obo` class to represent your resource
|
|
6
|
+
4. Add your resource to the list in `pyobo.sources.__init__`
|
|
7
|
+
|
|
8
|
+
## What is in scope?
|
|
9
|
+
|
|
10
|
+
1. Biomedical, semantic web, bibliographic, life sciences, and related natural sciences resources are welcome
|
|
11
|
+
2. The source you want to ontologize should be an identifier resource, i.e., it mints its own identifiers. If you want
|
|
12
|
+
to ontologize some database that reuses some other identifier resource's identifiers, then this isn't the right
|
|
13
|
+
place.
|
|
14
|
+
3. Resources that are not possible to download automatically are not in scope for PyOBO. Reproducibility and reusability
|
|
15
|
+
are core values of this software
|
pyobo/sources/__init__.py
CHANGED
pyobo/sources/agrovoc.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for AGROVOC."""
|
|
4
2
|
|
|
5
3
|
import pystow
|
|
@@ -11,6 +9,8 @@ __all__ = [
|
|
|
11
9
|
"ensure_agrovoc_graph",
|
|
12
10
|
]
|
|
13
11
|
|
|
12
|
+
PREFIX = "agrovoc"
|
|
13
|
+
|
|
14
14
|
|
|
15
15
|
def ensure_agrovoc_graph(version: str) -> Graph:
|
|
16
16
|
"""Download and parse the given version of AGROVOC."""
|
|
@@ -20,5 +20,5 @@ def ensure_agrovoc_graph(version: str) -> Graph:
|
|
|
20
20
|
graph.bind("skosxl", "http://www.w3.org/2008/05/skos-xl#")
|
|
21
21
|
graph.bind("skos", SKOS)
|
|
22
22
|
graph.bind("dcterms", DCTERMS)
|
|
23
|
-
graph.bind(
|
|
23
|
+
graph.bind(PREFIX, "http://aims.fao.org/aos/agrontology#")
|
|
24
24
|
return graph
|
|
@@ -1,9 +1,8 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for the Antibody Registry."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
6
|
-
from
|
|
4
|
+
from collections.abc import Iterable, Mapping
|
|
5
|
+
from typing import Optional
|
|
7
6
|
|
|
8
7
|
import pandas as pd
|
|
9
8
|
from bioregistry.utils import removeprefix
|
pyobo/sources/biogrid.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Extract and convert BioGRID identifiers."""
|
|
4
2
|
|
|
3
|
+
from collections.abc import Mapping
|
|
5
4
|
from functools import partial
|
|
6
|
-
from typing import
|
|
5
|
+
from typing import Optional
|
|
7
6
|
|
|
8
7
|
import pandas as pd
|
|
9
8
|
|
|
@@ -77,7 +76,8 @@ def get_ncbigene_mapping() -> Mapping[str, str]:
|
|
|
77
76
|
.. code-block:: python
|
|
78
77
|
|
|
79
78
|
from pyobo import get_filtered_xrefs
|
|
80
|
-
|
|
79
|
+
|
|
80
|
+
biogrid_ncbigene_mapping = get_filtered_xrefs("biogrid", "ncbigene")
|
|
81
81
|
"""
|
|
82
82
|
df = get_df()
|
|
83
83
|
df = df.loc[df["IDENTIFIER_TYPE"] == "ENTREZ_GENE", ["BIOGRID_ID", "IDENTIFIER_VALUE"]]
|
pyobo/sources/ccle.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Get the CCLE Cells, provided by cBioPortal."""
|
|
4
2
|
|
|
5
3
|
import tarfile
|
|
4
|
+
from collections.abc import Iterable
|
|
6
5
|
from pathlib import Path
|
|
7
|
-
from typing import
|
|
6
|
+
from typing import Optional
|
|
8
7
|
|
|
9
8
|
import pandas as pd
|
|
10
9
|
import pystow
|
|
@@ -25,7 +24,7 @@ class CCLEGetter(Obo):
|
|
|
25
24
|
|
|
26
25
|
ontology = bioregistry_key = PREFIX
|
|
27
26
|
|
|
28
|
-
def __post_init__(self):
|
|
27
|
+
def __post_init__(self):
|
|
29
28
|
self.data_version = VERSION
|
|
30
29
|
|
|
31
30
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
pyobo/sources/cgnc.py
CHANGED
pyobo/sources/chebi.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for ChEBI."""
|
|
4
2
|
|
|
5
|
-
from
|
|
3
|
+
from collections.abc import Mapping
|
|
6
4
|
|
|
7
5
|
from ..api import get_filtered_properties_mapping, get_filtered_relations_df
|
|
8
6
|
from ..struct import Reference, TypeDef
|
|
@@ -33,7 +31,7 @@ def get_chebi_smiles_id_mapping() -> Mapping[str, str]:
|
|
|
33
31
|
has_role = TypeDef(reference=Reference(prefix="chebi", identifier="has_role"))
|
|
34
32
|
|
|
35
33
|
|
|
36
|
-
def get_chebi_role_to_children() -> Mapping[str,
|
|
34
|
+
def get_chebi_role_to_children() -> Mapping[str, set[tuple[str, str]]]:
|
|
37
35
|
"""Get the ChEBI role to children mapping."""
|
|
38
36
|
df = get_filtered_relations_df("chebi", relation=has_role)
|
|
39
37
|
return multisetdict((role_id, ("chebi", chemical_id)) for chemical_id, _, role_id in df.values)
|
pyobo/sources/chembl.py
CHANGED
|
@@ -1,13 +1,11 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for ChEMBL.
|
|
4
2
|
|
|
5
3
|
Run with ``python -m pyobo.sources.chembl -vv``.
|
|
6
4
|
"""
|
|
7
5
|
|
|
8
6
|
import logging
|
|
7
|
+
from collections.abc import Iterable
|
|
9
8
|
from contextlib import closing
|
|
10
|
-
from typing import Iterable
|
|
11
9
|
|
|
12
10
|
import chembl_downloader
|
|
13
11
|
|
pyobo/sources/civic_gene.py
CHANGED
pyobo/sources/complexportal.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for ComplexPortal."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
6
|
-
from
|
|
4
|
+
from collections.abc import Iterable
|
|
7
5
|
|
|
8
6
|
import pandas as pd
|
|
9
7
|
from tqdm.auto import tqdm
|
|
@@ -52,7 +50,7 @@ DTYPE = {
|
|
|
52
50
|
}
|
|
53
51
|
|
|
54
52
|
|
|
55
|
-
def _parse_members(s) ->
|
|
53
|
+
def _parse_members(s) -> list[tuple[Reference, str]]:
|
|
56
54
|
if pd.isna(s):
|
|
57
55
|
return []
|
|
58
56
|
|
|
@@ -60,15 +58,35 @@ def _parse_members(s) -> List[Tuple[Reference, str]]:
|
|
|
60
58
|
for member in s.split("|"):
|
|
61
59
|
entity_id, count = member.split("(")
|
|
62
60
|
count = count.rstrip(")")
|
|
63
|
-
if "
|
|
61
|
+
if entity_id.startswith("URS"):
|
|
62
|
+
prefix, identifier = "rnacentral", entity_id
|
|
63
|
+
elif entity_id.startswith("CPX"):
|
|
64
|
+
# TODO why self xref?
|
|
65
|
+
prefix, identifier = "complexportal", entity_id
|
|
66
|
+
elif entity_id.startswith("["):
|
|
67
|
+
continue # this is a list of uniprot IDs, not sure what to do with this
|
|
68
|
+
elif entity_id.startswith("EBI-"):
|
|
69
|
+
continue
|
|
70
|
+
elif ":" not in entity_id:
|
|
71
|
+
if "PRO_" in entity_id:
|
|
72
|
+
prefix = "uniprot.chain"
|
|
73
|
+
identifier = entity_id.split("-")[1]
|
|
74
|
+
elif "-" in entity_id:
|
|
75
|
+
prefix, identifier = "uniprot.isoform", entity_id
|
|
76
|
+
else:
|
|
77
|
+
prefix, identifier = "uniprot", entity_id
|
|
78
|
+
else:
|
|
64
79
|
prefix, identifier = entity_id.split(":", 1)
|
|
80
|
+
try:
|
|
81
|
+
reference = Reference(prefix=prefix, identifier=identifier)
|
|
82
|
+
except ValueError:
|
|
83
|
+
tqdm.write(f"failed to validate reference: {entity_id}")
|
|
65
84
|
else:
|
|
66
|
-
|
|
67
|
-
rv.append((Reference(prefix=prefix, identifier=identifier), count))
|
|
85
|
+
rv.append((reference, count))
|
|
68
86
|
return rv
|
|
69
87
|
|
|
70
88
|
|
|
71
|
-
def _parse_xrefs(s) ->
|
|
89
|
+
def _parse_xrefs(s) -> list[tuple[Reference, str]]:
|
|
72
90
|
if pd.isna(s):
|
|
73
91
|
return []
|
|
74
92
|
|
|
@@ -76,27 +94,40 @@ def _parse_xrefs(s) -> List[Tuple[Reference, str]]:
|
|
|
76
94
|
for xref in s.split("|"):
|
|
77
95
|
xref = xref.replace("protein ontology:PR:", "PR:")
|
|
78
96
|
xref = xref.replace("protein ontology:PR_", "PR:")
|
|
97
|
+
xref = xref.replace("rhea:rhea ", "rhea:")
|
|
98
|
+
xref = xref.replace("rhea:Rhea ", "rhea:")
|
|
99
|
+
xref = xref.replace("rhea:RHEA:rhea", "rhea:")
|
|
100
|
+
xref = xref.replace("rhea:RHEA: ", "rhea:")
|
|
101
|
+
xref = xref.replace("rhea:RHEA:rhea ", "rhea:")
|
|
102
|
+
xref = xref.replace("intenz:RHEA:", "rhea:")
|
|
103
|
+
xref = xref.replace("eccode::", "eccode:")
|
|
104
|
+
xref = xref.replace("eccode:EC:", "eccode:")
|
|
105
|
+
xref = xref.replace("intenz:EC:", "eccode:")
|
|
106
|
+
xref = xref.replace("eccode:RHEA:", "rhea:")
|
|
107
|
+
xref = xref.replace("efo:MONDO:", "MONDO:")
|
|
108
|
+
xref = xref.replace("omim:MIM:", "omim:")
|
|
109
|
+
xref = xref.replace("efo:HP:", "HP:")
|
|
110
|
+
xref = xref.replace("efo:Orphanet:", "Orphanet:")
|
|
111
|
+
xref = xref.replace("orphanet:ORDO:", "Orphanet:")
|
|
112
|
+
xref = xref.replace("biorxiv:doi.org/", "doi:")
|
|
113
|
+
xref = xref.replace("emdb:EMDB-", "emdb:EMD-")
|
|
114
|
+
xref = xref.replace("wwpdb:EMD-", "emdb:EMD-")
|
|
115
|
+
xref = xref.replace("signor:CPX-", "complexportal:CPX-")
|
|
116
|
+
|
|
79
117
|
try:
|
|
80
118
|
xref_curie, note = xref.split("(")
|
|
81
119
|
except ValueError:
|
|
82
120
|
logger.warning("xref missing (: %s", xref)
|
|
83
121
|
continue
|
|
84
122
|
note = note.rstrip(")")
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
note.replace("eccode:RHEA:", "rhea:")
|
|
90
|
-
if note.lower().startswith("rhea "):
|
|
91
|
-
note = note[len("Rhea ") :]
|
|
92
|
-
if note.lower().startswith("rhea:rhea "):
|
|
93
|
-
note = note[len("rhea:rhea ") :]
|
|
94
|
-
if note.lower().startswith("EC:"):
|
|
95
|
-
note = note[len("EC:") :]
|
|
123
|
+
|
|
124
|
+
if xref_curie.startswith("intenz:"):
|
|
125
|
+
xref_curie = _clean_intenz(xref_curie)
|
|
126
|
+
|
|
96
127
|
try:
|
|
97
128
|
reference = Reference.from_curie(xref_curie)
|
|
98
129
|
except ValueError:
|
|
99
|
-
logger.warning("can not parse CURIE: %s",
|
|
130
|
+
logger.warning("can not parse CURIE: %s", xref_curie)
|
|
100
131
|
continue
|
|
101
132
|
if reference is None:
|
|
102
133
|
logger.warning("reference is None after parsing: %s", xref)
|
|
@@ -105,6 +136,12 @@ def _parse_xrefs(s) -> List[Tuple[Reference, str]]:
|
|
|
105
136
|
return rv
|
|
106
137
|
|
|
107
138
|
|
|
139
|
+
def _clean_intenz(s: str) -> str:
|
|
140
|
+
for _ in range(3):
|
|
141
|
+
s = s.rstrip("-").rstrip(".")
|
|
142
|
+
return s
|
|
143
|
+
|
|
144
|
+
|
|
108
145
|
class ComplexPortalGetter(Obo):
|
|
109
146
|
"""An ontology representation of the Complex Portal."""
|
|
110
147
|
|
pyobo/sources/conso.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for CONSO."""
|
|
4
2
|
|
|
5
|
-
from
|
|
3
|
+
from collections.abc import Iterable
|
|
6
4
|
|
|
7
5
|
import pandas as pd
|
|
8
6
|
|
|
@@ -68,7 +66,7 @@ def iter_terms() -> Iterable[Term]:
|
|
|
68
66
|
for _, row in terms_df.iterrows():
|
|
69
67
|
if row["Name"] == "WITHDRAWN":
|
|
70
68
|
continue
|
|
71
|
-
provenance:
|
|
69
|
+
provenance: list[Reference] = []
|
|
72
70
|
for curie in row["References"].split(","):
|
|
73
71
|
curie = curie.strip()
|
|
74
72
|
if not curie:
|
pyobo/sources/cpt.py
CHANGED
pyobo/sources/credit.py
CHANGED
pyobo/sources/cvx.py
CHANGED
pyobo/sources/depmap.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""DepMap cell lines."""
|
|
4
2
|
|
|
5
|
-
from
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from typing import Optional
|
|
6
5
|
|
|
7
6
|
import pandas as pd
|
|
8
7
|
import pystow
|
|
@@ -113,7 +112,7 @@ def ensure(version: str, force: bool = False) -> pd.DataFrame:
|
|
|
113
112
|
url=get_url(version=version),
|
|
114
113
|
name="sample_info.tsv",
|
|
115
114
|
force=force,
|
|
116
|
-
read_csv_kwargs=
|
|
115
|
+
read_csv_kwargs={"sep": ",", "dtype": str},
|
|
117
116
|
)
|
|
118
117
|
|
|
119
118
|
|
pyobo/sources/dictybase_gene.py
CHANGED
|
@@ -1,18 +1,15 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for dictyBase gene.
|
|
4
2
|
|
|
5
3
|
Note that normal dictybase idenififers are for sequences
|
|
6
4
|
"""
|
|
7
5
|
|
|
8
6
|
import logging
|
|
9
|
-
from
|
|
7
|
+
from collections.abc import Iterable
|
|
10
8
|
|
|
11
9
|
import pandas as pd
|
|
12
10
|
from tqdm.auto import tqdm
|
|
13
11
|
|
|
14
|
-
from pyobo.struct import Obo,
|
|
15
|
-
from pyobo.utils.io import multisetdict
|
|
12
|
+
from pyobo.struct import Obo, Synonym, Term, from_species, has_gene_product
|
|
16
13
|
from pyobo.utils.path import ensure_df
|
|
17
14
|
|
|
18
15
|
__all__ = [
|
|
@@ -51,10 +48,11 @@ def get_obo(force: bool = False) -> Obo:
|
|
|
51
48
|
|
|
52
49
|
def get_terms(force: bool = False) -> Iterable[Term]:
|
|
53
50
|
"""Get terms."""
|
|
51
|
+
# TODO the mappings file has actually no uniprot at all, and requires text mining
|
|
54
52
|
# DDB ID DDB_G ID Name UniProt ID
|
|
55
|
-
uniprot_mappings = multisetdict(
|
|
56
|
-
|
|
57
|
-
)
|
|
53
|
+
# uniprot_mappings = multisetdict(
|
|
54
|
+
# ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values
|
|
55
|
+
# )
|
|
58
56
|
|
|
59
57
|
terms = ensure_df(PREFIX, url=URL, force=force, name="gene_info.tsv")
|
|
60
58
|
# GENE ID (DDB_G ID) Gene Name Synonyms Gene products
|
|
@@ -70,10 +68,15 @@ def get_terms(force: bool = False) -> Iterable[Term]:
|
|
|
70
68
|
if synonyms and pd.notna(synonyms):
|
|
71
69
|
for synonym in synonyms.split(","):
|
|
72
70
|
term.append_synonym(Synonym(synonym.strip()))
|
|
73
|
-
for uniprot_id in uniprot_mappings.get(identifier, []):
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
71
|
+
# for uniprot_id in uniprot_mappings.get(identifier, []):
|
|
72
|
+
# if not uniprot_id or pd.isna(uniprot_id) or uniprot_id in {"unknown", "pseudogene"}:
|
|
73
|
+
# continue
|
|
74
|
+
# try:
|
|
75
|
+
# uniprot_ref = Reference(prefix="uniprot", identifier=uniprot_id)
|
|
76
|
+
# except ValueError:
|
|
77
|
+
# tqdm.write(f"[dictybase.gene] invalid uniprot ref: {uniprot_id}")
|
|
78
|
+
# else:
|
|
79
|
+
# term.append_relationship(has_gene_product, uniprot_ref)
|
|
77
80
|
|
|
78
81
|
term.set_species(identifier="44689", name="Dictyostelium discoideum")
|
|
79
82
|
yield term
|
pyobo/sources/drugbank.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Convert DrugBank to OBO.
|
|
4
2
|
|
|
5
3
|
Run with ``python -m pyobo.sources.drugbank``
|
|
@@ -8,14 +6,15 @@ Run with ``python -m pyobo.sources.drugbank``
|
|
|
8
6
|
import datetime
|
|
9
7
|
import itertools as itt
|
|
10
8
|
import logging
|
|
9
|
+
from collections.abc import Iterable, Mapping
|
|
11
10
|
from functools import lru_cache
|
|
12
|
-
from typing import Any,
|
|
11
|
+
from typing import Any, Optional
|
|
13
12
|
from xml.etree import ElementTree
|
|
14
13
|
|
|
15
14
|
import pystow
|
|
16
15
|
from tqdm.auto import tqdm
|
|
17
16
|
|
|
18
|
-
from ..getters import
|
|
17
|
+
from ..getters import NoBuildError
|
|
19
18
|
from ..struct import Obo, Reference, Term
|
|
20
19
|
from ..struct.typedef import has_inchi, has_salt, has_smiles
|
|
21
20
|
from ..utils.cache import cached_pickle
|
|
@@ -139,7 +138,7 @@ def _make_term(drug_info: Mapping[str, Any]) -> Term:
|
|
|
139
138
|
return term
|
|
140
139
|
|
|
141
140
|
|
|
142
|
-
@lru_cache
|
|
141
|
+
@lru_cache
|
|
143
142
|
def get_xml_root(version: Optional[str] = None) -> ElementTree.Element:
|
|
144
143
|
"""Get the DrugBank XML parser root.
|
|
145
144
|
|
|
@@ -152,7 +151,7 @@ def get_xml_root(version: Optional[str] = None) -> ElementTree.Element:
|
|
|
152
151
|
username = pystow.get_config("pyobo", "drugbank_username", raise_on_missing=True)
|
|
153
152
|
password = pystow.get_config("pyobo", "drugbank_password", raise_on_missing=True)
|
|
154
153
|
except ConfigError as e:
|
|
155
|
-
raise
|
|
154
|
+
raise NoBuildError from e
|
|
156
155
|
|
|
157
156
|
element = parse_drugbank(version=version, username=username, password=password)
|
|
158
157
|
return element.getroot()
|
|
@@ -167,7 +166,7 @@ smiles_template = f"{ns}calculated-properties/{ns}property[{ns}kind='SMILES']/{n
|
|
|
167
166
|
def _extract_drug_info(drug_xml: ElementTree.Element) -> Mapping[str, Any]:
|
|
168
167
|
"""Extract information from an XML element representing a drug."""
|
|
169
168
|
# assert drug_xml.tag == f'{ns}drug'
|
|
170
|
-
row:
|
|
169
|
+
row: dict[str, Any] = {
|
|
171
170
|
"type": drug_xml.get("type"),
|
|
172
171
|
"drugbank_id": drug_xml.findtext(f"{ns}drugbank-id[@primary='true']"),
|
|
173
172
|
"cas": drug_xml.findtext(f"{ns}cas-number"),
|
pyobo/sources/drugbank_salt.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Convert DrugBank Salts to OBO.
|
|
4
2
|
|
|
5
3
|
Run with ``python -m pyobo.sources.drugbank_salt``
|
|
@@ -10,11 +8,12 @@ Get relations between drugbank salts and drugbank parents with
|
|
|
10
8
|
.. code-block:: python
|
|
11
9
|
|
|
12
10
|
import pyobo
|
|
13
|
-
|
|
11
|
+
|
|
12
|
+
df = pyobo.get_filtered_relations_df("drugbank", "obo:has_salt")
|
|
14
13
|
"""
|
|
15
14
|
|
|
16
15
|
import logging
|
|
17
|
-
from
|
|
16
|
+
from collections.abc import Iterable
|
|
18
17
|
|
|
19
18
|
from .drugbank import iterate_drug_info
|
|
20
19
|
from ..struct import Obo, Reference, Term
|
pyobo/sources/drugcentral.py
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Get DrugCentral as OBO."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
6
4
|
from collections import defaultdict
|
|
5
|
+
from collections.abc import Iterable
|
|
7
6
|
from contextlib import closing
|
|
8
|
-
from typing import DefaultDict, Iterable, List
|
|
9
7
|
|
|
10
8
|
import bioregistry
|
|
11
9
|
import psycopg2
|
|
@@ -25,9 +23,9 @@ PREFIX = "drugcentral"
|
|
|
25
23
|
HOST = "unmtid-dbs.net"
|
|
26
24
|
PORT = 5433
|
|
27
25
|
USER = "drugman"
|
|
28
|
-
PASSWORD = "dosage"
|
|
26
|
+
PASSWORD = "dosage" # noqa:S105
|
|
29
27
|
DBNAME = "drugcentral"
|
|
30
|
-
PARAMS =
|
|
28
|
+
PARAMS = {"dbname": DBNAME, "user": USER, "password": PASSWORD, "host": HOST, "port": PORT}
|
|
31
29
|
|
|
32
30
|
|
|
33
31
|
class DrugCentralGetter(Obo):
|
|
@@ -58,7 +56,7 @@ def iter_terms() -> Iterable[Term]:
|
|
|
58
56
|
with closing(conn.cursor()) as cur:
|
|
59
57
|
cur.execute("SELECT struct_id, id_type, identifier FROM public.identifier")
|
|
60
58
|
rows = cur.fetchall()
|
|
61
|
-
xrefs:
|
|
59
|
+
xrefs: defaultdict[str, list[Reference]] = defaultdict(list)
|
|
62
60
|
for drugcentral_id, prefix, identifier in tqdm(
|
|
63
61
|
rows, unit_scale=True, desc="loading xrefs"
|
|
64
62
|
):
|
|
@@ -70,13 +68,16 @@ def iter_terms() -> Iterable[Term]:
|
|
|
70
68
|
if xref_prefix_norm is None:
|
|
71
69
|
tqdm.write(f"did not normalize {prefix}:{identifier}")
|
|
72
70
|
continue
|
|
71
|
+
if xref_prefix_norm == "pdb.ligand":
|
|
72
|
+
# there is a weird invalid escaped \W appearing in pdb ligand ids
|
|
73
|
+
identifier = identifier.strip()
|
|
73
74
|
identifier = bioregistry.standardize_identifier(xref_prefix_norm, identifier)
|
|
74
75
|
xrefs[str(drugcentral_id)].append(
|
|
75
76
|
Reference(prefix=xref_prefix_norm, identifier=identifier)
|
|
76
77
|
)
|
|
77
78
|
with closing(conn.cursor()) as cur:
|
|
78
79
|
cur.execute("SELECT id, name FROM public.synonyms")
|
|
79
|
-
synonyms:
|
|
80
|
+
synonyms: defaultdict[str, list[Synonym]] = defaultdict(list)
|
|
80
81
|
for drugcentral_id, synonym in cur.fetchall():
|
|
81
82
|
synonyms[str(drugcentral_id)].append(Synonym(name=synonym))
|
|
82
83
|
|
|
@@ -100,4 +101,4 @@ def iter_terms() -> Iterable[Term]:
|
|
|
100
101
|
|
|
101
102
|
|
|
102
103
|
if __name__ == "__main__":
|
|
103
|
-
|
|
104
|
+
DrugCentralGetter.cli()
|
pyobo/sources/expasy.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Convert ExPASy to OBO."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
4
|
+
import re
|
|
6
5
|
from collections import defaultdict
|
|
7
|
-
from
|
|
6
|
+
from collections.abc import Iterable, Mapping
|
|
7
|
+
from typing import Any, Optional
|
|
8
8
|
|
|
9
9
|
from .utils import get_go_mapping
|
|
10
10
|
from ..struct import Obo, Reference, Synonym, Term
|
|
@@ -43,7 +43,7 @@ class ExpasyGetter(Obo):
|
|
|
43
43
|
"""A getter for ExPASy Enzyme Classes."""
|
|
44
44
|
|
|
45
45
|
bioversions_key = ontology = PREFIX
|
|
46
|
-
typedefs = [has_member, enables]
|
|
46
|
+
typedefs = [has_member, enables, term_replaced_by]
|
|
47
47
|
root_terms = [
|
|
48
48
|
Reference(prefix="eccode", identifier="1"),
|
|
49
49
|
Reference(prefix="eccode", identifier="2"),
|
|
@@ -76,7 +76,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
76
76
|
with open(tree_path) as file:
|
|
77
77
|
tree = get_tree(file)
|
|
78
78
|
|
|
79
|
-
terms:
|
|
79
|
+
terms: dict[str, Term] = {}
|
|
80
80
|
child_to_parents = defaultdict(list)
|
|
81
81
|
for ec_code, data in tree.items():
|
|
82
82
|
terms[ec_code] = Term(
|
|
@@ -146,7 +146,9 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
146
146
|
for domain in data.get("domains", []):
|
|
147
147
|
term.append_relationship(
|
|
148
148
|
has_member,
|
|
149
|
-
Reference(
|
|
149
|
+
Reference.model_validate(
|
|
150
|
+
{"prefix": domain["namespace"], "identifier": domain["identifier"]},
|
|
151
|
+
),
|
|
150
152
|
)
|
|
151
153
|
for protein in data.get("proteins", []):
|
|
152
154
|
term.append_relationship(
|
|
@@ -176,7 +178,7 @@ def normalize_expasy_id(expasy_id: str) -> str:
|
|
|
176
178
|
return expasy_id.replace(" ", "")
|
|
177
179
|
|
|
178
180
|
|
|
179
|
-
def give_edge(unnormalized_ec_code: str) ->
|
|
181
|
+
def give_edge(unnormalized_ec_code: str) -> tuple[int, Optional[str], str]:
|
|
180
182
|
"""Return a (parent, child) tuple for given id."""
|
|
181
183
|
levels = [x for x in unnormalized_ec_code.replace(" ", "").replace("-", "").split(".") if x]
|
|
182
184
|
level = len(levels)
|
|
@@ -227,7 +229,7 @@ def get_database(lines: Iterable[str]) -> Mapping:
|
|
|
227
229
|
for groups in _group_by_id(lines):
|
|
228
230
|
_, expasy_id = groups[0]
|
|
229
231
|
|
|
230
|
-
ec_data_entry:
|
|
232
|
+
ec_data_entry: dict[str, Any] = {
|
|
231
233
|
"concept": {
|
|
232
234
|
"namespace": PREFIX,
|
|
233
235
|
"identifier": expasy_id,
|
|
@@ -249,8 +251,10 @@ def get_database(lines: Iterable[str]) -> Mapping:
|
|
|
249
251
|
elif descriptor == DE and value == "Deleted entry.":
|
|
250
252
|
ec_data_entry["deleted"] = True
|
|
251
253
|
elif descriptor == DE and value.startswith("Transferred entry: "):
|
|
252
|
-
|
|
253
|
-
|
|
254
|
+
# TODO There's a situation where there are enough transfers that it goes on to a second line
|
|
255
|
+
# the following line just gives up on this one. or maybe I don't understand
|
|
256
|
+
value = value.strip().removesuffix("and").rstrip(",").strip()
|
|
257
|
+
ec_data_entry["transfer_id"] = _parse_transfer(value)
|
|
254
258
|
elif descriptor == DE:
|
|
255
259
|
ec_data_entry["concept"]["name"] = value.rstrip(".") # type:ignore
|
|
256
260
|
elif descriptor == AN:
|
|
@@ -269,17 +273,30 @@ def get_database(lines: Iterable[str]) -> Mapping:
|
|
|
269
273
|
continue
|
|
270
274
|
uniprot_id, uniprot_accession = uniprot_entry.split(",")
|
|
271
275
|
ec_data_entry["proteins"].append( # type:ignore
|
|
272
|
-
|
|
273
|
-
namespace
|
|
274
|
-
name
|
|
275
|
-
identifier
|
|
276
|
-
|
|
276
|
+
{
|
|
277
|
+
"namespace": "uniprot",
|
|
278
|
+
"name": uniprot_accession,
|
|
279
|
+
"identifier": uniprot_id,
|
|
280
|
+
}
|
|
277
281
|
)
|
|
278
282
|
|
|
279
283
|
rv[expasy_id] = ec_data_entry
|
|
280
284
|
return rv
|
|
281
285
|
|
|
282
286
|
|
|
287
|
+
TRANSFER_SPLIT_RE = re.compile(r",\s*|\s+and\s+")
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def _parse_transfer(value: str) -> list[str]:
|
|
291
|
+
"""Parse transferred entry string.
|
|
292
|
+
|
|
293
|
+
>>> _parse_transfer("Transferred entry: 1.1.1.198, 1.1.1.227 and 1.1.1.228.")
|
|
294
|
+
['1.1.1.198', '1.1.1.227', '1.1.1.228']
|
|
295
|
+
"""
|
|
296
|
+
value = value[len("Transferred entry: ") :].rstrip().rstrip(".")
|
|
297
|
+
return sorted(x.strip().removeprefix("and").strip() for x in TRANSFER_SPLIT_RE.split(value))
|
|
298
|
+
|
|
299
|
+
|
|
283
300
|
def _group_by_id(lines):
|
|
284
301
|
"""Group lines by identifier."""
|
|
285
302
|
groups = []
|
|
@@ -300,7 +317,7 @@ def _group_by_id(lines):
|
|
|
300
317
|
return groups
|
|
301
318
|
|
|
302
319
|
|
|
303
|
-
def get_ec2go(version: str) -> Mapping[str,
|
|
320
|
+
def get_ec2go(version: str) -> Mapping[str, set[tuple[str, str]]]:
|
|
304
321
|
"""Get the EC mapping to GO activities."""
|
|
305
322
|
url = "http://current.geneontology.org/ontology/external2go/ec2go"
|
|
306
323
|
path = ensure_path(PREFIX, url=url, name="ec2go.tsv", version=version)
|