pyobo 0.10.12__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/__init__.py +0 -2
- pyobo/__main__.py +0 -2
- pyobo/api/__init__.py +0 -2
- pyobo/api/alts.py +6 -7
- pyobo/api/hierarchy.py +14 -15
- pyobo/api/metadata.py +3 -4
- pyobo/api/names.py +31 -32
- pyobo/api/properties.py +6 -7
- pyobo/api/relations.py +12 -11
- pyobo/api/species.py +5 -6
- pyobo/api/typedefs.py +1 -3
- pyobo/api/utils.py +61 -5
- pyobo/api/xrefs.py +4 -5
- pyobo/aws.py +3 -5
- pyobo/cli/__init__.py +0 -2
- pyobo/cli/aws.py +0 -2
- pyobo/cli/cli.py +0 -4
- pyobo/cli/database.py +1 -3
- pyobo/cli/lookup.py +0 -2
- pyobo/cli/utils.py +0 -2
- pyobo/constants.py +1 -33
- pyobo/getters.py +19 -26
- pyobo/gilda_utils.py +19 -17
- pyobo/identifier_utils.py +10 -10
- pyobo/mocks.py +5 -6
- pyobo/normalizer.py +24 -24
- pyobo/obographs.py +8 -5
- pyobo/plugins.py +3 -4
- pyobo/py.typed +0 -0
- pyobo/reader.py +19 -21
- pyobo/registries/__init__.py +0 -2
- pyobo/registries/metaregistry.py +6 -8
- pyobo/resource_utils.py +1 -3
- pyobo/resources/__init__.py +0 -2
- pyobo/resources/ncbitaxon.py +2 -3
- pyobo/resources/ro.py +2 -4
- pyobo/resources/so.py +55 -0
- pyobo/resources/so.tsv +2604 -0
- pyobo/sources/README.md +15 -0
- pyobo/sources/__init__.py +0 -2
- pyobo/sources/agrovoc.py +3 -3
- pyobo/sources/antibodyregistry.py +2 -3
- pyobo/sources/biogrid.py +4 -4
- pyobo/sources/ccle.py +3 -4
- pyobo/sources/cgnc.py +1 -3
- pyobo/sources/chebi.py +2 -4
- pyobo/sources/chembl.py +1 -3
- pyobo/sources/civic_gene.py +2 -3
- pyobo/sources/complexportal.py +57 -20
- pyobo/sources/conso.py +2 -4
- pyobo/sources/cpt.py +1 -3
- pyobo/sources/credit.py +1 -1
- pyobo/sources/cvx.py +1 -3
- pyobo/sources/depmap.py +3 -4
- pyobo/sources/dictybase_gene.py +15 -12
- pyobo/sources/drugbank.py +6 -7
- pyobo/sources/drugbank_salt.py +3 -4
- pyobo/sources/drugcentral.py +9 -8
- pyobo/sources/expasy.py +33 -16
- pyobo/sources/famplex.py +3 -5
- pyobo/sources/flybase.py +5 -6
- pyobo/sources/geonames.py +1 -1
- pyobo/sources/gmt_utils.py +5 -6
- pyobo/sources/go.py +4 -6
- pyobo/sources/gwascentral_phenotype.py +1 -3
- pyobo/sources/gwascentral_study.py +2 -3
- pyobo/sources/hgnc.py +30 -26
- pyobo/sources/hgncgenefamily.py +9 -11
- pyobo/sources/icd10.py +3 -4
- pyobo/sources/icd11.py +3 -4
- pyobo/sources/icd_utils.py +6 -7
- pyobo/sources/interpro.py +3 -5
- pyobo/sources/itis.py +1 -3
- pyobo/sources/kegg/__init__.py +0 -2
- pyobo/sources/kegg/api.py +3 -4
- pyobo/sources/kegg/genes.py +3 -4
- pyobo/sources/kegg/genome.py +19 -9
- pyobo/sources/kegg/pathway.py +5 -6
- pyobo/sources/mesh.py +19 -21
- pyobo/sources/mgi.py +1 -3
- pyobo/sources/mirbase.py +13 -9
- pyobo/sources/mirbase_constants.py +0 -2
- pyobo/sources/mirbase_family.py +1 -3
- pyobo/sources/mirbase_mature.py +1 -3
- pyobo/sources/msigdb.py +4 -5
- pyobo/sources/ncbigene.py +3 -5
- pyobo/sources/npass.py +2 -4
- pyobo/sources/omim_ps.py +1 -3
- pyobo/sources/pathbank.py +35 -28
- pyobo/sources/pfam.py +1 -3
- pyobo/sources/pfam_clan.py +1 -3
- pyobo/sources/pid.py +3 -5
- pyobo/sources/pombase.py +7 -6
- pyobo/sources/pubchem.py +2 -3
- pyobo/sources/reactome.py +30 -11
- pyobo/sources/rgd.py +3 -4
- pyobo/sources/rhea.py +7 -8
- pyobo/sources/ror.py +3 -2
- pyobo/sources/selventa/__init__.py +0 -2
- pyobo/sources/selventa/schem.py +1 -3
- pyobo/sources/selventa/scomp.py +1 -3
- pyobo/sources/selventa/sdis.py +1 -3
- pyobo/sources/selventa/sfam.py +1 -3
- pyobo/sources/sgd.py +1 -3
- pyobo/sources/slm.py +29 -17
- pyobo/sources/umls/__init__.py +0 -2
- pyobo/sources/umls/__main__.py +0 -2
- pyobo/sources/umls/get_synonym_types.py +1 -1
- pyobo/sources/umls/umls.py +2 -4
- pyobo/sources/uniprot/__init__.py +0 -2
- pyobo/sources/uniprot/uniprot.py +11 -10
- pyobo/sources/uniprot/uniprot_ptm.py +6 -5
- pyobo/sources/utils.py +3 -5
- pyobo/sources/wikipathways.py +1 -3
- pyobo/sources/zfin.py +20 -9
- pyobo/ssg/__init__.py +3 -2
- pyobo/struct/__init__.py +0 -2
- pyobo/struct/reference.py +22 -23
- pyobo/struct/struct.py +132 -116
- pyobo/struct/typedef.py +14 -10
- pyobo/struct/utils.py +0 -2
- pyobo/utils/__init__.py +0 -2
- pyobo/utils/cache.py +14 -6
- pyobo/utils/io.py +9 -10
- pyobo/utils/iter.py +5 -6
- pyobo/utils/misc.py +1 -3
- pyobo/utils/ndex_utils.py +6 -7
- pyobo/utils/path.py +4 -5
- pyobo/version.py +3 -5
- pyobo/xrefdb/__init__.py +0 -2
- pyobo/xrefdb/canonicalizer.py +27 -18
- pyobo/xrefdb/priority.py +0 -2
- pyobo/xrefdb/sources/__init__.py +3 -4
- pyobo/xrefdb/sources/biomappings.py +0 -2
- pyobo/xrefdb/sources/cbms2019.py +0 -2
- pyobo/xrefdb/sources/chembl.py +0 -2
- pyobo/xrefdb/sources/compath.py +1 -3
- pyobo/xrefdb/sources/famplex.py +3 -5
- pyobo/xrefdb/sources/gilda.py +0 -2
- pyobo/xrefdb/sources/intact.py +5 -5
- pyobo/xrefdb/sources/ncit.py +1 -3
- pyobo/xrefdb/sources/pubchem.py +2 -5
- pyobo/xrefdb/sources/wikidata.py +2 -4
- pyobo/xrefdb/xrefs_pipeline.py +15 -16
- {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/LICENSE +1 -1
- pyobo-0.11.1.dist-info/METADATA +711 -0
- pyobo-0.11.1.dist-info/RECORD +173 -0
- {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/WHEEL +1 -1
- pyobo-0.11.1.dist-info/entry_points.txt +2 -0
- pyobo-0.10.12.dist-info/METADATA +0 -499
- pyobo-0.10.12.dist-info/RECORD +0 -169
- pyobo-0.10.12.dist-info/entry_points.txt +0 -15
- {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/top_level.txt +0 -0
pyobo/sources/mesh.py
CHANGED
|
@@ -1,17 +1,16 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Parser for the MeSH descriptors."""
|
|
4
2
|
|
|
5
3
|
import datetime
|
|
6
4
|
import itertools as itt
|
|
7
5
|
import logging
|
|
8
6
|
import re
|
|
9
|
-
from
|
|
7
|
+
from collections.abc import Collection, Iterable, Mapping
|
|
8
|
+
from typing import Any, Optional
|
|
10
9
|
from xml.etree.ElementTree import Element
|
|
11
10
|
|
|
12
11
|
from tqdm.auto import tqdm
|
|
13
12
|
|
|
14
|
-
from pyobo.api.utils import
|
|
13
|
+
from pyobo.api.utils import safe_get_version
|
|
15
14
|
from pyobo.identifier_utils import standardize_ec
|
|
16
15
|
from pyobo.struct import Obo, Reference, Synonym, Term
|
|
17
16
|
from pyobo.utils.cache import cached_json, cached_mapping
|
|
@@ -70,7 +69,7 @@ def get_tree_to_mesh_id(version: str) -> Mapping[str, str]:
|
|
|
70
69
|
|
|
71
70
|
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
72
71
|
"""Get MeSH OBO terms."""
|
|
73
|
-
mesh_id_to_term:
|
|
72
|
+
mesh_id_to_term: dict[str, Term] = {}
|
|
74
73
|
|
|
75
74
|
descriptors = ensure_mesh_descriptors(version=version, force=force)
|
|
76
75
|
supplemental_records = ensure_mesh_supplemental_records(version=version, force=force)
|
|
@@ -80,8 +79,8 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
80
79
|
name = entry["name"]
|
|
81
80
|
definition = entry.get("scope_note")
|
|
82
81
|
|
|
83
|
-
xrefs:
|
|
84
|
-
synonyms:
|
|
82
|
+
xrefs: list[Reference] = []
|
|
83
|
+
synonyms: set[str] = set()
|
|
85
84
|
for concept in entry["concepts"]:
|
|
86
85
|
synonyms.add(concept["name"])
|
|
87
86
|
for term in concept["terms"]:
|
|
@@ -107,7 +106,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
107
106
|
|
|
108
107
|
def ensure_mesh_descriptors(
|
|
109
108
|
version: str, force: bool = False, force_process: bool = False
|
|
110
|
-
) ->
|
|
109
|
+
) -> list[Mapping[str, Any]]:
|
|
111
110
|
"""Get the parsed MeSH dictionary, and cache it if it wasn't already."""
|
|
112
111
|
|
|
113
112
|
@cached_json(path=prefix_directory_join(PREFIX, name="desc.json", version=version), force=force)
|
|
@@ -133,7 +132,7 @@ def get_supplemental_url(version: str) -> str:
|
|
|
133
132
|
return f"https://nlmpubs.nlm.nih.gov/projects/mesh/{version}/xmlmesh/supp{version}.gz"
|
|
134
133
|
|
|
135
134
|
|
|
136
|
-
def ensure_mesh_supplemental_records(version: str, force: bool = False) ->
|
|
135
|
+
def ensure_mesh_supplemental_records(version: str, force: bool = False) -> list[Mapping[str, Any]]:
|
|
137
136
|
"""Get the parsed MeSH dictionary, and cache it if it wasn't already."""
|
|
138
137
|
|
|
139
138
|
@cached_json(path=prefix_directory_join(PREFIX, name="supp.json", version=version), force=force)
|
|
@@ -147,11 +146,11 @@ def ensure_mesh_supplemental_records(version: str, force: bool = False) -> List[
|
|
|
147
146
|
return _inner()
|
|
148
147
|
|
|
149
148
|
|
|
150
|
-
def get_descriptor_records(element: Element, id_key: str, name_key) ->
|
|
149
|
+
def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict[str, Any]]:
|
|
151
150
|
"""Get MeSH descriptor records."""
|
|
152
151
|
logger.info("extract MeSH descriptors, concepts, and terms")
|
|
153
152
|
|
|
154
|
-
rv:
|
|
153
|
+
rv: list[dict[str, Any]] = [
|
|
155
154
|
get_descriptor_record(descriptor, id_key=id_key, name_key=name_key)
|
|
156
155
|
for descriptor in tqdm(element, desc="Getting MeSH Descriptors", unit_scale=True)
|
|
157
156
|
]
|
|
@@ -204,7 +203,7 @@ def get_descriptor_record(
|
|
|
204
203
|
element: Element,
|
|
205
204
|
id_key: str,
|
|
206
205
|
name_key: str,
|
|
207
|
-
) ->
|
|
206
|
+
) -> dict[str, Any]:
|
|
208
207
|
"""Get descriptor records from the main element.
|
|
209
208
|
|
|
210
209
|
:param element: An XML element
|
|
@@ -228,13 +227,13 @@ def get_descriptor_record(
|
|
|
228
227
|
return rv
|
|
229
228
|
|
|
230
229
|
|
|
231
|
-
def get_concept_records(element: Element) ->
|
|
230
|
+
def get_concept_records(element: Element) -> list[Mapping[str, Any]]:
|
|
232
231
|
"""Get concepts from a record."""
|
|
233
232
|
return [get_concept_record(e) for e in element.findall("ConceptList/Concept")]
|
|
234
233
|
|
|
235
234
|
|
|
236
|
-
def _get_xrefs(element: Element) ->
|
|
237
|
-
raw_registry_numbers:
|
|
235
|
+
def _get_xrefs(element: Element) -> list[tuple[str, str]]:
|
|
236
|
+
raw_registry_numbers: list[str] = sorted(
|
|
238
237
|
{e.text for e in element.findall("RelatedRegistryNumberList/RegistryNumber") if e.text}
|
|
239
238
|
)
|
|
240
239
|
registry_number = element.findtext("RegistryNumber")
|
|
@@ -267,7 +266,7 @@ def get_concept_record(element: Element) -> Mapping[str, Any]:
|
|
|
267
266
|
if scope_note is not None:
|
|
268
267
|
scope_note = scope_note.replace("\\n", "\n").strip()
|
|
269
268
|
|
|
270
|
-
rv:
|
|
269
|
+
rv: dict[str, Any] = {
|
|
271
270
|
"concept_ui": element.findtext("ConceptUI"),
|
|
272
271
|
"name": element.findtext("ConceptName/String"),
|
|
273
272
|
"terms": get_term_records(element),
|
|
@@ -286,7 +285,7 @@ def get_concept_record(element: Element) -> Mapping[str, Any]:
|
|
|
286
285
|
return rv
|
|
287
286
|
|
|
288
287
|
|
|
289
|
-
def get_term_records(element: Element) ->
|
|
288
|
+
def get_term_records(element: Element) -> list[Mapping[str, Any]]:
|
|
290
289
|
"""Get all of the terms for a concept."""
|
|
291
290
|
return [get_term_record(term) for term in element.findall("TermList/Term")]
|
|
292
291
|
|
|
@@ -307,7 +306,7 @@ def _text_or_bust(element: Element, name: str) -> str:
|
|
|
307
306
|
return n
|
|
308
307
|
|
|
309
308
|
|
|
310
|
-
def _get_descriptor_qualifiers(descriptor: Element) ->
|
|
309
|
+
def _get_descriptor_qualifiers(descriptor: Element) -> list[Mapping[str, str]]:
|
|
311
310
|
return [
|
|
312
311
|
{
|
|
313
312
|
"qualifier_ui": _text_or_bust(qualifier, "QualifierUI"),
|
|
@@ -321,7 +320,7 @@ def _get_descriptor_qualifiers(descriptor: Element) -> List[Mapping[str, str]]:
|
|
|
321
320
|
|
|
322
321
|
def get_mesh_category_curies(
|
|
323
322
|
letter: str, *, skip: Optional[Collection[str]] = None, version: Optional[str] = None
|
|
324
|
-
) ->
|
|
323
|
+
) -> list[str]:
|
|
325
324
|
"""Get the MeSH LUIDs for a category, by letter (e.g., "A").
|
|
326
325
|
|
|
327
326
|
:param letter: The MeSH tree, A for anatomy, C for disease, etc.
|
|
@@ -332,8 +331,7 @@ def get_mesh_category_curies(
|
|
|
332
331
|
.. seealso:: https://meshb.nlm.nih.gov/treeView
|
|
333
332
|
"""
|
|
334
333
|
if version is None:
|
|
335
|
-
version =
|
|
336
|
-
assert version is not None
|
|
334
|
+
version = safe_get_version("mesh")
|
|
337
335
|
tree_to_mesh = get_tree_to_mesh_id(version=version)
|
|
338
336
|
rv = []
|
|
339
337
|
for i in range(1, 100):
|
pyobo/sources/mgi.py
CHANGED
pyobo/sources/mirbase.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for miRBase."""
|
|
4
2
|
|
|
5
3
|
import gzip
|
|
6
4
|
import logging
|
|
7
|
-
from
|
|
5
|
+
from collections.abc import Iterable, Mapping
|
|
8
6
|
|
|
9
7
|
from tqdm.auto import tqdm
|
|
10
8
|
|
|
@@ -48,7 +46,7 @@ def get_obo(force: bool = False) -> Obo:
|
|
|
48
46
|
return MiRBaseGetter(force=force)
|
|
49
47
|
|
|
50
48
|
|
|
51
|
-
def get_terms(version: str, force: bool = False) ->
|
|
49
|
+
def get_terms(version: str, force: bool = False) -> list[Term]:
|
|
52
50
|
"""Parse miRNA data from filepath and convert it to dictionary."""
|
|
53
51
|
_assert_frozen_version(version)
|
|
54
52
|
url = f"{BASE_URL}/miRNA.dat.gz"
|
|
@@ -77,7 +75,7 @@ def _prepare_organisms(version: str, force: bool = False):
|
|
|
77
75
|
return {division: (taxonomy_id, name) for _, division, name, _tree, taxonomy_id in df.values}
|
|
78
76
|
|
|
79
77
|
|
|
80
|
-
def _prepare_aliases(version: str, force: bool = False) -> Mapping[str,
|
|
78
|
+
def _prepare_aliases(version: str, force: bool = False) -> Mapping[str, list[str]]:
|
|
81
79
|
_assert_frozen_version(version)
|
|
82
80
|
url = f"{BASE_URL}/aliases.txt.gz"
|
|
83
81
|
df = ensure_df(PREFIX, url=url, sep="\t", version=version, force=force)
|
|
@@ -94,7 +92,7 @@ def _process_definitions_lines(
|
|
|
94
92
|
organisms = _prepare_organisms(version, force=force)
|
|
95
93
|
aliases = _prepare_aliases(version, force=force)
|
|
96
94
|
|
|
97
|
-
groups:
|
|
95
|
+
groups: list[list[str]] = []
|
|
98
96
|
|
|
99
97
|
for line in lines: # TODO replace with itertools.groupby
|
|
100
98
|
if line.startswith("ID"):
|
|
@@ -138,9 +136,15 @@ def _process_definitions_lines(
|
|
|
138
136
|
xref_prefix = xref_mapping.get(xref_prefix, xref_prefix)
|
|
139
137
|
if xref_prefix == "pictar":
|
|
140
138
|
continue
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
xref = Reference(
|
|
142
|
+
prefix=xref_prefix, identifier=xref_identifier, name=xref_label or None
|
|
143
|
+
)
|
|
144
|
+
except ValueError:
|
|
145
|
+
tqdm.write(f"invalid xref: {xref_prefix}:{xref_identifier}")
|
|
146
|
+
else:
|
|
147
|
+
xrefs.append(xref)
|
|
144
148
|
|
|
145
149
|
# TODO add pubmed references
|
|
146
150
|
|
pyobo/sources/mirbase_family.py
CHANGED
pyobo/sources/mirbase_mature.py
CHANGED
pyobo/sources/msigdb.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Parsers for MSig."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
6
|
-
from
|
|
7
|
-
from
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from typing import Optional
|
|
8
6
|
|
|
7
|
+
from lxml.etree import ElementTree
|
|
9
8
|
from tqdm.auto import tqdm
|
|
10
9
|
|
|
11
10
|
from ..struct import Obo, Reference, Term, has_participant
|
|
@@ -137,7 +136,7 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
137
136
|
def _get_definition(attrib) -> Optional[str]:
|
|
138
137
|
rv = attrib["DESCRIPTION_FULL"].strip() or attrib["DESCRIPTION_BRIEF"].strip() or None
|
|
139
138
|
if rv is not None:
|
|
140
|
-
return rv.replace(r"\d", "").replace(r"\s", "")
|
|
139
|
+
return rv.replace(r"\d", "").replace(r"\s", "")
|
|
141
140
|
return None
|
|
142
141
|
|
|
143
142
|
|
pyobo/sources/ncbigene.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for Entrez."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
6
|
-
from
|
|
4
|
+
from collections.abc import Iterable, Mapping
|
|
7
5
|
|
|
8
6
|
import bioregistry
|
|
9
7
|
import pandas as pd
|
|
@@ -47,7 +45,7 @@ GENE_INFO_COLUMNS = [
|
|
|
47
45
|
]
|
|
48
46
|
|
|
49
47
|
|
|
50
|
-
def get_ncbigene_ids() ->
|
|
48
|
+
def get_ncbigene_ids() -> set[str]:
|
|
51
49
|
"""Get the Entrez name mapping."""
|
|
52
50
|
df = _get_ncbigene_subset(["GeneID"])
|
|
53
51
|
return set(df["GeneID"])
|
|
@@ -68,7 +66,7 @@ def _get_ncbigene_info_subset(usecols) -> Mapping[str, str]:
|
|
|
68
66
|
return dict(df.values)
|
|
69
67
|
|
|
70
68
|
|
|
71
|
-
def _get_ncbigene_subset(usecols:
|
|
69
|
+
def _get_ncbigene_subset(usecols: list[str]) -> pd.DataFrame:
|
|
72
70
|
df = ensure_df(
|
|
73
71
|
PREFIX,
|
|
74
72
|
url=GENE_INFO_URL,
|
pyobo/sources/npass.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for NPASS."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
6
|
-
from
|
|
4
|
+
from collections.abc import Iterable
|
|
7
5
|
|
|
8
6
|
import pandas as pd
|
|
9
7
|
from tqdm.auto import tqdm
|
|
@@ -41,7 +39,7 @@ def get_obo(force: bool = False) -> Obo:
|
|
|
41
39
|
|
|
42
40
|
def get_df(version: str, force: bool = False) -> pd.DataFrame:
|
|
43
41
|
"""Get the NPASS chemical nomenclature."""
|
|
44
|
-
base_url = f"
|
|
42
|
+
base_url = f"https://bidd.group/NPASS/downloadFiles/NPASSv{version}_download"
|
|
45
43
|
url = f"{base_url}_naturalProducts_generalInfo.txt"
|
|
46
44
|
return ensure_df(
|
|
47
45
|
PREFIX,
|
pyobo/sources/omim_ps.py
CHANGED
pyobo/sources/pathbank.py
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for PathBank."""
|
|
4
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
5
|
import logging
|
|
6
6
|
from collections import defaultdict
|
|
7
|
-
from
|
|
7
|
+
from collections.abc import Iterable, Mapping
|
|
8
8
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
from tqdm.auto import tqdm
|
|
11
11
|
|
|
12
12
|
from ..struct import Obo, Reference, Term
|
|
13
|
-
from ..struct.typedef import has_participant
|
|
13
|
+
from ..struct.typedef import has_category, has_participant
|
|
14
14
|
from ..utils.path import ensure_df
|
|
15
15
|
|
|
16
16
|
__all__ = [
|
|
@@ -70,7 +70,7 @@ class PathBankGetter(Obo):
|
|
|
70
70
|
"""An ontology representation of PathBank's pathway nomenclature."""
|
|
71
71
|
|
|
72
72
|
ontology = bioversions_key = PREFIX
|
|
73
|
-
typedefs = [has_participant]
|
|
73
|
+
typedefs = [has_participant, has_category]
|
|
74
74
|
|
|
75
75
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
76
76
|
"""Iterate over terms in the ontology."""
|
|
@@ -98,46 +98,58 @@ def get_proteins_df(version: str, force: bool = False) -> pd.DataFrame:
|
|
|
98
98
|
return proteins_df
|
|
99
99
|
|
|
100
100
|
|
|
101
|
-
def get_protein_mapping(version: str, force: bool = False) -> Mapping[str,
|
|
101
|
+
def get_protein_mapping(version: str, force: bool = False) -> Mapping[str, set[Reference]]:
|
|
102
102
|
"""Make the protein mapping."""
|
|
103
103
|
proteins_df = get_proteins_df(version=version, force=force)
|
|
104
104
|
smpdb_id_to_proteins = defaultdict(set)
|
|
105
105
|
for pathway_id, protein_id in tqdm(
|
|
106
106
|
proteins_df.values, desc=f"[{PREFIX}] mapping proteins", unit_scale=True
|
|
107
107
|
):
|
|
108
|
-
|
|
109
|
-
|
|
108
|
+
try:
|
|
109
|
+
if "-" in protein_id:
|
|
110
|
+
reference = Reference(prefix="uniprot.isoform", identifier=protein_id)
|
|
111
|
+
else:
|
|
112
|
+
reference = Reference(prefix="uniprot", identifier=protein_id)
|
|
113
|
+
except ValueError:
|
|
114
|
+
tqdm.write(f"[pathbank] invalid uniprot identifier: {protein_id}")
|
|
115
|
+
else:
|
|
116
|
+
smpdb_id_to_proteins[pathway_id].add(reference)
|
|
110
117
|
return smpdb_id_to_proteins
|
|
111
118
|
|
|
112
119
|
|
|
113
120
|
def get_metabolite_df(version: str, force: bool = False) -> pd.DataFrame:
|
|
114
121
|
"""Get the metabolites dataframe."""
|
|
115
|
-
|
|
122
|
+
df = ensure_df(
|
|
116
123
|
PREFIX,
|
|
117
124
|
url=METABOLITE_URL,
|
|
118
125
|
sep=",",
|
|
119
|
-
usecols=["PathBank ID", "
|
|
126
|
+
usecols=["PathBank ID", "ChEBI ID"],
|
|
120
127
|
force=force,
|
|
121
128
|
version=version,
|
|
122
129
|
)
|
|
130
|
+
df = df[df["ChEBI ID"].notna()]
|
|
131
|
+
return df
|
|
123
132
|
|
|
124
133
|
|
|
125
|
-
def get_metabolite_mapping(version: str, force: bool = False) -> Mapping[str,
|
|
134
|
+
def get_metabolite_mapping(version: str, force: bool = False) -> Mapping[str, set[Reference]]:
|
|
126
135
|
"""Make the metabolite mapping."""
|
|
127
136
|
metabolites_df = get_metabolite_df(version=version, force=force)
|
|
128
137
|
smpdb_id_to_metabolites = defaultdict(set)
|
|
129
138
|
it = tqdm(metabolites_df.values, desc=f"[{PREFIX}] mapping metabolites", unit_scale=True)
|
|
130
|
-
for pathway_id, metabolite_id
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
prefix=PREFIX,
|
|
134
|
-
identifier=metabolite_id,
|
|
135
|
-
name=metabolite_name,
|
|
136
|
-
)
|
|
137
|
-
)
|
|
139
|
+
for pathway_id, metabolite_id in it:
|
|
140
|
+
reference = Reference(prefix="chebi", identifier=metabolite_id.strip())
|
|
141
|
+
smpdb_id_to_metabolites[pathway_id].add(reference)
|
|
138
142
|
return smpdb_id_to_metabolites
|
|
139
143
|
|
|
140
144
|
|
|
145
|
+
def _clean_description(description: str) -> str | None:
|
|
146
|
+
"""Clean the description."""
|
|
147
|
+
if pd.isna(description) or not description:
|
|
148
|
+
return None
|
|
149
|
+
parts = [part.strip() for part in description.strip().splitlines()]
|
|
150
|
+
return " ".join(parts)
|
|
151
|
+
|
|
152
|
+
|
|
141
153
|
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
142
154
|
"""Get PathBank's terms."""
|
|
143
155
|
smpdb_id_to_proteins = get_protein_mapping(version=version, force=force)
|
|
@@ -149,16 +161,11 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
149
161
|
reference = Reference(prefix=PREFIX, identifier=pathbank_id, name=name)
|
|
150
162
|
term = Term(
|
|
151
163
|
reference=reference,
|
|
152
|
-
#
|
|
153
|
-
|
|
154
|
-
)
|
|
155
|
-
term.append_parent(
|
|
156
|
-
Reference(
|
|
157
|
-
prefix=PREFIX,
|
|
158
|
-
identifier=subject.lower().replace(" ", "_"),
|
|
159
|
-
name=subject,
|
|
160
|
-
)
|
|
164
|
+
# TODO use _clean_description(description) to add a description,
|
|
165
|
+
# but there are weird parser errors
|
|
161
166
|
)
|
|
167
|
+
term.append_exact_match(Reference(prefix="smpdb", identifier=smpdb_id))
|
|
168
|
+
term.append_property(has_category, subject.lower().replace(" ", "_"))
|
|
162
169
|
term.extend_relationship(has_participant, smpdb_id_to_proteins[smpdb_id])
|
|
163
170
|
term.extend_relationship(has_participant, smpdb_id_to_metabolites[smpdb_id])
|
|
164
171
|
yield term
|
pyobo/sources/pfam.py
CHANGED
pyobo/sources/pfam_clan.py
CHANGED
pyobo/sources/pid.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for NCI PID."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
6
4
|
from collections import defaultdict
|
|
7
|
-
from
|
|
5
|
+
from collections.abc import Iterable, Mapping
|
|
8
6
|
|
|
9
7
|
import pandas as pd
|
|
10
8
|
|
|
@@ -45,7 +43,7 @@ def get_obo() -> Obo:
|
|
|
45
43
|
return PIDGetter()
|
|
46
44
|
|
|
47
45
|
|
|
48
|
-
def iter_networks(use_tqdm: bool = False, force: bool = False) -> Iterable[
|
|
46
|
+
def iter_networks(use_tqdm: bool = False, force: bool = False) -> Iterable[tuple[str, CX]]:
|
|
49
47
|
"""Iterate over NCI PID networks."""
|
|
50
48
|
yield from ensure_ndex_network_set(
|
|
51
49
|
PREFIX, NDEX_NETWORK_SET_UUID, use_tqdm=use_tqdm, force=force
|
|
@@ -117,7 +115,7 @@ def get_curation_df() -> pd.DataFrame:
|
|
|
117
115
|
return df[["Text from NDEx", "Type", "Namespace", "Identifier"]]
|
|
118
116
|
|
|
119
117
|
|
|
120
|
-
def get_remapping() -> Mapping[str,
|
|
118
|
+
def get_remapping() -> Mapping[str, list[tuple[str, str]]]:
|
|
121
119
|
"""Get a mapping from text to list of HGNC id/symbols."""
|
|
122
120
|
curation_df = get_curation_df()
|
|
123
121
|
rv = defaultdict(list)
|
pyobo/sources/pombase.py
CHANGED
|
@@ -1,16 +1,15 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for PomBase."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
6
4
|
from collections import defaultdict
|
|
7
|
-
from
|
|
5
|
+
from collections.abc import Iterable
|
|
8
6
|
|
|
9
7
|
import pandas as pd
|
|
10
8
|
from tqdm.auto import tqdm
|
|
11
9
|
|
|
12
10
|
import pyobo
|
|
13
11
|
from pyobo import Reference
|
|
12
|
+
from pyobo.resources.so import get_so_name
|
|
14
13
|
from pyobo.struct import Obo, Term, from_species, has_gene_product, orthologous
|
|
15
14
|
from pyobo.utils.path import ensure_df
|
|
16
15
|
|
|
@@ -21,7 +20,7 @@ __all__ = [
|
|
|
21
20
|
logger = logging.getLogger(__name__)
|
|
22
21
|
|
|
23
22
|
PREFIX = "pombase"
|
|
24
|
-
|
|
23
|
+
GENE_NAMES_URL = "https://www.pombase.org/data/names_and_identifiers/gene_IDs_names_products.tsv"
|
|
25
24
|
ORTHOLOGS_URL = "https://www.pombase.org/data/orthologs/human-orthologs.txt.gz"
|
|
26
25
|
|
|
27
26
|
|
|
@@ -70,9 +69,11 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
70
69
|
if hgnc_id is not None:
|
|
71
70
|
identifier_to_hgnc_ids[identifier].add(hgnc_id)
|
|
72
71
|
|
|
73
|
-
df = ensure_df(PREFIX, url=
|
|
72
|
+
df = ensure_df(PREFIX, url=GENE_NAMES_URL, force=force, version=version)
|
|
74
73
|
so = {
|
|
75
|
-
gtype: Reference
|
|
74
|
+
gtype: Reference(
|
|
75
|
+
prefix="SO", identifier=POMBASE_TO_SO[gtype], name=get_so_name(POMBASE_TO_SO[gtype])
|
|
76
|
+
)
|
|
76
77
|
for gtype in sorted(df[df.columns[6]].unique())
|
|
77
78
|
}
|
|
78
79
|
for _, reference in sorted(so.items()):
|
pyobo/sources/pubchem.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for PubChem Compound."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
6
|
-
from
|
|
4
|
+
from collections.abc import Iterable, Mapping
|
|
5
|
+
from typing import Optional
|
|
7
6
|
|
|
8
7
|
import pandas as pd
|
|
9
8
|
from bioregistry.utils import removeprefix
|
pyobo/sources/reactome.py
CHANGED
|
@@ -1,11 +1,9 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for Reactome."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
6
4
|
from collections import defaultdict
|
|
5
|
+
from collections.abc import Iterable, Mapping
|
|
7
6
|
from functools import lru_cache
|
|
8
|
-
from typing import Iterable, Mapping, Set
|
|
9
7
|
|
|
10
8
|
import pandas as pd
|
|
11
9
|
from tqdm.auto import tqdm
|
|
@@ -72,7 +70,9 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
72
70
|
df["taxonomy_id"] = df["species"].map(get_ncbitaxon_id)
|
|
73
71
|
|
|
74
72
|
terms = {}
|
|
75
|
-
it = tqdm(
|
|
73
|
+
it = tqdm(
|
|
74
|
+
df.values, total=len(df.index), desc=f"mapping {PREFIX}", unit_scale=True, unit="pathway"
|
|
75
|
+
)
|
|
76
76
|
for reactome_id, name, species_name, taxonomy_id in it:
|
|
77
77
|
terms[reactome_id] = term = Term(
|
|
78
78
|
reference=Reference(prefix=PREFIX, identifier=reactome_id, name=name),
|
|
@@ -94,10 +94,21 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
94
94
|
terms[child_id].append_parent(terms[parent_id])
|
|
95
95
|
|
|
96
96
|
uniprot_pathway_df = ensure_participant_df(version=version, force=force)
|
|
97
|
-
for uniprot_id, reactome_id in tqdm(
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
97
|
+
for uniprot_id, reactome_id in tqdm(
|
|
98
|
+
uniprot_pathway_df.values,
|
|
99
|
+
total=len(uniprot_pathway_df),
|
|
100
|
+
unit_scale=True,
|
|
101
|
+
unit="pathway-protein",
|
|
102
|
+
):
|
|
103
|
+
if reactome_id not in terms:
|
|
104
|
+
tqdm.write(f"{reactome_id} appears in uniprot participants file but not pathways file")
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
if "-" in uniprot_id:
|
|
108
|
+
reference = Reference(prefix="uniprot.isoform", identifier=uniprot_id)
|
|
109
|
+
else:
|
|
110
|
+
reference = Reference(prefix="uniprot", identifier=uniprot_id)
|
|
111
|
+
terms[reactome_id].append_relationship(has_participant, reference)
|
|
101
112
|
|
|
102
113
|
chebi_pathway_url = f"https://reactome.org/download/{version}/ChEBI2Reactome_All_Levels.txt"
|
|
103
114
|
chebi_pathway_df = ensure_df(
|
|
@@ -108,7 +119,15 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
108
119
|
version=version,
|
|
109
120
|
force=force,
|
|
110
121
|
)
|
|
111
|
-
for chebi_id, reactome_id in tqdm(
|
|
122
|
+
for chebi_id, reactome_id in tqdm(
|
|
123
|
+
chebi_pathway_df.values,
|
|
124
|
+
total=len(chebi_pathway_df),
|
|
125
|
+
unit_scale=True,
|
|
126
|
+
unit="pathway-chemical",
|
|
127
|
+
):
|
|
128
|
+
if reactome_id not in terms:
|
|
129
|
+
tqdm.write(f"{reactome_id} appears in chebi participants file but not pathways file")
|
|
130
|
+
continue
|
|
112
131
|
terms[reactome_id].append_relationship(
|
|
113
132
|
has_participant, Reference(prefix="chebi", identifier=chebi_id)
|
|
114
133
|
)
|
|
@@ -122,7 +141,7 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
122
141
|
|
|
123
142
|
|
|
124
143
|
@lru_cache(maxsize=1)
|
|
125
|
-
def get_protein_to_pathways() -> Mapping[str,
|
|
144
|
+
def get_protein_to_pathways() -> Mapping[str, set[str]]:
|
|
126
145
|
"""Get a mapping from proteins to the pathways they're in."""
|
|
127
146
|
protein_to_pathways = defaultdict(set)
|
|
128
147
|
x = get_id_multirelations_mapping("reactome", has_participant)
|
|
@@ -135,4 +154,4 @@ def get_protein_to_pathways() -> Mapping[str, Set[str]]:
|
|
|
135
154
|
|
|
136
155
|
|
|
137
156
|
if __name__ == "__main__":
|
|
138
|
-
|
|
157
|
+
ReactomeGetter.cli()
|
pyobo/sources/rgd.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
1
|
"""Converter for RGD."""
|
|
4
2
|
|
|
5
3
|
import logging
|
|
6
|
-
from
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from typing import Optional
|
|
7
6
|
|
|
8
7
|
import pandas as pd
|
|
9
8
|
from tqdm.auto import tqdm
|
|
@@ -138,7 +137,7 @@ def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Te
|
|
|
138
137
|
continue
|
|
139
138
|
if prefix == "uniprot":
|
|
140
139
|
term.append_relationship(
|
|
141
|
-
has_gene_product, Reference
|
|
140
|
+
has_gene_product, Reference(prefix=prefix, identifier=xref_id)
|
|
142
141
|
)
|
|
143
142
|
elif prefix == "ensembl":
|
|
144
143
|
if xref_id.startswith("ENSMUSG") or xref_id.startswith("ENSRNOG"):
|