pyobo 0.11.2__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/.DS_Store +0 -0
- pyobo/__init__.py +95 -20
- pyobo/__main__.py +0 -0
- pyobo/api/__init__.py +81 -10
- pyobo/api/alts.py +52 -42
- pyobo/api/combine.py +39 -0
- pyobo/api/edges.py +68 -0
- pyobo/api/hierarchy.py +231 -203
- pyobo/api/metadata.py +14 -19
- pyobo/api/names.py +207 -127
- pyobo/api/properties.py +117 -113
- pyobo/api/relations.py +68 -94
- pyobo/api/species.py +24 -21
- pyobo/api/typedefs.py +11 -11
- pyobo/api/utils.py +66 -13
- pyobo/api/xrefs.py +108 -114
- pyobo/cli/__init__.py +0 -0
- pyobo/cli/cli.py +35 -50
- pyobo/cli/database.py +183 -161
- pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
- pyobo/cli/lookup.py +163 -195
- pyobo/cli/utils.py +19 -6
- pyobo/constants.py +102 -3
- pyobo/getters.py +196 -118
- pyobo/gilda_utils.py +79 -200
- pyobo/identifier_utils/__init__.py +41 -0
- pyobo/identifier_utils/api.py +296 -0
- pyobo/identifier_utils/model.py +130 -0
- pyobo/identifier_utils/preprocessing.json +812 -0
- pyobo/identifier_utils/preprocessing.py +61 -0
- pyobo/identifier_utils/relations/__init__.py +8 -0
- pyobo/identifier_utils/relations/api.py +162 -0
- pyobo/identifier_utils/relations/data.json +5824 -0
- pyobo/identifier_utils/relations/data_owl.json +57 -0
- pyobo/identifier_utils/relations/data_rdf.json +1 -0
- pyobo/identifier_utils/relations/data_rdfs.json +7 -0
- pyobo/mocks.py +9 -6
- pyobo/ner/__init__.py +9 -0
- pyobo/ner/api.py +72 -0
- pyobo/ner/normalizer.py +33 -0
- pyobo/obographs.py +43 -39
- pyobo/plugins.py +5 -4
- pyobo/py.typed +0 -0
- pyobo/reader.py +1358 -395
- pyobo/reader_utils.py +155 -0
- pyobo/resource_utils.py +42 -22
- pyobo/resources/__init__.py +0 -0
- pyobo/resources/goc.py +75 -0
- pyobo/resources/goc.tsv +188 -0
- pyobo/resources/ncbitaxon.py +4 -5
- pyobo/resources/ncbitaxon.tsv.gz +0 -0
- pyobo/resources/ro.py +3 -2
- pyobo/resources/ro.tsv +0 -0
- pyobo/resources/so.py +0 -0
- pyobo/resources/so.tsv +0 -0
- pyobo/sources/README.md +12 -8
- pyobo/sources/__init__.py +52 -29
- pyobo/sources/agrovoc.py +0 -0
- pyobo/sources/antibodyregistry.py +11 -12
- pyobo/sources/bigg/__init__.py +13 -0
- pyobo/sources/bigg/bigg_compartment.py +81 -0
- pyobo/sources/bigg/bigg_metabolite.py +229 -0
- pyobo/sources/bigg/bigg_model.py +46 -0
- pyobo/sources/bigg/bigg_reaction.py +77 -0
- pyobo/sources/biogrid.py +1 -2
- pyobo/sources/ccle.py +7 -12
- pyobo/sources/cgnc.py +0 -5
- pyobo/sources/chebi.py +1 -1
- pyobo/sources/chembl/__init__.py +9 -0
- pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
- pyobo/sources/chembl/chembl_target.py +160 -0
- pyobo/sources/civic_gene.py +55 -15
- pyobo/sources/clinicaltrials.py +160 -0
- pyobo/sources/complexportal.py +24 -24
- pyobo/sources/conso.py +14 -22
- pyobo/sources/cpt.py +0 -0
- pyobo/sources/credit.py +1 -9
- pyobo/sources/cvx.py +27 -5
- pyobo/sources/depmap.py +9 -12
- pyobo/sources/dictybase_gene.py +2 -7
- pyobo/sources/drugbank/__init__.py +9 -0
- pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
- pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
- pyobo/sources/drugcentral.py +17 -13
- pyobo/sources/expasy.py +31 -34
- pyobo/sources/famplex.py +13 -18
- pyobo/sources/flybase.py +3 -8
- pyobo/sources/gard.py +62 -0
- pyobo/sources/geonames/__init__.py +9 -0
- pyobo/sources/geonames/features.py +28 -0
- pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
- pyobo/sources/geonames/utils.py +115 -0
- pyobo/sources/gmt_utils.py +6 -7
- pyobo/sources/go.py +20 -13
- pyobo/sources/gtdb.py +154 -0
- pyobo/sources/gwascentral/__init__.py +9 -0
- pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
- pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
- pyobo/sources/hgnc/__init__.py +9 -0
- pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
- pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
- pyobo/sources/icd/__init__.py +9 -0
- pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
- pyobo/sources/icd/icd11.py +148 -0
- pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
- pyobo/sources/interpro.py +4 -9
- pyobo/sources/itis.py +0 -5
- pyobo/sources/kegg/__init__.py +0 -0
- pyobo/sources/kegg/api.py +16 -38
- pyobo/sources/kegg/genes.py +9 -20
- pyobo/sources/kegg/genome.py +1 -7
- pyobo/sources/kegg/pathway.py +9 -21
- pyobo/sources/mesh.py +58 -24
- pyobo/sources/mgi.py +3 -10
- pyobo/sources/mirbase/__init__.py +11 -0
- pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
- pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
- pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
- pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
- pyobo/sources/msigdb.py +74 -39
- pyobo/sources/ncbi/__init__.py +9 -0
- pyobo/sources/ncbi/ncbi_gc.py +162 -0
- pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
- pyobo/sources/nih_reporter.py +60 -0
- pyobo/sources/nlm/__init__.py +9 -0
- pyobo/sources/nlm/nlm_catalog.py +48 -0
- pyobo/sources/nlm/nlm_publisher.py +36 -0
- pyobo/sources/nlm/utils.py +116 -0
- pyobo/sources/npass.py +6 -8
- pyobo/sources/omim_ps.py +10 -3
- pyobo/sources/pathbank.py +4 -8
- pyobo/sources/pfam/__init__.py +9 -0
- pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
- pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
- pyobo/sources/pharmgkb/__init__.py +15 -0
- pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
- pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
- pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
- pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
- pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
- pyobo/sources/pharmgkb/utils.py +86 -0
- pyobo/sources/pid.py +1 -6
- pyobo/sources/pombase.py +6 -10
- pyobo/sources/pubchem.py +4 -9
- pyobo/sources/reactome.py +5 -11
- pyobo/sources/rgd.py +11 -16
- pyobo/sources/rhea.py +37 -36
- pyobo/sources/ror.py +69 -42
- pyobo/sources/selventa/__init__.py +0 -0
- pyobo/sources/selventa/schem.py +4 -7
- pyobo/sources/selventa/scomp.py +1 -6
- pyobo/sources/selventa/sdis.py +4 -7
- pyobo/sources/selventa/sfam.py +1 -6
- pyobo/sources/sgd.py +6 -11
- pyobo/sources/signor/__init__.py +7 -0
- pyobo/sources/signor/download.py +41 -0
- pyobo/sources/signor/signor_complexes.py +105 -0
- pyobo/sources/slm.py +12 -15
- pyobo/sources/umls/__init__.py +7 -1
- pyobo/sources/umls/__main__.py +0 -0
- pyobo/sources/umls/get_synonym_types.py +20 -4
- pyobo/sources/umls/sty.py +57 -0
- pyobo/sources/umls/synonym_types.tsv +1 -1
- pyobo/sources/umls/umls.py +18 -22
- pyobo/sources/unimod.py +46 -0
- pyobo/sources/uniprot/__init__.py +1 -1
- pyobo/sources/uniprot/uniprot.py +40 -32
- pyobo/sources/uniprot/uniprot_ptm.py +4 -34
- pyobo/sources/utils.py +3 -2
- pyobo/sources/wikipathways.py +7 -10
- pyobo/sources/zfin.py +5 -10
- pyobo/ssg/__init__.py +12 -16
- pyobo/ssg/base.html +0 -0
- pyobo/ssg/index.html +26 -13
- pyobo/ssg/term.html +12 -2
- pyobo/ssg/typedef.html +0 -0
- pyobo/struct/__init__.py +54 -8
- pyobo/struct/functional/__init__.py +1 -0
- pyobo/struct/functional/dsl.py +2572 -0
- pyobo/struct/functional/macros.py +423 -0
- pyobo/struct/functional/obo_to_functional.py +385 -0
- pyobo/struct/functional/ontology.py +270 -0
- pyobo/struct/functional/utils.py +112 -0
- pyobo/struct/reference.py +331 -136
- pyobo/struct/struct.py +1413 -643
- pyobo/struct/struct_utils.py +1078 -0
- pyobo/struct/typedef.py +162 -210
- pyobo/struct/utils.py +12 -5
- pyobo/struct/vocabulary.py +138 -0
- pyobo/utils/__init__.py +0 -0
- pyobo/utils/cache.py +13 -11
- pyobo/utils/io.py +17 -31
- pyobo/utils/iter.py +5 -5
- pyobo/utils/misc.py +41 -53
- pyobo/utils/ndex_utils.py +0 -0
- pyobo/utils/path.py +76 -70
- pyobo/version.py +3 -3
- {pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/METADATA +228 -229
- pyobo-0.12.0.dist-info/RECORD +202 -0
- pyobo-0.12.0.dist-info/WHEEL +4 -0
- {pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
- pyobo-0.12.0.dist-info/licenses/LICENSE +21 -0
- pyobo/aws.py +0 -162
- pyobo/cli/aws.py +0 -47
- pyobo/identifier_utils.py +0 -142
- pyobo/normalizer.py +0 -232
- pyobo/registries/__init__.py +0 -16
- pyobo/registries/metaregistry.json +0 -507
- pyobo/registries/metaregistry.py +0 -135
- pyobo/sources/icd11.py +0 -105
- pyobo/xrefdb/__init__.py +0 -1
- pyobo/xrefdb/canonicalizer.py +0 -214
- pyobo/xrefdb/priority.py +0 -59
- pyobo/xrefdb/sources/__init__.py +0 -60
- pyobo/xrefdb/sources/biomappings.py +0 -36
- pyobo/xrefdb/sources/cbms2019.py +0 -91
- pyobo/xrefdb/sources/chembl.py +0 -83
- pyobo/xrefdb/sources/compath.py +0 -82
- pyobo/xrefdb/sources/famplex.py +0 -64
- pyobo/xrefdb/sources/gilda.py +0 -50
- pyobo/xrefdb/sources/intact.py +0 -113
- pyobo/xrefdb/sources/ncit.py +0 -133
- pyobo/xrefdb/sources/pubchem.py +0 -27
- pyobo/xrefdb/sources/wikidata.py +0 -116
- pyobo-0.11.2.dist-info/RECORD +0 -157
- pyobo-0.11.2.dist-info/WHEEL +0 -5
- pyobo-0.11.2.dist-info/top_level.txt +0 -1
pyobo/sources/mesh.py
CHANGED
|
@@ -4,22 +4,25 @@ import datetime
|
|
|
4
4
|
import itertools as itt
|
|
5
5
|
import logging
|
|
6
6
|
import re
|
|
7
|
+
import time
|
|
7
8
|
from collections.abc import Collection, Iterable, Mapping
|
|
8
|
-
from
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
9
11
|
from xml.etree.ElementTree import Element
|
|
10
12
|
|
|
13
|
+
from lxml import etree
|
|
11
14
|
from tqdm.auto import tqdm
|
|
12
15
|
|
|
13
16
|
from pyobo.api.utils import safe_get_version
|
|
14
17
|
from pyobo.identifier_utils import standardize_ec
|
|
15
18
|
from pyobo.struct import Obo, Reference, Synonym, Term
|
|
16
19
|
from pyobo.utils.cache import cached_json, cached_mapping
|
|
17
|
-
from pyobo.utils.io import parse_xml_gz
|
|
18
20
|
from pyobo.utils.path import ensure_path, prefix_directory_join
|
|
19
21
|
|
|
20
22
|
__all__ = [
|
|
21
23
|
"MeSHGetter",
|
|
22
24
|
"get_mesh_category_curies",
|
|
25
|
+
"get_mesh_category_references",
|
|
23
26
|
]
|
|
24
27
|
|
|
25
28
|
logger = logging.getLogger(__name__)
|
|
@@ -30,12 +33,21 @@ CAS_RE = re.compile(r"^\d{1,7}\-\d{2}\-\d$")
|
|
|
30
33
|
UNII_RE = re.compile(r"[0-9A-Za-z]{10}$")
|
|
31
34
|
|
|
32
35
|
|
|
36
|
+
def _get_xml_root(path: Path) -> Element:
|
|
37
|
+
"""Parse an XML file from a path to a GZIP file."""
|
|
38
|
+
t = time.time()
|
|
39
|
+
logger.info("parsing xml from %s", path)
|
|
40
|
+
tree = etree.parse(path.as_posix()) # type:ignore
|
|
41
|
+
logger.info("parsed xml in %.2f seconds", time.time() - t)
|
|
42
|
+
return tree.getroot()
|
|
43
|
+
|
|
44
|
+
|
|
33
45
|
class MeSHGetter(Obo):
|
|
34
46
|
"""An ontology representation of the Medical Subject Headings."""
|
|
35
47
|
|
|
36
48
|
ontology = bioversions_key = PREFIX
|
|
37
49
|
|
|
38
|
-
def _get_version(self) ->
|
|
50
|
+
def _get_version(self) -> str | None:
|
|
39
51
|
return NOW_YEAR
|
|
40
52
|
|
|
41
53
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
@@ -43,11 +55,6 @@ class MeSHGetter(Obo):
|
|
|
43
55
|
return get_terms(version=self._version_or_raise, force=force)
|
|
44
56
|
|
|
45
57
|
|
|
46
|
-
def get_obo(force: bool = False) -> Obo:
|
|
47
|
-
"""Get MeSH as OBO."""
|
|
48
|
-
return MeSHGetter(force=force)
|
|
49
|
-
|
|
50
|
-
|
|
51
58
|
def get_tree_to_mesh_id(version: str) -> Mapping[str, str]:
|
|
52
59
|
"""Get a mapping from MeSH tree numbers to their MeSH identifiers."""
|
|
53
60
|
|
|
@@ -110,12 +117,12 @@ def ensure_mesh_descriptors(
|
|
|
110
117
|
"""Get the parsed MeSH dictionary, and cache it if it wasn't already."""
|
|
111
118
|
|
|
112
119
|
@cached_json(path=prefix_directory_join(PREFIX, name="desc.json", version=version), force=force)
|
|
113
|
-
def _inner():
|
|
120
|
+
def _inner() -> list[dict[str, Any]]:
|
|
114
121
|
path = ensure_path(PREFIX, url=get_descriptors_url(version), version=version)
|
|
115
|
-
root =
|
|
122
|
+
root = _get_xml_root(path)
|
|
116
123
|
return get_descriptor_records(root, id_key="DescriptorUI", name_key="DescriptorName/String")
|
|
117
124
|
|
|
118
|
-
return _inner()
|
|
125
|
+
return _inner() # type:ignore
|
|
119
126
|
|
|
120
127
|
|
|
121
128
|
def get_descriptors_url(version: str) -> str:
|
|
@@ -136,14 +143,14 @@ def ensure_mesh_supplemental_records(version: str, force: bool = False) -> list[
|
|
|
136
143
|
"""Get the parsed MeSH dictionary, and cache it if it wasn't already."""
|
|
137
144
|
|
|
138
145
|
@cached_json(path=prefix_directory_join(PREFIX, name="supp.json", version=version), force=force)
|
|
139
|
-
def _inner():
|
|
146
|
+
def _inner() -> list[dict[str, Any]]:
|
|
140
147
|
path = ensure_path(PREFIX, url=get_supplemental_url(version), version=version)
|
|
141
|
-
root =
|
|
148
|
+
root = _get_xml_root(path)
|
|
142
149
|
return get_descriptor_records(
|
|
143
150
|
root, id_key="SupplementalRecordUI", name_key="SupplementalRecordName/String"
|
|
144
151
|
)
|
|
145
152
|
|
|
146
|
-
return _inner()
|
|
153
|
+
return _inner() # type:ignore
|
|
147
154
|
|
|
148
155
|
|
|
149
156
|
def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict[str, Any]]:
|
|
@@ -169,7 +176,7 @@ def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict
|
|
|
169
176
|
parents_descriptor_uis = set()
|
|
170
177
|
for tree_number in descriptor["tree_numbers"]:
|
|
171
178
|
try:
|
|
172
|
-
parent_tn,
|
|
179
|
+
parent_tn, _self_tn = tree_number.rsplit(".", 1)
|
|
173
180
|
except ValueError:
|
|
174
181
|
logger.debug("No dot for %s", tree_number)
|
|
175
182
|
continue
|
|
@@ -185,7 +192,7 @@ def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict
|
|
|
185
192
|
return rv
|
|
186
193
|
|
|
187
194
|
|
|
188
|
-
def get_scope_note(descriptor_record) ->
|
|
195
|
+
def get_scope_note(descriptor_record) -> str | None:
|
|
189
196
|
"""Get the scope note from the preferred concept in a term's record."""
|
|
190
197
|
if isinstance(descriptor_record, dict):
|
|
191
198
|
# necessary for pre-2023 data
|
|
@@ -207,9 +214,10 @@ def get_descriptor_record(
|
|
|
207
214
|
"""Get descriptor records from the main element.
|
|
208
215
|
|
|
209
216
|
:param element: An XML element
|
|
210
|
-
:param id_key: For descriptors, set to 'DescriptorUI'. For supplement, set to
|
|
211
|
-
|
|
212
|
-
|
|
217
|
+
:param id_key: For descriptors, set to 'DescriptorUI'. For supplement, set to
|
|
218
|
+
'SupplementalRecordUI'
|
|
219
|
+
:param name_key: For descriptors, set to 'DescriptorName/String'. For supplement,
|
|
220
|
+
set to 'SupplementalRecordName/String'
|
|
213
221
|
"""
|
|
214
222
|
concepts = get_concept_records(element)
|
|
215
223
|
scope_note = get_scope_note(concepts)
|
|
@@ -248,7 +256,7 @@ def _get_xrefs(element: Element) -> list[tuple[str, str]]:
|
|
|
248
256
|
elif registry_number.startswith("txid"):
|
|
249
257
|
rv.append(("NCBITaxon", registry_number[4:]))
|
|
250
258
|
elif registry_number.startswith("EC "):
|
|
251
|
-
rv.append(("
|
|
259
|
+
rv.append(("ec", standardize_ec(registry_number[3:])))
|
|
252
260
|
elif CAS_RE.fullmatch(registry_number):
|
|
253
261
|
rv.append(("cas", registry_number))
|
|
254
262
|
elif UNII_RE.fullmatch(registry_number):
|
|
@@ -319,16 +327,40 @@ def _get_descriptor_qualifiers(descriptor: Element) -> list[Mapping[str, str]]:
|
|
|
319
327
|
|
|
320
328
|
|
|
321
329
|
def get_mesh_category_curies(
|
|
322
|
-
letter: str, *, skip:
|
|
330
|
+
letter: str, *, skip: Collection[str] | None = None, version: str | None = None
|
|
323
331
|
) -> list[str]:
|
|
324
332
|
"""Get the MeSH LUIDs for a category, by letter (e.g., "A").
|
|
325
333
|
|
|
326
334
|
:param letter: The MeSH tree, A for anatomy, C for disease, etc.
|
|
327
335
|
:param skip: An optional collection of MeSH tree codes to skip, such as "A03"
|
|
328
336
|
:param version: The MeSH version to use. Defaults to latest
|
|
337
|
+
|
|
329
338
|
:returns: A list of MeSH CURIE strings for the top level of each MeSH tree.
|
|
330
339
|
|
|
331
|
-
.. seealso::
|
|
340
|
+
.. seealso::
|
|
341
|
+
|
|
342
|
+
https://meshb.nlm.nih.gov/treeView
|
|
343
|
+
"""
|
|
344
|
+
return [
|
|
345
|
+
reference.curie
|
|
346
|
+
for reference in get_mesh_category_references(letter=letter, skip=skip, version=version)
|
|
347
|
+
]
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def get_mesh_category_references(
|
|
351
|
+
letter: str, *, skip: Collection[str] | None = None, version: str | None = None
|
|
352
|
+
) -> list[Reference]:
|
|
353
|
+
"""Get the MeSH references for a category, by letter (e.g., "A").
|
|
354
|
+
|
|
355
|
+
:param letter: The MeSH tree, A for anatomy, C for disease, etc.
|
|
356
|
+
:param skip: An optional collection of MeSH tree codes to skip, such as "A03"
|
|
357
|
+
:param version: The MeSH version to use. Defaults to latest
|
|
358
|
+
|
|
359
|
+
:returns: A list of MeSH references for the top level of each MeSH tree.
|
|
360
|
+
|
|
361
|
+
.. seealso::
|
|
362
|
+
|
|
363
|
+
https://meshb.nlm.nih.gov/treeView
|
|
332
364
|
"""
|
|
333
365
|
if version is None:
|
|
334
366
|
version = safe_get_version("mesh")
|
|
@@ -340,10 +372,12 @@ def get_mesh_category_curies(
|
|
|
340
372
|
continue
|
|
341
373
|
mesh_id = tree_to_mesh.get(key)
|
|
342
374
|
if mesh_id is None:
|
|
375
|
+
# as soon as we get to a missing ID, we don't
|
|
376
|
+
# have to go any further
|
|
343
377
|
break
|
|
344
|
-
rv.append(
|
|
378
|
+
rv.append(Reference(prefix="mesh", identifier=mesh_id))
|
|
345
379
|
return rv
|
|
346
380
|
|
|
347
381
|
|
|
348
382
|
if __name__ == "__main__":
|
|
349
|
-
|
|
383
|
+
MeSHGetter.cli()
|
pyobo/sources/mgi.py
CHANGED
|
@@ -12,7 +12,6 @@ from pyobo.struct.typedef import exact_match
|
|
|
12
12
|
from ..struct import (
|
|
13
13
|
Obo,
|
|
14
14
|
Reference,
|
|
15
|
-
Synonym,
|
|
16
15
|
Term,
|
|
17
16
|
from_species,
|
|
18
17
|
has_gene_product,
|
|
@@ -35,8 +34,7 @@ ENSEMBL_XREFS_URL = "http://www.informatics.jax.org/downloads/reports/MRK_ENSEMB
|
|
|
35
34
|
class MGIGetter(Obo):
|
|
36
35
|
"""An ontology representation of MGI's mouse gene nomenclature."""
|
|
37
36
|
|
|
38
|
-
ontology = PREFIX
|
|
39
|
-
dynamic_version = True
|
|
37
|
+
ontology = bioversions_key = PREFIX
|
|
40
38
|
typedefs = [from_species, has_gene_product, transcribes_to, exact_match]
|
|
41
39
|
|
|
42
40
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
@@ -44,11 +42,6 @@ class MGIGetter(Obo):
|
|
|
44
42
|
return get_terms(force=force)
|
|
45
43
|
|
|
46
44
|
|
|
47
|
-
def get_obo(force: bool = False) -> Obo:
|
|
48
|
-
"""Get MGI as OBO."""
|
|
49
|
-
return MGIGetter(force=force)
|
|
50
|
-
|
|
51
|
-
|
|
52
45
|
COLUMNS = ["MGI Accession ID", "Marker Symbol", "Marker Name"]
|
|
53
46
|
|
|
54
47
|
|
|
@@ -159,7 +152,7 @@ def get_terms(force: bool = False) -> Iterable[Term]:
|
|
|
159
152
|
)
|
|
160
153
|
if identifier in mgi_to_synonyms:
|
|
161
154
|
for synonym in mgi_to_synonyms[identifier]:
|
|
162
|
-
term.append_synonym(
|
|
155
|
+
term.append_synonym(synonym)
|
|
163
156
|
if identifier in mgi_to_entrez_id:
|
|
164
157
|
term.append_exact_match(
|
|
165
158
|
Reference(prefix="ncbigene", identifier=mgi_to_entrez_id[identifier])
|
|
@@ -179,4 +172,4 @@ def get_terms(force: bool = False) -> Iterable[Term]:
|
|
|
179
172
|
|
|
180
173
|
|
|
181
174
|
if __name__ == "__main__":
|
|
182
|
-
|
|
175
|
+
MGIGetter.cli()
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Resources from miRBase."""
|
|
2
|
+
|
|
3
|
+
from .mirbase import MiRBaseGetter
|
|
4
|
+
from .mirbase_family import MiRBaseFamilyGetter
|
|
5
|
+
from .mirbase_mature import MiRBaseMatureGetter
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"MiRBaseFamilyGetter",
|
|
9
|
+
"MiRBaseGetter",
|
|
10
|
+
"MiRBaseMatureGetter",
|
|
11
|
+
]
|
|
@@ -6,12 +6,13 @@ from collections.abc import Iterable, Mapping
|
|
|
6
6
|
|
|
7
7
|
from tqdm.auto import tqdm
|
|
8
8
|
|
|
9
|
-
from pyobo.sources.mirbase_constants import BASE_URL, _assert_frozen_version
|
|
10
9
|
from pyobo.struct import Obo, Reference, Synonym, Term, from_species
|
|
11
10
|
from pyobo.struct.typedef import has_mature
|
|
12
11
|
from pyobo.utils.cache import cached_mapping
|
|
13
12
|
from pyobo.utils.path import ensure_df, ensure_path, prefix_directory_join
|
|
14
13
|
|
|
14
|
+
from .mirbase_constants import BASE_URL, _assert_frozen_version
|
|
15
|
+
|
|
15
16
|
__all__ = [
|
|
16
17
|
"MiRBaseGetter",
|
|
17
18
|
]
|
|
@@ -41,11 +42,6 @@ class MiRBaseGetter(Obo):
|
|
|
41
42
|
return get_terms(version=self._version_or_raise, force=force)
|
|
42
43
|
|
|
43
44
|
|
|
44
|
-
def get_obo(force: bool = False) -> Obo:
|
|
45
|
-
"""Get miRBase as OBO."""
|
|
46
|
-
return MiRBaseGetter(force=force)
|
|
47
|
-
|
|
48
|
-
|
|
49
45
|
def get_terms(version: str, force: bool = False) -> list[Term]:
|
|
50
46
|
"""Parse miRNA data from filepath and convert it to dictionary."""
|
|
51
47
|
_assert_frozen_version(version)
|
|
@@ -54,7 +50,7 @@ def get_terms(version: str, force: bool = False) -> list[Term]:
|
|
|
54
50
|
|
|
55
51
|
file_handle = (
|
|
56
52
|
gzip.open(definitions_path, "rt")
|
|
57
|
-
if definitions_path.endswith(".gz")
|
|
53
|
+
if definitions_path.suffix.endswith(".gz")
|
|
58
54
|
else open(definitions_path)
|
|
59
55
|
)
|
|
60
56
|
with file_handle as file:
|
|
@@ -101,7 +97,7 @@ def _process_definitions_lines(
|
|
|
101
97
|
|
|
102
98
|
for group in tqdm(groups, desc=f"mapping {PREFIX}"):
|
|
103
99
|
name = group[0][5:23].strip()
|
|
104
|
-
|
|
100
|
+
_qualifier, _dtype, species_code, _length = map(
|
|
105
101
|
str.strip, group[0][23:].strip().rstrip(".").split(";")
|
|
106
102
|
)
|
|
107
103
|
identifier = group[2][3:-2].strip()
|
|
@@ -134,7 +130,7 @@ def _process_definitions_lines(
|
|
|
134
130
|
xref_prefix, xref_identifier, xref_label = map(str.strip, line.split(";"))
|
|
135
131
|
xref_prefix = xref_prefix.lower()
|
|
136
132
|
xref_prefix = xref_mapping.get(xref_prefix, xref_prefix)
|
|
137
|
-
if xref_prefix
|
|
133
|
+
if xref_prefix in {"pictar", "mir", "mirte"}:
|
|
138
134
|
continue
|
|
139
135
|
|
|
140
136
|
try:
|
|
@@ -157,7 +153,8 @@ def _process_definitions_lines(
|
|
|
157
153
|
|
|
158
154
|
species_identifier, species_name = organisms[species_code]
|
|
159
155
|
term.set_species(species_identifier, species_name)
|
|
160
|
-
|
|
156
|
+
for mature in matures:
|
|
157
|
+
term.append_relationship(has_mature, mature)
|
|
161
158
|
|
|
162
159
|
yield term
|
|
163
160
|
|
|
@@ -199,4 +196,4 @@ def get_mature_id_to_name(version: str) -> Mapping[str, str]:
|
|
|
199
196
|
|
|
200
197
|
|
|
201
198
|
if __name__ == "__main__":
|
|
202
|
-
|
|
199
|
+
MiRBaseGetter.cli()
|
|
File without changes
|
|
@@ -5,12 +5,13 @@ from collections.abc import Iterable
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from tqdm.auto import tqdm
|
|
7
7
|
|
|
8
|
-
from pyobo.
|
|
8
|
+
from pyobo.struct import Obo, Reference, Term, has_member
|
|
9
|
+
|
|
10
|
+
from .mirbase_constants import (
|
|
9
11
|
get_premature_df,
|
|
10
12
|
get_premature_family_df,
|
|
11
13
|
get_premature_to_prefamily_df,
|
|
12
14
|
)
|
|
13
|
-
from pyobo.struct import Obo, Reference, Term, has_member
|
|
14
15
|
|
|
15
16
|
__all__ = [
|
|
16
17
|
"MiRBaseFamilyGetter",
|
|
@@ -31,11 +32,6 @@ class MiRBaseFamilyGetter(Obo):
|
|
|
31
32
|
return iter_terms(version=self._version_or_raise, force=force)
|
|
32
33
|
|
|
33
34
|
|
|
34
|
-
def get_obo(force: bool = False) -> Obo:
|
|
35
|
-
"""Get miRBase family as OBO."""
|
|
36
|
-
return MiRBaseFamilyGetter(force=force)
|
|
37
|
-
|
|
38
|
-
|
|
39
35
|
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
40
36
|
"""Get miRBase family terms."""
|
|
41
37
|
df = get_df(version, force=force)
|
|
@@ -66,4 +62,4 @@ def get_df(version: str, force: bool = False) -> pd.DataFrame:
|
|
|
66
62
|
|
|
67
63
|
|
|
68
64
|
if __name__ == "__main__":
|
|
69
|
-
|
|
65
|
+
MiRBaseFamilyGetter.cli()
|
|
@@ -5,9 +5,10 @@ from collections.abc import Iterable
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from tqdm.auto import tqdm
|
|
7
7
|
|
|
8
|
-
from pyobo.sources.mirbase_constants import get_mature_df
|
|
9
8
|
from pyobo.struct import Obo, Reference, Synonym, Term
|
|
10
9
|
|
|
10
|
+
from .mirbase_constants import get_mature_df
|
|
11
|
+
|
|
11
12
|
__all__ = [
|
|
12
13
|
"MiRBaseMatureGetter",
|
|
13
14
|
]
|
|
@@ -26,11 +27,6 @@ class MiRBaseMatureGetter(Obo):
|
|
|
26
27
|
return iter_terms(version=self._version_or_raise, force=force)
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
def get_obo(force: bool = False) -> Obo:
|
|
30
|
-
"""Get miRBase mature as OBO."""
|
|
31
|
-
return MiRBaseMatureGetter(force=force)
|
|
32
|
-
|
|
33
|
-
|
|
34
30
|
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
35
31
|
"""Get miRBase mature terms."""
|
|
36
32
|
df = get_mature_df(version, force=force)
|
|
@@ -49,4 +45,4 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
49
45
|
|
|
50
46
|
|
|
51
47
|
if __name__ == "__main__":
|
|
52
|
-
|
|
48
|
+
MiRBaseMatureGetter.cli()
|
pyobo/sources/msigdb.py
CHANGED
|
@@ -1,41 +1,55 @@
|
|
|
1
1
|
"""Parsers for MSig."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import zipfile
|
|
4
5
|
from collections.abc import Iterable
|
|
5
|
-
from typing import Optional
|
|
6
6
|
|
|
7
|
-
from lxml
|
|
7
|
+
from lxml import etree
|
|
8
|
+
from pydantic import ValidationError
|
|
8
9
|
from tqdm.auto import tqdm
|
|
9
10
|
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
|
|
13
|
-
logger = logging.getLogger(__name__)
|
|
11
|
+
from pyobo.struct import Obo, Reference, Term, TypeDef, has_citation, has_participant
|
|
12
|
+
from pyobo.utils.path import ensure_path
|
|
14
13
|
|
|
15
14
|
__all__ = [
|
|
16
15
|
"MSigDBGetter",
|
|
17
16
|
]
|
|
18
17
|
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
19
20
|
PREFIX = "msigdb"
|
|
20
21
|
BASE_URL = "https://data.broadinstitute.org/gsea-msigdb/msigdb/release"
|
|
21
22
|
|
|
23
|
+
CATEGORY_CODE = TypeDef.default(PREFIX, "category_code", name="category code", is_metadata_tag=True)
|
|
24
|
+
SUB_CATEGORY_CODE = TypeDef.default(
|
|
25
|
+
PREFIX, "sub_category_code", name="sub-category code", is_metadata_tag=True
|
|
26
|
+
)
|
|
27
|
+
CONTRIBUTOR = TypeDef.default(PREFIX, "contributor", name="contributor", is_metadata_tag=True)
|
|
28
|
+
EXACT_SOURCE = TypeDef.default(PREFIX, "exact_source", name="exact source", is_metadata_tag=True)
|
|
29
|
+
EXTERNAL_DETAILS_URL = TypeDef.default(
|
|
30
|
+
PREFIX, "external_details_url", name="external details URL", is_metadata_tag=True
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
PROPERTIES = [
|
|
34
|
+
("CATEGORY_CODE", CATEGORY_CODE),
|
|
35
|
+
("SUB_CATEGORY_CODE", SUB_CATEGORY_CODE),
|
|
36
|
+
("CONTRIBUTOR", CONTRIBUTOR),
|
|
37
|
+
("EXACT_SOURCE", EXACT_SOURCE),
|
|
38
|
+
("EXTERNAL_DETAILS_URL", EXTERNAL_DETAILS_URL),
|
|
39
|
+
]
|
|
40
|
+
|
|
22
41
|
|
|
23
42
|
class MSigDBGetter(Obo):
|
|
24
43
|
"""An ontology representation of MMSigDB's gene set nomenclature."""
|
|
25
44
|
|
|
26
45
|
ontology = bioversions_key = PREFIX
|
|
27
|
-
typedefs = [has_participant]
|
|
46
|
+
typedefs = [has_participant, has_citation, *(p for _, p in PROPERTIES)]
|
|
28
47
|
|
|
29
48
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
30
49
|
"""Iterate over terms in the ontology."""
|
|
31
50
|
return iter_terms(version=self._version_or_raise, force=force)
|
|
32
51
|
|
|
33
52
|
|
|
34
|
-
def get_obo(force: bool = False) -> Obo:
|
|
35
|
-
"""Get MSIG as Obo."""
|
|
36
|
-
return MSigDBGetter(force=force)
|
|
37
|
-
|
|
38
|
-
|
|
39
53
|
_SPECIES = {
|
|
40
54
|
"Homo sapiens": "9606",
|
|
41
55
|
"Mus musculus": "10090",
|
|
@@ -49,24 +63,36 @@ GO_URL_PREFIX = "http://amigo.geneontology.org/amigo/term/GO:"
|
|
|
49
63
|
KEGG_URL_PREFIX = "http://www.genome.jp/kegg/pathway/hsa/"
|
|
50
64
|
|
|
51
65
|
|
|
52
|
-
def
|
|
53
|
-
|
|
54
|
-
xml_url = f"{BASE_URL}/{version}.Hs/msigdb_v{version}.Hs.xml"
|
|
66
|
+
def _iter_entries(version: str, force: bool = False):
|
|
67
|
+
xml_url = f"{BASE_URL}/{version}.Hs/msigdb_v{version}.Hs.xml.zip"
|
|
55
68
|
path = ensure_path(prefix=PREFIX, url=xml_url, version=version, force=force)
|
|
56
|
-
|
|
69
|
+
with zipfile.ZipFile(path, "r") as zf:
|
|
70
|
+
with zf.open(f"msigdb_v{version}.Hs.xml") as file:
|
|
71
|
+
for _ in range(3):
|
|
72
|
+
next(file)
|
|
73
|
+
# from here on out, every row except the last is a GENESET
|
|
74
|
+
for i, line_bytes in enumerate(file, start=4):
|
|
75
|
+
line = line_bytes.decode("utf8").strip()
|
|
76
|
+
if not line.startswith("<GENESET"):
|
|
77
|
+
continue
|
|
78
|
+
try:
|
|
79
|
+
tree = etree.fromstring(line)
|
|
80
|
+
except etree.XMLSyntaxError as e:
|
|
81
|
+
# this is the result of faulty encoding in XML - maybe they
|
|
82
|
+
# wrote XML with their own string formatting instead of using a
|
|
83
|
+
# library.
|
|
84
|
+
logger.debug("[%s] failed on line %s: %s", PREFIX, i, e)
|
|
85
|
+
else:
|
|
86
|
+
yield tree
|
|
57
87
|
|
|
58
|
-
|
|
88
|
+
|
|
89
|
+
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
90
|
+
"""Get MSigDb terms."""
|
|
91
|
+
entries = _iter_entries(version=version, force=force)
|
|
92
|
+
for entry in tqdm(entries, desc=f"{PREFIX} v{version}", unit_scale=True):
|
|
59
93
|
attrib = dict(entry.attrib)
|
|
60
94
|
tax_id = _SPECIES[attrib["ORGANISM"]]
|
|
61
95
|
|
|
62
|
-
reference_id = attrib["PMID"].strip()
|
|
63
|
-
if not reference_id:
|
|
64
|
-
reference = None
|
|
65
|
-
elif reference_id.startswith("GSE"):
|
|
66
|
-
reference = Reference(prefix="gse", identifier=reference_id)
|
|
67
|
-
else:
|
|
68
|
-
reference = Reference(prefix="pubmed", identifier=reference_id)
|
|
69
|
-
|
|
70
96
|
# NONE have the entry "HISTORICAL_NAME"
|
|
71
97
|
# historical_name = thing.attrib['HISTORICAL_NAME']
|
|
72
98
|
|
|
@@ -77,19 +103,20 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
77
103
|
term = Term(
|
|
78
104
|
reference=Reference(prefix=PREFIX, identifier=identifier, name=name),
|
|
79
105
|
definition=_get_definition(attrib),
|
|
80
|
-
provenance=[] if reference is None else [reference],
|
|
81
106
|
is_obsolete=is_obsolete,
|
|
82
107
|
)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
"
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
108
|
+
|
|
109
|
+
reference_id = attrib["PMID"].strip()
|
|
110
|
+
if not reference_id:
|
|
111
|
+
pass
|
|
112
|
+
elif reference_id.startswith("GSE"):
|
|
113
|
+
term.append_see_also(Reference(prefix="gse", identifier=reference_id))
|
|
114
|
+
else:
|
|
115
|
+
term.append_provenance(Reference(prefix="pubmed", identifier=reference_id))
|
|
116
|
+
|
|
117
|
+
for key, typedef in PROPERTIES:
|
|
118
|
+
if value := attrib[key].strip():
|
|
119
|
+
term.annotate_string(typedef, value)
|
|
93
120
|
|
|
94
121
|
term.set_species(tax_id)
|
|
95
122
|
|
|
@@ -123,17 +150,25 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
123
150
|
logger.warning(
|
|
124
151
|
"missing %s source: msigdb:%s (%s)", contributor, identifier, external_details
|
|
125
152
|
)
|
|
126
|
-
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
kegg_reference = Reference(prefix="kegg.pathway", identifier=external_id)
|
|
156
|
+
except ValidationError:
|
|
157
|
+
# TODO handle kegg.network which starts with N, like N01146
|
|
158
|
+
if not external_id.startswith("N"):
|
|
159
|
+
tqdm.write(f"could not validate kegg.pathway:{external_id}")
|
|
160
|
+
else:
|
|
161
|
+
term.append_xref(kegg_reference)
|
|
127
162
|
|
|
128
163
|
for ncbigene_id in attrib["MEMBERS_EZID"].strip().split(","):
|
|
129
164
|
if ncbigene_id:
|
|
130
|
-
term.
|
|
165
|
+
term.annotate_object(
|
|
131
166
|
has_participant, Reference(prefix="ncbigene", identifier=ncbigene_id)
|
|
132
167
|
)
|
|
133
168
|
yield term
|
|
134
169
|
|
|
135
170
|
|
|
136
|
-
def _get_definition(attrib) ->
|
|
171
|
+
def _get_definition(attrib) -> str | None:
|
|
137
172
|
rv = attrib["DESCRIPTION_FULL"].strip() or attrib["DESCRIPTION_BRIEF"].strip() or None
|
|
138
173
|
if rv is not None:
|
|
139
174
|
return rv.replace(r"\d", "").replace(r"\s", "")
|