pyobo 0.11.2__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/.DS_Store +0 -0
- pyobo/__init__.py +95 -20
- pyobo/__main__.py +0 -0
- pyobo/api/__init__.py +81 -10
- pyobo/api/alts.py +52 -42
- pyobo/api/combine.py +39 -0
- pyobo/api/edges.py +68 -0
- pyobo/api/hierarchy.py +231 -203
- pyobo/api/metadata.py +14 -19
- pyobo/api/names.py +207 -127
- pyobo/api/properties.py +117 -117
- pyobo/api/relations.py +68 -94
- pyobo/api/species.py +24 -21
- pyobo/api/typedefs.py +11 -11
- pyobo/api/utils.py +66 -13
- pyobo/api/xrefs.py +107 -114
- pyobo/cli/__init__.py +0 -0
- pyobo/cli/cli.py +35 -50
- pyobo/cli/database.py +210 -160
- pyobo/cli/database_utils.py +155 -0
- pyobo/cli/lookup.py +163 -195
- pyobo/cli/utils.py +19 -6
- pyobo/constants.py +102 -3
- pyobo/getters.py +209 -191
- pyobo/gilda_utils.py +52 -250
- pyobo/identifier_utils/__init__.py +33 -0
- pyobo/identifier_utils/api.py +305 -0
- pyobo/identifier_utils/preprocessing.json +873 -0
- pyobo/identifier_utils/preprocessing.py +27 -0
- pyobo/identifier_utils/relations/__init__.py +8 -0
- pyobo/identifier_utils/relations/api.py +162 -0
- pyobo/identifier_utils/relations/data.json +5824 -0
- pyobo/identifier_utils/relations/data_owl.json +57 -0
- pyobo/identifier_utils/relations/data_rdf.json +1 -0
- pyobo/identifier_utils/relations/data_rdfs.json +7 -0
- pyobo/mocks.py +9 -6
- pyobo/ner/__init__.py +9 -0
- pyobo/ner/api.py +72 -0
- pyobo/ner/normalizer.py +33 -0
- pyobo/obographs.py +48 -40
- pyobo/plugins.py +5 -4
- pyobo/py.typed +0 -0
- pyobo/reader.py +1354 -395
- pyobo/reader_utils.py +155 -0
- pyobo/resource_utils.py +42 -22
- pyobo/resources/__init__.py +0 -0
- pyobo/resources/goc.py +75 -0
- pyobo/resources/goc.tsv +188 -0
- pyobo/resources/ncbitaxon.py +4 -5
- pyobo/resources/ncbitaxon.tsv.gz +0 -0
- pyobo/resources/ro.py +3 -2
- pyobo/resources/ro.tsv +0 -0
- pyobo/resources/so.py +0 -0
- pyobo/resources/so.tsv +0 -0
- pyobo/sources/README.md +12 -8
- pyobo/sources/__init__.py +52 -29
- pyobo/sources/agrovoc.py +0 -0
- pyobo/sources/antibodyregistry.py +11 -12
- pyobo/sources/bigg/__init__.py +13 -0
- pyobo/sources/bigg/bigg_compartment.py +81 -0
- pyobo/sources/bigg/bigg_metabolite.py +229 -0
- pyobo/sources/bigg/bigg_model.py +46 -0
- pyobo/sources/bigg/bigg_reaction.py +77 -0
- pyobo/sources/biogrid.py +1 -2
- pyobo/sources/ccle.py +7 -12
- pyobo/sources/cgnc.py +9 -6
- pyobo/sources/chebi.py +1 -1
- pyobo/sources/chembl/__init__.py +9 -0
- pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
- pyobo/sources/chembl/chembl_target.py +160 -0
- pyobo/sources/civic_gene.py +55 -15
- pyobo/sources/clinicaltrials.py +160 -0
- pyobo/sources/complexportal.py +24 -24
- pyobo/sources/conso.py +14 -22
- pyobo/sources/cpt.py +0 -0
- pyobo/sources/credit.py +1 -9
- pyobo/sources/cvx.py +27 -5
- pyobo/sources/depmap.py +9 -12
- pyobo/sources/dictybase_gene.py +2 -7
- pyobo/sources/drugbank/__init__.py +9 -0
- pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
- pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
- pyobo/sources/drugcentral.py +17 -13
- pyobo/sources/expasy.py +31 -34
- pyobo/sources/famplex.py +13 -18
- pyobo/sources/flybase.py +8 -13
- pyobo/sources/gard.py +62 -0
- pyobo/sources/geonames/__init__.py +9 -0
- pyobo/sources/geonames/features.py +28 -0
- pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
- pyobo/sources/geonames/utils.py +115 -0
- pyobo/sources/gmt_utils.py +6 -7
- pyobo/sources/go.py +20 -13
- pyobo/sources/gtdb.py +154 -0
- pyobo/sources/gwascentral/__init__.py +9 -0
- pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
- pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
- pyobo/sources/hgnc/__init__.py +9 -0
- pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
- pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
- pyobo/sources/icd/__init__.py +9 -0
- pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
- pyobo/sources/icd/icd11.py +148 -0
- pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
- pyobo/sources/interpro.py +4 -9
- pyobo/sources/itis.py +0 -5
- pyobo/sources/kegg/__init__.py +0 -0
- pyobo/sources/kegg/api.py +16 -38
- pyobo/sources/kegg/genes.py +9 -20
- pyobo/sources/kegg/genome.py +1 -7
- pyobo/sources/kegg/pathway.py +9 -21
- pyobo/sources/mesh.py +58 -24
- pyobo/sources/mgi.py +3 -10
- pyobo/sources/mirbase/__init__.py +11 -0
- pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
- pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
- pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
- pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
- pyobo/sources/msigdb.py +74 -39
- pyobo/sources/ncbi/__init__.py +9 -0
- pyobo/sources/ncbi/ncbi_gc.py +162 -0
- pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
- pyobo/sources/nih_reporter.py +60 -0
- pyobo/sources/nlm/__init__.py +9 -0
- pyobo/sources/nlm/nlm_catalog.py +48 -0
- pyobo/sources/nlm/nlm_publisher.py +36 -0
- pyobo/sources/nlm/utils.py +116 -0
- pyobo/sources/npass.py +6 -8
- pyobo/sources/omim_ps.py +11 -4
- pyobo/sources/pathbank.py +4 -8
- pyobo/sources/pfam/__init__.py +9 -0
- pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
- pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
- pyobo/sources/pharmgkb/__init__.py +15 -0
- pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
- pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
- pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
- pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
- pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
- pyobo/sources/pharmgkb/utils.py +86 -0
- pyobo/sources/pid.py +1 -6
- pyobo/sources/pombase.py +6 -10
- pyobo/sources/pubchem.py +4 -9
- pyobo/sources/reactome.py +5 -11
- pyobo/sources/rgd.py +11 -16
- pyobo/sources/rhea.py +37 -36
- pyobo/sources/ror.py +69 -42
- pyobo/sources/selventa/__init__.py +0 -0
- pyobo/sources/selventa/schem.py +4 -7
- pyobo/sources/selventa/scomp.py +1 -6
- pyobo/sources/selventa/sdis.py +4 -7
- pyobo/sources/selventa/sfam.py +1 -6
- pyobo/sources/sgd.py +6 -11
- pyobo/sources/signor/__init__.py +7 -0
- pyobo/sources/signor/download.py +41 -0
- pyobo/sources/signor/signor_complexes.py +105 -0
- pyobo/sources/slm.py +12 -15
- pyobo/sources/umls/__init__.py +7 -1
- pyobo/sources/umls/__main__.py +0 -0
- pyobo/sources/umls/get_synonym_types.py +20 -4
- pyobo/sources/umls/sty.py +57 -0
- pyobo/sources/umls/synonym_types.tsv +1 -1
- pyobo/sources/umls/umls.py +18 -22
- pyobo/sources/unimod.py +46 -0
- pyobo/sources/uniprot/__init__.py +1 -1
- pyobo/sources/uniprot/uniprot.py +40 -32
- pyobo/sources/uniprot/uniprot_ptm.py +4 -34
- pyobo/sources/utils.py +3 -2
- pyobo/sources/wikipathways.py +7 -10
- pyobo/sources/zfin.py +5 -10
- pyobo/ssg/__init__.py +12 -16
- pyobo/ssg/base.html +0 -0
- pyobo/ssg/index.html +26 -13
- pyobo/ssg/term.html +12 -2
- pyobo/ssg/typedef.html +0 -0
- pyobo/struct/__init__.py +54 -8
- pyobo/struct/functional/__init__.py +1 -0
- pyobo/struct/functional/dsl.py +2572 -0
- pyobo/struct/functional/macros.py +423 -0
- pyobo/struct/functional/obo_to_functional.py +385 -0
- pyobo/struct/functional/ontology.py +272 -0
- pyobo/struct/functional/utils.py +112 -0
- pyobo/struct/reference.py +331 -136
- pyobo/struct/struct.py +1484 -657
- pyobo/struct/struct_utils.py +1078 -0
- pyobo/struct/typedef.py +162 -210
- pyobo/struct/utils.py +12 -5
- pyobo/struct/vocabulary.py +138 -0
- pyobo/utils/__init__.py +0 -0
- pyobo/utils/cache.py +16 -15
- pyobo/utils/io.py +51 -41
- pyobo/utils/iter.py +5 -5
- pyobo/utils/misc.py +41 -53
- pyobo/utils/ndex_utils.py +0 -0
- pyobo/utils/path.py +73 -70
- pyobo/version.py +3 -3
- pyobo-0.12.1.dist-info/METADATA +671 -0
- pyobo-0.12.1.dist-info/RECORD +201 -0
- pyobo-0.12.1.dist-info/WHEEL +4 -0
- {pyobo-0.11.2.dist-info → pyobo-0.12.1.dist-info}/entry_points.txt +1 -0
- pyobo-0.12.1.dist-info/licenses/LICENSE +21 -0
- pyobo/aws.py +0 -162
- pyobo/cli/aws.py +0 -47
- pyobo/identifier_utils.py +0 -142
- pyobo/normalizer.py +0 -232
- pyobo/registries/__init__.py +0 -16
- pyobo/registries/metaregistry.json +0 -507
- pyobo/registries/metaregistry.py +0 -135
- pyobo/sources/icd11.py +0 -105
- pyobo/xrefdb/__init__.py +0 -1
- pyobo/xrefdb/canonicalizer.py +0 -214
- pyobo/xrefdb/priority.py +0 -59
- pyobo/xrefdb/sources/__init__.py +0 -60
- pyobo/xrefdb/sources/biomappings.py +0 -36
- pyobo/xrefdb/sources/cbms2019.py +0 -91
- pyobo/xrefdb/sources/chembl.py +0 -83
- pyobo/xrefdb/sources/compath.py +0 -82
- pyobo/xrefdb/sources/famplex.py +0 -64
- pyobo/xrefdb/sources/gilda.py +0 -50
- pyobo/xrefdb/sources/intact.py +0 -113
- pyobo/xrefdb/sources/ncit.py +0 -133
- pyobo/xrefdb/sources/pubchem.py +0 -27
- pyobo/xrefdb/sources/wikidata.py +0 -116
- pyobo/xrefdb/xrefs_pipeline.py +0 -180
- pyobo-0.11.2.dist-info/METADATA +0 -711
- pyobo-0.11.2.dist-info/RECORD +0 -157
- pyobo-0.11.2.dist-info/WHEEL +0 -5
- pyobo-0.11.2.dist-info/top_level.txt +0 -1
pyobo/getters.py
CHANGED
|
@@ -1,38 +1,48 @@
|
|
|
1
1
|
"""Utilities for OBO files."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import datetime
|
|
4
|
-
import gzip
|
|
5
6
|
import json
|
|
6
7
|
import logging
|
|
7
8
|
import pathlib
|
|
8
9
|
import subprocess
|
|
10
|
+
import time
|
|
9
11
|
import typing
|
|
10
12
|
import urllib.error
|
|
13
|
+
import zipfile
|
|
11
14
|
from collections import Counter
|
|
12
|
-
from collections.abc import Iterable, Mapping, Sequence
|
|
15
|
+
from collections.abc import Callable, Iterable, Mapping, Sequence
|
|
13
16
|
from pathlib import Path
|
|
14
|
-
from
|
|
15
|
-
|
|
16
|
-
Optional,
|
|
17
|
-
TypeVar,
|
|
18
|
-
Union,
|
|
19
|
-
)
|
|
17
|
+
from textwrap import indent
|
|
18
|
+
from typing import Any, TypeVar
|
|
20
19
|
|
|
21
20
|
import bioregistry
|
|
21
|
+
import click
|
|
22
|
+
import pystow.utils
|
|
22
23
|
from bioontologies import robot
|
|
24
|
+
from tabulate import tabulate
|
|
23
25
|
from tqdm.auto import tqdm
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
from .
|
|
26
|
+
from typing_extensions import Unpack
|
|
27
|
+
|
|
28
|
+
from .constants import (
|
|
29
|
+
BUILD_SUBDIRECTORY_NAME,
|
|
30
|
+
DATABASE_DIRECTORY,
|
|
31
|
+
GetOntologyKwargs,
|
|
32
|
+
IterHelperHelperDict,
|
|
33
|
+
SlimGetOntologyKwargs,
|
|
34
|
+
)
|
|
35
|
+
from .identifier_utils import ParseError, wrap_norm_prefix
|
|
27
36
|
from .plugins import has_nomenclature_plugin, run_nomenclature_plugin
|
|
37
|
+
from .reader import from_obo_path, from_obonet
|
|
28
38
|
from .struct import Obo
|
|
29
|
-
from .utils.io import
|
|
39
|
+
from .utils.io import safe_open_writer
|
|
30
40
|
from .utils.path import ensure_path, prefix_directory_join
|
|
31
41
|
from .version import get_git_hash, get_version
|
|
32
42
|
|
|
33
43
|
__all__ = [
|
|
34
|
-
"get_ontology",
|
|
35
44
|
"NoBuildError",
|
|
45
|
+
"get_ontology",
|
|
36
46
|
]
|
|
37
47
|
|
|
38
48
|
logger = logging.getLogger(__name__)
|
|
@@ -48,7 +58,14 @@ class UnhandledFormatError(NoBuildError):
|
|
|
48
58
|
|
|
49
59
|
#: The following prefixes can not be loaded through ROBOT without
|
|
50
60
|
#: turning off integrity checks
|
|
51
|
-
REQUIRES_NO_ROBOT_CHECK = {
|
|
61
|
+
REQUIRES_NO_ROBOT_CHECK = {
|
|
62
|
+
"clo",
|
|
63
|
+
"vo",
|
|
64
|
+
"orphanet.ordo",
|
|
65
|
+
"orphanet",
|
|
66
|
+
"foodon",
|
|
67
|
+
"caloha",
|
|
68
|
+
}
|
|
52
69
|
|
|
53
70
|
|
|
54
71
|
@wrap_norm_prefix
|
|
@@ -56,58 +73,87 @@ def get_ontology(
|
|
|
56
73
|
prefix: str,
|
|
57
74
|
*,
|
|
58
75
|
force: bool = False,
|
|
59
|
-
|
|
60
|
-
strict: bool =
|
|
61
|
-
version:
|
|
76
|
+
force_process: bool = False,
|
|
77
|
+
strict: bool = False,
|
|
78
|
+
version: str | None = None,
|
|
62
79
|
robot_check: bool = True,
|
|
80
|
+
upgrade: bool = True,
|
|
81
|
+
cache: bool = True,
|
|
82
|
+
use_tqdm: bool = True,
|
|
63
83
|
) -> Obo:
|
|
64
84
|
"""Get the OBO for a given graph.
|
|
65
85
|
|
|
66
86
|
:param prefix: The prefix of the ontology to look up
|
|
67
87
|
:param version: The pre-looked-up version of the ontology
|
|
68
88
|
:param force: Download the data again
|
|
69
|
-
:param
|
|
70
|
-
|
|
71
|
-
:param
|
|
72
|
-
|
|
73
|
-
|
|
89
|
+
:param force_process: Should the OBO cache be rewritten? Automatically set to true
|
|
90
|
+
if ``force`` is true
|
|
91
|
+
:param strict: Should CURIEs be treated strictly? If true, raises exceptions on
|
|
92
|
+
invalid/malformed
|
|
93
|
+
:param robot_check: If set to false, will send the ``--check=false`` command to
|
|
94
|
+
ROBOT to disregard malformed ontology components. Necessary to load some
|
|
95
|
+
ontologies like VO.
|
|
96
|
+
:param upgrade: If set to true, will automatically upgrade relationships, such as
|
|
97
|
+
``obo:chebi#part_of`` to ``BFO:0000051``
|
|
98
|
+
:param cache: Should cached objects be written? defaults to True
|
|
99
|
+
|
|
74
100
|
:returns: An OBO object
|
|
75
101
|
|
|
76
102
|
:raises OnlyOWLError: If the OBO foundry only has an OWL document for this resource.
|
|
77
103
|
|
|
78
|
-
Alternate usage if you have a custom url
|
|
104
|
+
Alternate usage if you have a custom url
|
|
105
|
+
|
|
106
|
+
.. code-block:: python
|
|
79
107
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
108
|
+
from pystow.utils import download
|
|
109
|
+
from pyobo import Obo, from_obo_path
|
|
110
|
+
|
|
111
|
+
url = ...
|
|
112
|
+
obo_path = ...
|
|
113
|
+
download(url=url, path=path)
|
|
114
|
+
obo = from_obo_path(path)
|
|
86
115
|
"""
|
|
87
116
|
if force:
|
|
88
|
-
|
|
117
|
+
force_process = True
|
|
89
118
|
if prefix == "uberon":
|
|
90
119
|
logger.info("UBERON has so much garbage in it that defaulting to non-strict parsing")
|
|
91
120
|
strict = False
|
|
92
121
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
122
|
+
if force_process:
|
|
123
|
+
obonet_json_gz_path = None
|
|
124
|
+
elif not cache:
|
|
125
|
+
logger.debug("[%s] caching was turned off, so dont look for an obonet file", prefix)
|
|
126
|
+
obonet_json_gz_path = None
|
|
127
|
+
else:
|
|
128
|
+
obonet_json_gz_path = prefix_directory_join(
|
|
129
|
+
prefix, BUILD_SUBDIRECTORY_NAME, name=f"{prefix}.obonet.json.gz", version=version
|
|
130
|
+
)
|
|
131
|
+
logger.debug(
|
|
132
|
+
"[%s] caching is turned on, so look for an obonet file at %s",
|
|
133
|
+
prefix,
|
|
134
|
+
obonet_json_gz_path,
|
|
135
|
+
)
|
|
136
|
+
if obonet_json_gz_path.is_file() and not force:
|
|
137
|
+
from .utils.cache import get_gzipped_graph
|
|
138
|
+
|
|
139
|
+
logger.debug("[%s] using obonet cache at %s", prefix, obonet_json_gz_path)
|
|
140
|
+
return from_obonet(
|
|
141
|
+
get_gzipped_graph(obonet_json_gz_path),
|
|
142
|
+
strict=strict,
|
|
143
|
+
version=version,
|
|
144
|
+
upgrade=upgrade,
|
|
145
|
+
use_tqdm=use_tqdm,
|
|
146
|
+
)
|
|
147
|
+
else:
|
|
148
|
+
logger.debug("[%s] no obonet cache found at %s", prefix, obonet_json_gz_path)
|
|
102
149
|
|
|
103
150
|
if has_nomenclature_plugin(prefix):
|
|
104
151
|
obo = run_nomenclature_plugin(prefix, version=version)
|
|
105
|
-
|
|
106
|
-
|
|
152
|
+
if cache:
|
|
153
|
+
logger.debug("[%s] caching nomenclature plugin", prefix)
|
|
154
|
+
obo.write_default(force=force_process)
|
|
107
155
|
return obo
|
|
108
156
|
|
|
109
|
-
logger.debug("[%s] no obonet cache found at %s", prefix, obonet_json_gz_path)
|
|
110
|
-
|
|
111
157
|
ontology_format, path = _ensure_ontology_path(prefix, force=force, version=version)
|
|
112
158
|
if path is None:
|
|
113
159
|
raise NoBuildError(prefix)
|
|
@@ -122,25 +168,23 @@ def get_ontology(
|
|
|
122
168
|
else:
|
|
123
169
|
raise UnhandledFormatError(f"[{prefix}] unhandled ontology file format: {path.suffix}")
|
|
124
170
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
obo.data_version = version
|
|
137
|
-
obo.write_default(force=rewrite)
|
|
171
|
+
obo = from_obo_path(
|
|
172
|
+
path,
|
|
173
|
+
prefix=prefix,
|
|
174
|
+
strict=strict,
|
|
175
|
+
version=version,
|
|
176
|
+
upgrade=upgrade,
|
|
177
|
+
use_tqdm=use_tqdm,
|
|
178
|
+
_cache_path=obonet_json_gz_path,
|
|
179
|
+
)
|
|
180
|
+
if cache:
|
|
181
|
+
obo.write_default(force=force_process)
|
|
138
182
|
return obo
|
|
139
183
|
|
|
140
184
|
|
|
141
185
|
def _ensure_ontology_path(
|
|
142
|
-
prefix: str, force, version
|
|
143
|
-
) ->
|
|
186
|
+
prefix: str, force: bool, version: str | None
|
|
187
|
+
) -> tuple[str, Path] | tuple[None, None]:
|
|
144
188
|
for ontology_format, url in [
|
|
145
189
|
("obo", bioregistry.get_obo_download(prefix)),
|
|
146
190
|
("owl", bioregistry.get_owl_download(prefix)),
|
|
@@ -148,100 +192,52 @@ def _ensure_ontology_path(
|
|
|
148
192
|
]:
|
|
149
193
|
if url is not None:
|
|
150
194
|
try:
|
|
151
|
-
path =
|
|
152
|
-
except urllib.error.HTTPError:
|
|
195
|
+
path = ensure_path(prefix, url=url, force=force, version=version)
|
|
196
|
+
except (urllib.error.HTTPError, pystow.utils.DownloadError):
|
|
153
197
|
continue
|
|
154
198
|
else:
|
|
155
199
|
return ontology_format, path
|
|
156
200
|
return None, None
|
|
157
201
|
|
|
158
202
|
|
|
159
|
-
#: Obonet/Pronto can't parse these (consider converting to OBO with ROBOT?)
|
|
160
|
-
CANT_PARSE = {
|
|
161
|
-
"agro",
|
|
162
|
-
"aro",
|
|
163
|
-
"bco",
|
|
164
|
-
"caro",
|
|
165
|
-
"cco",
|
|
166
|
-
"chmo",
|
|
167
|
-
"cido",
|
|
168
|
-
"covoc",
|
|
169
|
-
"cto",
|
|
170
|
-
"cvdo",
|
|
171
|
-
"dicom",
|
|
172
|
-
"dinto",
|
|
173
|
-
"emap",
|
|
174
|
-
"epso",
|
|
175
|
-
"eupath",
|
|
176
|
-
"fbbi",
|
|
177
|
-
"fma",
|
|
178
|
-
"fobi",
|
|
179
|
-
"foodon",
|
|
180
|
-
"genepio",
|
|
181
|
-
"hancestro",
|
|
182
|
-
"hom",
|
|
183
|
-
"hso",
|
|
184
|
-
"htn", # Unknown string format: creation: 16MAY2017
|
|
185
|
-
"ico",
|
|
186
|
-
"idocovid19",
|
|
187
|
-
"labo",
|
|
188
|
-
"mamo",
|
|
189
|
-
"mfmo",
|
|
190
|
-
"mfo",
|
|
191
|
-
"mfomd",
|
|
192
|
-
"miapa",
|
|
193
|
-
"mo",
|
|
194
|
-
"oae",
|
|
195
|
-
"ogms", # Unknown string format: creation: 16MAY2017
|
|
196
|
-
"ohd",
|
|
197
|
-
"ons",
|
|
198
|
-
"oostt",
|
|
199
|
-
"opmi",
|
|
200
|
-
"ornaseq",
|
|
201
|
-
"orth",
|
|
202
|
-
"pdro",
|
|
203
|
-
"probonto",
|
|
204
|
-
"psdo",
|
|
205
|
-
"reo",
|
|
206
|
-
"rex",
|
|
207
|
-
"rnao",
|
|
208
|
-
"sepio",
|
|
209
|
-
"sio",
|
|
210
|
-
"spd",
|
|
211
|
-
"sweetrealm",
|
|
212
|
-
"txpo",
|
|
213
|
-
"vido",
|
|
214
|
-
"vt",
|
|
215
|
-
"xl",
|
|
216
|
-
}
|
|
217
203
|
SKIP = {
|
|
218
|
-
"ncbigene"
|
|
219
|
-
"pubchem.compound"
|
|
220
|
-
"gaz"
|
|
221
|
-
"ma"
|
|
222
|
-
"bila"
|
|
223
|
-
#
|
|
224
|
-
"
|
|
225
|
-
"
|
|
226
|
-
"
|
|
227
|
-
"
|
|
228
|
-
|
|
229
|
-
"
|
|
230
|
-
|
|
231
|
-
"
|
|
204
|
+
"ncbigene": "too big, refs acquired from other dbs",
|
|
205
|
+
"pubchem.compound": "top big, can't deal with this now",
|
|
206
|
+
"gaz": "Gazetteer is irrelevant for biology",
|
|
207
|
+
"ma": "yanked",
|
|
208
|
+
"bila": "yanked",
|
|
209
|
+
# Can't download",
|
|
210
|
+
"afpo": "unable to download",
|
|
211
|
+
"atol": "unable to download",
|
|
212
|
+
"eol": "unable to download, same source as atol",
|
|
213
|
+
"hog": "unable to download",
|
|
214
|
+
"vhog": "unable to download",
|
|
215
|
+
"gorel": "unable to download",
|
|
216
|
+
"dinto": "unable to download",
|
|
217
|
+
"gainesville.core": "unable to download",
|
|
218
|
+
"ato": "can't process",
|
|
219
|
+
"emapa": "recently changed with EMAP... not sure what the difference is anymore",
|
|
220
|
+
"kegg.genes": "needs fix", # FIXME
|
|
221
|
+
"kegg.genome": "needs fix", # FIXME
|
|
222
|
+
"kegg.pathway": "needs fix", # FIXME
|
|
223
|
+
"ensemblglossary": "URI is self-referential to data in OLS, extract from there",
|
|
224
|
+
"epio": "content from fraunhofer is unreliable",
|
|
225
|
+
"epso": "content from fraunhofer is unreliable",
|
|
226
|
+
"gwascentral.phenotype": "website is down? or API changed?", # FIXME
|
|
227
|
+
"gwascentral.study": "website is down? or API changed?", # FIXME
|
|
228
|
+
"snomedct": "dead source",
|
|
232
229
|
}
|
|
233
230
|
|
|
234
231
|
X = TypeVar("X")
|
|
235
232
|
|
|
236
233
|
|
|
237
234
|
def iter_helper(
|
|
238
|
-
f: Callable[[str], Mapping[str, X]],
|
|
235
|
+
f: Callable[[str, Unpack[GetOntologyKwargs]], Mapping[str, X]],
|
|
239
236
|
leave: bool = False,
|
|
240
|
-
|
|
241
|
-
**kwargs,
|
|
237
|
+
**kwargs: Unpack[IterHelperHelperDict],
|
|
242
238
|
) -> Iterable[tuple[str, str, X]]:
|
|
243
239
|
"""Yield all mappings extracted from each database given."""
|
|
244
|
-
for prefix, mapping in iter_helper_helper(f,
|
|
240
|
+
for prefix, mapping in iter_helper_helper(f, **kwargs):
|
|
245
241
|
it = tqdm(
|
|
246
242
|
mapping.items(),
|
|
247
243
|
desc=f"iterating {prefix}",
|
|
@@ -250,22 +246,24 @@ def iter_helper(
|
|
|
250
246
|
disable=None,
|
|
251
247
|
)
|
|
252
248
|
for key, value in it:
|
|
253
|
-
|
|
249
|
+
if isinstance(value, str):
|
|
250
|
+
value = value.strip('"').replace("\n", " ").replace("\t", " ").replace(" ", " ")
|
|
251
|
+
# TODO deal with when this is not a string?
|
|
254
252
|
if value:
|
|
255
253
|
yield prefix, key, value
|
|
256
254
|
|
|
257
255
|
|
|
258
256
|
def _prefixes(
|
|
259
|
-
skip_below:
|
|
257
|
+
skip_below: str | None = None,
|
|
260
258
|
skip_below_inclusive: bool = True,
|
|
261
259
|
skip_pyobo: bool = False,
|
|
262
|
-
skip_set:
|
|
260
|
+
skip_set: set[str] | None = None,
|
|
263
261
|
) -> Iterable[str]:
|
|
264
262
|
for prefix, resource in sorted(bioregistry.read_registry().items()):
|
|
265
263
|
if resource.no_own_terms:
|
|
266
264
|
continue
|
|
267
265
|
if prefix in SKIP:
|
|
268
|
-
tqdm.write(f"skipping {prefix} because
|
|
266
|
+
tqdm.write(f"skipping {prefix} because {SKIP[prefix]}")
|
|
269
267
|
continue
|
|
270
268
|
if skip_set and prefix in skip_set:
|
|
271
269
|
tqdm.write(f"skipping {prefix} because in skip set")
|
|
@@ -287,37 +285,39 @@ def _prefixes(
|
|
|
287
285
|
|
|
288
286
|
|
|
289
287
|
def iter_helper_helper(
|
|
290
|
-
f: Callable[[str], X],
|
|
288
|
+
f: Callable[[str, Unpack[GetOntologyKwargs]], X],
|
|
291
289
|
use_tqdm: bool = True,
|
|
292
|
-
skip_below:
|
|
293
|
-
skip_below_inclusive: bool = True,
|
|
290
|
+
skip_below: str | None = None,
|
|
294
291
|
skip_pyobo: bool = False,
|
|
295
|
-
skip_set:
|
|
296
|
-
|
|
297
|
-
**kwargs,
|
|
292
|
+
skip_set: set[str] | None = None,
|
|
293
|
+
**kwargs: Unpack[SlimGetOntologyKwargs],
|
|
298
294
|
) -> Iterable[tuple[str, X]]:
|
|
299
295
|
"""Yield all mappings extracted from each database given.
|
|
300
296
|
|
|
301
|
-
:param f: A function that takes a prefix and gives back something that will be used
|
|
297
|
+
:param f: A function that takes a prefix and gives back something that will be used
|
|
298
|
+
by an outer function.
|
|
302
299
|
:param use_tqdm: If true, use the tqdm progress bar
|
|
303
|
-
:param skip_below: If true, skip sources whose names are less than this (used for
|
|
300
|
+
:param skip_below: If true, skip sources whose names are less than this (used for
|
|
301
|
+
iterative curation
|
|
304
302
|
:param skip_pyobo: If true, skip sources implemented in PyOBO
|
|
305
303
|
:param skip_set: A pre-defined blacklist to skip
|
|
306
|
-
:param strict: If true, will raise exceptions and crash the program instead of
|
|
304
|
+
:param strict: If true, will raise exceptions and crash the program instead of
|
|
305
|
+
logging them.
|
|
307
306
|
:param kwargs: Keyword arguments passed to ``f``.
|
|
308
|
-
:yields: A prefix and the result of the callable ``f``
|
|
309
307
|
|
|
310
308
|
:raises TypeError: If a type error is raised, it gets re-raised
|
|
311
309
|
:raises urllib.error.HTTPError: If the resource could not be downloaded
|
|
312
310
|
:raises urllib.error.URLError: If another problem was encountered during download
|
|
313
311
|
:raises ValueError: If the data was not in the format that was expected (e.g., OWL)
|
|
312
|
+
|
|
313
|
+
:yields: A prefix and the result of the callable ``f``
|
|
314
314
|
"""
|
|
315
|
+
strict = kwargs.get("strict", True)
|
|
315
316
|
prefixes = list(
|
|
316
317
|
_prefixes(
|
|
317
318
|
skip_set=skip_set,
|
|
318
319
|
skip_below=skip_below,
|
|
319
320
|
skip_pyobo=skip_pyobo,
|
|
320
|
-
skip_below_inclusive=skip_below_inclusive,
|
|
321
321
|
)
|
|
322
322
|
)
|
|
323
323
|
prefix_it = tqdm(
|
|
@@ -325,28 +325,39 @@ def iter_helper_helper(
|
|
|
325
325
|
)
|
|
326
326
|
for prefix in prefix_it:
|
|
327
327
|
prefix_it.set_postfix(prefix=prefix)
|
|
328
|
+
tqdm.write(
|
|
329
|
+
click.style(f"\n{prefix} - {bioregistry.get_name(prefix)}", fg="green", bold=True)
|
|
330
|
+
)
|
|
328
331
|
try:
|
|
329
332
|
yv = f(prefix, **kwargs) # type:ignore
|
|
333
|
+
except (UnhandledFormatError, NoBuildError) as e:
|
|
334
|
+
# make sure this comes before the other runtimeerror catch
|
|
335
|
+
logger.warning("[%s] %s", prefix, e)
|
|
330
336
|
except urllib.error.HTTPError as e:
|
|
331
337
|
logger.warning("[%s] HTTP %s: unable to download %s", prefix, e.getcode(), e.geturl())
|
|
332
338
|
if strict and not bioregistry.is_deprecated(prefix):
|
|
333
339
|
raise
|
|
334
|
-
except urllib.error.URLError:
|
|
335
|
-
logger.warning("[%s] unable to download", prefix)
|
|
340
|
+
except urllib.error.URLError as e:
|
|
341
|
+
logger.warning("[%s] unable to download - %s", prefix, e.reason)
|
|
336
342
|
if strict and not bioregistry.is_deprecated(prefix):
|
|
337
343
|
raise
|
|
338
|
-
except
|
|
339
|
-
|
|
344
|
+
except ParseError as e:
|
|
345
|
+
if not e.node:
|
|
346
|
+
logger.warning("[%s] %s", prefix, e)
|
|
347
|
+
else:
|
|
348
|
+
logger.warning(str(e))
|
|
340
349
|
if strict and not bioregistry.is_deprecated(prefix):
|
|
341
350
|
raise e
|
|
351
|
+
except RuntimeError as e:
|
|
352
|
+
if "DrugBank" not in str(e):
|
|
353
|
+
raise
|
|
354
|
+
logger.warning("[drugbank] invalid credentials")
|
|
342
355
|
except subprocess.CalledProcessError:
|
|
343
356
|
logger.warning("[%s] ROBOT was unable to convert OWL to OBO", prefix)
|
|
344
|
-
except UnhandledFormatError as e:
|
|
345
|
-
logger.warning("[%s] %s", prefix, e)
|
|
346
357
|
except ValueError as e:
|
|
347
358
|
if _is_xml(e):
|
|
348
359
|
# this means that it tried doing parsing on an xml page
|
|
349
|
-
logger.
|
|
360
|
+
logger.warning(
|
|
350
361
|
"no resource available for %s. See http://www.obofoundry.org/ontology/%s",
|
|
351
362
|
prefix,
|
|
352
363
|
prefix,
|
|
@@ -355,6 +366,9 @@ def iter_helper_helper(
|
|
|
355
366
|
logger.exception(
|
|
356
367
|
"[%s] got exception %s while parsing", prefix, e.__class__.__name__
|
|
357
368
|
)
|
|
369
|
+
except zipfile.BadZipFile as e:
|
|
370
|
+
# This can happen if there's an error on UMLS
|
|
371
|
+
logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__)
|
|
358
372
|
except TypeError as e:
|
|
359
373
|
logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__)
|
|
360
374
|
if strict:
|
|
@@ -369,7 +383,7 @@ def _is_xml(e) -> bool:
|
|
|
369
383
|
)
|
|
370
384
|
|
|
371
385
|
|
|
372
|
-
def _prep_dir(directory:
|
|
386
|
+
def _prep_dir(directory: None | str | pathlib.Path) -> pathlib.Path:
|
|
373
387
|
if directory is None:
|
|
374
388
|
rv = DATABASE_DIRECTORY
|
|
375
389
|
elif isinstance(directory, str):
|
|
@@ -383,26 +397,28 @@ def _prep_dir(directory: Union[None, str, pathlib.Path]) -> pathlib.Path:
|
|
|
383
397
|
|
|
384
398
|
|
|
385
399
|
def db_output_helper(
|
|
386
|
-
|
|
400
|
+
it: Iterable[tuple[Any, ...]],
|
|
387
401
|
db_name: str,
|
|
388
402
|
columns: Sequence[str],
|
|
389
403
|
*,
|
|
390
|
-
directory:
|
|
391
|
-
strict: bool =
|
|
404
|
+
directory: None | str | pathlib.Path = None,
|
|
405
|
+
strict: bool = False,
|
|
392
406
|
use_gzip: bool = True,
|
|
393
|
-
summary_detailed:
|
|
394
|
-
**kwargs,
|
|
407
|
+
summary_detailed: Sequence[int] | None = None,
|
|
395
408
|
) -> list[pathlib.Path]:
|
|
396
409
|
"""Help output database builds.
|
|
397
410
|
|
|
398
|
-
:param f: A function that takes a prefix and gives back something that will be used
|
|
411
|
+
:param f: A function that takes a prefix and gives back something that will be used
|
|
412
|
+
by an outer function.
|
|
399
413
|
:param db_name: name of the output resource (e.g., "alts", "names")
|
|
400
414
|
:param columns: The names of the columns
|
|
401
|
-
:param directory: The directory to output everything, or defaults to
|
|
415
|
+
:param directory: The directory to output everything, or defaults to
|
|
416
|
+
:data:`pyobo.constants.DATABASE_DIRECTORY`.
|
|
402
417
|
:param strict: Passed to ``f`` by keyword
|
|
403
|
-
|
|
418
|
+
|
|
404
419
|
:returns: A sequence of paths that got created.
|
|
405
420
|
"""
|
|
421
|
+
start = time.time()
|
|
406
422
|
directory = _prep_dir(directory)
|
|
407
423
|
|
|
408
424
|
c: typing.Counter[str] = Counter()
|
|
@@ -415,27 +431,32 @@ def db_output_helper(
|
|
|
415
431
|
db_sample_path = directory.joinpath(f"{db_name}_sample.tsv")
|
|
416
432
|
db_summary_path = directory.joinpath(f"{db_name}_summary.tsv")
|
|
417
433
|
db_summary_detailed_path = directory.joinpath(f"{db_name}_summary_detailed.tsv")
|
|
434
|
+
db_metadata_path = directory.joinpath(f"{db_name}_metadata.json")
|
|
435
|
+
rv: list[tuple[str, pathlib.Path]] = [
|
|
436
|
+
("Metadata", db_metadata_path),
|
|
437
|
+
("Data", db_path),
|
|
438
|
+
("Sample", db_sample_path),
|
|
439
|
+
("Summary", db_summary_path),
|
|
440
|
+
]
|
|
418
441
|
|
|
419
442
|
logger.info("writing %s to %s", db_name, db_path)
|
|
420
443
|
logger.info("writing %s sample to %s", db_name, db_sample_path)
|
|
421
|
-
|
|
422
|
-
with gzip.open(db_path, mode="wt") if use_gzip else open(db_path, "w") as gzipped_file:
|
|
423
|
-
writer = get_writer(gzipped_file)
|
|
444
|
+
sample_rows = []
|
|
424
445
|
|
|
446
|
+
with safe_open_writer(db_path) as writer:
|
|
425
447
|
# for the first 10 rows, put it in a sample file too
|
|
426
|
-
with
|
|
427
|
-
sample_writer = get_writer(sample_file)
|
|
428
|
-
|
|
448
|
+
with safe_open_writer(db_sample_path) as sample_writer:
|
|
429
449
|
# write header
|
|
430
450
|
writer.writerow(columns)
|
|
431
451
|
sample_writer.writerow(columns)
|
|
432
452
|
|
|
433
|
-
for row, _ in zip(it, range(10)):
|
|
453
|
+
for row, _ in zip(it, range(10), strict=False):
|
|
434
454
|
c[row[0]] += 1
|
|
435
455
|
if summary_detailed is not None:
|
|
436
456
|
c_detailed[tuple(row[i] for i in summary_detailed)] += 1
|
|
437
457
|
writer.writerow(row)
|
|
438
458
|
sample_writer.writerow(row)
|
|
459
|
+
sample_rows.append(row)
|
|
439
460
|
|
|
440
461
|
# continue just in the gzipped one
|
|
441
462
|
for row in it:
|
|
@@ -444,18 +465,15 @@ def db_output_helper(
|
|
|
444
465
|
c_detailed[tuple(row[i] for i in summary_detailed)] += 1
|
|
445
466
|
writer.writerow(row)
|
|
446
467
|
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
writer = get_writer(file)
|
|
450
|
-
writer.writerows(c.most_common())
|
|
468
|
+
with safe_open_writer(db_summary_path) as summary_writer:
|
|
469
|
+
summary_writer.writerows(c.most_common())
|
|
451
470
|
|
|
452
471
|
if summary_detailed is not None:
|
|
453
472
|
logger.info(f"writing {db_name} detailed summary to {db_summary_detailed_path}")
|
|
454
|
-
with
|
|
455
|
-
|
|
456
|
-
|
|
473
|
+
with safe_open_writer(db_summary_detailed_path) as detailed_summary_writer:
|
|
474
|
+
detailed_summary_writer.writerows((*keys, v) for keys, v in c_detailed.most_common())
|
|
475
|
+
rv.append(("Summary (Detailed)", db_summary_detailed_path))
|
|
457
476
|
|
|
458
|
-
db_metadata_path = directory.joinpath(f"{db_name}_metadata.json")
|
|
459
477
|
with open(db_metadata_path, "w") as file:
|
|
460
478
|
json.dump(
|
|
461
479
|
{
|
|
@@ -468,12 +486,12 @@ def db_output_helper(
|
|
|
468
486
|
indent=2,
|
|
469
487
|
)
|
|
470
488
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
return rv
|
|
489
|
+
elapsed = time.time() - start
|
|
490
|
+
click.secho(f"\nWrote the following files in {elapsed:.1f} seconds\n", fg="green")
|
|
491
|
+
click.secho(indent(tabulate(rv), " "), fg="green")
|
|
492
|
+
|
|
493
|
+
click.secho("\nSample rows:\n", fg="green")
|
|
494
|
+
click.secho(indent(tabulate(sample_rows, headers=columns), " "), fg="green")
|
|
495
|
+
click.echo()
|
|
496
|
+
|
|
497
|
+
return [path for _, path in rv]
|