pyobo 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/.DS_Store +0 -0
- pyobo/__init__.py +95 -20
- pyobo/__main__.py +0 -0
- pyobo/api/__init__.py +81 -10
- pyobo/api/alts.py +52 -42
- pyobo/api/combine.py +39 -0
- pyobo/api/edges.py +68 -0
- pyobo/api/hierarchy.py +231 -203
- pyobo/api/metadata.py +14 -19
- pyobo/api/names.py +207 -127
- pyobo/api/properties.py +117 -113
- pyobo/api/relations.py +68 -94
- pyobo/api/species.py +24 -21
- pyobo/api/typedefs.py +11 -11
- pyobo/api/utils.py +66 -13
- pyobo/api/xrefs.py +108 -114
- pyobo/cli/__init__.py +0 -0
- pyobo/cli/cli.py +35 -50
- pyobo/cli/database.py +183 -161
- pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
- pyobo/cli/lookup.py +163 -195
- pyobo/cli/utils.py +19 -6
- pyobo/constants.py +102 -3
- pyobo/getters.py +196 -118
- pyobo/gilda_utils.py +79 -200
- pyobo/identifier_utils/__init__.py +41 -0
- pyobo/identifier_utils/api.py +296 -0
- pyobo/identifier_utils/model.py +130 -0
- pyobo/identifier_utils/preprocessing.json +812 -0
- pyobo/identifier_utils/preprocessing.py +61 -0
- pyobo/identifier_utils/relations/__init__.py +8 -0
- pyobo/identifier_utils/relations/api.py +162 -0
- pyobo/identifier_utils/relations/data.json +5824 -0
- pyobo/identifier_utils/relations/data_owl.json +57 -0
- pyobo/identifier_utils/relations/data_rdf.json +1 -0
- pyobo/identifier_utils/relations/data_rdfs.json +7 -0
- pyobo/mocks.py +9 -6
- pyobo/ner/__init__.py +9 -0
- pyobo/ner/api.py +72 -0
- pyobo/ner/normalizer.py +33 -0
- pyobo/obographs.py +43 -39
- pyobo/plugins.py +5 -4
- pyobo/py.typed +0 -0
- pyobo/reader.py +1358 -395
- pyobo/reader_utils.py +155 -0
- pyobo/resource_utils.py +42 -22
- pyobo/resources/__init__.py +0 -0
- pyobo/resources/goc.py +75 -0
- pyobo/resources/goc.tsv +188 -0
- pyobo/resources/ncbitaxon.py +4 -5
- pyobo/resources/ncbitaxon.tsv.gz +0 -0
- pyobo/resources/ro.py +3 -2
- pyobo/resources/ro.tsv +0 -0
- pyobo/resources/so.py +0 -0
- pyobo/resources/so.tsv +0 -0
- pyobo/sources/README.md +12 -8
- pyobo/sources/__init__.py +52 -29
- pyobo/sources/agrovoc.py +0 -0
- pyobo/sources/antibodyregistry.py +11 -12
- pyobo/sources/bigg/__init__.py +13 -0
- pyobo/sources/bigg/bigg_compartment.py +81 -0
- pyobo/sources/bigg/bigg_metabolite.py +229 -0
- pyobo/sources/bigg/bigg_model.py +46 -0
- pyobo/sources/bigg/bigg_reaction.py +77 -0
- pyobo/sources/biogrid.py +1 -2
- pyobo/sources/ccle.py +7 -12
- pyobo/sources/cgnc.py +0 -5
- pyobo/sources/chebi.py +1 -1
- pyobo/sources/chembl/__init__.py +9 -0
- pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
- pyobo/sources/chembl/chembl_target.py +160 -0
- pyobo/sources/civic_gene.py +55 -15
- pyobo/sources/clinicaltrials.py +160 -0
- pyobo/sources/complexportal.py +24 -24
- pyobo/sources/conso.py +14 -22
- pyobo/sources/cpt.py +0 -0
- pyobo/sources/credit.py +1 -9
- pyobo/sources/cvx.py +27 -5
- pyobo/sources/depmap.py +9 -12
- pyobo/sources/dictybase_gene.py +2 -7
- pyobo/sources/drugbank/__init__.py +9 -0
- pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
- pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
- pyobo/sources/drugcentral.py +17 -13
- pyobo/sources/expasy.py +31 -34
- pyobo/sources/famplex.py +13 -18
- pyobo/sources/flybase.py +3 -8
- pyobo/sources/gard.py +62 -0
- pyobo/sources/geonames/__init__.py +9 -0
- pyobo/sources/geonames/features.py +28 -0
- pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
- pyobo/sources/geonames/utils.py +115 -0
- pyobo/sources/gmt_utils.py +6 -7
- pyobo/sources/go.py +20 -13
- pyobo/sources/gtdb.py +154 -0
- pyobo/sources/gwascentral/__init__.py +9 -0
- pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
- pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
- pyobo/sources/hgnc/__init__.py +9 -0
- pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
- pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
- pyobo/sources/icd/__init__.py +9 -0
- pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
- pyobo/sources/icd/icd11.py +148 -0
- pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
- pyobo/sources/interpro.py +4 -9
- pyobo/sources/itis.py +0 -5
- pyobo/sources/kegg/__init__.py +0 -0
- pyobo/sources/kegg/api.py +16 -38
- pyobo/sources/kegg/genes.py +9 -20
- pyobo/sources/kegg/genome.py +1 -7
- pyobo/sources/kegg/pathway.py +9 -21
- pyobo/sources/mesh.py +58 -24
- pyobo/sources/mgi.py +3 -10
- pyobo/sources/mirbase/__init__.py +11 -0
- pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
- pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
- pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
- pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
- pyobo/sources/msigdb.py +74 -39
- pyobo/sources/ncbi/__init__.py +9 -0
- pyobo/sources/ncbi/ncbi_gc.py +162 -0
- pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
- pyobo/sources/nih_reporter.py +60 -0
- pyobo/sources/nlm/__init__.py +9 -0
- pyobo/sources/nlm/nlm_catalog.py +48 -0
- pyobo/sources/nlm/nlm_publisher.py +36 -0
- pyobo/sources/nlm/utils.py +116 -0
- pyobo/sources/npass.py +6 -8
- pyobo/sources/omim_ps.py +10 -3
- pyobo/sources/pathbank.py +4 -8
- pyobo/sources/pfam/__init__.py +9 -0
- pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
- pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
- pyobo/sources/pharmgkb/__init__.py +15 -0
- pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
- pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
- pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
- pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
- pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
- pyobo/sources/pharmgkb/utils.py +86 -0
- pyobo/sources/pid.py +1 -6
- pyobo/sources/pombase.py +6 -10
- pyobo/sources/pubchem.py +4 -9
- pyobo/sources/reactome.py +5 -11
- pyobo/sources/rgd.py +11 -16
- pyobo/sources/rhea.py +37 -36
- pyobo/sources/ror.py +69 -42
- pyobo/sources/selventa/__init__.py +0 -0
- pyobo/sources/selventa/schem.py +4 -7
- pyobo/sources/selventa/scomp.py +1 -6
- pyobo/sources/selventa/sdis.py +4 -7
- pyobo/sources/selventa/sfam.py +1 -6
- pyobo/sources/sgd.py +6 -11
- pyobo/sources/signor/__init__.py +7 -0
- pyobo/sources/signor/download.py +41 -0
- pyobo/sources/signor/signor_complexes.py +105 -0
- pyobo/sources/slm.py +12 -15
- pyobo/sources/umls/__init__.py +7 -1
- pyobo/sources/umls/__main__.py +0 -0
- pyobo/sources/umls/get_synonym_types.py +20 -4
- pyobo/sources/umls/sty.py +57 -0
- pyobo/sources/umls/synonym_types.tsv +1 -1
- pyobo/sources/umls/umls.py +18 -22
- pyobo/sources/unimod.py +46 -0
- pyobo/sources/uniprot/__init__.py +1 -1
- pyobo/sources/uniprot/uniprot.py +40 -32
- pyobo/sources/uniprot/uniprot_ptm.py +4 -34
- pyobo/sources/utils.py +3 -2
- pyobo/sources/wikipathways.py +7 -10
- pyobo/sources/zfin.py +5 -10
- pyobo/ssg/__init__.py +12 -16
- pyobo/ssg/base.html +0 -0
- pyobo/ssg/index.html +26 -13
- pyobo/ssg/term.html +12 -2
- pyobo/ssg/typedef.html +0 -0
- pyobo/struct/__init__.py +54 -8
- pyobo/struct/functional/__init__.py +1 -0
- pyobo/struct/functional/dsl.py +2572 -0
- pyobo/struct/functional/macros.py +423 -0
- pyobo/struct/functional/obo_to_functional.py +385 -0
- pyobo/struct/functional/ontology.py +270 -0
- pyobo/struct/functional/utils.py +112 -0
- pyobo/struct/reference.py +331 -136
- pyobo/struct/struct.py +1413 -643
- pyobo/struct/struct_utils.py +1078 -0
- pyobo/struct/typedef.py +162 -210
- pyobo/struct/utils.py +12 -5
- pyobo/struct/vocabulary.py +138 -0
- pyobo/utils/__init__.py +0 -0
- pyobo/utils/cache.py +13 -11
- pyobo/utils/io.py +17 -31
- pyobo/utils/iter.py +5 -5
- pyobo/utils/misc.py +41 -53
- pyobo/utils/ndex_utils.py +0 -0
- pyobo/utils/path.py +76 -70
- pyobo/version.py +3 -3
- {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/METADATA +224 -225
- pyobo-0.12.0.dist-info/RECORD +202 -0
- pyobo-0.12.0.dist-info/WHEEL +4 -0
- {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
- {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info/licenses}/LICENSE +0 -0
- pyobo/apps/__init__.py +0 -3
- pyobo/apps/cli.py +0 -24
- pyobo/apps/gilda/__init__.py +0 -3
- pyobo/apps/gilda/__main__.py +0 -8
- pyobo/apps/gilda/app.py +0 -48
- pyobo/apps/gilda/cli.py +0 -36
- pyobo/apps/gilda/templates/base.html +0 -33
- pyobo/apps/gilda/templates/home.html +0 -11
- pyobo/apps/gilda/templates/matches.html +0 -32
- pyobo/apps/mapper/__init__.py +0 -3
- pyobo/apps/mapper/__main__.py +0 -11
- pyobo/apps/mapper/cli.py +0 -37
- pyobo/apps/mapper/mapper.py +0 -187
- pyobo/apps/mapper/templates/base.html +0 -35
- pyobo/apps/mapper/templates/mapper_home.html +0 -64
- pyobo/aws.py +0 -162
- pyobo/cli/aws.py +0 -47
- pyobo/identifier_utils.py +0 -142
- pyobo/normalizer.py +0 -232
- pyobo/registries/__init__.py +0 -16
- pyobo/registries/metaregistry.json +0 -507
- pyobo/registries/metaregistry.py +0 -135
- pyobo/sources/icd11.py +0 -105
- pyobo/xrefdb/__init__.py +0 -1
- pyobo/xrefdb/canonicalizer.py +0 -214
- pyobo/xrefdb/priority.py +0 -59
- pyobo/xrefdb/sources/__init__.py +0 -60
- pyobo/xrefdb/sources/biomappings.py +0 -36
- pyobo/xrefdb/sources/cbms2019.py +0 -91
- pyobo/xrefdb/sources/chembl.py +0 -83
- pyobo/xrefdb/sources/compath.py +0 -82
- pyobo/xrefdb/sources/famplex.py +0 -64
- pyobo/xrefdb/sources/gilda.py +0 -50
- pyobo/xrefdb/sources/intact.py +0 -113
- pyobo/xrefdb/sources/ncit.py +0 -133
- pyobo/xrefdb/sources/pubchem.py +0 -27
- pyobo/xrefdb/sources/wikidata.py +0 -116
- pyobo-0.11.1.dist-info/RECORD +0 -173
- pyobo-0.11.1.dist-info/WHEEL +0 -5
- pyobo-0.11.1.dist-info/top_level.txt +0 -1
pyobo/getters.py
CHANGED
|
@@ -1,38 +1,48 @@
|
|
|
1
1
|
"""Utilities for OBO files."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import datetime
|
|
4
6
|
import gzip
|
|
5
7
|
import json
|
|
6
8
|
import logging
|
|
7
9
|
import pathlib
|
|
8
10
|
import subprocess
|
|
11
|
+
import time
|
|
9
12
|
import typing
|
|
10
13
|
import urllib.error
|
|
14
|
+
import zipfile
|
|
11
15
|
from collections import Counter
|
|
12
|
-
from collections.abc import Iterable, Mapping, Sequence
|
|
16
|
+
from collections.abc import Callable, Iterable, Mapping, Sequence
|
|
13
17
|
from pathlib import Path
|
|
14
|
-
from
|
|
15
|
-
|
|
16
|
-
Optional,
|
|
17
|
-
TypeVar,
|
|
18
|
-
Union,
|
|
19
|
-
)
|
|
18
|
+
from textwrap import indent
|
|
19
|
+
from typing import TypeVar
|
|
20
20
|
|
|
21
21
|
import bioregistry
|
|
22
|
+
import click
|
|
23
|
+
import pystow.utils
|
|
22
24
|
from bioontologies import robot
|
|
25
|
+
from tabulate import tabulate
|
|
23
26
|
from tqdm.auto import tqdm
|
|
27
|
+
from typing_extensions import Unpack
|
|
24
28
|
|
|
25
|
-
from .constants import
|
|
26
|
-
|
|
29
|
+
from .constants import (
|
|
30
|
+
DATABASE_DIRECTORY,
|
|
31
|
+
GetOntologyKwargs,
|
|
32
|
+
IterHelperHelperDict,
|
|
33
|
+
SlimGetOntologyKwargs,
|
|
34
|
+
)
|
|
35
|
+
from .identifier_utils import ParseError, wrap_norm_prefix
|
|
27
36
|
from .plugins import has_nomenclature_plugin, run_nomenclature_plugin
|
|
37
|
+
from .reader import from_obo_path, from_obonet
|
|
28
38
|
from .struct import Obo
|
|
29
39
|
from .utils.io import get_writer
|
|
30
40
|
from .utils.path import ensure_path, prefix_directory_join
|
|
31
41
|
from .version import get_git_hash, get_version
|
|
32
42
|
|
|
33
43
|
__all__ = [
|
|
34
|
-
"get_ontology",
|
|
35
44
|
"NoBuildError",
|
|
45
|
+
"get_ontology",
|
|
36
46
|
]
|
|
37
47
|
|
|
38
48
|
logger = logging.getLogger(__name__)
|
|
@@ -48,7 +58,14 @@ class UnhandledFormatError(NoBuildError):
|
|
|
48
58
|
|
|
49
59
|
#: The following prefixes can not be loaded through ROBOT without
|
|
50
60
|
#: turning off integrity checks
|
|
51
|
-
REQUIRES_NO_ROBOT_CHECK = {
|
|
61
|
+
REQUIRES_NO_ROBOT_CHECK = {
|
|
62
|
+
"clo",
|
|
63
|
+
"vo",
|
|
64
|
+
"orphanet.ordo",
|
|
65
|
+
"orphanet",
|
|
66
|
+
"foodon",
|
|
67
|
+
"caloha",
|
|
68
|
+
}
|
|
52
69
|
|
|
53
70
|
|
|
54
71
|
@wrap_norm_prefix
|
|
@@ -56,58 +73,85 @@ def get_ontology(
|
|
|
56
73
|
prefix: str,
|
|
57
74
|
*,
|
|
58
75
|
force: bool = False,
|
|
59
|
-
|
|
60
|
-
strict: bool =
|
|
61
|
-
version:
|
|
76
|
+
force_process: bool = False,
|
|
77
|
+
strict: bool = False,
|
|
78
|
+
version: str | None = None,
|
|
62
79
|
robot_check: bool = True,
|
|
80
|
+
upgrade: bool = True,
|
|
81
|
+
cache: bool = True,
|
|
82
|
+
use_tqdm: bool = True,
|
|
63
83
|
) -> Obo:
|
|
64
84
|
"""Get the OBO for a given graph.
|
|
65
85
|
|
|
66
86
|
:param prefix: The prefix of the ontology to look up
|
|
67
87
|
:param version: The pre-looked-up version of the ontology
|
|
68
88
|
:param force: Download the data again
|
|
69
|
-
:param
|
|
70
|
-
|
|
71
|
-
:param
|
|
72
|
-
|
|
73
|
-
|
|
89
|
+
:param force_process: Should the OBO cache be rewritten? Automatically set to true
|
|
90
|
+
if ``force`` is true
|
|
91
|
+
:param strict: Should CURIEs be treated strictly? If true, raises exceptions on
|
|
92
|
+
invalid/malformed
|
|
93
|
+
:param robot_check: If set to false, will send the ``--check=false`` command to
|
|
94
|
+
ROBOT to disregard malformed ontology components. Necessary to load some
|
|
95
|
+
ontologies like VO.
|
|
96
|
+
:param upgrade: If set to true, will automatically upgrade relationships, such as
|
|
97
|
+
``obo:chebi#part_of`` to ``BFO:0000051``
|
|
98
|
+
:param cache: Should cached objects be written? defaults to True
|
|
99
|
+
|
|
74
100
|
:returns: An OBO object
|
|
75
101
|
|
|
76
102
|
:raises OnlyOWLError: If the OBO foundry only has an OWL document for this resource.
|
|
77
103
|
|
|
78
|
-
Alternate usage if you have a custom url
|
|
104
|
+
Alternate usage if you have a custom url
|
|
105
|
+
|
|
106
|
+
.. code-block:: python
|
|
79
107
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
108
|
+
from pystow.utils import download
|
|
109
|
+
from pyobo import Obo, from_obo_path
|
|
110
|
+
|
|
111
|
+
url = ...
|
|
112
|
+
obo_path = ...
|
|
113
|
+
download(url=url, path=path)
|
|
114
|
+
obo = from_obo_path(path)
|
|
86
115
|
"""
|
|
87
116
|
if force:
|
|
88
|
-
|
|
117
|
+
force_process = True
|
|
89
118
|
if prefix == "uberon":
|
|
90
119
|
logger.info("UBERON has so much garbage in it that defaulting to non-strict parsing")
|
|
91
120
|
strict = False
|
|
92
121
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
logger.debug(
|
|
101
|
-
|
|
122
|
+
if not cache:
|
|
123
|
+
logger.debug("[%s] caching was turned off, so dont look for an obonet file", prefix)
|
|
124
|
+
obonet_json_gz_path = None
|
|
125
|
+
else:
|
|
126
|
+
obonet_json_gz_path = prefix_directory_join(
|
|
127
|
+
prefix, name=f"{prefix}.obonet.json.gz", ensure_exists=False, version=version
|
|
128
|
+
)
|
|
129
|
+
logger.debug(
|
|
130
|
+
"[%s] caching is turned on, so look for an obonet file at %s",
|
|
131
|
+
prefix,
|
|
132
|
+
obonet_json_gz_path,
|
|
133
|
+
)
|
|
134
|
+
if obonet_json_gz_path.exists() and not force:
|
|
135
|
+
from .utils.cache import get_gzipped_graph
|
|
136
|
+
|
|
137
|
+
logger.debug("[%s] using obonet cache at %s", prefix, obonet_json_gz_path)
|
|
138
|
+
return from_obonet(
|
|
139
|
+
get_gzipped_graph(obonet_json_gz_path),
|
|
140
|
+
strict=strict,
|
|
141
|
+
version=version,
|
|
142
|
+
upgrade=upgrade,
|
|
143
|
+
use_tqdm=use_tqdm,
|
|
144
|
+
)
|
|
145
|
+
else:
|
|
146
|
+
logger.debug("[%s] no obonet cache found at %s", prefix, obonet_json_gz_path)
|
|
102
147
|
|
|
103
148
|
if has_nomenclature_plugin(prefix):
|
|
104
149
|
obo = run_nomenclature_plugin(prefix, version=version)
|
|
105
|
-
|
|
106
|
-
|
|
150
|
+
if cache:
|
|
151
|
+
logger.debug("[%s] caching nomenclature plugin", prefix)
|
|
152
|
+
obo.write_default(force=force_process)
|
|
107
153
|
return obo
|
|
108
154
|
|
|
109
|
-
logger.debug("[%s] no obonet cache found at %s", prefix, obonet_json_gz_path)
|
|
110
|
-
|
|
111
155
|
ontology_format, path = _ensure_ontology_path(prefix, force=force, version=version)
|
|
112
156
|
if path is None:
|
|
113
157
|
raise NoBuildError(prefix)
|
|
@@ -122,25 +166,23 @@ def get_ontology(
|
|
|
122
166
|
else:
|
|
123
167
|
raise UnhandledFormatError(f"[{prefix}] unhandled ontology file format: {path.suffix}")
|
|
124
168
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
obo.data_version = version
|
|
137
|
-
obo.write_default(force=rewrite)
|
|
169
|
+
obo = from_obo_path(
|
|
170
|
+
path,
|
|
171
|
+
prefix=prefix,
|
|
172
|
+
strict=strict,
|
|
173
|
+
version=version,
|
|
174
|
+
upgrade=upgrade,
|
|
175
|
+
use_tqdm=use_tqdm,
|
|
176
|
+
_cache_path=obonet_json_gz_path,
|
|
177
|
+
)
|
|
178
|
+
if cache:
|
|
179
|
+
obo.write_default(force=force_process)
|
|
138
180
|
return obo
|
|
139
181
|
|
|
140
182
|
|
|
141
183
|
def _ensure_ontology_path(
|
|
142
|
-
prefix: str, force, version
|
|
143
|
-
) ->
|
|
184
|
+
prefix: str, force: bool, version: str | None
|
|
185
|
+
) -> tuple[str, Path] | tuple[None, None]:
|
|
144
186
|
for ontology_format, url in [
|
|
145
187
|
("obo", bioregistry.get_obo_download(prefix)),
|
|
146
188
|
("owl", bioregistry.get_owl_download(prefix)),
|
|
@@ -148,8 +190,8 @@ def _ensure_ontology_path(
|
|
|
148
190
|
]:
|
|
149
191
|
if url is not None:
|
|
150
192
|
try:
|
|
151
|
-
path =
|
|
152
|
-
except urllib.error.HTTPError:
|
|
193
|
+
path = ensure_path(prefix, url=url, force=force, version=version)
|
|
194
|
+
except (urllib.error.HTTPError, pystow.utils.DownloadError):
|
|
153
195
|
continue
|
|
154
196
|
else:
|
|
155
197
|
return ontology_format, path
|
|
@@ -215,33 +257,42 @@ CANT_PARSE = {
|
|
|
215
257
|
"xl",
|
|
216
258
|
}
|
|
217
259
|
SKIP = {
|
|
218
|
-
"ncbigene"
|
|
219
|
-
"pubchem.compound"
|
|
220
|
-
"gaz"
|
|
221
|
-
"ma"
|
|
222
|
-
"bila"
|
|
223
|
-
#
|
|
224
|
-
"
|
|
225
|
-
"
|
|
226
|
-
"
|
|
227
|
-
"
|
|
228
|
-
|
|
229
|
-
"
|
|
230
|
-
|
|
231
|
-
"
|
|
260
|
+
"ncbigene": "too big, refs acquired from other dbs",
|
|
261
|
+
"pubchem.compound": "top big, can't deal with this now",
|
|
262
|
+
"gaz": "Gazetteer is irrelevant for biology",
|
|
263
|
+
"ma": "yanked",
|
|
264
|
+
"bila": "yanked",
|
|
265
|
+
# Can't download",
|
|
266
|
+
"afpo": "unable to download",
|
|
267
|
+
"atol": "unable to download",
|
|
268
|
+
"eol": "unable to download, same source as atol",
|
|
269
|
+
"hog": "unable to download",
|
|
270
|
+
"vhog": "unable to download",
|
|
271
|
+
"gorel": "unable to download",
|
|
272
|
+
"dinto": "unable to download",
|
|
273
|
+
"gainesville.core": "unable to download",
|
|
274
|
+
"ato": "can't process",
|
|
275
|
+
"emapa": "recently changed with EMAP... not sure what the difference is anymore",
|
|
276
|
+
"kegg.genes": "needs fix", # FIXME
|
|
277
|
+
"kegg.genome": "needs fix", # FIXME
|
|
278
|
+
"kegg.pathway": "needs fix", # FIXME
|
|
279
|
+
"ensemblglossary": "uri is wrong",
|
|
280
|
+
"epio": "content from fraunhofer is unreliable",
|
|
281
|
+
"epso": "content from fraunhofer is unreliable",
|
|
282
|
+
"gwascentral.phenotype": "website is down? or API changed?", # FIXME
|
|
283
|
+
"gwascentral.study": "website is down? or API changed?", # FIXME
|
|
232
284
|
}
|
|
233
285
|
|
|
234
286
|
X = TypeVar("X")
|
|
235
287
|
|
|
236
288
|
|
|
237
289
|
def iter_helper(
|
|
238
|
-
f: Callable[[str], Mapping[str, X]],
|
|
290
|
+
f: Callable[[str, Unpack[GetOntologyKwargs]], Mapping[str, X]],
|
|
239
291
|
leave: bool = False,
|
|
240
|
-
|
|
241
|
-
**kwargs,
|
|
292
|
+
**kwargs: Unpack[IterHelperHelperDict],
|
|
242
293
|
) -> Iterable[tuple[str, str, X]]:
|
|
243
294
|
"""Yield all mappings extracted from each database given."""
|
|
244
|
-
for prefix, mapping in iter_helper_helper(f,
|
|
295
|
+
for prefix, mapping in iter_helper_helper(f, **kwargs):
|
|
245
296
|
it = tqdm(
|
|
246
297
|
mapping.items(),
|
|
247
298
|
desc=f"iterating {prefix}",
|
|
@@ -250,22 +301,24 @@ def iter_helper(
|
|
|
250
301
|
disable=None,
|
|
251
302
|
)
|
|
252
303
|
for key, value in it:
|
|
253
|
-
|
|
304
|
+
if isinstance(value, str):
|
|
305
|
+
value = value.strip('"').replace("\n", " ").replace("\t", " ").replace(" ", " ")
|
|
306
|
+
# TODO deal with when this is not a string?
|
|
254
307
|
if value:
|
|
255
308
|
yield prefix, key, value
|
|
256
309
|
|
|
257
310
|
|
|
258
311
|
def _prefixes(
|
|
259
|
-
skip_below:
|
|
312
|
+
skip_below: str | None = None,
|
|
260
313
|
skip_below_inclusive: bool = True,
|
|
261
314
|
skip_pyobo: bool = False,
|
|
262
|
-
skip_set:
|
|
315
|
+
skip_set: set[str] | None = None,
|
|
263
316
|
) -> Iterable[str]:
|
|
264
317
|
for prefix, resource in sorted(bioregistry.read_registry().items()):
|
|
265
318
|
if resource.no_own_terms:
|
|
266
319
|
continue
|
|
267
320
|
if prefix in SKIP:
|
|
268
|
-
tqdm.write(f"skipping {prefix} because
|
|
321
|
+
tqdm.write(f"skipping {prefix} because {SKIP[prefix]}")
|
|
269
322
|
continue
|
|
270
323
|
if skip_set and prefix in skip_set:
|
|
271
324
|
tqdm.write(f"skipping {prefix} because in skip set")
|
|
@@ -287,37 +340,39 @@ def _prefixes(
|
|
|
287
340
|
|
|
288
341
|
|
|
289
342
|
def iter_helper_helper(
|
|
290
|
-
f: Callable[[str], X],
|
|
343
|
+
f: Callable[[str, Unpack[GetOntologyKwargs]], X],
|
|
291
344
|
use_tqdm: bool = True,
|
|
292
|
-
skip_below:
|
|
293
|
-
skip_below_inclusive: bool = True,
|
|
345
|
+
skip_below: str | None = None,
|
|
294
346
|
skip_pyobo: bool = False,
|
|
295
|
-
skip_set:
|
|
296
|
-
|
|
297
|
-
**kwargs,
|
|
347
|
+
skip_set: set[str] | None = None,
|
|
348
|
+
**kwargs: Unpack[SlimGetOntologyKwargs],
|
|
298
349
|
) -> Iterable[tuple[str, X]]:
|
|
299
350
|
"""Yield all mappings extracted from each database given.
|
|
300
351
|
|
|
301
|
-
:param f: A function that takes a prefix and gives back something that will be used
|
|
352
|
+
:param f: A function that takes a prefix and gives back something that will be used
|
|
353
|
+
by an outer function.
|
|
302
354
|
:param use_tqdm: If true, use the tqdm progress bar
|
|
303
|
-
:param skip_below: If true, skip sources whose names are less than this (used for
|
|
355
|
+
:param skip_below: If true, skip sources whose names are less than this (used for
|
|
356
|
+
iterative curation
|
|
304
357
|
:param skip_pyobo: If true, skip sources implemented in PyOBO
|
|
305
358
|
:param skip_set: A pre-defined blacklist to skip
|
|
306
|
-
:param strict: If true, will raise exceptions and crash the program instead of
|
|
359
|
+
:param strict: If true, will raise exceptions and crash the program instead of
|
|
360
|
+
logging them.
|
|
307
361
|
:param kwargs: Keyword arguments passed to ``f``.
|
|
308
|
-
:yields: A prefix and the result of the callable ``f``
|
|
309
362
|
|
|
310
363
|
:raises TypeError: If a type error is raised, it gets re-raised
|
|
311
364
|
:raises urllib.error.HTTPError: If the resource could not be downloaded
|
|
312
365
|
:raises urllib.error.URLError: If another problem was encountered during download
|
|
313
366
|
:raises ValueError: If the data was not in the format that was expected (e.g., OWL)
|
|
367
|
+
|
|
368
|
+
:yields: A prefix and the result of the callable ``f``
|
|
314
369
|
"""
|
|
370
|
+
strict = kwargs.get("strict", True)
|
|
315
371
|
prefixes = list(
|
|
316
372
|
_prefixes(
|
|
317
373
|
skip_set=skip_set,
|
|
318
374
|
skip_below=skip_below,
|
|
319
375
|
skip_pyobo=skip_pyobo,
|
|
320
|
-
skip_below_inclusive=skip_below_inclusive,
|
|
321
376
|
)
|
|
322
377
|
)
|
|
323
378
|
prefix_it = tqdm(
|
|
@@ -325,24 +380,35 @@ def iter_helper_helper(
|
|
|
325
380
|
)
|
|
326
381
|
for prefix in prefix_it:
|
|
327
382
|
prefix_it.set_postfix(prefix=prefix)
|
|
383
|
+
tqdm.write(
|
|
384
|
+
click.style(f"\n{prefix} - {bioregistry.get_name(prefix)}", fg="green", bold=True)
|
|
385
|
+
)
|
|
328
386
|
try:
|
|
329
387
|
yv = f(prefix, **kwargs) # type:ignore
|
|
388
|
+
except (UnhandledFormatError, NoBuildError) as e:
|
|
389
|
+
# make sure this comes before the other runtimeerror catch
|
|
390
|
+
logger.warning("[%s] %s", prefix, e)
|
|
330
391
|
except urllib.error.HTTPError as e:
|
|
331
392
|
logger.warning("[%s] HTTP %s: unable to download %s", prefix, e.getcode(), e.geturl())
|
|
332
393
|
if strict and not bioregistry.is_deprecated(prefix):
|
|
333
394
|
raise
|
|
334
|
-
except urllib.error.URLError:
|
|
335
|
-
logger.warning("[%s] unable to download", prefix)
|
|
395
|
+
except urllib.error.URLError as e:
|
|
396
|
+
logger.warning("[%s] unable to download - %s", prefix, e.reason)
|
|
336
397
|
if strict and not bioregistry.is_deprecated(prefix):
|
|
337
398
|
raise
|
|
338
|
-
except
|
|
339
|
-
|
|
399
|
+
except ParseError as e:
|
|
400
|
+
if not e.node:
|
|
401
|
+
logger.warning("[%s] %s", prefix, e)
|
|
402
|
+
else:
|
|
403
|
+
logger.warning(str(e))
|
|
340
404
|
if strict and not bioregistry.is_deprecated(prefix):
|
|
341
405
|
raise e
|
|
406
|
+
except RuntimeError as e:
|
|
407
|
+
if "DrugBank" not in str(e):
|
|
408
|
+
raise
|
|
409
|
+
logger.warning("[drugbank] invalid credentials")
|
|
342
410
|
except subprocess.CalledProcessError:
|
|
343
411
|
logger.warning("[%s] ROBOT was unable to convert OWL to OBO", prefix)
|
|
344
|
-
except UnhandledFormatError as e:
|
|
345
|
-
logger.warning("[%s] %s", prefix, e)
|
|
346
412
|
except ValueError as e:
|
|
347
413
|
if _is_xml(e):
|
|
348
414
|
# this means that it tried doing parsing on an xml page
|
|
@@ -355,6 +421,9 @@ def iter_helper_helper(
|
|
|
355
421
|
logger.exception(
|
|
356
422
|
"[%s] got exception %s while parsing", prefix, e.__class__.__name__
|
|
357
423
|
)
|
|
424
|
+
except zipfile.BadZipFile as e:
|
|
425
|
+
# This can happen if there's an error on UMLS
|
|
426
|
+
logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__)
|
|
358
427
|
except TypeError as e:
|
|
359
428
|
logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__)
|
|
360
429
|
if strict:
|
|
@@ -369,7 +438,7 @@ def _is_xml(e) -> bool:
|
|
|
369
438
|
)
|
|
370
439
|
|
|
371
440
|
|
|
372
|
-
def _prep_dir(directory:
|
|
441
|
+
def _prep_dir(directory: None | str | pathlib.Path) -> pathlib.Path:
|
|
373
442
|
if directory is None:
|
|
374
443
|
rv = DATABASE_DIRECTORY
|
|
375
444
|
elif isinstance(directory, str):
|
|
@@ -383,26 +452,28 @@ def _prep_dir(directory: Union[None, str, pathlib.Path]) -> pathlib.Path:
|
|
|
383
452
|
|
|
384
453
|
|
|
385
454
|
def db_output_helper(
|
|
386
|
-
|
|
455
|
+
it: Iterable[tuple[str, ...]],
|
|
387
456
|
db_name: str,
|
|
388
457
|
columns: Sequence[str],
|
|
389
458
|
*,
|
|
390
|
-
directory:
|
|
391
|
-
strict: bool =
|
|
459
|
+
directory: None | str | pathlib.Path = None,
|
|
460
|
+
strict: bool = False,
|
|
392
461
|
use_gzip: bool = True,
|
|
393
|
-
summary_detailed:
|
|
394
|
-
**kwargs,
|
|
462
|
+
summary_detailed: Sequence[int] | None = None,
|
|
395
463
|
) -> list[pathlib.Path]:
|
|
396
464
|
"""Help output database builds.
|
|
397
465
|
|
|
398
|
-
:param f: A function that takes a prefix and gives back something that will be used
|
|
466
|
+
:param f: A function that takes a prefix and gives back something that will be used
|
|
467
|
+
by an outer function.
|
|
399
468
|
:param db_name: name of the output resource (e.g., "alts", "names")
|
|
400
469
|
:param columns: The names of the columns
|
|
401
|
-
:param directory: The directory to output everything, or defaults to
|
|
470
|
+
:param directory: The directory to output everything, or defaults to
|
|
471
|
+
:data:`pyobo.constants.DATABASE_DIRECTORY`.
|
|
402
472
|
:param strict: Passed to ``f`` by keyword
|
|
403
|
-
|
|
473
|
+
|
|
404
474
|
:returns: A sequence of paths that got created.
|
|
405
475
|
"""
|
|
476
|
+
start = time.time()
|
|
406
477
|
directory = _prep_dir(directory)
|
|
407
478
|
|
|
408
479
|
c: typing.Counter[str] = Counter()
|
|
@@ -415,10 +486,17 @@ def db_output_helper(
|
|
|
415
486
|
db_sample_path = directory.joinpath(f"{db_name}_sample.tsv")
|
|
416
487
|
db_summary_path = directory.joinpath(f"{db_name}_summary.tsv")
|
|
417
488
|
db_summary_detailed_path = directory.joinpath(f"{db_name}_summary_detailed.tsv")
|
|
489
|
+
db_metadata_path = directory.joinpath(f"{db_name}_metadata.json")
|
|
490
|
+
rv: list[tuple[str, pathlib.Path]] = [
|
|
491
|
+
("Metadata", db_metadata_path),
|
|
492
|
+
("Data", db_path),
|
|
493
|
+
("Sample", db_sample_path),
|
|
494
|
+
("Summary", db_summary_path),
|
|
495
|
+
]
|
|
418
496
|
|
|
419
497
|
logger.info("writing %s to %s", db_name, db_path)
|
|
420
498
|
logger.info("writing %s sample to %s", db_name, db_sample_path)
|
|
421
|
-
|
|
499
|
+
sample_rows = []
|
|
422
500
|
with gzip.open(db_path, mode="wt") if use_gzip else open(db_path, "w") as gzipped_file:
|
|
423
501
|
writer = get_writer(gzipped_file)
|
|
424
502
|
|
|
@@ -430,12 +508,13 @@ def db_output_helper(
|
|
|
430
508
|
writer.writerow(columns)
|
|
431
509
|
sample_writer.writerow(columns)
|
|
432
510
|
|
|
433
|
-
for row, _ in zip(it, range(10)):
|
|
511
|
+
for row, _ in zip(it, range(10), strict=False):
|
|
434
512
|
c[row[0]] += 1
|
|
435
513
|
if summary_detailed is not None:
|
|
436
514
|
c_detailed[tuple(row[i] for i in summary_detailed)] += 1
|
|
437
515
|
writer.writerow(row)
|
|
438
516
|
sample_writer.writerow(row)
|
|
517
|
+
sample_rows.append(row)
|
|
439
518
|
|
|
440
519
|
# continue just in the gzipped one
|
|
441
520
|
for row in it:
|
|
@@ -444,7 +523,6 @@ def db_output_helper(
|
|
|
444
523
|
c_detailed[tuple(row[i] for i in summary_detailed)] += 1
|
|
445
524
|
writer.writerow(row)
|
|
446
525
|
|
|
447
|
-
logger.info(f"writing {db_name} summary to {db_summary_path}")
|
|
448
526
|
with open(db_summary_path, "w") as file:
|
|
449
527
|
writer = get_writer(file)
|
|
450
528
|
writer.writerows(c.most_common())
|
|
@@ -454,8 +532,8 @@ def db_output_helper(
|
|
|
454
532
|
with open(db_summary_detailed_path, "w") as file:
|
|
455
533
|
writer = get_writer(file)
|
|
456
534
|
writer.writerows((*keys, v) for keys, v in c_detailed.most_common())
|
|
535
|
+
rv.append(("Summary (Detailed)", db_summary_detailed_path))
|
|
457
536
|
|
|
458
|
-
db_metadata_path = directory.joinpath(f"{db_name}_metadata.json")
|
|
459
537
|
with open(db_metadata_path, "w") as file:
|
|
460
538
|
json.dump(
|
|
461
539
|
{
|
|
@@ -468,12 +546,12 @@ def db_output_helper(
|
|
|
468
546
|
indent=2,
|
|
469
547
|
)
|
|
470
548
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
return rv
|
|
549
|
+
elapsed = time.time() - start
|
|
550
|
+
click.secho(f"\nWrote the following files in {elapsed:.1f} seconds\n", fg="green")
|
|
551
|
+
click.secho(indent(tabulate(rv), " "), fg="green")
|
|
552
|
+
|
|
553
|
+
click.secho("\nSample rows:\n", fg="green")
|
|
554
|
+
click.secho(indent(tabulate(sample_rows, headers=columns), " "), fg="green")
|
|
555
|
+
click.echo()
|
|
556
|
+
|
|
557
|
+
return [path for _, path in rv]
|