pyobo 0.11.2__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. pyobo/.DS_Store +0 -0
  2. pyobo/__init__.py +95 -20
  3. pyobo/__main__.py +0 -0
  4. pyobo/api/__init__.py +81 -10
  5. pyobo/api/alts.py +52 -42
  6. pyobo/api/combine.py +39 -0
  7. pyobo/api/edges.py +68 -0
  8. pyobo/api/hierarchy.py +231 -203
  9. pyobo/api/metadata.py +14 -19
  10. pyobo/api/names.py +207 -127
  11. pyobo/api/properties.py +117 -117
  12. pyobo/api/relations.py +68 -94
  13. pyobo/api/species.py +24 -21
  14. pyobo/api/typedefs.py +11 -11
  15. pyobo/api/utils.py +66 -13
  16. pyobo/api/xrefs.py +107 -114
  17. pyobo/cli/__init__.py +0 -0
  18. pyobo/cli/cli.py +35 -50
  19. pyobo/cli/database.py +210 -160
  20. pyobo/cli/database_utils.py +155 -0
  21. pyobo/cli/lookup.py +163 -195
  22. pyobo/cli/utils.py +19 -6
  23. pyobo/constants.py +102 -3
  24. pyobo/getters.py +209 -191
  25. pyobo/gilda_utils.py +52 -250
  26. pyobo/identifier_utils/__init__.py +33 -0
  27. pyobo/identifier_utils/api.py +305 -0
  28. pyobo/identifier_utils/preprocessing.json +873 -0
  29. pyobo/identifier_utils/preprocessing.py +27 -0
  30. pyobo/identifier_utils/relations/__init__.py +8 -0
  31. pyobo/identifier_utils/relations/api.py +162 -0
  32. pyobo/identifier_utils/relations/data.json +5824 -0
  33. pyobo/identifier_utils/relations/data_owl.json +57 -0
  34. pyobo/identifier_utils/relations/data_rdf.json +1 -0
  35. pyobo/identifier_utils/relations/data_rdfs.json +7 -0
  36. pyobo/mocks.py +9 -6
  37. pyobo/ner/__init__.py +9 -0
  38. pyobo/ner/api.py +72 -0
  39. pyobo/ner/normalizer.py +33 -0
  40. pyobo/obographs.py +48 -40
  41. pyobo/plugins.py +5 -4
  42. pyobo/py.typed +0 -0
  43. pyobo/reader.py +1354 -395
  44. pyobo/reader_utils.py +155 -0
  45. pyobo/resource_utils.py +42 -22
  46. pyobo/resources/__init__.py +0 -0
  47. pyobo/resources/goc.py +75 -0
  48. pyobo/resources/goc.tsv +188 -0
  49. pyobo/resources/ncbitaxon.py +4 -5
  50. pyobo/resources/ncbitaxon.tsv.gz +0 -0
  51. pyobo/resources/ro.py +3 -2
  52. pyobo/resources/ro.tsv +0 -0
  53. pyobo/resources/so.py +0 -0
  54. pyobo/resources/so.tsv +0 -0
  55. pyobo/sources/README.md +12 -8
  56. pyobo/sources/__init__.py +52 -29
  57. pyobo/sources/agrovoc.py +0 -0
  58. pyobo/sources/antibodyregistry.py +11 -12
  59. pyobo/sources/bigg/__init__.py +13 -0
  60. pyobo/sources/bigg/bigg_compartment.py +81 -0
  61. pyobo/sources/bigg/bigg_metabolite.py +229 -0
  62. pyobo/sources/bigg/bigg_model.py +46 -0
  63. pyobo/sources/bigg/bigg_reaction.py +77 -0
  64. pyobo/sources/biogrid.py +1 -2
  65. pyobo/sources/ccle.py +7 -12
  66. pyobo/sources/cgnc.py +9 -6
  67. pyobo/sources/chebi.py +1 -1
  68. pyobo/sources/chembl/__init__.py +9 -0
  69. pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
  70. pyobo/sources/chembl/chembl_target.py +160 -0
  71. pyobo/sources/civic_gene.py +55 -15
  72. pyobo/sources/clinicaltrials.py +160 -0
  73. pyobo/sources/complexportal.py +24 -24
  74. pyobo/sources/conso.py +14 -22
  75. pyobo/sources/cpt.py +0 -0
  76. pyobo/sources/credit.py +1 -9
  77. pyobo/sources/cvx.py +27 -5
  78. pyobo/sources/depmap.py +9 -12
  79. pyobo/sources/dictybase_gene.py +2 -7
  80. pyobo/sources/drugbank/__init__.py +9 -0
  81. pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
  82. pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
  83. pyobo/sources/drugcentral.py +17 -13
  84. pyobo/sources/expasy.py +31 -34
  85. pyobo/sources/famplex.py +13 -18
  86. pyobo/sources/flybase.py +8 -13
  87. pyobo/sources/gard.py +62 -0
  88. pyobo/sources/geonames/__init__.py +9 -0
  89. pyobo/sources/geonames/features.py +28 -0
  90. pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
  91. pyobo/sources/geonames/utils.py +115 -0
  92. pyobo/sources/gmt_utils.py +6 -7
  93. pyobo/sources/go.py +20 -13
  94. pyobo/sources/gtdb.py +154 -0
  95. pyobo/sources/gwascentral/__init__.py +9 -0
  96. pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
  97. pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
  98. pyobo/sources/hgnc/__init__.py +9 -0
  99. pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
  100. pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
  101. pyobo/sources/icd/__init__.py +9 -0
  102. pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
  103. pyobo/sources/icd/icd11.py +148 -0
  104. pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
  105. pyobo/sources/interpro.py +4 -9
  106. pyobo/sources/itis.py +0 -5
  107. pyobo/sources/kegg/__init__.py +0 -0
  108. pyobo/sources/kegg/api.py +16 -38
  109. pyobo/sources/kegg/genes.py +9 -20
  110. pyobo/sources/kegg/genome.py +1 -7
  111. pyobo/sources/kegg/pathway.py +9 -21
  112. pyobo/sources/mesh.py +58 -24
  113. pyobo/sources/mgi.py +3 -10
  114. pyobo/sources/mirbase/__init__.py +11 -0
  115. pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
  116. pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
  117. pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
  118. pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
  119. pyobo/sources/msigdb.py +74 -39
  120. pyobo/sources/ncbi/__init__.py +9 -0
  121. pyobo/sources/ncbi/ncbi_gc.py +162 -0
  122. pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
  123. pyobo/sources/nih_reporter.py +60 -0
  124. pyobo/sources/nlm/__init__.py +9 -0
  125. pyobo/sources/nlm/nlm_catalog.py +48 -0
  126. pyobo/sources/nlm/nlm_publisher.py +36 -0
  127. pyobo/sources/nlm/utils.py +116 -0
  128. pyobo/sources/npass.py +6 -8
  129. pyobo/sources/omim_ps.py +11 -4
  130. pyobo/sources/pathbank.py +4 -8
  131. pyobo/sources/pfam/__init__.py +9 -0
  132. pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
  133. pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
  134. pyobo/sources/pharmgkb/__init__.py +15 -0
  135. pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
  136. pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
  137. pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
  138. pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
  139. pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
  140. pyobo/sources/pharmgkb/utils.py +86 -0
  141. pyobo/sources/pid.py +1 -6
  142. pyobo/sources/pombase.py +6 -10
  143. pyobo/sources/pubchem.py +4 -9
  144. pyobo/sources/reactome.py +5 -11
  145. pyobo/sources/rgd.py +11 -16
  146. pyobo/sources/rhea.py +37 -36
  147. pyobo/sources/ror.py +69 -42
  148. pyobo/sources/selventa/__init__.py +0 -0
  149. pyobo/sources/selventa/schem.py +4 -7
  150. pyobo/sources/selventa/scomp.py +1 -6
  151. pyobo/sources/selventa/sdis.py +4 -7
  152. pyobo/sources/selventa/sfam.py +1 -6
  153. pyobo/sources/sgd.py +6 -11
  154. pyobo/sources/signor/__init__.py +7 -0
  155. pyobo/sources/signor/download.py +41 -0
  156. pyobo/sources/signor/signor_complexes.py +105 -0
  157. pyobo/sources/slm.py +12 -15
  158. pyobo/sources/umls/__init__.py +7 -1
  159. pyobo/sources/umls/__main__.py +0 -0
  160. pyobo/sources/umls/get_synonym_types.py +20 -4
  161. pyobo/sources/umls/sty.py +57 -0
  162. pyobo/sources/umls/synonym_types.tsv +1 -1
  163. pyobo/sources/umls/umls.py +18 -22
  164. pyobo/sources/unimod.py +46 -0
  165. pyobo/sources/uniprot/__init__.py +1 -1
  166. pyobo/sources/uniprot/uniprot.py +40 -32
  167. pyobo/sources/uniprot/uniprot_ptm.py +4 -34
  168. pyobo/sources/utils.py +3 -2
  169. pyobo/sources/wikipathways.py +7 -10
  170. pyobo/sources/zfin.py +5 -10
  171. pyobo/ssg/__init__.py +12 -16
  172. pyobo/ssg/base.html +0 -0
  173. pyobo/ssg/index.html +26 -13
  174. pyobo/ssg/term.html +12 -2
  175. pyobo/ssg/typedef.html +0 -0
  176. pyobo/struct/__init__.py +54 -8
  177. pyobo/struct/functional/__init__.py +1 -0
  178. pyobo/struct/functional/dsl.py +2572 -0
  179. pyobo/struct/functional/macros.py +423 -0
  180. pyobo/struct/functional/obo_to_functional.py +385 -0
  181. pyobo/struct/functional/ontology.py +272 -0
  182. pyobo/struct/functional/utils.py +112 -0
  183. pyobo/struct/reference.py +331 -136
  184. pyobo/struct/struct.py +1484 -657
  185. pyobo/struct/struct_utils.py +1078 -0
  186. pyobo/struct/typedef.py +162 -210
  187. pyobo/struct/utils.py +12 -5
  188. pyobo/struct/vocabulary.py +138 -0
  189. pyobo/utils/__init__.py +0 -0
  190. pyobo/utils/cache.py +16 -15
  191. pyobo/utils/io.py +51 -41
  192. pyobo/utils/iter.py +5 -5
  193. pyobo/utils/misc.py +41 -53
  194. pyobo/utils/ndex_utils.py +0 -0
  195. pyobo/utils/path.py +73 -70
  196. pyobo/version.py +3 -3
  197. pyobo-0.12.1.dist-info/METADATA +671 -0
  198. pyobo-0.12.1.dist-info/RECORD +201 -0
  199. pyobo-0.12.1.dist-info/WHEEL +4 -0
  200. {pyobo-0.11.2.dist-info → pyobo-0.12.1.dist-info}/entry_points.txt +1 -0
  201. pyobo-0.12.1.dist-info/licenses/LICENSE +21 -0
  202. pyobo/aws.py +0 -162
  203. pyobo/cli/aws.py +0 -47
  204. pyobo/identifier_utils.py +0 -142
  205. pyobo/normalizer.py +0 -232
  206. pyobo/registries/__init__.py +0 -16
  207. pyobo/registries/metaregistry.json +0 -507
  208. pyobo/registries/metaregistry.py +0 -135
  209. pyobo/sources/icd11.py +0 -105
  210. pyobo/xrefdb/__init__.py +0 -1
  211. pyobo/xrefdb/canonicalizer.py +0 -214
  212. pyobo/xrefdb/priority.py +0 -59
  213. pyobo/xrefdb/sources/__init__.py +0 -60
  214. pyobo/xrefdb/sources/biomappings.py +0 -36
  215. pyobo/xrefdb/sources/cbms2019.py +0 -91
  216. pyobo/xrefdb/sources/chembl.py +0 -83
  217. pyobo/xrefdb/sources/compath.py +0 -82
  218. pyobo/xrefdb/sources/famplex.py +0 -64
  219. pyobo/xrefdb/sources/gilda.py +0 -50
  220. pyobo/xrefdb/sources/intact.py +0 -113
  221. pyobo/xrefdb/sources/ncit.py +0 -133
  222. pyobo/xrefdb/sources/pubchem.py +0 -27
  223. pyobo/xrefdb/sources/wikidata.py +0 -116
  224. pyobo/xrefdb/xrefs_pipeline.py +0 -180
  225. pyobo-0.11.2.dist-info/METADATA +0 -711
  226. pyobo-0.11.2.dist-info/RECORD +0 -157
  227. pyobo-0.11.2.dist-info/WHEEL +0 -5
  228. pyobo-0.11.2.dist-info/top_level.txt +0 -1
pyobo/getters.py CHANGED
@@ -1,38 +1,48 @@
1
1
  """Utilities for OBO files."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import datetime
4
- import gzip
5
6
  import json
6
7
  import logging
7
8
  import pathlib
8
9
  import subprocess
10
+ import time
9
11
  import typing
10
12
  import urllib.error
13
+ import zipfile
11
14
  from collections import Counter
12
- from collections.abc import Iterable, Mapping, Sequence
15
+ from collections.abc import Callable, Iterable, Mapping, Sequence
13
16
  from pathlib import Path
14
- from typing import (
15
- Callable,
16
- Optional,
17
- TypeVar,
18
- Union,
19
- )
17
+ from textwrap import indent
18
+ from typing import Any, TypeVar
20
19
 
21
20
  import bioregistry
21
+ import click
22
+ import pystow.utils
22
23
  from bioontologies import robot
24
+ from tabulate import tabulate
23
25
  from tqdm.auto import tqdm
24
-
25
- from .constants import DATABASE_DIRECTORY
26
- from .identifier_utils import MissingPrefixError, wrap_norm_prefix
26
+ from typing_extensions import Unpack
27
+
28
+ from .constants import (
29
+ BUILD_SUBDIRECTORY_NAME,
30
+ DATABASE_DIRECTORY,
31
+ GetOntologyKwargs,
32
+ IterHelperHelperDict,
33
+ SlimGetOntologyKwargs,
34
+ )
35
+ from .identifier_utils import ParseError, wrap_norm_prefix
27
36
  from .plugins import has_nomenclature_plugin, run_nomenclature_plugin
37
+ from .reader import from_obo_path, from_obonet
28
38
  from .struct import Obo
29
- from .utils.io import get_writer
39
+ from .utils.io import safe_open_writer
30
40
  from .utils.path import ensure_path, prefix_directory_join
31
41
  from .version import get_git_hash, get_version
32
42
 
33
43
  __all__ = [
34
- "get_ontology",
35
44
  "NoBuildError",
45
+ "get_ontology",
36
46
  ]
37
47
 
38
48
  logger = logging.getLogger(__name__)
@@ -48,7 +58,14 @@ class UnhandledFormatError(NoBuildError):
48
58
 
49
59
  #: The following prefixes can not be loaded through ROBOT without
50
60
  #: turning off integrity checks
51
- REQUIRES_NO_ROBOT_CHECK = {"clo", "vo", "orphanet.ordo", "orphanet"}
61
+ REQUIRES_NO_ROBOT_CHECK = {
62
+ "clo",
63
+ "vo",
64
+ "orphanet.ordo",
65
+ "orphanet",
66
+ "foodon",
67
+ "caloha",
68
+ }
52
69
 
53
70
 
54
71
  @wrap_norm_prefix
@@ -56,58 +73,87 @@ def get_ontology(
56
73
  prefix: str,
57
74
  *,
58
75
  force: bool = False,
59
- rewrite: bool = False,
60
- strict: bool = True,
61
- version: Optional[str] = None,
76
+ force_process: bool = False,
77
+ strict: bool = False,
78
+ version: str | None = None,
62
79
  robot_check: bool = True,
80
+ upgrade: bool = True,
81
+ cache: bool = True,
82
+ use_tqdm: bool = True,
63
83
  ) -> Obo:
64
84
  """Get the OBO for a given graph.
65
85
 
66
86
  :param prefix: The prefix of the ontology to look up
67
87
  :param version: The pre-looked-up version of the ontology
68
88
  :param force: Download the data again
69
- :param rewrite: Should the OBO cache be rewritten? Automatically set to true if ``force`` is true
70
- :param strict: Should CURIEs be treated strictly? If true, raises exceptions on invalid/malformed
71
- :param robot_check:
72
- If set to false, will send the ``--check=false`` command to ROBOT to disregard
73
- malformed ontology components. Necessary to load some ontologies like VO.
89
+ :param force_process: Should the OBO cache be rewritten? Automatically set to true
90
+ if ``force`` is true
91
+ :param strict: Should CURIEs be treated strictly? If true, raises exceptions on
92
+ invalid/malformed
93
+ :param robot_check: If set to false, will send the ``--check=false`` command to
94
+ ROBOT to disregard malformed ontology components. Necessary to load some
95
+ ontologies like VO.
96
+ :param upgrade: If set to true, will automatically upgrade relationships, such as
97
+ ``obo:chebi#part_of`` to ``BFO:0000051``
98
+ :param cache: Should cached objects be written? defaults to True
99
+
74
100
  :returns: An OBO object
75
101
 
76
102
  :raises OnlyOWLError: If the OBO foundry only has an OWL document for this resource.
77
103
 
78
- Alternate usage if you have a custom url::
104
+ Alternate usage if you have a custom url
105
+
106
+ .. code-block:: python
79
107
 
80
- >>> from pystow.utils import download
81
- >>> from pyobo import Obo, from_obo_path
82
- >>> url = ...
83
- >>> obo_path = ...
84
- >>> download(url=url, path=path)
85
- >>> obo = from_obo_path(path)
108
+ from pystow.utils import download
109
+ from pyobo import Obo, from_obo_path
110
+
111
+ url = ...
112
+ obo_path = ...
113
+ download(url=url, path=path)
114
+ obo = from_obo_path(path)
86
115
  """
87
116
  if force:
88
- rewrite = True
117
+ force_process = True
89
118
  if prefix == "uberon":
90
119
  logger.info("UBERON has so much garbage in it that defaulting to non-strict parsing")
91
120
  strict = False
92
121
 
93
- obonet_json_gz_path = prefix_directory_join(
94
- prefix, name=f"{prefix}.obonet.json.gz", ensure_exists=False, version=version
95
- )
96
- if obonet_json_gz_path.exists() and not force:
97
- from .reader import from_obonet
98
- from .utils.cache import get_gzipped_graph
99
-
100
- logger.debug("[%s] using obonet cache at %s", prefix, obonet_json_gz_path)
101
- return from_obonet(get_gzipped_graph(obonet_json_gz_path))
122
+ if force_process:
123
+ obonet_json_gz_path = None
124
+ elif not cache:
125
+ logger.debug("[%s] caching was turned off, so dont look for an obonet file", prefix)
126
+ obonet_json_gz_path = None
127
+ else:
128
+ obonet_json_gz_path = prefix_directory_join(
129
+ prefix, BUILD_SUBDIRECTORY_NAME, name=f"{prefix}.obonet.json.gz", version=version
130
+ )
131
+ logger.debug(
132
+ "[%s] caching is turned on, so look for an obonet file at %s",
133
+ prefix,
134
+ obonet_json_gz_path,
135
+ )
136
+ if obonet_json_gz_path.is_file() and not force:
137
+ from .utils.cache import get_gzipped_graph
138
+
139
+ logger.debug("[%s] using obonet cache at %s", prefix, obonet_json_gz_path)
140
+ return from_obonet(
141
+ get_gzipped_graph(obonet_json_gz_path),
142
+ strict=strict,
143
+ version=version,
144
+ upgrade=upgrade,
145
+ use_tqdm=use_tqdm,
146
+ )
147
+ else:
148
+ logger.debug("[%s] no obonet cache found at %s", prefix, obonet_json_gz_path)
102
149
 
103
150
  if has_nomenclature_plugin(prefix):
104
151
  obo = run_nomenclature_plugin(prefix, version=version)
105
- logger.debug("[%s] caching nomenclature plugin", prefix)
106
- obo.write_default(force=rewrite)
152
+ if cache:
153
+ logger.debug("[%s] caching nomenclature plugin", prefix)
154
+ obo.write_default(force=force_process)
107
155
  return obo
108
156
 
109
- logger.debug("[%s] no obonet cache found at %s", prefix, obonet_json_gz_path)
110
-
111
157
  ontology_format, path = _ensure_ontology_path(prefix, force=force, version=version)
112
158
  if path is None:
113
159
  raise NoBuildError(prefix)
@@ -122,25 +168,23 @@ def get_ontology(
122
168
  else:
123
169
  raise UnhandledFormatError(f"[{prefix}] unhandled ontology file format: {path.suffix}")
124
170
 
125
- from .reader import from_obo_path
126
-
127
- obo = from_obo_path(path, prefix=prefix, strict=strict)
128
- if version is not None:
129
- if obo.data_version is None:
130
- logger.warning("[%s] did not have a version, overriding with %s", obo.ontology, version)
131
- obo.data_version = version
132
- elif obo.data_version != version:
133
- logger.warning(
134
- "[%s] had version %s, overriding with %s", obo.ontology, obo.data_version, version
135
- )
136
- obo.data_version = version
137
- obo.write_default(force=rewrite)
171
+ obo = from_obo_path(
172
+ path,
173
+ prefix=prefix,
174
+ strict=strict,
175
+ version=version,
176
+ upgrade=upgrade,
177
+ use_tqdm=use_tqdm,
178
+ _cache_path=obonet_json_gz_path,
179
+ )
180
+ if cache:
181
+ obo.write_default(force=force_process)
138
182
  return obo
139
183
 
140
184
 
141
185
  def _ensure_ontology_path(
142
- prefix: str, force, version
143
- ) -> Union[tuple[str, Path], tuple[None, None]]:
186
+ prefix: str, force: bool, version: str | None
187
+ ) -> tuple[str, Path] | tuple[None, None]:
144
188
  for ontology_format, url in [
145
189
  ("obo", bioregistry.get_obo_download(prefix)),
146
190
  ("owl", bioregistry.get_owl_download(prefix)),
@@ -148,100 +192,52 @@ def _ensure_ontology_path(
148
192
  ]:
149
193
  if url is not None:
150
194
  try:
151
- path = Path(ensure_path(prefix, url=url, force=force, version=version))
152
- except urllib.error.HTTPError:
195
+ path = ensure_path(prefix, url=url, force=force, version=version)
196
+ except (urllib.error.HTTPError, pystow.utils.DownloadError):
153
197
  continue
154
198
  else:
155
199
  return ontology_format, path
156
200
  return None, None
157
201
 
158
202
 
159
- #: Obonet/Pronto can't parse these (consider converting to OBO with ROBOT?)
160
- CANT_PARSE = {
161
- "agro",
162
- "aro",
163
- "bco",
164
- "caro",
165
- "cco",
166
- "chmo",
167
- "cido",
168
- "covoc",
169
- "cto",
170
- "cvdo",
171
- "dicom",
172
- "dinto",
173
- "emap",
174
- "epso",
175
- "eupath",
176
- "fbbi",
177
- "fma",
178
- "fobi",
179
- "foodon",
180
- "genepio",
181
- "hancestro",
182
- "hom",
183
- "hso",
184
- "htn", # Unknown string format: creation: 16MAY2017
185
- "ico",
186
- "idocovid19",
187
- "labo",
188
- "mamo",
189
- "mfmo",
190
- "mfo",
191
- "mfomd",
192
- "miapa",
193
- "mo",
194
- "oae",
195
- "ogms", # Unknown string format: creation: 16MAY2017
196
- "ohd",
197
- "ons",
198
- "oostt",
199
- "opmi",
200
- "ornaseq",
201
- "orth",
202
- "pdro",
203
- "probonto",
204
- "psdo",
205
- "reo",
206
- "rex",
207
- "rnao",
208
- "sepio",
209
- "sio",
210
- "spd",
211
- "sweetrealm",
212
- "txpo",
213
- "vido",
214
- "vt",
215
- "xl",
216
- }
217
203
  SKIP = {
218
- "ncbigene", # too big, refs acquired from other dbs
219
- "pubchem.compound", # to big, can't deal with this now
220
- "gaz", # Gazetteer is irrelevant for biology
221
- "ma", # yanked
222
- "bila", # yanked
223
- # FIXME below
224
- "emapa", # recently changed with EMAP... not sure what the difference is anymore
225
- "kegg.genes",
226
- "kegg.genome",
227
- "kegg.pathway",
228
- # URL is wrong
229
- "ensemblglossary",
230
- # Too much junk
231
- "biolink",
204
+ "ncbigene": "too big, refs acquired from other dbs",
205
+ "pubchem.compound": "top big, can't deal with this now",
206
+ "gaz": "Gazetteer is irrelevant for biology",
207
+ "ma": "yanked",
208
+ "bila": "yanked",
209
+ # Can't download",
210
+ "afpo": "unable to download",
211
+ "atol": "unable to download",
212
+ "eol": "unable to download, same source as atol",
213
+ "hog": "unable to download",
214
+ "vhog": "unable to download",
215
+ "gorel": "unable to download",
216
+ "dinto": "unable to download",
217
+ "gainesville.core": "unable to download",
218
+ "ato": "can't process",
219
+ "emapa": "recently changed with EMAP... not sure what the difference is anymore",
220
+ "kegg.genes": "needs fix", # FIXME
221
+ "kegg.genome": "needs fix", # FIXME
222
+ "kegg.pathway": "needs fix", # FIXME
223
+ "ensemblglossary": "URI is self-referential to data in OLS, extract from there",
224
+ "epio": "content from fraunhofer is unreliable",
225
+ "epso": "content from fraunhofer is unreliable",
226
+ "gwascentral.phenotype": "website is down? or API changed?", # FIXME
227
+ "gwascentral.study": "website is down? or API changed?", # FIXME
228
+ "snomedct": "dead source",
232
229
  }
233
230
 
234
231
  X = TypeVar("X")
235
232
 
236
233
 
237
234
  def iter_helper(
238
- f: Callable[[str], Mapping[str, X]],
235
+ f: Callable[[str, Unpack[GetOntologyKwargs]], Mapping[str, X]],
239
236
  leave: bool = False,
240
- strict: bool = True,
241
- **kwargs,
237
+ **kwargs: Unpack[IterHelperHelperDict],
242
238
  ) -> Iterable[tuple[str, str, X]]:
243
239
  """Yield all mappings extracted from each database given."""
244
- for prefix, mapping in iter_helper_helper(f, strict=strict, **kwargs):
240
+ for prefix, mapping in iter_helper_helper(f, **kwargs):
245
241
  it = tqdm(
246
242
  mapping.items(),
247
243
  desc=f"iterating {prefix}",
@@ -250,22 +246,24 @@ def iter_helper(
250
246
  disable=None,
251
247
  )
252
248
  for key, value in it:
253
- value = value.strip('"').replace("\n", " ").replace("\t", " ").replace(" ", " ")
249
+ if isinstance(value, str):
250
+ value = value.strip('"').replace("\n", " ").replace("\t", " ").replace(" ", " ")
251
+ # TODO deal with when this is not a string?
254
252
  if value:
255
253
  yield prefix, key, value
256
254
 
257
255
 
258
256
  def _prefixes(
259
- skip_below: Optional[str] = None,
257
+ skip_below: str | None = None,
260
258
  skip_below_inclusive: bool = True,
261
259
  skip_pyobo: bool = False,
262
- skip_set: Optional[set[str]] = None,
260
+ skip_set: set[str] | None = None,
263
261
  ) -> Iterable[str]:
264
262
  for prefix, resource in sorted(bioregistry.read_registry().items()):
265
263
  if resource.no_own_terms:
266
264
  continue
267
265
  if prefix in SKIP:
268
- tqdm.write(f"skipping {prefix} because in default skip set")
266
+ tqdm.write(f"skipping {prefix} because {SKIP[prefix]}")
269
267
  continue
270
268
  if skip_set and prefix in skip_set:
271
269
  tqdm.write(f"skipping {prefix} because in skip set")
@@ -287,37 +285,39 @@ def _prefixes(
287
285
 
288
286
 
289
287
  def iter_helper_helper(
290
- f: Callable[[str], X],
288
+ f: Callable[[str, Unpack[GetOntologyKwargs]], X],
291
289
  use_tqdm: bool = True,
292
- skip_below: Optional[str] = None,
293
- skip_below_inclusive: bool = True,
290
+ skip_below: str | None = None,
294
291
  skip_pyobo: bool = False,
295
- skip_set: Optional[set[str]] = None,
296
- strict: bool = True,
297
- **kwargs,
292
+ skip_set: set[str] | None = None,
293
+ **kwargs: Unpack[SlimGetOntologyKwargs],
298
294
  ) -> Iterable[tuple[str, X]]:
299
295
  """Yield all mappings extracted from each database given.
300
296
 
301
- :param f: A function that takes a prefix and gives back something that will be used by an outer function.
297
+ :param f: A function that takes a prefix and gives back something that will be used
298
+ by an outer function.
302
299
  :param use_tqdm: If true, use the tqdm progress bar
303
- :param skip_below: If true, skip sources whose names are less than this (used for iterative curation
300
+ :param skip_below: If true, skip sources whose names are less than this (used for
301
+ iterative curation
304
302
  :param skip_pyobo: If true, skip sources implemented in PyOBO
305
303
  :param skip_set: A pre-defined blacklist to skip
306
- :param strict: If true, will raise exceptions and crash the program instead of logging them.
304
+ :param strict: If true, will raise exceptions and crash the program instead of
305
+ logging them.
307
306
  :param kwargs: Keyword arguments passed to ``f``.
308
- :yields: A prefix and the result of the callable ``f``
309
307
 
310
308
  :raises TypeError: If a type error is raised, it gets re-raised
311
309
  :raises urllib.error.HTTPError: If the resource could not be downloaded
312
310
  :raises urllib.error.URLError: If another problem was encountered during download
313
311
  :raises ValueError: If the data was not in the format that was expected (e.g., OWL)
312
+
313
+ :yields: A prefix and the result of the callable ``f``
314
314
  """
315
+ strict = kwargs.get("strict", True)
315
316
  prefixes = list(
316
317
  _prefixes(
317
318
  skip_set=skip_set,
318
319
  skip_below=skip_below,
319
320
  skip_pyobo=skip_pyobo,
320
- skip_below_inclusive=skip_below_inclusive,
321
321
  )
322
322
  )
323
323
  prefix_it = tqdm(
@@ -325,28 +325,39 @@ def iter_helper_helper(
325
325
  )
326
326
  for prefix in prefix_it:
327
327
  prefix_it.set_postfix(prefix=prefix)
328
+ tqdm.write(
329
+ click.style(f"\n{prefix} - {bioregistry.get_name(prefix)}", fg="green", bold=True)
330
+ )
328
331
  try:
329
332
  yv = f(prefix, **kwargs) # type:ignore
333
+ except (UnhandledFormatError, NoBuildError) as e:
334
+ # make sure this comes before the other runtimeerror catch
335
+ logger.warning("[%s] %s", prefix, e)
330
336
  except urllib.error.HTTPError as e:
331
337
  logger.warning("[%s] HTTP %s: unable to download %s", prefix, e.getcode(), e.geturl())
332
338
  if strict and not bioregistry.is_deprecated(prefix):
333
339
  raise
334
- except urllib.error.URLError:
335
- logger.warning("[%s] unable to download", prefix)
340
+ except urllib.error.URLError as e:
341
+ logger.warning("[%s] unable to download - %s", prefix, e.reason)
336
342
  if strict and not bioregistry.is_deprecated(prefix):
337
343
  raise
338
- except MissingPrefixError as e:
339
- logger.warning("[%s] missing prefix: %s", prefix, e)
344
+ except ParseError as e:
345
+ if not e.node:
346
+ logger.warning("[%s] %s", prefix, e)
347
+ else:
348
+ logger.warning(str(e))
340
349
  if strict and not bioregistry.is_deprecated(prefix):
341
350
  raise e
351
+ except RuntimeError as e:
352
+ if "DrugBank" not in str(e):
353
+ raise
354
+ logger.warning("[drugbank] invalid credentials")
342
355
  except subprocess.CalledProcessError:
343
356
  logger.warning("[%s] ROBOT was unable to convert OWL to OBO", prefix)
344
- except UnhandledFormatError as e:
345
- logger.warning("[%s] %s", prefix, e)
346
357
  except ValueError as e:
347
358
  if _is_xml(e):
348
359
  # this means that it tried doing parsing on an xml page
349
- logger.info(
360
+ logger.warning(
350
361
  "no resource available for %s. See http://www.obofoundry.org/ontology/%s",
351
362
  prefix,
352
363
  prefix,
@@ -355,6 +366,9 @@ def iter_helper_helper(
355
366
  logger.exception(
356
367
  "[%s] got exception %s while parsing", prefix, e.__class__.__name__
357
368
  )
369
+ except zipfile.BadZipFile as e:
370
+ # This can happen if there's an error on UMLS
371
+ logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__)
358
372
  except TypeError as e:
359
373
  logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__)
360
374
  if strict:
@@ -369,7 +383,7 @@ def _is_xml(e) -> bool:
369
383
  )
370
384
 
371
385
 
372
- def _prep_dir(directory: Union[None, str, pathlib.Path]) -> pathlib.Path:
386
+ def _prep_dir(directory: None | str | pathlib.Path) -> pathlib.Path:
373
387
  if directory is None:
374
388
  rv = DATABASE_DIRECTORY
375
389
  elif isinstance(directory, str):
@@ -383,26 +397,28 @@ def _prep_dir(directory: Union[None, str, pathlib.Path]) -> pathlib.Path:
383
397
 
384
398
 
385
399
  def db_output_helper(
386
- f: Callable[..., Iterable[tuple[str, ...]]],
400
+ it: Iterable[tuple[Any, ...]],
387
401
  db_name: str,
388
402
  columns: Sequence[str],
389
403
  *,
390
- directory: Union[None, str, pathlib.Path] = None,
391
- strict: bool = True,
404
+ directory: None | str | pathlib.Path = None,
405
+ strict: bool = False,
392
406
  use_gzip: bool = True,
393
- summary_detailed: Optional[Sequence[int]] = None,
394
- **kwargs,
407
+ summary_detailed: Sequence[int] | None = None,
395
408
  ) -> list[pathlib.Path]:
396
409
  """Help output database builds.
397
410
 
398
- :param f: A function that takes a prefix and gives back something that will be used by an outer function.
411
+ :param f: A function that takes a prefix and gives back something that will be used
412
+ by an outer function.
399
413
  :param db_name: name of the output resource (e.g., "alts", "names")
400
414
  :param columns: The names of the columns
401
- :param directory: The directory to output everything, or defaults to :data:`pyobo.constants.DATABASE_DIRECTORY`.
415
+ :param directory: The directory to output everything, or defaults to
416
+ :data:`pyobo.constants.DATABASE_DIRECTORY`.
402
417
  :param strict: Passed to ``f`` by keyword
403
- :param kwargs: Passed to ``f`` by splat
418
+
404
419
  :returns: A sequence of paths that got created.
405
420
  """
421
+ start = time.time()
406
422
  directory = _prep_dir(directory)
407
423
 
408
424
  c: typing.Counter[str] = Counter()
@@ -415,27 +431,32 @@ def db_output_helper(
415
431
  db_sample_path = directory.joinpath(f"{db_name}_sample.tsv")
416
432
  db_summary_path = directory.joinpath(f"{db_name}_summary.tsv")
417
433
  db_summary_detailed_path = directory.joinpath(f"{db_name}_summary_detailed.tsv")
434
+ db_metadata_path = directory.joinpath(f"{db_name}_metadata.json")
435
+ rv: list[tuple[str, pathlib.Path]] = [
436
+ ("Metadata", db_metadata_path),
437
+ ("Data", db_path),
438
+ ("Sample", db_sample_path),
439
+ ("Summary", db_summary_path),
440
+ ]
418
441
 
419
442
  logger.info("writing %s to %s", db_name, db_path)
420
443
  logger.info("writing %s sample to %s", db_name, db_sample_path)
421
- it = f(strict=strict, **kwargs)
422
- with gzip.open(db_path, mode="wt") if use_gzip else open(db_path, "w") as gzipped_file:
423
- writer = get_writer(gzipped_file)
444
+ sample_rows = []
424
445
 
446
+ with safe_open_writer(db_path) as writer:
425
447
  # for the first 10 rows, put it in a sample file too
426
- with open(db_sample_path, "w") as sample_file:
427
- sample_writer = get_writer(sample_file)
428
-
448
+ with safe_open_writer(db_sample_path) as sample_writer:
429
449
  # write header
430
450
  writer.writerow(columns)
431
451
  sample_writer.writerow(columns)
432
452
 
433
- for row, _ in zip(it, range(10)):
453
+ for row, _ in zip(it, range(10), strict=False):
434
454
  c[row[0]] += 1
435
455
  if summary_detailed is not None:
436
456
  c_detailed[tuple(row[i] for i in summary_detailed)] += 1
437
457
  writer.writerow(row)
438
458
  sample_writer.writerow(row)
459
+ sample_rows.append(row)
439
460
 
440
461
  # continue just in the gzipped one
441
462
  for row in it:
@@ -444,18 +465,15 @@ def db_output_helper(
444
465
  c_detailed[tuple(row[i] for i in summary_detailed)] += 1
445
466
  writer.writerow(row)
446
467
 
447
- logger.info(f"writing {db_name} summary to {db_summary_path}")
448
- with open(db_summary_path, "w") as file:
449
- writer = get_writer(file)
450
- writer.writerows(c.most_common())
468
+ with safe_open_writer(db_summary_path) as summary_writer:
469
+ summary_writer.writerows(c.most_common())
451
470
 
452
471
  if summary_detailed is not None:
453
472
  logger.info(f"writing {db_name} detailed summary to {db_summary_detailed_path}")
454
- with open(db_summary_detailed_path, "w") as file:
455
- writer = get_writer(file)
456
- writer.writerows((*keys, v) for keys, v in c_detailed.most_common())
473
+ with safe_open_writer(db_summary_detailed_path) as detailed_summary_writer:
474
+ detailed_summary_writer.writerows((*keys, v) for keys, v in c_detailed.most_common())
475
+ rv.append(("Summary (Detailed)", db_summary_detailed_path))
457
476
 
458
- db_metadata_path = directory.joinpath(f"{db_name}_metadata.json")
459
477
  with open(db_metadata_path, "w") as file:
460
478
  json.dump(
461
479
  {
@@ -468,12 +486,12 @@ def db_output_helper(
468
486
  indent=2,
469
487
  )
470
488
 
471
- rv: list[pathlib.Path] = [
472
- db_metadata_path,
473
- db_path,
474
- db_sample_path,
475
- db_summary_path,
476
- ]
477
- if summary_detailed:
478
- rv.append(db_summary_detailed_path)
479
- return rv
489
+ elapsed = time.time() - start
490
+ click.secho(f"\nWrote the following files in {elapsed:.1f} seconds\n", fg="green")
491
+ click.secho(indent(tabulate(rv), " "), fg="green")
492
+
493
+ click.secho("\nSample rows:\n", fg="green")
494
+ click.secho(indent(tabulate(sample_rows, headers=columns), " "), fg="green")
495
+ click.echo()
496
+
497
+ return [path for _, path in rv]