pyobo 0.11.2__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. pyobo/.DS_Store +0 -0
  2. pyobo/__init__.py +95 -20
  3. pyobo/__main__.py +0 -0
  4. pyobo/api/__init__.py +81 -10
  5. pyobo/api/alts.py +52 -42
  6. pyobo/api/combine.py +39 -0
  7. pyobo/api/edges.py +68 -0
  8. pyobo/api/hierarchy.py +231 -203
  9. pyobo/api/metadata.py +14 -19
  10. pyobo/api/names.py +207 -127
  11. pyobo/api/properties.py +117 -113
  12. pyobo/api/relations.py +68 -94
  13. pyobo/api/species.py +24 -21
  14. pyobo/api/typedefs.py +11 -11
  15. pyobo/api/utils.py +66 -13
  16. pyobo/api/xrefs.py +108 -114
  17. pyobo/cli/__init__.py +0 -0
  18. pyobo/cli/cli.py +35 -50
  19. pyobo/cli/database.py +183 -161
  20. pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
  21. pyobo/cli/lookup.py +163 -195
  22. pyobo/cli/utils.py +19 -6
  23. pyobo/constants.py +102 -3
  24. pyobo/getters.py +196 -118
  25. pyobo/gilda_utils.py +79 -200
  26. pyobo/identifier_utils/__init__.py +41 -0
  27. pyobo/identifier_utils/api.py +296 -0
  28. pyobo/identifier_utils/model.py +130 -0
  29. pyobo/identifier_utils/preprocessing.json +812 -0
  30. pyobo/identifier_utils/preprocessing.py +61 -0
  31. pyobo/identifier_utils/relations/__init__.py +8 -0
  32. pyobo/identifier_utils/relations/api.py +162 -0
  33. pyobo/identifier_utils/relations/data.json +5824 -0
  34. pyobo/identifier_utils/relations/data_owl.json +57 -0
  35. pyobo/identifier_utils/relations/data_rdf.json +1 -0
  36. pyobo/identifier_utils/relations/data_rdfs.json +7 -0
  37. pyobo/mocks.py +9 -6
  38. pyobo/ner/__init__.py +9 -0
  39. pyobo/ner/api.py +72 -0
  40. pyobo/ner/normalizer.py +33 -0
  41. pyobo/obographs.py +43 -39
  42. pyobo/plugins.py +5 -4
  43. pyobo/py.typed +0 -0
  44. pyobo/reader.py +1358 -395
  45. pyobo/reader_utils.py +155 -0
  46. pyobo/resource_utils.py +42 -22
  47. pyobo/resources/__init__.py +0 -0
  48. pyobo/resources/goc.py +75 -0
  49. pyobo/resources/goc.tsv +188 -0
  50. pyobo/resources/ncbitaxon.py +4 -5
  51. pyobo/resources/ncbitaxon.tsv.gz +0 -0
  52. pyobo/resources/ro.py +3 -2
  53. pyobo/resources/ro.tsv +0 -0
  54. pyobo/resources/so.py +0 -0
  55. pyobo/resources/so.tsv +0 -0
  56. pyobo/sources/README.md +12 -8
  57. pyobo/sources/__init__.py +52 -29
  58. pyobo/sources/agrovoc.py +0 -0
  59. pyobo/sources/antibodyregistry.py +11 -12
  60. pyobo/sources/bigg/__init__.py +13 -0
  61. pyobo/sources/bigg/bigg_compartment.py +81 -0
  62. pyobo/sources/bigg/bigg_metabolite.py +229 -0
  63. pyobo/sources/bigg/bigg_model.py +46 -0
  64. pyobo/sources/bigg/bigg_reaction.py +77 -0
  65. pyobo/sources/biogrid.py +1 -2
  66. pyobo/sources/ccle.py +7 -12
  67. pyobo/sources/cgnc.py +0 -5
  68. pyobo/sources/chebi.py +1 -1
  69. pyobo/sources/chembl/__init__.py +9 -0
  70. pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
  71. pyobo/sources/chembl/chembl_target.py +160 -0
  72. pyobo/sources/civic_gene.py +55 -15
  73. pyobo/sources/clinicaltrials.py +160 -0
  74. pyobo/sources/complexportal.py +24 -24
  75. pyobo/sources/conso.py +14 -22
  76. pyobo/sources/cpt.py +0 -0
  77. pyobo/sources/credit.py +1 -9
  78. pyobo/sources/cvx.py +27 -5
  79. pyobo/sources/depmap.py +9 -12
  80. pyobo/sources/dictybase_gene.py +2 -7
  81. pyobo/sources/drugbank/__init__.py +9 -0
  82. pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
  83. pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
  84. pyobo/sources/drugcentral.py +17 -13
  85. pyobo/sources/expasy.py +31 -34
  86. pyobo/sources/famplex.py +13 -18
  87. pyobo/sources/flybase.py +3 -8
  88. pyobo/sources/gard.py +62 -0
  89. pyobo/sources/geonames/__init__.py +9 -0
  90. pyobo/sources/geonames/features.py +28 -0
  91. pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
  92. pyobo/sources/geonames/utils.py +115 -0
  93. pyobo/sources/gmt_utils.py +6 -7
  94. pyobo/sources/go.py +20 -13
  95. pyobo/sources/gtdb.py +154 -0
  96. pyobo/sources/gwascentral/__init__.py +9 -0
  97. pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
  98. pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
  99. pyobo/sources/hgnc/__init__.py +9 -0
  100. pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
  101. pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
  102. pyobo/sources/icd/__init__.py +9 -0
  103. pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
  104. pyobo/sources/icd/icd11.py +148 -0
  105. pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
  106. pyobo/sources/interpro.py +4 -9
  107. pyobo/sources/itis.py +0 -5
  108. pyobo/sources/kegg/__init__.py +0 -0
  109. pyobo/sources/kegg/api.py +16 -38
  110. pyobo/sources/kegg/genes.py +9 -20
  111. pyobo/sources/kegg/genome.py +1 -7
  112. pyobo/sources/kegg/pathway.py +9 -21
  113. pyobo/sources/mesh.py +58 -24
  114. pyobo/sources/mgi.py +3 -10
  115. pyobo/sources/mirbase/__init__.py +11 -0
  116. pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
  117. pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
  118. pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
  119. pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
  120. pyobo/sources/msigdb.py +74 -39
  121. pyobo/sources/ncbi/__init__.py +9 -0
  122. pyobo/sources/ncbi/ncbi_gc.py +162 -0
  123. pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
  124. pyobo/sources/nih_reporter.py +60 -0
  125. pyobo/sources/nlm/__init__.py +9 -0
  126. pyobo/sources/nlm/nlm_catalog.py +48 -0
  127. pyobo/sources/nlm/nlm_publisher.py +36 -0
  128. pyobo/sources/nlm/utils.py +116 -0
  129. pyobo/sources/npass.py +6 -8
  130. pyobo/sources/omim_ps.py +10 -3
  131. pyobo/sources/pathbank.py +4 -8
  132. pyobo/sources/pfam/__init__.py +9 -0
  133. pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
  134. pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
  135. pyobo/sources/pharmgkb/__init__.py +15 -0
  136. pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
  137. pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
  138. pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
  139. pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
  140. pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
  141. pyobo/sources/pharmgkb/utils.py +86 -0
  142. pyobo/sources/pid.py +1 -6
  143. pyobo/sources/pombase.py +6 -10
  144. pyobo/sources/pubchem.py +4 -9
  145. pyobo/sources/reactome.py +5 -11
  146. pyobo/sources/rgd.py +11 -16
  147. pyobo/sources/rhea.py +37 -36
  148. pyobo/sources/ror.py +69 -42
  149. pyobo/sources/selventa/__init__.py +0 -0
  150. pyobo/sources/selventa/schem.py +4 -7
  151. pyobo/sources/selventa/scomp.py +1 -6
  152. pyobo/sources/selventa/sdis.py +4 -7
  153. pyobo/sources/selventa/sfam.py +1 -6
  154. pyobo/sources/sgd.py +6 -11
  155. pyobo/sources/signor/__init__.py +7 -0
  156. pyobo/sources/signor/download.py +41 -0
  157. pyobo/sources/signor/signor_complexes.py +105 -0
  158. pyobo/sources/slm.py +12 -15
  159. pyobo/sources/umls/__init__.py +7 -1
  160. pyobo/sources/umls/__main__.py +0 -0
  161. pyobo/sources/umls/get_synonym_types.py +20 -4
  162. pyobo/sources/umls/sty.py +57 -0
  163. pyobo/sources/umls/synonym_types.tsv +1 -1
  164. pyobo/sources/umls/umls.py +18 -22
  165. pyobo/sources/unimod.py +46 -0
  166. pyobo/sources/uniprot/__init__.py +1 -1
  167. pyobo/sources/uniprot/uniprot.py +40 -32
  168. pyobo/sources/uniprot/uniprot_ptm.py +4 -34
  169. pyobo/sources/utils.py +3 -2
  170. pyobo/sources/wikipathways.py +7 -10
  171. pyobo/sources/zfin.py +5 -10
  172. pyobo/ssg/__init__.py +12 -16
  173. pyobo/ssg/base.html +0 -0
  174. pyobo/ssg/index.html +26 -13
  175. pyobo/ssg/term.html +12 -2
  176. pyobo/ssg/typedef.html +0 -0
  177. pyobo/struct/__init__.py +54 -8
  178. pyobo/struct/functional/__init__.py +1 -0
  179. pyobo/struct/functional/dsl.py +2572 -0
  180. pyobo/struct/functional/macros.py +423 -0
  181. pyobo/struct/functional/obo_to_functional.py +385 -0
  182. pyobo/struct/functional/ontology.py +270 -0
  183. pyobo/struct/functional/utils.py +112 -0
  184. pyobo/struct/reference.py +331 -136
  185. pyobo/struct/struct.py +1413 -643
  186. pyobo/struct/struct_utils.py +1078 -0
  187. pyobo/struct/typedef.py +162 -210
  188. pyobo/struct/utils.py +12 -5
  189. pyobo/struct/vocabulary.py +138 -0
  190. pyobo/utils/__init__.py +0 -0
  191. pyobo/utils/cache.py +13 -11
  192. pyobo/utils/io.py +17 -31
  193. pyobo/utils/iter.py +5 -5
  194. pyobo/utils/misc.py +41 -53
  195. pyobo/utils/ndex_utils.py +0 -0
  196. pyobo/utils/path.py +76 -70
  197. pyobo/version.py +3 -3
  198. {pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/METADATA +228 -229
  199. pyobo-0.12.0.dist-info/RECORD +202 -0
  200. pyobo-0.12.0.dist-info/WHEEL +4 -0
  201. {pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
  202. pyobo-0.12.0.dist-info/licenses/LICENSE +21 -0
  203. pyobo/aws.py +0 -162
  204. pyobo/cli/aws.py +0 -47
  205. pyobo/identifier_utils.py +0 -142
  206. pyobo/normalizer.py +0 -232
  207. pyobo/registries/__init__.py +0 -16
  208. pyobo/registries/metaregistry.json +0 -507
  209. pyobo/registries/metaregistry.py +0 -135
  210. pyobo/sources/icd11.py +0 -105
  211. pyobo/xrefdb/__init__.py +0 -1
  212. pyobo/xrefdb/canonicalizer.py +0 -214
  213. pyobo/xrefdb/priority.py +0 -59
  214. pyobo/xrefdb/sources/__init__.py +0 -60
  215. pyobo/xrefdb/sources/biomappings.py +0 -36
  216. pyobo/xrefdb/sources/cbms2019.py +0 -91
  217. pyobo/xrefdb/sources/chembl.py +0 -83
  218. pyobo/xrefdb/sources/compath.py +0 -82
  219. pyobo/xrefdb/sources/famplex.py +0 -64
  220. pyobo/xrefdb/sources/gilda.py +0 -50
  221. pyobo/xrefdb/sources/intact.py +0 -113
  222. pyobo/xrefdb/sources/ncit.py +0 -133
  223. pyobo/xrefdb/sources/pubchem.py +0 -27
  224. pyobo/xrefdb/sources/wikidata.py +0 -116
  225. pyobo-0.11.2.dist-info/RECORD +0 -157
  226. pyobo-0.11.2.dist-info/WHEEL +0 -5
  227. pyobo-0.11.2.dist-info/top_level.txt +0 -1
pyobo/getters.py CHANGED
@@ -1,38 +1,48 @@
1
1
  """Utilities for OBO files."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import datetime
4
6
  import gzip
5
7
  import json
6
8
  import logging
7
9
  import pathlib
8
10
  import subprocess
11
+ import time
9
12
  import typing
10
13
  import urllib.error
14
+ import zipfile
11
15
  from collections import Counter
12
- from collections.abc import Iterable, Mapping, Sequence
16
+ from collections.abc import Callable, Iterable, Mapping, Sequence
13
17
  from pathlib import Path
14
- from typing import (
15
- Callable,
16
- Optional,
17
- TypeVar,
18
- Union,
19
- )
18
+ from textwrap import indent
19
+ from typing import TypeVar
20
20
 
21
21
  import bioregistry
22
+ import click
23
+ import pystow.utils
22
24
  from bioontologies import robot
25
+ from tabulate import tabulate
23
26
  from tqdm.auto import tqdm
27
+ from typing_extensions import Unpack
24
28
 
25
- from .constants import DATABASE_DIRECTORY
26
- from .identifier_utils import MissingPrefixError, wrap_norm_prefix
29
+ from .constants import (
30
+ DATABASE_DIRECTORY,
31
+ GetOntologyKwargs,
32
+ IterHelperHelperDict,
33
+ SlimGetOntologyKwargs,
34
+ )
35
+ from .identifier_utils import ParseError, wrap_norm_prefix
27
36
  from .plugins import has_nomenclature_plugin, run_nomenclature_plugin
37
+ from .reader import from_obo_path, from_obonet
28
38
  from .struct import Obo
29
39
  from .utils.io import get_writer
30
40
  from .utils.path import ensure_path, prefix_directory_join
31
41
  from .version import get_git_hash, get_version
32
42
 
33
43
  __all__ = [
34
- "get_ontology",
35
44
  "NoBuildError",
45
+ "get_ontology",
36
46
  ]
37
47
 
38
48
  logger = logging.getLogger(__name__)
@@ -48,7 +58,14 @@ class UnhandledFormatError(NoBuildError):
48
58
 
49
59
  #: The following prefixes can not be loaded through ROBOT without
50
60
  #: turning off integrity checks
51
- REQUIRES_NO_ROBOT_CHECK = {"clo", "vo", "orphanet.ordo", "orphanet"}
61
+ REQUIRES_NO_ROBOT_CHECK = {
62
+ "clo",
63
+ "vo",
64
+ "orphanet.ordo",
65
+ "orphanet",
66
+ "foodon",
67
+ "caloha",
68
+ }
52
69
 
53
70
 
54
71
  @wrap_norm_prefix
@@ -56,58 +73,85 @@ def get_ontology(
56
73
  prefix: str,
57
74
  *,
58
75
  force: bool = False,
59
- rewrite: bool = False,
60
- strict: bool = True,
61
- version: Optional[str] = None,
76
+ force_process: bool = False,
77
+ strict: bool = False,
78
+ version: str | None = None,
62
79
  robot_check: bool = True,
80
+ upgrade: bool = True,
81
+ cache: bool = True,
82
+ use_tqdm: bool = True,
63
83
  ) -> Obo:
64
84
  """Get the OBO for a given graph.
65
85
 
66
86
  :param prefix: The prefix of the ontology to look up
67
87
  :param version: The pre-looked-up version of the ontology
68
88
  :param force: Download the data again
69
- :param rewrite: Should the OBO cache be rewritten? Automatically set to true if ``force`` is true
70
- :param strict: Should CURIEs be treated strictly? If true, raises exceptions on invalid/malformed
71
- :param robot_check:
72
- If set to false, will send the ``--check=false`` command to ROBOT to disregard
73
- malformed ontology components. Necessary to load some ontologies like VO.
89
+ :param force_process: Should the OBO cache be rewritten? Automatically set to true
90
+ if ``force`` is true
91
+ :param strict: Should CURIEs be treated strictly? If true, raises exceptions on
92
+ invalid/malformed
93
+ :param robot_check: If set to false, will send the ``--check=false`` command to
94
+ ROBOT to disregard malformed ontology components. Necessary to load some
95
+ ontologies like VO.
96
+ :param upgrade: If set to true, will automatically upgrade relationships, such as
97
+ ``obo:chebi#part_of`` to ``BFO:0000051``
98
+ :param cache: Should cached objects be written? defaults to True
99
+
74
100
  :returns: An OBO object
75
101
 
76
102
  :raises OnlyOWLError: If the OBO foundry only has an OWL document for this resource.
77
103
 
78
- Alternate usage if you have a custom url::
104
+ Alternate usage if you have a custom url
105
+
106
+ .. code-block:: python
79
107
 
80
- >>> from pystow.utils import download
81
- >>> from pyobo import Obo, from_obo_path
82
- >>> url = ...
83
- >>> obo_path = ...
84
- >>> download(url=url, path=path)
85
- >>> obo = from_obo_path(path)
108
+ from pystow.utils import download
109
+ from pyobo import Obo, from_obo_path
110
+
111
+ url = ...
112
+ obo_path = ...
113
+ download(url=url, path=path)
114
+ obo = from_obo_path(path)
86
115
  """
87
116
  if force:
88
- rewrite = True
117
+ force_process = True
89
118
  if prefix == "uberon":
90
119
  logger.info("UBERON has so much garbage in it that defaulting to non-strict parsing")
91
120
  strict = False
92
121
 
93
- obonet_json_gz_path = prefix_directory_join(
94
- prefix, name=f"{prefix}.obonet.json.gz", ensure_exists=False, version=version
95
- )
96
- if obonet_json_gz_path.exists() and not force:
97
- from .reader import from_obonet
98
- from .utils.cache import get_gzipped_graph
99
-
100
- logger.debug("[%s] using obonet cache at %s", prefix, obonet_json_gz_path)
101
- return from_obonet(get_gzipped_graph(obonet_json_gz_path))
122
+ if not cache:
123
+ logger.debug("[%s] caching was turned off, so dont look for an obonet file", prefix)
124
+ obonet_json_gz_path = None
125
+ else:
126
+ obonet_json_gz_path = prefix_directory_join(
127
+ prefix, name=f"{prefix}.obonet.json.gz", ensure_exists=False, version=version
128
+ )
129
+ logger.debug(
130
+ "[%s] caching is turned on, so look for an obonet file at %s",
131
+ prefix,
132
+ obonet_json_gz_path,
133
+ )
134
+ if obonet_json_gz_path.exists() and not force:
135
+ from .utils.cache import get_gzipped_graph
136
+
137
+ logger.debug("[%s] using obonet cache at %s", prefix, obonet_json_gz_path)
138
+ return from_obonet(
139
+ get_gzipped_graph(obonet_json_gz_path),
140
+ strict=strict,
141
+ version=version,
142
+ upgrade=upgrade,
143
+ use_tqdm=use_tqdm,
144
+ )
145
+ else:
146
+ logger.debug("[%s] no obonet cache found at %s", prefix, obonet_json_gz_path)
102
147
 
103
148
  if has_nomenclature_plugin(prefix):
104
149
  obo = run_nomenclature_plugin(prefix, version=version)
105
- logger.debug("[%s] caching nomenclature plugin", prefix)
106
- obo.write_default(force=rewrite)
150
+ if cache:
151
+ logger.debug("[%s] caching nomenclature plugin", prefix)
152
+ obo.write_default(force=force_process)
107
153
  return obo
108
154
 
109
- logger.debug("[%s] no obonet cache found at %s", prefix, obonet_json_gz_path)
110
-
111
155
  ontology_format, path = _ensure_ontology_path(prefix, force=force, version=version)
112
156
  if path is None:
113
157
  raise NoBuildError(prefix)
@@ -122,25 +166,23 @@ def get_ontology(
122
166
  else:
123
167
  raise UnhandledFormatError(f"[{prefix}] unhandled ontology file format: {path.suffix}")
124
168
 
125
- from .reader import from_obo_path
126
-
127
- obo = from_obo_path(path, prefix=prefix, strict=strict)
128
- if version is not None:
129
- if obo.data_version is None:
130
- logger.warning("[%s] did not have a version, overriding with %s", obo.ontology, version)
131
- obo.data_version = version
132
- elif obo.data_version != version:
133
- logger.warning(
134
- "[%s] had version %s, overriding with %s", obo.ontology, obo.data_version, version
135
- )
136
- obo.data_version = version
137
- obo.write_default(force=rewrite)
169
+ obo = from_obo_path(
170
+ path,
171
+ prefix=prefix,
172
+ strict=strict,
173
+ version=version,
174
+ upgrade=upgrade,
175
+ use_tqdm=use_tqdm,
176
+ _cache_path=obonet_json_gz_path,
177
+ )
178
+ if cache:
179
+ obo.write_default(force=force_process)
138
180
  return obo
139
181
 
140
182
 
141
183
  def _ensure_ontology_path(
142
- prefix: str, force, version
143
- ) -> Union[tuple[str, Path], tuple[None, None]]:
184
+ prefix: str, force: bool, version: str | None
185
+ ) -> tuple[str, Path] | tuple[None, None]:
144
186
  for ontology_format, url in [
145
187
  ("obo", bioregistry.get_obo_download(prefix)),
146
188
  ("owl", bioregistry.get_owl_download(prefix)),
@@ -148,8 +190,8 @@ def _ensure_ontology_path(
148
190
  ]:
149
191
  if url is not None:
150
192
  try:
151
- path = Path(ensure_path(prefix, url=url, force=force, version=version))
152
- except urllib.error.HTTPError:
193
+ path = ensure_path(prefix, url=url, force=force, version=version)
194
+ except (urllib.error.HTTPError, pystow.utils.DownloadError):
153
195
  continue
154
196
  else:
155
197
  return ontology_format, path
@@ -215,33 +257,42 @@ CANT_PARSE = {
215
257
  "xl",
216
258
  }
217
259
  SKIP = {
218
- "ncbigene", # too big, refs acquired from other dbs
219
- "pubchem.compound", # to big, can't deal with this now
220
- "gaz", # Gazetteer is irrelevant for biology
221
- "ma", # yanked
222
- "bila", # yanked
223
- # FIXME below
224
- "emapa", # recently changed with EMAP... not sure what the difference is anymore
225
- "kegg.genes",
226
- "kegg.genome",
227
- "kegg.pathway",
228
- # URL is wrong
229
- "ensemblglossary",
230
- # Too much junk
231
- "biolink",
260
+ "ncbigene": "too big, refs acquired from other dbs",
261
+ "pubchem.compound": "top big, can't deal with this now",
262
+ "gaz": "Gazetteer is irrelevant for biology",
263
+ "ma": "yanked",
264
+ "bila": "yanked",
265
+ # Can't download",
266
+ "afpo": "unable to download",
267
+ "atol": "unable to download",
268
+ "eol": "unable to download, same source as atol",
269
+ "hog": "unable to download",
270
+ "vhog": "unable to download",
271
+ "gorel": "unable to download",
272
+ "dinto": "unable to download",
273
+ "gainesville.core": "unable to download",
274
+ "ato": "can't process",
275
+ "emapa": "recently changed with EMAP... not sure what the difference is anymore",
276
+ "kegg.genes": "needs fix", # FIXME
277
+ "kegg.genome": "needs fix", # FIXME
278
+ "kegg.pathway": "needs fix", # FIXME
279
+ "ensemblglossary": "uri is wrong",
280
+ "epio": "content from fraunhofer is unreliable",
281
+ "epso": "content from fraunhofer is unreliable",
282
+ "gwascentral.phenotype": "website is down? or API changed?", # FIXME
283
+ "gwascentral.study": "website is down? or API changed?", # FIXME
232
284
  }
233
285
 
234
286
  X = TypeVar("X")
235
287
 
236
288
 
237
289
  def iter_helper(
238
- f: Callable[[str], Mapping[str, X]],
290
+ f: Callable[[str, Unpack[GetOntologyKwargs]], Mapping[str, X]],
239
291
  leave: bool = False,
240
- strict: bool = True,
241
- **kwargs,
292
+ **kwargs: Unpack[IterHelperHelperDict],
242
293
  ) -> Iterable[tuple[str, str, X]]:
243
294
  """Yield all mappings extracted from each database given."""
244
- for prefix, mapping in iter_helper_helper(f, strict=strict, **kwargs):
295
+ for prefix, mapping in iter_helper_helper(f, **kwargs):
245
296
  it = tqdm(
246
297
  mapping.items(),
247
298
  desc=f"iterating {prefix}",
@@ -250,22 +301,24 @@ def iter_helper(
250
301
  disable=None,
251
302
  )
252
303
  for key, value in it:
253
- value = value.strip('"').replace("\n", " ").replace("\t", " ").replace(" ", " ")
304
+ if isinstance(value, str):
305
+ value = value.strip('"').replace("\n", " ").replace("\t", " ").replace(" ", " ")
306
+ # TODO deal with when this is not a string?
254
307
  if value:
255
308
  yield prefix, key, value
256
309
 
257
310
 
258
311
  def _prefixes(
259
- skip_below: Optional[str] = None,
312
+ skip_below: str | None = None,
260
313
  skip_below_inclusive: bool = True,
261
314
  skip_pyobo: bool = False,
262
- skip_set: Optional[set[str]] = None,
315
+ skip_set: set[str] | None = None,
263
316
  ) -> Iterable[str]:
264
317
  for prefix, resource in sorted(bioregistry.read_registry().items()):
265
318
  if resource.no_own_terms:
266
319
  continue
267
320
  if prefix in SKIP:
268
- tqdm.write(f"skipping {prefix} because in default skip set")
321
+ tqdm.write(f"skipping {prefix} because {SKIP[prefix]}")
269
322
  continue
270
323
  if skip_set and prefix in skip_set:
271
324
  tqdm.write(f"skipping {prefix} because in skip set")
@@ -287,37 +340,39 @@ def _prefixes(
287
340
 
288
341
 
289
342
  def iter_helper_helper(
290
- f: Callable[[str], X],
343
+ f: Callable[[str, Unpack[GetOntologyKwargs]], X],
291
344
  use_tqdm: bool = True,
292
- skip_below: Optional[str] = None,
293
- skip_below_inclusive: bool = True,
345
+ skip_below: str | None = None,
294
346
  skip_pyobo: bool = False,
295
- skip_set: Optional[set[str]] = None,
296
- strict: bool = True,
297
- **kwargs,
347
+ skip_set: set[str] | None = None,
348
+ **kwargs: Unpack[SlimGetOntologyKwargs],
298
349
  ) -> Iterable[tuple[str, X]]:
299
350
  """Yield all mappings extracted from each database given.
300
351
 
301
- :param f: A function that takes a prefix and gives back something that will be used by an outer function.
352
+ :param f: A function that takes a prefix and gives back something that will be used
353
+ by an outer function.
302
354
  :param use_tqdm: If true, use the tqdm progress bar
303
- :param skip_below: If true, skip sources whose names are less than this (used for iterative curation
355
+ :param skip_below: If true, skip sources whose names are less than this (used for
356
+ iterative curation
304
357
  :param skip_pyobo: If true, skip sources implemented in PyOBO
305
358
  :param skip_set: A pre-defined blacklist to skip
306
- :param strict: If true, will raise exceptions and crash the program instead of logging them.
359
+ :param strict: If true, will raise exceptions and crash the program instead of
360
+ logging them.
307
361
  :param kwargs: Keyword arguments passed to ``f``.
308
- :yields: A prefix and the result of the callable ``f``
309
362
 
310
363
  :raises TypeError: If a type error is raised, it gets re-raised
311
364
  :raises urllib.error.HTTPError: If the resource could not be downloaded
312
365
  :raises urllib.error.URLError: If another problem was encountered during download
313
366
  :raises ValueError: If the data was not in the format that was expected (e.g., OWL)
367
+
368
+ :yields: A prefix and the result of the callable ``f``
314
369
  """
370
+ strict = kwargs.get("strict", True)
315
371
  prefixes = list(
316
372
  _prefixes(
317
373
  skip_set=skip_set,
318
374
  skip_below=skip_below,
319
375
  skip_pyobo=skip_pyobo,
320
- skip_below_inclusive=skip_below_inclusive,
321
376
  )
322
377
  )
323
378
  prefix_it = tqdm(
@@ -325,24 +380,35 @@ def iter_helper_helper(
325
380
  )
326
381
  for prefix in prefix_it:
327
382
  prefix_it.set_postfix(prefix=prefix)
383
+ tqdm.write(
384
+ click.style(f"\n{prefix} - {bioregistry.get_name(prefix)}", fg="green", bold=True)
385
+ )
328
386
  try:
329
387
  yv = f(prefix, **kwargs) # type:ignore
388
+ except (UnhandledFormatError, NoBuildError) as e:
389
+ # make sure this comes before the other runtimeerror catch
390
+ logger.warning("[%s] %s", prefix, e)
330
391
  except urllib.error.HTTPError as e:
331
392
  logger.warning("[%s] HTTP %s: unable to download %s", prefix, e.getcode(), e.geturl())
332
393
  if strict and not bioregistry.is_deprecated(prefix):
333
394
  raise
334
- except urllib.error.URLError:
335
- logger.warning("[%s] unable to download", prefix)
395
+ except urllib.error.URLError as e:
396
+ logger.warning("[%s] unable to download - %s", prefix, e.reason)
336
397
  if strict and not bioregistry.is_deprecated(prefix):
337
398
  raise
338
- except MissingPrefixError as e:
339
- logger.warning("[%s] missing prefix: %s", prefix, e)
399
+ except ParseError as e:
400
+ if not e.node:
401
+ logger.warning("[%s] %s", prefix, e)
402
+ else:
403
+ logger.warning(str(e))
340
404
  if strict and not bioregistry.is_deprecated(prefix):
341
405
  raise e
406
+ except RuntimeError as e:
407
+ if "DrugBank" not in str(e):
408
+ raise
409
+ logger.warning("[drugbank] invalid credentials")
342
410
  except subprocess.CalledProcessError:
343
411
  logger.warning("[%s] ROBOT was unable to convert OWL to OBO", prefix)
344
- except UnhandledFormatError as e:
345
- logger.warning("[%s] %s", prefix, e)
346
412
  except ValueError as e:
347
413
  if _is_xml(e):
348
414
  # this means that it tried doing parsing on an xml page
@@ -355,6 +421,9 @@ def iter_helper_helper(
355
421
  logger.exception(
356
422
  "[%s] got exception %s while parsing", prefix, e.__class__.__name__
357
423
  )
424
+ except zipfile.BadZipFile as e:
425
+ # This can happen if there's an error on UMLS
426
+ logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__)
358
427
  except TypeError as e:
359
428
  logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__)
360
429
  if strict:
@@ -369,7 +438,7 @@ def _is_xml(e) -> bool:
369
438
  )
370
439
 
371
440
 
372
- def _prep_dir(directory: Union[None, str, pathlib.Path]) -> pathlib.Path:
441
+ def _prep_dir(directory: None | str | pathlib.Path) -> pathlib.Path:
373
442
  if directory is None:
374
443
  rv = DATABASE_DIRECTORY
375
444
  elif isinstance(directory, str):
@@ -383,26 +452,28 @@ def _prep_dir(directory: Union[None, str, pathlib.Path]) -> pathlib.Path:
383
452
 
384
453
 
385
454
  def db_output_helper(
386
- f: Callable[..., Iterable[tuple[str, ...]]],
455
+ it: Iterable[tuple[str, ...]],
387
456
  db_name: str,
388
457
  columns: Sequence[str],
389
458
  *,
390
- directory: Union[None, str, pathlib.Path] = None,
391
- strict: bool = True,
459
+ directory: None | str | pathlib.Path = None,
460
+ strict: bool = False,
392
461
  use_gzip: bool = True,
393
- summary_detailed: Optional[Sequence[int]] = None,
394
- **kwargs,
462
+ summary_detailed: Sequence[int] | None = None,
395
463
  ) -> list[pathlib.Path]:
396
464
  """Help output database builds.
397
465
 
398
- :param f: A function that takes a prefix and gives back something that will be used by an outer function.
466
+ :param f: A function that takes a prefix and gives back something that will be used
467
+ by an outer function.
399
468
  :param db_name: name of the output resource (e.g., "alts", "names")
400
469
  :param columns: The names of the columns
401
- :param directory: The directory to output everything, or defaults to :data:`pyobo.constants.DATABASE_DIRECTORY`.
470
+ :param directory: The directory to output everything, or defaults to
471
+ :data:`pyobo.constants.DATABASE_DIRECTORY`.
402
472
  :param strict: Passed to ``f`` by keyword
403
- :param kwargs: Passed to ``f`` by splat
473
+
404
474
  :returns: A sequence of paths that got created.
405
475
  """
476
+ start = time.time()
406
477
  directory = _prep_dir(directory)
407
478
 
408
479
  c: typing.Counter[str] = Counter()
@@ -415,10 +486,17 @@ def db_output_helper(
415
486
  db_sample_path = directory.joinpath(f"{db_name}_sample.tsv")
416
487
  db_summary_path = directory.joinpath(f"{db_name}_summary.tsv")
417
488
  db_summary_detailed_path = directory.joinpath(f"{db_name}_summary_detailed.tsv")
489
+ db_metadata_path = directory.joinpath(f"{db_name}_metadata.json")
490
+ rv: list[tuple[str, pathlib.Path]] = [
491
+ ("Metadata", db_metadata_path),
492
+ ("Data", db_path),
493
+ ("Sample", db_sample_path),
494
+ ("Summary", db_summary_path),
495
+ ]
418
496
 
419
497
  logger.info("writing %s to %s", db_name, db_path)
420
498
  logger.info("writing %s sample to %s", db_name, db_sample_path)
421
- it = f(strict=strict, **kwargs)
499
+ sample_rows = []
422
500
  with gzip.open(db_path, mode="wt") if use_gzip else open(db_path, "w") as gzipped_file:
423
501
  writer = get_writer(gzipped_file)
424
502
 
@@ -430,12 +508,13 @@ def db_output_helper(
430
508
  writer.writerow(columns)
431
509
  sample_writer.writerow(columns)
432
510
 
433
- for row, _ in zip(it, range(10)):
511
+ for row, _ in zip(it, range(10), strict=False):
434
512
  c[row[0]] += 1
435
513
  if summary_detailed is not None:
436
514
  c_detailed[tuple(row[i] for i in summary_detailed)] += 1
437
515
  writer.writerow(row)
438
516
  sample_writer.writerow(row)
517
+ sample_rows.append(row)
439
518
 
440
519
  # continue just in the gzipped one
441
520
  for row in it:
@@ -444,7 +523,6 @@ def db_output_helper(
444
523
  c_detailed[tuple(row[i] for i in summary_detailed)] += 1
445
524
  writer.writerow(row)
446
525
 
447
- logger.info(f"writing {db_name} summary to {db_summary_path}")
448
526
  with open(db_summary_path, "w") as file:
449
527
  writer = get_writer(file)
450
528
  writer.writerows(c.most_common())
@@ -454,8 +532,8 @@ def db_output_helper(
454
532
  with open(db_summary_detailed_path, "w") as file:
455
533
  writer = get_writer(file)
456
534
  writer.writerows((*keys, v) for keys, v in c_detailed.most_common())
535
+ rv.append(("Summary (Detailed)", db_summary_detailed_path))
457
536
 
458
- db_metadata_path = directory.joinpath(f"{db_name}_metadata.json")
459
537
  with open(db_metadata_path, "w") as file:
460
538
  json.dump(
461
539
  {
@@ -468,12 +546,12 @@ def db_output_helper(
468
546
  indent=2,
469
547
  )
470
548
 
471
- rv: list[pathlib.Path] = [
472
- db_metadata_path,
473
- db_path,
474
- db_sample_path,
475
- db_summary_path,
476
- ]
477
- if summary_detailed:
478
- rv.append(db_summary_detailed_path)
479
- return rv
549
+ elapsed = time.time() - start
550
+ click.secho(f"\nWrote the following files in {elapsed:.1f} seconds\n", fg="green")
551
+ click.secho(indent(tabulate(rv), " "), fg="green")
552
+
553
+ click.secho("\nSample rows:\n", fg="green")
554
+ click.secho(indent(tabulate(sample_rows, headers=columns), " "), fg="green")
555
+ click.echo()
556
+
557
+ return [path for _, path in rv]