pyobo 0.11.2__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. pyobo/.DS_Store +0 -0
  2. pyobo/__init__.py +95 -20
  3. pyobo/__main__.py +0 -0
  4. pyobo/api/__init__.py +81 -10
  5. pyobo/api/alts.py +52 -42
  6. pyobo/api/combine.py +39 -0
  7. pyobo/api/edges.py +68 -0
  8. pyobo/api/hierarchy.py +231 -203
  9. pyobo/api/metadata.py +14 -19
  10. pyobo/api/names.py +207 -127
  11. pyobo/api/properties.py +117 -117
  12. pyobo/api/relations.py +68 -94
  13. pyobo/api/species.py +24 -21
  14. pyobo/api/typedefs.py +11 -11
  15. pyobo/api/utils.py +66 -13
  16. pyobo/api/xrefs.py +107 -114
  17. pyobo/cli/__init__.py +0 -0
  18. pyobo/cli/cli.py +35 -50
  19. pyobo/cli/database.py +210 -160
  20. pyobo/cli/database_utils.py +155 -0
  21. pyobo/cli/lookup.py +163 -195
  22. pyobo/cli/utils.py +19 -6
  23. pyobo/constants.py +102 -3
  24. pyobo/getters.py +209 -191
  25. pyobo/gilda_utils.py +52 -250
  26. pyobo/identifier_utils/__init__.py +33 -0
  27. pyobo/identifier_utils/api.py +305 -0
  28. pyobo/identifier_utils/preprocessing.json +873 -0
  29. pyobo/identifier_utils/preprocessing.py +27 -0
  30. pyobo/identifier_utils/relations/__init__.py +8 -0
  31. pyobo/identifier_utils/relations/api.py +162 -0
  32. pyobo/identifier_utils/relations/data.json +5824 -0
  33. pyobo/identifier_utils/relations/data_owl.json +57 -0
  34. pyobo/identifier_utils/relations/data_rdf.json +1 -0
  35. pyobo/identifier_utils/relations/data_rdfs.json +7 -0
  36. pyobo/mocks.py +9 -6
  37. pyobo/ner/__init__.py +9 -0
  38. pyobo/ner/api.py +72 -0
  39. pyobo/ner/normalizer.py +33 -0
  40. pyobo/obographs.py +48 -40
  41. pyobo/plugins.py +5 -4
  42. pyobo/py.typed +0 -0
  43. pyobo/reader.py +1354 -395
  44. pyobo/reader_utils.py +155 -0
  45. pyobo/resource_utils.py +42 -22
  46. pyobo/resources/__init__.py +0 -0
  47. pyobo/resources/goc.py +75 -0
  48. pyobo/resources/goc.tsv +188 -0
  49. pyobo/resources/ncbitaxon.py +4 -5
  50. pyobo/resources/ncbitaxon.tsv.gz +0 -0
  51. pyobo/resources/ro.py +3 -2
  52. pyobo/resources/ro.tsv +0 -0
  53. pyobo/resources/so.py +0 -0
  54. pyobo/resources/so.tsv +0 -0
  55. pyobo/sources/README.md +12 -8
  56. pyobo/sources/__init__.py +52 -29
  57. pyobo/sources/agrovoc.py +0 -0
  58. pyobo/sources/antibodyregistry.py +11 -12
  59. pyobo/sources/bigg/__init__.py +13 -0
  60. pyobo/sources/bigg/bigg_compartment.py +81 -0
  61. pyobo/sources/bigg/bigg_metabolite.py +229 -0
  62. pyobo/sources/bigg/bigg_model.py +46 -0
  63. pyobo/sources/bigg/bigg_reaction.py +77 -0
  64. pyobo/sources/biogrid.py +1 -2
  65. pyobo/sources/ccle.py +7 -12
  66. pyobo/sources/cgnc.py +9 -6
  67. pyobo/sources/chebi.py +1 -1
  68. pyobo/sources/chembl/__init__.py +9 -0
  69. pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
  70. pyobo/sources/chembl/chembl_target.py +160 -0
  71. pyobo/sources/civic_gene.py +55 -15
  72. pyobo/sources/clinicaltrials.py +160 -0
  73. pyobo/sources/complexportal.py +24 -24
  74. pyobo/sources/conso.py +14 -22
  75. pyobo/sources/cpt.py +0 -0
  76. pyobo/sources/credit.py +1 -9
  77. pyobo/sources/cvx.py +27 -5
  78. pyobo/sources/depmap.py +9 -12
  79. pyobo/sources/dictybase_gene.py +2 -7
  80. pyobo/sources/drugbank/__init__.py +9 -0
  81. pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
  82. pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
  83. pyobo/sources/drugcentral.py +17 -13
  84. pyobo/sources/expasy.py +31 -34
  85. pyobo/sources/famplex.py +13 -18
  86. pyobo/sources/flybase.py +8 -13
  87. pyobo/sources/gard.py +62 -0
  88. pyobo/sources/geonames/__init__.py +9 -0
  89. pyobo/sources/geonames/features.py +28 -0
  90. pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
  91. pyobo/sources/geonames/utils.py +115 -0
  92. pyobo/sources/gmt_utils.py +6 -7
  93. pyobo/sources/go.py +20 -13
  94. pyobo/sources/gtdb.py +154 -0
  95. pyobo/sources/gwascentral/__init__.py +9 -0
  96. pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
  97. pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
  98. pyobo/sources/hgnc/__init__.py +9 -0
  99. pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
  100. pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
  101. pyobo/sources/icd/__init__.py +9 -0
  102. pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
  103. pyobo/sources/icd/icd11.py +148 -0
  104. pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
  105. pyobo/sources/interpro.py +4 -9
  106. pyobo/sources/itis.py +0 -5
  107. pyobo/sources/kegg/__init__.py +0 -0
  108. pyobo/sources/kegg/api.py +16 -38
  109. pyobo/sources/kegg/genes.py +9 -20
  110. pyobo/sources/kegg/genome.py +1 -7
  111. pyobo/sources/kegg/pathway.py +9 -21
  112. pyobo/sources/mesh.py +58 -24
  113. pyobo/sources/mgi.py +3 -10
  114. pyobo/sources/mirbase/__init__.py +11 -0
  115. pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
  116. pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
  117. pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
  118. pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
  119. pyobo/sources/msigdb.py +74 -39
  120. pyobo/sources/ncbi/__init__.py +9 -0
  121. pyobo/sources/ncbi/ncbi_gc.py +162 -0
  122. pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
  123. pyobo/sources/nih_reporter.py +60 -0
  124. pyobo/sources/nlm/__init__.py +9 -0
  125. pyobo/sources/nlm/nlm_catalog.py +48 -0
  126. pyobo/sources/nlm/nlm_publisher.py +36 -0
  127. pyobo/sources/nlm/utils.py +116 -0
  128. pyobo/sources/npass.py +6 -8
  129. pyobo/sources/omim_ps.py +11 -4
  130. pyobo/sources/pathbank.py +4 -8
  131. pyobo/sources/pfam/__init__.py +9 -0
  132. pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
  133. pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
  134. pyobo/sources/pharmgkb/__init__.py +15 -0
  135. pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
  136. pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
  137. pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
  138. pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
  139. pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
  140. pyobo/sources/pharmgkb/utils.py +86 -0
  141. pyobo/sources/pid.py +1 -6
  142. pyobo/sources/pombase.py +6 -10
  143. pyobo/sources/pubchem.py +4 -9
  144. pyobo/sources/reactome.py +5 -11
  145. pyobo/sources/rgd.py +11 -16
  146. pyobo/sources/rhea.py +37 -36
  147. pyobo/sources/ror.py +69 -42
  148. pyobo/sources/selventa/__init__.py +0 -0
  149. pyobo/sources/selventa/schem.py +4 -7
  150. pyobo/sources/selventa/scomp.py +1 -6
  151. pyobo/sources/selventa/sdis.py +4 -7
  152. pyobo/sources/selventa/sfam.py +1 -6
  153. pyobo/sources/sgd.py +6 -11
  154. pyobo/sources/signor/__init__.py +7 -0
  155. pyobo/sources/signor/download.py +41 -0
  156. pyobo/sources/signor/signor_complexes.py +105 -0
  157. pyobo/sources/slm.py +12 -15
  158. pyobo/sources/umls/__init__.py +7 -1
  159. pyobo/sources/umls/__main__.py +0 -0
  160. pyobo/sources/umls/get_synonym_types.py +20 -4
  161. pyobo/sources/umls/sty.py +57 -0
  162. pyobo/sources/umls/synonym_types.tsv +1 -1
  163. pyobo/sources/umls/umls.py +18 -22
  164. pyobo/sources/unimod.py +46 -0
  165. pyobo/sources/uniprot/__init__.py +1 -1
  166. pyobo/sources/uniprot/uniprot.py +40 -32
  167. pyobo/sources/uniprot/uniprot_ptm.py +4 -34
  168. pyobo/sources/utils.py +3 -2
  169. pyobo/sources/wikipathways.py +7 -10
  170. pyobo/sources/zfin.py +5 -10
  171. pyobo/ssg/__init__.py +12 -16
  172. pyobo/ssg/base.html +0 -0
  173. pyobo/ssg/index.html +26 -13
  174. pyobo/ssg/term.html +12 -2
  175. pyobo/ssg/typedef.html +0 -0
  176. pyobo/struct/__init__.py +54 -8
  177. pyobo/struct/functional/__init__.py +1 -0
  178. pyobo/struct/functional/dsl.py +2572 -0
  179. pyobo/struct/functional/macros.py +423 -0
  180. pyobo/struct/functional/obo_to_functional.py +385 -0
  181. pyobo/struct/functional/ontology.py +272 -0
  182. pyobo/struct/functional/utils.py +112 -0
  183. pyobo/struct/reference.py +331 -136
  184. pyobo/struct/struct.py +1484 -657
  185. pyobo/struct/struct_utils.py +1078 -0
  186. pyobo/struct/typedef.py +162 -210
  187. pyobo/struct/utils.py +12 -5
  188. pyobo/struct/vocabulary.py +138 -0
  189. pyobo/utils/__init__.py +0 -0
  190. pyobo/utils/cache.py +16 -15
  191. pyobo/utils/io.py +51 -41
  192. pyobo/utils/iter.py +5 -5
  193. pyobo/utils/misc.py +41 -53
  194. pyobo/utils/ndex_utils.py +0 -0
  195. pyobo/utils/path.py +73 -70
  196. pyobo/version.py +3 -3
  197. pyobo-0.12.1.dist-info/METADATA +671 -0
  198. pyobo-0.12.1.dist-info/RECORD +201 -0
  199. pyobo-0.12.1.dist-info/WHEEL +4 -0
  200. {pyobo-0.11.2.dist-info → pyobo-0.12.1.dist-info}/entry_points.txt +1 -0
  201. pyobo-0.12.1.dist-info/licenses/LICENSE +21 -0
  202. pyobo/aws.py +0 -162
  203. pyobo/cli/aws.py +0 -47
  204. pyobo/identifier_utils.py +0 -142
  205. pyobo/normalizer.py +0 -232
  206. pyobo/registries/__init__.py +0 -16
  207. pyobo/registries/metaregistry.json +0 -507
  208. pyobo/registries/metaregistry.py +0 -135
  209. pyobo/sources/icd11.py +0 -105
  210. pyobo/xrefdb/__init__.py +0 -1
  211. pyobo/xrefdb/canonicalizer.py +0 -214
  212. pyobo/xrefdb/priority.py +0 -59
  213. pyobo/xrefdb/sources/__init__.py +0 -60
  214. pyobo/xrefdb/sources/biomappings.py +0 -36
  215. pyobo/xrefdb/sources/cbms2019.py +0 -91
  216. pyobo/xrefdb/sources/chembl.py +0 -83
  217. pyobo/xrefdb/sources/compath.py +0 -82
  218. pyobo/xrefdb/sources/famplex.py +0 -64
  219. pyobo/xrefdb/sources/gilda.py +0 -50
  220. pyobo/xrefdb/sources/intact.py +0 -113
  221. pyobo/xrefdb/sources/ncit.py +0 -133
  222. pyobo/xrefdb/sources/pubchem.py +0 -27
  223. pyobo/xrefdb/sources/wikidata.py +0 -116
  224. pyobo/xrefdb/xrefs_pipeline.py +0 -180
  225. pyobo-0.11.2.dist-info/METADATA +0 -711
  226. pyobo-0.11.2.dist-info/RECORD +0 -157
  227. pyobo-0.11.2.dist-info/WHEEL +0 -5
  228. pyobo-0.11.2.dist-info/top_level.txt +0 -1
pyobo/cli/database.py CHANGED
@@ -1,14 +1,40 @@
1
1
  """CLI for PyOBO Database Generation."""
2
2
 
3
3
  import logging
4
- from typing import Optional
4
+ import warnings
5
+ from collections.abc import Iterable
6
+ from pathlib import Path
5
7
 
8
+ import bioregistry
6
9
  import click
7
10
  from more_click import verbose_option
8
11
  from tqdm.contrib.logging import logging_redirect_tqdm
12
+ from typing_extensions import Unpack
9
13
  from zenodo_client import update_zenodo
10
14
 
11
- from .utils import directory_option, force_option, no_strict_option, zenodo_option
15
+ from .database_utils import (
16
+ IterHelperHelperDict,
17
+ _iter_alts,
18
+ _iter_definitions,
19
+ _iter_edges,
20
+ _iter_mappings,
21
+ _iter_names,
22
+ _iter_properties,
23
+ _iter_relations,
24
+ _iter_species,
25
+ _iter_synonyms,
26
+ _iter_typedefs,
27
+ _iter_xrefs,
28
+ iter_helper_helper,
29
+ )
30
+ from .utils import (
31
+ Clickable,
32
+ directory_option,
33
+ force_option,
34
+ force_process_option,
35
+ strict_option,
36
+ zenodo_option,
37
+ )
12
38
  from ..constants import (
13
39
  ALTS_DATA_RECORD,
14
40
  DEFINITIONS_RECORD,
@@ -19,39 +45,60 @@ from ..constants import (
19
45
  SPECIES_RECORD,
20
46
  SYNONYMS_RECORD,
21
47
  TYPEDEFS_RECORD,
48
+ DatabaseKwargs,
22
49
  )
23
- from ..getters import db_output_helper
24
- from ..xrefdb.xrefs_pipeline import (
25
- _iter_alts,
26
- _iter_definitions,
27
- _iter_metadata,
28
- _iter_names,
29
- _iter_properties,
30
- _iter_relations,
31
- _iter_species,
32
- _iter_synonyms,
33
- _iter_typedefs,
34
- _iter_xrefs,
35
- )
50
+ from ..getters import db_output_helper, get_ontology
36
51
 
37
52
  __all__ = [
38
53
  "main",
39
54
  ]
40
55
 
56
+ logger = logging.getLogger(__name__)
57
+
41
58
 
42
59
  @click.group(name="database")
43
60
  def main():
44
61
  """Build the PyOBO Database."""
45
62
 
46
63
 
47
- @main.command()
48
- @verbose_option
49
- @directory_option
50
- @zenodo_option
51
- @force_option
52
- @no_strict_option
64
+ skip_pyobo_option = click.option(
65
+ "--skip-pyobo",
66
+ is_flag=True,
67
+ help="Skip prefixes whose ontologies are implemented as PyOBO sources",
68
+ )
69
+ skip_below_option = click.option(
70
+ "--skip-below", help="Skip prefixes lexically sorted below the given one"
71
+ )
72
+
73
+
74
+ def database_annotate(f: Clickable) -> Clickable:
75
+ """Add appropriate decorators to database CLI functions."""
76
+ decorators = [
77
+ main.command(),
78
+ zenodo_option,
79
+ verbose_option,
80
+ directory_option,
81
+ force_option,
82
+ force_process_option,
83
+ strict_option,
84
+ skip_pyobo_option,
85
+ skip_below_option,
86
+ ]
87
+ for decorator in decorators:
88
+ f = decorator(f)
89
+ return f
90
+
91
+
92
+ def _update_database_kwargs(kwargs: DatabaseKwargs) -> DatabaseKwargs:
93
+ updated_kwargs = dict(kwargs)
94
+ updated_kwargs.update(force=False, force_process=False)
95
+ # FIXME get typing right on next line
96
+ return updated_kwargs # type:ignore
97
+
98
+
99
+ @database_annotate
53
100
  @click.pass_context
54
- def build(ctx: click.Context, directory: str, zenodo: bool, no_strict: bool, force: bool):
101
+ def build(ctx: click.Context, **kwargs: Unpack[DatabaseKwargs]) -> None:
55
102
  """Build all databases."""
56
103
  # if no_strict and zenodo:
57
104
  # click.secho("Must be strict before uploading", fg="red")
@@ -59,103 +106,97 @@ def build(ctx: click.Context, directory: str, zenodo: bool, no_strict: bool, for
59
106
  with logging_redirect_tqdm():
60
107
  click.secho("Collecting metadata and building", fg="cyan", bold=True)
61
108
  # note that this is the only one that needs a force=force
62
- ctx.invoke(metadata, directory=directory, no_strict=no_strict, force=force)
109
+ ctx.invoke(metadata, **kwargs)
110
+
111
+ # After running once, we don't want to force or re-process.
112
+ # All the other arguments come along for the ride!
113
+ updated_kwargs = _update_database_kwargs(kwargs)
114
+
63
115
  click.secho("Alternate Identifiers", fg="cyan", bold=True)
64
- ctx.invoke(alts, directory=directory, zenodo=zenodo, no_strict=no_strict)
116
+ ctx.invoke(alts, **updated_kwargs)
65
117
  click.secho("Synonyms", fg="cyan", bold=True)
66
- ctx.invoke(synonyms, directory=directory, zenodo=zenodo, no_strict=no_strict)
67
- click.secho("Xrefs", fg="cyan", bold=True)
68
- ctx.invoke(xrefs, directory=directory, zenodo=zenodo, no_strict=no_strict)
118
+ ctx.invoke(synonyms, **updated_kwargs)
119
+ click.secho("Mappings", fg="cyan", bold=True)
120
+ ctx.invoke(mappings, **updated_kwargs)
69
121
  click.secho("Names", fg="cyan", bold=True)
70
- ctx.invoke(names, directory=directory, zenodo=zenodo, no_strict=no_strict)
122
+ ctx.invoke(names, **updated_kwargs)
71
123
  click.secho("Definitions", fg="cyan", bold=True)
72
- ctx.invoke(definitions, directory=directory, zenodo=zenodo, no_strict=no_strict)
124
+ ctx.invoke(definitions, **updated_kwargs)
73
125
  click.secho("Properties", fg="cyan", bold=True)
74
- ctx.invoke(properties, directory=directory, zenodo=zenodo, no_strict=no_strict)
126
+ ctx.invoke(properties, **updated_kwargs)
75
127
  click.secho("Relations", fg="cyan", bold=True)
76
- ctx.invoke(relations, directory=directory, zenodo=zenodo, no_strict=no_strict)
128
+ ctx.invoke(relations, **updated_kwargs)
129
+ click.secho("Edges", fg="cyan", bold=True)
130
+ ctx.invoke(edges, **updated_kwargs)
77
131
  click.secho("Typedefs", fg="cyan", bold=True)
78
- ctx.invoke(typedefs, directory=directory, zenodo=zenodo, no_strict=no_strict)
132
+ ctx.invoke(typedefs, **updated_kwargs)
79
133
  click.secho("Species", fg="cyan", bold=True)
80
- ctx.invoke(species, directory=directory, zenodo=zenodo, no_strict=no_strict)
134
+ ctx.invoke(species, **updated_kwargs)
135
+
81
136
 
137
+ @database_annotate
138
+ def cache(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
139
+ """Cache all things."""
140
+ if zenodo:
141
+ click.echo("no zenodo for caching")
82
142
 
83
- skip_below_option = click.option("--skip-below")
84
- skip_below_exclusive_option = click.option("--skip-below-exclusive", is_flag=True)
143
+ kwargs["force_process"] = True
144
+ with logging_redirect_tqdm():
145
+ for _ in iter_helper_helper(get_ontology, **kwargs):
146
+ # this pass intentional to consume the iterable
147
+ pass
85
148
 
86
149
 
87
- @main.command()
88
- @verbose_option
89
- @directory_option
90
- @force_option
91
- @no_strict_option
92
- @skip_below_option
93
- @click.option("--skip-pyobo")
94
- def metadata(
95
- directory: str, no_strict: bool, force: bool, skip_below: Optional[str], skip_pyobo: bool
96
- ):
150
+ @database_annotate
151
+ def metadata(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
97
152
  """Make the prefix-metadata dump."""
153
+ from ..api import get_metadata
154
+
155
+ def _iter_metadata(
156
+ **kwargs: Unpack[IterHelperHelperDict],
157
+ ) -> Iterable[tuple[str, str, str, bool]]:
158
+ for prefix, data in iter_helper_helper(get_metadata, **kwargs):
159
+ version = data["version"]
160
+ logger.debug(f"[{prefix}] using version {version}")
161
+ yield prefix, version, data["date"], bioregistry.is_deprecated(prefix)
162
+
163
+ it = _iter_metadata(**kwargs)
98
164
  db_output_helper(
99
- _iter_metadata,
165
+ it,
100
166
  "metadata",
101
167
  ("prefix", "version", "date", "deprecated"),
102
- strict=not no_strict,
103
- force=force,
104
- directory=directory,
105
168
  use_gzip=False,
106
- skip_below=skip_below,
107
- skip_pyobo=skip_pyobo,
169
+ directory=directory,
108
170
  )
171
+ if zenodo:
172
+ click.secho("No Zenodo record for metadata", fg="red")
109
173
 
110
174
 
111
- @main.command()
112
- @verbose_option
113
- @directory_option
114
- @zenodo_option
115
- @force_option
116
- @no_strict_option
117
- @skip_below_option
118
- @skip_below_exclusive_option
119
- def names(
120
- directory: str,
121
- zenodo: bool,
122
- no_strict: bool,
123
- force: bool,
124
- skip_below: Optional[str],
125
- skip_below_exclusive: bool,
126
- ):
175
+ @database_annotate
176
+ def names(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
127
177
  """Make the prefix-identifier-name dump."""
178
+ it = _iter_names(**kwargs)
128
179
  with logging_redirect_tqdm():
129
180
  paths = db_output_helper(
130
- _iter_names,
181
+ it,
131
182
  "names",
132
183
  ("prefix", "identifier", "name"),
133
- strict=not no_strict,
134
- force=force,
135
184
  directory=directory,
136
- skip_below=skip_below,
137
- skip_below_inclusive=not skip_below_exclusive,
138
185
  )
139
186
  if zenodo:
140
187
  # see https://zenodo.org/record/4020486
141
188
  update_zenodo(OOH_NA_NA_RECORD, paths)
142
189
 
143
190
 
144
- @main.command()
145
- @verbose_option
146
- @directory_option
147
- @zenodo_option
148
- @force_option
149
- @no_strict_option
150
- def species(directory: str, zenodo: bool, no_strict: bool, force: bool):
191
+ @database_annotate
192
+ def species(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
151
193
  """Make the prefix-identifier-species dump."""
152
194
  with logging_redirect_tqdm():
195
+ it = _iter_species(**kwargs)
153
196
  paths = db_output_helper(
154
- _iter_species,
197
+ it,
155
198
  "species",
156
199
  ("prefix", "identifier", "species"),
157
- strict=not no_strict,
158
- force=force,
159
200
  directory=directory,
160
201
  )
161
202
  if zenodo:
@@ -163,110 +204,90 @@ def species(directory: str, zenodo: bool, no_strict: bool, force: bool):
163
204
  update_zenodo(SPECIES_RECORD, paths)
164
205
 
165
206
 
166
- @main.command()
167
- @verbose_option
168
- @directory_option
169
- @zenodo_option
170
- @force_option
171
- @no_strict_option
172
- def definitions(directory: str, zenodo: bool, no_strict: bool, force: bool):
207
+ def _extend_skip_set(kwargs: DatabaseKwargs, skip_set: set[str]) -> None:
208
+ ss = kwargs.get("skip_set")
209
+ if ss is None:
210
+ kwargs["skip_set"] = skip_set
211
+ else:
212
+ ss.update(skip_set)
213
+
214
+
215
+ @database_annotate
216
+ def definitions(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
173
217
  """Make the prefix-identifier-definition dump."""
174
218
  with logging_redirect_tqdm():
219
+ _extend_skip_set(kwargs, {"kegg.pathway", "kegg.genes", "kegg.genome", "umls"})
220
+ it = _iter_definitions(**kwargs)
175
221
  paths = db_output_helper(
176
- _iter_definitions,
222
+ it,
177
223
  "definitions",
178
224
  ("prefix", "identifier", "definition"),
179
- strict=not no_strict,
180
- force=force,
181
225
  directory=directory,
182
- skip_set={"kegg.pathway", "kegg.genes", "kegg.genome", "umls"},
183
226
  )
184
227
  if zenodo:
185
228
  # see https://zenodo.org/record/4637061
186
229
  update_zenodo(DEFINITIONS_RECORD, paths)
187
230
 
188
231
 
189
- @main.command()
190
- @verbose_option
191
- @directory_option
192
- @zenodo_option
193
- @force_option
194
- @no_strict_option
195
- def typedefs(directory: str, zenodo: bool, no_strict: bool, force: bool):
232
+ @database_annotate
233
+ def typedefs(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
196
234
  """Make the typedef prefix-identifier-name dump."""
197
235
  with logging_redirect_tqdm():
236
+ _extend_skip_set(kwargs, {"ncbigene", "kegg.pathway", "kegg.genes", "kegg.genome"})
237
+ it = _iter_typedefs(**kwargs)
198
238
  paths = db_output_helper(
199
- _iter_typedefs,
239
+ it,
200
240
  "typedefs",
201
241
  ("prefix", "typedef_prefix", "identifier", "name"),
202
- strict=not no_strict,
203
- force=force,
204
- directory=directory,
205
242
  use_gzip=False,
206
- skip_set={"ncbigene", "kegg.pathway", "kegg.genes", "kegg.genome"},
243
+ directory=directory,
207
244
  )
208
245
  if zenodo:
209
246
  # see https://zenodo.org/record/4644013
210
247
  update_zenodo(TYPEDEFS_RECORD, paths)
211
248
 
212
249
 
213
- @main.command()
214
- @verbose_option
215
- @directory_option
216
- @zenodo_option
217
- @force_option
218
- @no_strict_option
219
- def alts(directory: str, zenodo: bool, force: bool, no_strict: bool):
250
+ @database_annotate
251
+ def alts(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
220
252
  """Make the prefix-alt-id dump."""
221
253
  with logging_redirect_tqdm():
254
+ _extend_skip_set(kwargs, {"kegg.pathway", "kegg.genes", "kegg.genome", "umls"})
255
+ it = _iter_alts(**kwargs)
222
256
  paths = db_output_helper(
223
- _iter_alts,
257
+ it,
224
258
  "alts",
225
259
  ("prefix", "identifier", "alt"),
226
260
  directory=directory,
227
- force=force,
228
- strict=not no_strict,
229
- skip_set={"kegg.pathway", "kegg.genes", "kegg.genome", "umls"},
230
261
  )
231
262
  if zenodo:
232
263
  # see https://zenodo.org/record/4021476
233
264
  update_zenodo(ALTS_DATA_RECORD, paths)
234
265
 
235
266
 
236
- @main.command()
237
- @verbose_option
238
- @directory_option
239
- @zenodo_option
240
- @force_option
241
- @no_strict_option
242
- def synonyms(directory: str, zenodo: bool, force: bool, no_strict: bool):
267
+ @database_annotate
268
+ def synonyms(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
243
269
  """Make the prefix-identifier-synonym dump."""
244
270
  with logging_redirect_tqdm():
271
+ _extend_skip_set(kwargs, {"kegg.pathway", "kegg.genes", "kegg.genome"})
272
+ it = _iter_synonyms(**kwargs)
245
273
  paths = db_output_helper(
246
- _iter_synonyms,
274
+ it,
247
275
  "synonyms",
248
276
  ("prefix", "identifier", "synonym"),
249
277
  directory=directory,
250
- force=force,
251
- strict=not no_strict,
252
- skip_set={"kegg.pathway", "kegg.genes", "kegg.genome"},
253
278
  )
254
279
  if zenodo:
255
280
  # see https://zenodo.org/record/4021482
256
281
  update_zenodo(SYNONYMS_RECORD, paths)
257
282
 
258
283
 
259
- @main.command()
260
- @verbose_option
261
- @directory_option
262
- @zenodo_option
263
- @force_option
264
- @no_strict_option
265
- def relations(directory: str, zenodo: bool, force: bool, no_strict: bool):
284
+ @database_annotate
285
+ def relations(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
266
286
  """Make the relation dump."""
267
287
  with logging_redirect_tqdm():
288
+ it = _iter_relations(**kwargs)
268
289
  paths = db_output_helper(
269
- _iter_relations,
290
+ it,
270
291
  "relations",
271
292
  (
272
293
  "source_prefix",
@@ -276,62 +297,91 @@ def relations(directory: str, zenodo: bool, force: bool, no_strict: bool):
276
297
  "target_prefix",
277
298
  "target_identifier",
278
299
  ),
279
- directory=directory,
280
- force=force,
281
- strict=not no_strict,
282
300
  summary_detailed=(0, 2, 3), # second column corresponds to relation type
301
+ directory=directory,
283
302
  )
284
303
  if zenodo:
285
304
  # see https://zenodo.org/record/4625167
286
305
  update_zenodo(RELATIONS_RECORD, paths)
287
306
 
288
307
 
289
- @main.command()
290
- @verbose_option
291
- @directory_option
292
- @zenodo_option
293
- @force_option
294
- @no_strict_option
295
- def properties(directory: str, zenodo: bool, force: bool, no_strict: bool):
308
+ @database_annotate
309
+ def edges(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
310
+ """Make the edges dump."""
311
+ with logging_redirect_tqdm():
312
+ it = _iter_edges(**kwargs)
313
+ db_output_helper(
314
+ it,
315
+ "edges",
316
+ (
317
+ ":START_ID",
318
+ ":TYPE",
319
+ ":END_ID",
320
+ "provenance",
321
+ ),
322
+ directory=directory,
323
+ )
324
+ if zenodo:
325
+ raise NotImplementedError
326
+
327
+
328
+ @database_annotate
329
+ def properties(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
296
330
  """Make the properties dump."""
297
331
  with logging_redirect_tqdm():
332
+ it = _iter_properties(**kwargs)
298
333
  paths = db_output_helper(
299
- _iter_properties,
334
+ it,
300
335
  "properties",
301
336
  ("prefix", "identifier", "property", "value"),
302
- directory=directory,
303
- force=force,
304
- strict=not no_strict,
305
337
  summary_detailed=(0, 2), # second column corresponds to property type
338
+ directory=directory,
306
339
  )
307
340
  if zenodo:
308
341
  # see https://zenodo.org/record/4625172
309
342
  update_zenodo(PROPERTIES_RECORD, paths)
310
343
 
311
344
 
312
- @main.command()
313
- @verbose_option
314
- @directory_option
315
- @zenodo_option
316
- @force_option
317
- @no_strict_option
318
- def xrefs(directory: str, zenodo: bool, force: bool, no_strict: bool):
345
+ @database_annotate
346
+ def xrefs(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
319
347
  """Make the prefix-identifier-xref dump."""
348
+ warnings.warn("Use pyobo.database.mappings instead", DeprecationWarning, stacklevel=2)
320
349
  with logging_redirect_tqdm():
350
+ it = _iter_xrefs(**kwargs)
321
351
  paths = db_output_helper(
322
- _iter_xrefs,
352
+ it,
323
353
  "xrefs",
324
354
  ("prefix", "identifier", "xref_prefix", "xref_identifier", "provenance"),
325
- directory=directory,
326
- force=force,
327
- strict=not no_strict,
328
355
  summary_detailed=(0, 2), # second column corresponds to xref prefix
356
+ directory=directory,
329
357
  )
330
358
  if zenodo:
331
359
  # see https://zenodo.org/record/4021477
332
360
  update_zenodo(JAVERT_RECORD, paths)
333
361
 
334
362
 
363
+ @database_annotate
364
+ def mappings(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
365
+ """Make the SSSOM dump."""
366
+ columns = [
367
+ "subject_id",
368
+ "object_id",
369
+ "predicate_id",
370
+ "mapping_justification",
371
+ "mapping_source",
372
+ ]
373
+ with logging_redirect_tqdm():
374
+ it = _iter_mappings(**kwargs)
375
+ db_output_helper(
376
+ it,
377
+ "mappings",
378
+ columns,
379
+ directory=directory,
380
+ )
381
+ if zenodo:
382
+ raise NotImplementedError("need to do initial manual upload of SSSOM build")
383
+
384
+
335
385
  if __name__ == "__main__":
336
386
  logging.captureWarnings(True)
337
387
  with logging_redirect_tqdm():
@@ -0,0 +1,155 @@
1
+ """Pipeline for extracting all xrefs from OBO documents available."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import gzip
6
+ import logging
7
+ import warnings
8
+ from collections.abc import Iterable
9
+ from functools import partial
10
+ from typing import cast
11
+
12
+ from tqdm.auto import tqdm
13
+ from typing_extensions import Unpack
14
+
15
+ from ..api import (
16
+ get_edges_df,
17
+ get_id_definition_mapping,
18
+ get_id_name_mapping,
19
+ get_id_species_mapping,
20
+ get_id_synonyms_mapping,
21
+ get_id_to_alts,
22
+ get_mappings_df,
23
+ get_properties_df,
24
+ get_relations_df,
25
+ get_typedef_df,
26
+ get_xrefs_df,
27
+ )
28
+ from ..getters import IterHelperHelperDict, iter_helper, iter_helper_helper
29
+ from ..sources import pubchem
30
+ from ..sources.ncbi import ncbigene
31
+ from ..utils.path import ensure_path
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ def _iter_ncbigene(left: int, right: int) -> Iterable[tuple[str, str, str]]:
37
+ ncbi_path = ensure_path(ncbigene.PREFIX, url=ncbigene.GENE_INFO_URL)
38
+ with gzip.open(ncbi_path, "rt") as file:
39
+ next(file) # throw away the header
40
+ for line in tqdm(
41
+ file, desc=f"[{ncbigene.PREFIX}] extracting names", unit_scale=True, total=56_700_000
42
+ ):
43
+ parts = line.strip().split("\t")
44
+ yield ncbigene.PREFIX, parts[left], parts[right]
45
+
46
+
47
+ def _iter_names(leave: bool = False, **kwargs) -> Iterable[tuple[str, str, str]]:
48
+ """Iterate over all prefix-identifier-name triples we can get.
49
+
50
+ :param leave: should the tqdm be left behind?
51
+ """
52
+ yield from iter_helper(get_id_name_mapping, leave=leave, **kwargs)
53
+ yield from _iter_ncbigene(1, 2)
54
+ yield from _iter_pubchem_compound()
55
+
56
+
57
+ def _iter_pubchem_compound():
58
+ pcc_path = pubchem._ensure_cid_name_path()
59
+ with gzip.open(pcc_path, mode="rt", encoding="ISO-8859-1") as file:
60
+ for line in tqdm(
61
+ file, desc=f"[{pubchem.PREFIX}] extracting names", unit_scale=True, total=119_000_000
62
+ ):
63
+ identifier, name = line.strip().split("\t", 1)
64
+ yield pubchem.PREFIX, identifier, name
65
+
66
+
67
+ def _iter_species(
68
+ leave: bool = False, **kwargs: Unpack[IterHelperHelperDict]
69
+ ) -> Iterable[tuple[str, str, str]]:
70
+ """Iterate over all prefix-identifier-species triples we can get."""
71
+ yield from iter_helper(get_id_species_mapping, leave=leave, **kwargs)
72
+ # TODO ncbigene
73
+
74
+
75
+ def _iter_definitions(
76
+ leave: bool = False, **kwargs: Unpack[IterHelperHelperDict]
77
+ ) -> Iterable[tuple[str, str, str]]:
78
+ """Iterate over all prefix-identifier-descriptions triples we can get."""
79
+ yield from iter_helper(get_id_definition_mapping, leave=leave, **kwargs)
80
+ yield from _iter_ncbigene(1, 8)
81
+
82
+
83
+ def _iter_alts(
84
+ leave: bool = False, **kwargs: Unpack[IterHelperHelperDict]
85
+ ) -> Iterable[tuple[str, str, str]]:
86
+ for prefix, identifier, alts in iter_helper(get_id_to_alts, leave=leave, **kwargs):
87
+ for alt in alts:
88
+ yield prefix, identifier, alt
89
+
90
+
91
+ def _iter_synonyms(
92
+ leave: bool = False, **kwargs: Unpack[IterHelperHelperDict]
93
+ ) -> Iterable[tuple[str, str, str]]:
94
+ """Iterate over all prefix-identifier-synonym triples we can get.
95
+
96
+ :param leave: should the tqdm be left behind?
97
+ """
98
+ for prefix, identifier, synonyms in iter_helper(get_id_synonyms_mapping, leave=leave, **kwargs):
99
+ for synonym in synonyms:
100
+ yield prefix, identifier, synonym
101
+
102
+
103
+ def _iter_typedefs(**kwargs: Unpack[IterHelperHelperDict]) -> Iterable[tuple[str, str, str, str]]:
104
+ """Iterate over all prefix-identifier-name triples we can get."""
105
+ for prefix, df in iter_helper_helper(get_typedef_df, **kwargs):
106
+ for t in df.values:
107
+ if all(t):
108
+ yield cast(tuple[str, str, str, str], (prefix, *t))
109
+
110
+
111
+ def _iter_relations(
112
+ **kwargs: Unpack[IterHelperHelperDict],
113
+ ) -> Iterable[tuple[str, str, str, str, str, str]]:
114
+ for prefix, df in iter_helper_helper(get_relations_df, **kwargs):
115
+ for t in df.values:
116
+ if all(t):
117
+ yield cast(tuple[str, str, str, str, str, str], (prefix, *t))
118
+
119
+
120
+ def _iter_edges(**kwargs: Unpack[IterHelperHelperDict]) -> Iterable[tuple[str, str, str, str]]:
121
+ for prefix, df in iter_helper_helper(get_edges_df, **kwargs):
122
+ for row in df.values:
123
+ yield cast(tuple[str, str, str, str], (*row, prefix))
124
+
125
+
126
+ def _iter_properties(**kwargs: Unpack[IterHelperHelperDict]) -> Iterable[tuple[str, str, str, str]]:
127
+ for prefix, df in iter_helper_helper(get_properties_df, **kwargs):
128
+ for t in df.values:
129
+ if all(t):
130
+ yield cast(tuple[str, str, str, str], (prefix, *t))
131
+
132
+
133
+ def _iter_xrefs(
134
+ **kwargs: Unpack[IterHelperHelperDict],
135
+ ) -> Iterable[tuple[str, str, str, str, str]]:
136
+ warnings.warn(f"use {_iter_mappings.__name__} instead", DeprecationWarning, stacklevel=2)
137
+ it = iter_helper_helper(get_xrefs_df, **kwargs)
138
+ for prefix, df in it:
139
+ df.dropna(inplace=True)
140
+ for row in df.values:
141
+ if any(not element for element in row):
142
+ continue
143
+ yield cast(tuple[str, str, str, str, str], (prefix, *row, prefix))
144
+
145
+
146
+ def _iter_mappings(
147
+ **kwargs: Unpack[IterHelperHelperDict],
148
+ ) -> Iterable[tuple[str, str, str, str, str]]:
149
+ f = partial(get_mappings_df, names=False, include_mapping_source_column=True)
150
+ # hack in a name to the partial function object since
151
+ # it's used for the tqdm description in iter_helper_helper
152
+ f.__name__ = "get_mappings_df" # type:ignore
153
+ it = iter_helper_helper(f, **kwargs)
154
+ for _prefix, df in it:
155
+ yield from df.values