pyobo 0.11.2__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. pyobo/.DS_Store +0 -0
  2. pyobo/__init__.py +95 -20
  3. pyobo/__main__.py +0 -0
  4. pyobo/api/__init__.py +81 -10
  5. pyobo/api/alts.py +52 -42
  6. pyobo/api/combine.py +39 -0
  7. pyobo/api/edges.py +68 -0
  8. pyobo/api/hierarchy.py +231 -203
  9. pyobo/api/metadata.py +14 -19
  10. pyobo/api/names.py +207 -127
  11. pyobo/api/properties.py +117 -113
  12. pyobo/api/relations.py +68 -94
  13. pyobo/api/species.py +24 -21
  14. pyobo/api/typedefs.py +11 -11
  15. pyobo/api/utils.py +66 -13
  16. pyobo/api/xrefs.py +108 -114
  17. pyobo/cli/__init__.py +0 -0
  18. pyobo/cli/cli.py +35 -50
  19. pyobo/cli/database.py +183 -161
  20. pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
  21. pyobo/cli/lookup.py +163 -195
  22. pyobo/cli/utils.py +19 -6
  23. pyobo/constants.py +102 -3
  24. pyobo/getters.py +196 -118
  25. pyobo/gilda_utils.py +79 -200
  26. pyobo/identifier_utils/__init__.py +41 -0
  27. pyobo/identifier_utils/api.py +296 -0
  28. pyobo/identifier_utils/model.py +130 -0
  29. pyobo/identifier_utils/preprocessing.json +812 -0
  30. pyobo/identifier_utils/preprocessing.py +61 -0
  31. pyobo/identifier_utils/relations/__init__.py +8 -0
  32. pyobo/identifier_utils/relations/api.py +162 -0
  33. pyobo/identifier_utils/relations/data.json +5824 -0
  34. pyobo/identifier_utils/relations/data_owl.json +57 -0
  35. pyobo/identifier_utils/relations/data_rdf.json +1 -0
  36. pyobo/identifier_utils/relations/data_rdfs.json +7 -0
  37. pyobo/mocks.py +9 -6
  38. pyobo/ner/__init__.py +9 -0
  39. pyobo/ner/api.py +72 -0
  40. pyobo/ner/normalizer.py +33 -0
  41. pyobo/obographs.py +43 -39
  42. pyobo/plugins.py +5 -4
  43. pyobo/py.typed +0 -0
  44. pyobo/reader.py +1358 -395
  45. pyobo/reader_utils.py +155 -0
  46. pyobo/resource_utils.py +42 -22
  47. pyobo/resources/__init__.py +0 -0
  48. pyobo/resources/goc.py +75 -0
  49. pyobo/resources/goc.tsv +188 -0
  50. pyobo/resources/ncbitaxon.py +4 -5
  51. pyobo/resources/ncbitaxon.tsv.gz +0 -0
  52. pyobo/resources/ro.py +3 -2
  53. pyobo/resources/ro.tsv +0 -0
  54. pyobo/resources/so.py +0 -0
  55. pyobo/resources/so.tsv +0 -0
  56. pyobo/sources/README.md +12 -8
  57. pyobo/sources/__init__.py +52 -29
  58. pyobo/sources/agrovoc.py +0 -0
  59. pyobo/sources/antibodyregistry.py +11 -12
  60. pyobo/sources/bigg/__init__.py +13 -0
  61. pyobo/sources/bigg/bigg_compartment.py +81 -0
  62. pyobo/sources/bigg/bigg_metabolite.py +229 -0
  63. pyobo/sources/bigg/bigg_model.py +46 -0
  64. pyobo/sources/bigg/bigg_reaction.py +77 -0
  65. pyobo/sources/biogrid.py +1 -2
  66. pyobo/sources/ccle.py +7 -12
  67. pyobo/sources/cgnc.py +0 -5
  68. pyobo/sources/chebi.py +1 -1
  69. pyobo/sources/chembl/__init__.py +9 -0
  70. pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
  71. pyobo/sources/chembl/chembl_target.py +160 -0
  72. pyobo/sources/civic_gene.py +55 -15
  73. pyobo/sources/clinicaltrials.py +160 -0
  74. pyobo/sources/complexportal.py +24 -24
  75. pyobo/sources/conso.py +14 -22
  76. pyobo/sources/cpt.py +0 -0
  77. pyobo/sources/credit.py +1 -9
  78. pyobo/sources/cvx.py +27 -5
  79. pyobo/sources/depmap.py +9 -12
  80. pyobo/sources/dictybase_gene.py +2 -7
  81. pyobo/sources/drugbank/__init__.py +9 -0
  82. pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
  83. pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
  84. pyobo/sources/drugcentral.py +17 -13
  85. pyobo/sources/expasy.py +31 -34
  86. pyobo/sources/famplex.py +13 -18
  87. pyobo/sources/flybase.py +3 -8
  88. pyobo/sources/gard.py +62 -0
  89. pyobo/sources/geonames/__init__.py +9 -0
  90. pyobo/sources/geonames/features.py +28 -0
  91. pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
  92. pyobo/sources/geonames/utils.py +115 -0
  93. pyobo/sources/gmt_utils.py +6 -7
  94. pyobo/sources/go.py +20 -13
  95. pyobo/sources/gtdb.py +154 -0
  96. pyobo/sources/gwascentral/__init__.py +9 -0
  97. pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
  98. pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
  99. pyobo/sources/hgnc/__init__.py +9 -0
  100. pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
  101. pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
  102. pyobo/sources/icd/__init__.py +9 -0
  103. pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
  104. pyobo/sources/icd/icd11.py +148 -0
  105. pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
  106. pyobo/sources/interpro.py +4 -9
  107. pyobo/sources/itis.py +0 -5
  108. pyobo/sources/kegg/__init__.py +0 -0
  109. pyobo/sources/kegg/api.py +16 -38
  110. pyobo/sources/kegg/genes.py +9 -20
  111. pyobo/sources/kegg/genome.py +1 -7
  112. pyobo/sources/kegg/pathway.py +9 -21
  113. pyobo/sources/mesh.py +58 -24
  114. pyobo/sources/mgi.py +3 -10
  115. pyobo/sources/mirbase/__init__.py +11 -0
  116. pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
  117. pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
  118. pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
  119. pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
  120. pyobo/sources/msigdb.py +74 -39
  121. pyobo/sources/ncbi/__init__.py +9 -0
  122. pyobo/sources/ncbi/ncbi_gc.py +162 -0
  123. pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
  124. pyobo/sources/nih_reporter.py +60 -0
  125. pyobo/sources/nlm/__init__.py +9 -0
  126. pyobo/sources/nlm/nlm_catalog.py +48 -0
  127. pyobo/sources/nlm/nlm_publisher.py +36 -0
  128. pyobo/sources/nlm/utils.py +116 -0
  129. pyobo/sources/npass.py +6 -8
  130. pyobo/sources/omim_ps.py +10 -3
  131. pyobo/sources/pathbank.py +4 -8
  132. pyobo/sources/pfam/__init__.py +9 -0
  133. pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
  134. pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
  135. pyobo/sources/pharmgkb/__init__.py +15 -0
  136. pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
  137. pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
  138. pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
  139. pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
  140. pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
  141. pyobo/sources/pharmgkb/utils.py +86 -0
  142. pyobo/sources/pid.py +1 -6
  143. pyobo/sources/pombase.py +6 -10
  144. pyobo/sources/pubchem.py +4 -9
  145. pyobo/sources/reactome.py +5 -11
  146. pyobo/sources/rgd.py +11 -16
  147. pyobo/sources/rhea.py +37 -36
  148. pyobo/sources/ror.py +69 -42
  149. pyobo/sources/selventa/__init__.py +0 -0
  150. pyobo/sources/selventa/schem.py +4 -7
  151. pyobo/sources/selventa/scomp.py +1 -6
  152. pyobo/sources/selventa/sdis.py +4 -7
  153. pyobo/sources/selventa/sfam.py +1 -6
  154. pyobo/sources/sgd.py +6 -11
  155. pyobo/sources/signor/__init__.py +7 -0
  156. pyobo/sources/signor/download.py +41 -0
  157. pyobo/sources/signor/signor_complexes.py +105 -0
  158. pyobo/sources/slm.py +12 -15
  159. pyobo/sources/umls/__init__.py +7 -1
  160. pyobo/sources/umls/__main__.py +0 -0
  161. pyobo/sources/umls/get_synonym_types.py +20 -4
  162. pyobo/sources/umls/sty.py +57 -0
  163. pyobo/sources/umls/synonym_types.tsv +1 -1
  164. pyobo/sources/umls/umls.py +18 -22
  165. pyobo/sources/unimod.py +46 -0
  166. pyobo/sources/uniprot/__init__.py +1 -1
  167. pyobo/sources/uniprot/uniprot.py +40 -32
  168. pyobo/sources/uniprot/uniprot_ptm.py +4 -34
  169. pyobo/sources/utils.py +3 -2
  170. pyobo/sources/wikipathways.py +7 -10
  171. pyobo/sources/zfin.py +5 -10
  172. pyobo/ssg/__init__.py +12 -16
  173. pyobo/ssg/base.html +0 -0
  174. pyobo/ssg/index.html +26 -13
  175. pyobo/ssg/term.html +12 -2
  176. pyobo/ssg/typedef.html +0 -0
  177. pyobo/struct/__init__.py +54 -8
  178. pyobo/struct/functional/__init__.py +1 -0
  179. pyobo/struct/functional/dsl.py +2572 -0
  180. pyobo/struct/functional/macros.py +423 -0
  181. pyobo/struct/functional/obo_to_functional.py +385 -0
  182. pyobo/struct/functional/ontology.py +270 -0
  183. pyobo/struct/functional/utils.py +112 -0
  184. pyobo/struct/reference.py +331 -136
  185. pyobo/struct/struct.py +1413 -643
  186. pyobo/struct/struct_utils.py +1078 -0
  187. pyobo/struct/typedef.py +162 -210
  188. pyobo/struct/utils.py +12 -5
  189. pyobo/struct/vocabulary.py +138 -0
  190. pyobo/utils/__init__.py +0 -0
  191. pyobo/utils/cache.py +13 -11
  192. pyobo/utils/io.py +17 -31
  193. pyobo/utils/iter.py +5 -5
  194. pyobo/utils/misc.py +41 -53
  195. pyobo/utils/ndex_utils.py +0 -0
  196. pyobo/utils/path.py +76 -70
  197. pyobo/version.py +3 -3
  198. {pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/METADATA +228 -229
  199. pyobo-0.12.0.dist-info/RECORD +202 -0
  200. pyobo-0.12.0.dist-info/WHEEL +4 -0
  201. {pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
  202. pyobo-0.12.0.dist-info/licenses/LICENSE +21 -0
  203. pyobo/aws.py +0 -162
  204. pyobo/cli/aws.py +0 -47
  205. pyobo/identifier_utils.py +0 -142
  206. pyobo/normalizer.py +0 -232
  207. pyobo/registries/__init__.py +0 -16
  208. pyobo/registries/metaregistry.json +0 -507
  209. pyobo/registries/metaregistry.py +0 -135
  210. pyobo/sources/icd11.py +0 -105
  211. pyobo/xrefdb/__init__.py +0 -1
  212. pyobo/xrefdb/canonicalizer.py +0 -214
  213. pyobo/xrefdb/priority.py +0 -59
  214. pyobo/xrefdb/sources/__init__.py +0 -60
  215. pyobo/xrefdb/sources/biomappings.py +0 -36
  216. pyobo/xrefdb/sources/cbms2019.py +0 -91
  217. pyobo/xrefdb/sources/chembl.py +0 -83
  218. pyobo/xrefdb/sources/compath.py +0 -82
  219. pyobo/xrefdb/sources/famplex.py +0 -64
  220. pyobo/xrefdb/sources/gilda.py +0 -50
  221. pyobo/xrefdb/sources/intact.py +0 -113
  222. pyobo/xrefdb/sources/ncit.py +0 -133
  223. pyobo/xrefdb/sources/pubchem.py +0 -27
  224. pyobo/xrefdb/sources/wikidata.py +0 -116
  225. pyobo-0.11.2.dist-info/RECORD +0 -157
  226. pyobo-0.11.2.dist-info/WHEEL +0 -5
  227. pyobo-0.11.2.dist-info/top_level.txt +0 -1
pyobo/sources/mesh.py CHANGED
@@ -4,22 +4,25 @@ import datetime
4
4
  import itertools as itt
5
5
  import logging
6
6
  import re
7
+ import time
7
8
  from collections.abc import Collection, Iterable, Mapping
8
- from typing import Any, Optional
9
+ from pathlib import Path
10
+ from typing import Any
9
11
  from xml.etree.ElementTree import Element
10
12
 
13
+ from lxml import etree
11
14
  from tqdm.auto import tqdm
12
15
 
13
16
  from pyobo.api.utils import safe_get_version
14
17
  from pyobo.identifier_utils import standardize_ec
15
18
  from pyobo.struct import Obo, Reference, Synonym, Term
16
19
  from pyobo.utils.cache import cached_json, cached_mapping
17
- from pyobo.utils.io import parse_xml_gz
18
20
  from pyobo.utils.path import ensure_path, prefix_directory_join
19
21
 
20
22
  __all__ = [
21
23
  "MeSHGetter",
22
24
  "get_mesh_category_curies",
25
+ "get_mesh_category_references",
23
26
  ]
24
27
 
25
28
  logger = logging.getLogger(__name__)
@@ -30,12 +33,21 @@ CAS_RE = re.compile(r"^\d{1,7}\-\d{2}\-\d$")
30
33
  UNII_RE = re.compile(r"[0-9A-Za-z]{10}$")
31
34
 
32
35
 
36
+ def _get_xml_root(path: Path) -> Element:
37
+ """Parse an XML file from a path to a GZIP file."""
38
+ t = time.time()
39
+ logger.info("parsing xml from %s", path)
40
+ tree = etree.parse(path.as_posix()) # type:ignore
41
+ logger.info("parsed xml in %.2f seconds", time.time() - t)
42
+ return tree.getroot()
43
+
44
+
33
45
  class MeSHGetter(Obo):
34
46
  """An ontology representation of the Medical Subject Headings."""
35
47
 
36
48
  ontology = bioversions_key = PREFIX
37
49
 
38
- def _get_version(self) -> Optional[str]:
50
+ def _get_version(self) -> str | None:
39
51
  return NOW_YEAR
40
52
 
41
53
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
@@ -43,11 +55,6 @@ class MeSHGetter(Obo):
43
55
  return get_terms(version=self._version_or_raise, force=force)
44
56
 
45
57
 
46
- def get_obo(force: bool = False) -> Obo:
47
- """Get MeSH as OBO."""
48
- return MeSHGetter(force=force)
49
-
50
-
51
58
  def get_tree_to_mesh_id(version: str) -> Mapping[str, str]:
52
59
  """Get a mapping from MeSH tree numbers to their MeSH identifiers."""
53
60
 
@@ -110,12 +117,12 @@ def ensure_mesh_descriptors(
110
117
  """Get the parsed MeSH dictionary, and cache it if it wasn't already."""
111
118
 
112
119
  @cached_json(path=prefix_directory_join(PREFIX, name="desc.json", version=version), force=force)
113
- def _inner():
120
+ def _inner() -> list[dict[str, Any]]:
114
121
  path = ensure_path(PREFIX, url=get_descriptors_url(version), version=version)
115
- root = parse_xml_gz(path)
122
+ root = _get_xml_root(path)
116
123
  return get_descriptor_records(root, id_key="DescriptorUI", name_key="DescriptorName/String")
117
124
 
118
- return _inner()
125
+ return _inner() # type:ignore
119
126
 
120
127
 
121
128
  def get_descriptors_url(version: str) -> str:
@@ -136,14 +143,14 @@ def ensure_mesh_supplemental_records(version: str, force: bool = False) -> list[
136
143
  """Get the parsed MeSH dictionary, and cache it if it wasn't already."""
137
144
 
138
145
  @cached_json(path=prefix_directory_join(PREFIX, name="supp.json", version=version), force=force)
139
- def _inner():
146
+ def _inner() -> list[dict[str, Any]]:
140
147
  path = ensure_path(PREFIX, url=get_supplemental_url(version), version=version)
141
- root = parse_xml_gz(path)
148
+ root = _get_xml_root(path)
142
149
  return get_descriptor_records(
143
150
  root, id_key="SupplementalRecordUI", name_key="SupplementalRecordName/String"
144
151
  )
145
152
 
146
- return _inner()
153
+ return _inner() # type:ignore
147
154
 
148
155
 
149
156
  def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict[str, Any]]:
@@ -169,7 +176,7 @@ def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict
169
176
  parents_descriptor_uis = set()
170
177
  for tree_number in descriptor["tree_numbers"]:
171
178
  try:
172
- parent_tn, self_tn = tree_number.rsplit(".", 1)
179
+ parent_tn, _self_tn = tree_number.rsplit(".", 1)
173
180
  except ValueError:
174
181
  logger.debug("No dot for %s", tree_number)
175
182
  continue
@@ -185,7 +192,7 @@ def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict
185
192
  return rv
186
193
 
187
194
 
188
- def get_scope_note(descriptor_record) -> Optional[str]:
195
+ def get_scope_note(descriptor_record) -> str | None:
189
196
  """Get the scope note from the preferred concept in a term's record."""
190
197
  if isinstance(descriptor_record, dict):
191
198
  # necessary for pre-2023 data
@@ -207,9 +214,10 @@ def get_descriptor_record(
207
214
  """Get descriptor records from the main element.
208
215
 
209
216
  :param element: An XML element
210
- :param id_key: For descriptors, set to 'DescriptorUI'. For supplement, set to 'SupplementalRecordUI'
211
- :param name_key: For descriptors, set to 'DescriptorName/String'.
212
- For supplement, set to 'SupplementalRecordName/String'
217
+ :param id_key: For descriptors, set to 'DescriptorUI'. For supplement, set to
218
+ 'SupplementalRecordUI'
219
+ :param name_key: For descriptors, set to 'DescriptorName/String'. For supplement,
220
+ set to 'SupplementalRecordName/String'
213
221
  """
214
222
  concepts = get_concept_records(element)
215
223
  scope_note = get_scope_note(concepts)
@@ -248,7 +256,7 @@ def _get_xrefs(element: Element) -> list[tuple[str, str]]:
248
256
  elif registry_number.startswith("txid"):
249
257
  rv.append(("NCBITaxon", registry_number[4:]))
250
258
  elif registry_number.startswith("EC "):
251
- rv.append(("eccode", standardize_ec(registry_number[3:])))
259
+ rv.append(("ec", standardize_ec(registry_number[3:])))
252
260
  elif CAS_RE.fullmatch(registry_number):
253
261
  rv.append(("cas", registry_number))
254
262
  elif UNII_RE.fullmatch(registry_number):
@@ -319,16 +327,40 @@ def _get_descriptor_qualifiers(descriptor: Element) -> list[Mapping[str, str]]:
319
327
 
320
328
 
321
329
  def get_mesh_category_curies(
322
- letter: str, *, skip: Optional[Collection[str]] = None, version: Optional[str] = None
330
+ letter: str, *, skip: Collection[str] | None = None, version: str | None = None
323
331
  ) -> list[str]:
324
332
  """Get the MeSH LUIDs for a category, by letter (e.g., "A").
325
333
 
326
334
  :param letter: The MeSH tree, A for anatomy, C for disease, etc.
327
335
  :param skip: An optional collection of MeSH tree codes to skip, such as "A03"
328
336
  :param version: The MeSH version to use. Defaults to latest
337
+
329
338
  :returns: A list of MeSH CURIE strings for the top level of each MeSH tree.
330
339
 
331
- .. seealso:: https://meshb.nlm.nih.gov/treeView
340
+ .. seealso::
341
+
342
+ https://meshb.nlm.nih.gov/treeView
343
+ """
344
+ return [
345
+ reference.curie
346
+ for reference in get_mesh_category_references(letter=letter, skip=skip, version=version)
347
+ ]
348
+
349
+
350
+ def get_mesh_category_references(
351
+ letter: str, *, skip: Collection[str] | None = None, version: str | None = None
352
+ ) -> list[Reference]:
353
+ """Get the MeSH references for a category, by letter (e.g., "A").
354
+
355
+ :param letter: The MeSH tree, A for anatomy, C for disease, etc.
356
+ :param skip: An optional collection of MeSH tree codes to skip, such as "A03"
357
+ :param version: The MeSH version to use. Defaults to latest
358
+
359
+ :returns: A list of MeSH references for the top level of each MeSH tree.
360
+
361
+ .. seealso::
362
+
363
+ https://meshb.nlm.nih.gov/treeView
332
364
  """
333
365
  if version is None:
334
366
  version = safe_get_version("mesh")
@@ -340,10 +372,12 @@ def get_mesh_category_curies(
340
372
  continue
341
373
  mesh_id = tree_to_mesh.get(key)
342
374
  if mesh_id is None:
375
+ # as soon as we get to a missing ID, we don't
376
+ # have to go any further
343
377
  break
344
- rv.append(f"mesh:{mesh_id}")
378
+ rv.append(Reference(prefix="mesh", identifier=mesh_id))
345
379
  return rv
346
380
 
347
381
 
348
382
  if __name__ == "__main__":
349
- get_obo(force=True).write_default(force=True, write_obo=True)
383
+ MeSHGetter.cli()
pyobo/sources/mgi.py CHANGED
@@ -12,7 +12,6 @@ from pyobo.struct.typedef import exact_match
12
12
  from ..struct import (
13
13
  Obo,
14
14
  Reference,
15
- Synonym,
16
15
  Term,
17
16
  from_species,
18
17
  has_gene_product,
@@ -35,8 +34,7 @@ ENSEMBL_XREFS_URL = "http://www.informatics.jax.org/downloads/reports/MRK_ENSEMB
35
34
  class MGIGetter(Obo):
36
35
  """An ontology representation of MGI's mouse gene nomenclature."""
37
36
 
38
- ontology = PREFIX
39
- dynamic_version = True
37
+ ontology = bioversions_key = PREFIX
40
38
  typedefs = [from_species, has_gene_product, transcribes_to, exact_match]
41
39
 
42
40
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
@@ -44,11 +42,6 @@ class MGIGetter(Obo):
44
42
  return get_terms(force=force)
45
43
 
46
44
 
47
- def get_obo(force: bool = False) -> Obo:
48
- """Get MGI as OBO."""
49
- return MGIGetter(force=force)
50
-
51
-
52
45
  COLUMNS = ["MGI Accession ID", "Marker Symbol", "Marker Name"]
53
46
 
54
47
 
@@ -159,7 +152,7 @@ def get_terms(force: bool = False) -> Iterable[Term]:
159
152
  )
160
153
  if identifier in mgi_to_synonyms:
161
154
  for synonym in mgi_to_synonyms[identifier]:
162
- term.append_synonym(Synonym(name=synonym))
155
+ term.append_synonym(synonym)
163
156
  if identifier in mgi_to_entrez_id:
164
157
  term.append_exact_match(
165
158
  Reference(prefix="ncbigene", identifier=mgi_to_entrez_id[identifier])
@@ -179,4 +172,4 @@ def get_terms(force: bool = False) -> Iterable[Term]:
179
172
 
180
173
 
181
174
  if __name__ == "__main__":
182
- get_obo(force=True).write_default(write_obo=True, write_obograph=True, use_tqdm=True)
175
+ MGIGetter.cli()
@@ -0,0 +1,11 @@
1
+ """Resources from miRBase."""
2
+
3
+ from .mirbase import MiRBaseGetter
4
+ from .mirbase_family import MiRBaseFamilyGetter
5
+ from .mirbase_mature import MiRBaseMatureGetter
6
+
7
+ __all__ = [
8
+ "MiRBaseFamilyGetter",
9
+ "MiRBaseGetter",
10
+ "MiRBaseMatureGetter",
11
+ ]
@@ -6,12 +6,13 @@ from collections.abc import Iterable, Mapping
6
6
 
7
7
  from tqdm.auto import tqdm
8
8
 
9
- from pyobo.sources.mirbase_constants import BASE_URL, _assert_frozen_version
10
9
  from pyobo.struct import Obo, Reference, Synonym, Term, from_species
11
10
  from pyobo.struct.typedef import has_mature
12
11
  from pyobo.utils.cache import cached_mapping
13
12
  from pyobo.utils.path import ensure_df, ensure_path, prefix_directory_join
14
13
 
14
+ from .mirbase_constants import BASE_URL, _assert_frozen_version
15
+
15
16
  __all__ = [
16
17
  "MiRBaseGetter",
17
18
  ]
@@ -41,11 +42,6 @@ class MiRBaseGetter(Obo):
41
42
  return get_terms(version=self._version_or_raise, force=force)
42
43
 
43
44
 
44
- def get_obo(force: bool = False) -> Obo:
45
- """Get miRBase as OBO."""
46
- return MiRBaseGetter(force=force)
47
-
48
-
49
45
  def get_terms(version: str, force: bool = False) -> list[Term]:
50
46
  """Parse miRNA data from filepath and convert it to dictionary."""
51
47
  _assert_frozen_version(version)
@@ -54,7 +50,7 @@ def get_terms(version: str, force: bool = False) -> list[Term]:
54
50
 
55
51
  file_handle = (
56
52
  gzip.open(definitions_path, "rt")
57
- if definitions_path.endswith(".gz")
53
+ if definitions_path.suffix.endswith(".gz")
58
54
  else open(definitions_path)
59
55
  )
60
56
  with file_handle as file:
@@ -101,7 +97,7 @@ def _process_definitions_lines(
101
97
 
102
98
  for group in tqdm(groups, desc=f"mapping {PREFIX}"):
103
99
  name = group[0][5:23].strip()
104
- qualifier, dtype, species_code, length = map(
100
+ _qualifier, _dtype, species_code, _length = map(
105
101
  str.strip, group[0][23:].strip().rstrip(".").split(";")
106
102
  )
107
103
  identifier = group[2][3:-2].strip()
@@ -134,7 +130,7 @@ def _process_definitions_lines(
134
130
  xref_prefix, xref_identifier, xref_label = map(str.strip, line.split(";"))
135
131
  xref_prefix = xref_prefix.lower()
136
132
  xref_prefix = xref_mapping.get(xref_prefix, xref_prefix)
137
- if xref_prefix == "pictar":
133
+ if xref_prefix in {"pictar", "mir", "mirte"}:
138
134
  continue
139
135
 
140
136
  try:
@@ -157,7 +153,8 @@ def _process_definitions_lines(
157
153
 
158
154
  species_identifier, species_name = organisms[species_code]
159
155
  term.set_species(species_identifier, species_name)
160
- term.extend_relationship(has_mature, matures)
156
+ for mature in matures:
157
+ term.append_relationship(has_mature, mature)
161
158
 
162
159
  yield term
163
160
 
@@ -199,4 +196,4 @@ def get_mature_id_to_name(version: str) -> Mapping[str, str]:
199
196
 
200
197
 
201
198
  if __name__ == "__main__":
202
- get_obo(force=True).write_default(force=True, write_obograph=True, write_obo=True)
199
+ MiRBaseGetter.cli()
@@ -5,12 +5,13 @@ from collections.abc import Iterable
5
5
  import pandas as pd
6
6
  from tqdm.auto import tqdm
7
7
 
8
- from pyobo.sources.mirbase_constants import (
8
+ from pyobo.struct import Obo, Reference, Term, has_member
9
+
10
+ from .mirbase_constants import (
9
11
  get_premature_df,
10
12
  get_premature_family_df,
11
13
  get_premature_to_prefamily_df,
12
14
  )
13
- from pyobo.struct import Obo, Reference, Term, has_member
14
15
 
15
16
  __all__ = [
16
17
  "MiRBaseFamilyGetter",
@@ -31,11 +32,6 @@ class MiRBaseFamilyGetter(Obo):
31
32
  return iter_terms(version=self._version_or_raise, force=force)
32
33
 
33
34
 
34
- def get_obo(force: bool = False) -> Obo:
35
- """Get miRBase family as OBO."""
36
- return MiRBaseFamilyGetter(force=force)
37
-
38
-
39
35
  def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
40
36
  """Get miRBase family terms."""
41
37
  df = get_df(version, force=force)
@@ -66,4 +62,4 @@ def get_df(version: str, force: bool = False) -> pd.DataFrame:
66
62
 
67
63
 
68
64
  if __name__ == "__main__":
69
- get_obo().write_default(use_tqdm=True, write_obo=True, force=True)
65
+ MiRBaseFamilyGetter.cli()
@@ -5,9 +5,10 @@ from collections.abc import Iterable
5
5
  import pandas as pd
6
6
  from tqdm.auto import tqdm
7
7
 
8
- from pyobo.sources.mirbase_constants import get_mature_df
9
8
  from pyobo.struct import Obo, Reference, Synonym, Term
10
9
 
10
+ from .mirbase_constants import get_mature_df
11
+
11
12
  __all__ = [
12
13
  "MiRBaseMatureGetter",
13
14
  ]
@@ -26,11 +27,6 @@ class MiRBaseMatureGetter(Obo):
26
27
  return iter_terms(version=self._version_or_raise, force=force)
27
28
 
28
29
 
29
- def get_obo(force: bool = False) -> Obo:
30
- """Get miRBase mature as OBO."""
31
- return MiRBaseMatureGetter(force=force)
32
-
33
-
34
30
  def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
35
31
  """Get miRBase mature terms."""
36
32
  df = get_mature_df(version, force=force)
@@ -49,4 +45,4 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
49
45
 
50
46
 
51
47
  if __name__ == "__main__":
52
- get_obo().write_default(write_obo=True, write_obograph=True, use_tqdm=True)
48
+ MiRBaseMatureGetter.cli()
pyobo/sources/msigdb.py CHANGED
@@ -1,41 +1,55 @@
1
1
  """Parsers for MSig."""
2
2
 
3
3
  import logging
4
+ import zipfile
4
5
  from collections.abc import Iterable
5
- from typing import Optional
6
6
 
7
- from lxml.etree import ElementTree
7
+ from lxml import etree
8
+ from pydantic import ValidationError
8
9
  from tqdm.auto import tqdm
9
10
 
10
- from ..struct import Obo, Reference, Term, has_participant
11
- from ..utils.path import ensure_path
12
-
13
- logger = logging.getLogger(__name__)
11
+ from pyobo.struct import Obo, Reference, Term, TypeDef, has_citation, has_participant
12
+ from pyobo.utils.path import ensure_path
14
13
 
15
14
  __all__ = [
16
15
  "MSigDBGetter",
17
16
  ]
18
17
 
18
+ logger = logging.getLogger(__name__)
19
+
19
20
  PREFIX = "msigdb"
20
21
  BASE_URL = "https://data.broadinstitute.org/gsea-msigdb/msigdb/release"
21
22
 
23
+ CATEGORY_CODE = TypeDef.default(PREFIX, "category_code", name="category code", is_metadata_tag=True)
24
+ SUB_CATEGORY_CODE = TypeDef.default(
25
+ PREFIX, "sub_category_code", name="sub-category code", is_metadata_tag=True
26
+ )
27
+ CONTRIBUTOR = TypeDef.default(PREFIX, "contributor", name="contributor", is_metadata_tag=True)
28
+ EXACT_SOURCE = TypeDef.default(PREFIX, "exact_source", name="exact source", is_metadata_tag=True)
29
+ EXTERNAL_DETAILS_URL = TypeDef.default(
30
+ PREFIX, "external_details_url", name="external details URL", is_metadata_tag=True
31
+ )
32
+
33
+ PROPERTIES = [
34
+ ("CATEGORY_CODE", CATEGORY_CODE),
35
+ ("SUB_CATEGORY_CODE", SUB_CATEGORY_CODE),
36
+ ("CONTRIBUTOR", CONTRIBUTOR),
37
+ ("EXACT_SOURCE", EXACT_SOURCE),
38
+ ("EXTERNAL_DETAILS_URL", EXTERNAL_DETAILS_URL),
39
+ ]
40
+
22
41
 
23
42
  class MSigDBGetter(Obo):
24
43
  """An ontology representation of MMSigDB's gene set nomenclature."""
25
44
 
26
45
  ontology = bioversions_key = PREFIX
27
- typedefs = [has_participant]
46
+ typedefs = [has_participant, has_citation, *(p for _, p in PROPERTIES)]
28
47
 
29
48
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
30
49
  """Iterate over terms in the ontology."""
31
50
  return iter_terms(version=self._version_or_raise, force=force)
32
51
 
33
52
 
34
- def get_obo(force: bool = False) -> Obo:
35
- """Get MSIG as Obo."""
36
- return MSigDBGetter(force=force)
37
-
38
-
39
53
  _SPECIES = {
40
54
  "Homo sapiens": "9606",
41
55
  "Mus musculus": "10090",
@@ -49,24 +63,36 @@ GO_URL_PREFIX = "http://amigo.geneontology.org/amigo/term/GO:"
49
63
  KEGG_URL_PREFIX = "http://www.genome.jp/kegg/pathway/hsa/"
50
64
 
51
65
 
52
- def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
53
- """Get MSigDb terms."""
54
- xml_url = f"{BASE_URL}/{version}.Hs/msigdb_v{version}.Hs.xml"
66
+ def _iter_entries(version: str, force: bool = False):
67
+ xml_url = f"{BASE_URL}/{version}.Hs/msigdb_v{version}.Hs.xml.zip"
55
68
  path = ensure_path(prefix=PREFIX, url=xml_url, version=version, force=force)
56
- tree = ElementTree.parse(path)
69
+ with zipfile.ZipFile(path, "r") as zf:
70
+ with zf.open(f"msigdb_v{version}.Hs.xml") as file:
71
+ for _ in range(3):
72
+ next(file)
73
+ # from here on out, every row except the last is a GENESET
74
+ for i, line_bytes in enumerate(file, start=4):
75
+ line = line_bytes.decode("utf8").strip()
76
+ if not line.startswith("<GENESET"):
77
+ continue
78
+ try:
79
+ tree = etree.fromstring(line)
80
+ except etree.XMLSyntaxError as e:
81
+ # this is the result of faulty encoding in XML - maybe they
82
+ # wrote XML with their own string formatting instead of using a
83
+ # library.
84
+ logger.debug("[%s] failed on line %s: %s", PREFIX, i, e)
85
+ else:
86
+ yield tree
57
87
 
58
- for entry in tqdm(tree.getroot(), desc=f"{PREFIX} v{version}", unit_scale=True):
88
+
89
+ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
90
+ """Get MSigDb terms."""
91
+ entries = _iter_entries(version=version, force=force)
92
+ for entry in tqdm(entries, desc=f"{PREFIX} v{version}", unit_scale=True):
59
93
  attrib = dict(entry.attrib)
60
94
  tax_id = _SPECIES[attrib["ORGANISM"]]
61
95
 
62
- reference_id = attrib["PMID"].strip()
63
- if not reference_id:
64
- reference = None
65
- elif reference_id.startswith("GSE"):
66
- reference = Reference(prefix="gse", identifier=reference_id)
67
- else:
68
- reference = Reference(prefix="pubmed", identifier=reference_id)
69
-
70
96
  # NONE have the entry "HISTORICAL_NAME"
71
97
  # historical_name = thing.attrib['HISTORICAL_NAME']
72
98
 
@@ -77,19 +103,20 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
77
103
  term = Term(
78
104
  reference=Reference(prefix=PREFIX, identifier=identifier, name=name),
79
105
  definition=_get_definition(attrib),
80
- provenance=[] if reference is None else [reference],
81
106
  is_obsolete=is_obsolete,
82
107
  )
83
- for key in [
84
- "CATEGORY_CODE",
85
- "SUB_CATEGORY_CODE",
86
- "CONTRIBUTOR",
87
- "EXACT_SOURCE",
88
- "EXTERNAL_DETAILS_URL",
89
- ]:
90
- value = attrib[key].strip()
91
- if value:
92
- term.append_property(key.lower(), value)
108
+
109
+ reference_id = attrib["PMID"].strip()
110
+ if not reference_id:
111
+ pass
112
+ elif reference_id.startswith("GSE"):
113
+ term.append_see_also(Reference(prefix="gse", identifier=reference_id))
114
+ else:
115
+ term.append_provenance(Reference(prefix="pubmed", identifier=reference_id))
116
+
117
+ for key, typedef in PROPERTIES:
118
+ if value := attrib[key].strip():
119
+ term.annotate_string(typedef, value)
93
120
 
94
121
  term.set_species(tax_id)
95
122
 
@@ -123,17 +150,25 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
123
150
  logger.warning(
124
151
  "missing %s source: msigdb:%s (%s)", contributor, identifier, external_details
125
152
  )
126
- term.append_xref(Reference(prefix="kegg.pathway", identifier=external_id))
153
+
154
+ try:
155
+ kegg_reference = Reference(prefix="kegg.pathway", identifier=external_id)
156
+ except ValidationError:
157
+ # TODO handle kegg.network which starts with N, like N01146
158
+ if not external_id.startswith("N"):
159
+ tqdm.write(f"could not validate kegg.pathway:{external_id}")
160
+ else:
161
+ term.append_xref(kegg_reference)
127
162
 
128
163
  for ncbigene_id in attrib["MEMBERS_EZID"].strip().split(","):
129
164
  if ncbigene_id:
130
- term.append_relationship(
165
+ term.annotate_object(
131
166
  has_participant, Reference(prefix="ncbigene", identifier=ncbigene_id)
132
167
  )
133
168
  yield term
134
169
 
135
170
 
136
- def _get_definition(attrib) -> Optional[str]:
171
+ def _get_definition(attrib) -> str | None:
137
172
  rv = attrib["DESCRIPTION_FULL"].strip() or attrib["DESCRIPTION_BRIEF"].strip() or None
138
173
  if rv is not None:
139
174
  return rv.replace(r"\d", "").replace(r"\s", "")
@@ -0,0 +1,9 @@
1
+ """Resources from NCBI."""
2
+
3
+ from .ncbi_gc import NCBIGCGetter
4
+ from .ncbigene import NCBIGeneGetter
5
+
6
+ __all__ = [
7
+ "NCBIGCGetter",
8
+ "NCBIGeneGetter",
9
+ ]