pyobo 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. pyobo/.DS_Store +0 -0
  2. pyobo/__init__.py +95 -20
  3. pyobo/__main__.py +0 -0
  4. pyobo/api/__init__.py +81 -10
  5. pyobo/api/alts.py +52 -42
  6. pyobo/api/combine.py +39 -0
  7. pyobo/api/edges.py +68 -0
  8. pyobo/api/hierarchy.py +231 -203
  9. pyobo/api/metadata.py +14 -19
  10. pyobo/api/names.py +207 -127
  11. pyobo/api/properties.py +117 -113
  12. pyobo/api/relations.py +68 -94
  13. pyobo/api/species.py +24 -21
  14. pyobo/api/typedefs.py +11 -11
  15. pyobo/api/utils.py +66 -13
  16. pyobo/api/xrefs.py +108 -114
  17. pyobo/cli/__init__.py +0 -0
  18. pyobo/cli/cli.py +35 -50
  19. pyobo/cli/database.py +183 -161
  20. pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
  21. pyobo/cli/lookup.py +163 -195
  22. pyobo/cli/utils.py +19 -6
  23. pyobo/constants.py +102 -3
  24. pyobo/getters.py +196 -118
  25. pyobo/gilda_utils.py +79 -200
  26. pyobo/identifier_utils/__init__.py +41 -0
  27. pyobo/identifier_utils/api.py +296 -0
  28. pyobo/identifier_utils/model.py +130 -0
  29. pyobo/identifier_utils/preprocessing.json +812 -0
  30. pyobo/identifier_utils/preprocessing.py +61 -0
  31. pyobo/identifier_utils/relations/__init__.py +8 -0
  32. pyobo/identifier_utils/relations/api.py +162 -0
  33. pyobo/identifier_utils/relations/data.json +5824 -0
  34. pyobo/identifier_utils/relations/data_owl.json +57 -0
  35. pyobo/identifier_utils/relations/data_rdf.json +1 -0
  36. pyobo/identifier_utils/relations/data_rdfs.json +7 -0
  37. pyobo/mocks.py +9 -6
  38. pyobo/ner/__init__.py +9 -0
  39. pyobo/ner/api.py +72 -0
  40. pyobo/ner/normalizer.py +33 -0
  41. pyobo/obographs.py +43 -39
  42. pyobo/plugins.py +5 -4
  43. pyobo/py.typed +0 -0
  44. pyobo/reader.py +1358 -395
  45. pyobo/reader_utils.py +155 -0
  46. pyobo/resource_utils.py +42 -22
  47. pyobo/resources/__init__.py +0 -0
  48. pyobo/resources/goc.py +75 -0
  49. pyobo/resources/goc.tsv +188 -0
  50. pyobo/resources/ncbitaxon.py +4 -5
  51. pyobo/resources/ncbitaxon.tsv.gz +0 -0
  52. pyobo/resources/ro.py +3 -2
  53. pyobo/resources/ro.tsv +0 -0
  54. pyobo/resources/so.py +0 -0
  55. pyobo/resources/so.tsv +0 -0
  56. pyobo/sources/README.md +12 -8
  57. pyobo/sources/__init__.py +52 -29
  58. pyobo/sources/agrovoc.py +0 -0
  59. pyobo/sources/antibodyregistry.py +11 -12
  60. pyobo/sources/bigg/__init__.py +13 -0
  61. pyobo/sources/bigg/bigg_compartment.py +81 -0
  62. pyobo/sources/bigg/bigg_metabolite.py +229 -0
  63. pyobo/sources/bigg/bigg_model.py +46 -0
  64. pyobo/sources/bigg/bigg_reaction.py +77 -0
  65. pyobo/sources/biogrid.py +1 -2
  66. pyobo/sources/ccle.py +7 -12
  67. pyobo/sources/cgnc.py +0 -5
  68. pyobo/sources/chebi.py +1 -1
  69. pyobo/sources/chembl/__init__.py +9 -0
  70. pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
  71. pyobo/sources/chembl/chembl_target.py +160 -0
  72. pyobo/sources/civic_gene.py +55 -15
  73. pyobo/sources/clinicaltrials.py +160 -0
  74. pyobo/sources/complexportal.py +24 -24
  75. pyobo/sources/conso.py +14 -22
  76. pyobo/sources/cpt.py +0 -0
  77. pyobo/sources/credit.py +1 -9
  78. pyobo/sources/cvx.py +27 -5
  79. pyobo/sources/depmap.py +9 -12
  80. pyobo/sources/dictybase_gene.py +2 -7
  81. pyobo/sources/drugbank/__init__.py +9 -0
  82. pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
  83. pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
  84. pyobo/sources/drugcentral.py +17 -13
  85. pyobo/sources/expasy.py +31 -34
  86. pyobo/sources/famplex.py +13 -18
  87. pyobo/sources/flybase.py +3 -8
  88. pyobo/sources/gard.py +62 -0
  89. pyobo/sources/geonames/__init__.py +9 -0
  90. pyobo/sources/geonames/features.py +28 -0
  91. pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
  92. pyobo/sources/geonames/utils.py +115 -0
  93. pyobo/sources/gmt_utils.py +6 -7
  94. pyobo/sources/go.py +20 -13
  95. pyobo/sources/gtdb.py +154 -0
  96. pyobo/sources/gwascentral/__init__.py +9 -0
  97. pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
  98. pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
  99. pyobo/sources/hgnc/__init__.py +9 -0
  100. pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
  101. pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
  102. pyobo/sources/icd/__init__.py +9 -0
  103. pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
  104. pyobo/sources/icd/icd11.py +148 -0
  105. pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
  106. pyobo/sources/interpro.py +4 -9
  107. pyobo/sources/itis.py +0 -5
  108. pyobo/sources/kegg/__init__.py +0 -0
  109. pyobo/sources/kegg/api.py +16 -38
  110. pyobo/sources/kegg/genes.py +9 -20
  111. pyobo/sources/kegg/genome.py +1 -7
  112. pyobo/sources/kegg/pathway.py +9 -21
  113. pyobo/sources/mesh.py +58 -24
  114. pyobo/sources/mgi.py +3 -10
  115. pyobo/sources/mirbase/__init__.py +11 -0
  116. pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
  117. pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
  118. pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
  119. pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
  120. pyobo/sources/msigdb.py +74 -39
  121. pyobo/sources/ncbi/__init__.py +9 -0
  122. pyobo/sources/ncbi/ncbi_gc.py +162 -0
  123. pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
  124. pyobo/sources/nih_reporter.py +60 -0
  125. pyobo/sources/nlm/__init__.py +9 -0
  126. pyobo/sources/nlm/nlm_catalog.py +48 -0
  127. pyobo/sources/nlm/nlm_publisher.py +36 -0
  128. pyobo/sources/nlm/utils.py +116 -0
  129. pyobo/sources/npass.py +6 -8
  130. pyobo/sources/omim_ps.py +10 -3
  131. pyobo/sources/pathbank.py +4 -8
  132. pyobo/sources/pfam/__init__.py +9 -0
  133. pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
  134. pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
  135. pyobo/sources/pharmgkb/__init__.py +15 -0
  136. pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
  137. pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
  138. pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
  139. pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
  140. pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
  141. pyobo/sources/pharmgkb/utils.py +86 -0
  142. pyobo/sources/pid.py +1 -6
  143. pyobo/sources/pombase.py +6 -10
  144. pyobo/sources/pubchem.py +4 -9
  145. pyobo/sources/reactome.py +5 -11
  146. pyobo/sources/rgd.py +11 -16
  147. pyobo/sources/rhea.py +37 -36
  148. pyobo/sources/ror.py +69 -42
  149. pyobo/sources/selventa/__init__.py +0 -0
  150. pyobo/sources/selventa/schem.py +4 -7
  151. pyobo/sources/selventa/scomp.py +1 -6
  152. pyobo/sources/selventa/sdis.py +4 -7
  153. pyobo/sources/selventa/sfam.py +1 -6
  154. pyobo/sources/sgd.py +6 -11
  155. pyobo/sources/signor/__init__.py +7 -0
  156. pyobo/sources/signor/download.py +41 -0
  157. pyobo/sources/signor/signor_complexes.py +105 -0
  158. pyobo/sources/slm.py +12 -15
  159. pyobo/sources/umls/__init__.py +7 -1
  160. pyobo/sources/umls/__main__.py +0 -0
  161. pyobo/sources/umls/get_synonym_types.py +20 -4
  162. pyobo/sources/umls/sty.py +57 -0
  163. pyobo/sources/umls/synonym_types.tsv +1 -1
  164. pyobo/sources/umls/umls.py +18 -22
  165. pyobo/sources/unimod.py +46 -0
  166. pyobo/sources/uniprot/__init__.py +1 -1
  167. pyobo/sources/uniprot/uniprot.py +40 -32
  168. pyobo/sources/uniprot/uniprot_ptm.py +4 -34
  169. pyobo/sources/utils.py +3 -2
  170. pyobo/sources/wikipathways.py +7 -10
  171. pyobo/sources/zfin.py +5 -10
  172. pyobo/ssg/__init__.py +12 -16
  173. pyobo/ssg/base.html +0 -0
  174. pyobo/ssg/index.html +26 -13
  175. pyobo/ssg/term.html +12 -2
  176. pyobo/ssg/typedef.html +0 -0
  177. pyobo/struct/__init__.py +54 -8
  178. pyobo/struct/functional/__init__.py +1 -0
  179. pyobo/struct/functional/dsl.py +2572 -0
  180. pyobo/struct/functional/macros.py +423 -0
  181. pyobo/struct/functional/obo_to_functional.py +385 -0
  182. pyobo/struct/functional/ontology.py +270 -0
  183. pyobo/struct/functional/utils.py +112 -0
  184. pyobo/struct/reference.py +331 -136
  185. pyobo/struct/struct.py +1413 -643
  186. pyobo/struct/struct_utils.py +1078 -0
  187. pyobo/struct/typedef.py +162 -210
  188. pyobo/struct/utils.py +12 -5
  189. pyobo/struct/vocabulary.py +138 -0
  190. pyobo/utils/__init__.py +0 -0
  191. pyobo/utils/cache.py +13 -11
  192. pyobo/utils/io.py +17 -31
  193. pyobo/utils/iter.py +5 -5
  194. pyobo/utils/misc.py +41 -53
  195. pyobo/utils/ndex_utils.py +0 -0
  196. pyobo/utils/path.py +76 -70
  197. pyobo/version.py +3 -3
  198. {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/METADATA +224 -225
  199. pyobo-0.12.0.dist-info/RECORD +202 -0
  200. pyobo-0.12.0.dist-info/WHEEL +4 -0
  201. {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
  202. {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info/licenses}/LICENSE +0 -0
  203. pyobo/apps/__init__.py +0 -3
  204. pyobo/apps/cli.py +0 -24
  205. pyobo/apps/gilda/__init__.py +0 -3
  206. pyobo/apps/gilda/__main__.py +0 -8
  207. pyobo/apps/gilda/app.py +0 -48
  208. pyobo/apps/gilda/cli.py +0 -36
  209. pyobo/apps/gilda/templates/base.html +0 -33
  210. pyobo/apps/gilda/templates/home.html +0 -11
  211. pyobo/apps/gilda/templates/matches.html +0 -32
  212. pyobo/apps/mapper/__init__.py +0 -3
  213. pyobo/apps/mapper/__main__.py +0 -11
  214. pyobo/apps/mapper/cli.py +0 -37
  215. pyobo/apps/mapper/mapper.py +0 -187
  216. pyobo/apps/mapper/templates/base.html +0 -35
  217. pyobo/apps/mapper/templates/mapper_home.html +0 -64
  218. pyobo/aws.py +0 -162
  219. pyobo/cli/aws.py +0 -47
  220. pyobo/identifier_utils.py +0 -142
  221. pyobo/normalizer.py +0 -232
  222. pyobo/registries/__init__.py +0 -16
  223. pyobo/registries/metaregistry.json +0 -507
  224. pyobo/registries/metaregistry.py +0 -135
  225. pyobo/sources/icd11.py +0 -105
  226. pyobo/xrefdb/__init__.py +0 -1
  227. pyobo/xrefdb/canonicalizer.py +0 -214
  228. pyobo/xrefdb/priority.py +0 -59
  229. pyobo/xrefdb/sources/__init__.py +0 -60
  230. pyobo/xrefdb/sources/biomappings.py +0 -36
  231. pyobo/xrefdb/sources/cbms2019.py +0 -91
  232. pyobo/xrefdb/sources/chembl.py +0 -83
  233. pyobo/xrefdb/sources/compath.py +0 -82
  234. pyobo/xrefdb/sources/famplex.py +0 -64
  235. pyobo/xrefdb/sources/gilda.py +0 -50
  236. pyobo/xrefdb/sources/intact.py +0 -113
  237. pyobo/xrefdb/sources/ncit.py +0 -133
  238. pyobo/xrefdb/sources/pubchem.py +0 -27
  239. pyobo/xrefdb/sources/wikidata.py +0 -116
  240. pyobo-0.11.1.dist-info/RECORD +0 -173
  241. pyobo-0.11.1.dist-info/WHEEL +0 -5
  242. pyobo-0.11.1.dist-info/top_level.txt +0 -1
pyobo/sources/icd11.py DELETED
@@ -1,105 +0,0 @@
1
- """Convert ICD11 to OBO.
2
-
3
- Run with python -m pyobo.sources.icd11 -v
4
- """
5
-
6
- import json
7
- import logging
8
- import os
9
- from collections.abc import Iterable, Mapping
10
- from typing import Any
11
-
12
- import click
13
- from more_click import verbose_option
14
- from tqdm.auto import tqdm
15
-
16
- from ..sources.icd_utils import (
17
- ICD11_TOP_LEVEL_URL,
18
- get_child_identifiers,
19
- get_icd,
20
- visiter,
21
- )
22
- from ..struct import Obo, Reference, Synonym, Term
23
- from ..utils.path import prefix_directory_join
24
-
25
- __all__ = [
26
- "ICD11Getter",
27
- ]
28
-
29
- logger = logging.getLogger(__name__)
30
-
31
- PREFIX = "icd11"
32
-
33
-
34
- class ICD11Getter(Obo):
35
- """An ontology representation of ICD-11."""
36
-
37
- ontology = PREFIX
38
- dynamic_version = True
39
-
40
- def iter_terms(self, force: bool = False) -> Iterable[Term]:
41
- """Iterate over terms in the ontology."""
42
- return iterate_icd11()
43
-
44
-
45
- def get_obo() -> Obo:
46
- """Get ICD11 as OBO."""
47
- return ICD11Getter()
48
-
49
-
50
- def iterate_icd11() -> Iterable[Term]:
51
- """Iterate over the terms in ICD11.
52
-
53
- The API doesn't seem to have a rate limit, but returns pretty slow.
54
- This means that it only gets results at at about 5 calls/second.
55
- Get ready to be patient - the API token expires every hour so there's
56
- a caching mechanism with :mod:`cachier` that gets a new one every hour.
57
- """
58
- res = get_icd(ICD11_TOP_LEVEL_URL)
59
- res_json = res.json()
60
-
61
- version = res_json["releaseId"]
62
- directory = prefix_directory_join(PREFIX, version=version)
63
-
64
- with open(os.path.join(directory, "top.json"), "w") as file:
65
- json.dump(res_json, file, indent=2)
66
-
67
- tqdm.write(f'There are {len(res_json["child"])} top level entities')
68
-
69
- visited_identifiers: set[str] = set()
70
- for identifier in get_child_identifiers(ICD11_TOP_LEVEL_URL, res_json):
71
- yield from visiter(
72
- identifier,
73
- visited_identifiers,
74
- directory,
75
- endpoint=ICD11_TOP_LEVEL_URL,
76
- converter=_extract_icd11,
77
- )
78
-
79
-
80
- def _extract_icd11(res_json: Mapping[str, Any]) -> Term:
81
- identifier = res_json["@id"][len(ICD11_TOP_LEVEL_URL) :].lstrip("/")
82
- definition = res_json["definition"]["@value"] if "definition" in res_json else None
83
- name = res_json["title"]["@value"]
84
- synonyms = [Synonym(synonym["label"]["@value"]) for synonym in res_json.get("synonym", [])]
85
- parents = [
86
- Reference(prefix=PREFIX, identifier=url[len("http://id.who.int/icd/entity/") :])
87
- for url in res_json["parent"]
88
- if url[len("http://id.who.int/icd/entity/") :]
89
- ]
90
- return Term(
91
- reference=Reference(prefix=PREFIX, identifier=identifier, name=name),
92
- definition=definition,
93
- synonyms=synonyms,
94
- parents=parents,
95
- )
96
-
97
-
98
- @click.command()
99
- @verbose_option
100
- def _main():
101
- get_obo().write_default(use_tqdm=True)
102
-
103
-
104
- if __name__ == "__main__":
105
- _main()
pyobo/xrefdb/__init__.py DELETED
@@ -1 +0,0 @@
1
- """Extraction of mappings from OBO documents."""
@@ -1,214 +0,0 @@
1
- """Tools for canonicalizing a CURIE based on a priority list."""
2
-
3
- from collections.abc import Iterable, Mapping
4
- from dataclasses import dataclass, field
5
- from functools import lru_cache
6
- from typing import Optional
7
-
8
- import networkx as nx
9
- import pandas as pd
10
- from more_itertools import pairwise
11
- from tqdm.auto import tqdm
12
-
13
- from .priority import DEFAULT_PRIORITY_LIST
14
- from .xrefs_pipeline import get_graph_from_xref_df
15
- from .. import resource_utils
16
- from ..utils.io import get_reader, get_writer
17
-
18
- __all__ = [
19
- "Canonicalizer",
20
- "all_shortest_paths",
21
- "single_source_shortest_path",
22
- "get_equivalent",
23
- "get_priority_curie",
24
- "remap_file_stream",
25
- ]
26
-
27
-
28
- @dataclass
29
- class Canonicalizer:
30
- """Wraps a graph and priority list to allow getting the best identifier."""
31
-
32
- #: A graph from :func:`get_graph_from_xref_df`
33
- graph: nx.Graph
34
-
35
- #: A list of prefixes. The ones with the lower index are higher priority
36
- priority: Optional[list[str]] = None
37
-
38
- #: Longest length paths allowed
39
- cutoff: int = 5
40
-
41
- _priority: Mapping[str, int] = field(init=False)
42
-
43
- def __post_init__(self):
44
- """Initialize the priority map based on the priority list."""
45
- if self.priority is None:
46
- self.priority = DEFAULT_PRIORITY_LIST
47
- self._priority = {entry: len(self.priority) - i for i, entry in enumerate(self.priority)}
48
-
49
- def _key(self, curie: str) -> Optional[int]:
50
- prefix = self.graph.nodes[curie]["prefix"]
51
- return self._priority.get(prefix)
52
-
53
- def _get_priority_dict(self, curie: str) -> Mapping[str, int]:
54
- return dict(self._iterate_priority_targets(curie))
55
-
56
- def _iterate_priority_targets(self, curie: str) -> Iterable[tuple[str, int]]:
57
- for target in nx.single_source_shortest_path(self.graph, curie, cutoff=self.cutoff):
58
- priority = self._key(target)
59
- if priority is not None:
60
- yield target, priority
61
- elif target == curie:
62
- yield target, 0
63
- else:
64
- yield target, -1
65
-
66
- def canonicalize(self, curie: str) -> str:
67
- """Get the best CURIE from the given CURIE."""
68
- if curie not in self.graph:
69
- return curie
70
- priority_dict = self._get_priority_dict(curie)
71
- return max(priority_dict, key=priority_dict.get) # type:ignore
72
-
73
- @classmethod
74
- def get_default(cls, priority: Optional[Iterable[str]] = None) -> "Canonicalizer":
75
- """Get the default canonicalizer."""
76
- if priority is not None:
77
- priority = tuple(priority)
78
- return cls._get_default_helper(priority=priority)
79
-
80
- @classmethod
81
- @lru_cache
82
- def _get_default_helper(cls, priority: Optional[tuple[str, ...]] = None) -> "Canonicalizer":
83
- """Help get the default canonicalizer."""
84
- graph = cls._get_default_graph()
85
- return cls(graph=graph, priority=list(priority) if priority else None)
86
-
87
- @staticmethod
88
- @lru_cache
89
- def _get_default_graph() -> nx.Graph:
90
- df = resource_utils.ensure_inspector_javert_df()
91
- graph = get_graph_from_xref_df(df)
92
- return graph
93
-
94
- def iterate_flat_mapping(self, use_tqdm: bool = True) -> Iterable[tuple[str, str]]:
95
- """Iterate over the canonical mapping from all nodes to their canonical CURIEs."""
96
- nodes = self.graph.nodes()
97
- if use_tqdm:
98
- nodes = tqdm(
99
- nodes,
100
- total=self.graph.number_of_nodes(),
101
- desc="building flat mapping",
102
- unit_scale=True,
103
- unit="CURIE",
104
- )
105
- for node in nodes:
106
- yield node, self.canonicalize(node)
107
-
108
- def get_flat_mapping(self, use_tqdm: bool = True) -> Mapping[str, str]:
109
- """Get a canonical mapping from all nodes to their canonical CURIEs."""
110
- return dict(self.iterate_flat_mapping(use_tqdm=use_tqdm))
111
-
112
- def single_source_shortest_path(
113
- self,
114
- curie: str,
115
- cutoff: Optional[int] = None,
116
- ) -> Optional[Mapping[str, list[Mapping[str, str]]]]:
117
- """Get all shortest paths between given entity and its equivalent entities."""
118
- return single_source_shortest_path(graph=self.graph, curie=curie, cutoff=cutoff)
119
-
120
- def all_shortest_paths(
121
- self, source_curie: str, target_curie: str
122
- ) -> list[list[Mapping[str, str]]]:
123
- """Get all shortest paths between the two entities."""
124
- return all_shortest_paths(
125
- graph=self.graph, source_curie=source_curie, target_curie=target_curie
126
- )
127
-
128
- @classmethod
129
- def from_df(cls, df: pd.DataFrame) -> "Canonicalizer":
130
- """Instantiate from a dataframe."""
131
- return cls(graph=get_graph_from_xref_df(df))
132
-
133
-
134
- def all_shortest_paths(
135
- graph: nx.Graph, source_curie: str, target_curie: str
136
- ) -> list[list[Mapping[str, str]]]:
137
- """Get all shortest paths between the two CURIEs."""
138
- _paths = nx.all_shortest_paths(graph, source=source_curie, target=target_curie)
139
- return [
140
- [
141
- {"source": s, "target": t, "provenance": graph[s][t]["source"]}
142
- for s, t in pairwise(_path)
143
- ]
144
- for _path in _paths
145
- ]
146
-
147
-
148
- def single_source_shortest_path(
149
- graph: nx.Graph,
150
- curie: str,
151
- cutoff: Optional[int] = None,
152
- ) -> Optional[Mapping[str, list[Mapping[str, str]]]]:
153
- """Get the shortest path from the CURIE to all elements of its equivalence class.
154
-
155
- Things that didn't work:
156
-
157
- Unresponsive
158
- ------------
159
- .. code-block:: python
160
-
161
- for curies in tqdm(
162
- nx.connected_components(graph), desc="filling connected components", unit_scale=True
163
- ):
164
- for c1, c2 in itt.combinations(curies, r=2):
165
- if not graph.has_edge(c1, c2):
166
- graph.add_edge(c1, c2, inferred=True)
167
-
168
- Way too slow
169
- ------------
170
- .. code-block:: python
171
-
172
- for curie in tqdm(
173
- graph, total=graph.number_of_nodes(), desc="mapping connected components", unit_scale=True
174
- ):
175
- for incident_curie in nx.node_connected_component(graph, curie):
176
- if not graph.has_edge(curie, incident_curie):
177
- graph.add_edge(curie, incident_curie, inferred=True)
178
-
179
- Also consider the condensation of the graph:
180
- https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.components.condensation.html#networkx.algorithms.components.condensation
181
- """
182
- if curie not in graph:
183
- return None
184
- rv = nx.single_source_shortest_path(graph, curie, cutoff=cutoff)
185
- return {
186
- k: [
187
- {"source": s, "target": t, "provenance": graph[s][t]["provenance"]}
188
- for s, t in pairwise(v)
189
- ]
190
- for k, v in rv.items()
191
- if k != curie # don't map to self
192
- }
193
-
194
-
195
- def get_equivalent(curie: str, cutoff: Optional[int] = None) -> set[str]:
196
- """Get equivalent CURIEs."""
197
- canonicalizer = Canonicalizer.get_default()
198
- r = canonicalizer.single_source_shortest_path(curie=curie, cutoff=cutoff)
199
- return set(r or [])
200
-
201
-
202
- def get_priority_curie(curie: str) -> str:
203
- """Get the priority CURIE mapped to the best namespace."""
204
- canonicalizer = Canonicalizer.get_default()
205
- return canonicalizer.canonicalize(curie)
206
-
207
-
208
- def remap_file_stream(file_in, file_out, column: int, sep="\t") -> None:
209
- """Remap a file."""
210
- reader = get_reader(file_in, sep=sep)
211
- writer = get_writer(file_out, sep=sep)
212
- for row in reader:
213
- row[column] = get_priority_curie(row[column])
214
- writer.writerow(row)
pyobo/xrefdb/priority.py DELETED
@@ -1,59 +0,0 @@
1
- """Configuration for the default priority list."""
2
-
3
- import bioregistry
4
-
5
- __all__ = [
6
- "DEFAULT_PRIORITY_LIST",
7
- ]
8
-
9
- _DEFAULT_PRIORITY_LIST = [
10
- # Genes
11
- "ncbigene",
12
- "hgnc",
13
- "rgd",
14
- "mgi",
15
- "ensembl",
16
- "uniprot",
17
- # Chemicals
18
- # 'inchikey',
19
- # 'inchi',
20
- # 'smiles',
21
- "pubchem.compound",
22
- "chebi",
23
- "drugbank",
24
- "chembl.compound",
25
- "zinc",
26
- # protein families and complexes (and famplexes :))
27
- "complexportal",
28
- "fplx",
29
- "ec-code",
30
- "interpro",
31
- "pfam",
32
- "signor",
33
- # Pathologies/phenotypes
34
- "mondo",
35
- "efo",
36
- "doid",
37
- "hp",
38
- # Taxa
39
- "ncbitaxon",
40
- # If you can get away from MeSH, do it
41
- "mesh",
42
- "icd",
43
- ]
44
-
45
-
46
- def _get_default_priority_list():
47
- rv = []
48
- for _entry in _DEFAULT_PRIORITY_LIST:
49
- _prefix = bioregistry.normalize_prefix(_entry)
50
- if not _prefix:
51
- raise RuntimeError(f"unresolved prefix: {_entry}")
52
- if _prefix in rv:
53
- raise RuntimeError(f"duplicate found in priority list: {_entry}/{_prefix}")
54
- rv.append(_prefix)
55
- return rv
56
-
57
-
58
- DEFAULT_PRIORITY_LIST = _get_default_priority_list()
59
- del _get_default_priority_list
@@ -1,60 +0,0 @@
1
- """Sources of xrefs not from OBO."""
2
-
3
- import logging
4
- from collections.abc import Iterable, Mapping
5
- from functools import lru_cache
6
- from typing import Callable, Optional
7
-
8
- import pandas as pd
9
- from class_resolver import FunctionResolver
10
- from tqdm.auto import tqdm
11
-
12
- __all__ = [
13
- "iter_xref_plugins",
14
- "has_xref_plugin",
15
- "run_xref_plugin",
16
- "iter_xref_plugins",
17
- ]
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
- XrefGetter = Callable[[], pd.DataFrame]
22
-
23
-
24
- @lru_cache
25
- def _get_xref_plugins() -> Mapping[str, XrefGetter]:
26
- resolver: FunctionResolver[XrefGetter] = FunctionResolver.from_entrypoint("pyobo.xrefs")
27
- return resolver.lookup_dict
28
-
29
-
30
- def has_xref_plugin(prefix: str) -> bool:
31
- """Check if there's a plugin for converting the prefix."""
32
- return prefix in _get_xref_plugins()
33
-
34
-
35
- def run_xref_plugin(prefix: str) -> pd.DataFrame:
36
- """Get a converted PyOBO source."""
37
- rv = _get_xref_plugins()[prefix]()
38
-
39
- if isinstance(rv, pd.DataFrame):
40
- return rv
41
-
42
- logger.warning("can not load %s since it yields many dataframes", prefix)
43
-
44
-
45
- def iter_xref_plugins(
46
- use_tqdm: bool = True, skip_below: Optional[str] = None
47
- ) -> Iterable[pd.DataFrame]:
48
- """Get all modules in the PyOBO sources."""
49
- it = tqdm(sorted(_get_xref_plugins().items()), desc="Mapping Plugins", disable=not use_tqdm)
50
- for prefix, get_df in it:
51
- if skip_below and prefix < skip_below:
52
- continue
53
- it.set_postfix({"prefix": prefix})
54
- rv = get_df()
55
- if isinstance(rv, pd.DataFrame):
56
- yield rv
57
- elif isinstance(rv, Iterable):
58
- yield from rv
59
- else:
60
- raise TypeError
@@ -1,36 +0,0 @@
1
- """Get the Biomappings manually curated equivalences."""
2
-
3
- import pandas as pd
4
- from pystow.utils import get_commit
5
-
6
- from pyobo.constants import (
7
- PROVENANCE,
8
- SOURCE_ID,
9
- SOURCE_PREFIX,
10
- TARGET_ID,
11
- TARGET_PREFIX,
12
- XREF_COLUMNS,
13
- )
14
-
15
- __all__ = [
16
- "get_biomappings_df",
17
- ]
18
-
19
-
20
- def get_biomappings_df() -> pd.DataFrame:
21
- """Get biomappings equivalences."""
22
- sha = get_commit("biopragmatics", "biomappings")
23
- url = f"https://raw.githubusercontent.com/biopragmatics/biomappings/{sha}/src/biomappings/resources/mappings.tsv"
24
- df = pd.read_csv(url, sep="\t")
25
- df[PROVENANCE] = url
26
- df.rename(
27
- columns={
28
- "source prefix": SOURCE_PREFIX,
29
- "source identifier": SOURCE_ID,
30
- "target prefix": TARGET_PREFIX,
31
- "target identifier": TARGET_ID,
32
- },
33
- inplace=True,
34
- )
35
- df = df[XREF_COLUMNS]
36
- return df
@@ -1,91 +0,0 @@
1
- """Cross references from cbms2019.
2
-
3
- .. seealso:: https://github.com/pantapps/cbms2019
4
- """
5
-
6
- import pandas as pd
7
-
8
- from pyobo.constants import (
9
- PROVENANCE,
10
- SOURCE_ID,
11
- SOURCE_PREFIX,
12
- TARGET_ID,
13
- TARGET_PREFIX,
14
- XREF_COLUMNS,
15
- )
16
-
17
- __all__ = [
18
- "get_cbms2019_xrefs_df",
19
- ]
20
-
21
- #: Columns: DOID, DO name, xref xb, xref ix
22
- base_url = "https://raw.githubusercontent.com/pantapps/cbms2019/master"
23
- doid_to_all = f"{base_url}/mesh_icd10cm_via_do_not_mapped_umls.tsv"
24
- #: Columns: SNOMEDCT_ID, SNOMEDCIT_NAME, ICD10CM_ID, ICD10CM_NAME, MESH_ID
25
- all_to_all = f"{base_url}/mesh_icd10cm_via_snomedct_not_mapped_umls.tsv"
26
- #: Columns: DOID, DO name, xref xb, xref ix
27
- doid_to_all_2 = f"{base_url}/mesh_snomedct_via_do_not_mapped_umls.tsv"
28
- #: Columns: SNOMEDCT_ID, SNOMEDCIT_NAME, ICD10CM_ID, ICD10CM_NAME, MESH_ID
29
- all_to_all_2 = f"{base_url}/mesh_snomedct_via_icd10cm_not_mapped_umls.tsv"
30
-
31
- NSM = {
32
- "MESH": "mesh",
33
- "ICD10CM": "icd",
34
- "SNOMEDCT_US_2016_03_01": "snomedct",
35
- }
36
-
37
-
38
- def _get_doid(url: str) -> pd.DataFrame:
39
- df = pd.read_csv(url, sep="\t", usecols=["DO_ID", "resource", "resource_ID"])
40
- df.columns = [SOURCE_ID, TARGET_PREFIX, TARGET_ID]
41
-
42
- df[SOURCE_PREFIX] = "doid"
43
- df[SOURCE_ID] = df[SOURCE_ID].map(lambda s: s[len("DOID:") :])
44
- df[PROVENANCE] = url
45
- df[TARGET_PREFIX] = df[TARGET_PREFIX].map(NSM.get)
46
- df = df[XREF_COLUMNS]
47
- return df
48
-
49
-
50
- def _get_mesh_to_icd_via_doid() -> pd.DataFrame:
51
- return _get_doid(doid_to_all)
52
-
53
-
54
- def _get_mesh_to_icd_via_snomedct() -> pd.DataFrame:
55
- df = pd.read_csv(all_to_all, sep="\t", usecols=["SNOMEDCT_ID", "ICD10CM_ID", "MESH_ID"])
56
- rows = []
57
- for snomedct_id, icd_id, mesh_id in df.values:
58
- rows.append(("mesh", mesh_id, "snomedct", snomedct_id, all_to_all))
59
- rows.append(("snomedct", snomedct_id, "icd", icd_id, all_to_all))
60
- return pd.DataFrame(rows, columns=XREF_COLUMNS)
61
-
62
-
63
- def _get_mesh_to_snomedct_via_doid() -> pd.DataFrame:
64
- return _get_doid(doid_to_all_2)
65
-
66
-
67
- def _get_mesh_to_snomedct_via_icd() -> pd.DataFrame:
68
- df = pd.read_csv(
69
- all_to_all_2,
70
- sep="\t",
71
- usecols=["SNOMEDCT_ID", "ICD10CM_ID", "MESH_ID"],
72
- dtype={"SNOMEDCT_ID": float},
73
- )
74
- rows = []
75
- for snomedct_id, icd_id, mesh_id in df.values:
76
- snomedct_id = str(int(snomedct_id))
77
- rows.append(("mesh", mesh_id, "icd", icd_id, all_to_all))
78
- rows.append(("icd", icd_id, "snomedct", snomedct_id, all_to_all))
79
- return pd.DataFrame(rows, columns=XREF_COLUMNS)
80
-
81
-
82
- def get_cbms2019_xrefs_df() -> pd.DataFrame:
83
- """Get all CBMS2019 xrefs."""
84
- return pd.concat(
85
- [
86
- _get_mesh_to_icd_via_doid(),
87
- _get_mesh_to_icd_via_snomedct(),
88
- _get_mesh_to_snomedct_via_doid(),
89
- _get_mesh_to_snomedct_via_icd(),
90
- ]
91
- ).drop_duplicates()
@@ -1,83 +0,0 @@
1
- """Get ChEMBL xrefs."""
2
-
3
- from typing import Optional
4
-
5
- import pandas as pd
6
-
7
- from pyobo.api.utils import get_version
8
- from pyobo.constants import (
9
- PROVENANCE,
10
- SOURCE_ID,
11
- SOURCE_PREFIX,
12
- TARGET_ID,
13
- TARGET_PREFIX,
14
- XREF_COLUMNS,
15
- )
16
- from pyobo.utils.path import ensure_df
17
-
18
- CHEMBL_COMPOUND_PREFIX = "chembl.compound"
19
- CHEMBL_TARGET_PREFIX = "chembl.target"
20
-
21
-
22
- def get_chembl_compound_equivalences_raw(
23
- usecols=None, version: Optional[str] = None
24
- ) -> pd.DataFrame:
25
- """Get the chemical representations raw dataframe."""
26
- if version is None:
27
- version = get_version("chembl")
28
-
29
- base_url = f"ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}"
30
- url = f"{base_url}/chembl_{version}_chemreps.txt.gz"
31
- return ensure_df(CHEMBL_COMPOUND_PREFIX, url=url, sep="\t", usecols=usecols)
32
-
33
-
34
- def get_chembl_compound_equivalences(version: Optional[str] = None) -> pd.DataFrame:
35
- """Get ChEMBL chemical equivalences."""
36
- if version is None:
37
- version = get_version("chembl")
38
-
39
- df = get_chembl_compound_equivalences_raw(version=version)
40
- rows = []
41
- for chembl, _smiles, _inchi, inchi_key in df.values:
42
- rows.extend(
43
- [
44
- # No smiles/inchi since they can have variable length
45
- # ("chembl.compound", chembl, "smiles", smiles, f"chembl{version}"),
46
- # ("chembl.compound", chembl, "inchi", inchi, f"chembl{version}"),
47
- ("chembl.compound", chembl, "inchikey", inchi_key, f"chembl{version}"),
48
- ]
49
- )
50
- return pd.DataFrame(rows, columns=XREF_COLUMNS)
51
-
52
-
53
- def get_chembl_protein_equivalences(version: Optional[str] = None) -> pd.DataFrame:
54
- """Get ChEMBL protein equivalences."""
55
- if version is None:
56
- version = get_version("chembl")
57
-
58
- url = f"ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}/chembl_uniprot_mapping.txt"
59
- df = ensure_df(
60
- CHEMBL_TARGET_PREFIX,
61
- url=url,
62
- sep="\t",
63
- usecols=[0, 1],
64
- names=[TARGET_ID, SOURCE_ID], # switch around
65
- )
66
- df.loc[:, SOURCE_PREFIX] = "chembl.target"
67
- df.loc[:, TARGET_PREFIX] = "uniprot"
68
- df.loc[:, PROVENANCE] = f"chembl{version}"
69
- df = df[XREF_COLUMNS]
70
- return df
71
-
72
-
73
- def get_chembl_xrefs_df(version: Optional[str] = None) -> pd.DataFrame:
74
- """Get all ChEBML equivalences."""
75
- if version is None:
76
- version = get_version("chembl")
77
-
78
- return pd.concat(
79
- [
80
- get_chembl_compound_equivalences(version=version),
81
- get_chembl_protein_equivalences(version=version),
82
- ]
83
- )