pyobo 0.11.2__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. pyobo/.DS_Store +0 -0
  2. pyobo/__init__.py +95 -20
  3. pyobo/__main__.py +0 -0
  4. pyobo/api/__init__.py +81 -10
  5. pyobo/api/alts.py +52 -42
  6. pyobo/api/combine.py +39 -0
  7. pyobo/api/edges.py +68 -0
  8. pyobo/api/hierarchy.py +231 -203
  9. pyobo/api/metadata.py +14 -19
  10. pyobo/api/names.py +207 -127
  11. pyobo/api/properties.py +117 -117
  12. pyobo/api/relations.py +68 -94
  13. pyobo/api/species.py +24 -21
  14. pyobo/api/typedefs.py +11 -11
  15. pyobo/api/utils.py +66 -13
  16. pyobo/api/xrefs.py +107 -114
  17. pyobo/cli/__init__.py +0 -0
  18. pyobo/cli/cli.py +35 -50
  19. pyobo/cli/database.py +210 -160
  20. pyobo/cli/database_utils.py +155 -0
  21. pyobo/cli/lookup.py +163 -195
  22. pyobo/cli/utils.py +19 -6
  23. pyobo/constants.py +102 -3
  24. pyobo/getters.py +209 -191
  25. pyobo/gilda_utils.py +52 -250
  26. pyobo/identifier_utils/__init__.py +33 -0
  27. pyobo/identifier_utils/api.py +305 -0
  28. pyobo/identifier_utils/preprocessing.json +873 -0
  29. pyobo/identifier_utils/preprocessing.py +27 -0
  30. pyobo/identifier_utils/relations/__init__.py +8 -0
  31. pyobo/identifier_utils/relations/api.py +162 -0
  32. pyobo/identifier_utils/relations/data.json +5824 -0
  33. pyobo/identifier_utils/relations/data_owl.json +57 -0
  34. pyobo/identifier_utils/relations/data_rdf.json +1 -0
  35. pyobo/identifier_utils/relations/data_rdfs.json +7 -0
  36. pyobo/mocks.py +9 -6
  37. pyobo/ner/__init__.py +9 -0
  38. pyobo/ner/api.py +72 -0
  39. pyobo/ner/normalizer.py +33 -0
  40. pyobo/obographs.py +48 -40
  41. pyobo/plugins.py +5 -4
  42. pyobo/py.typed +0 -0
  43. pyobo/reader.py +1354 -395
  44. pyobo/reader_utils.py +155 -0
  45. pyobo/resource_utils.py +42 -22
  46. pyobo/resources/__init__.py +0 -0
  47. pyobo/resources/goc.py +75 -0
  48. pyobo/resources/goc.tsv +188 -0
  49. pyobo/resources/ncbitaxon.py +4 -5
  50. pyobo/resources/ncbitaxon.tsv.gz +0 -0
  51. pyobo/resources/ro.py +3 -2
  52. pyobo/resources/ro.tsv +0 -0
  53. pyobo/resources/so.py +0 -0
  54. pyobo/resources/so.tsv +0 -0
  55. pyobo/sources/README.md +12 -8
  56. pyobo/sources/__init__.py +52 -29
  57. pyobo/sources/agrovoc.py +0 -0
  58. pyobo/sources/antibodyregistry.py +11 -12
  59. pyobo/sources/bigg/__init__.py +13 -0
  60. pyobo/sources/bigg/bigg_compartment.py +81 -0
  61. pyobo/sources/bigg/bigg_metabolite.py +229 -0
  62. pyobo/sources/bigg/bigg_model.py +46 -0
  63. pyobo/sources/bigg/bigg_reaction.py +77 -0
  64. pyobo/sources/biogrid.py +1 -2
  65. pyobo/sources/ccle.py +7 -12
  66. pyobo/sources/cgnc.py +9 -6
  67. pyobo/sources/chebi.py +1 -1
  68. pyobo/sources/chembl/__init__.py +9 -0
  69. pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
  70. pyobo/sources/chembl/chembl_target.py +160 -0
  71. pyobo/sources/civic_gene.py +55 -15
  72. pyobo/sources/clinicaltrials.py +160 -0
  73. pyobo/sources/complexportal.py +24 -24
  74. pyobo/sources/conso.py +14 -22
  75. pyobo/sources/cpt.py +0 -0
  76. pyobo/sources/credit.py +1 -9
  77. pyobo/sources/cvx.py +27 -5
  78. pyobo/sources/depmap.py +9 -12
  79. pyobo/sources/dictybase_gene.py +2 -7
  80. pyobo/sources/drugbank/__init__.py +9 -0
  81. pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
  82. pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
  83. pyobo/sources/drugcentral.py +17 -13
  84. pyobo/sources/expasy.py +31 -34
  85. pyobo/sources/famplex.py +13 -18
  86. pyobo/sources/flybase.py +8 -13
  87. pyobo/sources/gard.py +62 -0
  88. pyobo/sources/geonames/__init__.py +9 -0
  89. pyobo/sources/geonames/features.py +28 -0
  90. pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
  91. pyobo/sources/geonames/utils.py +115 -0
  92. pyobo/sources/gmt_utils.py +6 -7
  93. pyobo/sources/go.py +20 -13
  94. pyobo/sources/gtdb.py +154 -0
  95. pyobo/sources/gwascentral/__init__.py +9 -0
  96. pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
  97. pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
  98. pyobo/sources/hgnc/__init__.py +9 -0
  99. pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
  100. pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
  101. pyobo/sources/icd/__init__.py +9 -0
  102. pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
  103. pyobo/sources/icd/icd11.py +148 -0
  104. pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
  105. pyobo/sources/interpro.py +4 -9
  106. pyobo/sources/itis.py +0 -5
  107. pyobo/sources/kegg/__init__.py +0 -0
  108. pyobo/sources/kegg/api.py +16 -38
  109. pyobo/sources/kegg/genes.py +9 -20
  110. pyobo/sources/kegg/genome.py +1 -7
  111. pyobo/sources/kegg/pathway.py +9 -21
  112. pyobo/sources/mesh.py +58 -24
  113. pyobo/sources/mgi.py +3 -10
  114. pyobo/sources/mirbase/__init__.py +11 -0
  115. pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
  116. pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
  117. pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
  118. pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
  119. pyobo/sources/msigdb.py +74 -39
  120. pyobo/sources/ncbi/__init__.py +9 -0
  121. pyobo/sources/ncbi/ncbi_gc.py +162 -0
  122. pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
  123. pyobo/sources/nih_reporter.py +60 -0
  124. pyobo/sources/nlm/__init__.py +9 -0
  125. pyobo/sources/nlm/nlm_catalog.py +48 -0
  126. pyobo/sources/nlm/nlm_publisher.py +36 -0
  127. pyobo/sources/nlm/utils.py +116 -0
  128. pyobo/sources/npass.py +6 -8
  129. pyobo/sources/omim_ps.py +11 -4
  130. pyobo/sources/pathbank.py +4 -8
  131. pyobo/sources/pfam/__init__.py +9 -0
  132. pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
  133. pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
  134. pyobo/sources/pharmgkb/__init__.py +15 -0
  135. pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
  136. pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
  137. pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
  138. pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
  139. pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
  140. pyobo/sources/pharmgkb/utils.py +86 -0
  141. pyobo/sources/pid.py +1 -6
  142. pyobo/sources/pombase.py +6 -10
  143. pyobo/sources/pubchem.py +4 -9
  144. pyobo/sources/reactome.py +5 -11
  145. pyobo/sources/rgd.py +11 -16
  146. pyobo/sources/rhea.py +37 -36
  147. pyobo/sources/ror.py +69 -42
  148. pyobo/sources/selventa/__init__.py +0 -0
  149. pyobo/sources/selventa/schem.py +4 -7
  150. pyobo/sources/selventa/scomp.py +1 -6
  151. pyobo/sources/selventa/sdis.py +4 -7
  152. pyobo/sources/selventa/sfam.py +1 -6
  153. pyobo/sources/sgd.py +6 -11
  154. pyobo/sources/signor/__init__.py +7 -0
  155. pyobo/sources/signor/download.py +41 -0
  156. pyobo/sources/signor/signor_complexes.py +105 -0
  157. pyobo/sources/slm.py +12 -15
  158. pyobo/sources/umls/__init__.py +7 -1
  159. pyobo/sources/umls/__main__.py +0 -0
  160. pyobo/sources/umls/get_synonym_types.py +20 -4
  161. pyobo/sources/umls/sty.py +57 -0
  162. pyobo/sources/umls/synonym_types.tsv +1 -1
  163. pyobo/sources/umls/umls.py +18 -22
  164. pyobo/sources/unimod.py +46 -0
  165. pyobo/sources/uniprot/__init__.py +1 -1
  166. pyobo/sources/uniprot/uniprot.py +40 -32
  167. pyobo/sources/uniprot/uniprot_ptm.py +4 -34
  168. pyobo/sources/utils.py +3 -2
  169. pyobo/sources/wikipathways.py +7 -10
  170. pyobo/sources/zfin.py +5 -10
  171. pyobo/ssg/__init__.py +12 -16
  172. pyobo/ssg/base.html +0 -0
  173. pyobo/ssg/index.html +26 -13
  174. pyobo/ssg/term.html +12 -2
  175. pyobo/ssg/typedef.html +0 -0
  176. pyobo/struct/__init__.py +54 -8
  177. pyobo/struct/functional/__init__.py +1 -0
  178. pyobo/struct/functional/dsl.py +2572 -0
  179. pyobo/struct/functional/macros.py +423 -0
  180. pyobo/struct/functional/obo_to_functional.py +385 -0
  181. pyobo/struct/functional/ontology.py +272 -0
  182. pyobo/struct/functional/utils.py +112 -0
  183. pyobo/struct/reference.py +331 -136
  184. pyobo/struct/struct.py +1484 -657
  185. pyobo/struct/struct_utils.py +1078 -0
  186. pyobo/struct/typedef.py +162 -210
  187. pyobo/struct/utils.py +12 -5
  188. pyobo/struct/vocabulary.py +138 -0
  189. pyobo/utils/__init__.py +0 -0
  190. pyobo/utils/cache.py +16 -15
  191. pyobo/utils/io.py +51 -41
  192. pyobo/utils/iter.py +5 -5
  193. pyobo/utils/misc.py +41 -53
  194. pyobo/utils/ndex_utils.py +0 -0
  195. pyobo/utils/path.py +73 -70
  196. pyobo/version.py +3 -3
  197. pyobo-0.12.1.dist-info/METADATA +671 -0
  198. pyobo-0.12.1.dist-info/RECORD +201 -0
  199. pyobo-0.12.1.dist-info/WHEEL +4 -0
  200. {pyobo-0.11.2.dist-info → pyobo-0.12.1.dist-info}/entry_points.txt +1 -0
  201. pyobo-0.12.1.dist-info/licenses/LICENSE +21 -0
  202. pyobo/aws.py +0 -162
  203. pyobo/cli/aws.py +0 -47
  204. pyobo/identifier_utils.py +0 -142
  205. pyobo/normalizer.py +0 -232
  206. pyobo/registries/__init__.py +0 -16
  207. pyobo/registries/metaregistry.json +0 -507
  208. pyobo/registries/metaregistry.py +0 -135
  209. pyobo/sources/icd11.py +0 -105
  210. pyobo/xrefdb/__init__.py +0 -1
  211. pyobo/xrefdb/canonicalizer.py +0 -214
  212. pyobo/xrefdb/priority.py +0 -59
  213. pyobo/xrefdb/sources/__init__.py +0 -60
  214. pyobo/xrefdb/sources/biomappings.py +0 -36
  215. pyobo/xrefdb/sources/cbms2019.py +0 -91
  216. pyobo/xrefdb/sources/chembl.py +0 -83
  217. pyobo/xrefdb/sources/compath.py +0 -82
  218. pyobo/xrefdb/sources/famplex.py +0 -64
  219. pyobo/xrefdb/sources/gilda.py +0 -50
  220. pyobo/xrefdb/sources/intact.py +0 -113
  221. pyobo/xrefdb/sources/ncit.py +0 -133
  222. pyobo/xrefdb/sources/pubchem.py +0 -27
  223. pyobo/xrefdb/sources/wikidata.py +0 -116
  224. pyobo/xrefdb/xrefs_pipeline.py +0 -180
  225. pyobo-0.11.2.dist-info/METADATA +0 -711
  226. pyobo-0.11.2.dist-info/RECORD +0 -157
  227. pyobo-0.11.2.dist-info/WHEEL +0 -5
  228. pyobo-0.11.2.dist-info/top_level.txt +0 -1
@@ -0,0 +1,15 @@
1
+ """Sources for PharmGKB."""
2
+
3
+ from .pharmgkb_chemical import PharmGKBChemicalGetter
4
+ from .pharmgkb_disease import PharmGKBDiseaseGetter
5
+ from .pharmgkb_gene import PharmGKBGeneGetter
6
+ from .pharmgkb_pathway import PharmGKBPathwayGetter
7
+ from .pharmgkb_variant import PharmGKBVariantGetter
8
+
9
+ __all__ = [
10
+ "PharmGKBChemicalGetter",
11
+ "PharmGKBDiseaseGetter",
12
+ "PharmGKBGeneGetter",
13
+ "PharmGKBPathwayGetter",
14
+ "PharmGKBVariantGetter",
15
+ ]
@@ -0,0 +1,89 @@
1
+ """An ontology representation of PharmGKB chemicals."""
2
+
3
+ from collections.abc import Iterable
4
+
5
+ import pandas as pd
6
+ from tqdm import tqdm
7
+
8
+ from pyobo import Obo, Reference, Term, default_reference
9
+ from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, split
10
+ from pyobo.struct.typedef import has_inchi, has_smiles
11
+
12
+ __all__ = [
13
+ "PharmGKBChemicalGetter",
14
+ ]
15
+
16
+ PREFIX = "pharmgkb.drug"
17
+ URL = "https://api.pharmgkb.org/v1/download/file/data/chemicals.zip"
18
+
19
+
20
+ class PharmGKBChemicalGetter(Obo):
21
+ """An ontology representation of PharmGKB chemicals."""
22
+
23
+ ontology = bioversions_key = PREFIX
24
+ dynamic_version = True
25
+ typedefs = [has_inchi, has_smiles]
26
+
27
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
28
+ """Iterate over terms in the ontology."""
29
+ return iter_terms(force=force)
30
+
31
+
32
+ SKIP_PREFIXES = {"smiles", "inchi", "atc", "rxnorm", "pubchem.compound"}
33
+
34
+
35
+ def iter_terms(force: bool = False) -> Iterable[Term]:
36
+ """Iterate over terms."""
37
+ df = download_pharmgkb_tsv(PREFIX, url=URL, inner="chemicals.tsv", force=force)
38
+
39
+ type_to_ref = {
40
+ typ: default_reference(PREFIX, typ.lower().replace(" ", "-").replace(",", ""), name=typ)
41
+ for typ in df["Type"].unique()
42
+ }
43
+ for x in type_to_ref.values():
44
+ yield Term(reference=x)
45
+
46
+ for _, row in df.iterrows():
47
+ term = Term.from_triple(PREFIX, identifier=row["PharmGKB Accession Id"], name=row["Name"])
48
+ term.append_parent(type_to_ref[row["Type"]])
49
+ if pd.notna(row["SMILES"]):
50
+ term.annotate_string(has_smiles, row["SMILES"])
51
+ if pd.notna(row["InChI"]):
52
+ term.annotate_string(has_inchi, row["InChI"])
53
+ for atc_id in split(row, "ATC Identifiers"):
54
+ term.append_exact_match(Reference(prefix="atc", identifier=atc_id))
55
+ for rxnorm_id in split(row, "RxNorm Identifiers"):
56
+ if len(rxnorm_id) > 7:
57
+ tqdm.write(f"invalid rxnorm luid (too long) - {rxnorm_id}")
58
+ else:
59
+ term.append_exact_match(Reference(prefix="rxnorm", identifier=rxnorm_id))
60
+ for pubchem_id in split(row, "PubChem Compound Identifiers"):
61
+ term.append_exact_match(Reference(prefix="pubchem.compound", identifier=pubchem_id))
62
+ for xref_curie in split(row, "External Vocabulary"):
63
+ try:
64
+ reference = Reference.from_curie(xref_curie)
65
+ except ValueError:
66
+ pass
67
+ else:
68
+ if reference.prefix not in SKIP_PREFIXES:
69
+ term.append_exact_match(reference)
70
+ for xref_curie in split(row, "Cross-references"):
71
+ try:
72
+ reference = Reference.from_curie(xref_curie)
73
+ except ValueError:
74
+ pass
75
+ else:
76
+ if reference.prefix not in SKIP_PREFIXES:
77
+ term.append_exact_match(reference)
78
+
79
+ for trade_name in split(row, "Trade names"):
80
+ # TODO use OMO term for trade name
81
+ term.append_synonym(trade_name)
82
+
83
+ # TODO add more
84
+
85
+ yield term
86
+
87
+
88
+ if __name__ == "__main__":
89
+ PharmGKBChemicalGetter.cli()
@@ -0,0 +1,77 @@
1
+ """An ontology representation of PharmGKB phenotypes."""
2
+
3
+ from collections.abc import Iterable
4
+ from typing import cast
5
+
6
+ import pandas as pd
7
+
8
+ from pyobo import Obo, Reference, Term
9
+ from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, parse_xrefs, split
10
+
11
+ __all__ = [
12
+ "PharmGKBDiseaseGetter",
13
+ ]
14
+
15
+ PREFIX = "pharmgkb.disease"
16
+ URL = "https://api.pharmgkb.org/v1/download/file/data/phenotypes.zip"
17
+
18
+
19
+ class PharmGKBDiseaseGetter(Obo):
20
+ """An ontology representation of PharmGKB phenotypes."""
21
+
22
+ ontology = bioversions_key = PREFIX
23
+ dynamic_version = True
24
+
25
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
26
+ """Iterate over terms in the ontology."""
27
+ return iter_terms(force=force)
28
+
29
+
30
+ def iter_terms(force: bool = False) -> Iterable[Term]:
31
+ """Iterate over terms.
32
+
33
+ :param force: Should the data be re-downloaded
34
+
35
+ :yields: Terms
36
+
37
+ 1. PharmGKB Accession Id = Identifier assigned to this phenotype by PharmGKB
38
+ 2. Name = Name PharmGKB uses for this phenotype
39
+ 3. Alternate Names = Other known names for this phenotype, comma-separated
40
+ 4. Cross-references = References to other resources in the form "resource:id",
41
+ comma-separated
42
+ 5. External Vocabulary = Term for this phenotype in another vocabulary in the form
43
+ "vocabulary:id", comma-separated
44
+ """
45
+ df = download_pharmgkb_tsv(PREFIX, url=URL, inner="phenotypes.tsv", force=force)
46
+ for _, row in df.iterrows():
47
+ identifier = row["PharmGKB Accession Id"]
48
+ if pd.isna(identifier):
49
+ continue
50
+ name = row["Name"]
51
+ term = Term.from_triple(PREFIX, identifier=str(identifier), name=name)
52
+
53
+ synonyms = set()
54
+ for synonym in split(row, "Alternate Names"):
55
+ synonym = synonym.strip()
56
+ if synonym.casefold() == name.casefold():
57
+ continue
58
+ synonyms.add(synonym.strip('"'))
59
+ for synonym in sorted(synonyms):
60
+ term.append_synonym(synonym)
61
+ for xref in parse_xrefs(term, row):
62
+ term.append_xref(xref)
63
+
64
+ for xref_line in split(row, "External Vocabulary"):
65
+ xref_curie, _, _ = xref_line.strip('"').partition("(")
66
+ try:
67
+ xref = cast(Reference, Reference.from_curie(xref_curie))
68
+ except Exception: # noqa:S110
69
+ pass # this happens when there's a comma in the name, but not a problem
70
+ else:
71
+ term.append_xref(xref)
72
+
73
+ yield term
74
+
75
+
76
+ if __name__ == "__main__":
77
+ PharmGKBDiseaseGetter.cli()
@@ -0,0 +1,108 @@
1
+ """An ontology representation of PharmGKB genes."""
2
+
3
+ from collections.abc import Iterable
4
+
5
+ import pandas as pd
6
+
7
+ from pyobo import Obo, Reference, Term
8
+ from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, parse_xrefs, split
9
+
10
+ __all__ = [
11
+ "PharmGKBGeneGetter",
12
+ ]
13
+
14
+ PREFIX = "pharmgkb.gene"
15
+ URL = "https://api.pharmgkb.org/v1/download/file/data/genes.zip"
16
+
17
+
18
+ class PharmGKBGeneGetter(Obo):
19
+ """An ontology representation of PharmGKB genes."""
20
+
21
+ ontology = bioversions_key = PREFIX
22
+ dynamic_version = True
23
+
24
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
25
+ """Iterate over terms in the ontology."""
26
+ return iter_terms(force=force)
27
+
28
+
29
+ def iter_terms(force: bool = False) -> Iterable[Term]:
30
+ """Iterate over terms.
31
+
32
+ :param force: Should the data be re-downloaded
33
+
34
+ :yields: Terms
35
+
36
+ 1. PharmGKB Accession Id = Identifier assigned to this gene by PharmGKB
37
+ 2. NCBI Gene ID = Identifier assigned to this gene by NCBI
38
+ 3. HGNC ID = Identifier assigned to this gene by HGNC
39
+ 4. Ensembl Id = Identifier assigned to this gene by Ensembl
40
+ 5. Name = Canonical name for this gene (by HGNC)
41
+ 6. Symbol = Canonical name for this gene (by HGNC)
42
+ 7. Alternate Names = Other known names for this gene, comma-separated
43
+ 8. Alternate Symbols = Other known symbols for this gene, comma-separated
44
+ 9. Is VIP = "Yes" if PharmGKB has written a VIP annotation for this gene, "No"
45
+ otherwise
46
+ 10. Has Variant Annotation = "Yes" if PharmGKB has written at least one variant
47
+ annotation for this gene, "No" otherwise
48
+ 11. Cross-references = References to other resources in the form "resource:id",
49
+ comma-separated
50
+ 12. Has CPIC Dosing Guideline = "Yes" if PharmGKB has annotated a CPIC guideline for
51
+ this gene, "No" otherwise
52
+ 13. Chromosome = The chromosome this gene is on, in the form "chr##"
53
+ 14. Chromosomal Start - GRCh37 = Where this gene starts on the chromosomal sequence
54
+ for NCBI GRCh37
55
+ 15. Chromosomal Stop - GRCh37 = Where this gene stops on the chromosomal sequence
56
+ for NCBI GRCh37
57
+ 16. Chromosomal Start - GRCh38 = Where this gene starts on the chromosomal sequence
58
+ for NCBI GRCh38
59
+ 17. Chromosomal Stop - GRCh38 = Where this gene stops on the chromosomal sequence
60
+ for NCBI GRCh38
61
+ """
62
+ df = download_pharmgkb_tsv(PREFIX, url=URL, inner="genes.tsv", force=force)
63
+
64
+ skip_xrefs = {"ncbigene", "hgnc", "ensembl", "GeneCard"}
65
+ for _, row in df.iterrows():
66
+ identifier = row["PharmGKB Accession Id"]
67
+ if pd.isna(identifier):
68
+ continue
69
+
70
+ term = Term.from_triple(PREFIX, identifier=str(identifier), name=row["Name"])
71
+
72
+ ncbigene_ids = list(split(row, "NCBI Gene ID"))
73
+ if len(ncbigene_ids) == 1:
74
+ term.append_exact_match(Reference(prefix="ncbigene", identifier=ncbigene_ids[0]))
75
+ else:
76
+ for ncbigene_id in ncbigene_ids:
77
+ term.append_xref(Reference(prefix="ncbigene", identifier=ncbigene_id))
78
+
79
+ hgnc_ids = list(split(row, "HGNC ID"))
80
+ if len(hgnc_ids) == 1:
81
+ term.append_exact_match(Reference(prefix="hgnc", identifier=hgnc_ids[0]))
82
+ else:
83
+ for hgnc_id in hgnc_ids:
84
+ term.append_xref(Reference(prefix="hgnc", identifier=hgnc_id))
85
+
86
+ for ensembl_id in split(row, "Ensembl Id"):
87
+ term.append_xref(Reference(prefix="ensembl", identifier=ensembl_id))
88
+
89
+ for synonym in split(row, "Alternate Names"):
90
+ synonym = synonym.strip('"')
91
+ term.append_synonym(synonym)
92
+
93
+ # TODO symbol synonym type
94
+ if pd.notna(row["Symbol"]):
95
+ term.append_synonym(row["Symbol"])
96
+ for synonym in split(row, "Alternate Symbols"):
97
+ term.append_synonym(synonym)
98
+
99
+ for xref in parse_xrefs(term, row):
100
+ if xref.prefix in skip_xrefs:
101
+ continue
102
+ term.append_xref(xref)
103
+
104
+ yield term
105
+
106
+
107
+ if __name__ == "__main__":
108
+ PharmGKBGeneGetter.cli()
@@ -0,0 +1,63 @@
1
+ """An ontology representation of PharmGKB pathways."""
2
+
3
+ import zipfile
4
+ from collections.abc import Iterable
5
+
6
+ from pyobo import Obo, Term
7
+ from pyobo.sources.pharmgkb.utils import download_pharmgkb
8
+
9
+ __all__ = [
10
+ "PharmGKBPathwayGetter",
11
+ ]
12
+
13
+ PREFIX = "pharmgkb.pathways"
14
+ BIOPAX_URL = "https://api.pharmgkb.org/v1/download/file/data/pathways-biopax.zip"
15
+ EXTENSION = ".owl"
16
+
17
+
18
+ class PharmGKBPathwayGetter(Obo):
19
+ """An ontology representation of PharmGKB pathways."""
20
+
21
+ ontology = bioversions_key = PREFIX
22
+ dynamic_version = True
23
+
24
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
25
+ """Iterate over terms in the ontology."""
26
+ return iter_terms(force=force)
27
+
28
+
29
+ def iter_terms(force: bool = False) -> Iterable[Term]:
30
+ """Iterate over terms.
31
+
32
+ :param force: Should the data be re-downloaded
33
+
34
+ :yields: Terms
35
+
36
+ 1. PharmGKB Accession Id = Identifier assigned to this phenotype by PharmGKB
37
+ 2. Name = Name PharmGKB uses for this phenotype
38
+ 3. Alternate Names = Other known names for this phenotype, comma-separated
39
+ 4. Cross-references = References to other resources in the form "resource:id",
40
+ comma-separated
41
+ 5. External Vocabulary = Term for this phenotype in another vocabulary in the form
42
+ "vocabulary:id", comma-separated
43
+ """
44
+ path = download_pharmgkb(PREFIX, url=BIOPAX_URL, force=force)
45
+ with zipfile.ZipFile(path) as zf:
46
+ for zip_info in zf.filelist:
47
+ if not zip_info.filename.endswith(EXTENSION):
48
+ continue
49
+ with zf.open(zip_info) as file:
50
+ yield _process_biopax(zip_info, file)
51
+
52
+
53
+ def _process_biopax(path: zipfile.ZipInfo, file) -> Term:
54
+ fname = path.filename.removesuffix(EXTENSION).strip().replace("\r\n", " ")
55
+ identifier, _, name = fname.partition("-")
56
+ name = name.replace("_", " ")
57
+ term = Term.from_triple(PREFIX, identifier, name)
58
+ # TODO parse file with pybiopax to include members and provenance
59
+ return term
60
+
61
+
62
+ if __name__ == "__main__":
63
+ PharmGKBPathwayGetter.cli()
@@ -0,0 +1,84 @@
1
+ """An ontology representation of PharmGKB variants."""
2
+
3
+ from collections.abc import Iterable
4
+
5
+ import pandas as pd
6
+
7
+ from pyobo import Obo, Reference, Term, TypeDef
8
+ from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, split
9
+
10
+ __all__ = [
11
+ "PharmGKBVariantGetter",
12
+ ]
13
+
14
+ PREFIX = "pharmgkb.variant"
15
+ URL = "https://api.pharmgkb.org/v1/download/file/data/variants.zip"
16
+
17
+
18
+ HAS_GENE_ASSOCIATION = TypeDef.default(
19
+ PREFIX, "hasGeneAssociation", name="has gene association", is_metadata_tag=True
20
+ )
21
+
22
+
23
+ class PharmGKBVariantGetter(Obo):
24
+ """An ontology representation of PharmGKB variants."""
25
+
26
+ ontology = bioversions_key = PREFIX
27
+ typedefs = [HAS_GENE_ASSOCIATION]
28
+ dynamic_version = True
29
+
30
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
31
+ """Iterate over terms in the ontology."""
32
+ return iter_terms(force=force)
33
+
34
+
35
+ def iter_terms(force: bool = False) -> Iterable[Term]:
36
+ """Iterate over terms.
37
+
38
+ :param force: Should the data be re-downloaded
39
+
40
+ :yields: Terms
41
+
42
+ 1. Variant ID = The PharmGKB identifier for this variant
43
+ 2. Variant Name = The PharmGKB name for this variant
44
+ 3. Gene IDs = The PharmGKB identifiers for genes associated with this variant
45
+ 4. Gene Symbols = The HGNC symbols for genes associated with this variant
46
+ 5. Location = The location of this variation on a reference sequence (either RefSeq
47
+ or GenBank), if available. HGVS format when applicable
48
+ 6. Variant Annotation count = The count of Variant Annotations done on this variant
49
+ 7. Clinical Annotation count = The count of all Clinical Annotations done on this
50
+ variant
51
+ 8. Level 1/2 Clinical Annotation count = The count of Level 1 or Level 2 ("top")
52
+ Clinical Annotations done on this variant
53
+ 9. Guideline Annotation count = The count of Dosing Guideline Annotations of which
54
+ this variant is a part
55
+ 10. Label Annotation count = The count of Drug Label Annotations in which this
56
+ variant is mentioned
57
+ 11. Synonym
58
+ """
59
+ df = download_pharmgkb_tsv(PREFIX, url=URL, inner="variants.tsv", force=force)
60
+
61
+ for _, row in df.iterrows():
62
+ identifier = row["Variant ID"]
63
+ if pd.isna(identifier):
64
+ continue
65
+
66
+ term = Term.from_triple(PREFIX, identifier=str(identifier))
67
+
68
+ dbsnp_id = row["Variant Name"]
69
+ if pd.notna(dbsnp_id):
70
+ term.append_exact_match(Reference(prefix="dbsnp", identifier=dbsnp_id))
71
+
72
+ for gene_id, gene_name in zip(
73
+ split(row, "Gene IDs"), split(row, "Gene Symbols"), strict=False
74
+ ):
75
+ gene_ref = Reference(prefix="pharmgkb.gene", identifier=gene_id, name=gene_name)
76
+ term.annotate_object(HAS_GENE_ASSOCIATION, gene_ref)
77
+
78
+ # TODO location, like NC_000003.12:183917980
79
+
80
+ yield term
81
+
82
+
83
+ if __name__ == "__main__":
84
+ PharmGKBVariantGetter.cli()
@@ -0,0 +1,86 @@
1
+ """Utilities for PharmGKB."""
2
+
3
+ from collections.abc import Iterable
4
+ from pathlib import Path
5
+ from typing import cast
6
+
7
+ import pandas as pd
8
+ from pystow.utils import read_zipfile_csv
9
+ from tqdm import tqdm
10
+
11
+ from pyobo import Reference
12
+ from pyobo.utils.path import ensure_path
13
+
14
+ __all__ = [
15
+ "download_pharmgkb_tsv",
16
+ ]
17
+
18
+ AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
19
+
20
+
21
+ def download_pharmgkb(prefix: str, url: str, *, force: bool) -> Path:
22
+ """Download a file from PharmGKB, spoofing the user agent."""
23
+ return ensure_path(
24
+ prefix,
25
+ url=url,
26
+ backend="requests",
27
+ download_kwargs={
28
+ "headers": {
29
+ # This is required otherwise we get booted
30
+ "User-Agent": AGENT,
31
+ },
32
+ },
33
+ force=force,
34
+ )
35
+
36
+
37
+ def download_pharmgkb_tsv(prefix: str, url: str, inner: str, *, force: bool) -> pd.DataFrame:
38
+ """Download PharmGKB data."""
39
+ path = download_pharmgkb(prefix, url=url, force=force)
40
+ df = read_zipfile_csv(path, inner_path=inner, dtype=str)
41
+ return df
42
+
43
+
44
+ def split(row, key: str) -> Iterable[str]:
45
+ """Split the data."""
46
+ values = row.get(key)
47
+ if pd.isna(values) or not values:
48
+ return
49
+ try:
50
+ for value in values.split(","):
51
+ yield value.strip()
52
+ except AttributeError:
53
+ pass
54
+
55
+
56
+ _MISSING_PREFIXES: set[str] = set()
57
+ REPLACES = {
58
+ "URL:http://www.ncbi.nlm.nih.gov/omim/": "omim:",
59
+ "Comparative Toxicogenomics Database:": "mesh:",
60
+ "ModBase:": "uniprot:",
61
+ "RefSeq DNA:": "refseq:",
62
+ "RefSeq RNA:": "refseq:",
63
+ "RefSeq Protein:": "refseq:",
64
+ "UCSC Genome Browser:": "refseq:",
65
+ }
66
+
67
+
68
+ def parse_xrefs(term, row, key="Cross-references") -> Iterable[Reference]:
69
+ """Parse the cross-references."""
70
+ for xref_curie in split(row, key):
71
+ # HOXD@ is a valid genatlas identifier, see http://genatlas.medecine.univ-paris5.fr/fiche.php?symbol=HOXD@
72
+ # but this is broken, so skip them for now
73
+ if xref_curie.endswith("@"):
74
+ continue
75
+ for k, v in REPLACES.items():
76
+ if xref_curie.startswith(k):
77
+ xref_curie = xref_curie.replace(k, v)
78
+ try:
79
+ xref = cast(Reference, Reference.from_curie(xref_curie))
80
+ except ValueError:
81
+ p, _, _ = xref_curie.partition(":")
82
+ if p not in _MISSING_PREFIXES:
83
+ tqdm.write(f"[{term.curie}] could not parse xref: {xref_curie}")
84
+ _MISSING_PREFIXES.add(p)
85
+ else:
86
+ yield xref
pyobo/sources/pid.py CHANGED
@@ -38,11 +38,6 @@ class PIDGetter(Obo):
38
38
  return iter_terms()
39
39
 
40
40
 
41
- def get_obo() -> Obo:
42
- """Get NCI PID as OBO."""
43
- return PIDGetter()
44
-
45
-
46
41
  def iter_networks(use_tqdm: bool = False, force: bool = False) -> Iterable[tuple[str, CX]]:
47
42
  """Iterate over NCI PID networks."""
48
43
  yield from ensure_ndex_network_set(
@@ -93,7 +88,7 @@ def iter_terms(force: bool = False) -> Iterable[Term]:
93
88
  logger.debug(f"unmapped: {name}, {reference}")
94
89
 
95
90
  for hgnc_id, hgnc_symbol in genes:
96
- term.append_relationship(
91
+ term.annotate_object(
97
92
  has_participant, Reference(prefix="hgnc", identifier=hgnc_id, name=hgnc_symbol)
98
93
  )
99
94
 
pyobo/sources/pombase.py CHANGED
@@ -8,7 +8,7 @@ import pandas as pd
8
8
  from tqdm.auto import tqdm
9
9
 
10
10
  import pyobo
11
- from pyobo import Reference
11
+ from pyobo import Reference, TypeDef
12
12
  from pyobo.resources.so import get_so_name
13
13
  from pyobo.struct import Obo, Term, from_species, has_gene_product, orthologous
14
14
  from pyobo.utils.path import ensure_df
@@ -22,24 +22,20 @@ logger = logging.getLogger(__name__)
22
22
  PREFIX = "pombase"
23
23
  GENE_NAMES_URL = "https://www.pombase.org/data/names_and_identifiers/gene_IDs_names_products.tsv"
24
24
  ORTHOLOGS_URL = "https://www.pombase.org/data/orthologs/human-orthologs.txt.gz"
25
+ CHROMOSOME = TypeDef.default(PREFIX, "chromosome", is_metadata_tag=True)
25
26
 
26
27
 
27
28
  class PomBaseGetter(Obo):
28
29
  """An ontology representation of PomBase's fission yeast gene nomenclature."""
29
30
 
30
31
  ontology = bioversions_key = PREFIX
31
- typedefs = [from_species, has_gene_product, orthologous]
32
+ typedefs = [from_species, has_gene_product, orthologous, CHROMOSOME]
32
33
 
33
34
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
34
35
  """Iterate over terms in the ontology."""
35
36
  return get_terms(force=force, version=self._version_or_raise)
36
37
 
37
38
 
38
- def get_obo(force: bool = False) -> Obo:
39
- """Get OBO."""
40
- return PomBaseGetter(force=force)
41
-
42
-
43
39
  #: A mapping from PomBase gene type to sequence ontology terms
44
40
  POMBASE_TO_SO = {
45
41
  # None: "0000704", # gene,
@@ -89,13 +85,13 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
89
85
  name=symbol if pd.notna(symbol) else None,
90
86
  definition=name if pd.notna(name) else None,
91
87
  )
92
- term.append_property("chromosome", chromosome[len("chromosome_") :])
88
+ term.annotate_string(CHROMOSOME, chromosome[len("chromosome_") :])
93
89
  term.append_parent(so[gtype])
94
90
  term.set_species(identifier="4896", name="Schizosaccharomyces pombe")
95
91
  for hgnc_id in identifier_to_hgnc_ids.get(identifier, []):
96
- term.append_relationship(orthologous, Reference(prefix="hgnc", identifier=hgnc_id))
92
+ term.annotate_object(orthologous, Reference(prefix="hgnc", identifier=hgnc_id))
97
93
  if uniprot_id and pd.notna(uniprot_id):
98
- term.append_relationship(
94
+ term.annotate_object(
99
95
  has_gene_product, Reference(prefix="uniprot", identifier=uniprot_id)
100
96
  )
101
97
  if synonyms and pd.notna(synonyms):
pyobo/sources/pubchem.py CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  import logging
4
4
  from collections.abc import Iterable, Mapping
5
- from typing import Optional
5
+ from pathlib import Path
6
6
 
7
7
  import pandas as pd
8
8
  from bioregistry.utils import removeprefix
@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
23
23
  PREFIX = "pubchem.compound"
24
24
 
25
25
 
26
- def _get_pubchem_extras_url(version: Optional[str], end: str) -> str:
26
+ def _get_pubchem_extras_url(version: str | None, end: str) -> str:
27
27
  if version is None:
28
28
  version = get_version("pubchem")
29
29
  return f"ftp://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Monthly/{version}/Extras/{end}"
@@ -40,11 +40,6 @@ class PubChemCompoundGetter(Obo):
40
40
  return get_terms(version=self._version_or_raise, force=force)
41
41
 
42
42
 
43
- def get_obo(force: bool = False) -> Obo:
44
- """Get PubChem Compound OBO."""
45
- return PubChemCompoundGetter(force=force)
46
-
47
-
48
43
  def _get_cid_smiles_df(version: str) -> pd.DataFrame:
49
44
  url = _get_pubchem_extras_url(version, "CID-SMILES.gz")
50
45
  return ensure_df(PREFIX, url=url, version=version, dtype=str)
@@ -97,7 +92,7 @@ def get_pubchem_id_to_mesh_id(version: str) -> Mapping[str, str]:
97
92
  return dict(df.values)
98
93
 
99
94
 
100
- def _ensure_cid_name_path(*, version: Optional[str] = None, force: bool = False) -> str:
95
+ def _ensure_cid_name_path(*, version: str | None = None, force: bool = False) -> Path:
101
96
  if version is None:
102
97
  version = get_version("pubchem")
103
98
  # 2 tab-separated columns: compound_id, name
@@ -145,4 +140,4 @@ def get_terms(*, version: str, use_tqdm: bool = True, force: bool = False) -> It
145
140
 
146
141
 
147
142
  if __name__ == "__main__":
148
- get_obo().write_default()
143
+ PubChemCompoundGetter.cli()