pyobo 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. pyobo/.DS_Store +0 -0
  2. pyobo/__init__.py +95 -20
  3. pyobo/__main__.py +0 -0
  4. pyobo/api/__init__.py +81 -10
  5. pyobo/api/alts.py +52 -42
  6. pyobo/api/combine.py +39 -0
  7. pyobo/api/edges.py +68 -0
  8. pyobo/api/hierarchy.py +231 -203
  9. pyobo/api/metadata.py +14 -19
  10. pyobo/api/names.py +207 -127
  11. pyobo/api/properties.py +117 -113
  12. pyobo/api/relations.py +68 -94
  13. pyobo/api/species.py +24 -21
  14. pyobo/api/typedefs.py +11 -11
  15. pyobo/api/utils.py +66 -13
  16. pyobo/api/xrefs.py +108 -114
  17. pyobo/cli/__init__.py +0 -0
  18. pyobo/cli/cli.py +35 -50
  19. pyobo/cli/database.py +183 -161
  20. pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
  21. pyobo/cli/lookup.py +163 -195
  22. pyobo/cli/utils.py +19 -6
  23. pyobo/constants.py +102 -3
  24. pyobo/getters.py +196 -118
  25. pyobo/gilda_utils.py +79 -200
  26. pyobo/identifier_utils/__init__.py +41 -0
  27. pyobo/identifier_utils/api.py +296 -0
  28. pyobo/identifier_utils/model.py +130 -0
  29. pyobo/identifier_utils/preprocessing.json +812 -0
  30. pyobo/identifier_utils/preprocessing.py +61 -0
  31. pyobo/identifier_utils/relations/__init__.py +8 -0
  32. pyobo/identifier_utils/relations/api.py +162 -0
  33. pyobo/identifier_utils/relations/data.json +5824 -0
  34. pyobo/identifier_utils/relations/data_owl.json +57 -0
  35. pyobo/identifier_utils/relations/data_rdf.json +1 -0
  36. pyobo/identifier_utils/relations/data_rdfs.json +7 -0
  37. pyobo/mocks.py +9 -6
  38. pyobo/ner/__init__.py +9 -0
  39. pyobo/ner/api.py +72 -0
  40. pyobo/ner/normalizer.py +33 -0
  41. pyobo/obographs.py +43 -39
  42. pyobo/plugins.py +5 -4
  43. pyobo/py.typed +0 -0
  44. pyobo/reader.py +1358 -395
  45. pyobo/reader_utils.py +155 -0
  46. pyobo/resource_utils.py +42 -22
  47. pyobo/resources/__init__.py +0 -0
  48. pyobo/resources/goc.py +75 -0
  49. pyobo/resources/goc.tsv +188 -0
  50. pyobo/resources/ncbitaxon.py +4 -5
  51. pyobo/resources/ncbitaxon.tsv.gz +0 -0
  52. pyobo/resources/ro.py +3 -2
  53. pyobo/resources/ro.tsv +0 -0
  54. pyobo/resources/so.py +0 -0
  55. pyobo/resources/so.tsv +0 -0
  56. pyobo/sources/README.md +12 -8
  57. pyobo/sources/__init__.py +52 -29
  58. pyobo/sources/agrovoc.py +0 -0
  59. pyobo/sources/antibodyregistry.py +11 -12
  60. pyobo/sources/bigg/__init__.py +13 -0
  61. pyobo/sources/bigg/bigg_compartment.py +81 -0
  62. pyobo/sources/bigg/bigg_metabolite.py +229 -0
  63. pyobo/sources/bigg/bigg_model.py +46 -0
  64. pyobo/sources/bigg/bigg_reaction.py +77 -0
  65. pyobo/sources/biogrid.py +1 -2
  66. pyobo/sources/ccle.py +7 -12
  67. pyobo/sources/cgnc.py +0 -5
  68. pyobo/sources/chebi.py +1 -1
  69. pyobo/sources/chembl/__init__.py +9 -0
  70. pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
  71. pyobo/sources/chembl/chembl_target.py +160 -0
  72. pyobo/sources/civic_gene.py +55 -15
  73. pyobo/sources/clinicaltrials.py +160 -0
  74. pyobo/sources/complexportal.py +24 -24
  75. pyobo/sources/conso.py +14 -22
  76. pyobo/sources/cpt.py +0 -0
  77. pyobo/sources/credit.py +1 -9
  78. pyobo/sources/cvx.py +27 -5
  79. pyobo/sources/depmap.py +9 -12
  80. pyobo/sources/dictybase_gene.py +2 -7
  81. pyobo/sources/drugbank/__init__.py +9 -0
  82. pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
  83. pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
  84. pyobo/sources/drugcentral.py +17 -13
  85. pyobo/sources/expasy.py +31 -34
  86. pyobo/sources/famplex.py +13 -18
  87. pyobo/sources/flybase.py +3 -8
  88. pyobo/sources/gard.py +62 -0
  89. pyobo/sources/geonames/__init__.py +9 -0
  90. pyobo/sources/geonames/features.py +28 -0
  91. pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
  92. pyobo/sources/geonames/utils.py +115 -0
  93. pyobo/sources/gmt_utils.py +6 -7
  94. pyobo/sources/go.py +20 -13
  95. pyobo/sources/gtdb.py +154 -0
  96. pyobo/sources/gwascentral/__init__.py +9 -0
  97. pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
  98. pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
  99. pyobo/sources/hgnc/__init__.py +9 -0
  100. pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
  101. pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
  102. pyobo/sources/icd/__init__.py +9 -0
  103. pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
  104. pyobo/sources/icd/icd11.py +148 -0
  105. pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
  106. pyobo/sources/interpro.py +4 -9
  107. pyobo/sources/itis.py +0 -5
  108. pyobo/sources/kegg/__init__.py +0 -0
  109. pyobo/sources/kegg/api.py +16 -38
  110. pyobo/sources/kegg/genes.py +9 -20
  111. pyobo/sources/kegg/genome.py +1 -7
  112. pyobo/sources/kegg/pathway.py +9 -21
  113. pyobo/sources/mesh.py +58 -24
  114. pyobo/sources/mgi.py +3 -10
  115. pyobo/sources/mirbase/__init__.py +11 -0
  116. pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
  117. pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
  118. pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
  119. pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
  120. pyobo/sources/msigdb.py +74 -39
  121. pyobo/sources/ncbi/__init__.py +9 -0
  122. pyobo/sources/ncbi/ncbi_gc.py +162 -0
  123. pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
  124. pyobo/sources/nih_reporter.py +60 -0
  125. pyobo/sources/nlm/__init__.py +9 -0
  126. pyobo/sources/nlm/nlm_catalog.py +48 -0
  127. pyobo/sources/nlm/nlm_publisher.py +36 -0
  128. pyobo/sources/nlm/utils.py +116 -0
  129. pyobo/sources/npass.py +6 -8
  130. pyobo/sources/omim_ps.py +10 -3
  131. pyobo/sources/pathbank.py +4 -8
  132. pyobo/sources/pfam/__init__.py +9 -0
  133. pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
  134. pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
  135. pyobo/sources/pharmgkb/__init__.py +15 -0
  136. pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
  137. pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
  138. pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
  139. pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
  140. pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
  141. pyobo/sources/pharmgkb/utils.py +86 -0
  142. pyobo/sources/pid.py +1 -6
  143. pyobo/sources/pombase.py +6 -10
  144. pyobo/sources/pubchem.py +4 -9
  145. pyobo/sources/reactome.py +5 -11
  146. pyobo/sources/rgd.py +11 -16
  147. pyobo/sources/rhea.py +37 -36
  148. pyobo/sources/ror.py +69 -42
  149. pyobo/sources/selventa/__init__.py +0 -0
  150. pyobo/sources/selventa/schem.py +4 -7
  151. pyobo/sources/selventa/scomp.py +1 -6
  152. pyobo/sources/selventa/sdis.py +4 -7
  153. pyobo/sources/selventa/sfam.py +1 -6
  154. pyobo/sources/sgd.py +6 -11
  155. pyobo/sources/signor/__init__.py +7 -0
  156. pyobo/sources/signor/download.py +41 -0
  157. pyobo/sources/signor/signor_complexes.py +105 -0
  158. pyobo/sources/slm.py +12 -15
  159. pyobo/sources/umls/__init__.py +7 -1
  160. pyobo/sources/umls/__main__.py +0 -0
  161. pyobo/sources/umls/get_synonym_types.py +20 -4
  162. pyobo/sources/umls/sty.py +57 -0
  163. pyobo/sources/umls/synonym_types.tsv +1 -1
  164. pyobo/sources/umls/umls.py +18 -22
  165. pyobo/sources/unimod.py +46 -0
  166. pyobo/sources/uniprot/__init__.py +1 -1
  167. pyobo/sources/uniprot/uniprot.py +40 -32
  168. pyobo/sources/uniprot/uniprot_ptm.py +4 -34
  169. pyobo/sources/utils.py +3 -2
  170. pyobo/sources/wikipathways.py +7 -10
  171. pyobo/sources/zfin.py +5 -10
  172. pyobo/ssg/__init__.py +12 -16
  173. pyobo/ssg/base.html +0 -0
  174. pyobo/ssg/index.html +26 -13
  175. pyobo/ssg/term.html +12 -2
  176. pyobo/ssg/typedef.html +0 -0
  177. pyobo/struct/__init__.py +54 -8
  178. pyobo/struct/functional/__init__.py +1 -0
  179. pyobo/struct/functional/dsl.py +2572 -0
  180. pyobo/struct/functional/macros.py +423 -0
  181. pyobo/struct/functional/obo_to_functional.py +385 -0
  182. pyobo/struct/functional/ontology.py +270 -0
  183. pyobo/struct/functional/utils.py +112 -0
  184. pyobo/struct/reference.py +331 -136
  185. pyobo/struct/struct.py +1413 -643
  186. pyobo/struct/struct_utils.py +1078 -0
  187. pyobo/struct/typedef.py +162 -210
  188. pyobo/struct/utils.py +12 -5
  189. pyobo/struct/vocabulary.py +138 -0
  190. pyobo/utils/__init__.py +0 -0
  191. pyobo/utils/cache.py +13 -11
  192. pyobo/utils/io.py +17 -31
  193. pyobo/utils/iter.py +5 -5
  194. pyobo/utils/misc.py +41 -53
  195. pyobo/utils/ndex_utils.py +0 -0
  196. pyobo/utils/path.py +76 -70
  197. pyobo/version.py +3 -3
  198. {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/METADATA +224 -225
  199. pyobo-0.12.0.dist-info/RECORD +202 -0
  200. pyobo-0.12.0.dist-info/WHEEL +4 -0
  201. {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
  202. {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info/licenses}/LICENSE +0 -0
  203. pyobo/apps/__init__.py +0 -3
  204. pyobo/apps/cli.py +0 -24
  205. pyobo/apps/gilda/__init__.py +0 -3
  206. pyobo/apps/gilda/__main__.py +0 -8
  207. pyobo/apps/gilda/app.py +0 -48
  208. pyobo/apps/gilda/cli.py +0 -36
  209. pyobo/apps/gilda/templates/base.html +0 -33
  210. pyobo/apps/gilda/templates/home.html +0 -11
  211. pyobo/apps/gilda/templates/matches.html +0 -32
  212. pyobo/apps/mapper/__init__.py +0 -3
  213. pyobo/apps/mapper/__main__.py +0 -11
  214. pyobo/apps/mapper/cli.py +0 -37
  215. pyobo/apps/mapper/mapper.py +0 -187
  216. pyobo/apps/mapper/templates/base.html +0 -35
  217. pyobo/apps/mapper/templates/mapper_home.html +0 -64
  218. pyobo/aws.py +0 -162
  219. pyobo/cli/aws.py +0 -47
  220. pyobo/identifier_utils.py +0 -142
  221. pyobo/normalizer.py +0 -232
  222. pyobo/registries/__init__.py +0 -16
  223. pyobo/registries/metaregistry.json +0 -507
  224. pyobo/registries/metaregistry.py +0 -135
  225. pyobo/sources/icd11.py +0 -105
  226. pyobo/xrefdb/__init__.py +0 -1
  227. pyobo/xrefdb/canonicalizer.py +0 -214
  228. pyobo/xrefdb/priority.py +0 -59
  229. pyobo/xrefdb/sources/__init__.py +0 -60
  230. pyobo/xrefdb/sources/biomappings.py +0 -36
  231. pyobo/xrefdb/sources/cbms2019.py +0 -91
  232. pyobo/xrefdb/sources/chembl.py +0 -83
  233. pyobo/xrefdb/sources/compath.py +0 -82
  234. pyobo/xrefdb/sources/famplex.py +0 -64
  235. pyobo/xrefdb/sources/gilda.py +0 -50
  236. pyobo/xrefdb/sources/intact.py +0 -113
  237. pyobo/xrefdb/sources/ncit.py +0 -133
  238. pyobo/xrefdb/sources/pubchem.py +0 -27
  239. pyobo/xrefdb/sources/wikidata.py +0 -116
  240. pyobo-0.11.1.dist-info/RECORD +0 -173
  241. pyobo-0.11.1.dist-info/WHEEL +0 -5
  242. pyobo-0.11.1.dist-info/top_level.txt +0 -1
@@ -0,0 +1,41 @@
1
+ """Download utilities for SIGNOR."""
2
+
3
+ import enum
4
+
5
+ import pandas as pd
6
+ import requests
7
+
8
+ from pyobo.utils.path import prefix_directory_join
9
+
10
+ __all__ = [
11
+ "DownloadKey",
12
+ "download_signor",
13
+ "get_signor_df",
14
+ ]
15
+
16
+
17
+ class DownloadKey(enum.Enum):
18
+ """Download key."""
19
+
20
+ complex = "Download complex data"
21
+ family = "Download protein family data"
22
+ phenotype = "Download phenotype data"
23
+ stimulus = "Download stimulus data"
24
+
25
+
26
+ def download_signor(key: DownloadKey) -> requests.Response:
27
+ """Download from SIGNOR."""
28
+ return requests.post(
29
+ "https://signor.uniroma2.it/download_complexes.php",
30
+ files={"submit": (None, key.value)},
31
+ )
32
+
33
+
34
+ def get_signor_df(prefix: str, *, version: str, key: DownloadKey, force: bool) -> pd.DataFrame:
35
+ """Get the appropriate SIGNOR dataframe."""
36
+ path = prefix_directory_join(prefix, version=version, name=f"{key.name}.csv")
37
+ if not path.is_file() or force:
38
+ res = download_signor(key)
39
+ path.write_text(res.text)
40
+ df = pd.read_csv(path, sep=";")
41
+ return df
@@ -0,0 +1,105 @@
1
+ """A source for SIGNOR complexes."""
2
+
3
+ from collections.abc import Iterable
4
+
5
+ import pandas as pd
6
+
7
+ from pyobo import Obo, Reference, Term, default_reference
8
+ from pyobo.sources.signor.download import DownloadKey, get_signor_df
9
+ from pyobo.struct import CHARLIE_TERM, HUMAN_TERM, PYOBO_INJECTED
10
+ from pyobo.struct.typedef import exact_match, has_component, has_member
11
+
12
+ __all__ = [
13
+ "SignorGetter",
14
+ ]
15
+
16
+ PREFIX = "signor"
17
+
18
+ PROTEIN_FAMILY = (
19
+ Term(reference=default_reference(PREFIX, "protein-family"))
20
+ .append_contributor(CHARLIE_TERM)
21
+ .append_comment(PYOBO_INJECTED)
22
+ )
23
+ PROTEIN_COMPLEX = (
24
+ Term(reference=default_reference(PREFIX, "protein-complex"))
25
+ .append_contributor(CHARLIE_TERM)
26
+ .append_comment(PYOBO_INJECTED)
27
+ )
28
+ PHENOTYPE = (
29
+ Term(reference=default_reference(PREFIX, "phenotype"))
30
+ .append_contributor(CHARLIE_TERM)
31
+ .append_comment(PYOBO_INJECTED)
32
+ )
33
+ STIMULUS = (
34
+ Term(reference=default_reference(PREFIX, "stimulus"))
35
+ .append_contributor(CHARLIE_TERM)
36
+ .append_comment(PYOBO_INJECTED)
37
+ )
38
+ ROOT_TERMS = (PROTEIN_FAMILY, PROTEIN_COMPLEX, PHENOTYPE, STIMULUS)
39
+
40
+
41
+ class SignorGetter(Obo):
42
+ """An ontology representation of SIGNOR complexes."""
43
+
44
+ ontology = bioversions_key = PREFIX
45
+ typedefs = [exact_match, has_component, has_member]
46
+ root_terms = [r.reference for r in ROOT_TERMS]
47
+
48
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
49
+ """Iterate over terms in the ontology."""
50
+ return iter_terms(version=self._version_or_raise, force=force)
51
+
52
+
53
+ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
54
+ """Iterate over terms."""
55
+ yield CHARLIE_TERM
56
+ yield HUMAN_TERM
57
+ yield from ROOT_TERMS
58
+
59
+ complexes_df = get_signor_df(PREFIX, version=version, force=force, key=DownloadKey.complex)
60
+ for identifier, name, proteins in complexes_df.values:
61
+ term = Term.from_triple(PREFIX, identifier, name)
62
+ term.append_parent(PROTEIN_COMPLEX)
63
+ for part_id in proteins.split(","):
64
+ part_id = part_id.strip()
65
+ if part_id.startswith("SIGNOR-"):
66
+ part = Reference(prefix="signor", identifier=part_id)
67
+ elif part_id.startswith("CHEBI:"):
68
+ part = Reference(prefix="chebi", identifier=part_id.removeprefix("CHEBI:"))
69
+ else:
70
+ part = Reference(prefix="uniprot", identifier=part_id)
71
+ term.annotate_object(has_component, part)
72
+ yield term
73
+
74
+ family_df = get_signor_df(PREFIX, version=version, force=force, key=DownloadKey.family)
75
+ for identifier, name, proteins in family_df.values:
76
+ term = Term.from_triple(PREFIX, identifier, name)
77
+ term.append_parent(PROTEIN_FAMILY)
78
+ for uniprot_id in proteins.split(","):
79
+ uniprot_id = uniprot_id.strip()
80
+ term.annotate_object(has_member, Reference(prefix="uniprot", identifier=uniprot_id))
81
+ yield term
82
+
83
+ stimulus_df = get_signor_df(PREFIX, version=version, force=force, key=DownloadKey.stimulus)
84
+ # for some reason, there are many duplicates in this file
85
+ stimulus_df = stimulus_df.drop_duplicates()
86
+ for identifier, name, description in stimulus_df.values:
87
+ term = Term.from_triple(PREFIX, identifier, name, definition=_clean_descr(description))
88
+ term.append_parent(STIMULUS)
89
+ yield term
90
+
91
+ phenotypes_df = get_signor_df(PREFIX, version=version, force=force, key=DownloadKey.phenotype)
92
+ for identifier, name, description in phenotypes_df.values:
93
+ term = Term.from_triple(PREFIX, identifier, name, definition=_clean_descr(description))
94
+ term.append_parent(PHENOTYPE)
95
+ yield term
96
+
97
+
98
+ def _clean_descr(d) -> str | None:
99
+ if pd.isna(d):
100
+ return None
101
+ return d.replace("\n", " ")
102
+
103
+
104
+ if __name__ == "__main__":
105
+ SignorGetter.cli()
pyobo/sources/slm.py CHANGED
@@ -5,9 +5,9 @@ from collections.abc import Iterable
5
5
  import pandas as pd
6
6
  from tqdm.auto import tqdm
7
7
 
8
- from pyobo import Obo, Reference, Term
8
+ from pyobo import Obo, Reference, Term, TypeDef
9
9
  from pyobo.struct.struct import abbreviation as abbreviation_typedef
10
- from pyobo.struct.typedef import exact_match, has_inchi, has_smiles
10
+ from pyobo.struct.typedef import exact_match, has_citation, has_inchi, has_smiles
11
11
  from pyobo.utils.path import ensure_df
12
12
 
13
13
  __all__ = [
@@ -36,13 +36,14 @@ COLUMNS = [
36
36
  "HMDB",
37
37
  "PMID",
38
38
  ]
39
+ LEVEL = TypeDef.default(PREFIX, "level", is_metadata_tag=True)
39
40
 
40
41
 
41
42
  class SLMGetter(Obo):
42
43
  """An ontology representation of SwissLipid's lipid nomenclature."""
43
44
 
44
45
  ontology = bioversions_key = PREFIX
45
- typedefs = [exact_match]
46
+ typedefs = [exact_match, LEVEL, has_inchi, has_smiles, has_citation]
46
47
  synonym_typedefs = [abbreviation_typedef]
47
48
 
48
49
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
@@ -50,9 +51,7 @@ class SLMGetter(Obo):
50
51
  return iter_terms(force=force, version=self._version_or_raise)
51
52
 
52
53
 
53
- def get_obo(force: bool = False) -> Obo:
54
- """Get SwissLipids as OBO."""
55
- return SLMGetter(force=force)
54
+ INVALID_INCHI = {"-", "none"}
56
55
 
57
56
 
58
57
  def iter_terms(version: str, force: bool = False):
@@ -90,27 +89,25 @@ def iter_terms(version: str, force: bool = False):
90
89
  raise ValueError(identifier)
91
90
  term = Term.from_triple(PREFIX, identifier, name)
92
91
  if pd.notna(level):
93
- term.append_property("level", level)
92
+ term.annotate_string(LEVEL, level)
94
93
  if pd.notna(abbreviation):
95
94
  term.append_synonym(abbreviation, type=abbreviation_typedef)
96
95
  if pd.notna(synonyms):
97
96
  for synonym in synonyms.split("|"):
98
97
  term.append_synonym(synonym.strip())
99
98
  if pd.notna(smiles):
100
- term.append_property(has_smiles, smiles)
99
+ term.annotate_string(has_smiles, smiles)
101
100
  if pd.notna(inchi) and inchi != "InChI=none":
102
101
  if inchi.startswith("InChI="):
103
102
  inchi = inchi[len("InChI=") :]
104
- term.append_property(has_inchi, inchi)
103
+ term.annotate_string(has_inchi, inchi)
105
104
  if pd.notna(inchikey):
106
105
  inchikey = inchikey.removeprefix("InChIKey=").strip()
107
- if inchikey and inchikey != "none":
106
+ if inchikey and inchikey not in INVALID_INCHI:
108
107
  try:
109
108
  inchi_ref = Reference(prefix="inchikey", identifier=inchikey)
110
109
  except ValueError:
111
- tqdm.write(
112
- f"[slm:{identifier}] had invalid inchikey reference: ({type(inchikey)}) {inchikey}"
113
- )
110
+ tqdm.write(f"[slm:{identifier}] had invalid inchikey reference: `{inchikey}`")
114
111
  else:
115
112
  term.append_exact_match(inchi_ref)
116
113
  for chebi_id in _split(chebi_ids):
@@ -120,7 +117,7 @@ def iter_terms(version: str, force: bool = False):
120
117
  for hmdb_id in _split(hmdb_ids):
121
118
  term.append_exact_match(("hmdb", hmdb_id))
122
119
  for pubmed_id in _split(pubmed_ids):
123
- term.append_provenance(("pubmed", pubmed_id))
120
+ term.append_provenance(Reference(prefix="pubmed", identifier=pubmed_id))
124
121
  # TODO how to handle class, parents, and components?
125
122
  yield term
126
123
 
@@ -134,4 +131,4 @@ def _split(s: str) -> Iterable[str]:
134
131
 
135
132
 
136
133
  if __name__ == "__main__":
137
- get_obo().write_default(write_obo=True, use_tqdm=True)
134
+ SLMGetter.cli()
@@ -1,3 +1,9 @@
1
1
  """Converter for UMLS."""
2
2
 
3
- from .umls import UMLSGetter, get_obo # noqa: F401
3
+ from .sty import UMLSSTyGetter
4
+ from .umls import UMLSGetter
5
+
6
+ __all__ = [
7
+ "UMLSGetter",
8
+ "UMLSSTyGetter",
9
+ ]
File without changes
@@ -1,19 +1,22 @@
1
1
  """Utilities for UMLS synonyms."""
2
2
 
3
+ import re
3
4
  from collections.abc import Mapping
4
5
  from pathlib import Path
5
6
 
6
7
  import requests
7
8
  from bs4 import BeautifulSoup
8
9
 
10
+ from pyobo.struct import SynonymTypeDef, default_reference
9
11
  from pyobo.utils.io import open_map_tsv, write_map_tsv
10
12
 
11
- __all__ = ["get_umls_synonyms"]
13
+ __all__ = ["get_umls_synonyms", "get_umls_typedefs"]
12
14
 
13
15
  HERE = Path(__file__).parent.resolve()
14
16
  SYNONYM_TYPE_PATH = HERE.joinpath("synonym_types.tsv")
15
17
 
16
18
  ABBREVIATIONS_URL = "https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/abbreviations.html"
19
+ SPACES = re.compile(r"\s+")
17
20
 
18
21
 
19
22
  def get_umls_synonyms(*, refresh: bool = False) -> Mapping[str, str]:
@@ -23,14 +26,27 @@ def get_umls_synonyms(*, refresh: bool = False) -> Mapping[str, str]:
23
26
  res = requests.get(ABBREVIATIONS_URL, timeout=5)
24
27
  soup = BeautifulSoup(res.text, features="html.parser")
25
28
  table = soup.find(id="mrdoc_TTY")
26
- body = table.find("tbody")
29
+ if table is None:
30
+ raise ValueError
31
+ body = table.find("tbody") # type:ignore[attr-defined]
32
+ if body is None:
33
+ raise ValueError
27
34
  rv = {}
28
35
  for row in body.find_all("tr"):
29
36
  left, right = row.find_all("td")
30
- rv[left.text.strip()] = right.text.strip()
37
+ rv[left.text.strip()] = SPACES.sub(" ", right.text.strip())
31
38
  write_map_tsv(path=SYNONYM_TYPE_PATH, rv=rv, header=["key", "name"])
32
39
  return rv
33
40
 
34
41
 
42
+ def get_umls_typedefs(*, refresh: bool = False) -> dict[str, SynonymTypeDef]:
43
+ """Get all synonym type definitions."""
44
+ umls_synonyms = get_umls_synonyms(refresh=refresh)
45
+ return {
46
+ identifier: SynonymTypeDef(reference=default_reference("umls", identifier, name=name))
47
+ for identifier, name in umls_synonyms.items()
48
+ }
49
+
50
+
35
51
  if __name__ == "__main__":
36
- get_umls_synonyms(refresh=True)
52
+ get_umls_typedefs(refresh=True)
@@ -0,0 +1,57 @@
1
+ """Converter for UMLS Semantic Types."""
2
+
3
+ from collections.abc import Iterable
4
+
5
+ from pyobo import Obo, Reference, Term, default_reference
6
+ from pyobo.struct.typedef import has_category
7
+ from pyobo.utils.path import ensure_df
8
+
9
+ __all__ = [
10
+ "UMLSSTyGetter",
11
+ ]
12
+
13
+ PREFIX = "sty"
14
+
15
+ URL = "https://www.nlm.nih.gov/research/umls/knowledge_sources/semantic_network/SemGroups.txt"
16
+
17
+
18
+ class UMLSSTyGetter(Obo):
19
+ """An ontology representation of UMLS Semantic Types."""
20
+
21
+ ontology = PREFIX
22
+ bioversions_key = "umls"
23
+ typedefs = [has_category]
24
+
25
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
26
+ """Iterate over terms in the ontology."""
27
+ return iter_terms(version=self._version_or_raise)
28
+
29
+
30
+ COLUMNS = [
31
+ "group",
32
+ "group_label",
33
+ "sty_id",
34
+ "sty_name",
35
+ ]
36
+
37
+
38
+ def iter_terms(version: str) -> Iterable[Term]:
39
+ """Iterate over UMLS terms."""
40
+ df = ensure_df(PREFIX, url=URL, version=version, sep="|", header=None, names=COLUMNS)
41
+
42
+ extras = {
43
+ group: Term(
44
+ reference=default_reference(PREFIX, group, name=group_label),
45
+ )
46
+ for group, group_label in df[["group", "group_label"]].drop_duplicates().values
47
+ }
48
+ yield from extras.values()
49
+
50
+ for group, _group_label, sty_id, sty_name in df.values:
51
+ term = Term(reference=Reference(prefix="sty", identifier=sty_id, name=sty_name))
52
+ term.append_parent(extras[group])
53
+ yield term
54
+
55
+
56
+ if __name__ == "__main__":
57
+ UMLSSTyGetter.cli()
@@ -146,6 +146,7 @@ OAM Obsolete Modifier Abbreviation
146
146
  OAP Obsolete active preferred term
147
147
  OAS Obsolete active synonym
148
148
  OC Nursing outcomes
149
+ ODN Obsolete Display Name
149
150
  OET Obsolete entry term
150
151
  OF Obsolete fully specified name
151
152
  OL Non-current Lower Level Term
@@ -188,7 +189,6 @@ PX Expanded preferred terms (pair with PS)
188
189
  PXQ Preferred qualifier term
189
190
  QAB Qualifier abbreviation
190
191
  QEV Qualifier entry version
191
- QSV Qualifier sort version
192
192
  RAB Root abbreviation
193
193
  RHT Root hierarchical term
194
194
  RPT Root preferred term
@@ -15,7 +15,7 @@ from umls_downloader import open_umls, open_umls_semantic_types
15
15
 
16
16
  from pyobo import Obo, Reference, Synonym, SynonymTypeDef, Term
17
17
 
18
- from .get_synonym_types import get_umls_synonyms
18
+ from .get_synonym_types import get_umls_typedefs
19
19
 
20
20
  __all__ = [
21
21
  "UMLSGetter",
@@ -46,30 +46,26 @@ RRF_COLUMNS = [
46
46
 
47
47
  PREFIX = "umls"
48
48
  SOURCE_VOCAB_URL = "https://www.nlm.nih.gov/research/umls/sourcereleasedocs/index.html"
49
- SYNONYM_ABB = get_umls_synonyms()
49
+ UMLS_TYPEDEFS: dict[str, SynonymTypeDef] = get_umls_typedefs()
50
50
 
51
51
 
52
52
  class UMLSGetter(Obo):
53
53
  """An ontology representation of UMLS."""
54
54
 
55
55
  ontology = bioversions_key = PREFIX
56
- synonym_typedefs = [SynonymTypeDef.from_text(v) for v in SYNONYM_ABB.values()]
56
+ synonym_typedefs = list(UMLS_TYPEDEFS.values())
57
57
 
58
58
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
59
59
  """Iterate over terms in the ontology."""
60
60
  return iter_terms(version=self._version_or_raise)
61
61
 
62
62
 
63
- def get_obo() -> Obo:
64
- """Get UMLS as OBO."""
65
- return UMLSGetter()
66
-
67
-
68
63
  def get_semantic_types() -> Mapping[str, set[str]]:
69
64
  """Get UMLS semantic types for each term."""
70
65
  dd = defaultdict(set)
71
66
  with open_umls_semantic_types() as file:
72
- for line in tqdm(file, unit_scale=True):
67
+ # this is very fast and doesn't need a progress bar
68
+ for line in file:
73
69
  cui, sty, _ = line.decode("utf8").split("|", 2)
74
70
  dd[cui].add(sty)
75
71
  return dict(dd)
@@ -80,7 +76,7 @@ def iter_terms(version: str) -> Iterable[Term]:
80
76
  semantic_types = get_semantic_types()
81
77
 
82
78
  with open_umls(version=version) as file:
83
- it = tqdm(file, unit_scale=True, desc="[umls] parsing")
79
+ it = tqdm(file, unit_scale=True, desc="[umls] parsing", total=16_700_000)
84
80
  lines = (line.decode("utf-8").strip().split("|") for line in it)
85
81
  for cui, cui_lines in itt.groupby(lines, key=operator.itemgetter(0)):
86
82
  df = pd.DataFrame(list(cui_lines), columns=RRF_COLUMNS)
@@ -96,38 +92,38 @@ def iter_terms(version: str) -> Iterable[Term]:
96
92
  continue
97
93
 
98
94
  df["TTY - Term Type in Source"] = df["TTY - Term Type in Source"].map(
99
- SYNONYM_ABB.__getitem__
95
+ UMLS_TYPEDEFS.__getitem__
100
96
  )
101
97
 
102
98
  _r = pref_rows_df.iloc[0]
103
99
  sdf = df[["SAB - source name", "CODE", "TTY - Term Type in Source", "STR"]]
104
100
 
105
101
  synonyms = []
106
- xrefs = []
102
+ xrefs = set()
107
103
  for source, identifier, synonym_type, synonym in sdf.values:
108
104
  norm_source = bioregistry.normalize_prefix(source)
109
- if norm_source is None or not identifier:
105
+ if not norm_source or not identifier or "," in identifier:
110
106
  provenance = []
111
107
  else:
112
- ref = Reference(prefix=norm_source, identifier=identifier)
113
- provenance = [ref]
114
- xrefs.append(ref)
108
+ try:
109
+ ref = Reference(prefix=norm_source, identifier=identifier)
110
+ except ValueError:
111
+ continue
112
+ else:
113
+ provenance = [ref]
114
+ xrefs.add(ref)
115
115
  synonyms.append(
116
116
  Synonym(
117
117
  name=synonym,
118
118
  provenance=provenance,
119
- type=SynonymTypeDef.from_text(synonym_type),
119
+ type=synonym_type.reference,
120
120
  )
121
121
  )
122
122
 
123
- xrefs = sorted(
124
- set(xrefs), key=lambda reference: (reference.prefix, reference.identifier)
125
- )
126
-
127
123
  term = Term(
128
124
  reference=Reference(prefix=PREFIX, identifier=cui, name=_r["STR"]),
129
125
  synonyms=synonyms,
130
- xrefs=xrefs,
126
+ xrefs=sorted(xrefs),
131
127
  )
132
128
  for sty_id in semantic_types.get(cui, set()):
133
129
  term.append_parent(Reference(prefix="sty", identifier=sty_id))
@@ -0,0 +1,46 @@
1
+ """Unimod provides an OBO file, but it's got lots of errors in its encoding."""
2
+
3
+ from collections.abc import Iterable
4
+
5
+ from lxml import etree
6
+
7
+ from pyobo.struct import Obo, Reference, Term
8
+ from pyobo.utils.path import ensure_path
9
+
10
+ URL = "https://www.unimod.org/xml/unimod.xml"
11
+ PREFIX_MAP = {"umod": "http://www.unimod.org/xmlns/schema/unimod_2"}
12
+ PREFIX = "unimod"
13
+
14
+
15
+ class UnimodGetter(Obo):
16
+ """An ontology representation of the unimod modifications."""
17
+
18
+ ontology = bioversions_key = PREFIX
19
+ dynamic_version = True
20
+
21
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
22
+ """Iterate over terms in the ontology."""
23
+ return get_terms()
24
+
25
+
26
+ def get_terms() -> Iterable[Term]:
27
+ """Get terms."""
28
+ path = ensure_path("unimod", url=URL)
29
+ x = etree.parse(path).getroot()
30
+ mods = x.findall("umod:modifications/umod:mod", namespaces=PREFIX_MAP)
31
+ return map(_mod_to_term, mods)
32
+
33
+
34
+ def _mod_to_term(mod: etree.Element) -> Term:
35
+ title = mod.attrib["title"]
36
+ name = mod.attrib["full_name"]
37
+ identifier = mod.attrib["record_id"]
38
+ term = Term(
39
+ reference=Reference(prefix=PREFIX, identifier=identifier, name=title),
40
+ definition=name if name != title else None,
41
+ )
42
+ return term
43
+
44
+
45
+ if __name__ == "__main__":
46
+ UnimodGetter.cli()
@@ -4,7 +4,7 @@ from .uniprot import PREFIX, UniProtGetter
4
4
  from .uniprot_ptm import UniProtPtmGetter
5
5
 
6
6
  __all__ = [
7
+ "PREFIX",
7
8
  "UniProtGetter",
8
9
  "UniProtPtmGetter",
9
- "PREFIX",
10
10
  ]