pyobo 0.11.2__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. pyobo/.DS_Store +0 -0
  2. pyobo/__init__.py +95 -20
  3. pyobo/__main__.py +0 -0
  4. pyobo/api/__init__.py +81 -10
  5. pyobo/api/alts.py +52 -42
  6. pyobo/api/combine.py +39 -0
  7. pyobo/api/edges.py +68 -0
  8. pyobo/api/hierarchy.py +231 -203
  9. pyobo/api/metadata.py +14 -19
  10. pyobo/api/names.py +207 -127
  11. pyobo/api/properties.py +117 -113
  12. pyobo/api/relations.py +68 -94
  13. pyobo/api/species.py +24 -21
  14. pyobo/api/typedefs.py +11 -11
  15. pyobo/api/utils.py +66 -13
  16. pyobo/api/xrefs.py +108 -114
  17. pyobo/cli/__init__.py +0 -0
  18. pyobo/cli/cli.py +35 -50
  19. pyobo/cli/database.py +183 -161
  20. pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
  21. pyobo/cli/lookup.py +163 -195
  22. pyobo/cli/utils.py +19 -6
  23. pyobo/constants.py +102 -3
  24. pyobo/getters.py +196 -118
  25. pyobo/gilda_utils.py +79 -200
  26. pyobo/identifier_utils/__init__.py +41 -0
  27. pyobo/identifier_utils/api.py +296 -0
  28. pyobo/identifier_utils/model.py +130 -0
  29. pyobo/identifier_utils/preprocessing.json +812 -0
  30. pyobo/identifier_utils/preprocessing.py +61 -0
  31. pyobo/identifier_utils/relations/__init__.py +8 -0
  32. pyobo/identifier_utils/relations/api.py +162 -0
  33. pyobo/identifier_utils/relations/data.json +5824 -0
  34. pyobo/identifier_utils/relations/data_owl.json +57 -0
  35. pyobo/identifier_utils/relations/data_rdf.json +1 -0
  36. pyobo/identifier_utils/relations/data_rdfs.json +7 -0
  37. pyobo/mocks.py +9 -6
  38. pyobo/ner/__init__.py +9 -0
  39. pyobo/ner/api.py +72 -0
  40. pyobo/ner/normalizer.py +33 -0
  41. pyobo/obographs.py +43 -39
  42. pyobo/plugins.py +5 -4
  43. pyobo/py.typed +0 -0
  44. pyobo/reader.py +1358 -395
  45. pyobo/reader_utils.py +155 -0
  46. pyobo/resource_utils.py +42 -22
  47. pyobo/resources/__init__.py +0 -0
  48. pyobo/resources/goc.py +75 -0
  49. pyobo/resources/goc.tsv +188 -0
  50. pyobo/resources/ncbitaxon.py +4 -5
  51. pyobo/resources/ncbitaxon.tsv.gz +0 -0
  52. pyobo/resources/ro.py +3 -2
  53. pyobo/resources/ro.tsv +0 -0
  54. pyobo/resources/so.py +0 -0
  55. pyobo/resources/so.tsv +0 -0
  56. pyobo/sources/README.md +12 -8
  57. pyobo/sources/__init__.py +52 -29
  58. pyobo/sources/agrovoc.py +0 -0
  59. pyobo/sources/antibodyregistry.py +11 -12
  60. pyobo/sources/bigg/__init__.py +13 -0
  61. pyobo/sources/bigg/bigg_compartment.py +81 -0
  62. pyobo/sources/bigg/bigg_metabolite.py +229 -0
  63. pyobo/sources/bigg/bigg_model.py +46 -0
  64. pyobo/sources/bigg/bigg_reaction.py +77 -0
  65. pyobo/sources/biogrid.py +1 -2
  66. pyobo/sources/ccle.py +7 -12
  67. pyobo/sources/cgnc.py +0 -5
  68. pyobo/sources/chebi.py +1 -1
  69. pyobo/sources/chembl/__init__.py +9 -0
  70. pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
  71. pyobo/sources/chembl/chembl_target.py +160 -0
  72. pyobo/sources/civic_gene.py +55 -15
  73. pyobo/sources/clinicaltrials.py +160 -0
  74. pyobo/sources/complexportal.py +24 -24
  75. pyobo/sources/conso.py +14 -22
  76. pyobo/sources/cpt.py +0 -0
  77. pyobo/sources/credit.py +1 -9
  78. pyobo/sources/cvx.py +27 -5
  79. pyobo/sources/depmap.py +9 -12
  80. pyobo/sources/dictybase_gene.py +2 -7
  81. pyobo/sources/drugbank/__init__.py +9 -0
  82. pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
  83. pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
  84. pyobo/sources/drugcentral.py +17 -13
  85. pyobo/sources/expasy.py +31 -34
  86. pyobo/sources/famplex.py +13 -18
  87. pyobo/sources/flybase.py +3 -8
  88. pyobo/sources/gard.py +62 -0
  89. pyobo/sources/geonames/__init__.py +9 -0
  90. pyobo/sources/geonames/features.py +28 -0
  91. pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
  92. pyobo/sources/geonames/utils.py +115 -0
  93. pyobo/sources/gmt_utils.py +6 -7
  94. pyobo/sources/go.py +20 -13
  95. pyobo/sources/gtdb.py +154 -0
  96. pyobo/sources/gwascentral/__init__.py +9 -0
  97. pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
  98. pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
  99. pyobo/sources/hgnc/__init__.py +9 -0
  100. pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
  101. pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
  102. pyobo/sources/icd/__init__.py +9 -0
  103. pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
  104. pyobo/sources/icd/icd11.py +148 -0
  105. pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
  106. pyobo/sources/interpro.py +4 -9
  107. pyobo/sources/itis.py +0 -5
  108. pyobo/sources/kegg/__init__.py +0 -0
  109. pyobo/sources/kegg/api.py +16 -38
  110. pyobo/sources/kegg/genes.py +9 -20
  111. pyobo/sources/kegg/genome.py +1 -7
  112. pyobo/sources/kegg/pathway.py +9 -21
  113. pyobo/sources/mesh.py +58 -24
  114. pyobo/sources/mgi.py +3 -10
  115. pyobo/sources/mirbase/__init__.py +11 -0
  116. pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
  117. pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
  118. pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
  119. pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
  120. pyobo/sources/msigdb.py +74 -39
  121. pyobo/sources/ncbi/__init__.py +9 -0
  122. pyobo/sources/ncbi/ncbi_gc.py +162 -0
  123. pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
  124. pyobo/sources/nih_reporter.py +60 -0
  125. pyobo/sources/nlm/__init__.py +9 -0
  126. pyobo/sources/nlm/nlm_catalog.py +48 -0
  127. pyobo/sources/nlm/nlm_publisher.py +36 -0
  128. pyobo/sources/nlm/utils.py +116 -0
  129. pyobo/sources/npass.py +6 -8
  130. pyobo/sources/omim_ps.py +10 -3
  131. pyobo/sources/pathbank.py +4 -8
  132. pyobo/sources/pfam/__init__.py +9 -0
  133. pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
  134. pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
  135. pyobo/sources/pharmgkb/__init__.py +15 -0
  136. pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
  137. pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
  138. pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
  139. pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
  140. pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
  141. pyobo/sources/pharmgkb/utils.py +86 -0
  142. pyobo/sources/pid.py +1 -6
  143. pyobo/sources/pombase.py +6 -10
  144. pyobo/sources/pubchem.py +4 -9
  145. pyobo/sources/reactome.py +5 -11
  146. pyobo/sources/rgd.py +11 -16
  147. pyobo/sources/rhea.py +37 -36
  148. pyobo/sources/ror.py +69 -42
  149. pyobo/sources/selventa/__init__.py +0 -0
  150. pyobo/sources/selventa/schem.py +4 -7
  151. pyobo/sources/selventa/scomp.py +1 -6
  152. pyobo/sources/selventa/sdis.py +4 -7
  153. pyobo/sources/selventa/sfam.py +1 -6
  154. pyobo/sources/sgd.py +6 -11
  155. pyobo/sources/signor/__init__.py +7 -0
  156. pyobo/sources/signor/download.py +41 -0
  157. pyobo/sources/signor/signor_complexes.py +105 -0
  158. pyobo/sources/slm.py +12 -15
  159. pyobo/sources/umls/__init__.py +7 -1
  160. pyobo/sources/umls/__main__.py +0 -0
  161. pyobo/sources/umls/get_synonym_types.py +20 -4
  162. pyobo/sources/umls/sty.py +57 -0
  163. pyobo/sources/umls/synonym_types.tsv +1 -1
  164. pyobo/sources/umls/umls.py +18 -22
  165. pyobo/sources/unimod.py +46 -0
  166. pyobo/sources/uniprot/__init__.py +1 -1
  167. pyobo/sources/uniprot/uniprot.py +40 -32
  168. pyobo/sources/uniprot/uniprot_ptm.py +4 -34
  169. pyobo/sources/utils.py +3 -2
  170. pyobo/sources/wikipathways.py +7 -10
  171. pyobo/sources/zfin.py +5 -10
  172. pyobo/ssg/__init__.py +12 -16
  173. pyobo/ssg/base.html +0 -0
  174. pyobo/ssg/index.html +26 -13
  175. pyobo/ssg/term.html +12 -2
  176. pyobo/ssg/typedef.html +0 -0
  177. pyobo/struct/__init__.py +54 -8
  178. pyobo/struct/functional/__init__.py +1 -0
  179. pyobo/struct/functional/dsl.py +2572 -0
  180. pyobo/struct/functional/macros.py +423 -0
  181. pyobo/struct/functional/obo_to_functional.py +385 -0
  182. pyobo/struct/functional/ontology.py +270 -0
  183. pyobo/struct/functional/utils.py +112 -0
  184. pyobo/struct/reference.py +331 -136
  185. pyobo/struct/struct.py +1413 -643
  186. pyobo/struct/struct_utils.py +1078 -0
  187. pyobo/struct/typedef.py +162 -210
  188. pyobo/struct/utils.py +12 -5
  189. pyobo/struct/vocabulary.py +138 -0
  190. pyobo/utils/__init__.py +0 -0
  191. pyobo/utils/cache.py +13 -11
  192. pyobo/utils/io.py +17 -31
  193. pyobo/utils/iter.py +5 -5
  194. pyobo/utils/misc.py +41 -53
  195. pyobo/utils/ndex_utils.py +0 -0
  196. pyobo/utils/path.py +76 -70
  197. pyobo/version.py +3 -3
  198. {pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/METADATA +228 -229
  199. pyobo-0.12.0.dist-info/RECORD +202 -0
  200. pyobo-0.12.0.dist-info/WHEEL +4 -0
  201. {pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
  202. pyobo-0.12.0.dist-info/licenses/LICENSE +21 -0
  203. pyobo/aws.py +0 -162
  204. pyobo/cli/aws.py +0 -47
  205. pyobo/identifier_utils.py +0 -142
  206. pyobo/normalizer.py +0 -232
  207. pyobo/registries/__init__.py +0 -16
  208. pyobo/registries/metaregistry.json +0 -507
  209. pyobo/registries/metaregistry.py +0 -135
  210. pyobo/sources/icd11.py +0 -105
  211. pyobo/xrefdb/__init__.py +0 -1
  212. pyobo/xrefdb/canonicalizer.py +0 -214
  213. pyobo/xrefdb/priority.py +0 -59
  214. pyobo/xrefdb/sources/__init__.py +0 -60
  215. pyobo/xrefdb/sources/biomappings.py +0 -36
  216. pyobo/xrefdb/sources/cbms2019.py +0 -91
  217. pyobo/xrefdb/sources/chembl.py +0 -83
  218. pyobo/xrefdb/sources/compath.py +0 -82
  219. pyobo/xrefdb/sources/famplex.py +0 -64
  220. pyobo/xrefdb/sources/gilda.py +0 -50
  221. pyobo/xrefdb/sources/intact.py +0 -113
  222. pyobo/xrefdb/sources/ncit.py +0 -133
  223. pyobo/xrefdb/sources/pubchem.py +0 -27
  224. pyobo/xrefdb/sources/wikidata.py +0 -116
  225. pyobo-0.11.2.dist-info/RECORD +0 -157
  226. pyobo-0.11.2.dist-info/WHEEL +0 -5
  227. pyobo-0.11.2.dist-info/top_level.txt +0 -1
@@ -0,0 +1,162 @@
1
+ """Convert NCBI Genetic Codes to an ontology.
2
+
3
+ .. seealso::
4
+
5
+ https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes
6
+ """
7
+
8
+ from collections.abc import Iterable
9
+
10
+ from pyobo import default_reference
11
+ from pyobo.struct import CHARLIE_TERM, HUMAN_TERM, PYOBO_INJECTED, Obo, Reference, Term, TypeDef
12
+ from pyobo.struct.typedef import comment, has_contributor, see_also, term_replaced_by
13
+ from pyobo.utils.path import ensure_path
14
+
15
+ PREFIX = "ncbi.gc"
16
+ URI_PREFIX = (
17
+ "https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes#SG"
18
+ )
19
+ URL = "ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt"
20
+ VERSION = "4.6"
21
+
22
+ GC_ROOT = default_reference(prefix=PREFIX, identifier="root", name="genetic code translation table")
23
+ NCBITAXON_ROOT = Reference(prefix="NCBITaxon", identifier="1", name="root")
24
+
25
+ has_gc_code = TypeDef(
26
+ reference=default_reference(
27
+ prefix=PREFIX,
28
+ identifier="hasGeneticCodeTranslationTable",
29
+ name="has genetic code translation table",
30
+ ),
31
+ definition="Connects a taxonomy term to a genetic code translation table",
32
+ domain=NCBITAXON_ROOT,
33
+ range=GC_ROOT,
34
+ ).append_contributor(CHARLIE_TERM)
35
+
36
+ NUCLEAR_GENETIC_CODE = default_reference(
37
+ prefix=PREFIX, identifier="nuclear-genetic-code", name="nuclear genetic code translation table"
38
+ )
39
+ MITOCHONDRIAL_GENETIC_CODE = default_reference(
40
+ prefix=PREFIX,
41
+ identifier="mitochondrial-genetic-code",
42
+ name="mitochondrial genetic code translation table",
43
+ )
44
+ PLASTID_GENETIC_CODE = default_reference(
45
+ prefix=PREFIX, identifier="plastid-genetic-code", name="plastid genetic code translation table"
46
+ )
47
+ NUCLEUS = Reference(prefix="GO", identifier="0005634", name="nucleus")
48
+ MITOCHONDIA = Reference(prefix="GO", identifier="0005739", name="mitochondrion")
49
+ PLASTID = Reference(prefix="GO", identifier="0009536", name="plastid")
50
+
51
+ CATEGORY_TO_CELLULAR_COMPONENT = {
52
+ NUCLEAR_GENETIC_CODE: NUCLEUS,
53
+ MITOCHONDRIAL_GENETIC_CODE: MITOCHONDIA,
54
+ PLASTID_GENETIC_CODE: PLASTID,
55
+ }
56
+ CATEGORY_TO_TABLES = {
57
+ NUCLEAR_GENETIC_CODE: [12, 31, 6, 28, 10, 27, 29, 26, 30, 15],
58
+ MITOCHONDRIAL_GENETIC_CODE: [14, 13, 16, 9, 5, 4, 22, 23, 21, 2, 3, 24],
59
+ PLASTID_GENETIC_CODE: [11, 32],
60
+ }
61
+ TABLE_TO_CATEGORY = {
62
+ str(value): key for key, values in CATEGORY_TO_TABLES.items() for value in values
63
+ }
64
+
65
+
66
+ class NCBIGCGetter(Obo):
67
+ """Get terms in GC."""
68
+
69
+ ontology = PREFIX
70
+ static_version = VERSION
71
+ root_terms = [GC_ROOT]
72
+ typedefs = [has_gc_code, has_contributor, see_also, comment, term_replaced_by]
73
+
74
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
75
+ """Iterate over terms in the ontology."""
76
+ return get_terms()
77
+
78
+
79
+ def get_terms() -> Iterable[Term]:
80
+ """Get terms for GC."""
81
+ yield CHARLIE_TERM
82
+ yield Term(reference=NCBITAXON_ROOT)
83
+ yield HUMAN_TERM
84
+
85
+ path = ensure_path(PREFIX, url=URL)
86
+ # first, remove comment lines
87
+ lines = [
88
+ line.strip()
89
+ for line in path.read_text().splitlines()
90
+ if not line.startswith("--") and line.strip()
91
+ ]
92
+
93
+ lines = lines[1:-2]
94
+ entries: list[dict[str, str]] = []
95
+ entry: dict[str, str] = {}
96
+ for line in lines:
97
+ # start a new entry
98
+ if line == "{":
99
+ if entry:
100
+ entries.append(entry)
101
+ entry = {}
102
+ elif line == "},":
103
+ pass
104
+ else:
105
+ key, data = line.split(" ", 1)
106
+ if key == "name":
107
+ data = data.lstrip('"')
108
+ if data.startswith("SGC"):
109
+ key = "symbol"
110
+ entry[key] = data.rstrip(",").rstrip().rstrip('"')
111
+ elif key == "id":
112
+ entry["identifier"] = data.rstrip(",").rstrip()
113
+
114
+ yield (
115
+ Term(
116
+ reference=GC_ROOT,
117
+ definition="A table for translating codons into amino acids. This can change for "
118
+ "different taxa, or be different in different organelles that include genetic information.",
119
+ )
120
+ .append_contributor(CHARLIE_TERM)
121
+ .append_comment(PYOBO_INJECTED)
122
+ )
123
+
124
+ for reference in CATEGORY_TO_TABLES:
125
+ term = Term(reference=reference)
126
+ term.append_parent(GC_ROOT)
127
+ term.append_contributor(CHARLIE_TERM)
128
+ term.append_comment(PYOBO_INJECTED)
129
+ if substructure := CATEGORY_TO_CELLULAR_COMPONENT.get(reference):
130
+ term.append_see_also(substructure)
131
+ yield term
132
+
133
+ for entry in entries:
134
+ identifier = entry["identifier"]
135
+ term = Term.from_triple(PREFIX, identifier, entry["name"])
136
+ term.append_parent(TABLE_TO_CATEGORY.get(identifier, GC_ROOT))
137
+ # TODO if symbol is available, what does it mean?
138
+ yield term
139
+
140
+ yield (
141
+ Term(
142
+ reference=Reference(prefix=PREFIX, identifier="7"),
143
+ is_obsolete=True,
144
+ )
145
+ .append_replaced_by(Reference(prefix=PREFIX, identifier="4"))
146
+ .append_comment("Kinetoplast code now merged in code id 4, as of 1995.")
147
+ )
148
+ yield (
149
+ Term(
150
+ reference=Reference(prefix=PREFIX, identifier="8"),
151
+ is_obsolete=True,
152
+ )
153
+ .append_replaced_by(Reference(prefix=PREFIX, identifier="1"))
154
+ .append_comment("all plant chloroplast differences due to RNA edit, as of 1995.")
155
+ )
156
+
157
+ for cellular_component in CATEGORY_TO_CELLULAR_COMPONENT.values():
158
+ yield Term(reference=cellular_component)
159
+
160
+
161
+ if __name__ == "__main__":
162
+ NCBIGCGetter.cli()
@@ -7,8 +7,8 @@ import bioregistry
7
7
  import pandas as pd
8
8
  from tqdm.auto import tqdm
9
9
 
10
- from ..struct import Obo, Reference, Term, from_species
11
- from ..utils.path import ensure_df
10
+ from ...struct import Obo, Reference, Term, from_species
11
+ from ...utils.path import ensure_df
12
12
 
13
13
  __all__ = [
14
14
  "NCBIGeneGetter",
@@ -34,7 +34,7 @@ CONSORTIUM_SPECIES_MAPPING = {
34
34
  }
35
35
 
36
36
  GENE_INFO_URL = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz"
37
- #: Columns fro gene_info.gz that are used
37
+ #: Columns for gene_info.gz that are used
38
38
  GENE_INFO_COLUMNS = [
39
39
  "#tax_id",
40
40
  "GeneID",
@@ -93,11 +93,6 @@ class NCBIGeneGetter(Obo):
93
93
  return get_terms(force=force)
94
94
 
95
95
 
96
- def get_obo(force: bool = False) -> Obo:
97
- """Get Entrez as OBO."""
98
- return NCBIGeneGetter(force=force)
99
-
100
-
101
96
  def get_gene_info_df(force: bool = False) -> pd.DataFrame:
102
97
  """Get the gene info dataframe."""
103
98
  return ensure_df(
@@ -111,17 +106,16 @@ def get_gene_info_df(force: bool = False) -> pd.DataFrame:
111
106
  )
112
107
 
113
108
 
114
- """xref_mapping was obtained from:
115
-
116
- namespaces = set()
117
- for xrefs in df[df['dbXrefs'].notna()]['dbXrefs']:
118
- for xref in xrefs.split('|'):
119
- namespaces.add(xref.split(':')[0])
109
+ def _get_xref_mapping() -> list[str]:
110
+ namespaces: set[str] = set()
111
+ df = get_gene_info_df()
112
+ for xrefs in df[df["dbXrefs"].notna()]["dbXrefs"]:
113
+ for xref in xrefs.split("|"):
114
+ namespaces.add(xref.split(":")[0])
115
+ return sorted(namespaces, key=str.casefold)
120
116
 
121
- print('namespaces:')
122
- print(*sorted(namespaces), sep='\n')
123
- """
124
117
 
118
+ # this was retrieved from :func:`_get_xref_mapping`
125
119
  xref_mapping = {
126
120
  "APHIDBASE",
127
121
  "ASAP",
@@ -157,7 +151,12 @@ xref_mapping = {x.lower() for x in xref_mapping}
157
151
 
158
152
 
159
153
  def get_terms(force: bool = False) -> Iterable[Term]:
160
- """Get Entrez terms."""
154
+ """Get Entrez terms.
155
+
156
+ :param force: should re-download be forced?
157
+
158
+ :yields: terms for each line
159
+ """
161
160
  df = get_gene_info_df(force=force)
162
161
 
163
162
  it = tqdm(
@@ -192,4 +191,4 @@ def get_terms(force: bool = False) -> Iterable[Term]:
192
191
 
193
192
 
194
193
  if __name__ == "__main__":
195
- get_obo().write_default()
194
+ NCBIGeneGetter.cli()
@@ -0,0 +1,60 @@
1
+ """A source for NIH RePORTER projects."""
2
+
3
+ from collections.abc import Iterable
4
+
5
+ import pandas as pd
6
+ from nih_reporter_downloader import get_projects_df
7
+
8
+ from pyobo import Reference
9
+ from pyobo.struct import CHARLIE_TERM, HUMAN_TERM, PYOBO_INJECTED, Obo, Term, default_reference
10
+
11
+ __all__ = [
12
+ "NIHReporterGetter",
13
+ ]
14
+
15
+ PREFIX = "nihreporter.project"
16
+ PROJECTS_SUBSET = [
17
+ "APPLICATION_ID",
18
+ "PROJECT_TITLE",
19
+ ]
20
+
21
+ PROJECT_TERM = (
22
+ Term(reference=default_reference(PREFIX, "project", name="project"))
23
+ .append_contributor(CHARLIE_TERM)
24
+ .append_comment(PYOBO_INJECTED)
25
+ )
26
+
27
+
28
+ class NIHReporterGetter(Obo):
29
+ """An ontology representation of NIH RePORTER."""
30
+
31
+ ontology = PREFIX
32
+ dynamic_version = True
33
+
34
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
35
+ """Iterate over terms in the ontology."""
36
+ yield CHARLIE_TERM
37
+ yield HUMAN_TERM
38
+ yield PROJECT_TERM
39
+ yield from iterate_nih_reporter_projects()
40
+
41
+
42
+ def iterate_nih_reporter_projects() -> Iterable[Term]:
43
+ """Iterate over NIH RePORTER projects."""
44
+ projects_df = get_projects_df()
45
+ for identifier, name in projects_df[PROJECTS_SUBSET].values:
46
+ term = Term(
47
+ reference=Reference(
48
+ prefix=PREFIX,
49
+ identifier=str(identifier),
50
+ name=name.replace("\n", " ") if pd.notna(name) else None,
51
+ ),
52
+ type="Instance",
53
+ )
54
+ term.append_parent(PROJECT_TERM)
55
+ # TODO there is a lot more information that can be added here
56
+ yield term
57
+
58
+
59
+ if __name__ == "__main__":
60
+ NIHReporterGetter.cli()
@@ -0,0 +1,9 @@
1
+ """Sources from NLM."""
2
+
3
+ from .nlm_catalog import NLMCatalogGetter
4
+ from .nlm_publisher import NLMPublisherGetter
5
+
6
+ __all__ = [
7
+ "NLMCatalogGetter",
8
+ "NLMPublisherGetter",
9
+ ]
@@ -0,0 +1,48 @@
1
+ """Converter for NLM Providers."""
2
+
3
+ from collections.abc import Iterable
4
+
5
+ from pyobo.sources.nlm.utils import (
6
+ JOURNAL_TERM,
7
+ PREFIX_CATALOG,
8
+ PUBLISHED_IN,
9
+ PUBLISHER_TERM,
10
+ get_journals,
11
+ get_publishers,
12
+ )
13
+ from pyobo.struct import CHARLIE_TERM, HUMAN_TERM, Obo, Term
14
+ from pyobo.struct.typedef import exact_match, has_end_date, has_start_date
15
+
16
+ __all__ = [
17
+ "NLMCatalogGetter",
18
+ ]
19
+
20
+
21
+ class NLMCatalogGetter(Obo):
22
+ """An ontology representation of NLM Providers."""
23
+
24
+ bioversions_key = ontology = PREFIX_CATALOG
25
+ dynamic_version = True
26
+ typedefs = [PUBLISHED_IN, has_end_date, has_start_date, exact_match]
27
+ root_terms = [JOURNAL_TERM.reference, PUBLISHER_TERM.reference]
28
+
29
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
30
+ """Iterate over journal terms for NLM Catalog."""
31
+ yield from get_terms(force=force)
32
+
33
+
34
+ def get_terms(*, force: bool = False) -> Iterable[Term]:
35
+ """Get NLM catalog terms."""
36
+ yield JOURNAL_TERM
37
+ yield PUBLISHER_TERM
38
+ yield CHARLIE_TERM
39
+ yield HUMAN_TERM
40
+
41
+ journal_id_to_publisher_key = get_publishers(force=force)
42
+ yield from sorted(set(journal_id_to_publisher_key.values()))
43
+
44
+ yield from get_journals(force=force, journal_id_to_publisher_key=journal_id_to_publisher_key)
45
+
46
+
47
+ if __name__ == "__main__":
48
+ NLMCatalogGetter.cli()
@@ -0,0 +1,36 @@
1
+ """Converter for NLM Providers."""
2
+
3
+ from collections.abc import Iterable
4
+
5
+ from pyobo.sources.nlm.utils import PREFIX_PUBLISHER, PUBLISHER_TERM, get_publishers
6
+ from pyobo.struct import CHARLIE_TERM, HUMAN_TERM, Obo, Term
7
+
8
+ __all__ = [
9
+ "NLMPublisherGetter",
10
+ ]
11
+
12
+
13
+ class NLMPublisherGetter(Obo):
14
+ """An ontology representation of NLM Publishers."""
15
+
16
+ bioversions_key = ontology = PREFIX_PUBLISHER
17
+ dynamic_version = True
18
+ root_terms = [PUBLISHER_TERM.reference]
19
+
20
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
21
+ """Iterate over gene terms for NLM Catalog."""
22
+ yield from get_terms(force=force)
23
+
24
+
25
+ def get_terms(*, force: bool = False) -> Iterable[Term]:
26
+ """Get NLM publisher terms."""
27
+ yield PUBLISHER_TERM
28
+ yield CHARLIE_TERM
29
+ yield HUMAN_TERM
30
+
31
+ journal_id_to_publisher_key = get_publishers(force=force)
32
+ yield from sorted(set(journal_id_to_publisher_key.values()))
33
+
34
+
35
+ if __name__ == "__main__":
36
+ NLMPublisherGetter.cli()
@@ -0,0 +1,116 @@
1
+ """Utilities for NLM."""
2
+
3
+ from collections.abc import Iterable
4
+ from xml.etree import ElementTree
5
+
6
+ from tqdm import tqdm
7
+
8
+ from pyobo import Reference, Term, TypeDef, default_reference, ensure_path
9
+ from pyobo.struct.struct import CHARLIE_TERM, PYOBO_INJECTED
10
+ from pyobo.struct.typedef import has_end_date, has_start_date
11
+ from pyobo.utils.path import ensure_df
12
+
13
+ PREFIX_CATALOG = "nlm"
14
+ PREFIX_PUBLISHER = "nlm.publisher"
15
+
16
+ CATALOG_TO_PUBLISHER = "https://ftp.ncbi.nlm.nih.gov/pubmed/xmlprovidernames.txt"
17
+ JOURNAL_INFO_PATH = "https://ftp.ncbi.nlm.nih.gov/pubmed/jourcache.xml"
18
+ PUBLISHED_IN = TypeDef(
19
+ reference=default_reference(PREFIX_CATALOG, "published_in", name="published in"),
20
+ xrefs=[
21
+ Reference(prefix="biolink", identifier="published_in"),
22
+ Reference(prefix="uniprot.core", identifier="publishedIn"),
23
+ ],
24
+ )
25
+ JOURNAL_TERM = (
26
+ Term(reference=default_reference(PREFIX_CATALOG, "journal", name="journal"))
27
+ .append_exact_match(Reference(prefix="SIO", identifier="000160"))
28
+ .append_exact_match(Reference(prefix="FBCV", identifier="0000787"))
29
+ .append_exact_match(Reference(prefix="MI", identifier="0885"))
30
+ .append_exact_match(Reference(prefix="bibo", identifier="Journal"))
31
+ .append_exact_match(Reference(prefix="uniprot.core", identifier="Journal"))
32
+ .append_contributor(CHARLIE_TERM)
33
+ .append_comment(PYOBO_INJECTED)
34
+ )
35
+ PUBLISHER_TERM = (
36
+ Term(reference=default_reference(PREFIX_CATALOG, "publisher", name="publisher"))
37
+ .append_exact_match(Reference(prefix="biolink", identifier="publisher"))
38
+ .append_exact_match(Reference(prefix="schema", identifier="publisher"))
39
+ .append_exact_match(Reference(prefix="uniprot.core", identifier="publisher"))
40
+ .append_contributor(CHARLIE_TERM)
41
+ .append_comment(PYOBO_INJECTED)
42
+ )
43
+
44
+
45
+ def get_publishers(*, force: bool = False) -> dict[str, Term]:
46
+ """Get NLM publishers."""
47
+ journal_to_publisher_df = ensure_df(
48
+ PREFIX_CATALOG, url=CATALOG_TO_PUBLISHER, sep="|", force=force, dtype=str
49
+ )
50
+ journal_id_to_publisher_key: dict[str, Term] = {
51
+ journal_id: Term(
52
+ reference=Reference(prefix=PREFIX_PUBLISHER, identifier=identifier, name=name),
53
+ type="Instance",
54
+ ).append_parent(PUBLISHER_TERM)
55
+ for journal_id, identifier, name in journal_to_publisher_df.values
56
+ }
57
+ return journal_id_to_publisher_key
58
+
59
+
60
+ def get_journals(
61
+ *, force: bool = False, journal_id_to_publisher_key: dict[str, Term] | None = None
62
+ ) -> Iterable[Term]:
63
+ """Get NLM Catalog terms."""
64
+ path = ensure_path(PREFIX_CATALOG, url=JOURNAL_INFO_PATH, force=force)
65
+ root = ElementTree.parse(path).getroot()
66
+
67
+ if journal_id_to_publisher_key is None:
68
+ journal_id_to_publisher_key = get_publishers(force=force)
69
+ elements = root.findall("Journal")
70
+ for element in elements:
71
+ if term := _process_journal(element, journal_id_to_publisher_key):
72
+ yield term
73
+
74
+
75
+ def _process_journal(element, journal_id_to_publisher_key: dict[str, Term]) -> Term | None:
76
+ # TODO enrich with context from https://ftp.ncbi.nlm.nih.gov/pubmed/J_Entrez.txt and https://ftp.ncbi.nlm.nih.gov/pubmed/J_Medline.txt
77
+
78
+ nlm_id = element.findtext("NlmUniqueID")
79
+ name = element.findtext("Name")
80
+
81
+ if not nlm_id.isnumeric():
82
+ # TODO investigate these records, which all appear to have IDs that
83
+ # end in R like 17410670R (Proceedings of the staff meetings. Honolulu. Clinic)
84
+ # which corresponds to https://www.ncbi.nlm.nih.gov/nlmcatalog/287649
85
+ return None
86
+
87
+ issns = [(issn.text, issn.attrib["type"]) for issn in element.findall("Issn")]
88
+ # ActivityFlag is either "0" or "1"
89
+ term = Term(
90
+ reference=Reference(prefix=PREFIX_CATALOG, identifier=nlm_id, name=name),
91
+ type="Instance",
92
+ )
93
+ term.append_parent(JOURNAL_TERM)
94
+ for synonym in element.findall("Alias"):
95
+ term.append_synonym(synonym.text)
96
+ for issn, _issn_type in issns:
97
+ if issn.isnumeric():
98
+ issn = issn[:4] + "-" + issn[4:]
99
+
100
+ # TODO include ISSN type, this is important
101
+ # to determine a "canonical" one
102
+ term.append_xref(Reference(prefix="issn", identifier=issn))
103
+ if start_year := element.findtext("StartYear"):
104
+ if len(start_year) != 4:
105
+ tqdm.write(f"[{term.curie}] invalid start year: {start_year}")
106
+ else:
107
+ term.annotate_year(has_start_date, start_year)
108
+ if end_year := element.findtext("EndYear"):
109
+ if len(end_year) != 4:
110
+ tqdm.write(f"[{term.curie}] invalid end year: {end_year}")
111
+ else:
112
+ term.annotate_year(has_end_date, end_year)
113
+ # FIXME this whole thing needs reinvestigating
114
+ if publisher_reference := journal_id_to_publisher_key.get(term.identifier):
115
+ term.annotate_object(PUBLISHED_IN, publisher_reference.reference)
116
+ return term
pyobo/sources/npass.py CHANGED
@@ -6,7 +6,7 @@ from collections.abc import Iterable
6
6
  import pandas as pd
7
7
  from tqdm.auto import tqdm
8
8
 
9
- from ..struct import Obo, Reference, Synonym, Term
9
+ from ..struct import Obo, Reference, Term
10
10
  from ..utils.path import ensure_df
11
11
 
12
12
  __all__ = [
@@ -32,11 +32,6 @@ class NPASSGetter(Obo):
32
32
  return iter_terms(force=force, version=self._version_or_raise)
33
33
 
34
34
 
35
- def get_obo(force: bool = False) -> Obo:
36
- """Get NPASS as OBO."""
37
- return NPASSGetter()
38
-
39
-
40
35
  def get_df(version: str, force: bool = False) -> pd.DataFrame:
41
36
  """Get the NPASS chemical nomenclature."""
42
37
  base_url = f"https://bidd.group/NPASS/downloadFiles/NPASSv{version}_download"
@@ -71,7 +66,10 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
71
66
  # TODO check that the first is always the parent compound?
72
67
  if pd.notna(pubchem_compound_ids):
73
68
  pubchem_compound_ids = [
74
- yy.strip() for xx in pubchem_compound_ids.split(";") for yy in xx.strip().split(",")
69
+ zz
70
+ for xx in pubchem_compound_ids.split(";")
71
+ for yy in xx.strip().split(",")
72
+ if (zz := yy.strip())
75
73
  ]
76
74
  if len(pubchem_compound_ids) > 1:
77
75
  logger.debug("multiple cids for %s: %s", identifier, pubchem_compound_ids)
@@ -82,7 +80,7 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
82
80
 
83
81
  for synonym in [iupac]:
84
82
  if pd.notna(synonym):
85
- term.append_synonym(Synonym(name=synonym))
83
+ term.append_synonym(synonym)
86
84
 
87
85
  yield term
88
86
 
pyobo/sources/omim_ps.py CHANGED
@@ -11,7 +11,6 @@ __all__ = [
11
11
  "OMIMPSGetter",
12
12
  ]
13
13
 
14
-
15
14
  logger = logging.getLogger(__name__)
16
15
  PREFIX = "omim.ps"
17
16
  URL = "https://omim.org/phenotypicSeriesTitles/all"
@@ -25,8 +24,16 @@ class OMIMPSGetter(Obo):
25
24
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
26
25
  """Iterate over terms in the ontology."""
27
26
  soup = get_soup(URL, user_agent="Mozilla/5.0")
28
- rows = soup.find(id="mimContent").find("table").find("tbody").find_all("tr")
29
- for row in rows:
27
+ content = soup.find(id="mimContent")
28
+ if content is None:
29
+ raise ValueError
30
+ table = content.find("table") # type:ignore[attr-defined]
31
+ if table is None:
32
+ raise ValueError
33
+ tbody = table.find("tbody")
34
+ if tbody is None:
35
+ raise ValueError
36
+ for row in tbody.find_all("tr"):
30
37
  anchor = row.find("td").find("a")
31
38
  name = anchor.text.strip()
32
39
  identifier = anchor.attrs["href"][len("/phenotypicSeries/") :]
pyobo/sources/pathbank.py CHANGED
@@ -5,6 +5,7 @@ from __future__ import annotations
5
5
  import logging
6
6
  from collections import defaultdict
7
7
  from collections.abc import Iterable, Mapping
8
+ from itertools import chain
8
9
 
9
10
  import pandas as pd
10
11
  from tqdm.auto import tqdm
@@ -77,11 +78,6 @@ class PathBankGetter(Obo):
77
78
  return iter_terms(force=force, version=self._version_or_raise)
78
79
 
79
80
 
80
- def get_obo(force: bool = False) -> Obo:
81
- """Get PathBank as OBO."""
82
- return PathBankGetter(force=force)
83
-
84
-
85
81
  def get_proteins_df(version: str, force: bool = False) -> pd.DataFrame:
86
82
  """Get the proteins dataframe."""
87
83
  proteins_df = ensure_df(
@@ -165,9 +161,9 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
165
161
  # but there are weird parser errors
166
162
  )
167
163
  term.append_exact_match(Reference(prefix="smpdb", identifier=smpdb_id))
168
- term.append_property(has_category, subject.lower().replace(" ", "_"))
169
- term.extend_relationship(has_participant, smpdb_id_to_proteins[smpdb_id])
170
- term.extend_relationship(has_participant, smpdb_id_to_metabolites[smpdb_id])
164
+ term.annotate_string(has_category, subject.lower().replace(" ", "_"))
165
+ for participant in chain(smpdb_id_to_proteins[smpdb_id], smpdb_id_to_metabolites[smpdb_id]):
166
+ term.append_relationship(has_participant, participant)
171
167
  yield term
172
168
 
173
169
 
@@ -0,0 +1,9 @@
1
+ """Resources from PFAM."""
2
+
3
+ from .pfam import PfamGetter
4
+ from .pfam_clan import PfamClanGetter
5
+
6
+ __all__ = [
7
+ "PfamClanGetter",
8
+ "PfamGetter",
9
+ ]
@@ -4,8 +4,8 @@ from collections.abc import Iterable
4
4
 
5
5
  import pandas as pd
6
6
 
7
- from ..struct import Obo, Reference, Term
8
- from ..utils.path import ensure_df
7
+ from ...struct import Obo, Reference, Term
8
+ from ...utils.path import ensure_df
9
9
 
10
10
  __all__ = [
11
11
  "PfamGetter",
@@ -47,11 +47,6 @@ class PfamGetter(Obo):
47
47
  return iter_terms(self._version_or_raise, force=force)
48
48
 
49
49
 
50
- def get_obo(force: bool = False) -> Obo:
51
- """Get PFAM as OBO."""
52
- return PfamGetter(force=force)
53
-
54
-
55
50
  def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
56
51
  """Iterate PFAM terms."""
57
52
  df = get_pfam_clan_df(version=version, force=force)
@@ -67,4 +62,4 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
67
62
 
68
63
 
69
64
  if __name__ == "__main__":
70
- get_obo().write_default()
65
+ PfamGetter.cli()
@@ -5,7 +5,7 @@ from collections.abc import Iterable
5
5
  from tqdm.auto import tqdm
6
6
 
7
7
  from .pfam import get_pfam_clan_df
8
- from ..struct import Obo, Reference, Term
8
+ from ...struct import Obo, Reference, Term
9
9
 
10
10
  __all__ = [
11
11
  "PfamClanGetter",
@@ -25,11 +25,6 @@ class PfamClanGetter(Obo):
25
25
  return iter_terms(version=self._version_or_raise, force=force)
26
26
 
27
27
 
28
- def get_obo(force: bool = False) -> Obo:
29
- """Get PFAM Clans as OBO."""
30
- return PfamClanGetter(force=force)
31
-
32
-
33
28
  # TODO could get definitions from ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam33.0/Pfam-C.gz
34
29
 
35
30
 
@@ -46,4 +41,4 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
46
41
 
47
42
 
48
43
  if __name__ == "__main__":
49
- get_obo().write_default()
44
+ PfamClanGetter.cli()