pyobo 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. pyobo/.DS_Store +0 -0
  2. pyobo/__init__.py +95 -20
  3. pyobo/__main__.py +0 -0
  4. pyobo/api/__init__.py +81 -10
  5. pyobo/api/alts.py +52 -42
  6. pyobo/api/combine.py +39 -0
  7. pyobo/api/edges.py +68 -0
  8. pyobo/api/hierarchy.py +231 -203
  9. pyobo/api/metadata.py +14 -19
  10. pyobo/api/names.py +207 -127
  11. pyobo/api/properties.py +117 -113
  12. pyobo/api/relations.py +68 -94
  13. pyobo/api/species.py +24 -21
  14. pyobo/api/typedefs.py +11 -11
  15. pyobo/api/utils.py +66 -13
  16. pyobo/api/xrefs.py +108 -114
  17. pyobo/cli/__init__.py +0 -0
  18. pyobo/cli/cli.py +35 -50
  19. pyobo/cli/database.py +183 -161
  20. pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
  21. pyobo/cli/lookup.py +163 -195
  22. pyobo/cli/utils.py +19 -6
  23. pyobo/constants.py +102 -3
  24. pyobo/getters.py +196 -118
  25. pyobo/gilda_utils.py +79 -200
  26. pyobo/identifier_utils/__init__.py +41 -0
  27. pyobo/identifier_utils/api.py +296 -0
  28. pyobo/identifier_utils/model.py +130 -0
  29. pyobo/identifier_utils/preprocessing.json +812 -0
  30. pyobo/identifier_utils/preprocessing.py +61 -0
  31. pyobo/identifier_utils/relations/__init__.py +8 -0
  32. pyobo/identifier_utils/relations/api.py +162 -0
  33. pyobo/identifier_utils/relations/data.json +5824 -0
  34. pyobo/identifier_utils/relations/data_owl.json +57 -0
  35. pyobo/identifier_utils/relations/data_rdf.json +1 -0
  36. pyobo/identifier_utils/relations/data_rdfs.json +7 -0
  37. pyobo/mocks.py +9 -6
  38. pyobo/ner/__init__.py +9 -0
  39. pyobo/ner/api.py +72 -0
  40. pyobo/ner/normalizer.py +33 -0
  41. pyobo/obographs.py +43 -39
  42. pyobo/plugins.py +5 -4
  43. pyobo/py.typed +0 -0
  44. pyobo/reader.py +1358 -395
  45. pyobo/reader_utils.py +155 -0
  46. pyobo/resource_utils.py +42 -22
  47. pyobo/resources/__init__.py +0 -0
  48. pyobo/resources/goc.py +75 -0
  49. pyobo/resources/goc.tsv +188 -0
  50. pyobo/resources/ncbitaxon.py +4 -5
  51. pyobo/resources/ncbitaxon.tsv.gz +0 -0
  52. pyobo/resources/ro.py +3 -2
  53. pyobo/resources/ro.tsv +0 -0
  54. pyobo/resources/so.py +0 -0
  55. pyobo/resources/so.tsv +0 -0
  56. pyobo/sources/README.md +12 -8
  57. pyobo/sources/__init__.py +52 -29
  58. pyobo/sources/agrovoc.py +0 -0
  59. pyobo/sources/antibodyregistry.py +11 -12
  60. pyobo/sources/bigg/__init__.py +13 -0
  61. pyobo/sources/bigg/bigg_compartment.py +81 -0
  62. pyobo/sources/bigg/bigg_metabolite.py +229 -0
  63. pyobo/sources/bigg/bigg_model.py +46 -0
  64. pyobo/sources/bigg/bigg_reaction.py +77 -0
  65. pyobo/sources/biogrid.py +1 -2
  66. pyobo/sources/ccle.py +7 -12
  67. pyobo/sources/cgnc.py +0 -5
  68. pyobo/sources/chebi.py +1 -1
  69. pyobo/sources/chembl/__init__.py +9 -0
  70. pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
  71. pyobo/sources/chembl/chembl_target.py +160 -0
  72. pyobo/sources/civic_gene.py +55 -15
  73. pyobo/sources/clinicaltrials.py +160 -0
  74. pyobo/sources/complexportal.py +24 -24
  75. pyobo/sources/conso.py +14 -22
  76. pyobo/sources/cpt.py +0 -0
  77. pyobo/sources/credit.py +1 -9
  78. pyobo/sources/cvx.py +27 -5
  79. pyobo/sources/depmap.py +9 -12
  80. pyobo/sources/dictybase_gene.py +2 -7
  81. pyobo/sources/drugbank/__init__.py +9 -0
  82. pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
  83. pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
  84. pyobo/sources/drugcentral.py +17 -13
  85. pyobo/sources/expasy.py +31 -34
  86. pyobo/sources/famplex.py +13 -18
  87. pyobo/sources/flybase.py +3 -8
  88. pyobo/sources/gard.py +62 -0
  89. pyobo/sources/geonames/__init__.py +9 -0
  90. pyobo/sources/geonames/features.py +28 -0
  91. pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
  92. pyobo/sources/geonames/utils.py +115 -0
  93. pyobo/sources/gmt_utils.py +6 -7
  94. pyobo/sources/go.py +20 -13
  95. pyobo/sources/gtdb.py +154 -0
  96. pyobo/sources/gwascentral/__init__.py +9 -0
  97. pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
  98. pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
  99. pyobo/sources/hgnc/__init__.py +9 -0
  100. pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
  101. pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
  102. pyobo/sources/icd/__init__.py +9 -0
  103. pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
  104. pyobo/sources/icd/icd11.py +148 -0
  105. pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
  106. pyobo/sources/interpro.py +4 -9
  107. pyobo/sources/itis.py +0 -5
  108. pyobo/sources/kegg/__init__.py +0 -0
  109. pyobo/sources/kegg/api.py +16 -38
  110. pyobo/sources/kegg/genes.py +9 -20
  111. pyobo/sources/kegg/genome.py +1 -7
  112. pyobo/sources/kegg/pathway.py +9 -21
  113. pyobo/sources/mesh.py +58 -24
  114. pyobo/sources/mgi.py +3 -10
  115. pyobo/sources/mirbase/__init__.py +11 -0
  116. pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
  117. pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
  118. pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
  119. pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
  120. pyobo/sources/msigdb.py +74 -39
  121. pyobo/sources/ncbi/__init__.py +9 -0
  122. pyobo/sources/ncbi/ncbi_gc.py +162 -0
  123. pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
  124. pyobo/sources/nih_reporter.py +60 -0
  125. pyobo/sources/nlm/__init__.py +9 -0
  126. pyobo/sources/nlm/nlm_catalog.py +48 -0
  127. pyobo/sources/nlm/nlm_publisher.py +36 -0
  128. pyobo/sources/nlm/utils.py +116 -0
  129. pyobo/sources/npass.py +6 -8
  130. pyobo/sources/omim_ps.py +10 -3
  131. pyobo/sources/pathbank.py +4 -8
  132. pyobo/sources/pfam/__init__.py +9 -0
  133. pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
  134. pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
  135. pyobo/sources/pharmgkb/__init__.py +15 -0
  136. pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
  137. pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
  138. pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
  139. pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
  140. pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
  141. pyobo/sources/pharmgkb/utils.py +86 -0
  142. pyobo/sources/pid.py +1 -6
  143. pyobo/sources/pombase.py +6 -10
  144. pyobo/sources/pubchem.py +4 -9
  145. pyobo/sources/reactome.py +5 -11
  146. pyobo/sources/rgd.py +11 -16
  147. pyobo/sources/rhea.py +37 -36
  148. pyobo/sources/ror.py +69 -42
  149. pyobo/sources/selventa/__init__.py +0 -0
  150. pyobo/sources/selventa/schem.py +4 -7
  151. pyobo/sources/selventa/scomp.py +1 -6
  152. pyobo/sources/selventa/sdis.py +4 -7
  153. pyobo/sources/selventa/sfam.py +1 -6
  154. pyobo/sources/sgd.py +6 -11
  155. pyobo/sources/signor/__init__.py +7 -0
  156. pyobo/sources/signor/download.py +41 -0
  157. pyobo/sources/signor/signor_complexes.py +105 -0
  158. pyobo/sources/slm.py +12 -15
  159. pyobo/sources/umls/__init__.py +7 -1
  160. pyobo/sources/umls/__main__.py +0 -0
  161. pyobo/sources/umls/get_synonym_types.py +20 -4
  162. pyobo/sources/umls/sty.py +57 -0
  163. pyobo/sources/umls/synonym_types.tsv +1 -1
  164. pyobo/sources/umls/umls.py +18 -22
  165. pyobo/sources/unimod.py +46 -0
  166. pyobo/sources/uniprot/__init__.py +1 -1
  167. pyobo/sources/uniprot/uniprot.py +40 -32
  168. pyobo/sources/uniprot/uniprot_ptm.py +4 -34
  169. pyobo/sources/utils.py +3 -2
  170. pyobo/sources/wikipathways.py +7 -10
  171. pyobo/sources/zfin.py +5 -10
  172. pyobo/ssg/__init__.py +12 -16
  173. pyobo/ssg/base.html +0 -0
  174. pyobo/ssg/index.html +26 -13
  175. pyobo/ssg/term.html +12 -2
  176. pyobo/ssg/typedef.html +0 -0
  177. pyobo/struct/__init__.py +54 -8
  178. pyobo/struct/functional/__init__.py +1 -0
  179. pyobo/struct/functional/dsl.py +2572 -0
  180. pyobo/struct/functional/macros.py +423 -0
  181. pyobo/struct/functional/obo_to_functional.py +385 -0
  182. pyobo/struct/functional/ontology.py +270 -0
  183. pyobo/struct/functional/utils.py +112 -0
  184. pyobo/struct/reference.py +331 -136
  185. pyobo/struct/struct.py +1413 -643
  186. pyobo/struct/struct_utils.py +1078 -0
  187. pyobo/struct/typedef.py +162 -210
  188. pyobo/struct/utils.py +12 -5
  189. pyobo/struct/vocabulary.py +138 -0
  190. pyobo/utils/__init__.py +0 -0
  191. pyobo/utils/cache.py +13 -11
  192. pyobo/utils/io.py +17 -31
  193. pyobo/utils/iter.py +5 -5
  194. pyobo/utils/misc.py +41 -53
  195. pyobo/utils/ndex_utils.py +0 -0
  196. pyobo/utils/path.py +76 -70
  197. pyobo/version.py +3 -3
  198. {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/METADATA +224 -225
  199. pyobo-0.12.0.dist-info/RECORD +202 -0
  200. pyobo-0.12.0.dist-info/WHEEL +4 -0
  201. {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
  202. {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info/licenses}/LICENSE +0 -0
  203. pyobo/apps/__init__.py +0 -3
  204. pyobo/apps/cli.py +0 -24
  205. pyobo/apps/gilda/__init__.py +0 -3
  206. pyobo/apps/gilda/__main__.py +0 -8
  207. pyobo/apps/gilda/app.py +0 -48
  208. pyobo/apps/gilda/cli.py +0 -36
  209. pyobo/apps/gilda/templates/base.html +0 -33
  210. pyobo/apps/gilda/templates/home.html +0 -11
  211. pyobo/apps/gilda/templates/matches.html +0 -32
  212. pyobo/apps/mapper/__init__.py +0 -3
  213. pyobo/apps/mapper/__main__.py +0 -11
  214. pyobo/apps/mapper/cli.py +0 -37
  215. pyobo/apps/mapper/mapper.py +0 -187
  216. pyobo/apps/mapper/templates/base.html +0 -35
  217. pyobo/apps/mapper/templates/mapper_home.html +0 -64
  218. pyobo/aws.py +0 -162
  219. pyobo/cli/aws.py +0 -47
  220. pyobo/identifier_utils.py +0 -142
  221. pyobo/normalizer.py +0 -232
  222. pyobo/registries/__init__.py +0 -16
  223. pyobo/registries/metaregistry.json +0 -507
  224. pyobo/registries/metaregistry.py +0 -135
  225. pyobo/sources/icd11.py +0 -105
  226. pyobo/xrefdb/__init__.py +0 -1
  227. pyobo/xrefdb/canonicalizer.py +0 -214
  228. pyobo/xrefdb/priority.py +0 -59
  229. pyobo/xrefdb/sources/__init__.py +0 -60
  230. pyobo/xrefdb/sources/biomappings.py +0 -36
  231. pyobo/xrefdb/sources/cbms2019.py +0 -91
  232. pyobo/xrefdb/sources/chembl.py +0 -83
  233. pyobo/xrefdb/sources/compath.py +0 -82
  234. pyobo/xrefdb/sources/famplex.py +0 -64
  235. pyobo/xrefdb/sources/gilda.py +0 -50
  236. pyobo/xrefdb/sources/intact.py +0 -113
  237. pyobo/xrefdb/sources/ncit.py +0 -133
  238. pyobo/xrefdb/sources/pubchem.py +0 -27
  239. pyobo/xrefdb/sources/wikidata.py +0 -116
  240. pyobo-0.11.1.dist-info/RECORD +0 -173
  241. pyobo-0.11.1.dist-info/WHEEL +0 -5
  242. pyobo-0.11.1.dist-info/top_level.txt +0 -1
@@ -0,0 +1,115 @@
1
+ """Shared code for geonames sources."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable
6
+
7
+ import pandas as pd
8
+ from tqdm import tqdm
9
+
10
+ from pyobo import Reference, Term, TypeDef, default_reference
11
+ from pyobo.struct.struct import CHARLIE_TERM, HUMAN_TERM, PYOBO_INJECTED
12
+ from pyobo.utils.path import ensure_df
13
+
14
+ PREFIX = "geonames"
15
+ PREFIX_FEATURE = "geonames.feature"
16
+
17
+ FEATURES_URL = "https://download.geonames.org/export/dump/featureCodes_en.txt"
18
+ COUNTRIES_URL = "https://download.geonames.org/export/dump/countryInfo.txt"
19
+ ADMIN1_URL = "https://download.geonames.org/export/dump/admin1CodesASCII.txt"
20
+ ADMIN2_URL = "https://download.geonames.org/export/dump/admin2Codes.txt"
21
+ CITIES_URL = "https://download.geonames.org/export/dump/cities15000.zip"
22
+ SYNONYMS_URL = "https://download.geonames.org/export/dump/alternateNamesV2.zip"
23
+
24
+ # External parent classes
25
+ CITY = Reference(prefix="ENVO", identifier="00000856", name="city")
26
+ NATION = Reference(prefix="ENVO", identifier="00000009", name="national geopolitical entity")
27
+ ADMIN_1 = Reference(prefix="ENVO", identifier="00000005", name="first-order administrative region")
28
+ ADMIN_2 = Reference(prefix="ENVO", identifier="00000006", name="second-order administrative region")
29
+
30
+ # Builtin classes
31
+ FEATURE = default_reference(PREFIX_FEATURE, "feature", "GeoNames feature")
32
+ FEATURE_TERM = Term(reference=FEATURE)
33
+
34
+ # Type definitions
35
+ CODE_TYPEDEF = TypeDef(
36
+ reference=default_reference(PREFIX, "code", name="GeoNames code"), is_metadata_tag=True
37
+ )
38
+
39
+ SYNONYMS_DF_COLUMNS = [
40
+ "id",
41
+ "geonames_id",
42
+ "iso_lang",
43
+ "synonym",
44
+ "is_preferred",
45
+ "is_short",
46
+ "is_colloquial",
47
+ "is_historic",
48
+ "start_time",
49
+ "end_time",
50
+ ]
51
+
52
+ P_CATEGORY = default_reference(PREFIX_FEATURE, "P", "city feature")
53
+
54
+ FEATURE_CATEGORIES = {
55
+ "A": default_reference(PREFIX_FEATURE, "A", "geopolitical feature"),
56
+ "H": default_reference(PREFIX_FEATURE, "H", "aquatic feature"),
57
+ "V": default_reference(PREFIX_FEATURE, "V", "floral feature feature"),
58
+ "S": default_reference(PREFIX_FEATURE, "S", "building feature"),
59
+ "U": default_reference(PREFIX_FEATURE, "U", "undersea feature"),
60
+ "T": default_reference(PREFIX_FEATURE, "T", "geographic feature"),
61
+ "L": default_reference(PREFIX_FEATURE, "L", "parks feature"),
62
+ "P": P_CATEGORY,
63
+ "R": default_reference(PREFIX_FEATURE, "R", "road or rail feature"),
64
+ }
65
+
66
+
67
+ def get_features(*, force: bool = False) -> dict[str, Term]:
68
+ """Get all features."""
69
+ df = ensure_df(
70
+ PREFIX,
71
+ url=FEATURES_URL,
72
+ force=force,
73
+ keep_default_na=False, # NA is a country code
74
+ dtype=str,
75
+ )
76
+ rv = {}
77
+ for identifier, name, description in df.values:
78
+ if pd.isna(identifier) or identifier == "null":
79
+ continue
80
+
81
+ term = Term(
82
+ reference=Reference(
83
+ prefix=PREFIX_FEATURE, identifier=identifier, name=name if pd.notna(name) else None
84
+ ),
85
+ definition=description if pd.notna(description) else None,
86
+ )
87
+ parent_letter, _, rest = identifier.partition(".")
88
+ if not rest:
89
+ tqdm.write(f"[{PREFIX_FEATURE}] unhandled identifier: {identifier}")
90
+ elif parent_letter not in FEATURE_CATEGORIES:
91
+ tqdm.write(f"[{PREFIX_FEATURE}] unhandled category: {parent_letter}")
92
+ else:
93
+ term.append_parent(FEATURE_CATEGORIES[parent_letter])
94
+
95
+ rv[identifier] = term
96
+ return rv
97
+
98
+
99
+ def get_feature_terms(
100
+ force: bool = False, features: dict[str, Term] | None = None
101
+ ) -> Iterable[Term]:
102
+ """Get terms for GeoNames features."""
103
+ yield FEATURE_TERM
104
+ yield HUMAN_TERM
105
+ yield CHARLIE_TERM
106
+ for cat in FEATURE_CATEGORIES.values():
107
+ yield (
108
+ Term(reference=cat)
109
+ .append_parent(FEATURE_TERM)
110
+ .append_contributor(CHARLIE_TERM)
111
+ .append_comment(PYOBO_INJECTED)
112
+ )
113
+ if features is None:
114
+ features = get_features(force=force)
115
+ yield from features.values()
@@ -2,17 +2,17 @@
2
2
 
3
3
  from collections.abc import Iterable
4
4
  from pathlib import Path
5
- from typing import Union
6
5
 
7
6
  GMTSummary = tuple[str, str, set[str]]
8
7
  WikiPathwaysGMTSummary = tuple[str, str, str, str, str, set[str]]
9
8
 
10
9
 
11
- def parse_gmt_file(path: Union[str, Path]) -> Iterable[GMTSummary]:
10
+ def parse_gmt_file(path: str | Path) -> Iterable[GMTSummary]:
12
11
  """Return file as list of pathway - gene sets (ENTREZ-identifiers).
13
12
 
14
13
  :param path: path to GMT file
15
- :return: line-based processed file
14
+
15
+ :yields: processed lines
16
16
  """
17
17
  with open(path) as file:
18
18
  for line in file:
@@ -23,15 +23,14 @@ def _process_line(line: str) -> tuple[str, str, set[str]]:
23
23
  """Return the pathway name, url, and gene sets associated.
24
24
 
25
25
  :param line: gmt file line
26
- :return: pathway name
27
- :return: pathway info url
28
- :return: genes set associated
26
+
27
+ :returns: pathway name, pathway info url, and genes set associated
29
28
  """
30
29
  name, info, *entries = (p.strip() for p in line.split("\t"))
31
30
  return name, info, set(entries)
32
31
 
33
32
 
34
- def parse_wikipathways_gmt(path: Union[str, Path]) -> Iterable[WikiPathwaysGMTSummary]:
33
+ def parse_wikipathways_gmt(path: str | Path) -> Iterable[WikiPathwaysGMTSummary]:
35
34
  """Parse WikiPathways GMT."""
36
35
  for info, _uri, entries in parse_gmt_file(path):
37
36
  info, version, identifier, species = info.split("%")
pyobo/sources/go.py CHANGED
@@ -4,33 +4,46 @@ from pyobo import get_descendants
4
4
 
5
5
  __all__ = [
6
6
  "is_biological_process",
7
- "is_molecular_function",
8
7
  "is_cellular_component",
8
+ "is_molecular_function",
9
9
  ]
10
10
 
11
11
 
12
12
  def is_biological_process(identifier: str) -> bool:
13
13
  """Return if the given GO identifier is a biological process.
14
14
 
15
+ :param identifier: A local unique identifier from GO
16
+ :return: If the identifier is a biological process
17
+
15
18
  >>> is_biological_process("0006915")
16
19
  True
17
20
  >>> is_biological_process("GO:0006915")
18
21
  True
19
- >>> is_molecular_function("0006915")
20
- False
21
- >>> is_cellular_component("0006915")
22
- False
23
22
  """
24
23
  return _is_descendant(identifier, "0008150")
25
24
 
26
25
 
27
26
  def is_molecular_function(identifier: str) -> bool:
28
- """Return if the given GO identifier is a molecular function."""
27
+ """Return if the given GO identifier is a molecular function.
28
+
29
+ :param identifier: A local unique identifier from GO
30
+ :return: If the identifier is a molecular function
31
+
32
+ >>> is_molecular_function("0006915")
33
+ False
34
+ """
29
35
  return _is_descendant(identifier, "0003674")
30
36
 
31
37
 
32
38
  def is_cellular_component(identifier: str) -> bool:
33
- """Return if the given GO identifier is a cellular component."""
39
+ """Return if the given GO identifier is a cellular component.
40
+
41
+ :param identifier: A local unique identifier from GO
42
+ :return: If the identifier is a cellular component
43
+
44
+ >>> is_cellular_component("0006915")
45
+ False
46
+ """
34
47
  return _is_descendant(identifier, "0005575")
35
48
 
36
49
 
@@ -40,9 +53,3 @@ def _is_descendant(identifier: str, ancestor: str) -> bool:
40
53
  identifier = f"go:{identifier}"
41
54
  descendants = get_descendants("go", ancestor)
42
55
  return descendants is not None and identifier in descendants
43
-
44
-
45
- if __name__ == "__main__":
46
- import doctest
47
-
48
- doctest.testmod()
pyobo/sources/gtdb.py ADDED
@@ -0,0 +1,154 @@
1
+ """Convert GTDB taxonomy to OBO format."""
2
+
3
+ import logging
4
+ from collections.abc import Iterable
5
+
6
+ import pandas as pd
7
+ from tqdm.auto import tqdm
8
+
9
+ from pyobo.struct import Obo, Reference, Term
10
+ from pyobo.struct.typedef import has_taxonomy_rank
11
+ from pyobo.utils.path import ensure_path
12
+
13
+ __all__ = [
14
+ "GTDBGetter",
15
+ ]
16
+
17
+ PREFIX = "gtdb"
18
+
19
+ #: A mapping from GTDB prefixes to TAXRANK ranks
20
+ LEVEL_TO_TAXRANK = {
21
+ "d": Reference(prefix="TAXRANK", identifier="0000037", name="domain"),
22
+ "p": Reference(prefix="TAXRANK", identifier="0000001", name="phylum"),
23
+ "c": Reference(prefix="TAXRANK", identifier="0000002", name="class"),
24
+ "o": Reference(prefix="TAXRANK", identifier="0000003", name="order"),
25
+ "f": Reference(prefix="TAXRANK", identifier="0000004", name="family"),
26
+ "g": Reference(prefix="TAXRANK", identifier="0000005", name="genus"),
27
+ "s": Reference(prefix="TAXRANK", identifier="0000006", name="species"),
28
+ }
29
+
30
+ #: AR stands for archea
31
+ GTDB_AR_URL = "https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tsv.gz"
32
+ #: BAC stands for bacteria
33
+ GTDB_BAC_URL = "https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tsv.gz"
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class GTDBGetter(Obo):
39
+ """An ontology representation of the GTDB taxonomy."""
40
+
41
+ ontology = bioversions_key = PREFIX
42
+ typedefs = [has_taxonomy_rank]
43
+ root_terms = [
44
+ Reference(prefix=PREFIX, identifier="d__Archea", name="Archea"),
45
+ Reference(prefix=PREFIX, identifier="d__Bacteria", name="Bacteria"),
46
+ ]
47
+
48
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
49
+ """Iterate over terms in the ontology."""
50
+ return iter_terms(version=self._version_or_raise, force=force)
51
+
52
+
53
+ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
54
+ """Iterate over GTDB terms."""
55
+ # Add the taxrank terms so we get nice display in Protege
56
+ for reference in LEVEL_TO_TAXRANK.values():
57
+ yield Term(reference=reference)
58
+
59
+ ar_path = ensure_path(PREFIX, url=GTDB_AR_URL, version=version, force=force)
60
+ bac_path = ensure_path(PREFIX, url=GTDB_BAC_URL, version=version, force=force)
61
+ columns = ["gtdb_taxonomy", "ncbi_species_taxid"]
62
+ for path_name, path in [
63
+ ("ar", ar_path),
64
+ ("bac", bac_path),
65
+ ]:
66
+ df = pd.read_csv(path, sep="\t", dtype=str)
67
+ for tax_string, ncbitaxon_id in tqdm(
68
+ df[columns].values, desc=f"[{PREFIX}] processing {path_name}", unit_scale=True
69
+ ):
70
+ yield from _process_row(tax_string, ncbitaxon_id)
71
+
72
+
73
+ def _process_row(tax_string, ncbitaxon_id) -> Iterable[Term]:
74
+ if not isinstance(tax_string, str):
75
+ logger.warning(f"Invalid taxonomy string: {tax_string}")
76
+ return None
77
+
78
+ taxa = _parse_tax_string(tax_string)
79
+ if not taxa:
80
+ logger.warning(f"No valid taxa found in: {tax_string}")
81
+ return None
82
+
83
+ parent_reference = None
84
+ for level, name in taxa:
85
+ identifier = f"{level}__{name.replace(' ', '_')}"
86
+ term = Term(
87
+ reference=Reference(prefix=PREFIX, identifier=identifier, name=name),
88
+ )
89
+ term.annotate_object(has_taxonomy_rank, LEVEL_TO_TAXRANK[level])
90
+
91
+ if parent_reference:
92
+ term.append_parent(parent_reference)
93
+ if ncbitaxon_id and level == "s":
94
+ # if the level is "s", it's a species. There might be multiple
95
+ # mappings to NCBITaxon, so we only use "see also" as the predicate
96
+ term.append_xref(
97
+ Reference(prefix="ncbitaxon", identifier=ncbitaxon_id),
98
+ # TODO @jose use confidence=... keyword here
99
+ )
100
+
101
+ yield term
102
+ parent_reference = term.reference
103
+
104
+
105
+ def _parse_tax_string(tax_string: str) -> list[tuple[str, str]]:
106
+ """Parse GTDB taxonomy string into (level, name) tuples."""
107
+ return [
108
+ level_name for part in _split_tax_string(tax_string) if (level_name := _parse_name(part))
109
+ ]
110
+
111
+
112
+ def _split_tax_string(tax_string: str) -> list[str]:
113
+ return [p.strip() for p in tax_string.split(";") if p.strip()]
114
+
115
+
116
+ def _parse_name(part: str) -> tuple[str, str] | None:
117
+ """Parse a GTDB taxonomy identifier.
118
+
119
+ :param part: The string
120
+ :returns: A tuple with the level and name, if parsable
121
+
122
+ >>> _parse_name("f__Sulfolobaceae")
123
+ ('f', 'Sulfoobaceae')
124
+
125
+ The following is malformed because it is missing a double underscore
126
+
127
+ >>> _parse_name("f_Sulfolobaceae")
128
+
129
+ The following is malformed because it has an invalid taxonomic level
130
+
131
+ >>> _parse_name("x__Sulfolobaceae")
132
+
133
+ The following is malformed because it's missing a name
134
+
135
+ >>> _parse_name("f__")
136
+ """
137
+ if len(part) < 4 or "__" not in part:
138
+ logger.warning(f"Malformed taxon string: {part}")
139
+ return None
140
+ level, delimiter, name = part.partition("__")
141
+ if not delimiter:
142
+ logger.warning(f"Missing double underscore delimiter: {part}")
143
+ return None
144
+ if level not in LEVEL_TO_TAXRANK or not name:
145
+ logger.warning(f"Invalid taxonomic level `{level}` in {part}")
146
+ return None
147
+ if not name:
148
+ logger.warning(f"Missing name: {part}")
149
+ return None
150
+ return level, name
151
+
152
+
153
+ if __name__ == "__main__":
154
+ GTDBGetter().cli()
@@ -0,0 +1,9 @@
1
+ """Resources from GWAS Central."""
2
+
3
+ from .gwascentral_phenotype import GWASCentralPhenotypeGetter
4
+ from .gwascentral_study import GWASCentralStudyGetter
5
+
6
+ __all__ = [
7
+ "GWASCentralPhenotypeGetter",
8
+ "GWASCentralStudyGetter",
9
+ ]
@@ -5,10 +5,11 @@ from collections.abc import Iterable
5
5
 
6
6
  from tqdm.auto import tqdm, trange
7
7
 
8
- from pyobo.sources.gwascentral_study import VERSION
9
8
  from pyobo.struct import Obo, Reference, Term
10
9
  from pyobo.utils.path import ensure_path
11
10
 
11
+ from .gwascentral_study import VERSION
12
+
12
13
  __all__ = [
13
14
  "GWASCentralPhenotypeGetter",
14
15
  ]
@@ -27,11 +28,6 @@ class GWASCentralPhenotypeGetter(Obo):
27
28
  return iter_terms(force=force, version=self._version_or_raise)
28
29
 
29
30
 
30
- def get_obo(force: bool = False) -> Obo:
31
- """Get GWAS Central Studies as OBO."""
32
- return GWASCentralPhenotypeGetter(force=force)
33
-
34
-
35
31
  def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
36
32
  """Iterate over terms from GWAS Central Phenotype."""
37
33
  for n in trange(1, 11000, desc=f"{PREFIX} download"):
@@ -43,11 +39,13 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
43
39
  url=f"https://www.gwascentral.org/phenotype/HGVPM{n}?format=json",
44
40
  name=f"HGVPM{n}.json",
45
41
  force=force,
42
+ backend="requests",
43
+ timeout=1,
46
44
  )
47
45
  except OSError as e:
48
46
  tqdm.write(f"{n}: {e}")
49
47
  continue
50
- with open(path) as file:
48
+ with path.open() as file:
51
49
  j = json.load(file)
52
50
 
53
51
  description = j.get("description")
@@ -3,7 +3,6 @@
3
3
  import logging
4
4
  import tarfile
5
5
  from collections.abc import Iterable
6
- from typing import Optional
7
6
  from xml.etree import ElementTree
8
7
 
9
8
  from pyobo.struct import Obo, Reference, Term, has_part
@@ -31,12 +30,7 @@ class GWASCentralStudyGetter(Obo):
31
30
  return iterate_terms(force=force, version=self._version_or_raise)
32
31
 
33
32
 
34
- def get_obo(force: bool = False):
35
- """Get GWAS Central Studies as OBO."""
36
- return GWASCentralStudyGetter(force=force)
37
-
38
-
39
- def _find_text(element, name: str) -> Optional[str]:
33
+ def _find_text(element, name: str) -> str | None:
40
34
  x = element.find(name)
41
35
  if x is not None:
42
36
  return x.text
@@ -0,0 +1,9 @@
1
+ """Resources from HGNC."""
2
+
3
+ from .hgnc import HGNCGetter
4
+ from .hgncgenefamily import HGNCGroupGetter
5
+
6
+ __all__ = [
7
+ "HGNCGetter",
8
+ "HGNCGroupGetter",
9
+ ]