pyobo 0.10.12__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. pyobo/__init__.py +0 -2
  2. pyobo/__main__.py +0 -2
  3. pyobo/api/__init__.py +0 -2
  4. pyobo/api/alts.py +6 -7
  5. pyobo/api/hierarchy.py +14 -15
  6. pyobo/api/metadata.py +3 -4
  7. pyobo/api/names.py +31 -32
  8. pyobo/api/properties.py +6 -7
  9. pyobo/api/relations.py +12 -11
  10. pyobo/api/species.py +5 -6
  11. pyobo/api/typedefs.py +1 -3
  12. pyobo/api/utils.py +61 -5
  13. pyobo/api/xrefs.py +4 -5
  14. pyobo/aws.py +3 -5
  15. pyobo/cli/__init__.py +0 -2
  16. pyobo/cli/aws.py +0 -2
  17. pyobo/cli/cli.py +0 -4
  18. pyobo/cli/database.py +1 -3
  19. pyobo/cli/lookup.py +0 -2
  20. pyobo/cli/utils.py +0 -2
  21. pyobo/constants.py +1 -33
  22. pyobo/getters.py +19 -26
  23. pyobo/gilda_utils.py +19 -17
  24. pyobo/identifier_utils.py +10 -10
  25. pyobo/mocks.py +5 -6
  26. pyobo/normalizer.py +24 -24
  27. pyobo/obographs.py +8 -5
  28. pyobo/plugins.py +3 -4
  29. pyobo/py.typed +0 -0
  30. pyobo/reader.py +19 -21
  31. pyobo/registries/__init__.py +0 -2
  32. pyobo/registries/metaregistry.py +6 -8
  33. pyobo/resource_utils.py +1 -3
  34. pyobo/resources/__init__.py +0 -2
  35. pyobo/resources/ncbitaxon.py +2 -3
  36. pyobo/resources/ro.py +2 -4
  37. pyobo/resources/so.py +55 -0
  38. pyobo/resources/so.tsv +2604 -0
  39. pyobo/sources/README.md +15 -0
  40. pyobo/sources/__init__.py +0 -2
  41. pyobo/sources/agrovoc.py +3 -3
  42. pyobo/sources/antibodyregistry.py +2 -3
  43. pyobo/sources/biogrid.py +4 -4
  44. pyobo/sources/ccle.py +3 -4
  45. pyobo/sources/cgnc.py +1 -3
  46. pyobo/sources/chebi.py +2 -4
  47. pyobo/sources/chembl.py +1 -3
  48. pyobo/sources/civic_gene.py +2 -3
  49. pyobo/sources/complexportal.py +57 -20
  50. pyobo/sources/conso.py +2 -4
  51. pyobo/sources/cpt.py +1 -3
  52. pyobo/sources/credit.py +1 -1
  53. pyobo/sources/cvx.py +1 -3
  54. pyobo/sources/depmap.py +3 -4
  55. pyobo/sources/dictybase_gene.py +15 -12
  56. pyobo/sources/drugbank.py +6 -7
  57. pyobo/sources/drugbank_salt.py +3 -4
  58. pyobo/sources/drugcentral.py +9 -8
  59. pyobo/sources/expasy.py +33 -16
  60. pyobo/sources/famplex.py +3 -5
  61. pyobo/sources/flybase.py +5 -6
  62. pyobo/sources/geonames.py +1 -1
  63. pyobo/sources/gmt_utils.py +5 -6
  64. pyobo/sources/go.py +4 -6
  65. pyobo/sources/gwascentral_phenotype.py +1 -3
  66. pyobo/sources/gwascentral_study.py +2 -3
  67. pyobo/sources/hgnc.py +30 -26
  68. pyobo/sources/hgncgenefamily.py +9 -11
  69. pyobo/sources/icd10.py +3 -4
  70. pyobo/sources/icd11.py +3 -4
  71. pyobo/sources/icd_utils.py +6 -7
  72. pyobo/sources/interpro.py +3 -5
  73. pyobo/sources/itis.py +1 -3
  74. pyobo/sources/kegg/__init__.py +0 -2
  75. pyobo/sources/kegg/api.py +3 -4
  76. pyobo/sources/kegg/genes.py +3 -4
  77. pyobo/sources/kegg/genome.py +19 -9
  78. pyobo/sources/kegg/pathway.py +5 -6
  79. pyobo/sources/mesh.py +19 -21
  80. pyobo/sources/mgi.py +1 -3
  81. pyobo/sources/mirbase.py +13 -9
  82. pyobo/sources/mirbase_constants.py +0 -2
  83. pyobo/sources/mirbase_family.py +1 -3
  84. pyobo/sources/mirbase_mature.py +1 -3
  85. pyobo/sources/msigdb.py +4 -5
  86. pyobo/sources/ncbigene.py +3 -5
  87. pyobo/sources/npass.py +2 -4
  88. pyobo/sources/omim_ps.py +1 -3
  89. pyobo/sources/pathbank.py +35 -28
  90. pyobo/sources/pfam.py +1 -3
  91. pyobo/sources/pfam_clan.py +1 -3
  92. pyobo/sources/pid.py +3 -5
  93. pyobo/sources/pombase.py +7 -6
  94. pyobo/sources/pubchem.py +2 -3
  95. pyobo/sources/reactome.py +30 -11
  96. pyobo/sources/rgd.py +3 -4
  97. pyobo/sources/rhea.py +7 -8
  98. pyobo/sources/ror.py +3 -2
  99. pyobo/sources/selventa/__init__.py +0 -2
  100. pyobo/sources/selventa/schem.py +1 -3
  101. pyobo/sources/selventa/scomp.py +1 -3
  102. pyobo/sources/selventa/sdis.py +1 -3
  103. pyobo/sources/selventa/sfam.py +1 -3
  104. pyobo/sources/sgd.py +1 -3
  105. pyobo/sources/slm.py +29 -17
  106. pyobo/sources/umls/__init__.py +0 -2
  107. pyobo/sources/umls/__main__.py +0 -2
  108. pyobo/sources/umls/get_synonym_types.py +1 -1
  109. pyobo/sources/umls/umls.py +2 -4
  110. pyobo/sources/uniprot/__init__.py +0 -2
  111. pyobo/sources/uniprot/uniprot.py +11 -10
  112. pyobo/sources/uniprot/uniprot_ptm.py +6 -5
  113. pyobo/sources/utils.py +3 -5
  114. pyobo/sources/wikipathways.py +1 -3
  115. pyobo/sources/zfin.py +20 -9
  116. pyobo/ssg/__init__.py +3 -2
  117. pyobo/struct/__init__.py +0 -2
  118. pyobo/struct/reference.py +22 -23
  119. pyobo/struct/struct.py +132 -116
  120. pyobo/struct/typedef.py +14 -10
  121. pyobo/struct/utils.py +0 -2
  122. pyobo/utils/__init__.py +0 -2
  123. pyobo/utils/cache.py +14 -6
  124. pyobo/utils/io.py +9 -10
  125. pyobo/utils/iter.py +5 -6
  126. pyobo/utils/misc.py +1 -3
  127. pyobo/utils/ndex_utils.py +6 -7
  128. pyobo/utils/path.py +4 -5
  129. pyobo/version.py +3 -5
  130. pyobo/xrefdb/__init__.py +0 -2
  131. pyobo/xrefdb/canonicalizer.py +27 -18
  132. pyobo/xrefdb/priority.py +0 -2
  133. pyobo/xrefdb/sources/__init__.py +3 -4
  134. pyobo/xrefdb/sources/biomappings.py +0 -2
  135. pyobo/xrefdb/sources/cbms2019.py +0 -2
  136. pyobo/xrefdb/sources/chembl.py +0 -2
  137. pyobo/xrefdb/sources/compath.py +1 -3
  138. pyobo/xrefdb/sources/famplex.py +3 -5
  139. pyobo/xrefdb/sources/gilda.py +0 -2
  140. pyobo/xrefdb/sources/intact.py +5 -5
  141. pyobo/xrefdb/sources/ncit.py +1 -3
  142. pyobo/xrefdb/sources/pubchem.py +2 -5
  143. pyobo/xrefdb/sources/wikidata.py +2 -4
  144. pyobo/xrefdb/xrefs_pipeline.py +15 -16
  145. {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/LICENSE +1 -1
  146. pyobo-0.11.1.dist-info/METADATA +711 -0
  147. pyobo-0.11.1.dist-info/RECORD +173 -0
  148. {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/WHEEL +1 -1
  149. pyobo-0.11.1.dist-info/entry_points.txt +2 -0
  150. pyobo-0.10.12.dist-info/METADATA +0 -499
  151. pyobo-0.10.12.dist-info/RECORD +0 -169
  152. pyobo-0.10.12.dist-info/entry_points.txt +0 -15
  153. {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/top_level.txt +0 -0
pyobo/sources/mesh.py CHANGED
@@ -1,17 +1,16 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Parser for the MeSH descriptors."""
4
2
 
5
3
  import datetime
6
4
  import itertools as itt
7
5
  import logging
8
6
  import re
9
- from typing import Any, Collection, Dict, Iterable, List, Mapping, Optional, Set, Tuple
7
+ from collections.abc import Collection, Iterable, Mapping
8
+ from typing import Any, Optional
10
9
  from xml.etree.ElementTree import Element
11
10
 
12
11
  from tqdm.auto import tqdm
13
12
 
14
- from pyobo.api.utils import get_version
13
+ from pyobo.api.utils import safe_get_version
15
14
  from pyobo.identifier_utils import standardize_ec
16
15
  from pyobo.struct import Obo, Reference, Synonym, Term
17
16
  from pyobo.utils.cache import cached_json, cached_mapping
@@ -70,7 +69,7 @@ def get_tree_to_mesh_id(version: str) -> Mapping[str, str]:
70
69
 
71
70
  def get_terms(version: str, force: bool = False) -> Iterable[Term]:
72
71
  """Get MeSH OBO terms."""
73
- mesh_id_to_term: Dict[str, Term] = {}
72
+ mesh_id_to_term: dict[str, Term] = {}
74
73
 
75
74
  descriptors = ensure_mesh_descriptors(version=version, force=force)
76
75
  supplemental_records = ensure_mesh_supplemental_records(version=version, force=force)
@@ -80,8 +79,8 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
80
79
  name = entry["name"]
81
80
  definition = entry.get("scope_note")
82
81
 
83
- xrefs: List[Reference] = []
84
- synonyms: Set[str] = set()
82
+ xrefs: list[Reference] = []
83
+ synonyms: set[str] = set()
85
84
  for concept in entry["concepts"]:
86
85
  synonyms.add(concept["name"])
87
86
  for term in concept["terms"]:
@@ -107,7 +106,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
107
106
 
108
107
  def ensure_mesh_descriptors(
109
108
  version: str, force: bool = False, force_process: bool = False
110
- ) -> List[Mapping[str, Any]]:
109
+ ) -> list[Mapping[str, Any]]:
111
110
  """Get the parsed MeSH dictionary, and cache it if it wasn't already."""
112
111
 
113
112
  @cached_json(path=prefix_directory_join(PREFIX, name="desc.json", version=version), force=force)
@@ -133,7 +132,7 @@ def get_supplemental_url(version: str) -> str:
133
132
  return f"https://nlmpubs.nlm.nih.gov/projects/mesh/{version}/xmlmesh/supp{version}.gz"
134
133
 
135
134
 
136
- def ensure_mesh_supplemental_records(version: str, force: bool = False) -> List[Mapping[str, Any]]:
135
+ def ensure_mesh_supplemental_records(version: str, force: bool = False) -> list[Mapping[str, Any]]:
137
136
  """Get the parsed MeSH dictionary, and cache it if it wasn't already."""
138
137
 
139
138
  @cached_json(path=prefix_directory_join(PREFIX, name="supp.json", version=version), force=force)
@@ -147,11 +146,11 @@ def ensure_mesh_supplemental_records(version: str, force: bool = False) -> List[
147
146
  return _inner()
148
147
 
149
148
 
150
- def get_descriptor_records(element: Element, id_key: str, name_key) -> List[Dict[str, Any]]:
149
+ def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict[str, Any]]:
151
150
  """Get MeSH descriptor records."""
152
151
  logger.info("extract MeSH descriptors, concepts, and terms")
153
152
 
154
- rv: List[Dict[str, Any]] = [
153
+ rv: list[dict[str, Any]] = [
155
154
  get_descriptor_record(descriptor, id_key=id_key, name_key=name_key)
156
155
  for descriptor in tqdm(element, desc="Getting MeSH Descriptors", unit_scale=True)
157
156
  ]
@@ -204,7 +203,7 @@ def get_descriptor_record(
204
203
  element: Element,
205
204
  id_key: str,
206
205
  name_key: str,
207
- ) -> Dict[str, Any]:
206
+ ) -> dict[str, Any]:
208
207
  """Get descriptor records from the main element.
209
208
 
210
209
  :param element: An XML element
@@ -228,13 +227,13 @@ def get_descriptor_record(
228
227
  return rv
229
228
 
230
229
 
231
- def get_concept_records(element: Element) -> List[Mapping[str, Any]]:
230
+ def get_concept_records(element: Element) -> list[Mapping[str, Any]]:
232
231
  """Get concepts from a record."""
233
232
  return [get_concept_record(e) for e in element.findall("ConceptList/Concept")]
234
233
 
235
234
 
236
- def _get_xrefs(element: Element) -> List[Tuple[str, str]]:
237
- raw_registry_numbers: List[str] = sorted(
235
+ def _get_xrefs(element: Element) -> list[tuple[str, str]]:
236
+ raw_registry_numbers: list[str] = sorted(
238
237
  {e.text for e in element.findall("RelatedRegistryNumberList/RegistryNumber") if e.text}
239
238
  )
240
239
  registry_number = element.findtext("RegistryNumber")
@@ -267,7 +266,7 @@ def get_concept_record(element: Element) -> Mapping[str, Any]:
267
266
  if scope_note is not None:
268
267
  scope_note = scope_note.replace("\\n", "\n").strip()
269
268
 
270
- rv: Dict[str, Any] = {
269
+ rv: dict[str, Any] = {
271
270
  "concept_ui": element.findtext("ConceptUI"),
272
271
  "name": element.findtext("ConceptName/String"),
273
272
  "terms": get_term_records(element),
@@ -286,7 +285,7 @@ def get_concept_record(element: Element) -> Mapping[str, Any]:
286
285
  return rv
287
286
 
288
287
 
289
- def get_term_records(element: Element) -> List[Mapping[str, Any]]:
288
+ def get_term_records(element: Element) -> list[Mapping[str, Any]]:
290
289
  """Get all of the terms for a concept."""
291
290
  return [get_term_record(term) for term in element.findall("TermList/Term")]
292
291
 
@@ -307,7 +306,7 @@ def _text_or_bust(element: Element, name: str) -> str:
307
306
  return n
308
307
 
309
308
 
310
- def _get_descriptor_qualifiers(descriptor: Element) -> List[Mapping[str, str]]:
309
+ def _get_descriptor_qualifiers(descriptor: Element) -> list[Mapping[str, str]]:
311
310
  return [
312
311
  {
313
312
  "qualifier_ui": _text_or_bust(qualifier, "QualifierUI"),
@@ -321,7 +320,7 @@ def _get_descriptor_qualifiers(descriptor: Element) -> List[Mapping[str, str]]:
321
320
 
322
321
  def get_mesh_category_curies(
323
322
  letter: str, *, skip: Optional[Collection[str]] = None, version: Optional[str] = None
324
- ) -> List[str]:
323
+ ) -> list[str]:
325
324
  """Get the MeSH LUIDs for a category, by letter (e.g., "A").
326
325
 
327
326
  :param letter: The MeSH tree, A for anatomy, C for disease, etc.
@@ -332,8 +331,7 @@ def get_mesh_category_curies(
332
331
  .. seealso:: https://meshb.nlm.nih.gov/treeView
333
332
  """
334
333
  if version is None:
335
- version = get_version("mesh")
336
- assert version is not None
334
+ version = safe_get_version("mesh")
337
335
  tree_to_mesh = get_tree_to_mesh_id(version=version)
338
336
  rv = []
339
337
  for i in range(1, 100):
pyobo/sources/mgi.py CHANGED
@@ -1,10 +1,8 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for MGI."""
4
2
 
5
3
  import logging
6
4
  from collections import defaultdict
7
- from typing import Iterable
5
+ from collections.abc import Iterable
8
6
 
9
7
  import pandas as pd
10
8
  from tqdm.auto import tqdm
pyobo/sources/mirbase.py CHANGED
@@ -1,10 +1,8 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for miRBase."""
4
2
 
5
3
  import gzip
6
4
  import logging
7
- from typing import Iterable, List, Mapping
5
+ from collections.abc import Iterable, Mapping
8
6
 
9
7
  from tqdm.auto import tqdm
10
8
 
@@ -48,7 +46,7 @@ def get_obo(force: bool = False) -> Obo:
48
46
  return MiRBaseGetter(force=force)
49
47
 
50
48
 
51
- def get_terms(version: str, force: bool = False) -> List[Term]:
49
+ def get_terms(version: str, force: bool = False) -> list[Term]:
52
50
  """Parse miRNA data from filepath and convert it to dictionary."""
53
51
  _assert_frozen_version(version)
54
52
  url = f"{BASE_URL}/miRNA.dat.gz"
@@ -77,7 +75,7 @@ def _prepare_organisms(version: str, force: bool = False):
77
75
  return {division: (taxonomy_id, name) for _, division, name, _tree, taxonomy_id in df.values}
78
76
 
79
77
 
80
- def _prepare_aliases(version: str, force: bool = False) -> Mapping[str, List[str]]:
78
+ def _prepare_aliases(version: str, force: bool = False) -> Mapping[str, list[str]]:
81
79
  _assert_frozen_version(version)
82
80
  url = f"{BASE_URL}/aliases.txt.gz"
83
81
  df = ensure_df(PREFIX, url=url, sep="\t", version=version, force=force)
@@ -94,7 +92,7 @@ def _process_definitions_lines(
94
92
  organisms = _prepare_organisms(version, force=force)
95
93
  aliases = _prepare_aliases(version, force=force)
96
94
 
97
- groups: List[List[str]] = []
95
+ groups: list[list[str]] = []
98
96
 
99
97
  for line in lines: # TODO replace with itertools.groupby
100
98
  if line.startswith("ID"):
@@ -138,9 +136,15 @@ def _process_definitions_lines(
138
136
  xref_prefix = xref_mapping.get(xref_prefix, xref_prefix)
139
137
  if xref_prefix == "pictar":
140
138
  continue
141
- xrefs.append(
142
- Reference(prefix=xref_prefix, identifier=xref_identifier, name=xref_label or None)
143
- )
139
+
140
+ try:
141
+ xref = Reference(
142
+ prefix=xref_prefix, identifier=xref_identifier, name=xref_label or None
143
+ )
144
+ except ValueError:
145
+ tqdm.write(f"invalid xref: {xref_prefix}:{xref_identifier}")
146
+ else:
147
+ xrefs.append(xref)
144
148
 
145
149
  # TODO add pubmed references
146
150
 
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Constants for miRBase."""
4
2
 
5
3
  import pandas as pd
@@ -1,8 +1,6 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for miRBase Families."""
4
2
 
5
- from typing import Iterable
3
+ from collections.abc import Iterable
6
4
 
7
5
  import pandas as pd
8
6
  from tqdm.auto import tqdm
@@ -1,8 +1,6 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for miRBase Mature."""
4
2
 
5
- from typing import Iterable
3
+ from collections.abc import Iterable
6
4
 
7
5
  import pandas as pd
8
6
  from tqdm.auto import tqdm
pyobo/sources/msigdb.py CHANGED
@@ -1,11 +1,10 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Parsers for MSig."""
4
2
 
5
3
  import logging
6
- from typing import Iterable, Optional
7
- from xml.etree import ElementTree
4
+ from collections.abc import Iterable
5
+ from typing import Optional
8
6
 
7
+ from lxml.etree import ElementTree
9
8
  from tqdm.auto import tqdm
10
9
 
11
10
  from ..struct import Obo, Reference, Term, has_participant
@@ -137,7 +136,7 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
137
136
  def _get_definition(attrib) -> Optional[str]:
138
137
  rv = attrib["DESCRIPTION_FULL"].strip() or attrib["DESCRIPTION_BRIEF"].strip() or None
139
138
  if rv is not None:
140
- return rv.replace(r"\d", "").replace(r"\s", "") # noqa: W605
139
+ return rv.replace(r"\d", "").replace(r"\s", "")
141
140
  return None
142
141
 
143
142
 
pyobo/sources/ncbigene.py CHANGED
@@ -1,9 +1,7 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for Entrez."""
4
2
 
5
3
  import logging
6
- from typing import Iterable, List, Mapping, Set
4
+ from collections.abc import Iterable, Mapping
7
5
 
8
6
  import bioregistry
9
7
  import pandas as pd
@@ -47,7 +45,7 @@ GENE_INFO_COLUMNS = [
47
45
  ]
48
46
 
49
47
 
50
- def get_ncbigene_ids() -> Set[str]:
48
+ def get_ncbigene_ids() -> set[str]:
51
49
  """Get the Entrez name mapping."""
52
50
  df = _get_ncbigene_subset(["GeneID"])
53
51
  return set(df["GeneID"])
@@ -68,7 +66,7 @@ def _get_ncbigene_info_subset(usecols) -> Mapping[str, str]:
68
66
  return dict(df.values)
69
67
 
70
68
 
71
- def _get_ncbigene_subset(usecols: List[str]) -> pd.DataFrame:
69
+ def _get_ncbigene_subset(usecols: list[str]) -> pd.DataFrame:
72
70
  df = ensure_df(
73
71
  PREFIX,
74
72
  url=GENE_INFO_URL,
pyobo/sources/npass.py CHANGED
@@ -1,9 +1,7 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for NPASS."""
4
2
 
5
3
  import logging
6
- from typing import Iterable
4
+ from collections.abc import Iterable
7
5
 
8
6
  import pandas as pd
9
7
  from tqdm.auto import tqdm
@@ -41,7 +39,7 @@ def get_obo(force: bool = False) -> Obo:
41
39
 
42
40
  def get_df(version: str, force: bool = False) -> pd.DataFrame:
43
41
  """Get the NPASS chemical nomenclature."""
44
- base_url = f"http://bidd.group/NPASS/downloadFiles/NPASSv{version}_download"
42
+ base_url = f"https://bidd.group/NPASS/downloadFiles/NPASSv{version}_download"
45
43
  url = f"{base_url}_naturalProducts_generalInfo.txt"
46
44
  return ensure_df(
47
45
  PREFIX,
pyobo/sources/omim_ps.py CHANGED
@@ -1,9 +1,7 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for OMIM Phenotypic Series."""
4
2
 
5
3
  import logging
6
- from typing import Iterable
4
+ from collections.abc import Iterable
7
5
 
8
6
  from bioversions.utils import get_soup
9
7
 
pyobo/sources/pathbank.py CHANGED
@@ -1,16 +1,16 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for PathBank."""
4
2
 
3
+ from __future__ import annotations
4
+
5
5
  import logging
6
6
  from collections import defaultdict
7
- from typing import Iterable, Mapping, Set
7
+ from collections.abc import Iterable, Mapping
8
8
 
9
9
  import pandas as pd
10
10
  from tqdm.auto import tqdm
11
11
 
12
12
  from ..struct import Obo, Reference, Term
13
- from ..struct.typedef import has_participant
13
+ from ..struct.typedef import has_category, has_participant
14
14
  from ..utils.path import ensure_df
15
15
 
16
16
  __all__ = [
@@ -70,7 +70,7 @@ class PathBankGetter(Obo):
70
70
  """An ontology representation of PathBank's pathway nomenclature."""
71
71
 
72
72
  ontology = bioversions_key = PREFIX
73
- typedefs = [has_participant]
73
+ typedefs = [has_participant, has_category]
74
74
 
75
75
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
76
76
  """Iterate over terms in the ontology."""
@@ -98,46 +98,58 @@ def get_proteins_df(version: str, force: bool = False) -> pd.DataFrame:
98
98
  return proteins_df
99
99
 
100
100
 
101
- def get_protein_mapping(version: str, force: bool = False) -> Mapping[str, Set[Reference]]:
101
+ def get_protein_mapping(version: str, force: bool = False) -> Mapping[str, set[Reference]]:
102
102
  """Make the protein mapping."""
103
103
  proteins_df = get_proteins_df(version=version, force=force)
104
104
  smpdb_id_to_proteins = defaultdict(set)
105
105
  for pathway_id, protein_id in tqdm(
106
106
  proteins_df.values, desc=f"[{PREFIX}] mapping proteins", unit_scale=True
107
107
  ):
108
- # TODO get protein names
109
- smpdb_id_to_proteins[pathway_id].add(Reference(prefix="uniprot", identifier=protein_id))
108
+ try:
109
+ if "-" in protein_id:
110
+ reference = Reference(prefix="uniprot.isoform", identifier=protein_id)
111
+ else:
112
+ reference = Reference(prefix="uniprot", identifier=protein_id)
113
+ except ValueError:
114
+ tqdm.write(f"[pathbank] invalid uniprot identifier: {protein_id}")
115
+ else:
116
+ smpdb_id_to_proteins[pathway_id].add(reference)
110
117
  return smpdb_id_to_proteins
111
118
 
112
119
 
113
120
  def get_metabolite_df(version: str, force: bool = False) -> pd.DataFrame:
114
121
  """Get the metabolites dataframe."""
115
- return ensure_df(
122
+ df = ensure_df(
116
123
  PREFIX,
117
124
  url=METABOLITE_URL,
118
125
  sep=",",
119
- usecols=["PathBank ID", "Metabolite ID", "Metabolite Name"],
126
+ usecols=["PathBank ID", "ChEBI ID"],
120
127
  force=force,
121
128
  version=version,
122
129
  )
130
+ df = df[df["ChEBI ID"].notna()]
131
+ return df
123
132
 
124
133
 
125
- def get_metabolite_mapping(version: str, force: bool = False) -> Mapping[str, Set[Reference]]:
134
+ def get_metabolite_mapping(version: str, force: bool = False) -> Mapping[str, set[Reference]]:
126
135
  """Make the metabolite mapping."""
127
136
  metabolites_df = get_metabolite_df(version=version, force=force)
128
137
  smpdb_id_to_metabolites = defaultdict(set)
129
138
  it = tqdm(metabolites_df.values, desc=f"[{PREFIX}] mapping metabolites", unit_scale=True)
130
- for pathway_id, metabolite_id, metabolite_name in it:
131
- smpdb_id_to_metabolites[pathway_id].add(
132
- Reference(
133
- prefix=PREFIX,
134
- identifier=metabolite_id,
135
- name=metabolite_name,
136
- )
137
- )
139
+ for pathway_id, metabolite_id in it:
140
+ reference = Reference(prefix="chebi", identifier=metabolite_id.strip())
141
+ smpdb_id_to_metabolites[pathway_id].add(reference)
138
142
  return smpdb_id_to_metabolites
139
143
 
140
144
 
145
+ def _clean_description(description: str) -> str | None:
146
+ """Clean the description."""
147
+ if pd.isna(description) or not description:
148
+ return None
149
+ parts = [part.strip() for part in description.strip().splitlines()]
150
+ return " ".join(parts)
151
+
152
+
141
153
  def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
142
154
  """Get PathBank's terms."""
143
155
  smpdb_id_to_proteins = get_protein_mapping(version=version, force=force)
@@ -149,16 +161,11 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
149
161
  reference = Reference(prefix=PREFIX, identifier=pathbank_id, name=name)
150
162
  term = Term(
151
163
  reference=reference,
152
- # definition=description.replace('\n', ' '),
153
- xrefs=[Reference(prefix="smpdb", identifier=smpdb_id)],
154
- )
155
- term.append_parent(
156
- Reference(
157
- prefix=PREFIX,
158
- identifier=subject.lower().replace(" ", "_"),
159
- name=subject,
160
- )
164
+ # TODO use _clean_description(description) to add a description,
165
+ # but there are weird parser errors
161
166
  )
167
+ term.append_exact_match(Reference(prefix="smpdb", identifier=smpdb_id))
168
+ term.append_property(has_category, subject.lower().replace(" ", "_"))
162
169
  term.extend_relationship(has_participant, smpdb_id_to_proteins[smpdb_id])
163
170
  term.extend_relationship(has_participant, smpdb_id_to_metabolites[smpdb_id])
164
171
  yield term
pyobo/sources/pfam.py CHANGED
@@ -1,8 +1,6 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Convert PFAM to OBO."""
4
2
 
5
- from typing import Iterable
3
+ from collections.abc import Iterable
6
4
 
7
5
  import pandas as pd
8
6
 
@@ -1,8 +1,6 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Convert PFAM Clans to OBO."""
4
2
 
5
- from typing import Iterable
3
+ from collections.abc import Iterable
6
4
 
7
5
  from tqdm.auto import tqdm
8
6
 
pyobo/sources/pid.py CHANGED
@@ -1,10 +1,8 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for NCI PID."""
4
2
 
5
3
  import logging
6
4
  from collections import defaultdict
7
- from typing import Iterable, List, Mapping, Tuple
5
+ from collections.abc import Iterable, Mapping
8
6
 
9
7
  import pandas as pd
10
8
 
@@ -45,7 +43,7 @@ def get_obo() -> Obo:
45
43
  return PIDGetter()
46
44
 
47
45
 
48
- def iter_networks(use_tqdm: bool = False, force: bool = False) -> Iterable[Tuple[str, CX]]:
46
+ def iter_networks(use_tqdm: bool = False, force: bool = False) -> Iterable[tuple[str, CX]]:
49
47
  """Iterate over NCI PID networks."""
50
48
  yield from ensure_ndex_network_set(
51
49
  PREFIX, NDEX_NETWORK_SET_UUID, use_tqdm=use_tqdm, force=force
@@ -117,7 +115,7 @@ def get_curation_df() -> pd.DataFrame:
117
115
  return df[["Text from NDEx", "Type", "Namespace", "Identifier"]]
118
116
 
119
117
 
120
- def get_remapping() -> Mapping[str, List[Tuple[str, str]]]:
118
+ def get_remapping() -> Mapping[str, list[tuple[str, str]]]:
121
119
  """Get a mapping from text to list of HGNC id/symbols."""
122
120
  curation_df = get_curation_df()
123
121
  rv = defaultdict(list)
pyobo/sources/pombase.py CHANGED
@@ -1,16 +1,15 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for PomBase."""
4
2
 
5
3
  import logging
6
4
  from collections import defaultdict
7
- from typing import Iterable
5
+ from collections.abc import Iterable
8
6
 
9
7
  import pandas as pd
10
8
  from tqdm.auto import tqdm
11
9
 
12
10
  import pyobo
13
11
  from pyobo import Reference
12
+ from pyobo.resources.so import get_so_name
14
13
  from pyobo.struct import Obo, Term, from_species, has_gene_product, orthologous
15
14
  from pyobo.utils.path import ensure_df
16
15
 
@@ -21,7 +20,7 @@ __all__ = [
21
20
  logger = logging.getLogger(__name__)
22
21
 
23
22
  PREFIX = "pombase"
24
- URL = "https://www.pombase.org/data/names_and_identifiers/gene_IDs_names_products.tsv"
23
+ GENE_NAMES_URL = "https://www.pombase.org/data/names_and_identifiers/gene_IDs_names_products.tsv"
25
24
  ORTHOLOGS_URL = "https://www.pombase.org/data/orthologs/human-orthologs.txt.gz"
26
25
 
27
26
 
@@ -70,9 +69,11 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
70
69
  if hgnc_id is not None:
71
70
  identifier_to_hgnc_ids[identifier].add(hgnc_id)
72
71
 
73
- df = ensure_df(PREFIX, url=URL, force=force, header=None, version=version)
72
+ df = ensure_df(PREFIX, url=GENE_NAMES_URL, force=force, version=version)
74
73
  so = {
75
- gtype: Reference.auto("SO", POMBASE_TO_SO[gtype])
74
+ gtype: Reference(
75
+ prefix="SO", identifier=POMBASE_TO_SO[gtype], name=get_so_name(POMBASE_TO_SO[gtype])
76
+ )
76
77
  for gtype in sorted(df[df.columns[6]].unique())
77
78
  }
78
79
  for _, reference in sorted(so.items()):
pyobo/sources/pubchem.py CHANGED
@@ -1,9 +1,8 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for PubChem Compound."""
4
2
 
5
3
  import logging
6
- from typing import Iterable, Mapping, Optional
4
+ from collections.abc import Iterable, Mapping
5
+ from typing import Optional
7
6
 
8
7
  import pandas as pd
9
8
  from bioregistry.utils import removeprefix
pyobo/sources/reactome.py CHANGED
@@ -1,11 +1,9 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for Reactome."""
4
2
 
5
3
  import logging
6
4
  from collections import defaultdict
5
+ from collections.abc import Iterable, Mapping
7
6
  from functools import lru_cache
8
- from typing import Iterable, Mapping, Set
9
7
 
10
8
  import pandas as pd
11
9
  from tqdm.auto import tqdm
@@ -72,7 +70,9 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
72
70
  df["taxonomy_id"] = df["species"].map(get_ncbitaxon_id)
73
71
 
74
72
  terms = {}
75
- it = tqdm(df.values, total=len(df.index), desc=f"mapping {PREFIX}")
73
+ it = tqdm(
74
+ df.values, total=len(df.index), desc=f"mapping {PREFIX}", unit_scale=True, unit="pathway"
75
+ )
76
76
  for reactome_id, name, species_name, taxonomy_id in it:
77
77
  terms[reactome_id] = term = Term(
78
78
  reference=Reference(prefix=PREFIX, identifier=reactome_id, name=name),
@@ -94,10 +94,21 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
94
94
  terms[child_id].append_parent(terms[parent_id])
95
95
 
96
96
  uniprot_pathway_df = ensure_participant_df(version=version, force=force)
97
- for uniprot_id, reactome_id in tqdm(uniprot_pathway_df.values, total=len(uniprot_pathway_df)):
98
- terms[reactome_id].append_relationship(
99
- has_participant, Reference(prefix="uniprot", identifier=uniprot_id)
100
- )
97
+ for uniprot_id, reactome_id in tqdm(
98
+ uniprot_pathway_df.values,
99
+ total=len(uniprot_pathway_df),
100
+ unit_scale=True,
101
+ unit="pathway-protein",
102
+ ):
103
+ if reactome_id not in terms:
104
+ tqdm.write(f"{reactome_id} appears in uniprot participants file but not pathways file")
105
+ continue
106
+
107
+ if "-" in uniprot_id:
108
+ reference = Reference(prefix="uniprot.isoform", identifier=uniprot_id)
109
+ else:
110
+ reference = Reference(prefix="uniprot", identifier=uniprot_id)
111
+ terms[reactome_id].append_relationship(has_participant, reference)
101
112
 
102
113
  chebi_pathway_url = f"https://reactome.org/download/{version}/ChEBI2Reactome_All_Levels.txt"
103
114
  chebi_pathway_df = ensure_df(
@@ -108,7 +119,15 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
108
119
  version=version,
109
120
  force=force,
110
121
  )
111
- for chebi_id, reactome_id in tqdm(chebi_pathway_df.values, total=len(chebi_pathway_df)):
122
+ for chebi_id, reactome_id in tqdm(
123
+ chebi_pathway_df.values,
124
+ total=len(chebi_pathway_df),
125
+ unit_scale=True,
126
+ unit="pathway-chemical",
127
+ ):
128
+ if reactome_id not in terms:
129
+ tqdm.write(f"{reactome_id} appears in chebi participants file but not pathways file")
130
+ continue
112
131
  terms[reactome_id].append_relationship(
113
132
  has_participant, Reference(prefix="chebi", identifier=chebi_id)
114
133
  )
@@ -122,7 +141,7 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
122
141
 
123
142
 
124
143
  @lru_cache(maxsize=1)
125
- def get_protein_to_pathways() -> Mapping[str, Set[str]]:
144
+ def get_protein_to_pathways() -> Mapping[str, set[str]]:
126
145
  """Get a mapping from proteins to the pathways they're in."""
127
146
  protein_to_pathways = defaultdict(set)
128
147
  x = get_id_multirelations_mapping("reactome", has_participant)
@@ -135,4 +154,4 @@ def get_protein_to_pathways() -> Mapping[str, Set[str]]:
135
154
 
136
155
 
137
156
  if __name__ == "__main__":
138
- get_obo().write_default()
157
+ ReactomeGetter.cli()
pyobo/sources/rgd.py CHANGED
@@ -1,9 +1,8 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for RGD."""
4
2
 
5
3
  import logging
6
- from typing import Iterable, Optional
4
+ from collections.abc import Iterable
5
+ from typing import Optional
7
6
 
8
7
  import pandas as pd
9
8
  from tqdm.auto import tqdm
@@ -138,7 +137,7 @@ def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Te
138
137
  continue
139
138
  if prefix == "uniprot":
140
139
  term.append_relationship(
141
- has_gene_product, Reference.auto(prefix=prefix, identifier=xref_id)
140
+ has_gene_product, Reference(prefix=prefix, identifier=xref_id)
142
141
  )
143
142
  elif prefix == "ensembl":
144
143
  if xref_id.startswith("ENSMUSG") or xref_id.startswith("ENSRNOG"):