pyobo 0.10.12__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. pyobo/__init__.py +0 -2
  2. pyobo/__main__.py +0 -2
  3. pyobo/api/__init__.py +0 -2
  4. pyobo/api/alts.py +6 -7
  5. pyobo/api/hierarchy.py +14 -15
  6. pyobo/api/metadata.py +3 -4
  7. pyobo/api/names.py +31 -32
  8. pyobo/api/properties.py +6 -7
  9. pyobo/api/relations.py +12 -11
  10. pyobo/api/species.py +5 -6
  11. pyobo/api/typedefs.py +1 -3
  12. pyobo/api/utils.py +61 -5
  13. pyobo/api/xrefs.py +4 -5
  14. pyobo/aws.py +3 -5
  15. pyobo/cli/__init__.py +0 -2
  16. pyobo/cli/aws.py +0 -2
  17. pyobo/cli/cli.py +0 -4
  18. pyobo/cli/database.py +1 -3
  19. pyobo/cli/lookup.py +0 -2
  20. pyobo/cli/utils.py +0 -2
  21. pyobo/constants.py +1 -33
  22. pyobo/getters.py +19 -26
  23. pyobo/gilda_utils.py +19 -17
  24. pyobo/identifier_utils.py +10 -10
  25. pyobo/mocks.py +5 -6
  26. pyobo/normalizer.py +24 -24
  27. pyobo/obographs.py +8 -5
  28. pyobo/plugins.py +3 -4
  29. pyobo/py.typed +0 -0
  30. pyobo/reader.py +19 -21
  31. pyobo/registries/__init__.py +0 -2
  32. pyobo/registries/metaregistry.py +6 -8
  33. pyobo/resource_utils.py +1 -3
  34. pyobo/resources/__init__.py +0 -2
  35. pyobo/resources/ncbitaxon.py +2 -3
  36. pyobo/resources/ro.py +2 -4
  37. pyobo/resources/so.py +55 -0
  38. pyobo/resources/so.tsv +2604 -0
  39. pyobo/sources/README.md +15 -0
  40. pyobo/sources/__init__.py +0 -2
  41. pyobo/sources/agrovoc.py +3 -3
  42. pyobo/sources/antibodyregistry.py +2 -3
  43. pyobo/sources/biogrid.py +4 -4
  44. pyobo/sources/ccle.py +3 -4
  45. pyobo/sources/cgnc.py +1 -3
  46. pyobo/sources/chebi.py +2 -4
  47. pyobo/sources/chembl.py +1 -3
  48. pyobo/sources/civic_gene.py +2 -3
  49. pyobo/sources/complexportal.py +57 -20
  50. pyobo/sources/conso.py +2 -4
  51. pyobo/sources/cpt.py +1 -3
  52. pyobo/sources/credit.py +1 -1
  53. pyobo/sources/cvx.py +1 -3
  54. pyobo/sources/depmap.py +3 -4
  55. pyobo/sources/dictybase_gene.py +15 -12
  56. pyobo/sources/drugbank.py +6 -7
  57. pyobo/sources/drugbank_salt.py +3 -4
  58. pyobo/sources/drugcentral.py +9 -8
  59. pyobo/sources/expasy.py +33 -16
  60. pyobo/sources/famplex.py +3 -5
  61. pyobo/sources/flybase.py +5 -6
  62. pyobo/sources/geonames.py +1 -1
  63. pyobo/sources/gmt_utils.py +5 -6
  64. pyobo/sources/go.py +4 -6
  65. pyobo/sources/gwascentral_phenotype.py +1 -3
  66. pyobo/sources/gwascentral_study.py +2 -3
  67. pyobo/sources/hgnc.py +30 -26
  68. pyobo/sources/hgncgenefamily.py +9 -11
  69. pyobo/sources/icd10.py +3 -4
  70. pyobo/sources/icd11.py +3 -4
  71. pyobo/sources/icd_utils.py +6 -7
  72. pyobo/sources/interpro.py +3 -5
  73. pyobo/sources/itis.py +1 -3
  74. pyobo/sources/kegg/__init__.py +0 -2
  75. pyobo/sources/kegg/api.py +3 -4
  76. pyobo/sources/kegg/genes.py +3 -4
  77. pyobo/sources/kegg/genome.py +19 -9
  78. pyobo/sources/kegg/pathway.py +5 -6
  79. pyobo/sources/mesh.py +19 -21
  80. pyobo/sources/mgi.py +1 -3
  81. pyobo/sources/mirbase.py +13 -9
  82. pyobo/sources/mirbase_constants.py +0 -2
  83. pyobo/sources/mirbase_family.py +1 -3
  84. pyobo/sources/mirbase_mature.py +1 -3
  85. pyobo/sources/msigdb.py +4 -5
  86. pyobo/sources/ncbigene.py +3 -5
  87. pyobo/sources/npass.py +2 -4
  88. pyobo/sources/omim_ps.py +1 -3
  89. pyobo/sources/pathbank.py +35 -28
  90. pyobo/sources/pfam.py +1 -3
  91. pyobo/sources/pfam_clan.py +1 -3
  92. pyobo/sources/pid.py +3 -5
  93. pyobo/sources/pombase.py +7 -6
  94. pyobo/sources/pubchem.py +2 -3
  95. pyobo/sources/reactome.py +30 -11
  96. pyobo/sources/rgd.py +3 -4
  97. pyobo/sources/rhea.py +7 -8
  98. pyobo/sources/ror.py +3 -2
  99. pyobo/sources/selventa/__init__.py +0 -2
  100. pyobo/sources/selventa/schem.py +1 -3
  101. pyobo/sources/selventa/scomp.py +1 -3
  102. pyobo/sources/selventa/sdis.py +1 -3
  103. pyobo/sources/selventa/sfam.py +1 -3
  104. pyobo/sources/sgd.py +1 -3
  105. pyobo/sources/slm.py +29 -17
  106. pyobo/sources/umls/__init__.py +0 -2
  107. pyobo/sources/umls/__main__.py +0 -2
  108. pyobo/sources/umls/get_synonym_types.py +1 -1
  109. pyobo/sources/umls/umls.py +2 -4
  110. pyobo/sources/uniprot/__init__.py +0 -2
  111. pyobo/sources/uniprot/uniprot.py +11 -10
  112. pyobo/sources/uniprot/uniprot_ptm.py +6 -5
  113. pyobo/sources/utils.py +3 -5
  114. pyobo/sources/wikipathways.py +1 -3
  115. pyobo/sources/zfin.py +20 -9
  116. pyobo/ssg/__init__.py +3 -2
  117. pyobo/struct/__init__.py +0 -2
  118. pyobo/struct/reference.py +22 -23
  119. pyobo/struct/struct.py +132 -116
  120. pyobo/struct/typedef.py +14 -10
  121. pyobo/struct/utils.py +0 -2
  122. pyobo/utils/__init__.py +0 -2
  123. pyobo/utils/cache.py +14 -6
  124. pyobo/utils/io.py +9 -10
  125. pyobo/utils/iter.py +5 -6
  126. pyobo/utils/misc.py +1 -3
  127. pyobo/utils/ndex_utils.py +6 -7
  128. pyobo/utils/path.py +4 -5
  129. pyobo/version.py +3 -5
  130. pyobo/xrefdb/__init__.py +0 -2
  131. pyobo/xrefdb/canonicalizer.py +27 -18
  132. pyobo/xrefdb/priority.py +0 -2
  133. pyobo/xrefdb/sources/__init__.py +3 -4
  134. pyobo/xrefdb/sources/biomappings.py +0 -2
  135. pyobo/xrefdb/sources/cbms2019.py +0 -2
  136. pyobo/xrefdb/sources/chembl.py +0 -2
  137. pyobo/xrefdb/sources/compath.py +1 -3
  138. pyobo/xrefdb/sources/famplex.py +3 -5
  139. pyobo/xrefdb/sources/gilda.py +0 -2
  140. pyobo/xrefdb/sources/intact.py +5 -5
  141. pyobo/xrefdb/sources/ncit.py +1 -3
  142. pyobo/xrefdb/sources/pubchem.py +2 -5
  143. pyobo/xrefdb/sources/wikidata.py +2 -4
  144. pyobo/xrefdb/xrefs_pipeline.py +15 -16
  145. {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/LICENSE +1 -1
  146. pyobo-0.11.1.dist-info/METADATA +711 -0
  147. pyobo-0.11.1.dist-info/RECORD +173 -0
  148. {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/WHEEL +1 -1
  149. pyobo-0.11.1.dist-info/entry_points.txt +2 -0
  150. pyobo-0.10.12.dist-info/METADATA +0 -499
  151. pyobo-0.10.12.dist-info/RECORD +0 -169
  152. pyobo-0.10.12.dist-info/entry_points.txt +0 -15
  153. {pyobo-0.10.12.dist-info → pyobo-0.11.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,15 @@
1
+ # Sources
2
+
3
+ 1. Create a new module in `pyobo.sources` named with the prefix for the resource you're ontologizing
4
+ 2. Make sure your resource has a corresponding prefix in [the Bioregistry](https://github.com/biopragmatics/bioregistry)
5
+ 3. Subclass the `pyobo.Obo` class to represent your resource
6
+ 4. Add your resource to the list in `pyobo.sources.__init__`
7
+
8
+ ## What is in scope?
9
+
10
+ 1. Biomedical, semantic web, bibliographic, life sciences, and related natural sciences resources are welcome
11
+ 2. The source you want to ontologize should be an identifier resource, i.e., it mints its own identifiers. If you want
12
+ to ontologize some database that reuses some other identifier resource's identifiers, then this isn't the right
13
+ place.
14
+ 3. Resources that are not possible to download automatically are not in scope for PyOBO. Reproducibility and reusability
15
+ are core values of this software
pyobo/sources/__init__.py CHANGED
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Sources of OBO content."""
4
2
 
5
3
  from class_resolver import ClassResolver
pyobo/sources/agrovoc.py CHANGED
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for AGROVOC."""
4
2
 
5
3
  import pystow
@@ -11,6 +9,8 @@ __all__ = [
11
9
  "ensure_agrovoc_graph",
12
10
  ]
13
11
 
12
+ PREFIX = "agrovoc"
13
+
14
14
 
15
15
  def ensure_agrovoc_graph(version: str) -> Graph:
16
16
  """Download and parse the given version of AGROVOC."""
@@ -20,5 +20,5 @@ def ensure_agrovoc_graph(version: str) -> Graph:
20
20
  graph.bind("skosxl", "http://www.w3.org/2008/05/skos-xl#")
21
21
  graph.bind("skos", SKOS)
22
22
  graph.bind("dcterms", DCTERMS)
23
- graph.bind("agrovoc", "http://aims.fao.org/aos/agrontology#")
23
+ graph.bind(PREFIX, "http://aims.fao.org/aos/agrontology#")
24
24
  return graph
@@ -1,9 +1,8 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for the Antibody Registry."""
4
2
 
5
3
  import logging
6
- from typing import Iterable, Mapping, Optional
4
+ from collections.abc import Iterable, Mapping
5
+ from typing import Optional
7
6
 
8
7
  import pandas as pd
9
8
  from bioregistry.utils import removeprefix
pyobo/sources/biogrid.py CHANGED
@@ -1,9 +1,8 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Extract and convert BioGRID identifiers."""
4
2
 
3
+ from collections.abc import Mapping
5
4
  from functools import partial
6
- from typing import Mapping, Optional
5
+ from typing import Optional
7
6
 
8
7
  import pandas as pd
9
8
 
@@ -77,7 +76,8 @@ def get_ncbigene_mapping() -> Mapping[str, str]:
77
76
  .. code-block:: python
78
77
 
79
78
  from pyobo import get_filtered_xrefs
80
- biogrid_ncbigene_mapping = get_filtered_xrefs('biogrid', 'ncbigene')
79
+
80
+ biogrid_ncbigene_mapping = get_filtered_xrefs("biogrid", "ncbigene")
81
81
  """
82
82
  df = get_df()
83
83
  df = df.loc[df["IDENTIFIER_TYPE"] == "ENTREZ_GENE", ["BIOGRID_ID", "IDENTIFIER_VALUE"]]
pyobo/sources/ccle.py CHANGED
@@ -1,10 +1,9 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Get the CCLE Cells, provided by cBioPortal."""
4
2
 
5
3
  import tarfile
4
+ from collections.abc import Iterable
6
5
  from pathlib import Path
7
- from typing import Iterable, Optional
6
+ from typing import Optional
8
7
 
9
8
  import pandas as pd
10
9
  import pystow
@@ -25,7 +24,7 @@ class CCLEGetter(Obo):
25
24
 
26
25
  ontology = bioregistry_key = PREFIX
27
26
 
28
- def __post_init__(self): # noqa: D105
27
+ def __post_init__(self):
29
28
  self.data_version = VERSION
30
29
 
31
30
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
pyobo/sources/cgnc.py CHANGED
@@ -1,9 +1,7 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for CGNC."""
4
2
 
5
3
  import logging
6
- from typing import Iterable
4
+ from collections.abc import Iterable
7
5
 
8
6
  import pandas as pd
9
7
 
pyobo/sources/chebi.py CHANGED
@@ -1,8 +1,6 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for ChEBI."""
4
2
 
5
- from typing import Mapping, Set, Tuple
3
+ from collections.abc import Mapping
6
4
 
7
5
  from ..api import get_filtered_properties_mapping, get_filtered_relations_df
8
6
  from ..struct import Reference, TypeDef
@@ -33,7 +31,7 @@ def get_chebi_smiles_id_mapping() -> Mapping[str, str]:
33
31
  has_role = TypeDef(reference=Reference(prefix="chebi", identifier="has_role"))
34
32
 
35
33
 
36
- def get_chebi_role_to_children() -> Mapping[str, Set[Tuple[str, str]]]:
34
+ def get_chebi_role_to_children() -> Mapping[str, set[tuple[str, str]]]:
37
35
  """Get the ChEBI role to children mapping."""
38
36
  df = get_filtered_relations_df("chebi", relation=has_role)
39
37
  return multisetdict((role_id, ("chebi", chemical_id)) for chemical_id, _, role_id in df.values)
pyobo/sources/chembl.py CHANGED
@@ -1,13 +1,11 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for ChEMBL.
4
2
 
5
3
  Run with ``python -m pyobo.sources.chembl -vv``.
6
4
  """
7
5
 
8
6
  import logging
7
+ from collections.abc import Iterable
9
8
  from contextlib import closing
10
- from typing import Iterable
11
9
 
12
10
  import chembl_downloader
13
11
 
@@ -1,8 +1,7 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for CiVIC Genes."""
4
2
 
5
- from typing import Iterable, Optional
3
+ from collections.abc import Iterable
4
+ from typing import Optional
6
5
 
7
6
  import pandas as pd
8
7
 
@@ -1,9 +1,7 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for ComplexPortal."""
4
2
 
5
3
  import logging
6
- from typing import Iterable, List, Tuple
4
+ from collections.abc import Iterable
7
5
 
8
6
  import pandas as pd
9
7
  from tqdm.auto import tqdm
@@ -52,7 +50,7 @@ DTYPE = {
52
50
  }
53
51
 
54
52
 
55
- def _parse_members(s) -> List[Tuple[Reference, str]]:
53
+ def _parse_members(s) -> list[tuple[Reference, str]]:
56
54
  if pd.isna(s):
57
55
  return []
58
56
 
@@ -60,15 +58,35 @@ def _parse_members(s) -> List[Tuple[Reference, str]]:
60
58
  for member in s.split("|"):
61
59
  entity_id, count = member.split("(")
62
60
  count = count.rstrip(")")
63
- if ":" in entity_id:
61
+ if entity_id.startswith("URS"):
62
+ prefix, identifier = "rnacentral", entity_id
63
+ elif entity_id.startswith("CPX"):
64
+ # TODO why self xref?
65
+ prefix, identifier = "complexportal", entity_id
66
+ elif entity_id.startswith("["):
67
+ continue # this is a list of uniprot IDs, not sure what to do with this
68
+ elif entity_id.startswith("EBI-"):
69
+ continue
70
+ elif ":" not in entity_id:
71
+ if "PRO_" in entity_id:
72
+ prefix = "uniprot.chain"
73
+ identifier = entity_id.split("-")[1]
74
+ elif "-" in entity_id:
75
+ prefix, identifier = "uniprot.isoform", entity_id
76
+ else:
77
+ prefix, identifier = "uniprot", entity_id
78
+ else:
64
79
  prefix, identifier = entity_id.split(":", 1)
80
+ try:
81
+ reference = Reference(prefix=prefix, identifier=identifier)
82
+ except ValueError:
83
+ tqdm.write(f"failed to validate reference: {entity_id}")
65
84
  else:
66
- prefix, identifier = "uniprot", entity_id
67
- rv.append((Reference(prefix=prefix, identifier=identifier), count))
85
+ rv.append((reference, count))
68
86
  return rv
69
87
 
70
88
 
71
- def _parse_xrefs(s) -> List[Tuple[Reference, str]]:
89
+ def _parse_xrefs(s) -> list[tuple[Reference, str]]:
72
90
  if pd.isna(s):
73
91
  return []
74
92
 
@@ -76,27 +94,40 @@ def _parse_xrefs(s) -> List[Tuple[Reference, str]]:
76
94
  for xref in s.split("|"):
77
95
  xref = xref.replace("protein ontology:PR:", "PR:")
78
96
  xref = xref.replace("protein ontology:PR_", "PR:")
97
+ xref = xref.replace("rhea:rhea ", "rhea:")
98
+ xref = xref.replace("rhea:Rhea ", "rhea:")
99
+ xref = xref.replace("rhea:RHEA:rhea", "rhea:")
100
+ xref = xref.replace("rhea:RHEA: ", "rhea:")
101
+ xref = xref.replace("rhea:RHEA:rhea ", "rhea:")
102
+ xref = xref.replace("intenz:RHEA:", "rhea:")
103
+ xref = xref.replace("eccode::", "eccode:")
104
+ xref = xref.replace("eccode:EC:", "eccode:")
105
+ xref = xref.replace("intenz:EC:", "eccode:")
106
+ xref = xref.replace("eccode:RHEA:", "rhea:")
107
+ xref = xref.replace("efo:MONDO:", "MONDO:")
108
+ xref = xref.replace("omim:MIM:", "omim:")
109
+ xref = xref.replace("efo:HP:", "HP:")
110
+ xref = xref.replace("efo:Orphanet:", "Orphanet:")
111
+ xref = xref.replace("orphanet:ORDO:", "Orphanet:")
112
+ xref = xref.replace("biorxiv:doi.org/", "doi:")
113
+ xref = xref.replace("emdb:EMDB-", "emdb:EMD-")
114
+ xref = xref.replace("wwpdb:EMD-", "emdb:EMD-")
115
+ xref = xref.replace("signor:CPX-", "complexportal:CPX-")
116
+
79
117
  try:
80
118
  xref_curie, note = xref.split("(")
81
119
  except ValueError:
82
120
  logger.warning("xref missing (: %s", xref)
83
121
  continue
84
122
  note = note.rstrip(")")
85
- note.replace("rhea:rhea ", "rhea:")
86
- note.replace("rhea:Rhea ", "rhea:")
87
- note.replace("eccode::", "eccode:")
88
- note.replace("eccode:EC:", "eccode:")
89
- note.replace("eccode:RHEA:", "rhea:")
90
- if note.lower().startswith("rhea "):
91
- note = note[len("Rhea ") :]
92
- if note.lower().startswith("rhea:rhea "):
93
- note = note[len("rhea:rhea ") :]
94
- if note.lower().startswith("EC:"):
95
- note = note[len("EC:") :]
123
+
124
+ if xref_curie.startswith("intenz:"):
125
+ xref_curie = _clean_intenz(xref_curie)
126
+
96
127
  try:
97
128
  reference = Reference.from_curie(xref_curie)
98
129
  except ValueError:
99
- logger.warning("can not parse CURIE: %s", xref)
130
+ logger.warning("can not parse CURIE: %s", xref_curie)
100
131
  continue
101
132
  if reference is None:
102
133
  logger.warning("reference is None after parsing: %s", xref)
@@ -105,6 +136,12 @@ def _parse_xrefs(s) -> List[Tuple[Reference, str]]:
105
136
  return rv
106
137
 
107
138
 
139
+ def _clean_intenz(s: str) -> str:
140
+ for _ in range(3):
141
+ s = s.rstrip("-").rstrip(".")
142
+ return s
143
+
144
+
108
145
  class ComplexPortalGetter(Obo):
109
146
  """An ontology representation of the Complex Portal."""
110
147
 
pyobo/sources/conso.py CHANGED
@@ -1,8 +1,6 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for CONSO."""
4
2
 
5
- from typing import Iterable, List
3
+ from collections.abc import Iterable
6
4
 
7
5
  import pandas as pd
8
6
 
@@ -68,7 +66,7 @@ def iter_terms() -> Iterable[Term]:
68
66
  for _, row in terms_df.iterrows():
69
67
  if row["Name"] == "WITHDRAWN":
70
68
  continue
71
- provenance: List[Reference] = []
69
+ provenance: list[Reference] = []
72
70
  for curie in row["References"].split(","):
73
71
  curie = curie.strip()
74
72
  if not curie:
pyobo/sources/cpt.py CHANGED
@@ -1,8 +1,6 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for CPT."""
4
2
 
5
- from typing import Iterable
3
+ from collections.abc import Iterable
6
4
 
7
5
  import pandas as pd
8
6
 
pyobo/sources/credit.py CHANGED
@@ -3,7 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import json
6
- from typing import Iterable
6
+ from collections.abc import Iterable
7
7
 
8
8
  from more_itertools import chunked
9
9
 
pyobo/sources/cvx.py CHANGED
@@ -1,9 +1,7 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for CVX."""
4
2
 
5
3
  from collections import defaultdict
6
- from typing import Iterable
4
+ from collections.abc import Iterable
7
5
 
8
6
  import pandas as pd
9
7
 
pyobo/sources/depmap.py CHANGED
@@ -1,8 +1,7 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """DepMap cell lines."""
4
2
 
5
- from typing import Iterable, Optional
3
+ from collections.abc import Iterable
4
+ from typing import Optional
6
5
 
7
6
  import pandas as pd
8
7
  import pystow
@@ -113,7 +112,7 @@ def ensure(version: str, force: bool = False) -> pd.DataFrame:
113
112
  url=get_url(version=version),
114
113
  name="sample_info.tsv",
115
114
  force=force,
116
- read_csv_kwargs=dict(sep=",", dtype=str),
115
+ read_csv_kwargs={"sep": ",", "dtype": str},
117
116
  )
118
117
 
119
118
 
@@ -1,18 +1,15 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Converter for dictyBase gene.
4
2
 
5
3
  Note that normal dictybase idenififers are for sequences
6
4
  """
7
5
 
8
6
  import logging
9
- from typing import Iterable
7
+ from collections.abc import Iterable
10
8
 
11
9
  import pandas as pd
12
10
  from tqdm.auto import tqdm
13
11
 
14
- from pyobo.struct import Obo, Reference, Synonym, Term, from_species, has_gene_product
15
- from pyobo.utils.io import multisetdict
12
+ from pyobo.struct import Obo, Synonym, Term, from_species, has_gene_product
16
13
  from pyobo.utils.path import ensure_df
17
14
 
18
15
  __all__ = [
@@ -51,10 +48,11 @@ def get_obo(force: bool = False) -> Obo:
51
48
 
52
49
  def get_terms(force: bool = False) -> Iterable[Term]:
53
50
  """Get terms."""
51
+ # TODO the mappings file has actually no uniprot at all, and requires text mining
54
52
  # DDB ID DDB_G ID Name UniProt ID
55
- uniprot_mappings = multisetdict(
56
- ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values
57
- )
53
+ # uniprot_mappings = multisetdict(
54
+ # ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values
55
+ # )
58
56
 
59
57
  terms = ensure_df(PREFIX, url=URL, force=force, name="gene_info.tsv")
60
58
  # GENE ID (DDB_G ID) Gene Name Synonyms Gene products
@@ -70,10 +68,15 @@ def get_terms(force: bool = False) -> Iterable[Term]:
70
68
  if synonyms and pd.notna(synonyms):
71
69
  for synonym in synonyms.split(","):
72
70
  term.append_synonym(Synonym(synonym.strip()))
73
- for uniprot_id in uniprot_mappings.get(identifier, []):
74
- if not uniprot_id or pd.isna(uniprot_id) or uniprot_id not in {"unknown", "pseudogene"}:
75
- continue
76
- term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id))
71
+ # for uniprot_id in uniprot_mappings.get(identifier, []):
72
+ # if not uniprot_id or pd.isna(uniprot_id) or uniprot_id in {"unknown", "pseudogene"}:
73
+ # continue
74
+ # try:
75
+ # uniprot_ref = Reference(prefix="uniprot", identifier=uniprot_id)
76
+ # except ValueError:
77
+ # tqdm.write(f"[dictybase.gene] invalid uniprot ref: {uniprot_id}")
78
+ # else:
79
+ # term.append_relationship(has_gene_product, uniprot_ref)
77
80
 
78
81
  term.set_species(identifier="44689", name="Dictyostelium discoideum")
79
82
  yield term
pyobo/sources/drugbank.py CHANGED
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Convert DrugBank to OBO.
4
2
 
5
3
  Run with ``python -m pyobo.sources.drugbank``
@@ -8,14 +6,15 @@ Run with ``python -m pyobo.sources.drugbank``
8
6
  import datetime
9
7
  import itertools as itt
10
8
  import logging
9
+ from collections.abc import Iterable, Mapping
11
10
  from functools import lru_cache
12
- from typing import Any, Dict, Iterable, Mapping, Optional
11
+ from typing import Any, Optional
13
12
  from xml.etree import ElementTree
14
13
 
15
14
  import pystow
16
15
  from tqdm.auto import tqdm
17
16
 
18
- from ..getters import NoBuild
17
+ from ..getters import NoBuildError
19
18
  from ..struct import Obo, Reference, Term
20
19
  from ..struct.typedef import has_inchi, has_salt, has_smiles
21
20
  from ..utils.cache import cached_pickle
@@ -139,7 +138,7 @@ def _make_term(drug_info: Mapping[str, Any]) -> Term:
139
138
  return term
140
139
 
141
140
 
142
- @lru_cache()
141
+ @lru_cache
143
142
  def get_xml_root(version: Optional[str] = None) -> ElementTree.Element:
144
143
  """Get the DrugBank XML parser root.
145
144
 
@@ -152,7 +151,7 @@ def get_xml_root(version: Optional[str] = None) -> ElementTree.Element:
152
151
  username = pystow.get_config("pyobo", "drugbank_username", raise_on_missing=True)
153
152
  password = pystow.get_config("pyobo", "drugbank_password", raise_on_missing=True)
154
153
  except ConfigError as e:
155
- raise NoBuild from e
154
+ raise NoBuildError from e
156
155
 
157
156
  element = parse_drugbank(version=version, username=username, password=password)
158
157
  return element.getroot()
@@ -167,7 +166,7 @@ smiles_template = f"{ns}calculated-properties/{ns}property[{ns}kind='SMILES']/{n
167
166
  def _extract_drug_info(drug_xml: ElementTree.Element) -> Mapping[str, Any]:
168
167
  """Extract information from an XML element representing a drug."""
169
168
  # assert drug_xml.tag == f'{ns}drug'
170
- row: Dict[str, Any] = {
169
+ row: dict[str, Any] = {
171
170
  "type": drug_xml.get("type"),
172
171
  "drugbank_id": drug_xml.findtext(f"{ns}drugbank-id[@primary='true']"),
173
172
  "cas": drug_xml.findtext(f"{ns}cas-number"),
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Convert DrugBank Salts to OBO.
4
2
 
5
3
  Run with ``python -m pyobo.sources.drugbank_salt``
@@ -10,11 +8,12 @@ Get relations between drugbank salts and drugbank parents with
10
8
  .. code-block:: python
11
9
 
12
10
  import pyobo
13
- df = pyobo.get_filtered_relations_df('drugbank', 'obo:has_salt')
11
+
12
+ df = pyobo.get_filtered_relations_df("drugbank", "obo:has_salt")
14
13
  """
15
14
 
16
15
  import logging
17
- from typing import Iterable
16
+ from collections.abc import Iterable
18
17
 
19
18
  from .drugbank import iterate_drug_info
20
19
  from ..struct import Obo, Reference, Term
@@ -1,11 +1,9 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Get DrugCentral as OBO."""
4
2
 
5
3
  import logging
6
4
  from collections import defaultdict
5
+ from collections.abc import Iterable
7
6
  from contextlib import closing
8
- from typing import DefaultDict, Iterable, List
9
7
 
10
8
  import bioregistry
11
9
  import psycopg2
@@ -25,9 +23,9 @@ PREFIX = "drugcentral"
25
23
  HOST = "unmtid-dbs.net"
26
24
  PORT = 5433
27
25
  USER = "drugman"
28
- PASSWORD = "dosage"
26
+ PASSWORD = "dosage" # noqa:S105
29
27
  DBNAME = "drugcentral"
30
- PARAMS = dict(dbname=DBNAME, user=USER, password=PASSWORD, host=HOST, port=PORT)
28
+ PARAMS = {"dbname": DBNAME, "user": USER, "password": PASSWORD, "host": HOST, "port": PORT}
31
29
 
32
30
 
33
31
  class DrugCentralGetter(Obo):
@@ -58,7 +56,7 @@ def iter_terms() -> Iterable[Term]:
58
56
  with closing(conn.cursor()) as cur:
59
57
  cur.execute("SELECT struct_id, id_type, identifier FROM public.identifier")
60
58
  rows = cur.fetchall()
61
- xrefs: DefaultDict[str, List[Reference]] = defaultdict(list)
59
+ xrefs: defaultdict[str, list[Reference]] = defaultdict(list)
62
60
  for drugcentral_id, prefix, identifier in tqdm(
63
61
  rows, unit_scale=True, desc="loading xrefs"
64
62
  ):
@@ -70,13 +68,16 @@ def iter_terms() -> Iterable[Term]:
70
68
  if xref_prefix_norm is None:
71
69
  tqdm.write(f"did not normalize {prefix}:{identifier}")
72
70
  continue
71
+ if xref_prefix_norm == "pdb.ligand":
72
+ # there is a weird invalid escaped \W appearing in pdb ligand ids
73
+ identifier = identifier.strip()
73
74
  identifier = bioregistry.standardize_identifier(xref_prefix_norm, identifier)
74
75
  xrefs[str(drugcentral_id)].append(
75
76
  Reference(prefix=xref_prefix_norm, identifier=identifier)
76
77
  )
77
78
  with closing(conn.cursor()) as cur:
78
79
  cur.execute("SELECT id, name FROM public.synonyms")
79
- synonyms: DefaultDict[str, List[Synonym]] = defaultdict(list)
80
+ synonyms: defaultdict[str, list[Synonym]] = defaultdict(list)
80
81
  for drugcentral_id, synonym in cur.fetchall():
81
82
  synonyms[str(drugcentral_id)].append(Synonym(name=synonym))
82
83
 
@@ -100,4 +101,4 @@ def iter_terms() -> Iterable[Term]:
100
101
 
101
102
 
102
103
  if __name__ == "__main__":
103
- get_obo().write_default(write_obo=True)
104
+ DrugCentralGetter.cli()
pyobo/sources/expasy.py CHANGED
@@ -1,10 +1,10 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Convert ExPASy to OBO."""
4
2
 
5
3
  import logging
4
+ import re
6
5
  from collections import defaultdict
7
- from typing import Any, Dict, Iterable, Mapping, Optional, Set, Tuple
6
+ from collections.abc import Iterable, Mapping
7
+ from typing import Any, Optional
8
8
 
9
9
  from .utils import get_go_mapping
10
10
  from ..struct import Obo, Reference, Synonym, Term
@@ -43,7 +43,7 @@ class ExpasyGetter(Obo):
43
43
  """A getter for ExPASy Enzyme Classes."""
44
44
 
45
45
  bioversions_key = ontology = PREFIX
46
- typedefs = [has_member, enables]
46
+ typedefs = [has_member, enables, term_replaced_by]
47
47
  root_terms = [
48
48
  Reference(prefix="eccode", identifier="1"),
49
49
  Reference(prefix="eccode", identifier="2"),
@@ -76,7 +76,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
76
76
  with open(tree_path) as file:
77
77
  tree = get_tree(file)
78
78
 
79
- terms: Dict[str, Term] = {}
79
+ terms: dict[str, Term] = {}
80
80
  child_to_parents = defaultdict(list)
81
81
  for ec_code, data in tree.items():
82
82
  terms[ec_code] = Term(
@@ -146,7 +146,9 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
146
146
  for domain in data.get("domains", []):
147
147
  term.append_relationship(
148
148
  has_member,
149
- Reference(prefix=domain["namespace"], identifier=domain["identifier"]),
149
+ Reference.model_validate(
150
+ {"prefix": domain["namespace"], "identifier": domain["identifier"]},
151
+ ),
150
152
  )
151
153
  for protein in data.get("proteins", []):
152
154
  term.append_relationship(
@@ -176,7 +178,7 @@ def normalize_expasy_id(expasy_id: str) -> str:
176
178
  return expasy_id.replace(" ", "")
177
179
 
178
180
 
179
- def give_edge(unnormalized_ec_code: str) -> Tuple[int, Optional[str], str]:
181
+ def give_edge(unnormalized_ec_code: str) -> tuple[int, Optional[str], str]:
180
182
  """Return a (parent, child) tuple for given id."""
181
183
  levels = [x for x in unnormalized_ec_code.replace(" ", "").replace("-", "").split(".") if x]
182
184
  level = len(levels)
@@ -227,7 +229,7 @@ def get_database(lines: Iterable[str]) -> Mapping:
227
229
  for groups in _group_by_id(lines):
228
230
  _, expasy_id = groups[0]
229
231
 
230
- ec_data_entry: Dict[str, Any] = {
232
+ ec_data_entry: dict[str, Any] = {
231
233
  "concept": {
232
234
  "namespace": PREFIX,
233
235
  "identifier": expasy_id,
@@ -249,8 +251,10 @@ def get_database(lines: Iterable[str]) -> Mapping:
249
251
  elif descriptor == DE and value == "Deleted entry.":
250
252
  ec_data_entry["deleted"] = True
251
253
  elif descriptor == DE and value.startswith("Transferred entry: "):
252
- value = value[len("Transferred entry: ") :].rstrip().rstrip(".")
253
- ec_data_entry["transfer_id"] = value.split(" and ")
254
+ # TODO There's a situation where there are enough transfers that it goes on to a second line
255
+ # the following line just gives up on this one. or maybe I don't understand
256
+ value = value.strip().removesuffix("and").rstrip(",").strip()
257
+ ec_data_entry["transfer_id"] = _parse_transfer(value)
254
258
  elif descriptor == DE:
255
259
  ec_data_entry["concept"]["name"] = value.rstrip(".") # type:ignore
256
260
  elif descriptor == AN:
@@ -269,17 +273,30 @@ def get_database(lines: Iterable[str]) -> Mapping:
269
273
  continue
270
274
  uniprot_id, uniprot_accession = uniprot_entry.split(",")
271
275
  ec_data_entry["proteins"].append( # type:ignore
272
- dict(
273
- namespace="uniprot",
274
- name=uniprot_accession,
275
- identifier=uniprot_id,
276
- )
276
+ {
277
+ "namespace": "uniprot",
278
+ "name": uniprot_accession,
279
+ "identifier": uniprot_id,
280
+ }
277
281
  )
278
282
 
279
283
  rv[expasy_id] = ec_data_entry
280
284
  return rv
281
285
 
282
286
 
287
+ TRANSFER_SPLIT_RE = re.compile(r",\s*|\s+and\s+")
288
+
289
+
290
+ def _parse_transfer(value: str) -> list[str]:
291
+ """Parse transferred entry string.
292
+
293
+ >>> _parse_transfer("Transferred entry: 1.1.1.198, 1.1.1.227 and 1.1.1.228.")
294
+ ['1.1.1.198', '1.1.1.227', '1.1.1.228']
295
+ """
296
+ value = value[len("Transferred entry: ") :].rstrip().rstrip(".")
297
+ return sorted(x.strip().removeprefix("and").strip() for x in TRANSFER_SPLIT_RE.split(value))
298
+
299
+
283
300
  def _group_by_id(lines):
284
301
  """Group lines by identifier."""
285
302
  groups = []
@@ -300,7 +317,7 @@ def _group_by_id(lines):
300
317
  return groups
301
318
 
302
319
 
303
- def get_ec2go(version: str) -> Mapping[str, Set[Tuple[str, str]]]:
320
+ def get_ec2go(version: str) -> Mapping[str, set[tuple[str, str]]]:
304
321
  """Get the EC mapping to GO activities."""
305
322
  url = "http://current.geneontology.org/ontology/external2go/ec2go"
306
323
  path = ensure_path(PREFIX, url=url, name="ec2go.tsv", version=version)