pyobo 0.10.5__py3-none-any.whl → 0.10.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pyobo/sources/npass.py CHANGED
@@ -77,7 +77,7 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
77
77
  logger.debug("multiple cids for %s: %s", identifier, pubchem_compound_ids)
78
78
  for pubchem_compound_id in pubchem_compound_ids:
79
79
  term.append_xref(
80
- Reference(prefix="pubchem.compound", identifier=pubchem_compound_id)
80
+ Reference(prefix="pubchem.compound", identifier=pubchem_compound_id.strip())
81
81
  )
82
82
 
83
83
  for synonym in [iupac]:
pyobo/sources/pombase.py CHANGED
@@ -29,7 +29,7 @@ class PomBaseGetter(Obo):
29
29
  """An ontology representation of PomBase's fission yeast gene nomenclature."""
30
30
 
31
31
  ontology = bioversions_key = PREFIX
32
- typedefs = [from_species, has_gene_product]
32
+ typedefs = [from_species, has_gene_product, orthologous]
33
33
 
34
34
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
35
35
  """Iterate over terms in the ontology."""
pyobo/sources/ror.py ADDED
@@ -0,0 +1,163 @@
1
+ """Convert the Research Organization Registry (ROR) into an ontology."""
2
+
3
+ import json
4
+ import zipfile
5
+ from typing import Iterable
6
+
7
+ import bioregistry
8
+ import zenodo_client
9
+ from tqdm.auto import tqdm
10
+
11
+ from pyobo.struct import Obo, Reference, Term, TypeDef
12
+ from pyobo.struct.struct import acronym
13
+
14
+ PREFIX = "ror"
15
+ ROR_ZENODO_RECORD_ID = "10086202"
16
+
17
+ # Constants
18
+ ORG_CLASS = Reference(prefix="OBI", identifier="0000245")
19
+ LOCATED_IN = Reference(prefix="RO", identifier="0001025")
20
+ PART_OF = Reference(prefix="BFO", identifier="0000050")
21
+ HAS_PART = Reference(prefix="BFO", identifier="0000051")
22
+ SUCCESSOR = Reference(prefix="BFO", identifier="0000063")
23
+ PREDECESSOR = Reference(prefix="BFO", identifier="0000062")
24
+
25
+ RMAP = {
26
+ "Related": TypeDef.from_triple("rdfs", "seeAlso"),
27
+ "Child": TypeDef(HAS_PART),
28
+ "Parent": TypeDef(PART_OF),
29
+ "Predecessor": TypeDef(PREDECESSOR),
30
+ "Successor": TypeDef(SUCCESSOR),
31
+ "Located in": TypeDef(LOCATED_IN),
32
+ }
33
+ NAME_REMAPPING = {
34
+ "'s-Hertogenbosch": "Den Bosch", # SMH Netherlands, why u gotta be like this
35
+ "'s Heeren Loo": "s Heeren Loo",
36
+ "'s-Heerenberg": "s-Heerenberg",
37
+ "Institut Virion\\Serion": "Institut Virion/Serion",
38
+ "Hematology\\Oncology Clinic": "Hematology/Oncology Clinic",
39
+ }
40
+
41
+
42
+ class RORGetter(Obo):
43
+ """An ontology representation of the ROR."""
44
+
45
+ ontology = bioregistry_key = PREFIX
46
+ typedefs = list(RMAP.values())
47
+ synonym_typedefs = [acronym]
48
+ idspaces = {
49
+ "ror": "https://ror.org/",
50
+ "geonames": "https://www.geonames.org/",
51
+ "envo": "http://purl.obolibrary.org/obo/ENVO_",
52
+ "bfo": "http://purl.obolibrary.org/obo/BFO_",
53
+ "ro": "http://purl.obolibrary.org/obo/RO_",
54
+ "obi": "http://purl.obolibrary.org/obo/OBI_",
55
+ "omo": "http://purl.obolibrary.org/obo/OMO_",
56
+ "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
57
+ }
58
+
59
+ def __post_init__(self): # noqa: D105
60
+ self.data_version, _url, _path = _get_info()
61
+ super().__post_init__()
62
+
63
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
64
+ """Iterate over terms in the ontology."""
65
+ return iterate_ror_terms(force=force)
66
+
67
+
68
+ def iterate_ror_terms(*, force: bool = False) -> Iterable[Term]:
69
+ """Iterate over terms in ROR."""
70
+ version, source_uri, records = get_latest(force=force)
71
+ unhandled_xref_prefixes = set()
72
+ for record in tqdm(records, unit_scale=True, unit="record", desc=PREFIX):
73
+ identifier = record["id"].removeprefix("https://ror.org/")
74
+ name = record["name"]
75
+ name = NAME_REMAPPING.get(name, name)
76
+
77
+ term = Term(
78
+ reference=Reference(prefix=PREFIX, identifier=identifier, name=name), type="Instance"
79
+ )
80
+ term.append_parent(ORG_CLASS)
81
+
82
+ if name.startswith("The "):
83
+ term.append_synonym(name.removeprefix("The "))
84
+
85
+ for relationship in record.get("relationships", []):
86
+ target_id = relationship["id"].removeprefix("https://ror.org/")
87
+ term.append_relationship(
88
+ RMAP[relationship["type"]], Reference(prefix=PREFIX, identifier=target_id)
89
+ )
90
+
91
+ term.is_obsolete = record.get("status") != "active"
92
+
93
+ for address in record.get("addresses", []):
94
+ city = address.get("geonames_city")
95
+ if not city:
96
+ continue
97
+ term.append_relationship(
98
+ RMAP["Located in"], Reference(prefix="geonames", identifier=str(city["id"]))
99
+ )
100
+
101
+ for label in record.get("labels", []):
102
+ label = label["label"] # there's a language availabel in this dict too
103
+ term.append_synonym(label)
104
+ if label.startswith("The "):
105
+ term.append_synonym(label.removeprefix("The "))
106
+
107
+ for synonym in record.get("aliases", []):
108
+ term.append_synonym(synonym)
109
+ if synonym.startswith("The "):
110
+ term.append_synonym(synonym.removeprefix("The "))
111
+
112
+ for acronym_synonym in record.get("acronyms", []):
113
+ term.append_synonym(acronym_synonym, type=acronym)
114
+
115
+ for prefix, xref_data in record.get("external_ids", {}).items():
116
+ if prefix == "OrgRef":
117
+ # OrgRef refers to wikipedia page id, see
118
+ # https://stackoverflow.com/questions/6168020/what-is-wikipedia-pageid-how-to-change-it-into-real-page-url
119
+ continue
120
+ norm_prefix = bioregistry.normalize_prefix(prefix)
121
+ if norm_prefix is None:
122
+ if prefix not in unhandled_xref_prefixes:
123
+ tqdm.write(f"Unhandled prefix: {prefix} in {name} ({term.curie}). Values:")
124
+ for xref_id in xref_data["all"]:
125
+ tqdm.write(f"- {xref_id}")
126
+ unhandled_xref_prefixes.add(prefix)
127
+ continue
128
+
129
+ identifiers = xref_data["all"]
130
+ if isinstance(identifiers, str):
131
+ identifiers = [identifiers]
132
+ for xref_id in identifiers:
133
+ term.append_xref(Reference(prefix=norm_prefix, identifier=xref_id.replace(" ", "")))
134
+
135
+ yield term
136
+
137
+
138
+ def _get_info(*, force: bool = False):
139
+ client = zenodo_client.Zenodo()
140
+ latest_record_id = client.get_latest_record(ROR_ZENODO_RECORD_ID)
141
+ response = client.get_record(latest_record_id)
142
+ response_json = response.json()
143
+ version = response_json["metadata"]["version"].lstrip("v")
144
+ file_record = response_json["files"][0]
145
+ name = file_record["key"]
146
+ url = file_record["links"]["self"]
147
+ path = client.download(latest_record_id, name=name, force=force)
148
+ return version, url, path
149
+
150
+
151
+ def get_latest(*, force: bool = False):
152
+ """Get the latest ROR metadata and records."""
153
+ version, url, path = _get_info(force=force)
154
+ with zipfile.ZipFile(path) as zf:
155
+ for zip_info in zf.filelist:
156
+ if zip_info.filename.endswith(".json"):
157
+ with zf.open(zip_info) as file:
158
+ return version, url, json.load(file)
159
+ raise FileNotFoundError
160
+
161
+
162
+ if __name__ == "__main__":
163
+ RORGetter().write_default(write_obo=True, force=True)
pyobo/sources/sgd.py CHANGED
@@ -5,7 +5,7 @@
5
5
  from typing import Iterable
6
6
  from urllib.parse import unquote_plus
7
7
 
8
- from ..struct import Obo, Reference, Synonym, SynonymTypeDef, Term, from_species
8
+ from ..struct import Obo, Reference, Synonym, Term, from_species
9
9
  from ..utils.path import ensure_tar_df
10
10
 
11
11
  __all__ = [
@@ -21,15 +21,12 @@ URL = (
21
21
  )
22
22
  INNER_PATH = "S288C_reference_genome_R64-2-1_20150113/saccharomyces_cerevisiae_R64-2-1_20150113.gff"
23
23
 
24
- alias_type = SynonymTypeDef.from_text("alias")
25
-
26
24
 
27
25
  class SGDGetter(Obo):
28
26
  """An ontology representation of SGD's yeast gene nomenclature."""
29
27
 
30
28
  bioversions_key = ontology = PREFIX
31
29
  typedefs = [from_species]
32
- synonym_typedefs = [alias_type]
33
30
 
34
31
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
35
32
  """Iterate over terms for SGD."""
@@ -68,7 +65,7 @@ def get_terms(ontology: Obo, force: bool = False) -> Iterable[Term]:
68
65
  aliases = d.get("Alias")
69
66
  if aliases:
70
67
  for alias in aliases.split(","):
71
- synonyms.append(Synonym(name=unquote_plus(alias), type=alias_type))
68
+ synonyms.append(Synonym(name=unquote_plus(alias)))
72
69
 
73
70
  term = Term(
74
71
  reference=Reference(prefix=PREFIX, identifier=identifier, name=name),
pyobo/sources/slm.py CHANGED
@@ -7,8 +7,9 @@ from typing import Iterable
7
7
  import pandas as pd
8
8
  from tqdm.auto import tqdm
9
9
 
10
- from pyobo import Obo, Reference, SynonymTypeDef, Term
11
- from pyobo.struct.typedef import has_inchi, has_smiles
10
+ from pyobo import Obo, Reference, Term
11
+ from pyobo.struct.struct import abbreviation as abbreviation_typedef
12
+ from pyobo.struct.typedef import exact_match, has_inchi, has_smiles
12
13
  from pyobo.utils.path import ensure_df
13
14
 
14
15
  __all__ = [
@@ -38,14 +39,13 @@ COLUMNS = [
38
39
  "PMID",
39
40
  ]
40
41
 
41
- abreviation_type = SynonymTypeDef.from_text("abbreviation")
42
-
43
42
 
44
43
  class SLMGetter(Obo):
45
44
  """An ontology representation of SwissLipid's lipid nomenclature."""
46
45
 
47
46
  ontology = bioversions_key = PREFIX
48
- synonym_typedefs = [abreviation_type]
47
+ typedefs = [exact_match]
48
+ synonym_typedefs = [abbreviation_typedef]
49
49
 
50
50
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
51
51
  """Iterate over terms in the ontology."""
@@ -94,7 +94,7 @@ def iter_terms(version: str, force: bool = False):
94
94
  if pd.notna(level):
95
95
  term.append_property("level", level)
96
96
  if pd.notna(abbreviation):
97
- term.append_synonym(abbreviation, type=abreviation_type)
97
+ term.append_synonym(abbreviation, type=abbreviation_typedef)
98
98
  if pd.notna(synonyms):
99
99
  for synonym in synonyms.split("|"):
100
100
  term.append_synonym(synonym.strip())
@@ -0,0 +1,36 @@
1
+ """Utilities for UMLS synonyms."""
2
+
3
+ from pathlib import Path
4
+ from typing import Mapping
5
+
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+
9
+ from pyobo.utils.io import open_map_tsv, write_map_tsv
10
+
11
+ __all__ = ["get_umls_synonyms"]
12
+
13
+ HERE = Path(__file__).parent.resolve()
14
+ SYNONYM_TYPE_PATH = HERE.joinpath("synonym_types.tsv")
15
+
16
+ ABBREVIATIONS_URL = "https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/abbreviations.html"
17
+
18
+
19
+ def get_umls_synonyms(*, refresh: bool = False) -> Mapping[str, str]:
20
+ """Get all synonyms."""
21
+ if SYNONYM_TYPE_PATH.is_file() and not refresh:
22
+ return open_map_tsv(SYNONYM_TYPE_PATH)
23
+ res = requests.get(ABBREVIATIONS_URL, timeout=5)
24
+ soup = BeautifulSoup(res.text, features="html.parser")
25
+ table = soup.find(id="mrdoc_TTY")
26
+ body = table.find("tbody")
27
+ rv = {}
28
+ for row in body.find_all("tr"):
29
+ left, right = row.find_all("td")
30
+ rv[left.text.strip()] = right.text.strip()
31
+ write_map_tsv(path=SYNONYM_TYPE_PATH, rv=rv, header=["key", "name"])
32
+ return rv
33
+
34
+
35
+ if __name__ == "__main__":
36
+ get_umls_synonyms(refresh=True)