PyPI - pyobo - Versions diffs - 0.10.5__py3-none-any.whl → 0.10.7__py3-none-any.whl - Mend

pyobo 0.10.5py3-none-any.whl → 0.10.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

pyobo/__init__.py +1 -0
pyobo/api/__init__.py +1 -0
pyobo/api/names.py +21 -0
pyobo/gilda_utils.py +54 -47
pyobo/identifier_utils.py +1 -1
pyobo/obographs.py +12 -4
pyobo/reader.py +17 -14
pyobo/sources/__init__.py +4 -0
pyobo/sources/cgnc.py +2 -1
pyobo/sources/chembl.py +2 -1
pyobo/sources/complexportal.py +11 -0
pyobo/sources/depmap.py +2 -0
pyobo/sources/drugcentral.py +2 -1
pyobo/sources/geonames.py +239 -0
pyobo/sources/hgnc.py +32 -1
pyobo/sources/mgi.py +3 -1
pyobo/sources/mirbase.py +2 -0
pyobo/sources/mirbase_family.py +5 -2
pyobo/sources/npass.py +4 -2
pyobo/sources/pombase.py +1 -1
pyobo/sources/ror.py +163 -0
pyobo/sources/sgd.py +2 -5
pyobo/sources/slm.py +6 -6
pyobo/sources/umls/get_synonym_types.py +36 -0
pyobo/sources/umls/synonym_types.tsv +243 -242
pyobo/sources/umls/umls.py +3 -7
pyobo/sources/uniprot/uniprot.py +5 -5
pyobo/sources/zfin.py +2 -1
pyobo/struct/reference.py +17 -2
pyobo/struct/struct.py +73 -19
pyobo/struct/typedef.py +30 -7
pyobo/version.py +1 -1
{pyobo-0.10.5.dist-info → pyobo-0.10.7.dist-info}/METADATA +2 -2
{pyobo-0.10.5.dist-info → pyobo-0.10.7.dist-info}/RECORD +38 -35
{pyobo-0.10.5.dist-info → pyobo-0.10.7.dist-info}/WHEEL +1 -1
{pyobo-0.10.5.dist-info → pyobo-0.10.7.dist-info}/LICENSE +0 -0
{pyobo-0.10.5.dist-info → pyobo-0.10.7.dist-info}/entry_points.txt +0 -0
{pyobo-0.10.5.dist-info → pyobo-0.10.7.dist-info}/top_level.txt +0 -0

pyobo/sources/hgnc.py CHANGED Viewed

@@ -27,6 +27,7 @@ from pyobo.struct import (
     orthologous,
     transcribes_to,
 )
+from pyobo.struct.typedef import exact_match
 from pyobo.utils.path import ensure_path, prefix_directory_join
 __all__ = [
@@ -108,6 +109,28 @@ ENCODINGS = {
     "unknown": "GRP",
 }
+SKIP_KEYS = {
+    "date_approved_reserved",
+    "_version_",
+    "uuid",
+    "date_modified",
+    "date_name_changed",
+    "date_symbol_changed",
+    "symbol_report_tag",
+    "location_sortable",
+    "curator_notes",
+    "agr",  # repeat of HGNC ID
+    "gencc",  # repeat of HGNC ID
+    "bioparadigms_slc",  # repeat of symbol
+    "lncrnadb",  # repeat of symbol
+    "gtrnadb",  # repeat of symbol
+    "horde_id",  # repeat of symbol
+    "imgt",  # repeat of symbol
+    "cd",  # symbol
+    "homeodb",  # TODO add to bioregistry, though this is defunct
+    "mamit-trnadb",  # TODO add to bioregistry, though this is defunct
+}
 #: A mapping from HGNC's locus_type annotations to sequence ontology identifiers
 LOCUS_TYPE_TO_SO = {
     # protein-coding gene
@@ -190,6 +213,7 @@ class HGNCGetter(Obo):
         transcribes_to,
         orthologous,
         member_of,
+        exact_match,
     ]
     idspaces = IDSPACES
     synonym_typedefs = [
@@ -330,6 +354,12 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
             else:
                 tqdm.write(f"unhandled IUPHAR: {iuphar}")
+        for lrg_info in entry.pop("lsdb", []):
+            if lrg_info.startswith("LRG_"):
+                lrg_curie = lrg_info.split("|")[0]
+                _, lrg_id = lrg_curie.split("_")
+                term.append_xref(Reference(prefix="lrg", identifier=lrg_id))
         for xref_prefix, key in gene_xrefs:
             xref_identifiers = entry.pop(key, None)
             if xref_identifiers is None:
@@ -397,7 +427,8 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
         term.set_species(identifier="9606", name="Homo sapiens")
         for key in entry:
-            unhandled_entry_keys[key] += 1
+            if key not in SKIP_KEYS:
+                unhandled_entry_keys[key] += 1
         yield term
     with open(prefix_directory_join(PREFIX, name="unhandled.json"), "w") as file:

pyobo/sources/mgi.py CHANGED Viewed

@@ -9,6 +9,8 @@ from typing import Iterable
 import pandas as pd
 from tqdm.auto import tqdm
+from pyobo.struct.typedef import exact_match
 from ..struct import (
     Obo,
     Reference,
@@ -37,7 +39,7 @@ class MGIGetter(Obo):
     ontology = PREFIX
     dynamic_version = True
-    typedefs = [from_species, has_gene_product, transcribes_to]
+    typedefs = [from_species, has_gene_product, transcribes_to, exact_match]
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""

pyobo/sources/mirbase.py CHANGED Viewed

@@ -136,6 +136,8 @@ def _process_definitions_lines(
             xref_prefix, xref_identifier, xref_label = map(str.strip, line.split(";"))
             xref_prefix = xref_prefix.lower()
             xref_prefix = xref_mapping.get(xref_prefix, xref_prefix)
+            if xref_prefix == "pictar":
+                continue
             xrefs.append(
                 Reference(prefix=xref_prefix, identifier=xref_identifier, name=xref_label or None)
             )

pyobo/sources/mirbase_family.py CHANGED Viewed

@@ -26,6 +26,7 @@ class MiRBaseFamilyGetter(Obo):
     ontology = PREFIX
     bioversions_key = "mirbase"
+    typedefs = [has_member]
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""
@@ -40,7 +41,9 @@ def get_obo(force: bool = False) -> Obo:
 def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
     """Get miRBase family terms."""
     df = get_df(version, force=force)
-    for family_id, name, mirna_id, mirna_name in tqdm(df.values, total=len(df.index)):
+    for family_id, name, mirna_id, mirna_name in tqdm(
+        df.values, total=len(df.index), unit_scale=True, desc="miRBase Family"
+    ):
         term = Term(
             reference=Reference(prefix=PREFIX, identifier=family_id, name=name),
         )
@@ -65,4 +68,4 @@ def get_df(version: str, force: bool = False) -> pd.DataFrame:
 if __name__ == "__main__":
-    get_obo().write_default(use_tqdm=True)
+    get_obo().write_default(use_tqdm=True, write_obo=True, force=True)

pyobo/sources/npass.py CHANGED Viewed

@@ -72,12 +72,14 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
         # TODO check that the first is always the parent compound?
         if pd.notna(pubchem_compound_ids):
-            pubchem_compound_ids = pubchem_compound_ids.split(";")
+            pubchem_compound_ids = [
+                yy.strip() for xx in pubchem_compound_ids.split(";") for yy in xx.strip().split(",")
+            ]
             if len(pubchem_compound_ids) > 1:
                 logger.debug("multiple cids for %s: %s", identifier, pubchem_compound_ids)
             for pubchem_compound_id in pubchem_compound_ids:
                 term.append_xref(
-                    Reference(prefix="pubchem.compound", identifier=pubchem_compound_id)
+                    Reference(prefix="pubchem.compound", identifier=pubchem_compound_id.strip())
                 )
         for synonym in [iupac]:

pyobo/sources/pombase.py CHANGED Viewed

@@ -29,7 +29,7 @@ class PomBaseGetter(Obo):
     """An ontology representation of PomBase's fission yeast gene nomenclature."""
     ontology = bioversions_key = PREFIX
-    typedefs = [from_species, has_gene_product]
+    typedefs = [from_species, has_gene_product, orthologous]
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""

pyobo/sources/ror.py ADDED Viewed

@@ -0,0 +1,163 @@
+"""Convert the Research Organization Registry (ROR) into an ontology."""
+import json
+import zipfile
+from typing import Iterable
+import bioregistry
+import zenodo_client
+from tqdm.auto import tqdm
+from pyobo.struct import Obo, Reference, Term, TypeDef
+from pyobo.struct.struct import acronym
+PREFIX = "ror"
+ROR_ZENODO_RECORD_ID = "10086202"
+# Constants
+ORG_CLASS = Reference(prefix="OBI", identifier="0000245")
+LOCATED_IN = Reference(prefix="RO", identifier="0001025")
+PART_OF = Reference(prefix="BFO", identifier="0000050")
+HAS_PART = Reference(prefix="BFO", identifier="0000051")
+SUCCESSOR = Reference(prefix="BFO", identifier="0000063")
+PREDECESSOR = Reference(prefix="BFO", identifier="0000062")
+RMAP = {
+    "Related": TypeDef.from_triple("rdfs", "seeAlso"),
+    "Child": TypeDef(HAS_PART),
+    "Parent": TypeDef(PART_OF),
+    "Predecessor": TypeDef(PREDECESSOR),
+    "Successor": TypeDef(SUCCESSOR),
+    "Located in": TypeDef(LOCATED_IN),
+}
+NAME_REMAPPING = {
+    "'s-Hertogenbosch": "Den Bosch",  # SMH Netherlands, why u gotta be like this
+    "'s Heeren Loo": "s Heeren Loo",
+    "'s-Heerenberg": "s-Heerenberg",
+    "Institut Virion\\Serion": "Institut Virion/Serion",
+    "Hematology\\Oncology Clinic": "Hematology/Oncology Clinic",
+}
+class RORGetter(Obo):
+    """An ontology representation of the ROR."""
+    ontology = bioregistry_key = PREFIX
+    typedefs = list(RMAP.values())
+    synonym_typedefs = [acronym]
+    idspaces = {
+        "ror": "https://ror.org/",
+        "geonames": "https://www.geonames.org/",
+        "envo": "http://purl.obolibrary.org/obo/ENVO_",
+        "bfo": "http://purl.obolibrary.org/obo/BFO_",
+        "ro": "http://purl.obolibrary.org/obo/RO_",
+        "obi": "http://purl.obolibrary.org/obo/OBI_",
+        "omo": "http://purl.obolibrary.org/obo/OMO_",
+        "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
+    }
+    def __post_init__(self):  # noqa: D105
+        self.data_version, _url, _path = _get_info()
+        super().__post_init__()
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return iterate_ror_terms(force=force)
+def iterate_ror_terms(*, force: bool = False) -> Iterable[Term]:
+    """Iterate over terms in ROR."""
+    version, source_uri, records = get_latest(force=force)
+    unhandled_xref_prefixes = set()
+    for record in tqdm(records, unit_scale=True, unit="record", desc=PREFIX):
+        identifier = record["id"].removeprefix("https://ror.org/")
+        name = record["name"]
+        name = NAME_REMAPPING.get(name, name)
+        term = Term(
+            reference=Reference(prefix=PREFIX, identifier=identifier, name=name), type="Instance"
+        )
+        term.append_parent(ORG_CLASS)
+        if name.startswith("The "):
+            term.append_synonym(name.removeprefix("The "))
+        for relationship in record.get("relationships", []):
+            target_id = relationship["id"].removeprefix("https://ror.org/")
+            term.append_relationship(
+                RMAP[relationship["type"]], Reference(prefix=PREFIX, identifier=target_id)
+            )
+        term.is_obsolete = record.get("status") != "active"
+        for address in record.get("addresses", []):
+            city = address.get("geonames_city")
+            if not city:
+                continue
+            term.append_relationship(
+                RMAP["Located in"], Reference(prefix="geonames", identifier=str(city["id"]))
+            )
+        for label in record.get("labels", []):
+            label = label["label"]  # there's a language availabel in this dict too
+            term.append_synonym(label)
+            if label.startswith("The "):
+                term.append_synonym(label.removeprefix("The "))
+        for synonym in record.get("aliases", []):
+            term.append_synonym(synonym)
+            if synonym.startswith("The "):
+                term.append_synonym(synonym.removeprefix("The "))
+        for acronym_synonym in record.get("acronyms", []):
+            term.append_synonym(acronym_synonym, type=acronym)
+        for prefix, xref_data in record.get("external_ids", {}).items():
+            if prefix == "OrgRef":
+                # OrgRef refers to wikipedia page id, see
+                # https://stackoverflow.com/questions/6168020/what-is-wikipedia-pageid-how-to-change-it-into-real-page-url
+                continue
+            norm_prefix = bioregistry.normalize_prefix(prefix)
+            if norm_prefix is None:
+                if prefix not in unhandled_xref_prefixes:
+                    tqdm.write(f"Unhandled prefix: {prefix} in {name} ({term.curie}). Values:")
+                    for xref_id in xref_data["all"]:
+                        tqdm.write(f"- {xref_id}")
+                    unhandled_xref_prefixes.add(prefix)
+                continue
+            identifiers = xref_data["all"]
+            if isinstance(identifiers, str):
+                identifiers = [identifiers]
+            for xref_id in identifiers:
+                term.append_xref(Reference(prefix=norm_prefix, identifier=xref_id.replace(" ", "")))
+        yield term
+def _get_info(*, force: bool = False):
+    client = zenodo_client.Zenodo()
+    latest_record_id = client.get_latest_record(ROR_ZENODO_RECORD_ID)
+    response = client.get_record(latest_record_id)
+    response_json = response.json()
+    version = response_json["metadata"]["version"].lstrip("v")
+    file_record = response_json["files"][0]
+    name = file_record["key"]
+    url = file_record["links"]["self"]
+    path = client.download(latest_record_id, name=name, force=force)
+    return version, url, path
+def get_latest(*, force: bool = False):
+    """Get the latest ROR metadata and records."""
+    version, url, path = _get_info(force=force)
+    with zipfile.ZipFile(path) as zf:
+        for zip_info in zf.filelist:
+            if zip_info.filename.endswith(".json"):
+                with zf.open(zip_info) as file:
+                    return version, url, json.load(file)
+    raise FileNotFoundError
+if __name__ == "__main__":
+    RORGetter().write_default(write_obo=True, force=True)

pyobo/sources/sgd.py CHANGED Viewed

@@ -5,7 +5,7 @@
 from typing import Iterable
 from urllib.parse import unquote_plus
-from ..struct import Obo, Reference, Synonym, SynonymTypeDef, Term, from_species
+from ..struct import Obo, Reference, Synonym, Term, from_species
 from ..utils.path import ensure_tar_df
 __all__ = [
@@ -21,15 +21,12 @@ URL = (
 )
 INNER_PATH = "S288C_reference_genome_R64-2-1_20150113/saccharomyces_cerevisiae_R64-2-1_20150113.gff"
-alias_type = SynonymTypeDef.from_text("alias")
 class SGDGetter(Obo):
     """An ontology representation of SGD's yeast gene nomenclature."""
     bioversions_key = ontology = PREFIX
     typedefs = [from_species]
-    synonym_typedefs = [alias_type]
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms for SGD."""
@@ -68,7 +65,7 @@ def get_terms(ontology: Obo, force: bool = False) -> Iterable[Term]:
         aliases = d.get("Alias")
         if aliases:
             for alias in aliases.split(","):
-                synonyms.append(Synonym(name=unquote_plus(alias), type=alias_type))
+                synonyms.append(Synonym(name=unquote_plus(alias)))
         term = Term(
             reference=Reference(prefix=PREFIX, identifier=identifier, name=name),

pyobo/sources/slm.py CHANGED Viewed

@@ -7,8 +7,9 @@ from typing import Iterable
 import pandas as pd
 from tqdm.auto import tqdm
-from pyobo import Obo, Reference, SynonymTypeDef, Term
-from pyobo.struct.typedef import has_inchi, has_smiles
+from pyobo import Obo, Reference, Term
+from pyobo.struct.struct import abbreviation as abbreviation_typedef
+from pyobo.struct.typedef import exact_match, has_inchi, has_smiles
 from pyobo.utils.path import ensure_df
 __all__ = [
@@ -38,14 +39,13 @@ COLUMNS = [
     "PMID",
 ]
-abreviation_type = SynonymTypeDef.from_text("abbreviation")
 class SLMGetter(Obo):
     """An ontology representation of SwissLipid's lipid nomenclature."""
     ontology = bioversions_key = PREFIX
-    synonym_typedefs = [abreviation_type]
+    typedefs = [exact_match]
+    synonym_typedefs = [abbreviation_typedef]
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""
@@ -94,7 +94,7 @@ def iter_terms(version: str, force: bool = False):
         if pd.notna(level):
             term.append_property("level", level)
         if pd.notna(abbreviation):
-            term.append_synonym(abbreviation, type=abreviation_type)
+            term.append_synonym(abbreviation, type=abbreviation_typedef)
         if pd.notna(synonyms):
             for synonym in synonyms.split("|"):
                 term.append_synonym(synonym.strip())

pyobo/sources/umls/get_synonym_types.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""Utilities for UMLS synonyms."""
+from pathlib import Path
+from typing import Mapping
+import requests
+from bs4 import BeautifulSoup
+from pyobo.utils.io import open_map_tsv, write_map_tsv
+__all__ = ["get_umls_synonyms"]
+HERE = Path(__file__).parent.resolve()
+SYNONYM_TYPE_PATH = HERE.joinpath("synonym_types.tsv")
+ABBREVIATIONS_URL = "https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/abbreviations.html"
+def get_umls_synonyms(*, refresh: bool = False) -> Mapping[str, str]:
+    """Get all synonyms."""
+    if SYNONYM_TYPE_PATH.is_file() and not refresh:
+        return open_map_tsv(SYNONYM_TYPE_PATH)
+    res = requests.get(ABBREVIATIONS_URL, timeout=5)
+    soup = BeautifulSoup(res.text, features="html.parser")
+    table = soup.find(id="mrdoc_TTY")
+    body = table.find("tbody")
+    rv = {}
+    for row in body.find_all("tr"):
+        left, right = row.find_all("td")
+        rv[left.text.strip()] = right.text.strip()
+    write_map_tsv(path=SYNONYM_TYPE_PATH, rv=rv, header=["key", "name"])
+    return rv
+if __name__ == "__main__":
+    get_umls_synonyms(refresh=True)

pyobo 0.10.5__py3-none-any.whl → 0.10.7__py3-none-any.whl

pyobo 0.10.5py3-none-any.whl → 0.10.7py3-none-any.whl