PyPI - pyobo - Versions diffs - 0.11.0__py3-none-any.whl → 0.11.2__py3-none-any.whl - Mend

pyobo 0.11.0py3-none-any.whl → 0.11.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

pyobo/constants.py +1 -0
pyobo/gilda_utils.py +14 -11
pyobo/obographs.py +5 -2
pyobo/resources/so.py +55 -0
pyobo/resources/so.tsv +2604 -0
pyobo/sources/complexportal.py +54 -15
pyobo/sources/dictybase_gene.py +14 -9
pyobo/sources/drugcentral.py +4 -1
pyobo/sources/expasy.py +22 -4
pyobo/sources/flybase.py +3 -2
pyobo/sources/hgnc.py +24 -19
pyobo/sources/hgncgenefamily.py +7 -7
pyobo/sources/kegg/genome.py +18 -6
pyobo/sources/mirbase.py +9 -3
pyobo/sources/npass.py +1 -1
pyobo/sources/pathbank.py +32 -23
pyobo/sources/pombase.py +6 -3
pyobo/sources/reactome.py +28 -7
pyobo/sources/rgd.py +1 -1
pyobo/sources/slm.py +28 -14
pyobo/sources/uniprot/uniprot.py +7 -6
pyobo/sources/zfin.py +18 -6
pyobo/struct/reference.py +9 -8
pyobo/struct/struct.py +30 -20
pyobo/struct/typedef.py +5 -0
pyobo/version.py +1 -1
{pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/METADATA +50 -62
{pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/RECORD +31 -45
{pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/WHEEL +1 -1
pyobo/apps/__init__.py +0 -3
pyobo/apps/cli.py +0 -24
pyobo/apps/gilda/__init__.py +0 -3
pyobo/apps/gilda/__main__.py +0 -8
pyobo/apps/gilda/app.py +0 -48
pyobo/apps/gilda/cli.py +0 -36
pyobo/apps/gilda/templates/base.html +0 -33
pyobo/apps/gilda/templates/home.html +0 -11
pyobo/apps/gilda/templates/matches.html +0 -32
pyobo/apps/mapper/__init__.py +0 -3
pyobo/apps/mapper/__main__.py +0 -11
pyobo/apps/mapper/cli.py +0 -37
pyobo/apps/mapper/mapper.py +0 -187
pyobo/apps/mapper/templates/base.html +0 -35
pyobo/apps/mapper/templates/mapper_home.html +0 -64
pyobo-0.11.0.dist-info/LICENSE +0 -21
{pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/entry_points.txt +0 -0
{pyobo-0.11.0.dist-info → pyobo-0.11.2.dist-info}/top_level.txt +0 -0

pyobo/sources/complexportal.py CHANGED Viewed

@@ -58,11 +58,31 @@ def _parse_members(s) -> list[tuple[Reference, str]]:
     for member in s.split("|"):
         entity_id, count = member.split("(")
         count = count.rstrip(")")
-        if ":" in entity_id:
+        if entity_id.startswith("URS"):
+            prefix, identifier = "rnacentral", entity_id
+        elif entity_id.startswith("CPX"):
+            # TODO why self xref?
+            prefix, identifier = "complexportal", entity_id
+        elif entity_id.startswith("["):
+            continue  # this is a list of uniprot IDs, not sure what to do with this
+        elif entity_id.startswith("EBI-"):
+            continue
+        elif ":" not in entity_id:
+            if "PRO_" in entity_id:
+                prefix = "uniprot.chain"
+                identifier = entity_id.split("-")[1]
+            elif "-" in entity_id:
+                prefix, identifier = "uniprot.isoform", entity_id
+            else:
+                prefix, identifier = "uniprot", entity_id
+        else:
             prefix, identifier = entity_id.split(":", 1)
+        try:
+            reference = Reference(prefix=prefix, identifier=identifier)
+        except ValueError:
+            tqdm.write(f"failed to validate reference: {entity_id}")
         else:
-            prefix, identifier = "uniprot", entity_id
-        rv.append((Reference(prefix=prefix, identifier=identifier), count))
+            rv.append((reference, count))
     return rv
@@ -74,27 +94,40 @@ def _parse_xrefs(s) -> list[tuple[Reference, str]]:
     for xref in s.split("|"):
         xref = xref.replace("protein ontology:PR:", "PR:")
         xref = xref.replace("protein ontology:PR_", "PR:")
+        xref = xref.replace("rhea:rhea ", "rhea:")
+        xref = xref.replace("rhea:Rhea ", "rhea:")
+        xref = xref.replace("rhea:RHEA:rhea", "rhea:")
+        xref = xref.replace("rhea:RHEA: ", "rhea:")
+        xref = xref.replace("rhea:RHEA:rhea ", "rhea:")
+        xref = xref.replace("intenz:RHEA:", "rhea:")
+        xref = xref.replace("eccode::", "eccode:")
+        xref = xref.replace("eccode:EC:", "eccode:")
+        xref = xref.replace("intenz:EC:", "eccode:")
+        xref = xref.replace("eccode:RHEA:", "rhea:")
+        xref = xref.replace("efo:MONDO:", "MONDO:")
+        xref = xref.replace("omim:MIM:", "omim:")
+        xref = xref.replace("efo:HP:", "HP:")
+        xref = xref.replace("efo:Orphanet:", "Orphanet:")
+        xref = xref.replace("orphanet:ORDO:", "Orphanet:")
+        xref = xref.replace("biorxiv:doi.org/", "doi:")
+        xref = xref.replace("emdb:EMDB-", "emdb:EMD-")
+        xref = xref.replace("wwpdb:EMD-", "emdb:EMD-")
+        xref = xref.replace("signor:CPX-", "complexportal:CPX-")
         try:
             xref_curie, note = xref.split("(")
         except ValueError:
             logger.warning("xref missing (: %s", xref)
             continue
         note = note.rstrip(")")
-        note.replace("rhea:rhea ", "rhea:")
-        note.replace("rhea:Rhea ", "rhea:")
-        note.replace("eccode::", "eccode:")
-        note.replace("eccode:EC:", "eccode:")
-        note.replace("eccode:RHEA:", "rhea:")
-        if note.lower().startswith("rhea "):
-            note = note[len("Rhea ") :]
-            if note.lower().startswith("rhea:rhea "):
-                note = note[len("rhea:rhea ") :]
-        if note.lower().startswith("EC:"):
-            note = note[len("EC:") :]
+        if xref_curie.startswith("intenz:"):
+            xref_curie = _clean_intenz(xref_curie)
         try:
             reference = Reference.from_curie(xref_curie)
         except ValueError:
-            logger.warning("can not parse CURIE: %s", xref)
+            logger.warning("can not parse CURIE: %s", xref_curie)
             continue
         if reference is None:
             logger.warning("reference is None after parsing: %s", xref)
@@ -103,6 +136,12 @@ def _parse_xrefs(s) -> list[tuple[Reference, str]]:
     return rv
+def _clean_intenz(s: str) -> str:
+    for _ in range(3):
+        s = s.rstrip("-").rstrip(".")
+    return s
 class ComplexPortalGetter(Obo):
     """An ontology representation of the Complex Portal."""

pyobo/sources/dictybase_gene.py CHANGED Viewed

@@ -9,8 +9,7 @@ from collections.abc import Iterable
 import pandas as pd
 from tqdm.auto import tqdm
-from pyobo.struct import Obo, Reference, Synonym, Term, from_species, has_gene_product
-from pyobo.utils.io import multisetdict
+from pyobo.struct import Obo, Synonym, Term, from_species, has_gene_product
 from pyobo.utils.path import ensure_df
 __all__ = [
@@ -49,10 +48,11 @@ def get_obo(force: bool = False) -> Obo:
 def get_terms(force: bool = False) -> Iterable[Term]:
     """Get terms."""
+    # TODO the mappings file has actually no uniprot at all, and requires text mining
     # DDB ID	DDB_G ID	Name	UniProt ID
-    uniprot_mappings = multisetdict(
-        ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values
-    )
+    # uniprot_mappings = multisetdict(
+    #     ensure_df(PREFIX, url=URL, force=force, name="uniprot_mappings.tsv", usecols=[1, 3]).values
+    # )
     terms = ensure_df(PREFIX, url=URL, force=force, name="gene_info.tsv")
     # GENE ID (DDB_G ID)	Gene Name	Synonyms	Gene products
@@ -68,10 +68,15 @@ def get_terms(force: bool = False) -> Iterable[Term]:
         if synonyms and pd.notna(synonyms):
             for synonym in synonyms.split(","):
                 term.append_synonym(Synonym(synonym.strip()))
-        for uniprot_id in uniprot_mappings.get(identifier, []):
-            if not uniprot_id or pd.isna(uniprot_id) or uniprot_id not in {"unknown", "pseudogene"}:
-                continue
-            term.append_relationship(has_gene_product, Reference.auto("uniprot", uniprot_id))
+        # for uniprot_id in uniprot_mappings.get(identifier, []):
+        #     if not uniprot_id or pd.isna(uniprot_id) or uniprot_id in {"unknown", "pseudogene"}:
+        #         continue
+        #     try:
+        #         uniprot_ref = Reference(prefix="uniprot", identifier=uniprot_id)
+        #     except ValueError:
+        #         tqdm.write(f"[dictybase.gene] invalid uniprot ref: {uniprot_id}")
+        #     else:
+        #         term.append_relationship(has_gene_product, uniprot_ref)
         term.set_species(identifier="44689", name="Dictyostelium discoideum")
         yield term

pyobo/sources/drugcentral.py CHANGED Viewed

@@ -68,6 +68,9 @@ def iter_terms() -> Iterable[Term]:
                 if xref_prefix_norm is None:
                     tqdm.write(f"did not normalize {prefix}:{identifier}")
                     continue
+                if xref_prefix_norm == "pdb.ligand":
+                    # there is a weird invalid escaped \W appearing in pdb ligand ids
+                    identifier = identifier.strip()
                 identifier = bioregistry.standardize_identifier(xref_prefix_norm, identifier)
                 xrefs[str(drugcentral_id)].append(
                     Reference(prefix=xref_prefix_norm, identifier=identifier)
@@ -98,4 +101,4 @@ def iter_terms() -> Iterable[Term]:
 if __name__ == "__main__":
-    get_obo().write_default(write_obo=True)
+    DrugCentralGetter.cli()

pyobo/sources/expasy.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Convert ExPASy to OBO."""
 import logging
+import re
 from collections import defaultdict
 from collections.abc import Iterable, Mapping
 from typing import Any, Optional
@@ -42,7 +43,7 @@ class ExpasyGetter(Obo):
     """A getter for ExPASy Enzyme Classes."""
     bioversions_key = ontology = PREFIX
-    typedefs = [has_member, enables]
+    typedefs = [has_member, enables, term_replaced_by]
     root_terms = [
         Reference(prefix="eccode", identifier="1"),
         Reference(prefix="eccode", identifier="2"),
@@ -145,7 +146,9 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
         for domain in data.get("domains", []):
             term.append_relationship(
                 has_member,
-                Reference(prefix=domain["namespace"], identifier=domain["identifier"]),
+                Reference.model_validate(
+                    {"prefix": domain["namespace"], "identifier": domain["identifier"]},
+                ),
             )
         for protein in data.get("proteins", []):
             term.append_relationship(
@@ -248,8 +251,10 @@ def get_database(lines: Iterable[str]) -> Mapping:
             elif descriptor == DE and value == "Deleted entry.":
                 ec_data_entry["deleted"] = True
             elif descriptor == DE and value.startswith("Transferred entry: "):
-                value = value[len("Transferred entry: ") :].rstrip().rstrip(".")
-                ec_data_entry["transfer_id"] = value.split(" and ")
+                # TODO There's a situation where there are enough transfers that it goes on to a second line
+                #  the following line just gives up on this one. or maybe I don't understand
+                value = value.strip().removesuffix("and").rstrip(",").strip()
+                ec_data_entry["transfer_id"] = _parse_transfer(value)
             elif descriptor == DE:
                 ec_data_entry["concept"]["name"] = value.rstrip(".")  # type:ignore
             elif descriptor == AN:
@@ -279,6 +284,19 @@ def get_database(lines: Iterable[str]) -> Mapping:
     return rv
+TRANSFER_SPLIT_RE = re.compile(r",\s*|\s+and\s+")
+def _parse_transfer(value: str) -> list[str]:
+    """Parse transferred entry string.
+    >>> _parse_transfer("Transferred entry: 1.1.1.198, 1.1.1.227 and 1.1.1.228.")
+    ['1.1.1.198', '1.1.1.227', '1.1.1.228']
+    """
+    value = value[len("Transferred entry: ") :].rstrip().rstrip(".")
+    return sorted(x.strip().removeprefix("and").strip() for x in TRANSFER_SPLIT_RE.split(value))
 def _group_by_id(lines):
     """Group lines by identifier."""
     groups = []

pyobo/sources/flybase.py CHANGED Viewed

@@ -7,6 +7,7 @@ import pandas as pd
 from tqdm.auto import tqdm
 from pyobo import Reference
+from pyobo.resources.so import get_so_name
 from pyobo.struct import Obo, Term, from_species, orthologous
 from pyobo.utils.io import multisetdict
 from pyobo.utils.path import ensure_df
@@ -133,7 +134,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
                 "FlyBase gene type is missing mapping to Sequence Ontology (SO): %s", gtype
             )
         else:
-            so[gtype] = Reference.auto("SO", so_id)
+            so[gtype] = Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id))
     for _, reference in sorted(so.items()):
         yield Term(reference=reference)
@@ -153,7 +154,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
         for hgnc_curie in human_orthologs.get(identifier, []):
             if not hgnc_curie or pd.isna(hgnc_curie):
                 continue
-            hgnc_ortholog = Reference.from_curie(hgnc_curie, auto=True)
+            hgnc_ortholog = Reference.from_curie(hgnc_curie)
             if hgnc_ortholog is None:
                 tqdm.write(f"[{PREFIX}] {identifier} had invalid ortholog: {hgnc_curie}")
             else:

pyobo/sources/hgnc.py CHANGED Viewed

@@ -13,6 +13,7 @@ from tabulate import tabulate
 from tqdm.auto import tqdm
 from pyobo.api.utils import get_version
+from pyobo.resources.so import get_so_name
 from pyobo.struct import (
     Obo,
     Reference,
@@ -37,8 +38,8 @@ logger = logging.getLogger(__name__)
 PREFIX = "hgnc"
 DEFINITIONS_URL_FMT = (
-    "http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/"
-    "archive/monthly/json/hgnc_complete_set_{version}.json"
+    "https://storage.googleapis.com/public-download-files/hgnc/archive/archive/monthly/json/"
+    "hgnc_complete_set_{version}.json"
 )
 previous_symbol_type = SynonymTypeDef.from_text("previous_symbol")
@@ -222,7 +223,7 @@ class HGNCGetter(Obo):
         alias_symbol_type,
     ]
     root_terms = [
-        Reference(prefix="so", identifier=so_id)
+        Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id))
         for so_id in sorted(set(LOCUS_TYPE_TO_SO.values()))
         if so_id
     ]
@@ -256,7 +257,7 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
     yield Term.from_triple("NCBITaxon", "9606", "Homo sapiens")
     yield from sorted(
         {
-            Term(reference=Reference.auto("SO", so_id))
+            Term(reference=Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id)))
             for so_id in sorted(LOCUS_TYPE_TO_SO.values())
             if so_id
         },
@@ -363,23 +364,25 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
             xref_identifiers = entry.pop(key, None)
             if xref_identifiers is None:
                 continue
             if isinstance(xref_identifiers, (str, int)):
+                xref_identifiers = [str(xref_identifiers)]
+            if xref_prefix == "merops.entry":
+                continue
+                # e.g., XM02-001 should be rewritten as XM02.001
+                xref_identifiers = [i.replace("-", ".") for i in xref_identifiers]
+            if xref_prefix == "refseq":
+                # e.g., strip off dots without substantiated record versions like in NM_021728.
+                xref_identifiers = [i.strip(".") for i in xref_identifiers]
+            if len(xref_identifiers) == 1:
                 term.append_exact_match(
-                    Reference(prefix=xref_prefix, identifier=str(xref_identifiers))
+                    Reference(prefix=xref_prefix, identifier=str(xref_identifiers[0]))
                 )
-            elif isinstance(xref_identifiers, list):
-                if len(xref_identifiers) == 1:
-                    term.append_exact_match(
-                        Reference(prefix=xref_prefix, identifier=str(xref_identifiers[0]))
-                    )
-                else:
-                    for xref_identifier in xref_identifiers:
-                        term.append_xref(
-                            Reference(prefix=xref_prefix, identifier=str(xref_identifier))
-                        )
             else:
-                raise TypeError
+                for xref_identifier in xref_identifiers:
+                    term.append_xref(Reference(prefix=xref_prefix, identifier=str(xref_identifier)))
         for pubmed_id in entry.pop("pubmed_id", []):
             term.append_provenance(Reference(prefix="pubmed", identifier=str(pubmed_id)))
@@ -416,9 +419,11 @@ def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Te
         locus_group = entry.pop("locus_group")
         so_id = LOCUS_TYPE_TO_SO.get(locus_type)
         if so_id:
-            term.append_parent(Reference.auto("SO", so_id))
+            term.append_parent(Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id)))
         else:
-            term.append_parent(Reference.auto("SO", "0000704"))  # gene
+            term.append_parent(
+                Reference(prefix="SO", identifier="0000704", name=get_so_name("0000704"))
+            )  # gene
             unhandle_locus_types[locus_type][identifier] = term
             term.append_property("locus_type", locus_type)
             term.append_property("locus_group", locus_group)

pyobo/sources/hgncgenefamily.py CHANGED Viewed

@@ -21,13 +21,13 @@ __all__ = [
 ]
 PREFIX = "hgnc.genegroup"
-FAMILIES_URL = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/csv/genefamily_db_tables/family.csv"
+FAMILIES_URL = "https://storage.googleapis.com/public-download-files/hgnc/csv/csv/genefamily_db_tables/family.csv"
 # TODO use family_alias.csv
-HIERARCHY_URL = (
-    "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/csv/genefamily_db_tables/hierarchy.csv"
-)
+HIERARCHY_URL = "https://storage.googleapis.com/public-download-files/hgnc/csv/csv/genefamily_db_tables/hierarchy.csv"
-symbol_type = SynonymTypeDef.from_text("symbol")
+symbol_type = SynonymTypeDef(
+    reference=Reference(prefix="OMO", identifier="0004000", name="has symbol")
+)
 class HGNCGroupGetter(Obo):
@@ -78,7 +78,7 @@ def get_terms(force: bool = False) -> Iterable[Term]:
                     name=parent.name,
                 )
             )
-    gene_group = Reference.auto("SO", "0005855")
+    gene_group = Reference(prefix="SO", identifier="0005855", name="gene group")
     yield Term(reference=gene_group)
     for term in terms:
         if not term.parents:
@@ -98,7 +98,7 @@ def _get_terms_helper(force: bool = False) -> Iterable[Term]:
             definition=definition,
         )
         if pubmed_ids and pd.notna(pubmed_ids):
-            for s in pubmed_ids.split(","):
+            for s in pubmed_ids.replace(" ", ",").split(","):
                 term.append_provenance(Reference(prefix="pubmed", identifier=s.strip()))
         if desc_go and pd.notna(desc_go):
             go_id = desc_go[len("http://purl.uniprot.org/go/") :]

pyobo/sources/kegg/genome.py CHANGED Viewed

@@ -3,6 +3,8 @@
 Run with ``python -m pyobo.sources.kegg.genome``
 """
+from __future__ import annotations
 import logging
 from collections.abc import Iterable
@@ -46,8 +48,11 @@ def get_obo() -> Obo:
     return KEGGGenomeGetter()
-def parse_genome_line(line: str) -> KEGGGenome:
+def parse_genome_line(line: str) -> KEGGGenome | None:
     """Parse a line from the KEGG Genome database."""
+    if not line.startswith("T"):
+        #  This is for an NCBI Taxonomy
+        return None
     line = line.strip()
     identifier, rest = _s(line, "\t")
     identifier = identifier[len("gn:") :]
@@ -94,6 +99,8 @@ def iter_kegg_genomes(version: str, desc: str) -> Iterable[KEGGGenome]:
     it = tqdm(lines, desc=desc, unit_scale=True, unit="genome")
     for line in it:
         yv = parse_genome_line(line)
+        if yv is None:
+            continue
         it.set_postfix({"id": yv.identifier, "name": yv.name})
         yield yv
@@ -105,11 +112,16 @@ def iter_terms(version: str) -> Iterable[Term]:
     for kegg_genome in iter_kegg_genomes(version=version, desc="KEGG Genomes"):
         if kegg_genome.identifier in SKIP:
             continue
-        term = Term.from_triple(
-            prefix=KEGG_GENOME_PREFIX,
-            identifier=kegg_genome.identifier,
-            name=kegg_genome.name,
-        )
+        try:
+            reference = Reference(
+                prefix=KEGG_GENOME_PREFIX, identifier=kegg_genome.identifier, name=kegg_genome.name
+            )
+        except ValueError:
+            tqdm.write(f"[{KEGG_GENOME_PREFIX}] invalid identifier: {kegg_genome}")
+            continue
+        term = Term(reference=reference)
         if kegg_genome.taxonomy_id is not None:
             taxonomy_name = get_ncbitaxon_name(kegg_genome.taxonomy_id)
             if taxonomy_name is None:

pyobo/sources/mirbase.py CHANGED Viewed

@@ -136,9 +136,15 @@ def _process_definitions_lines(
             xref_prefix = xref_mapping.get(xref_prefix, xref_prefix)
             if xref_prefix == "pictar":
                 continue
-            xrefs.append(
-                Reference(prefix=xref_prefix, identifier=xref_identifier, name=xref_label or None)
-            )
+            try:
+                xref = Reference(
+                    prefix=xref_prefix, identifier=xref_identifier, name=xref_label or None
+                )
+            except ValueError:
+                tqdm.write(f"invalid xref: {xref_prefix}:{xref_identifier}")
+            else:
+                xrefs.append(xref)
         # TODO add pubmed references

pyobo/sources/npass.py CHANGED Viewed

@@ -39,7 +39,7 @@ def get_obo(force: bool = False) -> Obo:
 def get_df(version: str, force: bool = False) -> pd.DataFrame:
     """Get the NPASS chemical nomenclature."""
-    base_url = f"http://bidd.group/NPASS/downloadFiles/NPASSv{version}_download"
+    base_url = f"https://bidd.group/NPASS/downloadFiles/NPASSv{version}_download"
     url = f"{base_url}_naturalProducts_generalInfo.txt"
     return ensure_df(
         PREFIX,

pyobo/sources/pathbank.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """Converter for PathBank."""
+from __future__ import annotations
 import logging
 from collections import defaultdict
 from collections.abc import Iterable, Mapping
@@ -8,7 +10,7 @@ import pandas as pd
 from tqdm.auto import tqdm
 from ..struct import Obo, Reference, Term
-from ..struct.typedef import has_participant
+from ..struct.typedef import has_category, has_participant
 from ..utils.path import ensure_df
 __all__ = [
@@ -68,7 +70,7 @@ class PathBankGetter(Obo):
     """An ontology representation of PathBank's pathway nomenclature."""
     ontology = bioversions_key = PREFIX
-    typedefs = [has_participant]
+    typedefs = [has_participant, has_category]
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""
@@ -103,21 +105,30 @@ def get_protein_mapping(version: str, force: bool = False) -> Mapping[str, set[R
     for pathway_id, protein_id in tqdm(
         proteins_df.values, desc=f"[{PREFIX}] mapping proteins", unit_scale=True
     ):
-        # TODO get protein names
-        smpdb_id_to_proteins[pathway_id].add(Reference(prefix="uniprot", identifier=protein_id))
+        try:
+            if "-" in protein_id:
+                reference = Reference(prefix="uniprot.isoform", identifier=protein_id)
+            else:
+                reference = Reference(prefix="uniprot", identifier=protein_id)
+        except ValueError:
+            tqdm.write(f"[pathbank] invalid uniprot identifier: {protein_id}")
+        else:
+            smpdb_id_to_proteins[pathway_id].add(reference)
     return smpdb_id_to_proteins
 def get_metabolite_df(version: str, force: bool = False) -> pd.DataFrame:
     """Get the metabolites dataframe."""
-    return ensure_df(
+    df = ensure_df(
         PREFIX,
         url=METABOLITE_URL,
         sep=",",
-        usecols=["PathBank ID", "Metabolite ID", "Metabolite Name"],
+        usecols=["PathBank ID", "ChEBI ID"],
         force=force,
         version=version,
     )
+    df = df[df["ChEBI ID"].notna()]
+    return df
 def get_metabolite_mapping(version: str, force: bool = False) -> Mapping[str, set[Reference]]:
@@ -125,17 +136,20 @@ def get_metabolite_mapping(version: str, force: bool = False) -> Mapping[str, se
     metabolites_df = get_metabolite_df(version=version, force=force)
     smpdb_id_to_metabolites = defaultdict(set)
     it = tqdm(metabolites_df.values, desc=f"[{PREFIX}] mapping metabolites", unit_scale=True)
-    for pathway_id, metabolite_id, metabolite_name in it:
-        smpdb_id_to_metabolites[pathway_id].add(
-            Reference(
-                prefix=PREFIX,
-                identifier=metabolite_id,
-                name=metabolite_name,
-            )
-        )
+    for pathway_id, metabolite_id in it:
+        reference = Reference(prefix="chebi", identifier=metabolite_id.strip())
+        smpdb_id_to_metabolites[pathway_id].add(reference)
     return smpdb_id_to_metabolites
+def _clean_description(description: str) -> str | None:
+    """Clean the description."""
+    if pd.isna(description) or not description:
+        return None
+    parts = [part.strip() for part in description.strip().splitlines()]
+    return " ".join(parts)
 def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
     """Get PathBank's terms."""
     smpdb_id_to_proteins = get_protein_mapping(version=version, force=force)
@@ -147,16 +161,11 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
         reference = Reference(prefix=PREFIX, identifier=pathbank_id, name=name)
         term = Term(
             reference=reference,
-            # definition=description.replace('\n', ' '),
-            xrefs=[Reference(prefix="smpdb", identifier=smpdb_id)],
-        )
-        term.append_parent(
-            Reference(
-                prefix=PREFIX,
-                identifier=subject.lower().replace(" ", "_"),
-                name=subject,
-            )
+            # TODO use _clean_description(description) to add a description,
+            #  but there are weird parser errors
         )
+        term.append_exact_match(Reference(prefix="smpdb", identifier=smpdb_id))
+        term.append_property(has_category, subject.lower().replace(" ", "_"))
         term.extend_relationship(has_participant, smpdb_id_to_proteins[smpdb_id])
         term.extend_relationship(has_participant, smpdb_id_to_metabolites[smpdb_id])
         yield term

pyobo/sources/pombase.py CHANGED Viewed

@@ -9,6 +9,7 @@ from tqdm.auto import tqdm
 import pyobo
 from pyobo import Reference
+from pyobo.resources.so import get_so_name
 from pyobo.struct import Obo, Term, from_species, has_gene_product, orthologous
 from pyobo.utils.path import ensure_df
@@ -19,7 +20,7 @@ __all__ = [
 logger = logging.getLogger(__name__)
 PREFIX = "pombase"
-URL = "https://www.pombase.org/data/names_and_identifiers/gene_IDs_names_products.tsv"
+GENE_NAMES_URL = "https://www.pombase.org/data/names_and_identifiers/gene_IDs_names_products.tsv"
 ORTHOLOGS_URL = "https://www.pombase.org/data/orthologs/human-orthologs.txt.gz"
@@ -68,9 +69,11 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
             if hgnc_id is not None:
                 identifier_to_hgnc_ids[identifier].add(hgnc_id)
-    df = ensure_df(PREFIX, url=URL, force=force, header=None, version=version)
+    df = ensure_df(PREFIX, url=GENE_NAMES_URL, force=force, version=version)
     so = {
-        gtype: Reference.auto("SO", POMBASE_TO_SO[gtype])
+        gtype: Reference(
+            prefix="SO", identifier=POMBASE_TO_SO[gtype], name=get_so_name(POMBASE_TO_SO[gtype])
+        )
         for gtype in sorted(df[df.columns[6]].unique())
     }
     for _, reference in sorted(so.items()):

pyobo/sources/reactome.py CHANGED Viewed

@@ -70,7 +70,9 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
     df["taxonomy_id"] = df["species"].map(get_ncbitaxon_id)
     terms = {}
-    it = tqdm(df.values, total=len(df.index), desc=f"mapping {PREFIX}")
+    it = tqdm(
+        df.values, total=len(df.index), desc=f"mapping {PREFIX}", unit_scale=True, unit="pathway"
+    )
     for reactome_id, name, species_name, taxonomy_id in it:
         terms[reactome_id] = term = Term(
             reference=Reference(prefix=PREFIX, identifier=reactome_id, name=name),
@@ -92,10 +94,21 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
         terms[child_id].append_parent(terms[parent_id])
     uniprot_pathway_df = ensure_participant_df(version=version, force=force)
-    for uniprot_id, reactome_id in tqdm(uniprot_pathway_df.values, total=len(uniprot_pathway_df)):
-        terms[reactome_id].append_relationship(
-            has_participant, Reference(prefix="uniprot", identifier=uniprot_id)
-        )
+    for uniprot_id, reactome_id in tqdm(
+        uniprot_pathway_df.values,
+        total=len(uniprot_pathway_df),
+        unit_scale=True,
+        unit="pathway-protein",
+    ):
+        if reactome_id not in terms:
+            tqdm.write(f"{reactome_id} appears in uniprot participants file but not pathways file")
+            continue
+        if "-" in uniprot_id:
+            reference = Reference(prefix="uniprot.isoform", identifier=uniprot_id)
+        else:
+            reference = Reference(prefix="uniprot", identifier=uniprot_id)
+        terms[reactome_id].append_relationship(has_participant, reference)
     chebi_pathway_url = f"https://reactome.org/download/{version}/ChEBI2Reactome_All_Levels.txt"
     chebi_pathway_df = ensure_df(
@@ -106,7 +119,15 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
         version=version,
         force=force,
     )
-    for chebi_id, reactome_id in tqdm(chebi_pathway_df.values, total=len(chebi_pathway_df)):
+    for chebi_id, reactome_id in tqdm(
+        chebi_pathway_df.values,
+        total=len(chebi_pathway_df),
+        unit_scale=True,
+        unit="pathway-chemical",
+    ):
+        if reactome_id not in terms:
+            tqdm.write(f"{reactome_id} appears in chebi participants file but not pathways file")
+            continue
         terms[reactome_id].append_relationship(
             has_participant, Reference(prefix="chebi", identifier=chebi_id)
         )
@@ -133,4 +154,4 @@ def get_protein_to_pathways() -> Mapping[str, set[str]]:
 if __name__ == "__main__":
-    get_obo().write_default()
+    ReactomeGetter.cli()

pyobo 0.11.0__py3-none-any.whl → 0.11.2__py3-none-any.whl

pyobo 0.11.0py3-none-any.whl → 0.11.2py3-none-any.whl