PyPI - pyobo - Versions diffs - 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

pyobo 0.11.1py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (242) hide show

pyobo/.DS_Store +0 -0
pyobo/__init__.py +95 -20
pyobo/__main__.py +0 -0
pyobo/api/__init__.py +81 -10
pyobo/api/alts.py +52 -42
pyobo/api/combine.py +39 -0
pyobo/api/edges.py +68 -0
pyobo/api/hierarchy.py +231 -203
pyobo/api/metadata.py +14 -19
pyobo/api/names.py +207 -127
pyobo/api/properties.py +117 -113
pyobo/api/relations.py +68 -94
pyobo/api/species.py +24 -21
pyobo/api/typedefs.py +11 -11
pyobo/api/utils.py +66 -13
pyobo/api/xrefs.py +108 -114
pyobo/cli/__init__.py +0 -0
pyobo/cli/cli.py +35 -50
pyobo/cli/database.py +183 -161
pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
pyobo/cli/lookup.py +163 -195
pyobo/cli/utils.py +19 -6
pyobo/constants.py +102 -3
pyobo/getters.py +196 -118
pyobo/gilda_utils.py +79 -200
pyobo/identifier_utils/__init__.py +41 -0
pyobo/identifier_utils/api.py +296 -0
pyobo/identifier_utils/model.py +130 -0
pyobo/identifier_utils/preprocessing.json +812 -0
pyobo/identifier_utils/preprocessing.py +61 -0
pyobo/identifier_utils/relations/__init__.py +8 -0
pyobo/identifier_utils/relations/api.py +162 -0
pyobo/identifier_utils/relations/data.json +5824 -0
pyobo/identifier_utils/relations/data_owl.json +57 -0
pyobo/identifier_utils/relations/data_rdf.json +1 -0
pyobo/identifier_utils/relations/data_rdfs.json +7 -0
pyobo/mocks.py +9 -6
pyobo/ner/__init__.py +9 -0
pyobo/ner/api.py +72 -0
pyobo/ner/normalizer.py +33 -0
pyobo/obographs.py +43 -39
pyobo/plugins.py +5 -4
pyobo/py.typed +0 -0
pyobo/reader.py +1358 -395
pyobo/reader_utils.py +155 -0
pyobo/resource_utils.py +42 -22
pyobo/resources/__init__.py +0 -0
pyobo/resources/goc.py +75 -0
pyobo/resources/goc.tsv +188 -0
pyobo/resources/ncbitaxon.py +4 -5
pyobo/resources/ncbitaxon.tsv.gz +0 -0
pyobo/resources/ro.py +3 -2
pyobo/resources/ro.tsv +0 -0
pyobo/resources/so.py +0 -0
pyobo/resources/so.tsv +0 -0
pyobo/sources/README.md +12 -8
pyobo/sources/__init__.py +52 -29
pyobo/sources/agrovoc.py +0 -0
pyobo/sources/antibodyregistry.py +11 -12
pyobo/sources/bigg/__init__.py +13 -0
pyobo/sources/bigg/bigg_compartment.py +81 -0
pyobo/sources/bigg/bigg_metabolite.py +229 -0
pyobo/sources/bigg/bigg_model.py +46 -0
pyobo/sources/bigg/bigg_reaction.py +77 -0
pyobo/sources/biogrid.py +1 -2
pyobo/sources/ccle.py +7 -12
pyobo/sources/cgnc.py +0 -5
pyobo/sources/chebi.py +1 -1
pyobo/sources/chembl/__init__.py +9 -0
pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
pyobo/sources/chembl/chembl_target.py +160 -0
pyobo/sources/civic_gene.py +55 -15
pyobo/sources/clinicaltrials.py +160 -0
pyobo/sources/complexportal.py +24 -24
pyobo/sources/conso.py +14 -22
pyobo/sources/cpt.py +0 -0
pyobo/sources/credit.py +1 -9
pyobo/sources/cvx.py +27 -5
pyobo/sources/depmap.py +9 -12
pyobo/sources/dictybase_gene.py +2 -7
pyobo/sources/drugbank/__init__.py +9 -0
pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
pyobo/sources/drugcentral.py +17 -13
pyobo/sources/expasy.py +31 -34
pyobo/sources/famplex.py +13 -18
pyobo/sources/flybase.py +3 -8
pyobo/sources/gard.py +62 -0
pyobo/sources/geonames/__init__.py +9 -0
pyobo/sources/geonames/features.py +28 -0
pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
pyobo/sources/geonames/utils.py +115 -0
pyobo/sources/gmt_utils.py +6 -7
pyobo/sources/go.py +20 -13
pyobo/sources/gtdb.py +154 -0
pyobo/sources/gwascentral/__init__.py +9 -0
pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
pyobo/sources/hgnc/__init__.py +9 -0
pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
pyobo/sources/icd/__init__.py +9 -0
pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
pyobo/sources/icd/icd11.py +148 -0
pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
pyobo/sources/interpro.py +4 -9
pyobo/sources/itis.py +0 -5
pyobo/sources/kegg/__init__.py +0 -0
pyobo/sources/kegg/api.py +16 -38
pyobo/sources/kegg/genes.py +9 -20
pyobo/sources/kegg/genome.py +1 -7
pyobo/sources/kegg/pathway.py +9 -21
pyobo/sources/mesh.py +58 -24
pyobo/sources/mgi.py +3 -10
pyobo/sources/mirbase/__init__.py +11 -0
pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
pyobo/sources/msigdb.py +74 -39
pyobo/sources/ncbi/__init__.py +9 -0
pyobo/sources/ncbi/ncbi_gc.py +162 -0
pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
pyobo/sources/nih_reporter.py +60 -0
pyobo/sources/nlm/__init__.py +9 -0
pyobo/sources/nlm/nlm_catalog.py +48 -0
pyobo/sources/nlm/nlm_publisher.py +36 -0
pyobo/sources/nlm/utils.py +116 -0
pyobo/sources/npass.py +6 -8
pyobo/sources/omim_ps.py +10 -3
pyobo/sources/pathbank.py +4 -8
pyobo/sources/pfam/__init__.py +9 -0
pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
pyobo/sources/pharmgkb/__init__.py +15 -0
pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
pyobo/sources/pharmgkb/utils.py +86 -0
pyobo/sources/pid.py +1 -6
pyobo/sources/pombase.py +6 -10
pyobo/sources/pubchem.py +4 -9
pyobo/sources/reactome.py +5 -11
pyobo/sources/rgd.py +11 -16
pyobo/sources/rhea.py +37 -36
pyobo/sources/ror.py +69 -42
pyobo/sources/selventa/__init__.py +0 -0
pyobo/sources/selventa/schem.py +4 -7
pyobo/sources/selventa/scomp.py +1 -6
pyobo/sources/selventa/sdis.py +4 -7
pyobo/sources/selventa/sfam.py +1 -6
pyobo/sources/sgd.py +6 -11
pyobo/sources/signor/__init__.py +7 -0
pyobo/sources/signor/download.py +41 -0
pyobo/sources/signor/signor_complexes.py +105 -0
pyobo/sources/slm.py +12 -15
pyobo/sources/umls/__init__.py +7 -1
pyobo/sources/umls/__main__.py +0 -0
pyobo/sources/umls/get_synonym_types.py +20 -4
pyobo/sources/umls/sty.py +57 -0
pyobo/sources/umls/synonym_types.tsv +1 -1
pyobo/sources/umls/umls.py +18 -22
pyobo/sources/unimod.py +46 -0
pyobo/sources/uniprot/__init__.py +1 -1
pyobo/sources/uniprot/uniprot.py +40 -32
pyobo/sources/uniprot/uniprot_ptm.py +4 -34
pyobo/sources/utils.py +3 -2
pyobo/sources/wikipathways.py +7 -10
pyobo/sources/zfin.py +5 -10
pyobo/ssg/__init__.py +12 -16
pyobo/ssg/base.html +0 -0
pyobo/ssg/index.html +26 -13
pyobo/ssg/term.html +12 -2
pyobo/ssg/typedef.html +0 -0
pyobo/struct/__init__.py +54 -8
pyobo/struct/functional/__init__.py +1 -0
pyobo/struct/functional/dsl.py +2572 -0
pyobo/struct/functional/macros.py +423 -0
pyobo/struct/functional/obo_to_functional.py +385 -0
pyobo/struct/functional/ontology.py +270 -0
pyobo/struct/functional/utils.py +112 -0
pyobo/struct/reference.py +331 -136
pyobo/struct/struct.py +1413 -643
pyobo/struct/struct_utils.py +1078 -0
pyobo/struct/typedef.py +162 -210
pyobo/struct/utils.py +12 -5
pyobo/struct/vocabulary.py +138 -0
pyobo/utils/__init__.py +0 -0
pyobo/utils/cache.py +13 -11
pyobo/utils/io.py +17 -31
pyobo/utils/iter.py +5 -5
pyobo/utils/misc.py +41 -53
pyobo/utils/ndex_utils.py +0 -0
pyobo/utils/path.py +76 -70
pyobo/version.py +3 -3
{pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/METADATA +224 -225
pyobo-0.12.0.dist-info/RECORD +202 -0
pyobo-0.12.0.dist-info/WHEEL +4 -0
{pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
{pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info/licenses}/LICENSE +0 -0
pyobo/apps/__init__.py +0 -3
pyobo/apps/cli.py +0 -24
pyobo/apps/gilda/__init__.py +0 -3
pyobo/apps/gilda/__main__.py +0 -8
pyobo/apps/gilda/app.py +0 -48
pyobo/apps/gilda/cli.py +0 -36
pyobo/apps/gilda/templates/base.html +0 -33
pyobo/apps/gilda/templates/home.html +0 -11
pyobo/apps/gilda/templates/matches.html +0 -32
pyobo/apps/mapper/__init__.py +0 -3
pyobo/apps/mapper/__main__.py +0 -11
pyobo/apps/mapper/cli.py +0 -37
pyobo/apps/mapper/mapper.py +0 -187
pyobo/apps/mapper/templates/base.html +0 -35
pyobo/apps/mapper/templates/mapper_home.html +0 -64
pyobo/aws.py +0 -162
pyobo/cli/aws.py +0 -47
pyobo/identifier_utils.py +0 -142
pyobo/normalizer.py +0 -232
pyobo/registries/__init__.py +0 -16
pyobo/registries/metaregistry.json +0 -507
pyobo/registries/metaregistry.py +0 -135
pyobo/sources/icd11.py +0 -105
pyobo/xrefdb/__init__.py +0 -1
pyobo/xrefdb/canonicalizer.py +0 -214
pyobo/xrefdb/priority.py +0 -59
pyobo/xrefdb/sources/__init__.py +0 -60
pyobo/xrefdb/sources/biomappings.py +0 -36
pyobo/xrefdb/sources/cbms2019.py +0 -91
pyobo/xrefdb/sources/chembl.py +0 -83
pyobo/xrefdb/sources/compath.py +0 -82
pyobo/xrefdb/sources/famplex.py +0 -64
pyobo/xrefdb/sources/gilda.py +0 -50
pyobo/xrefdb/sources/intact.py +0 -113
pyobo/xrefdb/sources/ncit.py +0 -133
pyobo/xrefdb/sources/pubchem.py +0 -27
pyobo/xrefdb/sources/wikidata.py +0 -116
pyobo-0.11.1.dist-info/RECORD +0 -173
pyobo-0.11.1.dist-info/WHEEL +0 -5
pyobo-0.11.1.dist-info/top_level.txt +0 -1

pyobo/sources/signor/download.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Download utilities for SIGNOR."""
+import enum
+import pandas as pd
+import requests
+from pyobo.utils.path import prefix_directory_join
+__all__ = [
+    "DownloadKey",
+    "download_signor",
+    "get_signor_df",
+]
+class DownloadKey(enum.Enum):
+    """Download key."""
+    complex = "Download complex data"
+    family = "Download protein family data"
+    phenotype = "Download phenotype data"
+    stimulus = "Download stimulus data"
+def download_signor(key: DownloadKey) -> requests.Response:
+    """Download from SIGNOR."""
+    return requests.post(
+        "https://signor.uniroma2.it/download_complexes.php",
+        files={"submit": (None, key.value)},
+    )
+def get_signor_df(prefix: str, *, version: str, key: DownloadKey, force: bool) -> pd.DataFrame:
+    """Get the appropriate SIGNOR dataframe."""
+    path = prefix_directory_join(prefix, version=version, name=f"{key.name}.csv")
+    if not path.is_file() or force:
+        res = download_signor(key)
+        path.write_text(res.text)
+    df = pd.read_csv(path, sep=";")
+    return df

pyobo/sources/signor/signor_complexes.py ADDED Viewed

@@ -0,0 +1,105 @@
+"""A source for SIGNOR complexes."""
+from collections.abc import Iterable
+import pandas as pd
+from pyobo import Obo, Reference, Term, default_reference
+from pyobo.sources.signor.download import DownloadKey, get_signor_df
+from pyobo.struct import CHARLIE_TERM, HUMAN_TERM, PYOBO_INJECTED
+from pyobo.struct.typedef import exact_match, has_component, has_member
+__all__ = [
+    "SignorGetter",
+]
+PREFIX = "signor"
+PROTEIN_FAMILY = (
+    Term(reference=default_reference(PREFIX, "protein-family"))
+    .append_contributor(CHARLIE_TERM)
+    .append_comment(PYOBO_INJECTED)
+)
+PROTEIN_COMPLEX = (
+    Term(reference=default_reference(PREFIX, "protein-complex"))
+    .append_contributor(CHARLIE_TERM)
+    .append_comment(PYOBO_INJECTED)
+)
+PHENOTYPE = (
+    Term(reference=default_reference(PREFIX, "phenotype"))
+    .append_contributor(CHARLIE_TERM)
+    .append_comment(PYOBO_INJECTED)
+)
+STIMULUS = (
+    Term(reference=default_reference(PREFIX, "stimulus"))
+    .append_contributor(CHARLIE_TERM)
+    .append_comment(PYOBO_INJECTED)
+)
+ROOT_TERMS = (PROTEIN_FAMILY, PROTEIN_COMPLEX, PHENOTYPE, STIMULUS)
+class SignorGetter(Obo):
+    """An ontology representation of SIGNOR complexes."""
+    ontology = bioversions_key = PREFIX
+    typedefs = [exact_match, has_component, has_member]
+    root_terms = [r.reference for r in ROOT_TERMS]
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return iter_terms(version=self._version_or_raise, force=force)
+def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
+    """Iterate over terms."""
+    yield CHARLIE_TERM
+    yield HUMAN_TERM
+    yield from ROOT_TERMS
+    complexes_df = get_signor_df(PREFIX, version=version, force=force, key=DownloadKey.complex)
+    for identifier, name, proteins in complexes_df.values:
+        term = Term.from_triple(PREFIX, identifier, name)
+        term.append_parent(PROTEIN_COMPLEX)
+        for part_id in proteins.split(","):
+            part_id = part_id.strip()
+            if part_id.startswith("SIGNOR-"):
+                part = Reference(prefix="signor", identifier=part_id)
+            elif part_id.startswith("CHEBI:"):
+                part = Reference(prefix="chebi", identifier=part_id.removeprefix("CHEBI:"))
+            else:
+                part = Reference(prefix="uniprot", identifier=part_id)
+            term.annotate_object(has_component, part)
+        yield term
+    family_df = get_signor_df(PREFIX, version=version, force=force, key=DownloadKey.family)
+    for identifier, name, proteins in family_df.values:
+        term = Term.from_triple(PREFIX, identifier, name)
+        term.append_parent(PROTEIN_FAMILY)
+        for uniprot_id in proteins.split(","):
+            uniprot_id = uniprot_id.strip()
+            term.annotate_object(has_member, Reference(prefix="uniprot", identifier=uniprot_id))
+        yield term
+    stimulus_df = get_signor_df(PREFIX, version=version, force=force, key=DownloadKey.stimulus)
+    # for some reason, there are many duplicates in this file
+    stimulus_df = stimulus_df.drop_duplicates()
+    for identifier, name, description in stimulus_df.values:
+        term = Term.from_triple(PREFIX, identifier, name, definition=_clean_descr(description))
+        term.append_parent(STIMULUS)
+        yield term
+    phenotypes_df = get_signor_df(PREFIX, version=version, force=force, key=DownloadKey.phenotype)
+    for identifier, name, description in phenotypes_df.values:
+        term = Term.from_triple(PREFIX, identifier, name, definition=_clean_descr(description))
+        term.append_parent(PHENOTYPE)
+        yield term
+def _clean_descr(d) -> str | None:
+    if pd.isna(d):
+        return None
+    return d.replace("\n", " ")
+if __name__ == "__main__":
+    SignorGetter.cli()

pyobo/sources/slm.py CHANGED Viewed

@@ -5,9 +5,9 @@ from collections.abc import Iterable
 import pandas as pd
 from tqdm.auto import tqdm
-from pyobo import Obo, Reference, Term
+from pyobo import Obo, Reference, Term, TypeDef
 from pyobo.struct.struct import abbreviation as abbreviation_typedef
-from pyobo.struct.typedef import exact_match, has_inchi, has_smiles
+from pyobo.struct.typedef import exact_match, has_citation, has_inchi, has_smiles
 from pyobo.utils.path import ensure_df
 __all__ = [
@@ -36,13 +36,14 @@ COLUMNS = [
     "HMDB",
     "PMID",
 ]
+LEVEL = TypeDef.default(PREFIX, "level", is_metadata_tag=True)
 class SLMGetter(Obo):
     """An ontology representation of SwissLipid's lipid nomenclature."""
     ontology = bioversions_key = PREFIX
-    typedefs = [exact_match]
+    typedefs = [exact_match, LEVEL, has_inchi, has_smiles, has_citation]
     synonym_typedefs = [abbreviation_typedef]
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
@@ -50,9 +51,7 @@ class SLMGetter(Obo):
         return iter_terms(force=force, version=self._version_or_raise)
-def get_obo(force: bool = False) -> Obo:
-    """Get SwissLipids as OBO."""
-    return SLMGetter(force=force)
+INVALID_INCHI = {"-", "none"}
 def iter_terms(version: str, force: bool = False):
@@ -90,27 +89,25 @@ def iter_terms(version: str, force: bool = False):
             raise ValueError(identifier)
         term = Term.from_triple(PREFIX, identifier, name)
         if pd.notna(level):
-            term.append_property("level", level)
+            term.annotate_string(LEVEL, level)
         if pd.notna(abbreviation):
             term.append_synonym(abbreviation, type=abbreviation_typedef)
         if pd.notna(synonyms):
             for synonym in synonyms.split("|"):
                 term.append_synonym(synonym.strip())
         if pd.notna(smiles):
-            term.append_property(has_smiles, smiles)
+            term.annotate_string(has_smiles, smiles)
         if pd.notna(inchi) and inchi != "InChI=none":
             if inchi.startswith("InChI="):
                 inchi = inchi[len("InChI=") :]
-            term.append_property(has_inchi, inchi)
+            term.annotate_string(has_inchi, inchi)
         if pd.notna(inchikey):
             inchikey = inchikey.removeprefix("InChIKey=").strip()
-            if inchikey and inchikey != "none":
+            if inchikey and inchikey not in INVALID_INCHI:
                 try:
                     inchi_ref = Reference(prefix="inchikey", identifier=inchikey)
                 except ValueError:
-                    tqdm.write(
-                        f"[slm:{identifier}] had invalid inchikey reference: ({type(inchikey)}) {inchikey}"
-                    )
+                    tqdm.write(f"[slm:{identifier}] had invalid inchikey reference: `{inchikey}`")
                 else:
                     term.append_exact_match(inchi_ref)
         for chebi_id in _split(chebi_ids):
@@ -120,7 +117,7 @@ def iter_terms(version: str, force: bool = False):
         for hmdb_id in _split(hmdb_ids):
             term.append_exact_match(("hmdb", hmdb_id))
         for pubmed_id in _split(pubmed_ids):
-            term.append_provenance(("pubmed", pubmed_id))
+            term.append_provenance(Reference(prefix="pubmed", identifier=pubmed_id))
         # TODO how to handle class, parents, and components?
         yield term
@@ -134,4 +131,4 @@ def _split(s: str) -> Iterable[str]:
 if __name__ == "__main__":
-    get_obo().write_default(write_obo=True, use_tqdm=True)
+    SLMGetter.cli()

pyobo/sources/umls/__init__.py CHANGED Viewed

@@ -1,3 +1,9 @@
 """Converter for UMLS."""
-from .umls import UMLSGetter, get_obo  # noqa: F401
+from .sty import UMLSSTyGetter
+from .umls import UMLSGetter
+__all__ = [
+    "UMLSGetter",
+    "UMLSSTyGetter",
+]

pyobo/sources/umls/__main__.py CHANGED Viewed

File without changes

pyobo/sources/umls/get_synonym_types.py CHANGED Viewed

@@ -1,19 +1,22 @@
 """Utilities for UMLS synonyms."""
+import re
 from collections.abc import Mapping
 from pathlib import Path
 import requests
 from bs4 import BeautifulSoup
+from pyobo.struct import SynonymTypeDef, default_reference
 from pyobo.utils.io import open_map_tsv, write_map_tsv
-__all__ = ["get_umls_synonyms"]
+__all__ = ["get_umls_synonyms", "get_umls_typedefs"]
 HERE = Path(__file__).parent.resolve()
 SYNONYM_TYPE_PATH = HERE.joinpath("synonym_types.tsv")
 ABBREVIATIONS_URL = "https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/abbreviations.html"
+SPACES = re.compile(r"\s+")
 def get_umls_synonyms(*, refresh: bool = False) -> Mapping[str, str]:
@@ -23,14 +26,27 @@ def get_umls_synonyms(*, refresh: bool = False) -> Mapping[str, str]:
     res = requests.get(ABBREVIATIONS_URL, timeout=5)
     soup = BeautifulSoup(res.text, features="html.parser")
     table = soup.find(id="mrdoc_TTY")
-    body = table.find("tbody")
+    if table is None:
+        raise ValueError
+    body = table.find("tbody")  # type:ignore[attr-defined]
+    if body is None:
+        raise ValueError
     rv = {}
     for row in body.find_all("tr"):
         left, right = row.find_all("td")
-        rv[left.text.strip()] = right.text.strip()
+        rv[left.text.strip()] = SPACES.sub(" ", right.text.strip())
     write_map_tsv(path=SYNONYM_TYPE_PATH, rv=rv, header=["key", "name"])
     return rv
+def get_umls_typedefs(*, refresh: bool = False) -> dict[str, SynonymTypeDef]:
+    """Get all synonym type definitions."""
+    umls_synonyms = get_umls_synonyms(refresh=refresh)
+    return {
+        identifier: SynonymTypeDef(reference=default_reference("umls", identifier, name=name))
+        for identifier, name in umls_synonyms.items()
+    }
 if __name__ == "__main__":
-    get_umls_synonyms(refresh=True)
+    get_umls_typedefs(refresh=True)

pyobo/sources/umls/sty.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Converter for UMLS Semantic Types."""
+from collections.abc import Iterable
+from pyobo import Obo, Reference, Term, default_reference
+from pyobo.struct.typedef import has_category
+from pyobo.utils.path import ensure_df
+__all__ = [
+    "UMLSSTyGetter",
+]
+PREFIX = "sty"
+URL = "https://www.nlm.nih.gov/research/umls/knowledge_sources/semantic_network/SemGroups.txt"
+class UMLSSTyGetter(Obo):
+    """An ontology representation of UMLS Semantic Types."""
+    ontology = PREFIX
+    bioversions_key = "umls"
+    typedefs = [has_category]
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return iter_terms(version=self._version_or_raise)
+COLUMNS = [
+    "group",
+    "group_label",
+    "sty_id",
+    "sty_name",
+]
+def iter_terms(version: str) -> Iterable[Term]:
+    """Iterate over UMLS terms."""
+    df = ensure_df(PREFIX, url=URL, version=version, sep="|", header=None, names=COLUMNS)
+    extras = {
+        group: Term(
+            reference=default_reference(PREFIX, group, name=group_label),
+        )
+        for group, group_label in df[["group", "group_label"]].drop_duplicates().values
+    }
+    yield from extras.values()
+    for group, _group_label, sty_id, sty_name in df.values:
+        term = Term(reference=Reference(prefix="sty", identifier=sty_id, name=sty_name))
+        term.append_parent(extras[group])
+        yield term
+if __name__ == "__main__":
+    UMLSSTyGetter.cli()

pyobo/sources/umls/synonym_types.tsv CHANGED Viewed

@@ -146,6 +146,7 @@ OAM	Obsolete Modifier Abbreviation
 OAP	Obsolete active preferred term
 OAS	Obsolete active synonym
 OC	Nursing outcomes
+ODN	Obsolete Display Name
 OET	Obsolete entry term
 OF	Obsolete fully specified name
 OL	Non-current Lower Level Term
@@ -188,7 +189,6 @@ PX	Expanded preferred terms (pair with PS)
 PXQ	Preferred qualifier term
 QAB	Qualifier abbreviation
 QEV	Qualifier entry version
-QSV	Qualifier sort version
 RAB	Root abbreviation
 RHT	Root hierarchical term
 RPT	Root preferred term

pyobo/sources/umls/umls.py CHANGED Viewed

@@ -15,7 +15,7 @@ from umls_downloader import open_umls, open_umls_semantic_types
 from pyobo import Obo, Reference, Synonym, SynonymTypeDef, Term
-from .get_synonym_types import get_umls_synonyms
+from .get_synonym_types import get_umls_typedefs
 __all__ = [
     "UMLSGetter",
@@ -46,30 +46,26 @@ RRF_COLUMNS = [
 PREFIX = "umls"
 SOURCE_VOCAB_URL = "https://www.nlm.nih.gov/research/umls/sourcereleasedocs/index.html"
-SYNONYM_ABB = get_umls_synonyms()
+UMLS_TYPEDEFS: dict[str, SynonymTypeDef] = get_umls_typedefs()
 class UMLSGetter(Obo):
     """An ontology representation of UMLS."""
     ontology = bioversions_key = PREFIX
-    synonym_typedefs = [SynonymTypeDef.from_text(v) for v in SYNONYM_ABB.values()]
+    synonym_typedefs = list(UMLS_TYPEDEFS.values())
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""
         return iter_terms(version=self._version_or_raise)
-def get_obo() -> Obo:
-    """Get UMLS as OBO."""
-    return UMLSGetter()
 def get_semantic_types() -> Mapping[str, set[str]]:
     """Get UMLS semantic types for each term."""
     dd = defaultdict(set)
     with open_umls_semantic_types() as file:
-        for line in tqdm(file, unit_scale=True):
+        # this is very fast and doesn't need a progress bar
+        for line in file:
             cui, sty, _ = line.decode("utf8").split("|", 2)
             dd[cui].add(sty)
     return dict(dd)
@@ -80,7 +76,7 @@ def iter_terms(version: str) -> Iterable[Term]:
     semantic_types = get_semantic_types()
     with open_umls(version=version) as file:
-        it = tqdm(file, unit_scale=True, desc="[umls] parsing")
+        it = tqdm(file, unit_scale=True, desc="[umls] parsing", total=16_700_000)
         lines = (line.decode("utf-8").strip().split("|") for line in it)
         for cui, cui_lines in itt.groupby(lines, key=operator.itemgetter(0)):
             df = pd.DataFrame(list(cui_lines), columns=RRF_COLUMNS)
@@ -96,38 +92,38 @@ def iter_terms(version: str) -> Iterable[Term]:
                 continue
             df["TTY - Term Type in Source"] = df["TTY - Term Type in Source"].map(
-                SYNONYM_ABB.__getitem__
+                UMLS_TYPEDEFS.__getitem__
             )
             _r = pref_rows_df.iloc[0]
             sdf = df[["SAB - source name", "CODE", "TTY - Term Type in Source", "STR"]]
             synonyms = []
-            xrefs = []
+            xrefs = set()
             for source, identifier, synonym_type, synonym in sdf.values:
                 norm_source = bioregistry.normalize_prefix(source)
-                if norm_source is None or not identifier:
+                if not norm_source or not identifier or "," in identifier:
                     provenance = []
                 else:
-                    ref = Reference(prefix=norm_source, identifier=identifier)
-                    provenance = [ref]
-                    xrefs.append(ref)
+                    try:
+                        ref = Reference(prefix=norm_source, identifier=identifier)
+                    except ValueError:
+                        continue
+                    else:
+                        provenance = [ref]
+                        xrefs.add(ref)
                 synonyms.append(
                     Synonym(
                         name=synonym,
                         provenance=provenance,
-                        type=SynonymTypeDef.from_text(synonym_type),
+                        type=synonym_type.reference,
                     )
                 )
-            xrefs = sorted(
-                set(xrefs), key=lambda reference: (reference.prefix, reference.identifier)
-            )
             term = Term(
                 reference=Reference(prefix=PREFIX, identifier=cui, name=_r["STR"]),
                 synonyms=synonyms,
-                xrefs=xrefs,
+                xrefs=sorted(xrefs),
             )
             for sty_id in semantic_types.get(cui, set()):
                 term.append_parent(Reference(prefix="sty", identifier=sty_id))

pyobo/sources/unimod.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""Unimod provides an OBO file, but it's got lots of errors in its encoding."""
+from collections.abc import Iterable
+from lxml import etree
+from pyobo.struct import Obo, Reference, Term
+from pyobo.utils.path import ensure_path
+URL = "https://www.unimod.org/xml/unimod.xml"
+PREFIX_MAP = {"umod": "http://www.unimod.org/xmlns/schema/unimod_2"}
+PREFIX = "unimod"
+class UnimodGetter(Obo):
+    """An ontology representation of the unimod modifications."""
+    ontology = bioversions_key = PREFIX
+    dynamic_version = True
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return get_terms()
+def get_terms() -> Iterable[Term]:
+    """Get terms."""
+    path = ensure_path("unimod", url=URL)
+    x = etree.parse(path).getroot()
+    mods = x.findall("umod:modifications/umod:mod", namespaces=PREFIX_MAP)
+    return map(_mod_to_term, mods)
+def _mod_to_term(mod: etree.Element) -> Term:
+    title = mod.attrib["title"]
+    name = mod.attrib["full_name"]
+    identifier = mod.attrib["record_id"]
+    term = Term(
+        reference=Reference(prefix=PREFIX, identifier=identifier, name=title),
+        definition=name if name != title else None,
+    )
+    return term
+if __name__ == "__main__":
+    UnimodGetter.cli()

pyobo/sources/uniprot/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ from .uniprot import PREFIX, UniProtGetter
 from .uniprot_ptm import UniProtPtmGetter
 __all__ = [
+    "PREFIX",
     "UniProtGetter",
     "UniProtPtmGetter",
-    "PREFIX",
 ]

pyobo 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

pyobo 0.11.1py3-none-any.whl → 0.12.0py3-none-any.whl