PyPI - pyobo - Versions diffs - 0.11.2__py3-none-any.whl → 0.12.1__py3-none-any.whl - Mend

pyobo 0.11.2py3-none-any.whl → 0.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (228) hide show

pyobo/.DS_Store +0 -0
pyobo/__init__.py +95 -20
pyobo/__main__.py +0 -0
pyobo/api/__init__.py +81 -10
pyobo/api/alts.py +52 -42
pyobo/api/combine.py +39 -0
pyobo/api/edges.py +68 -0
pyobo/api/hierarchy.py +231 -203
pyobo/api/metadata.py +14 -19
pyobo/api/names.py +207 -127
pyobo/api/properties.py +117 -117
pyobo/api/relations.py +68 -94
pyobo/api/species.py +24 -21
pyobo/api/typedefs.py +11 -11
pyobo/api/utils.py +66 -13
pyobo/api/xrefs.py +107 -114
pyobo/cli/__init__.py +0 -0
pyobo/cli/cli.py +35 -50
pyobo/cli/database.py +210 -160
pyobo/cli/database_utils.py +155 -0
pyobo/cli/lookup.py +163 -195
pyobo/cli/utils.py +19 -6
pyobo/constants.py +102 -3
pyobo/getters.py +209 -191
pyobo/gilda_utils.py +52 -250
pyobo/identifier_utils/__init__.py +33 -0
pyobo/identifier_utils/api.py +305 -0
pyobo/identifier_utils/preprocessing.json +873 -0
pyobo/identifier_utils/preprocessing.py +27 -0
pyobo/identifier_utils/relations/__init__.py +8 -0
pyobo/identifier_utils/relations/api.py +162 -0
pyobo/identifier_utils/relations/data.json +5824 -0
pyobo/identifier_utils/relations/data_owl.json +57 -0
pyobo/identifier_utils/relations/data_rdf.json +1 -0
pyobo/identifier_utils/relations/data_rdfs.json +7 -0
pyobo/mocks.py +9 -6
pyobo/ner/__init__.py +9 -0
pyobo/ner/api.py +72 -0
pyobo/ner/normalizer.py +33 -0
pyobo/obographs.py +48 -40
pyobo/plugins.py +5 -4
pyobo/py.typed +0 -0
pyobo/reader.py +1354 -395
pyobo/reader_utils.py +155 -0
pyobo/resource_utils.py +42 -22
pyobo/resources/__init__.py +0 -0
pyobo/resources/goc.py +75 -0
pyobo/resources/goc.tsv +188 -0
pyobo/resources/ncbitaxon.py +4 -5
pyobo/resources/ncbitaxon.tsv.gz +0 -0
pyobo/resources/ro.py +3 -2
pyobo/resources/ro.tsv +0 -0
pyobo/resources/so.py +0 -0
pyobo/resources/so.tsv +0 -0
pyobo/sources/README.md +12 -8
pyobo/sources/__init__.py +52 -29
pyobo/sources/agrovoc.py +0 -0
pyobo/sources/antibodyregistry.py +11 -12
pyobo/sources/bigg/__init__.py +13 -0
pyobo/sources/bigg/bigg_compartment.py +81 -0
pyobo/sources/bigg/bigg_metabolite.py +229 -0
pyobo/sources/bigg/bigg_model.py +46 -0
pyobo/sources/bigg/bigg_reaction.py +77 -0
pyobo/sources/biogrid.py +1 -2
pyobo/sources/ccle.py +7 -12
pyobo/sources/cgnc.py +9 -6
pyobo/sources/chebi.py +1 -1
pyobo/sources/chembl/__init__.py +9 -0
pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
pyobo/sources/chembl/chembl_target.py +160 -0
pyobo/sources/civic_gene.py +55 -15
pyobo/sources/clinicaltrials.py +160 -0
pyobo/sources/complexportal.py +24 -24
pyobo/sources/conso.py +14 -22
pyobo/sources/cpt.py +0 -0
pyobo/sources/credit.py +1 -9
pyobo/sources/cvx.py +27 -5
pyobo/sources/depmap.py +9 -12
pyobo/sources/dictybase_gene.py +2 -7
pyobo/sources/drugbank/__init__.py +9 -0
pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
pyobo/sources/drugcentral.py +17 -13
pyobo/sources/expasy.py +31 -34
pyobo/sources/famplex.py +13 -18
pyobo/sources/flybase.py +8 -13
pyobo/sources/gard.py +62 -0
pyobo/sources/geonames/__init__.py +9 -0
pyobo/sources/geonames/features.py +28 -0
pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
pyobo/sources/geonames/utils.py +115 -0
pyobo/sources/gmt_utils.py +6 -7
pyobo/sources/go.py +20 -13
pyobo/sources/gtdb.py +154 -0
pyobo/sources/gwascentral/__init__.py +9 -0
pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
pyobo/sources/hgnc/__init__.py +9 -0
pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
pyobo/sources/icd/__init__.py +9 -0
pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
pyobo/sources/icd/icd11.py +148 -0
pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
pyobo/sources/interpro.py +4 -9
pyobo/sources/itis.py +0 -5
pyobo/sources/kegg/__init__.py +0 -0
pyobo/sources/kegg/api.py +16 -38
pyobo/sources/kegg/genes.py +9 -20
pyobo/sources/kegg/genome.py +1 -7
pyobo/sources/kegg/pathway.py +9 -21
pyobo/sources/mesh.py +58 -24
pyobo/sources/mgi.py +3 -10
pyobo/sources/mirbase/__init__.py +11 -0
pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
pyobo/sources/msigdb.py +74 -39
pyobo/sources/ncbi/__init__.py +9 -0
pyobo/sources/ncbi/ncbi_gc.py +162 -0
pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
pyobo/sources/nih_reporter.py +60 -0
pyobo/sources/nlm/__init__.py +9 -0
pyobo/sources/nlm/nlm_catalog.py +48 -0
pyobo/sources/nlm/nlm_publisher.py +36 -0
pyobo/sources/nlm/utils.py +116 -0
pyobo/sources/npass.py +6 -8
pyobo/sources/omim_ps.py +11 -4
pyobo/sources/pathbank.py +4 -8
pyobo/sources/pfam/__init__.py +9 -0
pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
pyobo/sources/pharmgkb/__init__.py +15 -0
pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
pyobo/sources/pharmgkb/utils.py +86 -0
pyobo/sources/pid.py +1 -6
pyobo/sources/pombase.py +6 -10
pyobo/sources/pubchem.py +4 -9
pyobo/sources/reactome.py +5 -11
pyobo/sources/rgd.py +11 -16
pyobo/sources/rhea.py +37 -36
pyobo/sources/ror.py +69 -42
pyobo/sources/selventa/__init__.py +0 -0
pyobo/sources/selventa/schem.py +4 -7
pyobo/sources/selventa/scomp.py +1 -6
pyobo/sources/selventa/sdis.py +4 -7
pyobo/sources/selventa/sfam.py +1 -6
pyobo/sources/sgd.py +6 -11
pyobo/sources/signor/__init__.py +7 -0
pyobo/sources/signor/download.py +41 -0
pyobo/sources/signor/signor_complexes.py +105 -0
pyobo/sources/slm.py +12 -15
pyobo/sources/umls/__init__.py +7 -1
pyobo/sources/umls/__main__.py +0 -0
pyobo/sources/umls/get_synonym_types.py +20 -4
pyobo/sources/umls/sty.py +57 -0
pyobo/sources/umls/synonym_types.tsv +1 -1
pyobo/sources/umls/umls.py +18 -22
pyobo/sources/unimod.py +46 -0
pyobo/sources/uniprot/__init__.py +1 -1
pyobo/sources/uniprot/uniprot.py +40 -32
pyobo/sources/uniprot/uniprot_ptm.py +4 -34
pyobo/sources/utils.py +3 -2
pyobo/sources/wikipathways.py +7 -10
pyobo/sources/zfin.py +5 -10
pyobo/ssg/__init__.py +12 -16
pyobo/ssg/base.html +0 -0
pyobo/ssg/index.html +26 -13
pyobo/ssg/term.html +12 -2
pyobo/ssg/typedef.html +0 -0
pyobo/struct/__init__.py +54 -8
pyobo/struct/functional/__init__.py +1 -0
pyobo/struct/functional/dsl.py +2572 -0
pyobo/struct/functional/macros.py +423 -0
pyobo/struct/functional/obo_to_functional.py +385 -0
pyobo/struct/functional/ontology.py +272 -0
pyobo/struct/functional/utils.py +112 -0
pyobo/struct/reference.py +331 -136
pyobo/struct/struct.py +1484 -657
pyobo/struct/struct_utils.py +1078 -0
pyobo/struct/typedef.py +162 -210
pyobo/struct/utils.py +12 -5
pyobo/struct/vocabulary.py +138 -0
pyobo/utils/__init__.py +0 -0
pyobo/utils/cache.py +16 -15
pyobo/utils/io.py +51 -41
pyobo/utils/iter.py +5 -5
pyobo/utils/misc.py +41 -53
pyobo/utils/ndex_utils.py +0 -0
pyobo/utils/path.py +73 -70
pyobo/version.py +3 -3
pyobo-0.12.1.dist-info/METADATA +671 -0
pyobo-0.12.1.dist-info/RECORD +201 -0
pyobo-0.12.1.dist-info/WHEEL +4 -0
{pyobo-0.11.2.dist-info → pyobo-0.12.1.dist-info}/entry_points.txt +1 -0
pyobo-0.12.1.dist-info/licenses/LICENSE +21 -0
pyobo/aws.py +0 -162
pyobo/cli/aws.py +0 -47
pyobo/identifier_utils.py +0 -142
pyobo/normalizer.py +0 -232
pyobo/registries/__init__.py +0 -16
pyobo/registries/metaregistry.json +0 -507
pyobo/registries/metaregistry.py +0 -135
pyobo/sources/icd11.py +0 -105
pyobo/xrefdb/__init__.py +0 -1
pyobo/xrefdb/canonicalizer.py +0 -214
pyobo/xrefdb/priority.py +0 -59
pyobo/xrefdb/sources/__init__.py +0 -60
pyobo/xrefdb/sources/biomappings.py +0 -36
pyobo/xrefdb/sources/cbms2019.py +0 -91
pyobo/xrefdb/sources/chembl.py +0 -83
pyobo/xrefdb/sources/compath.py +0 -82
pyobo/xrefdb/sources/famplex.py +0 -64
pyobo/xrefdb/sources/gilda.py +0 -50
pyobo/xrefdb/sources/intact.py +0 -113
pyobo/xrefdb/sources/ncit.py +0 -133
pyobo/xrefdb/sources/pubchem.py +0 -27
pyobo/xrefdb/sources/wikidata.py +0 -116
pyobo/xrefdb/xrefs_pipeline.py +0 -180
pyobo-0.11.2.dist-info/METADATA +0 -711
pyobo-0.11.2.dist-info/RECORD +0 -157
pyobo-0.11.2.dist-info/WHEEL +0 -5
pyobo-0.11.2.dist-info/top_level.txt +0 -1

pyobo/sources/pharmgkb/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Sources for PharmGKB."""
+from .pharmgkb_chemical import PharmGKBChemicalGetter
+from .pharmgkb_disease import PharmGKBDiseaseGetter
+from .pharmgkb_gene import PharmGKBGeneGetter
+from .pharmgkb_pathway import PharmGKBPathwayGetter
+from .pharmgkb_variant import PharmGKBVariantGetter
+__all__ = [
+    "PharmGKBChemicalGetter",
+    "PharmGKBDiseaseGetter",
+    "PharmGKBGeneGetter",
+    "PharmGKBPathwayGetter",
+    "PharmGKBVariantGetter",
+]

pyobo/sources/pharmgkb/pharmgkb_chemical.py ADDED Viewed

@@ -0,0 +1,89 @@
+"""An ontology representation of PharmGKB chemicals."""
+from collections.abc import Iterable
+import pandas as pd
+from tqdm import tqdm
+from pyobo import Obo, Reference, Term, default_reference
+from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, split
+from pyobo.struct.typedef import has_inchi, has_smiles
+__all__ = [
+    "PharmGKBChemicalGetter",
+]
+PREFIX = "pharmgkb.drug"
+URL = "https://api.pharmgkb.org/v1/download/file/data/chemicals.zip"
+class PharmGKBChemicalGetter(Obo):
+    """An ontology representation of PharmGKB chemicals."""
+    ontology = bioversions_key = PREFIX
+    dynamic_version = True
+    typedefs = [has_inchi, has_smiles]
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return iter_terms(force=force)
+SKIP_PREFIXES = {"smiles", "inchi", "atc", "rxnorm", "pubchem.compound"}
+def iter_terms(force: bool = False) -> Iterable[Term]:
+    """Iterate over terms."""
+    df = download_pharmgkb_tsv(PREFIX, url=URL, inner="chemicals.tsv", force=force)
+    type_to_ref = {
+        typ: default_reference(PREFIX, typ.lower().replace(" ", "-").replace(",", ""), name=typ)
+        for typ in df["Type"].unique()
+    }
+    for x in type_to_ref.values():
+        yield Term(reference=x)
+    for _, row in df.iterrows():
+        term = Term.from_triple(PREFIX, identifier=row["PharmGKB Accession Id"], name=row["Name"])
+        term.append_parent(type_to_ref[row["Type"]])
+        if pd.notna(row["SMILES"]):
+            term.annotate_string(has_smiles, row["SMILES"])
+        if pd.notna(row["InChI"]):
+            term.annotate_string(has_inchi, row["InChI"])
+        for atc_id in split(row, "ATC Identifiers"):
+            term.append_exact_match(Reference(prefix="atc", identifier=atc_id))
+        for rxnorm_id in split(row, "RxNorm Identifiers"):
+            if len(rxnorm_id) > 7:
+                tqdm.write(f"invalid rxnorm luid (too long) - {rxnorm_id}")
+            else:
+                term.append_exact_match(Reference(prefix="rxnorm", identifier=rxnorm_id))
+        for pubchem_id in split(row, "PubChem Compound Identifiers"):
+            term.append_exact_match(Reference(prefix="pubchem.compound", identifier=pubchem_id))
+        for xref_curie in split(row, "External Vocabulary"):
+            try:
+                reference = Reference.from_curie(xref_curie)
+            except ValueError:
+                pass
+            else:
+                if reference.prefix not in SKIP_PREFIXES:
+                    term.append_exact_match(reference)
+        for xref_curie in split(row, "Cross-references"):
+            try:
+                reference = Reference.from_curie(xref_curie)
+            except ValueError:
+                pass
+            else:
+                if reference.prefix not in SKIP_PREFIXES:
+                    term.append_exact_match(reference)
+        for trade_name in split(row, "Trade names"):
+            # TODO use OMO term for trade name
+            term.append_synonym(trade_name)
+        # TODO add more
+        yield term
+if __name__ == "__main__":
+    PharmGKBChemicalGetter.cli()

pyobo/sources/pharmgkb/pharmgkb_disease.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""An ontology representation of PharmGKB phenotypes."""
+from collections.abc import Iterable
+from typing import cast
+import pandas as pd
+from pyobo import Obo, Reference, Term
+from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, parse_xrefs, split
+__all__ = [
+    "PharmGKBDiseaseGetter",
+]
+PREFIX = "pharmgkb.disease"
+URL = "https://api.pharmgkb.org/v1/download/file/data/phenotypes.zip"
+class PharmGKBDiseaseGetter(Obo):
+    """An ontology representation of PharmGKB phenotypes."""
+    ontology = bioversions_key = PREFIX
+    dynamic_version = True
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return iter_terms(force=force)
+def iter_terms(force: bool = False) -> Iterable[Term]:
+    """Iterate over terms.
+    :param force: Should the data be re-downloaded
+    :yields: Terms
+    1. PharmGKB Accession Id = Identifier assigned to this phenotype by PharmGKB
+    2. Name = Name PharmGKB uses for this phenotype
+    3. Alternate Names = Other known names for this phenotype, comma-separated
+    4. Cross-references = References to other resources in the form "resource:id",
+       comma-separated
+    5. External Vocabulary = Term for this phenotype in another vocabulary in the form
+       "vocabulary:id", comma-separated
+    """
+    df = download_pharmgkb_tsv(PREFIX, url=URL, inner="phenotypes.tsv", force=force)
+    for _, row in df.iterrows():
+        identifier = row["PharmGKB Accession Id"]
+        if pd.isna(identifier):
+            continue
+        name = row["Name"]
+        term = Term.from_triple(PREFIX, identifier=str(identifier), name=name)
+        synonyms = set()
+        for synonym in split(row, "Alternate Names"):
+            synonym = synonym.strip()
+            if synonym.casefold() == name.casefold():
+                continue
+            synonyms.add(synonym.strip('"'))
+        for synonym in sorted(synonyms):
+            term.append_synonym(synonym)
+        for xref in parse_xrefs(term, row):
+            term.append_xref(xref)
+        for xref_line in split(row, "External Vocabulary"):
+            xref_curie, _, _ = xref_line.strip('"').partition("(")
+            try:
+                xref = cast(Reference, Reference.from_curie(xref_curie))
+            except Exception:  # noqa:S110
+                pass  # this happens when there's a comma in the name, but not a problem
+            else:
+                term.append_xref(xref)
+        yield term
+if __name__ == "__main__":
+    PharmGKBDiseaseGetter.cli()

pyobo/sources/pharmgkb/pharmgkb_gene.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""An ontology representation of PharmGKB genes."""
+from collections.abc import Iterable
+import pandas as pd
+from pyobo import Obo, Reference, Term
+from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, parse_xrefs, split
+__all__ = [
+    "PharmGKBGeneGetter",
+]
+PREFIX = "pharmgkb.gene"
+URL = "https://api.pharmgkb.org/v1/download/file/data/genes.zip"
+class PharmGKBGeneGetter(Obo):
+    """An ontology representation of PharmGKB genes."""
+    ontology = bioversions_key = PREFIX
+    dynamic_version = True
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return iter_terms(force=force)
+def iter_terms(force: bool = False) -> Iterable[Term]:
+    """Iterate over terms.
+    :param force: Should the data be re-downloaded
+    :yields: Terms
+    1. PharmGKB Accession Id = Identifier assigned to this gene by PharmGKB
+    2. NCBI Gene ID = Identifier assigned to this gene by NCBI
+    3. HGNC ID = Identifier assigned to this gene by HGNC
+    4. Ensembl Id = Identifier assigned to this gene by Ensembl
+    5. Name = Canonical name for this gene (by HGNC)
+    6. Symbol = Canonical name for this gene (by HGNC)
+    7. Alternate Names = Other known names for this gene, comma-separated
+    8. Alternate Symbols = Other known symbols for this gene, comma-separated
+    9. Is VIP = "Yes" if PharmGKB has written a VIP annotation for this gene, "No"
+       otherwise
+    10. Has Variant Annotation = "Yes" if PharmGKB has written at least one variant
+        annotation for this gene, "No" otherwise
+    11. Cross-references = References to other resources in the form "resource:id",
+        comma-separated
+    12. Has CPIC Dosing Guideline = "Yes" if PharmGKB has annotated a CPIC guideline for
+        this gene, "No" otherwise
+    13. Chromosome = The chromosome this gene is on, in the form "chr##"
+    14. Chromosomal Start - GRCh37 = Where this gene starts on the chromosomal sequence
+        for NCBI GRCh37
+    15. Chromosomal Stop - GRCh37 = Where this gene stops on the chromosomal sequence
+        for NCBI GRCh37
+    16. Chromosomal Start - GRCh38 = Where this gene starts on the chromosomal sequence
+        for NCBI GRCh38
+    17. Chromosomal Stop - GRCh38 = Where this gene stops on the chromosomal sequence
+        for NCBI GRCh38
+    """
+    df = download_pharmgkb_tsv(PREFIX, url=URL, inner="genes.tsv", force=force)
+    skip_xrefs = {"ncbigene", "hgnc", "ensembl", "GeneCard"}
+    for _, row in df.iterrows():
+        identifier = row["PharmGKB Accession Id"]
+        if pd.isna(identifier):
+            continue
+        term = Term.from_triple(PREFIX, identifier=str(identifier), name=row["Name"])
+        ncbigene_ids = list(split(row, "NCBI Gene ID"))
+        if len(ncbigene_ids) == 1:
+            term.append_exact_match(Reference(prefix="ncbigene", identifier=ncbigene_ids[0]))
+        else:
+            for ncbigene_id in ncbigene_ids:
+                term.append_xref(Reference(prefix="ncbigene", identifier=ncbigene_id))
+        hgnc_ids = list(split(row, "HGNC ID"))
+        if len(hgnc_ids) == 1:
+            term.append_exact_match(Reference(prefix="hgnc", identifier=hgnc_ids[0]))
+        else:
+            for hgnc_id in hgnc_ids:
+                term.append_xref(Reference(prefix="hgnc", identifier=hgnc_id))
+        for ensembl_id in split(row, "Ensembl Id"):
+            term.append_xref(Reference(prefix="ensembl", identifier=ensembl_id))
+        for synonym in split(row, "Alternate Names"):
+            synonym = synonym.strip('"')
+            term.append_synonym(synonym)
+        # TODO symbol synonym type
+        if pd.notna(row["Symbol"]):
+            term.append_synonym(row["Symbol"])
+        for synonym in split(row, "Alternate Symbols"):
+            term.append_synonym(synonym)
+        for xref in parse_xrefs(term, row):
+            if xref.prefix in skip_xrefs:
+                continue
+            term.append_xref(xref)
+        yield term
+if __name__ == "__main__":
+    PharmGKBGeneGetter.cli()

pyobo/sources/pharmgkb/pharmgkb_pathway.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""An ontology representation of PharmGKB pathways."""
+import zipfile
+from collections.abc import Iterable
+from pyobo import Obo, Term
+from pyobo.sources.pharmgkb.utils import download_pharmgkb
+__all__ = [
+    "PharmGKBPathwayGetter",
+]
+PREFIX = "pharmgkb.pathways"
+BIOPAX_URL = "https://api.pharmgkb.org/v1/download/file/data/pathways-biopax.zip"
+EXTENSION = ".owl"
+class PharmGKBPathwayGetter(Obo):
+    """An ontology representation of PharmGKB pathways."""
+    ontology = bioversions_key = PREFIX
+    dynamic_version = True
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return iter_terms(force=force)
+def iter_terms(force: bool = False) -> Iterable[Term]:
+    """Iterate over terms.
+    :param force: Should the data be re-downloaded
+    :yields: Terms
+    1. PharmGKB Accession Id = Identifier assigned to this phenotype by PharmGKB
+    2. Name = Name PharmGKB uses for this phenotype
+    3. Alternate Names = Other known names for this phenotype, comma-separated
+    4. Cross-references = References to other resources in the form "resource:id",
+       comma-separated
+    5. External Vocabulary = Term for this phenotype in another vocabulary in the form
+       "vocabulary:id", comma-separated
+    """
+    path = download_pharmgkb(PREFIX, url=BIOPAX_URL, force=force)
+    with zipfile.ZipFile(path) as zf:
+        for zip_info in zf.filelist:
+            if not zip_info.filename.endswith(EXTENSION):
+                continue
+            with zf.open(zip_info) as file:
+                yield _process_biopax(zip_info, file)
+def _process_biopax(path: zipfile.ZipInfo, file) -> Term:
+    fname = path.filename.removesuffix(EXTENSION).strip().replace("\r\n", " ")
+    identifier, _, name = fname.partition("-")
+    name = name.replace("_", " ")
+    term = Term.from_triple(PREFIX, identifier, name)
+    # TODO parse file with pybiopax to include members and provenance
+    return term
+if __name__ == "__main__":
+    PharmGKBPathwayGetter.cli()

pyobo/sources/pharmgkb/pharmgkb_variant.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""An ontology representation of PharmGKB variants."""
+from collections.abc import Iterable
+import pandas as pd
+from pyobo import Obo, Reference, Term, TypeDef
+from pyobo.sources.pharmgkb.utils import download_pharmgkb_tsv, split
+__all__ = [
+    "PharmGKBVariantGetter",
+]
+PREFIX = "pharmgkb.variant"
+URL = "https://api.pharmgkb.org/v1/download/file/data/variants.zip"
+HAS_GENE_ASSOCIATION = TypeDef.default(
+    PREFIX, "hasGeneAssociation", name="has gene association", is_metadata_tag=True
+)
+class PharmGKBVariantGetter(Obo):
+    """An ontology representation of PharmGKB variants."""
+    ontology = bioversions_key = PREFIX
+    typedefs = [HAS_GENE_ASSOCIATION]
+    dynamic_version = True
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return iter_terms(force=force)
+def iter_terms(force: bool = False) -> Iterable[Term]:
+    """Iterate over terms.
+    :param force: Should the data be re-downloaded
+    :yields: Terms
+    1. Variant ID = The PharmGKB identifier for this variant
+    2. Variant Name = The PharmGKB name for this variant
+    3. Gene IDs = The PharmGKB identifiers for genes associated with this variant
+    4. Gene Symbols = The HGNC symbols for genes associated with this variant
+    5. Location = The location of this variation on a reference sequence (either RefSeq
+       or GenBank), if available. HGVS format when applicable
+    6. Variant Annotation count = The count of Variant Annotations done on this variant
+    7. Clinical Annotation count = The count of all Clinical Annotations done on this
+       variant
+    8. Level 1/2 Clinical Annotation count = The count of Level 1 or Level 2 ("top")
+       Clinical Annotations done on this variant
+    9. Guideline Annotation count = The count of Dosing Guideline Annotations of which
+       this variant is a part
+    10. Label Annotation count = The count of Drug Label Annotations in which this
+        variant is mentioned
+    11. Synonym
+    """
+    df = download_pharmgkb_tsv(PREFIX, url=URL, inner="variants.tsv", force=force)
+    for _, row in df.iterrows():
+        identifier = row["Variant ID"]
+        if pd.isna(identifier):
+            continue
+        term = Term.from_triple(PREFIX, identifier=str(identifier))
+        dbsnp_id = row["Variant Name"]
+        if pd.notna(dbsnp_id):
+            term.append_exact_match(Reference(prefix="dbsnp", identifier=dbsnp_id))
+        for gene_id, gene_name in zip(
+            split(row, "Gene IDs"), split(row, "Gene Symbols"), strict=False
+        ):
+            gene_ref = Reference(prefix="pharmgkb.gene", identifier=gene_id, name=gene_name)
+            term.annotate_object(HAS_GENE_ASSOCIATION, gene_ref)
+        # TODO location, like NC_000003.12:183917980
+        yield term
+if __name__ == "__main__":
+    PharmGKBVariantGetter.cli()

pyobo/sources/pharmgkb/utils.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""Utilities for PharmGKB."""
+from collections.abc import Iterable
+from pathlib import Path
+from typing import cast
+import pandas as pd
+from pystow.utils import read_zipfile_csv
+from tqdm import tqdm
+from pyobo import Reference
+from pyobo.utils.path import ensure_path
+__all__ = [
+    "download_pharmgkb_tsv",
+]
+AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
+def download_pharmgkb(prefix: str, url: str, *, force: bool) -> Path:
+    """Download a file from PharmGKB, spoofing the user agent."""
+    return ensure_path(
+        prefix,
+        url=url,
+        backend="requests",
+        download_kwargs={
+            "headers": {
+                # This is required otherwise we get booted
+                "User-Agent": AGENT,
+            },
+        },
+        force=force,
+    )
+def download_pharmgkb_tsv(prefix: str, url: str, inner: str, *, force: bool) -> pd.DataFrame:
+    """Download PharmGKB data."""
+    path = download_pharmgkb(prefix, url=url, force=force)
+    df = read_zipfile_csv(path, inner_path=inner, dtype=str)
+    return df
+def split(row, key: str) -> Iterable[str]:
+    """Split the data."""
+    values = row.get(key)
+    if pd.isna(values) or not values:
+        return
+    try:
+        for value in values.split(","):
+            yield value.strip()
+    except AttributeError:
+        pass
+_MISSING_PREFIXES: set[str] = set()
+REPLACES = {
+    "URL:http://www.ncbi.nlm.nih.gov/omim/": "omim:",
+    "Comparative Toxicogenomics Database:": "mesh:",
+    "ModBase:": "uniprot:",
+    "RefSeq DNA:": "refseq:",
+    "RefSeq RNA:": "refseq:",
+    "RefSeq Protein:": "refseq:",
+    "UCSC Genome Browser:": "refseq:",
+}
+def parse_xrefs(term, row, key="Cross-references") -> Iterable[Reference]:
+    """Parse the cross-references."""
+    for xref_curie in split(row, key):
+        # HOXD@ is a valid genatlas identifier, see http://genatlas.medecine.univ-paris5.fr/fiche.php?symbol=HOXD@
+        # but this is broken, so skip them for now
+        if xref_curie.endswith("@"):
+            continue
+        for k, v in REPLACES.items():
+            if xref_curie.startswith(k):
+                xref_curie = xref_curie.replace(k, v)
+        try:
+            xref = cast(Reference, Reference.from_curie(xref_curie))
+        except ValueError:
+            p, _, _ = xref_curie.partition(":")
+            if p not in _MISSING_PREFIXES:
+                tqdm.write(f"[{term.curie}] could not parse xref: {xref_curie}")
+            _MISSING_PREFIXES.add(p)
+        else:
+            yield xref

pyobo/sources/pid.py CHANGED Viewed

@@ -38,11 +38,6 @@ class PIDGetter(Obo):
         return iter_terms()
-def get_obo() -> Obo:
-    """Get NCI PID as OBO."""
-    return PIDGetter()
 def iter_networks(use_tqdm: bool = False, force: bool = False) -> Iterable[tuple[str, CX]]:
     """Iterate over NCI PID networks."""
     yield from ensure_ndex_network_set(
@@ -93,7 +88,7 @@ def iter_terms(force: bool = False) -> Iterable[Term]:
                 logger.debug(f"unmapped: {name}, {reference}")
         for hgnc_id, hgnc_symbol in genes:
-            term.append_relationship(
+            term.annotate_object(
                 has_participant, Reference(prefix="hgnc", identifier=hgnc_id, name=hgnc_symbol)
             )

pyobo/sources/pombase.py CHANGED Viewed

@@ -8,7 +8,7 @@ import pandas as pd
 from tqdm.auto import tqdm
 import pyobo
-from pyobo import Reference
+from pyobo import Reference, TypeDef
 from pyobo.resources.so import get_so_name
 from pyobo.struct import Obo, Term, from_species, has_gene_product, orthologous
 from pyobo.utils.path import ensure_df
@@ -22,24 +22,20 @@ logger = logging.getLogger(__name__)
 PREFIX = "pombase"
 GENE_NAMES_URL = "https://www.pombase.org/data/names_and_identifiers/gene_IDs_names_products.tsv"
 ORTHOLOGS_URL = "https://www.pombase.org/data/orthologs/human-orthologs.txt.gz"
+CHROMOSOME = TypeDef.default(PREFIX, "chromosome", is_metadata_tag=True)
 class PomBaseGetter(Obo):
     """An ontology representation of PomBase's fission yeast gene nomenclature."""
     ontology = bioversions_key = PREFIX
-    typedefs = [from_species, has_gene_product, orthologous]
+    typedefs = [from_species, has_gene_product, orthologous, CHROMOSOME]
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""
         return get_terms(force=force, version=self._version_or_raise)
-def get_obo(force: bool = False) -> Obo:
-    """Get OBO."""
-    return PomBaseGetter(force=force)
 #: A mapping from PomBase gene type to sequence ontology terms
 POMBASE_TO_SO = {
     # None: "0000704",  # gene,
@@ -89,13 +85,13 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
             name=symbol if pd.notna(symbol) else None,
             definition=name if pd.notna(name) else None,
         )
-        term.append_property("chromosome", chromosome[len("chromosome_") :])
+        term.annotate_string(CHROMOSOME, chromosome[len("chromosome_") :])
         term.append_parent(so[gtype])
         term.set_species(identifier="4896", name="Schizosaccharomyces pombe")
         for hgnc_id in identifier_to_hgnc_ids.get(identifier, []):
-            term.append_relationship(orthologous, Reference(prefix="hgnc", identifier=hgnc_id))
+            term.annotate_object(orthologous, Reference(prefix="hgnc", identifier=hgnc_id))
         if uniprot_id and pd.notna(uniprot_id):
-            term.append_relationship(
+            term.annotate_object(
                 has_gene_product, Reference(prefix="uniprot", identifier=uniprot_id)
             )
         if synonyms and pd.notna(synonyms):

pyobo/sources/pubchem.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import logging
 from collections.abc import Iterable, Mapping
-from typing import Optional
+from pathlib import Path
 import pandas as pd
 from bioregistry.utils import removeprefix
@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
 PREFIX = "pubchem.compound"
-def _get_pubchem_extras_url(version: Optional[str], end: str) -> str:
+def _get_pubchem_extras_url(version: str | None, end: str) -> str:
     if version is None:
         version = get_version("pubchem")
     return f"ftp://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Monthly/{version}/Extras/{end}"
@@ -40,11 +40,6 @@ class PubChemCompoundGetter(Obo):
         return get_terms(version=self._version_or_raise, force=force)
-def get_obo(force: bool = False) -> Obo:
-    """Get PubChem Compound OBO."""
-    return PubChemCompoundGetter(force=force)
 def _get_cid_smiles_df(version: str) -> pd.DataFrame:
     url = _get_pubchem_extras_url(version, "CID-SMILES.gz")
     return ensure_df(PREFIX, url=url, version=version, dtype=str)
@@ -97,7 +92,7 @@ def get_pubchem_id_to_mesh_id(version: str) -> Mapping[str, str]:
     return dict(df.values)
-def _ensure_cid_name_path(*, version: Optional[str] = None, force: bool = False) -> str:
+def _ensure_cid_name_path(*, version: str | None = None, force: bool = False) -> Path:
     if version is None:
         version = get_version("pubchem")
     # 2 tab-separated columns: compound_id, name
@@ -145,4 +140,4 @@ def get_terms(*, version: str, use_tqdm: bool = True, force: bool = False) -> It
 if __name__ == "__main__":
-    get_obo().write_default()
+    PubChemCompoundGetter.cli()

pyobo 0.11.2__py3-none-any.whl → 0.12.1__py3-none-any.whl

pyobo 0.11.2py3-none-any.whl → 0.12.1py3-none-any.whl