PyPI - pyobo - Versions diffs - 0.11.2__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

pyobo 0.11.2py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (227) hide show

pyobo/.DS_Store +0 -0
pyobo/__init__.py +95 -20
pyobo/__main__.py +0 -0
pyobo/api/__init__.py +81 -10
pyobo/api/alts.py +52 -42
pyobo/api/combine.py +39 -0
pyobo/api/edges.py +68 -0
pyobo/api/hierarchy.py +231 -203
pyobo/api/metadata.py +14 -19
pyobo/api/names.py +207 -127
pyobo/api/properties.py +117 -113
pyobo/api/relations.py +68 -94
pyobo/api/species.py +24 -21
pyobo/api/typedefs.py +11 -11
pyobo/api/utils.py +66 -13
pyobo/api/xrefs.py +108 -114
pyobo/cli/__init__.py +0 -0
pyobo/cli/cli.py +35 -50
pyobo/cli/database.py +183 -161
pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
pyobo/cli/lookup.py +163 -195
pyobo/cli/utils.py +19 -6
pyobo/constants.py +102 -3
pyobo/getters.py +196 -118
pyobo/gilda_utils.py +79 -200
pyobo/identifier_utils/__init__.py +41 -0
pyobo/identifier_utils/api.py +296 -0
pyobo/identifier_utils/model.py +130 -0
pyobo/identifier_utils/preprocessing.json +812 -0
pyobo/identifier_utils/preprocessing.py +61 -0
pyobo/identifier_utils/relations/__init__.py +8 -0
pyobo/identifier_utils/relations/api.py +162 -0
pyobo/identifier_utils/relations/data.json +5824 -0
pyobo/identifier_utils/relations/data_owl.json +57 -0
pyobo/identifier_utils/relations/data_rdf.json +1 -0
pyobo/identifier_utils/relations/data_rdfs.json +7 -0
pyobo/mocks.py +9 -6
pyobo/ner/__init__.py +9 -0
pyobo/ner/api.py +72 -0
pyobo/ner/normalizer.py +33 -0
pyobo/obographs.py +43 -39
pyobo/plugins.py +5 -4
pyobo/py.typed +0 -0
pyobo/reader.py +1358 -395
pyobo/reader_utils.py +155 -0
pyobo/resource_utils.py +42 -22
pyobo/resources/__init__.py +0 -0
pyobo/resources/goc.py +75 -0
pyobo/resources/goc.tsv +188 -0
pyobo/resources/ncbitaxon.py +4 -5
pyobo/resources/ncbitaxon.tsv.gz +0 -0
pyobo/resources/ro.py +3 -2
pyobo/resources/ro.tsv +0 -0
pyobo/resources/so.py +0 -0
pyobo/resources/so.tsv +0 -0
pyobo/sources/README.md +12 -8
pyobo/sources/__init__.py +52 -29
pyobo/sources/agrovoc.py +0 -0
pyobo/sources/antibodyregistry.py +11 -12
pyobo/sources/bigg/__init__.py +13 -0
pyobo/sources/bigg/bigg_compartment.py +81 -0
pyobo/sources/bigg/bigg_metabolite.py +229 -0
pyobo/sources/bigg/bigg_model.py +46 -0
pyobo/sources/bigg/bigg_reaction.py +77 -0
pyobo/sources/biogrid.py +1 -2
pyobo/sources/ccle.py +7 -12
pyobo/sources/cgnc.py +0 -5
pyobo/sources/chebi.py +1 -1
pyobo/sources/chembl/__init__.py +9 -0
pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
pyobo/sources/chembl/chembl_target.py +160 -0
pyobo/sources/civic_gene.py +55 -15
pyobo/sources/clinicaltrials.py +160 -0
pyobo/sources/complexportal.py +24 -24
pyobo/sources/conso.py +14 -22
pyobo/sources/cpt.py +0 -0
pyobo/sources/credit.py +1 -9
pyobo/sources/cvx.py +27 -5
pyobo/sources/depmap.py +9 -12
pyobo/sources/dictybase_gene.py +2 -7
pyobo/sources/drugbank/__init__.py +9 -0
pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
pyobo/sources/drugcentral.py +17 -13
pyobo/sources/expasy.py +31 -34
pyobo/sources/famplex.py +13 -18
pyobo/sources/flybase.py +3 -8
pyobo/sources/gard.py +62 -0
pyobo/sources/geonames/__init__.py +9 -0
pyobo/sources/geonames/features.py +28 -0
pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
pyobo/sources/geonames/utils.py +115 -0
pyobo/sources/gmt_utils.py +6 -7
pyobo/sources/go.py +20 -13
pyobo/sources/gtdb.py +154 -0
pyobo/sources/gwascentral/__init__.py +9 -0
pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
pyobo/sources/hgnc/__init__.py +9 -0
pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
pyobo/sources/icd/__init__.py +9 -0
pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
pyobo/sources/icd/icd11.py +148 -0
pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
pyobo/sources/interpro.py +4 -9
pyobo/sources/itis.py +0 -5
pyobo/sources/kegg/__init__.py +0 -0
pyobo/sources/kegg/api.py +16 -38
pyobo/sources/kegg/genes.py +9 -20
pyobo/sources/kegg/genome.py +1 -7
pyobo/sources/kegg/pathway.py +9 -21
pyobo/sources/mesh.py +58 -24
pyobo/sources/mgi.py +3 -10
pyobo/sources/mirbase/__init__.py +11 -0
pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
pyobo/sources/msigdb.py +74 -39
pyobo/sources/ncbi/__init__.py +9 -0
pyobo/sources/ncbi/ncbi_gc.py +162 -0
pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
pyobo/sources/nih_reporter.py +60 -0
pyobo/sources/nlm/__init__.py +9 -0
pyobo/sources/nlm/nlm_catalog.py +48 -0
pyobo/sources/nlm/nlm_publisher.py +36 -0
pyobo/sources/nlm/utils.py +116 -0
pyobo/sources/npass.py +6 -8
pyobo/sources/omim_ps.py +10 -3
pyobo/sources/pathbank.py +4 -8
pyobo/sources/pfam/__init__.py +9 -0
pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
pyobo/sources/pharmgkb/__init__.py +15 -0
pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
pyobo/sources/pharmgkb/utils.py +86 -0
pyobo/sources/pid.py +1 -6
pyobo/sources/pombase.py +6 -10
pyobo/sources/pubchem.py +4 -9
pyobo/sources/reactome.py +5 -11
pyobo/sources/rgd.py +11 -16
pyobo/sources/rhea.py +37 -36
pyobo/sources/ror.py +69 -42
pyobo/sources/selventa/__init__.py +0 -0
pyobo/sources/selventa/schem.py +4 -7
pyobo/sources/selventa/scomp.py +1 -6
pyobo/sources/selventa/sdis.py +4 -7
pyobo/sources/selventa/sfam.py +1 -6
pyobo/sources/sgd.py +6 -11
pyobo/sources/signor/__init__.py +7 -0
pyobo/sources/signor/download.py +41 -0
pyobo/sources/signor/signor_complexes.py +105 -0
pyobo/sources/slm.py +12 -15
pyobo/sources/umls/__init__.py +7 -1
pyobo/sources/umls/__main__.py +0 -0
pyobo/sources/umls/get_synonym_types.py +20 -4
pyobo/sources/umls/sty.py +57 -0
pyobo/sources/umls/synonym_types.tsv +1 -1
pyobo/sources/umls/umls.py +18 -22
pyobo/sources/unimod.py +46 -0
pyobo/sources/uniprot/__init__.py +1 -1
pyobo/sources/uniprot/uniprot.py +40 -32
pyobo/sources/uniprot/uniprot_ptm.py +4 -34
pyobo/sources/utils.py +3 -2
pyobo/sources/wikipathways.py +7 -10
pyobo/sources/zfin.py +5 -10
pyobo/ssg/__init__.py +12 -16
pyobo/ssg/base.html +0 -0
pyobo/ssg/index.html +26 -13
pyobo/ssg/term.html +12 -2
pyobo/ssg/typedef.html +0 -0
pyobo/struct/__init__.py +54 -8
pyobo/struct/functional/__init__.py +1 -0
pyobo/struct/functional/dsl.py +2572 -0
pyobo/struct/functional/macros.py +423 -0
pyobo/struct/functional/obo_to_functional.py +385 -0
pyobo/struct/functional/ontology.py +270 -0
pyobo/struct/functional/utils.py +112 -0
pyobo/struct/reference.py +331 -136
pyobo/struct/struct.py +1413 -643
pyobo/struct/struct_utils.py +1078 -0
pyobo/struct/typedef.py +162 -210
pyobo/struct/utils.py +12 -5
pyobo/struct/vocabulary.py +138 -0
pyobo/utils/__init__.py +0 -0
pyobo/utils/cache.py +13 -11
pyobo/utils/io.py +17 -31
pyobo/utils/iter.py +5 -5
pyobo/utils/misc.py +41 -53
pyobo/utils/ndex_utils.py +0 -0
pyobo/utils/path.py +76 -70
pyobo/version.py +3 -3
{pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/METADATA +228 -229
pyobo-0.12.0.dist-info/RECORD +202 -0
pyobo-0.12.0.dist-info/WHEEL +4 -0
{pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
pyobo-0.12.0.dist-info/licenses/LICENSE +21 -0
pyobo/aws.py +0 -162
pyobo/cli/aws.py +0 -47
pyobo/identifier_utils.py +0 -142
pyobo/normalizer.py +0 -232
pyobo/registries/__init__.py +0 -16
pyobo/registries/metaregistry.json +0 -507
pyobo/registries/metaregistry.py +0 -135
pyobo/sources/icd11.py +0 -105
pyobo/xrefdb/__init__.py +0 -1
pyobo/xrefdb/canonicalizer.py +0 -214
pyobo/xrefdb/priority.py +0 -59
pyobo/xrefdb/sources/__init__.py +0 -60
pyobo/xrefdb/sources/biomappings.py +0 -36
pyobo/xrefdb/sources/cbms2019.py +0 -91
pyobo/xrefdb/sources/chembl.py +0 -83
pyobo/xrefdb/sources/compath.py +0 -82
pyobo/xrefdb/sources/famplex.py +0 -64
pyobo/xrefdb/sources/gilda.py +0 -50
pyobo/xrefdb/sources/intact.py +0 -113
pyobo/xrefdb/sources/ncit.py +0 -133
pyobo/xrefdb/sources/pubchem.py +0 -27
pyobo/xrefdb/sources/wikidata.py +0 -116
pyobo-0.11.2.dist-info/RECORD +0 -157
pyobo-0.11.2.dist-info/WHEEL +0 -5
pyobo-0.11.2.dist-info/top_level.txt +0 -1

pyobo/sources/ncbi/ncbi_gc.py ADDED Viewed

@@ -0,0 +1,162 @@
+"""Convert NCBI Genetic Codes to an ontology.
+.. seealso::
+    https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes
+"""
+from collections.abc import Iterable
+from pyobo import default_reference
+from pyobo.struct import CHARLIE_TERM, HUMAN_TERM, PYOBO_INJECTED, Obo, Reference, Term, TypeDef
+from pyobo.struct.typedef import comment, has_contributor, see_also, term_replaced_by
+from pyobo.utils.path import ensure_path
+PREFIX = "ncbi.gc"
+URI_PREFIX = (
+    "https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes#SG"
+)
+URL = "ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt"
+VERSION = "4.6"
+GC_ROOT = default_reference(prefix=PREFIX, identifier="root", name="genetic code translation table")
+NCBITAXON_ROOT = Reference(prefix="NCBITaxon", identifier="1", name="root")
+has_gc_code = TypeDef(
+    reference=default_reference(
+        prefix=PREFIX,
+        identifier="hasGeneticCodeTranslationTable",
+        name="has genetic code translation table",
+    ),
+    definition="Connects a taxonomy term to a genetic code translation table",
+    domain=NCBITAXON_ROOT,
+    range=GC_ROOT,
+).append_contributor(CHARLIE_TERM)
+NUCLEAR_GENETIC_CODE = default_reference(
+    prefix=PREFIX, identifier="nuclear-genetic-code", name="nuclear genetic code translation table"
+)
+MITOCHONDRIAL_GENETIC_CODE = default_reference(
+    prefix=PREFIX,
+    identifier="mitochondrial-genetic-code",
+    name="mitochondrial genetic code translation table",
+)
+PLASTID_GENETIC_CODE = default_reference(
+    prefix=PREFIX, identifier="plastid-genetic-code", name="plastid genetic code translation table"
+)
+NUCLEUS = Reference(prefix="GO", identifier="0005634", name="nucleus")
+MITOCHONDIA = Reference(prefix="GO", identifier="0005739", name="mitochondrion")
+PLASTID = Reference(prefix="GO", identifier="0009536", name="plastid")
+CATEGORY_TO_CELLULAR_COMPONENT = {
+    NUCLEAR_GENETIC_CODE: NUCLEUS,
+    MITOCHONDRIAL_GENETIC_CODE: MITOCHONDIA,
+    PLASTID_GENETIC_CODE: PLASTID,
+}
+CATEGORY_TO_TABLES = {
+    NUCLEAR_GENETIC_CODE: [12, 31, 6, 28, 10, 27, 29, 26, 30, 15],
+    MITOCHONDRIAL_GENETIC_CODE: [14, 13, 16, 9, 5, 4, 22, 23, 21, 2, 3, 24],
+    PLASTID_GENETIC_CODE: [11, 32],
+}
+TABLE_TO_CATEGORY = {
+    str(value): key for key, values in CATEGORY_TO_TABLES.items() for value in values
+}
+class NCBIGCGetter(Obo):
+    """Get terms in GC."""
+    ontology = PREFIX
+    static_version = VERSION
+    root_terms = [GC_ROOT]
+    typedefs = [has_gc_code, has_contributor, see_also, comment, term_replaced_by]
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return get_terms()
+def get_terms() -> Iterable[Term]:
+    """Get terms for GC."""
+    yield CHARLIE_TERM
+    yield Term(reference=NCBITAXON_ROOT)
+    yield HUMAN_TERM
+    path = ensure_path(PREFIX, url=URL)
+    # first, remove comment lines
+    lines = [
+        line.strip()
+        for line in path.read_text().splitlines()
+        if not line.startswith("--") and line.strip()
+    ]
+    lines = lines[1:-2]
+    entries: list[dict[str, str]] = []
+    entry: dict[str, str] = {}
+    for line in lines:
+        # start a new entry
+        if line == "{":
+            if entry:
+                entries.append(entry)
+            entry = {}
+        elif line == "},":
+            pass
+        else:
+            key, data = line.split(" ", 1)
+            if key == "name":
+                data = data.lstrip('"')
+                if data.startswith("SGC"):
+                    key = "symbol"
+                entry[key] = data.rstrip(",").rstrip().rstrip('"')
+            elif key == "id":
+                entry["identifier"] = data.rstrip(",").rstrip()
+    yield (
+        Term(
+            reference=GC_ROOT,
+            definition="A table for translating codons into amino acids. This can change for "
+            "different taxa, or be different in different organelles that include genetic information.",
+        )
+        .append_contributor(CHARLIE_TERM)
+        .append_comment(PYOBO_INJECTED)
+    )
+    for reference in CATEGORY_TO_TABLES:
+        term = Term(reference=reference)
+        term.append_parent(GC_ROOT)
+        term.append_contributor(CHARLIE_TERM)
+        term.append_comment(PYOBO_INJECTED)
+        if substructure := CATEGORY_TO_CELLULAR_COMPONENT.get(reference):
+            term.append_see_also(substructure)
+        yield term
+    for entry in entries:
+        identifier = entry["identifier"]
+        term = Term.from_triple(PREFIX, identifier, entry["name"])
+        term.append_parent(TABLE_TO_CATEGORY.get(identifier, GC_ROOT))
+        # TODO if symbol is available, what does it mean?
+        yield term
+    yield (
+        Term(
+            reference=Reference(prefix=PREFIX, identifier="7"),
+            is_obsolete=True,
+        )
+        .append_replaced_by(Reference(prefix=PREFIX, identifier="4"))
+        .append_comment("Kinetoplast code now merged in code id 4, as of 1995.")
+    )
+    yield (
+        Term(
+            reference=Reference(prefix=PREFIX, identifier="8"),
+            is_obsolete=True,
+        )
+        .append_replaced_by(Reference(prefix=PREFIX, identifier="1"))
+        .append_comment("all plant chloroplast differences due to RNA edit, as of 1995.")
+    )
+    for cellular_component in CATEGORY_TO_CELLULAR_COMPONENT.values():
+        yield Term(reference=cellular_component)
+if __name__ == "__main__":
+    NCBIGCGetter.cli()

pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} RENAMED Viewed

@@ -7,8 +7,8 @@ import bioregistry
 import pandas as pd
 from tqdm.auto import tqdm
-from ..struct import Obo, Reference, Term, from_species
-from ..utils.path import ensure_df
+from ...struct import Obo, Reference, Term, from_species
+from ...utils.path import ensure_df
 __all__ = [
     "NCBIGeneGetter",
@@ -34,7 +34,7 @@ CONSORTIUM_SPECIES_MAPPING = {
 }
 GENE_INFO_URL = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz"
-#: Columns fro gene_info.gz that are used
+#: Columns for gene_info.gz that are used
 GENE_INFO_COLUMNS = [
     "#tax_id",
     "GeneID",
@@ -93,11 +93,6 @@ class NCBIGeneGetter(Obo):
         return get_terms(force=force)
-def get_obo(force: bool = False) -> Obo:
-    """Get Entrez as OBO."""
-    return NCBIGeneGetter(force=force)
 def get_gene_info_df(force: bool = False) -> pd.DataFrame:
     """Get the gene info dataframe."""
     return ensure_df(
@@ -111,17 +106,16 @@ def get_gene_info_df(force: bool = False) -> pd.DataFrame:
     )
-"""xref_mapping was obtained from:
-namespaces = set()
-for xrefs in df[df['dbXrefs'].notna()]['dbXrefs']:
-    for xref in xrefs.split('|'):
-        namespaces.add(xref.split(':')[0])
+def _get_xref_mapping() -> list[str]:
+    namespaces: set[str] = set()
+    df = get_gene_info_df()
+    for xrefs in df[df["dbXrefs"].notna()]["dbXrefs"]:
+        for xref in xrefs.split("|"):
+            namespaces.add(xref.split(":")[0])
+    return sorted(namespaces, key=str.casefold)
-print('namespaces:')
-print(*sorted(namespaces), sep='\n')
-"""
+# this was retrieved from :func:`_get_xref_mapping`
 xref_mapping = {
     "APHIDBASE",
     "ASAP",
@@ -157,7 +151,12 @@ xref_mapping = {x.lower() for x in xref_mapping}
 def get_terms(force: bool = False) -> Iterable[Term]:
-    """Get Entrez terms."""
+    """Get Entrez terms.
+    :param force: should re-download be forced?
+    :yields: terms for each line
+    """
     df = get_gene_info_df(force=force)
     it = tqdm(
@@ -192,4 +191,4 @@ def get_terms(force: bool = False) -> Iterable[Term]:
 if __name__ == "__main__":
-    get_obo().write_default()
+    NCBIGeneGetter.cli()

pyobo/sources/nih_reporter.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""A source for NIH RePORTER projects."""
+from collections.abc import Iterable
+import pandas as pd
+from nih_reporter_downloader import get_projects_df
+from pyobo import Reference
+from pyobo.struct import CHARLIE_TERM, HUMAN_TERM, PYOBO_INJECTED, Obo, Term, default_reference
+__all__ = [
+    "NIHReporterGetter",
+]
+PREFIX = "nihreporter.project"
+PROJECTS_SUBSET = [
+    "APPLICATION_ID",
+    "PROJECT_TITLE",
+]
+PROJECT_TERM = (
+    Term(reference=default_reference(PREFIX, "project", name="project"))
+    .append_contributor(CHARLIE_TERM)
+    .append_comment(PYOBO_INJECTED)
+)
+class NIHReporterGetter(Obo):
+    """An ontology representation of NIH RePORTER."""
+    ontology = PREFIX
+    dynamic_version = True
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        yield CHARLIE_TERM
+        yield HUMAN_TERM
+        yield PROJECT_TERM
+        yield from iterate_nih_reporter_projects()
+def iterate_nih_reporter_projects() -> Iterable[Term]:
+    """Iterate over NIH RePORTER projects."""
+    projects_df = get_projects_df()
+    for identifier, name in projects_df[PROJECTS_SUBSET].values:
+        term = Term(
+            reference=Reference(
+                prefix=PREFIX,
+                identifier=str(identifier),
+                name=name.replace("\n", " ") if pd.notna(name) else None,
+            ),
+            type="Instance",
+        )
+        term.append_parent(PROJECT_TERM)
+        # TODO there is a lot more information that can be added here
+        yield term
+if __name__ == "__main__":
+    NIHReporterGetter.cli()

pyobo/sources/nlm/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Sources from NLM."""
+from .nlm_catalog import NLMCatalogGetter
+from .nlm_publisher import NLMPublisherGetter
+__all__ = [
+    "NLMCatalogGetter",
+    "NLMPublisherGetter",
+]

pyobo/sources/nlm/nlm_catalog.py ADDED Viewed

@@ -0,0 +1,48 @@
+"""Converter for NLM Providers."""
+from collections.abc import Iterable
+from pyobo.sources.nlm.utils import (
+    JOURNAL_TERM,
+    PREFIX_CATALOG,
+    PUBLISHED_IN,
+    PUBLISHER_TERM,
+    get_journals,
+    get_publishers,
+)
+from pyobo.struct import CHARLIE_TERM, HUMAN_TERM, Obo, Term
+from pyobo.struct.typedef import exact_match, has_end_date, has_start_date
+__all__ = [
+    "NLMCatalogGetter",
+]
+class NLMCatalogGetter(Obo):
+    """An ontology representation of NLM Providers."""
+    bioversions_key = ontology = PREFIX_CATALOG
+    dynamic_version = True
+    typedefs = [PUBLISHED_IN, has_end_date, has_start_date, exact_match]
+    root_terms = [JOURNAL_TERM.reference, PUBLISHER_TERM.reference]
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over journal terms for NLM Catalog."""
+        yield from get_terms(force=force)
+def get_terms(*, force: bool = False) -> Iterable[Term]:
+    """Get NLM catalog terms."""
+    yield JOURNAL_TERM
+    yield PUBLISHER_TERM
+    yield CHARLIE_TERM
+    yield HUMAN_TERM
+    journal_id_to_publisher_key = get_publishers(force=force)
+    yield from sorted(set(journal_id_to_publisher_key.values()))
+    yield from get_journals(force=force, journal_id_to_publisher_key=journal_id_to_publisher_key)
+if __name__ == "__main__":
+    NLMCatalogGetter.cli()

pyobo/sources/nlm/nlm_publisher.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""Converter for NLM Providers."""
+from collections.abc import Iterable
+from pyobo.sources.nlm.utils import PREFIX_PUBLISHER, PUBLISHER_TERM, get_publishers
+from pyobo.struct import CHARLIE_TERM, HUMAN_TERM, Obo, Term
+__all__ = [
+    "NLMPublisherGetter",
+]
+class NLMPublisherGetter(Obo):
+    """An ontology representation of NLM Publishers."""
+    bioversions_key = ontology = PREFIX_PUBLISHER
+    dynamic_version = True
+    root_terms = [PUBLISHER_TERM.reference]
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over gene terms for NLM Catalog."""
+        yield from get_terms(force=force)
+def get_terms(*, force: bool = False) -> Iterable[Term]:
+    """Get NLM publisher terms."""
+    yield PUBLISHER_TERM
+    yield CHARLIE_TERM
+    yield HUMAN_TERM
+    journal_id_to_publisher_key = get_publishers(force=force)
+    yield from sorted(set(journal_id_to_publisher_key.values()))
+if __name__ == "__main__":
+    NLMPublisherGetter.cli()

pyobo/sources/nlm/utils.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""Utilities for NLM."""
+from collections.abc import Iterable
+from xml.etree import ElementTree
+from tqdm import tqdm
+from pyobo import Reference, Term, TypeDef, default_reference, ensure_path
+from pyobo.struct.struct import CHARLIE_TERM, PYOBO_INJECTED
+from pyobo.struct.typedef import has_end_date, has_start_date
+from pyobo.utils.path import ensure_df
+PREFIX_CATALOG = "nlm"
+PREFIX_PUBLISHER = "nlm.publisher"
+CATALOG_TO_PUBLISHER = "https://ftp.ncbi.nlm.nih.gov/pubmed/xmlprovidernames.txt"
+JOURNAL_INFO_PATH = "https://ftp.ncbi.nlm.nih.gov/pubmed/jourcache.xml"
+PUBLISHED_IN = TypeDef(
+    reference=default_reference(PREFIX_CATALOG, "published_in", name="published in"),
+    xrefs=[
+        Reference(prefix="biolink", identifier="published_in"),
+        Reference(prefix="uniprot.core", identifier="publishedIn"),
+    ],
+)
+JOURNAL_TERM = (
+    Term(reference=default_reference(PREFIX_CATALOG, "journal", name="journal"))
+    .append_exact_match(Reference(prefix="SIO", identifier="000160"))
+    .append_exact_match(Reference(prefix="FBCV", identifier="0000787"))
+    .append_exact_match(Reference(prefix="MI", identifier="0885"))
+    .append_exact_match(Reference(prefix="bibo", identifier="Journal"))
+    .append_exact_match(Reference(prefix="uniprot.core", identifier="Journal"))
+    .append_contributor(CHARLIE_TERM)
+    .append_comment(PYOBO_INJECTED)
+)
+PUBLISHER_TERM = (
+    Term(reference=default_reference(PREFIX_CATALOG, "publisher", name="publisher"))
+    .append_exact_match(Reference(prefix="biolink", identifier="publisher"))
+    .append_exact_match(Reference(prefix="schema", identifier="publisher"))
+    .append_exact_match(Reference(prefix="uniprot.core", identifier="publisher"))
+    .append_contributor(CHARLIE_TERM)
+    .append_comment(PYOBO_INJECTED)
+)
+def get_publishers(*, force: bool = False) -> dict[str, Term]:
+    """Get NLM publishers."""
+    journal_to_publisher_df = ensure_df(
+        PREFIX_CATALOG, url=CATALOG_TO_PUBLISHER, sep="|", force=force, dtype=str
+    )
+    journal_id_to_publisher_key: dict[str, Term] = {
+        journal_id: Term(
+            reference=Reference(prefix=PREFIX_PUBLISHER, identifier=identifier, name=name),
+            type="Instance",
+        ).append_parent(PUBLISHER_TERM)
+        for journal_id, identifier, name in journal_to_publisher_df.values
+    }
+    return journal_id_to_publisher_key
+def get_journals(
+    *, force: bool = False, journal_id_to_publisher_key: dict[str, Term] | None = None
+) -> Iterable[Term]:
+    """Get NLM Catalog terms."""
+    path = ensure_path(PREFIX_CATALOG, url=JOURNAL_INFO_PATH, force=force)
+    root = ElementTree.parse(path).getroot()
+    if journal_id_to_publisher_key is None:
+        journal_id_to_publisher_key = get_publishers(force=force)
+    elements = root.findall("Journal")
+    for element in elements:
+        if term := _process_journal(element, journal_id_to_publisher_key):
+            yield term
+def _process_journal(element, journal_id_to_publisher_key: dict[str, Term]) -> Term | None:
+    # TODO enrich with context from https://ftp.ncbi.nlm.nih.gov/pubmed/J_Entrez.txt and https://ftp.ncbi.nlm.nih.gov/pubmed/J_Medline.txt
+    nlm_id = element.findtext("NlmUniqueID")
+    name = element.findtext("Name")
+    if not nlm_id.isnumeric():
+        # TODO investigate these records, which all appear to have IDs that
+        #  end in R like 17410670R (Proceedings of the staff meetings. Honolulu. Clinic)
+        #  which corresponds to https://www.ncbi.nlm.nih.gov/nlmcatalog/287649
+        return None
+    issns = [(issn.text, issn.attrib["type"]) for issn in element.findall("Issn")]
+    # ActivityFlag is either "0" or "1"
+    term = Term(
+        reference=Reference(prefix=PREFIX_CATALOG, identifier=nlm_id, name=name),
+        type="Instance",
+    )
+    term.append_parent(JOURNAL_TERM)
+    for synonym in element.findall("Alias"):
+        term.append_synonym(synonym.text)
+    for issn, _issn_type in issns:
+        if issn.isnumeric():
+            issn = issn[:4] + "-" + issn[4:]
+        # TODO include ISSN type, this is important
+        #  to determine a "canonical" one
+        term.append_xref(Reference(prefix="issn", identifier=issn))
+    if start_year := element.findtext("StartYear"):
+        if len(start_year) != 4:
+            tqdm.write(f"[{term.curie}] invalid start year: {start_year}")
+        else:
+            term.annotate_year(has_start_date, start_year)
+    if end_year := element.findtext("EndYear"):
+        if len(end_year) != 4:
+            tqdm.write(f"[{term.curie}] invalid end year: {end_year}")
+        else:
+            term.annotate_year(has_end_date, end_year)
+    # FIXME this whole thing needs reinvestigating
+    if publisher_reference := journal_id_to_publisher_key.get(term.identifier):
+        term.annotate_object(PUBLISHED_IN, publisher_reference.reference)
+    return term

pyobo/sources/npass.py CHANGED Viewed

@@ -6,7 +6,7 @@ from collections.abc import Iterable
 import pandas as pd
 from tqdm.auto import tqdm
-from ..struct import Obo, Reference, Synonym, Term
+from ..struct import Obo, Reference, Term
 from ..utils.path import ensure_df
 __all__ = [
@@ -32,11 +32,6 @@ class NPASSGetter(Obo):
         return iter_terms(force=force, version=self._version_or_raise)
-def get_obo(force: bool = False) -> Obo:
-    """Get NPASS as OBO."""
-    return NPASSGetter()
 def get_df(version: str, force: bool = False) -> pd.DataFrame:
     """Get the NPASS chemical nomenclature."""
     base_url = f"https://bidd.group/NPASS/downloadFiles/NPASSv{version}_download"
@@ -71,7 +66,10 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
         # TODO check that the first is always the parent compound?
         if pd.notna(pubchem_compound_ids):
             pubchem_compound_ids = [
-                yy.strip() for xx in pubchem_compound_ids.split(";") for yy in xx.strip().split(",")
+                zz
+                for xx in pubchem_compound_ids.split(";")
+                for yy in xx.strip().split(",")
+                if (zz := yy.strip())
             ]
             if len(pubchem_compound_ids) > 1:
                 logger.debug("multiple cids for %s: %s", identifier, pubchem_compound_ids)
@@ -82,7 +80,7 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
         for synonym in [iupac]:
             if pd.notna(synonym):
-                term.append_synonym(Synonym(name=synonym))
+                term.append_synonym(synonym)
         yield term

pyobo/sources/omim_ps.py CHANGED Viewed

@@ -11,7 +11,6 @@ __all__ = [
     "OMIMPSGetter",
 ]
 logger = logging.getLogger(__name__)
 PREFIX = "omim.ps"
 URL = "https://omim.org/phenotypicSeriesTitles/all"
@@ -25,8 +24,16 @@ class OMIMPSGetter(Obo):
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""
         soup = get_soup(URL, user_agent="Mozilla/5.0")
-        rows = soup.find(id="mimContent").find("table").find("tbody").find_all("tr")
-        for row in rows:
+        content = soup.find(id="mimContent")
+        if content is None:
+            raise ValueError
+        table = content.find("table")  # type:ignore[attr-defined]
+        if table is None:
+            raise ValueError
+        tbody = table.find("tbody")
+        if tbody is None:
+            raise ValueError
+        for row in tbody.find_all("tr"):
             anchor = row.find("td").find("a")
             name = anchor.text.strip()
             identifier = anchor.attrs["href"][len("/phenotypicSeries/") :]

pyobo/sources/pathbank.py CHANGED Viewed

@@ -5,6 +5,7 @@ from __future__ import annotations
 import logging
 from collections import defaultdict
 from collections.abc import Iterable, Mapping
+from itertools import chain
 import pandas as pd
 from tqdm.auto import tqdm
@@ -77,11 +78,6 @@ class PathBankGetter(Obo):
         return iter_terms(force=force, version=self._version_or_raise)
-def get_obo(force: bool = False) -> Obo:
-    """Get PathBank as OBO."""
-    return PathBankGetter(force=force)
 def get_proteins_df(version: str, force: bool = False) -> pd.DataFrame:
     """Get the proteins dataframe."""
     proteins_df = ensure_df(
@@ -165,9 +161,9 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
             #  but there are weird parser errors
         )
         term.append_exact_match(Reference(prefix="smpdb", identifier=smpdb_id))
-        term.append_property(has_category, subject.lower().replace(" ", "_"))
-        term.extend_relationship(has_participant, smpdb_id_to_proteins[smpdb_id])
-        term.extend_relationship(has_participant, smpdb_id_to_metabolites[smpdb_id])
+        term.annotate_string(has_category, subject.lower().replace(" ", "_"))
+        for participant in chain(smpdb_id_to_proteins[smpdb_id], smpdb_id_to_metabolites[smpdb_id]):
+            term.append_relationship(has_participant, participant)
         yield term

pyobo/sources/pfam/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Resources from PFAM."""
+from .pfam import PfamGetter
+from .pfam_clan import PfamClanGetter
+__all__ = [
+    "PfamClanGetter",
+    "PfamGetter",
+]

pyobo/sources/{pfam.py → pfam/pfam.py} RENAMED Viewed

@@ -4,8 +4,8 @@ from collections.abc import Iterable
 import pandas as pd
-from ..struct import Obo, Reference, Term
-from ..utils.path import ensure_df
+from ...struct import Obo, Reference, Term
+from ...utils.path import ensure_df
 __all__ = [
     "PfamGetter",
@@ -47,11 +47,6 @@ class PfamGetter(Obo):
         return iter_terms(self._version_or_raise, force=force)
-def get_obo(force: bool = False) -> Obo:
-    """Get PFAM as OBO."""
-    return PfamGetter(force=force)
 def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
     """Iterate PFAM terms."""
     df = get_pfam_clan_df(version=version, force=force)
@@ -67,4 +62,4 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
 if __name__ == "__main__":
-    get_obo().write_default()
+    PfamGetter.cli()

pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} RENAMED Viewed

@@ -5,7 +5,7 @@ from collections.abc import Iterable
 from tqdm.auto import tqdm
 from .pfam import get_pfam_clan_df
-from ..struct import Obo, Reference, Term
+from ...struct import Obo, Reference, Term
 __all__ = [
     "PfamClanGetter",
@@ -25,11 +25,6 @@ class PfamClanGetter(Obo):
         return iter_terms(version=self._version_or_raise, force=force)
-def get_obo(force: bool = False) -> Obo:
-    """Get PFAM Clans as OBO."""
-    return PfamClanGetter(force=force)
 # TODO could get definitions from ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam33.0/Pfam-C.gz
@@ -46,4 +41,4 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
 if __name__ == "__main__":
-    get_obo().write_default()
+    PfamClanGetter.cli()

pyobo 0.11.2__py3-none-any.whl → 0.12.0__py3-none-any.whl

pyobo 0.11.2py3-none-any.whl → 0.12.0py3-none-any.whl