PyPI - pyobo - Versions diffs - 0.11.2__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

pyobo 0.11.2py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (227) hide show

pyobo/.DS_Store +0 -0
pyobo/__init__.py +95 -20
pyobo/__main__.py +0 -0
pyobo/api/__init__.py +81 -10
pyobo/api/alts.py +52 -42
pyobo/api/combine.py +39 -0
pyobo/api/edges.py +68 -0
pyobo/api/hierarchy.py +231 -203
pyobo/api/metadata.py +14 -19
pyobo/api/names.py +207 -127
pyobo/api/properties.py +117 -113
pyobo/api/relations.py +68 -94
pyobo/api/species.py +24 -21
pyobo/api/typedefs.py +11 -11
pyobo/api/utils.py +66 -13
pyobo/api/xrefs.py +108 -114
pyobo/cli/__init__.py +0 -0
pyobo/cli/cli.py +35 -50
pyobo/cli/database.py +183 -161
pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
pyobo/cli/lookup.py +163 -195
pyobo/cli/utils.py +19 -6
pyobo/constants.py +102 -3
pyobo/getters.py +196 -118
pyobo/gilda_utils.py +79 -200
pyobo/identifier_utils/__init__.py +41 -0
pyobo/identifier_utils/api.py +296 -0
pyobo/identifier_utils/model.py +130 -0
pyobo/identifier_utils/preprocessing.json +812 -0
pyobo/identifier_utils/preprocessing.py +61 -0
pyobo/identifier_utils/relations/__init__.py +8 -0
pyobo/identifier_utils/relations/api.py +162 -0
pyobo/identifier_utils/relations/data.json +5824 -0
pyobo/identifier_utils/relations/data_owl.json +57 -0
pyobo/identifier_utils/relations/data_rdf.json +1 -0
pyobo/identifier_utils/relations/data_rdfs.json +7 -0
pyobo/mocks.py +9 -6
pyobo/ner/__init__.py +9 -0
pyobo/ner/api.py +72 -0
pyobo/ner/normalizer.py +33 -0
pyobo/obographs.py +43 -39
pyobo/plugins.py +5 -4
pyobo/py.typed +0 -0
pyobo/reader.py +1358 -395
pyobo/reader_utils.py +155 -0
pyobo/resource_utils.py +42 -22
pyobo/resources/__init__.py +0 -0
pyobo/resources/goc.py +75 -0
pyobo/resources/goc.tsv +188 -0
pyobo/resources/ncbitaxon.py +4 -5
pyobo/resources/ncbitaxon.tsv.gz +0 -0
pyobo/resources/ro.py +3 -2
pyobo/resources/ro.tsv +0 -0
pyobo/resources/so.py +0 -0
pyobo/resources/so.tsv +0 -0
pyobo/sources/README.md +12 -8
pyobo/sources/__init__.py +52 -29
pyobo/sources/agrovoc.py +0 -0
pyobo/sources/antibodyregistry.py +11 -12
pyobo/sources/bigg/__init__.py +13 -0
pyobo/sources/bigg/bigg_compartment.py +81 -0
pyobo/sources/bigg/bigg_metabolite.py +229 -0
pyobo/sources/bigg/bigg_model.py +46 -0
pyobo/sources/bigg/bigg_reaction.py +77 -0
pyobo/sources/biogrid.py +1 -2
pyobo/sources/ccle.py +7 -12
pyobo/sources/cgnc.py +0 -5
pyobo/sources/chebi.py +1 -1
pyobo/sources/chembl/__init__.py +9 -0
pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
pyobo/sources/chembl/chembl_target.py +160 -0
pyobo/sources/civic_gene.py +55 -15
pyobo/sources/clinicaltrials.py +160 -0
pyobo/sources/complexportal.py +24 -24
pyobo/sources/conso.py +14 -22
pyobo/sources/cpt.py +0 -0
pyobo/sources/credit.py +1 -9
pyobo/sources/cvx.py +27 -5
pyobo/sources/depmap.py +9 -12
pyobo/sources/dictybase_gene.py +2 -7
pyobo/sources/drugbank/__init__.py +9 -0
pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
pyobo/sources/drugcentral.py +17 -13
pyobo/sources/expasy.py +31 -34
pyobo/sources/famplex.py +13 -18
pyobo/sources/flybase.py +3 -8
pyobo/sources/gard.py +62 -0
pyobo/sources/geonames/__init__.py +9 -0
pyobo/sources/geonames/features.py +28 -0
pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
pyobo/sources/geonames/utils.py +115 -0
pyobo/sources/gmt_utils.py +6 -7
pyobo/sources/go.py +20 -13
pyobo/sources/gtdb.py +154 -0
pyobo/sources/gwascentral/__init__.py +9 -0
pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
pyobo/sources/hgnc/__init__.py +9 -0
pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
pyobo/sources/icd/__init__.py +9 -0
pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
pyobo/sources/icd/icd11.py +148 -0
pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
pyobo/sources/interpro.py +4 -9
pyobo/sources/itis.py +0 -5
pyobo/sources/kegg/__init__.py +0 -0
pyobo/sources/kegg/api.py +16 -38
pyobo/sources/kegg/genes.py +9 -20
pyobo/sources/kegg/genome.py +1 -7
pyobo/sources/kegg/pathway.py +9 -21
pyobo/sources/mesh.py +58 -24
pyobo/sources/mgi.py +3 -10
pyobo/sources/mirbase/__init__.py +11 -0
pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
pyobo/sources/msigdb.py +74 -39
pyobo/sources/ncbi/__init__.py +9 -0
pyobo/sources/ncbi/ncbi_gc.py +162 -0
pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
pyobo/sources/nih_reporter.py +60 -0
pyobo/sources/nlm/__init__.py +9 -0
pyobo/sources/nlm/nlm_catalog.py +48 -0
pyobo/sources/nlm/nlm_publisher.py +36 -0
pyobo/sources/nlm/utils.py +116 -0
pyobo/sources/npass.py +6 -8
pyobo/sources/omim_ps.py +10 -3
pyobo/sources/pathbank.py +4 -8
pyobo/sources/pfam/__init__.py +9 -0
pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
pyobo/sources/pharmgkb/__init__.py +15 -0
pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
pyobo/sources/pharmgkb/utils.py +86 -0
pyobo/sources/pid.py +1 -6
pyobo/sources/pombase.py +6 -10
pyobo/sources/pubchem.py +4 -9
pyobo/sources/reactome.py +5 -11
pyobo/sources/rgd.py +11 -16
pyobo/sources/rhea.py +37 -36
pyobo/sources/ror.py +69 -42
pyobo/sources/selventa/__init__.py +0 -0
pyobo/sources/selventa/schem.py +4 -7
pyobo/sources/selventa/scomp.py +1 -6
pyobo/sources/selventa/sdis.py +4 -7
pyobo/sources/selventa/sfam.py +1 -6
pyobo/sources/sgd.py +6 -11
pyobo/sources/signor/__init__.py +7 -0
pyobo/sources/signor/download.py +41 -0
pyobo/sources/signor/signor_complexes.py +105 -0
pyobo/sources/slm.py +12 -15
pyobo/sources/umls/__init__.py +7 -1
pyobo/sources/umls/__main__.py +0 -0
pyobo/sources/umls/get_synonym_types.py +20 -4
pyobo/sources/umls/sty.py +57 -0
pyobo/sources/umls/synonym_types.tsv +1 -1
pyobo/sources/umls/umls.py +18 -22
pyobo/sources/unimod.py +46 -0
pyobo/sources/uniprot/__init__.py +1 -1
pyobo/sources/uniprot/uniprot.py +40 -32
pyobo/sources/uniprot/uniprot_ptm.py +4 -34
pyobo/sources/utils.py +3 -2
pyobo/sources/wikipathways.py +7 -10
pyobo/sources/zfin.py +5 -10
pyobo/ssg/__init__.py +12 -16
pyobo/ssg/base.html +0 -0
pyobo/ssg/index.html +26 -13
pyobo/ssg/term.html +12 -2
pyobo/ssg/typedef.html +0 -0
pyobo/struct/__init__.py +54 -8
pyobo/struct/functional/__init__.py +1 -0
pyobo/struct/functional/dsl.py +2572 -0
pyobo/struct/functional/macros.py +423 -0
pyobo/struct/functional/obo_to_functional.py +385 -0
pyobo/struct/functional/ontology.py +270 -0
pyobo/struct/functional/utils.py +112 -0
pyobo/struct/reference.py +331 -136
pyobo/struct/struct.py +1413 -643
pyobo/struct/struct_utils.py +1078 -0
pyobo/struct/typedef.py +162 -210
pyobo/struct/utils.py +12 -5
pyobo/struct/vocabulary.py +138 -0
pyobo/utils/__init__.py +0 -0
pyobo/utils/cache.py +13 -11
pyobo/utils/io.py +17 -31
pyobo/utils/iter.py +5 -5
pyobo/utils/misc.py +41 -53
pyobo/utils/ndex_utils.py +0 -0
pyobo/utils/path.py +76 -70
pyobo/version.py +3 -3
{pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/METADATA +228 -229
pyobo-0.12.0.dist-info/RECORD +202 -0
pyobo-0.12.0.dist-info/WHEEL +4 -0
{pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
pyobo-0.12.0.dist-info/licenses/LICENSE +21 -0
pyobo/aws.py +0 -162
pyobo/cli/aws.py +0 -47
pyobo/identifier_utils.py +0 -142
pyobo/normalizer.py +0 -232
pyobo/registries/__init__.py +0 -16
pyobo/registries/metaregistry.json +0 -507
pyobo/registries/metaregistry.py +0 -135
pyobo/sources/icd11.py +0 -105
pyobo/xrefdb/__init__.py +0 -1
pyobo/xrefdb/canonicalizer.py +0 -214
pyobo/xrefdb/priority.py +0 -59
pyobo/xrefdb/sources/__init__.py +0 -60
pyobo/xrefdb/sources/biomappings.py +0 -36
pyobo/xrefdb/sources/cbms2019.py +0 -91
pyobo/xrefdb/sources/chembl.py +0 -83
pyobo/xrefdb/sources/compath.py +0 -82
pyobo/xrefdb/sources/famplex.py +0 -64
pyobo/xrefdb/sources/gilda.py +0 -50
pyobo/xrefdb/sources/intact.py +0 -113
pyobo/xrefdb/sources/ncit.py +0 -133
pyobo/xrefdb/sources/pubchem.py +0 -27
pyobo/xrefdb/sources/wikidata.py +0 -116
pyobo-0.11.2.dist-info/RECORD +0 -157
pyobo-0.11.2.dist-info/WHEEL +0 -5
pyobo-0.11.2.dist-info/top_level.txt +0 -1

pyobo/gilda_utils.py CHANGED Viewed

@@ -3,63 +3,67 @@
 from __future__ import annotations
 import logging
-from collections.abc import Iterable
-from subprocess import CalledProcessError
+import warnings
+from collections.abc import Iterable, Sequence
+from typing import TYPE_CHECKING, Any, cast
 import bioregistry
-import gilda.api
-import gilda.term
-from gilda.grounder import Grounder
-from gilda.process import normalize
-from gilda.term import filter_out_duplicates
+import ssslm
+from ssslm import GildaGrounder, literal_mappings_to_gilda
 from tqdm.auto import tqdm
+from typing_extensions import Unpack
-from pyobo import (
-    get_descendants,
+from pyobo.api import (
     get_id_name_mapping,
-    get_id_species_mapping,
-    get_id_synonyms_mapping,
     get_ids,
-    get_obsolete,
+    get_literal_mappings,
+    get_literal_mappings_subset,
 )
-from pyobo.getters import NoBuildError
-from pyobo.utils.io import multidict
+from pyobo.constants import GetOntologyKwargs
+from pyobo.struct.reference import Reference
+if TYPE_CHECKING:
+    import gilda
 __all__ = [
-    "iter_gilda_prediction_tuples",
     "get_grounder",
-    "get_gilda_terms",
+    "iter_gilda_prediction_tuples",
 ]
 logger = logging.getLogger(__name__)
+# TODO the only place this is used is in Biomappings -
+#  might be better to directly move it there
 def iter_gilda_prediction_tuples(
     prefix: str,
     relation: str = "skos:exactMatch",
     *,
-    grounder: Grounder | None = None,
+    grounder: gilda.Grounder | None = None,
     identifiers_are_names: bool = False,
     strict: bool = False,
 ) -> Iterable[tuple[str, str, str, str, str, str, str, str, float]]:
     """Iterate over prediction tuples for a given prefix."""
     if grounder is None:
+        import gilda.api
         grounder = gilda.api.grounder
+    grounder_ = GildaGrounder(grounder)
     id_name_mapping = get_id_name_mapping(prefix, strict=strict)
     it = tqdm(
         id_name_mapping.items(), desc=f"[{prefix}] gilda tuples", unit_scale=True, unit="name"
     )
     for identifier, name in it:
-        for scored_match in grounder.ground(name):
-            target_prefix = scored_match.term.db.lower()
+        norm_identifier = _normalize_identifier(prefix, identifier)
+        for scored_match in grounder_.get_matches(name):
             yield (
                 prefix,
-                normalize_identifier(prefix, identifier),
+                norm_identifier,
                 name,
                 relation,
-                target_prefix,
-                normalize_identifier(target_prefix, scored_match.term.id),
-                scored_match.term.entry_name,
+                scored_match.prefix,
+                _normalize_identifier(scored_match.prefix, scored_match.identifier),
+                name,
                 "semapv:LexicalMatching",
                 round(scored_match.score, 3),
             )
@@ -67,22 +71,22 @@ def iter_gilda_prediction_tuples(
     if identifiers_are_names:
         it = tqdm(get_ids(prefix), desc=f"[{prefix}] gilda tuples", unit_scale=True, unit="id")
         for identifier in it:
-            for scored_match in grounder.ground(identifier):
-                target_prefix = scored_match.term.db.lower()
+            norm_identifier = _normalize_identifier(prefix, identifier)
+            for scored_match in grounder_.get_matches(identifier):
                 yield (
                     prefix,
-                    normalize_identifier(prefix, identifier),
+                    norm_identifier,
                     identifier,
                     relation,
-                    target_prefix,
-                    normalize_identifier(target_prefix, scored_match.term.id),
-                    scored_match.term.entry_name,
+                    scored_match.prefix,
+                    _normalize_identifier(scored_match.prefix, scored_match.identifier),
+                    identifier,
                     "semapv:LexicalMatching",
                     scored_match.score,
                 )
-def normalize_identifier(prefix: str, identifier: str) -> str:
+def _normalize_identifier(prefix: str, identifier: str) -> str:
     """Normalize the identifier."""
     resource = bioregistry.get_resource(prefix)
     if resource is None:
@@ -90,183 +94,58 @@ def normalize_identifier(prefix: str, identifier: str) -> str:
     return resource.miriam_standardize_identifier(identifier) or identifier
-def get_grounder(
-    prefixes: str | Iterable[str],
-    *,
-    unnamed: Iterable[str] | None = None,
-    grounder_cls: type[Grounder] | None = None,
-    versions: None | str | Iterable[str | None] | dict[str, str] = None,
-    strict: bool = True,
-    skip_obsolete: bool = False,
-    progress: bool = True,
-) -> Grounder:
-    """Get a Gilda grounder for the given prefix(es)."""
-    unnamed = set() if unnamed is None else set(unnamed)
-    if isinstance(prefixes, str):
-        prefixes = [prefixes]
-    else:
-        prefixes = list(prefixes)
-    if versions is None:
-        versions = [None] * len(prefixes)
-    elif isinstance(versions, str):
-        versions = [versions]
-    elif isinstance(versions, dict):
-        versions = [versions.get(prefix) for prefix in prefixes]
-    else:
-        versions = list(versions)
-    if len(prefixes) != len(versions):
-        raise ValueError
-    terms: list[gilda.term.Term] = []
-    for prefix, version in zip(tqdm(prefixes, leave=False, disable=not progress), versions):
-        try:
-            p_terms = list(
-                get_gilda_terms(
-                    prefix,
-                    identifiers_are_names=prefix in unnamed,
-                    version=version,
-                    strict=strict,
-                    skip_obsolete=skip_obsolete,
-                    progress=progress,
-                )
-            )
-        except (NoBuildError, CalledProcessError):
-            continue
-        else:
-            terms.extend(p_terms)
-    terms = filter_out_duplicates(terms)
-    terms_dict = multidict((term.norm_text, term) for term in terms)
-    if grounder_cls is None:
-        return Grounder(terms_dict)
-    else:
-        return grounder_cls(terms_dict)
+def normalize_identifier(prefix: str, identifier: str) -> str:
+    """Normalize the identifier."""
+    warnings.warn(
+        "normalization to MIRIAM is deprecated, please update to using Bioregistry standard identifiers",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    return _normalize_identifier(prefix, identifier)
-def _fast_term(
-    *,
-    text: str,
-    prefix: str,
-    identifier: str,
-    name: str,
-    status: str,
-    organism: str | None = None,
-) -> gilda.term.Term | None:
-    try:
-        term = gilda.term.Term(
-            norm_text=normalize(text),
-            text=text,
-            db=prefix,
-            id=identifier,
-            entry_name=name,
-            status=status,
-            source=prefix,
-            organism=organism,
-        )
-    except ValueError:
-        return None
-    return term
+def get_grounder(*args: Any, **kwargs: Any) -> gilda.Grounder:
+    """Get a grounder."""
+    warnings.warn("use pyobo.ner.get_grounder", DeprecationWarning, stacklevel=2)
+    import pyobo.ner
+    grounder = cast(ssslm.ner.GildaGrounder, pyobo.get_grounder(*args, **kwargs))
+    return grounder._grounder
-def get_gilda_terms(
-    prefix: str,
-    *,
-    identifiers_are_names: bool = False,
-    version: str | None = None,
-    strict: bool = True,
-    skip_obsolete: bool = False,
-    progress: bool = True,
-) -> Iterable[gilda.term.Term]:
-    """Get gilda terms for the given namespace."""
-    id_to_name = get_id_name_mapping(prefix, version=version, strict=strict)
-    id_to_species = get_id_species_mapping(prefix, version=version, strict=strict)
-    obsoletes = get_obsolete(prefix, version=version, strict=strict) if skip_obsolete else set()
-    it = tqdm(
-        id_to_name.items(),
-        desc=f"[{prefix}] mapping",
-        unit_scale=True,
-        unit="name",
-        disable=not progress,
+def get_gilda_terms(prefix: str, *, skip_obsolete: bool = False, **kwargs) -> Iterable[gilda.Term]:
+    """Get gilda terms."""
+    warnings.warn(
+        "use pyobo.get_literal_mappings() directly and convert to gilda yourself",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    yield from literal_mappings_to_gilda(
+        get_literal_mappings(prefix, skip_obsolete=skip_obsolete, **kwargs)
     )
-    for identifier, name in it:
-        if identifier in obsoletes:
-            continue
-        term = _fast_term(
-            text=name,
-            prefix=prefix,
-            identifier=identifier,
-            name=name,
-            status="name",
-            organism=id_to_species.get(identifier),
-        )
-        if term is not None:
-            yield term
-    id_to_synonyms = get_id_synonyms_mapping(prefix, version=version)
-    if id_to_synonyms:
-        it = tqdm(
-            id_to_synonyms.items(),
-            desc=f"[{prefix}] mapping",
-            unit_scale=True,
-            unit="synonym",
-            disable=not progress,
-        )
-        for identifier, synonyms in it:
-            if identifier in obsoletes:
-                continue
-            name = id_to_name[identifier]
-            for synonym in synonyms:
-                if not synonym:
-                    continue
-                term = _fast_term(
-                    text=synonym,
-                    prefix=prefix,
-                    identifier=identifier,
-                    name=name,
-                    status="synonym",
-                    organism=id_to_species.get(identifier),
-                )
-                if term is not None:
-                    yield term
-    if identifiers_are_names:
-        it = tqdm(
-            get_ids(prefix),
-            desc=f"[{prefix}] mapping",
-            unit_scale=True,
-            unit="id",
-            disable=not progress,
-        )
-        for identifier in it:
-            if identifier in obsoletes:
-                continue
-            term = _fast_term(
-                text=identifier,
-                prefix=prefix,
-                identifier=identifier,
-                name=identifier,
-                status="name",
-                organism=id_to_species.get(identifier),
-            )
-            if term is not None:
-                yield term
 def get_gilda_term_subset(
-    source: str, ancestors: str | list[str], **kwargs
-) -> Iterable[gilda.term.Term]:
+    source: str,
+    ancestors: str | Sequence[str],
+    *,
+    skip_obsolete: bool = False,
+    **kwargs: Unpack[GetOntologyKwargs],
+) -> Iterable[gilda.Term]:
     """Get a subset of terms."""
-    subset = {
-        descendant
-        for parent_curie in _ensure_list(ancestors)
-        for descendant in get_descendants(*parent_curie.split(":")) or []
-    }
-    for term in get_gilda_terms(source, **kwargs):
-        if bioregistry.curie_to_str(term.db, term.id) in subset:
-            yield term
-def _ensure_list(s: str | list[str]) -> list[str]:
-    if isinstance(s, str):
-        return [s]
-    return s
+    warnings.warn(
+        "use pyobo.get_literal_mappings_subset() directly and convert to gilda yourself",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    if isinstance(ancestors, str):
+        ancestors = [ancestors]
+    yield from literal_mappings_to_gilda(
+        get_literal_mappings_subset(
+            source,
+            ancestors=[Reference.from_curie(a) for a in ancestors],
+            skip_obsolete=skip_obsolete,
+            **kwargs,
+        )
+    )

pyobo/identifier_utils/__init__.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Extract registry information."""
+from .api import (
+    BlacklistedError,
+    DefaultCoercionError,
+    EmptyStringError,
+    NotCURIEError,
+    ParseError,
+    ParseValidationError,
+    UnparsableIRIError,
+    UnregisteredPrefixError,
+    _is_valid_identifier,
+    _parse_str_or_curie_or_uri_helper,
+    standardize_ec,
+    wrap_norm_prefix,
+)
+from .preprocessing import (
+    remap_full,
+    remap_prefix,
+    str_is_blacklisted,
+)
+from .relations import ground_relation
+__all__ = [
+    "BlacklistedError",
+    "DefaultCoercionError",
+    "EmptyStringError",
+    "NotCURIEError",
+    "ParseError",
+    "ParseValidationError",
+    "UnparsableIRIError",
+    "UnregisteredPrefixError",
+    "_is_valid_identifier",
+    "_parse_str_or_curie_or_uri_helper",
+    "ground_relation",
+    "remap_full",
+    "remap_prefix",
+    "standardize_ec",
+    "str_is_blacklisted",
+    "wrap_norm_prefix",
+]

pyobo/identifier_utils/api.py ADDED Viewed

@@ -0,0 +1,296 @@
+"""Utilities for handling prefixes."""
+from __future__ import annotations
+import logging
+from functools import wraps
+from typing import Annotated, ClassVar
+import bioregistry
+import click
+from bioregistry import NormalizedNamableReference as Reference
+from bioregistry.constants import FailureReturnType
+from curies import ReferenceTuple
+from pydantic import ValidationError
+from typing_extensions import Doc
+from .preprocessing import remap_full, remap_prefix, str_is_blacklisted
+from .relations import ground_relation
+__all__ = [
+    "BlacklistedError",
+    "DefaultCoercionError",
+    "EmptyStringError",
+    "NotCURIEError",
+    "ParseError",
+    "ParseValidationError",
+    "UnparsableIRIError",
+    "UnregisteredPrefixError",
+    "_parse_str_or_curie_or_uri_helper",
+    "standardize_ec",
+    "wrap_norm_prefix",
+]
+logger = logging.getLogger(__name__)
+class BlacklistedError(ValueError):
+    """A sentinel for blacklisted strings."""
+Line = Annotated[str | None, Doc("""The OBO line where the parsing happened""")]
+class ParseError(BaseException):
+    """Raised on a missing prefix."""
+    message: ClassVar[str]
+    def __init__(
+        self,
+        curie: str,
+        *,
+        context: str | None,
+        ontology_prefix: str | None = None,
+        node: Reference | None = None,
+        predicate: Reference | None = None,
+        line: Line = None,
+    ) -> None:
+        """Initialize the error."""
+        self.curie = curie
+        self.context = context
+        self.ontology_prefix = ontology_prefix
+        self.node = node
+        self.predicate = predicate
+        self.line = line
+    def __str__(self) -> str:
+        s = ""
+        if self.node:
+            if self.predicate:
+                s += f"[{self.node.curie} - {self.predicate.curie}] "
+            else:
+                s += f"[{self.node.curie}] "
+        elif self.ontology_prefix:
+            s += f"[{self.ontology_prefix}] "
+        s += f"{self.message} {click.style(self.curie, fg='cyan')}"
+        if self.context:
+            s += f" in {self.context}"
+        if self.line and self.line != self.curie:
+            s += f" in {click.style(self.line, fg='yellow')}"
+        return s
+class ParseValidationError(ParseError):
+    """Raised on a validation error."""
+    message = "failed Pydantic validation"
+    def __init__(self, *args, exc: ValidationError, **kwargs) -> None:
+        """Initialize the error."""
+        super().__init__(*args, **kwargs)
+        self.exc = exc
+class UnregisteredPrefixError(ParseError):
+    """Raised on a missing prefix."""
+    message = "unregistered prefix in"
+class UnparsableIRIError(ParseError):
+    """Raised on a an unparsable IRI."""
+    message = "couldn't parse IRI"
+class EmptyStringError(ParseError):
+    """Raised on a an empty string."""
+    message = "is empty"
+class NotCURIEError(ParseError):
+    """Raised on a text that can't be parsed as a CURIE."""
+    message = "not a CURIE"
+class DefaultCoercionError(ParseError):
+    """Raised on a text that can't be coerced into a default reference."""
+    message = "can't be coerced into a default reference"
+def _is_uri(s: str) -> bool:
+    return s.startswith("http:") or s.startswith("https:")
+def _preclean_uri(s: str) -> str:
+    s = s.strip().removeprefix(r"url\:").removeprefix(r"uri\:")
+    s = s.strip().removeprefix(r"URL\:").removeprefix(r"URI\:")
+    s = s.strip().removeprefix("url:").removeprefix("uri:")
+    s = s.removeprefix("URL:").removeprefix("URI:")
+    s = s.removeprefix("WWW:").removeprefix("www:").lstrip()
+    s = s.replace("http\\:", "http:")
+    s = s.replace("https\\:", "https:")
+    s = s.rstrip("/")
+    return s
+def _parse_str_or_curie_or_uri_helper(
+    str_or_curie_or_uri: str,
+    *,
+    ontology_prefix: str | None = None,
+    node: Reference | None = None,
+    predicate: Reference | None = None,
+    upgrade: bool = True,
+    line: str | None = None,
+    name: str | None = None,
+    context: str | None = None,
+) -> Reference | ParseError | BlacklistedError:
+    """Parse a string that looks like a CURIE.
+    :param str_or_curie_or_uri: A compact uniform resource identifier (CURIE)
+    :param ontology_prefix: The ontology in which the CURIE appears
+    :returns: A parse tuple or a tuple of None, None if not able to parse and not strict
+    - Normalizes the namespace
+    - Checks against a blacklist for the entire curie, for the namespace, and for
+      suffixes.
+    """
+    str_or_curie_or_uri = _preclean_uri(str_or_curie_or_uri)
+    if not str_or_curie_or_uri:
+        return EmptyStringError(
+            str_or_curie_or_uri,
+            ontology_prefix=ontology_prefix,
+            node=node,
+            predicate=predicate,
+            line=line,
+            context=context,
+        )
+    if upgrade:
+        # Remap the curie with the full list
+        if r1 := remap_full(str_or_curie_or_uri, ontology_prefix=ontology_prefix):
+            return r1
+        # Remap node's prefix (if necessary)
+        str_or_curie_or_uri = remap_prefix(str_or_curie_or_uri, ontology_prefix=ontology_prefix)
+        if r2 := ground_relation(str_or_curie_or_uri):
+            return r2
+    if str_is_blacklisted(str_or_curie_or_uri, ontology_prefix=ontology_prefix):
+        return BlacklistedError()
+    if _is_uri(str_or_curie_or_uri):
+        rt = bioregistry.parse_iri(
+            str_or_curie_or_uri, on_failure_return_type=FailureReturnType.single
+        )
+        if rt is None:
+            return UnparsableIRIError(
+                str_or_curie_or_uri,
+                ontology_prefix=ontology_prefix,
+                node=node,
+                predicate=predicate,
+                line=line,
+                context=context,
+            )
+        try:
+            rv = Reference.model_validate(
+                {"prefix": rt.prefix, "identifier": rt.identifier, "name": name}
+            )
+        except ValidationError as exc:
+            return ParseValidationError(
+                str_or_curie_or_uri,
+                ontology_prefix=ontology_prefix,
+                node=node,
+                predicate=predicate,
+                line=line,
+                context=context,
+                exc=exc,
+            )
+        else:
+            return rv
+    prefix, delimiter, identifier = str_or_curie_or_uri.partition(":")
+    if not delimiter:
+        return NotCURIEError(
+            str_or_curie_or_uri,
+            ontology_prefix=ontology_prefix,
+            node=node,
+            predicate=predicate,
+            line=line,
+            context=context,
+        )
+    norm_node_prefix = bioregistry.normalize_prefix(prefix)
+    if not norm_node_prefix:
+        return UnregisteredPrefixError(
+            str_or_curie_or_uri,
+            ontology_prefix=ontology_prefix,
+            node=node,
+            predicate=predicate,
+            line=line,
+            context=context,
+        )
+    identifier = bioregistry.standardize_identifier(norm_node_prefix, identifier)
+    try:
+        rv = Reference.model_validate(
+            {"prefix": norm_node_prefix, "identifier": identifier, "name": name}
+        )
+    except ValidationError as exc:
+        return ParseValidationError(
+            str_or_curie_or_uri,
+            ontology_prefix=ontology_prefix,
+            node=node,
+            predicate=predicate,
+            line=line,
+            exc=exc,
+            context=context,
+        )
+    else:
+        return rv
+def wrap_norm_prefix(f):
+    """Decorate a function that take in a prefix to auto-normalize, or return None if it can't be normalized."""
+    @wraps(f)
+    def _wrapped(prefix: str | Reference | ReferenceTuple, *args, **kwargs):
+        if isinstance(prefix, str):
+            norm_prefix = bioregistry.normalize_prefix(prefix)
+            if norm_prefix is None:
+                raise ValueError(f"Invalid prefix: {prefix}")
+            prefix = norm_prefix
+        elif isinstance(prefix, Reference):
+            norm_prefix = bioregistry.normalize_prefix(prefix.prefix)
+            if norm_prefix is None:
+                raise ValueError(f"Invalid prefix: {prefix.prefix}")
+            prefix = Reference(prefix=norm_prefix, identifier=prefix.identifier)
+        elif isinstance(prefix, ReferenceTuple):
+            norm_prefix = bioregistry.normalize_prefix(prefix.prefix)
+            if norm_prefix is None:
+                raise ValueError(f"Invalid prefix: {prefix.prefix}")
+            prefix = ReferenceTuple(norm_prefix, prefix.identifier)
+        else:
+            raise TypeError
+        return f(prefix, *args, **kwargs)
+    return _wrapped
+def standardize_ec(ec: str) -> str:
+    """Standardize an EC code identifier by removing all trailing dashes and dots."""
+    ec = ec.strip().replace(" ", "")
+    for _ in range(4):
+        ec = ec.rstrip("-").rstrip(".")
+    return ec
+def _is_valid_identifier(curie_or_uri: str) -> bool:
+    # TODO this needs more careful implementation
+    return bool(curie_or_uri.strip()) and " " not in curie_or_uri

pyobo 0.11.2__py3-none-any.whl → 0.12.0__py3-none-any.whl

pyobo 0.11.2py3-none-any.whl → 0.12.0py3-none-any.whl