PyPI - pyobo - Versions diffs - 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl - Mend

pyobo 0.12.0py3-none-any.whl → 0.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

pyobo/.DS_Store +0 -0
pyobo/api/properties.py +8 -12
pyobo/api/xrefs.py +1 -2
pyobo/cli/database.py +30 -2
pyobo/cli/database_utils.py +5 -11
pyobo/getters.py +18 -78
pyobo/gilda_utils.py +3 -80
pyobo/identifier_utils/__init__.py +2 -10
pyobo/identifier_utils/api.py +21 -12
pyobo/identifier_utils/preprocessing.json +74 -13
pyobo/identifier_utils/preprocessing.py +5 -39
pyobo/obographs.py +5 -1
pyobo/reader.py +13 -17
pyobo/sources/cgnc.py +9 -1
pyobo/sources/flybase.py +5 -5
pyobo/sources/omim_ps.py +4 -4
pyobo/sources/pharmgkb/pharmgkb_gene.py +1 -1
pyobo/struct/functional/ontology.py +3 -1
pyobo/struct/reference.py +4 -4
pyobo/struct/struct.py +112 -55
pyobo/utils/cache.py +3 -4
pyobo/utils/io.py +38 -14
pyobo/utils/path.py +16 -19
pyobo/version.py +1 -1
{pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/METADATA +71 -110
{pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/RECORD +29 -30
{pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/WHEEL +1 -1
pyobo/identifier_utils/model.py +0 -130
{pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/entry_points.txt +0 -0
{pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/licenses/LICENSE +0 -0

pyobo/.DS_Store CHANGED Viewed

Binary file

pyobo/api/properties.py CHANGED Viewed

@@ -113,18 +113,14 @@ def get_properties_df(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> pd.Da
     :param prefix: the resource to load
     :returns: A dataframe with the properties
     """
-    version = get_version_from_kwargs(prefix, kwargs)
-    path = get_cache_path(prefix, CacheArtifact.properties, version=version)
-    @cached_df(
-        path=path, dtype=str, force=check_should_force(kwargs), cache=check_should_cache(kwargs)
-    )
-    def _df_getter() -> pd.DataFrame:
-        return get_ontology(prefix, **kwargs).get_properties_df(
-            use_tqdm=check_should_use_tqdm(kwargs)
-        )
-    return _df_getter()
+    df1 = get_literal_properties_df(prefix, **kwargs)
+    df2 = get_object_properties_df(prefix, **kwargs)
+    df = pd.concat([df1[["source", "predicate", "target"]], df2])
+    ll = len(prefix) + 1
+    df[f"{prefix}_id"] = df["source"].map(lambda x: x[ll:])
+    df = df.rename(columns={"predicate": "property", "target": "value"})
+    del df["source"]
+    return df[[f"{prefix}_id", "property", "value"]]
 @wrap_norm_prefix

pyobo/api/xrefs.py CHANGED Viewed

@@ -81,8 +81,7 @@ get_xrefs = get_filtered_xrefs
 def get_xrefs_df(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> pd.DataFrame:
     """Get all xrefs."""
     warnings.warn(
-        f"use pyobo.get_mappings_df instead of pyobo.get_xrefs_df."
-        f"Not using cache artifact path to {CacheArtifact.xrefs}",
+        "use pyobo.get_mappings_df instead of pyobo.get_xrefs_df.",
         DeprecationWarning,
         stacklevel=2,
     )

pyobo/cli/database.py CHANGED Viewed

@@ -2,8 +2,10 @@
 import logging
 import warnings
+from collections.abc import Iterable
 from pathlib import Path
+import bioregistry
 import click
 from more_click import verbose_option
 from tqdm.contrib.logging import logging_redirect_tqdm
@@ -11,11 +13,11 @@ from typing_extensions import Unpack
 from zenodo_client import update_zenodo
 from .database_utils import (
+    IterHelperHelperDict,
     _iter_alts,
     _iter_definitions,
     _iter_edges,
     _iter_mappings,
-    _iter_metadata,
     _iter_names,
     _iter_properties,
     _iter_relations,
@@ -23,6 +25,7 @@ from .database_utils import (
     _iter_synonyms,
     _iter_typedefs,
     _iter_xrefs,
+    iter_helper_helper,
 )
 from .utils import (
     Clickable,
@@ -44,12 +47,14 @@ from ..constants import (
     TYPEDEFS_RECORD,
     DatabaseKwargs,
 )
-from ..getters import db_output_helper
+from ..getters import db_output_helper, get_ontology
 __all__ = [
     "main",
 ]
+logger = logging.getLogger(__name__)
 @click.group(name="database")
 def main():
@@ -129,9 +134,32 @@ def build(ctx: click.Context, **kwargs: Unpack[DatabaseKwargs]) -> None:
         ctx.invoke(species, **updated_kwargs)
+@database_annotate
+def cache(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
+    """Cache all things."""
+    if zenodo:
+        click.echo("no zenodo for caching")
+    kwargs["force_process"] = True
+    with logging_redirect_tqdm():
+        for _ in iter_helper_helper(get_ontology, **kwargs):
+            # this pass intentional to consume the iterable
+            pass
 @database_annotate
 def metadata(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
     """Make the prefix-metadata dump."""
+    from ..api import get_metadata
+    def _iter_metadata(
+        **kwargs: Unpack[IterHelperHelperDict],
+    ) -> Iterable[tuple[str, str, str, bool]]:
+        for prefix, data in iter_helper_helper(get_metadata, **kwargs):
+            version = data["version"]
+            logger.debug(f"[{prefix}] using version {version}")
+            yield prefix, version, data["date"], bioregistry.is_deprecated(prefix)
     it = _iter_metadata(**kwargs)
     db_output_helper(
         it,

pyobo/cli/database_utils.py CHANGED Viewed

@@ -9,7 +9,6 @@ from collections.abc import Iterable
 from functools import partial
 from typing import cast
-import bioregistry
 from tqdm.auto import tqdm
 from typing_extensions import Unpack
@@ -21,7 +20,6 @@ from ..api import (
     get_id_synonyms_mapping,
     get_id_to_alts,
     get_mappings_df,
-    get_metadata,
     get_properties_df,
     get_relations_df,
     get_typedef_df,
@@ -40,19 +38,12 @@ def _iter_ncbigene(left: int, right: int) -> Iterable[tuple[str, str, str]]:
     with gzip.open(ncbi_path, "rt") as file:
         next(file)  # throw away the header
         for line in tqdm(
-            file, desc=f"extracting {ncbigene.PREFIX}", unit_scale=True, total=27_000_000
+            file, desc=f"[{ncbigene.PREFIX}] extracting names", unit_scale=True, total=56_700_000
         ):
             parts = line.strip().split("\t")
             yield ncbigene.PREFIX, parts[left], parts[right]
-def _iter_metadata(**kwargs: Unpack[IterHelperHelperDict]):
-    for prefix, data in iter_helper_helper(get_metadata, **kwargs):
-        version = data["version"]
-        logger.debug(f"[{prefix}] using version {version}")
-        yield prefix, version, data["date"], bioregistry.is_deprecated(prefix)
 def _iter_names(leave: bool = False, **kwargs) -> Iterable[tuple[str, str, str]]:
     """Iterate over all prefix-identifier-name triples we can get.
@@ -60,11 +51,14 @@ def _iter_names(leave: bool = False, **kwargs) -> Iterable[tuple[str, str, str]]
     """
     yield from iter_helper(get_id_name_mapping, leave=leave, **kwargs)
     yield from _iter_ncbigene(1, 2)
+    yield from _iter_pubchem_compound()
+def _iter_pubchem_compound():
     pcc_path = pubchem._ensure_cid_name_path()
     with gzip.open(pcc_path, mode="rt", encoding="ISO-8859-1") as file:
         for line in tqdm(
-            file, desc=f"extracting {pubchem.PREFIX}", unit_scale=True, total=103_000_000
+            file, desc=f"[{pubchem.PREFIX}] extracting names", unit_scale=True, total=119_000_000
         ):
             identifier, name = line.strip().split("\t", 1)
             yield pubchem.PREFIX, identifier, name

pyobo/getters.py CHANGED Viewed

@@ -3,7 +3,6 @@
 from __future__ import annotations
 import datetime
-import gzip
 import json
 import logging
 import pathlib
@@ -16,7 +15,7 @@ from collections import Counter
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from pathlib import Path
 from textwrap import indent
-from typing import TypeVar
+from typing import Any, TypeVar
 import bioregistry
 import click
@@ -27,6 +26,7 @@ from tqdm.auto import tqdm
 from typing_extensions import Unpack
 from .constants import (
+    BUILD_SUBDIRECTORY_NAME,
     DATABASE_DIRECTORY,
     GetOntologyKwargs,
     IterHelperHelperDict,
@@ -36,7 +36,7 @@ from .identifier_utils import ParseError, wrap_norm_prefix
 from .plugins import has_nomenclature_plugin, run_nomenclature_plugin
 from .reader import from_obo_path, from_obonet
 from .struct import Obo
-from .utils.io import get_writer
+from .utils.io import safe_open_writer
 from .utils.path import ensure_path, prefix_directory_join
 from .version import get_git_hash, get_version
@@ -119,19 +119,21 @@ def get_ontology(
         logger.info("UBERON has so much garbage in it that defaulting to non-strict parsing")
         strict = False
-    if not cache:
+    if force_process:
+        obonet_json_gz_path = None
+    elif not cache:
         logger.debug("[%s] caching was turned off, so dont look for an obonet file", prefix)
         obonet_json_gz_path = None
     else:
         obonet_json_gz_path = prefix_directory_join(
-            prefix, name=f"{prefix}.obonet.json.gz", ensure_exists=False, version=version
+            prefix, BUILD_SUBDIRECTORY_NAME, name=f"{prefix}.obonet.json.gz", version=version
         )
         logger.debug(
             "[%s] caching is turned on, so look for an obonet file at %s",
             prefix,
             obonet_json_gz_path,
         )
-        if obonet_json_gz_path.exists() and not force:
+        if obonet_json_gz_path.is_file() and not force:
             from .utils.cache import get_gzipped_graph
             logger.debug("[%s] using obonet cache at %s", prefix, obonet_json_gz_path)
@@ -198,64 +200,6 @@ def _ensure_ontology_path(
     return None, None
-#: Obonet/Pronto can't parse these (consider converting to OBO with ROBOT?)
-CANT_PARSE = {
-    "agro",
-    "aro",
-    "bco",
-    "caro",
-    "cco",
-    "chmo",
-    "cido",
-    "covoc",
-    "cto",
-    "cvdo",
-    "dicom",
-    "dinto",
-    "emap",
-    "epso",
-    "eupath",
-    "fbbi",
-    "fma",
-    "fobi",
-    "foodon",
-    "genepio",
-    "hancestro",
-    "hom",
-    "hso",
-    "htn",  # Unknown string format: creation: 16MAY2017
-    "ico",
-    "idocovid19",
-    "labo",
-    "mamo",
-    "mfmo",
-    "mfo",
-    "mfomd",
-    "miapa",
-    "mo",
-    "oae",
-    "ogms",  # Unknown string format: creation: 16MAY2017
-    "ohd",
-    "ons",
-    "oostt",
-    "opmi",
-    "ornaseq",
-    "orth",
-    "pdro",
-    "probonto",
-    "psdo",
-    "reo",
-    "rex",
-    "rnao",
-    "sepio",
-    "sio",
-    "spd",
-    "sweetrealm",
-    "txpo",
-    "vido",
-    "vt",
-    "xl",
-}
 SKIP = {
     "ncbigene": "too big, refs acquired from other dbs",
     "pubchem.compound": "top big, can't deal with this now",
@@ -276,11 +220,12 @@ SKIP = {
     "kegg.genes": "needs fix",  # FIXME
     "kegg.genome": "needs fix",  # FIXME
     "kegg.pathway": "needs fix",  # FIXME
-    "ensemblglossary": "uri is wrong",
+    "ensemblglossary": "URI is self-referential to data in OLS, extract from there",
     "epio": "content from fraunhofer is unreliable",
     "epso": "content from fraunhofer is unreliable",
     "gwascentral.phenotype": "website is down? or API changed?",  # FIXME
     "gwascentral.study": "website is down? or API changed?",  # FIXME
+    "snomedct": "dead source",
 }
 X = TypeVar("X")
@@ -412,7 +357,7 @@ def iter_helper_helper(
         except ValueError as e:
             if _is_xml(e):
                 # this means that it tried doing parsing on an xml page
-                logger.info(
+                logger.warning(
                     "no resource available for %s. See http://www.obofoundry.org/ontology/%s",
                     prefix,
                     prefix,
@@ -452,7 +397,7 @@ def _prep_dir(directory: None | str | pathlib.Path) -> pathlib.Path:
 def db_output_helper(
-    it: Iterable[tuple[str, ...]],
+    it: Iterable[tuple[Any, ...]],
     db_name: str,
     columns: Sequence[str],
     *,
@@ -497,13 +442,10 @@ def db_output_helper(
     logger.info("writing %s to %s", db_name, db_path)
     logger.info("writing %s sample to %s", db_name, db_sample_path)
     sample_rows = []
-    with gzip.open(db_path, mode="wt") if use_gzip else open(db_path, "w") as gzipped_file:
-        writer = get_writer(gzipped_file)
+    with safe_open_writer(db_path) as writer:
         # for the first 10 rows, put it in a sample file too
-        with open(db_sample_path, "w") as sample_file:
-            sample_writer = get_writer(sample_file)
+        with safe_open_writer(db_sample_path) as sample_writer:
             # write header
             writer.writerow(columns)
             sample_writer.writerow(columns)
@@ -523,15 +465,13 @@ def db_output_helper(
                 c_detailed[tuple(row[i] for i in summary_detailed)] += 1
             writer.writerow(row)
-    with open(db_summary_path, "w") as file:
-        writer = get_writer(file)
-        writer.writerows(c.most_common())
+    with safe_open_writer(db_summary_path) as summary_writer:
+        summary_writer.writerows(c.most_common())
     if summary_detailed is not None:
         logger.info(f"writing {db_name} detailed summary to {db_summary_detailed_path}")
-        with open(db_summary_detailed_path, "w") as file:
-            writer = get_writer(file)
-            writer.writerows((*keys, v) for keys, v in c_detailed.most_common())
+        with safe_open_writer(db_summary_detailed_path) as detailed_summary_writer:
+            detailed_summary_writer.writerows((*keys, v) for keys, v in c_detailed.most_common())
         rv.append(("Summary (Detailed)", db_summary_detailed_path))
     with open(db_metadata_path, "w") as file:

pyobo/gilda_utils.py CHANGED Viewed

@@ -2,20 +2,15 @@
 from __future__ import annotations
-import logging
 import warnings
 from collections.abc import Iterable, Sequence
 from typing import TYPE_CHECKING, Any, cast
-import bioregistry
 import ssslm
-from ssslm import GildaGrounder, literal_mappings_to_gilda
-from tqdm.auto import tqdm
+from ssslm import literal_mappings_to_gilda
 from typing_extensions import Unpack
 from pyobo.api import (
-    get_id_name_mapping,
-    get_ids,
     get_literal_mappings,
     get_literal_mappings_subset,
 )
@@ -26,83 +21,11 @@ if TYPE_CHECKING:
     import gilda
 __all__ = [
+    "get_gilda_term_subset",
+    "get_gilda_terms",
     "get_grounder",
-    "iter_gilda_prediction_tuples",
 ]
-logger = logging.getLogger(__name__)
-# TODO the only place this is used is in Biomappings -
-#  might be better to directly move it there
-def iter_gilda_prediction_tuples(
-    prefix: str,
-    relation: str = "skos:exactMatch",
-    *,
-    grounder: gilda.Grounder | None = None,
-    identifiers_are_names: bool = False,
-    strict: bool = False,
-) -> Iterable[tuple[str, str, str, str, str, str, str, str, float]]:
-    """Iterate over prediction tuples for a given prefix."""
-    if grounder is None:
-        import gilda.api
-        grounder = gilda.api.grounder
-    grounder_ = GildaGrounder(grounder)
-    id_name_mapping = get_id_name_mapping(prefix, strict=strict)
-    it = tqdm(
-        id_name_mapping.items(), desc=f"[{prefix}] gilda tuples", unit_scale=True, unit="name"
-    )
-    for identifier, name in it:
-        norm_identifier = _normalize_identifier(prefix, identifier)
-        for scored_match in grounder_.get_matches(name):
-            yield (
-                prefix,
-                norm_identifier,
-                name,
-                relation,
-                scored_match.prefix,
-                _normalize_identifier(scored_match.prefix, scored_match.identifier),
-                name,
-                "semapv:LexicalMatching",
-                round(scored_match.score, 3),
-            )
-    if identifiers_are_names:
-        it = tqdm(get_ids(prefix), desc=f"[{prefix}] gilda tuples", unit_scale=True, unit="id")
-        for identifier in it:
-            norm_identifier = _normalize_identifier(prefix, identifier)
-            for scored_match in grounder_.get_matches(identifier):
-                yield (
-                    prefix,
-                    norm_identifier,
-                    identifier,
-                    relation,
-                    scored_match.prefix,
-                    _normalize_identifier(scored_match.prefix, scored_match.identifier),
-                    identifier,
-                    "semapv:LexicalMatching",
-                    scored_match.score,
-                )
-def _normalize_identifier(prefix: str, identifier: str) -> str:
-    """Normalize the identifier."""
-    resource = bioregistry.get_resource(prefix)
-    if resource is None:
-        raise KeyError
-    return resource.miriam_standardize_identifier(identifier) or identifier
-def normalize_identifier(prefix: str, identifier: str) -> str:
-    """Normalize the identifier."""
-    warnings.warn(
-        "normalization to MIRIAM is deprecated, please update to using Bioregistry standard identifiers",
-        DeprecationWarning,
-        stacklevel=2,
-    )
-    return _normalize_identifier(prefix, identifier)
 def get_grounder(*args: Any, **kwargs: Any) -> gilda.Grounder:
     """Get a grounder."""

pyobo/identifier_utils/__init__.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """Extract registry information."""
 from .api import (
-    BlacklistedError,
     DefaultCoercionError,
     EmptyStringError,
     NotCURIEError,
@@ -14,15 +13,10 @@ from .api import (
     standardize_ec,
     wrap_norm_prefix,
 )
-from .preprocessing import (
-    remap_full,
-    remap_prefix,
-    str_is_blacklisted,
-)
+from .preprocessing import get_rules
 from .relations import ground_relation
 __all__ = [
-    "BlacklistedError",
     "DefaultCoercionError",
     "EmptyStringError",
     "NotCURIEError",
@@ -32,10 +26,8 @@ __all__ = [
     "UnregisteredPrefixError",
     "_is_valid_identifier",
     "_parse_str_or_curie_or_uri_helper",
+    "get_rules",
     "ground_relation",
-    "remap_full",
-    "remap_prefix",
     "standardize_ec",
-    "str_is_blacklisted",
     "wrap_norm_prefix",
 ]

pyobo/identifier_utils/api.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from __future__ import annotations
 import logging
-from functools import wraps
+from functools import lru_cache, wraps
 from typing import Annotated, ClassVar
 import bioregistry
@@ -11,14 +11,14 @@ import click
 from bioregistry import NormalizedNamableReference as Reference
 from bioregistry.constants import FailureReturnType
 from curies import ReferenceTuple
+from curies.preprocessing import BlocklistError, PreprocessingConverter
 from pydantic import ValidationError
 from typing_extensions import Doc
-from .preprocessing import remap_full, remap_prefix, str_is_blacklisted
+from .preprocessing import get_rules
 from .relations import ground_relation
 __all__ = [
-    "BlacklistedError",
     "DefaultCoercionError",
     "EmptyStringError",
     "NotCURIEError",
@@ -34,10 +34,6 @@ __all__ = [
 logger = logging.getLogger(__name__)
-class BlacklistedError(ValueError):
-    """A sentinel for blacklisted strings."""
 Line = Annotated[str | None, Doc("""The OBO line where the parsing happened""")]
@@ -138,6 +134,15 @@ def _preclean_uri(s: str) -> str:
     return s
+@lru_cache(1)
+def _get_converter() -> PreprocessingConverter:
+    return PreprocessingConverter(
+        converter=bioregistry.manager.converter,
+        rules=get_rules(),
+        preclean=_preclean_uri,
+    )
 def _parse_str_or_curie_or_uri_helper(
     str_or_curie_or_uri: str,
     *,
@@ -148,7 +153,7 @@ def _parse_str_or_curie_or_uri_helper(
     line: str | None = None,
     name: str | None = None,
     context: str | None = None,
-) -> Reference | ParseError | BlacklistedError:
+) -> Reference | ParseError | BlocklistError:
     """Parse a string that looks like a CURIE.
     :param str_or_curie_or_uri: A compact uniform resource identifier (CURIE)
@@ -171,19 +176,23 @@ def _parse_str_or_curie_or_uri_helper(
             context=context,
         )
+    rules = get_rules()
     if upgrade:
         # Remap the curie with the full list
-        if r1 := remap_full(str_or_curie_or_uri, ontology_prefix=ontology_prefix):
+        if r1 := rules.remap_full(
+            str_or_curie_or_uri, reference_cls=Reference, context=ontology_prefix
+        ):
             return r1
         # Remap node's prefix (if necessary)
-        str_or_curie_or_uri = remap_prefix(str_or_curie_or_uri, ontology_prefix=ontology_prefix)
+        str_or_curie_or_uri = rules.remap_prefix(str_or_curie_or_uri, context=ontology_prefix)
         if r2 := ground_relation(str_or_curie_or_uri):
             return r2
-    if str_is_blacklisted(str_or_curie_or_uri, ontology_prefix=ontology_prefix):
-        return BlacklistedError()
+    if rules.str_is_blocked(str_or_curie_or_uri, context=ontology_prefix):
+        return BlocklistError()
     if _is_uri(str_or_curie_or_uri):
         rt = bioregistry.parse_iri(

pyobo 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl

pyobo 0.12.0py3-none-any.whl → 0.12.1py3-none-any.whl