PyPI - pyobo - Versions diffs - 0.12.9__py3-none-any.whl → 0.12.11__py3-none-any.whl - Mend

pyobo 0.12.9py3-none-any.whl → 0.12.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

pyobo/__init__.py +6 -0
pyobo/api/__init__.py +11 -1
pyobo/api/alts.py +18 -4
pyobo/api/embedding.py +108 -9
pyobo/api/names.py +28 -6
pyobo/api/xrefs.py +26 -1
pyobo/constants.py +38 -2
pyobo/getters.py +8 -3
pyobo/ner/api.py +14 -10
pyobo/ner/scispacy_utils.py +15 -21
pyobo/sources/__init__.py +2 -0
pyobo/sources/antibodyregistry.py +3 -3
pyobo/sources/bigg/bigg_compartment.py +1 -1
pyobo/sources/complexportal.py +3 -3
pyobo/sources/conso.py +3 -3
pyobo/sources/famplex.py +3 -3
pyobo/sources/goldbook.py +86 -0
pyobo/sources/hgnc/hgnc.py +157 -96
pyobo/sources/hgnc/hgncgenefamily.py +14 -13
pyobo/sources/msigdb.py +3 -3
pyobo/sources/omim_ps.py +8 -2
pyobo/sources/reactome.py +3 -3
pyobo/sources/rgd.py +7 -11
pyobo/sources/slm.py +3 -3
pyobo/sources/uniprot/uniprot.py +3 -3
pyobo/sources/wikipathways.py +7 -2
pyobo/struct/__init__.py +2 -2
pyobo/struct/functional/macros.py +1 -1
pyobo/struct/functional/obo_to_functional.py +7 -3
pyobo/struct/obo/reader.py +1 -1
pyobo/struct/struct.py +88 -18
pyobo/struct/struct_utils.py +19 -5
pyobo/struct/typedef.py +16 -3
pyobo/struct/vocabulary.py +4 -3
pyobo/utils/path.py +5 -4
pyobo/version.py +1 -1
{pyobo-0.12.9.dist-info → pyobo-0.12.11.dist-info}/METADATA +8 -1
{pyobo-0.12.9.dist-info → pyobo-0.12.11.dist-info}/RECORD +41 -40
{pyobo-0.12.9.dist-info → pyobo-0.12.11.dist-info}/WHEEL +0 -0
{pyobo-0.12.9.dist-info → pyobo-0.12.11.dist-info}/entry_points.txt +0 -0
{pyobo-0.12.9.dist-info → pyobo-0.12.11.dist-info}/licenses/LICENSE +0 -0

pyobo/__init__.py CHANGED Viewed

@@ -14,6 +14,7 @@ from .api import (
     get_filtered_relations_df,
     get_filtered_xrefs,
     get_graph,
+    get_graph_embeddings_df,
     get_hierarchy,
     get_id_definition_mapping,
     get_id_multirelations_mapping,
@@ -37,6 +38,7 @@ from .api import (
     get_obsolete,
     get_primary_curie,
     get_primary_identifier,
+    get_primary_reference,
     get_properties,
     get_properties_df,
     get_property,
@@ -44,6 +46,7 @@ from .api import (
     get_relation,
     get_relation_mapping,
     get_relations_df,
+    get_semantic_mappings,
     get_species,
     get_sssom_df,
     get_subhierarchy,
@@ -114,6 +117,7 @@ __all__ = [
     "get_filtered_relations_df",
     "get_filtered_xrefs",
     "get_graph",
+    "get_graph_embeddings_df",
     "get_grounder",
     "get_hierarchy",
     "get_id_definition_mapping",
@@ -139,6 +143,7 @@ __all__ = [
     "get_ontology",
     "get_primary_curie",
     "get_primary_identifier",
+    "get_primary_reference",
     "get_properties",
     "get_properties_df",
     "get_property",
@@ -149,6 +154,7 @@ __all__ = [
     "get_scispacy_entities",
     "get_scispacy_entity_linker",
     "get_scispacy_knowledgebase",
+    "get_semantic_mappings",
     "get_species",
     "get_sssom_df",
     "get_subhierarchy",

pyobo/api/__init__.py CHANGED Viewed

@@ -5,10 +5,16 @@ from .alts import (
     get_id_to_alts,
     get_primary_curie,
     get_primary_identifier,
+    get_primary_reference,
 )
 from .combine import get_literal_mappings_subset
 from .edges import get_edges, get_edges_df, get_graph
-from .embedding import get_text_embedding, get_text_embedding_similarity, get_text_embeddings_df
+from .embedding import (
+    get_graph_embeddings_df,
+    get_text_embedding,
+    get_text_embedding_similarity,
+    get_text_embeddings_df,
+)
 from .hierarchy import (
     get_ancestors,
     get_children,
@@ -59,6 +65,7 @@ from .typedefs import get_typedef_df
 from .xrefs import (
     get_filtered_xrefs,
     get_mappings_df,
+    get_semantic_mappings,
     get_sssom_df,
     get_xref,
     get_xrefs,
@@ -80,6 +87,7 @@ __all__ = [
     "get_filtered_relations_df",
     "get_filtered_xrefs",
     "get_graph",
+    "get_graph_embeddings_df",
     "get_hierarchy",
     "get_id_definition_mapping",
     "get_id_multirelations_mapping",
@@ -105,6 +113,7 @@ __all__ = [
     "get_ontology",
     "get_primary_curie",
     "get_primary_identifier",
+    "get_primary_reference",
     "get_priority_curie",
     "get_properties",
     "get_properties_df",
@@ -113,6 +122,7 @@ __all__ = [
     "get_relation",
     "get_relation_mapping",
     "get_relations_df",
+    "get_semantic_mappings",
     "get_species",
     "get_sssom_df",
     "get_subhierarchy",

pyobo/api/alts.py CHANGED Viewed

@@ -20,6 +20,7 @@ __all__ = [
     "get_id_to_alts",
     "get_primary_curie",
     "get_primary_identifier",
+    "get_primary_reference",
 ]
 logger = logging.getLogger(__name__)
@@ -61,13 +62,13 @@ def get_alts_to_id(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> Mapping[
     }
-def get_primary_curie(
+def get_primary_reference(
     prefix: str | curies.Reference | curies.ReferenceTuple,
     identifier: str | None = None,
     /,
     **kwargs: Unpack[GetOntologyKwargs],
-) -> str | None:
-    """Get the primary curie for an entity."""
+) -> curies.ReferenceTuple | None:
+    """Get the primary reference for an entity."""
     reference = _get_pi(prefix, identifier)
     try:
         primary_identifier = get_primary_identifier(reference, **kwargs)
@@ -76,7 +77,20 @@ def get_primary_curie(
             raise
         # this happens on invalid prefix. maybe revise?
         return None
-    return f"{reference.prefix}:{primary_identifier}"
+    return curies.ReferenceTuple(reference.prefix, primary_identifier)
+def get_primary_curie(
+    prefix: str | curies.Reference | curies.ReferenceTuple,
+    identifier: str | None = None,
+    /,
+    **kwargs: Unpack[GetOntologyKwargs],
+) -> str | None:
+    """Get the primary curie for an entity."""
+    reference = get_primary_reference(prefix, identifier, **kwargs)
+    if reference is None:
+        return None
+    return reference.curie
 def get_primary_identifier(

pyobo/api/embedding.py CHANGED Viewed

@@ -2,18 +2,29 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal
+import bioregistry
 import curies
 import numpy as np
 import pandas as pd
+from tqdm import tqdm
+from typing_extensions import Unpack
-from pyobo.api.names import get_definition, get_name, get_references
+from pyobo.api.edges import get_edges_df
+from pyobo.api.names import get_definition, get_id_name_mapping, get_name
+from pyobo.api.utils import get_version_from_kwargs
+from pyobo.constants import GetOntologyKwargs, check_should_force
+from pyobo.identifier_utils import wrap_norm_prefix
+from pyobo.utils.path import CacheArtifact, get_cache_path
 if TYPE_CHECKING:
     import sentence_transformers
 __all__ = [
+    "get_graph_embeddings_df",
     "get_text_embedding",
     "get_text_embedding_model",
     "get_text_embedding_similarity",
@@ -31,38 +42,126 @@ def get_text_embedding_model() -> sentence_transformers.SentenceTransformer:
 def _get_text(
     reference: str | curies.Reference | curies.ReferenceTuple,
+    /,
+    *,
+    name: str | None = None,
+    **kwargs: Unpack[GetOntologyKwargs],
 ) -> str | None:
-    name = get_name(reference)
+    if name is None:
+        name = get_name(reference, **kwargs)
     if name is None:
         return None
-    description = get_definition(reference)
+    description = get_definition(reference, **kwargs)
     if description:
         name += " " + description
     return name
+def get_graph_embeddings_df(
+    prefix: str,
+    *,
+    method: Literal["pykeen", "grape"] | None = None,
+    epochs: int = 30,
+    dimension: int = 32,
+    **kwargs: Unpack[GetOntologyKwargs],
+) -> pd.DataFrame:
+    """Get graph machine learning embeddings."""
+    if method == "pykeen" or method is None:
+        from pykeen.models import PairRE
+        from pykeen.training import SLCWATrainingLoop
+        from pykeen.triples import TriplesFactory
+        from torch.optim import Adam
+        triples_df = get_edges_df(prefix, **kwargs)
+        training = TriplesFactory.from_labeled_triples(triples_df.values)
+        model = PairRE(triples_factory=training, embedding_dim=dimension)
+        optimizer = Adam(params=model.get_grad_params())
+        training_loop = SLCWATrainingLoop(
+            model=model, triples_factory=training, optimizer=optimizer
+        )
+        # can also set batch size here
+        training_loop.train(triples_factory=training, num_epochs=epochs)
+        embeddings = model.entity_representations[0]()
+        df = pd.DataFrame(
+            embeddings.detach().numpy(),
+            index=[training.entity_id_to_label[i] for i in range(embeddings.shape[0])],
+        )
+    elif method == "grape":
+        from ensmallen import Graph
+        edges_df = get_edges_df(prefix, **kwargs)
+        with tempfile.TemporaryDirectory() as d:
+            path = Path(d).joinpath("test.tsv")
+            edges_df[[":START_ID", ":END_ID"]].to_csv(path, header=None, sep="\t", index=False)
+            graph = Graph.from_csv(
+                edge_path=str(path),
+                edge_list_separator="\t",
+                sources_column_number=0,
+                destinations_column_number=1,
+                edge_list_numeric_node_ids=False,
+                directed=True,
+                name=bioregistry.get_name(prefix, strict=True),
+                verbose=True,
+            )
+        graph = graph.remove_disconnected_nodes()
+        from embiggen.embedders.ensmallen_embedders.second_order_line import (
+            SecondOrderLINEEnsmallen,
+        )
+        embedding = SecondOrderLINEEnsmallen(embedding_size=dimension, epochs=epochs).fit_transform(
+            graph
+        )
+        df = embedding.get_all_node_embedding()[0].sort_index()
+        # df.columns = [str(c) for c in df.columns]
+    else:
+        raise ValueError(f"invalid graph machine learning method: {method}")
+    df.index.name = "curie"
+    return df
+@wrap_norm_prefix
 def get_text_embeddings_df(
     prefix: str,
     *,
     model: sentence_transformers.SentenceTransformer | None = None,
+    **kwargs: Unpack[GetOntologyKwargs],
 ) -> pd.DataFrame:
     """Get embeddings for all entities in the resource.
     :param prefix: A reference, either as a string or Reference object
     :param model: A sentence transformer model. Defaults to ``all-MiniLM-L6-v2`` if not
         given.
+    :param kwargs: The keyword arguments to forward to ontology getter functions for
+        names, definitions, and version
+    :returns: A pandas dataframe with an index representing local unique identifiers and
+        columns for the values of the model returned vectors
     """
+    path = get_cache_path(
+        prefix, CacheArtifact.embeddings, version=get_version_from_kwargs(prefix, kwargs)
+    )
+    if path.is_file() and not check_should_force(kwargs):
+        df = pd.read_csv(path, sep="\t").set_index(0)
+        return df
+    id_to_name = get_id_name_mapping(prefix, **kwargs)
     luids, texts = [], []
-    for reference in get_references(prefix):
-        text = _get_text(reference)
+    for identifier, name in tqdm(id_to_name.items(), desc=f"[{prefix}] constructing text"):
+        text = _get_text(curies.ReferenceTuple(prefix, identifier), name=name, **kwargs)
         if text is None:
             continue
-        luids.append(reference.identifier)
+        luids.append(identifier)
         texts.append(text)
     if model is None:
         model = get_text_embedding_model()
-    res = model.encode(texts)
-    return pd.DataFrame(res, index=luids)
+    res = model.encode(texts, show_progress_bar=True)
+    df = pd.DataFrame(res, index=luids)
+    df.to_csv(path, sep="\t")  # index is important here!
+    return df
 def get_text_embedding(

pyobo/api/names.py CHANGED Viewed

@@ -6,7 +6,7 @@ import logging
 import subprocess
 from collections.abc import Callable, Mapping
 from functools import lru_cache
-from typing import Any, TypeVar
+from typing import TypeVar
 import curies
 import pandas as pd
@@ -49,9 +49,15 @@ __all__ = [
 logger = logging.getLogger(__name__)
-def get_name_by_curie(curie: str, **kwargs: Any) -> str | None:
+def get_name_by_curie(
+    curie: str,
+    /,
+    *,
+    upgrade_identifier: bool | None = None,
+    **kwargs: Unpack[GetOntologyKwargs],
+) -> str | None:
     """Get the name for a CURIE, if possible."""
-    return get_name(curie, **kwargs)
+    return get_name(curie, upgrade_identifier=upgrade_identifier, **kwargs)
 X = TypeVar("X")
@@ -63,6 +69,8 @@ NO_BUILD_LOGGED: set = set()
 def _help_get(
     f: Callable[[str, Unpack[GetOntologyKwargs]], Mapping[str, X]],
     reference: Reference,
+    *,
+    upgrade_identifier: bool | None = None,
     **kwargs: Unpack[GetOntologyKwargs],
 ) -> X | None:
     """Get the result for an entity based on a mapping maker function ``f``."""
@@ -87,19 +95,32 @@ def _help_get(
             NO_BUILD_PREFIXES.add(reference.prefix)
         return None
-    primary_id = get_primary_identifier(reference, **kwargs)
-    return mapping.get(primary_id)
+    if upgrade_identifier is None:
+        if reference.identifier in mapping:
+            return mapping[reference.identifier]
+        else:
+            primary_id = get_primary_identifier(reference, **kwargs)
+            return mapping.get(primary_id)
+    elif upgrade_identifier is True:
+        primary_id = get_primary_identifier(reference, **kwargs)
+        return mapping.get(primary_id)
+    else:
+        return mapping.get(reference.identifier)
 def get_name(
     prefix: str | curies.Reference | curies.ReferenceTuple,
     identifier: str | None = None,
     /,
+    *,
+    upgrade_identifier: bool | None = None,
     **kwargs: Unpack[GetOntologyKwargs],
 ) -> str | None:
     """Get the name for an entity."""
     reference = _get_pi(prefix, identifier)
-    return _help_get(get_id_name_mapping, reference, **kwargs)
+    return _help_get(
+        get_id_name_mapping, reference, upgrade_identifier=upgrade_identifier, **kwargs
+    )
 @lru_cache
@@ -325,6 +346,7 @@ def get_literal_mappings(
     return rv
+@wrap_norm_prefix
 def get_literal_mappings_df(
     prefix: str,
     **kwargs: Unpack[GetOntologyKwargs],

pyobo/api/xrefs.py CHANGED Viewed

@@ -5,8 +5,11 @@ import warnings
 from collections.abc import Mapping
 from functools import lru_cache
+import curies
 import pandas as pd
 from curies import ReferenceTuple
+from sssom_pydantic import SemanticMapping
+from sssom_pydantic.io import parse_record, parse_row
 from typing_extensions import Unpack
 from .utils import get_version_from_kwargs
@@ -19,7 +22,7 @@ from ..constants import (
     check_should_use_tqdm,
 )
 from ..getters import get_ontology
-from ..identifier_utils import wrap_norm_prefix
+from ..identifier_utils import get_converter, wrap_norm_prefix
 from ..struct import Obo
 from ..utils.cache import cached_df
 from ..utils.path import CacheArtifact, get_cache_path
@@ -27,6 +30,7 @@ from ..utils.path import CacheArtifact, get_cache_path
 __all__ = [
     "get_filtered_xrefs",
     "get_mappings_df",
+    "get_semantic_mappings",
     "get_sssom_df",
     "get_xref",
     "get_xrefs",
@@ -107,6 +111,27 @@ def get_sssom_df(
     return get_mappings_df(prefix=prefix, names=names, **kwargs)
+def get_semantic_mappings(
+    prefix: str,
+    converter: curies.Converter | None = None,
+    names: bool = True,
+    include_mapping_source_column: bool = False,
+    **kwargs: Unpack[GetOntologyKwargs],
+) -> list[SemanticMapping]:
+    """Get semantic mapping objects."""
+    df = get_mappings_df(
+        prefix, names=names, include_mapping_source_column=include_mapping_source_column, **kwargs
+    )
+    if converter is None:
+        converter = get_converter()
+    rv = []
+    for _, row in df.iterrows():
+        record = parse_row(row.to_dict())
+        mapping = parse_record(record, converter=converter)
+        rv.append(mapping)
+    return rv
 def get_mappings_df(
     prefix: str | Obo,
     *,

pyobo/constants.py CHANGED Viewed

@@ -14,8 +14,21 @@ from typing_extensions import NotRequired, TypedDict
 __all__ = [
     "DATABASE_DIRECTORY",
+    "DEFAULT_PREFIX_MAP",
+    "ONTOLOGY_GETTERS",
+    "PROVENANCE_PREFIXES",
     "RAW_DIRECTORY",
     "SPECIES_REMAPPING",
+    "DatabaseKwargs",
+    "GetOntologyKwargs",
+    "IterHelperHelperDict",
+    "LookupKwargs",
+    "OntologyFormat",
+    "OntologyPathPack",
+    "SlimGetOntologyKwargs",
+    "check_should_cache",
+    "check_should_force",
+    "check_should_use_tqdm",
 ]
 logger = logging.getLogger(__name__)
@@ -96,6 +109,8 @@ SPECIES_FILE = "species.tsv.gz"
 NCBITAXON_PREFIX = "ncbitaxon"
 DATE_FORMAT = "%d:%m:%Y %H:%M"
+#: Prefixes for resources that are considered as provenance
 PROVENANCE_PREFIXES = {
     "pubmed",
     "pmc",
@@ -117,13 +132,21 @@ PROVENANCE_PREFIXES = {
 class DatabaseKwargs(TypedDict):
     """Keyword arguments for database CLI functions."""
+    #: Should strict identifier parsing be enabled?
     strict: bool
+    #: Should re-download and re-processing be forced?
     force: bool
+    #: Should re-processing be forced?
     force_process: bool
-    skip_pyobo: bool
+    #: Should a progress bar be used?
+    use_tqdm: bool
+    #: Skip all prefixes lexicographically sorted below the given prefix
     skip_below: str | None
+    #: If true, skips prefixes that are ontologized as sources in PyOBO
+    skip_pyobo: bool
+    #: An enumerated set of prefixes to skip
     skip_set: set[str] | None
-    use_tqdm: bool
 class SlimGetOntologyKwargs(TypedDict):
@@ -134,8 +157,11 @@ class SlimGetOntologyKwargs(TypedDict):
     only a single ontology is requested.
     """
+    #: Should strict identifier parsing be enabled?
     strict: NotRequired[bool]
+    #: Should re-download and re-processing be forced?
     force: NotRequired[bool]
+    #: Should re-processing be forced?
     force_process: NotRequired[bool]
@@ -145,8 +171,11 @@ class GetOntologyKwargs(SlimGetOntologyKwargs):
     This dictionary doesn't contain ``prefix`` since this is always explicitly handled.
     """
+    #: The version of the ontology to get
     version: NotRequired[str | None]
+    #: Should the cache be used?
     cache: NotRequired[bool]
+    #: Should a progress bar be used?
     use_tqdm: NotRequired[bool]
@@ -186,12 +215,17 @@ class IterHelperHelperDict(SlimGetOntologyKwargs):
     :func:`pyobo.get_ontology` in each iteration.
     """
+    #: Should a progress bar be used?
     use_tqdm: bool
+    #: Skip all prefixes lexicographically sorted below the given prefix
     skip_below: str | None
+    #: If true, skips prefixes that are ontologized as sources in PyOBO
     skip_pyobo: bool
+    #: An enumerated set of prefixes to skip
     skip_set: set[str] | None
+#: The ontology format
 OntologyFormat: TypeAlias = Literal["obo", "owl", "json", "rdf"]
 #: from table 2 of the Functional OWL syntax definition
@@ -207,7 +241,9 @@ DEFAULT_PREFIX_MAP = {
 class OntologyPathPack(NamedTuple):
     """A format and path tuple."""
+    #: The ontology format
     format: OntologyFormat
+    #: The path to the ontology file
     path: Path

pyobo/getters.py CHANGED Viewed

@@ -45,8 +45,14 @@ from .utils.path import ensure_path, prefix_directory_join
 from .version import get_git_hash, get_version
 __all__ = [
+    "REQUIRES_NO_ROBOT_CHECK",
+    "SKIP",
     "NoBuildError",
+    "UnhandledFormatError",
+    "db_output_helper",
     "get_ontology",
+    "iter_helper",
+    "iter_helper_helper",
 ]
 logger = logging.getLogger(__name__)
@@ -112,8 +118,6 @@ def get_ontology(
     :returns: An OBO object
-    :raises OnlyOWLError: If the OBO foundry only has an OWL document for this resource.
     Alternate usage if you have a custom url
     .. code-block:: python
@@ -220,7 +224,8 @@ def _ensure_ontology_path(
     return None
-SKIP = {
+#: A dictioanry of prefixes to skip during full build with reasons as values
+SKIP: dict[str, str] = {
     "ncbigene": "too big, refs acquired from other dbs",
     "pubchem.compound": "top big, can't deal with this now",
     "gaz": "Gazetteer is irrelevant for biology",

pyobo/ner/api.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
+import logging
 from collections.abc import Iterable
 from subprocess import CalledProcessError
 from typing import TYPE_CHECKING
@@ -22,6 +23,8 @@ __all__ = [
     "get_grounder",
 ]
+logger = logging.getLogger(__name__)
 def get_grounder(
     prefixes: str | Iterable[str],
@@ -32,22 +35,23 @@ def get_grounder(
     **kwargs: Unpack[GetOntologyKwargs],
 ) -> ssslm.Grounder:
     """Get a grounder for the given prefix(es)."""
-    literal_mappings: list[LiteralMapping] = []
+    all_literal_mappings: list[LiteralMapping] = []
     it = _clean_prefix_versions(prefixes, versions=versions)
     disable = len(it) == 1 or not check_should_use_tqdm(kwargs)
     for prefix, kwargs["version"] in tqdm(it, leave=False, disable=disable):
         try:
-            literal_mappings.extend(
-                get_literal_mappings(
-                    prefix,
-                    skip_obsolete=skip_obsolete,
-                    **kwargs,
-                )
-            )
-        except (NoBuildError, CalledProcessError):
+            literal_mappings = get_literal_mappings(prefix, skip_obsolete=skip_obsolete, **kwargs)
+        except (NoBuildError, CalledProcessError) as e:
+            logger.warning("[%s] unable to get literal mappings: %s", prefix, e)
             continue
+        else:
+            if not literal_mappings:
+                logger.warning("[%s] no literal mappings loaded", prefix)
+            all_literal_mappings.extend(literal_mappings)
-    return ssslm.make_grounder(literal_mappings, implementation="gilda", grounder_cls=grounder_cls)
+    return ssslm.make_grounder(
+        all_literal_mappings, implementation="gilda", grounder_cls=grounder_cls
+    )
 def _clean_prefix_versions(

pyobo 0.12.9__py3-none-any.whl → 0.12.11__py3-none-any.whl

pyobo 0.12.9py3-none-any.whl → 0.12.11py3-none-any.whl