PyPI - pyobo - Versions diffs - 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl - Mend

pyobo 0.12.0py3-none-any.whl → 0.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

pyobo/.DS_Store +0 -0
pyobo/api/properties.py +8 -12
pyobo/api/xrefs.py +1 -2
pyobo/cli/database.py +30 -2
pyobo/cli/database_utils.py +5 -11
pyobo/getters.py +18 -78
pyobo/gilda_utils.py +3 -80
pyobo/identifier_utils/__init__.py +2 -10
pyobo/identifier_utils/api.py +21 -12
pyobo/identifier_utils/preprocessing.json +74 -13
pyobo/identifier_utils/preprocessing.py +5 -39
pyobo/obographs.py +5 -1
pyobo/reader.py +13 -17
pyobo/sources/cgnc.py +9 -1
pyobo/sources/flybase.py +5 -5
pyobo/sources/omim_ps.py +4 -4
pyobo/sources/pharmgkb/pharmgkb_gene.py +1 -1
pyobo/struct/functional/ontology.py +3 -1
pyobo/struct/reference.py +4 -4
pyobo/struct/struct.py +112 -55
pyobo/utils/cache.py +3 -4
pyobo/utils/io.py +38 -14
pyobo/utils/path.py +16 -19
pyobo/version.py +1 -1
{pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/METADATA +71 -110
{pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/RECORD +29 -30
{pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/WHEEL +1 -1
pyobo/identifier_utils/model.py +0 -130
{pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/entry_points.txt +0 -0
{pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/licenses/LICENSE +0 -0

pyobo/struct/struct.py CHANGED Viewed

@@ -70,7 +70,7 @@ from ..constants import (
     TARGET_PREFIX,
 )
 from ..utils.cache import write_gzipped_graph
-from ..utils.io import multidict, write_iterable_tsv
+from ..utils.io import multidict, safe_open, write_iterable_tsv
 from ..utils.path import (
     CacheArtifact,
     get_cache_path,
@@ -712,6 +712,13 @@ class Obo:
             raise ValueError(f"There is no version available for {self.ontology}")
         return self.data_version
+    @property
+    def _prefix_version(self) -> str:
+        """Get the prefix and version (for logging)."""
+        if self.data_version:
+            return f"{self.ontology} {self.data_version}"
+        return self.ontology
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in this ontology."""
         raise NotImplementedError
@@ -722,10 +729,11 @@ class Obo:
         return graph_from_obo(self)
-    def write_obograph(self, path: Path) -> None:
+    def write_obograph(self, path: str | Path) -> None:
         """Write OBO Graph json."""
         graph = self.get_graph()
-        path.write_text(graph.model_dump_json(indent=2, exclude_none=True, exclude_unset=True))
+        with safe_open(path, read=False) as file:
+            file.write(graph.model_dump_json(indent=2, exclude_none=True, exclude_unset=True))
     @classmethod
     def cli(cls, *args, default_rewrite: bool = False) -> Any:
@@ -761,13 +769,12 @@ class Obo:
                 click.secho(f"[{cls.ontology}] Got an exception during instantiation - {type(e)}")
                 sys.exit(1)
             inst.write_default(
-                write_obograph=True,
-                write_obo=True,
+                write_obograph=False,
+                write_obo=False,
                 write_owl=owl,
                 write_ofn=ofn,
                 write_ttl=ttl,
                 write_nodes=True,
-                write_edges=True,
                 force=force or rewrite,
                 use_tqdm=True,
             )
@@ -969,9 +976,14 @@ class Obo:
             emit_annotation_properties=emit_annotation_properties,
         )
         if use_tqdm:
-            it = tqdm(it, desc=f"[{self.ontology}] writing OBO", unit_scale=True, unit="line")
+            it = tqdm(
+                it,
+                desc=f"[{self._prefix_version}] writing OBO",
+                unit_scale=True,
+                unit="line",
+            )
         if isinstance(file, str | Path | os.PathLike):
-            with open(file, "w") as fh:
+            with safe_open(file, read=False) as fh:
                 self._write_lines(it, fh)
         else:
             self._write_lines(it, file)
@@ -1002,11 +1014,72 @@ class Obo:
     def write_nodes(self, path: str | Path) -> None:
         """Write a nodes TSV file."""
-        # TODO reimplement internally
-        self.get_graph().get_nodes_df().to_csv(path, sep="\t", index=False)
+        write_iterable_tsv(
+            path=path,
+            header=self.nodes_header,
+            it=self.iterate_edge_rows(),
+        )
+    @property
+    def nodes_header(self) -> Sequence[str]:
+        """Get the header for nodes."""
+        return [
+            "curie:ID",
+            "name:string",
+            "synonyms:string[]",
+            "synonym_predicates:string[]",
+            "synonym_types:string[]",
+            "definition:string",
+            "deprecated:boolean",
+            "type:string",
+            "provenance:string[]",
+            "alts:string[]",
+            "replaced_by:string[]",
+            "mapping_objects:string[]",
+            "mapping_predicates:string[]",
+            "version:string",
+        ]
+    def _get_node_row(self, node: Term, sep: str, version: str) -> Sequence[str]:
+        synonym_predicate_curies, synonym_type_curies, synonyms = [], [], []
+        for synonym in node.synonyms:
+            synonym_predicate_curies.append(synonym.predicate.curie)
+            synonym_type_curies.append(synonym.type.curie if synonym.type else "")
+            synonyms.append(synonym.name)
+        mapping_predicate_curies, mapping_target_curies = [], []
+        for predicate, obj in node.get_mappings(include_xrefs=True, add_context=False):
+            mapping_predicate_curies.append(predicate.curie)
+            mapping_target_curies.append(obj.curie)
+        return (
+            node.curie,
+            node.name or "",
+            sep.join(synonyms),
+            sep.join(synonym_predicate_curies),
+            sep.join(synonym_type_curies),
+            node.definition or "",
+            "true" if node.is_obsolete else "false",
+            node.type,
+            sep.join(
+                reference.curie for reference in node.provenance if isinstance(reference, Reference)
+            ),
+            sep.join(alt_reference.curie for alt_reference in node.alt_ids),
+            sep.join(ref.curie for ref in node.get_replaced_by()),
+            sep.join(mapping_target_curies),
+            sep.join(mapping_predicate_curies),
+            version,
+        )
+    def iterate_node_rows(self, sep: str = ";") -> Iterable[Sequence[str]]:
+        """Get a nodes iterator appropriate for serialization."""
+        version = self.data_version or ""
+        for node in self.iter_terms():
+            if node.prefix != self.ontology:
+                continue
+            yield self._get_node_row(node, sep=sep, version=version)
     def write_edges(self, path: str | Path) -> None:
         """Write a edges TSV file."""
+        # node, this is actually taken care of as part of the cache configuration
         write_iterable_tsv(
             path=path,
             header=self.edges_header,
@@ -1025,15 +1098,15 @@ class Obo:
     @property
     def _obo_path(self) -> Path:
-        return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.obo")
+        return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.obo.gz")
     @property
     def _obograph_path(self) -> Path:
-        return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.json")
+        return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.json.gz")
     @property
     def _owl_path(self) -> Path:
-        return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.owl")
+        return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.owl.gz")
     @property
     def _obonet_gz_path(self) -> Path:
@@ -1041,7 +1114,7 @@ class Obo:
     @property
     def _ofn_path(self) -> Path:
-        return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.ofn")
+        return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.ofn.gz")
     @property
     def _ttl_path(self) -> Path:
@@ -1060,22 +1133,10 @@ class Obo:
                 [f"{self.ontology}_id", "taxonomy_id"],
                 self.iterate_id_species,
             ),
-            (
-                # TODO deprecate this in favor of literal mappings output
-                CacheArtifact.synonyms,
-                [f"{self.ontology}_id", "synonym"],
-                self.iterate_synonym_rows,
-            ),
             (CacheArtifact.alts, [f"{self.ontology}_id", "alt_id"], self.iterate_alt_rows),
             (CacheArtifact.mappings, SSSOM_DF_COLUMNS, self.iterate_mapping_rows),
             (CacheArtifact.relations, self.relations_header, self.iter_relation_rows),
             (CacheArtifact.edges, self.edges_header, self.iterate_edge_rows),
-            (
-                # TODO deprecate this in favor of pair of literal and object properties
-                CacheArtifact.properties,
-                self.properties_header,
-                self._iter_property_rows,
-            ),
             (
                 CacheArtifact.object_properties,
                 self.object_properties_header,
@@ -1097,8 +1158,8 @@ class Obo:
         """Write the metadata JSON file."""
         metadata = self.get_metadata()
         for path in (self._root_metadata_path, self._get_cache_path(CacheArtifact.metadata)):
-            logger.debug("[%s v%s] caching metadata to %s", self.ontology, self.data_version, path)
-            with path.open("w") as file:
+            logger.debug("[%s] caching metadata to %s", self._prefix_version, path)
+            with safe_open(path, read=False) as file:
                 json.dump(metadata, file, indent=2)
     def write_prefix_map(self) -> None:
@@ -1110,9 +1171,8 @@ class Obo:
         """Write cache parts."""
         typedefs_path = self._get_cache_path(CacheArtifact.typedefs)
         logger.debug(
-            "[%s v%s] caching typedefs to %s",
-            self.ontology,
-            self.data_version,
+            "[%s] caching typedefs to %s",
+            self._prefix_version,
             typedefs_path,
         )
         typedef_df: pd.DataFrame = self.get_typedef_df()
@@ -1121,10 +1181,10 @@ class Obo:
         for cache_artifact, header, fn in self._get_cache_config():
             path = self._get_cache_path(cache_artifact)
-            if path.exists() and not force:
+            if path.is_file() and not force:
                 continue
             tqdm.write(
-                f"[{self.ontology} {self.data_version}] writing {cache_artifact.name} to {path}",
+                f"[{self._prefix_version}] writing {cache_artifact.name} to {path}",
             )
             write_iterable_tsv(
                 path=path,
@@ -1139,12 +1199,11 @@ class Obo:
             relations_path = get_relation_cache_path(
                 self.ontology, reference=relation, version=self.data_version
             )
-            if relations_path.exists() and not force:
+            if relations_path.is_file() and not force:
                 continue
             logger.debug(
-                "[%s v%s] caching relation %s ! %s",
-                self.ontology,
-                self.data_version,
+                "[%s] caching relation %s ! %s",
+                self._prefix_version,
                 relation.curie,
                 relation.name,
             )
@@ -1164,8 +1223,7 @@ class Obo:
         write_owl: bool = False,
         write_ofn: bool = False,
         write_ttl: bool = False,
-        write_nodes: bool = True,
-        write_edges: bool = True,
+        write_nodes: bool = False,
         obograph_use_internal: bool = False,
         write_cache: bool = True,
     ) -> None:
@@ -1174,15 +1232,15 @@ class Obo:
         self.write_prefix_map()
         if write_cache:
             self.write_cache(force=force)
-        if write_obo and (not self._obo_path.exists() or force):
-            tqdm.write(f"[{self.ontology}] writing OBO to {self._obo_path}")
+        if write_obo and (not self._obo_path.is_file() or force):
+            tqdm.write(f"[{self._prefix_version}] writing OBO to {self._obo_path}")
             self.write_obo(self._obo_path, use_tqdm=use_tqdm)
-        if (write_ofn or write_owl or write_obograph) and (not self._ofn_path.exists() or force):
-            tqdm.write(f"[{self.ontology}] writing OFN to {self._ofn_path}")
+        if (write_ofn or write_owl or write_obograph) and (not self._ofn_path.is_file() or force):
+            tqdm.write(f"[{self._prefix_version}] writing OFN to {self._ofn_path}")
             self.write_ofn(self._ofn_path)
-        if write_obograph and (not self._obograph_path.exists() or force):
+        if write_obograph and (not self._obograph_path.is_file() or force):
             if obograph_use_internal:
-                tqdm.write(f"[{self.ontology}] writing OBO Graph to {self._obograph_path}")
+                tqdm.write(f"[{self._prefix_version}] writing OBO Graph to {self._obograph_path}")
                 self.write_obograph(self._obograph_path)
             else:
                 import bioontologies.robot
@@ -1193,22 +1251,22 @@ class Obo:
                 bioontologies.robot.convert(
                     self._ofn_path, self._obograph_path, debug=True, merge=False, reason=False
                 )
-        if write_owl and (not self._owl_path.exists() or force):
-            tqdm.write(f"[{self.ontology}] writing OWL to {self._owl_path}")
+        if write_owl and (not self._owl_path.is_file() or force):
+            tqdm.write(f"[{self._prefix_version}] writing OWL to {self._owl_path}")
             import bioontologies.robot
             bioontologies.robot.convert(
                 self._ofn_path, self._owl_path, debug=True, merge=False, reason=False
             )
-        if write_ttl and (not self._ttl_path.exists() or force):
-            tqdm.write(f"[{self.ontology}] writing Turtle to {self._ttl_path}")
+        if write_ttl and (not self._ttl_path.is_file() or force):
+            tqdm.write(f"[{self._prefix_version}] writing Turtle to {self._ttl_path}")
             self.write_rdf(self._ttl_path)
-        if write_obonet and (not self._obonet_gz_path.exists() or force):
-            tqdm.write(f"[{self.ontology}] writing obonet to {self._obonet_gz_path}")
+        if write_obonet and (not self._obonet_gz_path.is_file() or force):
+            tqdm.write(f"[{self._prefix_version}] writing obonet to {self._obonet_gz_path}")
             self.write_obonet_gz(self._obonet_gz_path)
         if write_nodes:
             nodes_path = self._get_cache_path(CacheArtifact.nodes)
-            tqdm.write(f"[{self.ontology}] writing nodes TSV to {nodes_path}")
+            tqdm.write(f"[{self._prefix_version}] writing nodes TSV to {nodes_path}")
             self.write_nodes(nodes_path)
     @property
@@ -1335,9 +1393,8 @@ class Obo:
             rv.add_edge(_source, _target, key=_key)
         logger.info(
-            "[%s v%s] exported graph with %d nodes",
-            self.ontology,
-            self.data_version,
+            "[%s] exported graph with %d nodes",
+            self._prefix_version,
             rv.number_of_nodes(),
         )
         return rv

pyobo/utils/cache.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Utilities for caching files."""
-import gzip
 import json
 import logging
 from collections.abc import Iterable, Mapping
@@ -14,7 +13,7 @@ from pystow.cache import CachedDataFrame as cached_df  # noqa:N813
 from pystow.cache import CachedJSON as cached_json  # noqa:N813
 from pystow.cache import CachedPickle as cached_pickle  # noqa:N813
-from .io import open_map_tsv, open_multimap_tsv, write_map_tsv, write_multimap_tsv
+from .io import open_map_tsv, open_multimap_tsv, safe_open, write_map_tsv, write_multimap_tsv
 __all__ = [
     "cached_collection",
@@ -70,13 +69,13 @@ NODE_LINK_STYLE = "links"  # TODO update to "edges"
 def get_gzipped_graph(path: str | Path) -> nx.MultiDiGraph:
     """Read a graph that's gzipped nodelink."""
-    with gzip.open(path, "rt") as file:
+    with safe_open(path, read=True) as file:
         return nx.node_link_graph(json.load(file), edges=NODE_LINK_STYLE)
 def write_gzipped_graph(graph: nx.MultiDiGraph, path: str | Path) -> None:
     """Write a graph as gzipped nodelink."""
-    with gzip.open(path, "wt") as file:
+    with safe_open(path, read=False) as file:
         json.dump(nx.node_link_data(graph, edges=NODE_LINK_STYLE), file)

pyobo/utils/io.py CHANGED Viewed

@@ -1,26 +1,28 @@
 """I/O utilities."""
 import collections.abc
+import contextlib
 import csv
 import gzip
 import logging
 from collections import defaultdict
-from collections.abc import Iterable, Mapping
+from collections.abc import Generator, Iterable, Mapping
 from contextlib import contextmanager
 from pathlib import Path
-from typing import TypeVar
+from typing import Literal, TextIO, TypeVar
 import pandas as pd
 from tqdm.auto import tqdm
 __all__ = [
     "get_reader",
-    "get_writer",
     "multidict",
     "multisetdict",
     "open_map_tsv",
     "open_multimap_tsv",
     "open_reader",
+    "safe_open",
+    "safe_open_writer",
     "write_iterable_tsv",
     "write_map_tsv",
     "write_multimap_tsv",
@@ -36,7 +38,7 @@ Y = TypeVar("Y")
 def open_reader(path: str | Path, sep: str = "\t"):
     """Open a file and get a reader for it."""
     path = Path(path)
-    with gzip.open(path, "rt") if path.suffix == ".gz" else open(path) as file:
+    with safe_open(path, read=True) as file:
         yield get_reader(file, sep=sep)
@@ -45,16 +47,11 @@ def get_reader(x, sep: str = "\t"):
     return csv.reader(x, delimiter=sep, quoting=csv.QUOTE_MINIMAL)
-def get_writer(x, sep: str = "\t"):
-    """Get a :func:`csv.writer` with PyOBO default settings."""
-    return csv.writer(x, delimiter=sep, quoting=csv.QUOTE_MINIMAL)
 def open_map_tsv(
     path: str | Path, *, use_tqdm: bool = False, has_header: bool = True
 ) -> Mapping[str, str]:
     """Load a mapping TSV file into a dictionary."""
-    with open(path) as file:
+    with safe_open(path, read=True) as file:
         if has_header:
             next(file)  # throw away header
         if use_tqdm:
@@ -84,9 +81,12 @@ def _help_multimap_tsv(
     use_tqdm: bool = False,
     has_header: bool = True,
 ) -> Iterable[tuple[str, str]]:
-    with open(path) as file:
+    with safe_open(path, read=True) as file:
         if has_header:
-            next(file)  # throw away header
+            try:
+                next(file)  # throw away header
+            except gzip.BadGzipFile as e:
+                raise ValueError(f"could not open file {path}") from e
         if use_tqdm:
             file = tqdm(file, desc=f"loading TSV from {path}")
         yield from get_reader(file)
@@ -145,8 +145,32 @@ def write_iterable_tsv(
     """Write a mapping dictionary to a TSV file."""
     it = (row for row in it if all(cell is not None for cell in row))
     it = sorted(it)
-    with open(path, "w") as file:
-        writer = get_writer(file, sep=sep)
+    with safe_open_writer(path, delimiter=sep) as writer:
         if header is not None:
             writer.writerow(header)
         writer.writerows(it)
+@contextlib.contextmanager
+def safe_open(
+    path: str | Path, read: bool, encoding: str | None = None
+) -> Generator[TextIO, None, None]:
+    """Safely open a file for reading or writing text."""
+    path = Path(path).expanduser().resolve()
+    mode: Literal["rt", "wt"] = "rt" if read else "wt"
+    if path.suffix.endswith(".gz"):
+        with gzip.open(path, mode=mode, encoding=encoding) as file:
+            yield file
+    else:
+        with open(path, mode=mode) as file:
+            yield file
+@contextlib.contextmanager
+def safe_open_writer(f: str | Path | TextIO, *, delimiter: str = "\t"):  # type:ignore
+    """Open a CSV writer, wrapping :func:`csv.writer`."""
+    if isinstance(f, str | Path):
+        with safe_open(f, read=False) as file:
+            yield csv.writer(file, delimiter=delimiter)
+    else:
+        yield csv.writer(f, delimiter=delimiter)

pyobo/utils/path.py CHANGED Viewed

@@ -99,25 +99,22 @@ def ensure_df(
 class CacheArtifact(enum.Enum):
     """An enumeration for."""
-    names = "names.tsv"
-    definitions = "definitions.tsv"
-    species = "species.tsv"
-    synonyms = "synonyms.tsv"  # deprecated
-    xrefs = "xrefs.tsv"  # deprecated
-    mappings = "mappings.tsv"
-    relations = "relations.tsv"
-    alts = "alt_ids.tsv"
-    typedefs = "typedefs.tsv"
-    literal_mappings = "literal_mappings.tsv"
-    references = "references.tsv"
-    obsoletes = "obsolete.tsv"
-    properties = "properties.tsv"  # deprecated
-    literal_properties = "literal_properties.tsv"
-    object_properties = "object_properties.tsv"
-    nodes = "nodes.tsv"
-    edges = "edges.tsv"
+    names = "names.tsv.gz"
+    definitions = "definitions.tsv.gz"
+    species = "species.tsv.gz"
+    mappings = "mappings.tsv.gz"
+    relations = "relations.tsv.gz"
+    alts = "alt_ids.tsv.gz"
+    typedefs = "typedefs.tsv.gz"
+    literal_mappings = "literal_mappings.tsv.gz"
+    references = "references.tsv.gz"
+    obsoletes = "obsolete.tsv.gz"
+    literal_properties = "literal_properties.tsv.gz"
+    object_properties = "object_properties.tsv.gz"
+    nodes = "nodes.tsv.gz"
+    edges = "edges.tsv.gz"
     prefixes = "prefixes.json"
     metadata = "metadata.json"

pyobo/version.py CHANGED Viewed

@@ -12,7 +12,7 @@ __all__ = [
     "get_version",
 ]
-VERSION = "0.12.0"
+VERSION = "0.12.1"
 def get_git_hash() -> str:

pyobo 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl

pyobo 0.12.0py3-none-any.whl → 0.12.1py3-none-any.whl