PyPI - biocypher - Versions diffs - 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

biocypher 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biocypher might be problematic. Click here for more details.

Files changed (15) hide show

biocypher/_config/biocypher_config.yaml +21 -4
biocypher/_metadata.py +1 -1
biocypher/_ontology.py +144 -51
biocypher/_translate.py +84 -79
biocypher/output/write/_batch_writer.py +133 -52
biocypher/output/write/_get_writer.py +28 -11
biocypher/output/write/_writer.py +32 -14
biocypher/output/write/graph/_arangodb.py +44 -32
biocypher/output/write/graph/_neo4j.py +3 -4
biocypher/output/write/graph/_owl.py +569 -0
biocypher/output/write/graph/_rdf.py +234 -97
{biocypher-0.7.0.dist-info → biocypher-0.9.0.dist-info}/METADATA +1 -1
{biocypher-0.7.0.dist-info → biocypher-0.9.0.dist-info}/RECORD +15 -14
{biocypher-0.7.0.dist-info → biocypher-0.9.0.dist-info}/LICENSE +0 -0
{biocypher-0.7.0.dist-info → biocypher-0.9.0.dist-info}/WHEEL +0 -0

biocypher/output/write/_batch_writer.py CHANGED Viewed

@@ -1,3 +1,5 @@
+"""Abstract base class for all batch writers."""
 import glob
 import os
 import re
@@ -16,30 +18,37 @@ from biocypher.output.write._writer import _Writer
 class _BatchWriter(_Writer, ABC):
-    """Abstract batch writer class"""
+    """Abstract batch writer class."""
     @abstractmethod
     def _quote_string(self, value: str) -> str:
-        """Abstract method to quote a string. Escaping is handled by the database-specific writer."""
-        raise NotImplementedError(
-            "Database writer must override '_quote_string'",
-        )
+        """Quote a string.
+        Escaping is handled by the database-specific writer.
+        """
+        msg = "Database writer must override '_quote_string'"
+        logger.error(msg)
+        raise NotImplementedError(msg)
     @abstractmethod
     def _get_default_import_call_bin_prefix(self):
-        """Abstract method to provide the default string for the import call bin prefix.
+        """Provide the default string for the import call bin prefix.
         Returns
         -------
             str: The database-specific string for the path to the import call bin prefix
         """
-        raise NotImplementedError("Database writer must override '_get_default_import_call_bin_prefix'")
+        msg = "Database writer must override '_get_default_import_call_bin_prefix'"
+        logger.error(msg)
+        raise NotImplementedError(msg)
     @abstractmethod
     def _write_array_string(self, string_list):
-        """Abstract method to write the string representation of an array into a .csv file.
-        Different databases require different formats of array to optimize import speed.
+        """Write the string representation of an array into a .csv file.
+        Different databases require different formats of array to optimize
+        import speed.
         Args:
         ----
@@ -50,50 +59,65 @@ class _BatchWriter(_Writer, ABC):
             str: The database-specific string representation of an array
         """
-        raise NotImplementedError("Database writer must override '_write_array_string'")
+        msg = "Database writer must override '_write_array_string'"
+        logger.error(msg)
+        raise NotImplementedError(msg)
     @abstractmethod
     def _write_node_headers(self):
-        """Abstract method that takes care of importing properties of a graph entity that is represented
-        as a node as per the definition in the `schema_config.yaml`
+        """Write header files for nodes.
+        Write header files (node properties) for nodes as per the
+        definition in the `schema_config.yaml`.
         Returns
         -------
             bool: The return value. True for success, False otherwise.
         """
-        raise NotImplementedError("Database writer must override '_write_node_headers'")
+        msg = "Database writer must override '_write_node_headers'"
+        logger.error(msg)
+        raise NotImplementedError(msg)
     @abstractmethod
     def _write_edge_headers(self):
-        """Abstract method to write a database import-file for a graph entity that is represented
-        as an edge as per the definition in the `schema_config.yaml`,
-        containing only the header for this type of edge.
+        """Write a database import-file for an edge.
+        Write a database import-file for an edge as per the definition in
+        the `schema_config.yaml`, containing only the header for this type
+        of edge.
         Returns
         -------
             bool: The return value. True for success, False otherwise.
         """
-        raise NotImplementedError("Database writer must override '_write_edge_headers'")
+        msg = "Database writer must override '_write_edge_headers'"
+        logger.error(msg)
+        raise NotImplementedError(msg)
     @abstractmethod
     def _construct_import_call(self) -> str:
-        """Function to construct the import call detailing folder and
-        individual node and edge headers and data files, as well as
-        delimiters and database name. Built after all data has been
-        processed to ensure that nodes are called before any edges.
+        """Construct the import call.
+        Construct the import call detailing folder and individual node and
+        edge headers and data files, as well as delimiters and database name.
+        Built after all data has been processed to ensure that nodes are
+        called before any edges.
         Returns
         -------
             str: A bash command for csv import.
         """
-        raise NotImplementedError("Database writer must override '_construct_import_call'")
+        msg = "Database writer must override '_construct_import_call'"
+        logger.error(msg)
+        raise NotImplementedError(msg)
     @abstractmethod
     def _get_import_script_name(self) -> str:
-        """Returns the name of the import script.
+        """Return the name of the import script.
         The name will be chosen based on the used database.
         Returns
@@ -101,7 +125,9 @@ class _BatchWriter(_Writer, ABC):
             str: The name of the import script (ending in .sh)
         """
-        raise NotImplementedError("Database writer must override '_get_import_script_name'")
+        msg = "Database writer must override '_get_import_script_name'"
+        logger.error(msg)
+        raise NotImplementedError(msg)
     def __init__(
         self,
@@ -122,10 +148,14 @@ class _BatchWriter(_Writer, ABC):
         db_password: str = None,
         db_host: str = None,
         db_port: str = None,
-        rdf_format: str = None,
+        file_format: str = None,
         rdf_namespaces: dict = {},
+        labels_order: str = "Ascending",
+        **kwargs,
     ):
-        """Abtract parent class for writing node and edge representations to disk
+        """Write node and edge representations to disk.
+        Abstract parent class for writing node and edge representations to disk
         using the format specified by each database type. The database-specific
         functions are implemented by the respective child-classes. This abstract
         class contains all methods expected by a bach writer instance, some of
@@ -179,7 +209,8 @@ class _BatchWriter(_Writer, ABC):
                 call.
             wipe:
-                Whether to force import (removing existing DB content). (Specific to Neo4j.)
+                Whether to force import (removing existing DB content).
+                    (Specific to Neo4j.)
             strict_mode:
                 Whether to enforce source, version, and license properties.
@@ -203,12 +234,16 @@ class _BatchWriter(_Writer, ABC):
             db_port:
                 The database port.
-            rdf_format:
+            file_format:
                 The format of RDF.
             rdf_namespaces:
                 The namespaces for RDF.
+            labels_order:
+                The order of labels, to reflect the hierarchy (or not).
+                Default: "Ascending" (from more specific to more generic).
         """
         super().__init__(
             translator=translator,
@@ -221,7 +256,7 @@ class _BatchWriter(_Writer, ABC):
         self.db_password = db_password
         self.db_host = db_host or "localhost"
         self.db_port = db_port
-        self.rdf_format = rdf_format
+        self.file_format = file_format
         self.rdf_namespaces = rdf_namespaces
         self.delim, self.escaped_delim = self._process_delimiter(delimiter)
@@ -251,6 +286,15 @@ class _BatchWriter(_Writer, ABC):
         self.parts = {}  # dict to store the paths of part files for each label
+        self._labels_orders = ["Alphabetical", "Ascending", "Descending", "Leaves"]
+        if labels_order not in self._labels_orders:
+            msg = (
+                f"neo4j's 'labels_order' parameter cannot be '{labels_order}',"
+                "must be one of: {' ,'.join(self._labels_orders)}",
+            )
+            raise ValueError(msg)
+        self.labels_order = labels_order
         # TODO not memory efficient, but should be fine for most cases; is
         # there a more elegant solution?
@@ -263,8 +307,16 @@ class _BatchWriter(_Writer, ABC):
             return self._import_call_file_prefix
     def _process_delimiter(self, delimiter: str) -> str:
-        """Return escaped characters in case of receiving their string
-        representation (e.g. tab for '\t').
+        """Process a delimited to escape correctly.
+        Args:
+        ----
+            delimiter (str): The delimiter to process.
+        Returns:
+        -------
+            tuple: The delimiter and its escaped representation.
         """
         if delimiter == "\\t":
             return "\t", "\\t"
@@ -273,7 +325,7 @@ class _BatchWriter(_Writer, ABC):
             return delimiter, delimiter
     def write_nodes(self, nodes, batch_size: int = int(1e6), force: bool = False):
-        """Wrapper for writing nodes and their headers.
+        """Write nodes and their headers.
         Args:
         ----
@@ -311,7 +363,7 @@ class _BatchWriter(_Writer, ABC):
         edges: list | GeneratorType,
         batch_size: int = int(1e6),
     ) -> bool:
-        """Wrapper for writing edges and their headers.
+        """Write edges and their headers.
         Args:
         ----
@@ -373,12 +425,13 @@ class _BatchWriter(_Writer, ABC):
         return True
     def _write_node_data(self, nodes, batch_size, force: bool = False):
-        """Writes biocypher nodes to CSV conforming to the headers created
-        with `_write_node_headers()`, and is actually required to be run
-        before calling `_write_node_headers()` to set the
-        :py:attr:`self.node_property_dict` for passing the node properties
-        to the instance. Expects list or generator of nodes from the
-        :py:class:`BioCypherNode` class.
+        """Write biocypher nodes to CSV.
+        Conforms to the headers created with `_write_node_headers()`, and
+        is actually required to be run before calling `_write_node_headers()`
+        to set the :py:attr:`self.node_property_dict` for passing the node
+        properties to the instance. Expects list or generator of nodes from
+        the :py:class:`BioCypherNode` class.
         Args:
         ----
@@ -472,8 +525,26 @@ class _BatchWriter(_Writer, ABC):
                         all_labels = [self.translator.name_sentence_to_pascal(label) for label in all_labels]
                         # remove duplicates
                         all_labels = list(OrderedDict.fromkeys(all_labels))
-                        # order alphabetically
-                        all_labels.sort()
+                        match self.labels_order:
+                            case "Ascending":
+                                pass  # Default from get_ancestors.
+                            case "Alphabetical":
+                                all_labels.sort()
+                            case "Descending":
+                                all_labels.reverse()
+                            case "Leaves":
+                                if len(all_labels) < 1:
+                                    msg = "Labels list cannot be empty when using 'Leaves' order."
+                                    raise ValueError(msg)
+                                all_labels = [all_labels[0]]
+                            case _:
+                                # In case someone touched _label_orders after constructor.
+                                if self.labels_order not in self._labels_orders:
+                                    msg = (
+                                        f"Invalid labels_order: {self.labels_order}. "
+                                        f"Must be one of {self._labels_orders}"
+                                    )
+                                    raise ValueError(msg)
                         # concatenate with array delimiter
                         all_labels = self._write_array_string(all_labels)
                     else:
@@ -539,7 +610,9 @@ class _BatchWriter(_Writer, ABC):
         prop_dict: dict,
         labels: str,
     ):
-        """This function takes one list of biocypher nodes and writes them
+        """Write a list of biocypher nodes to a CSV file.
+        This function takes one list of biocypher nodes and writes them
         to a Neo4j admin import compatible CSV file.
         Args:
@@ -623,7 +696,9 @@ class _BatchWriter(_Writer, ABC):
         return True
     def _write_edge_data(self, edges, batch_size):
-        """Writes biocypher edges to CSV conforming to the headers created
+        """Write biocypher edges to CSV.
+        Writes biocypher edges to CSV conforming to the headers created
         with `_write_edge_headers()`, and is actually required to be run
         before calling `_write_node_headers()` to set the
         :py:attr:`self.edge_property_dict` for passing the edge
@@ -772,7 +847,9 @@ class _BatchWriter(_Writer, ABC):
         label: str,
         prop_dict: dict,
     ):
-        """This function takes one list of biocypher edges and writes them
+        """Write a list of biocypher edges to a CSV file.
+        This function takes one list of biocypher edges and writes them
         to a Neo4j admin import compatible CSV file.
         Args:
@@ -891,7 +968,7 @@ class _BatchWriter(_Writer, ABC):
         return True
     def _write_next_part(self, label: str, lines: list):
-        """This function writes a list of strings to a new part file.
+        """Write a list of strings to a new part file.
         Args:
         ----
@@ -943,9 +1020,10 @@ class _BatchWriter(_Writer, ABC):
             self.parts[label].append(part)
     def get_import_call(self) -> str:
-        """Function to return the import call detailing folder and
-        individual node and edge headers and data files, as well as
-        delimiters and database name.
+        """Eeturn the import call.
+        Return the import call detailing folder and individual node and
+        edge headers and data files, as well as delimiters and database name.
         Returns
         -------
@@ -955,7 +1033,9 @@ class _BatchWriter(_Writer, ABC):
         return self._construct_import_call()
     def write_import_call(self) -> str:
-        """Function to write the import call detailing folder and
+        """Write the import call.
+        Function to write the import call detailing folder and
         individual node and edge headers and data files, as well as
         delimiters and database name, to the export folder as txt.
@@ -974,9 +1054,10 @@ class _BatchWriter(_Writer, ABC):
 def parse_label(label: str) -> str:
-    """Check if the label is compliant with Neo4j naming conventions,
-    https://neo4j.com/docs/cypher-manual/current/syntax/naming/, and if not,
-    remove non-compliant characters.
+    """Check if the label is compliant with Neo4j naming conventions.
+    Check against https://neo4j.com/docs/cypher-manual/current/syntax/naming/,
+    and if not compliant, remove non-compliant characters.
     Args:
     ----

biocypher/output/write/_get_writer.py CHANGED Viewed

@@ -1,15 +1,18 @@
-"""
-BioCypher 'offline' module. Handles the writing of node and edge representations
-suitable for import into a DBMS.
+"""Module to provide one of the available writer classes.
+The writer classes are responsible for writing the node and edge representations
+to disk in a format suitable for import into a DBMS.
 """
 from typing import TYPE_CHECKING
 from biocypher._config import config as _config
 from biocypher._logger import logger
+from biocypher.output.write._batch_writer import _BatchWriter
 from biocypher.output.write.graph._arangodb import _ArangoDBBatchWriter
 from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
 from biocypher.output.write.graph._networkx import _NetworkXWriter
+from biocypher.output.write.graph._owl import _OWLWriter
 from biocypher.output.write.graph._rdf import _RDFWriter
 from biocypher.output.write.relational._csv import _PandasCSVWriter
 from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
@@ -37,6 +40,8 @@ DBMS_TO_CLASS = {
     "sqlite3": _SQLiteBatchWriter,
     "rdf": _RDFWriter,
     "RDF": _RDFWriter,
+    "owl": _OWLWriter,
+    "OWL": _OWLWriter,
     "csv": _PandasCSVWriter,
     "CSV": _PandasCSVWriter,
     "pandas": _PandasCSVWriter,
@@ -54,12 +59,11 @@ def get_writer(
     deduplicator: "Deduplicator",
     output_directory: str,
     strict_mode: bool,
-):
-    """
-    Function to return the writer class based on the selection in the config
-    file.
+) -> _BatchWriter | None:
+    """Return the writer class based on the selection in the config file.
     Args:
+    ----
         dbms: the database management system; for options, see DBMS_TO_CLASS.
         translator: the Translator object.
         deduplicator: the Deduplicator object.
@@ -67,15 +71,26 @@ def get_writer(
         strict_mode: whether to use strict mode.
     Returns:
+    -------
         instance: an instance of the selected writer class.
-    """
+    """
     dbms_config = _config(dbms)
     writer = DBMS_TO_CLASS[dbms]
+    if "rdf_format" in dbms_config:
+        logger.warning("The 'rdf_format' config option is deprecated, use 'file_format' instead.")
+        if "file_format" not in dbms_config:
+            format = dbms_config["rdf_format"]
+            logger.warning(f"I will set 'file_format: {format}' for you.")
+            dbms_config["file_format"] = format
+            dbms_config.pop("rdf_format")
+        logger.warning("NOTE: this warning will become an error in next versions.")
     if not writer:
-        raise ValueError(f"Unknown dbms: {dbms}")
+        msg = f"Unknown dbms: {dbms}"
+        raise ValueError(msg)
     if writer is not None:
         return writer(
@@ -95,6 +110,8 @@ def get_writer(
             db_user=dbms_config.get("user"),  # psql
             db_password=dbms_config.get("password"),  # psql
             db_port=dbms_config.get("port"),  # psql
-            rdf_format=dbms_config.get("rdf_format"),  # rdf
-            rdf_namespaces=dbms_config.get("rdf_namespaces"),  # rdf
+            file_format=dbms_config.get("file_format"),  # rdf, owl
+            rdf_namespaces=dbms_config.get("rdf_namespaces"),  # rdf, owl
+            edge_model=dbms_config.get("edge_model"),  # owl
         )
+    return None

biocypher/output/write/_writer.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
-from typing import Optional, Union
 from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
 from biocypher._deduplicate import Deduplicator
@@ -23,26 +22,28 @@ class _Writer(ABC):
     - _get_import_script_name
     Args:
+    ----
         translator (Translator): Instance of :py:class:`Translator` to enable translation of
             nodes and manipulation of properties.
         deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
             of nodes and edges.
         output_directory (str, optional): Path for exporting CSV files. Defaults to None.
         strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
-    strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
     Raises:
+    ------
         NotImplementedError: Writer implementation must override '_write_node_data'
         NotImplementedError: Writer implementation must override '_write_edge_data'
         NotImplementedError: Writer implementation must override '_construct_import_call'
         NotImplementedError: Writer implementation must override '_get_import_script_name'
     """
     def __init__(
         self,
         translator: Translator,
         deduplicator: Deduplicator,
-        output_directory: Optional[str] = None,
+        output_directory: str | None = None,
         strict_mode: bool = False,
         *args,
         **kwargs,
@@ -50,13 +51,14 @@ class _Writer(ABC):
         """Abstract class for writing node and edge representations to disk.
         Args:
+        ----
             translator (Translator): Instance of :py:class:`Translator` to enable translation of
                 nodes and manipulation of properties.
             deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
                 of nodes and edges.
             output_directory (str, optional): Path for exporting CSV files. Defaults to None.
             strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
-        strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
         """
         self.translator = translator
         self.deduplicator = deduplicator
@@ -67,7 +69,7 @@ class _Writer(ABC):
             if kwargs.get("write_to_file", True):
                 logger.warning(
                     f"Output directory `{self.output_directory}` already exists. "
-                    "If this is not planned, file consistency may be compromised."
+                    "If this is not planned, file consistency may be compromised.",
                 )
         else:
             logger.info(f"Creating output directory `{self.output_directory}`.")
@@ -76,43 +78,50 @@ class _Writer(ABC):
     @abstractmethod
     def _write_node_data(
         self,
-        nodes: Iterable[Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]],
+        nodes: Iterable[BioCypherNode | BioCypherEdge | BioCypherRelAsNode],
     ) -> bool:
         """Implement how to output.write nodes to disk.
         Args:
+        ----
             nodes (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
         Returns:
+        -------
             bool: The return value. True for success, False otherwise.
         """
         raise NotImplementedError("Writer implementation must override 'write_nodes'")
     @abstractmethod
     def _write_edge_data(
         self,
-        edges: Iterable[Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]],
+        edges: Iterable[BioCypherNode | BioCypherEdge | BioCypherRelAsNode],
     ) -> bool:
         """Implement how to output.write edges to disk.
         Args:
+        ----
             edges (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
         Returns:
+        -------
             bool: The return value. True for success, False otherwise.
         """
         raise NotImplementedError("Writer implementation must override 'write_edges'")
     @abstractmethod
     def _construct_import_call(self) -> str:
-        """
-        Function to construct the import call detailing folder and
+        """Function to construct the import call detailing folder and
         individual node and edge headers and data files, as well as
         delimiters and database name. Built after all data has been
         processed to ensure that nodes are called before any edges.
-        Returns:
+        Returns
+        -------
             str: command for importing the output files into a DBMS.
         """
         raise NotImplementedError("Writer implementation must override '_construct_import_call'")
@@ -120,8 +129,10 @@ class _Writer(ABC):
     def _get_import_script_name(self) -> str:
         """Returns the name of the import script.
-        Returns:
+        Returns
+        -------
             str: The name of the import script (ending in .sh)
         """
         raise NotImplementedError("Writer implementation must override '_get_import_script_name'")
@@ -129,6 +140,7 @@ class _Writer(ABC):
         """Wrapper for writing nodes.
         Args:
+        ----
             nodes (BioCypherNode): a list or generator of nodes in
                 :py:class:`BioCypherNode` format
             batch_size (int): The batch size for writing nodes.
@@ -136,7 +148,9 @@ class _Writer(ABC):
                 not present in the schema.
         Returns:
+        -------
             bool: The return value. True for success, False otherwise.
         """
         passed = self._write_node_data(nodes)
         if not passed:
@@ -148,6 +162,7 @@ class _Writer(ABC):
         """Wrapper for writing edges.
         Args:
+        ----
             nodes (BioCypherNode): a list or generator of nodes in
                 :py:class:`BioCypherNode` format
             batch_size (int): The batch size for writing nodes.
@@ -155,7 +170,9 @@ class _Writer(ABC):
                 not present in the schema.
         Returns:
+        -------
             bool: The return value. True for success, False otherwise.
         """
         passed = self._write_edge_data(edges)
         if not passed:
@@ -164,13 +181,14 @@ class _Writer(ABC):
         return True
     def write_import_call(self):
-        """
-        Function to output.write the import call detailing folder and
+        """Function to output.write the import call detailing folder and
         individual node and edge headers and data files, as well as
         delimiters and database name, to the export folder as txt.
-        Returns:
+        Returns
+        -------
             str: The path of the file holding the import call.
         """
         file_path = os.path.join(self.output_directory, self._get_import_script_name())
         logger.info(f"Writing {self.__class__.__name__} import call to `{file_path}`.")

biocypher 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

Potentially problematic release.

biocypher 0.7.0py3-none-any.whl → 0.9.0py3-none-any.whl