PyPI - biocypher - Versions diffs - 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

biocypher 0.6.1py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biocypher might be problematic. Click here for more details.

Files changed (33) hide show

biocypher/__init__.py +3 -13
biocypher/_config/__init__.py +6 -23
biocypher/_core.py +360 -262
biocypher/_create.py +13 -27
biocypher/_deduplicate.py +4 -11
biocypher/_get.py +21 -60
biocypher/_logger.py +4 -16
biocypher/_mapping.py +4 -17
biocypher/_metadata.py +3 -15
biocypher/_misc.py +14 -28
biocypher/_ontology.py +127 -212
biocypher/_translate.py +34 -58
biocypher/output/connect/_get_connector.py +40 -0
biocypher/output/connect/_neo4j_driver.py +9 -65
biocypher/output/in_memory/_get_in_memory_kg.py +34 -0
biocypher/output/in_memory/_in_memory_kg.py +40 -0
biocypher/output/in_memory/_networkx.py +44 -0
biocypher/output/in_memory/_pandas.py +20 -15
biocypher/output/write/_batch_writer.py +137 -172
biocypher/output/write/_get_writer.py +11 -24
biocypher/output/write/_writer.py +14 -33
biocypher/output/write/graph/_arangodb.py +7 -24
biocypher/output/write/graph/_neo4j.py +59 -57
biocypher/output/write/graph/_networkx.py +36 -43
biocypher/output/write/graph/_rdf.py +114 -95
biocypher/output/write/relational/_csv.py +6 -11
biocypher/output/write/relational/_postgresql.py +12 -13
biocypher/output/write/relational/_sqlite.py +3 -1
{biocypher-0.6.1.dist-info → biocypher-0.7.0.dist-info}/LICENSE +1 -1
{biocypher-0.6.1.dist-info → biocypher-0.7.0.dist-info}/METADATA +3 -3
biocypher-0.7.0.dist-info/RECORD +43 -0
{biocypher-0.6.1.dist-info → biocypher-0.7.0.dist-info}/WHEEL +1 -1
biocypher-0.6.1.dist-info/RECORD +0 -39

biocypher/output/write/_get_writer.py CHANGED Viewed

@@ -1,38 +1,27 @@
-#!/usr/bin/env python
-#
-# Copyright 2021, Heidelberg University Clinic
-#
-# File author(s): Sebastian Lobentanzer
-#                 Michael Hartung
-#
-# Distributed under MIT licence, see the file `LICENSE`.
-#
 """
 BioCypher 'offline' module. Handles the writing of node and edge representations
 suitable for import into a DBMS.
 """
+from typing import TYPE_CHECKING
+from biocypher._config import config as _config
 from biocypher._logger import logger
-from biocypher.output.write.graph._rdf import _RDFWriter
-from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
 from biocypher.output.write.graph._arangodb import _ArangoDBBatchWriter
+from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
 from biocypher.output.write.graph._networkx import _NetworkXWriter
+from biocypher.output.write.graph._rdf import _RDFWriter
 from biocypher.output.write.relational._csv import _PandasCSVWriter
-from biocypher.output.write.relational._sqlite import _SQLiteBatchWriter
 from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
+from biocypher.output.write.relational._sqlite import _SQLiteBatchWriter
 logger.debug(f"Loading module {__name__}.")
-from typing import TYPE_CHECKING
-from biocypher._config import config as _config
 __all__ = ["get_writer", "DBMS_TO_CLASS"]
 if TYPE_CHECKING:
-    from biocypher._translate import Translator
     from biocypher._deduplicate import Deduplicator
+    from biocypher._translate import Translator
 DBMS_TO_CLASS = {
     "neo": _Neo4jBatchWriter,
@@ -52,6 +41,8 @@ DBMS_TO_CLASS = {
     "CSV": _PandasCSVWriter,
     "pandas": _PandasCSVWriter,
     "Pandas": _PandasCSVWriter,
+    "tabular": _PandasCSVWriter,
+    "Tabular": _PandasCSVWriter,
     "networkx": _NetworkXWriter,
     "NetworkX": _NetworkXWriter,
 }
@@ -99,12 +90,8 @@ def get_writer(
             import_call_file_prefix=dbms_config.get("import_call_file_prefix"),
             wipe=dbms_config.get("wipe"),
             strict_mode=strict_mode,
-            skip_bad_relationships=dbms_config.get(
-                "skip_bad_relationships"
-            ),  # neo4j
-            skip_duplicate_nodes=dbms_config.get(
-                "skip_duplicate_nodes"
-            ),  # neo4j
+            skip_bad_relationships=dbms_config.get("skip_bad_relationships"),  # neo4j
+            skip_duplicate_nodes=dbms_config.get("skip_duplicate_nodes"),  # neo4j
             db_user=dbms_config.get("user"),  # psql
             db_password=dbms_config.get("password"),  # psql
             db_port=dbms_config.get("port"),  # psql

biocypher/output/write/_writer.py CHANGED Viewed

@@ -1,12 +1,13 @@
+import os
 from abc import ABC, abstractmethod
-from typing import Union, Optional
 from collections.abc import Iterable
-import os
+from typing import Optional, Union
 from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
+from biocypher._deduplicate import Deduplicator
 from biocypher._logger import logger
 from biocypher._translate import Translator
-from biocypher._deduplicate import Deduplicator
 __all__ = ["_Writer"]
@@ -75,9 +76,7 @@ class _Writer(ABC):
     @abstractmethod
     def _write_node_data(
         self,
-        nodes: Iterable[
-            Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
-        ],
+        nodes: Iterable[Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]],
     ) -> bool:
         """Implement how to output.write nodes to disk.
@@ -87,16 +86,12 @@ class _Writer(ABC):
         Returns:
             bool: The return value. True for success, False otherwise.
         """
-        raise NotImplementedError(
-            "Writer implementation must override 'write_nodes'"
-        )
+        raise NotImplementedError("Writer implementation must override 'write_nodes'")
     @abstractmethod
     def _write_edge_data(
         self,
-        edges: Iterable[
-            Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
-        ],
+        edges: Iterable[Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]],
     ) -> bool:
         """Implement how to output.write edges to disk.
@@ -106,9 +101,7 @@ class _Writer(ABC):
         Returns:
             bool: The return value. True for success, False otherwise.
         """
-        raise NotImplementedError(
-            "Writer implementation must override 'write_edges'"
-        )
+        raise NotImplementedError("Writer implementation must override 'write_edges'")
     @abstractmethod
     def _construct_import_call(self) -> str:
@@ -121,9 +114,7 @@ class _Writer(ABC):
         Returns:
             str: command for importing the output files into a DBMS.
         """
-        raise NotImplementedError(
-            "Writer implementation must override '_construct_import_call'"
-        )
+        raise NotImplementedError("Writer implementation must override '_construct_import_call'")
     @abstractmethod
     def _get_import_script_name(self) -> str:
@@ -132,13 +123,9 @@ class _Writer(ABC):
         Returns:
             str: The name of the import script (ending in .sh)
         """
-        raise NotImplementedError(
-            "Writer implementation must override '_get_import_script_name'"
-        )
+        raise NotImplementedError("Writer implementation must override '_get_import_script_name'")
-    def write_nodes(
-        self, nodes, batch_size: int = int(1e6), force: bool = False
-    ):
+    def write_nodes(self, nodes, batch_size: int = int(1e6), force: bool = False):
         """Wrapper for writing nodes.
         Args:
@@ -157,9 +144,7 @@ class _Writer(ABC):
             return False
         return True
-    def write_edges(
-        self, edges, batch_size: int = int(1e6), force: bool = False
-    ):
+    def write_edges(self, edges, batch_size: int = int(1e6), force: bool = False):
         """Wrapper for writing edges.
         Args:
@@ -187,12 +172,8 @@ class _Writer(ABC):
         Returns:
             str: The path of the file holding the import call.
         """
-        file_path = os.path.join(
-            self.output_directory, self._get_import_script_name()
-        )
-        logger.info(
-            f"Writing {self.__class__.__name__} import call to `{file_path}`."
-        )
+        file_path = os.path.join(self.output_directory, self._get_import_script_name())
+        logger.info(f"Writing {self.__class__.__name__} import call to `{file_path}`.")
         with open(file_path, "w", encoding="utf-8") as f:
             f.write(self._construct_import_call())

biocypher/output/write/graph/_arangodb.py CHANGED Viewed

@@ -61,9 +61,7 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
             # check if file already exists
             if os.path.exists(header_path):
-                logger.warning(
-                    f"File {header_path} already exists. Overwriting."
-                )
+                logger.warning(f"File {header_path} already exists. Overwriting.")
             # concatenate key:value in props
             props_list = []
@@ -81,9 +79,7 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
                 f.write(row)
             # add collection from schema config
-            collection = self.translator.ontology.mapping.extended_schema[
-                label
-            ].get("db_collection_name", None)
+            collection = self.translator.ontology.mapping.extended_schema[label].get("db_collection_name", None)
             # add file path to neo4 admin import statement
             # do once for each part file
@@ -91,8 +87,7 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
             if not parts:
                 raise ValueError(
-                    f"No parts found for node label {label}. "
-                    f"Check that the data was parsed first.",
+                    f"No parts found for node label {label}. " f"Check that the data was parsed first.",
                 )
             for part in parts:
@@ -145,9 +140,7 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
             # check for file exists
             if os.path.exists(header_path):
-                logger.warning(
-                    f"Header file {header_path} already exists. Overwriting."
-                )
+                logger.warning(f"Header file {header_path} already exists. Overwriting.")
             # concatenate key:value in props
             props_list = []
@@ -172,9 +165,7 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
                         break
             else:
-                collection = self.translator.ontology.mapping.extended_schema[
-                    label
-                ].get("db_collection_name", None)
+                collection = self.translator.ontology.mapping.extended_schema[label].get("db_collection_name", None)
             # add file path to neo4 admin import statement (import call path
             # may be different from actual output path)
@@ -206,11 +197,7 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
         Returns:
             str: a bash command for neo4j-admin import
         """
-        import_call = (
-            f"{self.import_call_bin_prefix}arangoimp "
-            f"--type csv "
-            f'--separator="{self.escaped_delim}" '
-        )
+        import_call = f"{self.import_call_bin_prefix}arangoimp " f"--type csv " f'--separator="{self.escaped_delim}" '
         if self.quote == "'":
             import_call += f'--quote="{self.quote}" '
@@ -221,11 +208,7 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
         # node import calls: one line per node type
         for header_path, parts_path, collection in self.import_call_nodes:
-            line = (
-                f"{import_call} "
-                f"--headers-file {header_path} "
-                f"--file= {parts_path} "
-            )
+            line = f"{import_call} --headers-file {header_path} --file= {parts_path} "
             if collection:
                 line += f"--create-collection --collection {collection} "

biocypher/output/write/graph/_neo4j.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import os
 from biocypher._logger import logger
-from biocypher.output.write._batch_writer import parse_label, _BatchWriter
+from biocypher.output.write._batch_writer import _BatchWriter, parse_label
 class _Neo4jBatchWriter(_BatchWriter):
-    """
-    Class for writing node and edge representations to disk using the
+    """Class for writing node and edge representations to disk using the
     format specified by Neo4j for the use of admin import. Each batch
     writer instance has a fixed representation that needs to be passed
     at instantiation via the :py:attr:`schema` argument. The instance
@@ -23,50 +22,60 @@ class _Neo4jBatchWriter(_BatchWriter):
     """
     def __init__(self, *args, **kwargs):
-        """
-        Constructor.
+        """Constructor.
         Check the version of Neo4j and adds a command scope if version >= 5.
-        Returns:
+        Returns
+        -------
             _Neo4jBatchWriter: An instance of the writer.
-        """
+        """
         # Should read the configuration and setup import_call_bin_prefix.
         super().__init__(*args, **kwargs)
     def _get_default_import_call_bin_prefix(self):
-        """
-        Method to provide the default string for the import call bin prefix.
+        """Method to provide the default string for the import call bin prefix.
-        Returns:
+        Returns
+        -------
             str: The default location for the neo4j admin import location
-        """
+        """
         return "bin/"
-    def _write_array_string(self, string_list):
+    def _quote_string(self, value: str) -> str:
+        """
+        Quote a string. Quote character is escaped by doubling it.
         """
-        Abstract method to output.write the string representation of an array into a .csv file
+        return f"{self.quote}{value.replace(self.quote, self.quote * 2)}{self.quote}"
+    def _write_array_string(self, string_list):
+        """Abstract method to output.write the string representation of an array into a .csv file
         as required by the neo4j admin-import.
         Args:
+        ----
             string_list (list): list of ontology strings
         Returns:
+        -------
             str: The string representation of an array for the neo4j admin import
         """
         string = self.adelim.join(string_list)
-        return f"{self.quote}{string}{self.quote}"
+        return self._quote_string(string)
     def _write_node_headers(self):
-        """
-        Writes single CSV file for a graph entity that is represented
+        """Writes single CSV file for a graph entity that is represented
         as a node as per the definition in the `schema_config.yaml`,
         containing only the header for this type of node.
-        Returns:
+        Returns
+        -------
             bool: The return value. True for success, False otherwise.
         """
         # load headers from data parse
         if not self.node_property_dict:
@@ -79,9 +88,7 @@ class _Neo4jBatchWriter(_BatchWriter):
             _id = ":ID"
             # translate label to PascalCase
-            pascal_label = self.translator.name_sentence_to_pascal(
-                parse_label(label)
-            )
+            pascal_label = self.translator.name_sentence_to_pascal(parse_label(label))
             header = f"{pascal_label}-header.csv"
             header_path = os.path.join(
@@ -136,20 +143,19 @@ class _Neo4jBatchWriter(_BatchWriter):
                 self.import_call_file_prefix,
                 parts,
             )
-            self.import_call_nodes.add(
-                (import_call_header_path, import_call_parts_path)
-            )
+            self.import_call_nodes.add((import_call_header_path, import_call_parts_path))
         return True
     def _write_edge_headers(self):
-        """
-        Writes single CSV file for a graph entity that is represented
+        """Writes single CSV file for a graph entity that is represented
         as an edge as per the definition in the `schema_config.yaml`,
         containing only the header for this type of edge.
-        Returns:
+        Returns
+        -------
             bool: The return value. True for success, False otherwise.
         """
         # load headers from data parse
         if not self.edge_property_dict:
@@ -160,9 +166,7 @@ class _Neo4jBatchWriter(_BatchWriter):
         for label, props in self.edge_property_dict.items():
             # translate label to PascalCase
-            pascal_label = self.translator.name_sentence_to_pascal(
-                parse_label(label)
-            )
+            pascal_label = self.translator.name_sentence_to_pascal(parse_label(label))
             # paths
             header = f"{pascal_label}-header.csv"
@@ -174,9 +178,7 @@ class _Neo4jBatchWriter(_BatchWriter):
             # check for file exists
             if os.path.exists(header_path):
-                logger.warning(
-                    f"File {header_path} already exists. Overwriting."
-                )
+                logger.warning(f"File {header_path} already exists. Overwriting.")
             # concatenate key:value in props
             props_list = []
@@ -206,9 +208,7 @@ class _Neo4jBatchWriter(_BatchWriter):
             if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
                 skip_id = True
-            elif not self.translator.ontology.mapping.extended_schema.get(
-                label
-            ):
+            elif not self.translator.ontology.mapping.extended_schema.get(label):
                 # find label in schema by label_as_edge
                 for (
                     k,
@@ -224,10 +224,10 @@ class _Neo4jBatchWriter(_BatchWriter):
             if schema_label:
                 if (
-                    self.translator.ontology.mapping.extended_schema.get(
-                        schema_label
+                    self.translator.ontology.mapping.extended_schema.get(  # (seems to not work with 'not')
+                        schema_label,
                     ).get("use_id")
-                    == False
+                    == False  # noqa: E712 (seems to not work with 'not')
                 ):
                     skip_id = True
@@ -252,54 +252,56 @@ class _Neo4jBatchWriter(_BatchWriter):
                 self.import_call_file_prefix,
                 parts,
             )
-            self.import_call_edges.add(
-                (import_call_header_path, import_call_parts_path)
-            )
+            self.import_call_edges.add((import_call_header_path, import_call_parts_path))
         return True
     def _get_import_script_name(self) -> str:
-        """
-        Returns the name of the neo4j admin import script
+        """Returns the name of the neo4j admin import script
-        Returns:
+        Returns
+        -------
             str: The name of the import script (ending in .sh)
         """
         return "neo4j-admin-import-call.sh"
     def _construct_import_call(self) -> str:
-        """
-        Function to construct the import call detailing folder and
+        """Function to construct the import call detailing folder and
         individual node and edge headers and data files, as well as
         delimiters and database name. Built after all data has been
         processed to ensure that nodes are called before any edges.
-        Returns:
+        Returns
+        -------
             str: a bash command for neo4j-admin import
         """
-        import_call_neo4j_v4 = self._get_import_call(
-            "import", "--database=", "--force="
-        )
-        import_call_neo4j_v5 = self._get_import_call(
-            "database import full", "", "--overwrite-destination="
+        import_call_neo4j_v4 = self._get_import_call("import", "--database=", "--force=")
+        import_call_neo4j_v5 = self._get_import_call("database import full", "", "--overwrite-destination=")
+        neo4j_version_check = (
+            f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)"
         )
-        neo4j_version_check = f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)"
-        import_script = f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi"
+        import_script = (
+            f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; "
+            f"then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi"
+        )
         return import_script
-    def _get_import_call(
-        self, import_cmd: str, database_cmd: str, wipe_cmd: str
-    ) -> str:
+    def _get_import_call(self, import_cmd: str, database_cmd: str, wipe_cmd: str) -> str:
         """Get parametrized import call for Neo4j 4 or 5+.
         Args:
+        ----
             import_cmd (str): The import command to use.
             database_cmd (str): The database command to use.
             wipe_cmd (str): The wipe command to use.
         Returns:
+        -------
             str: The import call.
         """
         import_call = f"{self.import_call_bin_prefix}neo4j-admin {import_cmd} "

biocypher/output/write/graph/_networkx.py CHANGED Viewed

@@ -1,31 +1,34 @@
 import pickle
-import networkx as nx
 from biocypher._logger import logger
+from biocypher.output.in_memory._networkx import NetworkxKG
 from biocypher.output.write._writer import _Writer
-from biocypher.output.write.relational._csv import _PandasCSVWriter
 class _NetworkXWriter(_Writer):
     """
-    Class for writing node and edges to a networkx DiGraph.
+    Class for writing the in-memory networkx DiGraph to file.
+    Call `_construct_import_call` to write the networkx DiGraph to a pickle
+    file and return the Python call to load it.
+    TODO: this is a non-intuitive name, should be adjusted.
     """
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.csv_writer = _PandasCSVWriter(*args, write_to_file=False, **kwargs)
-        self.G = nx.DiGraph()
+        self.in_memory_networkx_kg = NetworkxKG(
+            deduplicator=self.deduplicator,
+        )
     def _construct_import_call(self) -> str:
-        """Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
+        """Dump networkx graph to a pickle file and return Python call.
         Returns:
-            str: Python code to load the csv files into Pandas dfs.
+            str: Python code to load the networkx graph from a pickle file.
         """
-        logger.info(
-            f"Writing networkx {self.G} to pickle file networkx_graph.pkl."
-        )
+        self.G = self.in_memory_networkx_kg._create_networkx_kg()
+        logger.info(f"Writing networkx {self.G} to pickle file networkx_graph.pkl.")
         with open(f"{self.output_directory}/networkx_graph.pkl", "wb") as f:
             pickle.dump(self.G, f)
@@ -38,39 +41,29 @@ class _NetworkXWriter(_Writer):
         return "import_networkx.py"
     def _write_node_data(self, nodes) -> bool:
-        passed = self.csv_writer._write_entities_to_file(nodes)
-        self.add_to_networkx()
+        """Add nodes to the networkx graph.
+        TODO: this is not strictly writing, should be refactored.
+        Args:
+            nodes (list): List of nodes to add to the networkx graph.
+        Returns:
+            bool: True if the nodes were added successfully, False otherwise.
+        """
+        passed = self.in_memory_networkx_kg.add_nodes(nodes)
         return passed
     def _write_edge_data(self, edges) -> bool:
-        passed = self.csv_writer._write_entities_to_file(edges)
-        self.add_to_networkx()
-        return passed
+        """Add edges to the networkx graph.
+        TODO: this is not strictly writing, should be refactored.
-    def add_to_networkx(self) -> bool:
-        all_dfs = self.csv_writer.stored_dfs
-        node_dfs = [
-            df
-            for df in all_dfs.values()
-            if df.columns.str.contains("node_id").any()
-        ]
-        edge_dfs = [
-            df
-            for df in all_dfs.values()
-            if df.columns.str.contains("source_id").any()
-            and df.columns.str.contains("target_id").any()
-        ]
-        for df in node_dfs:
-            nodes = df.set_index("node_id").to_dict(orient="index")
-            self.G.add_nodes_from(nodes.items())
-        for df in edge_dfs:
-            edges = df.set_index(["source_id", "target_id"]).to_dict(
-                orient="index"
-            )
-            self.G.add_edges_from(
-                (
-                    (source, target, attrs)
-                    for (source, target), attrs in edges.items()
-                )
-            )
-        return True
+        Args:
+            edges (list): List of edges to add to the networkx graph.
+        Returns:
+            bool: True if the edges were added successfully, False otherwise.
+        """
+        passed = self.in_memory_networkx_kg.add_edges(edges)
+        return passed

biocypher 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

Potentially problematic release.

biocypher 0.6.1py3-none-any.whl → 0.7.0py3-none-any.whl