PyPI - biocypher - Versions diffs - 0.5.40__py3-none-any.whl → 0.5.42__py3-none-any.whl - Mend

biocypher 0.5.40py3-none-any.whl → 0.5.42py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biocypher might be problematic. Click here for more details.

Files changed (13) hide show

biocypher/_config/biocypher_config.yaml +7 -0
biocypher/_connect.py +36 -9
biocypher/_metadata.py +1 -1
biocypher/_misc.py +12 -3
biocypher/_ontology.py +133 -53
biocypher/write/_batch_writer.py +11 -0
biocypher/write/_write.py +6 -1
biocypher/write/graph/_neo4j.py +44 -3
biocypher/write/graph/_rdf.py +516 -0
{biocypher-0.5.40.dist-info → biocypher-0.5.42.dist-info}/METADATA +1 -1
{biocypher-0.5.40.dist-info → biocypher-0.5.42.dist-info}/RECORD +13 -12
{biocypher-0.5.40.dist-info → biocypher-0.5.42.dist-info}/LICENSE +0 -0
{biocypher-0.5.40.dist-info → biocypher-0.5.42.dist-info}/WHEEL +0 -0

biocypher/_config/biocypher_config.yaml CHANGED Viewed

@@ -27,6 +27,7 @@ biocypher:
   head_ontology:
     url: https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
     root_node: entity
+    # switch_label_and_id: true
   ### Optional parameters ###
@@ -53,10 +54,12 @@ biocypher:
   #     url: test/ontologies/so.owl
   #     head_join_node: sequence variant
   #     tail_join_node: sequence_variant
+  #     switch_label_and_id: true
   #   mondo:
   #     url: test/ontologies/mondo.owl
   #     head_join_node: disease
   #     tail_join_node: disease
+  #     switch_label_and_id: true
 ### DBMS configuration ###
@@ -113,6 +116,10 @@ postgresql:
   # import_call_bin_prefix: '' # path to "psql"
   # import_call_file_prefix: '/path/to/files'
+rdf:
+  ### RDF configuration ###
+  rdf_format: turtle
 sqlite:
   ### SQLite configuration ###

biocypher/_connect.py CHANGED Viewed

@@ -11,11 +11,12 @@
 """
 BioCypher 'online' mode. Handles connection and manipulation of a running DBMS.
 """
+import subprocess
 from ._logger import logger
 logger.debug(f"Loading module {__name__}.")
-from typing import Optional
 from collections.abc import Iterable
 import itertools
@@ -24,7 +25,6 @@ import neo4j_utils
 from . import _misc
 from ._config import config as _config
 from ._create import BioCypherEdge, BioCypherNode
-from ._ontology import Ontology
 from ._translate import Translator
 __all__ = ["_Neo4jDriver"]
@@ -137,16 +137,43 @@ class _Neo4jDriver:
         logger.info("Creating constraints for node types in config.")
+        major_neo4j_version = int(self._get_neo4j_version().split(".")[0])
         # get structure
         for leaf in self.translator.ontology.mapping.extended_schema.items():
-            label = _misc.sentencecase_to_pascalcase(leaf[0])
+            label = _misc.sentencecase_to_pascalcase(leaf[0], sep=r"\s\.")
             if leaf[1]["represented_as"] == "node":
-                s = (
-                    f"CREATE CONSTRAINT `{label}_id` "
-                    f"IF NOT EXISTS ON (n:`{label}`) "
-                    "ASSERT n.id IS UNIQUE"
-                )
-                self._driver.query(s)
+                if major_neo4j_version >= 5:
+                    s = (
+                        f"CREATE CONSTRAINT `{label}_id` "
+                        f"IF NOT EXISTS FOR (n:`{label}`) "
+                        "REQUIRE n.id IS UNIQUE"
+                    )
+                    self._driver.query(s)
+                else:
+                    s = (
+                        f"CREATE CONSTRAINT `{label}_id` "
+                        f"IF NOT EXISTS ON (n:`{label}`) "
+                        "ASSERT n.id IS UNIQUE"
+                    )
+                    self._driver.query(s)
+    def _get_neo4j_version(self):
+        """Get neo4j version."""
+        try:
+            neo4j_version = self._driver.query(
+                """
+                    CALL dbms.components()
+                    YIELD name, versions, edition
+                    UNWIND versions AS version
+                    RETURN version AS version
+                """,
+            )[0][0]["version"]
+            return neo4j_version
+        except Exception as e:
+            logger.warning(
+                f"Error detecting Neo4j version: {e} use default version 4.0.0."
+            )
+            return "4.0.0"
     def add_nodes(self, id_type_tuples: Iterable[tuple]) -> tuple:
         """

biocypher/_metadata.py CHANGED Viewed

@@ -19,7 +19,7 @@ import importlib.metadata
 import toml
-_VERSION = "0.5.40"
+_VERSION = "0.5.42"
 def get_metadata():

biocypher/_misc.py CHANGED Viewed

@@ -115,7 +115,12 @@ def _get_inheritance_tree(inheritance_graph: Union[dict, nx.Graph]) -> dict:
         )
         if multiple_parents_present:
             logger.warning(
-                "The ontology contains multiple inheritance (one child node has multiple parent nodes). This is not visualized in the following hierarchy tree (the child node is only added once). If you want to browse all relationships of the parsed ontology write a graphml file to disk and view this file."
+                "The ontology contains multiple inheritance (one child node "
+                "has multiple parent nodes). This is not visualized in the "
+                "following hierarchy tree (the child node is only added once). "
+                "If you wish to browse all relationships of the parsed "
+                "ontologies, write a graphml file to disk using "
+                "`to_disk = <directory>` and view this file."
             )
         # unlist values
@@ -205,7 +210,7 @@ def sentencecase_to_snakecase(s: str) -> str:
     return stringcase.snakecase(s).lower()
-def sentencecase_to_pascalcase(s: str) -> str:
+def sentencecase_to_pascalcase(s: str, sep: str = r"\s") -> str:
     """
     Convert sentence case to PascalCase.
@@ -215,7 +220,11 @@ def sentencecase_to_pascalcase(s: str) -> str:
     Returns:
         string in PascalCase form
     """
-    return re.sub(r"(?:^| )([a-zA-Z])", lambda match: match.group(1).upper(), s)
+    return re.sub(
+        r"(?:^|[" + sep + "])([a-zA-Z])",
+        lambda match: match.group(1).upper(),
+        s,
+    )
 def to_lower_sentence_case(s: str) -> str:

biocypher/_ontology.py CHANGED Viewed

@@ -43,19 +43,19 @@ class OntologyAdapter:
     ontology is represented by a networkx.DiGraph object; an RDFlib graph is
     also kept. By default, the DiGraph reverses the label and identifier of the
     nodes, such that the node name in the graph is the human-readable label. The
-    edges are oriented from child to parent. Going from the Biolink example,
-    labels are formatted in lower sentence case. In some cases, this means that
-    we replace underscores with spaces.
+    edges are oriented from child to parent.
+    Labels are formatted in lower sentence case and underscores are replaced by spaces.
+    Identifiers are taken as defined and the prefixes are removed by default.
     """
     def __init__(
         self,
         ontology_file: str,
         root_label: str,
-        format: Optional[str] = None,
-        head_join_node: Optional[str] = None,
+        ontology_file_format: Optional[str] = None,
+        head_join_node_label: Optional[str] = None,
         merge_nodes: Optional[bool] = True,
-        reverse_labels: bool = True,
+        switch_label_and_id: bool = True,
         remove_prefixes: bool = True,
     ):
         """
@@ -68,7 +68,10 @@ class OntologyAdapter:
             root_label (str): The label of the root node in the ontology. In
                 case of a tail ontology, this is the tail join node.
-            head_join_node (str): Optional variable to store the label of the
+            ontology_file_format (str): The format of the ontology file (e.g. "application/rdf+xml")
+                If format is not passed, it is determined automatically.
+            head_join_node_label (str): Optional variable to store the label of the
                 node in the head ontology that should be used to join to the
                 root node of the tail ontology. Defaults to None.
@@ -77,7 +80,7 @@ class OntologyAdapter:
                 tail join node will be attached as a child of the head join
                 node.
-            reverse_labels (bool): If True, the node names in the graph will be
+            switch_label_and_id (bool): If True, the node names in the graph will be
                 the human-readable labels. If False, the node names will be the
                 identifiers. Defaults to True.
@@ -89,33 +92,37 @@ class OntologyAdapter:
         self._ontology_file = ontology_file
         self._root_label = root_label
-        self._format = format
+        self._format = ontology_file_format
         self._merge_nodes = merge_nodes
-        self._head_join_node = head_join_node
-        self._reverse_labels = reverse_labels
+        self._head_join_node = head_join_node_label
+        self._switch_label_and_id = switch_label_and_id
         self._remove_prefixes = remove_prefixes
         self._rdf_graph = self._load_rdf_graph(ontology_file)
         self._nx_graph = self._rdf_to_nx(
-            self._rdf_graph, root_label, reverse_labels
+            self._rdf_graph, root_label, switch_label_and_id
         )
     def _rdf_to_nx(
-        self, _rdf_graph: rdflib.Graph, root_label: str, reverse_labels: bool
+        self,
+        _rdf_graph: rdflib.Graph,
+        root_label: str,
+        switch_label_and_id: bool,
+        rename_nodes: bool = True,
     ) -> nx.DiGraph:
         one_to_one_triples, one_to_many_dict = self._get_relevant_rdf_triples(
             _rdf_graph
         )
         nx_graph = self._convert_to_nx(one_to_one_triples, one_to_many_dict)
-        nx_graph_with_labels = self._add_labels_to_nodes(
-            nx_graph, reverse_labels
+        nx_graph = self._add_labels_to_nodes(nx_graph, switch_label_and_id)
+        nx_graph = self._change_nodes_to_biocypher_format(
+            nx_graph, switch_label_and_id, rename_nodes
         )
-        renamed_graph = self._rename_nodes(nx_graph_with_labels, reverse_labels)
-        filtered_graph = self._get_all_ancestors(
-            renamed_graph, root_label, reverse_labels
+        nx_graph = self._get_all_ancestors(
+            nx_graph, root_label, switch_label_and_id, rename_nodes
         )
-        return nx.DiGraph(filtered_graph)
+        return nx.DiGraph(nx_graph)
     def _get_relevant_rdf_triples(self, g: rdflib.Graph) -> tuple:
         one_to_one_inheritance_graph = self._get_one_to_one_inheritance_triples(
@@ -239,19 +246,21 @@ class OntologyAdapter:
         return nx_graph
     def _add_labels_to_nodes(
-        self, nx_graph: nx.DiGraph, reverse_labels: bool
+        self, nx_graph: nx.DiGraph, switch_label_and_id: bool
     ) -> nx.DiGraph:
         """Add labels to the nodes in the networkx graph.
         Args:
             nx_graph (nx.DiGraph): The networkx graph
-            reverse_labels (bool): If True, id and label are switched
+            switch_label_and_id (bool): If True, id and label are switched
         Returns:
             nx.DiGraph: The networkx graph with labels
         """
         for node in list(nx_graph.nodes):
-            nx_id, nx_label = self._get_nx_id_and_label(node, reverse_labels)
+            nx_id, nx_label = self._get_nx_id_and_label(
+                node, switch_label_and_id
+            )
             if nx_id == "none":
                 # remove node if it has no id
                 nx_graph.remove_node(node)
@@ -260,39 +269,56 @@ class OntologyAdapter:
             nx_graph.nodes[node]["label"] = nx_label
         return nx_graph
-    def _rename_nodes(
-        self, nx_graph: nx.DiGraph, reverse_labels: bool
+    def _change_nodes_to_biocypher_format(
+        self,
+        nx_graph: nx.DiGraph,
+        switch_label_and_id: bool,
+        rename_nodes: bool = True,
     ) -> nx.DiGraph:
-        """Rename the nodes in the networkx graph (remove prefix and switch id and label).
+        """Change the nodes in the networkx graph to BioCypher format:
+            - remove the prefix of the identifier
+            - switch id and label
+            - adapt the labels (replace _ with space and convert to lower sentence case)
         Args:
             nx_graph (nx.DiGraph): The networkx graph
-            reverse_labels (bool): If True, id and label are switched
+            switch_label_and_id (bool): If True, id and label are switched
+            rename_nodes (bool): If True, the nodes are renamed
         Returns:
-            nx.DiGraph: The renamed networkx graph
+            nx.DiGraph: The networkx ontology graph in BioCypher format
         """
         mapping = {
-            node: self._get_nx_id_and_label(node, reverse_labels)[0]
+            node: self._get_nx_id_and_label(
+                node, switch_label_and_id, rename_nodes
+            )[0]
             for node in nx_graph.nodes
         }
         renamed = nx.relabel_nodes(nx_graph, mapping, copy=False)
         return renamed
     def _get_all_ancestors(
-        self, renamed: nx.DiGraph, root_label: str, reverse_labels: bool
+        self,
+        renamed: nx.DiGraph,
+        root_label: str,
+        switch_label_and_id: bool,
+        rename_nodes: bool = True,
     ) -> nx.DiGraph:
         """Get all ancestors of the root node in the networkx graph.
         Args:
             renamed (nx.DiGraph): The renamed networkx graph
             root_label (str): The label of the root node in the ontology
+            switch_label_and_id (bool): If True, id and label are switched
+            rename_nodes (bool): If True, the nodes are renamed
         Returns:
             nx.DiGraph: The filtered networkx graph
         """
         root = self._get_nx_id_and_label(
-            self._find_root_label(self._rdf_graph, root_label), reverse_labels
+            self._find_root_label(self._rdf_graph, root_label),
+            switch_label_and_id,
+            rename_nodes,
         )[0]
         ancestors = nx.ancestors(renamed, root)
         ancestors.add(root)
@@ -300,7 +326,7 @@ class OntologyAdapter:
         return filtered_graph
     def _get_nx_id_and_label(
-        self, node, switch_id_and_label: bool
+        self, node, switch_id_and_label: bool, rename_nodes: bool = True
     ) -> tuple[str, str]:
         """Rename node id and label for nx graph.
@@ -312,10 +338,10 @@ class OntologyAdapter:
             tuple[str, str]: The renamed node id and label
         """
         node_id_str = self._remove_prefix(str(node))
-        node_label_str = str(
-            self._rdf_graph.value(node, rdflib.RDFS.label)
-        ).replace("_", " ")
-        node_label_str = to_lower_sentence_case(node_label_str)
+        node_label_str = str(self._rdf_graph.value(node, rdflib.RDFS.label))
+        if rename_nodes:
+            node_label_str = node_label_str.replace("_", " ")
+            node_label_str = to_lower_sentence_case(node_label_str)
         nx_id = node_label_str if switch_id_and_label else node_id_str
         nx_label = node_id_str if switch_id_and_label else node_label_str
         return nx_id, nx_label
@@ -330,8 +356,14 @@ class OntologyAdapter:
                 root = label_subject
                 break
         else:
+            labels_in_ontology = []
+            for label_subject, _, label_in_ontology in g.triples(
+                (None, rdflib.RDFS.label, None)
+            ):
+                labels_in_ontology.append(str(label_in_ontology))
             raise ValueError(
-                f"Could not find root node with label {root_label}"
+                f"Could not find root node with label '{root_label}'. "
+                f"The ontology contains the following labels: {labels_in_ontology}"
             )
         return root
@@ -398,11 +430,29 @@ class OntologyAdapter:
         """
         return self._rdf_graph
-    def get_root_label(self):
+    def get_root_node(self):
         """
-        Get the label of the root node in the ontology.
+        Get root node in the ontology.
+        Returns:
+            root_node: If _switch_label_and_id is True, the root node label is returned,
+                otherwise the root node id is returned.
         """
-        return self._root_label
+        root_node = None
+        root_label = self._root_label.replace("_", " ")
+        if self._switch_label_and_id:
+            root_node = to_lower_sentence_case(root_label)
+        elif not self._switch_label_and_id:
+            for node, data in self.get_nx_graph().nodes(data=True):
+                if "label" in data and data["label"] == to_lower_sentence_case(
+                    root_label
+                ):
+                    root_node = node
+                    break
+        return root_node
     def get_ancestors(self, node_label):
         """
@@ -465,8 +515,8 @@ class Ontology:
         if self._tail_ontologies:
             for adapter in self._tail_ontologies.values():
-                self._assert_join_node(adapter)
-                self._join_ontologies(adapter)
+                head_join_node = self._get_head_join_node(adapter)
+                self._join_ontologies(adapter, head_join_node)
         else:
             self._nx_graph = self._head_ontology.get_nx_graph()
@@ -489,7 +539,10 @@ class Ontology:
         self._head_ontology = OntologyAdapter(
             ontology_file=self._head_ontology_meta["url"],
             root_label=self._head_ontology_meta["root_node"],
-            format=self._head_ontology_meta.get("format", None),
+            ontology_file_format=self._head_ontology_meta.get("format", None),
+            switch_label_and_id=self._head_ontology_meta.get(
+                "switch_label_and_id", True
+            ),
         )
         if self._tail_ontology_meta:
@@ -498,12 +551,13 @@ class Ontology:
                 self._tail_ontologies[key] = OntologyAdapter(
                     ontology_file=value["url"],
                     root_label=value["tail_join_node"],
-                    head_join_node=value["head_join_node"],
-                    format=value.get("format", None),
+                    head_join_node_label=value["head_join_node"],
+                    ontology_file_format=value.get("format", None),
                     merge_nodes=value.get("merge_nodes", True),
+                    switch_label_and_id=value.get("switch_label_and_id", True),
                 )
-    def _assert_join_node(self, adapter: OntologyAdapter) -> None:
+    def _get_head_join_node(self, adapter: OntologyAdapter) -> str:
         """
         Tries to find the head join node of the given ontology adapter in the
         head ontology. If the join node is not found, the method will raise an
@@ -514,15 +568,41 @@ class Ontology:
                 join node in the head ontology.
         """
-        head_join_node = adapter.get_head_join_node()
+        head_join_node = None
+        user_defined_head_join_node_label = adapter.get_head_join_node()
+        head_join_node_label_in_bc_format = to_lower_sentence_case(
+            user_defined_head_join_node_label.replace("_", " ")
+        )
+        if self._head_ontology._switch_label_and_id:
+            head_join_node = head_join_node_label_in_bc_format
+        elif not self._head_ontology._switch_label_and_id:
+            for node_id, data in self._head_ontology.get_nx_graph().nodes(
+                data=True
+            ):
+                if (
+                    "label" in data
+                    and data["label"] == head_join_node_label_in_bc_format
+                ):
+                    head_join_node = node_id
+                    break
         if head_join_node not in self._head_ontology.get_nx_graph().nodes:
+            head_ontology = self._head_ontology._rdf_to_nx(
+                self._head_ontology.get_rdf_graph(),
+                self._head_ontology._root_label,
+                self._head_ontology._switch_label_and_id,
+                rename_nodes=False,
+            )
             raise ValueError(
-                f"Head join node {head_join_node} not found in "
-                f"head ontology."
+                f"Head join node '{head_join_node}' not found in head ontology. "
+                f"The head ontology contains the following nodes: {head_ontology.nodes}."
             )
+        return head_join_node
-    def _join_ontologies(self, adapter: OntologyAdapter) -> None:
+    def _join_ontologies(
+        self, adapter: OntologyAdapter, head_join_node
+    ) -> None:
         """
         Joins the ontologies by adding the tail ontology as a subgraph to the
         head ontology at the specified join nodes.
@@ -535,8 +615,7 @@ class Ontology:
         if not self._nx_graph:
             self._nx_graph = self._head_ontology.get_nx_graph().copy()
-        head_join_node = to_lower_sentence_case(adapter.get_head_join_node())
-        tail_join_node = to_lower_sentence_case(adapter.get_root_label())
+        tail_join_node = adapter.get_root_node()
         tail_ontology = adapter.get_nx_graph()
         # subtree of tail ontology at join node
@@ -695,8 +774,9 @@ class Ontology:
         Args:
             to_disk (str): If specified, the ontology structure will be saved
-                to disk as a GRAPHML file, to be opened in your favourite
-                graph visualisation tool.
+                to disk as a GRAPHML file at the location (directory) specified
+                by the `to_disk` string, to be opened in your favourite graph
+                visualisation tool.
             full (bool): If True, the full ontology structure will be shown,
                 including all nodes and edges. If False, only the nodes and

biocypher/write/_batch_writer.py CHANGED Viewed

@@ -6,6 +6,7 @@ import os
 import re
 import glob
+from rdflib import Graph
 from more_itertools import peekable
 from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
@@ -117,6 +118,8 @@ class _BatchWriter(ABC):
         db_password: str = None,
         db_host: str = None,
         db_port: str = None,
+        rdf_format: str = None,
+        rdf_namespaces: dict = {},
     ):
         """
@@ -196,12 +199,20 @@ class _BatchWriter(ABC):
             db_port:
                 The database port.
+            rdf_format:
+                The format of RDF.
+            rdf_namespaces:
+                The namespaces for RDF.
         """
         self.db_name = db_name
         self.db_user = db_user
         self.db_password = db_password
         self.db_host = db_host or "localhost"
         self.db_port = db_port
+        self.rdf_format = rdf_format
+        self.rdf_namespaces = rdf_namespaces
         self.delim, self.escaped_delim = self._process_delimiter(delimiter)
         self.adelim, self.escaped_adelim = self._process_delimiter(

biocypher/write/_write.py CHANGED Viewed

@@ -14,6 +14,7 @@ suitable for import into a DBMS.
 """
 from biocypher._logger import logger
+from biocypher.write.graph._rdf import _RDFWriter
 from biocypher.write.graph._neo4j import _Neo4jBatchWriter
 from biocypher.write.graph._arangodb import _ArangoDBBatchWriter
 from biocypher.write.relational._sqlite import _SQLiteBatchWriter
@@ -25,7 +26,7 @@ from typing import TYPE_CHECKING
 from biocypher._config import config as _config
-__all__ = ["get_writer"]
+__all__ = ["get_writer", "DBMS_TO_CLASS"]
 if TYPE_CHECKING:
     from biocypher._translate import Translator
@@ -43,6 +44,8 @@ DBMS_TO_CLASS = {
     "ArangoDB": _ArangoDBBatchWriter,
     "sqlite": _SQLiteBatchWriter,
     "sqlite3": _SQLiteBatchWriter,
+    "rdf": _RDFWriter,
+    "RDF": _RDFWriter,
 }
@@ -102,4 +105,6 @@ def get_writer(
             db_user=dbms_config.get("user"),  # psql
             db_password=dbms_config.get("password"),  # psql
             db_port=dbms_config.get("port"),  # psql
+            rdf_format=dbms_config.get("rdf_format"),  # rdf
+            rdf_namespaces=dbms_config.get("rdf_namespaces"),  # rdf
         )

biocypher/write/graph/_neo4j.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import os
+import re
+import subprocess
 from biocypher._logger import logger
 from biocypher.write._batch_writer import parse_label, _BatchWriter
@@ -22,6 +24,19 @@ class _Neo4jBatchWriter(_BatchWriter):
         - _write_array_string
     """
+    def __init__(self, *args, **kwargs):
+        """
+        Constructor.
+        Check the version of Neo4j and adds a command scope if version >= 5.
+        Returns:
+            _Neo4jBatchWriter: An instance of the writer.
+        """
+        # Should read the configuration and setup import_call_bin_prefix.
+        super().__init__(*args, **kwargs)
     def _get_default_import_call_bin_prefix(self):
         """
         Method to provide the default string for the import call bin prefix.
@@ -29,6 +44,7 @@ class _Neo4jBatchWriter(_BatchWriter):
         Returns:
             str: The default location for the neo4j admin import location
         """
         return "bin/"
     def _write_array_string(self, string_list):
@@ -263,9 +279,32 @@ class _Neo4jBatchWriter(_BatchWriter):
         Returns:
             str: a bash command for neo4j-admin import
         """
+        import_call_neo4j_v4 = self._get_import_call(
+            "import", "--database=", "--force="
+        )
+        import_call_neo4j_v5 = self._get_import_call(
+            "database import full", "", "--overwrite-destination="
+        )
+        neo4j_version_check = f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)"
+        import_script = f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi"
+        return import_script
+    def _get_import_call(
+        self, import_cmd: str, database_cmd: str, wipe_cmd: str
+    ) -> str:
+        """Get parametrized import call for Neo4j 4 or 5+.
+        Args:
+            import_cmd (str): The import command to use.
+            database_cmd (str): The database command to use.
+            wipe_cmd (str): The wipe command to use.
+        Returns:
+            str: The import call.
+        """
         import_call = (
-            f"{self.import_call_bin_prefix}neo4j-admin import "
-            f"--database={self.db_name} "
+            f"{self.import_call_bin_prefix}neo4j-admin {import_cmd} "
             f'--delimiter="{self.escaped_delim}" '
             f'--array-delimiter="{self.escaped_adelim}" '
         )
@@ -276,7 +315,7 @@ class _Neo4jBatchWriter(_BatchWriter):
             import_call += f"--quote='{self.quote}' "
         if self.wipe:
-            import_call += f"--force=true "
+            import_call += f"{wipe_cmd}true "
         if self.skip_bad_relationships:
             import_call += "--skip-bad-relationships=true "
         if self.skip_duplicate_nodes:
@@ -290,4 +329,6 @@ class _Neo4jBatchWriter(_BatchWriter):
         for header_path, parts_path in self.import_call_edges:
             import_call += f'--relationships="{header_path},{parts_path}" '
+        # Database needs to be at the end starting with Neo4j 5.0+.
+        import_call += f"{database_cmd}{self.db_name} "
         return import_call

biocypher/write/graph/_rdf.py ADDED Viewed

@@ -0,0 +1,516 @@
+#!/usr/bin/env python
+#
+# Copyright 2021, Heidelberg University Clinic
+#
+# File author(s):  Loes van den Biggelaar
+#                  Sebastian Lobentanzer
+#
+# Distributed under MIT licence, see the file `LICENSE`.
+#
+"""
+BioCypher 'offline' module. Handles the writing of node and edge representations
+suitable for import into a DBMS.
+"""
+from types import GeneratorType
+from typing import Union
+import os
+from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
+from rdflib.namespace import (
+    _NAMESPACE_PREFIXES_CORE,
+    _NAMESPACE_PREFIXES_RDFLIB,
+)
+from biocypher._create import BioCypherEdge, BioCypherNode
+from biocypher._logger import logger
+from biocypher.write._batch_writer import _BatchWriter
+class _RDFWriter(_BatchWriter):
+    """
+    Class to write BioCypher's property graph into an RDF format using
+    rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
+    N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
+    is done keeping only the minimum information about node and edges,
+    skipping all properties.
+    """
+    def _get_import_script_name(self) -> str:
+        """
+        Returns the name of the RDF admin import script.
+        This function applicable for RDF export.
+        Returns:
+            str: The name of the import script (ending in .sh)
+        """
+        return "rdf-import-call.sh"
+    def _get_default_import_call_bin_prefix(self):
+        """
+        Method to provide the default string for the import call bin prefix.
+        Returns:
+            str: The default location for the RDF admin import location
+        """
+        return "bin/"
+    def _is_rdf_format_supported(self, rdf_format: str) -> bool:
+        """
+        Function to check if the specified RDF format is supported.
+        Args:
+            rdf_format (str): The RDF format to check.
+        Returns:
+            bool: Returns True if rdf format supported, False otherwise.
+        """
+        supported_formats = [
+            "xml",
+            "n3",
+            "turtle",
+            "nt",
+            "pretty-xml",
+            "trix",
+            "trig",
+            "nquads",
+            "json-ld",
+        ]
+        if rdf_format not in supported_formats:
+            logger.error(
+                f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
+                f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
+            )
+            return False
+        else:
+            # RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
+            if self.rdf_format == "turtle":
+                self.extension = "ttl"
+            elif self.rdf_format == "ttl":
+                self.rdf_format = "turtle"
+                self.extension = "ttl"
+            else:
+                self.extension = self.rdf_format
+            return True
+    def _write_single_edge_list_to_file(
+        self,
+        edge_list: list,
+        label: str,
+        prop_dict: dict,
+    ):
+        """
+        This function takes one list of biocypher edges and writes them
+        to an RDF file with the given format.
+        Args:
+            edge_list (list): list of BioCypherEdges to be written
+            label (str): the label (type) of the edge
+            prop_dict (dict): properties of node class passed from parsing
+                function and their types
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        if not all(isinstance(n, BioCypherEdge) for n in edge_list):
+            logger.error("Edges must be passed as type BioCypherEdge.")
+            return False
+        # translate label to PascalCase
+        label_pascal = self.translator.name_sentence_to_pascal(label)
+        # create file name
+        file_name = os.path.join(
+            self._outdir, f"{label_pascal}.{self.extension}"
+        )
+        # write data in graph
+        graph = Graph()
+        self._init_namespaces(graph)
+        for edge in edge_list:
+            rdf_subject = edge.get_source_id()
+            rdf_object = edge.get_target_id()
+            rdf_predicate = edge.get_id()
+            rdf_properties = edge.get_properties()
+            if rdf_predicate == None:
+                rdf_predicate = rdf_subject + rdf_object
+            edge_label = self.translator.name_sentence_to_pascal(
+                edge.get_label()
+            )
+            edge_uri = self.rdf_namespaces["biocypher"][edge_label]
+            graph.add((edge_uri, RDF.type, RDFS.Class))
+            graph.add(
+                (
+                    self.rdf_namespaces["biocypher"][rdf_predicate],
+                    RDF.type,
+                    edge_uri,
+                )
+            )
+            graph.add(
+                (
+                    self.rdf_namespaces["biocypher"][rdf_predicate],
+                    self.rdf_namespaces["biocypher"]["subject"],
+                    self.subject_to_uri(rdf_subject),
+                )
+            )
+            graph.add(
+                (
+                    self.rdf_namespaces["biocypher"][rdf_predicate],
+                    self.rdf_namespaces["biocypher"]["object"],
+                    self.subject_to_uri(rdf_object),
+                )
+            )
+            # add properties to the transformed edge --> node
+            for key, value in rdf_properties.items():
+                # only write value if it exists.
+                if value:
+                    self.add_property_to_graph(graph, rdf_predicate, value, key)
+        graph.serialize(destination=file_name, format=self.rdf_format)
+        logger.info(
+            f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
+        )
+        return True
+    def add_property_to_graph(
+        self,
+        graph: Graph,
+        rdf_subject: str,
+        rdf_object: str,
+        rdf_predicate: str,
+    ):
+        """
+        Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
+        It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
+        If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
+        If the property is neither a list or string, it will also be added as a literal.
+        Args:
+            graph (RDFLib.Graph): The RDF graph to add the nodes to.
+            rdf_subject (str): The subject of the RDF triple.
+            rdf_object (str): The object of the RDF triple.
+            rdf_predicate (str): The predicate of the RDF triple.
+        Returns:
+            None
+        """
+        if isinstance(rdf_object, list):
+            for obj in rdf_object:
+                graph.add(
+                    (
+                        self.subject_to_uri(rdf_subject),
+                        self.property_to_uri(rdf_predicate),
+                        Literal(obj),
+                    )
+                )
+        elif isinstance(rdf_object, str):
+            if rdf_object.startswith("[") and rdf_object.endswith("]"):
+                self.add_property_to_graph(
+                    graph,
+                    rdf_subject,
+                    self.transform_string_to_list(rdf_object),
+                    rdf_predicate,
+                )
+            else:
+                graph.add(
+                    (
+                        self.subject_to_uri(rdf_subject),
+                        self.property_to_uri(rdf_predicate),
+                        Literal(rdf_object),
+                    )
+                )
+        else:
+            graph.add(
+                (
+                    self.subject_to_uri(rdf_subject),
+                    self.property_to_uri(rdf_predicate),
+                    Literal(rdf_object),
+                )
+            )
+    def transform_string_to_list(self, string_list: str) -> list:
+        """
+        Function to transform a string representation of a list into a list.
+        Args:
+            string_list (str): The string representation of the list.
+        Returns:
+            list: The list representation of the input string.
+        """
+        return (
+            string_list.replace("[", "")
+            .replace("]", "")
+            .replace("'", "")
+            .split(", ")
+        )
+    def _write_single_node_list_to_file(
+        self,
+        node_list: list,
+        label: str,
+        prop_dict: dict,
+        labels: str,
+    ):
+        """
+        This function takes a list of BioCypherNodes and writes them
+        to an RDF file in the specified format.
+        Args:
+            node_list (list): A list of BioCypherNodes to be written.
+            label (str): The label (type) of the nodes.
+            prop_dict (dict): A dictionary of properties and their types for the node class.
+        Returns:
+            bool: True if the writing is successful, False otherwise.
+        """
+        if not all(isinstance(n, BioCypherNode) for n in node_list):
+            logger.error("Nodes must be passed as type BioCypherNode.")
+            return False
+        # translate label to PascalCase
+        label_pascal = self.translator.name_sentence_to_pascal(label)
+        # create file name
+        file_name = os.path.join(
+            self._outdir, f"{label_pascal}.{self.extension}"
+        )
+        # write data in graph
+        graph = Graph()
+        self._init_namespaces(graph)
+        for n in node_list:
+            rdf_subject = n.get_id()
+            rdf_object = n.get_label()
+            properties = n.get_properties()
+            class_name = self.translator.name_sentence_to_pascal(rdf_object)
+            graph.add(
+                (
+                    self.rdf_namespaces["biocypher"][class_name],
+                    RDF.type,
+                    RDFS.Class,
+                )
+            )
+            graph.add(
+                (
+                    self.subject_to_uri(rdf_subject),
+                    RDF.type,
+                    self.rdf_namespaces["biocypher"][class_name],
+                )
+            )
+            for key, value in properties.items():
+                # only write value if it exists.
+                if value:
+                    self.add_property_to_graph(graph, rdf_subject, value, key)
+        graph.serialize(destination=file_name, format=self.rdf_format)
+        logger.info(
+            f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
+        )
+        return True
+    def write_nodes(
+        self, nodes, batch_size: int = int(1e6), force: bool = False
+    ) -> bool:
+        """
+        Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
+        Args:
+            nodes (list or generator): A list or generator of nodes in BioCypherNode format.
+            batch_size (int): The number of nodes to write in each batch.
+            force (bool): Flag to force the writing even if the output file already exists.
+        Returns:
+            bool: True if the writing is successful, False otherwise.
+        """
+        # check if specified output format is correct
+        passed = self._is_rdf_format_supported(self.rdf_format)
+        if not passed:
+            logger.error("Error while writing node data, wrong RDF format")
+            return False
+        # write node data using _write_node_data method
+        passed = self._write_node_data(nodes, batch_size, force)
+        if not passed:
+            logger.error("Error while writing node data.")
+            return False
+        return True
+    def write_edges(
+        self,
+        edges: Union[list, GeneratorType],
+        batch_size: int = int(1e6),
+    ) -> bool:
+        """
+        Wrapper for writing edges in RDF format. It calls _write_edge_data()
+        functions specifying it's edge data.
+        Args:
+            edges (BioCypherEdge): a list or generator of edges in
+                :py:class:`BioCypherEdge` format
+            batch_size (int): The number of edges to write in each batch.
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        # check if specified output format is correct
+        passed = self._is_rdf_format_supported(self.rdf_format)
+        if not passed:
+            logger.error("Error while writing edge data, wrong RDF format")
+            return False
+        # write edge data using _write_edge_data method
+        passed = self._write_edge_data(edges, batch_size=batch_size)
+        if not passed:
+            logger.error("Error while writing edge data.")
+            return False
+        return True
+    def _construct_import_call(self) -> bool:
+        """
+        Function to write the import call.
+        This function is not applicable for RDF.
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        return ""
+    def _write_array_string(self, string_list):
+        """
+        Abstract method to write the string representation of an array into a .csv file
+        as required by the RDF admin-import.
+        This function is not applicable for RDF.
+        Args:
+            string_list (list): list of ontology strings
+        Returns:
+            str: The string representation of an array for the neo4j admin import
+        """
+        return True
+    def _write_node_headers(self):
+        """
+        Abstract method that takes care of importing properties of a graph entity that is represented
+        as a node as per the definition in the `schema_config.yaml`
+        This function is not applicable for RDF.
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        return True
+    def _write_edge_headers(self):
+        """
+        Abstract method to write a database import-file for a graph entity that is represented
+        as an edge as per the definition in the `schema_config.yaml`,
+        containing only the header for this type of edge.
+        This function is not applicable for RDF.
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        return True
+    def subject_to_uri(self, subject: str) -> str:
+        """
+        Converts the subject to a proper URI using the available namespaces.
+        If the conversion fails, it defaults to the biocypher prefix.
+        Args:
+            subject (str): The subject to be converted to a URI.
+        Returns:
+            str: The corresponding URI for the subject.
+        """
+        try:
+            _pref, _id = subject.split(":")
+            if _pref in self.rdf_namespaces.keys():
+                return self.rdf_namespaces[_pref][_id]
+            else:
+                return self.rdf_namespaces["biocypher"][subject]
+        except ValueError:
+            return self.rdf_namespaces["biocypher"][subject]
+    def property_to_uri(self, property_name: str) -> dict[str, str]:
+        """
+        Converts a property name to its corresponding URI.
+        This function takes a property name and searches for its corresponding URI in various namespaces.
+        It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
+        Args:
+            property_name (str): The property name to be converted to a URI.
+        Returns:
+            str: The corresponding URI for the input property name.
+        """
+        # These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
+        for namespace in _NAMESPACE_PREFIXES_CORE.values():
+            if property_name in namespace:
+                return namespace[property_name]
+        # If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
+        for namespace in [SKOS, DC, DCTERMS]:
+            if property_name in namespace:
+                return namespace[property_name]
+        # If the property name is still not found, try other namespaces from rdflib.
+        for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
+            if property_name in namespace:
+                return namespace[property_name]
+        # If the property name is "licence", it recursively calls the function with "license" as the input.
+        if property_name == "licence":
+            return self.property_to_uri("license")
+        # TODO: add an option to search trough manually implemented namespaces
+        # If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
+        # TODO: give a warning and try to prevent this option altogether
+        return self.rdf_namespaces["biocypher"][property_name]
+    def _init_namespaces(self, graph: Graph):
+        """
+        Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
+        This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
+        If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
+        the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
+        Args:
+            graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
+        Returns:
+            None
+        """
+        # add biocypher standard to self.rdf_namespaces
+        biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
+        if not self.rdf_namespaces:
+            self.rdf_namespaces = biocypher_standard
+        else:
+            self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
+        for key, value in self.rdf_namespaces.items():
+            namespace = Namespace(value)
+            self.rdf_namespaces[key] = namespace
+            graph.bind(key, namespace)

{biocypher-0.5.40.dist-info → biocypher-0.5.42.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: biocypher
-Version: 0.5.40
+Version: 0.5.42
 Summary: A unifying framework for biomedical research knowledge graphs
 Home-page: https://github.com/biocypher/biocypher
 License: MIT

{biocypher-0.5.40.dist-info → biocypher-0.5.42.dist-info}/RECORD RENAMED Viewed

@@ -1,32 +1,33 @@
 biocypher/__init__.py,sha256=ejNY53vH_pE3ZbIN8G_ZBYxxPG9aERovRLD0XhDvt4k,942
 biocypher/_config/__init__.py,sha256=fFHRFYxE2MtDAQWL6upe--MJ1vw3Z8CwIPhF2gW8cRU,3698
-biocypher/_config/biocypher_config.yaml,sha256=TEvIOgRy9WMvsb2CrV1ywuKLZWbedYubCC2bpdBIalU,2713
+biocypher/_config/biocypher_config.yaml,sha256=VE_UH6POExAsuPpqWsahsT8-9k5jglMkuBfuszH1tiU,2868
 biocypher/_config/test_config.yaml,sha256=Np8jeS5_EP6HHOvMKb7B_Tkyqd5YaYlYz_DVsXypt-A,119
 biocypher/_config/test_schema_config.yaml,sha256=D1600WgEj3iTXrumVU9LIivJHJO36iaxfkOgyam9zVU,3129
 biocypher/_config/test_schema_config_disconnected.yaml,sha256=Qm8FLxEn2spHcyj_5F859KjcDvKSxNhxDvi4b4LLkvQ,68
 biocypher/_config/test_schema_config_extended.yaml,sha256=wn3A76142hhjnImhMF6RODbCFESTJ2TtPvcFdIFsAT0,3309
-biocypher/_connect.py,sha256=0oSyO6CEIlKL8rHo-HHE7y0FzGfSi4vnEXSDy1TnIUE,12456
+biocypher/_connect.py,sha256=7hk3J03hzZOPE48ISaoB6IgRun8GaUmDtIRnnD7vKiU,13453
 biocypher/_core.py,sha256=5rZKYie_vSjTYduH8oH-GxLMZuNqLAe3ZYAQ5nUp8Nc,22578
 biocypher/_create.py,sha256=vpUchUdEpWupZi1LgFLxAWMtqoBwnWbP7PwEDUCBS4A,10202
 biocypher/_deduplicate.py,sha256=BBvfpXzu6L5YDY5FdtXxnf8YlsbJpbCE8RdUoKsm0n0,4949
 biocypher/_get.py,sha256=3Kpky3blfNf1JwxKWLsZxTU2aTP_C4sUe8OpiyYj63I,10810
 biocypher/_logger.py,sha256=NGXe3hZA79WSujfOgpcxHBf8N2QAfrmvM1LFDpsGK2U,3185
 biocypher/_mapping.py,sha256=ERSNH2Bg19145KytxbFE4BInPaiP-LWW7osOBot29Eo,9304
-biocypher/_metadata.py,sha256=AffJrg0s1b35C6caoCs0DlcXtXlEzUXDDylHdUFa_rI,1658
-biocypher/_misc.py,sha256=g5B-PO_XJlYEJC7kEVRdCXeB2NW0ZSVr_5KqTEk2ldk,5877
-biocypher/_ontology.py,sha256=3Wu1ZZYmtLpWfopi-aY9BA8qZ-ltPMXN4Ok_diK1YdA,28410
+biocypher/_metadata.py,sha256=hTN9aStXCS7IzABrE7BmT5GZ-8YUt8gP9PG_P5Ix1Vw,1658
+biocypher/_misc.py,sha256=18EG2Bei3RnyWXDWc3qtZaT3gybvXI8opi0HvSaF7Lg,6066
+biocypher/_ontology.py,sha256=G5k-bnzvPZUqhLPxtoOPFa4OSQ4JpufgozVakLTjwLg,31789
 biocypher/_pandas.py,sha256=GVCFM68J7yBjh40MpkNVgD8qT1RFMrrIjMOtD3iKsf4,3040
 biocypher/_translate.py,sha256=JafvhtVaFSpruRfYh9BzjVbvDF1Mhg7LLKMDZHWkRjg,16496
 biocypher/write/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-biocypher/write/_batch_writer.py,sha256=Ta2DNjSnJcVtFDMOGTtH5nnbKwyqSGf7xXGpYzi1bDM,36826
-biocypher/write/_write.py,sha256=u7m2XaliEgOrKdLddTn-2z6kqICmUmuTDQIcRzIFNY4,3087
+biocypher/write/_batch_writer.py,sha256=x_fe2yndASNAvO-GaeVhjUVxnSNdDZ6-FB1mj572Jvw,37129
+biocypher/write/_write.py,sha256=4UYw-y3CevwcdVBq6ou1rTJXuXrcde7oraWeO8YXcK4,3330
 biocypher/write/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 biocypher/write/graph/_arangodb.py,sha256=du5pivCR7xKs8VyxeegxYsSBIcsXGrfSbM_AffFapwg,8071
-biocypher/write/graph/_neo4j.py,sha256=dcBQ-OzvMG4gSTlU98k2LeqxRswpirM2XMEX9UEeBjo,10483
+biocypher/write/graph/_neo4j.py,sha256=qSj1PryD4UmveS7ACs1R3eo2pegi53pVI7d7P0ihOKI,11930
+biocypher/write/graph/_rdf.py,sha256=9_u9usWhU7EKKDd1PgXyV99opS5IAeef2lhDNEN6fOw,17973
 biocypher/write/relational/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 biocypher/write/relational/_postgresql.py,sha256=NdI-ULP8valsqlkObOg50od-3-amVj5RzGnZ_7NW2ww,11945
 biocypher/write/relational/_sqlite.py,sha256=KLQpxQXF1B8qqTtKUFfjWdwHjd1Fhn9syK931Z0dsq0,2066
-biocypher-0.5.40.dist-info/LICENSE,sha256=SjUaQkq671iQUZOxEUpC4jvJxXOlfSiHTTueyz9kXJM,1065
-biocypher-0.5.40.dist-info/METADATA,sha256=ur2IINfCgegTsj3B_ZKdV6BVAomyNX-NF3nDmtmkDjw,10642
-biocypher-0.5.40.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
-biocypher-0.5.40.dist-info/RECORD,,
+biocypher-0.5.42.dist-info/LICENSE,sha256=SjUaQkq671iQUZOxEUpC4jvJxXOlfSiHTTueyz9kXJM,1065
+biocypher-0.5.42.dist-info/METADATA,sha256=3lT_thshGguJMnCeer-4JaJQfrsuKeWAd6oaYWhXPyk,10642
+biocypher-0.5.42.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
+biocypher-0.5.42.dist-info/RECORD,,

{biocypher-0.5.40.dist-info → biocypher-0.5.42.dist-info}/LICENSE RENAMED Viewed

File without changes

{biocypher-0.5.40.dist-info → biocypher-0.5.42.dist-info}/WHEEL RENAMED Viewed

File without changes

biocypher 0.5.40__py3-none-any.whl → 0.5.42__py3-none-any.whl

Potentially problematic release.

biocypher 0.5.40py3-none-any.whl → 0.5.42py3-none-any.whl