PyPI - biocypher - Versions diffs - 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

biocypher 0.6.2py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biocypher might be problematic. Click here for more details.

Files changed (33) hide show

biocypher/__init__.py +3 -13
biocypher/_config/__init__.py +6 -23
biocypher/_core.py +360 -262
biocypher/_create.py +13 -27
biocypher/_deduplicate.py +4 -11
biocypher/_get.py +21 -60
biocypher/_logger.py +4 -16
biocypher/_mapping.py +4 -17
biocypher/_metadata.py +3 -15
biocypher/_misc.py +14 -28
biocypher/_ontology.py +127 -212
biocypher/_translate.py +34 -58
biocypher/output/connect/_get_connector.py +40 -0
biocypher/output/connect/_neo4j_driver.py +9 -65
biocypher/output/in_memory/_get_in_memory_kg.py +34 -0
biocypher/output/in_memory/_in_memory_kg.py +40 -0
biocypher/output/in_memory/_networkx.py +44 -0
biocypher/output/in_memory/_pandas.py +20 -15
biocypher/output/write/_batch_writer.py +132 -177
biocypher/output/write/_get_writer.py +11 -24
biocypher/output/write/_writer.py +14 -33
biocypher/output/write/graph/_arangodb.py +7 -24
biocypher/output/write/graph/_neo4j.py +51 -56
biocypher/output/write/graph/_networkx.py +36 -43
biocypher/output/write/graph/_rdf.py +107 -95
biocypher/output/write/relational/_csv.py +6 -11
biocypher/output/write/relational/_postgresql.py +5 -13
biocypher/output/write/relational/_sqlite.py +3 -1
{biocypher-0.6.2.dist-info → biocypher-0.7.0.dist-info}/LICENSE +1 -1
{biocypher-0.6.2.dist-info → biocypher-0.7.0.dist-info}/METADATA +3 -3
biocypher-0.7.0.dist-info/RECORD +43 -0
{biocypher-0.6.2.dist-info → biocypher-0.7.0.dist-info}/WHEEL +1 -1
biocypher-0.6.2.dist-info/RECORD +0 -39

biocypher/_translate.py CHANGED Viewed

@@ -1,31 +1,21 @@
-#!/usr/bin/env python
-#
-# Copyright 2021, Heidelberg University Clinic
-#
-# File author(s): Sebastian Lobentanzer
-#                 ...
-#
-# Distributed under MIT licence, see the file `LICENSE`.
-#
 """
 BioCypher 'translation' module. Responsible for translating between the raw
 input data and the BioCypherNode and BioCypherEdge objects.
 """
-from ._logger import logger
-logger.debug(f"Loading module {__name__}.")
-from typing import Any, Union, Optional
-from collections.abc import Iterable, Generator
+from collections.abc import Generator, Iterable
+from typing import Any, Optional, Union
 from more_itertools import peekable
 from . import _misc
 from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
+from ._logger import logger
 from ._ontology import Ontology
-__all__ = ["BiolinkAdapter", "Translator"]
+logger.debug(f"Loading module {__name__}.")
+__all__ = ["Translator"]
 class Translator:
@@ -67,6 +57,20 @@ class Translator:
         self._update_ontology_types()
+    def translate_entities(self, entities):
+        entities = peekable(entities)
+        if (
+            isinstance(entities.peek(), BioCypherNode)
+            or isinstance(entities.peek(), BioCypherEdge)
+            or isinstance(entities.peek(), BioCypherRelAsNode)
+        ):
+            translated_entities = entities
+        elif len(entities.peek()) < 4:
+            translated_entities = self.translate_nodes(entities)
+        else:
+            translated_entities = self.translate_edges(entities)
+        return translated_entities
     def translate_nodes(
         self,
         node_tuples: Iterable,
@@ -131,8 +135,7 @@ class Translator:
         return (
             self.ontology.mapping.extended_schema[_bl_type]["preferred_id"]
-            if "preferred_id"
-            in self.ontology.mapping.extended_schema.get(_bl_type, {})
+            if "preferred_id" in self.ontology.mapping.extended_schema.get(_bl_type, {})
             else "id"
         )
@@ -141,9 +144,7 @@ class Translator:
         Filters properties for those specified in schema_config if any.
         """
-        filter_props = self.ontology.mapping.extended_schema[bl_type].get(
-            "properties", {}
-        )
+        filter_props = self.ontology.mapping.extended_schema[bl_type].get("properties", {})
         # strict mode: add required properties (only if there is a whitelist)
         if self.strict_mode and filter_props:
@@ -151,36 +152,24 @@ class Translator:
                 {"source": "str", "licence": "str", "version": "str"},
             )
-        exclude_props = self.ontology.mapping.extended_schema[bl_type].get(
-            "exclude_properties", []
-        )
+        exclude_props = self.ontology.mapping.extended_schema[bl_type].get("exclude_properties", [])
         if isinstance(exclude_props, str):
             exclude_props = [exclude_props]
         if filter_props and exclude_props:
-            filtered_props = {
-                k: v
-                for k, v in props.items()
-                if (k in filter_props.keys() and k not in exclude_props)
-            }
+            filtered_props = {k: v for k, v in props.items() if (k in filter_props.keys() and k not in exclude_props)}
         elif filter_props:
-            filtered_props = {
-                k: v for k, v in props.items() if k in filter_props.keys()
-            }
+            filtered_props = {k: v for k, v in props.items() if k in filter_props.keys()}
         elif exclude_props:
-            filtered_props = {
-                k: v for k, v in props.items() if k not in exclude_props
-            }
+            filtered_props = {k: v for k, v in props.items() if k not in exclude_props}
         else:
             return props
-        missing_props = [
-            k for k in filter_props.keys() if k not in filtered_props.keys()
-        ]
+        missing_props = [k for k in filter_props.keys() if k not in filtered_props.keys()]
         # add missing properties with default values
         for k in missing_props:
             filtered_props[k] = None
@@ -213,20 +202,17 @@ class Translator:
         # TODO remove for performance reasons once safe
         edge_tuples = peekable(edge_tuples)
         if len(edge_tuples.peek()) == 4:
-            edge_tuples = [
-                (None, src, tar, typ, props)
-                for src, tar, typ, props in edge_tuples
-            ]
+            edge_tuples = [(None, src, tar, typ, props) for src, tar, typ, props in edge_tuples]
         for _id, _src, _tar, _type, _props in edge_tuples:
             # check for strict mode requirements
             if self.strict_mode:
-                if not "source" in _props:
+                if "source" not in _props:
                     raise ValueError(
                         f"Edge {_id if _id else (_src, _tar)} does not have a `source` property.",
                         " This is required in strict mode.",
                     )
-                if not "licence" in _props:
+                if "licence" not in _props:
                     raise ValueError(
                         f"Edge {_id if _id else (_src, _tar)} does not have a `licence` property.",
                         " This is required in strict mode.",
@@ -240,9 +226,7 @@ class Translator:
                 # filter properties for those specified in schema_config if any
                 _filtered_props = self._filter_props(bl_type, _props)
-                rep = self.ontology.mapping.extended_schema[bl_type][
-                    "represented_as"
-                ]
+                rep = self.ontology.mapping.extended_schema[bl_type]["represented_as"]
                 if rep == "node":
                     if _id:
@@ -251,13 +235,7 @@ class Translator:
                     else:
                         # source target concat
-                        node_id = (
-                            str(_src)
-                            + "_"
-                            + str(_tar)
-                            + "_"
-                            + "_".join(str(v) for v in _filtered_props.values())
-                        )
+                        node_id = str(_src) + "_" + str(_tar) + "_" + "_".join(str(v) for v in _filtered_props.values())
                     n = BioCypherNode(
                         node_id=node_id,
@@ -268,7 +246,7 @@ class Translator:
                     # directionality check TODO generalise to account for
                     # different descriptions of directionality or find a
                     # more consistent solution for indicating directionality
-                    if _filtered_props.get("directed") == True:
+                    if _filtered_props.get("directed") == True:  # noqa: E712 (seems to not work without '== True')
                         l1 = "IS_SOURCE_OF"
                         l2 = "IS_TARGET_OF"
@@ -298,9 +276,7 @@ class Translator:
                     yield BioCypherRelAsNode(n, e_s, e_t)
                 else:
-                    edge_label = self.ontology.mapping.extended_schema[
-                        bl_type
-                    ].get("label_as_edge")
+                    edge_label = self.ontology.mapping.extended_schema[bl_type].get("label_as_edge")
                     if edge_label is None:
                         edge_label = bl_type

biocypher/output/connect/_get_connector.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""BioCypher 'connect' module.
+Handles the connecting and writing a Knowledge Graph to a database.
+"""
+from biocypher._config import config as _config
+from biocypher._logger import logger
+from biocypher._translate import Translator
+from biocypher.output.connect._neo4j_driver import _Neo4jDriver
+logger.debug(f"Loading module {__name__}.")
+__all__ = ["get_connector"]
+def get_connector(
+    dbms: str,
+    translator: Translator,
+):
+    """
+    Function to return the connector class.
+    Returns:
+        class: the connector class
+    """
+    dbms_config = _config(dbms)
+    if dbms == "neo4j":
+        return _Neo4jDriver(
+            database_name=dbms_config["database_name"],
+            wipe=dbms_config["wipe"],
+            uri=dbms_config["uri"],
+            user=dbms_config["user"],
+            password=dbms_config["password"],
+            multi_db=dbms_config["multi_db"],
+            translator=translator,
+        )
+    else:
+        raise NotImplementedError(f"Online mode is not supported for the DBMS {dbms}.")

biocypher/output/connect/_neo4j_driver.py CHANGED Viewed

@@ -1,32 +1,19 @@
-#!/usr/bin/env python
-#
-# Copyright 2021, Heidelberg University Clinic
-#
-# File author(s): Sebastian Lobentanzer
-#                 ...
-#
-# Distributed under MIT licence, see the file `LICENSE`.
-#
 """
 BioCypher 'online' mode. Handles connection and manipulation of a running DBMS.
 """
-import subprocess
-from biocypher._logger import logger
-logger.debug(f"Loading module {__name__}.")
+import itertools
 from collections.abc import Iterable
-import itertools
 import neo4j_utils
 from biocypher import _misc
-from biocypher._config import config as _config
 from biocypher._create import BioCypherEdge, BioCypherNode
+from biocypher._logger import logger
 from biocypher._translate import Translator
+logger.debug(f"Loading module {__name__}.")
 __all__ = ["_Neo4jDriver"]
@@ -96,7 +83,7 @@ class _Neo4jDriver:
         # find current version node
         db_version = self._driver.query(
-            "MATCH (v:BioCypher) " "WHERE NOT (v)-[:PRECEDES]->() " "RETURN v",
+            "MATCH (v:BioCypher) WHERE NOT (v)-[:PRECEDES]->() RETURN v",
         )
         # add version node
         self.add_biocypher_nodes(self.translator.ontology)
@@ -143,18 +130,10 @@ class _Neo4jDriver:
             label = _misc.sentencecase_to_pascalcase(leaf[0], sep=r"\s\.")
             if leaf[1]["represented_as"] == "node":
                 if major_neo4j_version >= 5:
-                    s = (
-                        f"CREATE CONSTRAINT `{label}_id` "
-                        f"IF NOT EXISTS FOR (n:`{label}`) "
-                        "REQUIRE n.id IS UNIQUE"
-                    )
+                    s = f"CREATE CONSTRAINT `{label}_id` " f"IF NOT EXISTS FOR (n:`{label}`) " "REQUIRE n.id IS UNIQUE"
                     self._driver.query(s)
                 else:
-                    s = (
-                        f"CREATE CONSTRAINT `{label}_id` "
-                        f"IF NOT EXISTS ON (n:`{label}`) "
-                        "ASSERT n.id IS UNIQUE"
-                    )
+                    s = f"CREATE CONSTRAINT `{label}_id` " f"IF NOT EXISTS ON (n:`{label}`) " "ASSERT n.id IS UNIQUE"
                     self._driver.query(s)
     def _get_neo4j_version(self):
@@ -170,9 +149,7 @@ class _Neo4jDriver:
             )[0][0]["version"]
             return neo4j_version
         except Exception as e:
-            logger.warning(
-                f"Error detecting Neo4j version: {e} use default version 4.0.0."
-            )
+            logger.warning(f"Error detecting Neo4j version: {e} use default version 4.0.0.")
             return "4.0.0"
     def add_nodes(self, id_type_tuples: Iterable[tuple]) -> tuple:
@@ -364,11 +341,7 @@ class _Neo4jDriver:
         # merging only on the ids of the entities, passing the
         # properties on match and on create;
         # TODO add node labels?
-        node_query = (
-            "UNWIND $rels AS r "
-            "MERGE (src {id: r.source_id}) "
-            "MERGE (tar {id: r.target_id}) "
-        )
+        node_query = "UNWIND $rels AS r " "MERGE (src {id: r.source_id}) " "MERGE (tar {id: r.target_id}) "
         self._driver.query(node_query, parameters={"rels": rels})
@@ -386,37 +359,8 @@ class _Neo4jDriver:
         method = "explain" if explain else "profile" if profile else "query"
-        result = getattr(self._driver, method)(
-            edge_query, parameters={"rels": rels}
-        )
+        result = getattr(self._driver, method)(edge_query, parameters={"rels": rels})
         logger.info("Finished merging edges.")
         return result
-def get_driver(
-    dbms: str,
-    translator: "Translator",
-):
-    """
-    Function to return the writer class.
-    Returns:
-        class: the writer class
-    """
-    dbms_config = _config(dbms)
-    if dbms == "neo4j":
-        return _Neo4jDriver(
-            database_name=dbms_config["database_name"],
-            wipe=dbms_config["wipe"],
-            uri=dbms_config["uri"],
-            user=dbms_config["user"],
-            password=dbms_config["password"],
-            multi_db=dbms_config["multi_db"],
-            translator=translator,
-        )
-    return None

biocypher/output/in_memory/_get_in_memory_kg.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""
+BioCypher 'in_memory' module. Handles the in-memory Knowledge Graph instance.
+"""
+from biocypher._deduplicate import Deduplicator
+from biocypher._logger import logger
+from biocypher.output.in_memory._networkx import NetworkxKG
+from biocypher.output.in_memory._pandas import PandasKG
+logger.debug(f"Loading module {__name__}.")
+__all__ = ["get_in_memory_kg"]
+IN_MEMORY_DBMS = ["csv", "pandas", "tabular", "networkx"]
+def get_in_memory_kg(
+    dbms: str,
+    deduplicator: Deduplicator,
+):
+    """
+    Function to return the in-memory KG class.
+    Returns:
+        class: the in-memory KG class
+    """
+    if dbms in ["csv", "pandas", "tabular"]:
+        return PandasKG(deduplicator)
+    elif dbms == "networkx":
+        return NetworkxKG(deduplicator)
+    else:
+        raise NotImplementedError(
+            f"Getting the in memory BioCypher KG is not supported for the DBMS {dbms}. Supported: {IN_MEMORY_DBMS}."
+        )

biocypher/output/in_memory/_in_memory_kg.py ADDED Viewed

@@ -0,0 +1,40 @@
+from abc import ABC, abstractmethod
+class _InMemoryKG(ABC):
+    """Abstract class for handling the in-memory Knowledge Graph instance.
+    Specifics of the different in-memory implementations (e.g. csv, networkx)
+    are implemented in the child classes. Any concrete in-memory implementation
+    needs to implement at least:
+    - add_nodes
+    - add_edges
+    - get_kg
+    Raises:
+        NotImplementedError: InMemoryKG implementation must override 'add_nodes'
+        NotImplementedError: InMemoryKG implementation must override 'add_edges'
+        NotImplementedError: InMemoryKG implementation must override 'get_kg'
+    """
+    @abstractmethod
+    def add_nodes(self, nodes):
+        """Add nodes to the in-memory knowledge graph.
+        Args:
+            nodes (Iterable[BioCypherNode]): Iterable of BioCypherNode objects.
+        """
+        raise NotImplementedError("InMemoryKG implementation must override 'add_nodes'")
+    @abstractmethod
+    def add_edges(self, edges):
+        """Add edges to the in-memory knowledge graph.
+        Args:
+            edges (Iterable[BioCypherEdge]): Iterable of BioCypherEdge objects.
+        """
+        raise NotImplementedError("InMemoryKG implementation must override 'add_edges'")
+    @abstractmethod
+    def get_kg(self):
+        """Return the in-memory knowledge graph."""
+        raise NotImplementedError("InMemoryKG implementation must override 'get_kg'")

biocypher/output/in_memory/_networkx.py ADDED Viewed

@@ -0,0 +1,44 @@
+import networkx as nx
+from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
+from biocypher.output.in_memory._pandas import PandasKG
+class NetworkxKG(_InMemoryKG):
+    def __init__(self, deduplicator):
+        super().__init__()  # keeping in spite of ABC not having __init__
+        self.deduplicator = deduplicator
+        self._pd = PandasKG(
+            deduplicator=self.deduplicator,
+        )
+        self.KG = None
+    def get_kg(self):
+        if not self.KG:
+            self.KG = self._create_networkx_kg()
+        return self.KG
+    def add_nodes(self, nodes):
+        self._pd.add_nodes(nodes)
+        return True
+    def add_edges(self, edges):
+        self._pd.add_edges(edges)
+        return True
+    def _create_networkx_kg(self) -> nx.DiGraph:
+        self.KG = nx.DiGraph()
+        all_dfs = self._pd.dfs
+        node_dfs = [df for df in all_dfs.values() if df.columns.str.contains("node_id").any()]
+        edge_dfs = [
+            df
+            for df in all_dfs.values()
+            if df.columns.str.contains("source_id").any() and df.columns.str.contains("target_id").any()
+        ]
+        for df in node_dfs:
+            nodes = df.set_index("node_id").to_dict(orient="index")
+            self.KG.add_nodes_from(nodes.items())
+        for df in edge_dfs:
+            edges = df.set_index(["source_id", "target_id"]).to_dict(orient="index")
+            self.KG.add_edges_from(((source, target, attrs) for (source, target), attrs in edges.items()))
+        return self.KG

biocypher/output/in_memory/_pandas.py CHANGED Viewed

@@ -1,15 +1,25 @@
 import pandas as pd
 from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
+from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
-class Pandas:
-    def __init__(self, translator, deduplicator):
-        self.translator = translator
+class PandasKG(_InMemoryKG):
+    def __init__(self, deduplicator):
+        super().__init__()  # keeping in spite of ABC not having __init__
         self.deduplicator = deduplicator
         self.dfs = {}
+    def get_kg(self):
+        return self.dfs
+    def add_nodes(self, nodes):
+        self.add_tables(nodes)
+    def add_edges(self, edges):
+        self.add_tables(edges)
     def _separate_entity_types(self, entities):
         """
         Given mixed iterable of BioCypher objects, separate them into lists by
@@ -23,8 +33,7 @@ class Pandas:
                 and not isinstance(entity, BioCypherRelAsNode)
             ):
                 raise TypeError(
-                    "Expected a BioCypherNode / BioCypherEdge / "
-                    f"BioCypherRelAsNode, got {type(entity)}."
+                    "Expected a BioCypherNode / BioCypherEdge / " f"BioCypherRelAsNode, got {type(entity)}."
                 )
             if isinstance(entity, BioCypherNode):
@@ -43,23 +52,23 @@ class Pandas:
                 target_edge = entity.get_target_edge()
                 _type = node.get_type()
-                if not _type in lists:
+                if _type not in lists:
                     lists[_type] = []
                 lists[_type].append(node)
                 _source_type = source_edge.get_type()
-                if not _source_type in lists:
+                if _source_type not in lists:
                     lists[_source_type] = []
                 lists[_source_type].append(source_edge)
                 _target_type = target_edge.get_type()
-                if not _target_type in lists:
+                if _target_type not in lists:
                     lists[_target_type] = []
                 lists[_target_type].append(target_edge)
                 continue
             _type = entity.get_type()
-            if not _type in lists:
+            if _type not in lists:
                 lists[_type] = []
             lists[_type].append(entity)
@@ -76,15 +85,11 @@ class Pandas:
             self._add_entity_df(_type, _entities)
     def _add_entity_df(self, _type, _entities):
-        df = pd.DataFrame(
-            pd.json_normalize([node.get_dict() for node in _entities])
-        )
+        df = pd.DataFrame(pd.json_normalize([node.get_dict() for node in _entities]))
         # replace "properties." with "" in column names
         df.columns = [col.replace("properties.", "") for col in df.columns]
         if _type not in self.dfs:
             self.dfs[_type] = df
         else:
-            self.dfs[_type] = pd.concat(
-                [self.dfs[_type], df], ignore_index=True
-            )
+            self.dfs[_type] = pd.concat([self.dfs[_type], df], ignore_index=True)
         return self.dfs[_type]

biocypher 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

Potentially problematic release.

biocypher 0.6.2py3-none-any.whl → 0.7.0py3-none-any.whl