PyPI - biocypher - Versions diffs - 0.5.17__py3-none-any.whl → 0.5.20__py3-none-any.whl - Mend

biocypher 0.5.17py3-none-any.whl → 0.5.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biocypher might be problematic. Click here for more details.

Files changed (20) hide show

biocypher/__init__.py +10 -11
biocypher/_config/__init__.py +25 -27
biocypher/_config/biocypher_config.yaml +1 -2
biocypher/_connect.py +59 -79
biocypher/_core.py +146 -78
biocypher/_create.py +55 -52
biocypher/_deduplicate.py +81 -36
biocypher/_logger.py +12 -13
biocypher/_mapping.py +69 -83
biocypher/_metadata.py +12 -17
biocypher/_misc.py +17 -28
biocypher/_ontology.py +85 -101
biocypher/_pandas.py +46 -11
biocypher/_translate.py +93 -113
biocypher/_write.py +457 -404
{biocypher-0.5.17.dist-info → biocypher-0.5.20.dist-info}/METADATA +16 -6
biocypher-0.5.20.dist-info/RECORD +23 -0
biocypher-0.5.17.dist-info/RECORD +0 -23
{biocypher-0.5.17.dist-info → biocypher-0.5.20.dist-info}/LICENSE +0 -0
{biocypher-0.5.17.dist-info → biocypher-0.5.20.dist-info}/WHEEL +0 -0

biocypher/_core.py CHANGED Viewed

@@ -12,34 +12,38 @@
 BioCypher core module. Interfaces with the user and distributes tasks to
 submodules.
 """
-from typing import Dict, List, Optional
+from typing import Optional
+import os
 from more_itertools import peekable
+import yaml
 import pandas as pd
 from ._logger import logger
-logger.debug(f'Loading module {__name__}.')
+logger.debug(f"Loading module {__name__}.")
 from ._write import get_writer
-from ._pandas import Pandas
 from ._config import config as _config
 from ._config import update_from_file as _file_update
-from ._create import BioCypherEdge, BioCypherNode
+from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
+from ._pandas import Pandas
 from ._connect import get_driver
 from ._mapping import OntologyMapping
 from ._ontology import Ontology
 from ._translate import Translator
 from ._deduplicate import Deduplicator
-__all__ = ['BioCypher']
+__all__ = ["BioCypher"]
-SUPPORTED_DBMS = ['neo4j', 'postgresql']
+SUPPORTED_DBMS = ["neo4j", "postgresql"]
 REQUIRED_CONFIG = [
-    'dbms',
-    'offline',
-    'strict_mode',
-    'head_ontology',
+    "dbms",
+    "offline",
+    "strict_mode",
+    "head_ontology",
 ]
@@ -75,6 +79,7 @@ class BioCypher:
             provided, the default value 'biocypher-out' will be used.
     """
     def __init__(
         self,
         dbms: str = None,
@@ -88,65 +93,64 @@ class BioCypher:
         # legacy params
         db_name: str = None,
     ):
         # Update configuration if custom path is provided
         if biocypher_config_path:
             _file_update(biocypher_config_path)
         if db_name:
             logger.warning(
-                'The parameter `db_name` is deprecated. Please set the '
-                '`database_name` setting in the `biocypher_config.yaml` file '
-                'instead.'
+                "The parameter `db_name` is deprecated. Please set the "
+                "`database_name` setting in the `biocypher_config.yaml` file "
+                "instead."
             )
-            _config(**{db_name: {'database_name': db_name}})
+            _config(**{db_name: {"database_name": db_name}})
         # Load configuration
-        self.base_config = _config('biocypher')
+        self.base_config = _config("biocypher")
         # Check for required configuration
         for key in REQUIRED_CONFIG:
             if key not in self.base_config:
-                raise ValueError(f'Configuration key {key} is required.')
+                raise ValueError(f"Configuration key {key} is required.")
         # Set configuration - mandatory
-        self._dbms = dbms or self.base_config['dbms']
+        self._dbms = dbms or self.base_config["dbms"]
         if offline is None:
-            self._offline = self.base_config['offline']
+            self._offline = self.base_config["offline"]
         else:
             self._offline = offline
         if strict_mode is None:
-            self._strict_mode = self.base_config['strict_mode']
+            self._strict_mode = self.base_config["strict_mode"]
         else:
             self._strict_mode = strict_mode
         self._schema_config_path = schema_config_path or self.base_config.get(
-            'schema_config_path'
+            "schema_config_path"
         )
         if not self._schema_config_path:
             raise ValueError(
-                'BioCypher requires a schema configuration; please provide a '
-                'path to the schema configuration YAML file via '
-                '`biocypher_config.yaml` or `BioCypher` class parameter.'
+                "BioCypher requires a schema configuration; please provide a "
+                "path to the schema configuration YAML file via "
+                "`biocypher_config.yaml` or `BioCypher` class parameter."
             )
-        self._head_ontology = head_ontology or self.base_config['head_ontology']
+        self._head_ontology = head_ontology or self.base_config["head_ontology"]
         # Set configuration - optional
         self._output_directory = output_directory or self.base_config.get(
-            'output_directory'
+            "output_directory"
         )
         self._tail_ontologies = tail_ontologies or self.base_config.get(
-            'tail_ontologies'
+            "tail_ontologies"
         )
         if self._dbms not in SUPPORTED_DBMS:
             raise ValueError(
-                f'DBMS {self._dbms} not supported. '
-                f'Please select from {SUPPORTED_DBMS}.'
+                f"DBMS {self._dbms} not supported. "
+                f"Please select from {SUPPORTED_DBMS}."
             )
         # Initialize
@@ -156,7 +160,7 @@ class BioCypher:
         self._ontology = None
         self._writer = None
         self._pd = None
     def _get_deduplicator(self) -> Deduplicator:
         """
         Create deduplicator if not exists and return.
@@ -179,19 +183,6 @@ class BioCypher:
         return self._ontology_mapping
-    def _get_translator(self) -> Translator:
-        """
-        Create translator if not exists and return.
-        """
-        if not self._translator:
-            self._translator = Translator(
-                ontology_mapping=self._get_ontology_mapping(),
-                strict_mode=self._strict_mode,
-            )
-        return self._translator
     def _get_ontology(self) -> Ontology:
         """
         Create ontology if not exists and return.
@@ -206,23 +197,34 @@ class BioCypher:
         return self._ontology
+    def _get_translator(self) -> Translator:
+        """
+        Create translator if not exists and return.
+        """
+        if not self._translator:
+            self._translator = Translator(
+                ontology=self._get_ontology(),
+                strict_mode=self._strict_mode,
+            )
+        return self._translator
     def _get_writer(self):
         """
         Create writer if not online. Set as instance variable `self._writer`.
         """
-        # Get worker
         if self._offline:
             self._writer = get_writer(
                 dbms=self._dbms,
                 translator=self._get_translator(),
-                ontology=self._get_ontology(),
                 deduplicator=self._get_deduplicator(),
                 output_directory=self._output_directory,
                 strict_mode=self._strict_mode,
             )
         else:
-            raise NotImplementedError('Cannot get writer in online mode.')
+            raise NotImplementedError("Cannot get writer in online mode.")
     def _get_driver(self):
         """
@@ -233,16 +235,15 @@ class BioCypher:
             self._driver = get_driver(
                 dbms=self._dbms,
                 translator=self._get_translator(),
-                ontology=self._get_ontology(),
                 deduplicator=self._get_deduplicator(),
             )
         else:
-            raise NotImplementedError('Cannot get driver in offline mode.')
+            raise NotImplementedError("Cannot get driver in offline mode.")
     def write_nodes(self, nodes, batch_size: int = int(1e6)) -> bool:
         """
         Write nodes to database. Either takes an iterable of tuples (if given,
-        translates to ``BioCypherNode`` objects) or an iterable of
+        translates to ``BioCypherNode`` objects) or an iterable of
         ``BioCypherNode`` objects.
         Args:
@@ -287,7 +288,7 @@ class BioCypher:
         # write edge files
         return self._writer.write_edges(tedges, batch_size=batch_size)
-    def to_df(self) -> List[pd.DataFrame]:
+    def to_df(self) -> list[pd.DataFrame]:
         """
         Convert entities to a pandas DataFrame for each entity type and return
         a list.
@@ -303,9 +304,8 @@ class BioCypher:
             raise ValueError(
                 "No pandas instance found. Please call `add()` first."
             )
         return self._pd.dfs
     def add(self, entities):
         """
@@ -317,13 +317,16 @@ class BioCypher:
         if not self._pd:
             self._pd = Pandas(
                 translator=self._get_translator(),
-                ontology=self._get_ontology(),
                 deduplicator=self._get_deduplicator(),
             )
         entities = peekable(entities)
-        if isinstance(entities.peek(), BioCypherNode) or isinstance(entities.peek(), BioCypherEdge):
+        if (
+            isinstance(entities.peek(), BioCypherNode)
+            or isinstance(entities.peek(), BioCypherEdge)
+            or isinstance(entities.peek(), BioCypherRelAsNode)
+        ):
             tentities = entities
         elif len(entities.peek()) < 4:
             tentities = self._translator.translate_nodes(entities)
@@ -367,11 +370,11 @@ class BioCypher:
         Merge edges into database. Either takes an iterable of tuples (if given,
         translates to ``BioCypherEdge`` objects) or an iterable of
         ``BioCypherEdge`` objects.
         Args:
-            edges (iterable): An iterable of edges to merge into the database.
+            edges (iterable): An iterable of edges to merge into the database.
-        Returns:
+        Returns:
             bool: True if successful.
         """
@@ -388,7 +391,7 @@ class BioCypher:
     # OVERVIEW AND CONVENIENCE METHODS ###
-    def log_missing_input_labels(self) -> Optional[Dict[str, List[str]]]:
+    def log_missing_input_labels(self) -> Optional[dict[str, list[str]]]:
         """
         Get the set of input labels encountered without an entry in the
@@ -405,19 +408,19 @@ class BioCypher:
         if mt:
             msg = (
-                'Input entities not accounted for due to them not being '
-                'present in the `schema_config.yaml` configuration file '
-                '(this is not necessarily a problem, if you did not intend '
-                'to include them in the database; see the log for details): \n'
+                "Input entities not accounted for due to them not being "
+                "present in the `schema_config.yaml` configuration file "
+                "(this is not necessarily a problem, if you did not intend "
+                "to include them in the database; see the log for details): \n"
             )
             for k, v in mt.items():
-                msg += f'    {k}: {v} \n'
+                msg += f"    {k}: {v} \n"
             logger.info(msg)
             return mt
         else:
-            logger.info('No missing labels in input.')
+            logger.info("No missing labels in input.")
             return None
     def log_duplicates(self) -> None:
@@ -429,46 +432,44 @@ class BioCypher:
         dn = self._deduplicator.get_duplicate_nodes()
         if dn:
             ntypes = dn[0]
             nids = dn[1]
-            msg = ('Duplicate node types encountered (IDs in log): \n')
+            msg = "Duplicate node types encountered (IDs in log): \n"
             for typ in ntypes:
-                msg += f'    {typ}\n'
+                msg += f"    {typ}\n"
             logger.info(msg)
-            idmsg = ('Duplicate node IDs encountered: \n')
+            idmsg = "Duplicate node IDs encountered: \n"
             for _id in nids:
-                idmsg += f'    {_id}\n'
+                idmsg += f"    {_id}\n"
             logger.debug(idmsg)
         else:
-            logger.info('No duplicate nodes in input.')
+            logger.info("No duplicate nodes in input.")
         de = self._deduplicator.get_duplicate_edges()
         if de:
             etypes = de[0]
             eids = de[1]
-            msg = ('Duplicate edge types encountered (IDs in log): \n')
+            msg = "Duplicate edge types encountered (IDs in log): \n"
             for typ in etypes:
-                msg += f'    {typ}\n'
+                msg += f"    {typ}\n"
             logger.info(msg)
-            idmsg = ('Duplicate edge IDs encountered: \n')
+            idmsg = "Duplicate edge IDs encountered: \n"
             for _id in eids:
-                idmsg += f'    {_id}\n'
+                idmsg += f"    {_id}\n"
             logger.debug(idmsg)
         else:
-            logger.info('No duplicate edges in input.')
+            logger.info("No duplicate edges in input.")
     def show_ontology_structure(self, **kwargs) -> None:
         """
@@ -498,11 +499,78 @@ class BioCypher:
         if not self._offline:
             raise NotImplementedError(
-                'Cannot write import call in online mode.'
+                "Cannot write import call in online mode."
             )
         self._writer.write_import_call()
+    def write_schema_info(self) -> None:
+        """
+        Write an extended schema info YAML file that extends the
+        `schema_config.yaml` with run-time information of the built KG. For
+        instance, include information on whether something present in the actual
+        knowledge graph, whether it is a relationship (which is important in the
+        case of representing relationships as nodes) and the actual sources and
+        targets of edges. Since this file can be used in place of the original
+        `schema_config.yaml` file, it indicates that it is the extended schema
+        by setting `is_schema_info` to `true`.
+        We start by using the `extended_schema` dictionary from the ontology
+        class instance, which contains all expanded entities and relationships.
+        The information of whether something is a relationship can be gathered
+        from the deduplicator instance, which keeps track of all entities that
+        have been seen.
+        """
+        if not self._offline:
+            raise NotImplementedError(
+                "Cannot write schema info in online mode."
+            )
+        ontology = self._get_ontology()
+        schema = ontology.mapping.extended_schema
+        schema["is_schema_info"] = True
+        deduplicator = self._get_deduplicator()
+        for node in deduplicator.entity_types:
+            if node in schema.keys():
+                schema[node]["present_in_knowledge_graph"] = True
+                schema[node]["is_relationship"] = False
+            else:
+                logger.info(
+                    f"Node {node} not present in extended schema. "
+                    "Skipping schema info."
+                )
+        # find 'label_as_edge' cases in schema entries
+        changed_labels = {}
+        for k, v in schema.items():
+            if not isinstance(v, dict):
+                continue
+            if "label_as_edge" in v.keys():
+                if v["label_as_edge"] in deduplicator.seen_relationships.keys():
+                    changed_labels[v["label_as_edge"]] = k
+        for edge in deduplicator.seen_relationships.keys():
+            if edge in changed_labels.keys():
+                edge = changed_labels[edge]
+            if edge in schema.keys():
+                schema[edge]["present_in_knowledge_graph"] = True
+                schema[edge]["is_relationship"] = True
+                # TODO information about source and target nodes
+            else:
+                logger.info(
+                    f"Edge {edge} not present in extended schema. "
+                    "Skipping schema info."
+                )
+        # write to output directory as YAML file
+        path = os.path.join(self._output_directory, "schema_info.yaml")
+        with open(path, "w") as f:
+            f.write(yaml.dump(schema))
+        return schema
     # TRANSLATION METHODS ###
     def translate_term(self, term: str) -> str:
@@ -520,7 +588,7 @@ class BioCypher:
         self.start_ontology()
         return self._translator.translate_term(term)
     def summary(self) -> None:
         """
         Wrapper for showing ontology structure and logging duplicates and

biocypher/_create.py CHANGED Viewed

@@ -13,16 +13,16 @@ dataclasses.
 """
 from ._logger import logger
-logger.debug(f'Loading module {__name__}.')
+logger.debug(f"Loading module {__name__}.")
 from typing import Union
 from dataclasses import field, dataclass
 import os
 __all__ = [
-    'BioCypherEdge',
-    'BioCypherNode',
-    'BioCypherRelAsNode',
+    "BioCypherEdge",
+    "BioCypherNode",
+    "BioCypherRelAsNode",
 ]
@@ -53,7 +53,7 @@ class BioCypherNode:
     node_id: str
     node_label: str
-    preferred_id: str = 'id'
+    preferred_id: str = "id"
     properties: dict = field(default_factory=dict)
     def __post_init__(self):
@@ -64,47 +64,50 @@ class BioCypherNode:
         Replace unwanted characters in properties.
         """
-        self.properties['id'] = self.node_id
-        self.properties['preferred_id'] = self.preferred_id or None
+        self.properties["id"] = self.node_id
+        self.properties["preferred_id"] = self.preferred_id or None
         # TODO actually make None possible here; as is, "id" is the default in
         # the dataclass as well as in the configuration file
-        if ':TYPE' in self.properties.keys():
+        if ":TYPE" in self.properties.keys():
             logger.warning(
                 "Keyword ':TYPE' is reserved for Neo4j. "
-                'Removing from properties.',
+                "Removing from properties.",
                 # "Renaming to 'type'."
             )
             # self.properties["type"] = self.properties[":TYPE"]
-            del self.properties[':TYPE']
+            del self.properties[":TYPE"]
         for k, v in self.properties.items():
             if isinstance(v, str):
                 self.properties[k] = (
                     v.replace(
                         os.linesep,
-                        ' ',
-                    ).replace(
-                        '\n',
-                        ' ',
-                    ).replace(
-                        '\r',
-                        ' ',
+                        " ",
+                    )
+                    .replace(
+                        "\n",
+                        " ",
+                    )
+                    .replace(
+                        "\r",
+                        " ",
                     )
                 )
             elif isinstance(v, list):
-                self.properties[k] = (
-                    [
-                        val.replace(
-                            os.linesep,
-                            ' ',
-                        ).replace(
-                            '\n',
-                            ' ',
-                        ).replace('\r', ' ') for val in v
-                    ]
-                )
+                self.properties[k] = [
+                    val.replace(
+                        os.linesep,
+                        " ",
+                    )
+                    .replace(
+                        "\n",
+                        " ",
+                    )
+                    .replace("\r", " ")
+                    for val in v
+                ]
     def get_id(self) -> str:
         """
@@ -123,7 +126,7 @@ class BioCypherNode:
             str: node_label
         """
         return self.node_label
     def get_type(self) -> str:
         """
         Returns primary node label.
@@ -161,9 +164,9 @@ class BioCypherNode:
             properties as second-level dict.
         """
         return {
-            'node_id': self.node_id,
-            'node_label': self.node_label,
-            'properties': self.properties,
+            "node_id": self.node_id,
+            "node_label": self.node_label,
+            "properties": self.properties,
         }
@@ -204,30 +207,30 @@ class BioCypherEdge:
         Check for reserved keywords.
         """
-        if ':TYPE' in self.properties.keys():
+        if ":TYPE" in self.properties.keys():
             logger.debug(
                 "Keyword ':TYPE' is reserved for Neo4j. "
-                'Removing from properties.',
+                "Removing from properties.",
                 # "Renaming to 'type'."
             )
             # self.properties["type"] = self.properties[":TYPE"]
-            del self.properties[':TYPE']
-        elif 'id' in self.properties.keys():
+            del self.properties[":TYPE"]
+        elif "id" in self.properties.keys():
             logger.debug(
                 "Keyword 'id' is reserved for Neo4j. "
-                'Removing from properties.',
+                "Removing from properties.",
                 # "Renaming to 'type'."
             )
             # self.properties["type"] = self.properties[":TYPE"]
-            del self.properties['id']
-        elif '_ID' in self.properties.keys():
+            del self.properties["id"]
+        elif "_ID" in self.properties.keys():
             logger.debug(
                 "Keyword '_ID' is reserved for Postgres. "
-                'Removing from properties.',
+                "Removing from properties.",
                 # "Renaming to 'type'."
             )
             # self.properties["type"] = self.properties[":TYPE"]
-            del self.properties['_ID']
+            del self.properties["_ID"]
     def get_id(self) -> Union[str, None]:
         """
@@ -295,11 +298,11 @@ class BioCypherEdge:
                 dict.
         """
         return {
-            'relationship_id': self.relationship_id or None,
-            'source_id': self.source_id,
-            'target_id': self.target_id,
-            'relationship_label': self.relationship_label,
-            'properties': self.properties,
+            "relationship_id": self.relationship_id or None,
+            "source_id": self.source_id,
+            "target_id": self.target_id,
+            "relationship_label": self.relationship_label,
+            "properties": self.properties,
         }
@@ -331,20 +334,20 @@ class BioCypherRelAsNode:
     def __post_init__(self):
         if not isinstance(self.node, BioCypherNode):
             raise TypeError(
-                f'BioCypherRelAsNode.node must be a BioCypherNode, '
-                f'not {type(self.node)}.',
+                f"BioCypherRelAsNode.node must be a BioCypherNode, "
+                f"not {type(self.node)}.",
             )
         if not isinstance(self.source_edge, BioCypherEdge):
             raise TypeError(
-                f'BioCypherRelAsNode.source_edge must be a BioCypherEdge, '
-                f'not {type(self.source_edge)}.',
+                f"BioCypherRelAsNode.source_edge must be a BioCypherEdge, "
+                f"not {type(self.source_edge)}.",
             )
         if not isinstance(self.target_edge, BioCypherEdge):
             raise TypeError(
-                f'BioCypherRelAsNode.target_edge must be a BioCypherEdge, '
-                f'not {type(self.target_edge)}.',
+                f"BioCypherRelAsNode.target_edge must be a BioCypherEdge, "
+                f"not {type(self.target_edge)}.",
             )
     def get_node(self) -> BioCypherNode:

biocypher 0.5.17__py3-none-any.whl → 0.5.20__py3-none-any.whl

Potentially problematic release.

biocypher 0.5.17py3-none-any.whl → 0.5.20py3-none-any.whl