PyPI - biocypher - Versions diffs - 0.5.39__py3-none-any.whl → 0.5.41__py3-none-any.whl - Mend

biocypher 0.5.39py3-none-any.whl → 0.5.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of biocypher might be problematic. Click here for more details.

Files changed (19) hide show

biocypher/_config/biocypher_config.yaml +18 -8
biocypher/_connect.py +36 -9
biocypher/_core.py +7 -3
biocypher/_metadata.py +1 -1
biocypher/_misc.py +6 -2
biocypher/write/__init__.py +0 -0
biocypher/{_write.py → write/_batch_writer.py} +7 -944
biocypher/write/_write.py +105 -0
biocypher/write/graph/__init__.py +0 -0
biocypher/write/graph/_arangodb.py +241 -0
biocypher/write/graph/_neo4j.py +334 -0
biocypher/write/relational/__init__.py +0 -0
biocypher/write/relational/_postgresql.py +320 -0
biocypher/write/relational/_sqlite.py +51 -0
{biocypher-0.5.39.dist-info → biocypher-0.5.41.dist-info}/METADATA +1 -1
biocypher-0.5.41.dist-info/RECORD +32 -0
biocypher-0.5.39.dist-info/RECORD +0 -24
{biocypher-0.5.39.dist-info → biocypher-0.5.41.dist-info}/LICENSE +0 -0
{biocypher-0.5.39.dist-info → biocypher-0.5.41.dist-info}/WHEEL +0 -0

biocypher/write/relational/_postgresql.py ADDED Viewed

@@ -0,0 +1,320 @@
+import os
+import glob
+from biocypher._logger import logger
+from biocypher.write._batch_writer import _BatchWriter
+class _PostgreSQLBatchWriter(_BatchWriter):
+    """
+    Class for writing node and edge representations to disk using the
+    format specified by PostgreSQL for the use of "COPY FROM...". Each batch
+    writer instance has a fixed representation that needs to be passed
+    at instantiation via the :py:attr:`schema` argument. The instance
+    also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
+    to convert and extend the hierarchy.
+    This class inherits from the abstract class "_BatchWriter" and implements the
+    PostgreSQL-specific methods:
+        - _write_node_headers
+        - _write_edge_headers
+        - _construct_import_call
+        - _write_array_string
+    """
+    DATA_TYPE_LOOKUP = {
+        "str": "VARCHAR",  # VARCHAR needs limit
+        "int": "INTEGER",
+        "long": "BIGINT",
+        "float": "NUMERIC",
+        "double": "NUMERIC",
+        "dbl": "NUMERIC",
+        "boolean": "BOOLEAN",
+        "str[]": "VARCHAR[]",
+        "string[]": "VARCHAR[]",
+    }
+    def __init__(self, *args, **kwargs):
+        self._copy_from_csv_commands = set()
+        super().__init__(*args, **kwargs)
+    def _get_default_import_call_bin_prefix(self):
+        """
+        Method to provide the default string for the import call bin prefix.
+        Returns:
+            str: The default location for the psql command
+        """
+        return ""
+    def _get_data_type(self, string) -> str:
+        try:
+            return self.DATA_TYPE_LOOKUP[string]
+        except KeyError:
+            logger.info(
+                'Could not determine data type {string}. Using default "VARCHAR"'
+            )
+            return "VARCHAR"
+    def _write_array_string(self, string_list) -> str:
+        """
+        Abstract method to write the string representation of an array into a .csv file
+        as required by the postgresql COPY command, with '{','}' brackets and ',' separation.
+        Args:
+            string_list (list): list of ontology strings
+        Returns:
+            str: The string representation of an array for postgres COPY
+        """
+        string = ",".join(string_list)
+        string = f'"{{{string}}}"'
+        return string
+    def _get_import_script_name(self) -> str:
+        """
+        Returns the name of the psql import script
+        Returns:
+            str: The name of the import script (ending in .sh)
+        """
+        return f"{self.db_name}-import-call.sh"
+    def _adjust_pascal_to_psql(self, string):
+        string = string.replace(".", "_")
+        string = string.lower()
+        return string
+    def _write_node_headers(self):
+        """
+        Writes single CSV file for a graph entity that is represented
+        as a node as per the definition in the `schema_config.yaml`,
+        containing only the header for this type of node.
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        # load headers from data parse
+        if not self.node_property_dict:
+            logger.error(
+                "Header information not found. Was the data parsed first?",
+            )
+            return False
+        for label, props in self.node_property_dict.items():
+            # create header CSV with ID, properties, labels
+            # translate label to PascalCase
+            pascal_label = self.translator.name_sentence_to_pascal(label)
+            parts = f"{pascal_label}-part*.csv"
+            parts_paths = os.path.join(self.outdir, parts)
+            parts_paths = glob.glob(parts_paths)
+            parts_paths.sort()
+            # adjust label for import to psql
+            pascal_label = self._adjust_pascal_to_psql(pascal_label)
+            table_create_command_path = os.path.join(
+                self.outdir,
+                f"{pascal_label}-create_table.sql",
+            )
+            # check if file already exists
+            if os.path.exists(table_create_command_path):
+                logger.warning(
+                    f"File {table_create_command_path} already exists. Overwriting.",
+                )
+            # concatenate key:value in props
+            columns = ["_ID VARCHAR"]
+            for col_name, col_type in props.items():
+                col_type = self._get_data_type(col_type)
+                col_name = self._adjust_pascal_to_psql(col_name)
+                columns.append(f"{col_name} {col_type}")
+            columns.append("_LABEL VARCHAR[]")
+            with open(table_create_command_path, "w", encoding="utf-8") as f:
+                command = ""
+                if self.wipe:
+                    command += f"DROP TABLE IF EXISTS {pascal_label};\n"
+                # table creation requires comma separation
+                command += (
+                    f'CREATE TABLE {pascal_label}({",".join(columns)});\n'
+                )
+                f.write(command)
+                for parts_path in parts_paths:
+                    # if import_call_file_prefix is set, replace actual path
+                    # with prefix
+                    if self.import_call_file_prefix != self.outdir:
+                        parts_path = parts_path.replace(
+                            self.outdir,
+                            self.import_call_file_prefix,
+                        )
+                    self._copy_from_csv_commands.add(
+                        f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
+                    )
+            # add file path to import statement
+            # if import_call_file_prefix is set, replace actual path
+            # with prefix
+            if self.import_call_file_prefix != self.outdir:
+                table_create_command_path = table_create_command_path.replace(
+                    self.outdir,
+                    self.import_call_file_prefix,
+                )
+            self.import_call_nodes.add(table_create_command_path)
+        return True
+    def _write_edge_headers(self):
+        """
+        Writes single CSV file for a graph entity that is represented
+        as an edge as per the definition in the `schema_config.yaml`,
+        containing only the header for this type of edge.
+        Returns:
+            bool: The return value. True for success, False otherwise.
+        """
+        # load headers from data parse
+        if not self.edge_property_dict:
+            logger.error(
+                "Header information not found. Was the data parsed first?",
+            )
+            return False
+        for label, props in self.edge_property_dict.items():
+            # translate label to PascalCase
+            pascal_label = self.translator.name_sentence_to_pascal(label)
+            parts_paths = os.path.join(self.outdir, f"{pascal_label}-part*.csv")
+            parts_paths = glob.glob(parts_paths)
+            parts_paths.sort()
+            # adjust label for import to psql
+            pascal_label = self._adjust_pascal_to_psql(pascal_label)
+            table_create_command_path = os.path.join(
+                self.outdir,
+                f"{pascal_label}-create_table.sql",
+            )
+            # check for file exists
+            if os.path.exists(table_create_command_path):
+                logger.warning(
+                    f"File {table_create_command_path} already exists. Overwriting.",
+                )
+            # concatenate key:value in props
+            columns = []
+            for col_name, col_type in props.items():
+                col_type = self._get_data_type(col_type)
+                col_name = self._adjust_pascal_to_psql(col_name)
+                if col_name == "_ID":
+                    # should ideally never happen
+                    raise ValueError(
+                        "Column name '_ID' is reserved for internal use, "
+                        "denoting the relationship ID. Please choose a "
+                        "different name for your column."
+                    )
+                columns.append(f"{col_name} {col_type}")
+            # create list of lists and flatten
+            # removes need for empty check of property list
+            out_list = [
+                "_START_ID VARCHAR",
+                "_ID VARCHAR",
+                *columns,
+                "_END_ID VARCHAR",
+                "_TYPE VARCHAR",
+            ]
+            with open(table_create_command_path, "w", encoding="utf-8") as f:
+                command = ""
+                if self.wipe:
+                    command += f"DROP TABLE IF EXISTS {pascal_label};\n"
+                # table creation requires comma separation
+                command += (
+                    f'CREATE TABLE {pascal_label}({",".join(out_list)});\n'
+                )
+                f.write(command)
+                for parts_path in parts_paths:
+                    # if import_call_file_prefix is set, replace actual path
+                    # with prefix
+                    if self.import_call_file_prefix != self.outdir:
+                        parts_path = parts_path.replace(
+                            self.outdir,
+                            self.import_call_file_prefix,
+                        )
+                    self._copy_from_csv_commands.add(
+                        f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
+                    )
+            # add file path to import statement
+            # if import_call_file_prefix is set, replace actual path
+            # with prefix
+            if self.import_call_file_prefix != self.outdir:
+                table_create_command_path = table_create_command_path.replace(
+                    self.outdir,
+                    self.import_call_file_prefix,
+                )
+            self.import_call_edges.add(table_create_command_path)
+        return True
+    def _construct_import_call(self) -> str:
+        """
+        Function to construct the import call detailing folder and
+        individual node and edge headers and data files, as well as
+        delimiters and database name. Built after all data has been
+        processed to ensure that nodes are called before any edges.
+        Returns:
+            str: a bash command for postgresql import
+        """
+        import_call = ""
+        # create tables
+        # At this point, csv files of nodes and edges do not require differentiation
+        for import_file_path in [
+            *self.import_call_nodes,
+            *self.import_call_edges,
+        ]:
+            import_call += f'echo "Setup {import_file_path}..."\n'
+            if {self.db_password}:
+                # set password variable inline
+                import_call += f"PGPASSWORD={self.db_password} "
+            import_call += (
+                f"{self.import_call_bin_prefix}psql -f {import_file_path}"
+            )
+            import_call += f" --dbname {self.db_name}"
+            import_call += f" --host {self.db_host}"
+            import_call += f" --port {self.db_port}"
+            import_call += f" --user {self.db_user}"
+            import_call += '\necho "Done!"\n'
+            import_call += "\n"
+        # copy data to tables
+        for command in self._copy_from_csv_commands:
+            table_part = command.split(" ")[3]
+            import_call += f'echo "Importing {table_part}..."\n'
+            if {self.db_password}:
+                # set password variable inline
+                import_call += f"PGPASSWORD={self.db_password} "
+            import_call += f'{self.import_call_bin_prefix}psql -c "{command}"'
+            import_call += f" --dbname {self.db_name}"
+            import_call += f" --host {self.db_host}"
+            import_call += f" --port {self.db_port}"
+            import_call += f" --user {self.db_user}"
+            import_call += '\necho "Done!"\n'
+            import_call += "\n"
+        return import_call

biocypher/write/relational/_sqlite.py ADDED Viewed

@@ -0,0 +1,51 @@
+from biocypher.write.relational._postgresql import _PostgreSQLBatchWriter
+class _SQLiteBatchWriter(_PostgreSQLBatchWriter):
+    """
+    Class for writing node and edge representations to a SQLite database.
+    It uses the _PostgreSQLBatchWriter class under the hood, which already
+    implements the logic to write the nodes/edges to a relational DBMS.
+    Only the import bash script differs between PostgreSQL and SQLite
+    and is therefore implemented in this class.
+    - _construct_import_call
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def _construct_import_call(self) -> str:
+        """
+        Function to construct the import call detailing folder and
+        individual node and edge headers and data files, as well as
+        delimiters and database name. Built after all data has been
+        processed to ensure that nodes are called before any edges.
+        Returns:
+            str: a bash command for sqlite import
+        """
+        import_call = ""
+        # create tables
+        # At this point, csv files of nodes and edges do not require differentiation
+        for import_file_path in [
+            *self.import_call_nodes,
+            *self.import_call_edges,
+        ]:
+            import_call += f'echo "Setup {import_file_path}..."\n'
+            import_call += f"{self.import_call_bin_prefix}sqlite3 {self.db_name} < {import_file_path}"
+            import_call += '\necho "Done!"\n'
+            import_call += "\n"
+        for command in self._copy_from_csv_commands:
+            table_name = command.split(" ")[1]
+            table_part = command.split(" ")[3].replace("'", "")
+            import_call += f'echo "Importing {table_part}..."\n'
+            separator = self.delim
+            import_part = f".import {table_part} {table_name}"
+            import_call += f"{self.import_call_bin_prefix}sqlite3 -separator $'{separator}' {self.db_name} \"{import_part}\""
+            import_call += '\necho "Done!"\n'
+            import_call += "\n"
+        return import_call

{biocypher-0.5.39.dist-info → biocypher-0.5.41.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: biocypher
-Version: 0.5.39
+Version: 0.5.41
 Summary: A unifying framework for biomedical research knowledge graphs
 Home-page: https://github.com/biocypher/biocypher
 License: MIT

biocypher-0.5.41.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,32 @@
+biocypher/__init__.py,sha256=ejNY53vH_pE3ZbIN8G_ZBYxxPG9aERovRLD0XhDvt4k,942
+biocypher/_config/__init__.py,sha256=fFHRFYxE2MtDAQWL6upe--MJ1vw3Z8CwIPhF2gW8cRU,3698
+biocypher/_config/biocypher_config.yaml,sha256=TEvIOgRy9WMvsb2CrV1ywuKLZWbedYubCC2bpdBIalU,2713
+biocypher/_config/test_config.yaml,sha256=Np8jeS5_EP6HHOvMKb7B_Tkyqd5YaYlYz_DVsXypt-A,119
+biocypher/_config/test_schema_config.yaml,sha256=D1600WgEj3iTXrumVU9LIivJHJO36iaxfkOgyam9zVU,3129
+biocypher/_config/test_schema_config_disconnected.yaml,sha256=Qm8FLxEn2spHcyj_5F859KjcDvKSxNhxDvi4b4LLkvQ,68
+biocypher/_config/test_schema_config_extended.yaml,sha256=wn3A76142hhjnImhMF6RODbCFESTJ2TtPvcFdIFsAT0,3309
+biocypher/_connect.py,sha256=7hk3J03hzZOPE48ISaoB6IgRun8GaUmDtIRnnD7vKiU,13453
+biocypher/_core.py,sha256=5rZKYie_vSjTYduH8oH-GxLMZuNqLAe3ZYAQ5nUp8Nc,22578
+biocypher/_create.py,sha256=vpUchUdEpWupZi1LgFLxAWMtqoBwnWbP7PwEDUCBS4A,10202
+biocypher/_deduplicate.py,sha256=BBvfpXzu6L5YDY5FdtXxnf8YlsbJpbCE8RdUoKsm0n0,4949
+biocypher/_get.py,sha256=3Kpky3blfNf1JwxKWLsZxTU2aTP_C4sUe8OpiyYj63I,10810
+biocypher/_logger.py,sha256=NGXe3hZA79WSujfOgpcxHBf8N2QAfrmvM1LFDpsGK2U,3185
+biocypher/_mapping.py,sha256=ERSNH2Bg19145KytxbFE4BInPaiP-LWW7osOBot29Eo,9304
+biocypher/_metadata.py,sha256=GGh6YvKYrRWqdyZQYTaLnkYPaHgVHz00V6kpXQdjr2k,1658
+biocypher/_misc.py,sha256=lUUbF13FdBlYq01C-Vit52IbeRehW0oSUWsQ9tFC-xo,5938
+biocypher/_ontology.py,sha256=3Wu1ZZYmtLpWfopi-aY9BA8qZ-ltPMXN4Ok_diK1YdA,28410
+biocypher/_pandas.py,sha256=GVCFM68J7yBjh40MpkNVgD8qT1RFMrrIjMOtD3iKsf4,3040
+biocypher/_translate.py,sha256=JafvhtVaFSpruRfYh9BzjVbvDF1Mhg7LLKMDZHWkRjg,16496
+biocypher/write/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+biocypher/write/_batch_writer.py,sha256=Ta2DNjSnJcVtFDMOGTtH5nnbKwyqSGf7xXGpYzi1bDM,36826
+biocypher/write/_write.py,sha256=HLFQyGqLdkmIoBOjL9m81OUuSsHjvSfK9LY4jtrinv0,3104
+biocypher/write/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+biocypher/write/graph/_arangodb.py,sha256=du5pivCR7xKs8VyxeegxYsSBIcsXGrfSbM_AffFapwg,8071
+biocypher/write/graph/_neo4j.py,sha256=qSj1PryD4UmveS7ACs1R3eo2pegi53pVI7d7P0ihOKI,11930
+biocypher/write/relational/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+biocypher/write/relational/_postgresql.py,sha256=NdI-ULP8valsqlkObOg50od-3-amVj5RzGnZ_7NW2ww,11945
+biocypher/write/relational/_sqlite.py,sha256=KLQpxQXF1B8qqTtKUFfjWdwHjd1Fhn9syK931Z0dsq0,2066
+biocypher-0.5.41.dist-info/LICENSE,sha256=SjUaQkq671iQUZOxEUpC4jvJxXOlfSiHTTueyz9kXJM,1065
+biocypher-0.5.41.dist-info/METADATA,sha256=D23b4n9k_oEmXIMC3eKYJLP5MJVVYHkAeAWB8DAFpYk,10642
+biocypher-0.5.41.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
+biocypher-0.5.41.dist-info/RECORD,,

biocypher-0.5.39.dist-info/RECORD DELETED Viewed

@@ -1,24 +0,0 @@
-biocypher/__init__.py,sha256=ejNY53vH_pE3ZbIN8G_ZBYxxPG9aERovRLD0XhDvt4k,942
-biocypher/_config/__init__.py,sha256=fFHRFYxE2MtDAQWL6upe--MJ1vw3Z8CwIPhF2gW8cRU,3698
-biocypher/_config/biocypher_config.yaml,sha256=H0TKBJun7pQmIfIAgEiMkDDgE3kKoCElBuMt8lkkQcU,2404
-biocypher/_config/test_config.yaml,sha256=Np8jeS5_EP6HHOvMKb7B_Tkyqd5YaYlYz_DVsXypt-A,119
-biocypher/_config/test_schema_config.yaml,sha256=D1600WgEj3iTXrumVU9LIivJHJO36iaxfkOgyam9zVU,3129
-biocypher/_config/test_schema_config_disconnected.yaml,sha256=Qm8FLxEn2spHcyj_5F859KjcDvKSxNhxDvi4b4LLkvQ,68
-biocypher/_config/test_schema_config_extended.yaml,sha256=wn3A76142hhjnImhMF6RODbCFESTJ2TtPvcFdIFsAT0,3309
-biocypher/_connect.py,sha256=0oSyO6CEIlKL8rHo-HHE7y0FzGfSi4vnEXSDy1TnIUE,12456
-biocypher/_core.py,sha256=W3qeuCwG0q5H_RRoYDGfKe1VWTQx_5J_WOfZQqsWQXI,22388
-biocypher/_create.py,sha256=vpUchUdEpWupZi1LgFLxAWMtqoBwnWbP7PwEDUCBS4A,10202
-biocypher/_deduplicate.py,sha256=BBvfpXzu6L5YDY5FdtXxnf8YlsbJpbCE8RdUoKsm0n0,4949
-biocypher/_get.py,sha256=3Kpky3blfNf1JwxKWLsZxTU2aTP_C4sUe8OpiyYj63I,10810
-biocypher/_logger.py,sha256=NGXe3hZA79WSujfOgpcxHBf8N2QAfrmvM1LFDpsGK2U,3185
-biocypher/_mapping.py,sha256=ERSNH2Bg19145KytxbFE4BInPaiP-LWW7osOBot29Eo,9304
-biocypher/_metadata.py,sha256=McndBOmaAbQBpOK_B4FnLKihoAfeiDvmjqJGpzDsX-k,1658
-biocypher/_misc.py,sha256=g5B-PO_XJlYEJC7kEVRdCXeB2NW0ZSVr_5KqTEk2ldk,5877
-biocypher/_ontology.py,sha256=3Wu1ZZYmtLpWfopi-aY9BA8qZ-ltPMXN4Ok_diK1YdA,28410
-biocypher/_pandas.py,sha256=GVCFM68J7yBjh40MpkNVgD8qT1RFMrrIjMOtD3iKsf4,3040
-biocypher/_translate.py,sha256=JafvhtVaFSpruRfYh9BzjVbvDF1Mhg7LLKMDZHWkRjg,16496
-biocypher/_write.py,sha256=5pW0gYj2QW--FNB4DK53gK6D7dNHhPxYnPBhB8NjaSo,69550
-biocypher-0.5.39.dist-info/LICENSE,sha256=SjUaQkq671iQUZOxEUpC4jvJxXOlfSiHTTueyz9kXJM,1065
-biocypher-0.5.39.dist-info/METADATA,sha256=-bmzjCWZTR9TYom3XUQoXGOP_X8-gG8H-u_kU7f28cw,10642
-biocypher-0.5.39.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
-biocypher-0.5.39.dist-info/RECORD,,

{biocypher-0.5.39.dist-info → biocypher-0.5.41.dist-info}/LICENSE RENAMED Viewed

File without changes

{biocypher-0.5.39.dist-info → biocypher-0.5.41.dist-info}/WHEEL RENAMED Viewed

File without changes

biocypher 0.5.39__py3-none-any.whl → 0.5.41__py3-none-any.whl

Potentially problematic release.

biocypher 0.5.39py3-none-any.whl → 0.5.41py3-none-any.whl