biocypher 0.5.39__py3-none-any.whl → 0.5.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

@@ -0,0 +1,105 @@
1
+ #!/usr/bin/env python
2
+
3
+ #
4
+ # Copyright 2021, Heidelberg University Clinic
5
+ #
6
+ # File author(s): Sebastian Lobentanzer
7
+ # Michael Hartung
8
+ #
9
+ # Distributed under MIT licence, see the file `LICENSE`.
10
+ #
11
+ """
12
+ BioCypher 'offline' module. Handles the writing of node and edge representations
13
+ suitable for import into a DBMS.
14
+ """
15
+
16
+ from biocypher._logger import logger
17
+ from biocypher.write.graph._neo4j import _Neo4jBatchWriter
18
+ from biocypher.write.graph._arangodb import _ArangoDBBatchWriter
19
+ from biocypher.write.relational._sqlite import _SQLiteBatchWriter
20
+ from biocypher.write.relational._postgresql import _PostgreSQLBatchWriter
21
+
22
+ logger.debug(f"Loading module {__name__}.")
23
+
24
+ from typing import TYPE_CHECKING
25
+
26
+ from biocypher._config import config as _config
27
+
28
+ __all__ = ["get_writer", "DBMS_TO_CLASS"]
29
+
30
+ if TYPE_CHECKING:
31
+ from biocypher._translate import Translator
32
+ from biocypher._deduplicate import Deduplicator
33
+
34
+ DBMS_TO_CLASS = {
35
+ "neo": _Neo4jBatchWriter,
36
+ "neo4j": _Neo4jBatchWriter,
37
+ "Neo4j": _Neo4jBatchWriter,
38
+ "postgres": _PostgreSQLBatchWriter,
39
+ "postgresql": _PostgreSQLBatchWriter,
40
+ "PostgreSQL": _PostgreSQLBatchWriter,
41
+ "arango": _ArangoDBBatchWriter,
42
+ "arangodb": _ArangoDBBatchWriter,
43
+ "ArangoDB": _ArangoDBBatchWriter,
44
+ "sqlite": _SQLiteBatchWriter,
45
+ "sqlite3": _SQLiteBatchWriter,
46
+ }
47
+
48
+
49
+ def get_writer(
50
+ dbms: str,
51
+ translator: "Translator",
52
+ deduplicator: "Deduplicator",
53
+ output_directory: str,
54
+ strict_mode: bool,
55
+ ):
56
+ """
57
+ Function to return the writer class based on the selection in the config
58
+ file.
59
+
60
+ Args:
61
+
62
+ dbms: the database management system; for options, see DBMS_TO_CLASS.
63
+
64
+ translator: the Translator object.
65
+
66
+ output_directory: the directory to write the output files to.
67
+
68
+ strict_mode: whether to use strict mode.
69
+
70
+ Returns:
71
+
72
+ instance: an instance of the selected writer class.
73
+
74
+ """
75
+
76
+ dbms_config = _config(dbms)
77
+
78
+ writer = DBMS_TO_CLASS[dbms]
79
+
80
+ if not writer:
81
+ raise ValueError(f"Unknown dbms: {dbms}")
82
+
83
+ if writer is not None:
84
+ return writer(
85
+ translator=translator,
86
+ deduplicator=deduplicator,
87
+ delimiter=dbms_config.get("delimiter"),
88
+ array_delimiter=dbms_config.get("array_delimiter"),
89
+ quote=dbms_config.get("quote_character"),
90
+ output_directory=output_directory,
91
+ db_name=dbms_config.get("database_name"),
92
+ import_call_bin_prefix=dbms_config.get("import_call_bin_prefix"),
93
+ import_call_file_prefix=dbms_config.get("import_call_file_prefix"),
94
+ wipe=dbms_config.get("wipe"),
95
+ strict_mode=strict_mode,
96
+ skip_bad_relationships=dbms_config.get(
97
+ "skip_bad_relationships"
98
+ ), # neo4j
99
+ skip_duplicate_nodes=dbms_config.get(
100
+ "skip_duplicate_nodes"
101
+ ), # neo4j
102
+ db_user=dbms_config.get("user"), # psql
103
+ db_password=dbms_config.get("password"), # psql
104
+ db_port=dbms_config.get("port"), # psql
105
+ )
File without changes
@@ -0,0 +1,241 @@
1
+ import os
2
+
3
+ from biocypher._logger import logger
4
+ from biocypher.write.graph._neo4j import _Neo4jBatchWriter
5
+
6
+
7
+ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
8
+ """
9
+ Class for writing node and edge representations to disk using the format
10
+ specified by ArangoDB for the use of "arangoimport". Output files are
11
+ similar to Neo4j, but with a different header format.
12
+ """
13
+
14
+ def _get_default_import_call_bin_prefix(self):
15
+ """
16
+ Method to provide the default string for the import call bin prefix.
17
+
18
+ Returns:
19
+ str: The default location for the neo4j admin import location
20
+ """
21
+ return ""
22
+
23
+ def _get_import_script_name(self) -> str:
24
+ """
25
+ Returns the name of the neo4j admin import script
26
+
27
+ Returns:
28
+ str: The name of the import script (ending in .sh)
29
+ """
30
+ return "arangodb-import-call.sh"
31
+
32
+ def _write_node_headers(self):
33
+ """
34
+ Writes single CSV file for a graph entity that is represented
35
+ as a node as per the definition in the `schema_config.yaml`,
36
+ containing only the header for this type of node.
37
+
38
+ Returns:
39
+ bool: The return value. True for success, False otherwise.
40
+ """
41
+ # load headers from data parse
42
+ if not self.node_property_dict:
43
+ logger.error(
44
+ "Header information not found. Was the data parsed first?",
45
+ )
46
+ return False
47
+
48
+ for label, props in self.node_property_dict.items():
49
+ # create header CSV with ID, properties, labels
50
+
51
+ _id = "_key"
52
+
53
+ # translate label to PascalCase
54
+ pascal_label = self.translator.name_sentence_to_pascal(label)
55
+
56
+ header = f"{pascal_label}-header.csv"
57
+ header_path = os.path.join(
58
+ self.outdir,
59
+ header,
60
+ )
61
+
62
+ # check if file already exists
63
+ if os.path.exists(header_path):
64
+ logger.warning(
65
+ f"File {header_path} already exists. Overwriting."
66
+ )
67
+
68
+ # concatenate key:value in props
69
+ props_list = []
70
+ for k in props.keys():
71
+ props_list.append(f"{k}")
72
+
73
+ # create list of lists and flatten
74
+ # removes need for empty check of property list
75
+ out_list = [[_id], props_list]
76
+ out_list = [val for sublist in out_list for val in sublist]
77
+
78
+ with open(header_path, "w", encoding="utf-8") as f:
79
+ # concatenate with delimiter
80
+ row = self.delim.join(out_list)
81
+ f.write(row)
82
+
83
+ # add collection from schema config
84
+ collection = self.translator.ontology.mapping.extended_schema[
85
+ label
86
+ ].get("db_collection_name", None)
87
+
88
+ # add file path to neo4 admin import statement
89
+ # do once for each part file
90
+ parts = self.parts.get(label, [])
91
+
92
+ if not parts:
93
+ raise ValueError(
94
+ f"No parts found for node label {label}. "
95
+ f"Check that the data was parsed first.",
96
+ )
97
+
98
+ for part in parts:
99
+ import_call_header_path = os.path.join(
100
+ self.import_call_file_prefix,
101
+ header,
102
+ )
103
+ import_call_parts_path = os.path.join(
104
+ self.import_call_file_prefix,
105
+ part,
106
+ )
107
+
108
+ self.import_call_nodes.add(
109
+ (
110
+ import_call_header_path,
111
+ import_call_parts_path,
112
+ collection,
113
+ )
114
+ )
115
+
116
+ return True
117
+
118
+ def _write_edge_headers(self):
119
+ """
120
+ Writes single CSV file for a graph entity that is represented
121
+ as an edge as per the definition in the `schema_config.yaml`,
122
+ containing only the header for this type of edge.
123
+
124
+ Returns:
125
+ bool: The return value. True for success, False otherwise.
126
+ """
127
+ # load headers from data parse
128
+ if not self.edge_property_dict:
129
+ logger.error(
130
+ "Header information not found. Was the data parsed first?",
131
+ )
132
+ return False
133
+
134
+ for label, props in self.edge_property_dict.items():
135
+ # translate label to PascalCase
136
+ pascal_label = self.translator.name_sentence_to_pascal(label)
137
+
138
+ # paths
139
+ header = f"{pascal_label}-header.csv"
140
+ header_path = os.path.join(
141
+ self.outdir,
142
+ header,
143
+ )
144
+ parts = f"{pascal_label}-part.*"
145
+
146
+ # check for file exists
147
+ if os.path.exists(header_path):
148
+ logger.warning(
149
+ f"Header file {header_path} already exists. Overwriting."
150
+ )
151
+
152
+ # concatenate key:value in props
153
+ props_list = []
154
+ for k in props.keys():
155
+ props_list.append(f"{k}")
156
+
157
+ out_list = ["_from", "_key", *props_list, "_to"]
158
+
159
+ with open(header_path, "w", encoding="utf-8") as f:
160
+ # concatenate with delimiter
161
+ row = self.delim.join(out_list)
162
+ f.write(row)
163
+
164
+ # add collection from schema config
165
+ if not self.translator.ontology.mapping.extended_schema.get(label):
166
+ for (
167
+ _,
168
+ v,
169
+ ) in self.translator.ontology.mapping.extended_schema.items():
170
+ if v.get("label_as_edge") == label:
171
+ collection = v.get("db_collection_name", None)
172
+ break
173
+
174
+ else:
175
+ collection = self.translator.ontology.mapping.extended_schema[
176
+ label
177
+ ].get("db_collection_name", None)
178
+
179
+ # add file path to neo4 admin import statement (import call path
180
+ # may be different from actual output path)
181
+ header_import_call_path = os.path.join(
182
+ self.import_call_file_prefix,
183
+ header,
184
+ )
185
+ parts_import_call_path = os.path.join(
186
+ self.import_call_file_prefix,
187
+ parts,
188
+ )
189
+ self.import_call_edges.add(
190
+ (
191
+ header_import_call_path,
192
+ parts_import_call_path,
193
+ collection,
194
+ )
195
+ )
196
+
197
+ return True
198
+
199
+ def _construct_import_call(self) -> str:
200
+ """
201
+ Function to construct the import call detailing folder and
202
+ individual node and edge headers and data files, as well as
203
+ delimiters and database name. Built after all data has been
204
+ processed to ensure that nodes are called before any edges.
205
+
206
+ Returns:
207
+ str: a bash command for neo4j-admin import
208
+ """
209
+ import_call = (
210
+ f"{self.import_call_bin_prefix}arangoimp "
211
+ f"--type csv "
212
+ f'--separator="{self.escaped_delim}" '
213
+ )
214
+
215
+ if self.quote == "'":
216
+ import_call += f'--quote="{self.quote}" '
217
+ else:
218
+ import_call += f"--quote='{self.quote}' "
219
+
220
+ node_lines = ""
221
+
222
+ # node import calls: one line per node type
223
+ for header_path, parts_path, collection in self.import_call_nodes:
224
+ line = (
225
+ f"{import_call} "
226
+ f"--headers-file {header_path} "
227
+ f"--file= {parts_path} "
228
+ )
229
+
230
+ if collection:
231
+ line += f"--create-collection --collection {collection} "
232
+
233
+ node_lines += f"{line}\n"
234
+
235
+ edge_lines = ""
236
+
237
+ # edge import calls: one line per edge type
238
+ for header_path, parts_path, collection in self.import_call_edges:
239
+ import_call += f'--relationships="{header_path},{parts_path}" '
240
+
241
+ return node_lines + edge_lines
@@ -0,0 +1,334 @@
1
+ import os
2
+ import re
3
+ import subprocess
4
+
5
+ from biocypher._logger import logger
6
+ from biocypher.write._batch_writer import parse_label, _BatchWriter
7
+
8
+
9
+ class _Neo4jBatchWriter(_BatchWriter):
10
+ """
11
+ Class for writing node and edge representations to disk using the
12
+ format specified by Neo4j for the use of admin import. Each batch
13
+ writer instance has a fixed representation that needs to be passed
14
+ at instantiation via the :py:attr:`schema` argument. The instance
15
+ also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
16
+ to convert and extend the hierarchy.
17
+
18
+ This class inherits from the abstract class "_BatchWriter" and implements the
19
+ Neo4j-specific methods:
20
+
21
+ - _write_node_headers
22
+ - _write_edge_headers
23
+ - _construct_import_call
24
+ - _write_array_string
25
+ """
26
+
27
+ def __init__(self, *args, **kwargs):
28
+ """
29
+ Constructor.
30
+
31
+ Check the version of Neo4j and adds a command scope if version >= 5.
32
+
33
+ Returns:
34
+ _Neo4jBatchWriter: An instance of the writer.
35
+ """
36
+
37
+ # Should read the configuration and setup import_call_bin_prefix.
38
+ super().__init__(*args, **kwargs)
39
+
40
+ def _get_default_import_call_bin_prefix(self):
41
+ """
42
+ Method to provide the default string for the import call bin prefix.
43
+
44
+ Returns:
45
+ str: The default location for the neo4j admin import location
46
+ """
47
+
48
+ return "bin/"
49
+
50
+ def _write_array_string(self, string_list):
51
+ """
52
+ Abstract method to write the string representation of an array into a .csv file
53
+ as required by the neo4j admin-import.
54
+
55
+ Args:
56
+ string_list (list): list of ontology strings
57
+
58
+ Returns:
59
+ str: The string representation of an array for the neo4j admin import
60
+ """
61
+ string = self.adelim.join(string_list)
62
+ return f"{self.quote}{string}{self.quote}"
63
+
64
+ def _write_node_headers(self):
65
+ """
66
+ Writes single CSV file for a graph entity that is represented
67
+ as a node as per the definition in the `schema_config.yaml`,
68
+ containing only the header for this type of node.
69
+
70
+ Returns:
71
+ bool: The return value. True for success, False otherwise.
72
+ """
73
+ # load headers from data parse
74
+ if not self.node_property_dict:
75
+ logger.error(
76
+ "Header information not found. Was the data parsed first?",
77
+ )
78
+ return False
79
+
80
+ for label, props in self.node_property_dict.items():
81
+ _id = ":ID"
82
+
83
+ # translate label to PascalCase
84
+ pascal_label = self.translator.name_sentence_to_pascal(
85
+ parse_label(label)
86
+ )
87
+
88
+ header = f"{pascal_label}-header.csv"
89
+ header_path = os.path.join(
90
+ self.outdir,
91
+ header,
92
+ )
93
+ parts = f"{pascal_label}-part.*"
94
+
95
+ # check if file already exists
96
+ if os.path.exists(header_path):
97
+ logger.warning(
98
+ f"Header file `{header_path}` already exists. Overwriting.",
99
+ )
100
+
101
+ # concatenate key:value in props
102
+ props_list = []
103
+ for k, v in props.items():
104
+ if v in ["int", "long", "integer"]:
105
+ props_list.append(f"{k}:long")
106
+ elif v in ["int[]", "long[]", "integer[]"]:
107
+ props_list.append(f"{k}:long[]")
108
+ elif v in ["float", "double", "dbl"]:
109
+ props_list.append(f"{k}:double")
110
+ elif v in ["float[]", "double[]"]:
111
+ props_list.append(f"{k}:double[]")
112
+ elif v in ["bool", "boolean"]:
113
+ # TODO Neo4j boolean support / spelling?
114
+ props_list.append(f"{k}:boolean")
115
+ elif v in ["bool[]", "boolean[]"]:
116
+ props_list.append(f"{k}:boolean[]")
117
+ elif v in ["str[]", "string[]"]:
118
+ props_list.append(f"{k}:string[]")
119
+ else:
120
+ props_list.append(f"{k}")
121
+
122
+ # create list of lists and flatten
123
+ out_list = [[_id], props_list, [":LABEL"]]
124
+ out_list = [val for sublist in out_list for val in sublist]
125
+
126
+ with open(header_path, "w", encoding="utf-8") as f:
127
+ # concatenate with delimiter
128
+ row = self.delim.join(out_list)
129
+ f.write(row)
130
+
131
+ # add file path to neo4 admin import statement (import call file
132
+ # path may be different from actual file path)
133
+ import_call_header_path = os.path.join(
134
+ self.import_call_file_prefix,
135
+ header,
136
+ )
137
+ import_call_parts_path = os.path.join(
138
+ self.import_call_file_prefix,
139
+ parts,
140
+ )
141
+ self.import_call_nodes.add(
142
+ (import_call_header_path, import_call_parts_path)
143
+ )
144
+
145
+ return True
146
+
147
+ def _write_edge_headers(self):
148
+ """
149
+ Writes single CSV file for a graph entity that is represented
150
+ as an edge as per the definition in the `schema_config.yaml`,
151
+ containing only the header for this type of edge.
152
+
153
+ Returns:
154
+ bool: The return value. True for success, False otherwise.
155
+ """
156
+ # load headers from data parse
157
+ if not self.edge_property_dict:
158
+ logger.error(
159
+ "Header information not found. Was the data parsed first?",
160
+ )
161
+ return False
162
+
163
+ for label, props in self.edge_property_dict.items():
164
+ # translate label to PascalCase
165
+ pascal_label = self.translator.name_sentence_to_pascal(
166
+ parse_label(label)
167
+ )
168
+
169
+ # paths
170
+ header = f"{pascal_label}-header.csv"
171
+ header_path = os.path.join(
172
+ self.outdir,
173
+ header,
174
+ )
175
+ parts = f"{pascal_label}-part.*"
176
+
177
+ # check for file exists
178
+ if os.path.exists(header_path):
179
+ logger.warning(
180
+ f"File {header_path} already exists. Overwriting."
181
+ )
182
+
183
+ # concatenate key:value in props
184
+ props_list = []
185
+ for k, v in props.items():
186
+ if v in ["int", "long", "integer"]:
187
+ props_list.append(f"{k}:long")
188
+ elif v in ["int[]", "long[]", "integer[]"]:
189
+ props_list.append(f"{k}:long[]")
190
+ elif v in ["float", "double"]:
191
+ props_list.append(f"{k}:double")
192
+ elif v in ["float[]", "double[]"]:
193
+ props_list.append(f"{k}:double[]")
194
+ elif v in [
195
+ "bool",
196
+ "boolean",
197
+ ]: # TODO does Neo4j support bool?
198
+ props_list.append(f"{k}:boolean")
199
+ elif v in ["bool[]", "boolean[]"]:
200
+ props_list.append(f"{k}:boolean[]")
201
+ elif v in ["str[]", "string[]"]:
202
+ props_list.append(f"{k}:string[]")
203
+ else:
204
+ props_list.append(f"{k}")
205
+
206
+ skip_id = False
207
+ schema_label = None
208
+
209
+ if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
210
+ skip_id = True
211
+ elif not self.translator.ontology.mapping.extended_schema.get(
212
+ label
213
+ ):
214
+ # find label in schema by label_as_edge
215
+ for (
216
+ k,
217
+ v,
218
+ ) in self.translator.ontology.mapping.extended_schema.items():
219
+ if v.get("label_as_edge") == label:
220
+ schema_label = k
221
+ break
222
+ else:
223
+ schema_label = label
224
+
225
+ out_list = [":START_ID"]
226
+
227
+ if schema_label:
228
+ if (
229
+ self.translator.ontology.mapping.extended_schema.get(
230
+ schema_label
231
+ ).get("use_id")
232
+ == False
233
+ ):
234
+ skip_id = True
235
+
236
+ if not skip_id:
237
+ out_list.append("id")
238
+
239
+ out_list.extend(props_list)
240
+ out_list.extend([":END_ID", ":TYPE"])
241
+
242
+ with open(header_path, "w", encoding="utf-8") as f:
243
+ # concatenate with delimiter
244
+ row = self.delim.join(out_list)
245
+ f.write(row)
246
+
247
+ # add file path to neo4 admin import statement (import call file
248
+ # path may be different from actual file path)
249
+ import_call_header_path = os.path.join(
250
+ self.import_call_file_prefix,
251
+ header,
252
+ )
253
+ import_call_parts_path = os.path.join(
254
+ self.import_call_file_prefix,
255
+ parts,
256
+ )
257
+ self.import_call_edges.add(
258
+ (import_call_header_path, import_call_parts_path)
259
+ )
260
+
261
+ return True
262
+
263
+ def _get_import_script_name(self) -> str:
264
+ """
265
+ Returns the name of the neo4j admin import script
266
+
267
+ Returns:
268
+ str: The name of the import script (ending in .sh)
269
+ """
270
+ return "neo4j-admin-import-call.sh"
271
+
272
+ def _construct_import_call(self) -> str:
273
+ """
274
+ Function to construct the import call detailing folder and
275
+ individual node and edge headers and data files, as well as
276
+ delimiters and database name. Built after all data has been
277
+ processed to ensure that nodes are called before any edges.
278
+
279
+ Returns:
280
+ str: a bash command for neo4j-admin import
281
+ """
282
+ import_call_neo4j_v4 = self._get_import_call(
283
+ "import", "--database=", "--force="
284
+ )
285
+ import_call_neo4j_v5 = self._get_import_call(
286
+ "database import full", "", "--overwrite-destination="
287
+ )
288
+ neo4j_version_check = f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)"
289
+
290
+ import_script = f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi"
291
+ return import_script
292
+
293
+ def _get_import_call(
294
+ self, import_cmd: str, database_cmd: str, wipe_cmd: str
295
+ ) -> str:
296
+ """Get parametrized import call for Neo4j 4 or 5+.
297
+
298
+ Args:
299
+ import_cmd (str): The import command to use.
300
+ database_cmd (str): The database command to use.
301
+ wipe_cmd (str): The wipe command to use.
302
+
303
+ Returns:
304
+ str: The import call.
305
+ """
306
+ import_call = (
307
+ f"{self.import_call_bin_prefix}neo4j-admin {import_cmd} "
308
+ f'--delimiter="{self.escaped_delim}" '
309
+ f'--array-delimiter="{self.escaped_adelim}" '
310
+ )
311
+
312
+ if self.quote == "'":
313
+ import_call += f'--quote="{self.quote}" '
314
+ else:
315
+ import_call += f"--quote='{self.quote}' "
316
+
317
+ if self.wipe:
318
+ import_call += f"{wipe_cmd}true "
319
+ if self.skip_bad_relationships:
320
+ import_call += "--skip-bad-relationships=true "
321
+ if self.skip_duplicate_nodes:
322
+ import_call += "--skip-duplicate-nodes=true "
323
+
324
+ # append node import calls
325
+ for header_path, parts_path in self.import_call_nodes:
326
+ import_call += f'--nodes="{header_path},{parts_path}" '
327
+
328
+ # append edge import calls
329
+ for header_path, parts_path in self.import_call_edges:
330
+ import_call += f'--relationships="{header_path},{parts_path}" '
331
+
332
+ # Database needs to be at the end starting with Neo4j 5.0+.
333
+ import_call += f"{database_cmd}{self.db_name} "
334
+ return import_call
File without changes