biocypher 0.5.17__py3-none-any.whl → 0.5.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/__init__.py +10 -11
- biocypher/_config/__init__.py +25 -27
- biocypher/_config/biocypher_config.yaml +1 -2
- biocypher/_connect.py +59 -79
- biocypher/_core.py +146 -78
- biocypher/_create.py +55 -52
- biocypher/_deduplicate.py +81 -36
- biocypher/_logger.py +12 -13
- biocypher/_mapping.py +69 -83
- biocypher/_metadata.py +12 -17
- biocypher/_misc.py +17 -28
- biocypher/_ontology.py +85 -101
- biocypher/_pandas.py +46 -11
- biocypher/_translate.py +93 -113
- biocypher/_write.py +457 -404
- {biocypher-0.5.17.dist-info → biocypher-0.5.20.dist-info}/METADATA +16 -6
- biocypher-0.5.20.dist-info/RECORD +23 -0
- biocypher-0.5.17.dist-info/RECORD +0 -23
- {biocypher-0.5.17.dist-info → biocypher-0.5.20.dist-info}/LICENSE +0 -0
- {biocypher-0.5.17.dist-info → biocypher-0.5.20.dist-info}/WHEEL +0 -0
biocypher/_write.py
CHANGED
|
@@ -17,7 +17,7 @@ import glob
|
|
|
17
17
|
|
|
18
18
|
from ._logger import logger
|
|
19
19
|
|
|
20
|
-
logger.debug(f
|
|
20
|
+
logger.debug(f"Loading module {__name__}.")
|
|
21
21
|
|
|
22
22
|
from abc import ABC, abstractmethod
|
|
23
23
|
from types import GeneratorType
|
|
@@ -31,83 +31,15 @@ from more_itertools import peekable
|
|
|
31
31
|
from ._config import config as _config
|
|
32
32
|
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
33
33
|
|
|
34
|
-
__all__ = [
|
|
34
|
+
__all__ = ["get_writer"]
|
|
35
35
|
|
|
36
36
|
if TYPE_CHECKING:
|
|
37
|
-
|
|
38
37
|
from ._ontology import Ontology
|
|
39
38
|
from ._translate import Translator
|
|
40
39
|
from ._deduplicate import Deduplicator
|
|
41
40
|
|
|
42
41
|
|
|
43
42
|
class _BatchWriter(ABC):
|
|
44
|
-
"""
|
|
45
|
-
Abtract parent class for writing node and edge representations to disk using the
|
|
46
|
-
format specified by each database type. The database-specific functions are implemented
|
|
47
|
-
by the respective child-classes. This abstract class contains all methods expected by
|
|
48
|
-
a bach writer instance, some of which need to be overwritten by the child classes.
|
|
49
|
-
|
|
50
|
-
Each batch writer instance has a fixed representation that needs to be passed
|
|
51
|
-
at instantiation via the :py:attr:`schema` argument. The instance
|
|
52
|
-
also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
|
|
53
|
-
to convert and extend the hierarchy.
|
|
54
|
-
|
|
55
|
-
Requires the following methods to be overwritten by database-specific writer classes:
|
|
56
|
-
- _write_node_headers
|
|
57
|
-
- _write_edge_headers
|
|
58
|
-
- _construct_import_call
|
|
59
|
-
- _write_array_string
|
|
60
|
-
- _get_import_script_name
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
ontology:
|
|
64
|
-
Instance of :py:class:`Ontology` to enable translation and
|
|
65
|
-
ontology queries
|
|
66
|
-
|
|
67
|
-
translator:
|
|
68
|
-
Instance of :py:class:`Translator` to enable translation of
|
|
69
|
-
nodes and manipulation of properties.
|
|
70
|
-
|
|
71
|
-
deduplicator:
|
|
72
|
-
Instance of :py:class:`Deduplicator` to enable deduplication
|
|
73
|
-
of nodes and edges.
|
|
74
|
-
|
|
75
|
-
delimiter:
|
|
76
|
-
The delimiter to use for the CSV files.
|
|
77
|
-
|
|
78
|
-
array_delimiter:
|
|
79
|
-
The delimiter to use for array properties.
|
|
80
|
-
|
|
81
|
-
quote:
|
|
82
|
-
The quote character to use for the CSV files.
|
|
83
|
-
|
|
84
|
-
dirname:
|
|
85
|
-
Path for exporting CSV files.
|
|
86
|
-
|
|
87
|
-
db_name:
|
|
88
|
-
Name of the database that will be used in the generated
|
|
89
|
-
commands.
|
|
90
|
-
|
|
91
|
-
import_call_bin_prefix:
|
|
92
|
-
Path prefix for the admin import call binary.
|
|
93
|
-
|
|
94
|
-
import_call_file_prefix:
|
|
95
|
-
Path prefix for the data files (headers and parts) in the import
|
|
96
|
-
call.
|
|
97
|
-
|
|
98
|
-
wipe:
|
|
99
|
-
Whether to force import (removing existing DB content). (Specific to Neo4j.)
|
|
100
|
-
|
|
101
|
-
strict_mode:
|
|
102
|
-
Whether to enforce source, version, and license properties.
|
|
103
|
-
|
|
104
|
-
skip_bad_relationships:
|
|
105
|
-
Whether to skip relationships that do not have a valid
|
|
106
|
-
start and end node. (Specific to Neo4j.)
|
|
107
|
-
|
|
108
|
-
skip_duplicate_nodes:
|
|
109
|
-
Whether to skip duplicate nodes. (Specific to Neo4j.)
|
|
110
|
-
"""
|
|
111
43
|
@abstractmethod
|
|
112
44
|
def _get_default_import_call_bin_prefix(self):
|
|
113
45
|
"""
|
|
@@ -193,14 +125,13 @@ class _BatchWriter(ABC):
|
|
|
193
125
|
|
|
194
126
|
def __init__(
|
|
195
127
|
self,
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
deduplicator: 'Deduplicator',
|
|
128
|
+
translator: "Translator",
|
|
129
|
+
deduplicator: "Deduplicator",
|
|
199
130
|
delimiter: str,
|
|
200
|
-
array_delimiter: str =
|
|
131
|
+
array_delimiter: str = ",",
|
|
201
132
|
quote: str = '"',
|
|
202
133
|
output_directory: Optional[str] = None,
|
|
203
|
-
db_name: str =
|
|
134
|
+
db_name: str = "neo4j",
|
|
204
135
|
import_call_bin_prefix: Optional[str] = None,
|
|
205
136
|
import_call_file_prefix: Optional[str] = None,
|
|
206
137
|
wipe: bool = True,
|
|
@@ -209,11 +140,92 @@ class _BatchWriter(ABC):
|
|
|
209
140
|
skip_duplicate_nodes: bool = False,
|
|
210
141
|
db_user: str = None,
|
|
211
142
|
db_password: str = None,
|
|
212
|
-
|
|
143
|
+
db_host: str = None,
|
|
144
|
+
db_port: str = None,
|
|
213
145
|
):
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
Abtract parent class for writing node and edge representations to disk
|
|
149
|
+
using the format specified by each database type. The database-specific
|
|
150
|
+
functions are implemented by the respective child-classes. This abstract
|
|
151
|
+
class contains all methods expected by a bach writer instance, some of
|
|
152
|
+
which need to be overwritten by the child classes.
|
|
153
|
+
|
|
154
|
+
Each batch writer instance has a fixed representation that needs to be
|
|
155
|
+
passed at instantiation via the :py:attr:`schema` argument. The instance
|
|
156
|
+
also expects an ontology adapter via :py:attr:`ontology_adapter` to be
|
|
157
|
+
able to convert and extend the hierarchy.
|
|
158
|
+
|
|
159
|
+
Requires the following methods to be overwritten by database-specific
|
|
160
|
+
writer classes:
|
|
161
|
+
|
|
162
|
+
- _write_node_headers
|
|
163
|
+
- _write_edge_headers
|
|
164
|
+
- _construct_import_call
|
|
165
|
+
- _write_array_string
|
|
166
|
+
- _get_import_script_name
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
translator:
|
|
170
|
+
Instance of :py:class:`Translator` to enable translation of
|
|
171
|
+
nodes and manipulation of properties.
|
|
172
|
+
|
|
173
|
+
deduplicator:
|
|
174
|
+
Instance of :py:class:`Deduplicator` to enable deduplication
|
|
175
|
+
of nodes and edges.
|
|
176
|
+
|
|
177
|
+
delimiter:
|
|
178
|
+
The delimiter to use for the CSV files.
|
|
179
|
+
|
|
180
|
+
array_delimiter:
|
|
181
|
+
The delimiter to use for array properties.
|
|
182
|
+
|
|
183
|
+
quote:
|
|
184
|
+
The quote character to use for the CSV files.
|
|
185
|
+
|
|
186
|
+
dirname:
|
|
187
|
+
Path for exporting CSV files.
|
|
188
|
+
|
|
189
|
+
db_name:
|
|
190
|
+
Name of the database that will be used in the generated
|
|
191
|
+
commands.
|
|
192
|
+
|
|
193
|
+
import_call_bin_prefix:
|
|
194
|
+
Path prefix for the admin import call binary.
|
|
195
|
+
|
|
196
|
+
import_call_file_prefix:
|
|
197
|
+
Path prefix for the data files (headers and parts) in the import
|
|
198
|
+
call.
|
|
199
|
+
|
|
200
|
+
wipe:
|
|
201
|
+
Whether to force import (removing existing DB content). (Specific to Neo4j.)
|
|
202
|
+
|
|
203
|
+
strict_mode:
|
|
204
|
+
Whether to enforce source, version, and license properties.
|
|
205
|
+
|
|
206
|
+
skip_bad_relationships:
|
|
207
|
+
Whether to skip relationships that do not have a valid
|
|
208
|
+
start and end node. (Specific to Neo4j.)
|
|
209
|
+
|
|
210
|
+
skip_duplicate_nodes:
|
|
211
|
+
Whether to skip duplicate nodes. (Specific to Neo4j.)
|
|
212
|
+
|
|
213
|
+
db_user:
|
|
214
|
+
The database user.
|
|
215
|
+
|
|
216
|
+
db_password:
|
|
217
|
+
The database password.
|
|
218
|
+
|
|
219
|
+
db_host:
|
|
220
|
+
The database host. Defaults to localhost.
|
|
221
|
+
|
|
222
|
+
db_port:
|
|
223
|
+
The database port.
|
|
224
|
+
"""
|
|
214
225
|
self.db_name = db_name
|
|
215
226
|
self.db_user = db_user
|
|
216
227
|
self.db_password = db_password
|
|
228
|
+
self.db_host = db_host or "localhost"
|
|
217
229
|
self.db_port = db_port
|
|
218
230
|
|
|
219
231
|
self.delim, self.escaped_delim = self._process_delimiter(delimiter)
|
|
@@ -225,7 +237,8 @@ class _BatchWriter(ABC):
|
|
|
225
237
|
self.skip_duplicate_nodes = skip_duplicate_nodes
|
|
226
238
|
|
|
227
239
|
if import_call_bin_prefix is None:
|
|
228
|
-
self.import_call_bin_prefix =
|
|
240
|
+
self.import_call_bin_prefix = (
|
|
241
|
+
self._get_default_import_call_bin_prefix()
|
|
229
242
|
)
|
|
230
243
|
else:
|
|
231
244
|
self.import_call_bin_prefix = import_call_bin_prefix
|
|
@@ -233,8 +246,6 @@ class _BatchWriter(ABC):
|
|
|
233
246
|
self.wipe = wipe
|
|
234
247
|
self.strict_mode = strict_mode
|
|
235
248
|
|
|
236
|
-
self.extended_schema = ontology.extended_schema
|
|
237
|
-
self.ontology = ontology
|
|
238
249
|
self.translator = translator
|
|
239
250
|
self.deduplicator = deduplicator
|
|
240
251
|
self.node_property_dict = {}
|
|
@@ -248,11 +259,11 @@ class _BatchWriter(ABC):
|
|
|
248
259
|
|
|
249
260
|
if os.path.exists(self.outdir):
|
|
250
261
|
logger.warning(
|
|
251
|
-
f
|
|
252
|
-
|
|
262
|
+
f"Output directory `{self.outdir}` already exists. "
|
|
263
|
+
"If this is not planned, file consistency may be compromised."
|
|
253
264
|
)
|
|
254
265
|
else:
|
|
255
|
-
logger.info(f
|
|
266
|
+
logger.info(f"Creating output directory `{self.outdir}`.")
|
|
256
267
|
os.makedirs(self.outdir)
|
|
257
268
|
|
|
258
269
|
self.parts = {} # dict to store the paths of part files for each label
|
|
@@ -268,7 +279,6 @@ class _BatchWriter(ABC):
|
|
|
268
279
|
|
|
269
280
|
return self._outdir
|
|
270
281
|
|
|
271
|
-
|
|
272
282
|
@property
|
|
273
283
|
def import_call_file_prefix(self):
|
|
274
284
|
"""
|
|
@@ -286,12 +296,10 @@ class _BatchWriter(ABC):
|
|
|
286
296
|
representation (e.g. tab for '\t').
|
|
287
297
|
"""
|
|
288
298
|
|
|
289
|
-
if delimiter ==
|
|
290
|
-
|
|
291
|
-
return '\t', '\\t'
|
|
299
|
+
if delimiter == "\\t":
|
|
300
|
+
return "\t", "\\t"
|
|
292
301
|
|
|
293
302
|
else:
|
|
294
|
-
|
|
295
303
|
return delimiter, delimiter
|
|
296
304
|
|
|
297
305
|
def write_nodes(self, nodes, batch_size: int = int(1e6)):
|
|
@@ -310,12 +318,12 @@ class _BatchWriter(ABC):
|
|
|
310
318
|
# write node data
|
|
311
319
|
passed = self._write_node_data(nodes, batch_size)
|
|
312
320
|
if not passed:
|
|
313
|
-
logger.error(
|
|
321
|
+
logger.error("Error while writing node data.")
|
|
314
322
|
return False
|
|
315
323
|
# pass property data to header writer per node type written
|
|
316
324
|
passed = self._write_node_headers()
|
|
317
325
|
if not passed:
|
|
318
|
-
logger.error(
|
|
326
|
+
logger.error("Error while writing node headers.")
|
|
319
327
|
return False
|
|
320
328
|
|
|
321
329
|
return True
|
|
@@ -337,48 +345,50 @@ class _BatchWriter(ABC):
|
|
|
337
345
|
bool: The return value. True for success, False otherwise.
|
|
338
346
|
"""
|
|
339
347
|
passed = False
|
|
340
|
-
# unwrap generator in one step
|
|
341
348
|
edges = list(edges) # force evaluation to handle empty generator
|
|
342
349
|
if edges:
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
],
|
|
351
|
-
) if isinstance(e, BioCypherRelAsNode) else (None, [e])
|
|
352
|
-
for e in edges
|
|
353
|
-
)
|
|
354
|
-
)
|
|
355
|
-
nod, edg = (list(a) for a in z)
|
|
356
|
-
nod = [n for n in nod if n]
|
|
357
|
-
edg = [val for sublist in edg for val in sublist] # flatten
|
|
350
|
+
nodes_flat = []
|
|
351
|
+
edges_flat = []
|
|
352
|
+
for edge in edges:
|
|
353
|
+
if isinstance(edge, BioCypherRelAsNode):
|
|
354
|
+
# check if relationship has already been written, if so skip
|
|
355
|
+
if self.deduplicator.rel_as_node_seen(edge):
|
|
356
|
+
continue
|
|
358
357
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
358
|
+
nodes_flat.append(edge.get_node())
|
|
359
|
+
edges_flat.append(edge.get_source_edge())
|
|
360
|
+
edges_flat.append(edge.get_target_edge())
|
|
361
|
+
|
|
362
|
+
else:
|
|
363
|
+
# check if relationship has already been written, if so skip
|
|
364
|
+
if self.deduplicator.edge_seen(edge):
|
|
365
|
+
continue
|
|
366
|
+
|
|
367
|
+
edges_flat.append(edge)
|
|
368
|
+
|
|
369
|
+
if nodes_flat and edges_flat:
|
|
370
|
+
passed = self.write_nodes(nodes_flat) and self._write_edge_data(
|
|
371
|
+
edges_flat,
|
|
362
372
|
batch_size,
|
|
363
373
|
)
|
|
364
374
|
else:
|
|
365
|
-
passed = self._write_edge_data(
|
|
375
|
+
passed = self._write_edge_data(edges_flat, batch_size)
|
|
366
376
|
|
|
367
377
|
else:
|
|
368
378
|
# is this a problem? if the generator or list is empty, we
|
|
369
379
|
# don't write anything.
|
|
370
380
|
logger.debug(
|
|
371
|
-
|
|
381
|
+
"No edges to write, possibly due to no matched Biolink classes.",
|
|
372
382
|
)
|
|
373
383
|
pass
|
|
374
384
|
|
|
375
385
|
if not passed:
|
|
376
|
-
logger.error(
|
|
386
|
+
logger.error("Error while writing edge data.")
|
|
377
387
|
return False
|
|
378
388
|
# pass property data to header writer per edge type written
|
|
379
389
|
passed = self._write_edge_headers()
|
|
380
390
|
if not passed:
|
|
381
|
-
logger.error(
|
|
391
|
+
logger.error("Error while writing edge headers.")
|
|
382
392
|
return False
|
|
383
393
|
|
|
384
394
|
return True
|
|
@@ -401,7 +411,7 @@ class _BatchWriter(ABC):
|
|
|
401
411
|
"""
|
|
402
412
|
|
|
403
413
|
if isinstance(nodes, GeneratorType) or isinstance(nodes, peekable):
|
|
404
|
-
logger.debug(
|
|
414
|
+
logger.debug("Writing node CSV from generator.")
|
|
405
415
|
|
|
406
416
|
bins = defaultdict(list) # dict to store a list for each
|
|
407
417
|
# label that is passed in
|
|
@@ -424,7 +434,7 @@ class _BatchWriter(ABC):
|
|
|
424
434
|
|
|
425
435
|
# check for non-id
|
|
426
436
|
if not _id:
|
|
427
|
-
logger.warning(f
|
|
437
|
+
logger.warning(f"Node {label} has no id; skipping.")
|
|
428
438
|
continue
|
|
429
439
|
|
|
430
440
|
if not label in bins.keys():
|
|
@@ -434,20 +444,26 @@ class _BatchWriter(ABC):
|
|
|
434
444
|
bin_l[label] = 1
|
|
435
445
|
|
|
436
446
|
# get properties from config if present
|
|
437
|
-
cprops =
|
|
447
|
+
cprops = (
|
|
448
|
+
self.translator.ontology.mapping.extended_schema.get(
|
|
449
|
+
label
|
|
450
|
+
).get(
|
|
451
|
+
"properties",
|
|
452
|
+
)
|
|
453
|
+
)
|
|
438
454
|
if cprops:
|
|
439
455
|
d = dict(cprops)
|
|
440
456
|
|
|
441
457
|
# add id and preferred id to properties; these are
|
|
442
458
|
# created in node creation (`_create.BioCypherNode`)
|
|
443
|
-
d[
|
|
444
|
-
d[
|
|
459
|
+
d["id"] = "str"
|
|
460
|
+
d["preferred_id"] = "str"
|
|
445
461
|
|
|
446
462
|
# add strict mode properties
|
|
447
463
|
if self.strict_mode:
|
|
448
|
-
d[
|
|
449
|
-
d[
|
|
450
|
-
d[
|
|
464
|
+
d["source"] = "str"
|
|
465
|
+
d["version"] = "str"
|
|
466
|
+
d["licence"] = "str"
|
|
451
467
|
|
|
452
468
|
else:
|
|
453
469
|
d = dict(node.get_properties())
|
|
@@ -467,7 +483,7 @@ class _BatchWriter(ABC):
|
|
|
467
483
|
|
|
468
484
|
# get label hierarchy
|
|
469
485
|
# multiple labels:
|
|
470
|
-
all_labels = self.ontology.get_ancestors(label)
|
|
486
|
+
all_labels = self.translator.ontology.get_ancestors(label)
|
|
471
487
|
|
|
472
488
|
if all_labels:
|
|
473
489
|
# convert to pascal case
|
|
@@ -531,7 +547,7 @@ class _BatchWriter(ABC):
|
|
|
531
547
|
return True
|
|
532
548
|
else:
|
|
533
549
|
if type(nodes) is not list:
|
|
534
|
-
logger.error(
|
|
550
|
+
logger.error("Nodes must be passed as list or generator.")
|
|
535
551
|
return False
|
|
536
552
|
else:
|
|
537
553
|
|
|
@@ -563,14 +579,13 @@ class _BatchWriter(ABC):
|
|
|
563
579
|
bool: The return value. True for success, False otherwise.
|
|
564
580
|
"""
|
|
565
581
|
if not all(isinstance(n, BioCypherNode) for n in node_list):
|
|
566
|
-
logger.error(
|
|
582
|
+
logger.error("Nodes must be passed as type BioCypherNode.")
|
|
567
583
|
return False
|
|
568
584
|
|
|
569
585
|
# from list of nodes to list of strings
|
|
570
586
|
lines = []
|
|
571
587
|
|
|
572
588
|
for n in node_list:
|
|
573
|
-
|
|
574
589
|
# check for deviations in properties
|
|
575
590
|
# node properties
|
|
576
591
|
n_props = n.get_properties()
|
|
@@ -584,46 +599,45 @@ class _BatchWriter(ABC):
|
|
|
584
599
|
oprop1 = set(ref_props).difference(n_keys)
|
|
585
600
|
oprop2 = set(n_keys).difference(ref_props)
|
|
586
601
|
logger.error(
|
|
587
|
-
f
|
|
588
|
-
f
|
|
589
|
-
f
|
|
590
|
-
f
|
|
591
|
-
f
|
|
592
|
-
f
|
|
602
|
+
f"At least one node of the class {n.get_label()} "
|
|
603
|
+
f"has more or fewer properties than another. "
|
|
604
|
+
f"Offending node: {onode!r}, offending property: "
|
|
605
|
+
f"{max([oprop1, oprop2])}. "
|
|
606
|
+
f"All reference properties: {ref_props}, "
|
|
607
|
+
f"All node properties: {n_keys}.",
|
|
593
608
|
)
|
|
594
609
|
return False
|
|
595
610
|
|
|
596
611
|
line = [n.get_id()]
|
|
597
612
|
|
|
598
613
|
if ref_props:
|
|
599
|
-
|
|
600
614
|
plist = []
|
|
601
615
|
# make all into strings, put actual strings in quotes
|
|
602
616
|
for k, v in prop_dict.items():
|
|
603
617
|
p = n_props.get(k)
|
|
604
618
|
if p is None: # TODO make field empty instead of ""?
|
|
605
|
-
plist.append(
|
|
619
|
+
plist.append("")
|
|
606
620
|
elif v in [
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
621
|
+
"int",
|
|
622
|
+
"integer",
|
|
623
|
+
"long",
|
|
624
|
+
"float",
|
|
625
|
+
"double",
|
|
626
|
+
"dbl",
|
|
627
|
+
"bool",
|
|
628
|
+
"boolean",
|
|
615
629
|
]:
|
|
616
630
|
plist.append(str(p))
|
|
617
631
|
else:
|
|
618
632
|
if isinstance(p, list):
|
|
619
633
|
plist.append(self._write_array_string(p))
|
|
620
634
|
else:
|
|
621
|
-
plist.append(f
|
|
635
|
+
plist.append(f"{self.quote}{str(p)}{self.quote}")
|
|
622
636
|
|
|
623
637
|
line.append(self.delim.join(plist))
|
|
624
638
|
line.append(labels)
|
|
625
639
|
|
|
626
|
-
lines.append(self.delim.join(line) +
|
|
640
|
+
lines.append(self.delim.join(line) + "\n")
|
|
627
641
|
|
|
628
642
|
# avoid writing empty files
|
|
629
643
|
if lines:
|
|
@@ -653,7 +667,7 @@ class _BatchWriter(ABC):
|
|
|
653
667
|
"""
|
|
654
668
|
|
|
655
669
|
if isinstance(edges, GeneratorType):
|
|
656
|
-
logger.debug(
|
|
670
|
+
logger.debug("Writing edge CSV from generator.")
|
|
657
671
|
|
|
658
672
|
bins = defaultdict(list) # dict to store a list for each
|
|
659
673
|
# label that is passed in
|
|
@@ -665,14 +679,10 @@ class _BatchWriter(ABC):
|
|
|
665
679
|
# for each label to check for consistency and their type
|
|
666
680
|
# for now, relevant for `int`
|
|
667
681
|
for edge in edges:
|
|
668
|
-
# check for duplicates
|
|
669
|
-
if self.deduplicator.edge_seen(edge):
|
|
670
|
-
continue
|
|
671
|
-
|
|
672
682
|
if not (edge.get_source_id() and edge.get_target_id()):
|
|
673
683
|
logger.error(
|
|
674
|
-
|
|
675
|
-
f
|
|
684
|
+
"Edge must have source and target node. "
|
|
685
|
+
f"Caused by: {edge}",
|
|
676
686
|
)
|
|
677
687
|
continue
|
|
678
688
|
|
|
@@ -689,25 +699,35 @@ class _BatchWriter(ABC):
|
|
|
689
699
|
# (may not be if it is an edge that carries the
|
|
690
700
|
# "label_as_edge" property)
|
|
691
701
|
cprops = None
|
|
692
|
-
if
|
|
693
|
-
|
|
694
|
-
|
|
702
|
+
if (
|
|
703
|
+
label
|
|
704
|
+
in self.translator.ontology.mapping.extended_schema
|
|
705
|
+
):
|
|
706
|
+
cprops = self.translator.ontology.mapping.extended_schema.get(
|
|
707
|
+
label
|
|
708
|
+
).get(
|
|
709
|
+
"properties",
|
|
695
710
|
)
|
|
696
711
|
else:
|
|
697
712
|
# try via "label_as_edge"
|
|
698
|
-
for
|
|
713
|
+
for (
|
|
714
|
+
k,
|
|
715
|
+
v,
|
|
716
|
+
) in (
|
|
717
|
+
self.translator.ontology.mapping.extended_schema.items()
|
|
718
|
+
):
|
|
699
719
|
if isinstance(v, dict):
|
|
700
|
-
if v.get(
|
|
701
|
-
cprops = v.get(
|
|
720
|
+
if v.get("label_as_edge") == label:
|
|
721
|
+
cprops = v.get("properties")
|
|
702
722
|
break
|
|
703
723
|
if cprops:
|
|
704
724
|
d = cprops
|
|
705
725
|
|
|
706
726
|
# add strict mode properties
|
|
707
727
|
if self.strict_mode:
|
|
708
|
-
d[
|
|
709
|
-
d[
|
|
710
|
-
d[
|
|
728
|
+
d["source"] = "str"
|
|
729
|
+
d["version"] = "str"
|
|
730
|
+
d["licence"] = "str"
|
|
711
731
|
|
|
712
732
|
else:
|
|
713
733
|
d = dict(edge.get_properties())
|
|
@@ -746,7 +766,6 @@ class _BatchWriter(ABC):
|
|
|
746
766
|
|
|
747
767
|
# after generator depleted, write remainder of bins
|
|
748
768
|
for label, nl in bins.items():
|
|
749
|
-
|
|
750
769
|
passed = self._write_single_edge_list_to_file(
|
|
751
770
|
nl,
|
|
752
771
|
label,
|
|
@@ -768,7 +787,7 @@ class _BatchWriter(ABC):
|
|
|
768
787
|
return True
|
|
769
788
|
else:
|
|
770
789
|
if type(edges) is not list:
|
|
771
|
-
logger.error(
|
|
790
|
+
logger.error("Edges must be passed as list or generator.")
|
|
772
791
|
return False
|
|
773
792
|
else:
|
|
774
793
|
|
|
@@ -800,8 +819,7 @@ class _BatchWriter(ABC):
|
|
|
800
819
|
"""
|
|
801
820
|
|
|
802
821
|
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
|
|
803
|
-
|
|
804
|
-
logger.error('Edges must be passed as type BioCypherEdge.')
|
|
822
|
+
logger.error("Edges must be passed as type BioCypherEdge.")
|
|
805
823
|
return False
|
|
806
824
|
|
|
807
825
|
# from list of edges to list of strings
|
|
@@ -815,16 +833,16 @@ class _BatchWriter(ABC):
|
|
|
815
833
|
|
|
816
834
|
# compare list order invariant
|
|
817
835
|
if not set(ref_props) == set(e_keys):
|
|
818
|
-
oedge = f
|
|
836
|
+
oedge = f"{e.get_source_id()}-{e.get_target_id()}"
|
|
819
837
|
oprop1 = set(ref_props).difference(e_keys)
|
|
820
838
|
oprop2 = set(e_keys).difference(ref_props)
|
|
821
839
|
logger.error(
|
|
822
|
-
f
|
|
823
|
-
f
|
|
824
|
-
f
|
|
825
|
-
f
|
|
826
|
-
f
|
|
827
|
-
f
|
|
840
|
+
f"At least one edge of the class {e.get_label()} "
|
|
841
|
+
f"has more or fewer properties than another. "
|
|
842
|
+
f"Offending edge: {oedge!r}, offending property: "
|
|
843
|
+
f"{max([oprop1, oprop2])}. "
|
|
844
|
+
f"All reference properties: {ref_props}, "
|
|
845
|
+
f"All edge properties: {e_keys}.",
|
|
828
846
|
)
|
|
829
847
|
return False
|
|
830
848
|
|
|
@@ -833,16 +851,16 @@ class _BatchWriter(ABC):
|
|
|
833
851
|
for k, v in prop_dict.items():
|
|
834
852
|
p = e_props.get(k)
|
|
835
853
|
if p is None: # TODO make field empty instead of ""?
|
|
836
|
-
plist.append(
|
|
854
|
+
plist.append("")
|
|
837
855
|
elif v in [
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
856
|
+
"int",
|
|
857
|
+
"integer",
|
|
858
|
+
"long",
|
|
859
|
+
"float",
|
|
860
|
+
"double",
|
|
861
|
+
"dbl",
|
|
862
|
+
"bool",
|
|
863
|
+
"boolean",
|
|
846
864
|
]:
|
|
847
865
|
plist.append(str(p))
|
|
848
866
|
else:
|
|
@@ -850,7 +868,7 @@ class _BatchWriter(ABC):
|
|
|
850
868
|
plist.append(self._write_array_string(p))
|
|
851
869
|
else:
|
|
852
870
|
plist.append(self.quote + str(p) + self.quote)
|
|
853
|
-
|
|
871
|
+
|
|
854
872
|
entries = [e.get_source_id()]
|
|
855
873
|
|
|
856
874
|
skip_id = False
|
|
@@ -858,32 +876,44 @@ class _BatchWriter(ABC):
|
|
|
858
876
|
|
|
859
877
|
if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
|
|
860
878
|
skip_id = True
|
|
861
|
-
elif not self.extended_schema.get(
|
|
879
|
+
elif not self.translator.ontology.mapping.extended_schema.get(
|
|
880
|
+
label
|
|
881
|
+
):
|
|
862
882
|
# find label in schema by label_as_edge
|
|
863
|
-
for
|
|
864
|
-
|
|
883
|
+
for (
|
|
884
|
+
k,
|
|
885
|
+
v,
|
|
886
|
+
) in self.translator.ontology.mapping.extended_schema.items():
|
|
887
|
+
if v.get("label_as_edge") == label:
|
|
865
888
|
schema_label = k
|
|
866
889
|
break
|
|
867
890
|
else:
|
|
868
891
|
schema_label = label
|
|
869
892
|
|
|
870
893
|
if schema_label:
|
|
871
|
-
if
|
|
894
|
+
if (
|
|
895
|
+
self.translator.ontology.mapping.extended_schema.get(
|
|
896
|
+
schema_label
|
|
897
|
+
).get("use_id")
|
|
898
|
+
== False
|
|
899
|
+
):
|
|
872
900
|
skip_id = True
|
|
873
901
|
|
|
874
902
|
if not skip_id:
|
|
875
|
-
entries.append(e.get_id() or
|
|
903
|
+
entries.append(e.get_id() or "")
|
|
876
904
|
|
|
877
905
|
if ref_props:
|
|
878
906
|
entries.append(self.delim.join(plist))
|
|
879
907
|
|
|
880
908
|
entries.append(e.get_target_id())
|
|
881
|
-
entries.append(
|
|
882
|
-
|
|
883
|
-
|
|
909
|
+
entries.append(
|
|
910
|
+
self.translator.name_sentence_to_pascal(
|
|
911
|
+
e.get_label(),
|
|
912
|
+
)
|
|
913
|
+
)
|
|
884
914
|
|
|
885
915
|
lines.append(
|
|
886
|
-
self.delim.join(entries) +
|
|
916
|
+
self.delim.join(entries) + "\n",
|
|
887
917
|
)
|
|
888
918
|
|
|
889
919
|
# avoid writing empty files
|
|
@@ -911,39 +941,34 @@ class _BatchWriter(ABC):
|
|
|
911
941
|
|
|
912
942
|
# list files in self.outdir
|
|
913
943
|
files = glob.glob(
|
|
914
|
-
os.path.join(self.outdir, f
|
|
944
|
+
os.path.join(self.outdir, f"{label_pascal}-part*.csv")
|
|
915
945
|
)
|
|
916
946
|
# find file with highest part number
|
|
917
947
|
if not files:
|
|
918
|
-
|
|
919
948
|
next_part = 0
|
|
920
949
|
|
|
921
950
|
else:
|
|
922
|
-
|
|
923
951
|
next_part = (
|
|
924
952
|
max(
|
|
925
953
|
[
|
|
926
|
-
int(
|
|
927
|
-
|
|
928
|
-
) for f in files
|
|
954
|
+
int(f.split(".")[-2].split("-")[-1].replace("part", ""))
|
|
955
|
+
for f in files
|
|
929
956
|
],
|
|
930
|
-
)
|
|
957
|
+
)
|
|
958
|
+
+ 1
|
|
931
959
|
)
|
|
932
960
|
|
|
933
961
|
# write to file
|
|
934
962
|
padded_part = str(next_part).zfill(3)
|
|
935
963
|
logger.info(
|
|
936
|
-
f
|
|
964
|
+
f"Writing {len(lines)} entries to {label_pascal}-part{padded_part}.csv",
|
|
937
965
|
)
|
|
938
966
|
|
|
939
967
|
# store name only in case import_call_file_prefix is set
|
|
940
|
-
part = f
|
|
941
|
-
file_path = os.path.join(
|
|
942
|
-
self.outdir, part
|
|
943
|
-
)
|
|
944
|
-
|
|
945
|
-
with open(file_path, 'w', encoding='utf-8') as f:
|
|
968
|
+
part = f"{label_pascal}-part{padded_part}.csv"
|
|
969
|
+
file_path = os.path.join(self.outdir, part)
|
|
946
970
|
|
|
971
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
947
972
|
# concatenate with delimiter
|
|
948
973
|
f.writelines(lines)
|
|
949
974
|
|
|
@@ -975,10 +1000,9 @@ class _BatchWriter(ABC):
|
|
|
975
1000
|
"""
|
|
976
1001
|
|
|
977
1002
|
file_path = os.path.join(self.outdir, self._get_import_script_name())
|
|
978
|
-
logger.info(f
|
|
979
|
-
|
|
980
|
-
with open(file_path, 'w', encoding='utf-8') as f:
|
|
1003
|
+
logger.info(f"Writing {self.db_name} import call to `{file_path}`.")
|
|
981
1004
|
|
|
1005
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
982
1006
|
f.write(self._construct_import_call())
|
|
983
1007
|
|
|
984
1008
|
return True
|
|
@@ -995,11 +1019,13 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
995
1019
|
|
|
996
1020
|
This class inherits from the abstract class "_BatchWriter" and implements the
|
|
997
1021
|
Neo4j-specific methods:
|
|
1022
|
+
|
|
998
1023
|
- _write_node_headers
|
|
999
1024
|
- _write_edge_headers
|
|
1000
1025
|
- _construct_import_call
|
|
1001
1026
|
- _write_array_string
|
|
1002
1027
|
"""
|
|
1028
|
+
|
|
1003
1029
|
def _get_default_import_call_bin_prefix(self):
|
|
1004
1030
|
"""
|
|
1005
1031
|
Method to provide the default string for the import call bin prefix.
|
|
@@ -1007,7 +1033,7 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
1007
1033
|
Returns:
|
|
1008
1034
|
str: The default location for the neo4j admin import location
|
|
1009
1035
|
"""
|
|
1010
|
-
return
|
|
1036
|
+
return "bin/"
|
|
1011
1037
|
|
|
1012
1038
|
def _write_array_string(self, string_list):
|
|
1013
1039
|
"""
|
|
@@ -1021,7 +1047,7 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
1021
1047
|
str: The string representation of an array for the neo4j admin import
|
|
1022
1048
|
"""
|
|
1023
1049
|
string = self.adelim.join(string_list)
|
|
1024
|
-
return f
|
|
1050
|
+
return f"{self.quote}{string}{self.quote}"
|
|
1025
1051
|
|
|
1026
1052
|
def _write_node_headers(self):
|
|
1027
1053
|
"""
|
|
@@ -1035,56 +1061,55 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
1035
1061
|
# load headers from data parse
|
|
1036
1062
|
if not self.node_property_dict:
|
|
1037
1063
|
logger.error(
|
|
1038
|
-
|
|
1064
|
+
"Header information not found. Was the data parsed first?",
|
|
1039
1065
|
)
|
|
1040
1066
|
return False
|
|
1041
1067
|
|
|
1042
1068
|
for label, props in self.node_property_dict.items():
|
|
1043
|
-
|
|
1044
|
-
_id = ':ID'
|
|
1069
|
+
_id = ":ID"
|
|
1045
1070
|
|
|
1046
1071
|
# translate label to PascalCase
|
|
1047
1072
|
pascal_label = self.translator.name_sentence_to_pascal(label)
|
|
1048
1073
|
|
|
1049
|
-
header = f
|
|
1074
|
+
header = f"{pascal_label}-header.csv"
|
|
1050
1075
|
header_path = os.path.join(
|
|
1051
1076
|
self.outdir,
|
|
1052
1077
|
header,
|
|
1053
1078
|
)
|
|
1054
|
-
parts = f
|
|
1079
|
+
parts = f"{pascal_label}-part.*"
|
|
1055
1080
|
|
|
1056
1081
|
# check if file already exists
|
|
1057
1082
|
if os.path.exists(header_path):
|
|
1058
1083
|
logger.warning(
|
|
1059
|
-
f
|
|
1084
|
+
f"Header file `{header_path}` already exists. Overwriting.",
|
|
1060
1085
|
)
|
|
1061
1086
|
|
|
1062
1087
|
# concatenate key:value in props
|
|
1063
1088
|
props_list = []
|
|
1064
1089
|
for k, v in props.items():
|
|
1065
|
-
if v in [
|
|
1066
|
-
props_list.append(f
|
|
1067
|
-
elif v in [
|
|
1068
|
-
props_list.append(f
|
|
1069
|
-
elif v in [
|
|
1070
|
-
props_list.append(f
|
|
1071
|
-
elif v in [
|
|
1072
|
-
props_list.append(f
|
|
1073
|
-
elif v in [
|
|
1090
|
+
if v in ["int", "long", "integer"]:
|
|
1091
|
+
props_list.append(f"{k}:long")
|
|
1092
|
+
elif v in ["int[]", "long[]", "integer[]"]:
|
|
1093
|
+
props_list.append(f"{k}:long[]")
|
|
1094
|
+
elif v in ["float", "double", "dbl"]:
|
|
1095
|
+
props_list.append(f"{k}:double")
|
|
1096
|
+
elif v in ["float[]", "double[]"]:
|
|
1097
|
+
props_list.append(f"{k}:double[]")
|
|
1098
|
+
elif v in ["bool", "boolean"]:
|
|
1074
1099
|
# TODO Neo4j boolean support / spelling?
|
|
1075
|
-
props_list.append(f
|
|
1076
|
-
elif v in [
|
|
1077
|
-
props_list.append(f
|
|
1078
|
-
elif v in [
|
|
1079
|
-
props_list.append(f
|
|
1100
|
+
props_list.append(f"{k}:boolean")
|
|
1101
|
+
elif v in ["bool[]", "boolean[]"]:
|
|
1102
|
+
props_list.append(f"{k}:boolean[]")
|
|
1103
|
+
elif v in ["str[]", "string[]"]:
|
|
1104
|
+
props_list.append(f"{k}:string[]")
|
|
1080
1105
|
else:
|
|
1081
|
-
props_list.append(f
|
|
1106
|
+
props_list.append(f"{k}")
|
|
1082
1107
|
|
|
1083
1108
|
# create list of lists and flatten
|
|
1084
|
-
out_list = [[_id], props_list, [
|
|
1109
|
+
out_list = [[_id], props_list, [":LABEL"]]
|
|
1085
1110
|
out_list = [val for sublist in out_list for val in sublist]
|
|
1086
1111
|
|
|
1087
|
-
with open(header_path,
|
|
1112
|
+
with open(header_path, "w", encoding="utf-8") as f:
|
|
1088
1113
|
# concatenate with delimiter
|
|
1089
1114
|
row = self.delim.join(out_list)
|
|
1090
1115
|
f.write(row)
|
|
@@ -1099,7 +1124,9 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
1099
1124
|
self.import_call_file_prefix,
|
|
1100
1125
|
parts,
|
|
1101
1126
|
)
|
|
1102
|
-
self.import_call_nodes.add(
|
|
1127
|
+
self.import_call_nodes.add(
|
|
1128
|
+
(import_call_header_path, import_call_parts_path)
|
|
1129
|
+
)
|
|
1103
1130
|
|
|
1104
1131
|
return True
|
|
1105
1132
|
|
|
@@ -1115,79 +1142,88 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
1115
1142
|
# load headers from data parse
|
|
1116
1143
|
if not self.edge_property_dict:
|
|
1117
1144
|
logger.error(
|
|
1118
|
-
|
|
1145
|
+
"Header information not found. Was the data parsed first?",
|
|
1119
1146
|
)
|
|
1120
1147
|
return False
|
|
1121
1148
|
|
|
1122
1149
|
for label, props in self.edge_property_dict.items():
|
|
1123
|
-
|
|
1124
1150
|
# translate label to PascalCase
|
|
1125
1151
|
pascal_label = self.translator.name_sentence_to_pascal(label)
|
|
1126
1152
|
|
|
1127
1153
|
# paths
|
|
1128
|
-
header = f
|
|
1154
|
+
header = f"{pascal_label}-header.csv"
|
|
1129
1155
|
header_path = os.path.join(
|
|
1130
1156
|
self.outdir,
|
|
1131
1157
|
header,
|
|
1132
1158
|
)
|
|
1133
|
-
parts = f
|
|
1159
|
+
parts = f"{pascal_label}-part.*"
|
|
1134
1160
|
|
|
1135
1161
|
# check for file exists
|
|
1136
1162
|
if os.path.exists(header_path):
|
|
1137
1163
|
logger.warning(
|
|
1138
|
-
f
|
|
1164
|
+
f"File {header_path} already exists. Overwriting."
|
|
1139
1165
|
)
|
|
1140
1166
|
|
|
1141
1167
|
# concatenate key:value in props
|
|
1142
1168
|
props_list = []
|
|
1143
1169
|
for k, v in props.items():
|
|
1144
|
-
if v in [
|
|
1145
|
-
props_list.append(f
|
|
1146
|
-
elif v in [
|
|
1147
|
-
props_list.append(f
|
|
1148
|
-
elif v in [
|
|
1149
|
-
props_list.append(f
|
|
1150
|
-
elif v in [
|
|
1151
|
-
props_list.append(f
|
|
1170
|
+
if v in ["int", "long", "integer"]:
|
|
1171
|
+
props_list.append(f"{k}:long")
|
|
1172
|
+
elif v in ["int[]", "long[]", "integer[]"]:
|
|
1173
|
+
props_list.append(f"{k}:long[]")
|
|
1174
|
+
elif v in ["float", "double"]:
|
|
1175
|
+
props_list.append(f"{k}:double")
|
|
1176
|
+
elif v in ["float[]", "double[]"]:
|
|
1177
|
+
props_list.append(f"{k}:double[]")
|
|
1152
1178
|
elif v in [
|
|
1153
|
-
|
|
1154
|
-
|
|
1179
|
+
"bool",
|
|
1180
|
+
"boolean",
|
|
1155
1181
|
]: # TODO does Neo4j support bool?
|
|
1156
|
-
props_list.append(f
|
|
1157
|
-
elif v in [
|
|
1158
|
-
props_list.append(f
|
|
1159
|
-
elif v in [
|
|
1160
|
-
props_list.append(f
|
|
1182
|
+
props_list.append(f"{k}:boolean")
|
|
1183
|
+
elif v in ["bool[]", "boolean[]"]:
|
|
1184
|
+
props_list.append(f"{k}:boolean[]")
|
|
1185
|
+
elif v in ["str[]", "string[]"]:
|
|
1186
|
+
props_list.append(f"{k}:string[]")
|
|
1161
1187
|
else:
|
|
1162
|
-
props_list.append(f
|
|
1188
|
+
props_list.append(f"{k}")
|
|
1163
1189
|
|
|
1164
1190
|
skip_id = False
|
|
1165
1191
|
schema_label = None
|
|
1166
1192
|
|
|
1167
1193
|
if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
|
|
1168
1194
|
skip_id = True
|
|
1169
|
-
elif not self.extended_schema.get(
|
|
1195
|
+
elif not self.translator.ontology.mapping.extended_schema.get(
|
|
1196
|
+
label
|
|
1197
|
+
):
|
|
1170
1198
|
# find label in schema by label_as_edge
|
|
1171
|
-
for
|
|
1172
|
-
|
|
1199
|
+
for (
|
|
1200
|
+
k,
|
|
1201
|
+
v,
|
|
1202
|
+
) in self.translator.ontology.mapping.extended_schema.items():
|
|
1203
|
+
if v.get("label_as_edge") == label:
|
|
1173
1204
|
schema_label = k
|
|
1174
1205
|
break
|
|
1175
1206
|
else:
|
|
1176
1207
|
schema_label = label
|
|
1177
1208
|
|
|
1178
|
-
out_list = [
|
|
1209
|
+
out_list = [":START_ID"]
|
|
1179
1210
|
|
|
1180
1211
|
if schema_label:
|
|
1181
|
-
if
|
|
1212
|
+
if (
|
|
1213
|
+
self.translator.ontology.mapping.extended_schema.get(
|
|
1214
|
+
schema_label
|
|
1215
|
+
).get("use_id")
|
|
1216
|
+
== False
|
|
1217
|
+
):
|
|
1182
1218
|
skip_id = True
|
|
1183
1219
|
|
|
1184
1220
|
if not skip_id:
|
|
1185
|
-
out_list.append(
|
|
1221
|
+
out_list.append("id")
|
|
1186
1222
|
|
|
1187
1223
|
out_list.extend(props_list)
|
|
1188
|
-
out_list.extend([
|
|
1224
|
+
out_list.extend([":END_ID", ":TYPE"])
|
|
1189
1225
|
|
|
1190
|
-
with open(header_path,
|
|
1226
|
+
with open(header_path, "w", encoding="utf-8") as f:
|
|
1191
1227
|
# concatenate with delimiter
|
|
1192
1228
|
row = self.delim.join(out_list)
|
|
1193
1229
|
f.write(row)
|
|
@@ -1202,7 +1238,9 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
1202
1238
|
self.import_call_file_prefix,
|
|
1203
1239
|
parts,
|
|
1204
1240
|
)
|
|
1205
|
-
self.import_call_edges.add(
|
|
1241
|
+
self.import_call_edges.add(
|
|
1242
|
+
(import_call_header_path, import_call_parts_path)
|
|
1243
|
+
)
|
|
1206
1244
|
|
|
1207
1245
|
return True
|
|
1208
1246
|
|
|
@@ -1213,7 +1251,7 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
1213
1251
|
Returns:
|
|
1214
1252
|
str: The name of the import script (ending in .sh)
|
|
1215
1253
|
"""
|
|
1216
|
-
return
|
|
1254
|
+
return "neo4j-admin-import-call.sh"
|
|
1217
1255
|
|
|
1218
1256
|
def _construct_import_call(self) -> str:
|
|
1219
1257
|
"""
|
|
@@ -1226,8 +1264,8 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
1226
1264
|
str: a bash command for neo4j-admin import
|
|
1227
1265
|
"""
|
|
1228
1266
|
import_call = (
|
|
1229
|
-
f
|
|
1230
|
-
f
|
|
1267
|
+
f"{self.import_call_bin_prefix}neo4j-admin import "
|
|
1268
|
+
f"--database={self.db_name} "
|
|
1231
1269
|
f'--delimiter="{self.escaped_delim}" '
|
|
1232
1270
|
f'--array-delimiter="{self.escaped_adelim}" '
|
|
1233
1271
|
)
|
|
@@ -1238,11 +1276,11 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
1238
1276
|
import_call += f"--quote='{self.quote}' "
|
|
1239
1277
|
|
|
1240
1278
|
if self.wipe:
|
|
1241
|
-
import_call += f
|
|
1279
|
+
import_call += f"--force=true "
|
|
1242
1280
|
if self.skip_bad_relationships:
|
|
1243
|
-
import_call +=
|
|
1281
|
+
import_call += "--skip-bad-relationships=true "
|
|
1244
1282
|
if self.skip_duplicate_nodes:
|
|
1245
|
-
import_call +=
|
|
1283
|
+
import_call += "--skip-duplicate-nodes=true "
|
|
1246
1284
|
|
|
1247
1285
|
# append node import calls
|
|
1248
1286
|
for header_path, parts_path in self.import_call_nodes:
|
|
@@ -1261,6 +1299,7 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
|
|
1261
1299
|
specified by ArangoDB for the use of "arangoimport". Output files are
|
|
1262
1300
|
similar to Neo4j, but with a different header format.
|
|
1263
1301
|
"""
|
|
1302
|
+
|
|
1264
1303
|
def _get_default_import_call_bin_prefix(self):
|
|
1265
1304
|
"""
|
|
1266
1305
|
Method to provide the default string for the import call bin prefix.
|
|
@@ -1268,7 +1307,7 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
|
|
1268
1307
|
Returns:
|
|
1269
1308
|
str: The default location for the neo4j admin import location
|
|
1270
1309
|
"""
|
|
1271
|
-
return
|
|
1310
|
+
return ""
|
|
1272
1311
|
|
|
1273
1312
|
def _get_import_script_name(self) -> str:
|
|
1274
1313
|
"""
|
|
@@ -1277,7 +1316,7 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
|
|
1277
1316
|
Returns:
|
|
1278
1317
|
str: The name of the import script (ending in .sh)
|
|
1279
1318
|
"""
|
|
1280
|
-
return
|
|
1319
|
+
return "arangodb-import-call.sh"
|
|
1281
1320
|
|
|
1282
1321
|
def _write_node_headers(self):
|
|
1283
1322
|
"""
|
|
@@ -1291,19 +1330,19 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
|
|
1291
1330
|
# load headers from data parse
|
|
1292
1331
|
if not self.node_property_dict:
|
|
1293
1332
|
logger.error(
|
|
1294
|
-
|
|
1333
|
+
"Header information not found. Was the data parsed first?",
|
|
1295
1334
|
)
|
|
1296
1335
|
return False
|
|
1297
1336
|
|
|
1298
1337
|
for label, props in self.node_property_dict.items():
|
|
1299
1338
|
# create header CSV with ID, properties, labels
|
|
1300
1339
|
|
|
1301
|
-
_id =
|
|
1340
|
+
_id = "_key"
|
|
1302
1341
|
|
|
1303
1342
|
# translate label to PascalCase
|
|
1304
1343
|
pascal_label = self.translator.name_sentence_to_pascal(label)
|
|
1305
1344
|
|
|
1306
|
-
header = f
|
|
1345
|
+
header = f"{pascal_label}-header.csv"
|
|
1307
1346
|
header_path = os.path.join(
|
|
1308
1347
|
self.outdir,
|
|
1309
1348
|
header,
|
|
@@ -1312,43 +1351,40 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
|
|
1312
1351
|
# check if file already exists
|
|
1313
1352
|
if os.path.exists(header_path):
|
|
1314
1353
|
logger.warning(
|
|
1315
|
-
f
|
|
1354
|
+
f"File {header_path} already exists. Overwriting."
|
|
1316
1355
|
)
|
|
1317
1356
|
|
|
1318
1357
|
# concatenate key:value in props
|
|
1319
1358
|
props_list = []
|
|
1320
1359
|
for k in props.keys():
|
|
1321
|
-
|
|
1322
|
-
props_list.append(f'{k}')
|
|
1360
|
+
props_list.append(f"{k}")
|
|
1323
1361
|
|
|
1324
1362
|
# create list of lists and flatten
|
|
1325
1363
|
# removes need for empty check of property list
|
|
1326
1364
|
out_list = [[_id], props_list]
|
|
1327
1365
|
out_list = [val for sublist in out_list for val in sublist]
|
|
1328
1366
|
|
|
1329
|
-
with open(header_path,
|
|
1367
|
+
with open(header_path, "w", encoding="utf-8") as f:
|
|
1330
1368
|
# concatenate with delimiter
|
|
1331
1369
|
row = self.delim.join(out_list)
|
|
1332
1370
|
f.write(row)
|
|
1333
1371
|
|
|
1334
1372
|
# add collection from schema config
|
|
1335
|
-
collection = self.extended_schema[
|
|
1336
|
-
|
|
1337
|
-
)
|
|
1373
|
+
collection = self.translator.ontology.mapping.extended_schema[
|
|
1374
|
+
label
|
|
1375
|
+
].get("db_collection_name", None)
|
|
1338
1376
|
|
|
1339
1377
|
# add file path to neo4 admin import statement
|
|
1340
1378
|
# do once for each part file
|
|
1341
1379
|
parts = self.parts.get(label, [])
|
|
1342
1380
|
|
|
1343
1381
|
if not parts:
|
|
1344
|
-
|
|
1345
1382
|
raise ValueError(
|
|
1346
|
-
f
|
|
1347
|
-
f
|
|
1383
|
+
f"No parts found for node label {label}. "
|
|
1384
|
+
f"Check that the data was parsed first.",
|
|
1348
1385
|
)
|
|
1349
1386
|
|
|
1350
1387
|
for part in parts:
|
|
1351
|
-
|
|
1352
1388
|
import_call_header_path = os.path.join(
|
|
1353
1389
|
self.import_call_file_prefix,
|
|
1354
1390
|
header,
|
|
@@ -1358,7 +1394,13 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
|
|
1358
1394
|
part,
|
|
1359
1395
|
)
|
|
1360
1396
|
|
|
1361
|
-
self.import_call_nodes.add(
|
|
1397
|
+
self.import_call_nodes.add(
|
|
1398
|
+
(
|
|
1399
|
+
import_call_header_path,
|
|
1400
|
+
import_call_parts_path,
|
|
1401
|
+
collection,
|
|
1402
|
+
)
|
|
1403
|
+
)
|
|
1362
1404
|
|
|
1363
1405
|
return True
|
|
1364
1406
|
|
|
@@ -1374,55 +1416,54 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
|
|
1374
1416
|
# load headers from data parse
|
|
1375
1417
|
if not self.edge_property_dict:
|
|
1376
1418
|
logger.error(
|
|
1377
|
-
|
|
1419
|
+
"Header information not found. Was the data parsed first?",
|
|
1378
1420
|
)
|
|
1379
1421
|
return False
|
|
1380
1422
|
|
|
1381
1423
|
for label, props in self.edge_property_dict.items():
|
|
1382
|
-
|
|
1383
1424
|
# translate label to PascalCase
|
|
1384
1425
|
pascal_label = self.translator.name_sentence_to_pascal(label)
|
|
1385
1426
|
|
|
1386
1427
|
# paths
|
|
1387
|
-
header = f
|
|
1428
|
+
header = f"{pascal_label}-header.csv"
|
|
1388
1429
|
header_path = os.path.join(
|
|
1389
1430
|
self.outdir,
|
|
1390
1431
|
header,
|
|
1391
1432
|
)
|
|
1392
|
-
parts = f
|
|
1433
|
+
parts = f"{pascal_label}-part.*"
|
|
1393
1434
|
|
|
1394
1435
|
# check for file exists
|
|
1395
1436
|
if os.path.exists(header_path):
|
|
1396
1437
|
logger.warning(
|
|
1397
|
-
f
|
|
1438
|
+
f"Header file {header_path} already exists. Overwriting."
|
|
1398
1439
|
)
|
|
1399
1440
|
|
|
1400
1441
|
# concatenate key:value in props
|
|
1401
1442
|
props_list = []
|
|
1402
1443
|
for k in props.keys():
|
|
1444
|
+
props_list.append(f"{k}")
|
|
1403
1445
|
|
|
1404
|
-
|
|
1446
|
+
out_list = ["_from", "_key", *props_list, "_to"]
|
|
1405
1447
|
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
with open(header_path, 'w', encoding='utf-8') as f:
|
|
1448
|
+
with open(header_path, "w", encoding="utf-8") as f:
|
|
1409
1449
|
# concatenate with delimiter
|
|
1410
1450
|
row = self.delim.join(out_list)
|
|
1411
1451
|
f.write(row)
|
|
1412
1452
|
|
|
1413
1453
|
# add collection from schema config
|
|
1414
|
-
if not self.extended_schema.get(label):
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1454
|
+
if not self.translator.ontology.mapping.extended_schema.get(label):
|
|
1455
|
+
for (
|
|
1456
|
+
_,
|
|
1457
|
+
v,
|
|
1458
|
+
) in self.translator.ontology.mapping.extended_schema.items():
|
|
1459
|
+
if v.get("label_as_edge") == label:
|
|
1460
|
+
collection = v.get("db_collection_name", None)
|
|
1419
1461
|
break
|
|
1420
1462
|
|
|
1421
1463
|
else:
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
)
|
|
1464
|
+
collection = self.translator.ontology.mapping.extended_schema[
|
|
1465
|
+
label
|
|
1466
|
+
].get("db_collection_name", None)
|
|
1426
1467
|
|
|
1427
1468
|
# add file path to neo4 admin import statement (import call path
|
|
1428
1469
|
# may be different from actual output path)
|
|
@@ -1434,7 +1475,13 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
|
|
1434
1475
|
self.import_call_file_prefix,
|
|
1435
1476
|
parts,
|
|
1436
1477
|
)
|
|
1437
|
-
self.import_call_edges.add(
|
|
1478
|
+
self.import_call_edges.add(
|
|
1479
|
+
(
|
|
1480
|
+
header_import_call_path,
|
|
1481
|
+
parts_import_call_path,
|
|
1482
|
+
collection,
|
|
1483
|
+
)
|
|
1484
|
+
)
|
|
1438
1485
|
|
|
1439
1486
|
return True
|
|
1440
1487
|
|
|
@@ -1449,8 +1496,8 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
|
|
1449
1496
|
str: a bash command for neo4j-admin import
|
|
1450
1497
|
"""
|
|
1451
1498
|
import_call = (
|
|
1452
|
-
f
|
|
1453
|
-
f
|
|
1499
|
+
f"{self.import_call_bin_prefix}arangoimp "
|
|
1500
|
+
f"--type csv "
|
|
1454
1501
|
f'--separator="{self.escaped_delim}" '
|
|
1455
1502
|
)
|
|
1456
1503
|
|
|
@@ -1459,23 +1506,22 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
|
|
1459
1506
|
else:
|
|
1460
1507
|
import_call += f"--quote='{self.quote}' "
|
|
1461
1508
|
|
|
1462
|
-
node_lines =
|
|
1509
|
+
node_lines = ""
|
|
1463
1510
|
|
|
1464
1511
|
# node import calls: one line per node type
|
|
1465
1512
|
for header_path, parts_path, collection in self.import_call_nodes:
|
|
1466
|
-
|
|
1467
1513
|
line = (
|
|
1468
|
-
f
|
|
1469
|
-
f
|
|
1470
|
-
f
|
|
1514
|
+
f"{import_call} "
|
|
1515
|
+
f"--headers-file {header_path} "
|
|
1516
|
+
f"--file= {parts_path} "
|
|
1471
1517
|
)
|
|
1472
1518
|
|
|
1473
1519
|
if collection:
|
|
1474
|
-
line += f
|
|
1520
|
+
line += f"--create-collection --collection {collection} "
|
|
1475
1521
|
|
|
1476
|
-
node_lines += f
|
|
1522
|
+
node_lines += f"{line}\n"
|
|
1477
1523
|
|
|
1478
|
-
edge_lines =
|
|
1524
|
+
edge_lines = ""
|
|
1479
1525
|
|
|
1480
1526
|
# edge import calls: one line per edge type
|
|
1481
1527
|
for header_path, parts_path, collection in self.import_call_edges:
|
|
@@ -1495,6 +1541,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1495
1541
|
|
|
1496
1542
|
This class inherits from the abstract class "_BatchWriter" and implements the
|
|
1497
1543
|
PostgreSQL-specific methods:
|
|
1544
|
+
|
|
1498
1545
|
- _write_node_headers
|
|
1499
1546
|
- _write_edge_headers
|
|
1500
1547
|
- _construct_import_call
|
|
@@ -1502,15 +1549,15 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1502
1549
|
"""
|
|
1503
1550
|
|
|
1504
1551
|
DATA_TYPE_LOOKUP = {
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1552
|
+
"str": "VARCHAR", # VARCHAR needs limit
|
|
1553
|
+
"int": "INTEGER",
|
|
1554
|
+
"long": "BIGINT",
|
|
1555
|
+
"float": "NUMERIC",
|
|
1556
|
+
"double": "NUMERIC",
|
|
1557
|
+
"dbl": "NUMERIC",
|
|
1558
|
+
"boolean": "BOOLEAN",
|
|
1559
|
+
"str[]": "VARCHAR[]",
|
|
1560
|
+
"string[]": "VARCHAR[]",
|
|
1514
1561
|
}
|
|
1515
1562
|
|
|
1516
1563
|
def __init__(self, *args, **kwargs):
|
|
@@ -1524,7 +1571,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1524
1571
|
Returns:
|
|
1525
1572
|
str: The default location for the psql command
|
|
1526
1573
|
"""
|
|
1527
|
-
return
|
|
1574
|
+
return ""
|
|
1528
1575
|
|
|
1529
1576
|
def _get_data_type(self, string) -> str:
|
|
1530
1577
|
try:
|
|
@@ -1533,7 +1580,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1533
1580
|
logger.info(
|
|
1534
1581
|
'Could not determine data type {string}. Using default "VARCHAR"'
|
|
1535
1582
|
)
|
|
1536
|
-
return
|
|
1583
|
+
return "VARCHAR"
|
|
1537
1584
|
|
|
1538
1585
|
def _write_array_string(self, string_list) -> str:
|
|
1539
1586
|
"""
|
|
@@ -1546,7 +1593,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1546
1593
|
Returns:
|
|
1547
1594
|
str: The string representation of an array for postgres COPY
|
|
1548
1595
|
"""
|
|
1549
|
-
string =
|
|
1596
|
+
string = ",".join(string_list)
|
|
1550
1597
|
string = f'"{{{string}}}"'
|
|
1551
1598
|
return string
|
|
1552
1599
|
|
|
@@ -1557,10 +1604,10 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1557
1604
|
Returns:
|
|
1558
1605
|
str: The name of the import script (ending in .sh)
|
|
1559
1606
|
"""
|
|
1560
|
-
return f
|
|
1607
|
+
return f"{self.db_name}-import-call.sh"
|
|
1561
1608
|
|
|
1562
1609
|
def _adjust_pascal_to_psql(self, string):
|
|
1563
|
-
string = string.replace(
|
|
1610
|
+
string = string.replace(".", "_")
|
|
1564
1611
|
string = string.lower()
|
|
1565
1612
|
return string
|
|
1566
1613
|
|
|
@@ -1576,7 +1623,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1576
1623
|
# load headers from data parse
|
|
1577
1624
|
if not self.node_property_dict:
|
|
1578
1625
|
logger.error(
|
|
1579
|
-
|
|
1626
|
+
"Header information not found. Was the data parsed first?",
|
|
1580
1627
|
)
|
|
1581
1628
|
return False
|
|
1582
1629
|
|
|
@@ -1586,7 +1633,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1586
1633
|
# translate label to PascalCase
|
|
1587
1634
|
pascal_label = self.translator.name_sentence_to_pascal(label)
|
|
1588
1635
|
|
|
1589
|
-
parts = f
|
|
1636
|
+
parts = f"{pascal_label}-part*.csv"
|
|
1590
1637
|
parts_paths = os.path.join(self.outdir, parts)
|
|
1591
1638
|
parts_paths = glob.glob(parts_paths)
|
|
1592
1639
|
parts_paths.sort()
|
|
@@ -1595,36 +1642,36 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1595
1642
|
pascal_label = self._adjust_pascal_to_psql(pascal_label)
|
|
1596
1643
|
table_create_command_path = os.path.join(
|
|
1597
1644
|
self.outdir,
|
|
1598
|
-
f
|
|
1645
|
+
f"{pascal_label}-create_table.sql",
|
|
1599
1646
|
)
|
|
1600
1647
|
|
|
1601
1648
|
# check if file already exists
|
|
1602
1649
|
if os.path.exists(table_create_command_path):
|
|
1603
1650
|
logger.warning(
|
|
1604
|
-
f
|
|
1651
|
+
f"File {table_create_command_path} already exists. Overwriting.",
|
|
1605
1652
|
)
|
|
1606
1653
|
|
|
1607
1654
|
# concatenate key:value in props
|
|
1608
|
-
columns = [
|
|
1655
|
+
columns = ["_ID VARCHAR"]
|
|
1609
1656
|
for col_name, col_type in props.items():
|
|
1610
1657
|
col_type = self._get_data_type(col_type)
|
|
1611
1658
|
col_name = self._adjust_pascal_to_psql(col_name)
|
|
1612
|
-
columns.append(f
|
|
1613
|
-
columns.append(
|
|
1614
|
-
|
|
1615
|
-
with open(table_create_command_path, 'w', encoding='utf-8') as f:
|
|
1659
|
+
columns.append(f"{col_name} {col_type}")
|
|
1660
|
+
columns.append("_LABEL VARCHAR[]")
|
|
1616
1661
|
|
|
1617
|
-
|
|
1662
|
+
with open(table_create_command_path, "w", encoding="utf-8") as f:
|
|
1663
|
+
command = ""
|
|
1618
1664
|
if self.wipe:
|
|
1619
|
-
command += f
|
|
1665
|
+
command += f"DROP TABLE IF EXISTS {pascal_label};\n"
|
|
1620
1666
|
|
|
1621
1667
|
# table creation requires comma separation
|
|
1622
|
-
command +=
|
|
1668
|
+
command += (
|
|
1669
|
+
f'CREATE TABLE {pascal_label}({",".join(columns)});\n'
|
|
1670
|
+
)
|
|
1623
1671
|
f.write(command)
|
|
1624
1672
|
|
|
1625
1673
|
for parts_path in parts_paths:
|
|
1626
|
-
|
|
1627
|
-
# if import_call_file_prefix is set, replace actual path
|
|
1674
|
+
# if import_call_file_prefix is set, replace actual path
|
|
1628
1675
|
# with prefix
|
|
1629
1676
|
if self.import_call_file_prefix != self.outdir:
|
|
1630
1677
|
parts_path = parts_path.replace(
|
|
@@ -1633,7 +1680,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1633
1680
|
)
|
|
1634
1681
|
|
|
1635
1682
|
self._copy_from_csv_commands.add(
|
|
1636
|
-
f
|
|
1683
|
+
f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
|
|
1637
1684
|
)
|
|
1638
1685
|
|
|
1639
1686
|
# add file path to import statement
|
|
@@ -1661,16 +1708,15 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1661
1708
|
# load headers from data parse
|
|
1662
1709
|
if not self.edge_property_dict:
|
|
1663
1710
|
logger.error(
|
|
1664
|
-
|
|
1711
|
+
"Header information not found. Was the data parsed first?",
|
|
1665
1712
|
)
|
|
1666
1713
|
return False
|
|
1667
1714
|
|
|
1668
1715
|
for label, props in self.edge_property_dict.items():
|
|
1669
|
-
|
|
1670
1716
|
# translate label to PascalCase
|
|
1671
1717
|
pascal_label = self.translator.name_sentence_to_pascal(label)
|
|
1672
1718
|
|
|
1673
|
-
parts_paths = os.path.join(self.outdir, f
|
|
1719
|
+
parts_paths = os.path.join(self.outdir, f"{pascal_label}-part*.csv")
|
|
1674
1720
|
parts_paths = glob.glob(parts_paths)
|
|
1675
1721
|
parts_paths.sort()
|
|
1676
1722
|
|
|
@@ -1678,13 +1724,13 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1678
1724
|
pascal_label = self._adjust_pascal_to_psql(pascal_label)
|
|
1679
1725
|
table_create_command_path = os.path.join(
|
|
1680
1726
|
self.outdir,
|
|
1681
|
-
f
|
|
1727
|
+
f"{pascal_label}-create_table.sql",
|
|
1682
1728
|
)
|
|
1683
1729
|
|
|
1684
1730
|
# check for file exists
|
|
1685
1731
|
if os.path.exists(table_create_command_path):
|
|
1686
1732
|
logger.warning(
|
|
1687
|
-
f
|
|
1733
|
+
f"File {table_create_command_path} already exists. Overwriting.",
|
|
1688
1734
|
)
|
|
1689
1735
|
|
|
1690
1736
|
# concatenate key:value in props
|
|
@@ -1692,7 +1738,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1692
1738
|
for col_name, col_type in props.items():
|
|
1693
1739
|
col_type = self._get_data_type(col_type)
|
|
1694
1740
|
col_name = self._adjust_pascal_to_psql(col_name)
|
|
1695
|
-
if col_name ==
|
|
1741
|
+
if col_name == "_ID":
|
|
1696
1742
|
# should ideally never happen
|
|
1697
1743
|
raise ValueError(
|
|
1698
1744
|
"Column name '_ID' is reserved for internal use, "
|
|
@@ -1700,26 +1746,30 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1700
1746
|
"different name for your column."
|
|
1701
1747
|
)
|
|
1702
1748
|
|
|
1703
|
-
columns.append(f
|
|
1749
|
+
columns.append(f"{col_name} {col_type}")
|
|
1704
1750
|
|
|
1705
1751
|
# create list of lists and flatten
|
|
1706
1752
|
# removes need for empty check of property list
|
|
1707
1753
|
out_list = [
|
|
1708
|
-
|
|
1709
|
-
|
|
1754
|
+
"_START_ID VARCHAR",
|
|
1755
|
+
"_ID VARCHAR",
|
|
1756
|
+
*columns,
|
|
1757
|
+
"_END_ID VARCHAR",
|
|
1758
|
+
"_TYPE VARCHAR",
|
|
1710
1759
|
]
|
|
1711
1760
|
|
|
1712
|
-
with open(table_create_command_path,
|
|
1713
|
-
command =
|
|
1761
|
+
with open(table_create_command_path, "w", encoding="utf-8") as f:
|
|
1762
|
+
command = ""
|
|
1714
1763
|
if self.wipe:
|
|
1715
|
-
command += f
|
|
1764
|
+
command += f"DROP TABLE IF EXISTS {pascal_label};\n"
|
|
1716
1765
|
|
|
1717
1766
|
# table creation requires comma separation
|
|
1718
|
-
command +=
|
|
1767
|
+
command += (
|
|
1768
|
+
f'CREATE TABLE {pascal_label}({",".join(out_list)});\n'
|
|
1769
|
+
)
|
|
1719
1770
|
f.write(command)
|
|
1720
1771
|
|
|
1721
1772
|
for parts_path in parts_paths:
|
|
1722
|
-
|
|
1723
1773
|
# if import_call_file_prefix is set, replace actual path
|
|
1724
1774
|
# with prefix
|
|
1725
1775
|
if self.import_call_file_prefix != self.outdir:
|
|
@@ -1729,7 +1779,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1729
1779
|
)
|
|
1730
1780
|
|
|
1731
1781
|
self._copy_from_csv_commands.add(
|
|
1732
|
-
f
|
|
1782
|
+
f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
|
|
1733
1783
|
)
|
|
1734
1784
|
|
|
1735
1785
|
# add file path to import statement
|
|
@@ -1740,7 +1790,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1740
1790
|
self.outdir,
|
|
1741
1791
|
self.import_call_file_prefix,
|
|
1742
1792
|
)
|
|
1743
|
-
|
|
1793
|
+
|
|
1744
1794
|
self.import_call_edges.add(table_create_command_path)
|
|
1745
1795
|
|
|
1746
1796
|
return True
|
|
@@ -1755,59 +1805,63 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1755
1805
|
Returns:
|
|
1756
1806
|
str: a bash command for postgresql import
|
|
1757
1807
|
"""
|
|
1758
|
-
import_call =
|
|
1808
|
+
import_call = ""
|
|
1759
1809
|
|
|
1760
1810
|
# create tables
|
|
1761
1811
|
# At this point, csv files of nodes and edges do not require differentiation
|
|
1762
1812
|
for import_file_path in [
|
|
1763
|
-
*self.import_call_nodes,
|
|
1813
|
+
*self.import_call_nodes,
|
|
1814
|
+
*self.import_call_edges,
|
|
1764
1815
|
]:
|
|
1765
1816
|
import_call += f'echo "Setup {import_file_path}..."\n'
|
|
1766
1817
|
if {self.db_password}:
|
|
1767
1818
|
# set password variable inline
|
|
1768
|
-
import_call += f
|
|
1769
|
-
import_call +=
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
import_call += f
|
|
1819
|
+
import_call += f"PGPASSWORD={self.db_password} "
|
|
1820
|
+
import_call += (
|
|
1821
|
+
f"{self.import_call_bin_prefix}psql -f {import_file_path}"
|
|
1822
|
+
)
|
|
1823
|
+
import_call += f" --dbname {self.db_name}"
|
|
1824
|
+
import_call += f" --host {self.db_host}"
|
|
1825
|
+
import_call += f" --port {self.db_port}"
|
|
1826
|
+
import_call += f" --user {self.db_user}"
|
|
1773
1827
|
import_call += '\necho "Done!"\n'
|
|
1774
|
-
import_call +=
|
|
1828
|
+
import_call += "\n"
|
|
1775
1829
|
|
|
1776
1830
|
# copy data to tables
|
|
1777
1831
|
for command in self._copy_from_csv_commands:
|
|
1778
|
-
table_part = command.split(
|
|
1832
|
+
table_part = command.split(" ")[3]
|
|
1779
1833
|
import_call += f'echo "Importing {table_part}..."\n'
|
|
1780
1834
|
if {self.db_password}:
|
|
1781
1835
|
# set password variable inline
|
|
1782
|
-
import_call += f
|
|
1836
|
+
import_call += f"PGPASSWORD={self.db_password} "
|
|
1783
1837
|
import_call += f'{self.import_call_bin_prefix}psql -c "{command}"'
|
|
1784
|
-
import_call += f
|
|
1785
|
-
import_call += f
|
|
1786
|
-
import_call += f
|
|
1838
|
+
import_call += f" --dbname {self.db_name}"
|
|
1839
|
+
import_call += f" --host {self.db_host}"
|
|
1840
|
+
import_call += f" --port {self.db_port}"
|
|
1841
|
+
import_call += f" --user {self.db_user}"
|
|
1787
1842
|
import_call += '\necho "Done!"\n'
|
|
1788
|
-
import_call +=
|
|
1843
|
+
import_call += "\n"
|
|
1789
1844
|
|
|
1790
1845
|
return import_call
|
|
1791
1846
|
|
|
1792
1847
|
|
|
1793
1848
|
DBMS_TO_CLASS = {
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1849
|
+
"neo": _Neo4jBatchWriter,
|
|
1850
|
+
"neo4j": _Neo4jBatchWriter,
|
|
1851
|
+
"Neo4j": _Neo4jBatchWriter,
|
|
1852
|
+
"postgres": _PostgreSQLBatchWriter,
|
|
1853
|
+
"postgresql": _PostgreSQLBatchWriter,
|
|
1854
|
+
"PostgreSQL": _PostgreSQLBatchWriter,
|
|
1855
|
+
"arango": _ArangoDBBatchWriter,
|
|
1856
|
+
"arangodb": _ArangoDBBatchWriter,
|
|
1857
|
+
"ArangoDB": _ArangoDBBatchWriter,
|
|
1803
1858
|
}
|
|
1804
1859
|
|
|
1805
1860
|
|
|
1806
1861
|
def get_writer(
|
|
1807
1862
|
dbms: str,
|
|
1808
|
-
translator:
|
|
1809
|
-
|
|
1810
|
-
deduplicator: 'Deduplicator',
|
|
1863
|
+
translator: "Translator",
|
|
1864
|
+
deduplicator: "Deduplicator",
|
|
1811
1865
|
output_directory: str,
|
|
1812
1866
|
strict_mode: bool,
|
|
1813
1867
|
):
|
|
@@ -1821,8 +1875,6 @@ def get_writer(
|
|
|
1821
1875
|
|
|
1822
1876
|
translator: the Translator object.
|
|
1823
1877
|
|
|
1824
|
-
ontology: the Ontology object.
|
|
1825
|
-
|
|
1826
1878
|
output_directory: the directory to write the output files to.
|
|
1827
1879
|
|
|
1828
1880
|
strict_mode: whether to use strict mode.
|
|
@@ -1835,34 +1887,35 @@ def get_writer(
|
|
|
1835
1887
|
|
|
1836
1888
|
dbms_config = _config(dbms)
|
|
1837
1889
|
|
|
1838
|
-
timestamp = lambda: datetime.now().strftime(
|
|
1839
|
-
outdir = output_directory or os.path.join(
|
|
1890
|
+
timestamp = lambda: datetime.now().strftime("%Y%m%d%H%M%S")
|
|
1891
|
+
outdir = output_directory or os.path.join("biocypher-out", timestamp())
|
|
1840
1892
|
outdir = os.path.abspath(outdir)
|
|
1841
1893
|
|
|
1842
1894
|
writer = DBMS_TO_CLASS[dbms]
|
|
1843
1895
|
|
|
1844
1896
|
if not writer:
|
|
1845
|
-
raise ValueError(f
|
|
1897
|
+
raise ValueError(f"Unknown dbms: {dbms}")
|
|
1846
1898
|
|
|
1847
1899
|
if writer is not None:
|
|
1848
1900
|
return writer(
|
|
1849
|
-
ontology=ontology,
|
|
1850
1901
|
translator=translator,
|
|
1851
1902
|
deduplicator=deduplicator,
|
|
1852
|
-
delimiter=dbms_config.get(
|
|
1853
|
-
array_delimiter=dbms_config.get(
|
|
1854
|
-
quote=dbms_config.get(
|
|
1903
|
+
delimiter=dbms_config.get("delimiter"),
|
|
1904
|
+
array_delimiter=dbms_config.get("array_delimiter"),
|
|
1905
|
+
quote=dbms_config.get("quote_character"),
|
|
1855
1906
|
output_directory=outdir,
|
|
1856
|
-
db_name=dbms_config.get(
|
|
1857
|
-
import_call_bin_prefix=dbms_config.get(
|
|
1858
|
-
import_call_file_prefix=dbms_config.get(
|
|
1859
|
-
wipe=dbms_config.get(
|
|
1907
|
+
db_name=dbms_config.get("database_name"),
|
|
1908
|
+
import_call_bin_prefix=dbms_config.get("import_call_bin_prefix"),
|
|
1909
|
+
import_call_file_prefix=dbms_config.get("import_call_file_prefix"),
|
|
1910
|
+
wipe=dbms_config.get("wipe"),
|
|
1860
1911
|
strict_mode=strict_mode,
|
|
1861
|
-
skip_bad_relationships=dbms_config.get(
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1912
|
+
skip_bad_relationships=dbms_config.get(
|
|
1913
|
+
"skip_bad_relationships"
|
|
1914
|
+
), # neo4j
|
|
1915
|
+
skip_duplicate_nodes=dbms_config.get(
|
|
1916
|
+
"skip_duplicate_nodes"
|
|
1917
|
+
), # neo4j
|
|
1918
|
+
db_user=dbms_config.get("user"), # psql
|
|
1919
|
+
db_password=dbms_config.get("password"), # psql
|
|
1920
|
+
db_port=dbms_config.get("port"), # psql
|
|
1868
1921
|
)
|