biocypher 0.5.17__py3-none-any.whl → 0.5.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

biocypher/_write.py CHANGED
@@ -17,7 +17,7 @@ import glob
17
17
 
18
18
  from ._logger import logger
19
19
 
20
- logger.debug(f'Loading module {__name__}.')
20
+ logger.debug(f"Loading module {__name__}.")
21
21
 
22
22
  from abc import ABC, abstractmethod
23
23
  from types import GeneratorType
@@ -31,83 +31,15 @@ from more_itertools import peekable
31
31
  from ._config import config as _config
32
32
  from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
33
33
 
34
- __all__ = ['get_writer']
34
+ __all__ = ["get_writer"]
35
35
 
36
36
  if TYPE_CHECKING:
37
-
38
37
  from ._ontology import Ontology
39
38
  from ._translate import Translator
40
39
  from ._deduplicate import Deduplicator
41
40
 
42
41
 
43
42
  class _BatchWriter(ABC):
44
- """
45
- Abtract parent class for writing node and edge representations to disk using the
46
- format specified by each database type. The database-specific functions are implemented
47
- by the respective child-classes. This abstract class contains all methods expected by
48
- a bach writer instance, some of which need to be overwritten by the child classes.
49
-
50
- Each batch writer instance has a fixed representation that needs to be passed
51
- at instantiation via the :py:attr:`schema` argument. The instance
52
- also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
53
- to convert and extend the hierarchy.
54
-
55
- Requires the following methods to be overwritten by database-specific writer classes:
56
- - _write_node_headers
57
- - _write_edge_headers
58
- - _construct_import_call
59
- - _write_array_string
60
- - _get_import_script_name
61
-
62
- Args:
63
- ontology:
64
- Instance of :py:class:`Ontology` to enable translation and
65
- ontology queries
66
-
67
- translator:
68
- Instance of :py:class:`Translator` to enable translation of
69
- nodes and manipulation of properties.
70
-
71
- deduplicator:
72
- Instance of :py:class:`Deduplicator` to enable deduplication
73
- of nodes and edges.
74
-
75
- delimiter:
76
- The delimiter to use for the CSV files.
77
-
78
- array_delimiter:
79
- The delimiter to use for array properties.
80
-
81
- quote:
82
- The quote character to use for the CSV files.
83
-
84
- dirname:
85
- Path for exporting CSV files.
86
-
87
- db_name:
88
- Name of the database that will be used in the generated
89
- commands.
90
-
91
- import_call_bin_prefix:
92
- Path prefix for the admin import call binary.
93
-
94
- import_call_file_prefix:
95
- Path prefix for the data files (headers and parts) in the import
96
- call.
97
-
98
- wipe:
99
- Whether to force import (removing existing DB content). (Specific to Neo4j.)
100
-
101
- strict_mode:
102
- Whether to enforce source, version, and license properties.
103
-
104
- skip_bad_relationships:
105
- Whether to skip relationships that do not have a valid
106
- start and end node. (Specific to Neo4j.)
107
-
108
- skip_duplicate_nodes:
109
- Whether to skip duplicate nodes. (Specific to Neo4j.)
110
- """
111
43
  @abstractmethod
112
44
  def _get_default_import_call_bin_prefix(self):
113
45
  """
@@ -193,14 +125,13 @@ class _BatchWriter(ABC):
193
125
 
194
126
  def __init__(
195
127
  self,
196
- ontology: 'Ontology',
197
- translator: 'Translator',
198
- deduplicator: 'Deduplicator',
128
+ translator: "Translator",
129
+ deduplicator: "Deduplicator",
199
130
  delimiter: str,
200
- array_delimiter: str = ',',
131
+ array_delimiter: str = ",",
201
132
  quote: str = '"',
202
133
  output_directory: Optional[str] = None,
203
- db_name: str = 'neo4j',
134
+ db_name: str = "neo4j",
204
135
  import_call_bin_prefix: Optional[str] = None,
205
136
  import_call_file_prefix: Optional[str] = None,
206
137
  wipe: bool = True,
@@ -209,11 +140,92 @@ class _BatchWriter(ABC):
209
140
  skip_duplicate_nodes: bool = False,
210
141
  db_user: str = None,
211
142
  db_password: str = None,
212
- db_port: str = None
143
+ db_host: str = None,
144
+ db_port: str = None,
213
145
  ):
146
+ """
147
+
148
+ Abtract parent class for writing node and edge representations to disk
149
+ using the format specified by each database type. The database-specific
150
+ functions are implemented by the respective child-classes. This abstract
151
+ class contains all methods expected by a bach writer instance, some of
152
+ which need to be overwritten by the child classes.
153
+
154
+ Each batch writer instance has a fixed representation that needs to be
155
+ passed at instantiation via the :py:attr:`schema` argument. The instance
156
+ also expects an ontology adapter via :py:attr:`ontology_adapter` to be
157
+ able to convert and extend the hierarchy.
158
+
159
+ Requires the following methods to be overwritten by database-specific
160
+ writer classes:
161
+
162
+ - _write_node_headers
163
+ - _write_edge_headers
164
+ - _construct_import_call
165
+ - _write_array_string
166
+ - _get_import_script_name
167
+
168
+ Args:
169
+ translator:
170
+ Instance of :py:class:`Translator` to enable translation of
171
+ nodes and manipulation of properties.
172
+
173
+ deduplicator:
174
+ Instance of :py:class:`Deduplicator` to enable deduplication
175
+ of nodes and edges.
176
+
177
+ delimiter:
178
+ The delimiter to use for the CSV files.
179
+
180
+ array_delimiter:
181
+ The delimiter to use for array properties.
182
+
183
+ quote:
184
+ The quote character to use for the CSV files.
185
+
186
+ dirname:
187
+ Path for exporting CSV files.
188
+
189
+ db_name:
190
+ Name of the database that will be used in the generated
191
+ commands.
192
+
193
+ import_call_bin_prefix:
194
+ Path prefix for the admin import call binary.
195
+
196
+ import_call_file_prefix:
197
+ Path prefix for the data files (headers and parts) in the import
198
+ call.
199
+
200
+ wipe:
201
+ Whether to force import (removing existing DB content). (Specific to Neo4j.)
202
+
203
+ strict_mode:
204
+ Whether to enforce source, version, and license properties.
205
+
206
+ skip_bad_relationships:
207
+ Whether to skip relationships that do not have a valid
208
+ start and end node. (Specific to Neo4j.)
209
+
210
+ skip_duplicate_nodes:
211
+ Whether to skip duplicate nodes. (Specific to Neo4j.)
212
+
213
+ db_user:
214
+ The database user.
215
+
216
+ db_password:
217
+ The database password.
218
+
219
+ db_host:
220
+ The database host. Defaults to localhost.
221
+
222
+ db_port:
223
+ The database port.
224
+ """
214
225
  self.db_name = db_name
215
226
  self.db_user = db_user
216
227
  self.db_password = db_password
228
+ self.db_host = db_host or "localhost"
217
229
  self.db_port = db_port
218
230
 
219
231
  self.delim, self.escaped_delim = self._process_delimiter(delimiter)
@@ -225,7 +237,8 @@ class _BatchWriter(ABC):
225
237
  self.skip_duplicate_nodes = skip_duplicate_nodes
226
238
 
227
239
  if import_call_bin_prefix is None:
228
- self.import_call_bin_prefix = self._get_default_import_call_bin_prefix(
240
+ self.import_call_bin_prefix = (
241
+ self._get_default_import_call_bin_prefix()
229
242
  )
230
243
  else:
231
244
  self.import_call_bin_prefix = import_call_bin_prefix
@@ -233,8 +246,6 @@ class _BatchWriter(ABC):
233
246
  self.wipe = wipe
234
247
  self.strict_mode = strict_mode
235
248
 
236
- self.extended_schema = ontology.extended_schema
237
- self.ontology = ontology
238
249
  self.translator = translator
239
250
  self.deduplicator = deduplicator
240
251
  self.node_property_dict = {}
@@ -248,11 +259,11 @@ class _BatchWriter(ABC):
248
259
 
249
260
  if os.path.exists(self.outdir):
250
261
  logger.warning(
251
- f'Output directory `{self.outdir}` already exists. '
252
- 'If this is not planned, file consistency may be compromised.'
262
+ f"Output directory `{self.outdir}` already exists. "
263
+ "If this is not planned, file consistency may be compromised."
253
264
  )
254
265
  else:
255
- logger.info(f'Creating output directory `{self.outdir}`.')
266
+ logger.info(f"Creating output directory `{self.outdir}`.")
256
267
  os.makedirs(self.outdir)
257
268
 
258
269
  self.parts = {} # dict to store the paths of part files for each label
@@ -268,7 +279,6 @@ class _BatchWriter(ABC):
268
279
 
269
280
  return self._outdir
270
281
 
271
-
272
282
  @property
273
283
  def import_call_file_prefix(self):
274
284
  """
@@ -286,12 +296,10 @@ class _BatchWriter(ABC):
286
296
  representation (e.g. tab for '\t').
287
297
  """
288
298
 
289
- if delimiter == '\\t':
290
-
291
- return '\t', '\\t'
299
+ if delimiter == "\\t":
300
+ return "\t", "\\t"
292
301
 
293
302
  else:
294
-
295
303
  return delimiter, delimiter
296
304
 
297
305
  def write_nodes(self, nodes, batch_size: int = int(1e6)):
@@ -310,12 +318,12 @@ class _BatchWriter(ABC):
310
318
  # write node data
311
319
  passed = self._write_node_data(nodes, batch_size)
312
320
  if not passed:
313
- logger.error('Error while writing node data.')
321
+ logger.error("Error while writing node data.")
314
322
  return False
315
323
  # pass property data to header writer per node type written
316
324
  passed = self._write_node_headers()
317
325
  if not passed:
318
- logger.error('Error while writing node headers.')
326
+ logger.error("Error while writing node headers.")
319
327
  return False
320
328
 
321
329
  return True
@@ -337,48 +345,50 @@ class _BatchWriter(ABC):
337
345
  bool: The return value. True for success, False otherwise.
338
346
  """
339
347
  passed = False
340
- # unwrap generator in one step
341
348
  edges = list(edges) # force evaluation to handle empty generator
342
349
  if edges:
343
- z = zip(
344
- *(
345
- (
346
- e.get_node(),
347
- [
348
- e.get_source_edge(),
349
- e.get_target_edge(),
350
- ],
351
- ) if isinstance(e, BioCypherRelAsNode) else (None, [e])
352
- for e in edges
353
- )
354
- )
355
- nod, edg = (list(a) for a in z)
356
- nod = [n for n in nod if n]
357
- edg = [val for sublist in edg for val in sublist] # flatten
350
+ nodes_flat = []
351
+ edges_flat = []
352
+ for edge in edges:
353
+ if isinstance(edge, BioCypherRelAsNode):
354
+ # check if relationship has already been written, if so skip
355
+ if self.deduplicator.rel_as_node_seen(edge):
356
+ continue
358
357
 
359
- if nod and edg:
360
- passed = self.write_nodes(nod) and self._write_edge_data(
361
- edg,
358
+ nodes_flat.append(edge.get_node())
359
+ edges_flat.append(edge.get_source_edge())
360
+ edges_flat.append(edge.get_target_edge())
361
+
362
+ else:
363
+ # check if relationship has already been written, if so skip
364
+ if self.deduplicator.edge_seen(edge):
365
+ continue
366
+
367
+ edges_flat.append(edge)
368
+
369
+ if nodes_flat and edges_flat:
370
+ passed = self.write_nodes(nodes_flat) and self._write_edge_data(
371
+ edges_flat,
362
372
  batch_size,
363
373
  )
364
374
  else:
365
- passed = self._write_edge_data(edg, batch_size)
375
+ passed = self._write_edge_data(edges_flat, batch_size)
366
376
 
367
377
  else:
368
378
  # is this a problem? if the generator or list is empty, we
369
379
  # don't write anything.
370
380
  logger.debug(
371
- 'No edges to write, possibly due to no matched Biolink classes.',
381
+ "No edges to write, possibly due to no matched Biolink classes.",
372
382
  )
373
383
  pass
374
384
 
375
385
  if not passed:
376
- logger.error('Error while writing edge data.')
386
+ logger.error("Error while writing edge data.")
377
387
  return False
378
388
  # pass property data to header writer per edge type written
379
389
  passed = self._write_edge_headers()
380
390
  if not passed:
381
- logger.error('Error while writing edge headers.')
391
+ logger.error("Error while writing edge headers.")
382
392
  return False
383
393
 
384
394
  return True
@@ -401,7 +411,7 @@ class _BatchWriter(ABC):
401
411
  """
402
412
 
403
413
  if isinstance(nodes, GeneratorType) or isinstance(nodes, peekable):
404
- logger.debug('Writing node CSV from generator.')
414
+ logger.debug("Writing node CSV from generator.")
405
415
 
406
416
  bins = defaultdict(list) # dict to store a list for each
407
417
  # label that is passed in
@@ -424,7 +434,7 @@ class _BatchWriter(ABC):
424
434
 
425
435
  # check for non-id
426
436
  if not _id:
427
- logger.warning(f'Node {label} has no id; skipping.')
437
+ logger.warning(f"Node {label} has no id; skipping.")
428
438
  continue
429
439
 
430
440
  if not label in bins.keys():
@@ -434,20 +444,26 @@ class _BatchWriter(ABC):
434
444
  bin_l[label] = 1
435
445
 
436
446
  # get properties from config if present
437
- cprops = self.extended_schema.get(label).get('properties', )
447
+ cprops = (
448
+ self.translator.ontology.mapping.extended_schema.get(
449
+ label
450
+ ).get(
451
+ "properties",
452
+ )
453
+ )
438
454
  if cprops:
439
455
  d = dict(cprops)
440
456
 
441
457
  # add id and preferred id to properties; these are
442
458
  # created in node creation (`_create.BioCypherNode`)
443
- d['id'] = 'str'
444
- d['preferred_id'] = 'str'
459
+ d["id"] = "str"
460
+ d["preferred_id"] = "str"
445
461
 
446
462
  # add strict mode properties
447
463
  if self.strict_mode:
448
- d['source'] = 'str'
449
- d['version'] = 'str'
450
- d['licence'] = 'str'
464
+ d["source"] = "str"
465
+ d["version"] = "str"
466
+ d["licence"] = "str"
451
467
 
452
468
  else:
453
469
  d = dict(node.get_properties())
@@ -467,7 +483,7 @@ class _BatchWriter(ABC):
467
483
 
468
484
  # get label hierarchy
469
485
  # multiple labels:
470
- all_labels = self.ontology.get_ancestors(label)
486
+ all_labels = self.translator.ontology.get_ancestors(label)
471
487
 
472
488
  if all_labels:
473
489
  # convert to pascal case
@@ -531,7 +547,7 @@ class _BatchWriter(ABC):
531
547
  return True
532
548
  else:
533
549
  if type(nodes) is not list:
534
- logger.error('Nodes must be passed as list or generator.')
550
+ logger.error("Nodes must be passed as list or generator.")
535
551
  return False
536
552
  else:
537
553
 
@@ -563,14 +579,13 @@ class _BatchWriter(ABC):
563
579
  bool: The return value. True for success, False otherwise.
564
580
  """
565
581
  if not all(isinstance(n, BioCypherNode) for n in node_list):
566
- logger.error('Nodes must be passed as type BioCypherNode.')
582
+ logger.error("Nodes must be passed as type BioCypherNode.")
567
583
  return False
568
584
 
569
585
  # from list of nodes to list of strings
570
586
  lines = []
571
587
 
572
588
  for n in node_list:
573
-
574
589
  # check for deviations in properties
575
590
  # node properties
576
591
  n_props = n.get_properties()
@@ -584,46 +599,45 @@ class _BatchWriter(ABC):
584
599
  oprop1 = set(ref_props).difference(n_keys)
585
600
  oprop2 = set(n_keys).difference(ref_props)
586
601
  logger.error(
587
- f'At least one node of the class {n.get_label()} '
588
- f'has more or fewer properties than another. '
589
- f'Offending node: {onode!r}, offending property: '
590
- f'{max([oprop1, oprop2])}. '
591
- f'All reference properties: {ref_props}, '
592
- f'All node properties: {n_keys}.',
602
+ f"At least one node of the class {n.get_label()} "
603
+ f"has more or fewer properties than another. "
604
+ f"Offending node: {onode!r}, offending property: "
605
+ f"{max([oprop1, oprop2])}. "
606
+ f"All reference properties: {ref_props}, "
607
+ f"All node properties: {n_keys}.",
593
608
  )
594
609
  return False
595
610
 
596
611
  line = [n.get_id()]
597
612
 
598
613
  if ref_props:
599
-
600
614
  plist = []
601
615
  # make all into strings, put actual strings in quotes
602
616
  for k, v in prop_dict.items():
603
617
  p = n_props.get(k)
604
618
  if p is None: # TODO make field empty instead of ""?
605
- plist.append('')
619
+ plist.append("")
606
620
  elif v in [
607
- 'int',
608
- 'integer',
609
- 'long',
610
- 'float',
611
- 'double',
612
- 'dbl',
613
- 'bool',
614
- 'boolean',
621
+ "int",
622
+ "integer",
623
+ "long",
624
+ "float",
625
+ "double",
626
+ "dbl",
627
+ "bool",
628
+ "boolean",
615
629
  ]:
616
630
  plist.append(str(p))
617
631
  else:
618
632
  if isinstance(p, list):
619
633
  plist.append(self._write_array_string(p))
620
634
  else:
621
- plist.append(f'{self.quote}{str(p)}{self.quote}')
635
+ plist.append(f"{self.quote}{str(p)}{self.quote}")
622
636
 
623
637
  line.append(self.delim.join(plist))
624
638
  line.append(labels)
625
639
 
626
- lines.append(self.delim.join(line) + '\n')
640
+ lines.append(self.delim.join(line) + "\n")
627
641
 
628
642
  # avoid writing empty files
629
643
  if lines:
@@ -653,7 +667,7 @@ class _BatchWriter(ABC):
653
667
  """
654
668
 
655
669
  if isinstance(edges, GeneratorType):
656
- logger.debug('Writing edge CSV from generator.')
670
+ logger.debug("Writing edge CSV from generator.")
657
671
 
658
672
  bins = defaultdict(list) # dict to store a list for each
659
673
  # label that is passed in
@@ -665,14 +679,10 @@ class _BatchWriter(ABC):
665
679
  # for each label to check for consistency and their type
666
680
  # for now, relevant for `int`
667
681
  for edge in edges:
668
- # check for duplicates
669
- if self.deduplicator.edge_seen(edge):
670
- continue
671
-
672
682
  if not (edge.get_source_id() and edge.get_target_id()):
673
683
  logger.error(
674
- 'Edge must have source and target node. '
675
- f'Caused by: {edge}',
684
+ "Edge must have source and target node. "
685
+ f"Caused by: {edge}",
676
686
  )
677
687
  continue
678
688
 
@@ -689,25 +699,35 @@ class _BatchWriter(ABC):
689
699
  # (may not be if it is an edge that carries the
690
700
  # "label_as_edge" property)
691
701
  cprops = None
692
- if label in self.extended_schema:
693
- cprops = self.extended_schema.get(label).get(
694
- 'properties',
702
+ if (
703
+ label
704
+ in self.translator.ontology.mapping.extended_schema
705
+ ):
706
+ cprops = self.translator.ontology.mapping.extended_schema.get(
707
+ label
708
+ ).get(
709
+ "properties",
695
710
  )
696
711
  else:
697
712
  # try via "label_as_edge"
698
- for k, v in self.extended_schema.items():
713
+ for (
714
+ k,
715
+ v,
716
+ ) in (
717
+ self.translator.ontology.mapping.extended_schema.items()
718
+ ):
699
719
  if isinstance(v, dict):
700
- if v.get('label_as_edge') == label:
701
- cprops = v.get('properties')
720
+ if v.get("label_as_edge") == label:
721
+ cprops = v.get("properties")
702
722
  break
703
723
  if cprops:
704
724
  d = cprops
705
725
 
706
726
  # add strict mode properties
707
727
  if self.strict_mode:
708
- d['source'] = 'str'
709
- d['version'] = 'str'
710
- d['licence'] = 'str'
728
+ d["source"] = "str"
729
+ d["version"] = "str"
730
+ d["licence"] = "str"
711
731
 
712
732
  else:
713
733
  d = dict(edge.get_properties())
@@ -746,7 +766,6 @@ class _BatchWriter(ABC):
746
766
 
747
767
  # after generator depleted, write remainder of bins
748
768
  for label, nl in bins.items():
749
-
750
769
  passed = self._write_single_edge_list_to_file(
751
770
  nl,
752
771
  label,
@@ -768,7 +787,7 @@ class _BatchWriter(ABC):
768
787
  return True
769
788
  else:
770
789
  if type(edges) is not list:
771
- logger.error('Edges must be passed as list or generator.')
790
+ logger.error("Edges must be passed as list or generator.")
772
791
  return False
773
792
  else:
774
793
 
@@ -800,8 +819,7 @@ class _BatchWriter(ABC):
800
819
  """
801
820
 
802
821
  if not all(isinstance(n, BioCypherEdge) for n in edge_list):
803
-
804
- logger.error('Edges must be passed as type BioCypherEdge.')
822
+ logger.error("Edges must be passed as type BioCypherEdge.")
805
823
  return False
806
824
 
807
825
  # from list of edges to list of strings
@@ -815,16 +833,16 @@ class _BatchWriter(ABC):
815
833
 
816
834
  # compare list order invariant
817
835
  if not set(ref_props) == set(e_keys):
818
- oedge = f'{e.get_source_id()}-{e.get_target_id()}'
836
+ oedge = f"{e.get_source_id()}-{e.get_target_id()}"
819
837
  oprop1 = set(ref_props).difference(e_keys)
820
838
  oprop2 = set(e_keys).difference(ref_props)
821
839
  logger.error(
822
- f'At least one edge of the class {e.get_label()} '
823
- f'has more or fewer properties than another. '
824
- f'Offending edge: {oedge!r}, offending property: '
825
- f'{max([oprop1, oprop2])}. '
826
- f'All reference properties: {ref_props}, '
827
- f'All edge properties: {e_keys}.',
840
+ f"At least one edge of the class {e.get_label()} "
841
+ f"has more or fewer properties than another. "
842
+ f"Offending edge: {oedge!r}, offending property: "
843
+ f"{max([oprop1, oprop2])}. "
844
+ f"All reference properties: {ref_props}, "
845
+ f"All edge properties: {e_keys}.",
828
846
  )
829
847
  return False
830
848
 
@@ -833,16 +851,16 @@ class _BatchWriter(ABC):
833
851
  for k, v in prop_dict.items():
834
852
  p = e_props.get(k)
835
853
  if p is None: # TODO make field empty instead of ""?
836
- plist.append('')
854
+ plist.append("")
837
855
  elif v in [
838
- 'int',
839
- 'integer',
840
- 'long',
841
- 'float',
842
- 'double',
843
- 'dbl',
844
- 'bool',
845
- 'boolean',
856
+ "int",
857
+ "integer",
858
+ "long",
859
+ "float",
860
+ "double",
861
+ "dbl",
862
+ "bool",
863
+ "boolean",
846
864
  ]:
847
865
  plist.append(str(p))
848
866
  else:
@@ -850,7 +868,7 @@ class _BatchWriter(ABC):
850
868
  plist.append(self._write_array_string(p))
851
869
  else:
852
870
  plist.append(self.quote + str(p) + self.quote)
853
-
871
+
854
872
  entries = [e.get_source_id()]
855
873
 
856
874
  skip_id = False
@@ -858,32 +876,44 @@ class _BatchWriter(ABC):
858
876
 
859
877
  if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
860
878
  skip_id = True
861
- elif not self.extended_schema.get(label):
879
+ elif not self.translator.ontology.mapping.extended_schema.get(
880
+ label
881
+ ):
862
882
  # find label in schema by label_as_edge
863
- for k, v in self.extended_schema.items():
864
- if v.get('label_as_edge') == label:
883
+ for (
884
+ k,
885
+ v,
886
+ ) in self.translator.ontology.mapping.extended_schema.items():
887
+ if v.get("label_as_edge") == label:
865
888
  schema_label = k
866
889
  break
867
890
  else:
868
891
  schema_label = label
869
892
 
870
893
  if schema_label:
871
- if self.extended_schema.get(schema_label).get('use_id') == False:
894
+ if (
895
+ self.translator.ontology.mapping.extended_schema.get(
896
+ schema_label
897
+ ).get("use_id")
898
+ == False
899
+ ):
872
900
  skip_id = True
873
901
 
874
902
  if not skip_id:
875
- entries.append(e.get_id() or '')
903
+ entries.append(e.get_id() or "")
876
904
 
877
905
  if ref_props:
878
906
  entries.append(self.delim.join(plist))
879
907
 
880
908
  entries.append(e.get_target_id())
881
- entries.append(self.translator.name_sentence_to_pascal(
882
- e.get_label(),
883
- ))
909
+ entries.append(
910
+ self.translator.name_sentence_to_pascal(
911
+ e.get_label(),
912
+ )
913
+ )
884
914
 
885
915
  lines.append(
886
- self.delim.join(entries) + '\n',
916
+ self.delim.join(entries) + "\n",
887
917
  )
888
918
 
889
919
  # avoid writing empty files
@@ -911,39 +941,34 @@ class _BatchWriter(ABC):
911
941
 
912
942
  # list files in self.outdir
913
943
  files = glob.glob(
914
- os.path.join(self.outdir, f'{label_pascal}-part*.csv')
944
+ os.path.join(self.outdir, f"{label_pascal}-part*.csv")
915
945
  )
916
946
  # find file with highest part number
917
947
  if not files:
918
-
919
948
  next_part = 0
920
949
 
921
950
  else:
922
-
923
951
  next_part = (
924
952
  max(
925
953
  [
926
- int(
927
- f.split('.')[-2].split('-')[-1].replace('part', '')
928
- ) for f in files
954
+ int(f.split(".")[-2].split("-")[-1].replace("part", ""))
955
+ for f in files
929
956
  ],
930
- ) + 1
957
+ )
958
+ + 1
931
959
  )
932
960
 
933
961
  # write to file
934
962
  padded_part = str(next_part).zfill(3)
935
963
  logger.info(
936
- f'Writing {len(lines)} entries to {label_pascal}-part{padded_part}.csv',
964
+ f"Writing {len(lines)} entries to {label_pascal}-part{padded_part}.csv",
937
965
  )
938
966
 
939
967
  # store name only in case import_call_file_prefix is set
940
- part = f'{label_pascal}-part{padded_part}.csv'
941
- file_path = os.path.join(
942
- self.outdir, part
943
- )
944
-
945
- with open(file_path, 'w', encoding='utf-8') as f:
968
+ part = f"{label_pascal}-part{padded_part}.csv"
969
+ file_path = os.path.join(self.outdir, part)
946
970
 
971
+ with open(file_path, "w", encoding="utf-8") as f:
947
972
  # concatenate with delimiter
948
973
  f.writelines(lines)
949
974
 
@@ -975,10 +1000,9 @@ class _BatchWriter(ABC):
975
1000
  """
976
1001
 
977
1002
  file_path = os.path.join(self.outdir, self._get_import_script_name())
978
- logger.info(f'Writing {self.db_name} import call to `{file_path}`.')
979
-
980
- with open(file_path, 'w', encoding='utf-8') as f:
1003
+ logger.info(f"Writing {self.db_name} import call to `{file_path}`.")
981
1004
 
1005
+ with open(file_path, "w", encoding="utf-8") as f:
982
1006
  f.write(self._construct_import_call())
983
1007
 
984
1008
  return True
@@ -995,11 +1019,13 @@ class _Neo4jBatchWriter(_BatchWriter):
995
1019
 
996
1020
  This class inherits from the abstract class "_BatchWriter" and implements the
997
1021
  Neo4j-specific methods:
1022
+
998
1023
  - _write_node_headers
999
1024
  - _write_edge_headers
1000
1025
  - _construct_import_call
1001
1026
  - _write_array_string
1002
1027
  """
1028
+
1003
1029
  def _get_default_import_call_bin_prefix(self):
1004
1030
  """
1005
1031
  Method to provide the default string for the import call bin prefix.
@@ -1007,7 +1033,7 @@ class _Neo4jBatchWriter(_BatchWriter):
1007
1033
  Returns:
1008
1034
  str: The default location for the neo4j admin import location
1009
1035
  """
1010
- return 'bin/'
1036
+ return "bin/"
1011
1037
 
1012
1038
  def _write_array_string(self, string_list):
1013
1039
  """
@@ -1021,7 +1047,7 @@ class _Neo4jBatchWriter(_BatchWriter):
1021
1047
  str: The string representation of an array for the neo4j admin import
1022
1048
  """
1023
1049
  string = self.adelim.join(string_list)
1024
- return f'{self.quote}{string}{self.quote}'
1050
+ return f"{self.quote}{string}{self.quote}"
1025
1051
 
1026
1052
  def _write_node_headers(self):
1027
1053
  """
@@ -1035,56 +1061,55 @@ class _Neo4jBatchWriter(_BatchWriter):
1035
1061
  # load headers from data parse
1036
1062
  if not self.node_property_dict:
1037
1063
  logger.error(
1038
- 'Header information not found. Was the data parsed first?',
1064
+ "Header information not found. Was the data parsed first?",
1039
1065
  )
1040
1066
  return False
1041
1067
 
1042
1068
  for label, props in self.node_property_dict.items():
1043
-
1044
- _id = ':ID'
1069
+ _id = ":ID"
1045
1070
 
1046
1071
  # translate label to PascalCase
1047
1072
  pascal_label = self.translator.name_sentence_to_pascal(label)
1048
1073
 
1049
- header = f'{pascal_label}-header.csv'
1074
+ header = f"{pascal_label}-header.csv"
1050
1075
  header_path = os.path.join(
1051
1076
  self.outdir,
1052
1077
  header,
1053
1078
  )
1054
- parts = f'{pascal_label}-part.*'
1079
+ parts = f"{pascal_label}-part.*"
1055
1080
 
1056
1081
  # check if file already exists
1057
1082
  if os.path.exists(header_path):
1058
1083
  logger.warning(
1059
- f'Header file `{header_path}` already exists. Overwriting.',
1084
+ f"Header file `{header_path}` already exists. Overwriting.",
1060
1085
  )
1061
1086
 
1062
1087
  # concatenate key:value in props
1063
1088
  props_list = []
1064
1089
  for k, v in props.items():
1065
- if v in ['int', 'long', 'integer']:
1066
- props_list.append(f'{k}:long')
1067
- elif v in ['int[]', 'long[]', 'integer[]']:
1068
- props_list.append(f'{k}:long[]')
1069
- elif v in ['float', 'double', 'dbl']:
1070
- props_list.append(f'{k}:double')
1071
- elif v in ['float[]', 'double[]']:
1072
- props_list.append(f'{k}:double[]')
1073
- elif v in ['bool', 'boolean']:
1090
+ if v in ["int", "long", "integer"]:
1091
+ props_list.append(f"{k}:long")
1092
+ elif v in ["int[]", "long[]", "integer[]"]:
1093
+ props_list.append(f"{k}:long[]")
1094
+ elif v in ["float", "double", "dbl"]:
1095
+ props_list.append(f"{k}:double")
1096
+ elif v in ["float[]", "double[]"]:
1097
+ props_list.append(f"{k}:double[]")
1098
+ elif v in ["bool", "boolean"]:
1074
1099
  # TODO Neo4j boolean support / spelling?
1075
- props_list.append(f'{k}:boolean')
1076
- elif v in ['bool[]', 'boolean[]']:
1077
- props_list.append(f'{k}:boolean[]')
1078
- elif v in ['str[]', 'string[]']:
1079
- props_list.append(f'{k}:string[]')
1100
+ props_list.append(f"{k}:boolean")
1101
+ elif v in ["bool[]", "boolean[]"]:
1102
+ props_list.append(f"{k}:boolean[]")
1103
+ elif v in ["str[]", "string[]"]:
1104
+ props_list.append(f"{k}:string[]")
1080
1105
  else:
1081
- props_list.append(f'{k}')
1106
+ props_list.append(f"{k}")
1082
1107
 
1083
1108
  # create list of lists and flatten
1084
- out_list = [[_id], props_list, [':LABEL']]
1109
+ out_list = [[_id], props_list, [":LABEL"]]
1085
1110
  out_list = [val for sublist in out_list for val in sublist]
1086
1111
 
1087
- with open(header_path, 'w', encoding='utf-8') as f:
1112
+ with open(header_path, "w", encoding="utf-8") as f:
1088
1113
  # concatenate with delimiter
1089
1114
  row = self.delim.join(out_list)
1090
1115
  f.write(row)
@@ -1099,7 +1124,9 @@ class _Neo4jBatchWriter(_BatchWriter):
1099
1124
  self.import_call_file_prefix,
1100
1125
  parts,
1101
1126
  )
1102
- self.import_call_nodes.add((import_call_header_path, import_call_parts_path))
1127
+ self.import_call_nodes.add(
1128
+ (import_call_header_path, import_call_parts_path)
1129
+ )
1103
1130
 
1104
1131
  return True
1105
1132
 
@@ -1115,79 +1142,88 @@ class _Neo4jBatchWriter(_BatchWriter):
1115
1142
  # load headers from data parse
1116
1143
  if not self.edge_property_dict:
1117
1144
  logger.error(
1118
- 'Header information not found. Was the data parsed first?',
1145
+ "Header information not found. Was the data parsed first?",
1119
1146
  )
1120
1147
  return False
1121
1148
 
1122
1149
  for label, props in self.edge_property_dict.items():
1123
-
1124
1150
  # translate label to PascalCase
1125
1151
  pascal_label = self.translator.name_sentence_to_pascal(label)
1126
1152
 
1127
1153
  # paths
1128
- header = f'{pascal_label}-header.csv'
1154
+ header = f"{pascal_label}-header.csv"
1129
1155
  header_path = os.path.join(
1130
1156
  self.outdir,
1131
1157
  header,
1132
1158
  )
1133
- parts = f'{pascal_label}-part.*'
1159
+ parts = f"{pascal_label}-part.*"
1134
1160
 
1135
1161
  # check for file exists
1136
1162
  if os.path.exists(header_path):
1137
1163
  logger.warning(
1138
- f'File {header_path} already exists. Overwriting.'
1164
+ f"File {header_path} already exists. Overwriting."
1139
1165
  )
1140
1166
 
1141
1167
  # concatenate key:value in props
1142
1168
  props_list = []
1143
1169
  for k, v in props.items():
1144
- if v in ['int', 'long', 'integer']:
1145
- props_list.append(f'{k}:long')
1146
- elif v in ['int[]', 'long[]', 'integer[]']:
1147
- props_list.append(f'{k}:long[]')
1148
- elif v in ['float', 'double']:
1149
- props_list.append(f'{k}:double')
1150
- elif v in ['float[]', 'double[]']:
1151
- props_list.append(f'{k}:double[]')
1170
+ if v in ["int", "long", "integer"]:
1171
+ props_list.append(f"{k}:long")
1172
+ elif v in ["int[]", "long[]", "integer[]"]:
1173
+ props_list.append(f"{k}:long[]")
1174
+ elif v in ["float", "double"]:
1175
+ props_list.append(f"{k}:double")
1176
+ elif v in ["float[]", "double[]"]:
1177
+ props_list.append(f"{k}:double[]")
1152
1178
  elif v in [
1153
- 'bool',
1154
- 'boolean',
1179
+ "bool",
1180
+ "boolean",
1155
1181
  ]: # TODO does Neo4j support bool?
1156
- props_list.append(f'{k}:boolean')
1157
- elif v in ['bool[]', 'boolean[]']:
1158
- props_list.append(f'{k}:boolean[]')
1159
- elif v in ['str[]', 'string[]']:
1160
- props_list.append(f'{k}:string[]')
1182
+ props_list.append(f"{k}:boolean")
1183
+ elif v in ["bool[]", "boolean[]"]:
1184
+ props_list.append(f"{k}:boolean[]")
1185
+ elif v in ["str[]", "string[]"]:
1186
+ props_list.append(f"{k}:string[]")
1161
1187
  else:
1162
- props_list.append(f'{k}')
1188
+ props_list.append(f"{k}")
1163
1189
 
1164
1190
  skip_id = False
1165
1191
  schema_label = None
1166
1192
 
1167
1193
  if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
1168
1194
  skip_id = True
1169
- elif not self.extended_schema.get(label):
1195
+ elif not self.translator.ontology.mapping.extended_schema.get(
1196
+ label
1197
+ ):
1170
1198
  # find label in schema by label_as_edge
1171
- for k, v in self.extended_schema.items():
1172
- if v.get('label_as_edge') == label:
1199
+ for (
1200
+ k,
1201
+ v,
1202
+ ) in self.translator.ontology.mapping.extended_schema.items():
1203
+ if v.get("label_as_edge") == label:
1173
1204
  schema_label = k
1174
1205
  break
1175
1206
  else:
1176
1207
  schema_label = label
1177
1208
 
1178
- out_list = [':START_ID']
1209
+ out_list = [":START_ID"]
1179
1210
 
1180
1211
  if schema_label:
1181
- if self.extended_schema.get(schema_label).get('use_id') == False:
1212
+ if (
1213
+ self.translator.ontology.mapping.extended_schema.get(
1214
+ schema_label
1215
+ ).get("use_id")
1216
+ == False
1217
+ ):
1182
1218
  skip_id = True
1183
1219
 
1184
1220
  if not skip_id:
1185
- out_list.append('id')
1221
+ out_list.append("id")
1186
1222
 
1187
1223
  out_list.extend(props_list)
1188
- out_list.extend([':END_ID', ':TYPE'])
1224
+ out_list.extend([":END_ID", ":TYPE"])
1189
1225
 
1190
- with open(header_path, 'w', encoding='utf-8') as f:
1226
+ with open(header_path, "w", encoding="utf-8") as f:
1191
1227
  # concatenate with delimiter
1192
1228
  row = self.delim.join(out_list)
1193
1229
  f.write(row)
@@ -1202,7 +1238,9 @@ class _Neo4jBatchWriter(_BatchWriter):
1202
1238
  self.import_call_file_prefix,
1203
1239
  parts,
1204
1240
  )
1205
- self.import_call_edges.add((import_call_header_path, import_call_parts_path))
1241
+ self.import_call_edges.add(
1242
+ (import_call_header_path, import_call_parts_path)
1243
+ )
1206
1244
 
1207
1245
  return True
1208
1246
 
@@ -1213,7 +1251,7 @@ class _Neo4jBatchWriter(_BatchWriter):
1213
1251
  Returns:
1214
1252
  str: The name of the import script (ending in .sh)
1215
1253
  """
1216
- return 'neo4j-admin-import-call.sh'
1254
+ return "neo4j-admin-import-call.sh"
1217
1255
 
1218
1256
  def _construct_import_call(self) -> str:
1219
1257
  """
@@ -1226,8 +1264,8 @@ class _Neo4jBatchWriter(_BatchWriter):
1226
1264
  str: a bash command for neo4j-admin import
1227
1265
  """
1228
1266
  import_call = (
1229
- f'{self.import_call_bin_prefix}neo4j-admin import '
1230
- f'--database={self.db_name} '
1267
+ f"{self.import_call_bin_prefix}neo4j-admin import "
1268
+ f"--database={self.db_name} "
1231
1269
  f'--delimiter="{self.escaped_delim}" '
1232
1270
  f'--array-delimiter="{self.escaped_adelim}" '
1233
1271
  )
@@ -1238,11 +1276,11 @@ class _Neo4jBatchWriter(_BatchWriter):
1238
1276
  import_call += f"--quote='{self.quote}' "
1239
1277
 
1240
1278
  if self.wipe:
1241
- import_call += f'--force=true '
1279
+ import_call += f"--force=true "
1242
1280
  if self.skip_bad_relationships:
1243
- import_call += '--skip-bad-relationships=true '
1281
+ import_call += "--skip-bad-relationships=true "
1244
1282
  if self.skip_duplicate_nodes:
1245
- import_call += '--skip-duplicate-nodes=true '
1283
+ import_call += "--skip-duplicate-nodes=true "
1246
1284
 
1247
1285
  # append node import calls
1248
1286
  for header_path, parts_path in self.import_call_nodes:
@@ -1261,6 +1299,7 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
1261
1299
  specified by ArangoDB for the use of "arangoimport". Output files are
1262
1300
  similar to Neo4j, but with a different header format.
1263
1301
  """
1302
+
1264
1303
  def _get_default_import_call_bin_prefix(self):
1265
1304
  """
1266
1305
  Method to provide the default string for the import call bin prefix.
@@ -1268,7 +1307,7 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
1268
1307
  Returns:
1269
1308
  str: The default location for the neo4j admin import location
1270
1309
  """
1271
- return ''
1310
+ return ""
1272
1311
 
1273
1312
  def _get_import_script_name(self) -> str:
1274
1313
  """
@@ -1277,7 +1316,7 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
1277
1316
  Returns:
1278
1317
  str: The name of the import script (ending in .sh)
1279
1318
  """
1280
- return 'arangodb-import-call.sh'
1319
+ return "arangodb-import-call.sh"
1281
1320
 
1282
1321
  def _write_node_headers(self):
1283
1322
  """
@@ -1291,19 +1330,19 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
1291
1330
  # load headers from data parse
1292
1331
  if not self.node_property_dict:
1293
1332
  logger.error(
1294
- 'Header information not found. Was the data parsed first?',
1333
+ "Header information not found. Was the data parsed first?",
1295
1334
  )
1296
1335
  return False
1297
1336
 
1298
1337
  for label, props in self.node_property_dict.items():
1299
1338
  # create header CSV with ID, properties, labels
1300
1339
 
1301
- _id = '_key'
1340
+ _id = "_key"
1302
1341
 
1303
1342
  # translate label to PascalCase
1304
1343
  pascal_label = self.translator.name_sentence_to_pascal(label)
1305
1344
 
1306
- header = f'{pascal_label}-header.csv'
1345
+ header = f"{pascal_label}-header.csv"
1307
1346
  header_path = os.path.join(
1308
1347
  self.outdir,
1309
1348
  header,
@@ -1312,43 +1351,40 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
1312
1351
  # check if file already exists
1313
1352
  if os.path.exists(header_path):
1314
1353
  logger.warning(
1315
- f'File {header_path} already exists. Overwriting.'
1354
+ f"File {header_path} already exists. Overwriting."
1316
1355
  )
1317
1356
 
1318
1357
  # concatenate key:value in props
1319
1358
  props_list = []
1320
1359
  for k in props.keys():
1321
-
1322
- props_list.append(f'{k}')
1360
+ props_list.append(f"{k}")
1323
1361
 
1324
1362
  # create list of lists and flatten
1325
1363
  # removes need for empty check of property list
1326
1364
  out_list = [[_id], props_list]
1327
1365
  out_list = [val for sublist in out_list for val in sublist]
1328
1366
 
1329
- with open(header_path, 'w', encoding='utf-8') as f:
1367
+ with open(header_path, "w", encoding="utf-8") as f:
1330
1368
  # concatenate with delimiter
1331
1369
  row = self.delim.join(out_list)
1332
1370
  f.write(row)
1333
1371
 
1334
1372
  # add collection from schema config
1335
- collection = self.extended_schema[label].get(
1336
- 'db_collection_name', None
1337
- )
1373
+ collection = self.translator.ontology.mapping.extended_schema[
1374
+ label
1375
+ ].get("db_collection_name", None)
1338
1376
 
1339
1377
  # add file path to neo4 admin import statement
1340
1378
  # do once for each part file
1341
1379
  parts = self.parts.get(label, [])
1342
1380
 
1343
1381
  if not parts:
1344
-
1345
1382
  raise ValueError(
1346
- f'No parts found for node label {label}. '
1347
- f'Check that the data was parsed first.',
1383
+ f"No parts found for node label {label}. "
1384
+ f"Check that the data was parsed first.",
1348
1385
  )
1349
1386
 
1350
1387
  for part in parts:
1351
-
1352
1388
  import_call_header_path = os.path.join(
1353
1389
  self.import_call_file_prefix,
1354
1390
  header,
@@ -1358,7 +1394,13 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
1358
1394
  part,
1359
1395
  )
1360
1396
 
1361
- self.import_call_nodes.add((import_call_header_path, import_call_parts_path, collection))
1397
+ self.import_call_nodes.add(
1398
+ (
1399
+ import_call_header_path,
1400
+ import_call_parts_path,
1401
+ collection,
1402
+ )
1403
+ )
1362
1404
 
1363
1405
  return True
1364
1406
 
@@ -1374,55 +1416,54 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
1374
1416
  # load headers from data parse
1375
1417
  if not self.edge_property_dict:
1376
1418
  logger.error(
1377
- 'Header information not found. Was the data parsed first?',
1419
+ "Header information not found. Was the data parsed first?",
1378
1420
  )
1379
1421
  return False
1380
1422
 
1381
1423
  for label, props in self.edge_property_dict.items():
1382
-
1383
1424
  # translate label to PascalCase
1384
1425
  pascal_label = self.translator.name_sentence_to_pascal(label)
1385
1426
 
1386
1427
  # paths
1387
- header = f'{pascal_label}-header.csv'
1428
+ header = f"{pascal_label}-header.csv"
1388
1429
  header_path = os.path.join(
1389
1430
  self.outdir,
1390
1431
  header,
1391
1432
  )
1392
- parts = f'{pascal_label}-part.*'
1433
+ parts = f"{pascal_label}-part.*"
1393
1434
 
1394
1435
  # check for file exists
1395
1436
  if os.path.exists(header_path):
1396
1437
  logger.warning(
1397
- f'Header file {header_path} already exists. Overwriting.'
1438
+ f"Header file {header_path} already exists. Overwriting."
1398
1439
  )
1399
1440
 
1400
1441
  # concatenate key:value in props
1401
1442
  props_list = []
1402
1443
  for k in props.keys():
1444
+ props_list.append(f"{k}")
1403
1445
 
1404
- props_list.append(f'{k}')
1446
+ out_list = ["_from", "_key", *props_list, "_to"]
1405
1447
 
1406
- out_list = ['_from', '_key', *props_list, '_to']
1407
-
1408
- with open(header_path, 'w', encoding='utf-8') as f:
1448
+ with open(header_path, "w", encoding="utf-8") as f:
1409
1449
  # concatenate with delimiter
1410
1450
  row = self.delim.join(out_list)
1411
1451
  f.write(row)
1412
1452
 
1413
1453
  # add collection from schema config
1414
- if not self.extended_schema.get(label):
1415
-
1416
- for _, v in self.extended_schema.items():
1417
- if v.get('label_as_edge') == label:
1418
- collection = v.get('db_collection_name', None)
1454
+ if not self.translator.ontology.mapping.extended_schema.get(label):
1455
+ for (
1456
+ _,
1457
+ v,
1458
+ ) in self.translator.ontology.mapping.extended_schema.items():
1459
+ if v.get("label_as_edge") == label:
1460
+ collection = v.get("db_collection_name", None)
1419
1461
  break
1420
1462
 
1421
1463
  else:
1422
-
1423
- collection = self.extended_schema[label].get(
1424
- 'db_collection_name', None
1425
- )
1464
+ collection = self.translator.ontology.mapping.extended_schema[
1465
+ label
1466
+ ].get("db_collection_name", None)
1426
1467
 
1427
1468
  # add file path to neo4 admin import statement (import call path
1428
1469
  # may be different from actual output path)
@@ -1434,7 +1475,13 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
1434
1475
  self.import_call_file_prefix,
1435
1476
  parts,
1436
1477
  )
1437
- self.import_call_edges.add((header_import_call_path, parts_import_call_path, collection,))
1478
+ self.import_call_edges.add(
1479
+ (
1480
+ header_import_call_path,
1481
+ parts_import_call_path,
1482
+ collection,
1483
+ )
1484
+ )
1438
1485
 
1439
1486
  return True
1440
1487
 
@@ -1449,8 +1496,8 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
1449
1496
  str: a bash command for neo4j-admin import
1450
1497
  """
1451
1498
  import_call = (
1452
- f'{self.import_call_bin_prefix}arangoimp '
1453
- f'--type csv '
1499
+ f"{self.import_call_bin_prefix}arangoimp "
1500
+ f"--type csv "
1454
1501
  f'--separator="{self.escaped_delim}" '
1455
1502
  )
1456
1503
 
@@ -1459,23 +1506,22 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
1459
1506
  else:
1460
1507
  import_call += f"--quote='{self.quote}' "
1461
1508
 
1462
- node_lines = ''
1509
+ node_lines = ""
1463
1510
 
1464
1511
  # node import calls: one line per node type
1465
1512
  for header_path, parts_path, collection in self.import_call_nodes:
1466
-
1467
1513
  line = (
1468
- f'{import_call} '
1469
- f'--headers-file {header_path} '
1470
- f'--file= {parts_path} '
1514
+ f"{import_call} "
1515
+ f"--headers-file {header_path} "
1516
+ f"--file= {parts_path} "
1471
1517
  )
1472
1518
 
1473
1519
  if collection:
1474
- line += f'--create-collection --collection {collection} '
1520
+ line += f"--create-collection --collection {collection} "
1475
1521
 
1476
- node_lines += f'{line}\n'
1522
+ node_lines += f"{line}\n"
1477
1523
 
1478
- edge_lines = ''
1524
+ edge_lines = ""
1479
1525
 
1480
1526
  # edge import calls: one line per edge type
1481
1527
  for header_path, parts_path, collection in self.import_call_edges:
@@ -1495,6 +1541,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1495
1541
 
1496
1542
  This class inherits from the abstract class "_BatchWriter" and implements the
1497
1543
  PostgreSQL-specific methods:
1544
+
1498
1545
  - _write_node_headers
1499
1546
  - _write_edge_headers
1500
1547
  - _construct_import_call
@@ -1502,15 +1549,15 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1502
1549
  """
1503
1550
 
1504
1551
  DATA_TYPE_LOOKUP = {
1505
- 'str': 'VARCHAR', # VARCHAR needs limit
1506
- 'int': 'INTEGER',
1507
- 'long': 'BIGINT',
1508
- 'float': 'NUMERIC',
1509
- 'double': 'NUMERIC',
1510
- 'dbl': 'NUMERIC',
1511
- 'boolean': 'BOOLEAN',
1512
- 'str[]': 'VARCHAR[]',
1513
- 'string[]': 'VARCHAR[]'
1552
+ "str": "VARCHAR", # VARCHAR needs limit
1553
+ "int": "INTEGER",
1554
+ "long": "BIGINT",
1555
+ "float": "NUMERIC",
1556
+ "double": "NUMERIC",
1557
+ "dbl": "NUMERIC",
1558
+ "boolean": "BOOLEAN",
1559
+ "str[]": "VARCHAR[]",
1560
+ "string[]": "VARCHAR[]",
1514
1561
  }
1515
1562
 
1516
1563
  def __init__(self, *args, **kwargs):
@@ -1524,7 +1571,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1524
1571
  Returns:
1525
1572
  str: The default location for the psql command
1526
1573
  """
1527
- return ''
1574
+ return ""
1528
1575
 
1529
1576
  def _get_data_type(self, string) -> str:
1530
1577
  try:
@@ -1533,7 +1580,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1533
1580
  logger.info(
1534
1581
  'Could not determine data type {string}. Using default "VARCHAR"'
1535
1582
  )
1536
- return 'VARCHAR'
1583
+ return "VARCHAR"
1537
1584
 
1538
1585
  def _write_array_string(self, string_list) -> str:
1539
1586
  """
@@ -1546,7 +1593,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1546
1593
  Returns:
1547
1594
  str: The string representation of an array for postgres COPY
1548
1595
  """
1549
- string = ','.join(string_list)
1596
+ string = ",".join(string_list)
1550
1597
  string = f'"{{{string}}}"'
1551
1598
  return string
1552
1599
 
@@ -1557,10 +1604,10 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1557
1604
  Returns:
1558
1605
  str: The name of the import script (ending in .sh)
1559
1606
  """
1560
- return f'{self.db_name}-import-call.sh'
1607
+ return f"{self.db_name}-import-call.sh"
1561
1608
 
1562
1609
  def _adjust_pascal_to_psql(self, string):
1563
- string = string.replace('.', '_')
1610
+ string = string.replace(".", "_")
1564
1611
  string = string.lower()
1565
1612
  return string
1566
1613
 
@@ -1576,7 +1623,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1576
1623
  # load headers from data parse
1577
1624
  if not self.node_property_dict:
1578
1625
  logger.error(
1579
- 'Header information not found. Was the data parsed first?',
1626
+ "Header information not found. Was the data parsed first?",
1580
1627
  )
1581
1628
  return False
1582
1629
 
@@ -1586,7 +1633,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1586
1633
  # translate label to PascalCase
1587
1634
  pascal_label = self.translator.name_sentence_to_pascal(label)
1588
1635
 
1589
- parts = f'{pascal_label}-part*.csv'
1636
+ parts = f"{pascal_label}-part*.csv"
1590
1637
  parts_paths = os.path.join(self.outdir, parts)
1591
1638
  parts_paths = glob.glob(parts_paths)
1592
1639
  parts_paths.sort()
@@ -1595,36 +1642,36 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1595
1642
  pascal_label = self._adjust_pascal_to_psql(pascal_label)
1596
1643
  table_create_command_path = os.path.join(
1597
1644
  self.outdir,
1598
- f'{pascal_label}-create_table.sql',
1645
+ f"{pascal_label}-create_table.sql",
1599
1646
  )
1600
1647
 
1601
1648
  # check if file already exists
1602
1649
  if os.path.exists(table_create_command_path):
1603
1650
  logger.warning(
1604
- f'File {table_create_command_path} already exists. Overwriting.',
1651
+ f"File {table_create_command_path} already exists. Overwriting.",
1605
1652
  )
1606
1653
 
1607
1654
  # concatenate key:value in props
1608
- columns = ['_ID VARCHAR']
1655
+ columns = ["_ID VARCHAR"]
1609
1656
  for col_name, col_type in props.items():
1610
1657
  col_type = self._get_data_type(col_type)
1611
1658
  col_name = self._adjust_pascal_to_psql(col_name)
1612
- columns.append(f'{col_name} {col_type}')
1613
- columns.append('_LABEL VARCHAR[]')
1614
-
1615
- with open(table_create_command_path, 'w', encoding='utf-8') as f:
1659
+ columns.append(f"{col_name} {col_type}")
1660
+ columns.append("_LABEL VARCHAR[]")
1616
1661
 
1617
- command = ''
1662
+ with open(table_create_command_path, "w", encoding="utf-8") as f:
1663
+ command = ""
1618
1664
  if self.wipe:
1619
- command += f'DROP TABLE IF EXISTS {pascal_label};\n'
1665
+ command += f"DROP TABLE IF EXISTS {pascal_label};\n"
1620
1666
 
1621
1667
  # table creation requires comma separation
1622
- command += f'CREATE TABLE {pascal_label}({",".join(columns)});\n'
1668
+ command += (
1669
+ f'CREATE TABLE {pascal_label}({",".join(columns)});\n'
1670
+ )
1623
1671
  f.write(command)
1624
1672
 
1625
1673
  for parts_path in parts_paths:
1626
-
1627
- # if import_call_file_prefix is set, replace actual path
1674
+ # if import_call_file_prefix is set, replace actual path
1628
1675
  # with prefix
1629
1676
  if self.import_call_file_prefix != self.outdir:
1630
1677
  parts_path = parts_path.replace(
@@ -1633,7 +1680,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1633
1680
  )
1634
1681
 
1635
1682
  self._copy_from_csv_commands.add(
1636
- f'\\copy {pascal_label} FROM \'{parts_path}\' DELIMITER E\'{self.delim}\' CSV;'
1683
+ f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
1637
1684
  )
1638
1685
 
1639
1686
  # add file path to import statement
@@ -1661,16 +1708,15 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1661
1708
  # load headers from data parse
1662
1709
  if not self.edge_property_dict:
1663
1710
  logger.error(
1664
- 'Header information not found. Was the data parsed first?',
1711
+ "Header information not found. Was the data parsed first?",
1665
1712
  )
1666
1713
  return False
1667
1714
 
1668
1715
  for label, props in self.edge_property_dict.items():
1669
-
1670
1716
  # translate label to PascalCase
1671
1717
  pascal_label = self.translator.name_sentence_to_pascal(label)
1672
1718
 
1673
- parts_paths = os.path.join(self.outdir, f'{pascal_label}-part*.csv')
1719
+ parts_paths = os.path.join(self.outdir, f"{pascal_label}-part*.csv")
1674
1720
  parts_paths = glob.glob(parts_paths)
1675
1721
  parts_paths.sort()
1676
1722
 
@@ -1678,13 +1724,13 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1678
1724
  pascal_label = self._adjust_pascal_to_psql(pascal_label)
1679
1725
  table_create_command_path = os.path.join(
1680
1726
  self.outdir,
1681
- f'{pascal_label}-create_table.sql',
1727
+ f"{pascal_label}-create_table.sql",
1682
1728
  )
1683
1729
 
1684
1730
  # check for file exists
1685
1731
  if os.path.exists(table_create_command_path):
1686
1732
  logger.warning(
1687
- f'File {table_create_command_path} already exists. Overwriting.',
1733
+ f"File {table_create_command_path} already exists. Overwriting.",
1688
1734
  )
1689
1735
 
1690
1736
  # concatenate key:value in props
@@ -1692,7 +1738,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1692
1738
  for col_name, col_type in props.items():
1693
1739
  col_type = self._get_data_type(col_type)
1694
1740
  col_name = self._adjust_pascal_to_psql(col_name)
1695
- if col_name == '_ID':
1741
+ if col_name == "_ID":
1696
1742
  # should ideally never happen
1697
1743
  raise ValueError(
1698
1744
  "Column name '_ID' is reserved for internal use, "
@@ -1700,26 +1746,30 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1700
1746
  "different name for your column."
1701
1747
  )
1702
1748
 
1703
- columns.append(f'{col_name} {col_type}')
1749
+ columns.append(f"{col_name} {col_type}")
1704
1750
 
1705
1751
  # create list of lists and flatten
1706
1752
  # removes need for empty check of property list
1707
1753
  out_list = [
1708
- '_START_ID VARCHAR', '_ID VARCHAR', *columns, '_END_ID VARCHAR',
1709
- '_TYPE VARCHAR'
1754
+ "_START_ID VARCHAR",
1755
+ "_ID VARCHAR",
1756
+ *columns,
1757
+ "_END_ID VARCHAR",
1758
+ "_TYPE VARCHAR",
1710
1759
  ]
1711
1760
 
1712
- with open(table_create_command_path, 'w', encoding='utf-8') as f:
1713
- command = ''
1761
+ with open(table_create_command_path, "w", encoding="utf-8") as f:
1762
+ command = ""
1714
1763
  if self.wipe:
1715
- command += f'DROP TABLE IF EXISTS {pascal_label};\n'
1764
+ command += f"DROP TABLE IF EXISTS {pascal_label};\n"
1716
1765
 
1717
1766
  # table creation requires comma separation
1718
- command += f'CREATE TABLE {pascal_label}({",".join(out_list)});\n'
1767
+ command += (
1768
+ f'CREATE TABLE {pascal_label}({",".join(out_list)});\n'
1769
+ )
1719
1770
  f.write(command)
1720
1771
 
1721
1772
  for parts_path in parts_paths:
1722
-
1723
1773
  # if import_call_file_prefix is set, replace actual path
1724
1774
  # with prefix
1725
1775
  if self.import_call_file_prefix != self.outdir:
@@ -1729,7 +1779,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1729
1779
  )
1730
1780
 
1731
1781
  self._copy_from_csv_commands.add(
1732
- f'\\copy {pascal_label} FROM \'{parts_path}\' DELIMITER E\'{self.delim}\' CSV;'
1782
+ f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
1733
1783
  )
1734
1784
 
1735
1785
  # add file path to import statement
@@ -1740,7 +1790,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1740
1790
  self.outdir,
1741
1791
  self.import_call_file_prefix,
1742
1792
  )
1743
-
1793
+
1744
1794
  self.import_call_edges.add(table_create_command_path)
1745
1795
 
1746
1796
  return True
@@ -1755,59 +1805,63 @@ class _PostgreSQLBatchWriter(_BatchWriter):
1755
1805
  Returns:
1756
1806
  str: a bash command for postgresql import
1757
1807
  """
1758
- import_call = ''
1808
+ import_call = ""
1759
1809
 
1760
1810
  # create tables
1761
1811
  # At this point, csv files of nodes and edges do not require differentiation
1762
1812
  for import_file_path in [
1763
- *self.import_call_nodes, *self.import_call_edges
1813
+ *self.import_call_nodes,
1814
+ *self.import_call_edges,
1764
1815
  ]:
1765
1816
  import_call += f'echo "Setup {import_file_path}..."\n'
1766
1817
  if {self.db_password}:
1767
1818
  # set password variable inline
1768
- import_call += f'PGPASSWORD={self.db_password} '
1769
- import_call += f'{self.import_call_bin_prefix}psql -f {import_file_path}'
1770
- import_call += f' --dbname {self.db_name}'
1771
- import_call += f' --port {self.db_port}'
1772
- import_call += f' --user {self.db_user}'
1819
+ import_call += f"PGPASSWORD={self.db_password} "
1820
+ import_call += (
1821
+ f"{self.import_call_bin_prefix}psql -f {import_file_path}"
1822
+ )
1823
+ import_call += f" --dbname {self.db_name}"
1824
+ import_call += f" --host {self.db_host}"
1825
+ import_call += f" --port {self.db_port}"
1826
+ import_call += f" --user {self.db_user}"
1773
1827
  import_call += '\necho "Done!"\n'
1774
- import_call += '\n'
1828
+ import_call += "\n"
1775
1829
 
1776
1830
  # copy data to tables
1777
1831
  for command in self._copy_from_csv_commands:
1778
- table_part = command.split(' ')[3]
1832
+ table_part = command.split(" ")[3]
1779
1833
  import_call += f'echo "Importing {table_part}..."\n'
1780
1834
  if {self.db_password}:
1781
1835
  # set password variable inline
1782
- import_call += f'PGPASSWORD={self.db_password} '
1836
+ import_call += f"PGPASSWORD={self.db_password} "
1783
1837
  import_call += f'{self.import_call_bin_prefix}psql -c "{command}"'
1784
- import_call += f' --dbname {self.db_name}'
1785
- import_call += f' --port {self.db_port}'
1786
- import_call += f' --user {self.db_user}'
1838
+ import_call += f" --dbname {self.db_name}"
1839
+ import_call += f" --host {self.db_host}"
1840
+ import_call += f" --port {self.db_port}"
1841
+ import_call += f" --user {self.db_user}"
1787
1842
  import_call += '\necho "Done!"\n'
1788
- import_call += '\n'
1843
+ import_call += "\n"
1789
1844
 
1790
1845
  return import_call
1791
1846
 
1792
1847
 
1793
1848
  DBMS_TO_CLASS = {
1794
- 'neo': _Neo4jBatchWriter,
1795
- 'neo4j': _Neo4jBatchWriter,
1796
- 'Neo4j': _Neo4jBatchWriter,
1797
- 'postgres': _PostgreSQLBatchWriter,
1798
- 'postgresql': _PostgreSQLBatchWriter,
1799
- 'PostgreSQL': _PostgreSQLBatchWriter,
1800
- 'arango': _ArangoDBBatchWriter,
1801
- 'arangodb': _ArangoDBBatchWriter,
1802
- 'ArangoDB': _ArangoDBBatchWriter,
1849
+ "neo": _Neo4jBatchWriter,
1850
+ "neo4j": _Neo4jBatchWriter,
1851
+ "Neo4j": _Neo4jBatchWriter,
1852
+ "postgres": _PostgreSQLBatchWriter,
1853
+ "postgresql": _PostgreSQLBatchWriter,
1854
+ "PostgreSQL": _PostgreSQLBatchWriter,
1855
+ "arango": _ArangoDBBatchWriter,
1856
+ "arangodb": _ArangoDBBatchWriter,
1857
+ "ArangoDB": _ArangoDBBatchWriter,
1803
1858
  }
1804
1859
 
1805
1860
 
1806
1861
  def get_writer(
1807
1862
  dbms: str,
1808
- translator: 'Translator',
1809
- ontology: 'Ontology',
1810
- deduplicator: 'Deduplicator',
1863
+ translator: "Translator",
1864
+ deduplicator: "Deduplicator",
1811
1865
  output_directory: str,
1812
1866
  strict_mode: bool,
1813
1867
  ):
@@ -1821,8 +1875,6 @@ def get_writer(
1821
1875
 
1822
1876
  translator: the Translator object.
1823
1877
 
1824
- ontology: the Ontology object.
1825
-
1826
1878
  output_directory: the directory to write the output files to.
1827
1879
 
1828
1880
  strict_mode: whether to use strict mode.
@@ -1835,34 +1887,35 @@ def get_writer(
1835
1887
 
1836
1888
  dbms_config = _config(dbms)
1837
1889
 
1838
- timestamp = lambda: datetime.now().strftime('%Y%m%d%H%M%S')
1839
- outdir = output_directory or os.path.join('biocypher-out', timestamp())
1890
+ timestamp = lambda: datetime.now().strftime("%Y%m%d%H%M%S")
1891
+ outdir = output_directory or os.path.join("biocypher-out", timestamp())
1840
1892
  outdir = os.path.abspath(outdir)
1841
1893
 
1842
1894
  writer = DBMS_TO_CLASS[dbms]
1843
1895
 
1844
1896
  if not writer:
1845
- raise ValueError(f'Unknown dbms: {dbms}')
1897
+ raise ValueError(f"Unknown dbms: {dbms}")
1846
1898
 
1847
1899
  if writer is not None:
1848
1900
  return writer(
1849
- ontology=ontology,
1850
1901
  translator=translator,
1851
1902
  deduplicator=deduplicator,
1852
- delimiter=dbms_config.get('delimiter'),
1853
- array_delimiter=dbms_config.get('array_delimiter'),
1854
- quote=dbms_config.get('quote_character'),
1903
+ delimiter=dbms_config.get("delimiter"),
1904
+ array_delimiter=dbms_config.get("array_delimiter"),
1905
+ quote=dbms_config.get("quote_character"),
1855
1906
  output_directory=outdir,
1856
- db_name=dbms_config.get('database_name'),
1857
- import_call_bin_prefix=dbms_config.get('import_call_bin_prefix'),
1858
- import_call_file_prefix=dbms_config.get('import_call_file_prefix'),
1859
- wipe=dbms_config.get('wipe'),
1907
+ db_name=dbms_config.get("database_name"),
1908
+ import_call_bin_prefix=dbms_config.get("import_call_bin_prefix"),
1909
+ import_call_file_prefix=dbms_config.get("import_call_file_prefix"),
1910
+ wipe=dbms_config.get("wipe"),
1860
1911
  strict_mode=strict_mode,
1861
- skip_bad_relationships=dbms_config.get('skip_bad_relationships'
1862
- ), # neo4j
1863
- skip_duplicate_nodes=dbms_config.get('skip_duplicate_nodes'
1864
- ), # neo4j
1865
- db_user=dbms_config.get('user'), # psql
1866
- db_password=dbms_config.get('password'), # psql
1867
- db_port=dbms_config.get('port'), # psql
1912
+ skip_bad_relationships=dbms_config.get(
1913
+ "skip_bad_relationships"
1914
+ ), # neo4j
1915
+ skip_duplicate_nodes=dbms_config.get(
1916
+ "skip_duplicate_nodes"
1917
+ ), # neo4j
1918
+ db_user=dbms_config.get("user"), # psql
1919
+ db_password=dbms_config.get("password"), # psql
1920
+ db_port=dbms_config.get("port"), # psql
1868
1921
  )