biocypher 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/_config/biocypher_config.yaml +21 -4
- biocypher/_metadata.py +1 -1
- biocypher/_ontology.py +144 -51
- biocypher/_translate.py +84 -79
- biocypher/output/write/_batch_writer.py +133 -52
- biocypher/output/write/_get_writer.py +28 -11
- biocypher/output/write/_writer.py +32 -14
- biocypher/output/write/graph/_arangodb.py +44 -32
- biocypher/output/write/graph/_neo4j.py +3 -4
- biocypher/output/write/graph/_owl.py +569 -0
- biocypher/output/write/graph/_rdf.py +234 -97
- {biocypher-0.7.0.dist-info → biocypher-0.9.0.dist-info}/METADATA +1 -1
- {biocypher-0.7.0.dist-info → biocypher-0.9.0.dist-info}/RECORD +15 -14
- {biocypher-0.7.0.dist-info → biocypher-0.9.0.dist-info}/LICENSE +0 -0
- {biocypher-0.7.0.dist-info → biocypher-0.9.0.dist-info}/WHEEL +0 -0
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Abstract base class for all batch writers."""
|
|
2
|
+
|
|
1
3
|
import glob
|
|
2
4
|
import os
|
|
3
5
|
import re
|
|
@@ -16,30 +18,37 @@ from biocypher.output.write._writer import _Writer
|
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
class _BatchWriter(_Writer, ABC):
|
|
19
|
-
"""Abstract batch writer class"""
|
|
21
|
+
"""Abstract batch writer class."""
|
|
20
22
|
|
|
21
23
|
@abstractmethod
|
|
22
24
|
def _quote_string(self, value: str) -> str:
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
"""Quote a string.
|
|
26
|
+
|
|
27
|
+
Escaping is handled by the database-specific writer.
|
|
28
|
+
"""
|
|
29
|
+
msg = "Database writer must override '_quote_string'"
|
|
30
|
+
logger.error(msg)
|
|
31
|
+
raise NotImplementedError(msg)
|
|
27
32
|
|
|
28
33
|
@abstractmethod
|
|
29
34
|
def _get_default_import_call_bin_prefix(self):
|
|
30
|
-
"""
|
|
35
|
+
"""Provide the default string for the import call bin prefix.
|
|
31
36
|
|
|
32
37
|
Returns
|
|
33
38
|
-------
|
|
34
39
|
str: The database-specific string for the path to the import call bin prefix
|
|
35
40
|
|
|
36
41
|
"""
|
|
37
|
-
|
|
42
|
+
msg = "Database writer must override '_get_default_import_call_bin_prefix'"
|
|
43
|
+
logger.error(msg)
|
|
44
|
+
raise NotImplementedError(msg)
|
|
38
45
|
|
|
39
46
|
@abstractmethod
|
|
40
47
|
def _write_array_string(self, string_list):
|
|
41
|
-
"""
|
|
42
|
-
|
|
48
|
+
"""Write the string representation of an array into a .csv file.
|
|
49
|
+
|
|
50
|
+
Different databases require different formats of array to optimize
|
|
51
|
+
import speed.
|
|
43
52
|
|
|
44
53
|
Args:
|
|
45
54
|
----
|
|
@@ -50,50 +59,65 @@ class _BatchWriter(_Writer, ABC):
|
|
|
50
59
|
str: The database-specific string representation of an array
|
|
51
60
|
|
|
52
61
|
"""
|
|
53
|
-
|
|
62
|
+
msg = "Database writer must override '_write_array_string'"
|
|
63
|
+
logger.error(msg)
|
|
64
|
+
raise NotImplementedError(msg)
|
|
54
65
|
|
|
55
66
|
@abstractmethod
|
|
56
67
|
def _write_node_headers(self):
|
|
57
|
-
"""
|
|
58
|
-
|
|
68
|
+
"""Write header files for nodes.
|
|
69
|
+
|
|
70
|
+
Write header files (node properties) for nodes as per the
|
|
71
|
+
definition in the `schema_config.yaml`.
|
|
59
72
|
|
|
60
73
|
Returns
|
|
61
74
|
-------
|
|
62
75
|
bool: The return value. True for success, False otherwise.
|
|
63
76
|
|
|
64
77
|
"""
|
|
65
|
-
|
|
78
|
+
msg = "Database writer must override '_write_node_headers'"
|
|
79
|
+
logger.error(msg)
|
|
80
|
+
raise NotImplementedError(msg)
|
|
66
81
|
|
|
67
82
|
@abstractmethod
|
|
68
83
|
def _write_edge_headers(self):
|
|
69
|
-
"""
|
|
70
|
-
|
|
71
|
-
|
|
84
|
+
"""Write a database import-file for an edge.
|
|
85
|
+
|
|
86
|
+
Write a database import-file for an edge as per the definition in
|
|
87
|
+
the `schema_config.yaml`, containing only the header for this type
|
|
88
|
+
of edge.
|
|
72
89
|
|
|
73
90
|
Returns
|
|
74
91
|
-------
|
|
75
92
|
bool: The return value. True for success, False otherwise.
|
|
76
93
|
|
|
77
94
|
"""
|
|
78
|
-
|
|
95
|
+
msg = "Database writer must override '_write_edge_headers'"
|
|
96
|
+
logger.error(msg)
|
|
97
|
+
raise NotImplementedError(msg)
|
|
79
98
|
|
|
80
99
|
@abstractmethod
|
|
81
100
|
def _construct_import_call(self) -> str:
|
|
82
|
-
"""
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
101
|
+
"""Construct the import call.
|
|
102
|
+
|
|
103
|
+
Construct the import call detailing folder and individual node and
|
|
104
|
+
edge headers and data files, as well as delimiters and database name.
|
|
105
|
+
Built after all data has been processed to ensure that nodes are
|
|
106
|
+
called before any edges.
|
|
86
107
|
|
|
87
108
|
Returns
|
|
88
109
|
-------
|
|
89
110
|
str: A bash command for csv import.
|
|
90
111
|
|
|
91
112
|
"""
|
|
92
|
-
|
|
113
|
+
msg = "Database writer must override '_construct_import_call'"
|
|
114
|
+
logger.error(msg)
|
|
115
|
+
raise NotImplementedError(msg)
|
|
93
116
|
|
|
94
117
|
@abstractmethod
|
|
95
118
|
def _get_import_script_name(self) -> str:
|
|
96
|
-
"""
|
|
119
|
+
"""Return the name of the import script.
|
|
120
|
+
|
|
97
121
|
The name will be chosen based on the used database.
|
|
98
122
|
|
|
99
123
|
Returns
|
|
@@ -101,7 +125,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
101
125
|
str: The name of the import script (ending in .sh)
|
|
102
126
|
|
|
103
127
|
"""
|
|
104
|
-
|
|
128
|
+
msg = "Database writer must override '_get_import_script_name'"
|
|
129
|
+
logger.error(msg)
|
|
130
|
+
raise NotImplementedError(msg)
|
|
105
131
|
|
|
106
132
|
def __init__(
|
|
107
133
|
self,
|
|
@@ -122,10 +148,14 @@ class _BatchWriter(_Writer, ABC):
|
|
|
122
148
|
db_password: str = None,
|
|
123
149
|
db_host: str = None,
|
|
124
150
|
db_port: str = None,
|
|
125
|
-
|
|
151
|
+
file_format: str = None,
|
|
126
152
|
rdf_namespaces: dict = {},
|
|
153
|
+
labels_order: str = "Ascending",
|
|
154
|
+
**kwargs,
|
|
127
155
|
):
|
|
128
|
-
"""
|
|
156
|
+
"""Write node and edge representations to disk.
|
|
157
|
+
|
|
158
|
+
Abstract parent class for writing node and edge representations to disk
|
|
129
159
|
using the format specified by each database type. The database-specific
|
|
130
160
|
functions are implemented by the respective child-classes. This abstract
|
|
131
161
|
class contains all methods expected by a bach writer instance, some of
|
|
@@ -179,7 +209,8 @@ class _BatchWriter(_Writer, ABC):
|
|
|
179
209
|
call.
|
|
180
210
|
|
|
181
211
|
wipe:
|
|
182
|
-
Whether to force import (removing existing DB content).
|
|
212
|
+
Whether to force import (removing existing DB content).
|
|
213
|
+
(Specific to Neo4j.)
|
|
183
214
|
|
|
184
215
|
strict_mode:
|
|
185
216
|
Whether to enforce source, version, and license properties.
|
|
@@ -203,12 +234,16 @@ class _BatchWriter(_Writer, ABC):
|
|
|
203
234
|
db_port:
|
|
204
235
|
The database port.
|
|
205
236
|
|
|
206
|
-
|
|
237
|
+
file_format:
|
|
207
238
|
The format of RDF.
|
|
208
239
|
|
|
209
240
|
rdf_namespaces:
|
|
210
241
|
The namespaces for RDF.
|
|
211
242
|
|
|
243
|
+
labels_order:
|
|
244
|
+
The order of labels, to reflect the hierarchy (or not).
|
|
245
|
+
Default: "Ascending" (from more specific to more generic).
|
|
246
|
+
|
|
212
247
|
"""
|
|
213
248
|
super().__init__(
|
|
214
249
|
translator=translator,
|
|
@@ -221,7 +256,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
221
256
|
self.db_password = db_password
|
|
222
257
|
self.db_host = db_host or "localhost"
|
|
223
258
|
self.db_port = db_port
|
|
224
|
-
self.
|
|
259
|
+
self.file_format = file_format
|
|
225
260
|
self.rdf_namespaces = rdf_namespaces
|
|
226
261
|
|
|
227
262
|
self.delim, self.escaped_delim = self._process_delimiter(delimiter)
|
|
@@ -251,6 +286,15 @@ class _BatchWriter(_Writer, ABC):
|
|
|
251
286
|
|
|
252
287
|
self.parts = {} # dict to store the paths of part files for each label
|
|
253
288
|
|
|
289
|
+
self._labels_orders = ["Alphabetical", "Ascending", "Descending", "Leaves"]
|
|
290
|
+
if labels_order not in self._labels_orders:
|
|
291
|
+
msg = (
|
|
292
|
+
f"neo4j's 'labels_order' parameter cannot be '{labels_order}',"
|
|
293
|
+
"must be one of: {' ,'.join(self._labels_orders)}",
|
|
294
|
+
)
|
|
295
|
+
raise ValueError(msg)
|
|
296
|
+
self.labels_order = labels_order
|
|
297
|
+
|
|
254
298
|
# TODO not memory efficient, but should be fine for most cases; is
|
|
255
299
|
# there a more elegant solution?
|
|
256
300
|
|
|
@@ -263,8 +307,16 @@ class _BatchWriter(_Writer, ABC):
|
|
|
263
307
|
return self._import_call_file_prefix
|
|
264
308
|
|
|
265
309
|
def _process_delimiter(self, delimiter: str) -> str:
|
|
266
|
-
"""
|
|
267
|
-
|
|
310
|
+
"""Process a delimited to escape correctly.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
----
|
|
314
|
+
delimiter (str): The delimiter to process.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
-------
|
|
318
|
+
tuple: The delimiter and its escaped representation.
|
|
319
|
+
|
|
268
320
|
"""
|
|
269
321
|
if delimiter == "\\t":
|
|
270
322
|
return "\t", "\\t"
|
|
@@ -273,7 +325,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
273
325
|
return delimiter, delimiter
|
|
274
326
|
|
|
275
327
|
def write_nodes(self, nodes, batch_size: int = int(1e6), force: bool = False):
|
|
276
|
-
"""
|
|
328
|
+
"""Write nodes and their headers.
|
|
277
329
|
|
|
278
330
|
Args:
|
|
279
331
|
----
|
|
@@ -311,7 +363,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
311
363
|
edges: list | GeneratorType,
|
|
312
364
|
batch_size: int = int(1e6),
|
|
313
365
|
) -> bool:
|
|
314
|
-
"""
|
|
366
|
+
"""Write edges and their headers.
|
|
315
367
|
|
|
316
368
|
Args:
|
|
317
369
|
----
|
|
@@ -373,12 +425,13 @@ class _BatchWriter(_Writer, ABC):
|
|
|
373
425
|
return True
|
|
374
426
|
|
|
375
427
|
def _write_node_data(self, nodes, batch_size, force: bool = False):
|
|
376
|
-
"""
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
to the
|
|
381
|
-
|
|
428
|
+
"""Write biocypher nodes to CSV.
|
|
429
|
+
|
|
430
|
+
Conforms to the headers created with `_write_node_headers()`, and
|
|
431
|
+
is actually required to be run before calling `_write_node_headers()`
|
|
432
|
+
to set the :py:attr:`self.node_property_dict` for passing the node
|
|
433
|
+
properties to the instance. Expects list or generator of nodes from
|
|
434
|
+
the :py:class:`BioCypherNode` class.
|
|
382
435
|
|
|
383
436
|
Args:
|
|
384
437
|
----
|
|
@@ -472,8 +525,26 @@ class _BatchWriter(_Writer, ABC):
|
|
|
472
525
|
all_labels = [self.translator.name_sentence_to_pascal(label) for label in all_labels]
|
|
473
526
|
# remove duplicates
|
|
474
527
|
all_labels = list(OrderedDict.fromkeys(all_labels))
|
|
475
|
-
|
|
476
|
-
|
|
528
|
+
match self.labels_order:
|
|
529
|
+
case "Ascending":
|
|
530
|
+
pass # Default from get_ancestors.
|
|
531
|
+
case "Alphabetical":
|
|
532
|
+
all_labels.sort()
|
|
533
|
+
case "Descending":
|
|
534
|
+
all_labels.reverse()
|
|
535
|
+
case "Leaves":
|
|
536
|
+
if len(all_labels) < 1:
|
|
537
|
+
msg = "Labels list cannot be empty when using 'Leaves' order."
|
|
538
|
+
raise ValueError(msg)
|
|
539
|
+
all_labels = [all_labels[0]]
|
|
540
|
+
case _:
|
|
541
|
+
# In case someone touched _label_orders after constructor.
|
|
542
|
+
if self.labels_order not in self._labels_orders:
|
|
543
|
+
msg = (
|
|
544
|
+
f"Invalid labels_order: {self.labels_order}. "
|
|
545
|
+
f"Must be one of {self._labels_orders}"
|
|
546
|
+
)
|
|
547
|
+
raise ValueError(msg)
|
|
477
548
|
# concatenate with array delimiter
|
|
478
549
|
all_labels = self._write_array_string(all_labels)
|
|
479
550
|
else:
|
|
@@ -539,7 +610,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
539
610
|
prop_dict: dict,
|
|
540
611
|
labels: str,
|
|
541
612
|
):
|
|
542
|
-
"""
|
|
613
|
+
"""Write a list of biocypher nodes to a CSV file.
|
|
614
|
+
|
|
615
|
+
This function takes one list of biocypher nodes and writes them
|
|
543
616
|
to a Neo4j admin import compatible CSV file.
|
|
544
617
|
|
|
545
618
|
Args:
|
|
@@ -623,7 +696,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
623
696
|
return True
|
|
624
697
|
|
|
625
698
|
def _write_edge_data(self, edges, batch_size):
|
|
626
|
-
"""
|
|
699
|
+
"""Write biocypher edges to CSV.
|
|
700
|
+
|
|
701
|
+
Writes biocypher edges to CSV conforming to the headers created
|
|
627
702
|
with `_write_edge_headers()`, and is actually required to be run
|
|
628
703
|
before calling `_write_node_headers()` to set the
|
|
629
704
|
:py:attr:`self.edge_property_dict` for passing the edge
|
|
@@ -772,7 +847,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
772
847
|
label: str,
|
|
773
848
|
prop_dict: dict,
|
|
774
849
|
):
|
|
775
|
-
"""
|
|
850
|
+
"""Write a list of biocypher edges to a CSV file.
|
|
851
|
+
|
|
852
|
+
This function takes one list of biocypher edges and writes them
|
|
776
853
|
to a Neo4j admin import compatible CSV file.
|
|
777
854
|
|
|
778
855
|
Args:
|
|
@@ -891,7 +968,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
891
968
|
return True
|
|
892
969
|
|
|
893
970
|
def _write_next_part(self, label: str, lines: list):
|
|
894
|
-
"""
|
|
971
|
+
"""Write a list of strings to a new part file.
|
|
895
972
|
|
|
896
973
|
Args:
|
|
897
974
|
----
|
|
@@ -943,9 +1020,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
943
1020
|
self.parts[label].append(part)
|
|
944
1021
|
|
|
945
1022
|
def get_import_call(self) -> str:
|
|
946
|
-
"""
|
|
947
|
-
|
|
948
|
-
|
|
1023
|
+
"""Eeturn the import call.
|
|
1024
|
+
|
|
1025
|
+
Return the import call detailing folder and individual node and
|
|
1026
|
+
edge headers and data files, as well as delimiters and database name.
|
|
949
1027
|
|
|
950
1028
|
Returns
|
|
951
1029
|
-------
|
|
@@ -955,7 +1033,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
955
1033
|
return self._construct_import_call()
|
|
956
1034
|
|
|
957
1035
|
def write_import_call(self) -> str:
|
|
958
|
-
"""
|
|
1036
|
+
"""Write the import call.
|
|
1037
|
+
|
|
1038
|
+
Function to write the import call detailing folder and
|
|
959
1039
|
individual node and edge headers and data files, as well as
|
|
960
1040
|
delimiters and database name, to the export folder as txt.
|
|
961
1041
|
|
|
@@ -974,9 +1054,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
974
1054
|
|
|
975
1055
|
|
|
976
1056
|
def parse_label(label: str) -> str:
|
|
977
|
-
"""Check if the label is compliant with Neo4j naming conventions
|
|
978
|
-
|
|
979
|
-
|
|
1057
|
+
"""Check if the label is compliant with Neo4j naming conventions.
|
|
1058
|
+
|
|
1059
|
+
Check against https://neo4j.com/docs/cypher-manual/current/syntax/naming/,
|
|
1060
|
+
and if not compliant, remove non-compliant characters.
|
|
980
1061
|
|
|
981
1062
|
Args:
|
|
982
1063
|
----
|
|
@@ -1,15 +1,18 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
"""Module to provide one of the available writer classes.
|
|
2
|
+
|
|
3
|
+
The writer classes are responsible for writing the node and edge representations
|
|
4
|
+
to disk in a format suitable for import into a DBMS.
|
|
4
5
|
"""
|
|
5
6
|
|
|
6
7
|
from typing import TYPE_CHECKING
|
|
7
8
|
|
|
8
9
|
from biocypher._config import config as _config
|
|
9
10
|
from biocypher._logger import logger
|
|
11
|
+
from biocypher.output.write._batch_writer import _BatchWriter
|
|
10
12
|
from biocypher.output.write.graph._arangodb import _ArangoDBBatchWriter
|
|
11
13
|
from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
|
|
12
14
|
from biocypher.output.write.graph._networkx import _NetworkXWriter
|
|
15
|
+
from biocypher.output.write.graph._owl import _OWLWriter
|
|
13
16
|
from biocypher.output.write.graph._rdf import _RDFWriter
|
|
14
17
|
from biocypher.output.write.relational._csv import _PandasCSVWriter
|
|
15
18
|
from biocypher.output.write.relational._postgresql import _PostgreSQLBatchWriter
|
|
@@ -37,6 +40,8 @@ DBMS_TO_CLASS = {
|
|
|
37
40
|
"sqlite3": _SQLiteBatchWriter,
|
|
38
41
|
"rdf": _RDFWriter,
|
|
39
42
|
"RDF": _RDFWriter,
|
|
43
|
+
"owl": _OWLWriter,
|
|
44
|
+
"OWL": _OWLWriter,
|
|
40
45
|
"csv": _PandasCSVWriter,
|
|
41
46
|
"CSV": _PandasCSVWriter,
|
|
42
47
|
"pandas": _PandasCSVWriter,
|
|
@@ -54,12 +59,11 @@ def get_writer(
|
|
|
54
59
|
deduplicator: "Deduplicator",
|
|
55
60
|
output_directory: str,
|
|
56
61
|
strict_mode: bool,
|
|
57
|
-
):
|
|
58
|
-
"""
|
|
59
|
-
Function to return the writer class based on the selection in the config
|
|
60
|
-
file.
|
|
62
|
+
) -> _BatchWriter | None:
|
|
63
|
+
"""Return the writer class based on the selection in the config file.
|
|
61
64
|
|
|
62
65
|
Args:
|
|
66
|
+
----
|
|
63
67
|
dbms: the database management system; for options, see DBMS_TO_CLASS.
|
|
64
68
|
translator: the Translator object.
|
|
65
69
|
deduplicator: the Deduplicator object.
|
|
@@ -67,15 +71,26 @@ def get_writer(
|
|
|
67
71
|
strict_mode: whether to use strict mode.
|
|
68
72
|
|
|
69
73
|
Returns:
|
|
74
|
+
-------
|
|
70
75
|
instance: an instance of the selected writer class.
|
|
71
|
-
"""
|
|
72
76
|
|
|
77
|
+
"""
|
|
73
78
|
dbms_config = _config(dbms)
|
|
74
79
|
|
|
75
80
|
writer = DBMS_TO_CLASS[dbms]
|
|
76
81
|
|
|
82
|
+
if "rdf_format" in dbms_config:
|
|
83
|
+
logger.warning("The 'rdf_format' config option is deprecated, use 'file_format' instead.")
|
|
84
|
+
if "file_format" not in dbms_config:
|
|
85
|
+
format = dbms_config["rdf_format"]
|
|
86
|
+
logger.warning(f"I will set 'file_format: {format}' for you.")
|
|
87
|
+
dbms_config["file_format"] = format
|
|
88
|
+
dbms_config.pop("rdf_format")
|
|
89
|
+
logger.warning("NOTE: this warning will become an error in next versions.")
|
|
90
|
+
|
|
77
91
|
if not writer:
|
|
78
|
-
|
|
92
|
+
msg = f"Unknown dbms: {dbms}"
|
|
93
|
+
raise ValueError(msg)
|
|
79
94
|
|
|
80
95
|
if writer is not None:
|
|
81
96
|
return writer(
|
|
@@ -95,6 +110,8 @@ def get_writer(
|
|
|
95
110
|
db_user=dbms_config.get("user"), # psql
|
|
96
111
|
db_password=dbms_config.get("password"), # psql
|
|
97
112
|
db_port=dbms_config.get("port"), # psql
|
|
98
|
-
|
|
99
|
-
rdf_namespaces=dbms_config.get("rdf_namespaces"), # rdf
|
|
113
|
+
file_format=dbms_config.get("file_format"), # rdf, owl
|
|
114
|
+
rdf_namespaces=dbms_config.get("rdf_namespaces"), # rdf, owl
|
|
115
|
+
edge_model=dbms_config.get("edge_model"), # owl
|
|
100
116
|
)
|
|
117
|
+
return None
|
|
@@ -2,7 +2,6 @@ import os
|
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
4
|
from collections.abc import Iterable
|
|
5
|
-
from typing import Optional, Union
|
|
6
5
|
|
|
7
6
|
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
8
7
|
from biocypher._deduplicate import Deduplicator
|
|
@@ -23,26 +22,28 @@ class _Writer(ABC):
|
|
|
23
22
|
- _get_import_script_name
|
|
24
23
|
|
|
25
24
|
Args:
|
|
25
|
+
----
|
|
26
26
|
translator (Translator): Instance of :py:class:`Translator` to enable translation of
|
|
27
27
|
nodes and manipulation of properties.
|
|
28
28
|
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
|
|
29
29
|
of nodes and edges.
|
|
30
30
|
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
|
|
31
31
|
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
|
32
|
-
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
|
33
32
|
|
|
34
33
|
Raises:
|
|
34
|
+
------
|
|
35
35
|
NotImplementedError: Writer implementation must override '_write_node_data'
|
|
36
36
|
NotImplementedError: Writer implementation must override '_write_edge_data'
|
|
37
37
|
NotImplementedError: Writer implementation must override '_construct_import_call'
|
|
38
38
|
NotImplementedError: Writer implementation must override '_get_import_script_name'
|
|
39
|
+
|
|
39
40
|
"""
|
|
40
41
|
|
|
41
42
|
def __init__(
|
|
42
43
|
self,
|
|
43
44
|
translator: Translator,
|
|
44
45
|
deduplicator: Deduplicator,
|
|
45
|
-
output_directory:
|
|
46
|
+
output_directory: str | None = None,
|
|
46
47
|
strict_mode: bool = False,
|
|
47
48
|
*args,
|
|
48
49
|
**kwargs,
|
|
@@ -50,13 +51,14 @@ class _Writer(ABC):
|
|
|
50
51
|
"""Abstract class for writing node and edge representations to disk.
|
|
51
52
|
|
|
52
53
|
Args:
|
|
54
|
+
----
|
|
53
55
|
translator (Translator): Instance of :py:class:`Translator` to enable translation of
|
|
54
56
|
nodes and manipulation of properties.
|
|
55
57
|
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
|
|
56
58
|
of nodes and edges.
|
|
57
59
|
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
|
|
58
60
|
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
|
59
|
-
|
|
61
|
+
|
|
60
62
|
"""
|
|
61
63
|
self.translator = translator
|
|
62
64
|
self.deduplicator = deduplicator
|
|
@@ -67,7 +69,7 @@ class _Writer(ABC):
|
|
|
67
69
|
if kwargs.get("write_to_file", True):
|
|
68
70
|
logger.warning(
|
|
69
71
|
f"Output directory `{self.output_directory}` already exists. "
|
|
70
|
-
"If this is not planned, file consistency may be compromised."
|
|
72
|
+
"If this is not planned, file consistency may be compromised.",
|
|
71
73
|
)
|
|
72
74
|
else:
|
|
73
75
|
logger.info(f"Creating output directory `{self.output_directory}`.")
|
|
@@ -76,43 +78,50 @@ class _Writer(ABC):
|
|
|
76
78
|
@abstractmethod
|
|
77
79
|
def _write_node_data(
|
|
78
80
|
self,
|
|
79
|
-
nodes: Iterable[
|
|
81
|
+
nodes: Iterable[BioCypherNode | BioCypherEdge | BioCypherRelAsNode],
|
|
80
82
|
) -> bool:
|
|
81
83
|
"""Implement how to output.write nodes to disk.
|
|
82
84
|
|
|
83
85
|
Args:
|
|
86
|
+
----
|
|
84
87
|
nodes (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
|
85
88
|
|
|
86
89
|
Returns:
|
|
90
|
+
-------
|
|
87
91
|
bool: The return value. True for success, False otherwise.
|
|
92
|
+
|
|
88
93
|
"""
|
|
89
94
|
raise NotImplementedError("Writer implementation must override 'write_nodes'")
|
|
90
95
|
|
|
91
96
|
@abstractmethod
|
|
92
97
|
def _write_edge_data(
|
|
93
98
|
self,
|
|
94
|
-
edges: Iterable[
|
|
99
|
+
edges: Iterable[BioCypherNode | BioCypherEdge | BioCypherRelAsNode],
|
|
95
100
|
) -> bool:
|
|
96
101
|
"""Implement how to output.write edges to disk.
|
|
97
102
|
|
|
98
103
|
Args:
|
|
104
|
+
----
|
|
99
105
|
edges (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
|
100
106
|
|
|
101
107
|
Returns:
|
|
108
|
+
-------
|
|
102
109
|
bool: The return value. True for success, False otherwise.
|
|
110
|
+
|
|
103
111
|
"""
|
|
104
112
|
raise NotImplementedError("Writer implementation must override 'write_edges'")
|
|
105
113
|
|
|
106
114
|
@abstractmethod
|
|
107
115
|
def _construct_import_call(self) -> str:
|
|
108
|
-
"""
|
|
109
|
-
Function to construct the import call detailing folder and
|
|
116
|
+
"""Function to construct the import call detailing folder and
|
|
110
117
|
individual node and edge headers and data files, as well as
|
|
111
118
|
delimiters and database name. Built after all data has been
|
|
112
119
|
processed to ensure that nodes are called before any edges.
|
|
113
120
|
|
|
114
|
-
Returns
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
115
123
|
str: command for importing the output files into a DBMS.
|
|
124
|
+
|
|
116
125
|
"""
|
|
117
126
|
raise NotImplementedError("Writer implementation must override '_construct_import_call'")
|
|
118
127
|
|
|
@@ -120,8 +129,10 @@ class _Writer(ABC):
|
|
|
120
129
|
def _get_import_script_name(self) -> str:
|
|
121
130
|
"""Returns the name of the import script.
|
|
122
131
|
|
|
123
|
-
Returns
|
|
132
|
+
Returns
|
|
133
|
+
-------
|
|
124
134
|
str: The name of the import script (ending in .sh)
|
|
135
|
+
|
|
125
136
|
"""
|
|
126
137
|
raise NotImplementedError("Writer implementation must override '_get_import_script_name'")
|
|
127
138
|
|
|
@@ -129,6 +140,7 @@ class _Writer(ABC):
|
|
|
129
140
|
"""Wrapper for writing nodes.
|
|
130
141
|
|
|
131
142
|
Args:
|
|
143
|
+
----
|
|
132
144
|
nodes (BioCypherNode): a list or generator of nodes in
|
|
133
145
|
:py:class:`BioCypherNode` format
|
|
134
146
|
batch_size (int): The batch size for writing nodes.
|
|
@@ -136,7 +148,9 @@ class _Writer(ABC):
|
|
|
136
148
|
not present in the schema.
|
|
137
149
|
|
|
138
150
|
Returns:
|
|
151
|
+
-------
|
|
139
152
|
bool: The return value. True for success, False otherwise.
|
|
153
|
+
|
|
140
154
|
"""
|
|
141
155
|
passed = self._write_node_data(nodes)
|
|
142
156
|
if not passed:
|
|
@@ -148,6 +162,7 @@ class _Writer(ABC):
|
|
|
148
162
|
"""Wrapper for writing edges.
|
|
149
163
|
|
|
150
164
|
Args:
|
|
165
|
+
----
|
|
151
166
|
nodes (BioCypherNode): a list or generator of nodes in
|
|
152
167
|
:py:class:`BioCypherNode` format
|
|
153
168
|
batch_size (int): The batch size for writing nodes.
|
|
@@ -155,7 +170,9 @@ class _Writer(ABC):
|
|
|
155
170
|
not present in the schema.
|
|
156
171
|
|
|
157
172
|
Returns:
|
|
173
|
+
-------
|
|
158
174
|
bool: The return value. True for success, False otherwise.
|
|
175
|
+
|
|
159
176
|
"""
|
|
160
177
|
passed = self._write_edge_data(edges)
|
|
161
178
|
if not passed:
|
|
@@ -164,13 +181,14 @@ class _Writer(ABC):
|
|
|
164
181
|
return True
|
|
165
182
|
|
|
166
183
|
def write_import_call(self):
|
|
167
|
-
"""
|
|
168
|
-
Function to output.write the import call detailing folder and
|
|
184
|
+
"""Function to output.write the import call detailing folder and
|
|
169
185
|
individual node and edge headers and data files, as well as
|
|
170
186
|
delimiters and database name, to the export folder as txt.
|
|
171
187
|
|
|
172
|
-
Returns
|
|
188
|
+
Returns
|
|
189
|
+
-------
|
|
173
190
|
str: The path of the file holding the import call.
|
|
191
|
+
|
|
174
192
|
"""
|
|
175
193
|
file_path = os.path.join(self.output_directory, self._get_import_script_name())
|
|
176
194
|
logger.info(f"Writing {self.__class__.__name__} import call to `{file_path}`.")
|