biocypher 0.5.39__py3-none-any.whl → 0.5.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/_config/biocypher_config.yaml +18 -8
- biocypher/_connect.py +36 -9
- biocypher/_core.py +7 -3
- biocypher/_metadata.py +1 -1
- biocypher/_misc.py +6 -2
- biocypher/write/__init__.py +0 -0
- biocypher/{_write.py → write/_batch_writer.py} +7 -944
- biocypher/write/_write.py +105 -0
- biocypher/write/graph/__init__.py +0 -0
- biocypher/write/graph/_arangodb.py +241 -0
- biocypher/write/graph/_neo4j.py +334 -0
- biocypher/write/relational/__init__.py +0 -0
- biocypher/write/relational/_postgresql.py +320 -0
- biocypher/write/relational/_sqlite.py +51 -0
- {biocypher-0.5.39.dist-info → biocypher-0.5.41.dist-info}/METADATA +1 -1
- biocypher-0.5.41.dist-info/RECORD +32 -0
- biocypher-0.5.39.dist-info/RECORD +0 -24
- {biocypher-0.5.39.dist-info → biocypher-0.5.41.dist-info}/LICENSE +0 -0
- {biocypher-0.5.39.dist-info → biocypher-0.5.41.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
#
|
|
4
|
+
# Copyright 2021, Heidelberg University Clinic
|
|
5
|
+
#
|
|
6
|
+
# File author(s): Sebastian Lobentanzer
|
|
7
|
+
# Michael Hartung
|
|
8
|
+
#
|
|
9
|
+
# Distributed under MIT licence, see the file `LICENSE`.
|
|
10
|
+
#
|
|
11
|
+
"""
|
|
12
|
+
BioCypher 'offline' module. Handles the writing of node and edge representations
|
|
13
|
+
suitable for import into a DBMS.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from biocypher._logger import logger
|
|
17
|
+
from biocypher.write.graph._neo4j import _Neo4jBatchWriter
|
|
18
|
+
from biocypher.write.graph._arangodb import _ArangoDBBatchWriter
|
|
19
|
+
from biocypher.write.relational._sqlite import _SQLiteBatchWriter
|
|
20
|
+
from biocypher.write.relational._postgresql import _PostgreSQLBatchWriter
|
|
21
|
+
|
|
22
|
+
logger.debug(f"Loading module {__name__}.")
|
|
23
|
+
|
|
24
|
+
from typing import TYPE_CHECKING
|
|
25
|
+
|
|
26
|
+
from biocypher._config import config as _config
|
|
27
|
+
|
|
28
|
+
__all__ = ["get_writer", "DBMS_TO_CLASS"]
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from biocypher._translate import Translator
|
|
32
|
+
from biocypher._deduplicate import Deduplicator
|
|
33
|
+
|
|
34
|
+
DBMS_TO_CLASS = {
|
|
35
|
+
"neo": _Neo4jBatchWriter,
|
|
36
|
+
"neo4j": _Neo4jBatchWriter,
|
|
37
|
+
"Neo4j": _Neo4jBatchWriter,
|
|
38
|
+
"postgres": _PostgreSQLBatchWriter,
|
|
39
|
+
"postgresql": _PostgreSQLBatchWriter,
|
|
40
|
+
"PostgreSQL": _PostgreSQLBatchWriter,
|
|
41
|
+
"arango": _ArangoDBBatchWriter,
|
|
42
|
+
"arangodb": _ArangoDBBatchWriter,
|
|
43
|
+
"ArangoDB": _ArangoDBBatchWriter,
|
|
44
|
+
"sqlite": _SQLiteBatchWriter,
|
|
45
|
+
"sqlite3": _SQLiteBatchWriter,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_writer(
|
|
50
|
+
dbms: str,
|
|
51
|
+
translator: "Translator",
|
|
52
|
+
deduplicator: "Deduplicator",
|
|
53
|
+
output_directory: str,
|
|
54
|
+
strict_mode: bool,
|
|
55
|
+
):
|
|
56
|
+
"""
|
|
57
|
+
Function to return the writer class based on the selection in the config
|
|
58
|
+
file.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
|
|
62
|
+
dbms: the database management system; for options, see DBMS_TO_CLASS.
|
|
63
|
+
|
|
64
|
+
translator: the Translator object.
|
|
65
|
+
|
|
66
|
+
output_directory: the directory to write the output files to.
|
|
67
|
+
|
|
68
|
+
strict_mode: whether to use strict mode.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
|
|
72
|
+
instance: an instance of the selected writer class.
|
|
73
|
+
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
dbms_config = _config(dbms)
|
|
77
|
+
|
|
78
|
+
writer = DBMS_TO_CLASS[dbms]
|
|
79
|
+
|
|
80
|
+
if not writer:
|
|
81
|
+
raise ValueError(f"Unknown dbms: {dbms}")
|
|
82
|
+
|
|
83
|
+
if writer is not None:
|
|
84
|
+
return writer(
|
|
85
|
+
translator=translator,
|
|
86
|
+
deduplicator=deduplicator,
|
|
87
|
+
delimiter=dbms_config.get("delimiter"),
|
|
88
|
+
array_delimiter=dbms_config.get("array_delimiter"),
|
|
89
|
+
quote=dbms_config.get("quote_character"),
|
|
90
|
+
output_directory=output_directory,
|
|
91
|
+
db_name=dbms_config.get("database_name"),
|
|
92
|
+
import_call_bin_prefix=dbms_config.get("import_call_bin_prefix"),
|
|
93
|
+
import_call_file_prefix=dbms_config.get("import_call_file_prefix"),
|
|
94
|
+
wipe=dbms_config.get("wipe"),
|
|
95
|
+
strict_mode=strict_mode,
|
|
96
|
+
skip_bad_relationships=dbms_config.get(
|
|
97
|
+
"skip_bad_relationships"
|
|
98
|
+
), # neo4j
|
|
99
|
+
skip_duplicate_nodes=dbms_config.get(
|
|
100
|
+
"skip_duplicate_nodes"
|
|
101
|
+
), # neo4j
|
|
102
|
+
db_user=dbms_config.get("user"), # psql
|
|
103
|
+
db_password=dbms_config.get("password"), # psql
|
|
104
|
+
db_port=dbms_config.get("port"), # psql
|
|
105
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from biocypher._logger import logger
|
|
4
|
+
from biocypher.write.graph._neo4j import _Neo4jBatchWriter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
|
8
|
+
"""
|
|
9
|
+
Class for writing node and edge representations to disk using the format
|
|
10
|
+
specified by ArangoDB for the use of "arangoimport". Output files are
|
|
11
|
+
similar to Neo4j, but with a different header format.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def _get_default_import_call_bin_prefix(self):
|
|
15
|
+
"""
|
|
16
|
+
Method to provide the default string for the import call bin prefix.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
str: The default location for the neo4j admin import location
|
|
20
|
+
"""
|
|
21
|
+
return ""
|
|
22
|
+
|
|
23
|
+
def _get_import_script_name(self) -> str:
|
|
24
|
+
"""
|
|
25
|
+
Returns the name of the neo4j admin import script
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
str: The name of the import script (ending in .sh)
|
|
29
|
+
"""
|
|
30
|
+
return "arangodb-import-call.sh"
|
|
31
|
+
|
|
32
|
+
def _write_node_headers(self):
|
|
33
|
+
"""
|
|
34
|
+
Writes single CSV file for a graph entity that is represented
|
|
35
|
+
as a node as per the definition in the `schema_config.yaml`,
|
|
36
|
+
containing only the header for this type of node.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
bool: The return value. True for success, False otherwise.
|
|
40
|
+
"""
|
|
41
|
+
# load headers from data parse
|
|
42
|
+
if not self.node_property_dict:
|
|
43
|
+
logger.error(
|
|
44
|
+
"Header information not found. Was the data parsed first?",
|
|
45
|
+
)
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
for label, props in self.node_property_dict.items():
|
|
49
|
+
# create header CSV with ID, properties, labels
|
|
50
|
+
|
|
51
|
+
_id = "_key"
|
|
52
|
+
|
|
53
|
+
# translate label to PascalCase
|
|
54
|
+
pascal_label = self.translator.name_sentence_to_pascal(label)
|
|
55
|
+
|
|
56
|
+
header = f"{pascal_label}-header.csv"
|
|
57
|
+
header_path = os.path.join(
|
|
58
|
+
self.outdir,
|
|
59
|
+
header,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# check if file already exists
|
|
63
|
+
if os.path.exists(header_path):
|
|
64
|
+
logger.warning(
|
|
65
|
+
f"File {header_path} already exists. Overwriting."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# concatenate key:value in props
|
|
69
|
+
props_list = []
|
|
70
|
+
for k in props.keys():
|
|
71
|
+
props_list.append(f"{k}")
|
|
72
|
+
|
|
73
|
+
# create list of lists and flatten
|
|
74
|
+
# removes need for empty check of property list
|
|
75
|
+
out_list = [[_id], props_list]
|
|
76
|
+
out_list = [val for sublist in out_list for val in sublist]
|
|
77
|
+
|
|
78
|
+
with open(header_path, "w", encoding="utf-8") as f:
|
|
79
|
+
# concatenate with delimiter
|
|
80
|
+
row = self.delim.join(out_list)
|
|
81
|
+
f.write(row)
|
|
82
|
+
|
|
83
|
+
# add collection from schema config
|
|
84
|
+
collection = self.translator.ontology.mapping.extended_schema[
|
|
85
|
+
label
|
|
86
|
+
].get("db_collection_name", None)
|
|
87
|
+
|
|
88
|
+
# add file path to neo4 admin import statement
|
|
89
|
+
# do once for each part file
|
|
90
|
+
parts = self.parts.get(label, [])
|
|
91
|
+
|
|
92
|
+
if not parts:
|
|
93
|
+
raise ValueError(
|
|
94
|
+
f"No parts found for node label {label}. "
|
|
95
|
+
f"Check that the data was parsed first.",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
for part in parts:
|
|
99
|
+
import_call_header_path = os.path.join(
|
|
100
|
+
self.import_call_file_prefix,
|
|
101
|
+
header,
|
|
102
|
+
)
|
|
103
|
+
import_call_parts_path = os.path.join(
|
|
104
|
+
self.import_call_file_prefix,
|
|
105
|
+
part,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
self.import_call_nodes.add(
|
|
109
|
+
(
|
|
110
|
+
import_call_header_path,
|
|
111
|
+
import_call_parts_path,
|
|
112
|
+
collection,
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
return True
|
|
117
|
+
|
|
118
|
+
def _write_edge_headers(self):
|
|
119
|
+
"""
|
|
120
|
+
Writes single CSV file for a graph entity that is represented
|
|
121
|
+
as an edge as per the definition in the `schema_config.yaml`,
|
|
122
|
+
containing only the header for this type of edge.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
bool: The return value. True for success, False otherwise.
|
|
126
|
+
"""
|
|
127
|
+
# load headers from data parse
|
|
128
|
+
if not self.edge_property_dict:
|
|
129
|
+
logger.error(
|
|
130
|
+
"Header information not found. Was the data parsed first?",
|
|
131
|
+
)
|
|
132
|
+
return False
|
|
133
|
+
|
|
134
|
+
for label, props in self.edge_property_dict.items():
|
|
135
|
+
# translate label to PascalCase
|
|
136
|
+
pascal_label = self.translator.name_sentence_to_pascal(label)
|
|
137
|
+
|
|
138
|
+
# paths
|
|
139
|
+
header = f"{pascal_label}-header.csv"
|
|
140
|
+
header_path = os.path.join(
|
|
141
|
+
self.outdir,
|
|
142
|
+
header,
|
|
143
|
+
)
|
|
144
|
+
parts = f"{pascal_label}-part.*"
|
|
145
|
+
|
|
146
|
+
# check for file exists
|
|
147
|
+
if os.path.exists(header_path):
|
|
148
|
+
logger.warning(
|
|
149
|
+
f"Header file {header_path} already exists. Overwriting."
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# concatenate key:value in props
|
|
153
|
+
props_list = []
|
|
154
|
+
for k in props.keys():
|
|
155
|
+
props_list.append(f"{k}")
|
|
156
|
+
|
|
157
|
+
out_list = ["_from", "_key", *props_list, "_to"]
|
|
158
|
+
|
|
159
|
+
with open(header_path, "w", encoding="utf-8") as f:
|
|
160
|
+
# concatenate with delimiter
|
|
161
|
+
row = self.delim.join(out_list)
|
|
162
|
+
f.write(row)
|
|
163
|
+
|
|
164
|
+
# add collection from schema config
|
|
165
|
+
if not self.translator.ontology.mapping.extended_schema.get(label):
|
|
166
|
+
for (
|
|
167
|
+
_,
|
|
168
|
+
v,
|
|
169
|
+
) in self.translator.ontology.mapping.extended_schema.items():
|
|
170
|
+
if v.get("label_as_edge") == label:
|
|
171
|
+
collection = v.get("db_collection_name", None)
|
|
172
|
+
break
|
|
173
|
+
|
|
174
|
+
else:
|
|
175
|
+
collection = self.translator.ontology.mapping.extended_schema[
|
|
176
|
+
label
|
|
177
|
+
].get("db_collection_name", None)
|
|
178
|
+
|
|
179
|
+
# add file path to neo4 admin import statement (import call path
|
|
180
|
+
# may be different from actual output path)
|
|
181
|
+
header_import_call_path = os.path.join(
|
|
182
|
+
self.import_call_file_prefix,
|
|
183
|
+
header,
|
|
184
|
+
)
|
|
185
|
+
parts_import_call_path = os.path.join(
|
|
186
|
+
self.import_call_file_prefix,
|
|
187
|
+
parts,
|
|
188
|
+
)
|
|
189
|
+
self.import_call_edges.add(
|
|
190
|
+
(
|
|
191
|
+
header_import_call_path,
|
|
192
|
+
parts_import_call_path,
|
|
193
|
+
collection,
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
return True
|
|
198
|
+
|
|
199
|
+
def _construct_import_call(self) -> str:
|
|
200
|
+
"""
|
|
201
|
+
Function to construct the import call detailing folder and
|
|
202
|
+
individual node and edge headers and data files, as well as
|
|
203
|
+
delimiters and database name. Built after all data has been
|
|
204
|
+
processed to ensure that nodes are called before any edges.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
str: a bash command for neo4j-admin import
|
|
208
|
+
"""
|
|
209
|
+
import_call = (
|
|
210
|
+
f"{self.import_call_bin_prefix}arangoimp "
|
|
211
|
+
f"--type csv "
|
|
212
|
+
f'--separator="{self.escaped_delim}" '
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
if self.quote == "'":
|
|
216
|
+
import_call += f'--quote="{self.quote}" '
|
|
217
|
+
else:
|
|
218
|
+
import_call += f"--quote='{self.quote}' "
|
|
219
|
+
|
|
220
|
+
node_lines = ""
|
|
221
|
+
|
|
222
|
+
# node import calls: one line per node type
|
|
223
|
+
for header_path, parts_path, collection in self.import_call_nodes:
|
|
224
|
+
line = (
|
|
225
|
+
f"{import_call} "
|
|
226
|
+
f"--headers-file {header_path} "
|
|
227
|
+
f"--file= {parts_path} "
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
if collection:
|
|
231
|
+
line += f"--create-collection --collection {collection} "
|
|
232
|
+
|
|
233
|
+
node_lines += f"{line}\n"
|
|
234
|
+
|
|
235
|
+
edge_lines = ""
|
|
236
|
+
|
|
237
|
+
# edge import calls: one line per edge type
|
|
238
|
+
for header_path, parts_path, collection in self.import_call_edges:
|
|
239
|
+
import_call += f'--relationships="{header_path},{parts_path}" '
|
|
240
|
+
|
|
241
|
+
return node_lines + edge_lines
|
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import subprocess
|
|
4
|
+
|
|
5
|
+
from biocypher._logger import logger
|
|
6
|
+
from biocypher.write._batch_writer import parse_label, _BatchWriter
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class _Neo4jBatchWriter(_BatchWriter):
|
|
10
|
+
"""
|
|
11
|
+
Class for writing node and edge representations to disk using the
|
|
12
|
+
format specified by Neo4j for the use of admin import. Each batch
|
|
13
|
+
writer instance has a fixed representation that needs to be passed
|
|
14
|
+
at instantiation via the :py:attr:`schema` argument. The instance
|
|
15
|
+
also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
|
|
16
|
+
to convert and extend the hierarchy.
|
|
17
|
+
|
|
18
|
+
This class inherits from the abstract class "_BatchWriter" and implements the
|
|
19
|
+
Neo4j-specific methods:
|
|
20
|
+
|
|
21
|
+
- _write_node_headers
|
|
22
|
+
- _write_edge_headers
|
|
23
|
+
- _construct_import_call
|
|
24
|
+
- _write_array_string
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, *args, **kwargs):
|
|
28
|
+
"""
|
|
29
|
+
Constructor.
|
|
30
|
+
|
|
31
|
+
Check the version of Neo4j and adds a command scope if version >= 5.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
_Neo4jBatchWriter: An instance of the writer.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
# Should read the configuration and setup import_call_bin_prefix.
|
|
38
|
+
super().__init__(*args, **kwargs)
|
|
39
|
+
|
|
40
|
+
def _get_default_import_call_bin_prefix(self):
|
|
41
|
+
"""
|
|
42
|
+
Method to provide the default string for the import call bin prefix.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
str: The default location for the neo4j admin import location
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
return "bin/"
|
|
49
|
+
|
|
50
|
+
def _write_array_string(self, string_list):
|
|
51
|
+
"""
|
|
52
|
+
Abstract method to write the string representation of an array into a .csv file
|
|
53
|
+
as required by the neo4j admin-import.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
string_list (list): list of ontology strings
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
str: The string representation of an array for the neo4j admin import
|
|
60
|
+
"""
|
|
61
|
+
string = self.adelim.join(string_list)
|
|
62
|
+
return f"{self.quote}{string}{self.quote}"
|
|
63
|
+
|
|
64
|
+
def _write_node_headers(self):
|
|
65
|
+
"""
|
|
66
|
+
Writes single CSV file for a graph entity that is represented
|
|
67
|
+
as a node as per the definition in the `schema_config.yaml`,
|
|
68
|
+
containing only the header for this type of node.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
bool: The return value. True for success, False otherwise.
|
|
72
|
+
"""
|
|
73
|
+
# load headers from data parse
|
|
74
|
+
if not self.node_property_dict:
|
|
75
|
+
logger.error(
|
|
76
|
+
"Header information not found. Was the data parsed first?",
|
|
77
|
+
)
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
for label, props in self.node_property_dict.items():
|
|
81
|
+
_id = ":ID"
|
|
82
|
+
|
|
83
|
+
# translate label to PascalCase
|
|
84
|
+
pascal_label = self.translator.name_sentence_to_pascal(
|
|
85
|
+
parse_label(label)
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
header = f"{pascal_label}-header.csv"
|
|
89
|
+
header_path = os.path.join(
|
|
90
|
+
self.outdir,
|
|
91
|
+
header,
|
|
92
|
+
)
|
|
93
|
+
parts = f"{pascal_label}-part.*"
|
|
94
|
+
|
|
95
|
+
# check if file already exists
|
|
96
|
+
if os.path.exists(header_path):
|
|
97
|
+
logger.warning(
|
|
98
|
+
f"Header file `{header_path}` already exists. Overwriting.",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# concatenate key:value in props
|
|
102
|
+
props_list = []
|
|
103
|
+
for k, v in props.items():
|
|
104
|
+
if v in ["int", "long", "integer"]:
|
|
105
|
+
props_list.append(f"{k}:long")
|
|
106
|
+
elif v in ["int[]", "long[]", "integer[]"]:
|
|
107
|
+
props_list.append(f"{k}:long[]")
|
|
108
|
+
elif v in ["float", "double", "dbl"]:
|
|
109
|
+
props_list.append(f"{k}:double")
|
|
110
|
+
elif v in ["float[]", "double[]"]:
|
|
111
|
+
props_list.append(f"{k}:double[]")
|
|
112
|
+
elif v in ["bool", "boolean"]:
|
|
113
|
+
# TODO Neo4j boolean support / spelling?
|
|
114
|
+
props_list.append(f"{k}:boolean")
|
|
115
|
+
elif v in ["bool[]", "boolean[]"]:
|
|
116
|
+
props_list.append(f"{k}:boolean[]")
|
|
117
|
+
elif v in ["str[]", "string[]"]:
|
|
118
|
+
props_list.append(f"{k}:string[]")
|
|
119
|
+
else:
|
|
120
|
+
props_list.append(f"{k}")
|
|
121
|
+
|
|
122
|
+
# create list of lists and flatten
|
|
123
|
+
out_list = [[_id], props_list, [":LABEL"]]
|
|
124
|
+
out_list = [val for sublist in out_list for val in sublist]
|
|
125
|
+
|
|
126
|
+
with open(header_path, "w", encoding="utf-8") as f:
|
|
127
|
+
# concatenate with delimiter
|
|
128
|
+
row = self.delim.join(out_list)
|
|
129
|
+
f.write(row)
|
|
130
|
+
|
|
131
|
+
# add file path to neo4 admin import statement (import call file
|
|
132
|
+
# path may be different from actual file path)
|
|
133
|
+
import_call_header_path = os.path.join(
|
|
134
|
+
self.import_call_file_prefix,
|
|
135
|
+
header,
|
|
136
|
+
)
|
|
137
|
+
import_call_parts_path = os.path.join(
|
|
138
|
+
self.import_call_file_prefix,
|
|
139
|
+
parts,
|
|
140
|
+
)
|
|
141
|
+
self.import_call_nodes.add(
|
|
142
|
+
(import_call_header_path, import_call_parts_path)
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
return True
|
|
146
|
+
|
|
147
|
+
def _write_edge_headers(self):
|
|
148
|
+
"""
|
|
149
|
+
Writes single CSV file for a graph entity that is represented
|
|
150
|
+
as an edge as per the definition in the `schema_config.yaml`,
|
|
151
|
+
containing only the header for this type of edge.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
bool: The return value. True for success, False otherwise.
|
|
155
|
+
"""
|
|
156
|
+
# load headers from data parse
|
|
157
|
+
if not self.edge_property_dict:
|
|
158
|
+
logger.error(
|
|
159
|
+
"Header information not found. Was the data parsed first?",
|
|
160
|
+
)
|
|
161
|
+
return False
|
|
162
|
+
|
|
163
|
+
for label, props in self.edge_property_dict.items():
|
|
164
|
+
# translate label to PascalCase
|
|
165
|
+
pascal_label = self.translator.name_sentence_to_pascal(
|
|
166
|
+
parse_label(label)
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# paths
|
|
170
|
+
header = f"{pascal_label}-header.csv"
|
|
171
|
+
header_path = os.path.join(
|
|
172
|
+
self.outdir,
|
|
173
|
+
header,
|
|
174
|
+
)
|
|
175
|
+
parts = f"{pascal_label}-part.*"
|
|
176
|
+
|
|
177
|
+
# check for file exists
|
|
178
|
+
if os.path.exists(header_path):
|
|
179
|
+
logger.warning(
|
|
180
|
+
f"File {header_path} already exists. Overwriting."
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# concatenate key:value in props
|
|
184
|
+
props_list = []
|
|
185
|
+
for k, v in props.items():
|
|
186
|
+
if v in ["int", "long", "integer"]:
|
|
187
|
+
props_list.append(f"{k}:long")
|
|
188
|
+
elif v in ["int[]", "long[]", "integer[]"]:
|
|
189
|
+
props_list.append(f"{k}:long[]")
|
|
190
|
+
elif v in ["float", "double"]:
|
|
191
|
+
props_list.append(f"{k}:double")
|
|
192
|
+
elif v in ["float[]", "double[]"]:
|
|
193
|
+
props_list.append(f"{k}:double[]")
|
|
194
|
+
elif v in [
|
|
195
|
+
"bool",
|
|
196
|
+
"boolean",
|
|
197
|
+
]: # TODO does Neo4j support bool?
|
|
198
|
+
props_list.append(f"{k}:boolean")
|
|
199
|
+
elif v in ["bool[]", "boolean[]"]:
|
|
200
|
+
props_list.append(f"{k}:boolean[]")
|
|
201
|
+
elif v in ["str[]", "string[]"]:
|
|
202
|
+
props_list.append(f"{k}:string[]")
|
|
203
|
+
else:
|
|
204
|
+
props_list.append(f"{k}")
|
|
205
|
+
|
|
206
|
+
skip_id = False
|
|
207
|
+
schema_label = None
|
|
208
|
+
|
|
209
|
+
if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
|
|
210
|
+
skip_id = True
|
|
211
|
+
elif not self.translator.ontology.mapping.extended_schema.get(
|
|
212
|
+
label
|
|
213
|
+
):
|
|
214
|
+
# find label in schema by label_as_edge
|
|
215
|
+
for (
|
|
216
|
+
k,
|
|
217
|
+
v,
|
|
218
|
+
) in self.translator.ontology.mapping.extended_schema.items():
|
|
219
|
+
if v.get("label_as_edge") == label:
|
|
220
|
+
schema_label = k
|
|
221
|
+
break
|
|
222
|
+
else:
|
|
223
|
+
schema_label = label
|
|
224
|
+
|
|
225
|
+
out_list = [":START_ID"]
|
|
226
|
+
|
|
227
|
+
if schema_label:
|
|
228
|
+
if (
|
|
229
|
+
self.translator.ontology.mapping.extended_schema.get(
|
|
230
|
+
schema_label
|
|
231
|
+
).get("use_id")
|
|
232
|
+
== False
|
|
233
|
+
):
|
|
234
|
+
skip_id = True
|
|
235
|
+
|
|
236
|
+
if not skip_id:
|
|
237
|
+
out_list.append("id")
|
|
238
|
+
|
|
239
|
+
out_list.extend(props_list)
|
|
240
|
+
out_list.extend([":END_ID", ":TYPE"])
|
|
241
|
+
|
|
242
|
+
with open(header_path, "w", encoding="utf-8") as f:
|
|
243
|
+
# concatenate with delimiter
|
|
244
|
+
row = self.delim.join(out_list)
|
|
245
|
+
f.write(row)
|
|
246
|
+
|
|
247
|
+
# add file path to neo4 admin import statement (import call file
|
|
248
|
+
# path may be different from actual file path)
|
|
249
|
+
import_call_header_path = os.path.join(
|
|
250
|
+
self.import_call_file_prefix,
|
|
251
|
+
header,
|
|
252
|
+
)
|
|
253
|
+
import_call_parts_path = os.path.join(
|
|
254
|
+
self.import_call_file_prefix,
|
|
255
|
+
parts,
|
|
256
|
+
)
|
|
257
|
+
self.import_call_edges.add(
|
|
258
|
+
(import_call_header_path, import_call_parts_path)
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
return True
|
|
262
|
+
|
|
263
|
+
def _get_import_script_name(self) -> str:
|
|
264
|
+
"""
|
|
265
|
+
Returns the name of the neo4j admin import script
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
str: The name of the import script (ending in .sh)
|
|
269
|
+
"""
|
|
270
|
+
return "neo4j-admin-import-call.sh"
|
|
271
|
+
|
|
272
|
+
def _construct_import_call(self) -> str:
|
|
273
|
+
"""
|
|
274
|
+
Function to construct the import call detailing folder and
|
|
275
|
+
individual node and edge headers and data files, as well as
|
|
276
|
+
delimiters and database name. Built after all data has been
|
|
277
|
+
processed to ensure that nodes are called before any edges.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
str: a bash command for neo4j-admin import
|
|
281
|
+
"""
|
|
282
|
+
import_call_neo4j_v4 = self._get_import_call(
|
|
283
|
+
"import", "--database=", "--force="
|
|
284
|
+
)
|
|
285
|
+
import_call_neo4j_v5 = self._get_import_call(
|
|
286
|
+
"database import full", "", "--overwrite-destination="
|
|
287
|
+
)
|
|
288
|
+
neo4j_version_check = f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)"
|
|
289
|
+
|
|
290
|
+
import_script = f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi"
|
|
291
|
+
return import_script
|
|
292
|
+
|
|
293
|
+
def _get_import_call(
|
|
294
|
+
self, import_cmd: str, database_cmd: str, wipe_cmd: str
|
|
295
|
+
) -> str:
|
|
296
|
+
"""Get parametrized import call for Neo4j 4 or 5+.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
import_cmd (str): The import command to use.
|
|
300
|
+
database_cmd (str): The database command to use.
|
|
301
|
+
wipe_cmd (str): The wipe command to use.
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
str: The import call.
|
|
305
|
+
"""
|
|
306
|
+
import_call = (
|
|
307
|
+
f"{self.import_call_bin_prefix}neo4j-admin {import_cmd} "
|
|
308
|
+
f'--delimiter="{self.escaped_delim}" '
|
|
309
|
+
f'--array-delimiter="{self.escaped_adelim}" '
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
if self.quote == "'":
|
|
313
|
+
import_call += f'--quote="{self.quote}" '
|
|
314
|
+
else:
|
|
315
|
+
import_call += f"--quote='{self.quote}' "
|
|
316
|
+
|
|
317
|
+
if self.wipe:
|
|
318
|
+
import_call += f"{wipe_cmd}true "
|
|
319
|
+
if self.skip_bad_relationships:
|
|
320
|
+
import_call += "--skip-bad-relationships=true "
|
|
321
|
+
if self.skip_duplicate_nodes:
|
|
322
|
+
import_call += "--skip-duplicate-nodes=true "
|
|
323
|
+
|
|
324
|
+
# append node import calls
|
|
325
|
+
for header_path, parts_path in self.import_call_nodes:
|
|
326
|
+
import_call += f'--nodes="{header_path},{parts_path}" '
|
|
327
|
+
|
|
328
|
+
# append edge import calls
|
|
329
|
+
for header_path, parts_path in self.import_call_edges:
|
|
330
|
+
import_call += f'--relationships="{header_path},{parts_path}" '
|
|
331
|
+
|
|
332
|
+
# Database needs to be at the end starting with Neo4j 5.0+.
|
|
333
|
+
import_call += f"{database_cmd}{self.db_name} "
|
|
334
|
+
return import_call
|
|
File without changes
|