biocypher 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/_config/biocypher_config.yaml +7 -1
- biocypher/_core.py +25 -4
- biocypher/_metadata.py +1 -1
- biocypher/_ontology.py +144 -51
- biocypher/_translate.py +84 -79
- biocypher/output/write/_batch_writer.py +99 -50
- biocypher/output/write/_get_writer.py +29 -12
- biocypher/output/write/graph/_arangodb.py +44 -32
- biocypher/output/write/graph/_neo4j.py +3 -4
- biocypher/output/write/graph/_owl.py +569 -0
- biocypher/output/write/graph/_rdf.py +234 -97
- {biocypher-0.8.0.dist-info → biocypher-0.9.1.dist-info}/METADATA +1 -1
- {biocypher-0.8.0.dist-info → biocypher-0.9.1.dist-info}/RECORD +15 -14
- {biocypher-0.8.0.dist-info → biocypher-0.9.1.dist-info}/LICENSE +0 -0
- {biocypher-0.8.0.dist-info → biocypher-0.9.1.dist-info}/WHEEL +0 -0
biocypher/_translate.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
"""BioCypher 'translation' module.
|
|
2
|
+
|
|
3
|
+
Responsible for translating between the raw input data and the
|
|
4
|
+
BioCypherNode and BioCypherEdge objects.
|
|
4
5
|
"""
|
|
5
6
|
|
|
6
7
|
from collections.abc import Generator, Iterable
|
|
7
|
-
from typing import Any
|
|
8
|
+
from typing import Any
|
|
8
9
|
|
|
9
10
|
from more_itertools import peekable
|
|
10
11
|
|
|
@@ -19,21 +20,23 @@ __all__ = ["Translator"]
|
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class Translator:
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
the schema_config.yaml file. Creates a mapping
|
|
25
|
-
and, given nodes and edges, translates them into
|
|
26
|
-
BioCypherEdges. During this process, can also filter the
|
|
27
|
-
entities if the schema_config.yaml file specifies a property
|
|
28
|
-
blacklist.
|
|
23
|
+
"""Class responsible for exacting the translation process.
|
|
24
|
+
|
|
25
|
+
Translation is configured in the schema_config.yaml file. Creates a mapping
|
|
26
|
+
dictionary from that file, and, given nodes and edges, translates them into
|
|
27
|
+
BioCypherNodes and BioCypherEdges. During this process, can also filter the
|
|
28
|
+
properties of the entities if the schema_config.yaml file specifies a property
|
|
29
|
+
whitelist or blacklist.
|
|
29
30
|
|
|
30
31
|
Provides utility functions for translating between input and output labels
|
|
31
32
|
and cypher queries.
|
|
32
33
|
"""
|
|
33
34
|
|
|
34
35
|
def __init__(self, ontology: "Ontology", strict_mode: bool = False):
|
|
35
|
-
"""
|
|
36
|
+
"""Initialise the translator.
|
|
37
|
+
|
|
36
38
|
Args:
|
|
39
|
+
----
|
|
37
40
|
leaves:
|
|
38
41
|
Dictionary detailing the leaves of the hierarchy
|
|
39
42
|
tree representing the structure of the graph; the leaves are
|
|
@@ -43,8 +46,8 @@ class Translator:
|
|
|
43
46
|
strict_mode:
|
|
44
47
|
If True, the translator will raise an error if input data do not
|
|
45
48
|
carry source, licence, and version information.
|
|
46
|
-
"""
|
|
47
49
|
|
|
50
|
+
"""
|
|
48
51
|
self.ontology = ontology
|
|
49
52
|
self.strict_mode = strict_mode
|
|
50
53
|
|
|
@@ -59,11 +62,7 @@ class Translator:
|
|
|
59
62
|
|
|
60
63
|
def translate_entities(self, entities):
|
|
61
64
|
entities = peekable(entities)
|
|
62
|
-
if (
|
|
63
|
-
isinstance(entities.peek(), BioCypherNode)
|
|
64
|
-
or isinstance(entities.peek(), BioCypherEdge)
|
|
65
|
-
or isinstance(entities.peek(), BioCypherRelAsNode)
|
|
66
|
-
):
|
|
65
|
+
if isinstance(entities.peek(), BioCypherEdge | BioCypherNode | BioCypherRelAsNode):
|
|
67
66
|
translated_entities = entities
|
|
68
67
|
elif len(entities.peek()) < 4:
|
|
69
68
|
translated_entities = self.translate_nodes(entities)
|
|
@@ -75,19 +74,20 @@ class Translator:
|
|
|
75
74
|
self,
|
|
76
75
|
node_tuples: Iterable,
|
|
77
76
|
) -> Generator[BioCypherNode, None, None]:
|
|
78
|
-
"""
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
77
|
+
"""Translate input node representation.
|
|
78
|
+
|
|
79
|
+
Translate the node tuples to a representation that conforms to the
|
|
80
|
+
schema of the given BioCypher graph. For now requires explicit
|
|
81
|
+
statement of node type on pass.
|
|
82
82
|
|
|
83
83
|
Args:
|
|
84
|
+
----
|
|
84
85
|
node_tuples (list of tuples): collection of tuples
|
|
85
86
|
representing individual nodes by their unique id and a type
|
|
86
87
|
that is translated from the original database notation to
|
|
87
88
|
the corresponding BioCypher notation.
|
|
88
89
|
|
|
89
90
|
"""
|
|
90
|
-
|
|
91
91
|
self._log_begin_translate(node_tuples, "nodes")
|
|
92
92
|
|
|
93
93
|
for _id, _type, _props in node_tuples:
|
|
@@ -101,10 +101,12 @@ class Translator:
|
|
|
101
101
|
|
|
102
102
|
for prop in required_props:
|
|
103
103
|
if prop not in _props:
|
|
104
|
-
|
|
104
|
+
msg = (
|
|
105
105
|
f"Property `{prop}` missing from node {_id}. "
|
|
106
|
-
"Strict mode is enabled, so this is not allowed."
|
|
106
|
+
"Strict mode is enabled, so this is not allowed.",
|
|
107
107
|
)
|
|
108
|
+
logger.error(msg)
|
|
109
|
+
raise ValueError(msg)
|
|
108
110
|
|
|
109
111
|
# find the node in leaves that represents ontology node type
|
|
110
112
|
_ontology_class = self._get_ontology_mapping(_type)
|
|
@@ -129,10 +131,11 @@ class Translator:
|
|
|
129
131
|
self._log_finish_translate("nodes")
|
|
130
132
|
|
|
131
133
|
def _get_preferred_id(self, _bl_type: str) -> str:
|
|
132
|
-
"""
|
|
133
|
-
Returns the preferred id for the given Biolink type.
|
|
134
|
-
"""
|
|
134
|
+
"""Return the preferred id for the given Biolink type.
|
|
135
135
|
|
|
136
|
+
If the preferred id is not specified in the schema_config.yaml file,
|
|
137
|
+
return "id".
|
|
138
|
+
"""
|
|
136
139
|
return (
|
|
137
140
|
self.ontology.mapping.extended_schema[_bl_type]["preferred_id"]
|
|
138
141
|
if "preferred_id" in self.ontology.mapping.extended_schema.get(_bl_type, {})
|
|
@@ -140,10 +143,11 @@ class Translator:
|
|
|
140
143
|
)
|
|
141
144
|
|
|
142
145
|
def _filter_props(self, bl_type: str, props: dict) -> dict:
|
|
143
|
-
"""
|
|
144
|
-
Filters properties for those specified in schema_config if any.
|
|
145
|
-
"""
|
|
146
|
+
"""Filter properties for those specified in schema_config if any.
|
|
146
147
|
|
|
148
|
+
If the properties are not specified in the schema_config.yaml file,
|
|
149
|
+
return the original properties.
|
|
150
|
+
"""
|
|
147
151
|
filter_props = self.ontology.mapping.extended_schema[bl_type].get("properties", {})
|
|
148
152
|
|
|
149
153
|
# strict mode: add required properties (only if there is a whitelist)
|
|
@@ -179,14 +183,15 @@ class Translator:
|
|
|
179
183
|
def translate_edges(
|
|
180
184
|
self,
|
|
181
185
|
edge_tuples: Iterable,
|
|
182
|
-
) -> Generator[
|
|
183
|
-
"""
|
|
184
|
-
Translates input edge representation to a representation that
|
|
185
|
-
conforms to the schema of the given BioCypher graph. For now
|
|
186
|
-
requires explicit statement of edge type on pass.
|
|
186
|
+
) -> Generator[BioCypherEdge | BioCypherRelAsNode, None, None]:
|
|
187
|
+
"""Translate input edge representation.
|
|
187
188
|
|
|
188
|
-
|
|
189
|
+
Translate the edge tuples to a representation that conforms to the
|
|
190
|
+
schema of the given BioCypher graph. For now requires explicit
|
|
191
|
+
statement of edge type on pass.
|
|
189
192
|
|
|
193
|
+
Args:
|
|
194
|
+
----
|
|
190
195
|
edge_tuples (list of tuples):
|
|
191
196
|
|
|
192
197
|
collection of tuples representing source and target of
|
|
@@ -194,8 +199,8 @@ class Translator:
|
|
|
194
199
|
of interaction in the original database notation, which
|
|
195
200
|
is translated to BioCypher notation using the `leaves`.
|
|
196
201
|
Can optionally possess its own ID.
|
|
197
|
-
"""
|
|
198
202
|
|
|
203
|
+
"""
|
|
199
204
|
self._log_begin_translate(edge_tuples, "edges")
|
|
200
205
|
|
|
201
206
|
# legacy: deal with 4-tuples (no edge id)
|
|
@@ -208,18 +213,22 @@ class Translator:
|
|
|
208
213
|
# check for strict mode requirements
|
|
209
214
|
if self.strict_mode:
|
|
210
215
|
if "source" not in _props:
|
|
211
|
-
|
|
212
|
-
f"Edge {_id if _id else (_src, _tar)} does not have a `source` property."
|
|
216
|
+
msg = (
|
|
217
|
+
f"Edge {_id if _id else (_src, _tar)} does not have a `source` property."
|
|
213
218
|
" This is required in strict mode.",
|
|
214
219
|
)
|
|
220
|
+
logger.error(msg)
|
|
221
|
+
raise ValueError(msg)
|
|
215
222
|
if "licence" not in _props:
|
|
216
|
-
|
|
217
|
-
f"Edge {_id if _id else (_src, _tar)} does not have a `licence` property."
|
|
223
|
+
msg = (
|
|
224
|
+
f"Edge {_id if _id else (_src, _tar)} does not have a `licence` property."
|
|
218
225
|
" This is required in strict mode.",
|
|
219
226
|
)
|
|
227
|
+
logger.error(msg)
|
|
228
|
+
raise ValueError(msg)
|
|
220
229
|
|
|
221
230
|
# match the input label (_type) to
|
|
222
|
-
#
|
|
231
|
+
# an ontology label from schema_config
|
|
223
232
|
bl_type = self._get_ontology_mapping(_type)
|
|
224
233
|
|
|
225
234
|
if bl_type:
|
|
@@ -295,12 +304,12 @@ class Translator:
|
|
|
295
304
|
self._log_finish_translate("edges")
|
|
296
305
|
|
|
297
306
|
def _record_no_type(self, _type: Any, what: Any) -> None:
|
|
298
|
-
"""
|
|
299
|
-
Records the type of a node or edge that is not represented in the
|
|
300
|
-
schema_config.
|
|
301
|
-
"""
|
|
307
|
+
"""Record the type of a non-represented node or edge.
|
|
302
308
|
|
|
303
|
-
|
|
309
|
+
In case of an entity that is not represented in the schema_config,
|
|
310
|
+
record the type and the entity.
|
|
311
|
+
"""
|
|
312
|
+
logger.error(f"No ontology type defined for `{_type}`: {what}")
|
|
304
313
|
|
|
305
314
|
if self.notype.get(_type, None):
|
|
306
315
|
self.notype[_type] += 1
|
|
@@ -309,11 +318,11 @@ class Translator:
|
|
|
309
318
|
self.notype[_type] = 1
|
|
310
319
|
|
|
311
320
|
def get_missing_biolink_types(self) -> dict:
|
|
312
|
-
"""
|
|
313
|
-
Returns a dictionary of types that were not represented in the
|
|
314
|
-
schema_config.
|
|
315
|
-
"""
|
|
321
|
+
"""Return a dictionary of non-represented types.
|
|
316
322
|
|
|
323
|
+
The dictionary contains the type as the key and the number of
|
|
324
|
+
occurrences as the value.
|
|
325
|
+
"""
|
|
317
326
|
return self.notype
|
|
318
327
|
|
|
319
328
|
@staticmethod
|
|
@@ -327,12 +336,10 @@ class Translator:
|
|
|
327
336
|
logger.debug(f"Finished translating {what} to BioCypher.")
|
|
328
337
|
|
|
329
338
|
def _update_ontology_types(self):
|
|
330
|
-
"""
|
|
331
|
-
Creates a dictionary to translate from input labels to ontology labels.
|
|
339
|
+
"""Create a dictionary to translate from input to ontology labels.
|
|
332
340
|
|
|
333
341
|
If multiple input labels, creates mapping for each.
|
|
334
342
|
"""
|
|
335
|
-
|
|
336
343
|
self._ontology_mapping = {}
|
|
337
344
|
|
|
338
345
|
for key, value in self.ontology.mapping.extended_schema.items():
|
|
@@ -351,47 +358,45 @@ class Translator:
|
|
|
351
358
|
else:
|
|
352
359
|
self._add_translation_mappings(labels, key)
|
|
353
360
|
|
|
354
|
-
def _get_ontology_mapping(self, label: str) ->
|
|
355
|
-
"""
|
|
361
|
+
def _get_ontology_mapping(self, label: str) -> str | None:
|
|
362
|
+
"""Find the ontology class for the given input type.
|
|
363
|
+
|
|
356
364
|
For each given input type ("input_label" or "label_in_input"), find the
|
|
357
365
|
corresponding ontology class in the leaves dictionary (from the
|
|
358
366
|
`schema_config.yam`).
|
|
359
367
|
|
|
360
368
|
Args:
|
|
369
|
+
----
|
|
361
370
|
label:
|
|
362
371
|
The input type to find (`input_label` or `label_in_input` in
|
|
363
372
|
`schema_config.yaml`).
|
|
364
|
-
"""
|
|
365
373
|
|
|
374
|
+
"""
|
|
375
|
+
# FIXME does not seem like a necessary function.
|
|
366
376
|
# commented out until behaviour of _update_bl_types is fixed
|
|
367
377
|
return self._ontology_mapping.get(label, None)
|
|
368
378
|
|
|
369
379
|
def translate_term(self, term):
|
|
370
|
-
"""
|
|
371
|
-
Translate a single term.
|
|
372
|
-
"""
|
|
373
|
-
|
|
380
|
+
"""Translate a single term."""
|
|
374
381
|
return self.mappings.get(term, None)
|
|
375
382
|
|
|
376
383
|
def reverse_translate_term(self, term):
|
|
377
|
-
"""
|
|
378
|
-
Reverse translate a single term.
|
|
379
|
-
"""
|
|
380
|
-
|
|
384
|
+
"""Reverse translate a single term."""
|
|
381
385
|
return self.reverse_mappings.get(term, None)
|
|
382
386
|
|
|
383
387
|
def translate(self, query):
|
|
384
|
-
"""
|
|
385
|
-
|
|
388
|
+
"""Translate a cypher query.
|
|
389
|
+
|
|
390
|
+
Only translates labels as of now.
|
|
386
391
|
"""
|
|
387
392
|
for key in self.mappings:
|
|
388
393
|
query = query.replace(":" + key, ":" + self.mappings[key])
|
|
389
394
|
return query
|
|
390
395
|
|
|
391
396
|
def reverse_translate(self, query):
|
|
392
|
-
"""
|
|
393
|
-
|
|
394
|
-
now.
|
|
397
|
+
"""Reverse translate a cypher query.
|
|
398
|
+
|
|
399
|
+
Only translates labels as of now.
|
|
395
400
|
"""
|
|
396
401
|
for key in self.reverse_mappings:
|
|
397
402
|
a = ":" + key + ")"
|
|
@@ -399,12 +404,14 @@ class Translator:
|
|
|
399
404
|
# TODO this conditional probably does not cover all cases
|
|
400
405
|
if a in query or b in query:
|
|
401
406
|
if isinstance(self.reverse_mappings[key], list):
|
|
402
|
-
|
|
407
|
+
msg = (
|
|
403
408
|
"Reverse translation of multiple inputs not "
|
|
404
409
|
"implemented yet. Many-to-one mappings are "
|
|
405
410
|
"not reversible. "
|
|
406
411
|
f"({key} -> {self.reverse_mappings[key]})",
|
|
407
412
|
)
|
|
413
|
+
logger.error(msg)
|
|
414
|
+
raise NotImplementedError(msg)
|
|
408
415
|
else:
|
|
409
416
|
query = query.replace(
|
|
410
417
|
a,
|
|
@@ -413,10 +420,10 @@ class Translator:
|
|
|
413
420
|
return query
|
|
414
421
|
|
|
415
422
|
def _add_translation_mappings(self, original_name, biocypher_name):
|
|
416
|
-
"""
|
|
417
|
-
|
|
418
|
-
PascalCase version of the BioCypher name, since
|
|
419
|
-
not useful for Cypher queries.
|
|
423
|
+
"""Add translation mappings for a label and name.
|
|
424
|
+
|
|
425
|
+
We use here the PascalCase version of the BioCypher name, since
|
|
426
|
+
sentence case is not useful for Cypher queries.
|
|
420
427
|
"""
|
|
421
428
|
if isinstance(original_name, list):
|
|
422
429
|
for on in original_name:
|
|
@@ -444,9 +451,7 @@ class Translator:
|
|
|
444
451
|
|
|
445
452
|
@staticmethod
|
|
446
453
|
def name_sentence_to_pascal(name: str) -> str:
|
|
447
|
-
"""
|
|
448
|
-
Converts a name in sentence case to pascal case.
|
|
449
|
-
"""
|
|
454
|
+
"""Convert a name in sentence case to pascal case."""
|
|
450
455
|
# split on dots if dot is present
|
|
451
456
|
if "." in name:
|
|
452
457
|
return ".".join(
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Abstract base class for all batch writers."""
|
|
2
|
+
|
|
1
3
|
import glob
|
|
2
4
|
import os
|
|
3
5
|
import re
|
|
@@ -16,30 +18,37 @@ from biocypher.output.write._writer import _Writer
|
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
class _BatchWriter(_Writer, ABC):
|
|
19
|
-
"""Abstract batch writer class"""
|
|
21
|
+
"""Abstract batch writer class."""
|
|
20
22
|
|
|
21
23
|
@abstractmethod
|
|
22
24
|
def _quote_string(self, value: str) -> str:
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
"""Quote a string.
|
|
26
|
+
|
|
27
|
+
Escaping is handled by the database-specific writer.
|
|
28
|
+
"""
|
|
29
|
+
msg = "Database writer must override '_quote_string'"
|
|
30
|
+
logger.error(msg)
|
|
31
|
+
raise NotImplementedError(msg)
|
|
27
32
|
|
|
28
33
|
@abstractmethod
|
|
29
34
|
def _get_default_import_call_bin_prefix(self):
|
|
30
|
-
"""
|
|
35
|
+
"""Provide the default string for the import call bin prefix.
|
|
31
36
|
|
|
32
37
|
Returns
|
|
33
38
|
-------
|
|
34
39
|
str: The database-specific string for the path to the import call bin prefix
|
|
35
40
|
|
|
36
41
|
"""
|
|
37
|
-
|
|
42
|
+
msg = "Database writer must override '_get_default_import_call_bin_prefix'"
|
|
43
|
+
logger.error(msg)
|
|
44
|
+
raise NotImplementedError(msg)
|
|
38
45
|
|
|
39
46
|
@abstractmethod
|
|
40
47
|
def _write_array_string(self, string_list):
|
|
41
|
-
"""
|
|
42
|
-
|
|
48
|
+
"""Write the string representation of an array into a .csv file.
|
|
49
|
+
|
|
50
|
+
Different databases require different formats of array to optimize
|
|
51
|
+
import speed.
|
|
43
52
|
|
|
44
53
|
Args:
|
|
45
54
|
----
|
|
@@ -50,50 +59,65 @@ class _BatchWriter(_Writer, ABC):
|
|
|
50
59
|
str: The database-specific string representation of an array
|
|
51
60
|
|
|
52
61
|
"""
|
|
53
|
-
|
|
62
|
+
msg = "Database writer must override '_write_array_string'"
|
|
63
|
+
logger.error(msg)
|
|
64
|
+
raise NotImplementedError(msg)
|
|
54
65
|
|
|
55
66
|
@abstractmethod
|
|
56
67
|
def _write_node_headers(self):
|
|
57
|
-
"""
|
|
58
|
-
|
|
68
|
+
"""Write header files for nodes.
|
|
69
|
+
|
|
70
|
+
Write header files (node properties) for nodes as per the
|
|
71
|
+
definition in the `schema_config.yaml`.
|
|
59
72
|
|
|
60
73
|
Returns
|
|
61
74
|
-------
|
|
62
75
|
bool: The return value. True for success, False otherwise.
|
|
63
76
|
|
|
64
77
|
"""
|
|
65
|
-
|
|
78
|
+
msg = "Database writer must override '_write_node_headers'"
|
|
79
|
+
logger.error(msg)
|
|
80
|
+
raise NotImplementedError(msg)
|
|
66
81
|
|
|
67
82
|
@abstractmethod
|
|
68
83
|
def _write_edge_headers(self):
|
|
69
|
-
"""
|
|
70
|
-
|
|
71
|
-
|
|
84
|
+
"""Write a database import-file for an edge.
|
|
85
|
+
|
|
86
|
+
Write a database import-file for an edge as per the definition in
|
|
87
|
+
the `schema_config.yaml`, containing only the header for this type
|
|
88
|
+
of edge.
|
|
72
89
|
|
|
73
90
|
Returns
|
|
74
91
|
-------
|
|
75
92
|
bool: The return value. True for success, False otherwise.
|
|
76
93
|
|
|
77
94
|
"""
|
|
78
|
-
|
|
95
|
+
msg = "Database writer must override '_write_edge_headers'"
|
|
96
|
+
logger.error(msg)
|
|
97
|
+
raise NotImplementedError(msg)
|
|
79
98
|
|
|
80
99
|
@abstractmethod
|
|
81
100
|
def _construct_import_call(self) -> str:
|
|
82
|
-
"""
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
101
|
+
"""Construct the import call.
|
|
102
|
+
|
|
103
|
+
Construct the import call detailing folder and individual node and
|
|
104
|
+
edge headers and data files, as well as delimiters and database name.
|
|
105
|
+
Built after all data has been processed to ensure that nodes are
|
|
106
|
+
called before any edges.
|
|
86
107
|
|
|
87
108
|
Returns
|
|
88
109
|
-------
|
|
89
110
|
str: A bash command for csv import.
|
|
90
111
|
|
|
91
112
|
"""
|
|
92
|
-
|
|
113
|
+
msg = "Database writer must override '_construct_import_call'"
|
|
114
|
+
logger.error(msg)
|
|
115
|
+
raise NotImplementedError(msg)
|
|
93
116
|
|
|
94
117
|
@abstractmethod
|
|
95
118
|
def _get_import_script_name(self) -> str:
|
|
96
|
-
"""
|
|
119
|
+
"""Return the name of the import script.
|
|
120
|
+
|
|
97
121
|
The name will be chosen based on the used database.
|
|
98
122
|
|
|
99
123
|
Returns
|
|
@@ -101,7 +125,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
101
125
|
str: The name of the import script (ending in .sh)
|
|
102
126
|
|
|
103
127
|
"""
|
|
104
|
-
|
|
128
|
+
msg = "Database writer must override '_get_import_script_name'"
|
|
129
|
+
logger.error(msg)
|
|
130
|
+
raise NotImplementedError(msg)
|
|
105
131
|
|
|
106
132
|
def __init__(
|
|
107
133
|
self,
|
|
@@ -122,11 +148,14 @@ class _BatchWriter(_Writer, ABC):
|
|
|
122
148
|
db_password: str = None,
|
|
123
149
|
db_host: str = None,
|
|
124
150
|
db_port: str = None,
|
|
125
|
-
|
|
151
|
+
file_format: str = None,
|
|
126
152
|
rdf_namespaces: dict = {},
|
|
127
153
|
labels_order: str = "Ascending",
|
|
154
|
+
**kwargs,
|
|
128
155
|
):
|
|
129
|
-
"""
|
|
156
|
+
"""Write node and edge representations to disk.
|
|
157
|
+
|
|
158
|
+
Abstract parent class for writing node and edge representations to disk
|
|
130
159
|
using the format specified by each database type. The database-specific
|
|
131
160
|
functions are implemented by the respective child-classes. This abstract
|
|
132
161
|
class contains all methods expected by a bach writer instance, some of
|
|
@@ -180,7 +209,8 @@ class _BatchWriter(_Writer, ABC):
|
|
|
180
209
|
call.
|
|
181
210
|
|
|
182
211
|
wipe:
|
|
183
|
-
Whether to force import (removing existing DB content).
|
|
212
|
+
Whether to force import (removing existing DB content).
|
|
213
|
+
(Specific to Neo4j.)
|
|
184
214
|
|
|
185
215
|
strict_mode:
|
|
186
216
|
Whether to enforce source, version, and license properties.
|
|
@@ -204,7 +234,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
204
234
|
db_port:
|
|
205
235
|
The database port.
|
|
206
236
|
|
|
207
|
-
|
|
237
|
+
file_format:
|
|
208
238
|
The format of RDF.
|
|
209
239
|
|
|
210
240
|
rdf_namespaces:
|
|
@@ -226,7 +256,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
226
256
|
self.db_password = db_password
|
|
227
257
|
self.db_host = db_host or "localhost"
|
|
228
258
|
self.db_port = db_port
|
|
229
|
-
self.
|
|
259
|
+
self.file_format = file_format
|
|
230
260
|
self.rdf_namespaces = rdf_namespaces
|
|
231
261
|
|
|
232
262
|
self.delim, self.escaped_delim = self._process_delimiter(delimiter)
|
|
@@ -277,8 +307,16 @@ class _BatchWriter(_Writer, ABC):
|
|
|
277
307
|
return self._import_call_file_prefix
|
|
278
308
|
|
|
279
309
|
def _process_delimiter(self, delimiter: str) -> str:
|
|
280
|
-
"""
|
|
281
|
-
|
|
310
|
+
"""Process a delimited to escape correctly.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
----
|
|
314
|
+
delimiter (str): The delimiter to process.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
-------
|
|
318
|
+
tuple: The delimiter and its escaped representation.
|
|
319
|
+
|
|
282
320
|
"""
|
|
283
321
|
if delimiter == "\\t":
|
|
284
322
|
return "\t", "\\t"
|
|
@@ -287,7 +325,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
287
325
|
return delimiter, delimiter
|
|
288
326
|
|
|
289
327
|
def write_nodes(self, nodes, batch_size: int = int(1e6), force: bool = False):
|
|
290
|
-
"""
|
|
328
|
+
"""Write nodes and their headers.
|
|
291
329
|
|
|
292
330
|
Args:
|
|
293
331
|
----
|
|
@@ -325,7 +363,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
325
363
|
edges: list | GeneratorType,
|
|
326
364
|
batch_size: int = int(1e6),
|
|
327
365
|
) -> bool:
|
|
328
|
-
"""
|
|
366
|
+
"""Write edges and their headers.
|
|
329
367
|
|
|
330
368
|
Args:
|
|
331
369
|
----
|
|
@@ -387,12 +425,13 @@ class _BatchWriter(_Writer, ABC):
|
|
|
387
425
|
return True
|
|
388
426
|
|
|
389
427
|
def _write_node_data(self, nodes, batch_size, force: bool = False):
|
|
390
|
-
"""
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
to the
|
|
395
|
-
|
|
428
|
+
"""Write biocypher nodes to CSV.
|
|
429
|
+
|
|
430
|
+
Conforms to the headers created with `_write_node_headers()`, and
|
|
431
|
+
is actually required to be run before calling `_write_node_headers()`
|
|
432
|
+
to set the :py:attr:`self.node_property_dict` for passing the node
|
|
433
|
+
properties to the instance. Expects list or generator of nodes from
|
|
434
|
+
the :py:class:`BioCypherNode` class.
|
|
396
435
|
|
|
397
436
|
Args:
|
|
398
437
|
----
|
|
@@ -571,7 +610,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
571
610
|
prop_dict: dict,
|
|
572
611
|
labels: str,
|
|
573
612
|
):
|
|
574
|
-
"""
|
|
613
|
+
"""Write a list of biocypher nodes to a CSV file.
|
|
614
|
+
|
|
615
|
+
This function takes one list of biocypher nodes and writes them
|
|
575
616
|
to a Neo4j admin import compatible CSV file.
|
|
576
617
|
|
|
577
618
|
Args:
|
|
@@ -655,7 +696,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
655
696
|
return True
|
|
656
697
|
|
|
657
698
|
def _write_edge_data(self, edges, batch_size):
|
|
658
|
-
"""
|
|
699
|
+
"""Write biocypher edges to CSV.
|
|
700
|
+
|
|
701
|
+
Writes biocypher edges to CSV conforming to the headers created
|
|
659
702
|
with `_write_edge_headers()`, and is actually required to be run
|
|
660
703
|
before calling `_write_node_headers()` to set the
|
|
661
704
|
:py:attr:`self.edge_property_dict` for passing the edge
|
|
@@ -804,7 +847,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
804
847
|
label: str,
|
|
805
848
|
prop_dict: dict,
|
|
806
849
|
):
|
|
807
|
-
"""
|
|
850
|
+
"""Write a list of biocypher edges to a CSV file.
|
|
851
|
+
|
|
852
|
+
This function takes one list of biocypher edges and writes them
|
|
808
853
|
to a Neo4j admin import compatible CSV file.
|
|
809
854
|
|
|
810
855
|
Args:
|
|
@@ -923,7 +968,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
923
968
|
return True
|
|
924
969
|
|
|
925
970
|
def _write_next_part(self, label: str, lines: list):
|
|
926
|
-
"""
|
|
971
|
+
"""Write a list of strings to a new part file.
|
|
927
972
|
|
|
928
973
|
Args:
|
|
929
974
|
----
|
|
@@ -975,9 +1020,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
975
1020
|
self.parts[label].append(part)
|
|
976
1021
|
|
|
977
1022
|
def get_import_call(self) -> str:
|
|
978
|
-
"""
|
|
979
|
-
|
|
980
|
-
|
|
1023
|
+
"""Eeturn the import call.
|
|
1024
|
+
|
|
1025
|
+
Return the import call detailing folder and individual node and
|
|
1026
|
+
edge headers and data files, as well as delimiters and database name.
|
|
981
1027
|
|
|
982
1028
|
Returns
|
|
983
1029
|
-------
|
|
@@ -987,7 +1033,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
987
1033
|
return self._construct_import_call()
|
|
988
1034
|
|
|
989
1035
|
def write_import_call(self) -> str:
|
|
990
|
-
"""
|
|
1036
|
+
"""Write the import call.
|
|
1037
|
+
|
|
1038
|
+
Function to write the import call detailing folder and
|
|
991
1039
|
individual node and edge headers and data files, as well as
|
|
992
1040
|
delimiters and database name, to the export folder as txt.
|
|
993
1041
|
|
|
@@ -1006,9 +1054,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
1006
1054
|
|
|
1007
1055
|
|
|
1008
1056
|
def parse_label(label: str) -> str:
|
|
1009
|
-
"""Check if the label is compliant with Neo4j naming conventions
|
|
1010
|
-
|
|
1011
|
-
|
|
1057
|
+
"""Check if the label is compliant with Neo4j naming conventions.
|
|
1058
|
+
|
|
1059
|
+
Check against https://neo4j.com/docs/cypher-manual/current/syntax/naming/,
|
|
1060
|
+
and if not compliant, remove non-compliant characters.
|
|
1012
1061
|
|
|
1013
1062
|
Args:
|
|
1014
1063
|
----
|