biocypher 0.6.2__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/__init__.py +3 -13
- biocypher/_config/__init__.py +6 -23
- biocypher/_config/biocypher_config.yaml +14 -3
- biocypher/_core.py +360 -262
- biocypher/_create.py +13 -27
- biocypher/_deduplicate.py +4 -11
- biocypher/_get.py +21 -60
- biocypher/_logger.py +4 -16
- biocypher/_mapping.py +4 -17
- biocypher/_metadata.py +3 -15
- biocypher/_misc.py +14 -28
- biocypher/_ontology.py +127 -212
- biocypher/_translate.py +34 -58
- biocypher/output/connect/_get_connector.py +40 -0
- biocypher/output/connect/_neo4j_driver.py +9 -65
- biocypher/output/in_memory/_get_in_memory_kg.py +34 -0
- biocypher/output/in_memory/_in_memory_kg.py +40 -0
- biocypher/output/in_memory/_networkx.py +44 -0
- biocypher/output/in_memory/_pandas.py +20 -15
- biocypher/output/write/_batch_writer.py +166 -179
- biocypher/output/write/_get_writer.py +11 -24
- biocypher/output/write/_writer.py +43 -44
- biocypher/output/write/graph/_arangodb.py +7 -24
- biocypher/output/write/graph/_neo4j.py +51 -56
- biocypher/output/write/graph/_networkx.py +36 -43
- biocypher/output/write/graph/_rdf.py +107 -95
- biocypher/output/write/relational/_csv.py +6 -11
- biocypher/output/write/relational/_postgresql.py +5 -13
- biocypher/output/write/relational/_sqlite.py +3 -1
- {biocypher-0.6.2.dist-info → biocypher-0.8.0.dist-info}/LICENSE +1 -1
- {biocypher-0.6.2.dist-info → biocypher-0.8.0.dist-info}/METADATA +3 -3
- biocypher-0.8.0.dist-info/RECORD +43 -0
- {biocypher-0.6.2.dist-info → biocypher-0.8.0.dist-info}/WHEEL +1 -1
- biocypher-0.6.2.dist-info/RECORD +0 -39
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
|
|
2
|
-
from types import GeneratorType
|
|
3
|
-
from typing import Union, Optional
|
|
4
|
-
from collections import OrderedDict, defaultdict
|
|
1
|
+
import glob
|
|
5
2
|
import os
|
|
6
3
|
import re
|
|
7
|
-
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections import OrderedDict, defaultdict
|
|
7
|
+
from types import GeneratorType
|
|
8
8
|
|
|
9
9
|
from more_itertools import peekable
|
|
10
10
|
|
|
11
11
|
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
12
|
+
from biocypher._deduplicate import Deduplicator
|
|
12
13
|
from biocypher._logger import logger
|
|
13
14
|
from biocypher._translate import Translator
|
|
14
|
-
from biocypher._deduplicate import Deduplicator
|
|
15
15
|
from biocypher.output.write._writer import _Writer
|
|
16
16
|
|
|
17
17
|
|
|
@@ -20,96 +20,88 @@ class _BatchWriter(_Writer, ABC):
|
|
|
20
20
|
|
|
21
21
|
@abstractmethod
|
|
22
22
|
def _quote_string(self, value: str) -> str:
|
|
23
|
-
"""
|
|
24
|
-
Abstract method to quote a string. Escaping is handled by the database-specific writer.
|
|
25
|
-
"""
|
|
26
|
-
|
|
23
|
+
"""Abstract method to quote a string. Escaping is handled by the database-specific writer."""
|
|
27
24
|
raise NotImplementedError(
|
|
28
|
-
"Database writer must override '_quote_string'"
|
|
25
|
+
"Database writer must override '_quote_string'",
|
|
29
26
|
)
|
|
30
27
|
|
|
31
28
|
@abstractmethod
|
|
32
29
|
def _get_default_import_call_bin_prefix(self):
|
|
33
|
-
"""
|
|
34
|
-
Abstract method to provide the default string for the import call bin prefix.
|
|
30
|
+
"""Abstract method to provide the default string for the import call bin prefix.
|
|
35
31
|
|
|
36
|
-
Returns
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
37
34
|
str: The database-specific string for the path to the import call bin prefix
|
|
35
|
+
|
|
38
36
|
"""
|
|
39
|
-
raise NotImplementedError(
|
|
40
|
-
"Database writer must override '_get_default_import_call_bin_prefix'"
|
|
41
|
-
)
|
|
37
|
+
raise NotImplementedError("Database writer must override '_get_default_import_call_bin_prefix'")
|
|
42
38
|
|
|
43
39
|
@abstractmethod
|
|
44
40
|
def _write_array_string(self, string_list):
|
|
45
|
-
"""
|
|
46
|
-
Abstract method to write the string representation of an array into a .csv file.
|
|
41
|
+
"""Abstract method to write the string representation of an array into a .csv file.
|
|
47
42
|
Different databases require different formats of array to optimize import speed.
|
|
48
43
|
|
|
49
44
|
Args:
|
|
45
|
+
----
|
|
50
46
|
string_list (list): list of ontology strings
|
|
51
47
|
|
|
52
48
|
Returns:
|
|
49
|
+
-------
|
|
53
50
|
str: The database-specific string representation of an array
|
|
51
|
+
|
|
54
52
|
"""
|
|
55
|
-
raise NotImplementedError(
|
|
56
|
-
"Database writer must override '_write_array_string'"
|
|
57
|
-
)
|
|
53
|
+
raise NotImplementedError("Database writer must override '_write_array_string'")
|
|
58
54
|
|
|
59
55
|
@abstractmethod
|
|
60
56
|
def _write_node_headers(self):
|
|
61
|
-
"""
|
|
62
|
-
Abstract method that takes care of importing properties of a graph entity that is represented
|
|
57
|
+
"""Abstract method that takes care of importing properties of a graph entity that is represented
|
|
63
58
|
as a node as per the definition in the `schema_config.yaml`
|
|
64
59
|
|
|
65
|
-
Returns
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
66
62
|
bool: The return value. True for success, False otherwise.
|
|
63
|
+
|
|
67
64
|
"""
|
|
68
|
-
raise NotImplementedError(
|
|
69
|
-
"Database writer must override '_write_node_headers'"
|
|
70
|
-
)
|
|
65
|
+
raise NotImplementedError("Database writer must override '_write_node_headers'")
|
|
71
66
|
|
|
72
67
|
@abstractmethod
|
|
73
68
|
def _write_edge_headers(self):
|
|
74
|
-
"""
|
|
75
|
-
Abstract method to write a database import-file for a graph entity that is represented
|
|
69
|
+
"""Abstract method to write a database import-file for a graph entity that is represented
|
|
76
70
|
as an edge as per the definition in the `schema_config.yaml`,
|
|
77
71
|
containing only the header for this type of edge.
|
|
78
72
|
|
|
79
|
-
Returns
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
80
75
|
bool: The return value. True for success, False otherwise.
|
|
76
|
+
|
|
81
77
|
"""
|
|
82
|
-
raise NotImplementedError(
|
|
83
|
-
"Database writer must override '_write_edge_headers'"
|
|
84
|
-
)
|
|
78
|
+
raise NotImplementedError("Database writer must override '_write_edge_headers'")
|
|
85
79
|
|
|
86
80
|
@abstractmethod
|
|
87
81
|
def _construct_import_call(self) -> str:
|
|
88
|
-
"""
|
|
89
|
-
Function to construct the import call detailing folder and
|
|
82
|
+
"""Function to construct the import call detailing folder and
|
|
90
83
|
individual node and edge headers and data files, as well as
|
|
91
84
|
delimiters and database name. Built after all data has been
|
|
92
85
|
processed to ensure that nodes are called before any edges.
|
|
93
86
|
|
|
94
|
-
Returns
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
95
89
|
str: A bash command for csv import.
|
|
90
|
+
|
|
96
91
|
"""
|
|
97
|
-
raise NotImplementedError(
|
|
98
|
-
"Database writer must override '_construct_import_call'"
|
|
99
|
-
)
|
|
92
|
+
raise NotImplementedError("Database writer must override '_construct_import_call'")
|
|
100
93
|
|
|
101
94
|
@abstractmethod
|
|
102
95
|
def _get_import_script_name(self) -> str:
|
|
103
|
-
"""
|
|
104
|
-
Returns the name of the import script.
|
|
96
|
+
"""Returns the name of the import script.
|
|
105
97
|
The name will be chosen based on the used database.
|
|
106
98
|
|
|
107
|
-
Returns
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
108
101
|
str: The name of the import script (ending in .sh)
|
|
102
|
+
|
|
109
103
|
"""
|
|
110
|
-
raise NotImplementedError(
|
|
111
|
-
"Database writer must override '_get_import_script_name'"
|
|
112
|
-
)
|
|
104
|
+
raise NotImplementedError("Database writer must override '_get_import_script_name'")
|
|
113
105
|
|
|
114
106
|
def __init__(
|
|
115
107
|
self,
|
|
@@ -118,10 +110,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
118
110
|
delimiter: str,
|
|
119
111
|
array_delimiter: str = ",",
|
|
120
112
|
quote: str = '"',
|
|
121
|
-
output_directory:
|
|
113
|
+
output_directory: str | None = None,
|
|
122
114
|
db_name: str = "neo4j",
|
|
123
|
-
import_call_bin_prefix:
|
|
124
|
-
import_call_file_prefix:
|
|
115
|
+
import_call_bin_prefix: str | None = None,
|
|
116
|
+
import_call_file_prefix: str | None = None,
|
|
125
117
|
wipe: bool = True,
|
|
126
118
|
strict_mode: bool = False,
|
|
127
119
|
skip_bad_relationships: bool = False,
|
|
@@ -132,10 +124,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
132
124
|
db_port: str = None,
|
|
133
125
|
rdf_format: str = None,
|
|
134
126
|
rdf_namespaces: dict = {},
|
|
127
|
+
labels_order: str = "Ascending",
|
|
135
128
|
):
|
|
136
|
-
"""
|
|
137
|
-
|
|
138
|
-
Abtract parent class for writing node and edge representations to disk
|
|
129
|
+
"""Abtract parent class for writing node and edge representations to disk
|
|
139
130
|
using the format specified by each database type. The database-specific
|
|
140
131
|
functions are implemented by the respective child-classes. This abstract
|
|
141
132
|
class contains all methods expected by a bach writer instance, some of
|
|
@@ -156,6 +147,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
156
147
|
- _get_import_script_name
|
|
157
148
|
|
|
158
149
|
Args:
|
|
150
|
+
----
|
|
159
151
|
translator:
|
|
160
152
|
Instance of :py:class:`Translator` to enable translation of
|
|
161
153
|
nodes and manipulation of properties.
|
|
@@ -217,6 +209,11 @@ class _BatchWriter(_Writer, ABC):
|
|
|
217
209
|
|
|
218
210
|
rdf_namespaces:
|
|
219
211
|
The namespaces for RDF.
|
|
212
|
+
|
|
213
|
+
labels_order:
|
|
214
|
+
The order of labels, to reflect the hierarchy (or not).
|
|
215
|
+
Default: "Ascending" (from more specific to more generic).
|
|
216
|
+
|
|
220
217
|
"""
|
|
221
218
|
super().__init__(
|
|
222
219
|
translator=translator,
|
|
@@ -233,17 +230,13 @@ class _BatchWriter(_Writer, ABC):
|
|
|
233
230
|
self.rdf_namespaces = rdf_namespaces
|
|
234
231
|
|
|
235
232
|
self.delim, self.escaped_delim = self._process_delimiter(delimiter)
|
|
236
|
-
self.adelim, self.escaped_adelim = self._process_delimiter(
|
|
237
|
-
array_delimiter
|
|
238
|
-
)
|
|
233
|
+
self.adelim, self.escaped_adelim = self._process_delimiter(array_delimiter)
|
|
239
234
|
self.quote = quote
|
|
240
235
|
self.skip_bad_relationships = skip_bad_relationships
|
|
241
236
|
self.skip_duplicate_nodes = skip_duplicate_nodes
|
|
242
237
|
|
|
243
238
|
if import_call_bin_prefix is None:
|
|
244
|
-
self.import_call_bin_prefix = (
|
|
245
|
-
self._get_default_import_call_bin_prefix()
|
|
246
|
-
)
|
|
239
|
+
self.import_call_bin_prefix = self._get_default_import_call_bin_prefix()
|
|
247
240
|
else:
|
|
248
241
|
self.import_call_bin_prefix = import_call_bin_prefix
|
|
249
242
|
|
|
@@ -263,39 +256,41 @@ class _BatchWriter(_Writer, ABC):
|
|
|
263
256
|
|
|
264
257
|
self.parts = {} # dict to store the paths of part files for each label
|
|
265
258
|
|
|
259
|
+
self._labels_orders = ["Alphabetical", "Ascending", "Descending", "Leaves"]
|
|
260
|
+
if labels_order not in self._labels_orders:
|
|
261
|
+
msg = (
|
|
262
|
+
f"neo4j's 'labels_order' parameter cannot be '{labels_order}',"
|
|
263
|
+
"must be one of: {' ,'.join(self._labels_orders)}",
|
|
264
|
+
)
|
|
265
|
+
raise ValueError(msg)
|
|
266
|
+
self.labels_order = labels_order
|
|
267
|
+
|
|
266
268
|
# TODO not memory efficient, but should be fine for most cases; is
|
|
267
269
|
# there a more elegant solution?
|
|
268
270
|
|
|
269
271
|
@property
|
|
270
272
|
def import_call_file_prefix(self):
|
|
271
|
-
"""
|
|
272
|
-
Property for output directory path.
|
|
273
|
-
"""
|
|
274
|
-
|
|
273
|
+
"""Property for output directory path."""
|
|
275
274
|
if self._import_call_file_prefix is None:
|
|
276
275
|
return self.outdir
|
|
277
276
|
else:
|
|
278
277
|
return self._import_call_file_prefix
|
|
279
278
|
|
|
280
279
|
def _process_delimiter(self, delimiter: str) -> str:
|
|
281
|
-
"""
|
|
282
|
-
Return escaped characters in case of receiving their string
|
|
280
|
+
"""Return escaped characters in case of receiving their string
|
|
283
281
|
representation (e.g. tab for '\t').
|
|
284
282
|
"""
|
|
285
|
-
|
|
286
283
|
if delimiter == "\\t":
|
|
287
284
|
return "\t", "\\t"
|
|
288
285
|
|
|
289
286
|
else:
|
|
290
287
|
return delimiter, delimiter
|
|
291
288
|
|
|
292
|
-
def write_nodes(
|
|
293
|
-
|
|
294
|
-
):
|
|
295
|
-
"""
|
|
296
|
-
Wrapper for writing nodes and their headers.
|
|
289
|
+
def write_nodes(self, nodes, batch_size: int = int(1e6), force: bool = False):
|
|
290
|
+
"""Wrapper for writing nodes and their headers.
|
|
297
291
|
|
|
298
292
|
Args:
|
|
293
|
+
----
|
|
299
294
|
nodes (BioCypherNode): a list or generator of nodes in
|
|
300
295
|
:py:class:`BioCypherNode` format
|
|
301
296
|
|
|
@@ -306,7 +301,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
306
301
|
|
|
307
302
|
|
|
308
303
|
Returns:
|
|
304
|
+
-------
|
|
309
305
|
bool: The return value. True for success, False otherwise.
|
|
306
|
+
|
|
310
307
|
"""
|
|
311
308
|
# TODO check represented_as
|
|
312
309
|
|
|
@@ -325,19 +322,21 @@ class _BatchWriter(_Writer, ABC):
|
|
|
325
322
|
|
|
326
323
|
def write_edges(
|
|
327
324
|
self,
|
|
328
|
-
edges:
|
|
325
|
+
edges: list | GeneratorType,
|
|
329
326
|
batch_size: int = int(1e6),
|
|
330
327
|
) -> bool:
|
|
331
|
-
"""
|
|
332
|
-
Wrapper for writing edges and their headers.
|
|
328
|
+
"""Wrapper for writing edges and their headers.
|
|
333
329
|
|
|
334
330
|
Args:
|
|
331
|
+
----
|
|
335
332
|
edges (BioCypherEdge): a list or generator of edges in
|
|
336
333
|
:py:class:`BioCypherEdge` or :py:class:`BioCypherRelAsNode`
|
|
337
334
|
format
|
|
338
335
|
|
|
339
336
|
Returns:
|
|
337
|
+
-------
|
|
340
338
|
bool: The return value. True for success, False otherwise.
|
|
339
|
+
|
|
341
340
|
"""
|
|
342
341
|
passed = False
|
|
343
342
|
edges = list(edges) # force evaluation to handle empty generator
|
|
@@ -375,7 +374,6 @@ class _BatchWriter(_Writer, ABC):
|
|
|
375
374
|
logger.debug(
|
|
376
375
|
"No edges to write, possibly due to no matched Biolink classes.",
|
|
377
376
|
)
|
|
378
|
-
pass
|
|
379
377
|
|
|
380
378
|
if not passed:
|
|
381
379
|
logger.error("Error while writing edge data.")
|
|
@@ -389,8 +387,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
389
387
|
return True
|
|
390
388
|
|
|
391
389
|
def _write_node_data(self, nodes, batch_size, force: bool = False):
|
|
392
|
-
"""
|
|
393
|
-
Writes biocypher nodes to CSV conforming to the headers created
|
|
390
|
+
"""Writes biocypher nodes to CSV conforming to the headers created
|
|
394
391
|
with `_write_node_headers()`, and is actually required to be run
|
|
395
392
|
before calling `_write_node_headers()` to set the
|
|
396
393
|
:py:attr:`self.node_property_dict` for passing the node properties
|
|
@@ -398,14 +395,16 @@ class _BatchWriter(_Writer, ABC):
|
|
|
398
395
|
:py:class:`BioCypherNode` class.
|
|
399
396
|
|
|
400
397
|
Args:
|
|
398
|
+
----
|
|
401
399
|
nodes (BioCypherNode): a list or generator of nodes in
|
|
402
400
|
:py:class:`BioCypherNode` format
|
|
403
401
|
|
|
404
402
|
Returns:
|
|
403
|
+
-------
|
|
405
404
|
bool: The return value. True for success, False otherwise.
|
|
406
|
-
"""
|
|
407
405
|
|
|
408
|
-
|
|
406
|
+
"""
|
|
407
|
+
if isinstance(nodes, GeneratorType | peekable):
|
|
409
408
|
logger.debug("Writing node CSV from generator.")
|
|
410
409
|
|
|
411
410
|
bins = defaultdict(list) # dict to store a list for each
|
|
@@ -432,20 +431,15 @@ class _BatchWriter(_Writer, ABC):
|
|
|
432
431
|
logger.warning(f"Node {label} has no id; skipping.")
|
|
433
432
|
continue
|
|
434
433
|
|
|
435
|
-
if not
|
|
434
|
+
if label not in bins.keys():
|
|
436
435
|
# start new list
|
|
437
436
|
all_labels = None
|
|
438
437
|
bins[label].append(node)
|
|
439
438
|
bin_l[label] = 1
|
|
440
439
|
|
|
441
440
|
# get properties from config if present
|
|
442
|
-
if
|
|
443
|
-
label
|
|
444
|
-
in self.translator.ontology.mapping.extended_schema
|
|
445
|
-
):
|
|
446
|
-
cprops = self.translator.ontology.mapping.extended_schema.get(
|
|
447
|
-
label
|
|
448
|
-
).get(
|
|
441
|
+
if label in self.translator.ontology.mapping.extended_schema:
|
|
442
|
+
cprops = self.translator.ontology.mapping.extended_schema.get(label).get(
|
|
449
443
|
"properties",
|
|
450
444
|
)
|
|
451
445
|
else:
|
|
@@ -483,28 +477,39 @@ class _BatchWriter(_Writer, ABC):
|
|
|
483
477
|
# get label hierarchy
|
|
484
478
|
# multiple labels:
|
|
485
479
|
if not force:
|
|
486
|
-
all_labels = self.translator.ontology.get_ancestors(
|
|
487
|
-
label
|
|
488
|
-
)
|
|
480
|
+
all_labels = self.translator.ontology.get_ancestors(label)
|
|
489
481
|
else:
|
|
490
482
|
all_labels = None
|
|
491
483
|
|
|
492
484
|
if all_labels:
|
|
493
485
|
# convert to pascal case
|
|
494
|
-
all_labels = [
|
|
495
|
-
self.translator.name_sentence_to_pascal(label)
|
|
496
|
-
for label in all_labels
|
|
497
|
-
]
|
|
486
|
+
all_labels = [self.translator.name_sentence_to_pascal(label) for label in all_labels]
|
|
498
487
|
# remove duplicates
|
|
499
488
|
all_labels = list(OrderedDict.fromkeys(all_labels))
|
|
500
|
-
|
|
501
|
-
|
|
489
|
+
match self.labels_order:
|
|
490
|
+
case "Ascending":
|
|
491
|
+
pass # Default from get_ancestors.
|
|
492
|
+
case "Alphabetical":
|
|
493
|
+
all_labels.sort()
|
|
494
|
+
case "Descending":
|
|
495
|
+
all_labels.reverse()
|
|
496
|
+
case "Leaves":
|
|
497
|
+
if len(all_labels) < 1:
|
|
498
|
+
msg = "Labels list cannot be empty when using 'Leaves' order."
|
|
499
|
+
raise ValueError(msg)
|
|
500
|
+
all_labels = [all_labels[0]]
|
|
501
|
+
case _:
|
|
502
|
+
# In case someone touched _label_orders after constructor.
|
|
503
|
+
if self.labels_order not in self._labels_orders:
|
|
504
|
+
msg = (
|
|
505
|
+
f"Invalid labels_order: {self.labels_order}. "
|
|
506
|
+
f"Must be one of {self._labels_orders}"
|
|
507
|
+
)
|
|
508
|
+
raise ValueError(msg)
|
|
502
509
|
# concatenate with array delimiter
|
|
503
510
|
all_labels = self._write_array_string(all_labels)
|
|
504
511
|
else:
|
|
505
|
-
all_labels = self.translator.name_sentence_to_pascal(
|
|
506
|
-
label
|
|
507
|
-
)
|
|
512
|
+
all_labels = self.translator.name_sentence_to_pascal(label)
|
|
508
513
|
|
|
509
514
|
labels[label] = all_labels
|
|
510
515
|
|
|
@@ -549,16 +554,15 @@ class _BatchWriter(_Writer, ABC):
|
|
|
549
554
|
self.node_property_dict[label] = reference_props[label]
|
|
550
555
|
|
|
551
556
|
return True
|
|
557
|
+
elif not isinstance(nodes, list):
|
|
558
|
+
logger.error("Nodes must be passed as list or generator.")
|
|
559
|
+
return False
|
|
552
560
|
else:
|
|
553
|
-
if type(nodes) is not list:
|
|
554
|
-
logger.error("Nodes must be passed as list or generator.")
|
|
555
|
-
return False
|
|
556
|
-
else:
|
|
557
561
|
|
|
558
|
-
|
|
559
|
-
|
|
562
|
+
def gen(nodes):
|
|
563
|
+
yield from nodes
|
|
560
564
|
|
|
561
|
-
|
|
565
|
+
return self._write_node_data(gen(nodes), batch_size=batch_size)
|
|
562
566
|
|
|
563
567
|
def _write_single_node_list_to_file(
|
|
564
568
|
self,
|
|
@@ -567,11 +571,11 @@ class _BatchWriter(_Writer, ABC):
|
|
|
567
571
|
prop_dict: dict,
|
|
568
572
|
labels: str,
|
|
569
573
|
):
|
|
570
|
-
"""
|
|
571
|
-
This function takes one list of biocypher nodes and writes them
|
|
574
|
+
"""This function takes one list of biocypher nodes and writes them
|
|
572
575
|
to a Neo4j admin import compatible CSV file.
|
|
573
576
|
|
|
574
577
|
Args:
|
|
578
|
+
----
|
|
575
579
|
node_list (list): list of BioCypherNodes to be written
|
|
576
580
|
label (str): the primary label of the node
|
|
577
581
|
prop_dict (dict): properties of node class passed from parsing
|
|
@@ -580,7 +584,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
580
584
|
for the node class
|
|
581
585
|
|
|
582
586
|
Returns:
|
|
587
|
+
-------
|
|
583
588
|
bool: The return value. True for success, False otherwise.
|
|
589
|
+
|
|
584
590
|
"""
|
|
585
591
|
if not all(isinstance(n, BioCypherNode) for n in node_list):
|
|
586
592
|
logger.error("Nodes must be passed as type BioCypherNode.")
|
|
@@ -598,7 +604,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
598
604
|
ref_props = list(prop_dict.keys())
|
|
599
605
|
|
|
600
606
|
# compare lists order invariant
|
|
601
|
-
if
|
|
607
|
+
if set(ref_props) != set(n_keys):
|
|
602
608
|
onode = n.get_id()
|
|
603
609
|
oprop1 = set(ref_props).difference(n_keys)
|
|
604
610
|
oprop2 = set(n_keys).difference(ref_props)
|
|
@@ -632,11 +638,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
632
638
|
"boolean",
|
|
633
639
|
]:
|
|
634
640
|
plist.append(str(p))
|
|
641
|
+
elif isinstance(p, list):
|
|
642
|
+
plist.append(self._write_array_string(p))
|
|
635
643
|
else:
|
|
636
|
-
|
|
637
|
-
plist.append(self._write_array_string(p))
|
|
638
|
-
else:
|
|
639
|
-
plist.append(self._quote_string(str(p)))
|
|
644
|
+
plist.append(f"{self.quote}{p!s}{self.quote}")
|
|
640
645
|
|
|
641
646
|
line.append(self.delim.join(plist))
|
|
642
647
|
line.append(labels)
|
|
@@ -650,8 +655,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
650
655
|
return True
|
|
651
656
|
|
|
652
657
|
def _write_edge_data(self, edges, batch_size):
|
|
653
|
-
"""
|
|
654
|
-
Writes biocypher edges to CSV conforming to the headers created
|
|
658
|
+
"""Writes biocypher edges to CSV conforming to the headers created
|
|
655
659
|
with `_write_edge_headers()`, and is actually required to be run
|
|
656
660
|
before calling `_write_node_headers()` to set the
|
|
657
661
|
:py:attr:`self.edge_property_dict` for passing the edge
|
|
@@ -659,17 +663,20 @@ class _BatchWriter(_Writer, ABC):
|
|
|
659
663
|
from the :py:class:`BioCypherEdge` class.
|
|
660
664
|
|
|
661
665
|
Args:
|
|
666
|
+
----
|
|
662
667
|
edges (BioCypherEdge): a list or generator of edges in
|
|
663
668
|
:py:class:`BioCypherEdge` format
|
|
664
669
|
|
|
665
670
|
Returns:
|
|
671
|
+
-------
|
|
666
672
|
bool: The return value. True for success, False otherwise.
|
|
667
673
|
|
|
668
674
|
Todo:
|
|
675
|
+
----
|
|
669
676
|
- currently works for mixed edges but in practice often is
|
|
670
677
|
called on one iterable containing one type of edge only
|
|
671
|
-
"""
|
|
672
678
|
|
|
679
|
+
"""
|
|
673
680
|
if isinstance(edges, GeneratorType):
|
|
674
681
|
logger.debug("Writing edge CSV from generator.")
|
|
675
682
|
|
|
@@ -685,14 +692,13 @@ class _BatchWriter(_Writer, ABC):
|
|
|
685
692
|
for edge in edges:
|
|
686
693
|
if not (edge.get_source_id() and edge.get_target_id()):
|
|
687
694
|
logger.error(
|
|
688
|
-
"Edge must have source and target node. "
|
|
689
|
-
f"Caused by: {edge}",
|
|
695
|
+
f"Edge must have source and target node. Caused by: {edge}",
|
|
690
696
|
)
|
|
691
697
|
continue
|
|
692
698
|
|
|
693
699
|
label = edge.get_label()
|
|
694
700
|
|
|
695
|
-
if not
|
|
701
|
+
if label not in bins.keys():
|
|
696
702
|
# start new list
|
|
697
703
|
bins[label].append(edge)
|
|
698
704
|
bin_l[label] = 1
|
|
@@ -703,13 +709,8 @@ class _BatchWriter(_Writer, ABC):
|
|
|
703
709
|
# (may not be if it is an edge that carries the
|
|
704
710
|
# "label_as_edge" property)
|
|
705
711
|
cprops = None
|
|
706
|
-
if
|
|
707
|
-
label
|
|
708
|
-
in self.translator.ontology.mapping.extended_schema
|
|
709
|
-
):
|
|
710
|
-
cprops = self.translator.ontology.mapping.extended_schema.get(
|
|
711
|
-
label
|
|
712
|
-
).get(
|
|
712
|
+
if label in self.translator.ontology.mapping.extended_schema:
|
|
713
|
+
cprops = self.translator.ontology.mapping.extended_schema.get(label).get(
|
|
713
714
|
"properties",
|
|
714
715
|
)
|
|
715
716
|
else:
|
|
@@ -717,9 +718,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
717
718
|
for (
|
|
718
719
|
k,
|
|
719
720
|
v,
|
|
720
|
-
) in (
|
|
721
|
-
self.translator.ontology.mapping.extended_schema.items()
|
|
722
|
-
):
|
|
721
|
+
) in self.translator.ontology.mapping.extended_schema.items():
|
|
723
722
|
if isinstance(v, dict):
|
|
724
723
|
if v.get("label_as_edge") == label:
|
|
725
724
|
cprops = v.get("properties")
|
|
@@ -789,16 +788,15 @@ class _BatchWriter(_Writer, ABC):
|
|
|
789
788
|
self.edge_property_dict[label] = reference_props[label]
|
|
790
789
|
|
|
791
790
|
return True
|
|
791
|
+
elif not isinstance(edges, list):
|
|
792
|
+
logger.error("Edges must be passed as list or generator.")
|
|
793
|
+
return False
|
|
792
794
|
else:
|
|
793
|
-
if type(edges) is not list:
|
|
794
|
-
logger.error("Edges must be passed as list or generator.")
|
|
795
|
-
return False
|
|
796
|
-
else:
|
|
797
795
|
|
|
798
|
-
|
|
799
|
-
|
|
796
|
+
def gen(edges):
|
|
797
|
+
yield from edges
|
|
800
798
|
|
|
801
|
-
|
|
799
|
+
return self._write_edge_data(gen(edges), batch_size=batch_size)
|
|
802
800
|
|
|
803
801
|
def _write_single_edge_list_to_file(
|
|
804
802
|
self,
|
|
@@ -806,11 +804,11 @@ class _BatchWriter(_Writer, ABC):
|
|
|
806
804
|
label: str,
|
|
807
805
|
prop_dict: dict,
|
|
808
806
|
):
|
|
809
|
-
"""
|
|
810
|
-
This function takes one list of biocypher edges and writes them
|
|
807
|
+
"""This function takes one list of biocypher edges and writes them
|
|
811
808
|
to a Neo4j admin import compatible CSV file.
|
|
812
809
|
|
|
813
810
|
Args:
|
|
811
|
+
----
|
|
814
812
|
edge_list (list): list of BioCypherEdges to be written
|
|
815
813
|
|
|
816
814
|
label (str): the label (type) of the edge
|
|
@@ -819,9 +817,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
819
817
|
function and their types
|
|
820
818
|
|
|
821
819
|
Returns:
|
|
820
|
+
-------
|
|
822
821
|
bool: The return value. True for success, False otherwise.
|
|
823
|
-
"""
|
|
824
822
|
|
|
823
|
+
"""
|
|
825
824
|
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
|
|
826
825
|
logger.error("Edges must be passed as type BioCypherEdge.")
|
|
827
826
|
return False
|
|
@@ -836,7 +835,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
836
835
|
ref_props = list(prop_dict.keys())
|
|
837
836
|
|
|
838
837
|
# compare list order invariant
|
|
839
|
-
if
|
|
838
|
+
if set(ref_props) != set(e_keys):
|
|
840
839
|
oedge = f"{e.get_source_id()}-{e.get_target_id()}"
|
|
841
840
|
oprop1 = set(ref_props).difference(e_keys)
|
|
842
841
|
oprop2 = set(e_keys).difference(ref_props)
|
|
@@ -867,11 +866,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
867
866
|
"boolean",
|
|
868
867
|
]:
|
|
869
868
|
plist.append(str(p))
|
|
869
|
+
elif isinstance(p, list):
|
|
870
|
+
plist.append(self._write_array_string(p))
|
|
870
871
|
else:
|
|
871
|
-
|
|
872
|
-
plist.append(self._write_array_string(p))
|
|
873
|
-
else:
|
|
874
|
-
plist.append(self._quote_string(str(p)))
|
|
872
|
+
plist.append(self.quote + str(p) + self.quote)
|
|
875
873
|
|
|
876
874
|
entries = [e.get_source_id()]
|
|
877
875
|
|
|
@@ -880,9 +878,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
880
878
|
|
|
881
879
|
if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
|
|
882
880
|
skip_id = True
|
|
883
|
-
elif not self.translator.ontology.mapping.extended_schema.get(
|
|
884
|
-
label
|
|
885
|
-
):
|
|
881
|
+
elif not self.translator.ontology.mapping.extended_schema.get(label):
|
|
886
882
|
# find label in schema by label_as_edge
|
|
887
883
|
for (
|
|
888
884
|
k,
|
|
@@ -897,9 +893,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
897
893
|
if schema_label:
|
|
898
894
|
if (
|
|
899
895
|
self.translator.ontology.mapping.extended_schema.get(
|
|
900
|
-
schema_label
|
|
896
|
+
schema_label,
|
|
901
897
|
).get("use_id")
|
|
902
|
-
== False
|
|
898
|
+
== False # noqa: E712 (seems to not work with 'not')
|
|
903
899
|
):
|
|
904
900
|
skip_id = True
|
|
905
901
|
|
|
@@ -913,7 +909,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
913
909
|
entries.append(
|
|
914
910
|
self.translator.name_sentence_to_pascal(
|
|
915
911
|
e.get_label(),
|
|
916
|
-
)
|
|
912
|
+
),
|
|
917
913
|
)
|
|
918
914
|
|
|
919
915
|
lines.append(
|
|
@@ -927,10 +923,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
927
923
|
return True
|
|
928
924
|
|
|
929
925
|
def _write_next_part(self, label: str, lines: list):
|
|
930
|
-
"""
|
|
931
|
-
This function writes a list of strings to a new part file.
|
|
926
|
+
"""This function writes a list of strings to a new part file.
|
|
932
927
|
|
|
933
928
|
Args:
|
|
929
|
+
----
|
|
934
930
|
label (str): the label (type) of the edge; internal
|
|
935
931
|
representation sentence case -> needs to become PascalCase
|
|
936
932
|
for disk representation
|
|
@@ -938,17 +934,15 @@ class _BatchWriter(_Writer, ABC):
|
|
|
938
934
|
lines (list): list of strings to be written
|
|
939
935
|
|
|
940
936
|
Returns:
|
|
937
|
+
-------
|
|
941
938
|
bool: The return value. True for success, False otherwise.
|
|
939
|
+
|
|
942
940
|
"""
|
|
943
941
|
# translate label to PascalCase
|
|
944
|
-
label_pascal = self.translator.name_sentence_to_pascal(
|
|
945
|
-
parse_label(label)
|
|
946
|
-
)
|
|
942
|
+
label_pascal = self.translator.name_sentence_to_pascal(parse_label(label))
|
|
947
943
|
|
|
948
944
|
# list files in self.outdir
|
|
949
|
-
files = glob.glob(
|
|
950
|
-
os.path.join(self.outdir, f"{label_pascal}-part*.csv")
|
|
951
|
-
)
|
|
945
|
+
files = glob.glob(os.path.join(self.outdir, f"{label_pascal}-part*.csv"))
|
|
952
946
|
# find file with highest part number
|
|
953
947
|
if not files:
|
|
954
948
|
next_part = 0
|
|
@@ -956,10 +950,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
956
950
|
else:
|
|
957
951
|
next_part = (
|
|
958
952
|
max(
|
|
959
|
-
[
|
|
960
|
-
int(f.split(".")[-2].split("-")[-1].replace("part", ""))
|
|
961
|
-
for f in files
|
|
962
|
-
],
|
|
953
|
+
[int(f.split(".")[-2].split("-")[-1].replace("part", "")) for f in files],
|
|
963
954
|
)
|
|
964
955
|
+ 1
|
|
965
956
|
)
|
|
@@ -984,31 +975,29 @@ class _BatchWriter(_Writer, ABC):
|
|
|
984
975
|
self.parts[label].append(part)
|
|
985
976
|
|
|
986
977
|
def get_import_call(self) -> str:
|
|
987
|
-
"""
|
|
988
|
-
Function to return the import call detailing folder and
|
|
978
|
+
"""Function to return the import call detailing folder and
|
|
989
979
|
individual node and edge headers and data files, as well as
|
|
990
980
|
delimiters and database name.
|
|
991
981
|
|
|
992
|
-
Returns
|
|
982
|
+
Returns
|
|
983
|
+
-------
|
|
993
984
|
str: a bash command for the database import
|
|
994
|
-
"""
|
|
995
985
|
|
|
986
|
+
"""
|
|
996
987
|
return self._construct_import_call()
|
|
997
988
|
|
|
998
989
|
def write_import_call(self) -> str:
|
|
999
|
-
"""
|
|
1000
|
-
Function to write the import call detailing folder and
|
|
990
|
+
"""Function to write the import call detailing folder and
|
|
1001
991
|
individual node and edge headers and data files, as well as
|
|
1002
992
|
delimiters and database name, to the export folder as txt.
|
|
1003
993
|
|
|
1004
|
-
Returns
|
|
994
|
+
Returns
|
|
995
|
+
-------
|
|
1005
996
|
str: The path of the file holding the import call.
|
|
1006
|
-
"""
|
|
1007
997
|
|
|
998
|
+
"""
|
|
1008
999
|
file_path = os.path.join(self.outdir, self._get_import_script_name())
|
|
1009
|
-
logger.info(
|
|
1010
|
-
f"Writing {self.db_name + ' ' if self.db_name else ''}import call to `{file_path}`."
|
|
1011
|
-
)
|
|
1000
|
+
logger.info(f"Writing {self.db_name + ' ' if self.db_name else ''}import call to `{file_path}`.")
|
|
1012
1001
|
|
|
1013
1002
|
with open(file_path, "w", encoding="utf-8") as f:
|
|
1014
1003
|
f.write(self._construct_import_call())
|
|
@@ -1017,16 +1006,16 @@ class _BatchWriter(_Writer, ABC):
|
|
|
1017
1006
|
|
|
1018
1007
|
|
|
1019
1008
|
def parse_label(label: str) -> str:
|
|
1020
|
-
"""
|
|
1021
|
-
|
|
1022
|
-
Check if the label is compliant with Neo4j naming conventions,
|
|
1009
|
+
"""Check if the label is compliant with Neo4j naming conventions,
|
|
1023
1010
|
https://neo4j.com/docs/cypher-manual/current/syntax/naming/, and if not,
|
|
1024
1011
|
remove non-compliant characters.
|
|
1025
1012
|
|
|
1026
1013
|
Args:
|
|
1014
|
+
----
|
|
1027
1015
|
label (str): The label to check
|
|
1028
1016
|
Returns:
|
|
1029
1017
|
str: The compliant label
|
|
1018
|
+
|
|
1030
1019
|
"""
|
|
1031
1020
|
# Check if the name contains only alphanumeric characters, underscore, or dollar sign
|
|
1032
1021
|
# and dot (for class hierarchy of BioCypher)
|
|
@@ -1036,7 +1025,7 @@ def parse_label(label: str) -> str:
|
|
|
1036
1025
|
if non_matches:
|
|
1037
1026
|
non_matches = list(set(non_matches))
|
|
1038
1027
|
logger.warning(
|
|
1039
|
-
f"Label is not compliant with Neo4j naming rules. Removed non compliant characters: {non_matches}"
|
|
1028
|
+
f"Label is not compliant with Neo4j naming rules. Removed non compliant characters: {non_matches}",
|
|
1040
1029
|
)
|
|
1041
1030
|
|
|
1042
1031
|
def first_character_compliant(character: str) -> bool:
|
|
@@ -1047,7 +1036,5 @@ def parse_label(label: str) -> str:
|
|
|
1047
1036
|
if first_character_compliant(c):
|
|
1048
1037
|
matches = matches[matches.index(c) :]
|
|
1049
1038
|
break
|
|
1050
|
-
logger.warning(
|
|
1051
|
-
"Label does not start with an alphabetic character or with $. Removed non compliant characters."
|
|
1052
|
-
)
|
|
1039
|
+
logger.warning("Label does not start with an alphabetic character or with $. Removed non compliant characters.")
|
|
1053
1040
|
return "".join(matches).strip()
|