biocypher 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/__init__.py +3 -13
- biocypher/_config/__init__.py +6 -23
- biocypher/_core.py +360 -262
- biocypher/_create.py +13 -27
- biocypher/_deduplicate.py +4 -11
- biocypher/_get.py +21 -60
- biocypher/_logger.py +4 -16
- biocypher/_mapping.py +4 -17
- biocypher/_metadata.py +3 -15
- biocypher/_misc.py +14 -28
- biocypher/_ontology.py +127 -212
- biocypher/_translate.py +34 -58
- biocypher/output/connect/_get_connector.py +40 -0
- biocypher/output/connect/_neo4j_driver.py +9 -65
- biocypher/output/in_memory/_get_in_memory_kg.py +34 -0
- biocypher/output/in_memory/_in_memory_kg.py +40 -0
- biocypher/output/in_memory/_networkx.py +44 -0
- biocypher/output/in_memory/_pandas.py +20 -15
- biocypher/output/write/_batch_writer.py +132 -177
- biocypher/output/write/_get_writer.py +11 -24
- biocypher/output/write/_writer.py +14 -33
- biocypher/output/write/graph/_arangodb.py +7 -24
- biocypher/output/write/graph/_neo4j.py +51 -56
- biocypher/output/write/graph/_networkx.py +36 -43
- biocypher/output/write/graph/_rdf.py +107 -95
- biocypher/output/write/relational/_csv.py +6 -11
- biocypher/output/write/relational/_postgresql.py +5 -13
- biocypher/output/write/relational/_sqlite.py +3 -1
- {biocypher-0.6.2.dist-info → biocypher-0.7.0.dist-info}/LICENSE +1 -1
- {biocypher-0.6.2.dist-info → biocypher-0.7.0.dist-info}/METADATA +3 -3
- biocypher-0.7.0.dist-info/RECORD +43 -0
- {biocypher-0.6.2.dist-info → biocypher-0.7.0.dist-info}/WHEEL +1 -1
- biocypher-0.6.2.dist-info/RECORD +0 -39
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
|
|
2
|
-
from types import GeneratorType
|
|
3
|
-
from typing import Union, Optional
|
|
4
|
-
from collections import OrderedDict, defaultdict
|
|
1
|
+
import glob
|
|
5
2
|
import os
|
|
6
3
|
import re
|
|
7
|
-
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections import OrderedDict, defaultdict
|
|
7
|
+
from types import GeneratorType
|
|
8
8
|
|
|
9
9
|
from more_itertools import peekable
|
|
10
10
|
|
|
11
11
|
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
12
|
+
from biocypher._deduplicate import Deduplicator
|
|
12
13
|
from biocypher._logger import logger
|
|
13
14
|
from biocypher._translate import Translator
|
|
14
|
-
from biocypher._deduplicate import Deduplicator
|
|
15
15
|
from biocypher.output.write._writer import _Writer
|
|
16
16
|
|
|
17
17
|
|
|
@@ -20,96 +20,88 @@ class _BatchWriter(_Writer, ABC):
|
|
|
20
20
|
|
|
21
21
|
@abstractmethod
|
|
22
22
|
def _quote_string(self, value: str) -> str:
|
|
23
|
-
"""
|
|
24
|
-
Abstract method to quote a string. Escaping is handled by the database-specific writer.
|
|
25
|
-
"""
|
|
26
|
-
|
|
23
|
+
"""Abstract method to quote a string. Escaping is handled by the database-specific writer."""
|
|
27
24
|
raise NotImplementedError(
|
|
28
|
-
"Database writer must override '_quote_string'"
|
|
25
|
+
"Database writer must override '_quote_string'",
|
|
29
26
|
)
|
|
30
27
|
|
|
31
28
|
@abstractmethod
|
|
32
29
|
def _get_default_import_call_bin_prefix(self):
|
|
33
|
-
"""
|
|
34
|
-
Abstract method to provide the default string for the import call bin prefix.
|
|
30
|
+
"""Abstract method to provide the default string for the import call bin prefix.
|
|
35
31
|
|
|
36
|
-
Returns
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
37
34
|
str: The database-specific string for the path to the import call bin prefix
|
|
35
|
+
|
|
38
36
|
"""
|
|
39
|
-
raise NotImplementedError(
|
|
40
|
-
"Database writer must override '_get_default_import_call_bin_prefix'"
|
|
41
|
-
)
|
|
37
|
+
raise NotImplementedError("Database writer must override '_get_default_import_call_bin_prefix'")
|
|
42
38
|
|
|
43
39
|
@abstractmethod
|
|
44
40
|
def _write_array_string(self, string_list):
|
|
45
|
-
"""
|
|
46
|
-
Abstract method to write the string representation of an array into a .csv file.
|
|
41
|
+
"""Abstract method to write the string representation of an array into a .csv file.
|
|
47
42
|
Different databases require different formats of array to optimize import speed.
|
|
48
43
|
|
|
49
44
|
Args:
|
|
45
|
+
----
|
|
50
46
|
string_list (list): list of ontology strings
|
|
51
47
|
|
|
52
48
|
Returns:
|
|
49
|
+
-------
|
|
53
50
|
str: The database-specific string representation of an array
|
|
51
|
+
|
|
54
52
|
"""
|
|
55
|
-
raise NotImplementedError(
|
|
56
|
-
"Database writer must override '_write_array_string'"
|
|
57
|
-
)
|
|
53
|
+
raise NotImplementedError("Database writer must override '_write_array_string'")
|
|
58
54
|
|
|
59
55
|
@abstractmethod
|
|
60
56
|
def _write_node_headers(self):
|
|
61
|
-
"""
|
|
62
|
-
Abstract method that takes care of importing properties of a graph entity that is represented
|
|
57
|
+
"""Abstract method that takes care of importing properties of a graph entity that is represented
|
|
63
58
|
as a node as per the definition in the `schema_config.yaml`
|
|
64
59
|
|
|
65
|
-
Returns
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
66
62
|
bool: The return value. True for success, False otherwise.
|
|
63
|
+
|
|
67
64
|
"""
|
|
68
|
-
raise NotImplementedError(
|
|
69
|
-
"Database writer must override '_write_node_headers'"
|
|
70
|
-
)
|
|
65
|
+
raise NotImplementedError("Database writer must override '_write_node_headers'")
|
|
71
66
|
|
|
72
67
|
@abstractmethod
|
|
73
68
|
def _write_edge_headers(self):
|
|
74
|
-
"""
|
|
75
|
-
Abstract method to write a database import-file for a graph entity that is represented
|
|
69
|
+
"""Abstract method to write a database import-file for a graph entity that is represented
|
|
76
70
|
as an edge as per the definition in the `schema_config.yaml`,
|
|
77
71
|
containing only the header for this type of edge.
|
|
78
72
|
|
|
79
|
-
Returns
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
80
75
|
bool: The return value. True for success, False otherwise.
|
|
76
|
+
|
|
81
77
|
"""
|
|
82
|
-
raise NotImplementedError(
|
|
83
|
-
"Database writer must override '_write_edge_headers'"
|
|
84
|
-
)
|
|
78
|
+
raise NotImplementedError("Database writer must override '_write_edge_headers'")
|
|
85
79
|
|
|
86
80
|
@abstractmethod
|
|
87
81
|
def _construct_import_call(self) -> str:
|
|
88
|
-
"""
|
|
89
|
-
Function to construct the import call detailing folder and
|
|
82
|
+
"""Function to construct the import call detailing folder and
|
|
90
83
|
individual node and edge headers and data files, as well as
|
|
91
84
|
delimiters and database name. Built after all data has been
|
|
92
85
|
processed to ensure that nodes are called before any edges.
|
|
93
86
|
|
|
94
|
-
Returns
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
95
89
|
str: A bash command for csv import.
|
|
90
|
+
|
|
96
91
|
"""
|
|
97
|
-
raise NotImplementedError(
|
|
98
|
-
"Database writer must override '_construct_import_call'"
|
|
99
|
-
)
|
|
92
|
+
raise NotImplementedError("Database writer must override '_construct_import_call'")
|
|
100
93
|
|
|
101
94
|
@abstractmethod
|
|
102
95
|
def _get_import_script_name(self) -> str:
|
|
103
|
-
"""
|
|
104
|
-
Returns the name of the import script.
|
|
96
|
+
"""Returns the name of the import script.
|
|
105
97
|
The name will be chosen based on the used database.
|
|
106
98
|
|
|
107
|
-
Returns
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
108
101
|
str: The name of the import script (ending in .sh)
|
|
102
|
+
|
|
109
103
|
"""
|
|
110
|
-
raise NotImplementedError(
|
|
111
|
-
"Database writer must override '_get_import_script_name'"
|
|
112
|
-
)
|
|
104
|
+
raise NotImplementedError("Database writer must override '_get_import_script_name'")
|
|
113
105
|
|
|
114
106
|
def __init__(
|
|
115
107
|
self,
|
|
@@ -118,10 +110,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
118
110
|
delimiter: str,
|
|
119
111
|
array_delimiter: str = ",",
|
|
120
112
|
quote: str = '"',
|
|
121
|
-
output_directory:
|
|
113
|
+
output_directory: str | None = None,
|
|
122
114
|
db_name: str = "neo4j",
|
|
123
|
-
import_call_bin_prefix:
|
|
124
|
-
import_call_file_prefix:
|
|
115
|
+
import_call_bin_prefix: str | None = None,
|
|
116
|
+
import_call_file_prefix: str | None = None,
|
|
125
117
|
wipe: bool = True,
|
|
126
118
|
strict_mode: bool = False,
|
|
127
119
|
skip_bad_relationships: bool = False,
|
|
@@ -133,9 +125,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
133
125
|
rdf_format: str = None,
|
|
134
126
|
rdf_namespaces: dict = {},
|
|
135
127
|
):
|
|
136
|
-
"""
|
|
137
|
-
|
|
138
|
-
Abtract parent class for writing node and edge representations to disk
|
|
128
|
+
"""Abtract parent class for writing node and edge representations to disk
|
|
139
129
|
using the format specified by each database type. The database-specific
|
|
140
130
|
functions are implemented by the respective child-classes. This abstract
|
|
141
131
|
class contains all methods expected by a bach writer instance, some of
|
|
@@ -156,6 +146,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
156
146
|
- _get_import_script_name
|
|
157
147
|
|
|
158
148
|
Args:
|
|
149
|
+
----
|
|
159
150
|
translator:
|
|
160
151
|
Instance of :py:class:`Translator` to enable translation of
|
|
161
152
|
nodes and manipulation of properties.
|
|
@@ -217,6 +208,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
217
208
|
|
|
218
209
|
rdf_namespaces:
|
|
219
210
|
The namespaces for RDF.
|
|
211
|
+
|
|
220
212
|
"""
|
|
221
213
|
super().__init__(
|
|
222
214
|
translator=translator,
|
|
@@ -233,17 +225,13 @@ class _BatchWriter(_Writer, ABC):
|
|
|
233
225
|
self.rdf_namespaces = rdf_namespaces
|
|
234
226
|
|
|
235
227
|
self.delim, self.escaped_delim = self._process_delimiter(delimiter)
|
|
236
|
-
self.adelim, self.escaped_adelim = self._process_delimiter(
|
|
237
|
-
array_delimiter
|
|
238
|
-
)
|
|
228
|
+
self.adelim, self.escaped_adelim = self._process_delimiter(array_delimiter)
|
|
239
229
|
self.quote = quote
|
|
240
230
|
self.skip_bad_relationships = skip_bad_relationships
|
|
241
231
|
self.skip_duplicate_nodes = skip_duplicate_nodes
|
|
242
232
|
|
|
243
233
|
if import_call_bin_prefix is None:
|
|
244
|
-
self.import_call_bin_prefix = (
|
|
245
|
-
self._get_default_import_call_bin_prefix()
|
|
246
|
-
)
|
|
234
|
+
self.import_call_bin_prefix = self._get_default_import_call_bin_prefix()
|
|
247
235
|
else:
|
|
248
236
|
self.import_call_bin_prefix = import_call_bin_prefix
|
|
249
237
|
|
|
@@ -268,34 +256,27 @@ class _BatchWriter(_Writer, ABC):
|
|
|
268
256
|
|
|
269
257
|
@property
|
|
270
258
|
def import_call_file_prefix(self):
|
|
271
|
-
"""
|
|
272
|
-
Property for output directory path.
|
|
273
|
-
"""
|
|
274
|
-
|
|
259
|
+
"""Property for output directory path."""
|
|
275
260
|
if self._import_call_file_prefix is None:
|
|
276
261
|
return self.outdir
|
|
277
262
|
else:
|
|
278
263
|
return self._import_call_file_prefix
|
|
279
264
|
|
|
280
265
|
def _process_delimiter(self, delimiter: str) -> str:
|
|
281
|
-
"""
|
|
282
|
-
Return escaped characters in case of receiving their string
|
|
266
|
+
"""Return escaped characters in case of receiving their string
|
|
283
267
|
representation (e.g. tab for '\t').
|
|
284
268
|
"""
|
|
285
|
-
|
|
286
269
|
if delimiter == "\\t":
|
|
287
270
|
return "\t", "\\t"
|
|
288
271
|
|
|
289
272
|
else:
|
|
290
273
|
return delimiter, delimiter
|
|
291
274
|
|
|
292
|
-
def write_nodes(
|
|
293
|
-
|
|
294
|
-
):
|
|
295
|
-
"""
|
|
296
|
-
Wrapper for writing nodes and their headers.
|
|
275
|
+
def write_nodes(self, nodes, batch_size: int = int(1e6), force: bool = False):
|
|
276
|
+
"""Wrapper for writing nodes and their headers.
|
|
297
277
|
|
|
298
278
|
Args:
|
|
279
|
+
----
|
|
299
280
|
nodes (BioCypherNode): a list or generator of nodes in
|
|
300
281
|
:py:class:`BioCypherNode` format
|
|
301
282
|
|
|
@@ -306,7 +287,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
306
287
|
|
|
307
288
|
|
|
308
289
|
Returns:
|
|
290
|
+
-------
|
|
309
291
|
bool: The return value. True for success, False otherwise.
|
|
292
|
+
|
|
310
293
|
"""
|
|
311
294
|
# TODO check represented_as
|
|
312
295
|
|
|
@@ -325,19 +308,21 @@ class _BatchWriter(_Writer, ABC):
|
|
|
325
308
|
|
|
326
309
|
def write_edges(
|
|
327
310
|
self,
|
|
328
|
-
edges:
|
|
311
|
+
edges: list | GeneratorType,
|
|
329
312
|
batch_size: int = int(1e6),
|
|
330
313
|
) -> bool:
|
|
331
|
-
"""
|
|
332
|
-
Wrapper for writing edges and their headers.
|
|
314
|
+
"""Wrapper for writing edges and their headers.
|
|
333
315
|
|
|
334
316
|
Args:
|
|
317
|
+
----
|
|
335
318
|
edges (BioCypherEdge): a list or generator of edges in
|
|
336
319
|
:py:class:`BioCypherEdge` or :py:class:`BioCypherRelAsNode`
|
|
337
320
|
format
|
|
338
321
|
|
|
339
322
|
Returns:
|
|
323
|
+
-------
|
|
340
324
|
bool: The return value. True for success, False otherwise.
|
|
325
|
+
|
|
341
326
|
"""
|
|
342
327
|
passed = False
|
|
343
328
|
edges = list(edges) # force evaluation to handle empty generator
|
|
@@ -375,7 +360,6 @@ class _BatchWriter(_Writer, ABC):
|
|
|
375
360
|
logger.debug(
|
|
376
361
|
"No edges to write, possibly due to no matched Biolink classes.",
|
|
377
362
|
)
|
|
378
|
-
pass
|
|
379
363
|
|
|
380
364
|
if not passed:
|
|
381
365
|
logger.error("Error while writing edge data.")
|
|
@@ -389,8 +373,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
389
373
|
return True
|
|
390
374
|
|
|
391
375
|
def _write_node_data(self, nodes, batch_size, force: bool = False):
|
|
392
|
-
"""
|
|
393
|
-
Writes biocypher nodes to CSV conforming to the headers created
|
|
376
|
+
"""Writes biocypher nodes to CSV conforming to the headers created
|
|
394
377
|
with `_write_node_headers()`, and is actually required to be run
|
|
395
378
|
before calling `_write_node_headers()` to set the
|
|
396
379
|
:py:attr:`self.node_property_dict` for passing the node properties
|
|
@@ -398,14 +381,16 @@ class _BatchWriter(_Writer, ABC):
|
|
|
398
381
|
:py:class:`BioCypherNode` class.
|
|
399
382
|
|
|
400
383
|
Args:
|
|
384
|
+
----
|
|
401
385
|
nodes (BioCypherNode): a list or generator of nodes in
|
|
402
386
|
:py:class:`BioCypherNode` format
|
|
403
387
|
|
|
404
388
|
Returns:
|
|
389
|
+
-------
|
|
405
390
|
bool: The return value. True for success, False otherwise.
|
|
406
|
-
"""
|
|
407
391
|
|
|
408
|
-
|
|
392
|
+
"""
|
|
393
|
+
if isinstance(nodes, GeneratorType | peekable):
|
|
409
394
|
logger.debug("Writing node CSV from generator.")
|
|
410
395
|
|
|
411
396
|
bins = defaultdict(list) # dict to store a list for each
|
|
@@ -432,20 +417,15 @@ class _BatchWriter(_Writer, ABC):
|
|
|
432
417
|
logger.warning(f"Node {label} has no id; skipping.")
|
|
433
418
|
continue
|
|
434
419
|
|
|
435
|
-
if not
|
|
420
|
+
if label not in bins.keys():
|
|
436
421
|
# start new list
|
|
437
422
|
all_labels = None
|
|
438
423
|
bins[label].append(node)
|
|
439
424
|
bin_l[label] = 1
|
|
440
425
|
|
|
441
426
|
# get properties from config if present
|
|
442
|
-
if
|
|
443
|
-
label
|
|
444
|
-
in self.translator.ontology.mapping.extended_schema
|
|
445
|
-
):
|
|
446
|
-
cprops = self.translator.ontology.mapping.extended_schema.get(
|
|
447
|
-
label
|
|
448
|
-
).get(
|
|
427
|
+
if label in self.translator.ontology.mapping.extended_schema:
|
|
428
|
+
cprops = self.translator.ontology.mapping.extended_schema.get(label).get(
|
|
449
429
|
"properties",
|
|
450
430
|
)
|
|
451
431
|
else:
|
|
@@ -483,18 +463,13 @@ class _BatchWriter(_Writer, ABC):
|
|
|
483
463
|
# get label hierarchy
|
|
484
464
|
# multiple labels:
|
|
485
465
|
if not force:
|
|
486
|
-
all_labels = self.translator.ontology.get_ancestors(
|
|
487
|
-
label
|
|
488
|
-
)
|
|
466
|
+
all_labels = self.translator.ontology.get_ancestors(label)
|
|
489
467
|
else:
|
|
490
468
|
all_labels = None
|
|
491
469
|
|
|
492
470
|
if all_labels:
|
|
493
471
|
# convert to pascal case
|
|
494
|
-
all_labels = [
|
|
495
|
-
self.translator.name_sentence_to_pascal(label)
|
|
496
|
-
for label in all_labels
|
|
497
|
-
]
|
|
472
|
+
all_labels = [self.translator.name_sentence_to_pascal(label) for label in all_labels]
|
|
498
473
|
# remove duplicates
|
|
499
474
|
all_labels = list(OrderedDict.fromkeys(all_labels))
|
|
500
475
|
# order alphabetically
|
|
@@ -502,9 +477,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
502
477
|
# concatenate with array delimiter
|
|
503
478
|
all_labels = self._write_array_string(all_labels)
|
|
504
479
|
else:
|
|
505
|
-
all_labels = self.translator.name_sentence_to_pascal(
|
|
506
|
-
label
|
|
507
|
-
)
|
|
480
|
+
all_labels = self.translator.name_sentence_to_pascal(label)
|
|
508
481
|
|
|
509
482
|
labels[label] = all_labels
|
|
510
483
|
|
|
@@ -549,16 +522,15 @@ class _BatchWriter(_Writer, ABC):
|
|
|
549
522
|
self.node_property_dict[label] = reference_props[label]
|
|
550
523
|
|
|
551
524
|
return True
|
|
525
|
+
elif not isinstance(nodes, list):
|
|
526
|
+
logger.error("Nodes must be passed as list or generator.")
|
|
527
|
+
return False
|
|
552
528
|
else:
|
|
553
|
-
if type(nodes) is not list:
|
|
554
|
-
logger.error("Nodes must be passed as list or generator.")
|
|
555
|
-
return False
|
|
556
|
-
else:
|
|
557
529
|
|
|
558
|
-
|
|
559
|
-
|
|
530
|
+
def gen(nodes):
|
|
531
|
+
yield from nodes
|
|
560
532
|
|
|
561
|
-
|
|
533
|
+
return self._write_node_data(gen(nodes), batch_size=batch_size)
|
|
562
534
|
|
|
563
535
|
def _write_single_node_list_to_file(
|
|
564
536
|
self,
|
|
@@ -567,11 +539,11 @@ class _BatchWriter(_Writer, ABC):
|
|
|
567
539
|
prop_dict: dict,
|
|
568
540
|
labels: str,
|
|
569
541
|
):
|
|
570
|
-
"""
|
|
571
|
-
This function takes one list of biocypher nodes and writes them
|
|
542
|
+
"""This function takes one list of biocypher nodes and writes them
|
|
572
543
|
to a Neo4j admin import compatible CSV file.
|
|
573
544
|
|
|
574
545
|
Args:
|
|
546
|
+
----
|
|
575
547
|
node_list (list): list of BioCypherNodes to be written
|
|
576
548
|
label (str): the primary label of the node
|
|
577
549
|
prop_dict (dict): properties of node class passed from parsing
|
|
@@ -580,7 +552,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
580
552
|
for the node class
|
|
581
553
|
|
|
582
554
|
Returns:
|
|
555
|
+
-------
|
|
583
556
|
bool: The return value. True for success, False otherwise.
|
|
557
|
+
|
|
584
558
|
"""
|
|
585
559
|
if not all(isinstance(n, BioCypherNode) for n in node_list):
|
|
586
560
|
logger.error("Nodes must be passed as type BioCypherNode.")
|
|
@@ -598,7 +572,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
598
572
|
ref_props = list(prop_dict.keys())
|
|
599
573
|
|
|
600
574
|
# compare lists order invariant
|
|
601
|
-
if
|
|
575
|
+
if set(ref_props) != set(n_keys):
|
|
602
576
|
onode = n.get_id()
|
|
603
577
|
oprop1 = set(ref_props).difference(n_keys)
|
|
604
578
|
oprop2 = set(n_keys).difference(ref_props)
|
|
@@ -632,11 +606,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
632
606
|
"boolean",
|
|
633
607
|
]:
|
|
634
608
|
plist.append(str(p))
|
|
609
|
+
elif isinstance(p, list):
|
|
610
|
+
plist.append(self._write_array_string(p))
|
|
635
611
|
else:
|
|
636
|
-
|
|
637
|
-
plist.append(self._write_array_string(p))
|
|
638
|
-
else:
|
|
639
|
-
plist.append(self._quote_string(str(p)))
|
|
612
|
+
plist.append(f"{self.quote}{p!s}{self.quote}")
|
|
640
613
|
|
|
641
614
|
line.append(self.delim.join(plist))
|
|
642
615
|
line.append(labels)
|
|
@@ -650,8 +623,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
650
623
|
return True
|
|
651
624
|
|
|
652
625
|
def _write_edge_data(self, edges, batch_size):
|
|
653
|
-
"""
|
|
654
|
-
Writes biocypher edges to CSV conforming to the headers created
|
|
626
|
+
"""Writes biocypher edges to CSV conforming to the headers created
|
|
655
627
|
with `_write_edge_headers()`, and is actually required to be run
|
|
656
628
|
before calling `_write_node_headers()` to set the
|
|
657
629
|
:py:attr:`self.edge_property_dict` for passing the edge
|
|
@@ -659,17 +631,20 @@ class _BatchWriter(_Writer, ABC):
|
|
|
659
631
|
from the :py:class:`BioCypherEdge` class.
|
|
660
632
|
|
|
661
633
|
Args:
|
|
634
|
+
----
|
|
662
635
|
edges (BioCypherEdge): a list or generator of edges in
|
|
663
636
|
:py:class:`BioCypherEdge` format
|
|
664
637
|
|
|
665
638
|
Returns:
|
|
639
|
+
-------
|
|
666
640
|
bool: The return value. True for success, False otherwise.
|
|
667
641
|
|
|
668
642
|
Todo:
|
|
643
|
+
----
|
|
669
644
|
- currently works for mixed edges but in practice often is
|
|
670
645
|
called on one iterable containing one type of edge only
|
|
671
|
-
"""
|
|
672
646
|
|
|
647
|
+
"""
|
|
673
648
|
if isinstance(edges, GeneratorType):
|
|
674
649
|
logger.debug("Writing edge CSV from generator.")
|
|
675
650
|
|
|
@@ -685,14 +660,13 @@ class _BatchWriter(_Writer, ABC):
|
|
|
685
660
|
for edge in edges:
|
|
686
661
|
if not (edge.get_source_id() and edge.get_target_id()):
|
|
687
662
|
logger.error(
|
|
688
|
-
"Edge must have source and target node. "
|
|
689
|
-
f"Caused by: {edge}",
|
|
663
|
+
f"Edge must have source and target node. Caused by: {edge}",
|
|
690
664
|
)
|
|
691
665
|
continue
|
|
692
666
|
|
|
693
667
|
label = edge.get_label()
|
|
694
668
|
|
|
695
|
-
if not
|
|
669
|
+
if label not in bins.keys():
|
|
696
670
|
# start new list
|
|
697
671
|
bins[label].append(edge)
|
|
698
672
|
bin_l[label] = 1
|
|
@@ -703,13 +677,8 @@ class _BatchWriter(_Writer, ABC):
|
|
|
703
677
|
# (may not be if it is an edge that carries the
|
|
704
678
|
# "label_as_edge" property)
|
|
705
679
|
cprops = None
|
|
706
|
-
if
|
|
707
|
-
label
|
|
708
|
-
in self.translator.ontology.mapping.extended_schema
|
|
709
|
-
):
|
|
710
|
-
cprops = self.translator.ontology.mapping.extended_schema.get(
|
|
711
|
-
label
|
|
712
|
-
).get(
|
|
680
|
+
if label in self.translator.ontology.mapping.extended_schema:
|
|
681
|
+
cprops = self.translator.ontology.mapping.extended_schema.get(label).get(
|
|
713
682
|
"properties",
|
|
714
683
|
)
|
|
715
684
|
else:
|
|
@@ -717,9 +686,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
717
686
|
for (
|
|
718
687
|
k,
|
|
719
688
|
v,
|
|
720
|
-
) in (
|
|
721
|
-
self.translator.ontology.mapping.extended_schema.items()
|
|
722
|
-
):
|
|
689
|
+
) in self.translator.ontology.mapping.extended_schema.items():
|
|
723
690
|
if isinstance(v, dict):
|
|
724
691
|
if v.get("label_as_edge") == label:
|
|
725
692
|
cprops = v.get("properties")
|
|
@@ -789,16 +756,15 @@ class _BatchWriter(_Writer, ABC):
|
|
|
789
756
|
self.edge_property_dict[label] = reference_props[label]
|
|
790
757
|
|
|
791
758
|
return True
|
|
759
|
+
elif not isinstance(edges, list):
|
|
760
|
+
logger.error("Edges must be passed as list or generator.")
|
|
761
|
+
return False
|
|
792
762
|
else:
|
|
793
|
-
if type(edges) is not list:
|
|
794
|
-
logger.error("Edges must be passed as list or generator.")
|
|
795
|
-
return False
|
|
796
|
-
else:
|
|
797
763
|
|
|
798
|
-
|
|
799
|
-
|
|
764
|
+
def gen(edges):
|
|
765
|
+
yield from edges
|
|
800
766
|
|
|
801
|
-
|
|
767
|
+
return self._write_edge_data(gen(edges), batch_size=batch_size)
|
|
802
768
|
|
|
803
769
|
def _write_single_edge_list_to_file(
|
|
804
770
|
self,
|
|
@@ -806,11 +772,11 @@ class _BatchWriter(_Writer, ABC):
|
|
|
806
772
|
label: str,
|
|
807
773
|
prop_dict: dict,
|
|
808
774
|
):
|
|
809
|
-
"""
|
|
810
|
-
This function takes one list of biocypher edges and writes them
|
|
775
|
+
"""This function takes one list of biocypher edges and writes them
|
|
811
776
|
to a Neo4j admin import compatible CSV file.
|
|
812
777
|
|
|
813
778
|
Args:
|
|
779
|
+
----
|
|
814
780
|
edge_list (list): list of BioCypherEdges to be written
|
|
815
781
|
|
|
816
782
|
label (str): the label (type) of the edge
|
|
@@ -819,9 +785,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
819
785
|
function and their types
|
|
820
786
|
|
|
821
787
|
Returns:
|
|
788
|
+
-------
|
|
822
789
|
bool: The return value. True for success, False otherwise.
|
|
823
|
-
"""
|
|
824
790
|
|
|
791
|
+
"""
|
|
825
792
|
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
|
|
826
793
|
logger.error("Edges must be passed as type BioCypherEdge.")
|
|
827
794
|
return False
|
|
@@ -836,7 +803,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
836
803
|
ref_props = list(prop_dict.keys())
|
|
837
804
|
|
|
838
805
|
# compare list order invariant
|
|
839
|
-
if
|
|
806
|
+
if set(ref_props) != set(e_keys):
|
|
840
807
|
oedge = f"{e.get_source_id()}-{e.get_target_id()}"
|
|
841
808
|
oprop1 = set(ref_props).difference(e_keys)
|
|
842
809
|
oprop2 = set(e_keys).difference(ref_props)
|
|
@@ -867,11 +834,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
867
834
|
"boolean",
|
|
868
835
|
]:
|
|
869
836
|
plist.append(str(p))
|
|
837
|
+
elif isinstance(p, list):
|
|
838
|
+
plist.append(self._write_array_string(p))
|
|
870
839
|
else:
|
|
871
|
-
|
|
872
|
-
plist.append(self._write_array_string(p))
|
|
873
|
-
else:
|
|
874
|
-
plist.append(self._quote_string(str(p)))
|
|
840
|
+
plist.append(self.quote + str(p) + self.quote)
|
|
875
841
|
|
|
876
842
|
entries = [e.get_source_id()]
|
|
877
843
|
|
|
@@ -880,9 +846,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
880
846
|
|
|
881
847
|
if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
|
|
882
848
|
skip_id = True
|
|
883
|
-
elif not self.translator.ontology.mapping.extended_schema.get(
|
|
884
|
-
label
|
|
885
|
-
):
|
|
849
|
+
elif not self.translator.ontology.mapping.extended_schema.get(label):
|
|
886
850
|
# find label in schema by label_as_edge
|
|
887
851
|
for (
|
|
888
852
|
k,
|
|
@@ -897,9 +861,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
897
861
|
if schema_label:
|
|
898
862
|
if (
|
|
899
863
|
self.translator.ontology.mapping.extended_schema.get(
|
|
900
|
-
schema_label
|
|
864
|
+
schema_label,
|
|
901
865
|
).get("use_id")
|
|
902
|
-
== False
|
|
866
|
+
== False # noqa: E712 (seems to not work with 'not')
|
|
903
867
|
):
|
|
904
868
|
skip_id = True
|
|
905
869
|
|
|
@@ -913,7 +877,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
913
877
|
entries.append(
|
|
914
878
|
self.translator.name_sentence_to_pascal(
|
|
915
879
|
e.get_label(),
|
|
916
|
-
)
|
|
880
|
+
),
|
|
917
881
|
)
|
|
918
882
|
|
|
919
883
|
lines.append(
|
|
@@ -927,10 +891,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
927
891
|
return True
|
|
928
892
|
|
|
929
893
|
def _write_next_part(self, label: str, lines: list):
|
|
930
|
-
"""
|
|
931
|
-
This function writes a list of strings to a new part file.
|
|
894
|
+
"""This function writes a list of strings to a new part file.
|
|
932
895
|
|
|
933
896
|
Args:
|
|
897
|
+
----
|
|
934
898
|
label (str): the label (type) of the edge; internal
|
|
935
899
|
representation sentence case -> needs to become PascalCase
|
|
936
900
|
for disk representation
|
|
@@ -938,17 +902,15 @@ class _BatchWriter(_Writer, ABC):
|
|
|
938
902
|
lines (list): list of strings to be written
|
|
939
903
|
|
|
940
904
|
Returns:
|
|
905
|
+
-------
|
|
941
906
|
bool: The return value. True for success, False otherwise.
|
|
907
|
+
|
|
942
908
|
"""
|
|
943
909
|
# translate label to PascalCase
|
|
944
|
-
label_pascal = self.translator.name_sentence_to_pascal(
|
|
945
|
-
parse_label(label)
|
|
946
|
-
)
|
|
910
|
+
label_pascal = self.translator.name_sentence_to_pascal(parse_label(label))
|
|
947
911
|
|
|
948
912
|
# list files in self.outdir
|
|
949
|
-
files = glob.glob(
|
|
950
|
-
os.path.join(self.outdir, f"{label_pascal}-part*.csv")
|
|
951
|
-
)
|
|
913
|
+
files = glob.glob(os.path.join(self.outdir, f"{label_pascal}-part*.csv"))
|
|
952
914
|
# find file with highest part number
|
|
953
915
|
if not files:
|
|
954
916
|
next_part = 0
|
|
@@ -956,10 +918,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
956
918
|
else:
|
|
957
919
|
next_part = (
|
|
958
920
|
max(
|
|
959
|
-
[
|
|
960
|
-
int(f.split(".")[-2].split("-")[-1].replace("part", ""))
|
|
961
|
-
for f in files
|
|
962
|
-
],
|
|
921
|
+
[int(f.split(".")[-2].split("-")[-1].replace("part", "")) for f in files],
|
|
963
922
|
)
|
|
964
923
|
+ 1
|
|
965
924
|
)
|
|
@@ -984,31 +943,29 @@ class _BatchWriter(_Writer, ABC):
|
|
|
984
943
|
self.parts[label].append(part)
|
|
985
944
|
|
|
986
945
|
def get_import_call(self) -> str:
|
|
987
|
-
"""
|
|
988
|
-
Function to return the import call detailing folder and
|
|
946
|
+
"""Function to return the import call detailing folder and
|
|
989
947
|
individual node and edge headers and data files, as well as
|
|
990
948
|
delimiters and database name.
|
|
991
949
|
|
|
992
|
-
Returns
|
|
950
|
+
Returns
|
|
951
|
+
-------
|
|
993
952
|
str: a bash command for the database import
|
|
994
|
-
"""
|
|
995
953
|
|
|
954
|
+
"""
|
|
996
955
|
return self._construct_import_call()
|
|
997
956
|
|
|
998
957
|
def write_import_call(self) -> str:
|
|
999
|
-
"""
|
|
1000
|
-
Function to write the import call detailing folder and
|
|
958
|
+
"""Function to write the import call detailing folder and
|
|
1001
959
|
individual node and edge headers and data files, as well as
|
|
1002
960
|
delimiters and database name, to the export folder as txt.
|
|
1003
961
|
|
|
1004
|
-
Returns
|
|
962
|
+
Returns
|
|
963
|
+
-------
|
|
1005
964
|
str: The path of the file holding the import call.
|
|
1006
|
-
"""
|
|
1007
965
|
|
|
966
|
+
"""
|
|
1008
967
|
file_path = os.path.join(self.outdir, self._get_import_script_name())
|
|
1009
|
-
logger.info(
|
|
1010
|
-
f"Writing {self.db_name + ' ' if self.db_name else ''}import call to `{file_path}`."
|
|
1011
|
-
)
|
|
968
|
+
logger.info(f"Writing {self.db_name + ' ' if self.db_name else ''}import call to `{file_path}`.")
|
|
1012
969
|
|
|
1013
970
|
with open(file_path, "w", encoding="utf-8") as f:
|
|
1014
971
|
f.write(self._construct_import_call())
|
|
@@ -1017,16 +974,16 @@ class _BatchWriter(_Writer, ABC):
|
|
|
1017
974
|
|
|
1018
975
|
|
|
1019
976
|
def parse_label(label: str) -> str:
|
|
1020
|
-
"""
|
|
1021
|
-
|
|
1022
|
-
Check if the label is compliant with Neo4j naming conventions,
|
|
977
|
+
"""Check if the label is compliant with Neo4j naming conventions,
|
|
1023
978
|
https://neo4j.com/docs/cypher-manual/current/syntax/naming/, and if not,
|
|
1024
979
|
remove non-compliant characters.
|
|
1025
980
|
|
|
1026
981
|
Args:
|
|
982
|
+
----
|
|
1027
983
|
label (str): The label to check
|
|
1028
984
|
Returns:
|
|
1029
985
|
str: The compliant label
|
|
986
|
+
|
|
1030
987
|
"""
|
|
1031
988
|
# Check if the name contains only alphanumeric characters, underscore, or dollar sign
|
|
1032
989
|
# and dot (for class hierarchy of BioCypher)
|
|
@@ -1036,7 +993,7 @@ def parse_label(label: str) -> str:
|
|
|
1036
993
|
if non_matches:
|
|
1037
994
|
non_matches = list(set(non_matches))
|
|
1038
995
|
logger.warning(
|
|
1039
|
-
f"Label is not compliant with Neo4j naming rules. Removed non compliant characters: {non_matches}"
|
|
996
|
+
f"Label is not compliant with Neo4j naming rules. Removed non compliant characters: {non_matches}",
|
|
1040
997
|
)
|
|
1041
998
|
|
|
1042
999
|
def first_character_compliant(character: str) -> bool:
|
|
@@ -1047,7 +1004,5 @@ def parse_label(label: str) -> str:
|
|
|
1047
1004
|
if first_character_compliant(c):
|
|
1048
1005
|
matches = matches[matches.index(c) :]
|
|
1049
1006
|
break
|
|
1050
|
-
logger.warning(
|
|
1051
|
-
"Label does not start with an alphabetic character or with $. Removed non compliant characters."
|
|
1052
|
-
)
|
|
1007
|
+
logger.warning("Label does not start with an alphabetic character or with $. Removed non compliant characters.")
|
|
1053
1008
|
return "".join(matches).strip()
|