biocypher 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/__init__.py +3 -13
- biocypher/_config/__init__.py +6 -23
- biocypher/_core.py +360 -262
- biocypher/_create.py +13 -27
- biocypher/_deduplicate.py +4 -11
- biocypher/_get.py +21 -60
- biocypher/_logger.py +4 -16
- biocypher/_mapping.py +4 -17
- biocypher/_metadata.py +3 -15
- biocypher/_misc.py +14 -28
- biocypher/_ontology.py +127 -212
- biocypher/_translate.py +34 -58
- biocypher/output/connect/_get_connector.py +40 -0
- biocypher/output/connect/_neo4j_driver.py +9 -65
- biocypher/output/in_memory/_get_in_memory_kg.py +34 -0
- biocypher/output/in_memory/_in_memory_kg.py +40 -0
- biocypher/output/in_memory/_networkx.py +44 -0
- biocypher/output/in_memory/_pandas.py +20 -15
- biocypher/output/write/_batch_writer.py +137 -172
- biocypher/output/write/_get_writer.py +11 -24
- biocypher/output/write/_writer.py +14 -33
- biocypher/output/write/graph/_arangodb.py +7 -24
- biocypher/output/write/graph/_neo4j.py +59 -57
- biocypher/output/write/graph/_networkx.py +36 -43
- biocypher/output/write/graph/_rdf.py +114 -95
- biocypher/output/write/relational/_csv.py +6 -11
- biocypher/output/write/relational/_postgresql.py +12 -13
- biocypher/output/write/relational/_sqlite.py +3 -1
- {biocypher-0.6.1.dist-info → biocypher-0.7.0.dist-info}/LICENSE +1 -1
- {biocypher-0.6.1.dist-info → biocypher-0.7.0.dist-info}/METADATA +3 -3
- biocypher-0.7.0.dist-info/RECORD +43 -0
- {biocypher-0.6.1.dist-info → biocypher-0.7.0.dist-info}/WHEEL +1 -1
- biocypher-0.6.1.dist-info/RECORD +0 -39
|
@@ -1,105 +1,107 @@
|
|
|
1
|
-
|
|
2
|
-
from types import GeneratorType
|
|
3
|
-
from typing import Union, Optional
|
|
4
|
-
from collections import OrderedDict, defaultdict
|
|
1
|
+
import glob
|
|
5
2
|
import os
|
|
6
3
|
import re
|
|
7
|
-
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections import OrderedDict, defaultdict
|
|
7
|
+
from types import GeneratorType
|
|
8
8
|
|
|
9
9
|
from more_itertools import peekable
|
|
10
10
|
|
|
11
11
|
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
12
|
+
from biocypher._deduplicate import Deduplicator
|
|
12
13
|
from biocypher._logger import logger
|
|
13
14
|
from biocypher._translate import Translator
|
|
14
|
-
from biocypher._deduplicate import Deduplicator
|
|
15
15
|
from biocypher.output.write._writer import _Writer
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class _BatchWriter(_Writer, ABC):
|
|
19
19
|
"""Abstract batch writer class"""
|
|
20
20
|
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def _quote_string(self, value: str) -> str:
|
|
23
|
+
"""Abstract method to quote a string. Escaping is handled by the database-specific writer."""
|
|
24
|
+
raise NotImplementedError(
|
|
25
|
+
"Database writer must override '_quote_string'",
|
|
26
|
+
)
|
|
27
|
+
|
|
21
28
|
@abstractmethod
|
|
22
29
|
def _get_default_import_call_bin_prefix(self):
|
|
23
|
-
"""
|
|
24
|
-
Abstract method to provide the default string for the import call bin prefix.
|
|
30
|
+
"""Abstract method to provide the default string for the import call bin prefix.
|
|
25
31
|
|
|
26
|
-
Returns
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
27
34
|
str: The database-specific string for the path to the import call bin prefix
|
|
35
|
+
|
|
28
36
|
"""
|
|
29
|
-
raise NotImplementedError(
|
|
30
|
-
"Database writer must override '_get_default_import_call_bin_prefix'"
|
|
31
|
-
)
|
|
37
|
+
raise NotImplementedError("Database writer must override '_get_default_import_call_bin_prefix'")
|
|
32
38
|
|
|
33
39
|
@abstractmethod
|
|
34
40
|
def _write_array_string(self, string_list):
|
|
35
|
-
"""
|
|
36
|
-
Abstract method to write the string representation of an array into a .csv file.
|
|
41
|
+
"""Abstract method to write the string representation of an array into a .csv file.
|
|
37
42
|
Different databases require different formats of array to optimize import speed.
|
|
38
43
|
|
|
39
44
|
Args:
|
|
45
|
+
----
|
|
40
46
|
string_list (list): list of ontology strings
|
|
41
47
|
|
|
42
48
|
Returns:
|
|
49
|
+
-------
|
|
43
50
|
str: The database-specific string representation of an array
|
|
51
|
+
|
|
44
52
|
"""
|
|
45
|
-
raise NotImplementedError(
|
|
46
|
-
"Database writer must override '_write_array_string'"
|
|
47
|
-
)
|
|
53
|
+
raise NotImplementedError("Database writer must override '_write_array_string'")
|
|
48
54
|
|
|
49
55
|
@abstractmethod
|
|
50
56
|
def _write_node_headers(self):
|
|
51
|
-
"""
|
|
52
|
-
Abstract method that takes care of importing properties of a graph entity that is represented
|
|
57
|
+
"""Abstract method that takes care of importing properties of a graph entity that is represented
|
|
53
58
|
as a node as per the definition in the `schema_config.yaml`
|
|
54
59
|
|
|
55
|
-
Returns
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
56
62
|
bool: The return value. True for success, False otherwise.
|
|
63
|
+
|
|
57
64
|
"""
|
|
58
|
-
raise NotImplementedError(
|
|
59
|
-
"Database writer must override '_write_node_headers'"
|
|
60
|
-
)
|
|
65
|
+
raise NotImplementedError("Database writer must override '_write_node_headers'")
|
|
61
66
|
|
|
62
67
|
@abstractmethod
|
|
63
68
|
def _write_edge_headers(self):
|
|
64
|
-
"""
|
|
65
|
-
Abstract method to write a database import-file for a graph entity that is represented
|
|
69
|
+
"""Abstract method to write a database import-file for a graph entity that is represented
|
|
66
70
|
as an edge as per the definition in the `schema_config.yaml`,
|
|
67
71
|
containing only the header for this type of edge.
|
|
68
72
|
|
|
69
|
-
Returns
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
70
75
|
bool: The return value. True for success, False otherwise.
|
|
76
|
+
|
|
71
77
|
"""
|
|
72
|
-
raise NotImplementedError(
|
|
73
|
-
"Database writer must override '_write_edge_headers'"
|
|
74
|
-
)
|
|
78
|
+
raise NotImplementedError("Database writer must override '_write_edge_headers'")
|
|
75
79
|
|
|
76
80
|
@abstractmethod
|
|
77
81
|
def _construct_import_call(self) -> str:
|
|
78
|
-
"""
|
|
79
|
-
Function to construct the import call detailing folder and
|
|
82
|
+
"""Function to construct the import call detailing folder and
|
|
80
83
|
individual node and edge headers and data files, as well as
|
|
81
84
|
delimiters and database name. Built after all data has been
|
|
82
85
|
processed to ensure that nodes are called before any edges.
|
|
83
86
|
|
|
84
|
-
Returns
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
85
89
|
str: A bash command for csv import.
|
|
90
|
+
|
|
86
91
|
"""
|
|
87
|
-
raise NotImplementedError(
|
|
88
|
-
"Database writer must override '_construct_import_call'"
|
|
89
|
-
)
|
|
92
|
+
raise NotImplementedError("Database writer must override '_construct_import_call'")
|
|
90
93
|
|
|
91
94
|
@abstractmethod
|
|
92
95
|
def _get_import_script_name(self) -> str:
|
|
93
|
-
"""
|
|
94
|
-
Returns the name of the import script.
|
|
96
|
+
"""Returns the name of the import script.
|
|
95
97
|
The name will be chosen based on the used database.
|
|
96
98
|
|
|
97
|
-
Returns
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
98
101
|
str: The name of the import script (ending in .sh)
|
|
102
|
+
|
|
99
103
|
"""
|
|
100
|
-
raise NotImplementedError(
|
|
101
|
-
"Database writer must override '_get_import_script_name'"
|
|
102
|
-
)
|
|
104
|
+
raise NotImplementedError("Database writer must override '_get_import_script_name'")
|
|
103
105
|
|
|
104
106
|
def __init__(
|
|
105
107
|
self,
|
|
@@ -108,10 +110,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
108
110
|
delimiter: str,
|
|
109
111
|
array_delimiter: str = ",",
|
|
110
112
|
quote: str = '"',
|
|
111
|
-
output_directory:
|
|
113
|
+
output_directory: str | None = None,
|
|
112
114
|
db_name: str = "neo4j",
|
|
113
|
-
import_call_bin_prefix:
|
|
114
|
-
import_call_file_prefix:
|
|
115
|
+
import_call_bin_prefix: str | None = None,
|
|
116
|
+
import_call_file_prefix: str | None = None,
|
|
115
117
|
wipe: bool = True,
|
|
116
118
|
strict_mode: bool = False,
|
|
117
119
|
skip_bad_relationships: bool = False,
|
|
@@ -123,9 +125,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
123
125
|
rdf_format: str = None,
|
|
124
126
|
rdf_namespaces: dict = {},
|
|
125
127
|
):
|
|
126
|
-
"""
|
|
127
|
-
|
|
128
|
-
Abtract parent class for writing node and edge representations to disk
|
|
128
|
+
"""Abtract parent class for writing node and edge representations to disk
|
|
129
129
|
using the format specified by each database type. The database-specific
|
|
130
130
|
functions are implemented by the respective child-classes. This abstract
|
|
131
131
|
class contains all methods expected by a bach writer instance, some of
|
|
@@ -146,6 +146,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
146
146
|
- _get_import_script_name
|
|
147
147
|
|
|
148
148
|
Args:
|
|
149
|
+
----
|
|
149
150
|
translator:
|
|
150
151
|
Instance of :py:class:`Translator` to enable translation of
|
|
151
152
|
nodes and manipulation of properties.
|
|
@@ -207,6 +208,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
207
208
|
|
|
208
209
|
rdf_namespaces:
|
|
209
210
|
The namespaces for RDF.
|
|
211
|
+
|
|
210
212
|
"""
|
|
211
213
|
super().__init__(
|
|
212
214
|
translator=translator,
|
|
@@ -223,17 +225,13 @@ class _BatchWriter(_Writer, ABC):
|
|
|
223
225
|
self.rdf_namespaces = rdf_namespaces
|
|
224
226
|
|
|
225
227
|
self.delim, self.escaped_delim = self._process_delimiter(delimiter)
|
|
226
|
-
self.adelim, self.escaped_adelim = self._process_delimiter(
|
|
227
|
-
array_delimiter
|
|
228
|
-
)
|
|
228
|
+
self.adelim, self.escaped_adelim = self._process_delimiter(array_delimiter)
|
|
229
229
|
self.quote = quote
|
|
230
230
|
self.skip_bad_relationships = skip_bad_relationships
|
|
231
231
|
self.skip_duplicate_nodes = skip_duplicate_nodes
|
|
232
232
|
|
|
233
233
|
if import_call_bin_prefix is None:
|
|
234
|
-
self.import_call_bin_prefix = (
|
|
235
|
-
self._get_default_import_call_bin_prefix()
|
|
236
|
-
)
|
|
234
|
+
self.import_call_bin_prefix = self._get_default_import_call_bin_prefix()
|
|
237
235
|
else:
|
|
238
236
|
self.import_call_bin_prefix = import_call_bin_prefix
|
|
239
237
|
|
|
@@ -258,34 +256,27 @@ class _BatchWriter(_Writer, ABC):
|
|
|
258
256
|
|
|
259
257
|
@property
|
|
260
258
|
def import_call_file_prefix(self):
|
|
261
|
-
"""
|
|
262
|
-
Property for output directory path.
|
|
263
|
-
"""
|
|
264
|
-
|
|
259
|
+
"""Property for output directory path."""
|
|
265
260
|
if self._import_call_file_prefix is None:
|
|
266
261
|
return self.outdir
|
|
267
262
|
else:
|
|
268
263
|
return self._import_call_file_prefix
|
|
269
264
|
|
|
270
265
|
def _process_delimiter(self, delimiter: str) -> str:
|
|
271
|
-
"""
|
|
272
|
-
Return escaped characters in case of receiving their string
|
|
266
|
+
"""Return escaped characters in case of receiving their string
|
|
273
267
|
representation (e.g. tab for '\t').
|
|
274
268
|
"""
|
|
275
|
-
|
|
276
269
|
if delimiter == "\\t":
|
|
277
270
|
return "\t", "\\t"
|
|
278
271
|
|
|
279
272
|
else:
|
|
280
273
|
return delimiter, delimiter
|
|
281
274
|
|
|
282
|
-
def write_nodes(
|
|
283
|
-
|
|
284
|
-
):
|
|
285
|
-
"""
|
|
286
|
-
Wrapper for writing nodes and their headers.
|
|
275
|
+
def write_nodes(self, nodes, batch_size: int = int(1e6), force: bool = False):
|
|
276
|
+
"""Wrapper for writing nodes and their headers.
|
|
287
277
|
|
|
288
278
|
Args:
|
|
279
|
+
----
|
|
289
280
|
nodes (BioCypherNode): a list or generator of nodes in
|
|
290
281
|
:py:class:`BioCypherNode` format
|
|
291
282
|
|
|
@@ -296,7 +287,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
296
287
|
|
|
297
288
|
|
|
298
289
|
Returns:
|
|
290
|
+
-------
|
|
299
291
|
bool: The return value. True for success, False otherwise.
|
|
292
|
+
|
|
300
293
|
"""
|
|
301
294
|
# TODO check represented_as
|
|
302
295
|
|
|
@@ -315,19 +308,21 @@ class _BatchWriter(_Writer, ABC):
|
|
|
315
308
|
|
|
316
309
|
def write_edges(
|
|
317
310
|
self,
|
|
318
|
-
edges:
|
|
311
|
+
edges: list | GeneratorType,
|
|
319
312
|
batch_size: int = int(1e6),
|
|
320
313
|
) -> bool:
|
|
321
|
-
"""
|
|
322
|
-
Wrapper for writing edges and their headers.
|
|
314
|
+
"""Wrapper for writing edges and their headers.
|
|
323
315
|
|
|
324
316
|
Args:
|
|
317
|
+
----
|
|
325
318
|
edges (BioCypherEdge): a list or generator of edges in
|
|
326
319
|
:py:class:`BioCypherEdge` or :py:class:`BioCypherRelAsNode`
|
|
327
320
|
format
|
|
328
321
|
|
|
329
322
|
Returns:
|
|
323
|
+
-------
|
|
330
324
|
bool: The return value. True for success, False otherwise.
|
|
325
|
+
|
|
331
326
|
"""
|
|
332
327
|
passed = False
|
|
333
328
|
edges = list(edges) # force evaluation to handle empty generator
|
|
@@ -365,7 +360,6 @@ class _BatchWriter(_Writer, ABC):
|
|
|
365
360
|
logger.debug(
|
|
366
361
|
"No edges to write, possibly due to no matched Biolink classes.",
|
|
367
362
|
)
|
|
368
|
-
pass
|
|
369
363
|
|
|
370
364
|
if not passed:
|
|
371
365
|
logger.error("Error while writing edge data.")
|
|
@@ -379,8 +373,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
379
373
|
return True
|
|
380
374
|
|
|
381
375
|
def _write_node_data(self, nodes, batch_size, force: bool = False):
|
|
382
|
-
"""
|
|
383
|
-
Writes biocypher nodes to CSV conforming to the headers created
|
|
376
|
+
"""Writes biocypher nodes to CSV conforming to the headers created
|
|
384
377
|
with `_write_node_headers()`, and is actually required to be run
|
|
385
378
|
before calling `_write_node_headers()` to set the
|
|
386
379
|
:py:attr:`self.node_property_dict` for passing the node properties
|
|
@@ -388,14 +381,16 @@ class _BatchWriter(_Writer, ABC):
|
|
|
388
381
|
:py:class:`BioCypherNode` class.
|
|
389
382
|
|
|
390
383
|
Args:
|
|
384
|
+
----
|
|
391
385
|
nodes (BioCypherNode): a list or generator of nodes in
|
|
392
386
|
:py:class:`BioCypherNode` format
|
|
393
387
|
|
|
394
388
|
Returns:
|
|
389
|
+
-------
|
|
395
390
|
bool: The return value. True for success, False otherwise.
|
|
396
|
-
"""
|
|
397
391
|
|
|
398
|
-
|
|
392
|
+
"""
|
|
393
|
+
if isinstance(nodes, GeneratorType | peekable):
|
|
399
394
|
logger.debug("Writing node CSV from generator.")
|
|
400
395
|
|
|
401
396
|
bins = defaultdict(list) # dict to store a list for each
|
|
@@ -422,20 +417,15 @@ class _BatchWriter(_Writer, ABC):
|
|
|
422
417
|
logger.warning(f"Node {label} has no id; skipping.")
|
|
423
418
|
continue
|
|
424
419
|
|
|
425
|
-
if not
|
|
420
|
+
if label not in bins.keys():
|
|
426
421
|
# start new list
|
|
427
422
|
all_labels = None
|
|
428
423
|
bins[label].append(node)
|
|
429
424
|
bin_l[label] = 1
|
|
430
425
|
|
|
431
426
|
# get properties from config if present
|
|
432
|
-
if
|
|
433
|
-
label
|
|
434
|
-
in self.translator.ontology.mapping.extended_schema
|
|
435
|
-
):
|
|
436
|
-
cprops = self.translator.ontology.mapping.extended_schema.get(
|
|
437
|
-
label
|
|
438
|
-
).get(
|
|
427
|
+
if label in self.translator.ontology.mapping.extended_schema:
|
|
428
|
+
cprops = self.translator.ontology.mapping.extended_schema.get(label).get(
|
|
439
429
|
"properties",
|
|
440
430
|
)
|
|
441
431
|
else:
|
|
@@ -473,18 +463,13 @@ class _BatchWriter(_Writer, ABC):
|
|
|
473
463
|
# get label hierarchy
|
|
474
464
|
# multiple labels:
|
|
475
465
|
if not force:
|
|
476
|
-
all_labels = self.translator.ontology.get_ancestors(
|
|
477
|
-
label
|
|
478
|
-
)
|
|
466
|
+
all_labels = self.translator.ontology.get_ancestors(label)
|
|
479
467
|
else:
|
|
480
468
|
all_labels = None
|
|
481
469
|
|
|
482
470
|
if all_labels:
|
|
483
471
|
# convert to pascal case
|
|
484
|
-
all_labels = [
|
|
485
|
-
self.translator.name_sentence_to_pascal(label)
|
|
486
|
-
for label in all_labels
|
|
487
|
-
]
|
|
472
|
+
all_labels = [self.translator.name_sentence_to_pascal(label) for label in all_labels]
|
|
488
473
|
# remove duplicates
|
|
489
474
|
all_labels = list(OrderedDict.fromkeys(all_labels))
|
|
490
475
|
# order alphabetically
|
|
@@ -492,9 +477,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
492
477
|
# concatenate with array delimiter
|
|
493
478
|
all_labels = self._write_array_string(all_labels)
|
|
494
479
|
else:
|
|
495
|
-
all_labels = self.translator.name_sentence_to_pascal(
|
|
496
|
-
label
|
|
497
|
-
)
|
|
480
|
+
all_labels = self.translator.name_sentence_to_pascal(label)
|
|
498
481
|
|
|
499
482
|
labels[label] = all_labels
|
|
500
483
|
|
|
@@ -539,16 +522,15 @@ class _BatchWriter(_Writer, ABC):
|
|
|
539
522
|
self.node_property_dict[label] = reference_props[label]
|
|
540
523
|
|
|
541
524
|
return True
|
|
525
|
+
elif not isinstance(nodes, list):
|
|
526
|
+
logger.error("Nodes must be passed as list or generator.")
|
|
527
|
+
return False
|
|
542
528
|
else:
|
|
543
|
-
if type(nodes) is not list:
|
|
544
|
-
logger.error("Nodes must be passed as list or generator.")
|
|
545
|
-
return False
|
|
546
|
-
else:
|
|
547
529
|
|
|
548
|
-
|
|
549
|
-
|
|
530
|
+
def gen(nodes):
|
|
531
|
+
yield from nodes
|
|
550
532
|
|
|
551
|
-
|
|
533
|
+
return self._write_node_data(gen(nodes), batch_size=batch_size)
|
|
552
534
|
|
|
553
535
|
def _write_single_node_list_to_file(
|
|
554
536
|
self,
|
|
@@ -557,11 +539,11 @@ class _BatchWriter(_Writer, ABC):
|
|
|
557
539
|
prop_dict: dict,
|
|
558
540
|
labels: str,
|
|
559
541
|
):
|
|
560
|
-
"""
|
|
561
|
-
This function takes one list of biocypher nodes and writes them
|
|
542
|
+
"""This function takes one list of biocypher nodes and writes them
|
|
562
543
|
to a Neo4j admin import compatible CSV file.
|
|
563
544
|
|
|
564
545
|
Args:
|
|
546
|
+
----
|
|
565
547
|
node_list (list): list of BioCypherNodes to be written
|
|
566
548
|
label (str): the primary label of the node
|
|
567
549
|
prop_dict (dict): properties of node class passed from parsing
|
|
@@ -570,7 +552,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
570
552
|
for the node class
|
|
571
553
|
|
|
572
554
|
Returns:
|
|
555
|
+
-------
|
|
573
556
|
bool: The return value. True for success, False otherwise.
|
|
557
|
+
|
|
574
558
|
"""
|
|
575
559
|
if not all(isinstance(n, BioCypherNode) for n in node_list):
|
|
576
560
|
logger.error("Nodes must be passed as type BioCypherNode.")
|
|
@@ -588,7 +572,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
588
572
|
ref_props = list(prop_dict.keys())
|
|
589
573
|
|
|
590
574
|
# compare lists order invariant
|
|
591
|
-
if
|
|
575
|
+
if set(ref_props) != set(n_keys):
|
|
592
576
|
onode = n.get_id()
|
|
593
577
|
oprop1 = set(ref_props).difference(n_keys)
|
|
594
578
|
oprop2 = set(n_keys).difference(ref_props)
|
|
@@ -622,11 +606,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
622
606
|
"boolean",
|
|
623
607
|
]:
|
|
624
608
|
plist.append(str(p))
|
|
609
|
+
elif isinstance(p, list):
|
|
610
|
+
plist.append(self._write_array_string(p))
|
|
625
611
|
else:
|
|
626
|
-
|
|
627
|
-
plist.append(self._write_array_string(p))
|
|
628
|
-
else:
|
|
629
|
-
plist.append(f"{self.quote}{str(p)}{self.quote}")
|
|
612
|
+
plist.append(f"{self.quote}{p!s}{self.quote}")
|
|
630
613
|
|
|
631
614
|
line.append(self.delim.join(plist))
|
|
632
615
|
line.append(labels)
|
|
@@ -640,8 +623,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
640
623
|
return True
|
|
641
624
|
|
|
642
625
|
def _write_edge_data(self, edges, batch_size):
|
|
643
|
-
"""
|
|
644
|
-
Writes biocypher edges to CSV conforming to the headers created
|
|
626
|
+
"""Writes biocypher edges to CSV conforming to the headers created
|
|
645
627
|
with `_write_edge_headers()`, and is actually required to be run
|
|
646
628
|
before calling `_write_node_headers()` to set the
|
|
647
629
|
:py:attr:`self.edge_property_dict` for passing the edge
|
|
@@ -649,17 +631,20 @@ class _BatchWriter(_Writer, ABC):
|
|
|
649
631
|
from the :py:class:`BioCypherEdge` class.
|
|
650
632
|
|
|
651
633
|
Args:
|
|
634
|
+
----
|
|
652
635
|
edges (BioCypherEdge): a list or generator of edges in
|
|
653
636
|
:py:class:`BioCypherEdge` format
|
|
654
637
|
|
|
655
638
|
Returns:
|
|
639
|
+
-------
|
|
656
640
|
bool: The return value. True for success, False otherwise.
|
|
657
641
|
|
|
658
642
|
Todo:
|
|
643
|
+
----
|
|
659
644
|
- currently works for mixed edges but in practice often is
|
|
660
645
|
called on one iterable containing one type of edge only
|
|
661
|
-
"""
|
|
662
646
|
|
|
647
|
+
"""
|
|
663
648
|
if isinstance(edges, GeneratorType):
|
|
664
649
|
logger.debug("Writing edge CSV from generator.")
|
|
665
650
|
|
|
@@ -675,14 +660,13 @@ class _BatchWriter(_Writer, ABC):
|
|
|
675
660
|
for edge in edges:
|
|
676
661
|
if not (edge.get_source_id() and edge.get_target_id()):
|
|
677
662
|
logger.error(
|
|
678
|
-
"Edge must have source and target node. "
|
|
679
|
-
f"Caused by: {edge}",
|
|
663
|
+
f"Edge must have source and target node. Caused by: {edge}",
|
|
680
664
|
)
|
|
681
665
|
continue
|
|
682
666
|
|
|
683
667
|
label = edge.get_label()
|
|
684
668
|
|
|
685
|
-
if not
|
|
669
|
+
if label not in bins.keys():
|
|
686
670
|
# start new list
|
|
687
671
|
bins[label].append(edge)
|
|
688
672
|
bin_l[label] = 1
|
|
@@ -693,13 +677,8 @@ class _BatchWriter(_Writer, ABC):
|
|
|
693
677
|
# (may not be if it is an edge that carries the
|
|
694
678
|
# "label_as_edge" property)
|
|
695
679
|
cprops = None
|
|
696
|
-
if
|
|
697
|
-
label
|
|
698
|
-
in self.translator.ontology.mapping.extended_schema
|
|
699
|
-
):
|
|
700
|
-
cprops = self.translator.ontology.mapping.extended_schema.get(
|
|
701
|
-
label
|
|
702
|
-
).get(
|
|
680
|
+
if label in self.translator.ontology.mapping.extended_schema:
|
|
681
|
+
cprops = self.translator.ontology.mapping.extended_schema.get(label).get(
|
|
703
682
|
"properties",
|
|
704
683
|
)
|
|
705
684
|
else:
|
|
@@ -707,9 +686,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
707
686
|
for (
|
|
708
687
|
k,
|
|
709
688
|
v,
|
|
710
|
-
) in (
|
|
711
|
-
self.translator.ontology.mapping.extended_schema.items()
|
|
712
|
-
):
|
|
689
|
+
) in self.translator.ontology.mapping.extended_schema.items():
|
|
713
690
|
if isinstance(v, dict):
|
|
714
691
|
if v.get("label_as_edge") == label:
|
|
715
692
|
cprops = v.get("properties")
|
|
@@ -779,16 +756,15 @@ class _BatchWriter(_Writer, ABC):
|
|
|
779
756
|
self.edge_property_dict[label] = reference_props[label]
|
|
780
757
|
|
|
781
758
|
return True
|
|
759
|
+
elif not isinstance(edges, list):
|
|
760
|
+
logger.error("Edges must be passed as list or generator.")
|
|
761
|
+
return False
|
|
782
762
|
else:
|
|
783
|
-
if type(edges) is not list:
|
|
784
|
-
logger.error("Edges must be passed as list or generator.")
|
|
785
|
-
return False
|
|
786
|
-
else:
|
|
787
763
|
|
|
788
|
-
|
|
789
|
-
|
|
764
|
+
def gen(edges):
|
|
765
|
+
yield from edges
|
|
790
766
|
|
|
791
|
-
|
|
767
|
+
return self._write_edge_data(gen(edges), batch_size=batch_size)
|
|
792
768
|
|
|
793
769
|
def _write_single_edge_list_to_file(
|
|
794
770
|
self,
|
|
@@ -796,11 +772,11 @@ class _BatchWriter(_Writer, ABC):
|
|
|
796
772
|
label: str,
|
|
797
773
|
prop_dict: dict,
|
|
798
774
|
):
|
|
799
|
-
"""
|
|
800
|
-
This function takes one list of biocypher edges and writes them
|
|
775
|
+
"""This function takes one list of biocypher edges and writes them
|
|
801
776
|
to a Neo4j admin import compatible CSV file.
|
|
802
777
|
|
|
803
778
|
Args:
|
|
779
|
+
----
|
|
804
780
|
edge_list (list): list of BioCypherEdges to be written
|
|
805
781
|
|
|
806
782
|
label (str): the label (type) of the edge
|
|
@@ -809,9 +785,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
809
785
|
function and their types
|
|
810
786
|
|
|
811
787
|
Returns:
|
|
788
|
+
-------
|
|
812
789
|
bool: The return value. True for success, False otherwise.
|
|
813
|
-
"""
|
|
814
790
|
|
|
791
|
+
"""
|
|
815
792
|
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
|
|
816
793
|
logger.error("Edges must be passed as type BioCypherEdge.")
|
|
817
794
|
return False
|
|
@@ -826,7 +803,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
826
803
|
ref_props = list(prop_dict.keys())
|
|
827
804
|
|
|
828
805
|
# compare list order invariant
|
|
829
|
-
if
|
|
806
|
+
if set(ref_props) != set(e_keys):
|
|
830
807
|
oedge = f"{e.get_source_id()}-{e.get_target_id()}"
|
|
831
808
|
oprop1 = set(ref_props).difference(e_keys)
|
|
832
809
|
oprop2 = set(e_keys).difference(ref_props)
|
|
@@ -857,11 +834,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
857
834
|
"boolean",
|
|
858
835
|
]:
|
|
859
836
|
plist.append(str(p))
|
|
837
|
+
elif isinstance(p, list):
|
|
838
|
+
plist.append(self._write_array_string(p))
|
|
860
839
|
else:
|
|
861
|
-
|
|
862
|
-
plist.append(self._write_array_string(p))
|
|
863
|
-
else:
|
|
864
|
-
plist.append(self.quote + str(p) + self.quote)
|
|
840
|
+
plist.append(self.quote + str(p) + self.quote)
|
|
865
841
|
|
|
866
842
|
entries = [e.get_source_id()]
|
|
867
843
|
|
|
@@ -870,9 +846,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
870
846
|
|
|
871
847
|
if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
|
|
872
848
|
skip_id = True
|
|
873
|
-
elif not self.translator.ontology.mapping.extended_schema.get(
|
|
874
|
-
label
|
|
875
|
-
):
|
|
849
|
+
elif not self.translator.ontology.mapping.extended_schema.get(label):
|
|
876
850
|
# find label in schema by label_as_edge
|
|
877
851
|
for (
|
|
878
852
|
k,
|
|
@@ -887,9 +861,9 @@ class _BatchWriter(_Writer, ABC):
|
|
|
887
861
|
if schema_label:
|
|
888
862
|
if (
|
|
889
863
|
self.translator.ontology.mapping.extended_schema.get(
|
|
890
|
-
schema_label
|
|
864
|
+
schema_label,
|
|
891
865
|
).get("use_id")
|
|
892
|
-
== False
|
|
866
|
+
== False # noqa: E712 (seems to not work with 'not')
|
|
893
867
|
):
|
|
894
868
|
skip_id = True
|
|
895
869
|
|
|
@@ -903,7 +877,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
903
877
|
entries.append(
|
|
904
878
|
self.translator.name_sentence_to_pascal(
|
|
905
879
|
e.get_label(),
|
|
906
|
-
)
|
|
880
|
+
),
|
|
907
881
|
)
|
|
908
882
|
|
|
909
883
|
lines.append(
|
|
@@ -917,10 +891,10 @@ class _BatchWriter(_Writer, ABC):
|
|
|
917
891
|
return True
|
|
918
892
|
|
|
919
893
|
def _write_next_part(self, label: str, lines: list):
|
|
920
|
-
"""
|
|
921
|
-
This function writes a list of strings to a new part file.
|
|
894
|
+
"""This function writes a list of strings to a new part file.
|
|
922
895
|
|
|
923
896
|
Args:
|
|
897
|
+
----
|
|
924
898
|
label (str): the label (type) of the edge; internal
|
|
925
899
|
representation sentence case -> needs to become PascalCase
|
|
926
900
|
for disk representation
|
|
@@ -928,17 +902,15 @@ class _BatchWriter(_Writer, ABC):
|
|
|
928
902
|
lines (list): list of strings to be written
|
|
929
903
|
|
|
930
904
|
Returns:
|
|
905
|
+
-------
|
|
931
906
|
bool: The return value. True for success, False otherwise.
|
|
907
|
+
|
|
932
908
|
"""
|
|
933
909
|
# translate label to PascalCase
|
|
934
|
-
label_pascal = self.translator.name_sentence_to_pascal(
|
|
935
|
-
parse_label(label)
|
|
936
|
-
)
|
|
910
|
+
label_pascal = self.translator.name_sentence_to_pascal(parse_label(label))
|
|
937
911
|
|
|
938
912
|
# list files in self.outdir
|
|
939
|
-
files = glob.glob(
|
|
940
|
-
os.path.join(self.outdir, f"{label_pascal}-part*.csv")
|
|
941
|
-
)
|
|
913
|
+
files = glob.glob(os.path.join(self.outdir, f"{label_pascal}-part*.csv"))
|
|
942
914
|
# find file with highest part number
|
|
943
915
|
if not files:
|
|
944
916
|
next_part = 0
|
|
@@ -946,10 +918,7 @@ class _BatchWriter(_Writer, ABC):
|
|
|
946
918
|
else:
|
|
947
919
|
next_part = (
|
|
948
920
|
max(
|
|
949
|
-
[
|
|
950
|
-
int(f.split(".")[-2].split("-")[-1].replace("part", ""))
|
|
951
|
-
for f in files
|
|
952
|
-
],
|
|
921
|
+
[int(f.split(".")[-2].split("-")[-1].replace("part", "")) for f in files],
|
|
953
922
|
)
|
|
954
923
|
+ 1
|
|
955
924
|
)
|
|
@@ -974,31 +943,29 @@ class _BatchWriter(_Writer, ABC):
|
|
|
974
943
|
self.parts[label].append(part)
|
|
975
944
|
|
|
976
945
|
def get_import_call(self) -> str:
|
|
977
|
-
"""
|
|
978
|
-
Function to return the import call detailing folder and
|
|
946
|
+
"""Function to return the import call detailing folder and
|
|
979
947
|
individual node and edge headers and data files, as well as
|
|
980
948
|
delimiters and database name.
|
|
981
949
|
|
|
982
|
-
Returns
|
|
950
|
+
Returns
|
|
951
|
+
-------
|
|
983
952
|
str: a bash command for the database import
|
|
984
|
-
"""
|
|
985
953
|
|
|
954
|
+
"""
|
|
986
955
|
return self._construct_import_call()
|
|
987
956
|
|
|
988
957
|
def write_import_call(self) -> str:
|
|
989
|
-
"""
|
|
990
|
-
Function to write the import call detailing folder and
|
|
958
|
+
"""Function to write the import call detailing folder and
|
|
991
959
|
individual node and edge headers and data files, as well as
|
|
992
960
|
delimiters and database name, to the export folder as txt.
|
|
993
961
|
|
|
994
|
-
Returns
|
|
962
|
+
Returns
|
|
963
|
+
-------
|
|
995
964
|
str: The path of the file holding the import call.
|
|
996
|
-
"""
|
|
997
965
|
|
|
966
|
+
"""
|
|
998
967
|
file_path = os.path.join(self.outdir, self._get_import_script_name())
|
|
999
|
-
logger.info(
|
|
1000
|
-
f"Writing {self.db_name + ' ' if self.db_name else ''}import call to `{file_path}`."
|
|
1001
|
-
)
|
|
968
|
+
logger.info(f"Writing {self.db_name + ' ' if self.db_name else ''}import call to `{file_path}`.")
|
|
1002
969
|
|
|
1003
970
|
with open(file_path, "w", encoding="utf-8") as f:
|
|
1004
971
|
f.write(self._construct_import_call())
|
|
@@ -1007,16 +974,16 @@ class _BatchWriter(_Writer, ABC):
|
|
|
1007
974
|
|
|
1008
975
|
|
|
1009
976
|
def parse_label(label: str) -> str:
|
|
1010
|
-
"""
|
|
1011
|
-
|
|
1012
|
-
Check if the label is compliant with Neo4j naming conventions,
|
|
977
|
+
"""Check if the label is compliant with Neo4j naming conventions,
|
|
1013
978
|
https://neo4j.com/docs/cypher-manual/current/syntax/naming/, and if not,
|
|
1014
979
|
remove non-compliant characters.
|
|
1015
980
|
|
|
1016
981
|
Args:
|
|
982
|
+
----
|
|
1017
983
|
label (str): The label to check
|
|
1018
984
|
Returns:
|
|
1019
985
|
str: The compliant label
|
|
986
|
+
|
|
1020
987
|
"""
|
|
1021
988
|
# Check if the name contains only alphanumeric characters, underscore, or dollar sign
|
|
1022
989
|
# and dot (for class hierarchy of BioCypher)
|
|
@@ -1026,7 +993,7 @@ def parse_label(label: str) -> str:
|
|
|
1026
993
|
if non_matches:
|
|
1027
994
|
non_matches = list(set(non_matches))
|
|
1028
995
|
logger.warning(
|
|
1029
|
-
f"Label is not compliant with Neo4j naming rules. Removed non compliant characters: {non_matches}"
|
|
996
|
+
f"Label is not compliant with Neo4j naming rules. Removed non compliant characters: {non_matches}",
|
|
1030
997
|
)
|
|
1031
998
|
|
|
1032
999
|
def first_character_compliant(character: str) -> bool:
|
|
@@ -1037,7 +1004,5 @@ def parse_label(label: str) -> str:
|
|
|
1037
1004
|
if first_character_compliant(c):
|
|
1038
1005
|
matches = matches[matches.index(c) :]
|
|
1039
1006
|
break
|
|
1040
|
-
logger.warning(
|
|
1041
|
-
"Label does not start with an alphabetic character or with $. Removed non compliant characters."
|
|
1042
|
-
)
|
|
1007
|
+
logger.warning("Label does not start with an alphabetic character or with $. Removed non compliant characters.")
|
|
1043
1008
|
return "".join(matches).strip()
|