biocypher 0.5.40__py3-none-any.whl → 0.5.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/_config/biocypher_config.yaml +7 -0
- biocypher/_connect.py +36 -9
- biocypher/_metadata.py +1 -1
- biocypher/_misc.py +12 -3
- biocypher/_ontology.py +133 -53
- biocypher/write/_batch_writer.py +11 -0
- biocypher/write/_write.py +6 -1
- biocypher/write/graph/_neo4j.py +44 -3
- biocypher/write/graph/_rdf.py +516 -0
- {biocypher-0.5.40.dist-info → biocypher-0.5.42.dist-info}/METADATA +1 -1
- {biocypher-0.5.40.dist-info → biocypher-0.5.42.dist-info}/RECORD +13 -12
- {biocypher-0.5.40.dist-info → biocypher-0.5.42.dist-info}/LICENSE +0 -0
- {biocypher-0.5.40.dist-info → biocypher-0.5.42.dist-info}/WHEEL +0 -0
|
@@ -27,6 +27,7 @@ biocypher:
|
|
|
27
27
|
head_ontology:
|
|
28
28
|
url: https://github.com/biolink/biolink-model/raw/v3.2.1/biolink-model.owl.ttl
|
|
29
29
|
root_node: entity
|
|
30
|
+
# switch_label_and_id: true
|
|
30
31
|
|
|
31
32
|
### Optional parameters ###
|
|
32
33
|
|
|
@@ -53,10 +54,12 @@ biocypher:
|
|
|
53
54
|
# url: test/ontologies/so.owl
|
|
54
55
|
# head_join_node: sequence variant
|
|
55
56
|
# tail_join_node: sequence_variant
|
|
57
|
+
# switch_label_and_id: true
|
|
56
58
|
# mondo:
|
|
57
59
|
# url: test/ontologies/mondo.owl
|
|
58
60
|
# head_join_node: disease
|
|
59
61
|
# tail_join_node: disease
|
|
62
|
+
# switch_label_and_id: true
|
|
60
63
|
|
|
61
64
|
### DBMS configuration ###
|
|
62
65
|
|
|
@@ -113,6 +116,10 @@ postgresql:
|
|
|
113
116
|
# import_call_bin_prefix: '' # path to "psql"
|
|
114
117
|
# import_call_file_prefix: '/path/to/files'
|
|
115
118
|
|
|
119
|
+
rdf:
|
|
120
|
+
### RDF configuration ###
|
|
121
|
+
rdf_format: turtle
|
|
122
|
+
|
|
116
123
|
sqlite:
|
|
117
124
|
### SQLite configuration ###
|
|
118
125
|
|
biocypher/_connect.py
CHANGED
|
@@ -11,11 +11,12 @@
|
|
|
11
11
|
"""
|
|
12
12
|
BioCypher 'online' mode. Handles connection and manipulation of a running DBMS.
|
|
13
13
|
"""
|
|
14
|
+
import subprocess
|
|
15
|
+
|
|
14
16
|
from ._logger import logger
|
|
15
17
|
|
|
16
18
|
logger.debug(f"Loading module {__name__}.")
|
|
17
19
|
|
|
18
|
-
from typing import Optional
|
|
19
20
|
from collections.abc import Iterable
|
|
20
21
|
import itertools
|
|
21
22
|
|
|
@@ -24,7 +25,6 @@ import neo4j_utils
|
|
|
24
25
|
from . import _misc
|
|
25
26
|
from ._config import config as _config
|
|
26
27
|
from ._create import BioCypherEdge, BioCypherNode
|
|
27
|
-
from ._ontology import Ontology
|
|
28
28
|
from ._translate import Translator
|
|
29
29
|
|
|
30
30
|
__all__ = ["_Neo4jDriver"]
|
|
@@ -137,16 +137,43 @@ class _Neo4jDriver:
|
|
|
137
137
|
|
|
138
138
|
logger.info("Creating constraints for node types in config.")
|
|
139
139
|
|
|
140
|
+
major_neo4j_version = int(self._get_neo4j_version().split(".")[0])
|
|
140
141
|
# get structure
|
|
141
142
|
for leaf in self.translator.ontology.mapping.extended_schema.items():
|
|
142
|
-
label = _misc.sentencecase_to_pascalcase(leaf[0])
|
|
143
|
+
label = _misc.sentencecase_to_pascalcase(leaf[0], sep=r"\s\.")
|
|
143
144
|
if leaf[1]["represented_as"] == "node":
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
145
|
+
if major_neo4j_version >= 5:
|
|
146
|
+
s = (
|
|
147
|
+
f"CREATE CONSTRAINT `{label}_id` "
|
|
148
|
+
f"IF NOT EXISTS FOR (n:`{label}`) "
|
|
149
|
+
"REQUIRE n.id IS UNIQUE"
|
|
150
|
+
)
|
|
151
|
+
self._driver.query(s)
|
|
152
|
+
else:
|
|
153
|
+
s = (
|
|
154
|
+
f"CREATE CONSTRAINT `{label}_id` "
|
|
155
|
+
f"IF NOT EXISTS ON (n:`{label}`) "
|
|
156
|
+
"ASSERT n.id IS UNIQUE"
|
|
157
|
+
)
|
|
158
|
+
self._driver.query(s)
|
|
159
|
+
|
|
160
|
+
def _get_neo4j_version(self):
|
|
161
|
+
"""Get neo4j version."""
|
|
162
|
+
try:
|
|
163
|
+
neo4j_version = self._driver.query(
|
|
164
|
+
"""
|
|
165
|
+
CALL dbms.components()
|
|
166
|
+
YIELD name, versions, edition
|
|
167
|
+
UNWIND versions AS version
|
|
168
|
+
RETURN version AS version
|
|
169
|
+
""",
|
|
170
|
+
)[0][0]["version"]
|
|
171
|
+
return neo4j_version
|
|
172
|
+
except Exception as e:
|
|
173
|
+
logger.warning(
|
|
174
|
+
f"Error detecting Neo4j version: {e} use default version 4.0.0."
|
|
175
|
+
)
|
|
176
|
+
return "4.0.0"
|
|
150
177
|
|
|
151
178
|
def add_nodes(self, id_type_tuples: Iterable[tuple]) -> tuple:
|
|
152
179
|
"""
|
biocypher/_metadata.py
CHANGED
biocypher/_misc.py
CHANGED
|
@@ -115,7 +115,12 @@ def _get_inheritance_tree(inheritance_graph: Union[dict, nx.Graph]) -> dict:
|
|
|
115
115
|
)
|
|
116
116
|
if multiple_parents_present:
|
|
117
117
|
logger.warning(
|
|
118
|
-
"The ontology contains multiple inheritance (one child node
|
|
118
|
+
"The ontology contains multiple inheritance (one child node "
|
|
119
|
+
"has multiple parent nodes). This is not visualized in the "
|
|
120
|
+
"following hierarchy tree (the child node is only added once). "
|
|
121
|
+
"If you wish to browse all relationships of the parsed "
|
|
122
|
+
"ontologies, write a graphml file to disk using "
|
|
123
|
+
"`to_disk = <directory>` and view this file."
|
|
119
124
|
)
|
|
120
125
|
|
|
121
126
|
# unlist values
|
|
@@ -205,7 +210,7 @@ def sentencecase_to_snakecase(s: str) -> str:
|
|
|
205
210
|
return stringcase.snakecase(s).lower()
|
|
206
211
|
|
|
207
212
|
|
|
208
|
-
def sentencecase_to_pascalcase(s: str) -> str:
|
|
213
|
+
def sentencecase_to_pascalcase(s: str, sep: str = r"\s") -> str:
|
|
209
214
|
"""
|
|
210
215
|
Convert sentence case to PascalCase.
|
|
211
216
|
|
|
@@ -215,7 +220,11 @@ def sentencecase_to_pascalcase(s: str) -> str:
|
|
|
215
220
|
Returns:
|
|
216
221
|
string in PascalCase form
|
|
217
222
|
"""
|
|
218
|
-
return re.sub(
|
|
223
|
+
return re.sub(
|
|
224
|
+
r"(?:^|[" + sep + "])([a-zA-Z])",
|
|
225
|
+
lambda match: match.group(1).upper(),
|
|
226
|
+
s,
|
|
227
|
+
)
|
|
219
228
|
|
|
220
229
|
|
|
221
230
|
def to_lower_sentence_case(s: str) -> str:
|
biocypher/_ontology.py
CHANGED
|
@@ -43,19 +43,19 @@ class OntologyAdapter:
|
|
|
43
43
|
ontology is represented by a networkx.DiGraph object; an RDFlib graph is
|
|
44
44
|
also kept. By default, the DiGraph reverses the label and identifier of the
|
|
45
45
|
nodes, such that the node name in the graph is the human-readable label. The
|
|
46
|
-
edges are oriented from child to parent.
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
edges are oriented from child to parent.
|
|
47
|
+
Labels are formatted in lower sentence case and underscores are replaced by spaces.
|
|
48
|
+
Identifiers are taken as defined and the prefixes are removed by default.
|
|
49
49
|
"""
|
|
50
50
|
|
|
51
51
|
def __init__(
|
|
52
52
|
self,
|
|
53
53
|
ontology_file: str,
|
|
54
54
|
root_label: str,
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
ontology_file_format: Optional[str] = None,
|
|
56
|
+
head_join_node_label: Optional[str] = None,
|
|
57
57
|
merge_nodes: Optional[bool] = True,
|
|
58
|
-
|
|
58
|
+
switch_label_and_id: bool = True,
|
|
59
59
|
remove_prefixes: bool = True,
|
|
60
60
|
):
|
|
61
61
|
"""
|
|
@@ -68,7 +68,10 @@ class OntologyAdapter:
|
|
|
68
68
|
root_label (str): The label of the root node in the ontology. In
|
|
69
69
|
case of a tail ontology, this is the tail join node.
|
|
70
70
|
|
|
71
|
-
|
|
71
|
+
ontology_file_format (str): The format of the ontology file (e.g. "application/rdf+xml")
|
|
72
|
+
If format is not passed, it is determined automatically.
|
|
73
|
+
|
|
74
|
+
head_join_node_label (str): Optional variable to store the label of the
|
|
72
75
|
node in the head ontology that should be used to join to the
|
|
73
76
|
root node of the tail ontology. Defaults to None.
|
|
74
77
|
|
|
@@ -77,7 +80,7 @@ class OntologyAdapter:
|
|
|
77
80
|
tail join node will be attached as a child of the head join
|
|
78
81
|
node.
|
|
79
82
|
|
|
80
|
-
|
|
83
|
+
switch_label_and_id (bool): If True, the node names in the graph will be
|
|
81
84
|
the human-readable labels. If False, the node names will be the
|
|
82
85
|
identifiers. Defaults to True.
|
|
83
86
|
|
|
@@ -89,33 +92,37 @@ class OntologyAdapter:
|
|
|
89
92
|
|
|
90
93
|
self._ontology_file = ontology_file
|
|
91
94
|
self._root_label = root_label
|
|
92
|
-
self._format =
|
|
95
|
+
self._format = ontology_file_format
|
|
93
96
|
self._merge_nodes = merge_nodes
|
|
94
|
-
self._head_join_node =
|
|
95
|
-
self.
|
|
97
|
+
self._head_join_node = head_join_node_label
|
|
98
|
+
self._switch_label_and_id = switch_label_and_id
|
|
96
99
|
self._remove_prefixes = remove_prefixes
|
|
97
100
|
|
|
98
101
|
self._rdf_graph = self._load_rdf_graph(ontology_file)
|
|
99
102
|
|
|
100
103
|
self._nx_graph = self._rdf_to_nx(
|
|
101
|
-
self._rdf_graph, root_label,
|
|
104
|
+
self._rdf_graph, root_label, switch_label_and_id
|
|
102
105
|
)
|
|
103
106
|
|
|
104
107
|
def _rdf_to_nx(
|
|
105
|
-
self,
|
|
108
|
+
self,
|
|
109
|
+
_rdf_graph: rdflib.Graph,
|
|
110
|
+
root_label: str,
|
|
111
|
+
switch_label_and_id: bool,
|
|
112
|
+
rename_nodes: bool = True,
|
|
106
113
|
) -> nx.DiGraph:
|
|
107
114
|
one_to_one_triples, one_to_many_dict = self._get_relevant_rdf_triples(
|
|
108
115
|
_rdf_graph
|
|
109
116
|
)
|
|
110
117
|
nx_graph = self._convert_to_nx(one_to_one_triples, one_to_many_dict)
|
|
111
|
-
|
|
112
|
-
|
|
118
|
+
nx_graph = self._add_labels_to_nodes(nx_graph, switch_label_and_id)
|
|
119
|
+
nx_graph = self._change_nodes_to_biocypher_format(
|
|
120
|
+
nx_graph, switch_label_and_id, rename_nodes
|
|
113
121
|
)
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
renamed_graph, root_label, reverse_labels
|
|
122
|
+
nx_graph = self._get_all_ancestors(
|
|
123
|
+
nx_graph, root_label, switch_label_and_id, rename_nodes
|
|
117
124
|
)
|
|
118
|
-
return nx.DiGraph(
|
|
125
|
+
return nx.DiGraph(nx_graph)
|
|
119
126
|
|
|
120
127
|
def _get_relevant_rdf_triples(self, g: rdflib.Graph) -> tuple:
|
|
121
128
|
one_to_one_inheritance_graph = self._get_one_to_one_inheritance_triples(
|
|
@@ -239,19 +246,21 @@ class OntologyAdapter:
|
|
|
239
246
|
return nx_graph
|
|
240
247
|
|
|
241
248
|
def _add_labels_to_nodes(
|
|
242
|
-
self, nx_graph: nx.DiGraph,
|
|
249
|
+
self, nx_graph: nx.DiGraph, switch_label_and_id: bool
|
|
243
250
|
) -> nx.DiGraph:
|
|
244
251
|
"""Add labels to the nodes in the networkx graph.
|
|
245
252
|
|
|
246
253
|
Args:
|
|
247
254
|
nx_graph (nx.DiGraph): The networkx graph
|
|
248
|
-
|
|
255
|
+
switch_label_and_id (bool): If True, id and label are switched
|
|
249
256
|
|
|
250
257
|
Returns:
|
|
251
258
|
nx.DiGraph: The networkx graph with labels
|
|
252
259
|
"""
|
|
253
260
|
for node in list(nx_graph.nodes):
|
|
254
|
-
nx_id, nx_label = self._get_nx_id_and_label(
|
|
261
|
+
nx_id, nx_label = self._get_nx_id_and_label(
|
|
262
|
+
node, switch_label_and_id
|
|
263
|
+
)
|
|
255
264
|
if nx_id == "none":
|
|
256
265
|
# remove node if it has no id
|
|
257
266
|
nx_graph.remove_node(node)
|
|
@@ -260,39 +269,56 @@ class OntologyAdapter:
|
|
|
260
269
|
nx_graph.nodes[node]["label"] = nx_label
|
|
261
270
|
return nx_graph
|
|
262
271
|
|
|
263
|
-
def
|
|
264
|
-
self,
|
|
272
|
+
def _change_nodes_to_biocypher_format(
|
|
273
|
+
self,
|
|
274
|
+
nx_graph: nx.DiGraph,
|
|
275
|
+
switch_label_and_id: bool,
|
|
276
|
+
rename_nodes: bool = True,
|
|
265
277
|
) -> nx.DiGraph:
|
|
266
|
-
"""
|
|
278
|
+
"""Change the nodes in the networkx graph to BioCypher format:
|
|
279
|
+
- remove the prefix of the identifier
|
|
280
|
+
- switch id and label
|
|
281
|
+
- adapt the labels (replace _ with space and convert to lower sentence case)
|
|
267
282
|
|
|
268
283
|
Args:
|
|
269
284
|
nx_graph (nx.DiGraph): The networkx graph
|
|
270
|
-
|
|
285
|
+
switch_label_and_id (bool): If True, id and label are switched
|
|
286
|
+
rename_nodes (bool): If True, the nodes are renamed
|
|
271
287
|
|
|
272
288
|
Returns:
|
|
273
|
-
nx.DiGraph: The
|
|
289
|
+
nx.DiGraph: The networkx ontology graph in BioCypher format
|
|
274
290
|
"""
|
|
275
291
|
mapping = {
|
|
276
|
-
node: self._get_nx_id_and_label(
|
|
292
|
+
node: self._get_nx_id_and_label(
|
|
293
|
+
node, switch_label_and_id, rename_nodes
|
|
294
|
+
)[0]
|
|
277
295
|
for node in nx_graph.nodes
|
|
278
296
|
}
|
|
279
297
|
renamed = nx.relabel_nodes(nx_graph, mapping, copy=False)
|
|
280
298
|
return renamed
|
|
281
299
|
|
|
282
300
|
def _get_all_ancestors(
|
|
283
|
-
self,
|
|
301
|
+
self,
|
|
302
|
+
renamed: nx.DiGraph,
|
|
303
|
+
root_label: str,
|
|
304
|
+
switch_label_and_id: bool,
|
|
305
|
+
rename_nodes: bool = True,
|
|
284
306
|
) -> nx.DiGraph:
|
|
285
307
|
"""Get all ancestors of the root node in the networkx graph.
|
|
286
308
|
|
|
287
309
|
Args:
|
|
288
310
|
renamed (nx.DiGraph): The renamed networkx graph
|
|
289
311
|
root_label (str): The label of the root node in the ontology
|
|
312
|
+
switch_label_and_id (bool): If True, id and label are switched
|
|
313
|
+
rename_nodes (bool): If True, the nodes are renamed
|
|
290
314
|
|
|
291
315
|
Returns:
|
|
292
316
|
nx.DiGraph: The filtered networkx graph
|
|
293
317
|
"""
|
|
294
318
|
root = self._get_nx_id_and_label(
|
|
295
|
-
self._find_root_label(self._rdf_graph, root_label),
|
|
319
|
+
self._find_root_label(self._rdf_graph, root_label),
|
|
320
|
+
switch_label_and_id,
|
|
321
|
+
rename_nodes,
|
|
296
322
|
)[0]
|
|
297
323
|
ancestors = nx.ancestors(renamed, root)
|
|
298
324
|
ancestors.add(root)
|
|
@@ -300,7 +326,7 @@ class OntologyAdapter:
|
|
|
300
326
|
return filtered_graph
|
|
301
327
|
|
|
302
328
|
def _get_nx_id_and_label(
|
|
303
|
-
self, node, switch_id_and_label: bool
|
|
329
|
+
self, node, switch_id_and_label: bool, rename_nodes: bool = True
|
|
304
330
|
) -> tuple[str, str]:
|
|
305
331
|
"""Rename node id and label for nx graph.
|
|
306
332
|
|
|
@@ -312,10 +338,10 @@ class OntologyAdapter:
|
|
|
312
338
|
tuple[str, str]: The renamed node id and label
|
|
313
339
|
"""
|
|
314
340
|
node_id_str = self._remove_prefix(str(node))
|
|
315
|
-
node_label_str = str(
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
341
|
+
node_label_str = str(self._rdf_graph.value(node, rdflib.RDFS.label))
|
|
342
|
+
if rename_nodes:
|
|
343
|
+
node_label_str = node_label_str.replace("_", " ")
|
|
344
|
+
node_label_str = to_lower_sentence_case(node_label_str)
|
|
319
345
|
nx_id = node_label_str if switch_id_and_label else node_id_str
|
|
320
346
|
nx_label = node_id_str if switch_id_and_label else node_label_str
|
|
321
347
|
return nx_id, nx_label
|
|
@@ -330,8 +356,14 @@ class OntologyAdapter:
|
|
|
330
356
|
root = label_subject
|
|
331
357
|
break
|
|
332
358
|
else:
|
|
359
|
+
labels_in_ontology = []
|
|
360
|
+
for label_subject, _, label_in_ontology in g.triples(
|
|
361
|
+
(None, rdflib.RDFS.label, None)
|
|
362
|
+
):
|
|
363
|
+
labels_in_ontology.append(str(label_in_ontology))
|
|
333
364
|
raise ValueError(
|
|
334
|
-
f"Could not find root node with label {root_label}"
|
|
365
|
+
f"Could not find root node with label '{root_label}'. "
|
|
366
|
+
f"The ontology contains the following labels: {labels_in_ontology}"
|
|
335
367
|
)
|
|
336
368
|
return root
|
|
337
369
|
|
|
@@ -398,11 +430,29 @@ class OntologyAdapter:
|
|
|
398
430
|
"""
|
|
399
431
|
return self._rdf_graph
|
|
400
432
|
|
|
401
|
-
def
|
|
433
|
+
def get_root_node(self):
|
|
402
434
|
"""
|
|
403
|
-
Get
|
|
435
|
+
Get root node in the ontology.
|
|
436
|
+
|
|
437
|
+
Returns:
|
|
438
|
+
root_node: If _switch_label_and_id is True, the root node label is returned,
|
|
439
|
+
otherwise the root node id is returned.
|
|
404
440
|
"""
|
|
405
|
-
|
|
441
|
+
|
|
442
|
+
root_node = None
|
|
443
|
+
root_label = self._root_label.replace("_", " ")
|
|
444
|
+
|
|
445
|
+
if self._switch_label_and_id:
|
|
446
|
+
root_node = to_lower_sentence_case(root_label)
|
|
447
|
+
elif not self._switch_label_and_id:
|
|
448
|
+
for node, data in self.get_nx_graph().nodes(data=True):
|
|
449
|
+
if "label" in data and data["label"] == to_lower_sentence_case(
|
|
450
|
+
root_label
|
|
451
|
+
):
|
|
452
|
+
root_node = node
|
|
453
|
+
break
|
|
454
|
+
|
|
455
|
+
return root_node
|
|
406
456
|
|
|
407
457
|
def get_ancestors(self, node_label):
|
|
408
458
|
"""
|
|
@@ -465,8 +515,8 @@ class Ontology:
|
|
|
465
515
|
|
|
466
516
|
if self._tail_ontologies:
|
|
467
517
|
for adapter in self._tail_ontologies.values():
|
|
468
|
-
self.
|
|
469
|
-
self._join_ontologies(adapter)
|
|
518
|
+
head_join_node = self._get_head_join_node(adapter)
|
|
519
|
+
self._join_ontologies(adapter, head_join_node)
|
|
470
520
|
else:
|
|
471
521
|
self._nx_graph = self._head_ontology.get_nx_graph()
|
|
472
522
|
|
|
@@ -489,7 +539,10 @@ class Ontology:
|
|
|
489
539
|
self._head_ontology = OntologyAdapter(
|
|
490
540
|
ontology_file=self._head_ontology_meta["url"],
|
|
491
541
|
root_label=self._head_ontology_meta["root_node"],
|
|
492
|
-
|
|
542
|
+
ontology_file_format=self._head_ontology_meta.get("format", None),
|
|
543
|
+
switch_label_and_id=self._head_ontology_meta.get(
|
|
544
|
+
"switch_label_and_id", True
|
|
545
|
+
),
|
|
493
546
|
)
|
|
494
547
|
|
|
495
548
|
if self._tail_ontology_meta:
|
|
@@ -498,12 +551,13 @@ class Ontology:
|
|
|
498
551
|
self._tail_ontologies[key] = OntologyAdapter(
|
|
499
552
|
ontology_file=value["url"],
|
|
500
553
|
root_label=value["tail_join_node"],
|
|
501
|
-
|
|
502
|
-
|
|
554
|
+
head_join_node_label=value["head_join_node"],
|
|
555
|
+
ontology_file_format=value.get("format", None),
|
|
503
556
|
merge_nodes=value.get("merge_nodes", True),
|
|
557
|
+
switch_label_and_id=value.get("switch_label_and_id", True),
|
|
504
558
|
)
|
|
505
559
|
|
|
506
|
-
def
|
|
560
|
+
def _get_head_join_node(self, adapter: OntologyAdapter) -> str:
|
|
507
561
|
"""
|
|
508
562
|
Tries to find the head join node of the given ontology adapter in the
|
|
509
563
|
head ontology. If the join node is not found, the method will raise an
|
|
@@ -514,15 +568,41 @@ class Ontology:
|
|
|
514
568
|
join node in the head ontology.
|
|
515
569
|
"""
|
|
516
570
|
|
|
517
|
-
head_join_node =
|
|
571
|
+
head_join_node = None
|
|
572
|
+
user_defined_head_join_node_label = adapter.get_head_join_node()
|
|
573
|
+
head_join_node_label_in_bc_format = to_lower_sentence_case(
|
|
574
|
+
user_defined_head_join_node_label.replace("_", " ")
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
if self._head_ontology._switch_label_and_id:
|
|
578
|
+
head_join_node = head_join_node_label_in_bc_format
|
|
579
|
+
elif not self._head_ontology._switch_label_and_id:
|
|
580
|
+
for node_id, data in self._head_ontology.get_nx_graph().nodes(
|
|
581
|
+
data=True
|
|
582
|
+
):
|
|
583
|
+
if (
|
|
584
|
+
"label" in data
|
|
585
|
+
and data["label"] == head_join_node_label_in_bc_format
|
|
586
|
+
):
|
|
587
|
+
head_join_node = node_id
|
|
588
|
+
break
|
|
518
589
|
|
|
519
590
|
if head_join_node not in self._head_ontology.get_nx_graph().nodes:
|
|
591
|
+
head_ontology = self._head_ontology._rdf_to_nx(
|
|
592
|
+
self._head_ontology.get_rdf_graph(),
|
|
593
|
+
self._head_ontology._root_label,
|
|
594
|
+
self._head_ontology._switch_label_and_id,
|
|
595
|
+
rename_nodes=False,
|
|
596
|
+
)
|
|
520
597
|
raise ValueError(
|
|
521
|
-
f"Head join node {head_join_node} not found in "
|
|
522
|
-
f"head ontology."
|
|
598
|
+
f"Head join node '{head_join_node}' not found in head ontology. "
|
|
599
|
+
f"The head ontology contains the following nodes: {head_ontology.nodes}."
|
|
523
600
|
)
|
|
601
|
+
return head_join_node
|
|
524
602
|
|
|
525
|
-
def _join_ontologies(
|
|
603
|
+
def _join_ontologies(
|
|
604
|
+
self, adapter: OntologyAdapter, head_join_node
|
|
605
|
+
) -> None:
|
|
526
606
|
"""
|
|
527
607
|
Joins the ontologies by adding the tail ontology as a subgraph to the
|
|
528
608
|
head ontology at the specified join nodes.
|
|
@@ -535,8 +615,7 @@ class Ontology:
|
|
|
535
615
|
if not self._nx_graph:
|
|
536
616
|
self._nx_graph = self._head_ontology.get_nx_graph().copy()
|
|
537
617
|
|
|
538
|
-
|
|
539
|
-
tail_join_node = to_lower_sentence_case(adapter.get_root_label())
|
|
618
|
+
tail_join_node = adapter.get_root_node()
|
|
540
619
|
tail_ontology = adapter.get_nx_graph()
|
|
541
620
|
|
|
542
621
|
# subtree of tail ontology at join node
|
|
@@ -695,8 +774,9 @@ class Ontology:
|
|
|
695
774
|
Args:
|
|
696
775
|
|
|
697
776
|
to_disk (str): If specified, the ontology structure will be saved
|
|
698
|
-
to disk as a GRAPHML file
|
|
699
|
-
|
|
777
|
+
to disk as a GRAPHML file at the location (directory) specified
|
|
778
|
+
by the `to_disk` string, to be opened in your favourite graph
|
|
779
|
+
visualisation tool.
|
|
700
780
|
|
|
701
781
|
full (bool): If True, the full ontology structure will be shown,
|
|
702
782
|
including all nodes and edges. If False, only the nodes and
|
biocypher/write/_batch_writer.py
CHANGED
|
@@ -6,6 +6,7 @@ import os
|
|
|
6
6
|
import re
|
|
7
7
|
import glob
|
|
8
8
|
|
|
9
|
+
from rdflib import Graph
|
|
9
10
|
from more_itertools import peekable
|
|
10
11
|
|
|
11
12
|
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
@@ -117,6 +118,8 @@ class _BatchWriter(ABC):
|
|
|
117
118
|
db_password: str = None,
|
|
118
119
|
db_host: str = None,
|
|
119
120
|
db_port: str = None,
|
|
121
|
+
rdf_format: str = None,
|
|
122
|
+
rdf_namespaces: dict = {},
|
|
120
123
|
):
|
|
121
124
|
"""
|
|
122
125
|
|
|
@@ -196,12 +199,20 @@ class _BatchWriter(ABC):
|
|
|
196
199
|
|
|
197
200
|
db_port:
|
|
198
201
|
The database port.
|
|
202
|
+
|
|
203
|
+
rdf_format:
|
|
204
|
+
The format of RDF.
|
|
205
|
+
|
|
206
|
+
rdf_namespaces:
|
|
207
|
+
The namespaces for RDF.
|
|
199
208
|
"""
|
|
200
209
|
self.db_name = db_name
|
|
201
210
|
self.db_user = db_user
|
|
202
211
|
self.db_password = db_password
|
|
203
212
|
self.db_host = db_host or "localhost"
|
|
204
213
|
self.db_port = db_port
|
|
214
|
+
self.rdf_format = rdf_format
|
|
215
|
+
self.rdf_namespaces = rdf_namespaces
|
|
205
216
|
|
|
206
217
|
self.delim, self.escaped_delim = self._process_delimiter(delimiter)
|
|
207
218
|
self.adelim, self.escaped_adelim = self._process_delimiter(
|
biocypher/write/_write.py
CHANGED
|
@@ -14,6 +14,7 @@ suitable for import into a DBMS.
|
|
|
14
14
|
"""
|
|
15
15
|
|
|
16
16
|
from biocypher._logger import logger
|
|
17
|
+
from biocypher.write.graph._rdf import _RDFWriter
|
|
17
18
|
from biocypher.write.graph._neo4j import _Neo4jBatchWriter
|
|
18
19
|
from biocypher.write.graph._arangodb import _ArangoDBBatchWriter
|
|
19
20
|
from biocypher.write.relational._sqlite import _SQLiteBatchWriter
|
|
@@ -25,7 +26,7 @@ from typing import TYPE_CHECKING
|
|
|
25
26
|
|
|
26
27
|
from biocypher._config import config as _config
|
|
27
28
|
|
|
28
|
-
__all__ = ["get_writer"]
|
|
29
|
+
__all__ = ["get_writer", "DBMS_TO_CLASS"]
|
|
29
30
|
|
|
30
31
|
if TYPE_CHECKING:
|
|
31
32
|
from biocypher._translate import Translator
|
|
@@ -43,6 +44,8 @@ DBMS_TO_CLASS = {
|
|
|
43
44
|
"ArangoDB": _ArangoDBBatchWriter,
|
|
44
45
|
"sqlite": _SQLiteBatchWriter,
|
|
45
46
|
"sqlite3": _SQLiteBatchWriter,
|
|
47
|
+
"rdf": _RDFWriter,
|
|
48
|
+
"RDF": _RDFWriter,
|
|
46
49
|
}
|
|
47
50
|
|
|
48
51
|
|
|
@@ -102,4 +105,6 @@ def get_writer(
|
|
|
102
105
|
db_user=dbms_config.get("user"), # psql
|
|
103
106
|
db_password=dbms_config.get("password"), # psql
|
|
104
107
|
db_port=dbms_config.get("port"), # psql
|
|
108
|
+
rdf_format=dbms_config.get("rdf_format"), # rdf
|
|
109
|
+
rdf_namespaces=dbms_config.get("rdf_namespaces"), # rdf
|
|
105
110
|
)
|
biocypher/write/graph/_neo4j.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import re
|
|
3
|
+
import subprocess
|
|
2
4
|
|
|
3
5
|
from biocypher._logger import logger
|
|
4
6
|
from biocypher.write._batch_writer import parse_label, _BatchWriter
|
|
@@ -22,6 +24,19 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
22
24
|
- _write_array_string
|
|
23
25
|
"""
|
|
24
26
|
|
|
27
|
+
def __init__(self, *args, **kwargs):
|
|
28
|
+
"""
|
|
29
|
+
Constructor.
|
|
30
|
+
|
|
31
|
+
Check the version of Neo4j and adds a command scope if version >= 5.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
_Neo4jBatchWriter: An instance of the writer.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
# Should read the configuration and setup import_call_bin_prefix.
|
|
38
|
+
super().__init__(*args, **kwargs)
|
|
39
|
+
|
|
25
40
|
def _get_default_import_call_bin_prefix(self):
|
|
26
41
|
"""
|
|
27
42
|
Method to provide the default string for the import call bin prefix.
|
|
@@ -29,6 +44,7 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
29
44
|
Returns:
|
|
30
45
|
str: The default location for the neo4j admin import location
|
|
31
46
|
"""
|
|
47
|
+
|
|
32
48
|
return "bin/"
|
|
33
49
|
|
|
34
50
|
def _write_array_string(self, string_list):
|
|
@@ -263,9 +279,32 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
263
279
|
Returns:
|
|
264
280
|
str: a bash command for neo4j-admin import
|
|
265
281
|
"""
|
|
282
|
+
import_call_neo4j_v4 = self._get_import_call(
|
|
283
|
+
"import", "--database=", "--force="
|
|
284
|
+
)
|
|
285
|
+
import_call_neo4j_v5 = self._get_import_call(
|
|
286
|
+
"database import full", "", "--overwrite-destination="
|
|
287
|
+
)
|
|
288
|
+
neo4j_version_check = f"version=$({self._get_default_import_call_bin_prefix()}neo4j-admin --version | cut -d '.' -f 1)"
|
|
289
|
+
|
|
290
|
+
import_script = f"#!/bin/bash\n{neo4j_version_check}\nif [[ $version -ge 5 ]]; then\n\t{import_call_neo4j_v5}\nelse\n\t{import_call_neo4j_v4}\nfi"
|
|
291
|
+
return import_script
|
|
292
|
+
|
|
293
|
+
def _get_import_call(
|
|
294
|
+
self, import_cmd: str, database_cmd: str, wipe_cmd: str
|
|
295
|
+
) -> str:
|
|
296
|
+
"""Get parametrized import call for Neo4j 4 or 5+.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
import_cmd (str): The import command to use.
|
|
300
|
+
database_cmd (str): The database command to use.
|
|
301
|
+
wipe_cmd (str): The wipe command to use.
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
str: The import call.
|
|
305
|
+
"""
|
|
266
306
|
import_call = (
|
|
267
|
-
f"{self.import_call_bin_prefix}neo4j-admin
|
|
268
|
-
f"--database={self.db_name} "
|
|
307
|
+
f"{self.import_call_bin_prefix}neo4j-admin {import_cmd} "
|
|
269
308
|
f'--delimiter="{self.escaped_delim}" '
|
|
270
309
|
f'--array-delimiter="{self.escaped_adelim}" '
|
|
271
310
|
)
|
|
@@ -276,7 +315,7 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
276
315
|
import_call += f"--quote='{self.quote}' "
|
|
277
316
|
|
|
278
317
|
if self.wipe:
|
|
279
|
-
import_call += f"
|
|
318
|
+
import_call += f"{wipe_cmd}true "
|
|
280
319
|
if self.skip_bad_relationships:
|
|
281
320
|
import_call += "--skip-bad-relationships=true "
|
|
282
321
|
if self.skip_duplicate_nodes:
|
|
@@ -290,4 +329,6 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
290
329
|
for header_path, parts_path in self.import_call_edges:
|
|
291
330
|
import_call += f'--relationships="{header_path},{parts_path}" '
|
|
292
331
|
|
|
332
|
+
# Database needs to be at the end starting with Neo4j 5.0+.
|
|
333
|
+
import_call += f"{database_cmd}{self.db_name} "
|
|
293
334
|
return import_call
|
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
#
|
|
4
|
+
# Copyright 2021, Heidelberg University Clinic
|
|
5
|
+
#
|
|
6
|
+
# File author(s): Loes van den Biggelaar
|
|
7
|
+
# Sebastian Lobentanzer
|
|
8
|
+
#
|
|
9
|
+
# Distributed under MIT licence, see the file `LICENSE`.
|
|
10
|
+
#
|
|
11
|
+
"""
|
|
12
|
+
BioCypher 'offline' module. Handles the writing of node and edge representations
|
|
13
|
+
suitable for import into a DBMS.
|
|
14
|
+
"""
|
|
15
|
+
from types import GeneratorType
|
|
16
|
+
from typing import Union
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
|
|
20
|
+
from rdflib.namespace import (
|
|
21
|
+
_NAMESPACE_PREFIXES_CORE,
|
|
22
|
+
_NAMESPACE_PREFIXES_RDFLIB,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
from biocypher._create import BioCypherEdge, BioCypherNode
|
|
26
|
+
from biocypher._logger import logger
|
|
27
|
+
from biocypher.write._batch_writer import _BatchWriter
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class _RDFWriter(_BatchWriter):
|
|
31
|
+
|
|
32
|
+
"""
|
|
33
|
+
Class to write BioCypher's property graph into an RDF format using
|
|
34
|
+
rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
|
|
35
|
+
N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
|
|
36
|
+
is done keeping only the minimum information about node and edges,
|
|
37
|
+
skipping all properties.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def _get_import_script_name(self) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Returns the name of the RDF admin import script.
|
|
43
|
+
This function applicable for RDF export.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
str: The name of the import script (ending in .sh)
|
|
47
|
+
"""
|
|
48
|
+
return "rdf-import-call.sh"
|
|
49
|
+
|
|
50
|
+
def _get_default_import_call_bin_prefix(self):
|
|
51
|
+
"""
|
|
52
|
+
Method to provide the default string for the import call bin prefix.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
str: The default location for the RDF admin import location
|
|
56
|
+
"""
|
|
57
|
+
return "bin/"
|
|
58
|
+
|
|
59
|
+
def _is_rdf_format_supported(self, rdf_format: str) -> bool:
|
|
60
|
+
"""
|
|
61
|
+
Function to check if the specified RDF format is supported.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
rdf_format (str): The RDF format to check.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
bool: Returns True if rdf format supported, False otherwise.
|
|
68
|
+
"""
|
|
69
|
+
supported_formats = [
|
|
70
|
+
"xml",
|
|
71
|
+
"n3",
|
|
72
|
+
"turtle",
|
|
73
|
+
"nt",
|
|
74
|
+
"pretty-xml",
|
|
75
|
+
"trix",
|
|
76
|
+
"trig",
|
|
77
|
+
"nquads",
|
|
78
|
+
"json-ld",
|
|
79
|
+
]
|
|
80
|
+
if rdf_format not in supported_formats:
|
|
81
|
+
logger.error(
|
|
82
|
+
f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
|
|
83
|
+
f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
|
|
84
|
+
)
|
|
85
|
+
return False
|
|
86
|
+
else:
|
|
87
|
+
# RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
|
|
88
|
+
if self.rdf_format == "turtle":
|
|
89
|
+
self.extension = "ttl"
|
|
90
|
+
elif self.rdf_format == "ttl":
|
|
91
|
+
self.rdf_format = "turtle"
|
|
92
|
+
self.extension = "ttl"
|
|
93
|
+
else:
|
|
94
|
+
self.extension = self.rdf_format
|
|
95
|
+
return True
|
|
96
|
+
|
|
97
|
+
def _write_single_edge_list_to_file(
|
|
98
|
+
self,
|
|
99
|
+
edge_list: list,
|
|
100
|
+
label: str,
|
|
101
|
+
prop_dict: dict,
|
|
102
|
+
):
|
|
103
|
+
"""
|
|
104
|
+
This function takes one list of biocypher edges and writes them
|
|
105
|
+
to an RDF file with the given format.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
edge_list (list): list of BioCypherEdges to be written
|
|
109
|
+
|
|
110
|
+
label (str): the label (type) of the edge
|
|
111
|
+
|
|
112
|
+
prop_dict (dict): properties of node class passed from parsing
|
|
113
|
+
function and their types
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
bool: The return value. True for success, False otherwise.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
|
|
120
|
+
logger.error("Edges must be passed as type BioCypherEdge.")
|
|
121
|
+
return False
|
|
122
|
+
|
|
123
|
+
# translate label to PascalCase
|
|
124
|
+
label_pascal = self.translator.name_sentence_to_pascal(label)
|
|
125
|
+
|
|
126
|
+
# create file name
|
|
127
|
+
file_name = os.path.join(
|
|
128
|
+
self._outdir, f"{label_pascal}.{self.extension}"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# write data in graph
|
|
132
|
+
graph = Graph()
|
|
133
|
+
self._init_namespaces(graph)
|
|
134
|
+
|
|
135
|
+
for edge in edge_list:
|
|
136
|
+
rdf_subject = edge.get_source_id()
|
|
137
|
+
rdf_object = edge.get_target_id()
|
|
138
|
+
rdf_predicate = edge.get_id()
|
|
139
|
+
rdf_properties = edge.get_properties()
|
|
140
|
+
if rdf_predicate == None:
|
|
141
|
+
rdf_predicate = rdf_subject + rdf_object
|
|
142
|
+
|
|
143
|
+
edge_label = self.translator.name_sentence_to_pascal(
|
|
144
|
+
edge.get_label()
|
|
145
|
+
)
|
|
146
|
+
edge_uri = self.rdf_namespaces["biocypher"][edge_label]
|
|
147
|
+
graph.add((edge_uri, RDF.type, RDFS.Class))
|
|
148
|
+
graph.add(
|
|
149
|
+
(
|
|
150
|
+
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
151
|
+
RDF.type,
|
|
152
|
+
edge_uri,
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
graph.add(
|
|
156
|
+
(
|
|
157
|
+
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
158
|
+
self.rdf_namespaces["biocypher"]["subject"],
|
|
159
|
+
self.subject_to_uri(rdf_subject),
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
graph.add(
|
|
163
|
+
(
|
|
164
|
+
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
165
|
+
self.rdf_namespaces["biocypher"]["object"],
|
|
166
|
+
self.subject_to_uri(rdf_object),
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# add properties to the transformed edge --> node
|
|
171
|
+
for key, value in rdf_properties.items():
|
|
172
|
+
# only write value if it exists.
|
|
173
|
+
if value:
|
|
174
|
+
self.add_property_to_graph(graph, rdf_predicate, value, key)
|
|
175
|
+
|
|
176
|
+
graph.serialize(destination=file_name, format=self.rdf_format)
|
|
177
|
+
|
|
178
|
+
logger.info(
|
|
179
|
+
f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
return True
|
|
183
|
+
|
|
184
|
+
def add_property_to_graph(
|
|
185
|
+
self,
|
|
186
|
+
graph: Graph,
|
|
187
|
+
rdf_subject: str,
|
|
188
|
+
rdf_object: str,
|
|
189
|
+
rdf_predicate: str,
|
|
190
|
+
):
|
|
191
|
+
"""
|
|
192
|
+
Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
|
|
193
|
+
It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
|
|
194
|
+
If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
|
|
195
|
+
If the property is neither a list or string, it will also be added as a literal.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
graph (RDFLib.Graph): The RDF graph to add the nodes to.
|
|
199
|
+
|
|
200
|
+
rdf_subject (str): The subject of the RDF triple.
|
|
201
|
+
|
|
202
|
+
rdf_object (str): The object of the RDF triple.
|
|
203
|
+
|
|
204
|
+
rdf_predicate (str): The predicate of the RDF triple.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
None
|
|
208
|
+
"""
|
|
209
|
+
if isinstance(rdf_object, list):
|
|
210
|
+
for obj in rdf_object:
|
|
211
|
+
graph.add(
|
|
212
|
+
(
|
|
213
|
+
self.subject_to_uri(rdf_subject),
|
|
214
|
+
self.property_to_uri(rdf_predicate),
|
|
215
|
+
Literal(obj),
|
|
216
|
+
)
|
|
217
|
+
)
|
|
218
|
+
elif isinstance(rdf_object, str):
|
|
219
|
+
if rdf_object.startswith("[") and rdf_object.endswith("]"):
|
|
220
|
+
self.add_property_to_graph(
|
|
221
|
+
graph,
|
|
222
|
+
rdf_subject,
|
|
223
|
+
self.transform_string_to_list(rdf_object),
|
|
224
|
+
rdf_predicate,
|
|
225
|
+
)
|
|
226
|
+
else:
|
|
227
|
+
graph.add(
|
|
228
|
+
(
|
|
229
|
+
self.subject_to_uri(rdf_subject),
|
|
230
|
+
self.property_to_uri(rdf_predicate),
|
|
231
|
+
Literal(rdf_object),
|
|
232
|
+
)
|
|
233
|
+
)
|
|
234
|
+
else:
|
|
235
|
+
graph.add(
|
|
236
|
+
(
|
|
237
|
+
self.subject_to_uri(rdf_subject),
|
|
238
|
+
self.property_to_uri(rdf_predicate),
|
|
239
|
+
Literal(rdf_object),
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
def transform_string_to_list(self, string_list: str) -> list:
|
|
244
|
+
"""
|
|
245
|
+
Function to transform a string representation of a list into a list.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
string_list (str): The string representation of the list.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
list: The list representation of the input string.
|
|
252
|
+
"""
|
|
253
|
+
return (
|
|
254
|
+
string_list.replace("[", "")
|
|
255
|
+
.replace("]", "")
|
|
256
|
+
.replace("'", "")
|
|
257
|
+
.split(", ")
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
def _write_single_node_list_to_file(
|
|
261
|
+
self,
|
|
262
|
+
node_list: list,
|
|
263
|
+
label: str,
|
|
264
|
+
prop_dict: dict,
|
|
265
|
+
labels: str,
|
|
266
|
+
):
|
|
267
|
+
"""
|
|
268
|
+
This function takes a list of BioCypherNodes and writes them
|
|
269
|
+
to an RDF file in the specified format.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
node_list (list): A list of BioCypherNodes to be written.
|
|
273
|
+
|
|
274
|
+
label (str): The label (type) of the nodes.
|
|
275
|
+
|
|
276
|
+
prop_dict (dict): A dictionary of properties and their types for the node class.
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
bool: True if the writing is successful, False otherwise.
|
|
280
|
+
"""
|
|
281
|
+
if not all(isinstance(n, BioCypherNode) for n in node_list):
|
|
282
|
+
logger.error("Nodes must be passed as type BioCypherNode.")
|
|
283
|
+
return False
|
|
284
|
+
|
|
285
|
+
# translate label to PascalCase
|
|
286
|
+
label_pascal = self.translator.name_sentence_to_pascal(label)
|
|
287
|
+
|
|
288
|
+
# create file name
|
|
289
|
+
file_name = os.path.join(
|
|
290
|
+
self._outdir, f"{label_pascal}.{self.extension}"
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
# write data in graph
|
|
294
|
+
graph = Graph()
|
|
295
|
+
self._init_namespaces(graph)
|
|
296
|
+
|
|
297
|
+
for n in node_list:
|
|
298
|
+
rdf_subject = n.get_id()
|
|
299
|
+
rdf_object = n.get_label()
|
|
300
|
+
properties = n.get_properties()
|
|
301
|
+
class_name = self.translator.name_sentence_to_pascal(rdf_object)
|
|
302
|
+
graph.add(
|
|
303
|
+
(
|
|
304
|
+
self.rdf_namespaces["biocypher"][class_name],
|
|
305
|
+
RDF.type,
|
|
306
|
+
RDFS.Class,
|
|
307
|
+
)
|
|
308
|
+
)
|
|
309
|
+
graph.add(
|
|
310
|
+
(
|
|
311
|
+
self.subject_to_uri(rdf_subject),
|
|
312
|
+
RDF.type,
|
|
313
|
+
self.rdf_namespaces["biocypher"][class_name],
|
|
314
|
+
)
|
|
315
|
+
)
|
|
316
|
+
for key, value in properties.items():
|
|
317
|
+
# only write value if it exists.
|
|
318
|
+
if value:
|
|
319
|
+
self.add_property_to_graph(graph, rdf_subject, value, key)
|
|
320
|
+
|
|
321
|
+
graph.serialize(destination=file_name, format=self.rdf_format)
|
|
322
|
+
|
|
323
|
+
logger.info(
|
|
324
|
+
f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
return True
|
|
328
|
+
|
|
329
|
+
def write_nodes(
|
|
330
|
+
self, nodes, batch_size: int = int(1e6), force: bool = False
|
|
331
|
+
) -> bool:
|
|
332
|
+
"""
|
|
333
|
+
Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
nodes (list or generator): A list or generator of nodes in BioCypherNode format.
|
|
337
|
+
batch_size (int): The number of nodes to write in each batch.
|
|
338
|
+
force (bool): Flag to force the writing even if the output file already exists.
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
bool: True if the writing is successful, False otherwise.
|
|
342
|
+
"""
|
|
343
|
+
# check if specified output format is correct
|
|
344
|
+
passed = self._is_rdf_format_supported(self.rdf_format)
|
|
345
|
+
if not passed:
|
|
346
|
+
logger.error("Error while writing node data, wrong RDF format")
|
|
347
|
+
return False
|
|
348
|
+
# write node data using _write_node_data method
|
|
349
|
+
passed = self._write_node_data(nodes, batch_size, force)
|
|
350
|
+
if not passed:
|
|
351
|
+
logger.error("Error while writing node data.")
|
|
352
|
+
return False
|
|
353
|
+
return True
|
|
354
|
+
|
|
355
|
+
def write_edges(
|
|
356
|
+
self,
|
|
357
|
+
edges: Union[list, GeneratorType],
|
|
358
|
+
batch_size: int = int(1e6),
|
|
359
|
+
) -> bool:
|
|
360
|
+
"""
|
|
361
|
+
Wrapper for writing edges in RDF format. It calls _write_edge_data()
|
|
362
|
+
functions specifying it's edge data.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
edges (BioCypherEdge): a list or generator of edges in
|
|
366
|
+
:py:class:`BioCypherEdge` format
|
|
367
|
+
batch_size (int): The number of edges to write in each batch.
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
bool: The return value. True for success, False otherwise.
|
|
371
|
+
"""
|
|
372
|
+
# check if specified output format is correct
|
|
373
|
+
passed = self._is_rdf_format_supported(self.rdf_format)
|
|
374
|
+
if not passed:
|
|
375
|
+
logger.error("Error while writing edge data, wrong RDF format")
|
|
376
|
+
return False
|
|
377
|
+
# write edge data using _write_edge_data method
|
|
378
|
+
passed = self._write_edge_data(edges, batch_size=batch_size)
|
|
379
|
+
if not passed:
|
|
380
|
+
logger.error("Error while writing edge data.")
|
|
381
|
+
return False
|
|
382
|
+
|
|
383
|
+
return True
|
|
384
|
+
|
|
385
|
+
def _construct_import_call(self) -> bool:
|
|
386
|
+
"""
|
|
387
|
+
Function to write the import call.
|
|
388
|
+
This function is not applicable for RDF.
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
bool: The return value. True for success, False otherwise.
|
|
392
|
+
"""
|
|
393
|
+
return ""
|
|
394
|
+
|
|
395
|
+
def _write_array_string(self, string_list):
|
|
396
|
+
"""
|
|
397
|
+
Abstract method to write the string representation of an array into a .csv file
|
|
398
|
+
as required by the RDF admin-import.
|
|
399
|
+
This function is not applicable for RDF.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
string_list (list): list of ontology strings
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
str: The string representation of an array for the neo4j admin import
|
|
406
|
+
"""
|
|
407
|
+
|
|
408
|
+
return True
|
|
409
|
+
|
|
410
|
+
def _write_node_headers(self):
|
|
411
|
+
"""
|
|
412
|
+
Abstract method that takes care of importing properties of a graph entity that is represented
|
|
413
|
+
as a node as per the definition in the `schema_config.yaml`
|
|
414
|
+
This function is not applicable for RDF.
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
bool: The return value. True for success, False otherwise.
|
|
418
|
+
"""
|
|
419
|
+
return True
|
|
420
|
+
|
|
421
|
+
def _write_edge_headers(self):
|
|
422
|
+
"""
|
|
423
|
+
Abstract method to write a database import-file for a graph entity that is represented
|
|
424
|
+
as an edge as per the definition in the `schema_config.yaml`,
|
|
425
|
+
containing only the header for this type of edge.
|
|
426
|
+
This function is not applicable for RDF.
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
bool: The return value. True for success, False otherwise.
|
|
430
|
+
"""
|
|
431
|
+
return True
|
|
432
|
+
|
|
433
|
+
def subject_to_uri(self, subject: str) -> str:
|
|
434
|
+
"""
|
|
435
|
+
Converts the subject to a proper URI using the available namespaces.
|
|
436
|
+
If the conversion fails, it defaults to the biocypher prefix.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
subject (str): The subject to be converted to a URI.
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
str: The corresponding URI for the subject.
|
|
443
|
+
"""
|
|
444
|
+
try:
|
|
445
|
+
_pref, _id = subject.split(":")
|
|
446
|
+
|
|
447
|
+
if _pref in self.rdf_namespaces.keys():
|
|
448
|
+
return self.rdf_namespaces[_pref][_id]
|
|
449
|
+
else:
|
|
450
|
+
return self.rdf_namespaces["biocypher"][subject]
|
|
451
|
+
except ValueError:
|
|
452
|
+
return self.rdf_namespaces["biocypher"][subject]
|
|
453
|
+
|
|
454
|
+
def property_to_uri(self, property_name: str) -> dict[str, str]:
|
|
455
|
+
"""
|
|
456
|
+
Converts a property name to its corresponding URI.
|
|
457
|
+
|
|
458
|
+
This function takes a property name and searches for its corresponding URI in various namespaces.
|
|
459
|
+
It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
property_name (str): The property name to be converted to a URI.
|
|
463
|
+
|
|
464
|
+
Returns:
|
|
465
|
+
str: The corresponding URI for the input property name.
|
|
466
|
+
"""
|
|
467
|
+
# These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
|
|
468
|
+
for namespace in _NAMESPACE_PREFIXES_CORE.values():
|
|
469
|
+
if property_name in namespace:
|
|
470
|
+
return namespace[property_name]
|
|
471
|
+
|
|
472
|
+
# If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
|
|
473
|
+
for namespace in [SKOS, DC, DCTERMS]:
|
|
474
|
+
if property_name in namespace:
|
|
475
|
+
return namespace[property_name]
|
|
476
|
+
|
|
477
|
+
# If the property name is still not found, try other namespaces from rdflib.
|
|
478
|
+
for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
|
|
479
|
+
if property_name in namespace:
|
|
480
|
+
return namespace[property_name]
|
|
481
|
+
|
|
482
|
+
# If the property name is "licence", it recursively calls the function with "license" as the input.
|
|
483
|
+
if property_name == "licence":
|
|
484
|
+
return self.property_to_uri("license")
|
|
485
|
+
|
|
486
|
+
# TODO: add an option to search trough manually implemented namespaces
|
|
487
|
+
|
|
488
|
+
# If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
|
|
489
|
+
# TODO: give a warning and try to prevent this option altogether
|
|
490
|
+
return self.rdf_namespaces["biocypher"][property_name]
|
|
491
|
+
|
|
492
|
+
def _init_namespaces(self, graph: Graph):
|
|
493
|
+
"""
|
|
494
|
+
Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
|
|
495
|
+
|
|
496
|
+
This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
|
|
497
|
+
If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
|
|
498
|
+
the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
None
|
|
505
|
+
"""
|
|
506
|
+
# add biocypher standard to self.rdf_namespaces
|
|
507
|
+
biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
|
|
508
|
+
if not self.rdf_namespaces:
|
|
509
|
+
self.rdf_namespaces = biocypher_standard
|
|
510
|
+
else:
|
|
511
|
+
self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
|
|
512
|
+
|
|
513
|
+
for key, value in self.rdf_namespaces.items():
|
|
514
|
+
namespace = Namespace(value)
|
|
515
|
+
self.rdf_namespaces[key] = namespace
|
|
516
|
+
graph.bind(key, namespace)
|
|
@@ -1,32 +1,33 @@
|
|
|
1
1
|
biocypher/__init__.py,sha256=ejNY53vH_pE3ZbIN8G_ZBYxxPG9aERovRLD0XhDvt4k,942
|
|
2
2
|
biocypher/_config/__init__.py,sha256=fFHRFYxE2MtDAQWL6upe--MJ1vw3Z8CwIPhF2gW8cRU,3698
|
|
3
|
-
biocypher/_config/biocypher_config.yaml,sha256=
|
|
3
|
+
biocypher/_config/biocypher_config.yaml,sha256=VE_UH6POExAsuPpqWsahsT8-9k5jglMkuBfuszH1tiU,2868
|
|
4
4
|
biocypher/_config/test_config.yaml,sha256=Np8jeS5_EP6HHOvMKb7B_Tkyqd5YaYlYz_DVsXypt-A,119
|
|
5
5
|
biocypher/_config/test_schema_config.yaml,sha256=D1600WgEj3iTXrumVU9LIivJHJO36iaxfkOgyam9zVU,3129
|
|
6
6
|
biocypher/_config/test_schema_config_disconnected.yaml,sha256=Qm8FLxEn2spHcyj_5F859KjcDvKSxNhxDvi4b4LLkvQ,68
|
|
7
7
|
biocypher/_config/test_schema_config_extended.yaml,sha256=wn3A76142hhjnImhMF6RODbCFESTJ2TtPvcFdIFsAT0,3309
|
|
8
|
-
biocypher/_connect.py,sha256=
|
|
8
|
+
biocypher/_connect.py,sha256=7hk3J03hzZOPE48ISaoB6IgRun8GaUmDtIRnnD7vKiU,13453
|
|
9
9
|
biocypher/_core.py,sha256=5rZKYie_vSjTYduH8oH-GxLMZuNqLAe3ZYAQ5nUp8Nc,22578
|
|
10
10
|
biocypher/_create.py,sha256=vpUchUdEpWupZi1LgFLxAWMtqoBwnWbP7PwEDUCBS4A,10202
|
|
11
11
|
biocypher/_deduplicate.py,sha256=BBvfpXzu6L5YDY5FdtXxnf8YlsbJpbCE8RdUoKsm0n0,4949
|
|
12
12
|
biocypher/_get.py,sha256=3Kpky3blfNf1JwxKWLsZxTU2aTP_C4sUe8OpiyYj63I,10810
|
|
13
13
|
biocypher/_logger.py,sha256=NGXe3hZA79WSujfOgpcxHBf8N2QAfrmvM1LFDpsGK2U,3185
|
|
14
14
|
biocypher/_mapping.py,sha256=ERSNH2Bg19145KytxbFE4BInPaiP-LWW7osOBot29Eo,9304
|
|
15
|
-
biocypher/_metadata.py,sha256=
|
|
16
|
-
biocypher/_misc.py,sha256=
|
|
17
|
-
biocypher/_ontology.py,sha256=
|
|
15
|
+
biocypher/_metadata.py,sha256=hTN9aStXCS7IzABrE7BmT5GZ-8YUt8gP9PG_P5Ix1Vw,1658
|
|
16
|
+
biocypher/_misc.py,sha256=18EG2Bei3RnyWXDWc3qtZaT3gybvXI8opi0HvSaF7Lg,6066
|
|
17
|
+
biocypher/_ontology.py,sha256=G5k-bnzvPZUqhLPxtoOPFa4OSQ4JpufgozVakLTjwLg,31789
|
|
18
18
|
biocypher/_pandas.py,sha256=GVCFM68J7yBjh40MpkNVgD8qT1RFMrrIjMOtD3iKsf4,3040
|
|
19
19
|
biocypher/_translate.py,sha256=JafvhtVaFSpruRfYh9BzjVbvDF1Mhg7LLKMDZHWkRjg,16496
|
|
20
20
|
biocypher/write/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
-
biocypher/write/_batch_writer.py,sha256=
|
|
22
|
-
biocypher/write/_write.py,sha256=
|
|
21
|
+
biocypher/write/_batch_writer.py,sha256=x_fe2yndASNAvO-GaeVhjUVxnSNdDZ6-FB1mj572Jvw,37129
|
|
22
|
+
biocypher/write/_write.py,sha256=4UYw-y3CevwcdVBq6ou1rTJXuXrcde7oraWeO8YXcK4,3330
|
|
23
23
|
biocypher/write/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
biocypher/write/graph/_arangodb.py,sha256=du5pivCR7xKs8VyxeegxYsSBIcsXGrfSbM_AffFapwg,8071
|
|
25
|
-
biocypher/write/graph/_neo4j.py,sha256=
|
|
25
|
+
biocypher/write/graph/_neo4j.py,sha256=qSj1PryD4UmveS7ACs1R3eo2pegi53pVI7d7P0ihOKI,11930
|
|
26
|
+
biocypher/write/graph/_rdf.py,sha256=9_u9usWhU7EKKDd1PgXyV99opS5IAeef2lhDNEN6fOw,17973
|
|
26
27
|
biocypher/write/relational/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
28
|
biocypher/write/relational/_postgresql.py,sha256=NdI-ULP8valsqlkObOg50od-3-amVj5RzGnZ_7NW2ww,11945
|
|
28
29
|
biocypher/write/relational/_sqlite.py,sha256=KLQpxQXF1B8qqTtKUFfjWdwHjd1Fhn9syK931Z0dsq0,2066
|
|
29
|
-
biocypher-0.5.
|
|
30
|
-
biocypher-0.5.
|
|
31
|
-
biocypher-0.5.
|
|
32
|
-
biocypher-0.5.
|
|
30
|
+
biocypher-0.5.42.dist-info/LICENSE,sha256=SjUaQkq671iQUZOxEUpC4jvJxXOlfSiHTTueyz9kXJM,1065
|
|
31
|
+
biocypher-0.5.42.dist-info/METADATA,sha256=3lT_thshGguJMnCeer-4JaJQfrsuKeWAd6oaYWhXPyk,10642
|
|
32
|
+
biocypher-0.5.42.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
|
|
33
|
+
biocypher-0.5.42.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|