biocypher 0.6.2__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/__init__.py +3 -13
- biocypher/_config/__init__.py +6 -23
- biocypher/_config/biocypher_config.yaml +14 -3
- biocypher/_core.py +360 -262
- biocypher/_create.py +13 -27
- biocypher/_deduplicate.py +4 -11
- biocypher/_get.py +21 -60
- biocypher/_logger.py +4 -16
- biocypher/_mapping.py +4 -17
- biocypher/_metadata.py +3 -15
- biocypher/_misc.py +14 -28
- biocypher/_ontology.py +127 -212
- biocypher/_translate.py +34 -58
- biocypher/output/connect/_get_connector.py +40 -0
- biocypher/output/connect/_neo4j_driver.py +9 -65
- biocypher/output/in_memory/_get_in_memory_kg.py +34 -0
- biocypher/output/in_memory/_in_memory_kg.py +40 -0
- biocypher/output/in_memory/_networkx.py +44 -0
- biocypher/output/in_memory/_pandas.py +20 -15
- biocypher/output/write/_batch_writer.py +166 -179
- biocypher/output/write/_get_writer.py +11 -24
- biocypher/output/write/_writer.py +43 -44
- biocypher/output/write/graph/_arangodb.py +7 -24
- biocypher/output/write/graph/_neo4j.py +51 -56
- biocypher/output/write/graph/_networkx.py +36 -43
- biocypher/output/write/graph/_rdf.py +107 -95
- biocypher/output/write/relational/_csv.py +6 -11
- biocypher/output/write/relational/_postgresql.py +5 -13
- biocypher/output/write/relational/_sqlite.py +3 -1
- {biocypher-0.6.2.dist-info → biocypher-0.8.0.dist-info}/LICENSE +1 -1
- {biocypher-0.6.2.dist-info → biocypher-0.8.0.dist-info}/METADATA +3 -3
- biocypher-0.8.0.dist-info/RECORD +43 -0
- {biocypher-0.6.2.dist-info → biocypher-0.8.0.dist-info}/WHEEL +1 -1
- biocypher-0.6.2.dist-info/RECORD +0 -39
|
@@ -1,31 +1,34 @@
|
|
|
1
1
|
import pickle
|
|
2
2
|
|
|
3
|
-
import networkx as nx
|
|
4
|
-
|
|
5
3
|
from biocypher._logger import logger
|
|
4
|
+
from biocypher.output.in_memory._networkx import NetworkxKG
|
|
6
5
|
from biocypher.output.write._writer import _Writer
|
|
7
|
-
from biocypher.output.write.relational._csv import _PandasCSVWriter
|
|
8
6
|
|
|
9
7
|
|
|
10
8
|
class _NetworkXWriter(_Writer):
|
|
11
9
|
"""
|
|
12
|
-
Class for writing
|
|
10
|
+
Class for writing the in-memory networkx DiGraph to file.
|
|
11
|
+
|
|
12
|
+
Call `_construct_import_call` to write the networkx DiGraph to a pickle
|
|
13
|
+
file and return the Python call to load it.
|
|
14
|
+
|
|
15
|
+
TODO: this is a non-intuitive name, should be adjusted.
|
|
13
16
|
"""
|
|
14
17
|
|
|
15
18
|
def __init__(self, *args, **kwargs):
|
|
16
19
|
super().__init__(*args, **kwargs)
|
|
17
|
-
self.
|
|
18
|
-
|
|
20
|
+
self.in_memory_networkx_kg = NetworkxKG(
|
|
21
|
+
deduplicator=self.deduplicator,
|
|
22
|
+
)
|
|
19
23
|
|
|
20
24
|
def _construct_import_call(self) -> str:
|
|
21
|
-
"""
|
|
25
|
+
"""Dump networkx graph to a pickle file and return Python call.
|
|
22
26
|
|
|
23
27
|
Returns:
|
|
24
|
-
str: Python code to load the
|
|
28
|
+
str: Python code to load the networkx graph from a pickle file.
|
|
25
29
|
"""
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
)
|
|
30
|
+
self.G = self.in_memory_networkx_kg._create_networkx_kg()
|
|
31
|
+
logger.info(f"Writing networkx {self.G} to pickle file networkx_graph.pkl.")
|
|
29
32
|
with open(f"{self.output_directory}/networkx_graph.pkl", "wb") as f:
|
|
30
33
|
pickle.dump(self.G, f)
|
|
31
34
|
|
|
@@ -38,39 +41,29 @@ class _NetworkXWriter(_Writer):
|
|
|
38
41
|
return "import_networkx.py"
|
|
39
42
|
|
|
40
43
|
def _write_node_data(self, nodes) -> bool:
|
|
41
|
-
|
|
42
|
-
|
|
44
|
+
"""Add nodes to the networkx graph.
|
|
45
|
+
|
|
46
|
+
TODO: this is not strictly writing, should be refactored.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
nodes (list): List of nodes to add to the networkx graph.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
bool: True if the nodes were added successfully, False otherwise.
|
|
53
|
+
"""
|
|
54
|
+
passed = self.in_memory_networkx_kg.add_nodes(nodes)
|
|
43
55
|
return passed
|
|
44
56
|
|
|
45
57
|
def _write_edge_data(self, edges) -> bool:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
58
|
+
"""Add edges to the networkx graph.
|
|
59
|
+
|
|
60
|
+
TODO: this is not strictly writing, should be refactored.
|
|
49
61
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
df
|
|
59
|
-
for df in all_dfs.values()
|
|
60
|
-
if df.columns.str.contains("source_id").any()
|
|
61
|
-
and df.columns.str.contains("target_id").any()
|
|
62
|
-
]
|
|
63
|
-
for df in node_dfs:
|
|
64
|
-
nodes = df.set_index("node_id").to_dict(orient="index")
|
|
65
|
-
self.G.add_nodes_from(nodes.items())
|
|
66
|
-
for df in edge_dfs:
|
|
67
|
-
edges = df.set_index(["source_id", "target_id"]).to_dict(
|
|
68
|
-
orient="index"
|
|
69
|
-
)
|
|
70
|
-
self.G.add_edges_from(
|
|
71
|
-
(
|
|
72
|
-
(source, target, attrs)
|
|
73
|
-
for (source, target), attrs in edges.items()
|
|
74
|
-
)
|
|
75
|
-
)
|
|
76
|
-
return True
|
|
62
|
+
Args:
|
|
63
|
+
edges (list): List of edges to add to the networkx graph.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
bool: True if the edges were added successfully, False otherwise.
|
|
67
|
+
"""
|
|
68
|
+
passed = self.in_memory_networkx_kg.add_edges(edges)
|
|
69
|
+
return passed
|
|
@@ -1,22 +1,12 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
#
|
|
4
|
-
# Copyright 2021, Heidelberg University Clinic
|
|
5
|
-
#
|
|
6
|
-
# File author(s): Loes van den Biggelaar
|
|
7
|
-
# Sebastian Lobentanzer
|
|
8
|
-
#
|
|
9
|
-
# Distributed under MIT licence, see the file `LICENSE`.
|
|
10
|
-
#
|
|
11
|
-
"""
|
|
12
|
-
BioCypher 'offline' module. Handles the writing of node and edge representations
|
|
1
|
+
"""BioCypher 'offline' module. Handles the writing of node and edge representations
|
|
13
2
|
suitable for import into a DBMS.
|
|
14
3
|
"""
|
|
15
|
-
|
|
16
|
-
from typing import Union
|
|
4
|
+
|
|
17
5
|
import os
|
|
18
6
|
|
|
19
|
-
from
|
|
7
|
+
from types import GeneratorType
|
|
8
|
+
|
|
9
|
+
from rdflib import DC, DCTERMS, RDF, RDFS, SKOS, Graph, Literal, Namespace
|
|
20
10
|
from rdflib.namespace import (
|
|
21
11
|
_NAMESPACE_PREFIXES_CORE,
|
|
22
12
|
_NAMESPACE_PREFIXES_RDFLIB,
|
|
@@ -28,8 +18,7 @@ from biocypher.output.write._batch_writer import _BatchWriter
|
|
|
28
18
|
|
|
29
19
|
|
|
30
20
|
class _RDFWriter(_BatchWriter):
|
|
31
|
-
"""
|
|
32
|
-
Class to write BioCypher's property graph into an RDF format using
|
|
21
|
+
"""Class to write BioCypher's property graph into an RDF format using
|
|
33
22
|
rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
|
|
34
23
|
N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
|
|
35
24
|
is done keeping only the minimum information about node and edges,
|
|
@@ -37,33 +26,37 @@ class _RDFWriter(_BatchWriter):
|
|
|
37
26
|
"""
|
|
38
27
|
|
|
39
28
|
def _get_import_script_name(self) -> str:
|
|
40
|
-
"""
|
|
41
|
-
Returns the name of the RDF admin import script.
|
|
29
|
+
"""Returns the name of the RDF admin import script.
|
|
42
30
|
This function applicable for RDF export.
|
|
43
31
|
|
|
44
|
-
Returns
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
45
34
|
str: The name of the import script (ending in .sh)
|
|
35
|
+
|
|
46
36
|
"""
|
|
47
37
|
return "rdf-import-call.sh"
|
|
48
38
|
|
|
49
39
|
def _get_default_import_call_bin_prefix(self):
|
|
50
|
-
"""
|
|
51
|
-
Method to provide the default string for the import call bin prefix.
|
|
40
|
+
"""Method to provide the default string for the import call bin prefix.
|
|
52
41
|
|
|
53
|
-
Returns
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
54
44
|
str: The default location for the RDF admin import location
|
|
45
|
+
|
|
55
46
|
"""
|
|
56
47
|
return "bin/"
|
|
57
48
|
|
|
58
49
|
def _is_rdf_format_supported(self, rdf_format: str) -> bool:
|
|
59
|
-
"""
|
|
60
|
-
Function to check if the specified RDF format is supported.
|
|
50
|
+
"""Function to check if the specified RDF format is supported.
|
|
61
51
|
|
|
62
52
|
Args:
|
|
53
|
+
----
|
|
63
54
|
rdf_format (str): The RDF format to check.
|
|
64
55
|
|
|
65
56
|
Returns:
|
|
57
|
+
-------
|
|
66
58
|
bool: Returns True if rdf format supported, False otherwise.
|
|
59
|
+
|
|
67
60
|
"""
|
|
68
61
|
supported_formats = [
|
|
69
62
|
"xml",
|
|
@@ -83,7 +76,8 @@ class _RDFWriter(_BatchWriter):
|
|
|
83
76
|
)
|
|
84
77
|
return False
|
|
85
78
|
else:
|
|
86
|
-
# RDF graph does not support 'ttl' format, only 'turtle' format.
|
|
79
|
+
# RDF graph does not support 'ttl' format, only 'turtle' format.
|
|
80
|
+
# however, the preferred file extension is always '.ttl'
|
|
87
81
|
if self.rdf_format == "turtle":
|
|
88
82
|
self.extension = "ttl"
|
|
89
83
|
elif self.rdf_format == "ttl":
|
|
@@ -99,11 +93,11 @@ class _RDFWriter(_BatchWriter):
|
|
|
99
93
|
label: str,
|
|
100
94
|
prop_dict: dict,
|
|
101
95
|
):
|
|
102
|
-
"""
|
|
103
|
-
This function takes one list of biocypher edges and writes them
|
|
96
|
+
"""This function takes one list of biocypher edges and writes them
|
|
104
97
|
to an RDF file with the given format.
|
|
105
98
|
|
|
106
99
|
Args:
|
|
100
|
+
----
|
|
107
101
|
edge_list (list): list of BioCypherEdges to be written
|
|
108
102
|
|
|
109
103
|
label (str): the label (type) of the edge
|
|
@@ -112,9 +106,10 @@ class _RDFWriter(_BatchWriter):
|
|
|
112
106
|
function and their types
|
|
113
107
|
|
|
114
108
|
Returns:
|
|
109
|
+
-------
|
|
115
110
|
bool: The return value. True for success, False otherwise.
|
|
116
|
-
"""
|
|
117
111
|
|
|
112
|
+
"""
|
|
118
113
|
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
|
|
119
114
|
logger.error("Edges must be passed as type BioCypherEdge.")
|
|
120
115
|
return False
|
|
@@ -123,9 +118,7 @@ class _RDFWriter(_BatchWriter):
|
|
|
123
118
|
label_pascal = self.translator.name_sentence_to_pascal(label)
|
|
124
119
|
|
|
125
120
|
# create file name
|
|
126
|
-
file_name = os.path.join(
|
|
127
|
-
self.outdir, f"{label_pascal}.{self.extension}"
|
|
128
|
-
)
|
|
121
|
+
file_name = os.path.join(self.outdir, f"{label_pascal}.{self.extension}")
|
|
129
122
|
|
|
130
123
|
# write data in graph
|
|
131
124
|
graph = Graph()
|
|
@@ -136,12 +129,10 @@ class _RDFWriter(_BatchWriter):
|
|
|
136
129
|
rdf_object = edge.get_target_id()
|
|
137
130
|
rdf_predicate = edge.get_id()
|
|
138
131
|
rdf_properties = edge.get_properties()
|
|
139
|
-
if rdf_predicate
|
|
132
|
+
if rdf_predicate is None:
|
|
140
133
|
rdf_predicate = rdf_subject + rdf_object
|
|
141
134
|
|
|
142
|
-
edge_label = self.translator.name_sentence_to_pascal(
|
|
143
|
-
edge.get_label()
|
|
144
|
-
)
|
|
135
|
+
edge_label = self.translator.name_sentence_to_pascal(edge.get_label())
|
|
145
136
|
edge_uri = self.rdf_namespaces["biocypher"][edge_label]
|
|
146
137
|
graph.add((edge_uri, RDF.type, RDFS.Class))
|
|
147
138
|
graph.add(
|
|
@@ -149,21 +140,21 @@ class _RDFWriter(_BatchWriter):
|
|
|
149
140
|
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
150
141
|
RDF.type,
|
|
151
142
|
edge_uri,
|
|
152
|
-
)
|
|
143
|
+
),
|
|
153
144
|
)
|
|
154
145
|
graph.add(
|
|
155
146
|
(
|
|
156
147
|
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
157
148
|
self.rdf_namespaces["biocypher"]["subject"],
|
|
158
149
|
self.subject_to_uri(rdf_subject),
|
|
159
|
-
)
|
|
150
|
+
),
|
|
160
151
|
)
|
|
161
152
|
graph.add(
|
|
162
153
|
(
|
|
163
154
|
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
164
155
|
self.rdf_namespaces["biocypher"]["object"],
|
|
165
156
|
self.subject_to_uri(rdf_object),
|
|
166
|
-
)
|
|
157
|
+
),
|
|
167
158
|
)
|
|
168
159
|
|
|
169
160
|
# add properties to the transformed edge --> node
|
|
@@ -187,13 +178,17 @@ class _RDFWriter(_BatchWriter):
|
|
|
187
178
|
rdf_object: str,
|
|
188
179
|
rdf_predicate: str,
|
|
189
180
|
):
|
|
190
|
-
"""
|
|
191
|
-
|
|
192
|
-
It
|
|
193
|
-
|
|
194
|
-
|
|
181
|
+
"""Add the properties to an RDF node.
|
|
182
|
+
|
|
183
|
+
It takes the graph, the subject, object, and predicate of the RDF
|
|
184
|
+
triple. It checks if the property is a list and adds it to the graph
|
|
185
|
+
accordingly. Otherwise it checks if the string represents a list. If it
|
|
186
|
+
does, it transforms it to a list and adds it to the graph. If not, it
|
|
187
|
+
adds the property to the graph as a literal. If the property is neither
|
|
188
|
+
a list or string, it will also be added as a literal.
|
|
195
189
|
|
|
196
190
|
Args:
|
|
191
|
+
----
|
|
197
192
|
graph (RDFLib.Graph): The RDF graph to add the nodes to.
|
|
198
193
|
|
|
199
194
|
rdf_subject (str): The subject of the RDF triple.
|
|
@@ -203,7 +198,9 @@ class _RDFWriter(_BatchWriter):
|
|
|
203
198
|
rdf_predicate (str): The predicate of the RDF triple.
|
|
204
199
|
|
|
205
200
|
Returns:
|
|
201
|
+
-------
|
|
206
202
|
None
|
|
203
|
+
|
|
207
204
|
"""
|
|
208
205
|
if isinstance(rdf_object, list):
|
|
209
206
|
for obj in rdf_object:
|
|
@@ -212,7 +209,7 @@ class _RDFWriter(_BatchWriter):
|
|
|
212
209
|
self.subject_to_uri(rdf_subject),
|
|
213
210
|
self.property_to_uri(rdf_predicate),
|
|
214
211
|
Literal(obj),
|
|
215
|
-
)
|
|
212
|
+
),
|
|
216
213
|
)
|
|
217
214
|
elif isinstance(rdf_object, str):
|
|
218
215
|
if rdf_object.startswith("[") and rdf_object.endswith("]"):
|
|
@@ -228,7 +225,7 @@ class _RDFWriter(_BatchWriter):
|
|
|
228
225
|
self.subject_to_uri(rdf_subject),
|
|
229
226
|
self.property_to_uri(rdf_predicate),
|
|
230
227
|
Literal(rdf_object),
|
|
231
|
-
)
|
|
228
|
+
),
|
|
232
229
|
)
|
|
233
230
|
else:
|
|
234
231
|
graph.add(
|
|
@@ -236,25 +233,22 @@ class _RDFWriter(_BatchWriter):
|
|
|
236
233
|
self.subject_to_uri(rdf_subject),
|
|
237
234
|
self.property_to_uri(rdf_predicate),
|
|
238
235
|
Literal(rdf_object),
|
|
239
|
-
)
|
|
236
|
+
),
|
|
240
237
|
)
|
|
241
238
|
|
|
242
239
|
def transform_string_to_list(self, string_list: str) -> list:
|
|
243
|
-
"""
|
|
244
|
-
Function to transform a string representation of a list into a list.
|
|
240
|
+
"""Function to transform a string representation of a list into a list.
|
|
245
241
|
|
|
246
242
|
Args:
|
|
243
|
+
----
|
|
247
244
|
string_list (str): The string representation of the list.
|
|
248
245
|
|
|
249
246
|
Returns:
|
|
247
|
+
-------
|
|
250
248
|
list: The list representation of the input string.
|
|
249
|
+
|
|
251
250
|
"""
|
|
252
|
-
return (
|
|
253
|
-
string_list.replace("[", "")
|
|
254
|
-
.replace("]", "")
|
|
255
|
-
.replace("'", "")
|
|
256
|
-
.split(", ")
|
|
257
|
-
)
|
|
251
|
+
return string_list.replace("[", "").replace("]", "").replace("'", "").split(", ")
|
|
258
252
|
|
|
259
253
|
def _write_single_node_list_to_file(
|
|
260
254
|
self,
|
|
@@ -263,11 +257,11 @@ class _RDFWriter(_BatchWriter):
|
|
|
263
257
|
prop_dict: dict,
|
|
264
258
|
labels: str,
|
|
265
259
|
):
|
|
266
|
-
"""
|
|
267
|
-
This function takes a list of BioCypherNodes and writes them
|
|
260
|
+
"""This function takes a list of BioCypherNodes and writes them
|
|
268
261
|
to an RDF file in the specified format.
|
|
269
262
|
|
|
270
263
|
Args:
|
|
264
|
+
----
|
|
271
265
|
node_list (list): A list of BioCypherNodes to be written.
|
|
272
266
|
|
|
273
267
|
label (str): The label (type) of the nodes.
|
|
@@ -275,7 +269,9 @@ class _RDFWriter(_BatchWriter):
|
|
|
275
269
|
prop_dict (dict): A dictionary of properties and their types for the node class.
|
|
276
270
|
|
|
277
271
|
Returns:
|
|
272
|
+
-------
|
|
278
273
|
bool: True if the writing is successful, False otherwise.
|
|
274
|
+
|
|
279
275
|
"""
|
|
280
276
|
if not all(isinstance(n, BioCypherNode) for n in node_list):
|
|
281
277
|
logger.error("Nodes must be passed as type BioCypherNode.")
|
|
@@ -285,9 +281,7 @@ class _RDFWriter(_BatchWriter):
|
|
|
285
281
|
label_pascal = self.translator.name_sentence_to_pascal(label)
|
|
286
282
|
|
|
287
283
|
# create file name
|
|
288
|
-
file_name = os.path.join(
|
|
289
|
-
self.outdir, f"{label_pascal}.{self.extension}"
|
|
290
|
-
)
|
|
284
|
+
file_name = os.path.join(self.outdir, f"{label_pascal}.{self.extension}")
|
|
291
285
|
|
|
292
286
|
# write data in graph
|
|
293
287
|
graph = Graph()
|
|
@@ -303,14 +297,14 @@ class _RDFWriter(_BatchWriter):
|
|
|
303
297
|
self.rdf_namespaces["biocypher"][class_name],
|
|
304
298
|
RDF.type,
|
|
305
299
|
RDFS.Class,
|
|
306
|
-
)
|
|
300
|
+
),
|
|
307
301
|
)
|
|
308
302
|
graph.add(
|
|
309
303
|
(
|
|
310
304
|
self.subject_to_uri(rdf_subject),
|
|
311
305
|
RDF.type,
|
|
312
306
|
self.rdf_namespaces["biocypher"][class_name],
|
|
313
|
-
)
|
|
307
|
+
),
|
|
314
308
|
)
|
|
315
309
|
for key, value in properties.items():
|
|
316
310
|
# only write value if it exists.
|
|
@@ -325,19 +319,19 @@ class _RDFWriter(_BatchWriter):
|
|
|
325
319
|
|
|
326
320
|
return True
|
|
327
321
|
|
|
328
|
-
def write_nodes(
|
|
329
|
-
|
|
330
|
-
) -> bool:
|
|
331
|
-
"""
|
|
332
|
-
Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
|
|
322
|
+
def write_nodes(self, nodes, batch_size: int = int(1e6), force: bool = False) -> bool:
|
|
323
|
+
"""Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
|
|
333
324
|
|
|
334
325
|
Args:
|
|
326
|
+
----
|
|
335
327
|
nodes (list or generator): A list or generator of nodes in BioCypherNode format.
|
|
336
328
|
batch_size (int): The number of nodes to write in each batch.
|
|
337
329
|
force (bool): Flag to force the writing even if the output file already exists.
|
|
338
330
|
|
|
339
331
|
Returns:
|
|
332
|
+
-------
|
|
340
333
|
bool: True if the writing is successful, False otherwise.
|
|
334
|
+
|
|
341
335
|
"""
|
|
342
336
|
# check if specified output format is correct
|
|
343
337
|
passed = self._is_rdf_format_supported(self.rdf_format)
|
|
@@ -353,20 +347,22 @@ class _RDFWriter(_BatchWriter):
|
|
|
353
347
|
|
|
354
348
|
def write_edges(
|
|
355
349
|
self,
|
|
356
|
-
edges:
|
|
350
|
+
edges: list | GeneratorType,
|
|
357
351
|
batch_size: int = int(1e6),
|
|
358
352
|
) -> bool:
|
|
359
|
-
"""
|
|
360
|
-
Wrapper for writing edges in RDF format. It calls _write_edge_data()
|
|
353
|
+
"""Wrapper for writing edges in RDF format. It calls _write_edge_data()
|
|
361
354
|
functions specifying it's edge data.
|
|
362
355
|
|
|
363
356
|
Args:
|
|
357
|
+
----
|
|
364
358
|
edges (BioCypherEdge): a list or generator of edges in
|
|
365
359
|
:py:class:`BioCypherEdge` format
|
|
366
360
|
batch_size (int): The number of edges to write in each batch.
|
|
367
361
|
|
|
368
362
|
Returns:
|
|
363
|
+
-------
|
|
369
364
|
bool: The return value. True for success, False otherwise.
|
|
365
|
+
|
|
370
366
|
"""
|
|
371
367
|
# check if specified output format is correct
|
|
372
368
|
passed = self._is_rdf_format_supported(self.rdf_format)
|
|
@@ -382,12 +378,13 @@ class _RDFWriter(_BatchWriter):
|
|
|
382
378
|
return True
|
|
383
379
|
|
|
384
380
|
def _construct_import_call(self) -> bool:
|
|
385
|
-
"""
|
|
386
|
-
Function to write the import call.
|
|
381
|
+
"""Function to write the import call.
|
|
387
382
|
This function is not applicable for RDF.
|
|
388
383
|
|
|
389
|
-
Returns
|
|
384
|
+
Returns
|
|
385
|
+
-------
|
|
390
386
|
bool: The return value. True for success, False otherwise.
|
|
387
|
+
|
|
391
388
|
"""
|
|
392
389
|
return ""
|
|
393
390
|
|
|
@@ -399,53 +396,58 @@ class _RDFWriter(_BatchWriter):
|
|
|
399
396
|
return f"{self.quote}{value}{self.quote}"
|
|
400
397
|
|
|
401
398
|
def _write_array_string(self, string_list):
|
|
402
|
-
"""
|
|
403
|
-
Abstract method to write the string representation of an array into a .csv file
|
|
399
|
+
"""Abstract method to write the string representation of an array into a .csv file
|
|
404
400
|
as required by the RDF admin-import.
|
|
405
401
|
This function is not applicable for RDF.
|
|
406
402
|
|
|
407
403
|
Args:
|
|
404
|
+
----
|
|
408
405
|
string_list (list): list of ontology strings
|
|
409
406
|
|
|
410
407
|
Returns:
|
|
408
|
+
-------
|
|
411
409
|
str: The string representation of an array for the neo4j admin import
|
|
412
|
-
"""
|
|
413
410
|
|
|
411
|
+
"""
|
|
414
412
|
return True
|
|
415
413
|
|
|
416
414
|
def _write_node_headers(self):
|
|
417
|
-
"""
|
|
418
|
-
Abstract method that takes care of importing properties of a graph entity that is represented
|
|
415
|
+
"""Abstract method that takes care of importing properties of a graph entity that is represented
|
|
419
416
|
as a node as per the definition in the `schema_config.yaml`
|
|
420
417
|
This function is not applicable for RDF.
|
|
421
418
|
|
|
422
|
-
Returns
|
|
419
|
+
Returns
|
|
420
|
+
-------
|
|
423
421
|
bool: The return value. True for success, False otherwise.
|
|
422
|
+
|
|
424
423
|
"""
|
|
425
424
|
return True
|
|
426
425
|
|
|
427
426
|
def _write_edge_headers(self):
|
|
428
|
-
"""
|
|
429
|
-
Abstract method to write a database import-file for a graph entity that is represented
|
|
427
|
+
"""Abstract method to write a database import-file for a graph entity that is represented
|
|
430
428
|
as an edge as per the definition in the `schema_config.yaml`,
|
|
431
429
|
containing only the header for this type of edge.
|
|
432
430
|
This function is not applicable for RDF.
|
|
433
431
|
|
|
434
|
-
Returns
|
|
432
|
+
Returns
|
|
433
|
+
-------
|
|
435
434
|
bool: The return value. True for success, False otherwise.
|
|
435
|
+
|
|
436
436
|
"""
|
|
437
437
|
return True
|
|
438
438
|
|
|
439
439
|
def subject_to_uri(self, subject: str) -> str:
|
|
440
|
-
"""
|
|
441
|
-
Converts the subject to a proper URI using the available namespaces.
|
|
440
|
+
"""Converts the subject to a proper URI using the available namespaces.
|
|
442
441
|
If the conversion fails, it defaults to the biocypher prefix.
|
|
443
442
|
|
|
444
443
|
Args:
|
|
444
|
+
----
|
|
445
445
|
subject (str): The subject to be converted to a URI.
|
|
446
446
|
|
|
447
447
|
Returns:
|
|
448
|
+
-------
|
|
448
449
|
str: The corresponding URI for the subject.
|
|
450
|
+
|
|
449
451
|
"""
|
|
450
452
|
try:
|
|
451
453
|
_pref, _id = subject.split(":")
|
|
@@ -458,56 +460,66 @@ class _RDFWriter(_BatchWriter):
|
|
|
458
460
|
return self.rdf_namespaces["biocypher"][subject]
|
|
459
461
|
|
|
460
462
|
def property_to_uri(self, property_name: str) -> dict[str, str]:
|
|
461
|
-
"""
|
|
462
|
-
Converts a property name to its corresponding URI.
|
|
463
|
+
"""Converts a property name to its corresponding URI.
|
|
463
464
|
|
|
464
465
|
This function takes a property name and searches for its corresponding URI in various namespaces.
|
|
465
466
|
It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
|
|
466
467
|
|
|
467
468
|
Args:
|
|
469
|
+
----
|
|
468
470
|
property_name (str): The property name to be converted to a URI.
|
|
469
471
|
|
|
470
472
|
Returns:
|
|
473
|
+
-------
|
|
471
474
|
str: The corresponding URI for the input property name.
|
|
475
|
+
|
|
472
476
|
"""
|
|
473
477
|
# These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
|
|
474
478
|
for namespace in _NAMESPACE_PREFIXES_CORE.values():
|
|
475
479
|
if property_name in namespace:
|
|
476
480
|
return namespace[property_name]
|
|
477
481
|
|
|
478
|
-
# If the property name is not found in the core namespaces, search in
|
|
482
|
+
# If the property name is not found in the core namespaces, search in
|
|
483
|
+
# the SKOS, DC, and DCTERMS namespaces
|
|
479
484
|
for namespace in [SKOS, DC, DCTERMS]:
|
|
480
485
|
if property_name in namespace:
|
|
481
486
|
return namespace[property_name]
|
|
482
487
|
|
|
483
|
-
# If the property name is still not found, try other namespaces from
|
|
488
|
+
# If the property name is still not found, try other namespaces from
|
|
489
|
+
# rdflib.
|
|
484
490
|
for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
|
|
485
491
|
if property_name in namespace:
|
|
486
492
|
return namespace[property_name]
|
|
487
493
|
|
|
488
|
-
# If the property name is "licence", it recursively calls the function
|
|
494
|
+
# If the property name is "licence", it recursively calls the function
|
|
495
|
+
# with "license" as the input.
|
|
489
496
|
if property_name == "licence":
|
|
490
497
|
return self.property_to_uri("license")
|
|
491
498
|
|
|
492
499
|
# TODO: add an option to search trough manually implemented namespaces
|
|
493
500
|
|
|
494
|
-
# If the input is not found in any of the namespaces, it returns
|
|
501
|
+
# If the input is not found in any of the namespaces, it returns
|
|
502
|
+
# the corresponding URI from the biocypher namespace.
|
|
495
503
|
# TODO: give a warning and try to prevent this option altogether
|
|
496
504
|
return self.rdf_namespaces["biocypher"][property_name]
|
|
497
505
|
|
|
498
506
|
def _init_namespaces(self, graph: Graph):
|
|
499
|
-
"""
|
|
500
|
-
Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
|
|
507
|
+
"""Initialise the namespaces for the RDF graph.
|
|
501
508
|
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
the
|
|
509
|
+
These namespaces are used to convert nodes to URIs. This function adds
|
|
510
|
+
the biocypher standard namespace to the `rdf_namespaces` attribute of
|
|
511
|
+
the class. If `rdf_namespaces` is empty, it sets it to the biocypher
|
|
512
|
+
standard namespace. Otherwise, it merges the biocypher standard
|
|
513
|
+
namespace with the namespaces defined in the biocypher_config.yaml.
|
|
505
514
|
|
|
506
515
|
Args:
|
|
516
|
+
----
|
|
507
517
|
graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
|
|
508
518
|
|
|
509
519
|
Returns:
|
|
520
|
+
-------
|
|
510
521
|
None
|
|
522
|
+
|
|
511
523
|
"""
|
|
512
524
|
# add biocypher standard to self.rdf_namespaces
|
|
513
525
|
biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
from more_itertools import peekable
|
|
2
2
|
|
|
3
3
|
from biocypher._logger import logger
|
|
4
|
+
from biocypher.output.in_memory._pandas import PandasKG
|
|
4
5
|
from biocypher.output.write._writer import _Writer
|
|
5
|
-
from biocypher.output.in_memory._pandas import Pandas
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class _PandasCSVWriter(_Writer):
|
|
9
9
|
"""
|
|
10
|
-
Class for writing node and edge representations to
|
|
10
|
+
Class for writing node and edge representations to CSV files.
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
def __init__(self, *args, write_to_file: bool = True, **kwargs):
|
|
@@ -15,8 +15,7 @@ class _PandasCSVWriter(_Writer):
|
|
|
15
15
|
super().__init__(*args, **kwargs)
|
|
16
16
|
self.in_memory_dfs = {}
|
|
17
17
|
self.stored_dfs = {}
|
|
18
|
-
self.pandas_in_memory =
|
|
19
|
-
translator=self.translator,
|
|
18
|
+
self.pandas_in_memory = PandasKG(
|
|
20
19
|
deduplicator=self.deduplicator,
|
|
21
20
|
)
|
|
22
21
|
self.delimiter = kwargs.get("delimiter")
|
|
@@ -48,7 +47,7 @@ class _PandasCSVWriter(_Writer):
|
|
|
48
47
|
return passed
|
|
49
48
|
|
|
50
49
|
def _write_entities_to_file(self, entities: iter) -> bool:
|
|
51
|
-
"""Function to
|
|
50
|
+
"""Function to write the entities to a CSV file.
|
|
52
51
|
|
|
53
52
|
Args:
|
|
54
53
|
entities (iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
|
@@ -56,17 +55,13 @@ class _PandasCSVWriter(_Writer):
|
|
|
56
55
|
entities = peekable(entities)
|
|
57
56
|
entity_list = self.pandas_in_memory._separate_entity_types(entities)
|
|
58
57
|
for entity_type, entities in entity_list.items():
|
|
59
|
-
self.in_memory_dfs[
|
|
60
|
-
entity_type
|
|
61
|
-
] = self.pandas_in_memory._add_entity_df(entity_type, entities)
|
|
58
|
+
self.in_memory_dfs[entity_type] = self.pandas_in_memory._add_entity_df(entity_type, entities)
|
|
62
59
|
for entity_type in self.in_memory_dfs.keys():
|
|
63
60
|
entity_df = self.in_memory_dfs[entity_type]
|
|
64
61
|
if " " in entity_type or "." in entity_type:
|
|
65
62
|
entity_type = entity_type.replace(" ", "_").replace(".", "_")
|
|
66
63
|
if self.write_to_file:
|
|
67
|
-
logger.info(
|
|
68
|
-
f"Writing {entity_df.shape[0]} entries to {entity_type}.csv."
|
|
69
|
-
)
|
|
64
|
+
logger.info(f"Writing {entity_df.shape[0]} entries to {entity_type}.csv.")
|
|
70
65
|
entity_df.to_csv(
|
|
71
66
|
f"{self.output_directory}/{entity_type}.csv",
|
|
72
67
|
sep=self.delimiter,
|