biocypher 0.5.41__py3-none-any.whl → 0.5.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/_config/biocypher_config.yaml +15 -0
- biocypher/_core.py +3 -3
- biocypher/_metadata.py +1 -1
- biocypher/_misc.py +6 -1
- biocypher/_ontology.py +133 -53
- biocypher/{_connect.py → output/connect/_neo4j_driver.py} +5 -5
- biocypher/{_pandas.py → output/in_memory/_pandas.py} +2 -1
- biocypher/output/write/__init__.py +0 -0
- biocypher/{write → output/write}/_batch_writer.py +26 -22
- biocypher/{write/_write.py → output/write/_get_writer.py} +19 -11
- biocypher/output/write/_writer.py +200 -0
- biocypher/output/write/graph/__init__.py +0 -0
- biocypher/{write → output/write}/graph/_arangodb.py +1 -1
- biocypher/{write → output/write}/graph/_neo4j.py +2 -4
- biocypher/output/write/graph/_networkx.py +76 -0
- biocypher/output/write/graph/_rdf.py +515 -0
- biocypher/output/write/relational/__init__.py +0 -0
- biocypher/output/write/relational/_csv.py +76 -0
- biocypher/{write → output/write}/relational/_postgresql.py +2 -2
- biocypher/{write → output/write}/relational/_sqlite.py +1 -1
- {biocypher-0.5.41.dist-info → biocypher-0.5.43.dist-info}/METADATA +1 -1
- biocypher-0.5.43.dist-info/RECORD +39 -0
- biocypher-0.5.41.dist-info/RECORD +0 -32
- /biocypher/{write → output}/__init__.py +0 -0
- /biocypher/{write/graph → output/connect}/__init__.py +0 -0
- /biocypher/{write/relational → output/in_memory}/__init__.py +0 -0
- {biocypher-0.5.41.dist-info → biocypher-0.5.43.dist-info}/LICENSE +0 -0
- {biocypher-0.5.41.dist-info → biocypher-0.5.43.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,515 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
#
|
|
4
|
+
# Copyright 2021, Heidelberg University Clinic
|
|
5
|
+
#
|
|
6
|
+
# File author(s): Loes van den Biggelaar
|
|
7
|
+
# Sebastian Lobentanzer
|
|
8
|
+
#
|
|
9
|
+
# Distributed under MIT licence, see the file `LICENSE`.
|
|
10
|
+
#
|
|
11
|
+
"""
|
|
12
|
+
BioCypher 'offline' module. Handles the writing of node and edge representations
|
|
13
|
+
suitable for import into a DBMS.
|
|
14
|
+
"""
|
|
15
|
+
from types import GeneratorType
|
|
16
|
+
from typing import Union
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
from rdflib import DC, RDF, RDFS, SKOS, DCTERMS, Graph, Literal, Namespace
|
|
20
|
+
from rdflib.namespace import (
|
|
21
|
+
_NAMESPACE_PREFIXES_CORE,
|
|
22
|
+
_NAMESPACE_PREFIXES_RDFLIB,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
from biocypher._create import BioCypherEdge, BioCypherNode
|
|
26
|
+
from biocypher._logger import logger
|
|
27
|
+
from biocypher.output.write._batch_writer import _BatchWriter
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class _RDFWriter(_BatchWriter):
|
|
31
|
+
"""
|
|
32
|
+
Class to write BioCypher's property graph into an RDF format using
|
|
33
|
+
rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
|
|
34
|
+
N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
|
|
35
|
+
is done keeping only the minimum information about node and edges,
|
|
36
|
+
skipping all properties.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def _get_import_script_name(self) -> str:
|
|
40
|
+
"""
|
|
41
|
+
Returns the name of the RDF admin import script.
|
|
42
|
+
This function applicable for RDF export.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
str: The name of the import script (ending in .sh)
|
|
46
|
+
"""
|
|
47
|
+
return "rdf-import-call.sh"
|
|
48
|
+
|
|
49
|
+
def _get_default_import_call_bin_prefix(self):
|
|
50
|
+
"""
|
|
51
|
+
Method to provide the default string for the import call bin prefix.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
str: The default location for the RDF admin import location
|
|
55
|
+
"""
|
|
56
|
+
return "bin/"
|
|
57
|
+
|
|
58
|
+
def _is_rdf_format_supported(self, rdf_format: str) -> bool:
|
|
59
|
+
"""
|
|
60
|
+
Function to check if the specified RDF format is supported.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
rdf_format (str): The RDF format to check.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
bool: Returns True if rdf format supported, False otherwise.
|
|
67
|
+
"""
|
|
68
|
+
supported_formats = [
|
|
69
|
+
"xml",
|
|
70
|
+
"n3",
|
|
71
|
+
"turtle",
|
|
72
|
+
"nt",
|
|
73
|
+
"pretty-xml",
|
|
74
|
+
"trix",
|
|
75
|
+
"trig",
|
|
76
|
+
"nquads",
|
|
77
|
+
"json-ld",
|
|
78
|
+
]
|
|
79
|
+
if rdf_format not in supported_formats:
|
|
80
|
+
logger.error(
|
|
81
|
+
f"{rdf_format}; Incorrect or unsupported RDF format, use one of the following: "
|
|
82
|
+
f'"xml", "n3", "turtle", "nt", "pretty-xml", "trix", "trig", "nquads", "json-ld" ',
|
|
83
|
+
)
|
|
84
|
+
return False
|
|
85
|
+
else:
|
|
86
|
+
# RDF graph does not support 'ttl' format, only 'turtle' format. however, the preferred file extension is always '.ttl'
|
|
87
|
+
if self.rdf_format == "turtle":
|
|
88
|
+
self.extension = "ttl"
|
|
89
|
+
elif self.rdf_format == "ttl":
|
|
90
|
+
self.rdf_format = "turtle"
|
|
91
|
+
self.extension = "ttl"
|
|
92
|
+
else:
|
|
93
|
+
self.extension = self.rdf_format
|
|
94
|
+
return True
|
|
95
|
+
|
|
96
|
+
def _write_single_edge_list_to_file(
|
|
97
|
+
self,
|
|
98
|
+
edge_list: list,
|
|
99
|
+
label: str,
|
|
100
|
+
prop_dict: dict,
|
|
101
|
+
):
|
|
102
|
+
"""
|
|
103
|
+
This function takes one list of biocypher edges and writes them
|
|
104
|
+
to an RDF file with the given format.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
edge_list (list): list of BioCypherEdges to be written
|
|
108
|
+
|
|
109
|
+
label (str): the label (type) of the edge
|
|
110
|
+
|
|
111
|
+
prop_dict (dict): properties of node class passed from parsing
|
|
112
|
+
function and their types
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
bool: The return value. True for success, False otherwise.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
|
|
119
|
+
logger.error("Edges must be passed as type BioCypherEdge.")
|
|
120
|
+
return False
|
|
121
|
+
|
|
122
|
+
# translate label to PascalCase
|
|
123
|
+
label_pascal = self.translator.name_sentence_to_pascal(label)
|
|
124
|
+
|
|
125
|
+
# create file name
|
|
126
|
+
file_name = os.path.join(
|
|
127
|
+
self.outdir, f"{label_pascal}.{self.extension}"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# write data in graph
|
|
131
|
+
graph = Graph()
|
|
132
|
+
self._init_namespaces(graph)
|
|
133
|
+
|
|
134
|
+
for edge in edge_list:
|
|
135
|
+
rdf_subject = edge.get_source_id()
|
|
136
|
+
rdf_object = edge.get_target_id()
|
|
137
|
+
rdf_predicate = edge.get_id()
|
|
138
|
+
rdf_properties = edge.get_properties()
|
|
139
|
+
if rdf_predicate == None:
|
|
140
|
+
rdf_predicate = rdf_subject + rdf_object
|
|
141
|
+
|
|
142
|
+
edge_label = self.translator.name_sentence_to_pascal(
|
|
143
|
+
edge.get_label()
|
|
144
|
+
)
|
|
145
|
+
edge_uri = self.rdf_namespaces["biocypher"][edge_label]
|
|
146
|
+
graph.add((edge_uri, RDF.type, RDFS.Class))
|
|
147
|
+
graph.add(
|
|
148
|
+
(
|
|
149
|
+
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
150
|
+
RDF.type,
|
|
151
|
+
edge_uri,
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
graph.add(
|
|
155
|
+
(
|
|
156
|
+
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
157
|
+
self.rdf_namespaces["biocypher"]["subject"],
|
|
158
|
+
self.subject_to_uri(rdf_subject),
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
graph.add(
|
|
162
|
+
(
|
|
163
|
+
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
164
|
+
self.rdf_namespaces["biocypher"]["object"],
|
|
165
|
+
self.subject_to_uri(rdf_object),
|
|
166
|
+
)
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# add properties to the transformed edge --> node
|
|
170
|
+
for key, value in rdf_properties.items():
|
|
171
|
+
# only write value if it exists.
|
|
172
|
+
if value:
|
|
173
|
+
self.add_property_to_graph(graph, rdf_predicate, value, key)
|
|
174
|
+
|
|
175
|
+
graph.serialize(destination=file_name, format=self.rdf_format)
|
|
176
|
+
|
|
177
|
+
logger.info(
|
|
178
|
+
f"Writing {len(edge_list)} entries to {label_pascal}.{self.rdf_format}",
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
def add_property_to_graph(
|
|
184
|
+
self,
|
|
185
|
+
graph: Graph,
|
|
186
|
+
rdf_subject: str,
|
|
187
|
+
rdf_object: str,
|
|
188
|
+
rdf_predicate: str,
|
|
189
|
+
):
|
|
190
|
+
"""
|
|
191
|
+
Function to add the properties to an RDF node. It takes the graph, the subject, object, and predicate of the RDF triple.
|
|
192
|
+
It checks if the property is a list and adds it to the graph accordingly. otherwise it checks if the string represents a list.
|
|
193
|
+
If it does, it transforms it to a list and adds it to the graph. if not, it adds the property to the graph as a literal.
|
|
194
|
+
If the property is neither a list or string, it will also be added as a literal.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
graph (RDFLib.Graph): The RDF graph to add the nodes to.
|
|
198
|
+
|
|
199
|
+
rdf_subject (str): The subject of the RDF triple.
|
|
200
|
+
|
|
201
|
+
rdf_object (str): The object of the RDF triple.
|
|
202
|
+
|
|
203
|
+
rdf_predicate (str): The predicate of the RDF triple.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
None
|
|
207
|
+
"""
|
|
208
|
+
if isinstance(rdf_object, list):
|
|
209
|
+
for obj in rdf_object:
|
|
210
|
+
graph.add(
|
|
211
|
+
(
|
|
212
|
+
self.subject_to_uri(rdf_subject),
|
|
213
|
+
self.property_to_uri(rdf_predicate),
|
|
214
|
+
Literal(obj),
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
elif isinstance(rdf_object, str):
|
|
218
|
+
if rdf_object.startswith("[") and rdf_object.endswith("]"):
|
|
219
|
+
self.add_property_to_graph(
|
|
220
|
+
graph,
|
|
221
|
+
rdf_subject,
|
|
222
|
+
self.transform_string_to_list(rdf_object),
|
|
223
|
+
rdf_predicate,
|
|
224
|
+
)
|
|
225
|
+
else:
|
|
226
|
+
graph.add(
|
|
227
|
+
(
|
|
228
|
+
self.subject_to_uri(rdf_subject),
|
|
229
|
+
self.property_to_uri(rdf_predicate),
|
|
230
|
+
Literal(rdf_object),
|
|
231
|
+
)
|
|
232
|
+
)
|
|
233
|
+
else:
|
|
234
|
+
graph.add(
|
|
235
|
+
(
|
|
236
|
+
self.subject_to_uri(rdf_subject),
|
|
237
|
+
self.property_to_uri(rdf_predicate),
|
|
238
|
+
Literal(rdf_object),
|
|
239
|
+
)
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
def transform_string_to_list(self, string_list: str) -> list:
|
|
243
|
+
"""
|
|
244
|
+
Function to transform a string representation of a list into a list.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
string_list (str): The string representation of the list.
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
list: The list representation of the input string.
|
|
251
|
+
"""
|
|
252
|
+
return (
|
|
253
|
+
string_list.replace("[", "")
|
|
254
|
+
.replace("]", "")
|
|
255
|
+
.replace("'", "")
|
|
256
|
+
.split(", ")
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
def _write_single_node_list_to_file(
|
|
260
|
+
self,
|
|
261
|
+
node_list: list,
|
|
262
|
+
label: str,
|
|
263
|
+
prop_dict: dict,
|
|
264
|
+
labels: str,
|
|
265
|
+
):
|
|
266
|
+
"""
|
|
267
|
+
This function takes a list of BioCypherNodes and writes them
|
|
268
|
+
to an RDF file in the specified format.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
node_list (list): A list of BioCypherNodes to be written.
|
|
272
|
+
|
|
273
|
+
label (str): The label (type) of the nodes.
|
|
274
|
+
|
|
275
|
+
prop_dict (dict): A dictionary of properties and their types for the node class.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
bool: True if the writing is successful, False otherwise.
|
|
279
|
+
"""
|
|
280
|
+
if not all(isinstance(n, BioCypherNode) for n in node_list):
|
|
281
|
+
logger.error("Nodes must be passed as type BioCypherNode.")
|
|
282
|
+
return False
|
|
283
|
+
|
|
284
|
+
# translate label to PascalCase
|
|
285
|
+
label_pascal = self.translator.name_sentence_to_pascal(label)
|
|
286
|
+
|
|
287
|
+
# create file name
|
|
288
|
+
file_name = os.path.join(
|
|
289
|
+
self.outdir, f"{label_pascal}.{self.extension}"
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
# write data in graph
|
|
293
|
+
graph = Graph()
|
|
294
|
+
self._init_namespaces(graph)
|
|
295
|
+
|
|
296
|
+
for n in node_list:
|
|
297
|
+
rdf_subject = n.get_id()
|
|
298
|
+
rdf_object = n.get_label()
|
|
299
|
+
properties = n.get_properties()
|
|
300
|
+
class_name = self.translator.name_sentence_to_pascal(rdf_object)
|
|
301
|
+
graph.add(
|
|
302
|
+
(
|
|
303
|
+
self.rdf_namespaces["biocypher"][class_name],
|
|
304
|
+
RDF.type,
|
|
305
|
+
RDFS.Class,
|
|
306
|
+
)
|
|
307
|
+
)
|
|
308
|
+
graph.add(
|
|
309
|
+
(
|
|
310
|
+
self.subject_to_uri(rdf_subject),
|
|
311
|
+
RDF.type,
|
|
312
|
+
self.rdf_namespaces["biocypher"][class_name],
|
|
313
|
+
)
|
|
314
|
+
)
|
|
315
|
+
for key, value in properties.items():
|
|
316
|
+
# only write value if it exists.
|
|
317
|
+
if value:
|
|
318
|
+
self.add_property_to_graph(graph, rdf_subject, value, key)
|
|
319
|
+
|
|
320
|
+
graph.serialize(destination=file_name, format=self.rdf_format)
|
|
321
|
+
|
|
322
|
+
logger.info(
|
|
323
|
+
f"Writing {len(node_list)} entries to {label_pascal}.{self.rdf_format}",
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
return True
|
|
327
|
+
|
|
328
|
+
def write_nodes(
|
|
329
|
+
self, nodes, batch_size: int = int(1e6), force: bool = False
|
|
330
|
+
) -> bool:
|
|
331
|
+
"""
|
|
332
|
+
Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
nodes (list or generator): A list or generator of nodes in BioCypherNode format.
|
|
336
|
+
batch_size (int): The number of nodes to write in each batch.
|
|
337
|
+
force (bool): Flag to force the writing even if the output file already exists.
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
bool: True if the writing is successful, False otherwise.
|
|
341
|
+
"""
|
|
342
|
+
# check if specified output format is correct
|
|
343
|
+
passed = self._is_rdf_format_supported(self.rdf_format)
|
|
344
|
+
if not passed:
|
|
345
|
+
logger.error("Error while writing node data, wrong RDF format")
|
|
346
|
+
return False
|
|
347
|
+
# write node data using _write_node_data method
|
|
348
|
+
passed = self._write_node_data(nodes, batch_size, force)
|
|
349
|
+
if not passed:
|
|
350
|
+
logger.error("Error while writing node data.")
|
|
351
|
+
return False
|
|
352
|
+
return True
|
|
353
|
+
|
|
354
|
+
def write_edges(
|
|
355
|
+
self,
|
|
356
|
+
edges: Union[list, GeneratorType],
|
|
357
|
+
batch_size: int = int(1e6),
|
|
358
|
+
) -> bool:
|
|
359
|
+
"""
|
|
360
|
+
Wrapper for writing edges in RDF format. It calls _write_edge_data()
|
|
361
|
+
functions specifying it's edge data.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
edges (BioCypherEdge): a list or generator of edges in
|
|
365
|
+
:py:class:`BioCypherEdge` format
|
|
366
|
+
batch_size (int): The number of edges to write in each batch.
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
bool: The return value. True for success, False otherwise.
|
|
370
|
+
"""
|
|
371
|
+
# check if specified output format is correct
|
|
372
|
+
passed = self._is_rdf_format_supported(self.rdf_format)
|
|
373
|
+
if not passed:
|
|
374
|
+
logger.error("Error while writing edge data, wrong RDF format")
|
|
375
|
+
return False
|
|
376
|
+
# write edge data using _write_edge_data method
|
|
377
|
+
passed = self._write_edge_data(edges, batch_size=batch_size)
|
|
378
|
+
if not passed:
|
|
379
|
+
logger.error("Error while writing edge data.")
|
|
380
|
+
return False
|
|
381
|
+
|
|
382
|
+
return True
|
|
383
|
+
|
|
384
|
+
def _construct_import_call(self) -> bool:
|
|
385
|
+
"""
|
|
386
|
+
Function to write the import call.
|
|
387
|
+
This function is not applicable for RDF.
|
|
388
|
+
|
|
389
|
+
Returns:
|
|
390
|
+
bool: The return value. True for success, False otherwise.
|
|
391
|
+
"""
|
|
392
|
+
return ""
|
|
393
|
+
|
|
394
|
+
def _write_array_string(self, string_list):
|
|
395
|
+
"""
|
|
396
|
+
Abstract method to write the string representation of an array into a .csv file
|
|
397
|
+
as required by the RDF admin-import.
|
|
398
|
+
This function is not applicable for RDF.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
string_list (list): list of ontology strings
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
str: The string representation of an array for the neo4j admin import
|
|
405
|
+
"""
|
|
406
|
+
|
|
407
|
+
return True
|
|
408
|
+
|
|
409
|
+
def _write_node_headers(self):
|
|
410
|
+
"""
|
|
411
|
+
Abstract method that takes care of importing properties of a graph entity that is represented
|
|
412
|
+
as a node as per the definition in the `schema_config.yaml`
|
|
413
|
+
This function is not applicable for RDF.
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
bool: The return value. True for success, False otherwise.
|
|
417
|
+
"""
|
|
418
|
+
return True
|
|
419
|
+
|
|
420
|
+
def _write_edge_headers(self):
|
|
421
|
+
"""
|
|
422
|
+
Abstract method to write a database import-file for a graph entity that is represented
|
|
423
|
+
as an edge as per the definition in the `schema_config.yaml`,
|
|
424
|
+
containing only the header for this type of edge.
|
|
425
|
+
This function is not applicable for RDF.
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
bool: The return value. True for success, False otherwise.
|
|
429
|
+
"""
|
|
430
|
+
return True
|
|
431
|
+
|
|
432
|
+
def subject_to_uri(self, subject: str) -> str:
|
|
433
|
+
"""
|
|
434
|
+
Converts the subject to a proper URI using the available namespaces.
|
|
435
|
+
If the conversion fails, it defaults to the biocypher prefix.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
subject (str): The subject to be converted to a URI.
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
str: The corresponding URI for the subject.
|
|
442
|
+
"""
|
|
443
|
+
try:
|
|
444
|
+
_pref, _id = subject.split(":")
|
|
445
|
+
|
|
446
|
+
if _pref in self.rdf_namespaces.keys():
|
|
447
|
+
return self.rdf_namespaces[_pref][_id]
|
|
448
|
+
else:
|
|
449
|
+
return self.rdf_namespaces["biocypher"][subject]
|
|
450
|
+
except ValueError:
|
|
451
|
+
return self.rdf_namespaces["biocypher"][subject]
|
|
452
|
+
|
|
453
|
+
def property_to_uri(self, property_name: str) -> dict[str, str]:
|
|
454
|
+
"""
|
|
455
|
+
Converts a property name to its corresponding URI.
|
|
456
|
+
|
|
457
|
+
This function takes a property name and searches for its corresponding URI in various namespaces.
|
|
458
|
+
It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
|
|
459
|
+
|
|
460
|
+
Args:
|
|
461
|
+
property_name (str): The property name to be converted to a URI.
|
|
462
|
+
|
|
463
|
+
Returns:
|
|
464
|
+
str: The corresponding URI for the input property name.
|
|
465
|
+
"""
|
|
466
|
+
# These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
|
|
467
|
+
for namespace in _NAMESPACE_PREFIXES_CORE.values():
|
|
468
|
+
if property_name in namespace:
|
|
469
|
+
return namespace[property_name]
|
|
470
|
+
|
|
471
|
+
# If the property name is not found in the core namespaces, search in the SKOS, DC, and DCTERMS namespaces
|
|
472
|
+
for namespace in [SKOS, DC, DCTERMS]:
|
|
473
|
+
if property_name in namespace:
|
|
474
|
+
return namespace[property_name]
|
|
475
|
+
|
|
476
|
+
# If the property name is still not found, try other namespaces from rdflib.
|
|
477
|
+
for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
|
|
478
|
+
if property_name in namespace:
|
|
479
|
+
return namespace[property_name]
|
|
480
|
+
|
|
481
|
+
# If the property name is "licence", it recursively calls the function with "license" as the input.
|
|
482
|
+
if property_name == "licence":
|
|
483
|
+
return self.property_to_uri("license")
|
|
484
|
+
|
|
485
|
+
# TODO: add an option to search trough manually implemented namespaces
|
|
486
|
+
|
|
487
|
+
# If the input is not found in any of the namespaces, it returns the corresponding URI from the biocypher namespace.
|
|
488
|
+
# TODO: give a warning and try to prevent this option altogether
|
|
489
|
+
return self.rdf_namespaces["biocypher"][property_name]
|
|
490
|
+
|
|
491
|
+
def _init_namespaces(self, graph: Graph):
|
|
492
|
+
"""
|
|
493
|
+
Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
|
|
494
|
+
|
|
495
|
+
This function adds the biocypher standard namespace to the `rdf_namespaces` attribute of the class.
|
|
496
|
+
If `rdf_namespaces` is empty, it sets it to the biocypher standard namespace. Otherwise, it merges
|
|
497
|
+
the biocypher standard namespace with the namespaces defined in the biocypher_config.yaml.
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
|
|
501
|
+
|
|
502
|
+
Returns:
|
|
503
|
+
None
|
|
504
|
+
"""
|
|
505
|
+
# add biocypher standard to self.rdf_namespaces
|
|
506
|
+
biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
|
|
507
|
+
if not self.rdf_namespaces:
|
|
508
|
+
self.rdf_namespaces = biocypher_standard
|
|
509
|
+
else:
|
|
510
|
+
self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
|
|
511
|
+
|
|
512
|
+
for key, value in self.rdf_namespaces.items():
|
|
513
|
+
namespace = Namespace(value)
|
|
514
|
+
self.rdf_namespaces[key] = namespace
|
|
515
|
+
graph.bind(key, namespace)
|
|
File without changes
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from more_itertools import peekable
|
|
2
|
+
|
|
3
|
+
from biocypher._logger import logger
|
|
4
|
+
from biocypher.output.write._writer import _Writer
|
|
5
|
+
from biocypher.output.in_memory._pandas import Pandas
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class _PandasCSVWriter(_Writer):
|
|
9
|
+
"""
|
|
10
|
+
Class for writing node and edge representations to a CSV file.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, *args, write_to_file: bool = True, **kwargs):
|
|
14
|
+
kwargs["write_to_file"] = write_to_file
|
|
15
|
+
super().__init__(*args, **kwargs)
|
|
16
|
+
self.in_memory_dfs = {}
|
|
17
|
+
self.stored_dfs = {}
|
|
18
|
+
self.pandas_in_memory = Pandas(
|
|
19
|
+
translator=self.translator,
|
|
20
|
+
deduplicator=self.deduplicator,
|
|
21
|
+
)
|
|
22
|
+
self.delimiter = kwargs.get("delimiter")
|
|
23
|
+
if not self.delimiter:
|
|
24
|
+
self.delimiter = ","
|
|
25
|
+
self.write_to_file = write_to_file
|
|
26
|
+
|
|
27
|
+
def _construct_import_call(self) -> str:
|
|
28
|
+
"""Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
str: Python code to load the csv files into Pandas dfs.
|
|
32
|
+
"""
|
|
33
|
+
import_call = "import pandas as pd\n\n"
|
|
34
|
+
for df_name in self.stored_dfs.keys():
|
|
35
|
+
import_call += f"{df_name} = pd.read_csv('./{df_name}.csv', header=0, index_col=0)\n"
|
|
36
|
+
return import_call
|
|
37
|
+
|
|
38
|
+
def _get_import_script_name(self) -> str:
|
|
39
|
+
"""Function to return the name of the import script."""
|
|
40
|
+
return "import_pandas_csv.py"
|
|
41
|
+
|
|
42
|
+
def _write_node_data(self, nodes) -> bool:
|
|
43
|
+
passed = self._write_entities_to_file(nodes)
|
|
44
|
+
return passed
|
|
45
|
+
|
|
46
|
+
def _write_edge_data(self, edges) -> bool:
|
|
47
|
+
passed = self._write_entities_to_file(edges)
|
|
48
|
+
return passed
|
|
49
|
+
|
|
50
|
+
def _write_entities_to_file(self, entities: iter) -> bool:
|
|
51
|
+
"""Function to output.write the entities to a CSV file.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
entities (iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
|
55
|
+
"""
|
|
56
|
+
entities = peekable(entities)
|
|
57
|
+
entity_list = self.pandas_in_memory._separate_entity_types(entities)
|
|
58
|
+
for entity_type, entities in entity_list.items():
|
|
59
|
+
self.in_memory_dfs[
|
|
60
|
+
entity_type
|
|
61
|
+
] = self.pandas_in_memory._add_entity_df(entity_type, entities)
|
|
62
|
+
for entity_type in self.in_memory_dfs.keys():
|
|
63
|
+
entity_df = self.in_memory_dfs[entity_type]
|
|
64
|
+
if " " in entity_type or "." in entity_type:
|
|
65
|
+
entity_type = entity_type.replace(" ", "_").replace(".", "_")
|
|
66
|
+
if self.write_to_file:
|
|
67
|
+
logger.info(
|
|
68
|
+
f"Writing {entity_df.shape[0]} entries to {entity_type}.csv."
|
|
69
|
+
)
|
|
70
|
+
entity_df.to_csv(
|
|
71
|
+
f"{self.output_directory}/{entity_type}.csv",
|
|
72
|
+
sep=self.delimiter,
|
|
73
|
+
)
|
|
74
|
+
self.stored_dfs[entity_type] = entity_df
|
|
75
|
+
self.in_memory_dfs = {}
|
|
76
|
+
return True
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
import glob
|
|
3
3
|
|
|
4
4
|
from biocypher._logger import logger
|
|
5
|
-
from biocypher.write._batch_writer import _BatchWriter
|
|
5
|
+
from biocypher.output.write._batch_writer import _BatchWriter
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class _PostgreSQLBatchWriter(_BatchWriter):
|
|
@@ -59,7 +59,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
59
59
|
|
|
60
60
|
def _write_array_string(self, string_list) -> str:
|
|
61
61
|
"""
|
|
62
|
-
Abstract method to write the string representation of an array into a .csv file
|
|
62
|
+
Abstract method to output.write the string representation of an array into a .csv file
|
|
63
63
|
as required by the postgresql COPY command, with '{','}' brackets and ',' separation.
|
|
64
64
|
|
|
65
65
|
Args:
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
biocypher/__init__.py,sha256=ejNY53vH_pE3ZbIN8G_ZBYxxPG9aERovRLD0XhDvt4k,942
|
|
2
|
+
biocypher/_config/__init__.py,sha256=fFHRFYxE2MtDAQWL6upe--MJ1vw3Z8CwIPhF2gW8cRU,3698
|
|
3
|
+
biocypher/_config/biocypher_config.yaml,sha256=pusj0IjJM3uWRcm0N7U7mb1IX257HCV2reZV3YKFCk0,3037
|
|
4
|
+
biocypher/_config/test_config.yaml,sha256=Np8jeS5_EP6HHOvMKb7B_Tkyqd5YaYlYz_DVsXypt-A,119
|
|
5
|
+
biocypher/_config/test_schema_config.yaml,sha256=D1600WgEj3iTXrumVU9LIivJHJO36iaxfkOgyam9zVU,3129
|
|
6
|
+
biocypher/_config/test_schema_config_disconnected.yaml,sha256=Qm8FLxEn2spHcyj_5F859KjcDvKSxNhxDvi4b4LLkvQ,68
|
|
7
|
+
biocypher/_config/test_schema_config_extended.yaml,sha256=wn3A76142hhjnImhMF6RODbCFESTJ2TtPvcFdIFsAT0,3309
|
|
8
|
+
biocypher/_core.py,sha256=m4o4Szv2xY2gl3PnNAA9m7Gg5Sgd8iR9THv3RDyZlQ8,22618
|
|
9
|
+
biocypher/_create.py,sha256=vpUchUdEpWupZi1LgFLxAWMtqoBwnWbP7PwEDUCBS4A,10202
|
|
10
|
+
biocypher/_deduplicate.py,sha256=BBvfpXzu6L5YDY5FdtXxnf8YlsbJpbCE8RdUoKsm0n0,4949
|
|
11
|
+
biocypher/_get.py,sha256=3Kpky3blfNf1JwxKWLsZxTU2aTP_C4sUe8OpiyYj63I,10810
|
|
12
|
+
biocypher/_logger.py,sha256=NGXe3hZA79WSujfOgpcxHBf8N2QAfrmvM1LFDpsGK2U,3185
|
|
13
|
+
biocypher/_mapping.py,sha256=ERSNH2Bg19145KytxbFE4BInPaiP-LWW7osOBot29Eo,9304
|
|
14
|
+
biocypher/_metadata.py,sha256=BDzCUMFoSAQx4kqvmfcYYR0xHnLv17AoCr1sKJMS7XE,1658
|
|
15
|
+
biocypher/_misc.py,sha256=18EG2Bei3RnyWXDWc3qtZaT3gybvXI8opi0HvSaF7Lg,6066
|
|
16
|
+
biocypher/_ontology.py,sha256=G5k-bnzvPZUqhLPxtoOPFa4OSQ4JpufgozVakLTjwLg,31789
|
|
17
|
+
biocypher/_translate.py,sha256=JafvhtVaFSpruRfYh9BzjVbvDF1Mhg7LLKMDZHWkRjg,16496
|
|
18
|
+
biocypher/output/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
biocypher/output/connect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
+
biocypher/output/connect/_neo4j_driver.py,sha256=jzF5sDhs_WnYEfXiSjQ1P3wNgoadl4Cg80EUYYOk0Ro,13497
|
|
21
|
+
biocypher/output/in_memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
+
biocypher/output/in_memory/_pandas.py,sha256=lsYQKjfxUy0O-ae4-YpsCJX-l85bxyc60WOj8gKfMfU,3080
|
|
23
|
+
biocypher/output/write/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
+
biocypher/output/write/_batch_writer.py,sha256=3pdS8ZLN4sBwATXaXFaSrfPQmejjFjo0avHkPQavFSU,36959
|
|
25
|
+
biocypher/output/write/_get_writer.py,sha256=AeQcHQTrz68ZvtxsZl4W0ymc8cOxe3Qfq5PJRY7kq_I,3736
|
|
26
|
+
biocypher/output/write/_writer.py,sha256=v4-c8yME1UCJeqy8Lfmv7KtY7_B4QkWgADt5xkFNJFQ,7453
|
|
27
|
+
biocypher/output/write/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
biocypher/output/write/graph/_arangodb.py,sha256=aUa_CNZyunFaPrJHc9RtVHRo0Fca9xJ-ZmRz4PxPO8c,8078
|
|
29
|
+
biocypher/output/write/graph/_neo4j.py,sha256=MkXW2wkyxR110R6RncpEcj-ztxK66jpuoaF_Q1iPTDY,11916
|
|
30
|
+
biocypher/output/write/graph/_networkx.py,sha256=EW2we3FlqQ8KfLv4l_2wE27KBUlhXJyD5ORvowSjlaA,2545
|
|
31
|
+
biocypher/output/write/graph/_rdf.py,sha256=BtunVo0iaCVM9I2tWOYwGpB9itbngHBjP0RhwgcJUiM,17977
|
|
32
|
+
biocypher/output/write/relational/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
+
biocypher/output/write/relational/_csv.py,sha256=eyAtmwfCNYnuVbkpd0rUoo9KgG2KPgopZVA3X97tRLU,2919
|
|
34
|
+
biocypher/output/write/relational/_postgresql.py,sha256=6sABZaELzmV7a2aUy2iRksf28WFsc3EA9mdQ2mShPeM,11959
|
|
35
|
+
biocypher/output/write/relational/_sqlite.py,sha256=ozElhca1YCYq8R-VFh-LDsnPBaXVJm2cvEboBK2LVVY,2073
|
|
36
|
+
biocypher-0.5.43.dist-info/LICENSE,sha256=SjUaQkq671iQUZOxEUpC4jvJxXOlfSiHTTueyz9kXJM,1065
|
|
37
|
+
biocypher-0.5.43.dist-info/METADATA,sha256=GteqAnDfyteLf4OrR8JFXRF3rfl_lmrFiZTALsdmfJs,10642
|
|
38
|
+
biocypher-0.5.43.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
|
|
39
|
+
biocypher-0.5.43.dist-info/RECORD,,
|