biocypher 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/__init__.py +3 -13
- biocypher/_config/__init__.py +6 -23
- biocypher/_core.py +360 -262
- biocypher/_create.py +13 -27
- biocypher/_deduplicate.py +4 -11
- biocypher/_get.py +21 -60
- biocypher/_logger.py +4 -16
- biocypher/_mapping.py +4 -17
- biocypher/_metadata.py +3 -15
- biocypher/_misc.py +14 -28
- biocypher/_ontology.py +127 -212
- biocypher/_translate.py +34 -58
- biocypher/output/connect/_get_connector.py +40 -0
- biocypher/output/connect/_neo4j_driver.py +9 -65
- biocypher/output/in_memory/_get_in_memory_kg.py +34 -0
- biocypher/output/in_memory/_in_memory_kg.py +40 -0
- biocypher/output/in_memory/_networkx.py +44 -0
- biocypher/output/in_memory/_pandas.py +20 -15
- biocypher/output/write/_batch_writer.py +137 -172
- biocypher/output/write/_get_writer.py +11 -24
- biocypher/output/write/_writer.py +14 -33
- biocypher/output/write/graph/_arangodb.py +7 -24
- biocypher/output/write/graph/_neo4j.py +59 -57
- biocypher/output/write/graph/_networkx.py +36 -43
- biocypher/output/write/graph/_rdf.py +114 -95
- biocypher/output/write/relational/_csv.py +6 -11
- biocypher/output/write/relational/_postgresql.py +12 -13
- biocypher/output/write/relational/_sqlite.py +3 -1
- {biocypher-0.6.1.dist-info → biocypher-0.7.0.dist-info}/LICENSE +1 -1
- {biocypher-0.6.1.dist-info → biocypher-0.7.0.dist-info}/METADATA +3 -3
- biocypher-0.7.0.dist-info/RECORD +43 -0
- {biocypher-0.6.1.dist-info → biocypher-0.7.0.dist-info}/WHEEL +1 -1
- biocypher-0.6.1.dist-info/RECORD +0 -39
|
@@ -1,22 +1,12 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
#
|
|
4
|
-
# Copyright 2021, Heidelberg University Clinic
|
|
5
|
-
#
|
|
6
|
-
# File author(s): Loes van den Biggelaar
|
|
7
|
-
# Sebastian Lobentanzer
|
|
8
|
-
#
|
|
9
|
-
# Distributed under MIT licence, see the file `LICENSE`.
|
|
10
|
-
#
|
|
11
|
-
"""
|
|
12
|
-
BioCypher 'offline' module. Handles the writing of node and edge representations
|
|
1
|
+
"""BioCypher 'offline' module. Handles the writing of node and edge representations
|
|
13
2
|
suitable for import into a DBMS.
|
|
14
3
|
"""
|
|
15
|
-
|
|
16
|
-
from typing import Union
|
|
4
|
+
|
|
17
5
|
import os
|
|
18
6
|
|
|
19
|
-
from
|
|
7
|
+
from types import GeneratorType
|
|
8
|
+
|
|
9
|
+
from rdflib import DC, DCTERMS, RDF, RDFS, SKOS, Graph, Literal, Namespace
|
|
20
10
|
from rdflib.namespace import (
|
|
21
11
|
_NAMESPACE_PREFIXES_CORE,
|
|
22
12
|
_NAMESPACE_PREFIXES_RDFLIB,
|
|
@@ -28,8 +18,7 @@ from biocypher.output.write._batch_writer import _BatchWriter
|
|
|
28
18
|
|
|
29
19
|
|
|
30
20
|
class _RDFWriter(_BatchWriter):
|
|
31
|
-
"""
|
|
32
|
-
Class to write BioCypher's property graph into an RDF format using
|
|
21
|
+
"""Class to write BioCypher's property graph into an RDF format using
|
|
33
22
|
rdflib and all the extensions it supports (RDF/XML, N3, NTriples,
|
|
34
23
|
N-Quads, Turtle, TriX, Trig and JSON-LD). By default the conversion
|
|
35
24
|
is done keeping only the minimum information about node and edges,
|
|
@@ -37,33 +26,37 @@ class _RDFWriter(_BatchWriter):
|
|
|
37
26
|
"""
|
|
38
27
|
|
|
39
28
|
def _get_import_script_name(self) -> str:
|
|
40
|
-
"""
|
|
41
|
-
Returns the name of the RDF admin import script.
|
|
29
|
+
"""Returns the name of the RDF admin import script.
|
|
42
30
|
This function applicable for RDF export.
|
|
43
31
|
|
|
44
|
-
Returns
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
45
34
|
str: The name of the import script (ending in .sh)
|
|
35
|
+
|
|
46
36
|
"""
|
|
47
37
|
return "rdf-import-call.sh"
|
|
48
38
|
|
|
49
39
|
def _get_default_import_call_bin_prefix(self):
|
|
50
|
-
"""
|
|
51
|
-
Method to provide the default string for the import call bin prefix.
|
|
40
|
+
"""Method to provide the default string for the import call bin prefix.
|
|
52
41
|
|
|
53
|
-
Returns
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
54
44
|
str: The default location for the RDF admin import location
|
|
45
|
+
|
|
55
46
|
"""
|
|
56
47
|
return "bin/"
|
|
57
48
|
|
|
58
49
|
def _is_rdf_format_supported(self, rdf_format: str) -> bool:
|
|
59
|
-
"""
|
|
60
|
-
Function to check if the specified RDF format is supported.
|
|
50
|
+
"""Function to check if the specified RDF format is supported.
|
|
61
51
|
|
|
62
52
|
Args:
|
|
53
|
+
----
|
|
63
54
|
rdf_format (str): The RDF format to check.
|
|
64
55
|
|
|
65
56
|
Returns:
|
|
57
|
+
-------
|
|
66
58
|
bool: Returns True if rdf format supported, False otherwise.
|
|
59
|
+
|
|
67
60
|
"""
|
|
68
61
|
supported_formats = [
|
|
69
62
|
"xml",
|
|
@@ -83,7 +76,8 @@ class _RDFWriter(_BatchWriter):
|
|
|
83
76
|
)
|
|
84
77
|
return False
|
|
85
78
|
else:
|
|
86
|
-
# RDF graph does not support 'ttl' format, only 'turtle' format.
|
|
79
|
+
# RDF graph does not support 'ttl' format, only 'turtle' format.
|
|
80
|
+
# however, the preferred file extension is always '.ttl'
|
|
87
81
|
if self.rdf_format == "turtle":
|
|
88
82
|
self.extension = "ttl"
|
|
89
83
|
elif self.rdf_format == "ttl":
|
|
@@ -99,11 +93,11 @@ class _RDFWriter(_BatchWriter):
|
|
|
99
93
|
label: str,
|
|
100
94
|
prop_dict: dict,
|
|
101
95
|
):
|
|
102
|
-
"""
|
|
103
|
-
This function takes one list of biocypher edges and writes them
|
|
96
|
+
"""This function takes one list of biocypher edges and writes them
|
|
104
97
|
to an RDF file with the given format.
|
|
105
98
|
|
|
106
99
|
Args:
|
|
100
|
+
----
|
|
107
101
|
edge_list (list): list of BioCypherEdges to be written
|
|
108
102
|
|
|
109
103
|
label (str): the label (type) of the edge
|
|
@@ -112,9 +106,10 @@ class _RDFWriter(_BatchWriter):
|
|
|
112
106
|
function and their types
|
|
113
107
|
|
|
114
108
|
Returns:
|
|
109
|
+
-------
|
|
115
110
|
bool: The return value. True for success, False otherwise.
|
|
116
|
-
"""
|
|
117
111
|
|
|
112
|
+
"""
|
|
118
113
|
if not all(isinstance(n, BioCypherEdge) for n in edge_list):
|
|
119
114
|
logger.error("Edges must be passed as type BioCypherEdge.")
|
|
120
115
|
return False
|
|
@@ -123,9 +118,7 @@ class _RDFWriter(_BatchWriter):
|
|
|
123
118
|
label_pascal = self.translator.name_sentence_to_pascal(label)
|
|
124
119
|
|
|
125
120
|
# create file name
|
|
126
|
-
file_name = os.path.join(
|
|
127
|
-
self.outdir, f"{label_pascal}.{self.extension}"
|
|
128
|
-
)
|
|
121
|
+
file_name = os.path.join(self.outdir, f"{label_pascal}.{self.extension}")
|
|
129
122
|
|
|
130
123
|
# write data in graph
|
|
131
124
|
graph = Graph()
|
|
@@ -136,12 +129,10 @@ class _RDFWriter(_BatchWriter):
|
|
|
136
129
|
rdf_object = edge.get_target_id()
|
|
137
130
|
rdf_predicate = edge.get_id()
|
|
138
131
|
rdf_properties = edge.get_properties()
|
|
139
|
-
if rdf_predicate
|
|
132
|
+
if rdf_predicate is None:
|
|
140
133
|
rdf_predicate = rdf_subject + rdf_object
|
|
141
134
|
|
|
142
|
-
edge_label = self.translator.name_sentence_to_pascal(
|
|
143
|
-
edge.get_label()
|
|
144
|
-
)
|
|
135
|
+
edge_label = self.translator.name_sentence_to_pascal(edge.get_label())
|
|
145
136
|
edge_uri = self.rdf_namespaces["biocypher"][edge_label]
|
|
146
137
|
graph.add((edge_uri, RDF.type, RDFS.Class))
|
|
147
138
|
graph.add(
|
|
@@ -149,21 +140,21 @@ class _RDFWriter(_BatchWriter):
|
|
|
149
140
|
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
150
141
|
RDF.type,
|
|
151
142
|
edge_uri,
|
|
152
|
-
)
|
|
143
|
+
),
|
|
153
144
|
)
|
|
154
145
|
graph.add(
|
|
155
146
|
(
|
|
156
147
|
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
157
148
|
self.rdf_namespaces["biocypher"]["subject"],
|
|
158
149
|
self.subject_to_uri(rdf_subject),
|
|
159
|
-
)
|
|
150
|
+
),
|
|
160
151
|
)
|
|
161
152
|
graph.add(
|
|
162
153
|
(
|
|
163
154
|
self.rdf_namespaces["biocypher"][rdf_predicate],
|
|
164
155
|
self.rdf_namespaces["biocypher"]["object"],
|
|
165
156
|
self.subject_to_uri(rdf_object),
|
|
166
|
-
)
|
|
157
|
+
),
|
|
167
158
|
)
|
|
168
159
|
|
|
169
160
|
# add properties to the transformed edge --> node
|
|
@@ -187,13 +178,17 @@ class _RDFWriter(_BatchWriter):
|
|
|
187
178
|
rdf_object: str,
|
|
188
179
|
rdf_predicate: str,
|
|
189
180
|
):
|
|
190
|
-
"""
|
|
191
|
-
|
|
192
|
-
It
|
|
193
|
-
|
|
194
|
-
|
|
181
|
+
"""Add the properties to an RDF node.
|
|
182
|
+
|
|
183
|
+
It takes the graph, the subject, object, and predicate of the RDF
|
|
184
|
+
triple. It checks if the property is a list and adds it to the graph
|
|
185
|
+
accordingly. Otherwise it checks if the string represents a list. If it
|
|
186
|
+
does, it transforms it to a list and adds it to the graph. If not, it
|
|
187
|
+
adds the property to the graph as a literal. If the property is neither
|
|
188
|
+
a list or string, it will also be added as a literal.
|
|
195
189
|
|
|
196
190
|
Args:
|
|
191
|
+
----
|
|
197
192
|
graph (RDFLib.Graph): The RDF graph to add the nodes to.
|
|
198
193
|
|
|
199
194
|
rdf_subject (str): The subject of the RDF triple.
|
|
@@ -203,7 +198,9 @@ class _RDFWriter(_BatchWriter):
|
|
|
203
198
|
rdf_predicate (str): The predicate of the RDF triple.
|
|
204
199
|
|
|
205
200
|
Returns:
|
|
201
|
+
-------
|
|
206
202
|
None
|
|
203
|
+
|
|
207
204
|
"""
|
|
208
205
|
if isinstance(rdf_object, list):
|
|
209
206
|
for obj in rdf_object:
|
|
@@ -212,7 +209,7 @@ class _RDFWriter(_BatchWriter):
|
|
|
212
209
|
self.subject_to_uri(rdf_subject),
|
|
213
210
|
self.property_to_uri(rdf_predicate),
|
|
214
211
|
Literal(obj),
|
|
215
|
-
)
|
|
212
|
+
),
|
|
216
213
|
)
|
|
217
214
|
elif isinstance(rdf_object, str):
|
|
218
215
|
if rdf_object.startswith("[") and rdf_object.endswith("]"):
|
|
@@ -228,7 +225,7 @@ class _RDFWriter(_BatchWriter):
|
|
|
228
225
|
self.subject_to_uri(rdf_subject),
|
|
229
226
|
self.property_to_uri(rdf_predicate),
|
|
230
227
|
Literal(rdf_object),
|
|
231
|
-
)
|
|
228
|
+
),
|
|
232
229
|
)
|
|
233
230
|
else:
|
|
234
231
|
graph.add(
|
|
@@ -236,25 +233,22 @@ class _RDFWriter(_BatchWriter):
|
|
|
236
233
|
self.subject_to_uri(rdf_subject),
|
|
237
234
|
self.property_to_uri(rdf_predicate),
|
|
238
235
|
Literal(rdf_object),
|
|
239
|
-
)
|
|
236
|
+
),
|
|
240
237
|
)
|
|
241
238
|
|
|
242
239
|
def transform_string_to_list(self, string_list: str) -> list:
|
|
243
|
-
"""
|
|
244
|
-
Function to transform a string representation of a list into a list.
|
|
240
|
+
"""Function to transform a string representation of a list into a list.
|
|
245
241
|
|
|
246
242
|
Args:
|
|
243
|
+
----
|
|
247
244
|
string_list (str): The string representation of the list.
|
|
248
245
|
|
|
249
246
|
Returns:
|
|
247
|
+
-------
|
|
250
248
|
list: The list representation of the input string.
|
|
249
|
+
|
|
251
250
|
"""
|
|
252
|
-
return (
|
|
253
|
-
string_list.replace("[", "")
|
|
254
|
-
.replace("]", "")
|
|
255
|
-
.replace("'", "")
|
|
256
|
-
.split(", ")
|
|
257
|
-
)
|
|
251
|
+
return string_list.replace("[", "").replace("]", "").replace("'", "").split(", ")
|
|
258
252
|
|
|
259
253
|
def _write_single_node_list_to_file(
|
|
260
254
|
self,
|
|
@@ -263,11 +257,11 @@ class _RDFWriter(_BatchWriter):
|
|
|
263
257
|
prop_dict: dict,
|
|
264
258
|
labels: str,
|
|
265
259
|
):
|
|
266
|
-
"""
|
|
267
|
-
This function takes a list of BioCypherNodes and writes them
|
|
260
|
+
"""This function takes a list of BioCypherNodes and writes them
|
|
268
261
|
to an RDF file in the specified format.
|
|
269
262
|
|
|
270
263
|
Args:
|
|
264
|
+
----
|
|
271
265
|
node_list (list): A list of BioCypherNodes to be written.
|
|
272
266
|
|
|
273
267
|
label (str): The label (type) of the nodes.
|
|
@@ -275,7 +269,9 @@ class _RDFWriter(_BatchWriter):
|
|
|
275
269
|
prop_dict (dict): A dictionary of properties and their types for the node class.
|
|
276
270
|
|
|
277
271
|
Returns:
|
|
272
|
+
-------
|
|
278
273
|
bool: True if the writing is successful, False otherwise.
|
|
274
|
+
|
|
279
275
|
"""
|
|
280
276
|
if not all(isinstance(n, BioCypherNode) for n in node_list):
|
|
281
277
|
logger.error("Nodes must be passed as type BioCypherNode.")
|
|
@@ -285,9 +281,7 @@ class _RDFWriter(_BatchWriter):
|
|
|
285
281
|
label_pascal = self.translator.name_sentence_to_pascal(label)
|
|
286
282
|
|
|
287
283
|
# create file name
|
|
288
|
-
file_name = os.path.join(
|
|
289
|
-
self.outdir, f"{label_pascal}.{self.extension}"
|
|
290
|
-
)
|
|
284
|
+
file_name = os.path.join(self.outdir, f"{label_pascal}.{self.extension}")
|
|
291
285
|
|
|
292
286
|
# write data in graph
|
|
293
287
|
graph = Graph()
|
|
@@ -303,14 +297,14 @@ class _RDFWriter(_BatchWriter):
|
|
|
303
297
|
self.rdf_namespaces["biocypher"][class_name],
|
|
304
298
|
RDF.type,
|
|
305
299
|
RDFS.Class,
|
|
306
|
-
)
|
|
300
|
+
),
|
|
307
301
|
)
|
|
308
302
|
graph.add(
|
|
309
303
|
(
|
|
310
304
|
self.subject_to_uri(rdf_subject),
|
|
311
305
|
RDF.type,
|
|
312
306
|
self.rdf_namespaces["biocypher"][class_name],
|
|
313
|
-
)
|
|
307
|
+
),
|
|
314
308
|
)
|
|
315
309
|
for key, value in properties.items():
|
|
316
310
|
# only write value if it exists.
|
|
@@ -325,19 +319,19 @@ class _RDFWriter(_BatchWriter):
|
|
|
325
319
|
|
|
326
320
|
return True
|
|
327
321
|
|
|
328
|
-
def write_nodes(
|
|
329
|
-
|
|
330
|
-
) -> bool:
|
|
331
|
-
"""
|
|
332
|
-
Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
|
|
322
|
+
def write_nodes(self, nodes, batch_size: int = int(1e6), force: bool = False) -> bool:
|
|
323
|
+
"""Wrapper for writing nodes in RDF format. It calls the _write_node_data() function, specifying the node data.
|
|
333
324
|
|
|
334
325
|
Args:
|
|
326
|
+
----
|
|
335
327
|
nodes (list or generator): A list or generator of nodes in BioCypherNode format.
|
|
336
328
|
batch_size (int): The number of nodes to write in each batch.
|
|
337
329
|
force (bool): Flag to force the writing even if the output file already exists.
|
|
338
330
|
|
|
339
331
|
Returns:
|
|
332
|
+
-------
|
|
340
333
|
bool: True if the writing is successful, False otherwise.
|
|
334
|
+
|
|
341
335
|
"""
|
|
342
336
|
# check if specified output format is correct
|
|
343
337
|
passed = self._is_rdf_format_supported(self.rdf_format)
|
|
@@ -353,20 +347,22 @@ class _RDFWriter(_BatchWriter):
|
|
|
353
347
|
|
|
354
348
|
def write_edges(
|
|
355
349
|
self,
|
|
356
|
-
edges:
|
|
350
|
+
edges: list | GeneratorType,
|
|
357
351
|
batch_size: int = int(1e6),
|
|
358
352
|
) -> bool:
|
|
359
|
-
"""
|
|
360
|
-
Wrapper for writing edges in RDF format. It calls _write_edge_data()
|
|
353
|
+
"""Wrapper for writing edges in RDF format. It calls _write_edge_data()
|
|
361
354
|
functions specifying it's edge data.
|
|
362
355
|
|
|
363
356
|
Args:
|
|
357
|
+
----
|
|
364
358
|
edges (BioCypherEdge): a list or generator of edges in
|
|
365
359
|
:py:class:`BioCypherEdge` format
|
|
366
360
|
batch_size (int): The number of edges to write in each batch.
|
|
367
361
|
|
|
368
362
|
Returns:
|
|
363
|
+
-------
|
|
369
364
|
bool: The return value. True for success, False otherwise.
|
|
365
|
+
|
|
370
366
|
"""
|
|
371
367
|
# check if specified output format is correct
|
|
372
368
|
passed = self._is_rdf_format_supported(self.rdf_format)
|
|
@@ -382,63 +378,76 @@ class _RDFWriter(_BatchWriter):
|
|
|
382
378
|
return True
|
|
383
379
|
|
|
384
380
|
def _construct_import_call(self) -> bool:
|
|
385
|
-
"""
|
|
386
|
-
Function to write the import call.
|
|
381
|
+
"""Function to write the import call.
|
|
387
382
|
This function is not applicable for RDF.
|
|
388
383
|
|
|
389
|
-
Returns
|
|
384
|
+
Returns
|
|
385
|
+
-------
|
|
390
386
|
bool: The return value. True for success, False otherwise.
|
|
387
|
+
|
|
391
388
|
"""
|
|
392
389
|
return ""
|
|
393
390
|
|
|
394
|
-
def
|
|
391
|
+
def _quote_string(self, value: str) -> str:
|
|
395
392
|
"""
|
|
396
|
-
|
|
393
|
+
Quote a string.
|
|
394
|
+
"""
|
|
395
|
+
|
|
396
|
+
return f"{self.quote}{value}{self.quote}"
|
|
397
|
+
|
|
398
|
+
def _write_array_string(self, string_list):
|
|
399
|
+
"""Abstract method to write the string representation of an array into a .csv file
|
|
397
400
|
as required by the RDF admin-import.
|
|
398
401
|
This function is not applicable for RDF.
|
|
399
402
|
|
|
400
403
|
Args:
|
|
404
|
+
----
|
|
401
405
|
string_list (list): list of ontology strings
|
|
402
406
|
|
|
403
407
|
Returns:
|
|
408
|
+
-------
|
|
404
409
|
str: The string representation of an array for the neo4j admin import
|
|
405
|
-
"""
|
|
406
410
|
|
|
411
|
+
"""
|
|
407
412
|
return True
|
|
408
413
|
|
|
409
414
|
def _write_node_headers(self):
|
|
410
|
-
"""
|
|
411
|
-
Abstract method that takes care of importing properties of a graph entity that is represented
|
|
415
|
+
"""Abstract method that takes care of importing properties of a graph entity that is represented
|
|
412
416
|
as a node as per the definition in the `schema_config.yaml`
|
|
413
417
|
This function is not applicable for RDF.
|
|
414
418
|
|
|
415
|
-
Returns
|
|
419
|
+
Returns
|
|
420
|
+
-------
|
|
416
421
|
bool: The return value. True for success, False otherwise.
|
|
422
|
+
|
|
417
423
|
"""
|
|
418
424
|
return True
|
|
419
425
|
|
|
420
426
|
def _write_edge_headers(self):
|
|
421
|
-
"""
|
|
422
|
-
Abstract method to write a database import-file for a graph entity that is represented
|
|
427
|
+
"""Abstract method to write a database import-file for a graph entity that is represented
|
|
423
428
|
as an edge as per the definition in the `schema_config.yaml`,
|
|
424
429
|
containing only the header for this type of edge.
|
|
425
430
|
This function is not applicable for RDF.
|
|
426
431
|
|
|
427
|
-
Returns
|
|
432
|
+
Returns
|
|
433
|
+
-------
|
|
428
434
|
bool: The return value. True for success, False otherwise.
|
|
435
|
+
|
|
429
436
|
"""
|
|
430
437
|
return True
|
|
431
438
|
|
|
432
439
|
def subject_to_uri(self, subject: str) -> str:
|
|
433
|
-
"""
|
|
434
|
-
Converts the subject to a proper URI using the available namespaces.
|
|
440
|
+
"""Converts the subject to a proper URI using the available namespaces.
|
|
435
441
|
If the conversion fails, it defaults to the biocypher prefix.
|
|
436
442
|
|
|
437
443
|
Args:
|
|
444
|
+
----
|
|
438
445
|
subject (str): The subject to be converted to a URI.
|
|
439
446
|
|
|
440
447
|
Returns:
|
|
448
|
+
-------
|
|
441
449
|
str: The corresponding URI for the subject.
|
|
450
|
+
|
|
442
451
|
"""
|
|
443
452
|
try:
|
|
444
453
|
_pref, _id = subject.split(":")
|
|
@@ -451,56 +460,66 @@ class _RDFWriter(_BatchWriter):
|
|
|
451
460
|
return self.rdf_namespaces["biocypher"][subject]
|
|
452
461
|
|
|
453
462
|
def property_to_uri(self, property_name: str) -> dict[str, str]:
|
|
454
|
-
"""
|
|
455
|
-
Converts a property name to its corresponding URI.
|
|
463
|
+
"""Converts a property name to its corresponding URI.
|
|
456
464
|
|
|
457
465
|
This function takes a property name and searches for its corresponding URI in various namespaces.
|
|
458
466
|
It first checks the core namespaces for rdflib, including owl, rdf, rdfs, xsd, and xml.
|
|
459
467
|
|
|
460
468
|
Args:
|
|
469
|
+
----
|
|
461
470
|
property_name (str): The property name to be converted to a URI.
|
|
462
471
|
|
|
463
472
|
Returns:
|
|
473
|
+
-------
|
|
464
474
|
str: The corresponding URI for the input property name.
|
|
475
|
+
|
|
465
476
|
"""
|
|
466
477
|
# These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
|
|
467
478
|
for namespace in _NAMESPACE_PREFIXES_CORE.values():
|
|
468
479
|
if property_name in namespace:
|
|
469
480
|
return namespace[property_name]
|
|
470
481
|
|
|
471
|
-
# If the property name is not found in the core namespaces, search in
|
|
482
|
+
# If the property name is not found in the core namespaces, search in
|
|
483
|
+
# the SKOS, DC, and DCTERMS namespaces
|
|
472
484
|
for namespace in [SKOS, DC, DCTERMS]:
|
|
473
485
|
if property_name in namespace:
|
|
474
486
|
return namespace[property_name]
|
|
475
487
|
|
|
476
|
-
# If the property name is still not found, try other namespaces from
|
|
488
|
+
# If the property name is still not found, try other namespaces from
|
|
489
|
+
# rdflib.
|
|
477
490
|
for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
|
|
478
491
|
if property_name in namespace:
|
|
479
492
|
return namespace[property_name]
|
|
480
493
|
|
|
481
|
-
# If the property name is "licence", it recursively calls the function
|
|
494
|
+
# If the property name is "licence", it recursively calls the function
|
|
495
|
+
# with "license" as the input.
|
|
482
496
|
if property_name == "licence":
|
|
483
497
|
return self.property_to_uri("license")
|
|
484
498
|
|
|
485
499
|
# TODO: add an option to search trough manually implemented namespaces
|
|
486
500
|
|
|
487
|
-
# If the input is not found in any of the namespaces, it returns
|
|
501
|
+
# If the input is not found in any of the namespaces, it returns
|
|
502
|
+
# the corresponding URI from the biocypher namespace.
|
|
488
503
|
# TODO: give a warning and try to prevent this option altogether
|
|
489
504
|
return self.rdf_namespaces["biocypher"][property_name]
|
|
490
505
|
|
|
491
506
|
def _init_namespaces(self, graph: Graph):
|
|
492
|
-
"""
|
|
493
|
-
Initializes the namespaces for the RDF graph. These namespaces are used to convert nodes to URIs.
|
|
507
|
+
"""Initialise the namespaces for the RDF graph.
|
|
494
508
|
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
the
|
|
509
|
+
These namespaces are used to convert nodes to URIs. This function adds
|
|
510
|
+
the biocypher standard namespace to the `rdf_namespaces` attribute of
|
|
511
|
+
the class. If `rdf_namespaces` is empty, it sets it to the biocypher
|
|
512
|
+
standard namespace. Otherwise, it merges the biocypher standard
|
|
513
|
+
namespace with the namespaces defined in the biocypher_config.yaml.
|
|
498
514
|
|
|
499
515
|
Args:
|
|
516
|
+
----
|
|
500
517
|
graph (RDFLib.Graph): The RDF graph to bind the namespaces to.
|
|
501
518
|
|
|
502
519
|
Returns:
|
|
520
|
+
-------
|
|
503
521
|
None
|
|
522
|
+
|
|
504
523
|
"""
|
|
505
524
|
# add biocypher standard to self.rdf_namespaces
|
|
506
525
|
biocypher_standard = {"biocypher": "https://biocypher.org/biocypher#"}
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
from more_itertools import peekable
|
|
2
2
|
|
|
3
3
|
from biocypher._logger import logger
|
|
4
|
+
from biocypher.output.in_memory._pandas import PandasKG
|
|
4
5
|
from biocypher.output.write._writer import _Writer
|
|
5
|
-
from biocypher.output.in_memory._pandas import Pandas
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class _PandasCSVWriter(_Writer):
|
|
9
9
|
"""
|
|
10
|
-
Class for writing node and edge representations to
|
|
10
|
+
Class for writing node and edge representations to CSV files.
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
def __init__(self, *args, write_to_file: bool = True, **kwargs):
|
|
@@ -15,8 +15,7 @@ class _PandasCSVWriter(_Writer):
|
|
|
15
15
|
super().__init__(*args, **kwargs)
|
|
16
16
|
self.in_memory_dfs = {}
|
|
17
17
|
self.stored_dfs = {}
|
|
18
|
-
self.pandas_in_memory =
|
|
19
|
-
translator=self.translator,
|
|
18
|
+
self.pandas_in_memory = PandasKG(
|
|
20
19
|
deduplicator=self.deduplicator,
|
|
21
20
|
)
|
|
22
21
|
self.delimiter = kwargs.get("delimiter")
|
|
@@ -48,7 +47,7 @@ class _PandasCSVWriter(_Writer):
|
|
|
48
47
|
return passed
|
|
49
48
|
|
|
50
49
|
def _write_entities_to_file(self, entities: iter) -> bool:
|
|
51
|
-
"""Function to
|
|
50
|
+
"""Function to write the entities to a CSV file.
|
|
52
51
|
|
|
53
52
|
Args:
|
|
54
53
|
entities (iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
|
@@ -56,17 +55,13 @@ class _PandasCSVWriter(_Writer):
|
|
|
56
55
|
entities = peekable(entities)
|
|
57
56
|
entity_list = self.pandas_in_memory._separate_entity_types(entities)
|
|
58
57
|
for entity_type, entities in entity_list.items():
|
|
59
|
-
self.in_memory_dfs[
|
|
60
|
-
entity_type
|
|
61
|
-
] = self.pandas_in_memory._add_entity_df(entity_type, entities)
|
|
58
|
+
self.in_memory_dfs[entity_type] = self.pandas_in_memory._add_entity_df(entity_type, entities)
|
|
62
59
|
for entity_type in self.in_memory_dfs.keys():
|
|
63
60
|
entity_df = self.in_memory_dfs[entity_type]
|
|
64
61
|
if " " in entity_type or "." in entity_type:
|
|
65
62
|
entity_type = entity_type.replace(" ", "_").replace(".", "_")
|
|
66
63
|
if self.write_to_file:
|
|
67
|
-
logger.info(
|
|
68
|
-
f"Writing {entity_df.shape[0]} entries to {entity_type}.csv."
|
|
69
|
-
)
|
|
64
|
+
logger.info(f"Writing {entity_df.shape[0]} entries to {entity_type}.csv.")
|
|
70
65
|
entity_df.to_csv(
|
|
71
66
|
f"{self.output_directory}/{entity_type}.csv",
|
|
72
67
|
sep=self.delimiter,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import glob
|
|
2
|
+
import os
|
|
3
3
|
|
|
4
4
|
from biocypher._logger import logger
|
|
5
5
|
from biocypher.output.write._batch_writer import _BatchWriter
|
|
@@ -52,11 +52,16 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
52
52
|
try:
|
|
53
53
|
return self.DATA_TYPE_LOOKUP[string]
|
|
54
54
|
except KeyError:
|
|
55
|
-
logger.info(
|
|
56
|
-
'Could not determine data type {string}. Using default "VARCHAR"'
|
|
57
|
-
)
|
|
55
|
+
logger.info('Could not determine data type {string}. Using default "VARCHAR"')
|
|
58
56
|
return "VARCHAR"
|
|
59
57
|
|
|
58
|
+
def _quote_string(self, value: str) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Quote a string.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
return f"{self.quote}{value}{self.quote}"
|
|
64
|
+
|
|
60
65
|
def _write_array_string(self, string_list) -> str:
|
|
61
66
|
"""
|
|
62
67
|
Abstract method to output.write the string representation of an array into a .csv file
|
|
@@ -140,9 +145,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
140
145
|
command += f"DROP TABLE IF EXISTS {pascal_label};\n"
|
|
141
146
|
|
|
142
147
|
# table creation requires comma separation
|
|
143
|
-
command += (
|
|
144
|
-
f'CREATE TABLE {pascal_label}({",".join(columns)});\n'
|
|
145
|
-
)
|
|
148
|
+
command += f"CREATE TABLE {pascal_label}({','.join(columns)});\n"
|
|
146
149
|
f.write(command)
|
|
147
150
|
|
|
148
151
|
for parts_path in parts_paths:
|
|
@@ -239,9 +242,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
239
242
|
command += f"DROP TABLE IF EXISTS {pascal_label};\n"
|
|
240
243
|
|
|
241
244
|
# table creation requires comma separation
|
|
242
|
-
command += (
|
|
243
|
-
f'CREATE TABLE {pascal_label}({",".join(out_list)});\n'
|
|
244
|
-
)
|
|
245
|
+
command += f"CREATE TABLE {pascal_label}({','.join(out_list)});\n"
|
|
245
246
|
f.write(command)
|
|
246
247
|
|
|
247
248
|
for parts_path in parts_paths:
|
|
@@ -292,9 +293,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
292
293
|
if {self.db_password}:
|
|
293
294
|
# set password variable inline
|
|
294
295
|
import_call += f"PGPASSWORD={self.db_password} "
|
|
295
|
-
import_call +=
|
|
296
|
-
f"{self.import_call_bin_prefix}psql -f {import_file_path}"
|
|
297
|
-
)
|
|
296
|
+
import_call += f"{self.import_call_bin_prefix}psql -f {import_file_path}"
|
|
298
297
|
import_call += f" --dbname {self.db_name}"
|
|
299
298
|
import_call += f" --host {self.db_host}"
|
|
300
299
|
import_call += f" --port {self.db_port}"
|
|
@@ -44,7 +44,9 @@ class _SQLiteBatchWriter(_PostgreSQLBatchWriter):
|
|
|
44
44
|
import_call += f'echo "Importing {table_part}..."\n'
|
|
45
45
|
separator = self.delim
|
|
46
46
|
import_part = f".import {table_part} {table_name}"
|
|
47
|
-
import_call +=
|
|
47
|
+
import_call += (
|
|
48
|
+
f"{self.import_call_bin_prefix}sqlite3 -separator $'{separator}' {self.db_name} \"{import_part}\""
|
|
49
|
+
)
|
|
48
50
|
import_call += '\necho "Done!"\n'
|
|
49
51
|
import_call += "\n"
|
|
50
52
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: biocypher
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: A unifying framework for biomedical research knowledge graphs
|
|
5
5
|
Home-page: https://github.com/biocypher/biocypher
|
|
6
6
|
License: MIT
|
|
7
7
|
Author: Sebastian Lobentanzer
|
|
8
8
|
Author-email: sebastian.lobentanzer@gmail.com
|
|
9
|
-
Requires-Python: >=3.
|
|
9
|
+
Requires-Python: >=3.10,<4.0
|
|
10
10
|
Classifier: Development Status :: 3 - Alpha
|
|
11
11
|
Classifier: Intended Audience :: Developers
|
|
12
12
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -15,9 +15,9 @@ Classifier: Natural Language :: English
|
|
|
15
15
|
Classifier: Operating System :: OS Independent
|
|
16
16
|
Classifier: Programming Language :: Python
|
|
17
17
|
Classifier: Programming Language :: Python :: 3
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.10
|
|
20
19
|
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
22
22
|
Requires-Dist: PyYAML (>=5.0)
|
|
23
23
|
Requires-Dist: appdirs
|