biocypher 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/__init__.py +3 -13
- biocypher/_config/__init__.py +6 -23
- biocypher/_core.py +360 -262
- biocypher/_create.py +13 -27
- biocypher/_deduplicate.py +4 -11
- biocypher/_get.py +21 -60
- biocypher/_logger.py +4 -16
- biocypher/_mapping.py +4 -17
- biocypher/_metadata.py +3 -15
- biocypher/_misc.py +14 -28
- biocypher/_ontology.py +127 -212
- biocypher/_translate.py +34 -58
- biocypher/output/connect/_get_connector.py +40 -0
- biocypher/output/connect/_neo4j_driver.py +9 -65
- biocypher/output/in_memory/_get_in_memory_kg.py +34 -0
- biocypher/output/in_memory/_in_memory_kg.py +40 -0
- biocypher/output/in_memory/_networkx.py +44 -0
- biocypher/output/in_memory/_pandas.py +20 -15
- biocypher/output/write/_batch_writer.py +137 -172
- biocypher/output/write/_get_writer.py +11 -24
- biocypher/output/write/_writer.py +14 -33
- biocypher/output/write/graph/_arangodb.py +7 -24
- biocypher/output/write/graph/_neo4j.py +59 -57
- biocypher/output/write/graph/_networkx.py +36 -43
- biocypher/output/write/graph/_rdf.py +114 -95
- biocypher/output/write/relational/_csv.py +6 -11
- biocypher/output/write/relational/_postgresql.py +12 -13
- biocypher/output/write/relational/_sqlite.py +3 -1
- {biocypher-0.6.1.dist-info → biocypher-0.7.0.dist-info}/LICENSE +1 -1
- {biocypher-0.6.1.dist-info → biocypher-0.7.0.dist-info}/METADATA +3 -3
- biocypher-0.7.0.dist-info/RECORD +43 -0
- {biocypher-0.6.1.dist-info → biocypher-0.7.0.dist-info}/WHEEL +1 -1
- biocypher-0.6.1.dist-info/RECORD +0 -39
biocypher/_core.py
CHANGED
|
@@ -1,43 +1,32 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
#
|
|
4
|
-
# Copyright 2021, Heidelberg University Clinic
|
|
5
|
-
#
|
|
6
|
-
# File author(s): Sebastian Lobentanzer
|
|
7
|
-
# ...
|
|
8
|
-
#
|
|
9
|
-
# Distributed under MIT licence, see the file `LICENSE`.
|
|
10
|
-
#
|
|
11
|
-
"""
|
|
12
|
-
BioCypher core module. Interfaces with the user and distributes tasks to
|
|
13
|
-
submodules.
|
|
14
|
-
"""
|
|
15
|
-
from typing import Optional
|
|
16
|
-
from datetime import datetime
|
|
17
|
-
import os
|
|
18
|
-
import json
|
|
1
|
+
"""BioCypher core module.
|
|
19
2
|
|
|
20
|
-
|
|
21
|
-
|
|
3
|
+
Interfaces with the user and distributes tasks to submodules.
|
|
4
|
+
"""
|
|
22
5
|
|
|
23
|
-
import
|
|
6
|
+
import itertools
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
24
9
|
|
|
25
|
-
from
|
|
10
|
+
from datetime import datetime
|
|
26
11
|
|
|
27
|
-
|
|
12
|
+
import yaml
|
|
28
13
|
|
|
14
|
+
from ._config import (
|
|
15
|
+
config as _config,
|
|
16
|
+
update_from_file as _file_update,
|
|
17
|
+
)
|
|
18
|
+
from ._create import BioCypherNode
|
|
19
|
+
from ._deduplicate import Deduplicator
|
|
29
20
|
from ._get import Downloader
|
|
30
|
-
from .
|
|
31
|
-
from ._config import update_from_file as _file_update
|
|
32
|
-
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
21
|
+
from ._logger import logger
|
|
33
22
|
from ._mapping import OntologyMapping
|
|
34
23
|
from ._ontology import Ontology
|
|
35
24
|
from ._translate import Translator
|
|
36
|
-
from .
|
|
37
|
-
from .output.in_memory.
|
|
25
|
+
from .output.connect._get_connector import get_connector
|
|
26
|
+
from .output.in_memory._get_in_memory_kg import IN_MEMORY_DBMS, get_in_memory_kg
|
|
38
27
|
from .output.write._get_writer import DBMS_TO_CLASS, get_writer
|
|
39
|
-
from .output.connect._neo4j_driver import get_driver
|
|
40
28
|
|
|
29
|
+
logger.debug(f"Loading module {__name__}.")
|
|
41
30
|
__all__ = ["BioCypher"]
|
|
42
31
|
|
|
43
32
|
SUPPORTED_DBMS = DBMS_TO_CLASS.keys()
|
|
@@ -51,17 +40,18 @@ REQUIRED_CONFIG = [
|
|
|
51
40
|
|
|
52
41
|
|
|
53
42
|
class BioCypher:
|
|
54
|
-
"""
|
|
55
|
-
Orchestration of BioCypher operations. Instantiate this class to interact
|
|
56
|
-
with BioCypher.
|
|
43
|
+
"""Orchestration of BioCypher operations.
|
|
57
44
|
|
|
58
|
-
|
|
45
|
+
Instantiate this class to interact with BioCypher.
|
|
59
46
|
|
|
47
|
+
Args:
|
|
48
|
+
----
|
|
60
49
|
dbms (str): The database management system to use. For supported
|
|
61
50
|
systems see SUPPORTED_DBMS.
|
|
62
51
|
|
|
63
|
-
offline (bool): Whether to run in offline mode.
|
|
64
|
-
|
|
52
|
+
offline (bool): Whether to run in offline mode. In offline mode
|
|
53
|
+
the Knowledge Graph is written to files. In online mode, it
|
|
54
|
+
is written to a database or hold in memory.
|
|
65
55
|
|
|
66
56
|
strict_mode (bool): Whether to run in strict mode. If True, the
|
|
67
57
|
translator will raise an error if a node or edge does not
|
|
@@ -81,6 +71,8 @@ class BioCypher:
|
|
|
81
71
|
output_directory (str): Path to the output directory. If not
|
|
82
72
|
provided, the default value 'biocypher-out' will be used.
|
|
83
73
|
|
|
74
|
+
cache_directory (str): Path to the cache directory.
|
|
75
|
+
|
|
84
76
|
"""
|
|
85
77
|
|
|
86
78
|
def __init__(
|
|
@@ -105,7 +97,7 @@ class BioCypher:
|
|
|
105
97
|
logger.warning(
|
|
106
98
|
"The parameter `db_name` is deprecated. Please set the "
|
|
107
99
|
"`database_name` setting in the `biocypher_config.yaml` file "
|
|
108
|
-
"instead."
|
|
100
|
+
"instead.",
|
|
109
101
|
)
|
|
110
102
|
_config(**{db_name: {"database_name": db_name}})
|
|
111
103
|
|
|
@@ -115,7 +107,8 @@ class BioCypher:
|
|
|
115
107
|
# Check for required configuration
|
|
116
108
|
for key in REQUIRED_CONFIG:
|
|
117
109
|
if key not in self.base_config:
|
|
118
|
-
|
|
110
|
+
msg = f"Configuration key {key} is required."
|
|
111
|
+
raise ValueError(msg)
|
|
119
112
|
|
|
120
113
|
# Set configuration - mandatory
|
|
121
114
|
self._dbms = dbms or self.base_config["dbms"]
|
|
@@ -131,34 +124,32 @@ class BioCypher:
|
|
|
131
124
|
self._strict_mode = strict_mode
|
|
132
125
|
|
|
133
126
|
self._schema_config_path = schema_config_path or self.base_config.get(
|
|
134
|
-
"schema_config_path"
|
|
127
|
+
"schema_config_path",
|
|
135
128
|
)
|
|
136
129
|
|
|
137
130
|
if not self._schema_config_path:
|
|
138
131
|
logger.warning("Running BioCypher without schema configuration.")
|
|
139
132
|
else:
|
|
140
133
|
logger.info(
|
|
141
|
-
f"Running BioCypher with schema configuration from {self._schema_config_path}."
|
|
134
|
+
f"Running BioCypher with schema configuration from {self._schema_config_path}.",
|
|
142
135
|
)
|
|
143
136
|
|
|
144
137
|
self._head_ontology = head_ontology or self.base_config["head_ontology"]
|
|
145
138
|
|
|
146
139
|
# Set configuration - optional
|
|
147
140
|
self._output_directory = output_directory or self.base_config.get(
|
|
148
|
-
"output_directory"
|
|
141
|
+
"output_directory",
|
|
149
142
|
)
|
|
150
143
|
self._cache_directory = cache_directory or self.base_config.get(
|
|
151
|
-
"cache_directory"
|
|
144
|
+
"cache_directory",
|
|
152
145
|
)
|
|
153
146
|
self._tail_ontologies = tail_ontologies or self.base_config.get(
|
|
154
|
-
"tail_ontologies"
|
|
147
|
+
"tail_ontologies",
|
|
155
148
|
)
|
|
156
149
|
|
|
157
150
|
if self._dbms not in SUPPORTED_DBMS:
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
f"Please select from {SUPPORTED_DBMS}."
|
|
161
|
-
)
|
|
151
|
+
msg = f"DBMS {self._dbms} not supported. Please select from {SUPPORTED_DBMS}."
|
|
152
|
+
raise ValueError(msg)
|
|
162
153
|
|
|
163
154
|
# Initialize
|
|
164
155
|
self._ontology_mapping = None
|
|
@@ -167,23 +158,97 @@ class BioCypher:
|
|
|
167
158
|
self._downloader = None
|
|
168
159
|
self._ontology = None
|
|
169
160
|
self._writer = None
|
|
170
|
-
self.
|
|
161
|
+
self._driver = None
|
|
162
|
+
self._in_memory_kg = None
|
|
171
163
|
|
|
172
|
-
|
|
164
|
+
self._in_memory_kg = None
|
|
165
|
+
self._nodes = None
|
|
166
|
+
self._edges = None
|
|
167
|
+
|
|
168
|
+
def _initialize_in_memory_kg(self) -> None:
|
|
169
|
+
"""Create in-memory KG instance.
|
|
170
|
+
|
|
171
|
+
Set as instance variable `self._in_memory_kg`.
|
|
173
172
|
"""
|
|
174
|
-
|
|
173
|
+
if not self._in_memory_kg:
|
|
174
|
+
self._in_memory_kg = get_in_memory_kg(
|
|
175
|
+
dbms=self._dbms,
|
|
176
|
+
deduplicator=self._get_deduplicator(),
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
def add_nodes(self, nodes) -> None:
|
|
180
|
+
"""Add new nodes to the internal representation.
|
|
181
|
+
|
|
182
|
+
Initially, receive nodes data from adaptor and create internal
|
|
183
|
+
representation for nodes.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
----
|
|
187
|
+
nodes(iterable): An iterable of nodes
|
|
188
|
+
|
|
189
|
+
"""
|
|
190
|
+
if isinstance(nodes, list):
|
|
191
|
+
self._nodes = list(itertools.chain(self._nodes, nodes))
|
|
192
|
+
else:
|
|
193
|
+
self._nodes = itertools.chain(self._nodes, nodes)
|
|
194
|
+
|
|
195
|
+
def add_edges(self, edges) -> None:
|
|
196
|
+
"""Add new edges to the internal representation.
|
|
197
|
+
|
|
198
|
+
Initially, receive edges data from adaptor and create internal
|
|
199
|
+
representation for edges.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
----
|
|
203
|
+
edges(iterable): An iterable of edges.
|
|
204
|
+
|
|
175
205
|
"""
|
|
206
|
+
if isinstance(edges, list):
|
|
207
|
+
self._edges = list(itertools.chain(self._edges, edges))
|
|
208
|
+
else:
|
|
209
|
+
self._edges = itertools.chain(self._edges, edges)
|
|
210
|
+
|
|
211
|
+
def to_df(self):
|
|
212
|
+
"""Create DataFrame using internal representation.
|
|
213
|
+
|
|
214
|
+
TODO: to_df implies data frame, should be specifically that use case
|
|
215
|
+
"""
|
|
216
|
+
return self._to_KG()
|
|
217
|
+
|
|
218
|
+
def to_networkx(self):
|
|
219
|
+
"""Create networkx using internal representation."""
|
|
220
|
+
return self._to_KG()
|
|
221
|
+
|
|
222
|
+
def _to_KG(self):
|
|
223
|
+
"""Convert the internal representation to knowledge graph.
|
|
224
|
+
|
|
225
|
+
The knowledge graph is returned based on the `dbms` parameter in
|
|
226
|
+
the biocypher configuration file.
|
|
227
|
+
|
|
228
|
+
Returns
|
|
229
|
+
-------
|
|
230
|
+
Any: knowledge graph.
|
|
231
|
+
|
|
232
|
+
"""
|
|
233
|
+
if not self._in_memory_kg:
|
|
234
|
+
self._initialize_in_memory_kg()
|
|
235
|
+
if not self._translator:
|
|
236
|
+
self._get_translator()
|
|
237
|
+
tnodes = self._translator.translate_entities(self._nodes)
|
|
238
|
+
tedges = self._translator.translate_entities(self._edges)
|
|
239
|
+
self._in_memory_kg.add_nodes(tnodes)
|
|
240
|
+
self._in_memory_kg.add_edges(tedges)
|
|
241
|
+
return self._in_memory_kg.get_kg()
|
|
176
242
|
|
|
243
|
+
def _get_deduplicator(self) -> Deduplicator:
|
|
244
|
+
"""Create deduplicator if not exists and return."""
|
|
177
245
|
if not self._deduplicator:
|
|
178
246
|
self._deduplicator = Deduplicator()
|
|
179
247
|
|
|
180
248
|
return self._deduplicator
|
|
181
249
|
|
|
182
250
|
def _get_ontology_mapping(self) -> OntologyMapping:
|
|
183
|
-
"""
|
|
184
|
-
Create ontology mapping if not exists and return.
|
|
185
|
-
"""
|
|
186
|
-
|
|
251
|
+
"""Create ontology mapping if not exists and return."""
|
|
187
252
|
if not self._schema_config_path:
|
|
188
253
|
self._ontology_mapping = OntologyMapping()
|
|
189
254
|
|
|
@@ -195,10 +260,7 @@ class BioCypher:
|
|
|
195
260
|
return self._ontology_mapping
|
|
196
261
|
|
|
197
262
|
def _get_ontology(self) -> Ontology:
|
|
198
|
-
"""
|
|
199
|
-
Create ontology if not exists and return.
|
|
200
|
-
"""
|
|
201
|
-
|
|
263
|
+
"""Create ontology if not exists and return."""
|
|
202
264
|
if not self._ontology:
|
|
203
265
|
self._ontology = Ontology(
|
|
204
266
|
ontology_mapping=self._get_ontology_mapping(),
|
|
@@ -209,10 +271,7 @@ class BioCypher:
|
|
|
209
271
|
return self._ontology
|
|
210
272
|
|
|
211
273
|
def _get_translator(self) -> Translator:
|
|
212
|
-
"""
|
|
213
|
-
Create translator if not exists and return.
|
|
214
|
-
"""
|
|
215
|
-
|
|
274
|
+
"""Create translator if not exists and return."""
|
|
216
275
|
if not self._translator:
|
|
217
276
|
self._translator = Translator(
|
|
218
277
|
ontology=self._get_ontology(),
|
|
@@ -222,14 +281,18 @@ class BioCypher:
|
|
|
222
281
|
return self._translator
|
|
223
282
|
|
|
224
283
|
def _get_writer(self):
|
|
225
|
-
"""
|
|
226
|
-
Create writer if not online. Set as instance variable `self._writer`.
|
|
227
|
-
"""
|
|
284
|
+
"""Create writer if not online.
|
|
228
285
|
|
|
286
|
+
Set as instance variable `self._writer`.
|
|
287
|
+
"""
|
|
229
288
|
if self._offline:
|
|
230
|
-
|
|
289
|
+
|
|
290
|
+
def timestamp() -> str:
|
|
291
|
+
return datetime.now().strftime("%Y%m%d%H%M%S")
|
|
292
|
+
|
|
231
293
|
outdir = self._output_directory or os.path.join(
|
|
232
|
-
"biocypher-out",
|
|
294
|
+
"biocypher-out",
|
|
295
|
+
timestamp(),
|
|
233
296
|
)
|
|
234
297
|
self._output_directory = os.path.abspath(outdir)
|
|
235
298
|
|
|
@@ -241,240 +304,270 @@ class BioCypher:
|
|
|
241
304
|
strict_mode=self._strict_mode,
|
|
242
305
|
)
|
|
243
306
|
else:
|
|
244
|
-
|
|
307
|
+
msg = "Cannot get writer in online mode."
|
|
308
|
+
raise NotImplementedError(msg)
|
|
309
|
+
|
|
310
|
+
return self._writer
|
|
245
311
|
|
|
246
312
|
def _get_driver(self):
|
|
247
|
-
"""
|
|
248
|
-
Create driver if not exists. Set as instance variable `self._driver`.
|
|
249
|
-
"""
|
|
313
|
+
"""Create driver if not exists.
|
|
250
314
|
|
|
315
|
+
Set as instance variable `self._driver`.
|
|
316
|
+
"""
|
|
251
317
|
if not self._offline:
|
|
252
|
-
self._driver =
|
|
318
|
+
self._driver = get_connector(
|
|
253
319
|
dbms=self._dbms,
|
|
254
320
|
translator=self._get_translator(),
|
|
255
|
-
deduplicator=self._get_deduplicator(),
|
|
256
321
|
)
|
|
257
322
|
else:
|
|
258
|
-
|
|
323
|
+
msg = "Cannot get driver in offline mode."
|
|
324
|
+
raise NotImplementedError(msg)
|
|
259
325
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
)
|
|
326
|
+
return self._driver
|
|
327
|
+
|
|
328
|
+
def _get_in_memory_kg(self):
|
|
329
|
+
"""Create in-memory KG instance.
|
|
330
|
+
|
|
331
|
+
Set as instance variable `self._in_memory_kg`.
|
|
263
332
|
"""
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
333
|
+
if not self._in_memory_kg:
|
|
334
|
+
self._in_memory_kg = get_in_memory_kg(
|
|
335
|
+
dbms=self._dbms,
|
|
336
|
+
deduplicator=self._get_deduplicator(),
|
|
337
|
+
)
|
|
267
338
|
|
|
268
|
-
|
|
269
|
-
nodes (iterable): An iterable of nodes to write to the database.
|
|
339
|
+
return self._in_memory_kg
|
|
270
340
|
|
|
271
|
-
|
|
341
|
+
def _add_nodes(
|
|
342
|
+
self,
|
|
343
|
+
nodes,
|
|
344
|
+
batch_size: int = int(1e6),
|
|
345
|
+
force: bool = False,
|
|
346
|
+
):
|
|
347
|
+
"""Add nodes to the BioCypher KG.
|
|
272
348
|
|
|
273
|
-
|
|
274
|
-
|
|
349
|
+
First uses the `_translator` to translate the nodes to `BioCypherNode`
|
|
350
|
+
objects. Depending on the configuration the translated nodes are then
|
|
351
|
+
passed to the
|
|
275
352
|
|
|
276
|
-
|
|
277
|
-
bool: True if successful.
|
|
278
|
-
"""
|
|
353
|
+
- `_writer`: if `_offline` is set to `False`
|
|
279
354
|
|
|
280
|
-
if
|
|
281
|
-
|
|
355
|
+
- `_in_memory_kg`: if `_offline` is set to `False` and the `_dbms` is an
|
|
356
|
+
`IN_MEMORY_DBMS`
|
|
282
357
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
tnodes = self._translator.translate_nodes(nodes)
|
|
286
|
-
else:
|
|
287
|
-
tnodes = nodes
|
|
288
|
-
# write node files
|
|
289
|
-
return self._writer.write_nodes(
|
|
290
|
-
tnodes, batch_size=batch_size, force=force
|
|
291
|
-
)
|
|
358
|
+
- `_driver`: if `_offline` is set to `True` and the `_dbms` is not an
|
|
359
|
+
`IN_MEMORY_DBMS`
|
|
292
360
|
|
|
293
|
-
def write_edges(self, edges, batch_size: int = int(1e6)) -> bool:
|
|
294
361
|
"""
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
362
|
+
if not self._translator:
|
|
363
|
+
self._get_translator()
|
|
364
|
+
translated_nodes = self._translator.translate_entities(nodes)
|
|
298
365
|
|
|
299
|
-
|
|
300
|
-
|
|
366
|
+
if self._offline:
|
|
367
|
+
passed = self._get_writer().write_nodes(
|
|
368
|
+
translated_nodes,
|
|
369
|
+
batch_size=batch_size,
|
|
370
|
+
force=force,
|
|
371
|
+
)
|
|
372
|
+
elif self._is_online_and_in_memory():
|
|
373
|
+
passed = self._get_in_memory_kg().add_nodes(translated_nodes)
|
|
374
|
+
else:
|
|
375
|
+
passed = self._get_driver().add_biocypher_nodes(translated_nodes)
|
|
301
376
|
|
|
302
|
-
|
|
303
|
-
bool: True if successful.
|
|
304
|
-
"""
|
|
377
|
+
return passed
|
|
305
378
|
|
|
306
|
-
|
|
307
|
-
|
|
379
|
+
def _add_edges(self, edges, batch_size: int = int(1e6)):
|
|
380
|
+
"""Add edges to the BioCypher KG.
|
|
308
381
|
|
|
309
|
-
edges
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
else:
|
|
313
|
-
tedges = edges
|
|
314
|
-
# write edge files
|
|
315
|
-
return self._writer.write_edges(tedges, batch_size=batch_size)
|
|
382
|
+
First uses the `_translator` to translate the edges to `BioCypherEdge`
|
|
383
|
+
objects. Depending on the configuration the translated edges are then
|
|
384
|
+
passed to the
|
|
316
385
|
|
|
317
|
-
|
|
318
|
-
"""
|
|
319
|
-
Convert entities to a pandas DataFrame for each entity type and return
|
|
320
|
-
a list.
|
|
386
|
+
- `_writer`: if `_offline` is set to `False`
|
|
321
387
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
388
|
+
- `_in_memory_kg`: if `_offline` is set to `False` and the `_dbms` is an
|
|
389
|
+
`IN_MEMORY_DBMS`
|
|
390
|
+
|
|
391
|
+
- `_driver`: if `_offline` is set to `True` and the `_dbms` is not an
|
|
392
|
+
`IN_MEMORY_DBMS`
|
|
325
393
|
|
|
326
|
-
Returns:
|
|
327
|
-
pd.DataFrame: A pandas DataFrame.
|
|
328
394
|
"""
|
|
329
|
-
if not self.
|
|
330
|
-
|
|
331
|
-
|
|
395
|
+
if not self._translator:
|
|
396
|
+
self._get_translator()
|
|
397
|
+
translated_edges = self._translator.translate_entities(edges)
|
|
398
|
+
|
|
399
|
+
if self._offline:
|
|
400
|
+
if not self._writer:
|
|
401
|
+
self._initialize_writer()
|
|
402
|
+
passed = self._writer.write_edges(
|
|
403
|
+
translated_edges,
|
|
404
|
+
batch_size=batch_size,
|
|
332
405
|
)
|
|
406
|
+
elif self._is_online_and_in_memory():
|
|
407
|
+
if not self._in_memory_kg:
|
|
408
|
+
self._initialize_in_memory_kg()
|
|
409
|
+
passed = self._in_memory_kg.add_edges(translated_edges)
|
|
410
|
+
else:
|
|
411
|
+
if not self._driver:
|
|
412
|
+
self._initialize_driver()
|
|
413
|
+
passed = self._driver.add_biocypher_nodes(translated_edges)
|
|
333
414
|
|
|
334
|
-
return
|
|
415
|
+
return passed
|
|
335
416
|
|
|
336
|
-
def
|
|
337
|
-
"""
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
417
|
+
def _is_online_and_in_memory(self) -> bool:
|
|
418
|
+
"""Return True if in online mode and in-memory dbms is used."""
|
|
419
|
+
return (not self._offline) & (self._dbms in IN_MEMORY_DBMS)
|
|
420
|
+
|
|
421
|
+
def write_nodes(
|
|
422
|
+
self,
|
|
423
|
+
nodes,
|
|
424
|
+
batch_size: int = int(1e6),
|
|
425
|
+
force: bool = False,
|
|
426
|
+
) -> bool:
|
|
427
|
+
"""Write nodes to database.
|
|
428
|
+
|
|
429
|
+
Either takes an iterable of tuples (if given, translates to
|
|
430
|
+
``BioCypherNode`` objects) or an iterable of ``BioCypherNode`` objects.
|
|
342
431
|
|
|
343
432
|
Args:
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
433
|
+
----
|
|
434
|
+
nodes (iterable): An iterable of nodes to write to the database.
|
|
435
|
+
batch_size (int): The batch size to use when writing to disk.
|
|
436
|
+
force (bool): Whether to force writing to the output directory even
|
|
437
|
+
if the node type is not present in the schema config file.
|
|
347
438
|
|
|
348
439
|
Returns:
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
if not self._pd:
|
|
352
|
-
self._pd = Pandas(
|
|
353
|
-
translator=self._get_translator(),
|
|
354
|
-
deduplicator=self._get_deduplicator(),
|
|
355
|
-
)
|
|
356
|
-
|
|
357
|
-
entities = peekable(entities)
|
|
440
|
+
-------
|
|
441
|
+
bool: True if successful.
|
|
358
442
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
or isinstance(entities.peek(), BioCypherEdge)
|
|
362
|
-
or isinstance(entities.peek(), BioCypherRelAsNode)
|
|
363
|
-
):
|
|
364
|
-
tentities = entities
|
|
365
|
-
elif len(entities.peek()) < 4:
|
|
366
|
-
tentities = self._translator.translate_nodes(entities)
|
|
367
|
-
else:
|
|
368
|
-
tentities = self._translator.translate_edges(entities)
|
|
443
|
+
"""
|
|
444
|
+
return self._add_nodes(nodes, batch_size=batch_size, force=force)
|
|
369
445
|
|
|
370
|
-
|
|
446
|
+
def write_edges(self, edges, batch_size: int = int(1e6)) -> bool:
|
|
447
|
+
"""Write edges to database.
|
|
371
448
|
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
Wrapper for ``add()`` to add nodes to the in-memory database.
|
|
449
|
+
Either takes an iterable of tuples (if given, translates to
|
|
450
|
+
``BioCypherEdge`` objects) or an iterable of ``BioCypherEdge`` objects.
|
|
375
451
|
|
|
376
452
|
Args:
|
|
377
|
-
|
|
453
|
+
----
|
|
454
|
+
edges (iterable): An iterable of edges to write to the database.
|
|
378
455
|
|
|
379
456
|
Returns:
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
self.add(nodes)
|
|
457
|
+
-------
|
|
458
|
+
bool: True if successful.
|
|
383
459
|
|
|
384
|
-
def add_edges(self, edges) -> None:
|
|
385
460
|
"""
|
|
386
|
-
|
|
461
|
+
return self._add_edges(edges, batch_size=batch_size)
|
|
462
|
+
|
|
463
|
+
def add(self, entities) -> None:
|
|
464
|
+
"""Add entities to the in-memory database.
|
|
465
|
+
|
|
466
|
+
Accepts an iterable of tuples (if given, translates to
|
|
467
|
+
``BioCypherNode`` or ``BioCypherEdge`` objects) or an iterable of
|
|
468
|
+
``BioCypherNode`` or ``BioCypherEdge`` objects.
|
|
387
469
|
|
|
388
470
|
Args:
|
|
389
|
-
|
|
471
|
+
----
|
|
472
|
+
entities (iterable): An iterable of entities to add to the database.
|
|
473
|
+
Can be 3-tuples (nodes) or 5-tuples (edges); also accepts
|
|
474
|
+
4-tuples for edges (deprecated).
|
|
390
475
|
|
|
391
476
|
Returns:
|
|
477
|
+
-------
|
|
392
478
|
None
|
|
479
|
+
|
|
393
480
|
"""
|
|
394
|
-
self.
|
|
481
|
+
return self._add_nodes(entities)
|
|
395
482
|
|
|
396
483
|
def merge_nodes(self, nodes) -> bool:
|
|
397
|
-
"""
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
``BioCypherNode`` objects.
|
|
484
|
+
"""Merge nodes into database.
|
|
485
|
+
|
|
486
|
+
Either takes an iterable of tuples (if given, translates to
|
|
487
|
+
``BioCypherNode`` objects) or an iterable of ``BioCypherNode`` objects.
|
|
401
488
|
|
|
402
489
|
Args:
|
|
490
|
+
----
|
|
403
491
|
nodes (iterable): An iterable of nodes to merge into the database.
|
|
404
492
|
|
|
405
493
|
Returns:
|
|
494
|
+
-------
|
|
406
495
|
bool: True if successful.
|
|
407
|
-
"""
|
|
408
|
-
|
|
409
|
-
if not self._driver:
|
|
410
|
-
self._get_driver()
|
|
411
496
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
tnodes = self._translator.translate_nodes(nodes)
|
|
415
|
-
else:
|
|
416
|
-
tnodes = nodes
|
|
417
|
-
# write node files
|
|
418
|
-
return self._driver.add_biocypher_nodes(tnodes)
|
|
497
|
+
"""
|
|
498
|
+
return self._add_nodes(nodes)
|
|
419
499
|
|
|
420
500
|
def merge_edges(self, edges) -> bool:
|
|
421
|
-
"""
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
``BioCypherEdge`` objects.
|
|
501
|
+
"""Merge edges into database.
|
|
502
|
+
|
|
503
|
+
Either takes an iterable of tuples (if given, translates to
|
|
504
|
+
``BioCypherEdge`` objects) or an iterable of ``BioCypherEdge`` objects.
|
|
425
505
|
|
|
426
506
|
Args:
|
|
507
|
+
----
|
|
427
508
|
edges (iterable): An iterable of edges to merge into the database.
|
|
428
509
|
|
|
429
510
|
Returns:
|
|
511
|
+
-------
|
|
430
512
|
bool: True if successful.
|
|
513
|
+
|
|
431
514
|
"""
|
|
515
|
+
return self._add_edges(edges)
|
|
432
516
|
|
|
433
|
-
|
|
434
|
-
|
|
517
|
+
def get_kg(self):
|
|
518
|
+
"""Get the in-memory KG instance.
|
|
435
519
|
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
520
|
+
Depending on the specified `dbms` this could either be a list of Pandas
|
|
521
|
+
dataframes or a NetworkX DiGraph.
|
|
522
|
+
"""
|
|
523
|
+
if not self._is_online_and_in_memory():
|
|
524
|
+
msg = (f"Getting the in-memory KG is only available in online mode for {IN_MEMORY_DBMS}.",)
|
|
525
|
+
raise ValueError(msg)
|
|
526
|
+
if not self._in_memory_kg:
|
|
527
|
+
msg = "No in-memory KG instance found. Please call `add()` first."
|
|
528
|
+
raise ValueError(msg)
|
|
443
529
|
|
|
444
|
-
|
|
530
|
+
if not self._in_memory_kg:
|
|
531
|
+
self._initialize_in_memory_kg()
|
|
532
|
+
return self._in_memory_kg.get_kg()
|
|
445
533
|
|
|
446
|
-
|
|
447
|
-
"""
|
|
448
|
-
Create downloader if not exists.
|
|
449
|
-
"""
|
|
534
|
+
# DOWNLOAD AND CACHE MANAGEMENT METHODS ###
|
|
450
535
|
|
|
536
|
+
def _get_downloader(self, cache_dir: str | None = None):
|
|
537
|
+
"""Create downloader if not exists."""
|
|
451
538
|
if not self._downloader:
|
|
452
539
|
self._downloader = Downloader(self._cache_directory)
|
|
453
540
|
|
|
454
541
|
def download(self, *resources) -> None:
|
|
455
|
-
"""
|
|
456
|
-
Use the :class:`Downloader` class to download or load from cache the
|
|
457
|
-
resources given by the adapter.
|
|
458
|
-
"""
|
|
542
|
+
"""Download or load from cache the resources given by the adapter.
|
|
459
543
|
|
|
544
|
+
Args:
|
|
545
|
+
----
|
|
546
|
+
resources (iterable): An iterable of resources to download or load
|
|
547
|
+
from cache.
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
-------
|
|
551
|
+
None
|
|
552
|
+
|
|
553
|
+
"""
|
|
460
554
|
self._get_downloader()
|
|
461
555
|
return self._downloader.download(*resources)
|
|
462
556
|
|
|
463
557
|
# OVERVIEW AND CONVENIENCE METHODS ###
|
|
464
558
|
|
|
465
|
-
def log_missing_input_labels(self) ->
|
|
466
|
-
"""
|
|
559
|
+
def log_missing_input_labels(self) -> dict[str, list[str]] | None:
|
|
560
|
+
"""Log missing input labels.
|
|
467
561
|
|
|
468
562
|
Get the set of input labels encountered without an entry in the
|
|
469
563
|
`schema_config.yaml` and print them to the logger.
|
|
470
564
|
|
|
471
|
-
Returns
|
|
472
|
-
|
|
565
|
+
Returns
|
|
566
|
+
-------
|
|
473
567
|
Optional[Dict[str, List[str]]]: A dictionary of Biolink types
|
|
474
568
|
encountered without an entry in the `schema_config.yaml` file.
|
|
475
569
|
|
|
476
570
|
"""
|
|
477
|
-
|
|
478
571
|
mt = self._translator.get_missing_biolink_types()
|
|
479
572
|
|
|
480
573
|
if mt:
|
|
@@ -495,11 +588,11 @@ class BioCypher:
|
|
|
495
588
|
return None
|
|
496
589
|
|
|
497
590
|
def log_duplicates(self) -> None:
|
|
498
|
-
"""
|
|
591
|
+
"""Log duplicate nodes and edges.
|
|
592
|
+
|
|
499
593
|
Get the set of duplicate nodes and edges encountered and print them to
|
|
500
594
|
the logger.
|
|
501
595
|
"""
|
|
502
|
-
|
|
503
596
|
dn = self._deduplicator.get_duplicate_nodes()
|
|
504
597
|
|
|
505
598
|
if dn:
|
|
@@ -543,11 +636,10 @@ class BioCypher:
|
|
|
543
636
|
logger.info("No duplicate edges in input.")
|
|
544
637
|
|
|
545
638
|
def show_ontology_structure(self, **kwargs) -> None:
|
|
546
|
-
"""
|
|
547
|
-
Show the ontology structure using treelib or write to GRAPHML file.
|
|
639
|
+
"""Show the ontology structure using treelib or write to GRAPHML file.
|
|
548
640
|
|
|
549
641
|
Args:
|
|
550
|
-
|
|
642
|
+
----
|
|
551
643
|
to_disk (str): If specified, the ontology structure will be saved
|
|
552
644
|
to disk as a GRAPHML file, to be opened in your favourite
|
|
553
645
|
graph visualisation tool.
|
|
@@ -555,36 +647,37 @@ class BioCypher:
|
|
|
555
647
|
full (bool): If True, the full ontology structure will be shown,
|
|
556
648
|
including all nodes and edges. If False, only the nodes and
|
|
557
649
|
edges that are relevant to the extended schema will be shown.
|
|
558
|
-
"""
|
|
559
650
|
|
|
651
|
+
"""
|
|
560
652
|
if not self._ontology:
|
|
561
653
|
self._get_ontology()
|
|
562
654
|
|
|
563
655
|
return self._ontology.show_ontology_structure(**kwargs)
|
|
564
656
|
|
|
565
657
|
def write_import_call(self) -> str:
|
|
566
|
-
"""
|
|
567
|
-
Write a shell script to import the database depending on the chosen
|
|
568
|
-
DBMS.
|
|
658
|
+
"""Write a shell script to import the database.
|
|
569
659
|
|
|
570
|
-
|
|
660
|
+
Shell script is written depending on the chosen DBMS.
|
|
661
|
+
|
|
662
|
+
Returns
|
|
663
|
+
-------
|
|
571
664
|
str: path toward the file holding the import call.
|
|
572
|
-
"""
|
|
573
665
|
|
|
666
|
+
"""
|
|
574
667
|
if not self._offline:
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
)
|
|
668
|
+
msg = "Cannot write import call in online mode."
|
|
669
|
+
raise NotImplementedError(msg)
|
|
578
670
|
|
|
579
671
|
return self._writer.write_import_call()
|
|
580
672
|
|
|
581
673
|
def write_schema_info(self, as_node: bool = False) -> None:
|
|
582
|
-
"""
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
674
|
+
"""Write an extended schema info to file or node.
|
|
675
|
+
|
|
676
|
+
Creates a YAML file or KG node that extends the `schema_config.yaml`
|
|
677
|
+
with run-time information of the built KG. For instance, include
|
|
678
|
+
information on whether something present in the actual knowledge graph,
|
|
679
|
+
whether it is a relationship (which is important in the case of
|
|
680
|
+
representing relationships as nodes) and the actual sources and
|
|
588
681
|
targets of edges. Since this file can be used in place of the original
|
|
589
682
|
`schema_config.yaml` file, it indicates that it is the extended schema
|
|
590
683
|
by setting `is_schema_info` to `true`.
|
|
@@ -594,12 +687,16 @@ class BioCypher:
|
|
|
594
687
|
The information of whether something is a relationship can be gathered
|
|
595
688
|
from the deduplicator instance, which keeps track of all entities that
|
|
596
689
|
have been seen.
|
|
597
|
-
"""
|
|
598
690
|
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
691
|
+
Args:
|
|
692
|
+
----
|
|
693
|
+
as_node (bool): If True, the schema info is written as a KG node.
|
|
694
|
+
If False, the schema info is written to a YAML file.
|
|
695
|
+
|
|
696
|
+
"""
|
|
697
|
+
if (not self._offline) and self._dbms not in IN_MEMORY_DBMS:
|
|
698
|
+
msg = "Cannot write schema info in online mode."
|
|
699
|
+
raise NotImplementedError(msg)
|
|
603
700
|
|
|
604
701
|
ontology = self._get_ontology()
|
|
605
702
|
schema = ontology.mapping.extended_schema.copy()
|
|
@@ -607,13 +704,12 @@ class BioCypher:
|
|
|
607
704
|
|
|
608
705
|
deduplicator = self._get_deduplicator()
|
|
609
706
|
for node in deduplicator.entity_types:
|
|
610
|
-
if node in schema
|
|
707
|
+
if node in schema:
|
|
611
708
|
schema[node]["present_in_knowledge_graph"] = True
|
|
612
709
|
schema[node]["is_relationship"] = False
|
|
613
710
|
else:
|
|
614
711
|
logger.info(
|
|
615
|
-
f"Node {node} not present in extended schema. "
|
|
616
|
-
"Skipping schema info."
|
|
712
|
+
f"Node {node} not present in extended schema. Skipping schema info.",
|
|
617
713
|
)
|
|
618
714
|
|
|
619
715
|
# find 'label_as_edge' cases in schema entries
|
|
@@ -621,21 +717,20 @@ class BioCypher:
|
|
|
621
717
|
for k, v in schema.items():
|
|
622
718
|
if not isinstance(v, dict):
|
|
623
719
|
continue
|
|
624
|
-
if "label_as_edge" in v
|
|
625
|
-
if v["label_as_edge"] in deduplicator.seen_relationships
|
|
720
|
+
if "label_as_edge" in v:
|
|
721
|
+
if v["label_as_edge"] in deduplicator.seen_relationships:
|
|
626
722
|
changed_labels[v["label_as_edge"]] = k
|
|
627
723
|
|
|
628
|
-
for edge in deduplicator.seen_relationships
|
|
629
|
-
if edge in changed_labels
|
|
724
|
+
for edge in deduplicator.seen_relationships:
|
|
725
|
+
if edge in changed_labels:
|
|
630
726
|
edge = changed_labels[edge]
|
|
631
|
-
if edge in schema
|
|
727
|
+
if edge in schema:
|
|
632
728
|
schema[edge]["present_in_knowledge_graph"] = True
|
|
633
729
|
schema[edge]["is_relationship"] = True
|
|
634
730
|
# TODO information about source and target nodes
|
|
635
731
|
else:
|
|
636
732
|
logger.info(
|
|
637
|
-
f"Edge {edge} not present in extended schema. "
|
|
638
|
-
"Skipping schema info."
|
|
733
|
+
f"Edge {edge} not present in extended schema. Skipping schema info.",
|
|
639
734
|
)
|
|
640
735
|
|
|
641
736
|
# write to output directory as YAML file
|
|
@@ -660,74 +755,77 @@ class BioCypher:
|
|
|
660
755
|
# TRANSLATION METHODS ###
|
|
661
756
|
|
|
662
757
|
def translate_term(self, term: str) -> str:
|
|
663
|
-
"""
|
|
664
|
-
Translate a term to its BioCypher equivalent.
|
|
758
|
+
"""Translate a term to its BioCypher equivalent.
|
|
665
759
|
|
|
666
760
|
Args:
|
|
761
|
+
----
|
|
667
762
|
term (str): The term to translate.
|
|
668
763
|
|
|
669
764
|
Returns:
|
|
765
|
+
-------
|
|
670
766
|
str: The BioCypher equivalent of the term.
|
|
671
|
-
"""
|
|
672
767
|
|
|
768
|
+
"""
|
|
673
769
|
# instantiate adapter if not exists
|
|
674
770
|
self.start_ontology()
|
|
675
771
|
|
|
676
772
|
return self._translator.translate_term(term)
|
|
677
773
|
|
|
678
774
|
def summary(self) -> None:
|
|
679
|
-
"""
|
|
680
|
-
Wrapper for showing ontology structure and logging duplicates and
|
|
681
|
-
missing input types.
|
|
682
|
-
"""
|
|
775
|
+
"""Call convenience and reporting methods.
|
|
683
776
|
|
|
777
|
+
Shows ontology structure and logs duplicates and missing input types.
|
|
778
|
+
"""
|
|
684
779
|
self.show_ontology_structure()
|
|
685
780
|
self.log_duplicates()
|
|
686
781
|
self.log_missing_input_labels()
|
|
687
782
|
|
|
688
783
|
def reverse_translate_term(self, term: str) -> str:
|
|
689
|
-
"""
|
|
690
|
-
Reverse translate a term from its BioCypher equivalent.
|
|
784
|
+
"""Reverse translate a term from its BioCypher equivalent.
|
|
691
785
|
|
|
692
786
|
Args:
|
|
787
|
+
----
|
|
693
788
|
term (str): The BioCypher term to reverse translate.
|
|
694
789
|
|
|
695
790
|
Returns:
|
|
791
|
+
-------
|
|
696
792
|
str: The original term.
|
|
697
|
-
"""
|
|
698
793
|
|
|
794
|
+
"""
|
|
699
795
|
# instantiate adapter if not exists
|
|
700
796
|
self.start_ontology()
|
|
701
797
|
|
|
702
798
|
return self._translator.reverse_translate_term(term)
|
|
703
799
|
|
|
704
800
|
def translate_query(self, query: str) -> str:
|
|
705
|
-
"""
|
|
706
|
-
Translate a query to its BioCypher equivalent.
|
|
801
|
+
"""Translate a query to its BioCypher equivalent.
|
|
707
802
|
|
|
708
803
|
Args:
|
|
804
|
+
----
|
|
709
805
|
query (str): The query to translate.
|
|
710
806
|
|
|
711
807
|
Returns:
|
|
808
|
+
-------
|
|
712
809
|
str: The BioCypher equivalent of the query.
|
|
713
|
-
"""
|
|
714
810
|
|
|
811
|
+
"""
|
|
715
812
|
# instantiate adapter if not exists
|
|
716
813
|
self.start_ontology()
|
|
717
814
|
|
|
718
815
|
return self._translator.translate(query)
|
|
719
816
|
|
|
720
817
|
def reverse_translate_query(self, query: str) -> str:
|
|
721
|
-
"""
|
|
722
|
-
Reverse translate a query from its BioCypher equivalent.
|
|
818
|
+
"""Reverse translate a query from its BioCypher equivalent.
|
|
723
819
|
|
|
724
820
|
Args:
|
|
821
|
+
----
|
|
725
822
|
query (str): The BioCypher query to reverse translate.
|
|
726
823
|
|
|
727
824
|
Returns:
|
|
825
|
+
-------
|
|
728
826
|
str: The original query.
|
|
729
|
-
"""
|
|
730
827
|
|
|
828
|
+
"""
|
|
731
829
|
# instantiate adapter if not exists
|
|
732
830
|
self.start_ontology()
|
|
733
831
|
|