biocypher 0.6.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

Files changed (34) hide show
  1. biocypher/__init__.py +3 -13
  2. biocypher/_config/__init__.py +6 -23
  3. biocypher/_config/biocypher_config.yaml +14 -3
  4. biocypher/_core.py +360 -262
  5. biocypher/_create.py +13 -27
  6. biocypher/_deduplicate.py +4 -11
  7. biocypher/_get.py +21 -60
  8. biocypher/_logger.py +4 -16
  9. biocypher/_mapping.py +4 -17
  10. biocypher/_metadata.py +3 -15
  11. biocypher/_misc.py +14 -28
  12. biocypher/_ontology.py +127 -212
  13. biocypher/_translate.py +34 -58
  14. biocypher/output/connect/_get_connector.py +40 -0
  15. biocypher/output/connect/_neo4j_driver.py +9 -65
  16. biocypher/output/in_memory/_get_in_memory_kg.py +34 -0
  17. biocypher/output/in_memory/_in_memory_kg.py +40 -0
  18. biocypher/output/in_memory/_networkx.py +44 -0
  19. biocypher/output/in_memory/_pandas.py +20 -15
  20. biocypher/output/write/_batch_writer.py +166 -179
  21. biocypher/output/write/_get_writer.py +11 -24
  22. biocypher/output/write/_writer.py +43 -44
  23. biocypher/output/write/graph/_arangodb.py +7 -24
  24. biocypher/output/write/graph/_neo4j.py +51 -56
  25. biocypher/output/write/graph/_networkx.py +36 -43
  26. biocypher/output/write/graph/_rdf.py +107 -95
  27. biocypher/output/write/relational/_csv.py +6 -11
  28. biocypher/output/write/relational/_postgresql.py +5 -13
  29. biocypher/output/write/relational/_sqlite.py +3 -1
  30. {biocypher-0.6.2.dist-info → biocypher-0.8.0.dist-info}/LICENSE +1 -1
  31. {biocypher-0.6.2.dist-info → biocypher-0.8.0.dist-info}/METADATA +3 -3
  32. biocypher-0.8.0.dist-info/RECORD +43 -0
  33. {biocypher-0.6.2.dist-info → biocypher-0.8.0.dist-info}/WHEEL +1 -1
  34. biocypher-0.6.2.dist-info/RECORD +0 -39
biocypher/_core.py CHANGED
@@ -1,43 +1,32 @@
1
- #!/usr/bin/env python
2
-
3
- #
4
- # Copyright 2021, Heidelberg University Clinic
5
- #
6
- # File author(s): Sebastian Lobentanzer
7
- # ...
8
- #
9
- # Distributed under MIT licence, see the file `LICENSE`.
10
- #
11
- """
12
- BioCypher core module. Interfaces with the user and distributes tasks to
13
- submodules.
14
- """
15
- from typing import Optional
16
- from datetime import datetime
17
- import os
18
- import json
1
+ """BioCypher core module.
19
2
 
20
- from more_itertools import peekable
21
- import yaml
3
+ Interfaces with the user and distributes tasks to submodules.
4
+ """
22
5
 
23
- import pandas as pd
6
+ import itertools
7
+ import json
8
+ import os
24
9
 
25
- from ._logger import logger
10
+ from datetime import datetime
26
11
 
27
- logger.debug(f"Loading module {__name__}.")
12
+ import yaml
28
13
 
14
+ from ._config import (
15
+ config as _config,
16
+ update_from_file as _file_update,
17
+ )
18
+ from ._create import BioCypherNode
19
+ from ._deduplicate import Deduplicator
29
20
  from ._get import Downloader
30
- from ._config import config as _config
31
- from ._config import update_from_file as _file_update
32
- from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
21
+ from ._logger import logger
33
22
  from ._mapping import OntologyMapping
34
23
  from ._ontology import Ontology
35
24
  from ._translate import Translator
36
- from ._deduplicate import Deduplicator
37
- from .output.in_memory._pandas import Pandas
25
+ from .output.connect._get_connector import get_connector
26
+ from .output.in_memory._get_in_memory_kg import IN_MEMORY_DBMS, get_in_memory_kg
38
27
  from .output.write._get_writer import DBMS_TO_CLASS, get_writer
39
- from .output.connect._neo4j_driver import get_driver
40
28
 
29
+ logger.debug(f"Loading module {__name__}.")
41
30
  __all__ = ["BioCypher"]
42
31
 
43
32
  SUPPORTED_DBMS = DBMS_TO_CLASS.keys()
@@ -51,17 +40,18 @@ REQUIRED_CONFIG = [
51
40
 
52
41
 
53
42
  class BioCypher:
54
- """
55
- Orchestration of BioCypher operations. Instantiate this class to interact
56
- with BioCypher.
43
+ """Orchestration of BioCypher operations.
57
44
 
58
- Args:
45
+ Instantiate this class to interact with BioCypher.
59
46
 
47
+ Args:
48
+ ----
60
49
  dbms (str): The database management system to use. For supported
61
50
  systems see SUPPORTED_DBMS.
62
51
 
63
- offline (bool): Whether to run in offline mode. If True, no
64
- connection to the database will be made.
52
+ offline (bool): Whether to run in offline mode. In offline mode
53
+ the Knowledge Graph is written to files. In online mode, it
54
+ is written to a database or hold in memory.
65
55
 
66
56
  strict_mode (bool): Whether to run in strict mode. If True, the
67
57
  translator will raise an error if a node or edge does not
@@ -81,6 +71,8 @@ class BioCypher:
81
71
  output_directory (str): Path to the output directory. If not
82
72
  provided, the default value 'biocypher-out' will be used.
83
73
 
74
+ cache_directory (str): Path to the cache directory.
75
+
84
76
  """
85
77
 
86
78
  def __init__(
@@ -105,7 +97,7 @@ class BioCypher:
105
97
  logger.warning(
106
98
  "The parameter `db_name` is deprecated. Please set the "
107
99
  "`database_name` setting in the `biocypher_config.yaml` file "
108
- "instead."
100
+ "instead.",
109
101
  )
110
102
  _config(**{db_name: {"database_name": db_name}})
111
103
 
@@ -115,7 +107,8 @@ class BioCypher:
115
107
  # Check for required configuration
116
108
  for key in REQUIRED_CONFIG:
117
109
  if key not in self.base_config:
118
- raise ValueError(f"Configuration key {key} is required.")
110
+ msg = f"Configuration key {key} is required."
111
+ raise ValueError(msg)
119
112
 
120
113
  # Set configuration - mandatory
121
114
  self._dbms = dbms or self.base_config["dbms"]
@@ -131,34 +124,32 @@ class BioCypher:
131
124
  self._strict_mode = strict_mode
132
125
 
133
126
  self._schema_config_path = schema_config_path or self.base_config.get(
134
- "schema_config_path"
127
+ "schema_config_path",
135
128
  )
136
129
 
137
130
  if not self._schema_config_path:
138
131
  logger.warning("Running BioCypher without schema configuration.")
139
132
  else:
140
133
  logger.info(
141
- f"Running BioCypher with schema configuration from {self._schema_config_path}."
134
+ f"Running BioCypher with schema configuration from {self._schema_config_path}.",
142
135
  )
143
136
 
144
137
  self._head_ontology = head_ontology or self.base_config["head_ontology"]
145
138
 
146
139
  # Set configuration - optional
147
140
  self._output_directory = output_directory or self.base_config.get(
148
- "output_directory"
141
+ "output_directory",
149
142
  )
150
143
  self._cache_directory = cache_directory or self.base_config.get(
151
- "cache_directory"
144
+ "cache_directory",
152
145
  )
153
146
  self._tail_ontologies = tail_ontologies or self.base_config.get(
154
- "tail_ontologies"
147
+ "tail_ontologies",
155
148
  )
156
149
 
157
150
  if self._dbms not in SUPPORTED_DBMS:
158
- raise ValueError(
159
- f"DBMS {self._dbms} not supported. "
160
- f"Please select from {SUPPORTED_DBMS}."
161
- )
151
+ msg = f"DBMS {self._dbms} not supported. Please select from {SUPPORTED_DBMS}."
152
+ raise ValueError(msg)
162
153
 
163
154
  # Initialize
164
155
  self._ontology_mapping = None
@@ -167,23 +158,97 @@ class BioCypher:
167
158
  self._downloader = None
168
159
  self._ontology = None
169
160
  self._writer = None
170
- self._pd = None
161
+ self._driver = None
162
+ self._in_memory_kg = None
171
163
 
172
- def _get_deduplicator(self) -> Deduplicator:
164
+ self._in_memory_kg = None
165
+ self._nodes = None
166
+ self._edges = None
167
+
168
+ def _initialize_in_memory_kg(self) -> None:
169
+ """Create in-memory KG instance.
170
+
171
+ Set as instance variable `self._in_memory_kg`.
173
172
  """
174
- Create deduplicator if not exists and return.
173
+ if not self._in_memory_kg:
174
+ self._in_memory_kg = get_in_memory_kg(
175
+ dbms=self._dbms,
176
+ deduplicator=self._get_deduplicator(),
177
+ )
178
+
179
+ def add_nodes(self, nodes) -> None:
180
+ """Add new nodes to the internal representation.
181
+
182
+ Initially, receive nodes data from adaptor and create internal
183
+ representation for nodes.
184
+
185
+ Args:
186
+ ----
187
+ nodes(iterable): An iterable of nodes
188
+
189
+ """
190
+ if isinstance(nodes, list):
191
+ self._nodes = list(itertools.chain(self._nodes, nodes))
192
+ else:
193
+ self._nodes = itertools.chain(self._nodes, nodes)
194
+
195
+ def add_edges(self, edges) -> None:
196
+ """Add new edges to the internal representation.
197
+
198
+ Initially, receive edges data from adaptor and create internal
199
+ representation for edges.
200
+
201
+ Args:
202
+ ----
203
+ edges(iterable): An iterable of edges.
204
+
175
205
  """
206
+ if isinstance(edges, list):
207
+ self._edges = list(itertools.chain(self._edges, edges))
208
+ else:
209
+ self._edges = itertools.chain(self._edges, edges)
210
+
211
+ def to_df(self):
212
+ """Create DataFrame using internal representation.
213
+
214
+ TODO: to_df implies data frame, should be specifically that use case
215
+ """
216
+ return self._to_KG()
217
+
218
+ def to_networkx(self):
219
+ """Create networkx using internal representation."""
220
+ return self._to_KG()
221
+
222
+ def _to_KG(self):
223
+ """Convert the internal representation to knowledge graph.
224
+
225
+ The knowledge graph is returned based on the `dbms` parameter in
226
+ the biocypher configuration file.
227
+
228
+ Returns
229
+ -------
230
+ Any: knowledge graph.
231
+
232
+ """
233
+ if not self._in_memory_kg:
234
+ self._initialize_in_memory_kg()
235
+ if not self._translator:
236
+ self._get_translator()
237
+ tnodes = self._translator.translate_entities(self._nodes)
238
+ tedges = self._translator.translate_entities(self._edges)
239
+ self._in_memory_kg.add_nodes(tnodes)
240
+ self._in_memory_kg.add_edges(tedges)
241
+ return self._in_memory_kg.get_kg()
176
242
 
243
+ def _get_deduplicator(self) -> Deduplicator:
244
+ """Create deduplicator if not exists and return."""
177
245
  if not self._deduplicator:
178
246
  self._deduplicator = Deduplicator()
179
247
 
180
248
  return self._deduplicator
181
249
 
182
250
  def _get_ontology_mapping(self) -> OntologyMapping:
183
- """
184
- Create ontology mapping if not exists and return.
185
- """
186
-
251
+ """Create ontology mapping if not exists and return."""
187
252
  if not self._schema_config_path:
188
253
  self._ontology_mapping = OntologyMapping()
189
254
 
@@ -195,10 +260,7 @@ class BioCypher:
195
260
  return self._ontology_mapping
196
261
 
197
262
  def _get_ontology(self) -> Ontology:
198
- """
199
- Create ontology if not exists and return.
200
- """
201
-
263
+ """Create ontology if not exists and return."""
202
264
  if not self._ontology:
203
265
  self._ontology = Ontology(
204
266
  ontology_mapping=self._get_ontology_mapping(),
@@ -209,10 +271,7 @@ class BioCypher:
209
271
  return self._ontology
210
272
 
211
273
  def _get_translator(self) -> Translator:
212
- """
213
- Create translator if not exists and return.
214
- """
215
-
274
+ """Create translator if not exists and return."""
216
275
  if not self._translator:
217
276
  self._translator = Translator(
218
277
  ontology=self._get_ontology(),
@@ -222,14 +281,18 @@ class BioCypher:
222
281
  return self._translator
223
282
 
224
283
  def _get_writer(self):
225
- """
226
- Create writer if not online. Set as instance variable `self._writer`.
227
- """
284
+ """Create writer if not online.
228
285
 
286
+ Set as instance variable `self._writer`.
287
+ """
229
288
  if self._offline:
230
- timestamp = lambda: datetime.now().strftime("%Y%m%d%H%M%S")
289
+
290
+ def timestamp() -> str:
291
+ return datetime.now().strftime("%Y%m%d%H%M%S")
292
+
231
293
  outdir = self._output_directory or os.path.join(
232
- "biocypher-out", timestamp()
294
+ "biocypher-out",
295
+ timestamp(),
233
296
  )
234
297
  self._output_directory = os.path.abspath(outdir)
235
298
 
@@ -241,240 +304,270 @@ class BioCypher:
241
304
  strict_mode=self._strict_mode,
242
305
  )
243
306
  else:
244
- raise NotImplementedError("Cannot get writer in online mode.")
307
+ msg = "Cannot get writer in online mode."
308
+ raise NotImplementedError(msg)
309
+
310
+ return self._writer
245
311
 
246
312
  def _get_driver(self):
247
- """
248
- Create driver if not exists. Set as instance variable `self._driver`.
249
- """
313
+ """Create driver if not exists.
250
314
 
315
+ Set as instance variable `self._driver`.
316
+ """
251
317
  if not self._offline:
252
- self._driver = get_driver(
318
+ self._driver = get_connector(
253
319
  dbms=self._dbms,
254
320
  translator=self._get_translator(),
255
- deduplicator=self._get_deduplicator(),
256
321
  )
257
322
  else:
258
- raise NotImplementedError("Cannot get driver in offline mode.")
323
+ msg = "Cannot get driver in offline mode."
324
+ raise NotImplementedError(msg)
259
325
 
260
- def write_nodes(
261
- self, nodes, batch_size: int = int(1e6), force: bool = False
262
- ) -> bool:
326
+ return self._driver
327
+
328
+ def _get_in_memory_kg(self):
329
+ """Create in-memory KG instance.
330
+
331
+ Set as instance variable `self._in_memory_kg`.
263
332
  """
264
- Write nodes to database. Either takes an iterable of tuples (if given,
265
- translates to ``BioCypherNode`` objects) or an iterable of
266
- ``BioCypherNode`` objects.
333
+ if not self._in_memory_kg:
334
+ self._in_memory_kg = get_in_memory_kg(
335
+ dbms=self._dbms,
336
+ deduplicator=self._get_deduplicator(),
337
+ )
267
338
 
268
- Args:
269
- nodes (iterable): An iterable of nodes to write to the database.
339
+ return self._in_memory_kg
270
340
 
271
- batch_size (int): The batch size to use when writing to disk.
341
+ def _add_nodes(
342
+ self,
343
+ nodes,
344
+ batch_size: int = int(1e6),
345
+ force: bool = False,
346
+ ):
347
+ """Add nodes to the BioCypher KG.
272
348
 
273
- force (bool): Whether to force writing to the output directory even
274
- if the node type is not present in the schema config file.
349
+ First uses the `_translator` to translate the nodes to `BioCypherNode`
350
+ objects. Depending on the configuration the translated nodes are then
351
+ passed to the
275
352
 
276
- Returns:
277
- bool: True if successful.
278
- """
353
+ - `_writer`: if `_offline` is set to `False`
279
354
 
280
- if not self._writer:
281
- self._get_writer()
355
+ - `_in_memory_kg`: if `_offline` is set to `False` and the `_dbms` is an
356
+ `IN_MEMORY_DBMS`
282
357
 
283
- nodes = peekable(nodes)
284
- if not isinstance(nodes.peek(), BioCypherNode):
285
- tnodes = self._translator.translate_nodes(nodes)
286
- else:
287
- tnodes = nodes
288
- # write node files
289
- return self._writer.write_nodes(
290
- tnodes, batch_size=batch_size, force=force
291
- )
358
+ - `_driver`: if `_offline` is set to `True` and the `_dbms` is not an
359
+ `IN_MEMORY_DBMS`
292
360
 
293
- def write_edges(self, edges, batch_size: int = int(1e6)) -> bool:
294
361
  """
295
- Write edges to database. Either takes an iterable of tuples (if given,
296
- translates to ``BioCypherEdge`` objects) or an iterable of
297
- ``BioCypherEdge`` objects.
362
+ if not self._translator:
363
+ self._get_translator()
364
+ translated_nodes = self._translator.translate_entities(nodes)
298
365
 
299
- Args:
300
- edges (iterable): An iterable of edges to write to the database.
366
+ if self._offline:
367
+ passed = self._get_writer().write_nodes(
368
+ translated_nodes,
369
+ batch_size=batch_size,
370
+ force=force,
371
+ )
372
+ elif self._is_online_and_in_memory():
373
+ passed = self._get_in_memory_kg().add_nodes(translated_nodes)
374
+ else:
375
+ passed = self._get_driver().add_biocypher_nodes(translated_nodes)
301
376
 
302
- Returns:
303
- bool: True if successful.
304
- """
377
+ return passed
305
378
 
306
- if not self._writer:
307
- self._get_writer()
379
+ def _add_edges(self, edges, batch_size: int = int(1e6)):
380
+ """Add edges to the BioCypher KG.
308
381
 
309
- edges = peekable(edges)
310
- if not isinstance(edges.peek(), BioCypherEdge):
311
- tedges = self._translator.translate_edges(edges)
312
- else:
313
- tedges = edges
314
- # write edge files
315
- return self._writer.write_edges(tedges, batch_size=batch_size)
382
+ First uses the `_translator` to translate the edges to `BioCypherEdge`
383
+ objects. Depending on the configuration the translated edges are then
384
+ passed to the
316
385
 
317
- def to_df(self) -> list[pd.DataFrame]:
318
- """
319
- Convert entities to a pandas DataFrame for each entity type and return
320
- a list.
386
+ - `_writer`: if `_offline` is set to `False`
321
387
 
322
- Args:
323
- entities (iterable): An iterable of entities to convert to a
324
- DataFrame.
388
+ - `_in_memory_kg`: if `_offline` is set to `False` and the `_dbms` is an
389
+ `IN_MEMORY_DBMS`
390
+
391
+ - `_driver`: if `_offline` is set to `True` and the `_dbms` is not an
392
+ `IN_MEMORY_DBMS`
325
393
 
326
- Returns:
327
- pd.DataFrame: A pandas DataFrame.
328
394
  """
329
- if not self._pd:
330
- raise ValueError(
331
- "No pandas instance found. Please call `add()` first."
395
+ if not self._translator:
396
+ self._get_translator()
397
+ translated_edges = self._translator.translate_entities(edges)
398
+
399
+ if self._offline:
400
+ if not self._writer:
401
+ self._initialize_writer()
402
+ passed = self._writer.write_edges(
403
+ translated_edges,
404
+ batch_size=batch_size,
332
405
  )
406
+ elif self._is_online_and_in_memory():
407
+ if not self._in_memory_kg:
408
+ self._initialize_in_memory_kg()
409
+ passed = self._in_memory_kg.add_edges(translated_edges)
410
+ else:
411
+ if not self._driver:
412
+ self._initialize_driver()
413
+ passed = self._driver.add_biocypher_nodes(translated_edges)
333
414
 
334
- return self._pd.dfs
415
+ return passed
335
416
 
336
- def add(self, entities) -> None:
337
- """
338
- Function to add entities to the in-memory database. Accepts an iterable
339
- of tuples (if given, translates to ``BioCypherNode`` or
340
- ``BioCypherEdge`` objects) or an iterable of ``BioCypherNode`` or
341
- ``BioCypherEdge`` objects.
417
+ def _is_online_and_in_memory(self) -> bool:
418
+ """Return True if in online mode and in-memory dbms is used."""
419
+ return (not self._offline) & (self._dbms in IN_MEMORY_DBMS)
420
+
421
+ def write_nodes(
422
+ self,
423
+ nodes,
424
+ batch_size: int = int(1e6),
425
+ force: bool = False,
426
+ ) -> bool:
427
+ """Write nodes to database.
428
+
429
+ Either takes an iterable of tuples (if given, translates to
430
+ ``BioCypherNode`` objects) or an iterable of ``BioCypherNode`` objects.
342
431
 
343
432
  Args:
344
- entities (iterable): An iterable of entities to add to the database.
345
- Can be 3-tuples (nodes) or 5-tuples (edges); also accepts
346
- 4-tuples for edges (deprecated).
433
+ ----
434
+ nodes (iterable): An iterable of nodes to write to the database.
435
+ batch_size (int): The batch size to use when writing to disk.
436
+ force (bool): Whether to force writing to the output directory even
437
+ if the node type is not present in the schema config file.
347
438
 
348
439
  Returns:
349
- None
350
- """
351
- if not self._pd:
352
- self._pd = Pandas(
353
- translator=self._get_translator(),
354
- deduplicator=self._get_deduplicator(),
355
- )
356
-
357
- entities = peekable(entities)
440
+ -------
441
+ bool: True if successful.
358
442
 
359
- if (
360
- isinstance(entities.peek(), BioCypherNode)
361
- or isinstance(entities.peek(), BioCypherEdge)
362
- or isinstance(entities.peek(), BioCypherRelAsNode)
363
- ):
364
- tentities = entities
365
- elif len(entities.peek()) < 4:
366
- tentities = self._translator.translate_nodes(entities)
367
- else:
368
- tentities = self._translator.translate_edges(entities)
443
+ """
444
+ return self._add_nodes(nodes, batch_size=batch_size, force=force)
369
445
 
370
- self._pd.add_tables(tentities)
446
+ def write_edges(self, edges, batch_size: int = int(1e6)) -> bool:
447
+ """Write edges to database.
371
448
 
372
- def add_nodes(self, nodes) -> None:
373
- """
374
- Wrapper for ``add()`` to add nodes to the in-memory database.
449
+ Either takes an iterable of tuples (if given, translates to
450
+ ``BioCypherEdge`` objects) or an iterable of ``BioCypherEdge`` objects.
375
451
 
376
452
  Args:
377
- nodes (iterable): An iterable of node tuples to add to the database.
453
+ ----
454
+ edges (iterable): An iterable of edges to write to the database.
378
455
 
379
456
  Returns:
380
- None
381
- """
382
- self.add(nodes)
457
+ -------
458
+ bool: True if successful.
383
459
 
384
- def add_edges(self, edges) -> None:
385
460
  """
386
- Wrapper for ``add()`` to add edges to the in-memory database.
461
+ return self._add_edges(edges, batch_size=batch_size)
462
+
463
+ def add(self, entities) -> None:
464
+ """Add entities to the in-memory database.
465
+
466
+ Accepts an iterable of tuples (if given, translates to
467
+ ``BioCypherNode`` or ``BioCypherEdge`` objects) or an iterable of
468
+ ``BioCypherNode`` or ``BioCypherEdge`` objects.
387
469
 
388
470
  Args:
389
- edges (iterable): An iterable of edge tuples to add to the database.
471
+ ----
472
+ entities (iterable): An iterable of entities to add to the database.
473
+ Can be 3-tuples (nodes) or 5-tuples (edges); also accepts
474
+ 4-tuples for edges (deprecated).
390
475
 
391
476
  Returns:
477
+ -------
392
478
  None
479
+
393
480
  """
394
- self.add(edges)
481
+ return self._add_nodes(entities)
395
482
 
396
483
  def merge_nodes(self, nodes) -> bool:
397
- """
398
- Merge nodes into database. Either takes an iterable of tuples (if given,
399
- translates to ``BioCypherNode`` objects) or an iterable of
400
- ``BioCypherNode`` objects.
484
+ """Merge nodes into database.
485
+
486
+ Either takes an iterable of tuples (if given, translates to
487
+ ``BioCypherNode`` objects) or an iterable of ``BioCypherNode`` objects.
401
488
 
402
489
  Args:
490
+ ----
403
491
  nodes (iterable): An iterable of nodes to merge into the database.
404
492
 
405
493
  Returns:
494
+ -------
406
495
  bool: True if successful.
407
- """
408
-
409
- if not self._driver:
410
- self._get_driver()
411
496
 
412
- nodes = peekable(nodes)
413
- if not isinstance(nodes.peek(), BioCypherNode):
414
- tnodes = self._translator.translate_nodes(nodes)
415
- else:
416
- tnodes = nodes
417
- # write node files
418
- return self._driver.add_biocypher_nodes(tnodes)
497
+ """
498
+ return self._add_nodes(nodes)
419
499
 
420
500
  def merge_edges(self, edges) -> bool:
421
- """
422
- Merge edges into database. Either takes an iterable of tuples (if given,
423
- translates to ``BioCypherEdge`` objects) or an iterable of
424
- ``BioCypherEdge`` objects.
501
+ """Merge edges into database.
502
+
503
+ Either takes an iterable of tuples (if given, translates to
504
+ ``BioCypherEdge`` objects) or an iterable of ``BioCypherEdge`` objects.
425
505
 
426
506
  Args:
507
+ ----
427
508
  edges (iterable): An iterable of edges to merge into the database.
428
509
 
429
510
  Returns:
511
+ -------
430
512
  bool: True if successful.
513
+
431
514
  """
515
+ return self._add_edges(edges)
432
516
 
433
- if not self._driver:
434
- self._get_driver()
517
+ def get_kg(self):
518
+ """Get the in-memory KG instance.
435
519
 
436
- edges = peekable(edges)
437
- if not isinstance(edges.peek(), BioCypherEdge):
438
- tedges = self._translator.translate_edges(edges)
439
- else:
440
- tedges = edges
441
- # write edge files
442
- return self._driver.add_biocypher_edges(tedges)
520
+ Depending on the specified `dbms` this could either be a list of Pandas
521
+ dataframes or a NetworkX DiGraph.
522
+ """
523
+ if not self._is_online_and_in_memory():
524
+ msg = (f"Getting the in-memory KG is only available in online mode for {IN_MEMORY_DBMS}.",)
525
+ raise ValueError(msg)
526
+ if not self._in_memory_kg:
527
+ msg = "No in-memory KG instance found. Please call `add()` first."
528
+ raise ValueError(msg)
443
529
 
444
- # DOWNLOAD AND CACHE MANAGEMENT METHODS ###
530
+ if not self._in_memory_kg:
531
+ self._initialize_in_memory_kg()
532
+ return self._in_memory_kg.get_kg()
445
533
 
446
- def _get_downloader(self, cache_dir: Optional[str] = None):
447
- """
448
- Create downloader if not exists.
449
- """
534
+ # DOWNLOAD AND CACHE MANAGEMENT METHODS ###
450
535
 
536
+ def _get_downloader(self, cache_dir: str | None = None):
537
+ """Create downloader if not exists."""
451
538
  if not self._downloader:
452
539
  self._downloader = Downloader(self._cache_directory)
453
540
 
454
541
  def download(self, *resources) -> None:
455
- """
456
- Use the :class:`Downloader` class to download or load from cache the
457
- resources given by the adapter.
458
- """
542
+ """Download or load from cache the resources given by the adapter.
459
543
 
544
+ Args:
545
+ ----
546
+ resources (iterable): An iterable of resources to download or load
547
+ from cache.
548
+
549
+ Returns:
550
+ -------
551
+ None
552
+
553
+ """
460
554
  self._get_downloader()
461
555
  return self._downloader.download(*resources)
462
556
 
463
557
  # OVERVIEW AND CONVENIENCE METHODS ###
464
558
 
465
- def log_missing_input_labels(self) -> Optional[dict[str, list[str]]]:
466
- """
559
+ def log_missing_input_labels(self) -> dict[str, list[str]] | None:
560
+ """Log missing input labels.
467
561
 
468
562
  Get the set of input labels encountered without an entry in the
469
563
  `schema_config.yaml` and print them to the logger.
470
564
 
471
- Returns:
472
-
565
+ Returns
566
+ -------
473
567
  Optional[Dict[str, List[str]]]: A dictionary of Biolink types
474
568
  encountered without an entry in the `schema_config.yaml` file.
475
569
 
476
570
  """
477
-
478
571
  mt = self._translator.get_missing_biolink_types()
479
572
 
480
573
  if mt:
@@ -495,11 +588,11 @@ class BioCypher:
495
588
  return None
496
589
 
497
590
  def log_duplicates(self) -> None:
498
- """
591
+ """Log duplicate nodes and edges.
592
+
499
593
  Get the set of duplicate nodes and edges encountered and print them to
500
594
  the logger.
501
595
  """
502
-
503
596
  dn = self._deduplicator.get_duplicate_nodes()
504
597
 
505
598
  if dn:
@@ -543,11 +636,10 @@ class BioCypher:
543
636
  logger.info("No duplicate edges in input.")
544
637
 
545
638
  def show_ontology_structure(self, **kwargs) -> None:
546
- """
547
- Show the ontology structure using treelib or write to GRAPHML file.
639
+ """Show the ontology structure using treelib or write to GRAPHML file.
548
640
 
549
641
  Args:
550
-
642
+ ----
551
643
  to_disk (str): If specified, the ontology structure will be saved
552
644
  to disk as a GRAPHML file, to be opened in your favourite
553
645
  graph visualisation tool.
@@ -555,36 +647,37 @@ class BioCypher:
555
647
  full (bool): If True, the full ontology structure will be shown,
556
648
  including all nodes and edges. If False, only the nodes and
557
649
  edges that are relevant to the extended schema will be shown.
558
- """
559
650
 
651
+ """
560
652
  if not self._ontology:
561
653
  self._get_ontology()
562
654
 
563
655
  return self._ontology.show_ontology_structure(**kwargs)
564
656
 
565
657
  def write_import_call(self) -> str:
566
- """
567
- Write a shell script to import the database depending on the chosen
568
- DBMS.
658
+ """Write a shell script to import the database.
569
659
 
570
- Returns:
660
+ Shell script is written depending on the chosen DBMS.
661
+
662
+ Returns
663
+ -------
571
664
  str: path toward the file holding the import call.
572
- """
573
665
 
666
+ """
574
667
  if not self._offline:
575
- raise NotImplementedError(
576
- "Cannot write import call in online mode."
577
- )
668
+ msg = "Cannot write import call in online mode."
669
+ raise NotImplementedError(msg)
578
670
 
579
671
  return self._writer.write_import_call()
580
672
 
581
673
  def write_schema_info(self, as_node: bool = False) -> None:
582
- """
583
- Write an extended schema info YAML file that extends the
584
- `schema_config.yaml` with run-time information of the built KG. For
585
- instance, include information on whether something present in the actual
586
- knowledge graph, whether it is a relationship (which is important in the
587
- case of representing relationships as nodes) and the actual sources and
674
+ """Write an extended schema info to file or node.
675
+
676
+ Creates a YAML file or KG node that extends the `schema_config.yaml`
677
+ with run-time information of the built KG. For instance, include
678
+ information on whether something present in the actual knowledge graph,
679
+ whether it is a relationship (which is important in the case of
680
+ representing relationships as nodes) and the actual sources and
588
681
  targets of edges. Since this file can be used in place of the original
589
682
  `schema_config.yaml` file, it indicates that it is the extended schema
590
683
  by setting `is_schema_info` to `true`.
@@ -594,12 +687,16 @@ class BioCypher:
594
687
  The information of whether something is a relationship can be gathered
595
688
  from the deduplicator instance, which keeps track of all entities that
596
689
  have been seen.
597
- """
598
690
 
599
- if not self._offline:
600
- raise NotImplementedError(
601
- "Cannot write schema info in online mode."
602
- )
691
+ Args:
692
+ ----
693
+ as_node (bool): If True, the schema info is written as a KG node.
694
+ If False, the schema info is written to a YAML file.
695
+
696
+ """
697
+ if (not self._offline) and self._dbms not in IN_MEMORY_DBMS:
698
+ msg = "Cannot write schema info in online mode."
699
+ raise NotImplementedError(msg)
603
700
 
604
701
  ontology = self._get_ontology()
605
702
  schema = ontology.mapping.extended_schema.copy()
@@ -607,13 +704,12 @@ class BioCypher:
607
704
 
608
705
  deduplicator = self._get_deduplicator()
609
706
  for node in deduplicator.entity_types:
610
- if node in schema.keys():
707
+ if node in schema:
611
708
  schema[node]["present_in_knowledge_graph"] = True
612
709
  schema[node]["is_relationship"] = False
613
710
  else:
614
711
  logger.info(
615
- f"Node {node} not present in extended schema. "
616
- "Skipping schema info."
712
+ f"Node {node} not present in extended schema. Skipping schema info.",
617
713
  )
618
714
 
619
715
  # find 'label_as_edge' cases in schema entries
@@ -621,21 +717,20 @@ class BioCypher:
621
717
  for k, v in schema.items():
622
718
  if not isinstance(v, dict):
623
719
  continue
624
- if "label_as_edge" in v.keys():
625
- if v["label_as_edge"] in deduplicator.seen_relationships.keys():
720
+ if "label_as_edge" in v:
721
+ if v["label_as_edge"] in deduplicator.seen_relationships:
626
722
  changed_labels[v["label_as_edge"]] = k
627
723
 
628
- for edge in deduplicator.seen_relationships.keys():
629
- if edge in changed_labels.keys():
724
+ for edge in deduplicator.seen_relationships:
725
+ if edge in changed_labels:
630
726
  edge = changed_labels[edge]
631
- if edge in schema.keys():
727
+ if edge in schema:
632
728
  schema[edge]["present_in_knowledge_graph"] = True
633
729
  schema[edge]["is_relationship"] = True
634
730
  # TODO information about source and target nodes
635
731
  else:
636
732
  logger.info(
637
- f"Edge {edge} not present in extended schema. "
638
- "Skipping schema info."
733
+ f"Edge {edge} not present in extended schema. Skipping schema info.",
639
734
  )
640
735
 
641
736
  # write to output directory as YAML file
@@ -660,74 +755,77 @@ class BioCypher:
660
755
  # TRANSLATION METHODS ###
661
756
 
662
757
  def translate_term(self, term: str) -> str:
663
- """
664
- Translate a term to its BioCypher equivalent.
758
+ """Translate a term to its BioCypher equivalent.
665
759
 
666
760
  Args:
761
+ ----
667
762
  term (str): The term to translate.
668
763
 
669
764
  Returns:
765
+ -------
670
766
  str: The BioCypher equivalent of the term.
671
- """
672
767
 
768
+ """
673
769
  # instantiate adapter if not exists
674
770
  self.start_ontology()
675
771
 
676
772
  return self._translator.translate_term(term)
677
773
 
678
774
  def summary(self) -> None:
679
- """
680
- Wrapper for showing ontology structure and logging duplicates and
681
- missing input types.
682
- """
775
+ """Call convenience and reporting methods.
683
776
 
777
+ Shows ontology structure and logs duplicates and missing input types.
778
+ """
684
779
  self.show_ontology_structure()
685
780
  self.log_duplicates()
686
781
  self.log_missing_input_labels()
687
782
 
688
783
  def reverse_translate_term(self, term: str) -> str:
689
- """
690
- Reverse translate a term from its BioCypher equivalent.
784
+ """Reverse translate a term from its BioCypher equivalent.
691
785
 
692
786
  Args:
787
+ ----
693
788
  term (str): The BioCypher term to reverse translate.
694
789
 
695
790
  Returns:
791
+ -------
696
792
  str: The original term.
697
- """
698
793
 
794
+ """
699
795
  # instantiate adapter if not exists
700
796
  self.start_ontology()
701
797
 
702
798
  return self._translator.reverse_translate_term(term)
703
799
 
704
800
  def translate_query(self, query: str) -> str:
705
- """
706
- Translate a query to its BioCypher equivalent.
801
+ """Translate a query to its BioCypher equivalent.
707
802
 
708
803
  Args:
804
+ ----
709
805
  query (str): The query to translate.
710
806
 
711
807
  Returns:
808
+ -------
712
809
  str: The BioCypher equivalent of the query.
713
- """
714
810
 
811
+ """
715
812
  # instantiate adapter if not exists
716
813
  self.start_ontology()
717
814
 
718
815
  return self._translator.translate(query)
719
816
 
720
817
  def reverse_translate_query(self, query: str) -> str:
721
- """
722
- Reverse translate a query from its BioCypher equivalent.
818
+ """Reverse translate a query from its BioCypher equivalent.
723
819
 
724
820
  Args:
821
+ ----
725
822
  query (str): The BioCypher query to reverse translate.
726
823
 
727
824
  Returns:
825
+ -------
728
826
  str: The original query.
729
- """
730
827
 
828
+ """
731
829
  # instantiate adapter if not exists
732
830
  self.start_ontology()
733
831