biocypher 0.5.19__py3-none-any.whl → 0.5.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

biocypher/_connect.py CHANGED
@@ -53,8 +53,6 @@ class _Neo4jDriver:
53
53
 
54
54
  increment_version (bool): Whether to increment the version number.
55
55
 
56
- ontology (Ontology): The ontology to use for mapping.
57
-
58
56
  translator (Translator): The translator to use for mapping.
59
57
 
60
58
  """
@@ -66,14 +64,12 @@ class _Neo4jDriver:
66
64
  user: str,
67
65
  password: str,
68
66
  multi_db: bool,
69
- ontology: Ontology,
70
67
  translator: Translator,
71
68
  wipe: bool = False,
72
69
  fetch_size: int = 1000,
73
70
  increment_version: bool = True,
74
71
  ):
75
- self._ontology = ontology
76
- self._translator = translator
72
+ self.translator = translator
77
73
 
78
74
  self._driver = neo4j_utils.Driver(
79
75
  db_name=database_name,
@@ -103,7 +99,7 @@ class _Neo4jDriver:
103
99
  "MATCH (v:BioCypher) " "WHERE NOT (v)-[:PRECEDES]->() " "RETURN v",
104
100
  )
105
101
  # add version node
106
- self.add_biocypher_nodes(self._ontology)
102
+ self.add_biocypher_nodes(self.translator.ontology)
107
103
 
108
104
  # connect version node to previous
109
105
  if db_version[0]:
@@ -111,7 +107,7 @@ class _Neo4jDriver:
111
107
  previous_id = previous["v"]["id"]
112
108
  e_meta = BioCypherEdge(
113
109
  previous_id,
114
- self._ontology.get_dict().get("node_id"),
110
+ self.translator.ontology.get_dict().get("node_id"),
115
111
  "PRECEDES",
116
112
  )
117
113
  self.add_biocypher_edges(e_meta)
@@ -142,7 +138,7 @@ class _Neo4jDriver:
142
138
  logger.info("Creating constraints for node types in config.")
143
139
 
144
140
  # get structure
145
- for leaf in self._ontology.extended_schema.items():
141
+ for leaf in self.translator.ontology.mapping.extended_schema.items():
146
142
  label = _misc.sentencecase_to_pascalcase(leaf[0])
147
143
  if leaf[1]["represented_as"] == "node":
148
144
  s = (
@@ -172,7 +168,7 @@ class _Neo4jDriver:
172
168
  - second entry: Neo4j summary.
173
169
  """
174
170
 
175
- bn = self._translator.translate_nodes(id_type_tuples)
171
+ bn = self.translator.translate_nodes(id_type_tuples)
176
172
  return self.add_biocypher_nodes(bn)
177
173
 
178
174
  def add_edges(self, id_src_tar_type_tuples: Iterable[tuple]) -> tuple:
@@ -204,7 +200,7 @@ class _Neo4jDriver:
204
200
  - second entry: Neo4j summary.
205
201
  """
206
202
 
207
- bn = self._translator.translate_edges(id_src_tar_type_tuples)
203
+ bn = self.translator.translate_edges(id_src_tar_type_tuples)
208
204
  return self.add_biocypher_edges(bn)
209
205
 
210
206
  def add_biocypher_nodes(
@@ -375,7 +371,6 @@ class _Neo4jDriver:
375
371
  def get_driver(
376
372
  dbms: str,
377
373
  translator: "Translator",
378
- ontology: "Ontology",
379
374
  ):
380
375
  """
381
376
  Function to return the writer class.
@@ -394,7 +389,6 @@ def get_driver(
394
389
  user=dbms_config["user"],
395
390
  password=dbms_config["password"],
396
391
  multi_db=dbms_config["multi_db"],
397
- ontology=ontology,
398
392
  translator=translator,
399
393
  )
400
394
 
biocypher/_core.py CHANGED
@@ -13,8 +13,10 @@ BioCypher core module. Interfaces with the user and distributes tasks to
13
13
  submodules.
14
14
  """
15
15
  from typing import Optional
16
+ import os
16
17
 
17
18
  from more_itertools import peekable
19
+ import yaml
18
20
 
19
21
  import pandas as pd
20
22
 
@@ -22,10 +24,11 @@ from ._logger import logger
22
24
 
23
25
  logger.debug(f"Loading module {__name__}.")
24
26
 
27
+ from ._get import Downloader
25
28
  from ._write import get_writer
26
29
  from ._config import config as _config
27
30
  from ._config import update_from_file as _file_update
28
- from ._create import BioCypherEdge, BioCypherNode
31
+ from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
29
32
  from ._pandas import Pandas
30
33
  from ._connect import get_driver
31
34
  from ._mapping import OntologyMapping
@@ -181,19 +184,6 @@ class BioCypher:
181
184
 
182
185
  return self._ontology_mapping
183
186
 
184
- def _get_translator(self) -> Translator:
185
- """
186
- Create translator if not exists and return.
187
- """
188
-
189
- if not self._translator:
190
- self._translator = Translator(
191
- ontology_mapping=self._get_ontology_mapping(),
192
- strict_mode=self._strict_mode,
193
- )
194
-
195
- return self._translator
196
-
197
187
  def _get_ontology(self) -> Ontology:
198
188
  """
199
189
  Create ontology if not exists and return.
@@ -208,17 +198,28 @@ class BioCypher:
208
198
 
209
199
  return self._ontology
210
200
 
201
+ def _get_translator(self) -> Translator:
202
+ """
203
+ Create translator if not exists and return.
204
+ """
205
+
206
+ if not self._translator:
207
+ self._translator = Translator(
208
+ ontology=self._get_ontology(),
209
+ strict_mode=self._strict_mode,
210
+ )
211
+
212
+ return self._translator
213
+
211
214
  def _get_writer(self):
212
215
  """
213
216
  Create writer if not online. Set as instance variable `self._writer`.
214
217
  """
215
218
 
216
- # Get worker
217
219
  if self._offline:
218
220
  self._writer = get_writer(
219
221
  dbms=self._dbms,
220
222
  translator=self._get_translator(),
221
- ontology=self._get_ontology(),
222
223
  deduplicator=self._get_deduplicator(),
223
224
  output_directory=self._output_directory,
224
225
  strict_mode=self._strict_mode,
@@ -235,7 +236,6 @@ class BioCypher:
235
236
  self._driver = get_driver(
236
237
  dbms=self._dbms,
237
238
  translator=self._get_translator(),
238
- ontology=self._get_ontology(),
239
239
  deduplicator=self._get_deduplicator(),
240
240
  )
241
241
  else:
@@ -308,24 +308,33 @@ class BioCypher:
308
308
 
309
309
  return self._pd.dfs
310
310
 
311
- def add(self, entities):
311
+ def add(self, entities) -> None:
312
312
  """
313
313
  Function to add entities to the in-memory database. Accepts an iterable
314
314
  of tuples (if given, translates to ``BioCypherNode`` or
315
315
  ``BioCypherEdge`` objects) or an iterable of ``BioCypherNode`` or
316
316
  ``BioCypherEdge`` objects.
317
+
318
+ Args:
319
+ entities (iterable): An iterable of entities to add to the database.
320
+ Can be 3-tuples (nodes) or 5-tuples (edges); also accepts
321
+ 4-tuples for edges (deprecated).
322
+
323
+ Returns:
324
+ None
317
325
  """
318
326
  if not self._pd:
319
327
  self._pd = Pandas(
320
328
  translator=self._get_translator(),
321
- ontology=self._get_ontology(),
322
329
  deduplicator=self._get_deduplicator(),
323
330
  )
324
331
 
325
332
  entities = peekable(entities)
326
333
 
327
- if isinstance(entities.peek(), BioCypherNode) or isinstance(
328
- entities.peek(), BioCypherEdge
334
+ if (
335
+ isinstance(entities.peek(), BioCypherNode)
336
+ or isinstance(entities.peek(), BioCypherEdge)
337
+ or isinstance(entities.peek(), BioCypherRelAsNode)
329
338
  ):
330
339
  tentities = entities
331
340
  elif len(entities.peek()) < 4:
@@ -335,10 +344,28 @@ class BioCypher:
335
344
 
336
345
  self._pd.add_tables(tentities)
337
346
 
338
- def add_nodes(self, nodes):
347
+ def add_nodes(self, nodes) -> None:
348
+ """
349
+ Wrapper for ``add()`` to add nodes to the in-memory database.
350
+
351
+ Args:
352
+ nodes (iterable): An iterable of node tuples to add to the database.
353
+
354
+ Returns:
355
+ None
356
+ """
339
357
  self.add(nodes)
340
358
 
341
- def add_edges(self, edges):
359
+ def add_edges(self, edges) -> None:
360
+ """
361
+ Wrapper for ``add()`` to add edges to the in-memory database.
362
+
363
+ Args:
364
+ edges (iterable): An iterable of edge tuples to add to the database.
365
+
366
+ Returns:
367
+ None
368
+ """
342
369
  self.add(edges)
343
370
 
344
371
  def merge_nodes(self, nodes) -> bool:
@@ -389,6 +416,24 @@ class BioCypher:
389
416
  # write edge files
390
417
  return self._driver.add_biocypher_edges(tedges)
391
418
 
419
+ # DOWNLOAD AND CACHE MANAGEMENT METHODS ###
420
+
421
+ def _get_downloader(self):
422
+ """
423
+ Create downloader if not exists.
424
+ """
425
+
426
+ if not self._downloader:
427
+ self._downloader = Downloader()
428
+
429
+ def download(self, force: bool = False) -> None:
430
+ """
431
+ Use the :class:`Downloader` class to download or load from cache the
432
+ resources given by the adapter.
433
+ """
434
+
435
+ self._get_downloader()
436
+
392
437
  # OVERVIEW AND CONVENIENCE METHODS ###
393
438
 
394
439
  def log_missing_input_labels(self) -> Optional[dict[str, list[str]]]:
@@ -504,6 +549,73 @@ class BioCypher:
504
549
 
505
550
  self._writer.write_import_call()
506
551
 
552
+ def write_schema_info(self) -> None:
553
+ """
554
+ Write an extended schema info YAML file that extends the
555
+ `schema_config.yaml` with run-time information of the built KG. For
556
+ instance, include information on whether something present in the actual
557
+ knowledge graph, whether it is a relationship (which is important in the
558
+ case of representing relationships as nodes) and the actual sources and
559
+ targets of edges. Since this file can be used in place of the original
560
+ `schema_config.yaml` file, it indicates that it is the extended schema
561
+ by setting `is_schema_info` to `true`.
562
+
563
+ We start by using the `extended_schema` dictionary from the ontology
564
+ class instance, which contains all expanded entities and relationships.
565
+ The information of whether something is a relationship can be gathered
566
+ from the deduplicator instance, which keeps track of all entities that
567
+ have been seen.
568
+ """
569
+
570
+ if not self._offline:
571
+ raise NotImplementedError(
572
+ "Cannot write schema info in online mode."
573
+ )
574
+
575
+ ontology = self._get_ontology()
576
+ schema = ontology.mapping.extended_schema
577
+ schema["is_schema_info"] = True
578
+
579
+ deduplicator = self._get_deduplicator()
580
+ for node in deduplicator.entity_types:
581
+ if node in schema.keys():
582
+ schema[node]["present_in_knowledge_graph"] = True
583
+ schema[node]["is_relationship"] = False
584
+ else:
585
+ logger.info(
586
+ f"Node {node} not present in extended schema. "
587
+ "Skipping schema info."
588
+ )
589
+
590
+ # find 'label_as_edge' cases in schema entries
591
+ changed_labels = {}
592
+ for k, v in schema.items():
593
+ if not isinstance(v, dict):
594
+ continue
595
+ if "label_as_edge" in v.keys():
596
+ if v["label_as_edge"] in deduplicator.seen_relationships.keys():
597
+ changed_labels[v["label_as_edge"]] = k
598
+
599
+ for edge in deduplicator.seen_relationships.keys():
600
+ if edge in changed_labels.keys():
601
+ edge = changed_labels[edge]
602
+ if edge in schema.keys():
603
+ schema[edge]["present_in_knowledge_graph"] = True
604
+ schema[edge]["is_relationship"] = True
605
+ # TODO information about source and target nodes
606
+ else:
607
+ logger.info(
608
+ f"Edge {edge} not present in extended schema. "
609
+ "Skipping schema info."
610
+ )
611
+
612
+ # write to output directory as YAML file
613
+ path = os.path.join(self._output_directory, "schema_info.yaml")
614
+ with open(path, "w") as f:
615
+ f.write(yaml.dump(schema))
616
+
617
+ return schema
618
+
507
619
  # TRANSLATION METHODS ###
508
620
 
509
621
  def translate_term(self, term: str) -> str:
biocypher/_deduplicate.py CHANGED
@@ -2,7 +2,7 @@ from ._logger import logger
2
2
 
3
3
  logger.debug(f"Loading module {__name__}.")
4
4
 
5
- from ._create import BioCypherEdge, BioCypherNode
5
+ from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
6
6
 
7
7
 
8
8
  class Deduplicator:
@@ -19,15 +19,17 @@ class Deduplicator:
19
19
  """
20
20
 
21
21
  def __init__(self):
22
- self.seen_node_ids = set()
23
- self.duplicate_node_ids = set()
24
- self.duplicate_node_types = set()
22
+ self.seen_entity_ids = set()
23
+ self.duplicate_entity_ids = set()
25
24
 
26
- self.seen_edges = {}
27
- self.duplicate_edge_ids = set()
28
- self.duplicate_edge_types = set()
25
+ self.entity_types = set()
26
+ self.duplicate_entity_types = set()
29
27
 
30
- def node_seen(self, node: BioCypherNode) -> bool:
28
+ self.seen_relationships = {}
29
+ self.duplicate_relationship_ids = set()
30
+ self.duplicate_relationship_types = set()
31
+
32
+ def node_seen(self, entity: BioCypherNode) -> bool:
31
33
  """
32
34
  Adds a node to the instance and checks if it has been seen before.
33
35
 
@@ -37,19 +39,22 @@ class Deduplicator:
37
39
  Returns:
38
40
  True if the node has been seen before, False otherwise.
39
41
  """
40
- if node.get_id() in self.seen_node_ids:
41
- self.duplicate_node_ids.add(node.get_id())
42
- if node.get_label() not in self.duplicate_node_types:
42
+ if entity.get_label() not in self.entity_types:
43
+ self.entity_types.add(entity.get_label())
44
+
45
+ if entity.get_id() in self.seen_entity_ids:
46
+ self.duplicate_entity_ids.add(entity.get_id())
47
+ if entity.get_label() not in self.duplicate_entity_types:
43
48
  logger.warning(
44
- f"Duplicate node type {node.get_label()} found. "
49
+ f"Duplicate node type {entity.get_label()} found. "
45
50
  )
46
- self.duplicate_node_types.add(node.get_label())
51
+ self.duplicate_entity_types.add(entity.get_label())
47
52
  return True
48
53
 
49
- self.seen_node_ids.add(node.get_id())
54
+ self.seen_entity_ids.add(entity.get_id())
50
55
  return False
51
56
 
52
- def edge_seen(self, edge: BioCypherEdge) -> bool:
57
+ def edge_seen(self, relationship: BioCypherEdge) -> bool:
53
58
  """
54
59
  Adds an edge to the instance and checks if it has been seen before.
55
60
 
@@ -59,23 +64,57 @@ class Deduplicator:
59
64
  Returns:
60
65
  True if the edge has been seen before, False otherwise.
61
66
  """
62
- if edge.get_type() not in self.seen_edges:
63
- self.seen_edges[edge.get_type()] = set()
67
+ if relationship.get_type() not in self.seen_relationships:
68
+ self.seen_relationships[relationship.get_type()] = set()
64
69
 
65
70
  # concatenate source and target if no id is present
66
- if not edge.get_id():
67
- _id = f"{edge.get_source_id()}_{edge.get_target_id()}"
71
+ if not relationship.get_id():
72
+ _id = (
73
+ f"{relationship.get_source_id()}_{relationship.get_target_id()}"
74
+ )
68
75
  else:
69
- _id = edge.get_id()
76
+ _id = relationship.get_id()
77
+
78
+ if _id in self.seen_relationships[relationship.get_type()]:
79
+ self.duplicate_relationship_ids.add(_id)
80
+ if relationship.get_type() not in self.duplicate_relationship_types:
81
+ logger.warning(
82
+ f"Duplicate edge type {relationship.get_type()} found. "
83
+ )
84
+ self.duplicate_relationship_types.add(relationship.get_type())
85
+ return True
86
+
87
+ self.seen_relationships[relationship.get_type()].add(_id)
88
+ return False
89
+
90
+ def rel_as_node_seen(self, rel_as_node: BioCypherRelAsNode) -> bool:
91
+ """
92
+ Adds a rel_as_node to the instance (one entity and two relationships)
93
+ and checks if it has been seen before. Only the node is relevant for
94
+ identifying the rel_as_node as a duplicate.
95
+
96
+ Args:
97
+ rel_as_node: BioCypherRelAsNode to be added.
98
+
99
+ Returns:
100
+ True if the rel_as_node has been seen before, False otherwise.
101
+ """
102
+ node = rel_as_node.get_node()
103
+
104
+ if node.get_label() not in self.seen_relationships:
105
+ self.seen_relationships[node.get_label()] = set()
106
+
107
+ # rel as node always has an id
108
+ _id = node.get_id()
70
109
 
71
- if _id in self.seen_edges[edge.get_type()]:
72
- self.duplicate_edge_ids.add(_id)
73
- if edge.get_type() not in self.duplicate_edge_types:
74
- logger.warning(f"Duplicate edge type {edge.get_type()} found. ")
75
- self.duplicate_edge_types.add(edge.get_type())
110
+ if _id in self.seen_relationships[node.get_type()]:
111
+ self.duplicate_relationship_ids.add(_id)
112
+ if node.get_type() not in self.duplicate_relationship_types:
113
+ logger.warning(f"Duplicate edge type {node.get_type()} found. ")
114
+ self.duplicate_relationship_types.add(node.get_type())
76
115
  return True
77
116
 
78
- self.seen_edges[edge.get_type()].add(_id)
117
+ self.seen_relationships[node.get_type()].add(_id)
79
118
  return False
80
119
 
81
120
  def get_duplicate_nodes(self):
@@ -86,8 +125,8 @@ class Deduplicator:
86
125
  list: list of duplicate nodes
87
126
  """
88
127
 
89
- if self.duplicate_node_types:
90
- return (self.duplicate_node_types, self.duplicate_node_ids)
128
+ if self.duplicate_entity_types:
129
+ return (self.duplicate_entity_types, self.duplicate_entity_ids)
91
130
  else:
92
131
  return None
93
132
 
@@ -99,7 +138,10 @@ class Deduplicator:
99
138
  list: list of duplicate edges
100
139
  """
101
140
 
102
- if self.duplicate_edge_types:
103
- return (self.duplicate_edge_types, self.duplicate_edge_ids)
141
+ if self.duplicate_relationship_types:
142
+ return (
143
+ self.duplicate_relationship_types,
144
+ self.duplicate_relationship_ids,
145
+ )
104
146
  else:
105
147
  return None