biocypher 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

biocypher/_translate.py CHANGED
@@ -1,31 +1,21 @@
1
- #!/usr/bin/env python
2
-
3
- #
4
- # Copyright 2021, Heidelberg University Clinic
5
- #
6
- # File author(s): Sebastian Lobentanzer
7
- # ...
8
- #
9
- # Distributed under MIT licence, see the file `LICENSE`.
10
- #
11
1
  """
12
2
  BioCypher 'translation' module. Responsible for translating between the raw
13
3
  input data and the BioCypherNode and BioCypherEdge objects.
14
4
  """
15
- from ._logger import logger
16
-
17
- logger.debug(f"Loading module {__name__}.")
18
5
 
19
- from typing import Any, Union, Optional
20
- from collections.abc import Iterable, Generator
6
+ from collections.abc import Generator, Iterable
7
+ from typing import Any, Optional, Union
21
8
 
22
9
  from more_itertools import peekable
23
10
 
24
11
  from . import _misc
25
12
  from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
13
+ from ._logger import logger
26
14
  from ._ontology import Ontology
27
15
 
28
- __all__ = ["BiolinkAdapter", "Translator"]
16
+ logger.debug(f"Loading module {__name__}.")
17
+
18
+ __all__ = ["Translator"]
29
19
 
30
20
 
31
21
  class Translator:
@@ -67,6 +57,20 @@ class Translator:
67
57
 
68
58
  self._update_ontology_types()
69
59
 
60
+ def translate_entities(self, entities):
61
+ entities = peekable(entities)
62
+ if (
63
+ isinstance(entities.peek(), BioCypherNode)
64
+ or isinstance(entities.peek(), BioCypherEdge)
65
+ or isinstance(entities.peek(), BioCypherRelAsNode)
66
+ ):
67
+ translated_entities = entities
68
+ elif len(entities.peek()) < 4:
69
+ translated_entities = self.translate_nodes(entities)
70
+ else:
71
+ translated_entities = self.translate_edges(entities)
72
+ return translated_entities
73
+
70
74
  def translate_nodes(
71
75
  self,
72
76
  node_tuples: Iterable,
@@ -131,8 +135,7 @@ class Translator:
131
135
 
132
136
  return (
133
137
  self.ontology.mapping.extended_schema[_bl_type]["preferred_id"]
134
- if "preferred_id"
135
- in self.ontology.mapping.extended_schema.get(_bl_type, {})
138
+ if "preferred_id" in self.ontology.mapping.extended_schema.get(_bl_type, {})
136
139
  else "id"
137
140
  )
138
141
 
@@ -141,9 +144,7 @@ class Translator:
141
144
  Filters properties for those specified in schema_config if any.
142
145
  """
143
146
 
144
- filter_props = self.ontology.mapping.extended_schema[bl_type].get(
145
- "properties", {}
146
- )
147
+ filter_props = self.ontology.mapping.extended_schema[bl_type].get("properties", {})
147
148
 
148
149
  # strict mode: add required properties (only if there is a whitelist)
149
150
  if self.strict_mode and filter_props:
@@ -151,36 +152,24 @@ class Translator:
151
152
  {"source": "str", "licence": "str", "version": "str"},
152
153
  )
153
154
 
154
- exclude_props = self.ontology.mapping.extended_schema[bl_type].get(
155
- "exclude_properties", []
156
- )
155
+ exclude_props = self.ontology.mapping.extended_schema[bl_type].get("exclude_properties", [])
157
156
 
158
157
  if isinstance(exclude_props, str):
159
158
  exclude_props = [exclude_props]
160
159
 
161
160
  if filter_props and exclude_props:
162
- filtered_props = {
163
- k: v
164
- for k, v in props.items()
165
- if (k in filter_props.keys() and k not in exclude_props)
166
- }
161
+ filtered_props = {k: v for k, v in props.items() if (k in filter_props.keys() and k not in exclude_props)}
167
162
 
168
163
  elif filter_props:
169
- filtered_props = {
170
- k: v for k, v in props.items() if k in filter_props.keys()
171
- }
164
+ filtered_props = {k: v for k, v in props.items() if k in filter_props.keys()}
172
165
 
173
166
  elif exclude_props:
174
- filtered_props = {
175
- k: v for k, v in props.items() if k not in exclude_props
176
- }
167
+ filtered_props = {k: v for k, v in props.items() if k not in exclude_props}
177
168
 
178
169
  else:
179
170
  return props
180
171
 
181
- missing_props = [
182
- k for k in filter_props.keys() if k not in filtered_props.keys()
183
- ]
172
+ missing_props = [k for k in filter_props.keys() if k not in filtered_props.keys()]
184
173
  # add missing properties with default values
185
174
  for k in missing_props:
186
175
  filtered_props[k] = None
@@ -213,20 +202,17 @@ class Translator:
213
202
  # TODO remove for performance reasons once safe
214
203
  edge_tuples = peekable(edge_tuples)
215
204
  if len(edge_tuples.peek()) == 4:
216
- edge_tuples = [
217
- (None, src, tar, typ, props)
218
- for src, tar, typ, props in edge_tuples
219
- ]
205
+ edge_tuples = [(None, src, tar, typ, props) for src, tar, typ, props in edge_tuples]
220
206
 
221
207
  for _id, _src, _tar, _type, _props in edge_tuples:
222
208
  # check for strict mode requirements
223
209
  if self.strict_mode:
224
- if not "source" in _props:
210
+ if "source" not in _props:
225
211
  raise ValueError(
226
212
  f"Edge {_id if _id else (_src, _tar)} does not have a `source` property.",
227
213
  " This is required in strict mode.",
228
214
  )
229
- if not "licence" in _props:
215
+ if "licence" not in _props:
230
216
  raise ValueError(
231
217
  f"Edge {_id if _id else (_src, _tar)} does not have a `licence` property.",
232
218
  " This is required in strict mode.",
@@ -240,9 +226,7 @@ class Translator:
240
226
  # filter properties for those specified in schema_config if any
241
227
  _filtered_props = self._filter_props(bl_type, _props)
242
228
 
243
- rep = self.ontology.mapping.extended_schema[bl_type][
244
- "represented_as"
245
- ]
229
+ rep = self.ontology.mapping.extended_schema[bl_type]["represented_as"]
246
230
 
247
231
  if rep == "node":
248
232
  if _id:
@@ -251,13 +235,7 @@ class Translator:
251
235
 
252
236
  else:
253
237
  # source target concat
254
- node_id = (
255
- str(_src)
256
- + "_"
257
- + str(_tar)
258
- + "_"
259
- + "_".join(str(v) for v in _filtered_props.values())
260
- )
238
+ node_id = str(_src) + "_" + str(_tar) + "_" + "_".join(str(v) for v in _filtered_props.values())
261
239
 
262
240
  n = BioCypherNode(
263
241
  node_id=node_id,
@@ -268,7 +246,7 @@ class Translator:
268
246
  # directionality check TODO generalise to account for
269
247
  # different descriptions of directionality or find a
270
248
  # more consistent solution for indicating directionality
271
- if _filtered_props.get("directed") == True:
249
+ if _filtered_props.get("directed") == True: # noqa: E712 (seems to not work without '== True')
272
250
  l1 = "IS_SOURCE_OF"
273
251
  l2 = "IS_TARGET_OF"
274
252
 
@@ -298,9 +276,7 @@ class Translator:
298
276
  yield BioCypherRelAsNode(n, e_s, e_t)
299
277
 
300
278
  else:
301
- edge_label = self.ontology.mapping.extended_schema[
302
- bl_type
303
- ].get("label_as_edge")
279
+ edge_label = self.ontology.mapping.extended_schema[bl_type].get("label_as_edge")
304
280
 
305
281
  if edge_label is None:
306
282
  edge_label = bl_type
@@ -0,0 +1,40 @@
1
+ """BioCypher 'connect' module.
2
+
3
+ Handles the connecting and writing a Knowledge Graph to a database.
4
+ """
5
+
6
+ from biocypher._config import config as _config
7
+ from biocypher._logger import logger
8
+ from biocypher._translate import Translator
9
+ from biocypher.output.connect._neo4j_driver import _Neo4jDriver
10
+
11
+ logger.debug(f"Loading module {__name__}.")
12
+
13
+ __all__ = ["get_connector"]
14
+
15
+
16
+ def get_connector(
17
+ dbms: str,
18
+ translator: Translator,
19
+ ):
20
+ """
21
+ Function to return the connector class.
22
+
23
+ Returns:
24
+ class: the connector class
25
+ """
26
+
27
+ dbms_config = _config(dbms)
28
+
29
+ if dbms == "neo4j":
30
+ return _Neo4jDriver(
31
+ database_name=dbms_config["database_name"],
32
+ wipe=dbms_config["wipe"],
33
+ uri=dbms_config["uri"],
34
+ user=dbms_config["user"],
35
+ password=dbms_config["password"],
36
+ multi_db=dbms_config["multi_db"],
37
+ translator=translator,
38
+ )
39
+ else:
40
+ raise NotImplementedError(f"Online mode is not supported for the DBMS {dbms}.")
@@ -1,32 +1,19 @@
1
- #!/usr/bin/env python
2
-
3
- #
4
- # Copyright 2021, Heidelberg University Clinic
5
- #
6
- # File author(s): Sebastian Lobentanzer
7
- # ...
8
- #
9
- # Distributed under MIT licence, see the file `LICENSE`.
10
- #
11
1
  """
12
2
  BioCypher 'online' mode. Handles connection and manipulation of a running DBMS.
13
3
  """
14
- import subprocess
15
4
 
16
- from biocypher._logger import logger
17
-
18
- logger.debug(f"Loading module {__name__}.")
5
+ import itertools
19
6
 
20
7
  from collections.abc import Iterable
21
- import itertools
22
8
 
23
9
  import neo4j_utils
24
10
 
25
11
  from biocypher import _misc
26
- from biocypher._config import config as _config
27
12
  from biocypher._create import BioCypherEdge, BioCypherNode
13
+ from biocypher._logger import logger
28
14
  from biocypher._translate import Translator
29
15
 
16
+ logger.debug(f"Loading module {__name__}.")
30
17
  __all__ = ["_Neo4jDriver"]
31
18
 
32
19
 
@@ -96,7 +83,7 @@ class _Neo4jDriver:
96
83
 
97
84
  # find current version node
98
85
  db_version = self._driver.query(
99
- "MATCH (v:BioCypher) " "WHERE NOT (v)-[:PRECEDES]->() " "RETURN v",
86
+ "MATCH (v:BioCypher) WHERE NOT (v)-[:PRECEDES]->() RETURN v",
100
87
  )
101
88
  # add version node
102
89
  self.add_biocypher_nodes(self.translator.ontology)
@@ -143,18 +130,10 @@ class _Neo4jDriver:
143
130
  label = _misc.sentencecase_to_pascalcase(leaf[0], sep=r"\s\.")
144
131
  if leaf[1]["represented_as"] == "node":
145
132
  if major_neo4j_version >= 5:
146
- s = (
147
- f"CREATE CONSTRAINT `{label}_id` "
148
- f"IF NOT EXISTS FOR (n:`{label}`) "
149
- "REQUIRE n.id IS UNIQUE"
150
- )
133
+ s = f"CREATE CONSTRAINT `{label}_id` " f"IF NOT EXISTS FOR (n:`{label}`) " "REQUIRE n.id IS UNIQUE"
151
134
  self._driver.query(s)
152
135
  else:
153
- s = (
154
- f"CREATE CONSTRAINT `{label}_id` "
155
- f"IF NOT EXISTS ON (n:`{label}`) "
156
- "ASSERT n.id IS UNIQUE"
157
- )
136
+ s = f"CREATE CONSTRAINT `{label}_id` " f"IF NOT EXISTS ON (n:`{label}`) " "ASSERT n.id IS UNIQUE"
158
137
  self._driver.query(s)
159
138
 
160
139
  def _get_neo4j_version(self):
@@ -170,9 +149,7 @@ class _Neo4jDriver:
170
149
  )[0][0]["version"]
171
150
  return neo4j_version
172
151
  except Exception as e:
173
- logger.warning(
174
- f"Error detecting Neo4j version: {e} use default version 4.0.0."
175
- )
152
+ logger.warning(f"Error detecting Neo4j version: {e} use default version 4.0.0.")
176
153
  return "4.0.0"
177
154
 
178
155
  def add_nodes(self, id_type_tuples: Iterable[tuple]) -> tuple:
@@ -364,11 +341,7 @@ class _Neo4jDriver:
364
341
  # merging only on the ids of the entities, passing the
365
342
  # properties on match and on create;
366
343
  # TODO add node labels?
367
- node_query = (
368
- "UNWIND $rels AS r "
369
- "MERGE (src {id: r.source_id}) "
370
- "MERGE (tar {id: r.target_id}) "
371
- )
344
+ node_query = "UNWIND $rels AS r " "MERGE (src {id: r.source_id}) " "MERGE (tar {id: r.target_id}) "
372
345
 
373
346
  self._driver.query(node_query, parameters={"rels": rels})
374
347
 
@@ -386,37 +359,8 @@ class _Neo4jDriver:
386
359
 
387
360
  method = "explain" if explain else "profile" if profile else "query"
388
361
 
389
- result = getattr(self._driver, method)(
390
- edge_query, parameters={"rels": rels}
391
- )
362
+ result = getattr(self._driver, method)(edge_query, parameters={"rels": rels})
392
363
 
393
364
  logger.info("Finished merging edges.")
394
365
 
395
366
  return result
396
-
397
-
398
- def get_driver(
399
- dbms: str,
400
- translator: "Translator",
401
- ):
402
- """
403
- Function to return the writer class.
404
-
405
- Returns:
406
- class: the writer class
407
- """
408
-
409
- dbms_config = _config(dbms)
410
-
411
- if dbms == "neo4j":
412
- return _Neo4jDriver(
413
- database_name=dbms_config["database_name"],
414
- wipe=dbms_config["wipe"],
415
- uri=dbms_config["uri"],
416
- user=dbms_config["user"],
417
- password=dbms_config["password"],
418
- multi_db=dbms_config["multi_db"],
419
- translator=translator,
420
- )
421
-
422
- return None
@@ -0,0 +1,34 @@
1
+ """
2
+ BioCypher 'in_memory' module. Handles the in-memory Knowledge Graph instance.
3
+ """
4
+
5
+ from biocypher._deduplicate import Deduplicator
6
+ from biocypher._logger import logger
7
+ from biocypher.output.in_memory._networkx import NetworkxKG
8
+ from biocypher.output.in_memory._pandas import PandasKG
9
+
10
+ logger.debug(f"Loading module {__name__}.")
11
+
12
+ __all__ = ["get_in_memory_kg"]
13
+
14
+ IN_MEMORY_DBMS = ["csv", "pandas", "tabular", "networkx"]
15
+
16
+
17
+ def get_in_memory_kg(
18
+ dbms: str,
19
+ deduplicator: Deduplicator,
20
+ ):
21
+ """
22
+ Function to return the in-memory KG class.
23
+
24
+ Returns:
25
+ class: the in-memory KG class
26
+ """
27
+ if dbms in ["csv", "pandas", "tabular"]:
28
+ return PandasKG(deduplicator)
29
+ elif dbms == "networkx":
30
+ return NetworkxKG(deduplicator)
31
+ else:
32
+ raise NotImplementedError(
33
+ f"Getting the in memory BioCypher KG is not supported for the DBMS {dbms}. Supported: {IN_MEMORY_DBMS}."
34
+ )
@@ -0,0 +1,40 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class _InMemoryKG(ABC):
5
+ """Abstract class for handling the in-memory Knowledge Graph instance.
6
+ Specifics of the different in-memory implementations (e.g. csv, networkx)
7
+ are implemented in the child classes. Any concrete in-memory implementation
8
+ needs to implement at least:
9
+ - add_nodes
10
+ - add_edges
11
+ - get_kg
12
+
13
+ Raises:
14
+ NotImplementedError: InMemoryKG implementation must override 'add_nodes'
15
+ NotImplementedError: InMemoryKG implementation must override 'add_edges'
16
+ NotImplementedError: InMemoryKG implementation must override 'get_kg'
17
+ """
18
+
19
+ @abstractmethod
20
+ def add_nodes(self, nodes):
21
+ """Add nodes to the in-memory knowledge graph.
22
+
23
+ Args:
24
+ nodes (Iterable[BioCypherNode]): Iterable of BioCypherNode objects.
25
+ """
26
+ raise NotImplementedError("InMemoryKG implementation must override 'add_nodes'")
27
+
28
+ @abstractmethod
29
+ def add_edges(self, edges):
30
+ """Add edges to the in-memory knowledge graph.
31
+
32
+ Args:
33
+ edges (Iterable[BioCypherEdge]): Iterable of BioCypherEdge objects.
34
+ """
35
+ raise NotImplementedError("InMemoryKG implementation must override 'add_edges'")
36
+
37
+ @abstractmethod
38
+ def get_kg(self):
39
+ """Return the in-memory knowledge graph."""
40
+ raise NotImplementedError("InMemoryKG implementation must override 'get_kg'")
@@ -0,0 +1,44 @@
1
+ import networkx as nx
2
+
3
+ from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
4
+ from biocypher.output.in_memory._pandas import PandasKG
5
+
6
+
7
+ class NetworkxKG(_InMemoryKG):
8
+ def __init__(self, deduplicator):
9
+ super().__init__() # keeping in spite of ABC not having __init__
10
+ self.deduplicator = deduplicator
11
+ self._pd = PandasKG(
12
+ deduplicator=self.deduplicator,
13
+ )
14
+ self.KG = None
15
+
16
+ def get_kg(self):
17
+ if not self.KG:
18
+ self.KG = self._create_networkx_kg()
19
+ return self.KG
20
+
21
+ def add_nodes(self, nodes):
22
+ self._pd.add_nodes(nodes)
23
+ return True
24
+
25
+ def add_edges(self, edges):
26
+ self._pd.add_edges(edges)
27
+ return True
28
+
29
+ def _create_networkx_kg(self) -> nx.DiGraph:
30
+ self.KG = nx.DiGraph()
31
+ all_dfs = self._pd.dfs
32
+ node_dfs = [df for df in all_dfs.values() if df.columns.str.contains("node_id").any()]
33
+ edge_dfs = [
34
+ df
35
+ for df in all_dfs.values()
36
+ if df.columns.str.contains("source_id").any() and df.columns.str.contains("target_id").any()
37
+ ]
38
+ for df in node_dfs:
39
+ nodes = df.set_index("node_id").to_dict(orient="index")
40
+ self.KG.add_nodes_from(nodes.items())
41
+ for df in edge_dfs:
42
+ edges = df.set_index(["source_id", "target_id"]).to_dict(orient="index")
43
+ self.KG.add_edges_from(((source, target, attrs) for (source, target), attrs in edges.items()))
44
+ return self.KG
@@ -1,15 +1,25 @@
1
1
  import pandas as pd
2
2
 
3
3
  from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
4
+ from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
4
5
 
5
6
 
6
- class Pandas:
7
- def __init__(self, translator, deduplicator):
8
- self.translator = translator
7
+ class PandasKG(_InMemoryKG):
8
+ def __init__(self, deduplicator):
9
+ super().__init__() # keeping in spite of ABC not having __init__
9
10
  self.deduplicator = deduplicator
10
11
 
11
12
  self.dfs = {}
12
13
 
14
+ def get_kg(self):
15
+ return self.dfs
16
+
17
+ def add_nodes(self, nodes):
18
+ self.add_tables(nodes)
19
+
20
+ def add_edges(self, edges):
21
+ self.add_tables(edges)
22
+
13
23
  def _separate_entity_types(self, entities):
14
24
  """
15
25
  Given mixed iterable of BioCypher objects, separate them into lists by
@@ -23,8 +33,7 @@ class Pandas:
23
33
  and not isinstance(entity, BioCypherRelAsNode)
24
34
  ):
25
35
  raise TypeError(
26
- "Expected a BioCypherNode / BioCypherEdge / "
27
- f"BioCypherRelAsNode, got {type(entity)}."
36
+ "Expected a BioCypherNode / BioCypherEdge / " f"BioCypherRelAsNode, got {type(entity)}."
28
37
  )
29
38
 
30
39
  if isinstance(entity, BioCypherNode):
@@ -43,23 +52,23 @@ class Pandas:
43
52
  target_edge = entity.get_target_edge()
44
53
 
45
54
  _type = node.get_type()
46
- if not _type in lists:
55
+ if _type not in lists:
47
56
  lists[_type] = []
48
57
  lists[_type].append(node)
49
58
 
50
59
  _source_type = source_edge.get_type()
51
- if not _source_type in lists:
60
+ if _source_type not in lists:
52
61
  lists[_source_type] = []
53
62
  lists[_source_type].append(source_edge)
54
63
 
55
64
  _target_type = target_edge.get_type()
56
- if not _target_type in lists:
65
+ if _target_type not in lists:
57
66
  lists[_target_type] = []
58
67
  lists[_target_type].append(target_edge)
59
68
  continue
60
69
 
61
70
  _type = entity.get_type()
62
- if not _type in lists:
71
+ if _type not in lists:
63
72
  lists[_type] = []
64
73
  lists[_type].append(entity)
65
74
 
@@ -76,15 +85,11 @@ class Pandas:
76
85
  self._add_entity_df(_type, _entities)
77
86
 
78
87
  def _add_entity_df(self, _type, _entities):
79
- df = pd.DataFrame(
80
- pd.json_normalize([node.get_dict() for node in _entities])
81
- )
88
+ df = pd.DataFrame(pd.json_normalize([node.get_dict() for node in _entities]))
82
89
  # replace "properties." with "" in column names
83
90
  df.columns = [col.replace("properties.", "") for col in df.columns]
84
91
  if _type not in self.dfs:
85
92
  self.dfs[_type] = df
86
93
  else:
87
- self.dfs[_type] = pd.concat(
88
- [self.dfs[_type], df], ignore_index=True
89
- )
94
+ self.dfs[_type] = pd.concat([self.dfs[_type], df], ignore_index=True)
90
95
  return self.dfs[_type]