biocypher 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/__init__.py +3 -13
- biocypher/_config/__init__.py +6 -23
- biocypher/_core.py +360 -262
- biocypher/_create.py +13 -27
- biocypher/_deduplicate.py +4 -11
- biocypher/_get.py +21 -60
- biocypher/_logger.py +4 -16
- biocypher/_mapping.py +4 -17
- biocypher/_metadata.py +3 -15
- biocypher/_misc.py +14 -28
- biocypher/_ontology.py +127 -212
- biocypher/_translate.py +34 -58
- biocypher/output/connect/_get_connector.py +40 -0
- biocypher/output/connect/_neo4j_driver.py +9 -65
- biocypher/output/in_memory/_get_in_memory_kg.py +34 -0
- biocypher/output/in_memory/_in_memory_kg.py +40 -0
- biocypher/output/in_memory/_networkx.py +44 -0
- biocypher/output/in_memory/_pandas.py +20 -15
- biocypher/output/write/_batch_writer.py +132 -177
- biocypher/output/write/_get_writer.py +11 -24
- biocypher/output/write/_writer.py +14 -33
- biocypher/output/write/graph/_arangodb.py +7 -24
- biocypher/output/write/graph/_neo4j.py +51 -56
- biocypher/output/write/graph/_networkx.py +36 -43
- biocypher/output/write/graph/_rdf.py +107 -95
- biocypher/output/write/relational/_csv.py +6 -11
- biocypher/output/write/relational/_postgresql.py +5 -13
- biocypher/output/write/relational/_sqlite.py +3 -1
- {biocypher-0.6.2.dist-info → biocypher-0.7.0.dist-info}/LICENSE +1 -1
- {biocypher-0.6.2.dist-info → biocypher-0.7.0.dist-info}/METADATA +3 -3
- biocypher-0.7.0.dist-info/RECORD +43 -0
- {biocypher-0.6.2.dist-info → biocypher-0.7.0.dist-info}/WHEEL +1 -1
- biocypher-0.6.2.dist-info/RECORD +0 -39
biocypher/_translate.py
CHANGED
|
@@ -1,31 +1,21 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
|
|
3
|
-
#
|
|
4
|
-
# Copyright 2021, Heidelberg University Clinic
|
|
5
|
-
#
|
|
6
|
-
# File author(s): Sebastian Lobentanzer
|
|
7
|
-
# ...
|
|
8
|
-
#
|
|
9
|
-
# Distributed under MIT licence, see the file `LICENSE`.
|
|
10
|
-
#
|
|
11
1
|
"""
|
|
12
2
|
BioCypher 'translation' module. Responsible for translating between the raw
|
|
13
3
|
input data and the BioCypherNode and BioCypherEdge objects.
|
|
14
4
|
"""
|
|
15
|
-
from ._logger import logger
|
|
16
|
-
|
|
17
|
-
logger.debug(f"Loading module {__name__}.")
|
|
18
5
|
|
|
19
|
-
from
|
|
20
|
-
from
|
|
6
|
+
from collections.abc import Generator, Iterable
|
|
7
|
+
from typing import Any, Optional, Union
|
|
21
8
|
|
|
22
9
|
from more_itertools import peekable
|
|
23
10
|
|
|
24
11
|
from . import _misc
|
|
25
12
|
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
13
|
+
from ._logger import logger
|
|
26
14
|
from ._ontology import Ontology
|
|
27
15
|
|
|
28
|
-
|
|
16
|
+
logger.debug(f"Loading module {__name__}.")
|
|
17
|
+
|
|
18
|
+
__all__ = ["Translator"]
|
|
29
19
|
|
|
30
20
|
|
|
31
21
|
class Translator:
|
|
@@ -67,6 +57,20 @@ class Translator:
|
|
|
67
57
|
|
|
68
58
|
self._update_ontology_types()
|
|
69
59
|
|
|
60
|
+
def translate_entities(self, entities):
|
|
61
|
+
entities = peekable(entities)
|
|
62
|
+
if (
|
|
63
|
+
isinstance(entities.peek(), BioCypherNode)
|
|
64
|
+
or isinstance(entities.peek(), BioCypherEdge)
|
|
65
|
+
or isinstance(entities.peek(), BioCypherRelAsNode)
|
|
66
|
+
):
|
|
67
|
+
translated_entities = entities
|
|
68
|
+
elif len(entities.peek()) < 4:
|
|
69
|
+
translated_entities = self.translate_nodes(entities)
|
|
70
|
+
else:
|
|
71
|
+
translated_entities = self.translate_edges(entities)
|
|
72
|
+
return translated_entities
|
|
73
|
+
|
|
70
74
|
def translate_nodes(
|
|
71
75
|
self,
|
|
72
76
|
node_tuples: Iterable,
|
|
@@ -131,8 +135,7 @@ class Translator:
|
|
|
131
135
|
|
|
132
136
|
return (
|
|
133
137
|
self.ontology.mapping.extended_schema[_bl_type]["preferred_id"]
|
|
134
|
-
if "preferred_id"
|
|
135
|
-
in self.ontology.mapping.extended_schema.get(_bl_type, {})
|
|
138
|
+
if "preferred_id" in self.ontology.mapping.extended_schema.get(_bl_type, {})
|
|
136
139
|
else "id"
|
|
137
140
|
)
|
|
138
141
|
|
|
@@ -141,9 +144,7 @@ class Translator:
|
|
|
141
144
|
Filters properties for those specified in schema_config if any.
|
|
142
145
|
"""
|
|
143
146
|
|
|
144
|
-
filter_props = self.ontology.mapping.extended_schema[bl_type].get(
|
|
145
|
-
"properties", {}
|
|
146
|
-
)
|
|
147
|
+
filter_props = self.ontology.mapping.extended_schema[bl_type].get("properties", {})
|
|
147
148
|
|
|
148
149
|
# strict mode: add required properties (only if there is a whitelist)
|
|
149
150
|
if self.strict_mode and filter_props:
|
|
@@ -151,36 +152,24 @@ class Translator:
|
|
|
151
152
|
{"source": "str", "licence": "str", "version": "str"},
|
|
152
153
|
)
|
|
153
154
|
|
|
154
|
-
exclude_props = self.ontology.mapping.extended_schema[bl_type].get(
|
|
155
|
-
"exclude_properties", []
|
|
156
|
-
)
|
|
155
|
+
exclude_props = self.ontology.mapping.extended_schema[bl_type].get("exclude_properties", [])
|
|
157
156
|
|
|
158
157
|
if isinstance(exclude_props, str):
|
|
159
158
|
exclude_props = [exclude_props]
|
|
160
159
|
|
|
161
160
|
if filter_props and exclude_props:
|
|
162
|
-
filtered_props = {
|
|
163
|
-
k: v
|
|
164
|
-
for k, v in props.items()
|
|
165
|
-
if (k in filter_props.keys() and k not in exclude_props)
|
|
166
|
-
}
|
|
161
|
+
filtered_props = {k: v for k, v in props.items() if (k in filter_props.keys() and k not in exclude_props)}
|
|
167
162
|
|
|
168
163
|
elif filter_props:
|
|
169
|
-
filtered_props = {
|
|
170
|
-
k: v for k, v in props.items() if k in filter_props.keys()
|
|
171
|
-
}
|
|
164
|
+
filtered_props = {k: v for k, v in props.items() if k in filter_props.keys()}
|
|
172
165
|
|
|
173
166
|
elif exclude_props:
|
|
174
|
-
filtered_props = {
|
|
175
|
-
k: v for k, v in props.items() if k not in exclude_props
|
|
176
|
-
}
|
|
167
|
+
filtered_props = {k: v for k, v in props.items() if k not in exclude_props}
|
|
177
168
|
|
|
178
169
|
else:
|
|
179
170
|
return props
|
|
180
171
|
|
|
181
|
-
missing_props = [
|
|
182
|
-
k for k in filter_props.keys() if k not in filtered_props.keys()
|
|
183
|
-
]
|
|
172
|
+
missing_props = [k for k in filter_props.keys() if k not in filtered_props.keys()]
|
|
184
173
|
# add missing properties with default values
|
|
185
174
|
for k in missing_props:
|
|
186
175
|
filtered_props[k] = None
|
|
@@ -213,20 +202,17 @@ class Translator:
|
|
|
213
202
|
# TODO remove for performance reasons once safe
|
|
214
203
|
edge_tuples = peekable(edge_tuples)
|
|
215
204
|
if len(edge_tuples.peek()) == 4:
|
|
216
|
-
edge_tuples = [
|
|
217
|
-
(None, src, tar, typ, props)
|
|
218
|
-
for src, tar, typ, props in edge_tuples
|
|
219
|
-
]
|
|
205
|
+
edge_tuples = [(None, src, tar, typ, props) for src, tar, typ, props in edge_tuples]
|
|
220
206
|
|
|
221
207
|
for _id, _src, _tar, _type, _props in edge_tuples:
|
|
222
208
|
# check for strict mode requirements
|
|
223
209
|
if self.strict_mode:
|
|
224
|
-
if
|
|
210
|
+
if "source" not in _props:
|
|
225
211
|
raise ValueError(
|
|
226
212
|
f"Edge {_id if _id else (_src, _tar)} does not have a `source` property.",
|
|
227
213
|
" This is required in strict mode.",
|
|
228
214
|
)
|
|
229
|
-
if
|
|
215
|
+
if "licence" not in _props:
|
|
230
216
|
raise ValueError(
|
|
231
217
|
f"Edge {_id if _id else (_src, _tar)} does not have a `licence` property.",
|
|
232
218
|
" This is required in strict mode.",
|
|
@@ -240,9 +226,7 @@ class Translator:
|
|
|
240
226
|
# filter properties for those specified in schema_config if any
|
|
241
227
|
_filtered_props = self._filter_props(bl_type, _props)
|
|
242
228
|
|
|
243
|
-
rep = self.ontology.mapping.extended_schema[bl_type][
|
|
244
|
-
"represented_as"
|
|
245
|
-
]
|
|
229
|
+
rep = self.ontology.mapping.extended_schema[bl_type]["represented_as"]
|
|
246
230
|
|
|
247
231
|
if rep == "node":
|
|
248
232
|
if _id:
|
|
@@ -251,13 +235,7 @@ class Translator:
|
|
|
251
235
|
|
|
252
236
|
else:
|
|
253
237
|
# source target concat
|
|
254
|
-
node_id = (
|
|
255
|
-
str(_src)
|
|
256
|
-
+ "_"
|
|
257
|
-
+ str(_tar)
|
|
258
|
-
+ "_"
|
|
259
|
-
+ "_".join(str(v) for v in _filtered_props.values())
|
|
260
|
-
)
|
|
238
|
+
node_id = str(_src) + "_" + str(_tar) + "_" + "_".join(str(v) for v in _filtered_props.values())
|
|
261
239
|
|
|
262
240
|
n = BioCypherNode(
|
|
263
241
|
node_id=node_id,
|
|
@@ -268,7 +246,7 @@ class Translator:
|
|
|
268
246
|
# directionality check TODO generalise to account for
|
|
269
247
|
# different descriptions of directionality or find a
|
|
270
248
|
# more consistent solution for indicating directionality
|
|
271
|
-
if _filtered_props.get("directed") == True:
|
|
249
|
+
if _filtered_props.get("directed") == True: # noqa: E712 (seems to not work without '== True')
|
|
272
250
|
l1 = "IS_SOURCE_OF"
|
|
273
251
|
l2 = "IS_TARGET_OF"
|
|
274
252
|
|
|
@@ -298,9 +276,7 @@ class Translator:
|
|
|
298
276
|
yield BioCypherRelAsNode(n, e_s, e_t)
|
|
299
277
|
|
|
300
278
|
else:
|
|
301
|
-
edge_label = self.ontology.mapping.extended_schema[
|
|
302
|
-
bl_type
|
|
303
|
-
].get("label_as_edge")
|
|
279
|
+
edge_label = self.ontology.mapping.extended_schema[bl_type].get("label_as_edge")
|
|
304
280
|
|
|
305
281
|
if edge_label is None:
|
|
306
282
|
edge_label = bl_type
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""BioCypher 'connect' module.
|
|
2
|
+
|
|
3
|
+
Handles the connecting and writing a Knowledge Graph to a database.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from biocypher._config import config as _config
|
|
7
|
+
from biocypher._logger import logger
|
|
8
|
+
from biocypher._translate import Translator
|
|
9
|
+
from biocypher.output.connect._neo4j_driver import _Neo4jDriver
|
|
10
|
+
|
|
11
|
+
logger.debug(f"Loading module {__name__}.")
|
|
12
|
+
|
|
13
|
+
__all__ = ["get_connector"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_connector(
|
|
17
|
+
dbms: str,
|
|
18
|
+
translator: Translator,
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
Function to return the connector class.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
class: the connector class
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
dbms_config = _config(dbms)
|
|
28
|
+
|
|
29
|
+
if dbms == "neo4j":
|
|
30
|
+
return _Neo4jDriver(
|
|
31
|
+
database_name=dbms_config["database_name"],
|
|
32
|
+
wipe=dbms_config["wipe"],
|
|
33
|
+
uri=dbms_config["uri"],
|
|
34
|
+
user=dbms_config["user"],
|
|
35
|
+
password=dbms_config["password"],
|
|
36
|
+
multi_db=dbms_config["multi_db"],
|
|
37
|
+
translator=translator,
|
|
38
|
+
)
|
|
39
|
+
else:
|
|
40
|
+
raise NotImplementedError(f"Online mode is not supported for the DBMS {dbms}.")
|
|
@@ -1,32 +1,19 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
|
|
3
|
-
#
|
|
4
|
-
# Copyright 2021, Heidelberg University Clinic
|
|
5
|
-
#
|
|
6
|
-
# File author(s): Sebastian Lobentanzer
|
|
7
|
-
# ...
|
|
8
|
-
#
|
|
9
|
-
# Distributed under MIT licence, see the file `LICENSE`.
|
|
10
|
-
#
|
|
11
1
|
"""
|
|
12
2
|
BioCypher 'online' mode. Handles connection and manipulation of a running DBMS.
|
|
13
3
|
"""
|
|
14
|
-
import subprocess
|
|
15
4
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
logger.debug(f"Loading module {__name__}.")
|
|
5
|
+
import itertools
|
|
19
6
|
|
|
20
7
|
from collections.abc import Iterable
|
|
21
|
-
import itertools
|
|
22
8
|
|
|
23
9
|
import neo4j_utils
|
|
24
10
|
|
|
25
11
|
from biocypher import _misc
|
|
26
|
-
from biocypher._config import config as _config
|
|
27
12
|
from biocypher._create import BioCypherEdge, BioCypherNode
|
|
13
|
+
from biocypher._logger import logger
|
|
28
14
|
from biocypher._translate import Translator
|
|
29
15
|
|
|
16
|
+
logger.debug(f"Loading module {__name__}.")
|
|
30
17
|
__all__ = ["_Neo4jDriver"]
|
|
31
18
|
|
|
32
19
|
|
|
@@ -96,7 +83,7 @@ class _Neo4jDriver:
|
|
|
96
83
|
|
|
97
84
|
# find current version node
|
|
98
85
|
db_version = self._driver.query(
|
|
99
|
-
"MATCH (v:BioCypher)
|
|
86
|
+
"MATCH (v:BioCypher) WHERE NOT (v)-[:PRECEDES]->() RETURN v",
|
|
100
87
|
)
|
|
101
88
|
# add version node
|
|
102
89
|
self.add_biocypher_nodes(self.translator.ontology)
|
|
@@ -143,18 +130,10 @@ class _Neo4jDriver:
|
|
|
143
130
|
label = _misc.sentencecase_to_pascalcase(leaf[0], sep=r"\s\.")
|
|
144
131
|
if leaf[1]["represented_as"] == "node":
|
|
145
132
|
if major_neo4j_version >= 5:
|
|
146
|
-
s = (
|
|
147
|
-
f"CREATE CONSTRAINT `{label}_id` "
|
|
148
|
-
f"IF NOT EXISTS FOR (n:`{label}`) "
|
|
149
|
-
"REQUIRE n.id IS UNIQUE"
|
|
150
|
-
)
|
|
133
|
+
s = f"CREATE CONSTRAINT `{label}_id` " f"IF NOT EXISTS FOR (n:`{label}`) " "REQUIRE n.id IS UNIQUE"
|
|
151
134
|
self._driver.query(s)
|
|
152
135
|
else:
|
|
153
|
-
s = (
|
|
154
|
-
f"CREATE CONSTRAINT `{label}_id` "
|
|
155
|
-
f"IF NOT EXISTS ON (n:`{label}`) "
|
|
156
|
-
"ASSERT n.id IS UNIQUE"
|
|
157
|
-
)
|
|
136
|
+
s = f"CREATE CONSTRAINT `{label}_id` " f"IF NOT EXISTS ON (n:`{label}`) " "ASSERT n.id IS UNIQUE"
|
|
158
137
|
self._driver.query(s)
|
|
159
138
|
|
|
160
139
|
def _get_neo4j_version(self):
|
|
@@ -170,9 +149,7 @@ class _Neo4jDriver:
|
|
|
170
149
|
)[0][0]["version"]
|
|
171
150
|
return neo4j_version
|
|
172
151
|
except Exception as e:
|
|
173
|
-
logger.warning(
|
|
174
|
-
f"Error detecting Neo4j version: {e} use default version 4.0.0."
|
|
175
|
-
)
|
|
152
|
+
logger.warning(f"Error detecting Neo4j version: {e} use default version 4.0.0.")
|
|
176
153
|
return "4.0.0"
|
|
177
154
|
|
|
178
155
|
def add_nodes(self, id_type_tuples: Iterable[tuple]) -> tuple:
|
|
@@ -364,11 +341,7 @@ class _Neo4jDriver:
|
|
|
364
341
|
# merging only on the ids of the entities, passing the
|
|
365
342
|
# properties on match and on create;
|
|
366
343
|
# TODO add node labels?
|
|
367
|
-
node_query = (
|
|
368
|
-
"UNWIND $rels AS r "
|
|
369
|
-
"MERGE (src {id: r.source_id}) "
|
|
370
|
-
"MERGE (tar {id: r.target_id}) "
|
|
371
|
-
)
|
|
344
|
+
node_query = "UNWIND $rels AS r " "MERGE (src {id: r.source_id}) " "MERGE (tar {id: r.target_id}) "
|
|
372
345
|
|
|
373
346
|
self._driver.query(node_query, parameters={"rels": rels})
|
|
374
347
|
|
|
@@ -386,37 +359,8 @@ class _Neo4jDriver:
|
|
|
386
359
|
|
|
387
360
|
method = "explain" if explain else "profile" if profile else "query"
|
|
388
361
|
|
|
389
|
-
result = getattr(self._driver, method)(
|
|
390
|
-
edge_query, parameters={"rels": rels}
|
|
391
|
-
)
|
|
362
|
+
result = getattr(self._driver, method)(edge_query, parameters={"rels": rels})
|
|
392
363
|
|
|
393
364
|
logger.info("Finished merging edges.")
|
|
394
365
|
|
|
395
366
|
return result
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
def get_driver(
|
|
399
|
-
dbms: str,
|
|
400
|
-
translator: "Translator",
|
|
401
|
-
):
|
|
402
|
-
"""
|
|
403
|
-
Function to return the writer class.
|
|
404
|
-
|
|
405
|
-
Returns:
|
|
406
|
-
class: the writer class
|
|
407
|
-
"""
|
|
408
|
-
|
|
409
|
-
dbms_config = _config(dbms)
|
|
410
|
-
|
|
411
|
-
if dbms == "neo4j":
|
|
412
|
-
return _Neo4jDriver(
|
|
413
|
-
database_name=dbms_config["database_name"],
|
|
414
|
-
wipe=dbms_config["wipe"],
|
|
415
|
-
uri=dbms_config["uri"],
|
|
416
|
-
user=dbms_config["user"],
|
|
417
|
-
password=dbms_config["password"],
|
|
418
|
-
multi_db=dbms_config["multi_db"],
|
|
419
|
-
translator=translator,
|
|
420
|
-
)
|
|
421
|
-
|
|
422
|
-
return None
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BioCypher 'in_memory' module. Handles the in-memory Knowledge Graph instance.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from biocypher._deduplicate import Deduplicator
|
|
6
|
+
from biocypher._logger import logger
|
|
7
|
+
from biocypher.output.in_memory._networkx import NetworkxKG
|
|
8
|
+
from biocypher.output.in_memory._pandas import PandasKG
|
|
9
|
+
|
|
10
|
+
logger.debug(f"Loading module {__name__}.")
|
|
11
|
+
|
|
12
|
+
__all__ = ["get_in_memory_kg"]
|
|
13
|
+
|
|
14
|
+
IN_MEMORY_DBMS = ["csv", "pandas", "tabular", "networkx"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_in_memory_kg(
|
|
18
|
+
dbms: str,
|
|
19
|
+
deduplicator: Deduplicator,
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
Function to return the in-memory KG class.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
class: the in-memory KG class
|
|
26
|
+
"""
|
|
27
|
+
if dbms in ["csv", "pandas", "tabular"]:
|
|
28
|
+
return PandasKG(deduplicator)
|
|
29
|
+
elif dbms == "networkx":
|
|
30
|
+
return NetworkxKG(deduplicator)
|
|
31
|
+
else:
|
|
32
|
+
raise NotImplementedError(
|
|
33
|
+
f"Getting the in memory BioCypher KG is not supported for the DBMS {dbms}. Supported: {IN_MEMORY_DBMS}."
|
|
34
|
+
)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class _InMemoryKG(ABC):
|
|
5
|
+
"""Abstract class for handling the in-memory Knowledge Graph instance.
|
|
6
|
+
Specifics of the different in-memory implementations (e.g. csv, networkx)
|
|
7
|
+
are implemented in the child classes. Any concrete in-memory implementation
|
|
8
|
+
needs to implement at least:
|
|
9
|
+
- add_nodes
|
|
10
|
+
- add_edges
|
|
11
|
+
- get_kg
|
|
12
|
+
|
|
13
|
+
Raises:
|
|
14
|
+
NotImplementedError: InMemoryKG implementation must override 'add_nodes'
|
|
15
|
+
NotImplementedError: InMemoryKG implementation must override 'add_edges'
|
|
16
|
+
NotImplementedError: InMemoryKG implementation must override 'get_kg'
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def add_nodes(self, nodes):
|
|
21
|
+
"""Add nodes to the in-memory knowledge graph.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
nodes (Iterable[BioCypherNode]): Iterable of BioCypherNode objects.
|
|
25
|
+
"""
|
|
26
|
+
raise NotImplementedError("InMemoryKG implementation must override 'add_nodes'")
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def add_edges(self, edges):
|
|
30
|
+
"""Add edges to the in-memory knowledge graph.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
edges (Iterable[BioCypherEdge]): Iterable of BioCypherEdge objects.
|
|
34
|
+
"""
|
|
35
|
+
raise NotImplementedError("InMemoryKG implementation must override 'add_edges'")
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def get_kg(self):
|
|
39
|
+
"""Return the in-memory knowledge graph."""
|
|
40
|
+
raise NotImplementedError("InMemoryKG implementation must override 'get_kg'")
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import networkx as nx
|
|
2
|
+
|
|
3
|
+
from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
|
|
4
|
+
from biocypher.output.in_memory._pandas import PandasKG
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class NetworkxKG(_InMemoryKG):
|
|
8
|
+
def __init__(self, deduplicator):
|
|
9
|
+
super().__init__() # keeping in spite of ABC not having __init__
|
|
10
|
+
self.deduplicator = deduplicator
|
|
11
|
+
self._pd = PandasKG(
|
|
12
|
+
deduplicator=self.deduplicator,
|
|
13
|
+
)
|
|
14
|
+
self.KG = None
|
|
15
|
+
|
|
16
|
+
def get_kg(self):
|
|
17
|
+
if not self.KG:
|
|
18
|
+
self.KG = self._create_networkx_kg()
|
|
19
|
+
return self.KG
|
|
20
|
+
|
|
21
|
+
def add_nodes(self, nodes):
|
|
22
|
+
self._pd.add_nodes(nodes)
|
|
23
|
+
return True
|
|
24
|
+
|
|
25
|
+
def add_edges(self, edges):
|
|
26
|
+
self._pd.add_edges(edges)
|
|
27
|
+
return True
|
|
28
|
+
|
|
29
|
+
def _create_networkx_kg(self) -> nx.DiGraph:
|
|
30
|
+
self.KG = nx.DiGraph()
|
|
31
|
+
all_dfs = self._pd.dfs
|
|
32
|
+
node_dfs = [df for df in all_dfs.values() if df.columns.str.contains("node_id").any()]
|
|
33
|
+
edge_dfs = [
|
|
34
|
+
df
|
|
35
|
+
for df in all_dfs.values()
|
|
36
|
+
if df.columns.str.contains("source_id").any() and df.columns.str.contains("target_id").any()
|
|
37
|
+
]
|
|
38
|
+
for df in node_dfs:
|
|
39
|
+
nodes = df.set_index("node_id").to_dict(orient="index")
|
|
40
|
+
self.KG.add_nodes_from(nodes.items())
|
|
41
|
+
for df in edge_dfs:
|
|
42
|
+
edges = df.set_index(["source_id", "target_id"]).to_dict(orient="index")
|
|
43
|
+
self.KG.add_edges_from(((source, target, attrs) for (source, target), attrs in edges.items()))
|
|
44
|
+
return self.KG
|
|
@@ -1,15 +1,25 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
|
|
3
3
|
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
4
|
+
from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
|
|
4
5
|
|
|
5
6
|
|
|
6
|
-
class
|
|
7
|
-
def __init__(self,
|
|
8
|
-
|
|
7
|
+
class PandasKG(_InMemoryKG):
|
|
8
|
+
def __init__(self, deduplicator):
|
|
9
|
+
super().__init__() # keeping in spite of ABC not having __init__
|
|
9
10
|
self.deduplicator = deduplicator
|
|
10
11
|
|
|
11
12
|
self.dfs = {}
|
|
12
13
|
|
|
14
|
+
def get_kg(self):
|
|
15
|
+
return self.dfs
|
|
16
|
+
|
|
17
|
+
def add_nodes(self, nodes):
|
|
18
|
+
self.add_tables(nodes)
|
|
19
|
+
|
|
20
|
+
def add_edges(self, edges):
|
|
21
|
+
self.add_tables(edges)
|
|
22
|
+
|
|
13
23
|
def _separate_entity_types(self, entities):
|
|
14
24
|
"""
|
|
15
25
|
Given mixed iterable of BioCypher objects, separate them into lists by
|
|
@@ -23,8 +33,7 @@ class Pandas:
|
|
|
23
33
|
and not isinstance(entity, BioCypherRelAsNode)
|
|
24
34
|
):
|
|
25
35
|
raise TypeError(
|
|
26
|
-
"Expected a BioCypherNode / BioCypherEdge / "
|
|
27
|
-
f"BioCypherRelAsNode, got {type(entity)}."
|
|
36
|
+
"Expected a BioCypherNode / BioCypherEdge / " f"BioCypherRelAsNode, got {type(entity)}."
|
|
28
37
|
)
|
|
29
38
|
|
|
30
39
|
if isinstance(entity, BioCypherNode):
|
|
@@ -43,23 +52,23 @@ class Pandas:
|
|
|
43
52
|
target_edge = entity.get_target_edge()
|
|
44
53
|
|
|
45
54
|
_type = node.get_type()
|
|
46
|
-
if not
|
|
55
|
+
if _type not in lists:
|
|
47
56
|
lists[_type] = []
|
|
48
57
|
lists[_type].append(node)
|
|
49
58
|
|
|
50
59
|
_source_type = source_edge.get_type()
|
|
51
|
-
if not
|
|
60
|
+
if _source_type not in lists:
|
|
52
61
|
lists[_source_type] = []
|
|
53
62
|
lists[_source_type].append(source_edge)
|
|
54
63
|
|
|
55
64
|
_target_type = target_edge.get_type()
|
|
56
|
-
if not
|
|
65
|
+
if _target_type not in lists:
|
|
57
66
|
lists[_target_type] = []
|
|
58
67
|
lists[_target_type].append(target_edge)
|
|
59
68
|
continue
|
|
60
69
|
|
|
61
70
|
_type = entity.get_type()
|
|
62
|
-
if not
|
|
71
|
+
if _type not in lists:
|
|
63
72
|
lists[_type] = []
|
|
64
73
|
lists[_type].append(entity)
|
|
65
74
|
|
|
@@ -76,15 +85,11 @@ class Pandas:
|
|
|
76
85
|
self._add_entity_df(_type, _entities)
|
|
77
86
|
|
|
78
87
|
def _add_entity_df(self, _type, _entities):
|
|
79
|
-
df = pd.DataFrame(
|
|
80
|
-
pd.json_normalize([node.get_dict() for node in _entities])
|
|
81
|
-
)
|
|
88
|
+
df = pd.DataFrame(pd.json_normalize([node.get_dict() for node in _entities]))
|
|
82
89
|
# replace "properties." with "" in column names
|
|
83
90
|
df.columns = [col.replace("properties.", "") for col in df.columns]
|
|
84
91
|
if _type not in self.dfs:
|
|
85
92
|
self.dfs[_type] = df
|
|
86
93
|
else:
|
|
87
|
-
self.dfs[_type] = pd.concat(
|
|
88
|
-
[self.dfs[_type], df], ignore_index=True
|
|
89
|
-
)
|
|
94
|
+
self.dfs[_type] = pd.concat([self.dfs[_type], df], ignore_index=True)
|
|
90
95
|
return self.dfs[_type]
|