biocypher 0.5.19__tar.gz → 0.5.20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- {biocypher-0.5.19 → biocypher-0.5.20}/PKG-INFO +4 -2
- {biocypher-0.5.19 → biocypher-0.5.20}/README.md +3 -1
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_connect.py +6 -12
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_core.py +87 -20
- biocypher-0.5.20/biocypher/_deduplicate.py +147 -0
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_metadata.py +1 -1
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_ontology.py +11 -9
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_pandas.py +32 -7
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_translate.py +29 -26
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_write.py +75 -57
- {biocypher-0.5.19 → biocypher-0.5.20}/pyproject.toml +3 -1
- biocypher-0.5.20/setup.py +38 -0
- biocypher-0.5.19/biocypher/_deduplicate.py +0 -105
- biocypher-0.5.19/setup.py +0 -38
- {biocypher-0.5.19 → biocypher-0.5.20}/LICENSE +0 -0
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/__init__.py +0 -0
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_config/__init__.py +0 -0
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_config/biocypher_config.yaml +0 -0
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_config/test_config.yaml +0 -0
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_config/test_schema_config.yaml +0 -0
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_config/test_schema_config_disconnected.yaml +0 -0
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_config/test_schema_config_extended.yaml +0 -0
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_create.py +0 -0
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_logger.py +0 -0
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_mapping.py +0 -0
- {biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_misc.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: biocypher
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.20
|
|
4
4
|
Summary: A unifying framework for biomedical research knowledge graphs
|
|
5
5
|
Home-page: https://github.com/biocypher/biocypher
|
|
6
6
|
License: MIT
|
|
@@ -38,7 +38,9 @@ Description-Content-Type: text/markdown
|
|
|
38
38
|

|
|
39
39
|
[](https://badge.fury.io/py/biocypher)
|
|
40
40
|
[](https://www.repostatus.org/#active)
|
|
41
|
-
](https://github.com/biocypher/biocypher/actions/workflows/ci_cd.yaml)
|
|
42
|
+

|
|
43
|
+
[](https://github.com/biocypher/biocypher/actions/workflows/sphinx_autodoc.yaml)
|
|
42
44
|
[](https://pepy.tech/project/biocypher)
|
|
43
45
|
[](https://github.com/pre-commit/pre-commit)
|
|
44
46
|
[](http://makeapullrequest.com)
|
|
@@ -4,7 +4,9 @@
|
|
|
4
4
|

|
|
5
5
|
[](https://badge.fury.io/py/biocypher)
|
|
6
6
|
[](https://www.repostatus.org/#active)
|
|
7
|
-
](https://github.com/biocypher/biocypher/actions/workflows/ci_cd.yaml)
|
|
8
|
+

|
|
9
|
+
[](https://github.com/biocypher/biocypher/actions/workflows/sphinx_autodoc.yaml)
|
|
8
10
|
[](https://pepy.tech/project/biocypher)
|
|
9
11
|
[](https://github.com/pre-commit/pre-commit)
|
|
10
12
|
[](http://makeapullrequest.com)
|
|
@@ -53,8 +53,6 @@ class _Neo4jDriver:
|
|
|
53
53
|
|
|
54
54
|
increment_version (bool): Whether to increment the version number.
|
|
55
55
|
|
|
56
|
-
ontology (Ontology): The ontology to use for mapping.
|
|
57
|
-
|
|
58
56
|
translator (Translator): The translator to use for mapping.
|
|
59
57
|
|
|
60
58
|
"""
|
|
@@ -66,14 +64,12 @@ class _Neo4jDriver:
|
|
|
66
64
|
user: str,
|
|
67
65
|
password: str,
|
|
68
66
|
multi_db: bool,
|
|
69
|
-
ontology: Ontology,
|
|
70
67
|
translator: Translator,
|
|
71
68
|
wipe: bool = False,
|
|
72
69
|
fetch_size: int = 1000,
|
|
73
70
|
increment_version: bool = True,
|
|
74
71
|
):
|
|
75
|
-
self.
|
|
76
|
-
self._translator = translator
|
|
72
|
+
self.translator = translator
|
|
77
73
|
|
|
78
74
|
self._driver = neo4j_utils.Driver(
|
|
79
75
|
db_name=database_name,
|
|
@@ -103,7 +99,7 @@ class _Neo4jDriver:
|
|
|
103
99
|
"MATCH (v:BioCypher) " "WHERE NOT (v)-[:PRECEDES]->() " "RETURN v",
|
|
104
100
|
)
|
|
105
101
|
# add version node
|
|
106
|
-
self.add_biocypher_nodes(self.
|
|
102
|
+
self.add_biocypher_nodes(self.translator.ontology)
|
|
107
103
|
|
|
108
104
|
# connect version node to previous
|
|
109
105
|
if db_version[0]:
|
|
@@ -111,7 +107,7 @@ class _Neo4jDriver:
|
|
|
111
107
|
previous_id = previous["v"]["id"]
|
|
112
108
|
e_meta = BioCypherEdge(
|
|
113
109
|
previous_id,
|
|
114
|
-
self.
|
|
110
|
+
self.translator.ontology.get_dict().get("node_id"),
|
|
115
111
|
"PRECEDES",
|
|
116
112
|
)
|
|
117
113
|
self.add_biocypher_edges(e_meta)
|
|
@@ -142,7 +138,7 @@ class _Neo4jDriver:
|
|
|
142
138
|
logger.info("Creating constraints for node types in config.")
|
|
143
139
|
|
|
144
140
|
# get structure
|
|
145
|
-
for leaf in self.
|
|
141
|
+
for leaf in self.translator.ontology.mapping.extended_schema.items():
|
|
146
142
|
label = _misc.sentencecase_to_pascalcase(leaf[0])
|
|
147
143
|
if leaf[1]["represented_as"] == "node":
|
|
148
144
|
s = (
|
|
@@ -172,7 +168,7 @@ class _Neo4jDriver:
|
|
|
172
168
|
- second entry: Neo4j summary.
|
|
173
169
|
"""
|
|
174
170
|
|
|
175
|
-
bn = self.
|
|
171
|
+
bn = self.translator.translate_nodes(id_type_tuples)
|
|
176
172
|
return self.add_biocypher_nodes(bn)
|
|
177
173
|
|
|
178
174
|
def add_edges(self, id_src_tar_type_tuples: Iterable[tuple]) -> tuple:
|
|
@@ -204,7 +200,7 @@ class _Neo4jDriver:
|
|
|
204
200
|
- second entry: Neo4j summary.
|
|
205
201
|
"""
|
|
206
202
|
|
|
207
|
-
bn = self.
|
|
203
|
+
bn = self.translator.translate_edges(id_src_tar_type_tuples)
|
|
208
204
|
return self.add_biocypher_edges(bn)
|
|
209
205
|
|
|
210
206
|
def add_biocypher_nodes(
|
|
@@ -375,7 +371,6 @@ class _Neo4jDriver:
|
|
|
375
371
|
def get_driver(
|
|
376
372
|
dbms: str,
|
|
377
373
|
translator: "Translator",
|
|
378
|
-
ontology: "Ontology",
|
|
379
374
|
):
|
|
380
375
|
"""
|
|
381
376
|
Function to return the writer class.
|
|
@@ -394,7 +389,6 @@ def get_driver(
|
|
|
394
389
|
user=dbms_config["user"],
|
|
395
390
|
password=dbms_config["password"],
|
|
396
391
|
multi_db=dbms_config["multi_db"],
|
|
397
|
-
ontology=ontology,
|
|
398
392
|
translator=translator,
|
|
399
393
|
)
|
|
400
394
|
|
|
@@ -13,8 +13,10 @@ BioCypher core module. Interfaces with the user and distributes tasks to
|
|
|
13
13
|
submodules.
|
|
14
14
|
"""
|
|
15
15
|
from typing import Optional
|
|
16
|
+
import os
|
|
16
17
|
|
|
17
18
|
from more_itertools import peekable
|
|
19
|
+
import yaml
|
|
18
20
|
|
|
19
21
|
import pandas as pd
|
|
20
22
|
|
|
@@ -25,7 +27,7 @@ logger.debug(f"Loading module {__name__}.")
|
|
|
25
27
|
from ._write import get_writer
|
|
26
28
|
from ._config import config as _config
|
|
27
29
|
from ._config import update_from_file as _file_update
|
|
28
|
-
from ._create import BioCypherEdge, BioCypherNode
|
|
30
|
+
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
29
31
|
from ._pandas import Pandas
|
|
30
32
|
from ._connect import get_driver
|
|
31
33
|
from ._mapping import OntologyMapping
|
|
@@ -181,19 +183,6 @@ class BioCypher:
|
|
|
181
183
|
|
|
182
184
|
return self._ontology_mapping
|
|
183
185
|
|
|
184
|
-
def _get_translator(self) -> Translator:
|
|
185
|
-
"""
|
|
186
|
-
Create translator if not exists and return.
|
|
187
|
-
"""
|
|
188
|
-
|
|
189
|
-
if not self._translator:
|
|
190
|
-
self._translator = Translator(
|
|
191
|
-
ontology_mapping=self._get_ontology_mapping(),
|
|
192
|
-
strict_mode=self._strict_mode,
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
return self._translator
|
|
196
|
-
|
|
197
186
|
def _get_ontology(self) -> Ontology:
|
|
198
187
|
"""
|
|
199
188
|
Create ontology if not exists and return.
|
|
@@ -208,17 +197,28 @@ class BioCypher:
|
|
|
208
197
|
|
|
209
198
|
return self._ontology
|
|
210
199
|
|
|
200
|
+
def _get_translator(self) -> Translator:
|
|
201
|
+
"""
|
|
202
|
+
Create translator if not exists and return.
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
if not self._translator:
|
|
206
|
+
self._translator = Translator(
|
|
207
|
+
ontology=self._get_ontology(),
|
|
208
|
+
strict_mode=self._strict_mode,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
return self._translator
|
|
212
|
+
|
|
211
213
|
def _get_writer(self):
|
|
212
214
|
"""
|
|
213
215
|
Create writer if not online. Set as instance variable `self._writer`.
|
|
214
216
|
"""
|
|
215
217
|
|
|
216
|
-
# Get worker
|
|
217
218
|
if self._offline:
|
|
218
219
|
self._writer = get_writer(
|
|
219
220
|
dbms=self._dbms,
|
|
220
221
|
translator=self._get_translator(),
|
|
221
|
-
ontology=self._get_ontology(),
|
|
222
222
|
deduplicator=self._get_deduplicator(),
|
|
223
223
|
output_directory=self._output_directory,
|
|
224
224
|
strict_mode=self._strict_mode,
|
|
@@ -235,7 +235,6 @@ class BioCypher:
|
|
|
235
235
|
self._driver = get_driver(
|
|
236
236
|
dbms=self._dbms,
|
|
237
237
|
translator=self._get_translator(),
|
|
238
|
-
ontology=self._get_ontology(),
|
|
239
238
|
deduplicator=self._get_deduplicator(),
|
|
240
239
|
)
|
|
241
240
|
else:
|
|
@@ -318,14 +317,15 @@ class BioCypher:
|
|
|
318
317
|
if not self._pd:
|
|
319
318
|
self._pd = Pandas(
|
|
320
319
|
translator=self._get_translator(),
|
|
321
|
-
ontology=self._get_ontology(),
|
|
322
320
|
deduplicator=self._get_deduplicator(),
|
|
323
321
|
)
|
|
324
322
|
|
|
325
323
|
entities = peekable(entities)
|
|
326
324
|
|
|
327
|
-
if
|
|
328
|
-
entities.peek(),
|
|
325
|
+
if (
|
|
326
|
+
isinstance(entities.peek(), BioCypherNode)
|
|
327
|
+
or isinstance(entities.peek(), BioCypherEdge)
|
|
328
|
+
or isinstance(entities.peek(), BioCypherRelAsNode)
|
|
329
329
|
):
|
|
330
330
|
tentities = entities
|
|
331
331
|
elif len(entities.peek()) < 4:
|
|
@@ -504,6 +504,73 @@ class BioCypher:
|
|
|
504
504
|
|
|
505
505
|
self._writer.write_import_call()
|
|
506
506
|
|
|
507
|
+
def write_schema_info(self) -> None:
|
|
508
|
+
"""
|
|
509
|
+
Write an extended schema info YAML file that extends the
|
|
510
|
+
`schema_config.yaml` with run-time information of the built KG. For
|
|
511
|
+
instance, include information on whether something present in the actual
|
|
512
|
+
knowledge graph, whether it is a relationship (which is important in the
|
|
513
|
+
case of representing relationships as nodes) and the actual sources and
|
|
514
|
+
targets of edges. Since this file can be used in place of the original
|
|
515
|
+
`schema_config.yaml` file, it indicates that it is the extended schema
|
|
516
|
+
by setting `is_schema_info` to `true`.
|
|
517
|
+
|
|
518
|
+
We start by using the `extended_schema` dictionary from the ontology
|
|
519
|
+
class instance, which contains all expanded entities and relationships.
|
|
520
|
+
The information of whether something is a relationship can be gathered
|
|
521
|
+
from the deduplicator instance, which keeps track of all entities that
|
|
522
|
+
have been seen.
|
|
523
|
+
"""
|
|
524
|
+
|
|
525
|
+
if not self._offline:
|
|
526
|
+
raise NotImplementedError(
|
|
527
|
+
"Cannot write schema info in online mode."
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
ontology = self._get_ontology()
|
|
531
|
+
schema = ontology.mapping.extended_schema
|
|
532
|
+
schema["is_schema_info"] = True
|
|
533
|
+
|
|
534
|
+
deduplicator = self._get_deduplicator()
|
|
535
|
+
for node in deduplicator.entity_types:
|
|
536
|
+
if node in schema.keys():
|
|
537
|
+
schema[node]["present_in_knowledge_graph"] = True
|
|
538
|
+
schema[node]["is_relationship"] = False
|
|
539
|
+
else:
|
|
540
|
+
logger.info(
|
|
541
|
+
f"Node {node} not present in extended schema. "
|
|
542
|
+
"Skipping schema info."
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
# find 'label_as_edge' cases in schema entries
|
|
546
|
+
changed_labels = {}
|
|
547
|
+
for k, v in schema.items():
|
|
548
|
+
if not isinstance(v, dict):
|
|
549
|
+
continue
|
|
550
|
+
if "label_as_edge" in v.keys():
|
|
551
|
+
if v["label_as_edge"] in deduplicator.seen_relationships.keys():
|
|
552
|
+
changed_labels[v["label_as_edge"]] = k
|
|
553
|
+
|
|
554
|
+
for edge in deduplicator.seen_relationships.keys():
|
|
555
|
+
if edge in changed_labels.keys():
|
|
556
|
+
edge = changed_labels[edge]
|
|
557
|
+
if edge in schema.keys():
|
|
558
|
+
schema[edge]["present_in_knowledge_graph"] = True
|
|
559
|
+
schema[edge]["is_relationship"] = True
|
|
560
|
+
# TODO information about source and target nodes
|
|
561
|
+
else:
|
|
562
|
+
logger.info(
|
|
563
|
+
f"Edge {edge} not present in extended schema. "
|
|
564
|
+
"Skipping schema info."
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
# write to output directory as YAML file
|
|
568
|
+
path = os.path.join(self._output_directory, "schema_info.yaml")
|
|
569
|
+
with open(path, "w") as f:
|
|
570
|
+
f.write(yaml.dump(schema))
|
|
571
|
+
|
|
572
|
+
return schema
|
|
573
|
+
|
|
507
574
|
# TRANSLATION METHODS ###
|
|
508
575
|
|
|
509
576
|
def translate_term(self, term: str) -> str:
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from ._logger import logger
|
|
2
|
+
|
|
3
|
+
logger.debug(f"Loading module {__name__}.")
|
|
4
|
+
|
|
5
|
+
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Deduplicator:
|
|
9
|
+
"""
|
|
10
|
+
Singleton class responsible of deduplicating BioCypher inputs. Maintains
|
|
11
|
+
sets/dictionaries of node and edge types and their unique identifiers.
|
|
12
|
+
|
|
13
|
+
Nodes identifiers should be globally unique (represented as a set), while
|
|
14
|
+
edge identifiers are only unique per edge type (represented as a dict of
|
|
15
|
+
sets, keyed by edge type).
|
|
16
|
+
|
|
17
|
+
Stores collection of duplicate node and edge identifiers and types for
|
|
18
|
+
troubleshooting and to avoid overloading the log.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self):
|
|
22
|
+
self.seen_entity_ids = set()
|
|
23
|
+
self.duplicate_entity_ids = set()
|
|
24
|
+
|
|
25
|
+
self.entity_types = set()
|
|
26
|
+
self.duplicate_entity_types = set()
|
|
27
|
+
|
|
28
|
+
self.seen_relationships = {}
|
|
29
|
+
self.duplicate_relationship_ids = set()
|
|
30
|
+
self.duplicate_relationship_types = set()
|
|
31
|
+
|
|
32
|
+
def node_seen(self, entity: BioCypherNode) -> bool:
|
|
33
|
+
"""
|
|
34
|
+
Adds a node to the instance and checks if it has been seen before.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
node: BioCypherNode to be added.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
True if the node has been seen before, False otherwise.
|
|
41
|
+
"""
|
|
42
|
+
if entity.get_label() not in self.entity_types:
|
|
43
|
+
self.entity_types.add(entity.get_label())
|
|
44
|
+
|
|
45
|
+
if entity.get_id() in self.seen_entity_ids:
|
|
46
|
+
self.duplicate_entity_ids.add(entity.get_id())
|
|
47
|
+
if entity.get_label() not in self.duplicate_entity_types:
|
|
48
|
+
logger.warning(
|
|
49
|
+
f"Duplicate node type {entity.get_label()} found. "
|
|
50
|
+
)
|
|
51
|
+
self.duplicate_entity_types.add(entity.get_label())
|
|
52
|
+
return True
|
|
53
|
+
|
|
54
|
+
self.seen_entity_ids.add(entity.get_id())
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
def edge_seen(self, relationship: BioCypherEdge) -> bool:
|
|
58
|
+
"""
|
|
59
|
+
Adds an edge to the instance and checks if it has been seen before.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
edge: BioCypherEdge to be added.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
True if the edge has been seen before, False otherwise.
|
|
66
|
+
"""
|
|
67
|
+
if relationship.get_type() not in self.seen_relationships:
|
|
68
|
+
self.seen_relationships[relationship.get_type()] = set()
|
|
69
|
+
|
|
70
|
+
# concatenate source and target if no id is present
|
|
71
|
+
if not relationship.get_id():
|
|
72
|
+
_id = (
|
|
73
|
+
f"{relationship.get_source_id()}_{relationship.get_target_id()}"
|
|
74
|
+
)
|
|
75
|
+
else:
|
|
76
|
+
_id = relationship.get_id()
|
|
77
|
+
|
|
78
|
+
if _id in self.seen_relationships[relationship.get_type()]:
|
|
79
|
+
self.duplicate_relationship_ids.add(_id)
|
|
80
|
+
if relationship.get_type() not in self.duplicate_relationship_types:
|
|
81
|
+
logger.warning(
|
|
82
|
+
f"Duplicate edge type {relationship.get_type()} found. "
|
|
83
|
+
)
|
|
84
|
+
self.duplicate_relationship_types.add(relationship.get_type())
|
|
85
|
+
return True
|
|
86
|
+
|
|
87
|
+
self.seen_relationships[relationship.get_type()].add(_id)
|
|
88
|
+
return False
|
|
89
|
+
|
|
90
|
+
def rel_as_node_seen(self, rel_as_node: BioCypherRelAsNode) -> bool:
|
|
91
|
+
"""
|
|
92
|
+
Adds a rel_as_node to the instance (one entity and two relationships)
|
|
93
|
+
and checks if it has been seen before. Only the node is relevant for
|
|
94
|
+
identifying the rel_as_node as a duplicate.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
rel_as_node: BioCypherRelAsNode to be added.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
True if the rel_as_node has been seen before, False otherwise.
|
|
101
|
+
"""
|
|
102
|
+
node = rel_as_node.get_node()
|
|
103
|
+
|
|
104
|
+
if node.get_label() not in self.seen_relationships:
|
|
105
|
+
self.seen_relationships[node.get_label()] = set()
|
|
106
|
+
|
|
107
|
+
# rel as node always has an id
|
|
108
|
+
_id = node.get_id()
|
|
109
|
+
|
|
110
|
+
if _id in self.seen_relationships[node.get_type()]:
|
|
111
|
+
self.duplicate_relationship_ids.add(_id)
|
|
112
|
+
if node.get_type() not in self.duplicate_relationship_types:
|
|
113
|
+
logger.warning(f"Duplicate edge type {node.get_type()} found. ")
|
|
114
|
+
self.duplicate_relationship_types.add(node.get_type())
|
|
115
|
+
return True
|
|
116
|
+
|
|
117
|
+
self.seen_relationships[node.get_type()].add(_id)
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
def get_duplicate_nodes(self):
|
|
121
|
+
"""
|
|
122
|
+
Function to return a list of duplicate nodes.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
list: list of duplicate nodes
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
if self.duplicate_entity_types:
|
|
129
|
+
return (self.duplicate_entity_types, self.duplicate_entity_ids)
|
|
130
|
+
else:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
def get_duplicate_edges(self):
|
|
134
|
+
"""
|
|
135
|
+
Function to return a list of duplicate edges.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
list: list of duplicate edges
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
if self.duplicate_relationship_types:
|
|
142
|
+
return (
|
|
143
|
+
self.duplicate_relationship_types,
|
|
144
|
+
self.duplicate_relationship_ids,
|
|
145
|
+
)
|
|
146
|
+
else:
|
|
147
|
+
return None
|
|
@@ -269,7 +269,7 @@ class Ontology:
|
|
|
269
269
|
"""
|
|
270
270
|
|
|
271
271
|
self._head_ontology_meta = head_ontology
|
|
272
|
-
self.
|
|
272
|
+
self.mapping = ontology_mapping
|
|
273
273
|
self._tail_ontology_meta = tail_ontologies
|
|
274
274
|
|
|
275
275
|
self._tail_ontologies = None
|
|
@@ -403,7 +403,7 @@ class Ontology:
|
|
|
403
403
|
if not self._nx_graph:
|
|
404
404
|
self._nx_graph = self._head_ontology.get_nx_graph().copy()
|
|
405
405
|
|
|
406
|
-
for key, value in self.extended_schema.items():
|
|
406
|
+
for key, value in self.mapping.extended_schema.items():
|
|
407
407
|
if not value.get("is_a"):
|
|
408
408
|
if self._nx_graph.has_node(value.get("synonym_for")):
|
|
409
409
|
continue
|
|
@@ -485,7 +485,7 @@ class Ontology:
|
|
|
485
485
|
setting the synonym as the primary node label.
|
|
486
486
|
"""
|
|
487
487
|
|
|
488
|
-
for key, value in self.extended_schema.items():
|
|
488
|
+
for key, value in self.mapping.extended_schema.items():
|
|
489
489
|
if key in self._nx_graph.nodes:
|
|
490
490
|
self._nx_graph.nodes[key].update(value)
|
|
491
491
|
|
|
@@ -541,9 +541,9 @@ class Ontology:
|
|
|
541
541
|
|
|
542
542
|
if not full:
|
|
543
543
|
# set of leaves and their intermediate parents up to the root
|
|
544
|
-
filter_nodes = set(self.extended_schema.keys())
|
|
544
|
+
filter_nodes = set(self.mapping.extended_schema.keys())
|
|
545
545
|
|
|
546
|
-
for node in self.extended_schema.keys():
|
|
546
|
+
for node in self.mapping.extended_schema.keys():
|
|
547
547
|
filter_nodes.update(self.get_ancestors(node).nodes)
|
|
548
548
|
|
|
549
549
|
# filter graph
|
|
@@ -557,11 +557,13 @@ class Ontology:
|
|
|
557
557
|
tree = _misc.create_tree_visualisation(G)
|
|
558
558
|
|
|
559
559
|
# add synonym information
|
|
560
|
-
for node in self.extended_schema:
|
|
561
|
-
if self.extended_schema[node]
|
|
560
|
+
for node in self.mapping.extended_schema:
|
|
561
|
+
if not isinstance(self.mapping.extended_schema[node], dict):
|
|
562
|
+
continue
|
|
563
|
+
if self.mapping.extended_schema[node].get("synonym_for"):
|
|
562
564
|
tree.nodes[node].tag = (
|
|
563
565
|
f"{node} = "
|
|
564
|
-
f"{self.extended_schema[node].get('synonym_for')}"
|
|
566
|
+
f"{self.mapping.extended_schema[node].get('synonym_for')}"
|
|
565
567
|
)
|
|
566
568
|
|
|
567
569
|
tree.show()
|
|
@@ -602,7 +604,7 @@ class Ontology:
|
|
|
602
604
|
"node_id": self._get_current_id(),
|
|
603
605
|
"node_label": "BioCypher",
|
|
604
606
|
"properties": {
|
|
605
|
-
"schema": "self.extended_schema",
|
|
607
|
+
"schema": "self.ontology_mapping.extended_schema",
|
|
606
608
|
},
|
|
607
609
|
}
|
|
608
610
|
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
|
|
3
|
-
from ._create import BioCypherEdge, BioCypherNode
|
|
3
|
+
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class Pandas:
|
|
7
|
-
def __init__(self,
|
|
8
|
-
self.ontology = ontology
|
|
7
|
+
def __init__(self, translator, deduplicator):
|
|
9
8
|
self.translator = translator
|
|
10
9
|
self.deduplicator = deduplicator
|
|
11
10
|
|
|
@@ -18,22 +17,48 @@ class Pandas:
|
|
|
18
17
|
"""
|
|
19
18
|
lists = {}
|
|
20
19
|
for entity in entities:
|
|
21
|
-
if
|
|
22
|
-
entity,
|
|
20
|
+
if (
|
|
21
|
+
not isinstance(entity, BioCypherNode)
|
|
22
|
+
and not isinstance(entity, BioCypherEdge)
|
|
23
|
+
and not isinstance(entity, BioCypherRelAsNode)
|
|
23
24
|
):
|
|
24
25
|
raise TypeError(
|
|
25
|
-
|
|
26
|
+
"Expected a BioCypherNode / BioCypherEdge / "
|
|
27
|
+
f"BioCypherRelAsNode, got {type(entity)}."
|
|
26
28
|
)
|
|
27
29
|
|
|
28
30
|
if isinstance(entity, BioCypherNode):
|
|
29
31
|
seen = self.deduplicator.node_seen(entity)
|
|
30
32
|
elif isinstance(entity, BioCypherEdge):
|
|
31
33
|
seen = self.deduplicator.edge_seen(entity)
|
|
34
|
+
elif isinstance(entity, BioCypherRelAsNode):
|
|
35
|
+
seen = self.deduplicator.rel_as_node_seen(entity)
|
|
32
36
|
|
|
33
37
|
if seen:
|
|
34
38
|
continue
|
|
35
39
|
|
|
36
|
-
|
|
40
|
+
if isinstance(entity, BioCypherRelAsNode):
|
|
41
|
+
node = entity.get_node()
|
|
42
|
+
source_edge = entity.get_source_edge()
|
|
43
|
+
target_edge = entity.get_target_edge()
|
|
44
|
+
|
|
45
|
+
_type = node.get_type()
|
|
46
|
+
if not _type in lists:
|
|
47
|
+
lists[_type] = []
|
|
48
|
+
lists[_type].append(node)
|
|
49
|
+
|
|
50
|
+
_source_type = source_edge.get_type()
|
|
51
|
+
if not _source_type in lists:
|
|
52
|
+
lists[_source_type] = []
|
|
53
|
+
lists[_source_type].append(source_edge)
|
|
54
|
+
|
|
55
|
+
_target_type = target_edge.get_type()
|
|
56
|
+
if not _target_type in lists:
|
|
57
|
+
lists[_target_type] = []
|
|
58
|
+
lists[_target_type].append(target_edge)
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
_type = entity.get_type()
|
|
37
62
|
if not _type in lists:
|
|
38
63
|
lists[_type] = []
|
|
39
64
|
lists[_type].append(entity)
|
|
@@ -23,7 +23,7 @@ from more_itertools import peekable
|
|
|
23
23
|
|
|
24
24
|
from . import _misc
|
|
25
25
|
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
26
|
-
from .
|
|
26
|
+
from ._ontology import Ontology
|
|
27
27
|
|
|
28
28
|
__all__ = ["BiolinkAdapter", "Translator"]
|
|
29
29
|
|
|
@@ -41,9 +41,7 @@ class Translator:
|
|
|
41
41
|
and cypher queries.
|
|
42
42
|
"""
|
|
43
43
|
|
|
44
|
-
def __init__(
|
|
45
|
-
self, ontology_mapping: "OntologyMapping", strict_mode: bool = False
|
|
46
|
-
):
|
|
44
|
+
def __init__(self, ontology: "Ontology", strict_mode: bool = False):
|
|
47
45
|
"""
|
|
48
46
|
Args:
|
|
49
47
|
leaves:
|
|
@@ -57,7 +55,7 @@ class Translator:
|
|
|
57
55
|
carry source, licence, and version information.
|
|
58
56
|
"""
|
|
59
57
|
|
|
60
|
-
self.
|
|
58
|
+
self.ontology = ontology
|
|
61
59
|
self.strict_mode = strict_mode
|
|
62
60
|
|
|
63
61
|
# record nodes without biolink type configured in schema_config.yaml
|
|
@@ -71,7 +69,7 @@ class Translator:
|
|
|
71
69
|
|
|
72
70
|
def translate_nodes(
|
|
73
71
|
self,
|
|
74
|
-
|
|
72
|
+
node_tuples: Iterable,
|
|
75
73
|
) -> Generator[BioCypherNode, None, None]:
|
|
76
74
|
"""
|
|
77
75
|
Translates input node representation to a representation that
|
|
@@ -79,16 +77,16 @@ class Translator:
|
|
|
79
77
|
requires explicit statement of node type on pass.
|
|
80
78
|
|
|
81
79
|
Args:
|
|
82
|
-
|
|
80
|
+
node_tuples (list of tuples): collection of tuples
|
|
83
81
|
representing individual nodes by their unique id and a type
|
|
84
82
|
that is translated from the original database notation to
|
|
85
83
|
the corresponding BioCypher notation.
|
|
86
84
|
|
|
87
85
|
"""
|
|
88
86
|
|
|
89
|
-
self._log_begin_translate(
|
|
87
|
+
self._log_begin_translate(node_tuples, "nodes")
|
|
90
88
|
|
|
91
|
-
for _id, _type, _props in
|
|
89
|
+
for _id, _type, _props in node_tuples:
|
|
92
90
|
# check for strict mode requirements
|
|
93
91
|
required_props = ["source", "licence", "version"]
|
|
94
92
|
|
|
@@ -132,8 +130,9 @@ class Translator:
|
|
|
132
130
|
"""
|
|
133
131
|
|
|
134
132
|
return (
|
|
135
|
-
self.extended_schema[_bl_type]["preferred_id"]
|
|
136
|
-
if "preferred_id"
|
|
133
|
+
self.ontology.mapping.extended_schema[_bl_type]["preferred_id"]
|
|
134
|
+
if "preferred_id"
|
|
135
|
+
in self.ontology.mapping.extended_schema.get(_bl_type, {})
|
|
137
136
|
else "id"
|
|
138
137
|
)
|
|
139
138
|
|
|
@@ -142,7 +141,9 @@ class Translator:
|
|
|
142
141
|
Filters properties for those specified in schema_config if any.
|
|
143
142
|
"""
|
|
144
143
|
|
|
145
|
-
filter_props = self.extended_schema[bl_type].get(
|
|
144
|
+
filter_props = self.ontology.mapping.extended_schema[bl_type].get(
|
|
145
|
+
"properties", {}
|
|
146
|
+
)
|
|
146
147
|
|
|
147
148
|
# strict mode: add required properties (only if there is a whitelist)
|
|
148
149
|
if self.strict_mode and filter_props:
|
|
@@ -150,7 +151,7 @@ class Translator:
|
|
|
150
151
|
{"source": "str", "licence": "str", "version": "str"},
|
|
151
152
|
)
|
|
152
153
|
|
|
153
|
-
exclude_props = self.extended_schema[bl_type].get(
|
|
154
|
+
exclude_props = self.ontology.mapping.extended_schema[bl_type].get(
|
|
154
155
|
"exclude_properties", []
|
|
155
156
|
)
|
|
156
157
|
|
|
@@ -188,7 +189,7 @@ class Translator:
|
|
|
188
189
|
|
|
189
190
|
def translate_edges(
|
|
190
191
|
self,
|
|
191
|
-
|
|
192
|
+
edge_tuples: Iterable,
|
|
192
193
|
) -> Generator[Union[BioCypherEdge, BioCypherRelAsNode], None, None]:
|
|
193
194
|
"""
|
|
194
195
|
Translates input edge representation to a representation that
|
|
@@ -197,7 +198,7 @@ class Translator:
|
|
|
197
198
|
|
|
198
199
|
Args:
|
|
199
200
|
|
|
200
|
-
|
|
201
|
+
edge_tuples (list of tuples):
|
|
201
202
|
|
|
202
203
|
collection of tuples representing source and target of
|
|
203
204
|
an interaction via their unique ids as well as the type
|
|
@@ -206,18 +207,18 @@ class Translator:
|
|
|
206
207
|
Can optionally possess its own ID.
|
|
207
208
|
"""
|
|
208
209
|
|
|
209
|
-
self._log_begin_translate(
|
|
210
|
+
self._log_begin_translate(edge_tuples, "edges")
|
|
210
211
|
|
|
211
212
|
# legacy: deal with 4-tuples (no edge id)
|
|
212
213
|
# TODO remove for performance reasons once safe
|
|
213
|
-
|
|
214
|
-
if len(
|
|
215
|
-
|
|
214
|
+
edge_tuples = peekable(edge_tuples)
|
|
215
|
+
if len(edge_tuples.peek()) == 4:
|
|
216
|
+
edge_tuples = [
|
|
216
217
|
(None, src, tar, typ, props)
|
|
217
|
-
for src, tar, typ, props in
|
|
218
|
+
for src, tar, typ, props in edge_tuples
|
|
218
219
|
]
|
|
219
220
|
|
|
220
|
-
for _id, _src, _tar, _type, _props in
|
|
221
|
+
for _id, _src, _tar, _type, _props in edge_tuples:
|
|
221
222
|
# check for strict mode requirements
|
|
222
223
|
if self.strict_mode:
|
|
223
224
|
if not "source" in _props:
|
|
@@ -239,7 +240,9 @@ class Translator:
|
|
|
239
240
|
# filter properties for those specified in schema_config if any
|
|
240
241
|
_filtered_props = self._filter_props(bl_type, _props)
|
|
241
242
|
|
|
242
|
-
rep = self.extended_schema[bl_type][
|
|
243
|
+
rep = self.ontology.mapping.extended_schema[bl_type][
|
|
244
|
+
"represented_as"
|
|
245
|
+
]
|
|
243
246
|
|
|
244
247
|
if rep == "node":
|
|
245
248
|
if _id:
|
|
@@ -295,9 +298,9 @@ class Translator:
|
|
|
295
298
|
yield BioCypherRelAsNode(n, e_s, e_t)
|
|
296
299
|
|
|
297
300
|
else:
|
|
298
|
-
edge_label = self.extended_schema[
|
|
299
|
-
|
|
300
|
-
)
|
|
301
|
+
edge_label = self.ontology.mapping.extended_schema[
|
|
302
|
+
bl_type
|
|
303
|
+
].get("label_as_edge")
|
|
301
304
|
|
|
302
305
|
if edge_label is None:
|
|
303
306
|
edge_label = bl_type
|
|
@@ -356,7 +359,7 @@ class Translator:
|
|
|
356
359
|
|
|
357
360
|
self._ontology_mapping = {}
|
|
358
361
|
|
|
359
|
-
for key, value in self.extended_schema.items():
|
|
362
|
+
for key, value in self.ontology.mapping.extended_schema.items():
|
|
360
363
|
labels = value.get("input_label") or value.get("label_in_input")
|
|
361
364
|
|
|
362
365
|
if isinstance(labels, str):
|
|
@@ -125,7 +125,6 @@ class _BatchWriter(ABC):
|
|
|
125
125
|
|
|
126
126
|
def __init__(
|
|
127
127
|
self,
|
|
128
|
-
ontology: "Ontology",
|
|
129
128
|
translator: "Translator",
|
|
130
129
|
deduplicator: "Deduplicator",
|
|
131
130
|
delimiter: str,
|
|
@@ -167,10 +166,6 @@ class _BatchWriter(ABC):
|
|
|
167
166
|
- _get_import_script_name
|
|
168
167
|
|
|
169
168
|
Args:
|
|
170
|
-
ontology:
|
|
171
|
-
Instance of :py:class:`Ontology` to enable translation and
|
|
172
|
-
ontology queries
|
|
173
|
-
|
|
174
169
|
translator:
|
|
175
170
|
Instance of :py:class:`Translator` to enable translation of
|
|
176
171
|
nodes and manipulation of properties.
|
|
@@ -251,8 +246,6 @@ class _BatchWriter(ABC):
|
|
|
251
246
|
self.wipe = wipe
|
|
252
247
|
self.strict_mode = strict_mode
|
|
253
248
|
|
|
254
|
-
self.extended_schema = ontology.extended_schema
|
|
255
|
-
self.ontology = ontology
|
|
256
249
|
self.translator = translator
|
|
257
250
|
self.deduplicator = deduplicator
|
|
258
251
|
self.node_property_dict = {}
|
|
@@ -352,34 +345,34 @@ class _BatchWriter(ABC):
|
|
|
352
345
|
bool: The return value. True for success, False otherwise.
|
|
353
346
|
"""
|
|
354
347
|
passed = False
|
|
355
|
-
# unwrap generator in one step
|
|
356
348
|
edges = list(edges) # force evaluation to handle empty generator
|
|
357
349
|
if edges:
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
],
|
|
366
|
-
)
|
|
367
|
-
if isinstance(e, BioCypherRelAsNode)
|
|
368
|
-
else (None, [e])
|
|
369
|
-
for e in edges
|
|
370
|
-
)
|
|
371
|
-
)
|
|
372
|
-
nod, edg = (list(a) for a in z)
|
|
373
|
-
nod = [n for n in nod if n]
|
|
374
|
-
edg = [val for sublist in edg for val in sublist] # flatten
|
|
350
|
+
nodes_flat = []
|
|
351
|
+
edges_flat = []
|
|
352
|
+
for edge in edges:
|
|
353
|
+
if isinstance(edge, BioCypherRelAsNode):
|
|
354
|
+
# check if relationship has already been written, if so skip
|
|
355
|
+
if self.deduplicator.rel_as_node_seen(edge):
|
|
356
|
+
continue
|
|
375
357
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
358
|
+
nodes_flat.append(edge.get_node())
|
|
359
|
+
edges_flat.append(edge.get_source_edge())
|
|
360
|
+
edges_flat.append(edge.get_target_edge())
|
|
361
|
+
|
|
362
|
+
else:
|
|
363
|
+
# check if relationship has already been written, if so skip
|
|
364
|
+
if self.deduplicator.edge_seen(edge):
|
|
365
|
+
continue
|
|
366
|
+
|
|
367
|
+
edges_flat.append(edge)
|
|
368
|
+
|
|
369
|
+
if nodes_flat and edges_flat:
|
|
370
|
+
passed = self.write_nodes(nodes_flat) and self._write_edge_data(
|
|
371
|
+
edges_flat,
|
|
379
372
|
batch_size,
|
|
380
373
|
)
|
|
381
374
|
else:
|
|
382
|
-
passed = self._write_edge_data(
|
|
375
|
+
passed = self._write_edge_data(edges_flat, batch_size)
|
|
383
376
|
|
|
384
377
|
else:
|
|
385
378
|
# is this a problem? if the generator or list is empty, we
|
|
@@ -451,8 +444,12 @@ class _BatchWriter(ABC):
|
|
|
451
444
|
bin_l[label] = 1
|
|
452
445
|
|
|
453
446
|
# get properties from config if present
|
|
454
|
-
cprops =
|
|
455
|
-
|
|
447
|
+
cprops = (
|
|
448
|
+
self.translator.ontology.mapping.extended_schema.get(
|
|
449
|
+
label
|
|
450
|
+
).get(
|
|
451
|
+
"properties",
|
|
452
|
+
)
|
|
456
453
|
)
|
|
457
454
|
if cprops:
|
|
458
455
|
d = dict(cprops)
|
|
@@ -486,7 +483,7 @@ class _BatchWriter(ABC):
|
|
|
486
483
|
|
|
487
484
|
# get label hierarchy
|
|
488
485
|
# multiple labels:
|
|
489
|
-
all_labels = self.ontology.get_ancestors(label)
|
|
486
|
+
all_labels = self.translator.ontology.get_ancestors(label)
|
|
490
487
|
|
|
491
488
|
if all_labels:
|
|
492
489
|
# convert to pascal case
|
|
@@ -682,10 +679,6 @@ class _BatchWriter(ABC):
|
|
|
682
679
|
# for each label to check for consistency and their type
|
|
683
680
|
# for now, relevant for `int`
|
|
684
681
|
for edge in edges:
|
|
685
|
-
# check for duplicates
|
|
686
|
-
if self.deduplicator.edge_seen(edge):
|
|
687
|
-
continue
|
|
688
|
-
|
|
689
682
|
if not (edge.get_source_id() and edge.get_target_id()):
|
|
690
683
|
logger.error(
|
|
691
684
|
"Edge must have source and target node. "
|
|
@@ -706,13 +699,23 @@ class _BatchWriter(ABC):
|
|
|
706
699
|
# (may not be if it is an edge that carries the
|
|
707
700
|
# "label_as_edge" property)
|
|
708
701
|
cprops = None
|
|
709
|
-
if
|
|
710
|
-
|
|
702
|
+
if (
|
|
703
|
+
label
|
|
704
|
+
in self.translator.ontology.mapping.extended_schema
|
|
705
|
+
):
|
|
706
|
+
cprops = self.translator.ontology.mapping.extended_schema.get(
|
|
707
|
+
label
|
|
708
|
+
).get(
|
|
711
709
|
"properties",
|
|
712
710
|
)
|
|
713
711
|
else:
|
|
714
712
|
# try via "label_as_edge"
|
|
715
|
-
for
|
|
713
|
+
for (
|
|
714
|
+
k,
|
|
715
|
+
v,
|
|
716
|
+
) in (
|
|
717
|
+
self.translator.ontology.mapping.extended_schema.items()
|
|
718
|
+
):
|
|
716
719
|
if isinstance(v, dict):
|
|
717
720
|
if v.get("label_as_edge") == label:
|
|
718
721
|
cprops = v.get("properties")
|
|
@@ -873,9 +876,14 @@ class _BatchWriter(ABC):
|
|
|
873
876
|
|
|
874
877
|
if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
|
|
875
878
|
skip_id = True
|
|
876
|
-
elif not self.extended_schema.get(
|
|
879
|
+
elif not self.translator.ontology.mapping.extended_schema.get(
|
|
880
|
+
label
|
|
881
|
+
):
|
|
877
882
|
# find label in schema by label_as_edge
|
|
878
|
-
for
|
|
883
|
+
for (
|
|
884
|
+
k,
|
|
885
|
+
v,
|
|
886
|
+
) in self.translator.ontology.mapping.extended_schema.items():
|
|
879
887
|
if v.get("label_as_edge") == label:
|
|
880
888
|
schema_label = k
|
|
881
889
|
break
|
|
@@ -884,7 +892,9 @@ class _BatchWriter(ABC):
|
|
|
884
892
|
|
|
885
893
|
if schema_label:
|
|
886
894
|
if (
|
|
887
|
-
self.extended_schema.get(
|
|
895
|
+
self.translator.ontology.mapping.extended_schema.get(
|
|
896
|
+
schema_label
|
|
897
|
+
).get("use_id")
|
|
888
898
|
== False
|
|
889
899
|
):
|
|
890
900
|
skip_id = True
|
|
@@ -1009,6 +1019,7 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
1009
1019
|
|
|
1010
1020
|
This class inherits from the abstract class "_BatchWriter" and implements the
|
|
1011
1021
|
Neo4j-specific methods:
|
|
1022
|
+
|
|
1012
1023
|
- _write_node_headers
|
|
1013
1024
|
- _write_edge_headers
|
|
1014
1025
|
- _construct_import_call
|
|
@@ -1181,9 +1192,14 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
1181
1192
|
|
|
1182
1193
|
if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
|
|
1183
1194
|
skip_id = True
|
|
1184
|
-
elif not self.extended_schema.get(
|
|
1195
|
+
elif not self.translator.ontology.mapping.extended_schema.get(
|
|
1196
|
+
label
|
|
1197
|
+
):
|
|
1185
1198
|
# find label in schema by label_as_edge
|
|
1186
|
-
for
|
|
1199
|
+
for (
|
|
1200
|
+
k,
|
|
1201
|
+
v,
|
|
1202
|
+
) in self.translator.ontology.mapping.extended_schema.items():
|
|
1187
1203
|
if v.get("label_as_edge") == label:
|
|
1188
1204
|
schema_label = k
|
|
1189
1205
|
break
|
|
@@ -1194,7 +1210,9 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
1194
1210
|
|
|
1195
1211
|
if schema_label:
|
|
1196
1212
|
if (
|
|
1197
|
-
self.extended_schema.get(
|
|
1213
|
+
self.translator.ontology.mapping.extended_schema.get(
|
|
1214
|
+
schema_label
|
|
1215
|
+
).get("use_id")
|
|
1198
1216
|
== False
|
|
1199
1217
|
):
|
|
1200
1218
|
skip_id = True
|
|
@@ -1352,9 +1370,9 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
|
|
1352
1370
|
f.write(row)
|
|
1353
1371
|
|
|
1354
1372
|
# add collection from schema config
|
|
1355
|
-
collection = self.extended_schema[
|
|
1356
|
-
|
|
1357
|
-
)
|
|
1373
|
+
collection = self.translator.ontology.mapping.extended_schema[
|
|
1374
|
+
label
|
|
1375
|
+
].get("db_collection_name", None)
|
|
1358
1376
|
|
|
1359
1377
|
# add file path to neo4 admin import statement
|
|
1360
1378
|
# do once for each part file
|
|
@@ -1433,16 +1451,19 @@ class _ArangoDBBatchWriter(_Neo4jBatchWriter):
|
|
|
1433
1451
|
f.write(row)
|
|
1434
1452
|
|
|
1435
1453
|
# add collection from schema config
|
|
1436
|
-
if not self.extended_schema.get(label):
|
|
1437
|
-
for
|
|
1454
|
+
if not self.translator.ontology.mapping.extended_schema.get(label):
|
|
1455
|
+
for (
|
|
1456
|
+
_,
|
|
1457
|
+
v,
|
|
1458
|
+
) in self.translator.ontology.mapping.extended_schema.items():
|
|
1438
1459
|
if v.get("label_as_edge") == label:
|
|
1439
1460
|
collection = v.get("db_collection_name", None)
|
|
1440
1461
|
break
|
|
1441
1462
|
|
|
1442
1463
|
else:
|
|
1443
|
-
collection = self.extended_schema[
|
|
1444
|
-
|
|
1445
|
-
)
|
|
1464
|
+
collection = self.translator.ontology.mapping.extended_schema[
|
|
1465
|
+
label
|
|
1466
|
+
].get("db_collection_name", None)
|
|
1446
1467
|
|
|
1447
1468
|
# add file path to neo4 admin import statement (import call path
|
|
1448
1469
|
# may be different from actual output path)
|
|
@@ -1520,6 +1541,7 @@ class _PostgreSQLBatchWriter(_BatchWriter):
|
|
|
1520
1541
|
|
|
1521
1542
|
This class inherits from the abstract class "_BatchWriter" and implements the
|
|
1522
1543
|
PostgreSQL-specific methods:
|
|
1544
|
+
|
|
1523
1545
|
- _write_node_headers
|
|
1524
1546
|
- _write_edge_headers
|
|
1525
1547
|
- _construct_import_call
|
|
@@ -1839,7 +1861,6 @@ DBMS_TO_CLASS = {
|
|
|
1839
1861
|
def get_writer(
|
|
1840
1862
|
dbms: str,
|
|
1841
1863
|
translator: "Translator",
|
|
1842
|
-
ontology: "Ontology",
|
|
1843
1864
|
deduplicator: "Deduplicator",
|
|
1844
1865
|
output_directory: str,
|
|
1845
1866
|
strict_mode: bool,
|
|
@@ -1854,8 +1875,6 @@ def get_writer(
|
|
|
1854
1875
|
|
|
1855
1876
|
translator: the Translator object.
|
|
1856
1877
|
|
|
1857
|
-
ontology: the Ontology object.
|
|
1858
|
-
|
|
1859
1878
|
output_directory: the directory to write the output files to.
|
|
1860
1879
|
|
|
1861
1880
|
strict_mode: whether to use strict mode.
|
|
@@ -1879,7 +1898,6 @@ def get_writer(
|
|
|
1879
1898
|
|
|
1880
1899
|
if writer is not None:
|
|
1881
1900
|
return writer(
|
|
1882
|
-
ontology=ontology,
|
|
1883
1901
|
translator=translator,
|
|
1884
1902
|
deduplicator=deduplicator,
|
|
1885
1903
|
delimiter=dbms_config.get("delimiter"),
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "biocypher"
|
|
3
|
-
version = "0.5.
|
|
3
|
+
version = "0.5.20"
|
|
4
4
|
description = "A unifying framework for biomedical research knowledge graphs"
|
|
5
5
|
authors = [
|
|
6
6
|
"Sebastian Lobentanzer <sebastian.lobentanzer@gmail.com>",
|
|
@@ -54,6 +54,8 @@ isort = "^5.10.1"
|
|
|
54
54
|
ipython = "^8.7.0"
|
|
55
55
|
ipykernel = "^6.23.1"
|
|
56
56
|
sphinxext-opengraph = "^0.8.2"
|
|
57
|
+
coverage-badge = "^1.1.0"
|
|
58
|
+
nbsphinx = "^0.9.2"
|
|
57
59
|
|
|
58
60
|
[build-system]
|
|
59
61
|
requires = ["poetry-core>=1.0.0"]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from setuptools import setup
|
|
3
|
+
|
|
4
|
+
packages = \
|
|
5
|
+
['biocypher', 'biocypher._config']
|
|
6
|
+
|
|
7
|
+
package_data = \
|
|
8
|
+
{'': ['*']}
|
|
9
|
+
|
|
10
|
+
install_requires = \
|
|
11
|
+
['PyYAML>=5.0',
|
|
12
|
+
'appdirs',
|
|
13
|
+
'more_itertools',
|
|
14
|
+
'neo4j-utils==0.0.7',
|
|
15
|
+
'networkx>=3.0,<4.0',
|
|
16
|
+
'pandas>=2.0.1,<3.0.0',
|
|
17
|
+
'rdflib>=6.2.0,<7.0.0',
|
|
18
|
+
'stringcase>=1.2.0,<2.0.0',
|
|
19
|
+
'treelib>=1.6.1,<2.0.0']
|
|
20
|
+
|
|
21
|
+
setup_kwargs = {
|
|
22
|
+
'name': 'biocypher',
|
|
23
|
+
'version': '0.5.20',
|
|
24
|
+
'description': 'A unifying framework for biomedical research knowledge graphs',
|
|
25
|
+
'long_description': '# BioCypher\n[](https://opensource.org/licenses/MIT)\n\n\n[](https://badge.fury.io/py/biocypher)\n[](https://www.repostatus.org/#active)\n[](https://github.com/biocypher/biocypher/actions/workflows/ci_cd.yaml)\n\n[](https://github.com/biocypher/biocypher/actions/workflows/sphinx_autodoc.yaml)\n[](https://pepy.tech/project/biocypher)\n[](https://github.com/pre-commit/pre-commit)\n[](http://makeapullrequest.com)\n[](https://github.com/biopragmatics/bioregistry)\n\n## ❓ Description\nKnowledge graphs (KGs) are an [approach to knowledge\nrepresentation](https://en.wikipedia.org/wiki/Knowledge_graph) that uses graph\nstructure to facilitate exploration and analysis of complex data, often\nleveraging semantic information. They are popular in many research areas,\nincluding the life sciences, due to their versatile use, for instance in data\nstorage, integration, reasoning, and more recently in artificial intelligence.\nThe creation of KGs is a complex task; BioCypher helps you in creating and\nmaintaining your own KG. For more overview, usage notes, and a tutorial, read\nthe docs [here](https://biocypher.org).\n\n<img\n style="display: block;\n margin-left: auto;\n margin-right: auto;\n width: 70%;"\n src="docs/graphical_abstract.png"\n alt="Graphical Abstract">\n</img>\n\n## 📖 Documentation\nTutorial and developer docs at https://biocypher.org. For a quickstart into your\nown pipeline, you can refer to our [project\ntemplate](https://github.com/biocypher/project-template), and for an overview of\nexisting and planned adapters for resources and outputs, as well as other\nfeatures, visit our [GitHub Project\nBoard](https://github.com/orgs/biocypher/projects/3/views/2).\n\n## ⚙️ Installation / Usage\nInstall the package from PyPI using `pip install biocypher`. More comprehensive\ninstallation and configuration instructions can be found\n[here](https://biocypher.org/installation.html).\n\nExemplary usage of BioCypher to build a graph database is shown in our tutorial\nand the various pipelines we have created. You can find these on the [Components\nProject Board](https://github.com/orgs/biocypher/projects/3/views/2).\n\n## 🤝 Getting involved\nWe are very happy about contributions from the community, large and small!\nIf you would like to contribute to BioCypher development, please refer to\nour [contribution guidelines](CONTRIBUTING.md). :)\n\nIf you want to ask informal questions, talk about dev things, or just chat,\nplease join our community at https://biocypher.zulipchat.com!\n\n> **Imposter syndrome disclaimer:** We want your help. No, really. There may be a little voice inside your head that is telling you that you\'re not ready, that you aren\'t skilled enough to contribute. We assure you that the little voice in your head is wrong. Most importantly, there are many valuable ways to contribute besides writing code.\n>\n> This disclaimer was adapted from the [Pooch](https://github.com/fatiando/pooch) project.\n\n## ✍️ Citation\nThe BioCypher paper has been peer-reviewed in\n[Nature Biotechnology](https://www.nature.com/articles/s41587-023-01848-y).\nBefore, it was available as a preprint at https://arxiv.org/abs/2212.13543.\n\n## Acknowledgements\nThis project has received funding from the European Union’s Horizon 2020\nresearch and innovation programme under grant agreement No 965193 for DECIDER\nand No 116030 for TransQST.\n',
|
|
26
|
+
'author': 'Sebastian Lobentanzer',
|
|
27
|
+
'author_email': 'sebastian.lobentanzer@gmail.com',
|
|
28
|
+
'maintainer': 'None',
|
|
29
|
+
'maintainer_email': 'None',
|
|
30
|
+
'url': 'https://github.com/biocypher/biocypher',
|
|
31
|
+
'packages': packages,
|
|
32
|
+
'package_data': package_data,
|
|
33
|
+
'install_requires': install_requires,
|
|
34
|
+
'python_requires': '>=3.9,<4.0',
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
setup(**setup_kwargs)
|
|
@@ -1,105 +0,0 @@
|
|
|
1
|
-
from ._logger import logger
|
|
2
|
-
|
|
3
|
-
logger.debug(f"Loading module {__name__}.")
|
|
4
|
-
|
|
5
|
-
from ._create import BioCypherEdge, BioCypherNode
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class Deduplicator:
|
|
9
|
-
"""
|
|
10
|
-
Singleton class responsible of deduplicating BioCypher inputs. Maintains
|
|
11
|
-
sets/dictionaries of node and edge types and their unique identifiers.
|
|
12
|
-
|
|
13
|
-
Nodes identifiers should be globally unique (represented as a set), while
|
|
14
|
-
edge identifiers are only unique per edge type (represented as a dict of
|
|
15
|
-
sets, keyed by edge type).
|
|
16
|
-
|
|
17
|
-
Stores collection of duplicate node and edge identifiers and types for
|
|
18
|
-
troubleshooting and to avoid overloading the log.
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
def __init__(self):
|
|
22
|
-
self.seen_node_ids = set()
|
|
23
|
-
self.duplicate_node_ids = set()
|
|
24
|
-
self.duplicate_node_types = set()
|
|
25
|
-
|
|
26
|
-
self.seen_edges = {}
|
|
27
|
-
self.duplicate_edge_ids = set()
|
|
28
|
-
self.duplicate_edge_types = set()
|
|
29
|
-
|
|
30
|
-
def node_seen(self, node: BioCypherNode) -> bool:
|
|
31
|
-
"""
|
|
32
|
-
Adds a node to the instance and checks if it has been seen before.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
node: BioCypherNode to be added.
|
|
36
|
-
|
|
37
|
-
Returns:
|
|
38
|
-
True if the node has been seen before, False otherwise.
|
|
39
|
-
"""
|
|
40
|
-
if node.get_id() in self.seen_node_ids:
|
|
41
|
-
self.duplicate_node_ids.add(node.get_id())
|
|
42
|
-
if node.get_label() not in self.duplicate_node_types:
|
|
43
|
-
logger.warning(
|
|
44
|
-
f"Duplicate node type {node.get_label()} found. "
|
|
45
|
-
)
|
|
46
|
-
self.duplicate_node_types.add(node.get_label())
|
|
47
|
-
return True
|
|
48
|
-
|
|
49
|
-
self.seen_node_ids.add(node.get_id())
|
|
50
|
-
return False
|
|
51
|
-
|
|
52
|
-
def edge_seen(self, edge: BioCypherEdge) -> bool:
|
|
53
|
-
"""
|
|
54
|
-
Adds an edge to the instance and checks if it has been seen before.
|
|
55
|
-
|
|
56
|
-
Args:
|
|
57
|
-
edge: BioCypherEdge to be added.
|
|
58
|
-
|
|
59
|
-
Returns:
|
|
60
|
-
True if the edge has been seen before, False otherwise.
|
|
61
|
-
"""
|
|
62
|
-
if edge.get_type() not in self.seen_edges:
|
|
63
|
-
self.seen_edges[edge.get_type()] = set()
|
|
64
|
-
|
|
65
|
-
# concatenate source and target if no id is present
|
|
66
|
-
if not edge.get_id():
|
|
67
|
-
_id = f"{edge.get_source_id()}_{edge.get_target_id()}"
|
|
68
|
-
else:
|
|
69
|
-
_id = edge.get_id()
|
|
70
|
-
|
|
71
|
-
if _id in self.seen_edges[edge.get_type()]:
|
|
72
|
-
self.duplicate_edge_ids.add(_id)
|
|
73
|
-
if edge.get_type() not in self.duplicate_edge_types:
|
|
74
|
-
logger.warning(f"Duplicate edge type {edge.get_type()} found. ")
|
|
75
|
-
self.duplicate_edge_types.add(edge.get_type())
|
|
76
|
-
return True
|
|
77
|
-
|
|
78
|
-
self.seen_edges[edge.get_type()].add(_id)
|
|
79
|
-
return False
|
|
80
|
-
|
|
81
|
-
def get_duplicate_nodes(self):
|
|
82
|
-
"""
|
|
83
|
-
Function to return a list of duplicate nodes.
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
list: list of duplicate nodes
|
|
87
|
-
"""
|
|
88
|
-
|
|
89
|
-
if self.duplicate_node_types:
|
|
90
|
-
return (self.duplicate_node_types, self.duplicate_node_ids)
|
|
91
|
-
else:
|
|
92
|
-
return None
|
|
93
|
-
|
|
94
|
-
def get_duplicate_edges(self):
|
|
95
|
-
"""
|
|
96
|
-
Function to return a list of duplicate edges.
|
|
97
|
-
|
|
98
|
-
Returns:
|
|
99
|
-
list: list of duplicate edges
|
|
100
|
-
"""
|
|
101
|
-
|
|
102
|
-
if self.duplicate_edge_types:
|
|
103
|
-
return (self.duplicate_edge_types, self.duplicate_edge_ids)
|
|
104
|
-
else:
|
|
105
|
-
return None
|
biocypher-0.5.19/setup.py
DELETED
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
from setuptools import setup
|
|
3
|
-
|
|
4
|
-
packages = \
|
|
5
|
-
['biocypher', 'biocypher._config']
|
|
6
|
-
|
|
7
|
-
package_data = \
|
|
8
|
-
{'': ['*']}
|
|
9
|
-
|
|
10
|
-
install_requires = \
|
|
11
|
-
['PyYAML>=5.0',
|
|
12
|
-
'appdirs',
|
|
13
|
-
'more_itertools',
|
|
14
|
-
'neo4j-utils==0.0.7',
|
|
15
|
-
'networkx>=3.0,<4.0',
|
|
16
|
-
'pandas>=2.0.1,<3.0.0',
|
|
17
|
-
'rdflib>=6.2.0,<7.0.0',
|
|
18
|
-
'stringcase>=1.2.0,<2.0.0',
|
|
19
|
-
'treelib>=1.6.1,<2.0.0']
|
|
20
|
-
|
|
21
|
-
setup_kwargs = {
|
|
22
|
-
'name': 'biocypher',
|
|
23
|
-
'version': '0.5.19',
|
|
24
|
-
'description': 'A unifying framework for biomedical research knowledge graphs',
|
|
25
|
-
'long_description': '# BioCypher\n[](https://opensource.org/licenses/MIT)\n\n\n[](https://badge.fury.io/py/biocypher)\n[](https://www.repostatus.org/#active)\n\n[](https://pepy.tech/project/biocypher)\n[](https://github.com/pre-commit/pre-commit)\n[](http://makeapullrequest.com)\n[](https://github.com/biopragmatics/bioregistry)\n\n## ❓ Description\nKnowledge graphs (KGs) are an [approach to knowledge\nrepresentation](https://en.wikipedia.org/wiki/Knowledge_graph) that uses graph\nstructure to facilitate exploration and analysis of complex data, often\nleveraging semantic information. They are popular in many research areas,\nincluding the life sciences, due to their versatile use, for instance in data\nstorage, integration, reasoning, and more recently in artificial intelligence.\nThe creation of KGs is a complex task; BioCypher helps you in creating and\nmaintaining your own KG. For more overview, usage notes, and a tutorial, read\nthe docs [here](https://biocypher.org).\n\n<img\n style="display: block;\n margin-left: auto;\n margin-right: auto;\n width: 70%;"\n src="docs/graphical_abstract.png"\n alt="Graphical Abstract">\n</img>\n\n## 📖 Documentation\nTutorial and developer docs at https://biocypher.org. For a quickstart into your\nown pipeline, you can refer to our [project\ntemplate](https://github.com/biocypher/project-template), and for an overview of\nexisting and planned adapters for resources and outputs, as well as other\nfeatures, visit our [GitHub Project\nBoard](https://github.com/orgs/biocypher/projects/3/views/2).\n\n## ⚙️ Installation / Usage\nInstall the package from PyPI using `pip install biocypher`. More comprehensive\ninstallation and configuration instructions can be found\n[here](https://biocypher.org/installation.html).\n\nExemplary usage of BioCypher to build a graph database is shown in our tutorial\nand the various pipelines we have created. You can find these on the [Components\nProject Board](https://github.com/orgs/biocypher/projects/3/views/2).\n\n## 🤝 Getting involved\nWe are very happy about contributions from the community, large and small!\nIf you would like to contribute to BioCypher development, please refer to\nour [contribution guidelines](CONTRIBUTING.md). :)\n\nIf you want to ask informal questions, talk about dev things, or just chat,\nplease join our community at https://biocypher.zulipchat.com!\n\n> **Imposter syndrome disclaimer:** We want your help. No, really. There may be a little voice inside your head that is telling you that you\'re not ready, that you aren\'t skilled enough to contribute. We assure you that the little voice in your head is wrong. Most importantly, there are many valuable ways to contribute besides writing code.\n>\n> This disclaimer was adapted from the [Pooch](https://github.com/fatiando/pooch) project.\n\n## ✍️ Citation\nThe BioCypher paper has been peer-reviewed in\n[Nature Biotechnology](https://www.nature.com/articles/s41587-023-01848-y).\nBefore, it was available as a preprint at https://arxiv.org/abs/2212.13543.\n\n## Acknowledgements\nThis project has received funding from the European Union’s Horizon 2020\nresearch and innovation programme under grant agreement No 965193 for DECIDER\nand No 116030 for TransQST.\n',
|
|
26
|
-
'author': 'Sebastian Lobentanzer',
|
|
27
|
-
'author_email': 'sebastian.lobentanzer@gmail.com',
|
|
28
|
-
'maintainer': 'None',
|
|
29
|
-
'maintainer_email': 'None',
|
|
30
|
-
'url': 'https://github.com/biocypher/biocypher',
|
|
31
|
-
'packages': packages,
|
|
32
|
-
'package_data': package_data,
|
|
33
|
-
'install_requires': install_requires,
|
|
34
|
-
'python_requires': '>=3.9,<4.0',
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
setup(**setup_kwargs)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{biocypher-0.5.19 → biocypher-0.5.20}/biocypher/_config/test_schema_config_disconnected.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|