biocypher 0.5.19__tar.gz → 0.5.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- {biocypher-0.5.19 → biocypher-0.5.21}/PKG-INFO +6 -2
- {biocypher-0.5.19 → biocypher-0.5.21}/README.md +3 -1
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_connect.py +6 -12
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_core.py +135 -23
- biocypher-0.5.21/biocypher/_deduplicate.py +147 -0
- biocypher-0.5.21/biocypher/_get.py +299 -0
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_metadata.py +1 -1
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_ontology.py +11 -9
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_pandas.py +32 -7
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_translate.py +29 -26
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_write.py +75 -57
- {biocypher-0.5.19 → biocypher-0.5.21}/pyproject.toml +5 -1
- biocypher-0.5.21/setup.py +40 -0
- biocypher-0.5.19/biocypher/_deduplicate.py +0 -105
- biocypher-0.5.19/setup.py +0 -38
- {biocypher-0.5.19 → biocypher-0.5.21}/LICENSE +0 -0
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/__init__.py +0 -0
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_config/__init__.py +0 -0
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_config/biocypher_config.yaml +0 -0
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_config/test_config.yaml +0 -0
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_config/test_schema_config.yaml +0 -0
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_config/test_schema_config_disconnected.yaml +0 -0
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_config/test_schema_config_extended.yaml +0 -0
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_create.py +0 -0
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_logger.py +0 -0
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_mapping.py +0 -0
- {biocypher-0.5.19 → biocypher-0.5.21}/biocypher/_misc.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: biocypher
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.21
|
|
4
4
|
Summary: A unifying framework for biomedical research knowledge graphs
|
|
5
5
|
Home-page: https://github.com/biocypher/biocypher
|
|
6
6
|
License: MIT
|
|
@@ -25,8 +25,10 @@ Requires-Dist: more_itertools
|
|
|
25
25
|
Requires-Dist: neo4j-utils (==0.0.7)
|
|
26
26
|
Requires-Dist: networkx (>=3.0,<4.0)
|
|
27
27
|
Requires-Dist: pandas (>=2.0.1,<3.0.0)
|
|
28
|
+
Requires-Dist: pooch (>=1.7.0,<2.0.0)
|
|
28
29
|
Requires-Dist: rdflib (>=6.2.0,<7.0.0)
|
|
29
30
|
Requires-Dist: stringcase (>=1.2.0,<2.0.0)
|
|
31
|
+
Requires-Dist: tqdm (>=4.65.0,<5.0.0)
|
|
30
32
|
Requires-Dist: treelib (>=1.6.1,<2.0.0)
|
|
31
33
|
Project-URL: Bug Tracker, https://github.com/biocypher/biocypher/issues
|
|
32
34
|
Project-URL: Repository, https://github.com/biocypher/biocypher
|
|
@@ -38,7 +40,9 @@ Description-Content-Type: text/markdown
|
|
|
38
40
|

|
|
39
41
|
[](https://badge.fury.io/py/biocypher)
|
|
40
42
|
[](https://www.repostatus.org/#active)
|
|
41
|
-
](https://github.com/biocypher/biocypher/actions/workflows/ci_cd.yaml)
|
|
44
|
+

|
|
45
|
+
[](https://github.com/biocypher/biocypher/actions/workflows/sphinx_autodoc.yaml)
|
|
42
46
|
[](https://pepy.tech/project/biocypher)
|
|
43
47
|
[](https://github.com/pre-commit/pre-commit)
|
|
44
48
|
[](http://makeapullrequest.com)
|
|
@@ -4,7 +4,9 @@
|
|
|
4
4
|

|
|
5
5
|
[](https://badge.fury.io/py/biocypher)
|
|
6
6
|
[](https://www.repostatus.org/#active)
|
|
7
|
-
](https://github.com/biocypher/biocypher/actions/workflows/ci_cd.yaml)
|
|
8
|
+

|
|
9
|
+
[](https://github.com/biocypher/biocypher/actions/workflows/sphinx_autodoc.yaml)
|
|
8
10
|
[](https://pepy.tech/project/biocypher)
|
|
9
11
|
[](https://github.com/pre-commit/pre-commit)
|
|
10
12
|
[](http://makeapullrequest.com)
|
|
@@ -53,8 +53,6 @@ class _Neo4jDriver:
|
|
|
53
53
|
|
|
54
54
|
increment_version (bool): Whether to increment the version number.
|
|
55
55
|
|
|
56
|
-
ontology (Ontology): The ontology to use for mapping.
|
|
57
|
-
|
|
58
56
|
translator (Translator): The translator to use for mapping.
|
|
59
57
|
|
|
60
58
|
"""
|
|
@@ -66,14 +64,12 @@ class _Neo4jDriver:
|
|
|
66
64
|
user: str,
|
|
67
65
|
password: str,
|
|
68
66
|
multi_db: bool,
|
|
69
|
-
ontology: Ontology,
|
|
70
67
|
translator: Translator,
|
|
71
68
|
wipe: bool = False,
|
|
72
69
|
fetch_size: int = 1000,
|
|
73
70
|
increment_version: bool = True,
|
|
74
71
|
):
|
|
75
|
-
self.
|
|
76
|
-
self._translator = translator
|
|
72
|
+
self.translator = translator
|
|
77
73
|
|
|
78
74
|
self._driver = neo4j_utils.Driver(
|
|
79
75
|
db_name=database_name,
|
|
@@ -103,7 +99,7 @@ class _Neo4jDriver:
|
|
|
103
99
|
"MATCH (v:BioCypher) " "WHERE NOT (v)-[:PRECEDES]->() " "RETURN v",
|
|
104
100
|
)
|
|
105
101
|
# add version node
|
|
106
|
-
self.add_biocypher_nodes(self.
|
|
102
|
+
self.add_biocypher_nodes(self.translator.ontology)
|
|
107
103
|
|
|
108
104
|
# connect version node to previous
|
|
109
105
|
if db_version[0]:
|
|
@@ -111,7 +107,7 @@ class _Neo4jDriver:
|
|
|
111
107
|
previous_id = previous["v"]["id"]
|
|
112
108
|
e_meta = BioCypherEdge(
|
|
113
109
|
previous_id,
|
|
114
|
-
self.
|
|
110
|
+
self.translator.ontology.get_dict().get("node_id"),
|
|
115
111
|
"PRECEDES",
|
|
116
112
|
)
|
|
117
113
|
self.add_biocypher_edges(e_meta)
|
|
@@ -142,7 +138,7 @@ class _Neo4jDriver:
|
|
|
142
138
|
logger.info("Creating constraints for node types in config.")
|
|
143
139
|
|
|
144
140
|
# get structure
|
|
145
|
-
for leaf in self.
|
|
141
|
+
for leaf in self.translator.ontology.mapping.extended_schema.items():
|
|
146
142
|
label = _misc.sentencecase_to_pascalcase(leaf[0])
|
|
147
143
|
if leaf[1]["represented_as"] == "node":
|
|
148
144
|
s = (
|
|
@@ -172,7 +168,7 @@ class _Neo4jDriver:
|
|
|
172
168
|
- second entry: Neo4j summary.
|
|
173
169
|
"""
|
|
174
170
|
|
|
175
|
-
bn = self.
|
|
171
|
+
bn = self.translator.translate_nodes(id_type_tuples)
|
|
176
172
|
return self.add_biocypher_nodes(bn)
|
|
177
173
|
|
|
178
174
|
def add_edges(self, id_src_tar_type_tuples: Iterable[tuple]) -> tuple:
|
|
@@ -204,7 +200,7 @@ class _Neo4jDriver:
|
|
|
204
200
|
- second entry: Neo4j summary.
|
|
205
201
|
"""
|
|
206
202
|
|
|
207
|
-
bn = self.
|
|
203
|
+
bn = self.translator.translate_edges(id_src_tar_type_tuples)
|
|
208
204
|
return self.add_biocypher_edges(bn)
|
|
209
205
|
|
|
210
206
|
def add_biocypher_nodes(
|
|
@@ -375,7 +371,6 @@ class _Neo4jDriver:
|
|
|
375
371
|
def get_driver(
|
|
376
372
|
dbms: str,
|
|
377
373
|
translator: "Translator",
|
|
378
|
-
ontology: "Ontology",
|
|
379
374
|
):
|
|
380
375
|
"""
|
|
381
376
|
Function to return the writer class.
|
|
@@ -394,7 +389,6 @@ def get_driver(
|
|
|
394
389
|
user=dbms_config["user"],
|
|
395
390
|
password=dbms_config["password"],
|
|
396
391
|
multi_db=dbms_config["multi_db"],
|
|
397
|
-
ontology=ontology,
|
|
398
392
|
translator=translator,
|
|
399
393
|
)
|
|
400
394
|
|
|
@@ -13,8 +13,10 @@ BioCypher core module. Interfaces with the user and distributes tasks to
|
|
|
13
13
|
submodules.
|
|
14
14
|
"""
|
|
15
15
|
from typing import Optional
|
|
16
|
+
import os
|
|
16
17
|
|
|
17
18
|
from more_itertools import peekable
|
|
19
|
+
import yaml
|
|
18
20
|
|
|
19
21
|
import pandas as pd
|
|
20
22
|
|
|
@@ -22,10 +24,11 @@ from ._logger import logger
|
|
|
22
24
|
|
|
23
25
|
logger.debug(f"Loading module {__name__}.")
|
|
24
26
|
|
|
27
|
+
from ._get import Downloader
|
|
25
28
|
from ._write import get_writer
|
|
26
29
|
from ._config import config as _config
|
|
27
30
|
from ._config import update_from_file as _file_update
|
|
28
|
-
from ._create import BioCypherEdge, BioCypherNode
|
|
31
|
+
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
29
32
|
from ._pandas import Pandas
|
|
30
33
|
from ._connect import get_driver
|
|
31
34
|
from ._mapping import OntologyMapping
|
|
@@ -181,19 +184,6 @@ class BioCypher:
|
|
|
181
184
|
|
|
182
185
|
return self._ontology_mapping
|
|
183
186
|
|
|
184
|
-
def _get_translator(self) -> Translator:
|
|
185
|
-
"""
|
|
186
|
-
Create translator if not exists and return.
|
|
187
|
-
"""
|
|
188
|
-
|
|
189
|
-
if not self._translator:
|
|
190
|
-
self._translator = Translator(
|
|
191
|
-
ontology_mapping=self._get_ontology_mapping(),
|
|
192
|
-
strict_mode=self._strict_mode,
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
return self._translator
|
|
196
|
-
|
|
197
187
|
def _get_ontology(self) -> Ontology:
|
|
198
188
|
"""
|
|
199
189
|
Create ontology if not exists and return.
|
|
@@ -208,17 +198,28 @@ class BioCypher:
|
|
|
208
198
|
|
|
209
199
|
return self._ontology
|
|
210
200
|
|
|
201
|
+
def _get_translator(self) -> Translator:
|
|
202
|
+
"""
|
|
203
|
+
Create translator if not exists and return.
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
if not self._translator:
|
|
207
|
+
self._translator = Translator(
|
|
208
|
+
ontology=self._get_ontology(),
|
|
209
|
+
strict_mode=self._strict_mode,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
return self._translator
|
|
213
|
+
|
|
211
214
|
def _get_writer(self):
|
|
212
215
|
"""
|
|
213
216
|
Create writer if not online. Set as instance variable `self._writer`.
|
|
214
217
|
"""
|
|
215
218
|
|
|
216
|
-
# Get worker
|
|
217
219
|
if self._offline:
|
|
218
220
|
self._writer = get_writer(
|
|
219
221
|
dbms=self._dbms,
|
|
220
222
|
translator=self._get_translator(),
|
|
221
|
-
ontology=self._get_ontology(),
|
|
222
223
|
deduplicator=self._get_deduplicator(),
|
|
223
224
|
output_directory=self._output_directory,
|
|
224
225
|
strict_mode=self._strict_mode,
|
|
@@ -235,7 +236,6 @@ class BioCypher:
|
|
|
235
236
|
self._driver = get_driver(
|
|
236
237
|
dbms=self._dbms,
|
|
237
238
|
translator=self._get_translator(),
|
|
238
|
-
ontology=self._get_ontology(),
|
|
239
239
|
deduplicator=self._get_deduplicator(),
|
|
240
240
|
)
|
|
241
241
|
else:
|
|
@@ -308,24 +308,33 @@ class BioCypher:
|
|
|
308
308
|
|
|
309
309
|
return self._pd.dfs
|
|
310
310
|
|
|
311
|
-
def add(self, entities):
|
|
311
|
+
def add(self, entities) -> None:
|
|
312
312
|
"""
|
|
313
313
|
Function to add entities to the in-memory database. Accepts an iterable
|
|
314
314
|
of tuples (if given, translates to ``BioCypherNode`` or
|
|
315
315
|
``BioCypherEdge`` objects) or an iterable of ``BioCypherNode`` or
|
|
316
316
|
``BioCypherEdge`` objects.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
entities (iterable): An iterable of entities to add to the database.
|
|
320
|
+
Can be 3-tuples (nodes) or 5-tuples (edges); also accepts
|
|
321
|
+
4-tuples for edges (deprecated).
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
None
|
|
317
325
|
"""
|
|
318
326
|
if not self._pd:
|
|
319
327
|
self._pd = Pandas(
|
|
320
328
|
translator=self._get_translator(),
|
|
321
|
-
ontology=self._get_ontology(),
|
|
322
329
|
deduplicator=self._get_deduplicator(),
|
|
323
330
|
)
|
|
324
331
|
|
|
325
332
|
entities = peekable(entities)
|
|
326
333
|
|
|
327
|
-
if
|
|
328
|
-
entities.peek(),
|
|
334
|
+
if (
|
|
335
|
+
isinstance(entities.peek(), BioCypherNode)
|
|
336
|
+
or isinstance(entities.peek(), BioCypherEdge)
|
|
337
|
+
or isinstance(entities.peek(), BioCypherRelAsNode)
|
|
329
338
|
):
|
|
330
339
|
tentities = entities
|
|
331
340
|
elif len(entities.peek()) < 4:
|
|
@@ -335,10 +344,28 @@ class BioCypher:
|
|
|
335
344
|
|
|
336
345
|
self._pd.add_tables(tentities)
|
|
337
346
|
|
|
338
|
-
def add_nodes(self, nodes):
|
|
347
|
+
def add_nodes(self, nodes) -> None:
|
|
348
|
+
"""
|
|
349
|
+
Wrapper for ``add()`` to add nodes to the in-memory database.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
nodes (iterable): An iterable of node tuples to add to the database.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
None
|
|
356
|
+
"""
|
|
339
357
|
self.add(nodes)
|
|
340
358
|
|
|
341
|
-
def add_edges(self, edges):
|
|
359
|
+
def add_edges(self, edges) -> None:
|
|
360
|
+
"""
|
|
361
|
+
Wrapper for ``add()`` to add edges to the in-memory database.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
edges (iterable): An iterable of edge tuples to add to the database.
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
None
|
|
368
|
+
"""
|
|
342
369
|
self.add(edges)
|
|
343
370
|
|
|
344
371
|
def merge_nodes(self, nodes) -> bool:
|
|
@@ -389,6 +416,24 @@ class BioCypher:
|
|
|
389
416
|
# write edge files
|
|
390
417
|
return self._driver.add_biocypher_edges(tedges)
|
|
391
418
|
|
|
419
|
+
# DOWNLOAD AND CACHE MANAGEMENT METHODS ###
|
|
420
|
+
|
|
421
|
+
def _get_downloader(self):
|
|
422
|
+
"""
|
|
423
|
+
Create downloader if not exists.
|
|
424
|
+
"""
|
|
425
|
+
|
|
426
|
+
if not self._downloader:
|
|
427
|
+
self._downloader = Downloader()
|
|
428
|
+
|
|
429
|
+
def download(self, force: bool = False) -> None:
|
|
430
|
+
"""
|
|
431
|
+
Use the :class:`Downloader` class to download or load from cache the
|
|
432
|
+
resources given by the adapter.
|
|
433
|
+
"""
|
|
434
|
+
|
|
435
|
+
self._get_downloader()
|
|
436
|
+
|
|
392
437
|
# OVERVIEW AND CONVENIENCE METHODS ###
|
|
393
438
|
|
|
394
439
|
def log_missing_input_labels(self) -> Optional[dict[str, list[str]]]:
|
|
@@ -504,6 +549,73 @@ class BioCypher:
|
|
|
504
549
|
|
|
505
550
|
self._writer.write_import_call()
|
|
506
551
|
|
|
552
|
+
def write_schema_info(self) -> None:
|
|
553
|
+
"""
|
|
554
|
+
Write an extended schema info YAML file that extends the
|
|
555
|
+
`schema_config.yaml` with run-time information of the built KG. For
|
|
556
|
+
instance, include information on whether something present in the actual
|
|
557
|
+
knowledge graph, whether it is a relationship (which is important in the
|
|
558
|
+
case of representing relationships as nodes) and the actual sources and
|
|
559
|
+
targets of edges. Since this file can be used in place of the original
|
|
560
|
+
`schema_config.yaml` file, it indicates that it is the extended schema
|
|
561
|
+
by setting `is_schema_info` to `true`.
|
|
562
|
+
|
|
563
|
+
We start by using the `extended_schema` dictionary from the ontology
|
|
564
|
+
class instance, which contains all expanded entities and relationships.
|
|
565
|
+
The information of whether something is a relationship can be gathered
|
|
566
|
+
from the deduplicator instance, which keeps track of all entities that
|
|
567
|
+
have been seen.
|
|
568
|
+
"""
|
|
569
|
+
|
|
570
|
+
if not self._offline:
|
|
571
|
+
raise NotImplementedError(
|
|
572
|
+
"Cannot write schema info in online mode."
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
ontology = self._get_ontology()
|
|
576
|
+
schema = ontology.mapping.extended_schema
|
|
577
|
+
schema["is_schema_info"] = True
|
|
578
|
+
|
|
579
|
+
deduplicator = self._get_deduplicator()
|
|
580
|
+
for node in deduplicator.entity_types:
|
|
581
|
+
if node in schema.keys():
|
|
582
|
+
schema[node]["present_in_knowledge_graph"] = True
|
|
583
|
+
schema[node]["is_relationship"] = False
|
|
584
|
+
else:
|
|
585
|
+
logger.info(
|
|
586
|
+
f"Node {node} not present in extended schema. "
|
|
587
|
+
"Skipping schema info."
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
# find 'label_as_edge' cases in schema entries
|
|
591
|
+
changed_labels = {}
|
|
592
|
+
for k, v in schema.items():
|
|
593
|
+
if not isinstance(v, dict):
|
|
594
|
+
continue
|
|
595
|
+
if "label_as_edge" in v.keys():
|
|
596
|
+
if v["label_as_edge"] in deduplicator.seen_relationships.keys():
|
|
597
|
+
changed_labels[v["label_as_edge"]] = k
|
|
598
|
+
|
|
599
|
+
for edge in deduplicator.seen_relationships.keys():
|
|
600
|
+
if edge in changed_labels.keys():
|
|
601
|
+
edge = changed_labels[edge]
|
|
602
|
+
if edge in schema.keys():
|
|
603
|
+
schema[edge]["present_in_knowledge_graph"] = True
|
|
604
|
+
schema[edge]["is_relationship"] = True
|
|
605
|
+
# TODO information about source and target nodes
|
|
606
|
+
else:
|
|
607
|
+
logger.info(
|
|
608
|
+
f"Edge {edge} not present in extended schema. "
|
|
609
|
+
"Skipping schema info."
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
# write to output directory as YAML file
|
|
613
|
+
path = os.path.join(self._output_directory, "schema_info.yaml")
|
|
614
|
+
with open(path, "w") as f:
|
|
615
|
+
f.write(yaml.dump(schema))
|
|
616
|
+
|
|
617
|
+
return schema
|
|
618
|
+
|
|
507
619
|
# TRANSLATION METHODS ###
|
|
508
620
|
|
|
509
621
|
def translate_term(self, term: str) -> str:
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from ._logger import logger
|
|
2
|
+
|
|
3
|
+
logger.debug(f"Loading module {__name__}.")
|
|
4
|
+
|
|
5
|
+
from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Deduplicator:
|
|
9
|
+
"""
|
|
10
|
+
Singleton class responsible of deduplicating BioCypher inputs. Maintains
|
|
11
|
+
sets/dictionaries of node and edge types and their unique identifiers.
|
|
12
|
+
|
|
13
|
+
Nodes identifiers should be globally unique (represented as a set), while
|
|
14
|
+
edge identifiers are only unique per edge type (represented as a dict of
|
|
15
|
+
sets, keyed by edge type).
|
|
16
|
+
|
|
17
|
+
Stores collection of duplicate node and edge identifiers and types for
|
|
18
|
+
troubleshooting and to avoid overloading the log.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self):
|
|
22
|
+
self.seen_entity_ids = set()
|
|
23
|
+
self.duplicate_entity_ids = set()
|
|
24
|
+
|
|
25
|
+
self.entity_types = set()
|
|
26
|
+
self.duplicate_entity_types = set()
|
|
27
|
+
|
|
28
|
+
self.seen_relationships = {}
|
|
29
|
+
self.duplicate_relationship_ids = set()
|
|
30
|
+
self.duplicate_relationship_types = set()
|
|
31
|
+
|
|
32
|
+
def node_seen(self, entity: BioCypherNode) -> bool:
|
|
33
|
+
"""
|
|
34
|
+
Adds a node to the instance and checks if it has been seen before.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
node: BioCypherNode to be added.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
True if the node has been seen before, False otherwise.
|
|
41
|
+
"""
|
|
42
|
+
if entity.get_label() not in self.entity_types:
|
|
43
|
+
self.entity_types.add(entity.get_label())
|
|
44
|
+
|
|
45
|
+
if entity.get_id() in self.seen_entity_ids:
|
|
46
|
+
self.duplicate_entity_ids.add(entity.get_id())
|
|
47
|
+
if entity.get_label() not in self.duplicate_entity_types:
|
|
48
|
+
logger.warning(
|
|
49
|
+
f"Duplicate node type {entity.get_label()} found. "
|
|
50
|
+
)
|
|
51
|
+
self.duplicate_entity_types.add(entity.get_label())
|
|
52
|
+
return True
|
|
53
|
+
|
|
54
|
+
self.seen_entity_ids.add(entity.get_id())
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
def edge_seen(self, relationship: BioCypherEdge) -> bool:
|
|
58
|
+
"""
|
|
59
|
+
Adds an edge to the instance and checks if it has been seen before.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
edge: BioCypherEdge to be added.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
True if the edge has been seen before, False otherwise.
|
|
66
|
+
"""
|
|
67
|
+
if relationship.get_type() not in self.seen_relationships:
|
|
68
|
+
self.seen_relationships[relationship.get_type()] = set()
|
|
69
|
+
|
|
70
|
+
# concatenate source and target if no id is present
|
|
71
|
+
if not relationship.get_id():
|
|
72
|
+
_id = (
|
|
73
|
+
f"{relationship.get_source_id()}_{relationship.get_target_id()}"
|
|
74
|
+
)
|
|
75
|
+
else:
|
|
76
|
+
_id = relationship.get_id()
|
|
77
|
+
|
|
78
|
+
if _id in self.seen_relationships[relationship.get_type()]:
|
|
79
|
+
self.duplicate_relationship_ids.add(_id)
|
|
80
|
+
if relationship.get_type() not in self.duplicate_relationship_types:
|
|
81
|
+
logger.warning(
|
|
82
|
+
f"Duplicate edge type {relationship.get_type()} found. "
|
|
83
|
+
)
|
|
84
|
+
self.duplicate_relationship_types.add(relationship.get_type())
|
|
85
|
+
return True
|
|
86
|
+
|
|
87
|
+
self.seen_relationships[relationship.get_type()].add(_id)
|
|
88
|
+
return False
|
|
89
|
+
|
|
90
|
+
def rel_as_node_seen(self, rel_as_node: BioCypherRelAsNode) -> bool:
|
|
91
|
+
"""
|
|
92
|
+
Adds a rel_as_node to the instance (one entity and two relationships)
|
|
93
|
+
and checks if it has been seen before. Only the node is relevant for
|
|
94
|
+
identifying the rel_as_node as a duplicate.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
rel_as_node: BioCypherRelAsNode to be added.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
True if the rel_as_node has been seen before, False otherwise.
|
|
101
|
+
"""
|
|
102
|
+
node = rel_as_node.get_node()
|
|
103
|
+
|
|
104
|
+
if node.get_label() not in self.seen_relationships:
|
|
105
|
+
self.seen_relationships[node.get_label()] = set()
|
|
106
|
+
|
|
107
|
+
# rel as node always has an id
|
|
108
|
+
_id = node.get_id()
|
|
109
|
+
|
|
110
|
+
if _id in self.seen_relationships[node.get_type()]:
|
|
111
|
+
self.duplicate_relationship_ids.add(_id)
|
|
112
|
+
if node.get_type() not in self.duplicate_relationship_types:
|
|
113
|
+
logger.warning(f"Duplicate edge type {node.get_type()} found. ")
|
|
114
|
+
self.duplicate_relationship_types.add(node.get_type())
|
|
115
|
+
return True
|
|
116
|
+
|
|
117
|
+
self.seen_relationships[node.get_type()].add(_id)
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
def get_duplicate_nodes(self):
|
|
121
|
+
"""
|
|
122
|
+
Function to return a list of duplicate nodes.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
list: list of duplicate nodes
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
if self.duplicate_entity_types:
|
|
129
|
+
return (self.duplicate_entity_types, self.duplicate_entity_ids)
|
|
130
|
+
else:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
def get_duplicate_edges(self):
|
|
134
|
+
"""
|
|
135
|
+
Function to return a list of duplicate edges.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
list: list of duplicate edges
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
if self.duplicate_relationship_types:
|
|
142
|
+
return (
|
|
143
|
+
self.duplicate_relationship_types,
|
|
144
|
+
self.duplicate_relationship_ids,
|
|
145
|
+
)
|
|
146
|
+
else:
|
|
147
|
+
return None
|