biocypher 0.5.41__py3-none-any.whl → 0.5.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/_config/biocypher_config.yaml +15 -0
- biocypher/_core.py +3 -3
- biocypher/_metadata.py +1 -1
- biocypher/_misc.py +6 -1
- biocypher/_ontology.py +133 -53
- biocypher/{_connect.py → output/connect/_neo4j_driver.py} +5 -5
- biocypher/{_pandas.py → output/in_memory/_pandas.py} +2 -1
- biocypher/output/write/__init__.py +0 -0
- biocypher/{write → output/write}/_batch_writer.py +26 -22
- biocypher/{write/_write.py → output/write/_get_writer.py} +19 -11
- biocypher/output/write/_writer.py +200 -0
- biocypher/output/write/graph/__init__.py +0 -0
- biocypher/{write → output/write}/graph/_arangodb.py +1 -1
- biocypher/{write → output/write}/graph/_neo4j.py +2 -4
- biocypher/output/write/graph/_networkx.py +76 -0
- biocypher/output/write/graph/_rdf.py +515 -0
- biocypher/output/write/relational/__init__.py +0 -0
- biocypher/output/write/relational/_csv.py +76 -0
- biocypher/{write → output/write}/relational/_postgresql.py +2 -2
- biocypher/{write → output/write}/relational/_sqlite.py +1 -1
- {biocypher-0.5.41.dist-info → biocypher-0.5.43.dist-info}/METADATA +1 -1
- biocypher-0.5.43.dist-info/RECORD +39 -0
- biocypher-0.5.41.dist-info/RECORD +0 -32
- /biocypher/{write → output}/__init__.py +0 -0
- /biocypher/{write/graph → output/connect}/__init__.py +0 -0
- /biocypher/{write/relational → output/in_memory}/__init__.py +0 -0
- {biocypher-0.5.41.dist-info → biocypher-0.5.43.dist-info}/LICENSE +0 -0
- {biocypher-0.5.41.dist-info → biocypher-0.5.43.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Union, Optional
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
|
|
7
|
+
from biocypher._logger import logger
|
|
8
|
+
from biocypher._translate import Translator
|
|
9
|
+
from biocypher._deduplicate import Deduplicator
|
|
10
|
+
|
|
11
|
+
__all__ = ["_Writer"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _Writer(ABC):
|
|
15
|
+
"""Abstract class for writing node and edge representations to disk.
|
|
16
|
+
Specifics of the different writers (e.g. neo4j, postgresql, csv, etc.)
|
|
17
|
+
are implemented in the child classes. Any concrete writer needs to
|
|
18
|
+
implement at least:
|
|
19
|
+
- _write_node_data
|
|
20
|
+
- _write_edge_data
|
|
21
|
+
- _construct_import_call
|
|
22
|
+
- _get_import_script_name
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
translator (Translator): Instance of :py:class:`Translator` to enable translation of
|
|
26
|
+
nodes and manipulation of properties.
|
|
27
|
+
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
|
|
28
|
+
of nodes and edges.
|
|
29
|
+
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
|
|
30
|
+
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
|
31
|
+
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
NotImplementedError: Writer implementation must override '_write_node_data'
|
|
35
|
+
NotImplementedError: Writer implementation must override '_write_edge_data'
|
|
36
|
+
NotImplementedError: Writer implementation must override '_construct_import_call'
|
|
37
|
+
NotImplementedError: Writer implementation must override '_get_import_script_name'
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
translator: Translator,
|
|
43
|
+
deduplicator: Deduplicator,
|
|
44
|
+
output_directory: Optional[str] = None,
|
|
45
|
+
strict_mode: bool = False,
|
|
46
|
+
*args,
|
|
47
|
+
**kwargs,
|
|
48
|
+
):
|
|
49
|
+
"""Abstract class for writing node and edge representations to disk.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
translator (Translator): Instance of :py:class:`Translator` to enable translation of
|
|
53
|
+
nodes and manipulation of properties.
|
|
54
|
+
deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
|
|
55
|
+
of nodes and edges.
|
|
56
|
+
output_directory (str, optional): Path for exporting CSV files. Defaults to None.
|
|
57
|
+
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
|
58
|
+
strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
|
|
59
|
+
"""
|
|
60
|
+
self.translator = translator
|
|
61
|
+
self.deduplicator = deduplicator
|
|
62
|
+
self.strict_mode = strict_mode
|
|
63
|
+
self.output_directory = output_directory
|
|
64
|
+
|
|
65
|
+
if os.path.exists(self.output_directory):
|
|
66
|
+
if kwargs.get("write_to_file", True):
|
|
67
|
+
logger.warning(
|
|
68
|
+
f"Output directory `{self.output_directory}` already exists. "
|
|
69
|
+
"If this is not planned, file consistency may be compromised."
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
logger.info(f"Creating output directory `{self.output_directory}`.")
|
|
73
|
+
os.makedirs(self.output_directory)
|
|
74
|
+
|
|
75
|
+
@abstractmethod
|
|
76
|
+
def _write_node_data(
|
|
77
|
+
self,
|
|
78
|
+
nodes: Iterable[
|
|
79
|
+
Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
|
|
80
|
+
],
|
|
81
|
+
) -> bool:
|
|
82
|
+
"""Implement how to output.write nodes to disk.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
nodes (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
bool: The return value. True for success, False otherwise.
|
|
89
|
+
"""
|
|
90
|
+
raise NotImplementedError(
|
|
91
|
+
"Writer implementation must override 'write_nodes'"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
@abstractmethod
|
|
95
|
+
def _write_edge_data(
|
|
96
|
+
self,
|
|
97
|
+
edges: Iterable[
|
|
98
|
+
Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
|
|
99
|
+
],
|
|
100
|
+
) -> bool:
|
|
101
|
+
"""Implement how to output.write edges to disk.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
edges (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
bool: The return value. True for success, False otherwise.
|
|
108
|
+
"""
|
|
109
|
+
raise NotImplementedError(
|
|
110
|
+
"Writer implementation must override 'write_edges'"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
@abstractmethod
|
|
114
|
+
def _construct_import_call(self) -> str:
|
|
115
|
+
"""
|
|
116
|
+
Function to construct the import call detailing folder and
|
|
117
|
+
individual node and edge headers and data files, as well as
|
|
118
|
+
delimiters and database name. Built after all data has been
|
|
119
|
+
processed to ensure that nodes are called before any edges.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
str: command for importing the output files into a DBMS.
|
|
123
|
+
"""
|
|
124
|
+
raise NotImplementedError(
|
|
125
|
+
"Writer implementation must override '_construct_import_call'"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
@abstractmethod
|
|
129
|
+
def _get_import_script_name(self) -> str:
|
|
130
|
+
"""Returns the name of the import script.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
str: The name of the import script (ending in .sh)
|
|
134
|
+
"""
|
|
135
|
+
raise NotImplementedError(
|
|
136
|
+
"Writer implementation must override '_get_import_script_name'"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def write_nodes(
|
|
140
|
+
self, nodes, batch_size: int = int(1e6), force: bool = False
|
|
141
|
+
):
|
|
142
|
+
"""Wrapper for writing nodes.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
nodes (BioCypherNode): a list or generator of nodes in
|
|
146
|
+
:py:class:`BioCypherNode` format
|
|
147
|
+
batch_size (int): The batch size for writing nodes.
|
|
148
|
+
force (bool): Whether to force writing nodes even if their type is
|
|
149
|
+
not present in the schema.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
bool: The return value. True for success, False otherwise.
|
|
153
|
+
"""
|
|
154
|
+
passed = self._write_node_data(nodes)
|
|
155
|
+
if not passed:
|
|
156
|
+
logger.error("Error while writing node data.")
|
|
157
|
+
return False
|
|
158
|
+
return True
|
|
159
|
+
|
|
160
|
+
def write_edges(
|
|
161
|
+
self, edges, batch_size: int = int(1e6), force: bool = False
|
|
162
|
+
):
|
|
163
|
+
"""Wrapper for writing edges.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
nodes (BioCypherNode): a list or generator of nodes in
|
|
167
|
+
:py:class:`BioCypherNode` format
|
|
168
|
+
batch_size (int): The batch size for writing nodes.
|
|
169
|
+
force (bool): Whether to force writing nodes even if their type is
|
|
170
|
+
not present in the schema.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
bool: The return value. True for success, False otherwise.
|
|
174
|
+
"""
|
|
175
|
+
passed = self._write_edge_data(edges)
|
|
176
|
+
if not passed:
|
|
177
|
+
logger.error("Error while writing edge data.")
|
|
178
|
+
return False
|
|
179
|
+
return True
|
|
180
|
+
|
|
181
|
+
def write_import_call(self):
|
|
182
|
+
"""
|
|
183
|
+
Function to output.write the import call detailing folder and
|
|
184
|
+
individual node and edge headers and data files, as well as
|
|
185
|
+
delimiters and database name, to the export folder as txt.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
str: The path of the file holding the import call.
|
|
189
|
+
"""
|
|
190
|
+
file_path = os.path.join(
|
|
191
|
+
self.output_directory, self._get_import_script_name()
|
|
192
|
+
)
|
|
193
|
+
logger.info(
|
|
194
|
+
f"Writing {self.__class__.__name__} import call to `{file_path}`."
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
198
|
+
f.write(self._construct_import_call())
|
|
199
|
+
|
|
200
|
+
return file_path
|
|
File without changes
|
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import re
|
|
3
|
-
import subprocess
|
|
4
2
|
|
|
5
3
|
from biocypher._logger import logger
|
|
6
|
-
from biocypher.write._batch_writer import parse_label, _BatchWriter
|
|
4
|
+
from biocypher.output.write._batch_writer import parse_label, _BatchWriter
|
|
7
5
|
|
|
8
6
|
|
|
9
7
|
class _Neo4jBatchWriter(_BatchWriter):
|
|
@@ -49,7 +47,7 @@ class _Neo4jBatchWriter(_BatchWriter):
|
|
|
49
47
|
|
|
50
48
|
def _write_array_string(self, string_list):
|
|
51
49
|
"""
|
|
52
|
-
Abstract method to write the string representation of an array into a .csv file
|
|
50
|
+
Abstract method to output.write the string representation of an array into a .csv file
|
|
53
51
|
as required by the neo4j admin-import.
|
|
54
52
|
|
|
55
53
|
Args:
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
|
|
3
|
+
import networkx as nx
|
|
4
|
+
|
|
5
|
+
from biocypher._logger import logger
|
|
6
|
+
from biocypher.output.write._writer import _Writer
|
|
7
|
+
from biocypher.output.write.relational._csv import _PandasCSVWriter
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _NetworkXWriter(_Writer):
|
|
11
|
+
"""
|
|
12
|
+
Class for writing node and edges to a networkx DiGraph.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, *args, **kwargs):
|
|
16
|
+
super().__init__(*args, **kwargs)
|
|
17
|
+
self.csv_writer = _PandasCSVWriter(*args, write_to_file=False, **kwargs)
|
|
18
|
+
self.G = nx.DiGraph()
|
|
19
|
+
|
|
20
|
+
def _construct_import_call(self) -> str:
|
|
21
|
+
"""Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
str: Python code to load the csv files into Pandas dfs.
|
|
25
|
+
"""
|
|
26
|
+
logger.info(
|
|
27
|
+
f"Writing networkx {self.G} to pickle file networkx_graph.pkl."
|
|
28
|
+
)
|
|
29
|
+
with open(f"{self.output_directory}/networkx_graph.pkl", "wb") as f:
|
|
30
|
+
pickle.dump(self.G, f)
|
|
31
|
+
|
|
32
|
+
import_call = "import pickle\n"
|
|
33
|
+
import_call += "with open('./networkx_graph.pkl', 'rb') as f:\n\tG_loaded = pickle.load(f)"
|
|
34
|
+
return import_call
|
|
35
|
+
|
|
36
|
+
def _get_import_script_name(self) -> str:
|
|
37
|
+
"""Function to return the name of the import script."""
|
|
38
|
+
return "import_networkx.py"
|
|
39
|
+
|
|
40
|
+
def _write_node_data(self, nodes) -> bool:
|
|
41
|
+
passed = self.csv_writer._write_entities_to_file(nodes)
|
|
42
|
+
self.add_to_networkx()
|
|
43
|
+
return passed
|
|
44
|
+
|
|
45
|
+
def _write_edge_data(self, edges) -> bool:
|
|
46
|
+
passed = self.csv_writer._write_entities_to_file(edges)
|
|
47
|
+
self.add_to_networkx()
|
|
48
|
+
return passed
|
|
49
|
+
|
|
50
|
+
def add_to_networkx(self) -> bool:
|
|
51
|
+
all_dfs = self.csv_writer.stored_dfs
|
|
52
|
+
node_dfs = [
|
|
53
|
+
df
|
|
54
|
+
for df in all_dfs.values()
|
|
55
|
+
if df.columns.str.contains("node_id").any()
|
|
56
|
+
]
|
|
57
|
+
edge_dfs = [
|
|
58
|
+
df
|
|
59
|
+
for df in all_dfs.values()
|
|
60
|
+
if df.columns.str.contains("source_id").any()
|
|
61
|
+
and df.columns.str.contains("target_id").any()
|
|
62
|
+
]
|
|
63
|
+
for df in node_dfs:
|
|
64
|
+
nodes = df.set_index("node_id").to_dict(orient="index")
|
|
65
|
+
self.G.add_nodes_from(nodes.items())
|
|
66
|
+
for df in edge_dfs:
|
|
67
|
+
edges = df.set_index(["source_id", "target_id"]).to_dict(
|
|
68
|
+
orient="index"
|
|
69
|
+
)
|
|
70
|
+
self.G.add_edges_from(
|
|
71
|
+
(
|
|
72
|
+
(source, target, attrs)
|
|
73
|
+
for (source, target), attrs in edges.items()
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
return True
|