biocypher 0.5.41__py3-none-any.whl → 0.5.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

@@ -0,0 +1,200 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Union, Optional
3
+ from collections.abc import Iterable
4
+ import os
5
+
6
+ from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
7
+ from biocypher._logger import logger
8
+ from biocypher._translate import Translator
9
+ from biocypher._deduplicate import Deduplicator
10
+
11
+ __all__ = ["_Writer"]
12
+
13
+
14
+ class _Writer(ABC):
15
+ """Abstract class for writing node and edge representations to disk.
16
+ Specifics of the different writers (e.g. neo4j, postgresql, csv, etc.)
17
+ are implemented in the child classes. Any concrete writer needs to
18
+ implement at least:
19
+ - _write_node_data
20
+ - _write_edge_data
21
+ - _construct_import_call
22
+ - _get_import_script_name
23
+
24
+ Args:
25
+ translator (Translator): Instance of :py:class:`Translator` to enable translation of
26
+ nodes and manipulation of properties.
27
+ deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
28
+ of nodes and edges.
29
+ output_directory (str, optional): Path for exporting CSV files. Defaults to None.
30
+ strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
31
+ strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
32
+
33
+ Raises:
34
+ NotImplementedError: Writer implementation must override '_write_node_data'
35
+ NotImplementedError: Writer implementation must override '_write_edge_data'
36
+ NotImplementedError: Writer implementation must override '_construct_import_call'
37
+ NotImplementedError: Writer implementation must override '_get_import_script_name'
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ translator: Translator,
43
+ deduplicator: Deduplicator,
44
+ output_directory: Optional[str] = None,
45
+ strict_mode: bool = False,
46
+ *args,
47
+ **kwargs,
48
+ ):
49
+ """Abstract class for writing node and edge representations to disk.
50
+
51
+ Args:
52
+ translator (Translator): Instance of :py:class:`Translator` to enable translation of
53
+ nodes and manipulation of properties.
54
+ deduplicator (Deduplicator): Instance of :py:class:`Deduplicator` to enable deduplication
55
+ of nodes and edges.
56
+ output_directory (str, optional): Path for exporting CSV files. Defaults to None.
57
+ strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
58
+ strict_mode (bool, optional): Whether to enforce source, version, and license properties. Defaults to False.
59
+ """
60
+ self.translator = translator
61
+ self.deduplicator = deduplicator
62
+ self.strict_mode = strict_mode
63
+ self.output_directory = output_directory
64
+
65
+ if os.path.exists(self.output_directory):
66
+ if kwargs.get("write_to_file", True):
67
+ logger.warning(
68
+ f"Output directory `{self.output_directory}` already exists. "
69
+ "If this is not planned, file consistency may be compromised."
70
+ )
71
+ else:
72
+ logger.info(f"Creating output directory `{self.output_directory}`.")
73
+ os.makedirs(self.output_directory)
74
+
75
+ @abstractmethod
76
+ def _write_node_data(
77
+ self,
78
+ nodes: Iterable[
79
+ Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
80
+ ],
81
+ ) -> bool:
82
+ """Implement how to output.write nodes to disk.
83
+
84
+ Args:
85
+ nodes (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
86
+
87
+ Returns:
88
+ bool: The return value. True for success, False otherwise.
89
+ """
90
+ raise NotImplementedError(
91
+ "Writer implementation must override 'write_nodes'"
92
+ )
93
+
94
+ @abstractmethod
95
+ def _write_edge_data(
96
+ self,
97
+ edges: Iterable[
98
+ Union[BioCypherNode, BioCypherEdge, BioCypherRelAsNode]
99
+ ],
100
+ ) -> bool:
101
+ """Implement how to output.write edges to disk.
102
+
103
+ Args:
104
+ edges (Iterable): An iterable of BioCypherNode / BioCypherEdge / BioCypherRelAsNode objects.
105
+
106
+ Returns:
107
+ bool: The return value. True for success, False otherwise.
108
+ """
109
+ raise NotImplementedError(
110
+ "Writer implementation must override 'write_edges'"
111
+ )
112
+
113
+ @abstractmethod
114
+ def _construct_import_call(self) -> str:
115
+ """
116
+ Function to construct the import call detailing folder and
117
+ individual node and edge headers and data files, as well as
118
+ delimiters and database name. Built after all data has been
119
+ processed to ensure that nodes are called before any edges.
120
+
121
+ Returns:
122
+ str: command for importing the output files into a DBMS.
123
+ """
124
+ raise NotImplementedError(
125
+ "Writer implementation must override '_construct_import_call'"
126
+ )
127
+
128
+ @abstractmethod
129
+ def _get_import_script_name(self) -> str:
130
+ """Returns the name of the import script.
131
+
132
+ Returns:
133
+ str: The name of the import script (ending in .sh)
134
+ """
135
+ raise NotImplementedError(
136
+ "Writer implementation must override '_get_import_script_name'"
137
+ )
138
+
139
+ def write_nodes(
140
+ self, nodes, batch_size: int = int(1e6), force: bool = False
141
+ ):
142
+ """Wrapper for writing nodes.
143
+
144
+ Args:
145
+ nodes (BioCypherNode): a list or generator of nodes in
146
+ :py:class:`BioCypherNode` format
147
+ batch_size (int): The batch size for writing nodes.
148
+ force (bool): Whether to force writing nodes even if their type is
149
+ not present in the schema.
150
+
151
+ Returns:
152
+ bool: The return value. True for success, False otherwise.
153
+ """
154
+ passed = self._write_node_data(nodes)
155
+ if not passed:
156
+ logger.error("Error while writing node data.")
157
+ return False
158
+ return True
159
+
160
+ def write_edges(
161
+ self, edges, batch_size: int = int(1e6), force: bool = False
162
+ ):
163
+ """Wrapper for writing edges.
164
+
165
+ Args:
166
+ nodes (BioCypherNode): a list or generator of nodes in
167
+ :py:class:`BioCypherNode` format
168
+ batch_size (int): The batch size for writing nodes.
169
+ force (bool): Whether to force writing nodes even if their type is
170
+ not present in the schema.
171
+
172
+ Returns:
173
+ bool: The return value. True for success, False otherwise.
174
+ """
175
+ passed = self._write_edge_data(edges)
176
+ if not passed:
177
+ logger.error("Error while writing edge data.")
178
+ return False
179
+ return True
180
+
181
+ def write_import_call(self):
182
+ """
183
+ Function to output.write the import call detailing folder and
184
+ individual node and edge headers and data files, as well as
185
+ delimiters and database name, to the export folder as txt.
186
+
187
+ Returns:
188
+ str: The path of the file holding the import call.
189
+ """
190
+ file_path = os.path.join(
191
+ self.output_directory, self._get_import_script_name()
192
+ )
193
+ logger.info(
194
+ f"Writing {self.__class__.__name__} import call to `{file_path}`."
195
+ )
196
+
197
+ with open(file_path, "w", encoding="utf-8") as f:
198
+ f.write(self._construct_import_call())
199
+
200
+ return file_path
File without changes
@@ -1,7 +1,7 @@
1
1
  import os
2
2
 
3
3
  from biocypher._logger import logger
4
- from biocypher.write.graph._neo4j import _Neo4jBatchWriter
4
+ from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
5
5
 
6
6
 
7
7
  class _ArangoDBBatchWriter(_Neo4jBatchWriter):
@@ -1,9 +1,7 @@
1
1
  import os
2
- import re
3
- import subprocess
4
2
 
5
3
  from biocypher._logger import logger
6
- from biocypher.write._batch_writer import parse_label, _BatchWriter
4
+ from biocypher.output.write._batch_writer import parse_label, _BatchWriter
7
5
 
8
6
 
9
7
  class _Neo4jBatchWriter(_BatchWriter):
@@ -49,7 +47,7 @@ class _Neo4jBatchWriter(_BatchWriter):
49
47
 
50
48
  def _write_array_string(self, string_list):
51
49
  """
52
- Abstract method to write the string representation of an array into a .csv file
50
+ Abstract method to output.write the string representation of an array into a .csv file
53
51
  as required by the neo4j admin-import.
54
52
 
55
53
  Args:
@@ -0,0 +1,76 @@
1
+ import pickle
2
+
3
+ import networkx as nx
4
+
5
+ from biocypher._logger import logger
6
+ from biocypher.output.write._writer import _Writer
7
+ from biocypher.output.write.relational._csv import _PandasCSVWriter
8
+
9
+
10
+ class _NetworkXWriter(_Writer):
11
+ """
12
+ Class for writing node and edges to a networkx DiGraph.
13
+ """
14
+
15
+ def __init__(self, *args, **kwargs):
16
+ super().__init__(*args, **kwargs)
17
+ self.csv_writer = _PandasCSVWriter(*args, write_to_file=False, **kwargs)
18
+ self.G = nx.DiGraph()
19
+
20
+ def _construct_import_call(self) -> str:
21
+ """Function to construct the Python code to load all node and edge csv files again into Pandas dfs.
22
+
23
+ Returns:
24
+ str: Python code to load the csv files into Pandas dfs.
25
+ """
26
+ logger.info(
27
+ f"Writing networkx {self.G} to pickle file networkx_graph.pkl."
28
+ )
29
+ with open(f"{self.output_directory}/networkx_graph.pkl", "wb") as f:
30
+ pickle.dump(self.G, f)
31
+
32
+ import_call = "import pickle\n"
33
+ import_call += "with open('./networkx_graph.pkl', 'rb') as f:\n\tG_loaded = pickle.load(f)"
34
+ return import_call
35
+
36
+ def _get_import_script_name(self) -> str:
37
+ """Function to return the name of the import script."""
38
+ return "import_networkx.py"
39
+
40
+ def _write_node_data(self, nodes) -> bool:
41
+ passed = self.csv_writer._write_entities_to_file(nodes)
42
+ self.add_to_networkx()
43
+ return passed
44
+
45
+ def _write_edge_data(self, edges) -> bool:
46
+ passed = self.csv_writer._write_entities_to_file(edges)
47
+ self.add_to_networkx()
48
+ return passed
49
+
50
+ def add_to_networkx(self) -> bool:
51
+ all_dfs = self.csv_writer.stored_dfs
52
+ node_dfs = [
53
+ df
54
+ for df in all_dfs.values()
55
+ if df.columns.str.contains("node_id").any()
56
+ ]
57
+ edge_dfs = [
58
+ df
59
+ for df in all_dfs.values()
60
+ if df.columns.str.contains("source_id").any()
61
+ and df.columns.str.contains("target_id").any()
62
+ ]
63
+ for df in node_dfs:
64
+ nodes = df.set_index("node_id").to_dict(orient="index")
65
+ self.G.add_nodes_from(nodes.items())
66
+ for df in edge_dfs:
67
+ edges = df.set_index(["source_id", "target_id"]).to_dict(
68
+ orient="index"
69
+ )
70
+ self.G.add_edges_from(
71
+ (
72
+ (source, target, attrs)
73
+ for (source, target), attrs in edges.items()
74
+ )
75
+ )
76
+ return True