biocypher 0.5.39__py3-none-any.whl → 0.5.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

@@ -0,0 +1,320 @@
1
+ import os
2
+ import glob
3
+
4
+ from biocypher._logger import logger
5
+ from biocypher.write._batch_writer import _BatchWriter
6
+
7
+
8
+ class _PostgreSQLBatchWriter(_BatchWriter):
9
+ """
10
+ Class for writing node and edge representations to disk using the
11
+ format specified by PostgreSQL for the use of "COPY FROM...". Each batch
12
+ writer instance has a fixed representation that needs to be passed
13
+ at instantiation via the :py:attr:`schema` argument. The instance
14
+ also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
15
+ to convert and extend the hierarchy.
16
+
17
+ This class inherits from the abstract class "_BatchWriter" and implements the
18
+ PostgreSQL-specific methods:
19
+
20
+ - _write_node_headers
21
+ - _write_edge_headers
22
+ - _construct_import_call
23
+ - _write_array_string
24
+ """
25
+
26
+ DATA_TYPE_LOOKUP = {
27
+ "str": "VARCHAR", # VARCHAR needs limit
28
+ "int": "INTEGER",
29
+ "long": "BIGINT",
30
+ "float": "NUMERIC",
31
+ "double": "NUMERIC",
32
+ "dbl": "NUMERIC",
33
+ "boolean": "BOOLEAN",
34
+ "str[]": "VARCHAR[]",
35
+ "string[]": "VARCHAR[]",
36
+ }
37
+
38
+ def __init__(self, *args, **kwargs):
39
+ self._copy_from_csv_commands = set()
40
+ super().__init__(*args, **kwargs)
41
+
42
+ def _get_default_import_call_bin_prefix(self):
43
+ """
44
+ Method to provide the default string for the import call bin prefix.
45
+
46
+ Returns:
47
+ str: The default location for the psql command
48
+ """
49
+ return ""
50
+
51
+ def _get_data_type(self, string) -> str:
52
+ try:
53
+ return self.DATA_TYPE_LOOKUP[string]
54
+ except KeyError:
55
+ logger.info(
56
+ 'Could not determine data type {string}. Using default "VARCHAR"'
57
+ )
58
+ return "VARCHAR"
59
+
60
+ def _write_array_string(self, string_list) -> str:
61
+ """
62
+ Abstract method to write the string representation of an array into a .csv file
63
+ as required by the postgresql COPY command, with '{','}' brackets and ',' separation.
64
+
65
+ Args:
66
+ string_list (list): list of ontology strings
67
+
68
+ Returns:
69
+ str: The string representation of an array for postgres COPY
70
+ """
71
+ string = ",".join(string_list)
72
+ string = f'"{{{string}}}"'
73
+ return string
74
+
75
+ def _get_import_script_name(self) -> str:
76
+ """
77
+ Returns the name of the psql import script
78
+
79
+ Returns:
80
+ str: The name of the import script (ending in .sh)
81
+ """
82
+ return f"{self.db_name}-import-call.sh"
83
+
84
+ def _adjust_pascal_to_psql(self, string):
85
+ string = string.replace(".", "_")
86
+ string = string.lower()
87
+ return string
88
+
89
+ def _write_node_headers(self):
90
+ """
91
+ Writes single CSV file for a graph entity that is represented
92
+ as a node as per the definition in the `schema_config.yaml`,
93
+ containing only the header for this type of node.
94
+
95
+ Returns:
96
+ bool: The return value. True for success, False otherwise.
97
+ """
98
+ # load headers from data parse
99
+ if not self.node_property_dict:
100
+ logger.error(
101
+ "Header information not found. Was the data parsed first?",
102
+ )
103
+ return False
104
+
105
+ for label, props in self.node_property_dict.items():
106
+ # create header CSV with ID, properties, labels
107
+
108
+ # translate label to PascalCase
109
+ pascal_label = self.translator.name_sentence_to_pascal(label)
110
+
111
+ parts = f"{pascal_label}-part*.csv"
112
+ parts_paths = os.path.join(self.outdir, parts)
113
+ parts_paths = glob.glob(parts_paths)
114
+ parts_paths.sort()
115
+
116
+ # adjust label for import to psql
117
+ pascal_label = self._adjust_pascal_to_psql(pascal_label)
118
+ table_create_command_path = os.path.join(
119
+ self.outdir,
120
+ f"{pascal_label}-create_table.sql",
121
+ )
122
+
123
+ # check if file already exists
124
+ if os.path.exists(table_create_command_path):
125
+ logger.warning(
126
+ f"File {table_create_command_path} already exists. Overwriting.",
127
+ )
128
+
129
+ # concatenate key:value in props
130
+ columns = ["_ID VARCHAR"]
131
+ for col_name, col_type in props.items():
132
+ col_type = self._get_data_type(col_type)
133
+ col_name = self._adjust_pascal_to_psql(col_name)
134
+ columns.append(f"{col_name} {col_type}")
135
+ columns.append("_LABEL VARCHAR[]")
136
+
137
+ with open(table_create_command_path, "w", encoding="utf-8") as f:
138
+ command = ""
139
+ if self.wipe:
140
+ command += f"DROP TABLE IF EXISTS {pascal_label};\n"
141
+
142
+ # table creation requires comma separation
143
+ command += (
144
+ f'CREATE TABLE {pascal_label}({",".join(columns)});\n'
145
+ )
146
+ f.write(command)
147
+
148
+ for parts_path in parts_paths:
149
+ # if import_call_file_prefix is set, replace actual path
150
+ # with prefix
151
+ if self.import_call_file_prefix != self.outdir:
152
+ parts_path = parts_path.replace(
153
+ self.outdir,
154
+ self.import_call_file_prefix,
155
+ )
156
+
157
+ self._copy_from_csv_commands.add(
158
+ f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
159
+ )
160
+
161
+ # add file path to import statement
162
+ # if import_call_file_prefix is set, replace actual path
163
+ # with prefix
164
+ if self.import_call_file_prefix != self.outdir:
165
+ table_create_command_path = table_create_command_path.replace(
166
+ self.outdir,
167
+ self.import_call_file_prefix,
168
+ )
169
+
170
+ self.import_call_nodes.add(table_create_command_path)
171
+
172
+ return True
173
+
174
+ def _write_edge_headers(self):
175
+ """
176
+ Writes single CSV file for a graph entity that is represented
177
+ as an edge as per the definition in the `schema_config.yaml`,
178
+ containing only the header for this type of edge.
179
+
180
+ Returns:
181
+ bool: The return value. True for success, False otherwise.
182
+ """
183
+ # load headers from data parse
184
+ if not self.edge_property_dict:
185
+ logger.error(
186
+ "Header information not found. Was the data parsed first?",
187
+ )
188
+ return False
189
+
190
+ for label, props in self.edge_property_dict.items():
191
+ # translate label to PascalCase
192
+ pascal_label = self.translator.name_sentence_to_pascal(label)
193
+
194
+ parts_paths = os.path.join(self.outdir, f"{pascal_label}-part*.csv")
195
+ parts_paths = glob.glob(parts_paths)
196
+ parts_paths.sort()
197
+
198
+ # adjust label for import to psql
199
+ pascal_label = self._adjust_pascal_to_psql(pascal_label)
200
+ table_create_command_path = os.path.join(
201
+ self.outdir,
202
+ f"{pascal_label}-create_table.sql",
203
+ )
204
+
205
+ # check for file exists
206
+ if os.path.exists(table_create_command_path):
207
+ logger.warning(
208
+ f"File {table_create_command_path} already exists. Overwriting.",
209
+ )
210
+
211
+ # concatenate key:value in props
212
+ columns = []
213
+ for col_name, col_type in props.items():
214
+ col_type = self._get_data_type(col_type)
215
+ col_name = self._adjust_pascal_to_psql(col_name)
216
+ if col_name == "_ID":
217
+ # should ideally never happen
218
+ raise ValueError(
219
+ "Column name '_ID' is reserved for internal use, "
220
+ "denoting the relationship ID. Please choose a "
221
+ "different name for your column."
222
+ )
223
+
224
+ columns.append(f"{col_name} {col_type}")
225
+
226
+ # create list of lists and flatten
227
+ # removes need for empty check of property list
228
+ out_list = [
229
+ "_START_ID VARCHAR",
230
+ "_ID VARCHAR",
231
+ *columns,
232
+ "_END_ID VARCHAR",
233
+ "_TYPE VARCHAR",
234
+ ]
235
+
236
+ with open(table_create_command_path, "w", encoding="utf-8") as f:
237
+ command = ""
238
+ if self.wipe:
239
+ command += f"DROP TABLE IF EXISTS {pascal_label};\n"
240
+
241
+ # table creation requires comma separation
242
+ command += (
243
+ f'CREATE TABLE {pascal_label}({",".join(out_list)});\n'
244
+ )
245
+ f.write(command)
246
+
247
+ for parts_path in parts_paths:
248
+ # if import_call_file_prefix is set, replace actual path
249
+ # with prefix
250
+ if self.import_call_file_prefix != self.outdir:
251
+ parts_path = parts_path.replace(
252
+ self.outdir,
253
+ self.import_call_file_prefix,
254
+ )
255
+
256
+ self._copy_from_csv_commands.add(
257
+ f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
258
+ )
259
+
260
+ # add file path to import statement
261
+ # if import_call_file_prefix is set, replace actual path
262
+ # with prefix
263
+ if self.import_call_file_prefix != self.outdir:
264
+ table_create_command_path = table_create_command_path.replace(
265
+ self.outdir,
266
+ self.import_call_file_prefix,
267
+ )
268
+
269
+ self.import_call_edges.add(table_create_command_path)
270
+
271
+ return True
272
+
273
+ def _construct_import_call(self) -> str:
274
+ """
275
+ Function to construct the import call detailing folder and
276
+ individual node and edge headers and data files, as well as
277
+ delimiters and database name. Built after all data has been
278
+ processed to ensure that nodes are called before any edges.
279
+
280
+ Returns:
281
+ str: a bash command for postgresql import
282
+ """
283
+ import_call = ""
284
+
285
+ # create tables
286
+ # At this point, csv files of nodes and edges do not require differentiation
287
+ for import_file_path in [
288
+ *self.import_call_nodes,
289
+ *self.import_call_edges,
290
+ ]:
291
+ import_call += f'echo "Setup {import_file_path}..."\n'
292
+ if {self.db_password}:
293
+ # set password variable inline
294
+ import_call += f"PGPASSWORD={self.db_password} "
295
+ import_call += (
296
+ f"{self.import_call_bin_prefix}psql -f {import_file_path}"
297
+ )
298
+ import_call += f" --dbname {self.db_name}"
299
+ import_call += f" --host {self.db_host}"
300
+ import_call += f" --port {self.db_port}"
301
+ import_call += f" --user {self.db_user}"
302
+ import_call += '\necho "Done!"\n'
303
+ import_call += "\n"
304
+
305
+ # copy data to tables
306
+ for command in self._copy_from_csv_commands:
307
+ table_part = command.split(" ")[3]
308
+ import_call += f'echo "Importing {table_part}..."\n'
309
+ if {self.db_password}:
310
+ # set password variable inline
311
+ import_call += f"PGPASSWORD={self.db_password} "
312
+ import_call += f'{self.import_call_bin_prefix}psql -c "{command}"'
313
+ import_call += f" --dbname {self.db_name}"
314
+ import_call += f" --host {self.db_host}"
315
+ import_call += f" --port {self.db_port}"
316
+ import_call += f" --user {self.db_user}"
317
+ import_call += '\necho "Done!"\n'
318
+ import_call += "\n"
319
+
320
+ return import_call
@@ -0,0 +1,51 @@
1
+ from biocypher.write.relational._postgresql import _PostgreSQLBatchWriter
2
+
3
+
4
+ class _SQLiteBatchWriter(_PostgreSQLBatchWriter):
5
+ """
6
+ Class for writing node and edge representations to a SQLite database.
7
+ It uses the _PostgreSQLBatchWriter class under the hood, which already
8
+ implements the logic to write the nodes/edges to a relational DBMS.
9
+ Only the import bash script differs between PostgreSQL and SQLite
10
+ and is therefore implemented in this class.
11
+
12
+ - _construct_import_call
13
+ """
14
+
15
+ def __init__(self, *args, **kwargs):
16
+ super().__init__(*args, **kwargs)
17
+
18
+ def _construct_import_call(self) -> str:
19
+ """
20
+ Function to construct the import call detailing folder and
21
+ individual node and edge headers and data files, as well as
22
+ delimiters and database name. Built after all data has been
23
+ processed to ensure that nodes are called before any edges.
24
+
25
+ Returns:
26
+ str: a bash command for sqlite import
27
+ """
28
+ import_call = ""
29
+
30
+ # create tables
31
+ # At this point, csv files of nodes and edges do not require differentiation
32
+ for import_file_path in [
33
+ *self.import_call_nodes,
34
+ *self.import_call_edges,
35
+ ]:
36
+ import_call += f'echo "Setup {import_file_path}..."\n'
37
+ import_call += f"{self.import_call_bin_prefix}sqlite3 {self.db_name} < {import_file_path}"
38
+ import_call += '\necho "Done!"\n'
39
+ import_call += "\n"
40
+
41
+ for command in self._copy_from_csv_commands:
42
+ table_name = command.split(" ")[1]
43
+ table_part = command.split(" ")[3].replace("'", "")
44
+ import_call += f'echo "Importing {table_part}..."\n'
45
+ separator = self.delim
46
+ import_part = f".import {table_part} {table_name}"
47
+ import_call += f"{self.import_call_bin_prefix}sqlite3 -separator $'{separator}' {self.db_name} \"{import_part}\""
48
+ import_call += '\necho "Done!"\n'
49
+ import_call += "\n"
50
+
51
+ return import_call
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: biocypher
3
- Version: 0.5.39
3
+ Version: 0.5.41
4
4
  Summary: A unifying framework for biomedical research knowledge graphs
5
5
  Home-page: https://github.com/biocypher/biocypher
6
6
  License: MIT
@@ -0,0 +1,32 @@
1
+ biocypher/__init__.py,sha256=ejNY53vH_pE3ZbIN8G_ZBYxxPG9aERovRLD0XhDvt4k,942
2
+ biocypher/_config/__init__.py,sha256=fFHRFYxE2MtDAQWL6upe--MJ1vw3Z8CwIPhF2gW8cRU,3698
3
+ biocypher/_config/biocypher_config.yaml,sha256=TEvIOgRy9WMvsb2CrV1ywuKLZWbedYubCC2bpdBIalU,2713
4
+ biocypher/_config/test_config.yaml,sha256=Np8jeS5_EP6HHOvMKb7B_Tkyqd5YaYlYz_DVsXypt-A,119
5
+ biocypher/_config/test_schema_config.yaml,sha256=D1600WgEj3iTXrumVU9LIivJHJO36iaxfkOgyam9zVU,3129
6
+ biocypher/_config/test_schema_config_disconnected.yaml,sha256=Qm8FLxEn2spHcyj_5F859KjcDvKSxNhxDvi4b4LLkvQ,68
7
+ biocypher/_config/test_schema_config_extended.yaml,sha256=wn3A76142hhjnImhMF6RODbCFESTJ2TtPvcFdIFsAT0,3309
8
+ biocypher/_connect.py,sha256=7hk3J03hzZOPE48ISaoB6IgRun8GaUmDtIRnnD7vKiU,13453
9
+ biocypher/_core.py,sha256=5rZKYie_vSjTYduH8oH-GxLMZuNqLAe3ZYAQ5nUp8Nc,22578
10
+ biocypher/_create.py,sha256=vpUchUdEpWupZi1LgFLxAWMtqoBwnWbP7PwEDUCBS4A,10202
11
+ biocypher/_deduplicate.py,sha256=BBvfpXzu6L5YDY5FdtXxnf8YlsbJpbCE8RdUoKsm0n0,4949
12
+ biocypher/_get.py,sha256=3Kpky3blfNf1JwxKWLsZxTU2aTP_C4sUe8OpiyYj63I,10810
13
+ biocypher/_logger.py,sha256=NGXe3hZA79WSujfOgpcxHBf8N2QAfrmvM1LFDpsGK2U,3185
14
+ biocypher/_mapping.py,sha256=ERSNH2Bg19145KytxbFE4BInPaiP-LWW7osOBot29Eo,9304
15
+ biocypher/_metadata.py,sha256=GGh6YvKYrRWqdyZQYTaLnkYPaHgVHz00V6kpXQdjr2k,1658
16
+ biocypher/_misc.py,sha256=lUUbF13FdBlYq01C-Vit52IbeRehW0oSUWsQ9tFC-xo,5938
17
+ biocypher/_ontology.py,sha256=3Wu1ZZYmtLpWfopi-aY9BA8qZ-ltPMXN4Ok_diK1YdA,28410
18
+ biocypher/_pandas.py,sha256=GVCFM68J7yBjh40MpkNVgD8qT1RFMrrIjMOtD3iKsf4,3040
19
+ biocypher/_translate.py,sha256=JafvhtVaFSpruRfYh9BzjVbvDF1Mhg7LLKMDZHWkRjg,16496
20
+ biocypher/write/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ biocypher/write/_batch_writer.py,sha256=Ta2DNjSnJcVtFDMOGTtH5nnbKwyqSGf7xXGpYzi1bDM,36826
22
+ biocypher/write/_write.py,sha256=HLFQyGqLdkmIoBOjL9m81OUuSsHjvSfK9LY4jtrinv0,3104
23
+ biocypher/write/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ biocypher/write/graph/_arangodb.py,sha256=du5pivCR7xKs8VyxeegxYsSBIcsXGrfSbM_AffFapwg,8071
25
+ biocypher/write/graph/_neo4j.py,sha256=qSj1PryD4UmveS7ACs1R3eo2pegi53pVI7d7P0ihOKI,11930
26
+ biocypher/write/relational/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
+ biocypher/write/relational/_postgresql.py,sha256=NdI-ULP8valsqlkObOg50od-3-amVj5RzGnZ_7NW2ww,11945
28
+ biocypher/write/relational/_sqlite.py,sha256=KLQpxQXF1B8qqTtKUFfjWdwHjd1Fhn9syK931Z0dsq0,2066
29
+ biocypher-0.5.41.dist-info/LICENSE,sha256=SjUaQkq671iQUZOxEUpC4jvJxXOlfSiHTTueyz9kXJM,1065
30
+ biocypher-0.5.41.dist-info/METADATA,sha256=D23b4n9k_oEmXIMC3eKYJLP5MJVVYHkAeAWB8DAFpYk,10642
31
+ biocypher-0.5.41.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
32
+ biocypher-0.5.41.dist-info/RECORD,,
@@ -1,24 +0,0 @@
1
- biocypher/__init__.py,sha256=ejNY53vH_pE3ZbIN8G_ZBYxxPG9aERovRLD0XhDvt4k,942
2
- biocypher/_config/__init__.py,sha256=fFHRFYxE2MtDAQWL6upe--MJ1vw3Z8CwIPhF2gW8cRU,3698
3
- biocypher/_config/biocypher_config.yaml,sha256=H0TKBJun7pQmIfIAgEiMkDDgE3kKoCElBuMt8lkkQcU,2404
4
- biocypher/_config/test_config.yaml,sha256=Np8jeS5_EP6HHOvMKb7B_Tkyqd5YaYlYz_DVsXypt-A,119
5
- biocypher/_config/test_schema_config.yaml,sha256=D1600WgEj3iTXrumVU9LIivJHJO36iaxfkOgyam9zVU,3129
6
- biocypher/_config/test_schema_config_disconnected.yaml,sha256=Qm8FLxEn2spHcyj_5F859KjcDvKSxNhxDvi4b4LLkvQ,68
7
- biocypher/_config/test_schema_config_extended.yaml,sha256=wn3A76142hhjnImhMF6RODbCFESTJ2TtPvcFdIFsAT0,3309
8
- biocypher/_connect.py,sha256=0oSyO6CEIlKL8rHo-HHE7y0FzGfSi4vnEXSDy1TnIUE,12456
9
- biocypher/_core.py,sha256=W3qeuCwG0q5H_RRoYDGfKe1VWTQx_5J_WOfZQqsWQXI,22388
10
- biocypher/_create.py,sha256=vpUchUdEpWupZi1LgFLxAWMtqoBwnWbP7PwEDUCBS4A,10202
11
- biocypher/_deduplicate.py,sha256=BBvfpXzu6L5YDY5FdtXxnf8YlsbJpbCE8RdUoKsm0n0,4949
12
- biocypher/_get.py,sha256=3Kpky3blfNf1JwxKWLsZxTU2aTP_C4sUe8OpiyYj63I,10810
13
- biocypher/_logger.py,sha256=NGXe3hZA79WSujfOgpcxHBf8N2QAfrmvM1LFDpsGK2U,3185
14
- biocypher/_mapping.py,sha256=ERSNH2Bg19145KytxbFE4BInPaiP-LWW7osOBot29Eo,9304
15
- biocypher/_metadata.py,sha256=McndBOmaAbQBpOK_B4FnLKihoAfeiDvmjqJGpzDsX-k,1658
16
- biocypher/_misc.py,sha256=g5B-PO_XJlYEJC7kEVRdCXeB2NW0ZSVr_5KqTEk2ldk,5877
17
- biocypher/_ontology.py,sha256=3Wu1ZZYmtLpWfopi-aY9BA8qZ-ltPMXN4Ok_diK1YdA,28410
18
- biocypher/_pandas.py,sha256=GVCFM68J7yBjh40MpkNVgD8qT1RFMrrIjMOtD3iKsf4,3040
19
- biocypher/_translate.py,sha256=JafvhtVaFSpruRfYh9BzjVbvDF1Mhg7LLKMDZHWkRjg,16496
20
- biocypher/_write.py,sha256=5pW0gYj2QW--FNB4DK53gK6D7dNHhPxYnPBhB8NjaSo,69550
21
- biocypher-0.5.39.dist-info/LICENSE,sha256=SjUaQkq671iQUZOxEUpC4jvJxXOlfSiHTTueyz9kXJM,1065
22
- biocypher-0.5.39.dist-info/METADATA,sha256=-bmzjCWZTR9TYom3XUQoXGOP_X8-gG8H-u_kU7f28cw,10642
23
- biocypher-0.5.39.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
24
- biocypher-0.5.39.dist-info/RECORD,,