biocypher 0.5.39__py3-none-any.whl → 0.5.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

@@ -1,41 +1,17 @@
1
- #!/usr/bin/env python
2
-
3
- #
4
- # Copyright 2021, Heidelberg University Clinic
5
- #
6
- # File author(s): Sebastian Lobentanzer
7
- # Michael Hartung
8
- #
9
- # Distributed under MIT licence, see the file `LICENSE`.
10
- #
11
- """
12
- BioCypher 'offline' module. Handles the writing of node and edge representations
13
- suitable for import into a DBMS.
14
- """
15
-
16
- import re
17
- import glob
18
-
19
- from ._logger import logger
20
-
21
- logger.debug(f"Loading module {__name__}.")
22
-
23
1
  from abc import ABC, abstractmethod
24
2
  from types import GeneratorType
25
- from typing import TYPE_CHECKING, Union, Optional
3
+ from typing import Union, Optional
26
4
  from collections import OrderedDict, defaultdict
27
5
  import os
6
+ import re
7
+ import glob
28
8
 
29
9
  from more_itertools import peekable
30
10
 
31
- from ._config import config as _config
32
- from ._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
33
-
34
- __all__ = ["get_writer"]
35
-
36
- if TYPE_CHECKING:
37
- from ._translate import Translator
38
- from ._deduplicate import Deduplicator
11
+ from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
12
+ from biocypher._logger import logger
13
+ from biocypher._translate import Translator
14
+ from biocypher._deduplicate import Deduplicator
39
15
 
40
16
 
41
17
  class _BatchWriter(ABC):
@@ -1026,295 +1002,6 @@ class _BatchWriter(ABC):
1026
1002
  return file_path
1027
1003
 
1028
1004
 
1029
- class _Neo4jBatchWriter(_BatchWriter):
1030
- """
1031
- Class for writing node and edge representations to disk using the
1032
- format specified by Neo4j for the use of admin import. Each batch
1033
- writer instance has a fixed representation that needs to be passed
1034
- at instantiation via the :py:attr:`schema` argument. The instance
1035
- also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
1036
- to convert and extend the hierarchy.
1037
-
1038
- This class inherits from the abstract class "_BatchWriter" and implements the
1039
- Neo4j-specific methods:
1040
-
1041
- - _write_node_headers
1042
- - _write_edge_headers
1043
- - _construct_import_call
1044
- - _write_array_string
1045
- """
1046
-
1047
- def _get_default_import_call_bin_prefix(self):
1048
- """
1049
- Method to provide the default string for the import call bin prefix.
1050
-
1051
- Returns:
1052
- str: The default location for the neo4j admin import location
1053
- """
1054
- return "bin/"
1055
-
1056
- def _write_array_string(self, string_list):
1057
- """
1058
- Abstract method to write the string representation of an array into a .csv file
1059
- as required by the neo4j admin-import.
1060
-
1061
- Args:
1062
- string_list (list): list of ontology strings
1063
-
1064
- Returns:
1065
- str: The string representation of an array for the neo4j admin import
1066
- """
1067
- string = self.adelim.join(string_list)
1068
- return f"{self.quote}{string}{self.quote}"
1069
-
1070
- def _write_node_headers(self):
1071
- """
1072
- Writes single CSV file for a graph entity that is represented
1073
- as a node as per the definition in the `schema_config.yaml`,
1074
- containing only the header for this type of node.
1075
-
1076
- Returns:
1077
- bool: The return value. True for success, False otherwise.
1078
- """
1079
- # load headers from data parse
1080
- if not self.node_property_dict:
1081
- logger.error(
1082
- "Header information not found. Was the data parsed first?",
1083
- )
1084
- return False
1085
-
1086
- for label, props in self.node_property_dict.items():
1087
- _id = ":ID"
1088
-
1089
- # translate label to PascalCase
1090
- pascal_label = self.translator.name_sentence_to_pascal(
1091
- parse_label(label)
1092
- )
1093
-
1094
- header = f"{pascal_label}-header.csv"
1095
- header_path = os.path.join(
1096
- self.outdir,
1097
- header,
1098
- )
1099
- parts = f"{pascal_label}-part.*"
1100
-
1101
- # check if file already exists
1102
- if os.path.exists(header_path):
1103
- logger.warning(
1104
- f"Header file `{header_path}` already exists. Overwriting.",
1105
- )
1106
-
1107
- # concatenate key:value in props
1108
- props_list = []
1109
- for k, v in props.items():
1110
- if v in ["int", "long", "integer"]:
1111
- props_list.append(f"{k}:long")
1112
- elif v in ["int[]", "long[]", "integer[]"]:
1113
- props_list.append(f"{k}:long[]")
1114
- elif v in ["float", "double", "dbl"]:
1115
- props_list.append(f"{k}:double")
1116
- elif v in ["float[]", "double[]"]:
1117
- props_list.append(f"{k}:double[]")
1118
- elif v in ["bool", "boolean"]:
1119
- # TODO Neo4j boolean support / spelling?
1120
- props_list.append(f"{k}:boolean")
1121
- elif v in ["bool[]", "boolean[]"]:
1122
- props_list.append(f"{k}:boolean[]")
1123
- elif v in ["str[]", "string[]"]:
1124
- props_list.append(f"{k}:string[]")
1125
- else:
1126
- props_list.append(f"{k}")
1127
-
1128
- # create list of lists and flatten
1129
- out_list = [[_id], props_list, [":LABEL"]]
1130
- out_list = [val for sublist in out_list for val in sublist]
1131
-
1132
- with open(header_path, "w", encoding="utf-8") as f:
1133
- # concatenate with delimiter
1134
- row = self.delim.join(out_list)
1135
- f.write(row)
1136
-
1137
- # add file path to neo4 admin import statement (import call file
1138
- # path may be different from actual file path)
1139
- import_call_header_path = os.path.join(
1140
- self.import_call_file_prefix,
1141
- header,
1142
- )
1143
- import_call_parts_path = os.path.join(
1144
- self.import_call_file_prefix,
1145
- parts,
1146
- )
1147
- self.import_call_nodes.add(
1148
- (import_call_header_path, import_call_parts_path)
1149
- )
1150
-
1151
- return True
1152
-
1153
- def _write_edge_headers(self):
1154
- """
1155
- Writes single CSV file for a graph entity that is represented
1156
- as an edge as per the definition in the `schema_config.yaml`,
1157
- containing only the header for this type of edge.
1158
-
1159
- Returns:
1160
- bool: The return value. True for success, False otherwise.
1161
- """
1162
- # load headers from data parse
1163
- if not self.edge_property_dict:
1164
- logger.error(
1165
- "Header information not found. Was the data parsed first?",
1166
- )
1167
- return False
1168
-
1169
- for label, props in self.edge_property_dict.items():
1170
- # translate label to PascalCase
1171
- pascal_label = self.translator.name_sentence_to_pascal(
1172
- parse_label(label)
1173
- )
1174
-
1175
- # paths
1176
- header = f"{pascal_label}-header.csv"
1177
- header_path = os.path.join(
1178
- self.outdir,
1179
- header,
1180
- )
1181
- parts = f"{pascal_label}-part.*"
1182
-
1183
- # check for file exists
1184
- if os.path.exists(header_path):
1185
- logger.warning(
1186
- f"File {header_path} already exists. Overwriting."
1187
- )
1188
-
1189
- # concatenate key:value in props
1190
- props_list = []
1191
- for k, v in props.items():
1192
- if v in ["int", "long", "integer"]:
1193
- props_list.append(f"{k}:long")
1194
- elif v in ["int[]", "long[]", "integer[]"]:
1195
- props_list.append(f"{k}:long[]")
1196
- elif v in ["float", "double"]:
1197
- props_list.append(f"{k}:double")
1198
- elif v in ["float[]", "double[]"]:
1199
- props_list.append(f"{k}:double[]")
1200
- elif v in [
1201
- "bool",
1202
- "boolean",
1203
- ]: # TODO does Neo4j support bool?
1204
- props_list.append(f"{k}:boolean")
1205
- elif v in ["bool[]", "boolean[]"]:
1206
- props_list.append(f"{k}:boolean[]")
1207
- elif v in ["str[]", "string[]"]:
1208
- props_list.append(f"{k}:string[]")
1209
- else:
1210
- props_list.append(f"{k}")
1211
-
1212
- skip_id = False
1213
- schema_label = None
1214
-
1215
- if label in ["IS_SOURCE_OF", "IS_TARGET_OF", "IS_PART_OF"]:
1216
- skip_id = True
1217
- elif not self.translator.ontology.mapping.extended_schema.get(
1218
- label
1219
- ):
1220
- # find label in schema by label_as_edge
1221
- for (
1222
- k,
1223
- v,
1224
- ) in self.translator.ontology.mapping.extended_schema.items():
1225
- if v.get("label_as_edge") == label:
1226
- schema_label = k
1227
- break
1228
- else:
1229
- schema_label = label
1230
-
1231
- out_list = [":START_ID"]
1232
-
1233
- if schema_label:
1234
- if (
1235
- self.translator.ontology.mapping.extended_schema.get(
1236
- schema_label
1237
- ).get("use_id")
1238
- == False
1239
- ):
1240
- skip_id = True
1241
-
1242
- if not skip_id:
1243
- out_list.append("id")
1244
-
1245
- out_list.extend(props_list)
1246
- out_list.extend([":END_ID", ":TYPE"])
1247
-
1248
- with open(header_path, "w", encoding="utf-8") as f:
1249
- # concatenate with delimiter
1250
- row = self.delim.join(out_list)
1251
- f.write(row)
1252
-
1253
- # add file path to neo4 admin import statement (import call file
1254
- # path may be different from actual file path)
1255
- import_call_header_path = os.path.join(
1256
- self.import_call_file_prefix,
1257
- header,
1258
- )
1259
- import_call_parts_path = os.path.join(
1260
- self.import_call_file_prefix,
1261
- parts,
1262
- )
1263
- self.import_call_edges.add(
1264
- (import_call_header_path, import_call_parts_path)
1265
- )
1266
-
1267
- return True
1268
-
1269
- def _get_import_script_name(self) -> str:
1270
- """
1271
- Returns the name of the neo4j admin import script
1272
-
1273
- Returns:
1274
- str: The name of the import script (ending in .sh)
1275
- """
1276
- return "neo4j-admin-import-call.sh"
1277
-
1278
- def _construct_import_call(self) -> str:
1279
- """
1280
- Function to construct the import call detailing folder and
1281
- individual node and edge headers and data files, as well as
1282
- delimiters and database name. Built after all data has been
1283
- processed to ensure that nodes are called before any edges.
1284
-
1285
- Returns:
1286
- str: a bash command for neo4j-admin import
1287
- """
1288
- import_call = (
1289
- f"{self.import_call_bin_prefix}neo4j-admin import "
1290
- f"--database={self.db_name} "
1291
- f'--delimiter="{self.escaped_delim}" '
1292
- f'--array-delimiter="{self.escaped_adelim}" '
1293
- )
1294
-
1295
- if self.quote == "'":
1296
- import_call += f'--quote="{self.quote}" '
1297
- else:
1298
- import_call += f"--quote='{self.quote}' "
1299
-
1300
- if self.wipe:
1301
- import_call += f"--force=true "
1302
- if self.skip_bad_relationships:
1303
- import_call += "--skip-bad-relationships=true "
1304
- if self.skip_duplicate_nodes:
1305
- import_call += "--skip-duplicate-nodes=true "
1306
-
1307
- # append node import calls
1308
- for header_path, parts_path in self.import_call_nodes:
1309
- import_call += f'--nodes="{header_path},{parts_path}" '
1310
-
1311
- # append edge import calls
1312
- for header_path, parts_path in self.import_call_edges:
1313
- import_call += f'--relationships="{header_path},{parts_path}" '
1314
-
1315
- return import_call
1316
-
1317
-
1318
1005
  def parse_label(label: str) -> str:
1319
1006
  """
1320
1007
 
@@ -1350,627 +1037,3 @@ def parse_label(label: str) -> str:
1350
1037
  "Label does not start with an alphabetic character or with $. Removed non compliant characters."
1351
1038
  )
1352
1039
  return "".join(matches).strip()
1353
-
1354
-
1355
- class _ArangoDBBatchWriter(_Neo4jBatchWriter):
1356
- """
1357
- Class for writing node and edge representations to disk using the format
1358
- specified by ArangoDB for the use of "arangoimport". Output files are
1359
- similar to Neo4j, but with a different header format.
1360
- """
1361
-
1362
- def _get_default_import_call_bin_prefix(self):
1363
- """
1364
- Method to provide the default string for the import call bin prefix.
1365
-
1366
- Returns:
1367
- str: The default location for the neo4j admin import location
1368
- """
1369
- return ""
1370
-
1371
- def _get_import_script_name(self) -> str:
1372
- """
1373
- Returns the name of the neo4j admin import script
1374
-
1375
- Returns:
1376
- str: The name of the import script (ending in .sh)
1377
- """
1378
- return "arangodb-import-call.sh"
1379
-
1380
- def _write_node_headers(self):
1381
- """
1382
- Writes single CSV file for a graph entity that is represented
1383
- as a node as per the definition in the `schema_config.yaml`,
1384
- containing only the header for this type of node.
1385
-
1386
- Returns:
1387
- bool: The return value. True for success, False otherwise.
1388
- """
1389
- # load headers from data parse
1390
- if not self.node_property_dict:
1391
- logger.error(
1392
- "Header information not found. Was the data parsed first?",
1393
- )
1394
- return False
1395
-
1396
- for label, props in self.node_property_dict.items():
1397
- # create header CSV with ID, properties, labels
1398
-
1399
- _id = "_key"
1400
-
1401
- # translate label to PascalCase
1402
- pascal_label = self.translator.name_sentence_to_pascal(label)
1403
-
1404
- header = f"{pascal_label}-header.csv"
1405
- header_path = os.path.join(
1406
- self.outdir,
1407
- header,
1408
- )
1409
-
1410
- # check if file already exists
1411
- if os.path.exists(header_path):
1412
- logger.warning(
1413
- f"File {header_path} already exists. Overwriting."
1414
- )
1415
-
1416
- # concatenate key:value in props
1417
- props_list = []
1418
- for k in props.keys():
1419
- props_list.append(f"{k}")
1420
-
1421
- # create list of lists and flatten
1422
- # removes need for empty check of property list
1423
- out_list = [[_id], props_list]
1424
- out_list = [val for sublist in out_list for val in sublist]
1425
-
1426
- with open(header_path, "w", encoding="utf-8") as f:
1427
- # concatenate with delimiter
1428
- row = self.delim.join(out_list)
1429
- f.write(row)
1430
-
1431
- # add collection from schema config
1432
- collection = self.translator.ontology.mapping.extended_schema[
1433
- label
1434
- ].get("db_collection_name", None)
1435
-
1436
- # add file path to neo4 admin import statement
1437
- # do once for each part file
1438
- parts = self.parts.get(label, [])
1439
-
1440
- if not parts:
1441
- raise ValueError(
1442
- f"No parts found for node label {label}. "
1443
- f"Check that the data was parsed first.",
1444
- )
1445
-
1446
- for part in parts:
1447
- import_call_header_path = os.path.join(
1448
- self.import_call_file_prefix,
1449
- header,
1450
- )
1451
- import_call_parts_path = os.path.join(
1452
- self.import_call_file_prefix,
1453
- part,
1454
- )
1455
-
1456
- self.import_call_nodes.add(
1457
- (
1458
- import_call_header_path,
1459
- import_call_parts_path,
1460
- collection,
1461
- )
1462
- )
1463
-
1464
- return True
1465
-
1466
- def _write_edge_headers(self):
1467
- """
1468
- Writes single CSV file for a graph entity that is represented
1469
- as an edge as per the definition in the `schema_config.yaml`,
1470
- containing only the header for this type of edge.
1471
-
1472
- Returns:
1473
- bool: The return value. True for success, False otherwise.
1474
- """
1475
- # load headers from data parse
1476
- if not self.edge_property_dict:
1477
- logger.error(
1478
- "Header information not found. Was the data parsed first?",
1479
- )
1480
- return False
1481
-
1482
- for label, props in self.edge_property_dict.items():
1483
- # translate label to PascalCase
1484
- pascal_label = self.translator.name_sentence_to_pascal(label)
1485
-
1486
- # paths
1487
- header = f"{pascal_label}-header.csv"
1488
- header_path = os.path.join(
1489
- self.outdir,
1490
- header,
1491
- )
1492
- parts = f"{pascal_label}-part.*"
1493
-
1494
- # check for file exists
1495
- if os.path.exists(header_path):
1496
- logger.warning(
1497
- f"Header file {header_path} already exists. Overwriting."
1498
- )
1499
-
1500
- # concatenate key:value in props
1501
- props_list = []
1502
- for k in props.keys():
1503
- props_list.append(f"{k}")
1504
-
1505
- out_list = ["_from", "_key", *props_list, "_to"]
1506
-
1507
- with open(header_path, "w", encoding="utf-8") as f:
1508
- # concatenate with delimiter
1509
- row = self.delim.join(out_list)
1510
- f.write(row)
1511
-
1512
- # add collection from schema config
1513
- if not self.translator.ontology.mapping.extended_schema.get(label):
1514
- for (
1515
- _,
1516
- v,
1517
- ) in self.translator.ontology.mapping.extended_schema.items():
1518
- if v.get("label_as_edge") == label:
1519
- collection = v.get("db_collection_name", None)
1520
- break
1521
-
1522
- else:
1523
- collection = self.translator.ontology.mapping.extended_schema[
1524
- label
1525
- ].get("db_collection_name", None)
1526
-
1527
- # add file path to neo4 admin import statement (import call path
1528
- # may be different from actual output path)
1529
- header_import_call_path = os.path.join(
1530
- self.import_call_file_prefix,
1531
- header,
1532
- )
1533
- parts_import_call_path = os.path.join(
1534
- self.import_call_file_prefix,
1535
- parts,
1536
- )
1537
- self.import_call_edges.add(
1538
- (
1539
- header_import_call_path,
1540
- parts_import_call_path,
1541
- collection,
1542
- )
1543
- )
1544
-
1545
- return True
1546
-
1547
- def _construct_import_call(self) -> str:
1548
- """
1549
- Function to construct the import call detailing folder and
1550
- individual node and edge headers and data files, as well as
1551
- delimiters and database name. Built after all data has been
1552
- processed to ensure that nodes are called before any edges.
1553
-
1554
- Returns:
1555
- str: a bash command for neo4j-admin import
1556
- """
1557
- import_call = (
1558
- f"{self.import_call_bin_prefix}arangoimp "
1559
- f"--type csv "
1560
- f'--separator="{self.escaped_delim}" '
1561
- )
1562
-
1563
- if self.quote == "'":
1564
- import_call += f'--quote="{self.quote}" '
1565
- else:
1566
- import_call += f"--quote='{self.quote}' "
1567
-
1568
- node_lines = ""
1569
-
1570
- # node import calls: one line per node type
1571
- for header_path, parts_path, collection in self.import_call_nodes:
1572
- line = (
1573
- f"{import_call} "
1574
- f"--headers-file {header_path} "
1575
- f"--file= {parts_path} "
1576
- )
1577
-
1578
- if collection:
1579
- line += f"--create-collection --collection {collection} "
1580
-
1581
- node_lines += f"{line}\n"
1582
-
1583
- edge_lines = ""
1584
-
1585
- # edge import calls: one line per edge type
1586
- for header_path, parts_path, collection in self.import_call_edges:
1587
- import_call += f'--relationships="{header_path},{parts_path}" '
1588
-
1589
- return node_lines + edge_lines
1590
-
1591
-
1592
- class _PostgreSQLBatchWriter(_BatchWriter):
1593
- """
1594
- Class for writing node and edge representations to disk using the
1595
- format specified by PostgreSQL for the use of "COPY FROM...". Each batch
1596
- writer instance has a fixed representation that needs to be passed
1597
- at instantiation via the :py:attr:`schema` argument. The instance
1598
- also expects an ontology adapter via :py:attr:`ontology_adapter` to be able
1599
- to convert and extend the hierarchy.
1600
-
1601
- This class inherits from the abstract class "_BatchWriter" and implements the
1602
- PostgreSQL-specific methods:
1603
-
1604
- - _write_node_headers
1605
- - _write_edge_headers
1606
- - _construct_import_call
1607
- - _write_array_string
1608
- """
1609
-
1610
- DATA_TYPE_LOOKUP = {
1611
- "str": "VARCHAR", # VARCHAR needs limit
1612
- "int": "INTEGER",
1613
- "long": "BIGINT",
1614
- "float": "NUMERIC",
1615
- "double": "NUMERIC",
1616
- "dbl": "NUMERIC",
1617
- "boolean": "BOOLEAN",
1618
- "str[]": "VARCHAR[]",
1619
- "string[]": "VARCHAR[]",
1620
- }
1621
-
1622
- def __init__(self, *args, **kwargs):
1623
- self._copy_from_csv_commands = set()
1624
- super().__init__(*args, **kwargs)
1625
-
1626
- def _get_default_import_call_bin_prefix(self):
1627
- """
1628
- Method to provide the default string for the import call bin prefix.
1629
-
1630
- Returns:
1631
- str: The default location for the psql command
1632
- """
1633
- return ""
1634
-
1635
- def _get_data_type(self, string) -> str:
1636
- try:
1637
- return self.DATA_TYPE_LOOKUP[string]
1638
- except KeyError:
1639
- logger.info(
1640
- 'Could not determine data type {string}. Using default "VARCHAR"'
1641
- )
1642
- return "VARCHAR"
1643
-
1644
- def _write_array_string(self, string_list) -> str:
1645
- """
1646
- Abstract method to write the string representation of an array into a .csv file
1647
- as required by the postgresql COPY command, with '{','}' brackets and ',' separation.
1648
-
1649
- Args:
1650
- string_list (list): list of ontology strings
1651
-
1652
- Returns:
1653
- str: The string representation of an array for postgres COPY
1654
- """
1655
- string = ",".join(string_list)
1656
- string = f'"{{{string}}}"'
1657
- return string
1658
-
1659
- def _get_import_script_name(self) -> str:
1660
- """
1661
- Returns the name of the psql import script
1662
-
1663
- Returns:
1664
- str: The name of the import script (ending in .sh)
1665
- """
1666
- return f"{self.db_name}-import-call.sh"
1667
-
1668
- def _adjust_pascal_to_psql(self, string):
1669
- string = string.replace(".", "_")
1670
- string = string.lower()
1671
- return string
1672
-
1673
- def _write_node_headers(self):
1674
- """
1675
- Writes single CSV file for a graph entity that is represented
1676
- as a node as per the definition in the `schema_config.yaml`,
1677
- containing only the header for this type of node.
1678
-
1679
- Returns:
1680
- bool: The return value. True for success, False otherwise.
1681
- """
1682
- # load headers from data parse
1683
- if not self.node_property_dict:
1684
- logger.error(
1685
- "Header information not found. Was the data parsed first?",
1686
- )
1687
- return False
1688
-
1689
- for label, props in self.node_property_dict.items():
1690
- # create header CSV with ID, properties, labels
1691
-
1692
- # translate label to PascalCase
1693
- pascal_label = self.translator.name_sentence_to_pascal(label)
1694
-
1695
- parts = f"{pascal_label}-part*.csv"
1696
- parts_paths = os.path.join(self.outdir, parts)
1697
- parts_paths = glob.glob(parts_paths)
1698
- parts_paths.sort()
1699
-
1700
- # adjust label for import to psql
1701
- pascal_label = self._adjust_pascal_to_psql(pascal_label)
1702
- table_create_command_path = os.path.join(
1703
- self.outdir,
1704
- f"{pascal_label}-create_table.sql",
1705
- )
1706
-
1707
- # check if file already exists
1708
- if os.path.exists(table_create_command_path):
1709
- logger.warning(
1710
- f"File {table_create_command_path} already exists. Overwriting.",
1711
- )
1712
-
1713
- # concatenate key:value in props
1714
- columns = ["_ID VARCHAR"]
1715
- for col_name, col_type in props.items():
1716
- col_type = self._get_data_type(col_type)
1717
- col_name = self._adjust_pascal_to_psql(col_name)
1718
- columns.append(f"{col_name} {col_type}")
1719
- columns.append("_LABEL VARCHAR[]")
1720
-
1721
- with open(table_create_command_path, "w", encoding="utf-8") as f:
1722
- command = ""
1723
- if self.wipe:
1724
- command += f"DROP TABLE IF EXISTS {pascal_label};\n"
1725
-
1726
- # table creation requires comma separation
1727
- command += (
1728
- f'CREATE TABLE {pascal_label}({",".join(columns)});\n'
1729
- )
1730
- f.write(command)
1731
-
1732
- for parts_path in parts_paths:
1733
- # if import_call_file_prefix is set, replace actual path
1734
- # with prefix
1735
- if self.import_call_file_prefix != self.outdir:
1736
- parts_path = parts_path.replace(
1737
- self.outdir,
1738
- self.import_call_file_prefix,
1739
- )
1740
-
1741
- self._copy_from_csv_commands.add(
1742
- f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
1743
- )
1744
-
1745
- # add file path to import statement
1746
- # if import_call_file_prefix is set, replace actual path
1747
- # with prefix
1748
- if self.import_call_file_prefix != self.outdir:
1749
- table_create_command_path = table_create_command_path.replace(
1750
- self.outdir,
1751
- self.import_call_file_prefix,
1752
- )
1753
-
1754
- self.import_call_nodes.add(table_create_command_path)
1755
-
1756
- return True
1757
-
1758
- def _write_edge_headers(self):
1759
- """
1760
- Writes single CSV file for a graph entity that is represented
1761
- as an edge as per the definition in the `schema_config.yaml`,
1762
- containing only the header for this type of edge.
1763
-
1764
- Returns:
1765
- bool: The return value. True for success, False otherwise.
1766
- """
1767
- # load headers from data parse
1768
- if not self.edge_property_dict:
1769
- logger.error(
1770
- "Header information not found. Was the data parsed first?",
1771
- )
1772
- return False
1773
-
1774
- for label, props in self.edge_property_dict.items():
1775
- # translate label to PascalCase
1776
- pascal_label = self.translator.name_sentence_to_pascal(label)
1777
-
1778
- parts_paths = os.path.join(self.outdir, f"{pascal_label}-part*.csv")
1779
- parts_paths = glob.glob(parts_paths)
1780
- parts_paths.sort()
1781
-
1782
- # adjust label for import to psql
1783
- pascal_label = self._adjust_pascal_to_psql(pascal_label)
1784
- table_create_command_path = os.path.join(
1785
- self.outdir,
1786
- f"{pascal_label}-create_table.sql",
1787
- )
1788
-
1789
- # check for file exists
1790
- if os.path.exists(table_create_command_path):
1791
- logger.warning(
1792
- f"File {table_create_command_path} already exists. Overwriting.",
1793
- )
1794
-
1795
- # concatenate key:value in props
1796
- columns = []
1797
- for col_name, col_type in props.items():
1798
- col_type = self._get_data_type(col_type)
1799
- col_name = self._adjust_pascal_to_psql(col_name)
1800
- if col_name == "_ID":
1801
- # should ideally never happen
1802
- raise ValueError(
1803
- "Column name '_ID' is reserved for internal use, "
1804
- "denoting the relationship ID. Please choose a "
1805
- "different name for your column."
1806
- )
1807
-
1808
- columns.append(f"{col_name} {col_type}")
1809
-
1810
- # create list of lists and flatten
1811
- # removes need for empty check of property list
1812
- out_list = [
1813
- "_START_ID VARCHAR",
1814
- "_ID VARCHAR",
1815
- *columns,
1816
- "_END_ID VARCHAR",
1817
- "_TYPE VARCHAR",
1818
- ]
1819
-
1820
- with open(table_create_command_path, "w", encoding="utf-8") as f:
1821
- command = ""
1822
- if self.wipe:
1823
- command += f"DROP TABLE IF EXISTS {pascal_label};\n"
1824
-
1825
- # table creation requires comma separation
1826
- command += (
1827
- f'CREATE TABLE {pascal_label}({",".join(out_list)});\n'
1828
- )
1829
- f.write(command)
1830
-
1831
- for parts_path in parts_paths:
1832
- # if import_call_file_prefix is set, replace actual path
1833
- # with prefix
1834
- if self.import_call_file_prefix != self.outdir:
1835
- parts_path = parts_path.replace(
1836
- self.outdir,
1837
- self.import_call_file_prefix,
1838
- )
1839
-
1840
- self._copy_from_csv_commands.add(
1841
- f"\\copy {pascal_label} FROM '{parts_path}' DELIMITER E'{self.delim}' CSV;"
1842
- )
1843
-
1844
- # add file path to import statement
1845
- # if import_call_file_prefix is set, replace actual path
1846
- # with prefix
1847
- if self.import_call_file_prefix != self.outdir:
1848
- table_create_command_path = table_create_command_path.replace(
1849
- self.outdir,
1850
- self.import_call_file_prefix,
1851
- )
1852
-
1853
- self.import_call_edges.add(table_create_command_path)
1854
-
1855
- return True
1856
-
1857
- def _construct_import_call(self) -> str:
1858
- """
1859
- Function to construct the import call detailing folder and
1860
- individual node and edge headers and data files, as well as
1861
- delimiters and database name. Built after all data has been
1862
- processed to ensure that nodes are called before any edges.
1863
-
1864
- Returns:
1865
- str: a bash command for postgresql import
1866
- """
1867
- import_call = ""
1868
-
1869
- # create tables
1870
- # At this point, csv files of nodes and edges do not require differentiation
1871
- for import_file_path in [
1872
- *self.import_call_nodes,
1873
- *self.import_call_edges,
1874
- ]:
1875
- import_call += f'echo "Setup {import_file_path}..."\n'
1876
- if {self.db_password}:
1877
- # set password variable inline
1878
- import_call += f"PGPASSWORD={self.db_password} "
1879
- import_call += (
1880
- f"{self.import_call_bin_prefix}psql -f {import_file_path}"
1881
- )
1882
- import_call += f" --dbname {self.db_name}"
1883
- import_call += f" --host {self.db_host}"
1884
- import_call += f" --port {self.db_port}"
1885
- import_call += f" --user {self.db_user}"
1886
- import_call += '\necho "Done!"\n'
1887
- import_call += "\n"
1888
-
1889
- # copy data to tables
1890
- for command in self._copy_from_csv_commands:
1891
- table_part = command.split(" ")[3]
1892
- import_call += f'echo "Importing {table_part}..."\n'
1893
- if {self.db_password}:
1894
- # set password variable inline
1895
- import_call += f"PGPASSWORD={self.db_password} "
1896
- import_call += f'{self.import_call_bin_prefix}psql -c "{command}"'
1897
- import_call += f" --dbname {self.db_name}"
1898
- import_call += f" --host {self.db_host}"
1899
- import_call += f" --port {self.db_port}"
1900
- import_call += f" --user {self.db_user}"
1901
- import_call += '\necho "Done!"\n'
1902
- import_call += "\n"
1903
-
1904
- return import_call
1905
-
1906
-
1907
- DBMS_TO_CLASS = {
1908
- "neo": _Neo4jBatchWriter,
1909
- "neo4j": _Neo4jBatchWriter,
1910
- "Neo4j": _Neo4jBatchWriter,
1911
- "postgres": _PostgreSQLBatchWriter,
1912
- "postgresql": _PostgreSQLBatchWriter,
1913
- "PostgreSQL": _PostgreSQLBatchWriter,
1914
- "arango": _ArangoDBBatchWriter,
1915
- "arangodb": _ArangoDBBatchWriter,
1916
- "ArangoDB": _ArangoDBBatchWriter,
1917
- }
1918
-
1919
-
1920
- def get_writer(
1921
- dbms: str,
1922
- translator: "Translator",
1923
- deduplicator: "Deduplicator",
1924
- output_directory: str,
1925
- strict_mode: bool,
1926
- ):
1927
- """
1928
- Function to return the writer class based on the selection in the config
1929
- file.
1930
-
1931
- Args:
1932
-
1933
- dbms: the database management system; for options, see DBMS_TO_CLASS.
1934
-
1935
- translator: the Translator object.
1936
-
1937
- output_directory: the directory to write the output files to.
1938
-
1939
- strict_mode: whether to use strict mode.
1940
-
1941
- Returns:
1942
-
1943
- instance: an instance of the selected writer class.
1944
-
1945
- """
1946
-
1947
- dbms_config = _config(dbms)
1948
-
1949
- writer = DBMS_TO_CLASS[dbms]
1950
-
1951
- if not writer:
1952
- raise ValueError(f"Unknown dbms: {dbms}")
1953
-
1954
- if writer is not None:
1955
- return writer(
1956
- translator=translator,
1957
- deduplicator=deduplicator,
1958
- delimiter=dbms_config.get("delimiter"),
1959
- array_delimiter=dbms_config.get("array_delimiter"),
1960
- quote=dbms_config.get("quote_character"),
1961
- output_directory=output_directory,
1962
- db_name=dbms_config.get("database_name"),
1963
- import_call_bin_prefix=dbms_config.get("import_call_bin_prefix"),
1964
- import_call_file_prefix=dbms_config.get("import_call_file_prefix"),
1965
- wipe=dbms_config.get("wipe"),
1966
- strict_mode=strict_mode,
1967
- skip_bad_relationships=dbms_config.get(
1968
- "skip_bad_relationships"
1969
- ), # neo4j
1970
- skip_duplicate_nodes=dbms_config.get(
1971
- "skip_duplicate_nodes"
1972
- ), # neo4j
1973
- db_user=dbms_config.get("user"), # psql
1974
- db_password=dbms_config.get("password"), # psql
1975
- db_port=dbms_config.get("port"), # psql
1976
- )