xml2db 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,152 @@
1
+ from sqlalchemy import (
2
+ Table,
3
+ Column,
4
+ Integer,
5
+ Index,
6
+ PrimaryKeyConstraint,
7
+ UniqueConstraint,
8
+ Boolean,
9
+ DateTime,
10
+ String,
11
+ LargeBinary,
12
+ select,
13
+ )
14
+
15
+ from .transformed_table import DataModelTableTransformed
16
+
17
+
18
+ class DataModelTableReused(DataModelTableTransformed):
19
+ """A table data model which de-duplicates records in the database based on their hash value.
20
+
21
+ This table model is the default model to store XML nodes. n-n relationships with parent nodes \
22
+ are represented with an intermediate relationship table. Although more complicated than the \
23
+ duplicated version, this table model store less records in the database.
24
+ """
25
+
26
+ is_reused = True
27
+
28
+ def build_sqlalchemy_tables(self):
29
+ """Build sqlalchemy table objects.
30
+
31
+ Build the sqlalchemy table objet based on table attributes for the main table, \
32
+ and relation tables to store n-n relationships, for target and temp tables \
33
+ (so it builds at least 2 tables if there is no relations).
34
+ This method is intended to be called only once (if it called more than once it \
35
+ will return immediately) and further changes to the table will not be updated.
36
+
37
+ """
38
+
39
+ if self.table is not None:
40
+ return
41
+
42
+ prefix = f"temp_{self.temp_prefix}_"
43
+
44
+ # build target table and n-n relations tables
45
+ def get_col(temp=False):
46
+ for field_type, key, field in self.fields:
47
+ if field_type == "col" or field_type == "rel1":
48
+ yield from field.get_sqlalchemy_column(temp)
49
+ # Root table is given additional integration metadata columns
50
+ if self.is_root_table:
51
+ yield Column("xml2db_input_file_path", String(256), nullable=False)
52
+ yield Column(
53
+ "xml2db_processed_at", DateTime(timezone=True), nullable=False
54
+ )
55
+
56
+ yield Column("xml2db_record_hash", LargeBinary(20), nullable=False)
57
+ yield UniqueConstraint(
58
+ "xml2db_record_hash",
59
+ name=f"{prefix if temp else ''}{self.name}_xml2db_record_hash",
60
+ )
61
+
62
+ # build target table
63
+ self.table = Table(
64
+ self.name,
65
+ self.metadata,
66
+ Column(f"pk_{self.name}", Integer, primary_key=True, autoincrement=True),
67
+ PrimaryKeyConstraint(
68
+ name=f"cx_pk_{self.name}",
69
+ mssql_clustered=not self.config["as_columnstore"],
70
+ ),
71
+ *get_col(),
72
+ )
73
+
74
+ # set columnstore index
75
+ if self.config["as_columnstore"]:
76
+ self.table.append_constraint(
77
+ Index(
78
+ f"idx_{self.name}_columnstore",
79
+ mssql_clustered=True,
80
+ mssql_columnstore=True,
81
+ )
82
+ )
83
+
84
+ # build temporary table
85
+ self.temp_table = Table(
86
+ f"{prefix}{self.name}",
87
+ self.metadata,
88
+ Column(f"pk_{self.name}", Integer),
89
+ Column(
90
+ f"temp_pk_{self.name}", Integer, primary_key=True, autoincrement=False
91
+ ),
92
+ *get_col(temp=True),
93
+ Column("temp_exists", Boolean, default=False),
94
+ )
95
+
96
+ # build relation tables
97
+ for rel in self.relations_n.values():
98
+ rel.build_relation_tables()
99
+
100
+ self._set_db_schema()
101
+
102
+ def get_merge_temp_records_statements(self):
103
+ """Yield insert and update statements to merge temporary tables into target tables
104
+
105
+ This method yield SQL statements inserting the data of the temporary table (prefixed)
106
+ into the target tables (unprefixed). It deals with primary keys and foreign keys by
107
+ looking up first existing records with the same hash in order to reuse already existing
108
+ records when the new record is identical.
109
+
110
+ This method should not be called directly but through the save_db method in the Document
111
+ class, which will ensure that merge queries are issued in the correct order for all the
112
+ data flow, and which will encapsulated all queries in a transaction in order to rollback
113
+ changes on failure.
114
+ """
115
+
116
+ # find matching records hash in target table
117
+ yield self.temp_table.update().values(temp_exists=True).where(
118
+ getattr( # noqa: Linter puzzled by ==
119
+ self.temp_table.c, "xml2db_record_hash"
120
+ )
121
+ == getattr(self.table.c, "xml2db_record_hash")
122
+ )
123
+
124
+ # update foreign keys for n-1 relations tables
125
+ for rel in self.relations_1.values():
126
+ yield from rel.get_merge_temp_records_statements()
127
+
128
+ # insert missing records from temp table to target
129
+ cols = [
130
+ col_name
131
+ for col_name in self.temp_table.columns.keys()
132
+ if not col_name.startswith("temp_") and col_name != f"pk_{self.name}"
133
+ ]
134
+ sel = select(*[getattr(self.temp_table.c, col) for col in cols]).where(
135
+ self.temp_table.c.temp_exists
136
+ == False # noqa: SQLAlchemy not supporting "is False"
137
+ )
138
+ yield self.table.insert().from_select(cols, sel)
139
+
140
+ # update primary keys back in temp table
141
+ yield self.temp_table.update().values(
142
+ **{f"pk_{self.name}": getattr(self.table.c, f"pk_{self.name}")}
143
+ ).where(
144
+ getattr( # noqa: Linter puzzled by ==
145
+ self.temp_table.c, "xml2db_record_hash"
146
+ )
147
+ == getattr(self.table.c, "xml2db_record_hash")
148
+ )
149
+
150
+ # update primary keys for n-n relations tables
151
+ for rel in self.relations_n.values():
152
+ yield from rel.get_merge_temp_records_statements()
xml2db/table/table.py ADDED
@@ -0,0 +1,356 @@
1
+ from typing import Iterable, List, Any, Union, TYPE_CHECKING
2
+ import logging
3
+ import sqlalchemy
4
+ from sqlalchemy import Table
5
+ from sqlalchemy.schema import CreateTable, CreateIndex
6
+
7
+ from xml2db.table.column import DataModelColumn
8
+ from xml2db.table.relations import DataModelRelation1, DataModelRelationN
9
+ from xml2db.exceptions import DataModelConfigError
10
+
11
+ if TYPE_CHECKING:
12
+ from xml2db.model import DataModel
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class DataModelTable:
18
+ """A class representing a database table translated from an XML schema complex type
19
+
20
+ :param table_name: the table's name
21
+ :param type_name: the XSD complex type name
22
+ :param is_root_table: is this table the root table?
23
+ :param is_virtual_node: was this table created to store multiple root elements?
24
+ :param metadata: :class:`sqlalchemy.Metadata` object to build sqlalchemy models into
25
+ :param config: model's configuration
26
+ :param db_schema: database schema to use
27
+ :param temp_prefix: temp prefix to use for naming temp tables
28
+ :param data_model: the `DataModel` instance
29
+ :ivar model_group: 'choice' or 'sequence', extracted from the XSD. 'choice' means that only one field \
30
+ can have a value at the same time
31
+ :ivar is_root_table: is this table the root table?
32
+ :ivar fields: a list of tuples describing all table fields, ordered, in the form (type, name, object) where \
33
+ type can be "col", "rel1" or "reln", name is the name of the column or relation, and object is the column \
34
+ or relationship object
35
+ :ivar columns: a dict of all columns (fields with simple values), keyed by field name
36
+ :ivar relations_1: a dict of 0-1 or 1-1 relations, keyed by field name
37
+ :ivar relations_n: a dict of 0-n or 1-n relations, keyed by field name
38
+ """
39
+
40
+ is_reused = None
41
+
42
+ def __init__(
43
+ self,
44
+ table_name: str,
45
+ type_name: str,
46
+ is_root_table: bool,
47
+ is_virtual_node: bool,
48
+ metadata: sqlalchemy.MetaData,
49
+ config: dict,
50
+ db_schema: str,
51
+ temp_prefix: str,
52
+ data_model: "DataModel",
53
+ ):
54
+ """Constructor method"""
55
+ # config attributes
56
+ self.name = table_name
57
+ self.type_name = type_name
58
+ self.is_root_table = is_root_table
59
+ self.is_virtual_node = is_virtual_node
60
+ self.model_group = "sequence"
61
+ self.config = {} if config is None else config
62
+ if "as_columnstore" in self.config:
63
+ if not isinstance(self.config["as_columnstore"], bool):
64
+ raise DataModelConfigError("as_columnstore must be a bool")
65
+ if (
66
+ self.config["as_columnstore"]
67
+ and data_model.engine
68
+ and not data_model.engine.dialect.name == "mssql"
69
+ ):
70
+ self.config["as_columnstore"] = False
71
+ logger.warning(
72
+ "Clustered columnstore indexes are only supported with MS SQL Server database"
73
+ )
74
+ else:
75
+ self.config["as_columnstore"] = data_model.model_config["as_columnstore"]
76
+ self.db_schema = db_schema
77
+ self.temp_prefix = temp_prefix
78
+ # fields (columns and relations)
79
+ self.fields = []
80
+ self.columns = {}
81
+ self.relations_1 = {}
82
+ self.relations_n = {}
83
+ # dependencies logic
84
+ self.is_simplified = False # is the table already simplified ? (used in the simplification process)
85
+ self.parents_1 = (
86
+ set()
87
+ ) # a set of 1-1 relations the table is involved in as a child
88
+ self.parents_n = (
89
+ set()
90
+ ) # a set of 1-n relations the table is involved in as a child
91
+ self.parent = None
92
+ self.dependencies = (
93
+ set()
94
+ ) # a set of tables this table depends on (can be children or parents)
95
+ self.referenced_as_fk = False
96
+ # sqlalchemy objects
97
+ self.metadata = metadata
98
+ self.table = None
99
+ self.temp_table = None
100
+ self.data_model = data_model
101
+
102
+ def add_column(
103
+ self,
104
+ name: str,
105
+ data_type: str,
106
+ occurs: List[int],
107
+ min_length: int,
108
+ max_length: Union[int, None],
109
+ is_attr: bool,
110
+ is_content: bool,
111
+ allow_empty: bool,
112
+ ngroup: Union[str, None],
113
+ ) -> None:
114
+ """Helper to add a new column to the model
115
+
116
+ :param name: name of the column
117
+ :param data_type: data type
118
+ :param occurs: min and max occurrences
119
+ :param min_length: minimum length
120
+ :param max_length: maximum length
121
+ :param is_attr: is XML attribute or element?
122
+ :param is_content: is content of a mixed type element?
123
+ :param allow_empty: is nullable?
124
+ :param ngroup: a string id signaling that the column belongs to a nested sequence
125
+ """
126
+ self.columns[name] = DataModelColumn(
127
+ name,
128
+ [(name, None)],
129
+ data_type,
130
+ occurs,
131
+ min_length,
132
+ max_length,
133
+ is_attr,
134
+ is_content,
135
+ allow_empty,
136
+ ngroup,
137
+ self.config,
138
+ self.data_model,
139
+ )
140
+ self.fields.append(("col", name, self.columns[name]))
141
+
142
+ def add_relation_1(
143
+ self,
144
+ name: str,
145
+ other_table: "DataModelTable",
146
+ occurs: List[int],
147
+ ngroup: Union[str, None],
148
+ ) -> None:
149
+ """Helper to add a 1-to-1 relationship
150
+
151
+ :param name: name of the 1-1 relationship
152
+ :param other_table: the child table of the relationship
153
+ :param occurs: min and max occurs for this relationship
154
+ :param ngroup: a string id signaling that the relation belongs to a nested sequence
155
+ """
156
+ if occurs[1] != 1:
157
+ raise ValueError(
158
+ "attempting to add a 1-1 relationship with max occurrences different from 1"
159
+ )
160
+ rel = DataModelRelation1(
161
+ name,
162
+ [(name, other_table.type_name)],
163
+ self,
164
+ other_table,
165
+ occurs,
166
+ ngroup,
167
+ self.data_model,
168
+ )
169
+ self.relations_1[name] = rel
170
+ self.fields.append(("rel1", name, rel))
171
+ other_table.parents_1.add(rel)
172
+
173
+ def add_relation_n(self, name, other_table, occurs, ngroup):
174
+ """Helper to add a 1-to-many relationship
175
+
176
+ :param name: name of the 1-1 relationship
177
+ :param other_table: the child table of the relationship
178
+ :param occurs: min and max occurs for this relationship
179
+ :param ngroup: a string id signaling that the relation belongs to a nested sequence
180
+ """
181
+ if occurs[1] == 1:
182
+ raise ValueError(
183
+ "attempting to add a 1-n relationship with max occurrences equal to 1"
184
+ )
185
+ rel = DataModelRelationN(
186
+ name,
187
+ [(name, other_table.type_name)],
188
+ self,
189
+ other_table,
190
+ occurs,
191
+ ngroup,
192
+ self.data_model,
193
+ )
194
+ self.relations_n[name] = rel
195
+ self.fields.append(("reln", name, rel))
196
+ other_table.parents_n.add(rel)
197
+
198
+ def compute_dependencies(self) -> None:
199
+ """Compute the table's dependencies according to foreign keys relationships.
200
+
201
+ Dependencies are tables that the current table holds foreign keys relationships to (i.e. the one which need
202
+ to exist before this one can be created, for instance). To compute `dependencies` list, it ignores fk referenced
203
+ in relationship tables for n-n relationships. For `referenced_as_fk` it is more litteral and include those.
204
+
205
+ This function should be called after schema simplification because dependencies will not \
206
+ be properly updated during the simplification process.
207
+ """
208
+ # we drop parents information which is no longer accurate after schema simplification
209
+ self.parents_1 = None
210
+ self.parents_n = None
211
+ for field_type, rel_name, relation in self.fields:
212
+ if field_type == "rel1" or field_type == "reln":
213
+ if (
214
+ relation.other_table.parent is not None
215
+ and not relation.other_table.is_reused
216
+ ):
217
+ raise ValueError(
218
+ f"unsupported: table {relation.other_table.name} is not reused and has more than 1 parent"
219
+ )
220
+ relation.other_table.parent = self
221
+ if relation.other_table.is_reused:
222
+ self.dependencies.add(relation.other_table.type_name)
223
+ relation.other_table.referenced_as_fk = True
224
+ if (
225
+ field_type == "reln"
226
+ ): # the relationship table will create a fk constraint to self
227
+ self.referenced_as_fk = True
228
+ else:
229
+ relation.other_table.dependencies.add(self.type_name)
230
+ self.referenced_as_fk = True
231
+
232
+ def _set_db_schema(self) -> None:
233
+ """Set db schema value for sqlalchemy tables objects"""
234
+ if (
235
+ self.db_schema is not None
236
+ and self.table is not None
237
+ and self.temp_table is not None
238
+ ):
239
+ # sqlalchemy.Table.schema is the db_schema
240
+ self.table.schema = self.db_schema
241
+ self.temp_table.schema = self.db_schema
242
+
243
+ def get_create_table_statements(self, temp=False) -> Iterable[CreateTable]:
244
+ """Yield create table statements for the table and the rel tables
245
+
246
+ :param temp: if True, yield create table statements for temporary tables (prefixed)
247
+ """
248
+ if temp:
249
+ yield CreateTable(self.temp_table)
250
+ for relation in self.relations_n.values():
251
+ if relation.temp_rel_table is not None:
252
+ yield CreateTable(relation.temp_rel_table)
253
+ else:
254
+ yield CreateTable(self.table)
255
+ for relation in self.relations_n.values():
256
+ if relation.rel_table is not None:
257
+ yield CreateTable(relation.rel_table)
258
+
259
+ def get_create_index_statements(self) -> Iterable[CreateIndex]:
260
+ """Yield create index statements for the indexes of the table and its relation tables"""
261
+
262
+ def yield_indexes(table: Table) -> Iterable[CreateIndex]:
263
+ indexes = [index for index in table.indexes]
264
+ # Sort to guarantee indexes statements of a same table are printed in the same order everytime, otherwise
265
+ # the order is random, and it may create useless git changes in the output folder
266
+ indexes.sort(key=lambda index: index.name)
267
+ for index in indexes:
268
+ yield CreateIndex(index)
269
+
270
+ yield from yield_indexes(self.table)
271
+
272
+ for relation in self.relations_n.values():
273
+ if relation.rel_table is not None:
274
+ yield from yield_indexes(relation.rel_table)
275
+
276
+ def create_tables(self, engine: sqlalchemy.engine.base.Engine, temp: bool = False):
277
+ """Create tables, either target tables or temp tables used to import data
278
+
279
+ :param engine: a sqlalchemy engine to use
280
+ :param temp: if True, create temporary (prefixed) tables
281
+ """
282
+ if temp:
283
+ self.temp_table.create(engine, checkfirst=True)
284
+ else:
285
+ self.table.create(engine, checkfirst=True)
286
+ for relation in self.relations_n.values():
287
+ relation.create_table(engine, temp)
288
+
289
+ def get_insert_temp_records_statements(self, data: dict) -> Iterable[Any]:
290
+ """Yield drop table if exists, create table and insert statement for temporary tables"""
291
+ if data is not None and len(data["records"]) > 0:
292
+ yield self.temp_table.insert(), data["records"]
293
+ data_rel = data.get("relations_n", {})
294
+ for relation in self.relations_n.values():
295
+ if (
296
+ relation.rel_table_name in data_rel
297
+ and len(data_rel[relation.rel_table_name]["records"]) > 0
298
+ ):
299
+ yield relation.temp_rel_table.insert(), data_rel[
300
+ relation.rel_table_name
301
+ ]["records"]
302
+
303
+ def drop_tables(self, engine: sqlalchemy.engine.base.Engine) -> None:
304
+ """Drop target (unprefixed) tables (main table and relations)
305
+
306
+ BE CAUTIOUS, THIS METHOD DROPS TABLES WITHOUT FURTHER NOTICE!
307
+
308
+ :param engine: a sqlalchemy engine to use
309
+ """
310
+ for rel in self.relations_n.values():
311
+ if rel.rel_table is not None:
312
+ rel.rel_table.drop(engine, checkfirst=True)
313
+ self.table.drop(engine, checkfirst=True)
314
+
315
+ def drop_temp_tables(self, engine: sqlalchemy.engine.base.Engine) -> None:
316
+ """Drop temporary (prefixed) tables (main table and relations)
317
+
318
+ BE CAUTIOUS, THIS METHOD DROPS TABLES WITHOUT FURTHER NOTICE!
319
+
320
+ :param engine: a sqlalchemy engine to use
321
+ """
322
+ for rel in self.relations_n.values():
323
+ if rel.temp_rel_table is not None:
324
+ rel.temp_rel_table.drop(engine, checkfirst=True)
325
+ self.temp_table.drop(engine, checkfirst=True)
326
+
327
+ def get_entity_rel_diagram(self) -> List:
328
+ """Build ERD representation for a single table and its relationships
329
+
330
+ The string representation is used by mermaid.js to create a visual diagram.
331
+
332
+ :return: a list of strings (lines)
333
+ """
334
+ out = (
335
+ [
336
+ f"{self.name} ||--{'o' if rel.occurs[0] == 0 else '|'}| {rel.other_table.name} : "
337
+ f'"{rel.name}"'
338
+ for rel in self.relations_1.values()
339
+ ]
340
+ + [
341
+ f"{self.name} ||--{'o' if rel.occurs[0] == 0 else '|'}{{ {rel.other_table.name} : "
342
+ f"\"{rel.name}{'*' if rel.other_table.is_reused else ''}\""
343
+ for rel in self.relations_n.values()
344
+ ]
345
+ + [f"{self.name} {{"]
346
+ + [
347
+ (
348
+ f" {self.columns[field[1]].data_type}{'-N' if self.columns[field[1]].occurs[1] is None else ''} "
349
+ f"{field[1].replace('.', '_')}"
350
+ )
351
+ for field in self.fields
352
+ if field[0] == "col"
353
+ ]
354
+ + ["}"]
355
+ )
356
+ return [f" {line}" for line in out]