xml2db 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xml2db/table/column.py ADDED
@@ -0,0 +1,190 @@
1
+ import logging
2
+ from typing import List, Iterable, Any, Union, TYPE_CHECKING
3
+
4
+ from sqlalchemy import (
5
+ Integer,
6
+ Float,
7
+ Boolean,
8
+ BigInteger,
9
+ SmallInteger,
10
+ Column,
11
+ DateTime,
12
+ String,
13
+ )
14
+ from sqlalchemy.dialects import mssql
15
+
16
+ if TYPE_CHECKING:
17
+ from xml2db.model import DataModel
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def types_mapping_default(temp: bool, col: "DataModelColumn") -> Any:
23
+ """Defines the sqlalchemy type to use for given column properties in target tables
24
+
25
+ :param temp: are we targeting the temporary tables schema or the final tables?
26
+ :param col: an object representing a column of a table for which we are determining the SQL type to define
27
+ :return: a sqlalchemy class representing the data type to be used
28
+ """
29
+ if col.occurs[1] != 1:
30
+ return String(8000)
31
+ if col.data_type in ["decimal", "float"]:
32
+ return Float
33
+ if col.data_type == "dateTime":
34
+ return DateTime(timezone=True)
35
+ if col.data_type == "integer" or col.data_type == "int":
36
+ return Integer
37
+ if col.data_type == "boolean":
38
+ return Boolean
39
+ if col.data_type == "byte":
40
+ return SmallInteger
41
+ if col.data_type == "long":
42
+ return BigInteger
43
+ if col.data_type == "date":
44
+ return String(16)
45
+ if col.data_type == "time":
46
+ return String(18)
47
+ if col.data_type in ["string", "NMTOKEN", "duration", "token"]:
48
+ if col.max_length is None:
49
+ return String(1000)
50
+ min_length = 0 if col.min_length is None else col.min_length
51
+ if min_length >= col.max_length - 1 and not col.allow_empty:
52
+ return String(col.max_length)
53
+ return String(col.max_length)
54
+ else:
55
+ logger.warning(
56
+ f"unknown type '{col.data_type}' for column '{col.name}', defaulting to VARCHAR(1000) "
57
+ f"(this can be overridden by providing a field type in the configuration)"
58
+ )
59
+ return String(1000)
60
+
61
+
62
+ def types_mapping_mssql(temp: bool, col: "DataModelColumn") -> Any:
63
+ """Defines the MSSQL type to use for given column properties in target tables
64
+
65
+ :param temp: are we targeting the temporary tables schema or the final tables?
66
+ :param col: an object representing a column of a table for which we are determining the SQL type to define
67
+ :return: a sqlalchemy class representing the data type to be used
68
+ """
69
+ if col.occurs[1] != 1:
70
+ return mssql.VARCHAR(8000)
71
+ if col.data_type in ["decimal", "float"]:
72
+ return Float
73
+ if col.data_type == "dateTime":
74
+ # using the DATETIMEOFFSET directly in the temporary table caused issues when inserting data in the target
75
+ # table with INSERT INTO SELECT converts datetime VARCHAR to DATETIMEOFFSET without errors
76
+ return mssql.VARCHAR(100) if temp else mssql.DATETIMEOFFSET
77
+ if col.data_type == "integer" or col.data_type == "int":
78
+ return Integer
79
+ if col.data_type == "boolean":
80
+ return Boolean
81
+ if col.data_type == "byte":
82
+ return SmallInteger
83
+ if col.data_type == "long":
84
+ return BigInteger
85
+ if col.data_type == "date":
86
+ return mssql.VARCHAR(16)
87
+ if col.data_type == "time":
88
+ return mssql.VARCHAR(18)
89
+ if col.data_type in ["string", "NMTOKEN", "duration", "token"]:
90
+ if col.max_length is None:
91
+ return mssql.VARCHAR(1000)
92
+ min_length = 0 if col.min_length is None else col.min_length
93
+ if min_length >= col.max_length - 1 and not col.allow_empty:
94
+ return mssql.CHAR(col.max_length)
95
+ return mssql.VARCHAR(col.max_length)
96
+ else:
97
+ logger.warning(
98
+ f"unknown type '{col.data_type}' for column '{col.name}', defaulting to VARCHAR(1000) "
99
+ f"(this can be overridden by providing a field type in the configuration)"
100
+ )
101
+ return mssql.VARCHAR(1000)
102
+
103
+
104
+ class DataModelColumn:
105
+ """A class representing a column of a table
106
+
107
+ :param name: column name
108
+ :param data_type: column data type
109
+ :param occurs: min and max occurrences of the field
110
+ :param min_length: min length
111
+ :param max_length: max length
112
+ :param allow_empty: is nullable ?
113
+ :param ngroup: a key used to handle nested sequences
114
+ :param model_config: data model config, may contain column type information
115
+ :param data_model: the DataModel object it belongs to
116
+ :ivar name: the name of the field (i.e. column name)
117
+ :ivar data_type: the data type, extracted from XSD data type
118
+ :ivar occurs: list of int with two elements: min occurrences and max occurrences. \
119
+ Max occurrences is None if unbounded
120
+ """
121
+
122
+ def __init__(
123
+ self,
124
+ name: str,
125
+ name_chain: list,
126
+ data_type: str,
127
+ occurs: List[int],
128
+ min_length: int,
129
+ max_length: int,
130
+ is_attr: bool,
131
+ is_content: bool,
132
+ allow_empty: bool,
133
+ ngroup: Union[int, None],
134
+ model_config: dict[str, Any],
135
+ data_model: "DataModel",
136
+ ):
137
+ """Constructor method"""
138
+ self.name = name
139
+ self.name_chain = name_chain
140
+ self.data_type = data_type
141
+ self.occurs = occurs
142
+ self.min_length = min_length
143
+ self.max_length = max_length
144
+ self.is_attr = is_attr
145
+ self.is_content = is_content
146
+ self.allow_empty = allow_empty
147
+ self.ngroup = ngroup
148
+ self.model_config = model_config
149
+ self.data_model = data_model
150
+ self.other_table = None # just to avoid a linting warning
151
+ self.types_mapping = (
152
+ types_mapping_mssql
153
+ if data_model.engine and data_model.engine.dialect.name == "mssql"
154
+ else types_mapping_default
155
+ )
156
+
157
+ @property
158
+ def can_join_values_as_string(self):
159
+ """Decide whether multiple values can be stored as comma separated values in this column
160
+
161
+ :return: True if data type is compatible with comma separated values
162
+ :raises ValueError: if data type does not allow storage as comma separated values
163
+ """
164
+ if self.occurs[1] == 1:
165
+ return True
166
+ if self.occurs[1] is None or self.occurs[1] > 1:
167
+ if self.data_type in (
168
+ "string",
169
+ "date",
170
+ "dateTime",
171
+ "NMTOKEN",
172
+ "time",
173
+ ):
174
+ return True
175
+ raise ValueError(
176
+ f"Col type '{self.data_type}' with maxOccur > 1 is not supported."
177
+ )
178
+ return False
179
+
180
+ def get_sqlalchemy_column(self, temp: bool = False) -> Iterable[Column]:
181
+ """Create sqlalchemy Column object
182
+
183
+ :param temp: temp table or target table ?
184
+ """
185
+ # use type specified in config if exists
186
+ column_type = self.model_config.get("fields", {}).get(self.name, {}).get(
187
+ "type"
188
+ ) or self.types_mapping(temp, self)
189
+
190
+ yield Column(self.name, column_type)
@@ -0,0 +1,180 @@
1
+ from typing import Iterable, Any
2
+ from sqlalchemy import (
3
+ Table,
4
+ Column,
5
+ Integer,
6
+ ForeignKey,
7
+ PrimaryKeyConstraint,
8
+ Index,
9
+ Boolean,
10
+ DateTime,
11
+ String,
12
+ select,
13
+ and_,
14
+ )
15
+
16
+ from xml2db.table.transformed_table import DataModelTableTransformed
17
+
18
+
19
+ class DataModelTableDuplicated(DataModelTableTransformed):
20
+ """A table data model which allows duplicated records in the database.
21
+
22
+ This table model is only allowed if this node type is used only once in the schema, \
23
+ in a 1-n relationship with its parent node. The 1-n relationship is represented with \
24
+ a foreign key relation from this node to its parent node, without intermediate relationship \
25
+ table. As such, it is a simpler schema, with the drawback of having duplicates records.
26
+ """
27
+
28
+ is_reused = False
29
+
30
+ def build_sqlalchemy_tables(self) -> None:
31
+ """Build sqlalchemy table objects.
32
+
33
+ Build the sqlalchemy table objet based on table attributes for the main table, and \
34
+ relation tables to store n-n relationships with children nodes, for target and temp \
35
+ tables (so it builds at least 2 tables if there is no relations).
36
+ This method is intended to be called only once (if it called more than once it will return \
37
+ immediately) and further changes to the table will not be updated.
38
+ """
39
+
40
+ if self.table is not None:
41
+ return
42
+
43
+ prefix = f"temp_{self.temp_prefix}_"
44
+
45
+ def get_col(temp=False) -> Iterable[Column]:
46
+ """Generator function to build sqlalchemy Column objects
47
+
48
+ :param temp: are we targeting temp or target table?
49
+ """
50
+ # temp primary key which is used also in the final table to update back target pk
51
+ if temp or self.referenced_as_fk:
52
+ yield Column(
53
+ f"temp_pk_{self.name}",
54
+ Integer,
55
+ primary_key=temp,
56
+ autoincrement=False,
57
+ )
58
+ # foreign key column to link with parent
59
+ if temp:
60
+ yield Column(f"temp_fk_parent_{self.parent.name}", Integer)
61
+ yield Column(f"fk_parent_{self.parent.name}", Integer)
62
+ else:
63
+ yield Column(
64
+ f"fk_parent_{self.parent.name}",
65
+ Integer,
66
+ ForeignKey(f"{self.parent.name}.pk_{self.parent.name}"),
67
+ index=True,
68
+ )
69
+ # row_number if needed
70
+ if self.data_model.model_config["row_numbers"]:
71
+ yield Column(
72
+ "xml2db_row_number",
73
+ Integer,
74
+ nullable=False,
75
+ )
76
+ # all other columns and 1-1 relationships
77
+ for field_type, key, field in self.fields:
78
+ if field_type == "col" or field_type == "rel1":
79
+ yield from field.get_sqlalchemy_column(temp)
80
+ # root table is given additional integration metadata columns
81
+ if self.is_root_table:
82
+ yield Column("xml2db_input_file_path", String(256), nullable=False)
83
+ yield Column(
84
+ "xml2db_processed_at", DateTime(timezone=True), nullable=False
85
+ )
86
+
87
+ # build target table
88
+ self.table = Table(
89
+ self.name,
90
+ self.metadata,
91
+ Column(f"pk_{self.name}", Integer, primary_key=True, autoincrement=True),
92
+ PrimaryKeyConstraint(
93
+ name=f"cx_pk_{self.name}",
94
+ mssql_clustered=not self.config["as_columnstore"],
95
+ ),
96
+ *get_col(),
97
+ )
98
+
99
+ # set columnstore index
100
+ if self.config["as_columnstore"]:
101
+ self.table.append_constraint(
102
+ Index(
103
+ f"idx_{self.name}_columnstore",
104
+ mssql_clustered=True,
105
+ mssql_columnstore=True,
106
+ )
107
+ )
108
+
109
+ # build temporary table
110
+ self.temp_table = Table(
111
+ f"{prefix}{self.name}",
112
+ self.metadata,
113
+ Column(f"pk_{self.name}", Integer),
114
+ *get_col(temp=True),
115
+ Column("temp_exists", Boolean, default=False),
116
+ )
117
+
118
+ # build relationship tables
119
+ for rel in self.relations_n.values():
120
+ rel.build_relation_tables()
121
+
122
+ self._set_db_schema()
123
+
124
+ def get_merge_temp_records_statements(self) -> Iterable[Any]:
125
+ """Yield insert and update statements to merge temporary tables into target tables
126
+
127
+ This method yields SQL statements inserting the data of the temporary table (prefixed) \
128
+ into the target tables (unprefixed). As this kind of node can be duplicated, no unique constraint \
129
+ is used, but a record is inserted only if its parent record is inserted too.
130
+
131
+ This method should not be called directly but through the save_db method in the :class:`xml2db.Document` \
132
+ object holding the parsed XML document data, which will ensure that merge queries are issued in the \
133
+ correct order, and which will encapsulated all queries in a transaction in order to rollback changes on failure.
134
+ """
135
+
136
+ # update foreign keys and temp_exists based on parent table
137
+ yield self.temp_table.update().values(
138
+ **{
139
+ f"fk_parent_{self.parent.name}": getattr(
140
+ self.parent.temp_table.c, f"pk_{self.parent.name}"
141
+ ),
142
+ "temp_exists": self.parent.temp_table.c.temp_exists,
143
+ }
144
+ ).where(
145
+ getattr(self.temp_table.c, f"temp_fk_parent_{self.parent.name}") # noqa
146
+ == getattr(self.parent.temp_table.c, f"temp_pk_{self.parent.name}")
147
+ )
148
+
149
+ # update foreign keys for n-1 relations tables
150
+ for rel in self.relations_1.values():
151
+ yield from rel.get_merge_temp_records_statements()
152
+
153
+ # insert new records from temp table to target
154
+ cols = [
155
+ col_name
156
+ for col_name in self.table.columns.keys()
157
+ if col_name != f"pk_{self.name}"
158
+ ]
159
+ sel = select(*[getattr(self.temp_table.c, col) for col in cols]).where(
160
+ self.temp_table.c.temp_exists
161
+ == False # noqa: SQLAlchemy not supporting "is False"
162
+ )
163
+ yield self.table.insert().from_select(cols, sel)
164
+
165
+ # if table is referenced in a fk relationship, update primary keys back in temp table
166
+ if self.referenced_as_fk:
167
+ yield self.temp_table.update().values(
168
+ **{f"pk_{self.name}": getattr(self.table.c, f"pk_{self.name}")}
169
+ ).where(
170
+ and_(
171
+ getattr(self.temp_table.c, f"fk_parent_{self.parent.name}")
172
+ == getattr(self.table.c, f"fk_parent_{self.parent.name}"),
173
+ getattr(self.temp_table.c, f"temp_pk_{self.name}")
174
+ == getattr(self.table.c, f"temp_pk_{self.name}"),
175
+ )
176
+ )
177
+
178
+ # update records for n-n relations tables
179
+ for rel in self.relations_n.values():
180
+ yield from rel.get_merge_temp_records_statements()
@@ -0,0 +1,243 @@
1
+ import sqlalchemy.engine
2
+ from sqlalchemy import Table, Column, ForeignKey, Integer, Index, select
3
+ from typing import TYPE_CHECKING, List, Iterable, Any, Union
4
+
5
+ if TYPE_CHECKING:
6
+ from xml2db.table.table import DataModelTable
7
+ from xml2db.model import DataModel
8
+
9
+
10
+ class DataModelRelation:
11
+ """A class representing a relation with another table
12
+
13
+ :param name: the name of the field holding the relation in the parent table
14
+ :param table: the parent table model in the relation
15
+ :param other_table: the other table model in the relation
16
+ :param occurs: list of int with two elements: min occurrences and max occurrences. \
17
+ Max occurrences is None if unbounded
18
+ :param ngroup: a key used to handle nested sequences
19
+ :param data_model: the DataModel object it belongs to
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ name: str,
25
+ name_chain: list,
26
+ table: "DataModelTable",
27
+ other_table: "DataModelTable",
28
+ occurs: List[int],
29
+ ngroup: Union[str, None],
30
+ data_model: "DataModel",
31
+ ):
32
+ """Constructor method"""
33
+ self.name = name
34
+ self.name_chain = name_chain
35
+ self.table = table
36
+ self.other_table = other_table
37
+ self.occurs = occurs
38
+ self.ngroup = ngroup
39
+ self.rel_table_name = None
40
+ self.field_name = None
41
+ self.rel_table = None
42
+ self.temp_rel_table = None
43
+ self.data_model = data_model
44
+
45
+
46
+ class DataModelRelation1(DataModelRelation):
47
+ """A class representing a 1-1 relation with another table"""
48
+
49
+ def get_sqlalchemy_column(self, temp: bool = False):
50
+ """Yields SQLAlchemy object representing the foreign key relation
51
+
52
+ :param temp: are we targeting temp or target table?
53
+ """
54
+ self.field_name = (
55
+ f"{self.name}_fk_{self.other_table.name}"
56
+ if not self.name.endswith(self.other_table.name)
57
+ else f"fk_{self.name}"
58
+ )
59
+ if temp:
60
+ yield Column(f"temp_{self.field_name}", Integer)
61
+ yield Column(self.field_name, Integer)
62
+ else:
63
+ yield Column(
64
+ self.field_name,
65
+ Integer,
66
+ ForeignKey(f"{self.other_table.name}.pk_{self.other_table.name}"),
67
+ index=True,
68
+ )
69
+
70
+ def get_merge_temp_records_statements(self) -> Iterable[Any]:
71
+ """A SQL statement to update foreign keys values from target table back to temp table after insert
72
+
73
+ :return: iterable of SQL statements
74
+ """
75
+ yield self.table.temp_table.update().values(
76
+ **{
77
+ self.field_name: getattr(
78
+ self.other_table.temp_table.c, f"pk_{self.other_table.name}"
79
+ )
80
+ }
81
+ ).where(
82
+ getattr(self.table.temp_table.c, f"temp_{self.field_name}")
83
+ == getattr(
84
+ self.other_table.temp_table.c, f"temp_pk_{self.other_table.name}"
85
+ )
86
+ )
87
+
88
+
89
+ class DataModelRelationN(DataModelRelation):
90
+ """A class representing a 1-N relation with another table"""
91
+
92
+ def build_relation_tables(self) -> None:
93
+ """Builds sqlalchemy objects for intermediate relationship tables"""
94
+ self.rel_table_name = (
95
+ f"{self.table.name}_{self.name}_{self.other_table.name}"
96
+ if not self.name.endswith(self.other_table.name)
97
+ else f"{self.table.name}_{self.other_table.name}"
98
+ )
99
+ prefix = f"temp_{self.table.temp_prefix}_"
100
+ if self.other_table.is_reused:
101
+ self.temp_rel_table = Table(
102
+ f"{prefix}{self.rel_table_name}",
103
+ self.table.metadata,
104
+ Column(f"temp_fk_{self.table.name}", Integer, nullable=False),
105
+ Column(f"fk_{self.table.name}", Integer),
106
+ Column(f"temp_fk_{self.other_table.name}", Integer, nullable=False),
107
+ Column(f"fk_{self.other_table.name}", Integer),
108
+ *(
109
+ (
110
+ Column(
111
+ "xml2db_row_number",
112
+ Integer,
113
+ nullable=False,
114
+ ),
115
+ )
116
+ if self.data_model.model_config["row_numbers"]
117
+ else ()
118
+ ),
119
+ )
120
+ cl_index = ()
121
+ if (
122
+ self.data_model.engine
123
+ and self.data_model.engine.dialect.name == "mssql"
124
+ and not self.data_model.model_config["as_columnstore"]
125
+ ):
126
+ # n-n relation tables don't have a primary key, so we define a clustered index on the first FK
127
+ cl_index = (
128
+ Index(
129
+ f"ix_fk_{self.rel_table_name}",
130
+ f"fk_{self.table.name}",
131
+ mssql_clustered=True,
132
+ ),
133
+ )
134
+
135
+ self.rel_table = Table(
136
+ self.rel_table_name,
137
+ self.table.metadata,
138
+ Column(
139
+ f"fk_{self.table.name}",
140
+ Integer,
141
+ ForeignKey(f"{self.table.name}.pk_{self.table.name}"),
142
+ nullable=False,
143
+ ),
144
+ Column(
145
+ f"fk_{self.other_table.name}",
146
+ Integer,
147
+ ForeignKey(f"{self.other_table.name}.pk_{self.other_table.name}"),
148
+ nullable=False,
149
+ index=True,
150
+ ),
151
+ *(
152
+ (
153
+ Column(
154
+ "xml2db_row_number",
155
+ Integer,
156
+ nullable=False,
157
+ ),
158
+ )
159
+ if self.data_model.model_config["row_numbers"]
160
+ else ()
161
+ ),
162
+ *cl_index,
163
+ )
164
+
165
+ # set columnstore index
166
+ if self.data_model.model_config["as_columnstore"]:
167
+ self.rel_table.append_constraint(
168
+ Index(
169
+ f"idx_{self.rel_table.name}_columnstore",
170
+ mssql_clustered=True,
171
+ mssql_columnstore=True,
172
+ )
173
+ )
174
+
175
+ if self.table.db_schema is not None:
176
+ self.rel_table.schema = self.table.db_schema
177
+ self.temp_rel_table.schema = self.table.db_schema
178
+
179
+ def create_table(
180
+ self, engine: sqlalchemy.engine.Engine, temp: bool = False
181
+ ) -> None:
182
+ """Create intermediate relationship table
183
+
184
+ :param engine: sqlalchemy engine to use
185
+ :param temp: are we creating temp or target table?
186
+ """
187
+ if temp:
188
+ if self.temp_rel_table is not None:
189
+ self.temp_rel_table.create(engine, checkfirst=True)
190
+ else:
191
+ if self.rel_table is not None:
192
+ self.rel_table.create(engine, checkfirst=True)
193
+
194
+ def get_merge_temp_records_statements(self) -> Iterable[Any]:
195
+ """Issue SQL statements to insert new records in the intermediate relationship table
196
+
197
+ First, it will update foreign keys in the relationship table to use target tables foreign keys.
198
+ Then, it will insert new relationship records into the target relationship table
199
+
200
+ :return: sqlalchemy query statements
201
+ """
202
+ if self.other_table.is_reused:
203
+ rel_tb = self.temp_rel_table
204
+ # update foreign key with self
205
+ yield rel_tb.update().values(
206
+ **{
207
+ f"fk_{self.table.name}": getattr(
208
+ self.table.temp_table.c, f"pk_{self.table.name}"
209
+ )
210
+ }
211
+ ).where(
212
+ getattr( # noqa: Linter puzzled by ==
213
+ rel_tb.c, f"temp_fk_{self.table.name}"
214
+ )
215
+ == getattr(self.table.temp_table.c, f"temp_pk_{self.table.name}")
216
+ ).where(
217
+ self.table.temp_table.c.temp_exists
218
+ == False # noqa: SQLAlchemy not supporting "is False"
219
+ )
220
+ # update foreign key with other table
221
+ yield rel_tb.update().values(
222
+ **{
223
+ f"fk_{self.other_table.name}": getattr(
224
+ self.other_table.temp_table.c, f"pk_{self.other_table.name}"
225
+ )
226
+ }
227
+ ).where(
228
+ getattr( # noqa: Linter puzzled by ==
229
+ rel_tb.c, f"temp_fk_{self.other_table.name}"
230
+ )
231
+ == getattr(
232
+ self.other_table.temp_table.c, f"temp_pk_{self.other_table.name}"
233
+ )
234
+ )
235
+ # insert new records
236
+ cols = [f"fk_{self.table.name}", f"fk_{self.other_table.name}"]
237
+ if self.data_model.model_config["row_numbers"]:
238
+ cols = cols + ["xml2db_row_number"]
239
+ sel = select(*[getattr(rel_tb.c, col) for col in cols]).where(
240
+ getattr(rel_tb.c, f"fk_{self.table.name}") # noqa
241
+ != None # SQLAlchemy not supporting "is not None"
242
+ )
243
+ yield self.rel_table.insert().from_select(cols, sel)