xml2db 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,314 @@
1
+ from typing import Union, List, Tuple
2
+
3
+ from xml2db.exceptions import DataModelConfigError
4
+ from .column import DataModelColumn
5
+ from .relations import DataModelRelation1, DataModelRelationN
6
+ from .table import DataModelTable
7
+
8
+
9
+ class DataModelTableTransformed(DataModelTable):
10
+ """A class extending DataModelTable with transformations
11
+
12
+ This class allows simplifying a DataModelTable object with default or configured transformations in \
13
+ order to reduce final schema complexity.
14
+ """
15
+
16
+ def _can_choice_transform_table(self) -> bool:
17
+ """Check if the table is of type "choice" and can be transformed to type/value fields.
18
+
19
+ :return: True if the table model be converted to type/value choice model, False otherwise
20
+ """
21
+ if self.model_group == "choice":
22
+ col_types = list(set([col.data_type for col in self.columns.values()]))
23
+ return (
24
+ len(self.relations_1) == 0
25
+ and len(self.relations_n) == 0
26
+ and len(col_types) == 1
27
+ )
28
+ return False
29
+
30
+ def _is_table_choice_transform_applicable(self) -> bool:
31
+ """Determine if choice transform should be applied to the whole table.
32
+
33
+ We try the choice_transform value provided in config, if any, and otherwise fall back to default value.
34
+
35
+ :return: True if choice transform is to be applied, False otherwise.
36
+ """
37
+ if "choice_transform" in self.config:
38
+ if isinstance(self.config["choice_transform"], bool):
39
+ if self.config["choice_transform"]:
40
+ if self._can_choice_transform_table():
41
+ return True
42
+ else:
43
+ raise DataModelConfigError(
44
+ f"Choice-transform cannot be applied to table '{self.name}', see conditions in "
45
+ f"DataModelTableTransformed._can_choice_transform_table."
46
+ )
47
+ else:
48
+ return False
49
+ else:
50
+ raise DataModelConfigError(
51
+ f"Unrecognized choice_transform value '{self.config['choice_transform']}'"
52
+ f" for table '{self.name}'. Only boolean values True or False are allowed."
53
+ )
54
+ elif self._can_choice_transform_table() and len(self.columns) > 2:
55
+ # column number isn't reduced if the number of columns = 2, as it would be elevated to 2 columns then
56
+ return True
57
+ return False
58
+
59
+ def _transform_to_choice(self) -> None:
60
+ """Transform the current table to a choice model representation with only type and value fields"""
61
+ col_types = list(set([col.data_type for col in self.columns.values()]))
62
+ col_names = [col.name for col in self.columns.values()]
63
+ min_lengths = [col.min_length for col in self.columns.values()]
64
+ max_lengths = [col.max_length for col in self.columns.values()]
65
+ allow_empty = [col.allow_empty for col in self.columns.values()]
66
+ self.columns = {
67
+ "type": DataModelColumn(
68
+ "type",
69
+ [("type", None)],
70
+ "string",
71
+ [1, 1],
72
+ min(len(name) for name in col_names),
73
+ max(len(name) for name in col_names),
74
+ False,
75
+ False,
76
+ False,
77
+ None,
78
+ self.config,
79
+ self.data_model,
80
+ ),
81
+ "value": DataModelColumn(
82
+ "value",
83
+ [("value", None)],
84
+ col_types[0],
85
+ [1, 1],
86
+ min(min_lengths) if all(e is not None for e in min_lengths) else None,
87
+ max(max_lengths) if all(e is not None for e in max_lengths) else None,
88
+ False,
89
+ False,
90
+ any(allow_empty),
91
+ None,
92
+ self.config,
93
+ self.data_model,
94
+ ),
95
+ }
96
+ self.fields = [
97
+ ("col", "type", self.columns["type"]),
98
+ ("col", "value", self.columns["value"]),
99
+ ]
100
+
101
+ def _can_transform_field(
102
+ self, field_type: str, field_name: str, transform: str = "join"
103
+ ) -> bool:
104
+ """Check if a given transformation can be applied to a given field
105
+
106
+ :param field_type: the field type ("col", "rel1" or "reln")
107
+ :param field_name: the field name
108
+ :param transform: the transform to be tested
109
+ :return: True is the field can be transformed
110
+ """
111
+ if field_type == "col" and transform == "join":
112
+ # check if simple columns with max occurrences > 1 can be joined as string
113
+ if self.columns[field_name].can_join_values_as_string:
114
+ return True
115
+ elif field_type == "rel1":
116
+ return transform in ["elevate", "elevate_wo_prefix"]
117
+ # "reln" can never be transformed
118
+ return False
119
+
120
+ def _get_field_transform(
121
+ self, field_type: str, field_name: str
122
+ ) -> Union[str, None]:
123
+ """Get the transformation that should be applied to this field, taking into account user-provided config
124
+
125
+ :param field_type: the field type ("col", "rel1", "reln")
126
+ :param field_name: the field name
127
+ :return: the default transformation that should be applied
128
+ """
129
+ field_config = self.config.get("fields", {}).get(field_name, {})
130
+ if "transform" in field_config:
131
+ if field_config["transform"] is False:
132
+ return None
133
+ if self._can_transform_field(
134
+ field_type, field_name, field_config["transform"]
135
+ ):
136
+ return field_config["transform"]
137
+ else:
138
+ raise DataModelConfigError(
139
+ f"Transform value '{field_config['transform']}' cannot be applied"
140
+ f" to field {field_name} of table '{self.name}'."
141
+ )
142
+ else:
143
+ if field_type == "col":
144
+ if self._can_transform_field("col", field_name, "join"):
145
+ return "join"
146
+ elif field_type == "rel1":
147
+ if (
148
+ self.relations_1[field_name].occurs[0] == 1
149
+ or len(self.relations_1[field_name].other_table.columns) <= 4
150
+ ) and len(self.relations_1[field_name].other_table.parents_n) == 0:
151
+ transform = (
152
+ "elevate_wo_prefix"
153
+ if len(self.columns) == 0 and len(self.relations_1) == 1
154
+ else "elevate"
155
+ )
156
+ if self._can_transform_field("rel1", field_name, transform):
157
+ return transform
158
+
159
+ def _elevate_relation_1(
160
+ self, rel_name, transform
161
+ ) -> List[
162
+ Tuple[str, str, Union[DataModelColumn, DataModelRelation1, DataModelRelationN]]
163
+ ]:
164
+ """Elevate a child table to the upper level"""
165
+ rel = self.relations_1[rel_name]
166
+ if transform == "elevate_wo_prefix":
167
+ prefix = ""
168
+ else:
169
+ prefix = f"{rel.name}_"
170
+
171
+ del self.relations_1[rel_name]
172
+
173
+ # insert the children fields into the current table
174
+ elevated_fields = []
175
+ for child_field_type, key, child_field in rel.other_table.fields:
176
+ prefixed_key = f"{prefix}{key}"
177
+ if child_field_type == "col":
178
+ self.columns[prefixed_key] = DataModelColumn(
179
+ prefixed_key,
180
+ [(rel.name, rel.other_table.type_name)] + child_field.name_chain,
181
+ child_field.data_type,
182
+ [0, child_field.occurs[1]]
183
+ if rel.occurs[0] == 0
184
+ else child_field.occurs,
185
+ child_field.min_length,
186
+ child_field.max_length,
187
+ child_field.is_attr,
188
+ child_field.is_content,
189
+ child_field.allow_empty,
190
+ child_field.ngroup,
191
+ self.config,
192
+ self.data_model,
193
+ )
194
+ elevated_fields.append(
195
+ (
196
+ "col",
197
+ prefixed_key,
198
+ self.columns[prefixed_key],
199
+ )
200
+ )
201
+ elif child_field_type == "rel1":
202
+ self.relations_1[prefixed_key] = DataModelRelation1(
203
+ prefixed_key,
204
+ [(rel.name, rel.other_table.type_name)] + child_field.name_chain,
205
+ self,
206
+ child_field.other_table,
207
+ [0, child_field.occurs[1]]
208
+ if rel.occurs[0] == 0
209
+ else child_field.occurs,
210
+ child_field.ngroup,
211
+ self.data_model,
212
+ )
213
+ elevated_fields.append(
214
+ (
215
+ "rel1",
216
+ prefixed_key,
217
+ self.relations_1[prefixed_key],
218
+ )
219
+ )
220
+ elif child_field_type == "reln":
221
+ self.relations_n[prefixed_key] = DataModelRelationN(
222
+ prefixed_key,
223
+ [(rel.name, rel.other_table.type_name)] + child_field.name_chain,
224
+ self,
225
+ child_field.other_table,
226
+ [0, child_field.occurs[1]]
227
+ if rel.occurs[0] == 0
228
+ else child_field.occurs,
229
+ child_field.ngroup,
230
+ self.data_model,
231
+ )
232
+ elevated_fields.append(
233
+ (
234
+ "reln",
235
+ prefixed_key,
236
+ self.relations_n[prefixed_key],
237
+ )
238
+ )
239
+ return elevated_fields
240
+
241
+ def simplify_table(self) -> Tuple[dict, dict]:
242
+ """Simplify table recursively and return a dict of simplifications applied
243
+
244
+ Return values are dict which associate xml types and xml field with transform operations. These dicts are used \
245
+ at parsing stage and should contain all xml types and field names, even if there is no transformation to apply.\
246
+ Transformations are described by keywords:
247
+ For tables (aka XML complex types):
248
+ - "choice_transform": transform a choice between 2+ different fields to type / value fields in order to \
249
+ reduce the number of columns.
250
+ For fields:
251
+ - "join": applies to fields with multiple values allowed: append all values in a comma separated string
252
+ - "elevate": pull up child type to parent level, appending field name to child field names
253
+ - "elevate_wo_prefix": same as "elevate" but keeping only child's fields names (without prefixing)
254
+ - False: prevents any transformation on this field
255
+
256
+
257
+ :return: a tuple of 2 dicts: the first one for type transforms, the second one for fields transforms
258
+ """
259
+
260
+ # if the table is already simplified, stop here
261
+ if self.is_simplified:
262
+ return {}, {}
263
+ self.is_simplified = True
264
+
265
+ # if the table can be transformed, stop here
266
+ if self._is_table_choice_transform_applicable():
267
+ self._transform_to_choice()
268
+ self.is_simplified = True
269
+ return {self.type_name: "choice"}, {}
270
+
271
+ # loop through field to transform them if need be
272
+ out_fields = []
273
+ types_transforms = {}
274
+ fields_transforms = {}
275
+ for field_type, field_name, field in self.fields:
276
+ if field_type == "col":
277
+ if self._get_field_transform("col", field_name) == "join":
278
+ fields_transforms[(self.type_name, field_name)] = (
279
+ None,
280
+ "join",
281
+ )
282
+ out_fields.append(("col", field_name, field))
283
+
284
+ else:
285
+ # simplify child table
286
+ (
287
+ types_transforms_child,
288
+ fields_transforms_child,
289
+ ) = field.other_table.simplify_table()
290
+ types_transforms.update(types_transforms_child)
291
+ fields_transforms.update(fields_transforms_child)
292
+
293
+ # check if children can be "elevated" to the upper level
294
+ transform = self._get_field_transform(field_type, field_name)
295
+ if transform is not None:
296
+ if field_type == "rel1":
297
+ elevated_fields = self._elevate_relation_1(
298
+ field_name, transform
299
+ )
300
+ out_fields.extend(elevated_fields)
301
+ fields_transforms[(self.type_name, field_name)] = (
302
+ field.other_table.type_name,
303
+ transform,
304
+ )
305
+ else:
306
+ out_fields.append((field_type, field_name, field))
307
+ fields_transforms[(self.type_name, field_name)] = (
308
+ field.other_table.type_name,
309
+ None,
310
+ )
311
+ field.other_table.keep_table = True
312
+
313
+ self.fields = out_fields
314
+ return types_transforms, fields_transforms
@@ -0,0 +1,258 @@
1
+ import typing
2
+ from datetime import datetime
3
+ from typing import Union
4
+ import logging
5
+ from lxml import etree
6
+ from io import BytesIO
7
+ from itertools import zip_longest
8
+
9
+ if typing.TYPE_CHECKING:
10
+ from xml2db.model import DataModel
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class XMLConverter:
16
+ def __init__(self, data_model: "DataModel", document_tree: dict = None):
17
+ """A class to convert data from document tree format (nested dict) to and from XML.
18
+
19
+ :param data_model: The `DataModel` object
20
+ :param document_tree: Data in the document tree format (optional, can be built by `parse_xml` function)
21
+ """
22
+ self.model = data_model
23
+ self.document_tree = document_tree
24
+
25
+ def parse_xml(
26
+ self,
27
+ xml_file: Union[str, BytesIO],
28
+ file_path: str = None,
29
+ skip_validation: bool = False,
30
+ ) -> dict:
31
+ """Parse an XML document into a nested dict and performs the simplifications defined in the
32
+ DataModel object ("pull" child to upper level, transform a choice model into "type" and "value"
33
+ fields or concatenate children as string).
34
+
35
+ :param xml_file: An XML file path or file content to be converted
36
+ :param file_path: The file path to be printed in logs
37
+ :param skip_validation: Whether we should validate XML against the schema before parsing
38
+ :return: The parsed data in the document tree format (nested dict)
39
+ """
40
+ logger.info(f"Parsing XML file: {file_path}")
41
+
42
+ xt = etree.parse(xml_file)
43
+ if skip_validation:
44
+ logger.info("Skipping XML file validation")
45
+ else:
46
+ logger.info("Validating XML file against the schema")
47
+ if not self.model.xml_schema.is_valid(xt):
48
+ logger.error(f"XML file {file_path} does not conform with the schema")
49
+ raise ValueError(
50
+ f"XML file {file_path} does not conform with the schema"
51
+ )
52
+ logger.info("XML file conforms with the schema")
53
+
54
+ if self.model.tables[self.model.root_table].is_virtual_node:
55
+ doc = etree.Element(self.model.root_table)
56
+ doc.append(xt.getroot())
57
+ else:
58
+ doc = xt.getroot()
59
+ self.document_tree = self._parse_xml_node(self.model.root_table, doc)
60
+ return self.document_tree
61
+
62
+ def _parse_xml_node(self, node_type: str, node: etree.Element) -> dict:
63
+ """Parse nodes of an XML document into a dict recursively
64
+
65
+ This method is much faster than using xmlschema parse method, but it will not
66
+ check the validity of the document regarding the XSD. It expects to
67
+ deal with a valid XML document.
68
+
69
+ :param node_type: type of the node
70
+ :param node: lxml node object
71
+ :return: a dict representing the node content
72
+ """
73
+
74
+ result = {"type": node_type, "content": {}}
75
+
76
+ for key, val in node.attrib.items():
77
+ if (
78
+ key
79
+ != "{http://www.w3.org/2001/XMLSchema-instance}noNamespaceSchemaLocation"
80
+ ):
81
+ result["content"][key] = [val]
82
+
83
+ if node.text and node.text.strip():
84
+ result["content"]["value"] = [node.text.strip()]
85
+
86
+ for element in node.iterchildren():
87
+ key = element.tag.split("}")[1] if "}" in element.tag else element.tag
88
+ node_type_key = (node_type, key)
89
+ value = None
90
+ if element.text and element.text.strip():
91
+ value = element.text
92
+ if (
93
+ self.model.fields_transforms.get(node_type_key, (None, "join"))[1]
94
+ != "join"
95
+ ):
96
+ value = self._parse_xml_node(
97
+ self.model.fields_transforms[node_type_key][0], element
98
+ )
99
+
100
+ if key in result["content"]:
101
+ result["content"][key].append(value)
102
+ else:
103
+ result["content"][key] = [value]
104
+
105
+ for key in list(result["content"]):
106
+ node_type_key = (node_type, key)
107
+ if node_type_key in self.model.fields_transforms:
108
+ transform = self.model.fields_transforms[node_type_key][1]
109
+ if transform == "elevate" or transform == "elevate_wo_prefix":
110
+ prefix = f"{key}_" if transform == "elevate" else ""
111
+ child = result["content"][key][0]
112
+ child_content = child["content"]
113
+ del result["content"][key]
114
+ for child_key, val in child_content.items():
115
+ result["content"][f"{prefix}{child_key}"] = val
116
+
117
+ if node_type in self.model.types_transforms:
118
+ if self.model.types_transforms[node_type] == "choice":
119
+ result["content"] = [
120
+ {"type": [child_key], "value": val}
121
+ for child_key, val in result["content"].items()
122
+ ][0]
123
+
124
+ return result
125
+
126
+ def to_xml(
127
+ self, out_file: str = None, nsmap: dict = None, indent: str = " "
128
+ ) -> etree.Element:
129
+ """Convert a document tree (nested dict) into an XML file
130
+
131
+ :param out_file: If provided, write output to a file.
132
+ :param nsmap: An optional namespace mapping.
133
+ :param indent: A string used as indentin XML output.
134
+ :return: The etree object corresponding to the root XML node.
135
+ """
136
+ doc = self._make_xml_node(
137
+ self.document_tree,
138
+ self.model.tables[self.document_tree["type"]].name,
139
+ nsmap,
140
+ )
141
+ if self.model.tables[self.model.root_table].is_virtual_node:
142
+ child = None
143
+ for child in doc:
144
+ break
145
+ doc = child
146
+ if out_file:
147
+ etree.indent(doc, space=indent)
148
+ with open(out_file, "wt") as f:
149
+ f.write(
150
+ etree.tostring(
151
+ doc,
152
+ pretty_print=True,
153
+ encoding="utf-8",
154
+ xml_declaration=True,
155
+ ).decode("utf-8")
156
+ )
157
+ return doc
158
+
159
+ def _make_xml_node(self, node_data, node_name, nsmap: dict = None):
160
+ def check_transformed_node(node_type, element):
161
+ if (
162
+ node_type in self.model.types_transforms
163
+ and self.model.types_transforms[node_type] == "choice"
164
+ ):
165
+ new_node = etree.Element(element.tag)
166
+ extracted = {}
167
+ for child in element:
168
+ extracted[child.tag] = child.text
169
+ if "type" in extracted and "value" in extracted:
170
+ child_node = etree.Element(extracted["type"])
171
+ child_node.text = extracted["value"]
172
+ new_node.append(child_node)
173
+ return new_node
174
+ else:
175
+ return None
176
+ return element
177
+
178
+ tb = self.model.tables[node_data["type"]]
179
+ nodes_stack = [(node_data["type"], etree.Element(node_name, nsmap=nsmap))]
180
+ prev_chain = []
181
+ prev_ngroup = None
182
+ ngroup_stack = []
183
+ for field_type, rel_name, rel in tb.fields:
184
+ name_chain = rel.name_chain[:-1]
185
+ i = len(prev_chain)
186
+ while i > 0 and (
187
+ i > len(name_chain) or name_chain[i - 1][0] != prev_chain[i - 1][0]
188
+ ):
189
+ completed_node = check_transformed_node(*nodes_stack.pop())
190
+ if completed_node is not None and len(completed_node) > 0:
191
+ nodes_stack[-1][1].append(completed_node)
192
+ i -= 1
193
+ while i < len(name_chain):
194
+ node = etree.Element(name_chain[i][0])
195
+ nodes_stack.append(
196
+ (
197
+ name_chain[i][1],
198
+ node,
199
+ )
200
+ )
201
+ i += 1
202
+ prev_chain = name_chain
203
+ children = []
204
+ attributes = {}
205
+ text_content = None
206
+ if field_type == "col":
207
+ if rel_name in node_data["content"]:
208
+ if rel.is_attr:
209
+ attributes[rel.name_chain[-1][0]] = node_data["content"][
210
+ rel_name
211
+ ][0]
212
+ elif rel.is_content:
213
+ text_content = node_data["content"][rel_name][0]
214
+ else:
215
+ for field_value in node_data["content"][rel_name]:
216
+ child = etree.Element(rel.name_chain[-1][0])
217
+ if isinstance(field_value, datetime):
218
+ field_value = field_value.isoformat()
219
+ child.text = str(field_value).encode("utf-8")
220
+ children.append(child)
221
+ elif field_type == "rel1":
222
+ if rel_name in node_data["content"]:
223
+ child = self._make_xml_node(
224
+ node_data["content"][rel_name][0], rel.name_chain[-1][0]
225
+ )
226
+ children = [child]
227
+ elif field_type == "reln":
228
+ if rel_name in node_data["content"]:
229
+ children = [
230
+ self._make_xml_node(child_tree, rel.name_chain[-1][0])
231
+ for child_tree in node_data["content"][rel_name]
232
+ ]
233
+ if prev_ngroup and rel.ngroup != prev_ngroup:
234
+ for ngroup_children in zip_longest(*ngroup_stack):
235
+ for child in ngroup_children:
236
+ nodes_stack[-1][1].append(child)
237
+ ngroup_stack = []
238
+ prev_ngroup = rel.ngroup
239
+ if len(children) > 0:
240
+ if rel.ngroup:
241
+ ngroup_stack.append(children)
242
+ else:
243
+ for child in children:
244
+ nodes_stack[-1][1].append(child)
245
+ for key, val in attributes.items():
246
+ nodes_stack[-1][1].set(key, val)
247
+ if text_content is not None:
248
+ nodes_stack[-1][1].text = text_content
249
+ if len(ngroup_stack) > 0:
250
+ for ngroup_children in zip_longest(*ngroup_stack):
251
+ for child in ngroup_children:
252
+ nodes_stack[-1][1].append(child)
253
+ while len(nodes_stack) > 1:
254
+ node = check_transformed_node(*nodes_stack.pop())
255
+ if node is not None and len(node) > 0:
256
+ nodes_stack[-1][1].append(node)
257
+
258
+ return check_transformed_node(*nodes_stack[0])
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2023 Commission de régulation de l'énergie
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,100 @@
1
+ Metadata-Version: 2.1
2
+ Name: xml2db
3
+ Version: 0.9.0
4
+ Summary: Import complex XML files to a relational database
5
+ Author-email: Commission de régulation de l'énergie <opensource@cre.fr>
6
+ Project-URL: Documentation, https://cre-dev.github.io/xml2db
7
+ Project-URL: Repository, https://github.com/cre-dev/xml2db
8
+ Project-URL: Issues page, https://github.com/cre-dev/xml2db/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: sqlalchemy
16
+ Requires-Dist: xmlschema
17
+ Requires-Dist: lxml
18
+ Requires-Dist: graphlib-backport ; python_version < "3.9"
19
+ Provides-Extra: docs
20
+ Requires-Dist: mkdocs-material ; extra == 'docs'
21
+ Requires-Dist: mkdocstrings[python] ; extra == 'docs'
22
+ Provides-Extra: tests
23
+ Requires-Dist: pytest ; extra == 'tests'
24
+
25
+ # Xml2db
26
+
27
+ `xml2db` is a Python package which allows loading XML data into a relational database. It is designed to handle complex
28
+ schemas which cannot be denormalized to a flat table, without any custom code.
29
+
30
+ It builds a data model (i.e. a set of database tables linked with foreign keys relationships) based on a XSD schema and
31
+ allows parsing and loading XML files into the database, and get them back to XML, if needed.
32
+
33
+ It is as simple as:
34
+
35
+ ```python
36
+ from xml2db import DataModel
37
+
38
+ # Create a data model of tables with relations based on the XSD file
39
+ data_model = DataModel(
40
+ xsd_file="path/to/file.xsd",
41
+ connection_string="mssql+pyodbc://server/database?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes",
42
+ )
43
+ # Parse an XML file based on this XSD
44
+ document = data_model.parse_xml(
45
+ xml_file="path/to/file.xml"
46
+ )
47
+ # Insert the document content into the database
48
+ document.insert_into_target_tables()
49
+ ```
50
+
51
+ The data model will adhere closely to the XSD schema, but `xml2db` will perform simplifications aimed at limiting the
52
+ complexity of the resulting data model and the storage footprint.
53
+
54
+ The raw data loaded into the database can then be processed using [DBT](https://www.getdbt.com/), SQL views or
55
+ stored procedures aimed at extracting, correcting and formatting the data into more user-friendly tables.
56
+
57
+ `xml2db` is developed and used at the [French energy regulation authority (CRE)](https://www.cre.fr/) to process XML
58
+ data, notably [REMIT data](https://www.acer.europa.eu/remit/data-collection). There, it handles batches of ~500 MB XML
59
+ files translating into a 20+ tables data model in the database.
60
+
61
+ This package uses `sqlalchemy` to interact with the database, so it should work with different database backends. It has
62
+ been tested against PostgreSQL and MS SQL Server. It currently does not work with SQLite. You may have to install
63
+ additional packages to connect to your database (e.g. `pyodbc` which is the default connector for MS SQL Server, or
64
+ `psycopg2` for PostgreSQL).
65
+
66
+ **Please read the [package documentation website](https://cre-dev.github.io/xml2db) for all the details!**
67
+
68
+ ## Installation
69
+
70
+ The package can be installed, preferably in a virtual environment, using `pip`:
71
+
72
+ ``` bash
73
+ pip install xml2db
74
+ ```
75
+
76
+ ## Testing
77
+
78
+ Running the tests requires installing additional development dependencies, after cloning the repo, with:
79
+
80
+ ```bash
81
+ pip install -e .[tests,docs]
82
+ ```
83
+
84
+ Run all tests with the following command:
85
+
86
+ ```bash
87
+ python -m pytest
88
+ ```
89
+
90
+ Integration tests require write access to a MS SQL server database; the connection string is provided as an environment
91
+ variable `DB_STRING`. If you want to run only conversion tests that do not require a database you can run:
92
+
93
+ ```bash
94
+ pytest -m "not dbtest"
95
+ `````
96
+
97
+ ## Contributing
98
+
99
+ Contributions are more than welcome, as well as bug reports, starting with the project's
100
+ [issue page](https://github.com/cre-dev/xml2db/issues).