xml2db 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debug.py +34 -0
- xml2db/__init__.py +21 -0
- xml2db/document.py +650 -0
- xml2db/exceptions.py +4 -0
- xml2db/model.py +619 -0
- xml2db/table/__init__.py +5 -0
- xml2db/table/column.py +190 -0
- xml2db/table/duplicated_table.py +180 -0
- xml2db/table/relations.py +243 -0
- xml2db/table/reused_table.py +152 -0
- xml2db/table/table.py +356 -0
- xml2db/table/transformed_table.py +314 -0
- xml2db/xml_converter.py +258 -0
- xml2db-0.9.0.dist-info/LICENSE +19 -0
- xml2db-0.9.0.dist-info/METADATA +100 -0
- xml2db-0.9.0.dist-info/RECORD +18 -0
- xml2db-0.9.0.dist-info/WHEEL +5 -0
- xml2db-0.9.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
from typing import Union, List, Tuple
|
|
2
|
+
|
|
3
|
+
from xml2db.exceptions import DataModelConfigError
|
|
4
|
+
from .column import DataModelColumn
|
|
5
|
+
from .relations import DataModelRelation1, DataModelRelationN
|
|
6
|
+
from .table import DataModelTable
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DataModelTableTransformed(DataModelTable):
|
|
10
|
+
"""A class extending DataModelTable with transformations
|
|
11
|
+
|
|
12
|
+
This class allows simplifying a DataModelTable object with default or configured transformations in \
|
|
13
|
+
order to reduce final schema complexity.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def _can_choice_transform_table(self) -> bool:
|
|
17
|
+
"""Check if the table is of type "choice" and can be transformed to type/value fields.
|
|
18
|
+
|
|
19
|
+
:return: True if the table model be converted to type/value choice model, False otherwise
|
|
20
|
+
"""
|
|
21
|
+
if self.model_group == "choice":
|
|
22
|
+
col_types = list(set([col.data_type for col in self.columns.values()]))
|
|
23
|
+
return (
|
|
24
|
+
len(self.relations_1) == 0
|
|
25
|
+
and len(self.relations_n) == 0
|
|
26
|
+
and len(col_types) == 1
|
|
27
|
+
)
|
|
28
|
+
return False
|
|
29
|
+
|
|
30
|
+
def _is_table_choice_transform_applicable(self) -> bool:
|
|
31
|
+
"""Determine if choice transform should be applied to the whole table.
|
|
32
|
+
|
|
33
|
+
We try the choice_transform value provided in config, if any, and otherwise fall back to default value.
|
|
34
|
+
|
|
35
|
+
:return: True if choice transform is to be applied, False otherwise.
|
|
36
|
+
"""
|
|
37
|
+
if "choice_transform" in self.config:
|
|
38
|
+
if isinstance(self.config["choice_transform"], bool):
|
|
39
|
+
if self.config["choice_transform"]:
|
|
40
|
+
if self._can_choice_transform_table():
|
|
41
|
+
return True
|
|
42
|
+
else:
|
|
43
|
+
raise DataModelConfigError(
|
|
44
|
+
f"Choice-transform cannot be applied to table '{self.name}', see conditions in "
|
|
45
|
+
f"DataModelTableTransformed._can_choice_transform_table."
|
|
46
|
+
)
|
|
47
|
+
else:
|
|
48
|
+
return False
|
|
49
|
+
else:
|
|
50
|
+
raise DataModelConfigError(
|
|
51
|
+
f"Unrecognized choice_transform value '{self.config['choice_transform']}'"
|
|
52
|
+
f" for table '{self.name}'. Only boolean values True or False are allowed."
|
|
53
|
+
)
|
|
54
|
+
elif self._can_choice_transform_table() and len(self.columns) > 2:
|
|
55
|
+
# column number isn't reduced if the number of columns = 2, as it would be elevated to 2 columns then
|
|
56
|
+
return True
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
def _transform_to_choice(self) -> None:
|
|
60
|
+
"""Transform the current table to a choice model representation with only type and value fields"""
|
|
61
|
+
col_types = list(set([col.data_type for col in self.columns.values()]))
|
|
62
|
+
col_names = [col.name for col in self.columns.values()]
|
|
63
|
+
min_lengths = [col.min_length for col in self.columns.values()]
|
|
64
|
+
max_lengths = [col.max_length for col in self.columns.values()]
|
|
65
|
+
allow_empty = [col.allow_empty for col in self.columns.values()]
|
|
66
|
+
self.columns = {
|
|
67
|
+
"type": DataModelColumn(
|
|
68
|
+
"type",
|
|
69
|
+
[("type", None)],
|
|
70
|
+
"string",
|
|
71
|
+
[1, 1],
|
|
72
|
+
min(len(name) for name in col_names),
|
|
73
|
+
max(len(name) for name in col_names),
|
|
74
|
+
False,
|
|
75
|
+
False,
|
|
76
|
+
False,
|
|
77
|
+
None,
|
|
78
|
+
self.config,
|
|
79
|
+
self.data_model,
|
|
80
|
+
),
|
|
81
|
+
"value": DataModelColumn(
|
|
82
|
+
"value",
|
|
83
|
+
[("value", None)],
|
|
84
|
+
col_types[0],
|
|
85
|
+
[1, 1],
|
|
86
|
+
min(min_lengths) if all(e is not None for e in min_lengths) else None,
|
|
87
|
+
max(max_lengths) if all(e is not None for e in max_lengths) else None,
|
|
88
|
+
False,
|
|
89
|
+
False,
|
|
90
|
+
any(allow_empty),
|
|
91
|
+
None,
|
|
92
|
+
self.config,
|
|
93
|
+
self.data_model,
|
|
94
|
+
),
|
|
95
|
+
}
|
|
96
|
+
self.fields = [
|
|
97
|
+
("col", "type", self.columns["type"]),
|
|
98
|
+
("col", "value", self.columns["value"]),
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
def _can_transform_field(
|
|
102
|
+
self, field_type: str, field_name: str, transform: str = "join"
|
|
103
|
+
) -> bool:
|
|
104
|
+
"""Check if a given transformation can be applied to a given field
|
|
105
|
+
|
|
106
|
+
:param field_type: the field type ("col", "rel1" or "reln")
|
|
107
|
+
:param field_name: the field name
|
|
108
|
+
:param transform: the transform to be tested
|
|
109
|
+
:return: True is the field can be transformed
|
|
110
|
+
"""
|
|
111
|
+
if field_type == "col" and transform == "join":
|
|
112
|
+
# check if simple columns with max occurrences > 1 can be joined as string
|
|
113
|
+
if self.columns[field_name].can_join_values_as_string:
|
|
114
|
+
return True
|
|
115
|
+
elif field_type == "rel1":
|
|
116
|
+
return transform in ["elevate", "elevate_wo_prefix"]
|
|
117
|
+
# "reln" can never be transformed
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
def _get_field_transform(
|
|
121
|
+
self, field_type: str, field_name: str
|
|
122
|
+
) -> Union[str, None]:
|
|
123
|
+
"""Get the transformation that should be applied to this field, taking into account user-provided config
|
|
124
|
+
|
|
125
|
+
:param field_type: the field type ("col", "rel1", "reln")
|
|
126
|
+
:param field_name: the field name
|
|
127
|
+
:return: the default transformation that should be applied
|
|
128
|
+
"""
|
|
129
|
+
field_config = self.config.get("fields", {}).get(field_name, {})
|
|
130
|
+
if "transform" in field_config:
|
|
131
|
+
if field_config["transform"] is False:
|
|
132
|
+
return None
|
|
133
|
+
if self._can_transform_field(
|
|
134
|
+
field_type, field_name, field_config["transform"]
|
|
135
|
+
):
|
|
136
|
+
return field_config["transform"]
|
|
137
|
+
else:
|
|
138
|
+
raise DataModelConfigError(
|
|
139
|
+
f"Transform value '{field_config['transform']}' cannot be applied"
|
|
140
|
+
f" to field {field_name} of table '{self.name}'."
|
|
141
|
+
)
|
|
142
|
+
else:
|
|
143
|
+
if field_type == "col":
|
|
144
|
+
if self._can_transform_field("col", field_name, "join"):
|
|
145
|
+
return "join"
|
|
146
|
+
elif field_type == "rel1":
|
|
147
|
+
if (
|
|
148
|
+
self.relations_1[field_name].occurs[0] == 1
|
|
149
|
+
or len(self.relations_1[field_name].other_table.columns) <= 4
|
|
150
|
+
) and len(self.relations_1[field_name].other_table.parents_n) == 0:
|
|
151
|
+
transform = (
|
|
152
|
+
"elevate_wo_prefix"
|
|
153
|
+
if len(self.columns) == 0 and len(self.relations_1) == 1
|
|
154
|
+
else "elevate"
|
|
155
|
+
)
|
|
156
|
+
if self._can_transform_field("rel1", field_name, transform):
|
|
157
|
+
return transform
|
|
158
|
+
|
|
159
|
+
def _elevate_relation_1(
|
|
160
|
+
self, rel_name, transform
|
|
161
|
+
) -> List[
|
|
162
|
+
Tuple[str, str, Union[DataModelColumn, DataModelRelation1, DataModelRelationN]]
|
|
163
|
+
]:
|
|
164
|
+
"""Elevate a child table to the upper level"""
|
|
165
|
+
rel = self.relations_1[rel_name]
|
|
166
|
+
if transform == "elevate_wo_prefix":
|
|
167
|
+
prefix = ""
|
|
168
|
+
else:
|
|
169
|
+
prefix = f"{rel.name}_"
|
|
170
|
+
|
|
171
|
+
del self.relations_1[rel_name]
|
|
172
|
+
|
|
173
|
+
# insert the children fields into the current table
|
|
174
|
+
elevated_fields = []
|
|
175
|
+
for child_field_type, key, child_field in rel.other_table.fields:
|
|
176
|
+
prefixed_key = f"{prefix}{key}"
|
|
177
|
+
if child_field_type == "col":
|
|
178
|
+
self.columns[prefixed_key] = DataModelColumn(
|
|
179
|
+
prefixed_key,
|
|
180
|
+
[(rel.name, rel.other_table.type_name)] + child_field.name_chain,
|
|
181
|
+
child_field.data_type,
|
|
182
|
+
[0, child_field.occurs[1]]
|
|
183
|
+
if rel.occurs[0] == 0
|
|
184
|
+
else child_field.occurs,
|
|
185
|
+
child_field.min_length,
|
|
186
|
+
child_field.max_length,
|
|
187
|
+
child_field.is_attr,
|
|
188
|
+
child_field.is_content,
|
|
189
|
+
child_field.allow_empty,
|
|
190
|
+
child_field.ngroup,
|
|
191
|
+
self.config,
|
|
192
|
+
self.data_model,
|
|
193
|
+
)
|
|
194
|
+
elevated_fields.append(
|
|
195
|
+
(
|
|
196
|
+
"col",
|
|
197
|
+
prefixed_key,
|
|
198
|
+
self.columns[prefixed_key],
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
elif child_field_type == "rel1":
|
|
202
|
+
self.relations_1[prefixed_key] = DataModelRelation1(
|
|
203
|
+
prefixed_key,
|
|
204
|
+
[(rel.name, rel.other_table.type_name)] + child_field.name_chain,
|
|
205
|
+
self,
|
|
206
|
+
child_field.other_table,
|
|
207
|
+
[0, child_field.occurs[1]]
|
|
208
|
+
if rel.occurs[0] == 0
|
|
209
|
+
else child_field.occurs,
|
|
210
|
+
child_field.ngroup,
|
|
211
|
+
self.data_model,
|
|
212
|
+
)
|
|
213
|
+
elevated_fields.append(
|
|
214
|
+
(
|
|
215
|
+
"rel1",
|
|
216
|
+
prefixed_key,
|
|
217
|
+
self.relations_1[prefixed_key],
|
|
218
|
+
)
|
|
219
|
+
)
|
|
220
|
+
elif child_field_type == "reln":
|
|
221
|
+
self.relations_n[prefixed_key] = DataModelRelationN(
|
|
222
|
+
prefixed_key,
|
|
223
|
+
[(rel.name, rel.other_table.type_name)] + child_field.name_chain,
|
|
224
|
+
self,
|
|
225
|
+
child_field.other_table,
|
|
226
|
+
[0, child_field.occurs[1]]
|
|
227
|
+
if rel.occurs[0] == 0
|
|
228
|
+
else child_field.occurs,
|
|
229
|
+
child_field.ngroup,
|
|
230
|
+
self.data_model,
|
|
231
|
+
)
|
|
232
|
+
elevated_fields.append(
|
|
233
|
+
(
|
|
234
|
+
"reln",
|
|
235
|
+
prefixed_key,
|
|
236
|
+
self.relations_n[prefixed_key],
|
|
237
|
+
)
|
|
238
|
+
)
|
|
239
|
+
return elevated_fields
|
|
240
|
+
|
|
241
|
+
def simplify_table(self) -> Tuple[dict, dict]:
|
|
242
|
+
"""Simplify table recursively and return a dict of simplifications applied
|
|
243
|
+
|
|
244
|
+
Return values are dict which associate xml types and xml field with transform operations. These dicts are used \
|
|
245
|
+
at parsing stage and should contain all xml types and field names, even if there is no transformation to apply.\
|
|
246
|
+
Transformations are described by keywords:
|
|
247
|
+
For tables (aka XML complex types):
|
|
248
|
+
- "choice_transform": transform a choice between 2+ different fields to type / value fields in order to \
|
|
249
|
+
reduce the number of columns.
|
|
250
|
+
For fields:
|
|
251
|
+
- "join": applies to fields with multiple values allowed: append all values in a comma separated string
|
|
252
|
+
- "elevate": pull up child type to parent level, appending field name to child field names
|
|
253
|
+
- "elevate_wo_prefix": same as "elevate" but keeping only child's fields names (without prefixing)
|
|
254
|
+
- False: prevents any transformation on this field
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
:return: a tuple of 2 dicts: the first one for type transforms, the second one for fields transforms
|
|
258
|
+
"""
|
|
259
|
+
|
|
260
|
+
# if the table is already simplified, stop here
|
|
261
|
+
if self.is_simplified:
|
|
262
|
+
return {}, {}
|
|
263
|
+
self.is_simplified = True
|
|
264
|
+
|
|
265
|
+
# if the table can be transformed, stop here
|
|
266
|
+
if self._is_table_choice_transform_applicable():
|
|
267
|
+
self._transform_to_choice()
|
|
268
|
+
self.is_simplified = True
|
|
269
|
+
return {self.type_name: "choice"}, {}
|
|
270
|
+
|
|
271
|
+
# loop through field to transform them if need be
|
|
272
|
+
out_fields = []
|
|
273
|
+
types_transforms = {}
|
|
274
|
+
fields_transforms = {}
|
|
275
|
+
for field_type, field_name, field in self.fields:
|
|
276
|
+
if field_type == "col":
|
|
277
|
+
if self._get_field_transform("col", field_name) == "join":
|
|
278
|
+
fields_transforms[(self.type_name, field_name)] = (
|
|
279
|
+
None,
|
|
280
|
+
"join",
|
|
281
|
+
)
|
|
282
|
+
out_fields.append(("col", field_name, field))
|
|
283
|
+
|
|
284
|
+
else:
|
|
285
|
+
# simplify child table
|
|
286
|
+
(
|
|
287
|
+
types_transforms_child,
|
|
288
|
+
fields_transforms_child,
|
|
289
|
+
) = field.other_table.simplify_table()
|
|
290
|
+
types_transforms.update(types_transforms_child)
|
|
291
|
+
fields_transforms.update(fields_transforms_child)
|
|
292
|
+
|
|
293
|
+
# check if children can be "elevated" to the upper level
|
|
294
|
+
transform = self._get_field_transform(field_type, field_name)
|
|
295
|
+
if transform is not None:
|
|
296
|
+
if field_type == "rel1":
|
|
297
|
+
elevated_fields = self._elevate_relation_1(
|
|
298
|
+
field_name, transform
|
|
299
|
+
)
|
|
300
|
+
out_fields.extend(elevated_fields)
|
|
301
|
+
fields_transforms[(self.type_name, field_name)] = (
|
|
302
|
+
field.other_table.type_name,
|
|
303
|
+
transform,
|
|
304
|
+
)
|
|
305
|
+
else:
|
|
306
|
+
out_fields.append((field_type, field_name, field))
|
|
307
|
+
fields_transforms[(self.type_name, field_name)] = (
|
|
308
|
+
field.other_table.type_name,
|
|
309
|
+
None,
|
|
310
|
+
)
|
|
311
|
+
field.other_table.keep_table = True
|
|
312
|
+
|
|
313
|
+
self.fields = out_fields
|
|
314
|
+
return types_transforms, fields_transforms
|
xml2db/xml_converter.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Union
|
|
4
|
+
import logging
|
|
5
|
+
from lxml import etree
|
|
6
|
+
from io import BytesIO
|
|
7
|
+
from itertools import zip_longest
|
|
8
|
+
|
|
9
|
+
if typing.TYPE_CHECKING:
|
|
10
|
+
from xml2db.model import DataModel
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class XMLConverter:
|
|
16
|
+
def __init__(self, data_model: "DataModel", document_tree: dict = None):
|
|
17
|
+
"""A class to convert data from document tree format (nested dict) to and from XML.
|
|
18
|
+
|
|
19
|
+
:param data_model: The `DataModel` object
|
|
20
|
+
:param document_tree: Data in the document tree format (optional, can be built by `parse_xml` function)
|
|
21
|
+
"""
|
|
22
|
+
self.model = data_model
|
|
23
|
+
self.document_tree = document_tree
|
|
24
|
+
|
|
25
|
+
def parse_xml(
|
|
26
|
+
self,
|
|
27
|
+
xml_file: Union[str, BytesIO],
|
|
28
|
+
file_path: str = None,
|
|
29
|
+
skip_validation: bool = False,
|
|
30
|
+
) -> dict:
|
|
31
|
+
"""Parse an XML document into a nested dict and performs the simplifications defined in the
|
|
32
|
+
DataModel object ("pull" child to upper level, transform a choice model into "type" and "value"
|
|
33
|
+
fields or concatenate children as string).
|
|
34
|
+
|
|
35
|
+
:param xml_file: An XML file path or file content to be converted
|
|
36
|
+
:param file_path: The file path to be printed in logs
|
|
37
|
+
:param skip_validation: Whether we should validate XML against the schema before parsing
|
|
38
|
+
:return: The parsed data in the document tree format (nested dict)
|
|
39
|
+
"""
|
|
40
|
+
logger.info(f"Parsing XML file: {file_path}")
|
|
41
|
+
|
|
42
|
+
xt = etree.parse(xml_file)
|
|
43
|
+
if skip_validation:
|
|
44
|
+
logger.info("Skipping XML file validation")
|
|
45
|
+
else:
|
|
46
|
+
logger.info("Validating XML file against the schema")
|
|
47
|
+
if not self.model.xml_schema.is_valid(xt):
|
|
48
|
+
logger.error(f"XML file {file_path} does not conform with the schema")
|
|
49
|
+
raise ValueError(
|
|
50
|
+
f"XML file {file_path} does not conform with the schema"
|
|
51
|
+
)
|
|
52
|
+
logger.info("XML file conforms with the schema")
|
|
53
|
+
|
|
54
|
+
if self.model.tables[self.model.root_table].is_virtual_node:
|
|
55
|
+
doc = etree.Element(self.model.root_table)
|
|
56
|
+
doc.append(xt.getroot())
|
|
57
|
+
else:
|
|
58
|
+
doc = xt.getroot()
|
|
59
|
+
self.document_tree = self._parse_xml_node(self.model.root_table, doc)
|
|
60
|
+
return self.document_tree
|
|
61
|
+
|
|
62
|
+
def _parse_xml_node(self, node_type: str, node: etree.Element) -> dict:
|
|
63
|
+
"""Parse nodes of an XML document into a dict recursively
|
|
64
|
+
|
|
65
|
+
This method is much faster than using xmlschema parse method, but it will not
|
|
66
|
+
check the validity of the document regarding the XSD. It expects to
|
|
67
|
+
deal with a valid XML document.
|
|
68
|
+
|
|
69
|
+
:param node_type: type of the node
|
|
70
|
+
:param node: lxml node object
|
|
71
|
+
:return: a dict representing the node content
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
result = {"type": node_type, "content": {}}
|
|
75
|
+
|
|
76
|
+
for key, val in node.attrib.items():
|
|
77
|
+
if (
|
|
78
|
+
key
|
|
79
|
+
!= "{http://www.w3.org/2001/XMLSchema-instance}noNamespaceSchemaLocation"
|
|
80
|
+
):
|
|
81
|
+
result["content"][key] = [val]
|
|
82
|
+
|
|
83
|
+
if node.text and node.text.strip():
|
|
84
|
+
result["content"]["value"] = [node.text.strip()]
|
|
85
|
+
|
|
86
|
+
for element in node.iterchildren():
|
|
87
|
+
key = element.tag.split("}")[1] if "}" in element.tag else element.tag
|
|
88
|
+
node_type_key = (node_type, key)
|
|
89
|
+
value = None
|
|
90
|
+
if element.text and element.text.strip():
|
|
91
|
+
value = element.text
|
|
92
|
+
if (
|
|
93
|
+
self.model.fields_transforms.get(node_type_key, (None, "join"))[1]
|
|
94
|
+
!= "join"
|
|
95
|
+
):
|
|
96
|
+
value = self._parse_xml_node(
|
|
97
|
+
self.model.fields_transforms[node_type_key][0], element
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if key in result["content"]:
|
|
101
|
+
result["content"][key].append(value)
|
|
102
|
+
else:
|
|
103
|
+
result["content"][key] = [value]
|
|
104
|
+
|
|
105
|
+
for key in list(result["content"]):
|
|
106
|
+
node_type_key = (node_type, key)
|
|
107
|
+
if node_type_key in self.model.fields_transforms:
|
|
108
|
+
transform = self.model.fields_transforms[node_type_key][1]
|
|
109
|
+
if transform == "elevate" or transform == "elevate_wo_prefix":
|
|
110
|
+
prefix = f"{key}_" if transform == "elevate" else ""
|
|
111
|
+
child = result["content"][key][0]
|
|
112
|
+
child_content = child["content"]
|
|
113
|
+
del result["content"][key]
|
|
114
|
+
for child_key, val in child_content.items():
|
|
115
|
+
result["content"][f"{prefix}{child_key}"] = val
|
|
116
|
+
|
|
117
|
+
if node_type in self.model.types_transforms:
|
|
118
|
+
if self.model.types_transforms[node_type] == "choice":
|
|
119
|
+
result["content"] = [
|
|
120
|
+
{"type": [child_key], "value": val}
|
|
121
|
+
for child_key, val in result["content"].items()
|
|
122
|
+
][0]
|
|
123
|
+
|
|
124
|
+
return result
|
|
125
|
+
|
|
126
|
+
def to_xml(
|
|
127
|
+
self, out_file: str = None, nsmap: dict = None, indent: str = " "
|
|
128
|
+
) -> etree.Element:
|
|
129
|
+
"""Convert a document tree (nested dict) into an XML file
|
|
130
|
+
|
|
131
|
+
:param out_file: If provided, write output to a file.
|
|
132
|
+
:param nsmap: An optional namespace mapping.
|
|
133
|
+
:param indent: A string used as indentin XML output.
|
|
134
|
+
:return: The etree object corresponding to the root XML node.
|
|
135
|
+
"""
|
|
136
|
+
doc = self._make_xml_node(
|
|
137
|
+
self.document_tree,
|
|
138
|
+
self.model.tables[self.document_tree["type"]].name,
|
|
139
|
+
nsmap,
|
|
140
|
+
)
|
|
141
|
+
if self.model.tables[self.model.root_table].is_virtual_node:
|
|
142
|
+
child = None
|
|
143
|
+
for child in doc:
|
|
144
|
+
break
|
|
145
|
+
doc = child
|
|
146
|
+
if out_file:
|
|
147
|
+
etree.indent(doc, space=indent)
|
|
148
|
+
with open(out_file, "wt") as f:
|
|
149
|
+
f.write(
|
|
150
|
+
etree.tostring(
|
|
151
|
+
doc,
|
|
152
|
+
pretty_print=True,
|
|
153
|
+
encoding="utf-8",
|
|
154
|
+
xml_declaration=True,
|
|
155
|
+
).decode("utf-8")
|
|
156
|
+
)
|
|
157
|
+
return doc
|
|
158
|
+
|
|
159
|
+
def _make_xml_node(self, node_data, node_name, nsmap: dict = None):
|
|
160
|
+
def check_transformed_node(node_type, element):
|
|
161
|
+
if (
|
|
162
|
+
node_type in self.model.types_transforms
|
|
163
|
+
and self.model.types_transforms[node_type] == "choice"
|
|
164
|
+
):
|
|
165
|
+
new_node = etree.Element(element.tag)
|
|
166
|
+
extracted = {}
|
|
167
|
+
for child in element:
|
|
168
|
+
extracted[child.tag] = child.text
|
|
169
|
+
if "type" in extracted and "value" in extracted:
|
|
170
|
+
child_node = etree.Element(extracted["type"])
|
|
171
|
+
child_node.text = extracted["value"]
|
|
172
|
+
new_node.append(child_node)
|
|
173
|
+
return new_node
|
|
174
|
+
else:
|
|
175
|
+
return None
|
|
176
|
+
return element
|
|
177
|
+
|
|
178
|
+
tb = self.model.tables[node_data["type"]]
|
|
179
|
+
nodes_stack = [(node_data["type"], etree.Element(node_name, nsmap=nsmap))]
|
|
180
|
+
prev_chain = []
|
|
181
|
+
prev_ngroup = None
|
|
182
|
+
ngroup_stack = []
|
|
183
|
+
for field_type, rel_name, rel in tb.fields:
|
|
184
|
+
name_chain = rel.name_chain[:-1]
|
|
185
|
+
i = len(prev_chain)
|
|
186
|
+
while i > 0 and (
|
|
187
|
+
i > len(name_chain) or name_chain[i - 1][0] != prev_chain[i - 1][0]
|
|
188
|
+
):
|
|
189
|
+
completed_node = check_transformed_node(*nodes_stack.pop())
|
|
190
|
+
if completed_node is not None and len(completed_node) > 0:
|
|
191
|
+
nodes_stack[-1][1].append(completed_node)
|
|
192
|
+
i -= 1
|
|
193
|
+
while i < len(name_chain):
|
|
194
|
+
node = etree.Element(name_chain[i][0])
|
|
195
|
+
nodes_stack.append(
|
|
196
|
+
(
|
|
197
|
+
name_chain[i][1],
|
|
198
|
+
node,
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
i += 1
|
|
202
|
+
prev_chain = name_chain
|
|
203
|
+
children = []
|
|
204
|
+
attributes = {}
|
|
205
|
+
text_content = None
|
|
206
|
+
if field_type == "col":
|
|
207
|
+
if rel_name in node_data["content"]:
|
|
208
|
+
if rel.is_attr:
|
|
209
|
+
attributes[rel.name_chain[-1][0]] = node_data["content"][
|
|
210
|
+
rel_name
|
|
211
|
+
][0]
|
|
212
|
+
elif rel.is_content:
|
|
213
|
+
text_content = node_data["content"][rel_name][0]
|
|
214
|
+
else:
|
|
215
|
+
for field_value in node_data["content"][rel_name]:
|
|
216
|
+
child = etree.Element(rel.name_chain[-1][0])
|
|
217
|
+
if isinstance(field_value, datetime):
|
|
218
|
+
field_value = field_value.isoformat()
|
|
219
|
+
child.text = str(field_value).encode("utf-8")
|
|
220
|
+
children.append(child)
|
|
221
|
+
elif field_type == "rel1":
|
|
222
|
+
if rel_name in node_data["content"]:
|
|
223
|
+
child = self._make_xml_node(
|
|
224
|
+
node_data["content"][rel_name][0], rel.name_chain[-1][0]
|
|
225
|
+
)
|
|
226
|
+
children = [child]
|
|
227
|
+
elif field_type == "reln":
|
|
228
|
+
if rel_name in node_data["content"]:
|
|
229
|
+
children = [
|
|
230
|
+
self._make_xml_node(child_tree, rel.name_chain[-1][0])
|
|
231
|
+
for child_tree in node_data["content"][rel_name]
|
|
232
|
+
]
|
|
233
|
+
if prev_ngroup and rel.ngroup != prev_ngroup:
|
|
234
|
+
for ngroup_children in zip_longest(*ngroup_stack):
|
|
235
|
+
for child in ngroup_children:
|
|
236
|
+
nodes_stack[-1][1].append(child)
|
|
237
|
+
ngroup_stack = []
|
|
238
|
+
prev_ngroup = rel.ngroup
|
|
239
|
+
if len(children) > 0:
|
|
240
|
+
if rel.ngroup:
|
|
241
|
+
ngroup_stack.append(children)
|
|
242
|
+
else:
|
|
243
|
+
for child in children:
|
|
244
|
+
nodes_stack[-1][1].append(child)
|
|
245
|
+
for key, val in attributes.items():
|
|
246
|
+
nodes_stack[-1][1].set(key, val)
|
|
247
|
+
if text_content is not None:
|
|
248
|
+
nodes_stack[-1][1].text = text_content
|
|
249
|
+
if len(ngroup_stack) > 0:
|
|
250
|
+
for ngroup_children in zip_longest(*ngroup_stack):
|
|
251
|
+
for child in ngroup_children:
|
|
252
|
+
nodes_stack[-1][1].append(child)
|
|
253
|
+
while len(nodes_stack) > 1:
|
|
254
|
+
node = check_transformed_node(*nodes_stack.pop())
|
|
255
|
+
if node is not None and len(node) > 0:
|
|
256
|
+
nodes_stack[-1][1].append(node)
|
|
257
|
+
|
|
258
|
+
return check_transformed_node(*nodes_stack[0])
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2023 Commission de régulation de l'énergie
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: xml2db
|
|
3
|
+
Version: 0.9.0
|
|
4
|
+
Summary: Import complex XML files to a relational database
|
|
5
|
+
Author-email: Commission de régulation de l'énergie <opensource@cre.fr>
|
|
6
|
+
Project-URL: Documentation, https://cre-dev.github.io/xml2db
|
|
7
|
+
Project-URL: Repository, https://github.com/cre-dev/xml2db
|
|
8
|
+
Project-URL: Issues page, https://github.com/cre-dev/xml2db/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: sqlalchemy
|
|
16
|
+
Requires-Dist: xmlschema
|
|
17
|
+
Requires-Dist: lxml
|
|
18
|
+
Requires-Dist: graphlib-backport ; python_version < "3.9"
|
|
19
|
+
Provides-Extra: docs
|
|
20
|
+
Requires-Dist: mkdocs-material ; extra == 'docs'
|
|
21
|
+
Requires-Dist: mkdocstrings[python] ; extra == 'docs'
|
|
22
|
+
Provides-Extra: tests
|
|
23
|
+
Requires-Dist: pytest ; extra == 'tests'
|
|
24
|
+
|
|
25
|
+
# Xml2db
|
|
26
|
+
|
|
27
|
+
`xml2db` is a Python package which allows loading XML data into a relational database. It is designed to handle complex
|
|
28
|
+
schemas which cannot be denormalized to a flat table, without any custom code.
|
|
29
|
+
|
|
30
|
+
It builds a data model (i.e. a set of database tables linked with foreign keys relationships) based on a XSD schema and
|
|
31
|
+
allows parsing and loading XML files into the database, and get them back to XML, if needed.
|
|
32
|
+
|
|
33
|
+
It is as simple as:
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from xml2db import DataModel
|
|
37
|
+
|
|
38
|
+
# Create a data model of tables with relations based on the XSD file
|
|
39
|
+
data_model = DataModel(
|
|
40
|
+
xsd_file="path/to/file.xsd",
|
|
41
|
+
connection_string="mssql+pyodbc://server/database?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes",
|
|
42
|
+
)
|
|
43
|
+
# Parse an XML file based on this XSD
|
|
44
|
+
document = data_model.parse_xml(
|
|
45
|
+
xml_file="path/to/file.xml"
|
|
46
|
+
)
|
|
47
|
+
# Insert the document content into the database
|
|
48
|
+
document.insert_into_target_tables()
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
The data model will adhere closely to the XSD schema, but `xml2db` will perform simplifications aimed at limiting the
|
|
52
|
+
complexity of the resulting data model and the storage footprint.
|
|
53
|
+
|
|
54
|
+
The raw data loaded into the database can then be processed using [DBT](https://www.getdbt.com/), SQL views or
|
|
55
|
+
stored procedures aimed at extracting, correcting and formatting the data into more user-friendly tables.
|
|
56
|
+
|
|
57
|
+
`xml2db` is developed and used at the [French energy regulation authority (CRE)](https://www.cre.fr/) to process XML
|
|
58
|
+
data, notably [REMIT data](https://www.acer.europa.eu/remit/data-collection). There, it handles batches of ~500 MB XML
|
|
59
|
+
files translating into a 20+ tables data model in the database.
|
|
60
|
+
|
|
61
|
+
This package uses `sqlalchemy` to interact with the database, so it should work with different database backends. It has
|
|
62
|
+
been tested against PostgreSQL and MS SQL Server. It currently does not work with SQLite. You may have to install
|
|
63
|
+
additional packages to connect to your database (e.g. `pyodbc` which is the default connector for MS SQL Server, or
|
|
64
|
+
`psycopg2` for PostgreSQL).
|
|
65
|
+
|
|
66
|
+
**Please read the [package documentation website](https://cre-dev.github.io/xml2db) for all the details!**
|
|
67
|
+
|
|
68
|
+
## Installation
|
|
69
|
+
|
|
70
|
+
The package can be installed, preferably in a virtual environment, using `pip`:
|
|
71
|
+
|
|
72
|
+
``` bash
|
|
73
|
+
pip install xml2db
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Testing
|
|
77
|
+
|
|
78
|
+
Running the tests requires installing additional development dependencies, after cloning the repo, with:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pip install -e .[tests,docs]
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Run all tests with the following command:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
python -m pytest
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Integration tests require write access to a MS SQL server database; the connection string is provided as an environment
|
|
91
|
+
variable `DB_STRING`. If you want to run only conversion tests that do not require a database you can run:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
pytest -m "not dbtest"
|
|
95
|
+
`````
|
|
96
|
+
|
|
97
|
+
## Contributing
|
|
98
|
+
|
|
99
|
+
Contributions are more than welcome, as well as bug reports, starting with the project's
|
|
100
|
+
[issue page](https://github.com/cre-dev/xml2db/issues).
|