xml2db 0.9.0__tar.gz → 0.9.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xml2db-0.9.0 → xml2db-0.9.4}/LICENSE +1 -1
- {xml2db-0.9.0/src/xml2db.egg-info → xml2db-0.9.4}/PKG-INFO +14 -16
- {xml2db-0.9.0 → xml2db-0.9.4}/README.md +6 -7
- {xml2db-0.9.0 → xml2db-0.9.4}/pyproject.toml +7 -8
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db/document.py +8 -5
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db/model.py +32 -8
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db/table/column.py +10 -1
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db/table/reused_table.py +36 -13
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db/xml_converter.py +11 -1
- {xml2db-0.9.0 → xml2db-0.9.4/src/xml2db.egg-info}/PKG-INFO +14 -16
- xml2db-0.9.4/src/xml2db.egg-info/requires.txt +10 -0
- xml2db-0.9.0/src/xml2db.egg-info/requires.txt +0 -13
- {xml2db-0.9.0 → xml2db-0.9.4}/setup.cfg +0 -0
- {xml2db-0.9.0 → xml2db-0.9.4}/src/debug.py +0 -0
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db/__init__.py +0 -0
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db/exceptions.py +0 -0
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db/table/__init__.py +0 -0
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db/table/duplicated_table.py +0 -0
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db/table/relations.py +0 -0
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db/table/table.py +0 -0
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db/table/transformed_table.py +0 -0
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db.egg-info/SOURCES.txt +0 -0
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db.egg-info/dependency_links.txt +0 -0
- {xml2db-0.9.0 → xml2db-0.9.4}/src/xml2db.egg-info/top_level.txt +0 -0
- {xml2db-0.9.0 → xml2db-0.9.4}/tests/test_conversions.py +0 -0
- {xml2db-0.9.0 → xml2db-0.9.4}/tests/test_roundtrip.py +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
Copyright (c)
|
|
1
|
+
Copyright (c) 2024 Commission de régulation de l'énergie
|
|
2
2
|
|
|
3
3
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
4
|
of this software and associated documentation files (the "Software"), to deal
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: xml2db
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.4
|
|
4
4
|
Summary: Import complex XML files to a relational database
|
|
5
5
|
Author-email: Commission de régulation de l'énergie <opensource@cre.fr>
|
|
6
6
|
Project-URL: Documentation, https://cre-dev.github.io/xml2db
|
|
@@ -9,23 +9,22 @@ Project-URL: Issues page, https://github.com/cre-dev/xml2db/issues
|
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
10
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
11
11
|
Classifier: Operating System :: OS Independent
|
|
12
|
-
Requires-Python: >=3.
|
|
12
|
+
Requires-Python: >=3.9
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
|
-
Requires-Dist: sqlalchemy
|
|
16
|
-
Requires-Dist: xmlschema
|
|
17
|
-
Requires-Dist: lxml
|
|
18
|
-
Requires-Dist: graphlib_backport; python_version < "3.9"
|
|
15
|
+
Requires-Dist: sqlalchemy>1.4
|
|
16
|
+
Requires-Dist: xmlschema==3.1.0
|
|
17
|
+
Requires-Dist: lxml==5.1.0
|
|
19
18
|
Provides-Extra: docs
|
|
20
|
-
Requires-Dist: mkdocs-material; extra == "docs"
|
|
21
|
-
Requires-Dist: mkdocstrings[python]; extra == "docs"
|
|
19
|
+
Requires-Dist: mkdocs-material==9.5.14; extra == "docs"
|
|
20
|
+
Requires-Dist: mkdocstrings[python]==0.24.1; extra == "docs"
|
|
22
21
|
Provides-Extra: tests
|
|
23
|
-
Requires-Dist: pytest; extra == "tests"
|
|
22
|
+
Requires-Dist: pytest>=7.0; extra == "tests"
|
|
24
23
|
|
|
25
24
|
# Xml2db
|
|
26
25
|
|
|
27
26
|
`xml2db` is a Python package which allows loading XML data into a relational database. It is designed to handle complex
|
|
28
|
-
schemas which cannot be denormalized to a flat table, without any custom code.
|
|
27
|
+
schemas which cannot be easily denormalized to a flat table, without any custom code.
|
|
29
28
|
|
|
30
29
|
It builds a data model (i.e. a set of database tables linked with foreign keys relationships) based on a XSD schema and
|
|
31
30
|
allows parsing and loading XML files into the database, and get them back to XML, if needed.
|
|
@@ -38,7 +37,7 @@ from xml2db import DataModel
|
|
|
38
37
|
# Create a data model of tables with relations based on the XSD file
|
|
39
38
|
data_model = DataModel(
|
|
40
39
|
xsd_file="path/to/file.xsd",
|
|
41
|
-
connection_string="
|
|
40
|
+
connection_string="postgresql+psycopg2://testuser:testuser@localhost:5432/testdb",
|
|
42
41
|
)
|
|
43
42
|
# Parse an XML file based on this XSD
|
|
44
43
|
document = data_model.parse_xml(
|
|
@@ -52,11 +51,10 @@ The data model will adhere closely to the XSD schema, but `xml2db` will perform
|
|
|
52
51
|
complexity of the resulting data model and the storage footprint.
|
|
53
52
|
|
|
54
53
|
The raw data loaded into the database can then be processed using [DBT](https://www.getdbt.com/), SQL views or
|
|
55
|
-
|
|
54
|
+
other tools aimed at extracting, correcting and formatting the data into more user-friendly tables.
|
|
56
55
|
|
|
57
56
|
`xml2db` is developed and used at the [French energy regulation authority (CRE)](https://www.cre.fr/) to process XML
|
|
58
|
-
data
|
|
59
|
-
files translating into a 20+ tables data model in the database.
|
|
57
|
+
data.
|
|
60
58
|
|
|
61
59
|
This package uses `sqlalchemy` to interact with the database, so it should work with different database backends. It has
|
|
62
60
|
been tested against PostgreSQL and MS SQL Server. It currently does not work with SQLite. You may have to install
|
|
@@ -87,8 +85,8 @@ Run all tests with the following command:
|
|
|
87
85
|
python -m pytest
|
|
88
86
|
```
|
|
89
87
|
|
|
90
|
-
Integration tests require write access to a MS SQL
|
|
91
|
-
variable `DB_STRING`. If you want to run only conversion tests that do not require a database you can run:
|
|
88
|
+
Integration tests require write access to a PostgreSQL or MS SQL Server database; the connection string is provided as an
|
|
89
|
+
environment variable `DB_STRING`. If you want to run only conversion tests that do not require a database you can run:
|
|
92
90
|
|
|
93
91
|
```bash
|
|
94
92
|
pytest -m "not dbtest"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# Xml2db
|
|
2
2
|
|
|
3
3
|
`xml2db` is a Python package which allows loading XML data into a relational database. It is designed to handle complex
|
|
4
|
-
schemas which cannot be denormalized to a flat table, without any custom code.
|
|
4
|
+
schemas which cannot be easily denormalized to a flat table, without any custom code.
|
|
5
5
|
|
|
6
6
|
It builds a data model (i.e. a set of database tables linked with foreign keys relationships) based on a XSD schema and
|
|
7
7
|
allows parsing and loading XML files into the database, and get them back to XML, if needed.
|
|
@@ -14,7 +14,7 @@ from xml2db import DataModel
|
|
|
14
14
|
# Create a data model of tables with relations based on the XSD file
|
|
15
15
|
data_model = DataModel(
|
|
16
16
|
xsd_file="path/to/file.xsd",
|
|
17
|
-
connection_string="
|
|
17
|
+
connection_string="postgresql+psycopg2://testuser:testuser@localhost:5432/testdb",
|
|
18
18
|
)
|
|
19
19
|
# Parse an XML file based on this XSD
|
|
20
20
|
document = data_model.parse_xml(
|
|
@@ -28,11 +28,10 @@ The data model will adhere closely to the XSD schema, but `xml2db` will perform
|
|
|
28
28
|
complexity of the resulting data model and the storage footprint.
|
|
29
29
|
|
|
30
30
|
The raw data loaded into the database can then be processed using [DBT](https://www.getdbt.com/), SQL views or
|
|
31
|
-
|
|
31
|
+
other tools aimed at extracting, correcting and formatting the data into more user-friendly tables.
|
|
32
32
|
|
|
33
33
|
`xml2db` is developed and used at the [French energy regulation authority (CRE)](https://www.cre.fr/) to process XML
|
|
34
|
-
data
|
|
35
|
-
files translating into a 20+ tables data model in the database.
|
|
34
|
+
data.
|
|
36
35
|
|
|
37
36
|
This package uses `sqlalchemy` to interact with the database, so it should work with different database backends. It has
|
|
38
37
|
been tested against PostgreSQL and MS SQL Server. It currently does not work with SQLite. You may have to install
|
|
@@ -63,8 +62,8 @@ Run all tests with the following command:
|
|
|
63
62
|
python -m pytest
|
|
64
63
|
```
|
|
65
64
|
|
|
66
|
-
Integration tests require write access to a MS SQL
|
|
67
|
-
variable `DB_STRING`. If you want to run only conversion tests that do not require a database you can run:
|
|
65
|
+
Integration tests require write access to a PostgreSQL or MS SQL Server database; the connection string is provided as an
|
|
66
|
+
environment variable `DB_STRING`. If you want to run only conversion tests that do not require a database you can run:
|
|
68
67
|
|
|
69
68
|
```bash
|
|
70
69
|
pytest -m "not dbtest"
|
|
@@ -4,28 +4,27 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "xml2db"
|
|
7
|
-
version = "0.9.
|
|
7
|
+
version = "0.9.4"
|
|
8
8
|
authors = [
|
|
9
9
|
{ name="Commission de régulation de l'énergie", email="opensource@cre.fr" },
|
|
10
10
|
]
|
|
11
11
|
description = "Import complex XML files to a relational database"
|
|
12
12
|
readme = "README.md"
|
|
13
|
-
requires-python = ">=3.
|
|
13
|
+
requires-python = ">=3.9"
|
|
14
14
|
classifiers = [
|
|
15
15
|
"Programming Language :: Python :: 3",
|
|
16
16
|
"License :: OSI Approved :: MIT License",
|
|
17
17
|
"Operating System :: OS Independent",
|
|
18
18
|
]
|
|
19
19
|
dependencies = [
|
|
20
|
-
"sqlalchemy",
|
|
21
|
-
"xmlschema",
|
|
22
|
-
"lxml",
|
|
23
|
-
"graphlib_backport;python_version<'3.9'",
|
|
20
|
+
"sqlalchemy>1.4",
|
|
21
|
+
"xmlschema==3.1.0",
|
|
22
|
+
"lxml==5.1.0",
|
|
24
23
|
]
|
|
25
24
|
|
|
26
25
|
[project.optional-dependencies]
|
|
27
|
-
docs = ["mkdocs-material", "mkdocstrings[python]"]
|
|
28
|
-
tests = ["pytest"]
|
|
26
|
+
docs = ["mkdocs-material==9.5.14", "mkdocstrings[python]==0.24.1"]
|
|
27
|
+
tests = ["pytest>=7.0"]
|
|
29
28
|
|
|
30
29
|
[project.urls]
|
|
31
30
|
"Documentation" = "https://cre-dev.github.io/xml2db"
|
|
@@ -122,8 +122,8 @@ class Document:
|
|
|
122
122
|
]
|
|
123
123
|
for h_child in sorted(h_children):
|
|
124
124
|
h.update(h_child)
|
|
125
|
-
node["
|
|
126
|
-
return node["
|
|
125
|
+
node["record_hash"] = h.digest()
|
|
126
|
+
return node["record_hash"]
|
|
127
127
|
|
|
128
128
|
def doc_tree_to_flat_data(self, document_tree: dict) -> dict:
|
|
129
129
|
"""Convert document tree (nested dict) to flat tables data model to prepare database import
|
|
@@ -164,7 +164,7 @@ class Document:
|
|
|
164
164
|
}
|
|
165
165
|
data = data_model[node["type"]]
|
|
166
166
|
|
|
167
|
-
hex_hash = str(node["
|
|
167
|
+
hex_hash = str(node["record_hash"])
|
|
168
168
|
|
|
169
169
|
# if node is reused and a record with identical hash is already inserted, return its pk
|
|
170
170
|
if model_table.is_reused:
|
|
@@ -204,7 +204,10 @@ class Document:
|
|
|
204
204
|
else:
|
|
205
205
|
esc_val = [str(v).replace('"', '\\"') for v in val]
|
|
206
206
|
esc_val = [
|
|
207
|
-
f'"{v}"'
|
|
207
|
+
f'"{v}"'
|
|
208
|
+
if "," in v or "\n" in v or "\r" in v or '"' in v
|
|
209
|
+
else v
|
|
210
|
+
for v in esc_val
|
|
208
211
|
]
|
|
209
212
|
record[key] = ",".join(esc_val)
|
|
210
213
|
else:
|
|
@@ -222,7 +225,7 @@ class Document:
|
|
|
222
225
|
else:
|
|
223
226
|
record[f"temp_{rel.field_name}"] = None
|
|
224
227
|
|
|
225
|
-
record["
|
|
228
|
+
record["record_hash"] = bytes(node["record_hash"])
|
|
226
229
|
|
|
227
230
|
# add integration meta data if root table
|
|
228
231
|
if model_table.type_name == self.model.root_table:
|
|
@@ -311,7 +311,11 @@ class DataModel:
|
|
|
311
311
|
)
|
|
312
312
|
elem_type = elem_type[0]
|
|
313
313
|
if elem_type.is_union():
|
|
314
|
-
return
|
|
314
|
+
return (
|
|
315
|
+
recurse_parse_simple_type(elem_type.base_type.member_types)
|
|
316
|
+
if elem_type.base_type
|
|
317
|
+
else recurse_parse_simple_type(elem_type.member_types)
|
|
318
|
+
)
|
|
315
319
|
if elem_type.is_restriction():
|
|
316
320
|
dt = elem_type.base_type.local_name
|
|
317
321
|
mil = elem_type.min_length
|
|
@@ -402,7 +406,12 @@ class DataModel:
|
|
|
402
406
|
)
|
|
403
407
|
)
|
|
404
408
|
ct = child.type
|
|
405
|
-
if
|
|
409
|
+
if (
|
|
410
|
+
ct.is_complex()
|
|
411
|
+
and len(child) == 0
|
|
412
|
+
and len(child.attributes) == 0
|
|
413
|
+
and ct.base_type is not None
|
|
414
|
+
):
|
|
406
415
|
ct = ct.base_type
|
|
407
416
|
if ct.is_simple():
|
|
408
417
|
(
|
|
@@ -427,7 +436,9 @@ class DataModel:
|
|
|
427
436
|
elif ct.is_complex():
|
|
428
437
|
child_table = self._parse_tree(child)
|
|
429
438
|
child_table.model_group = (
|
|
430
|
-
"choice"
|
|
439
|
+
"choice"
|
|
440
|
+
if ct.model_group and ct.model_group.model == "choice"
|
|
441
|
+
else "sequence"
|
|
431
442
|
)
|
|
432
443
|
occurs = get_occurs(child)
|
|
433
444
|
if child.is_single():
|
|
@@ -449,16 +460,29 @@ class DataModel:
|
|
|
449
460
|
else:
|
|
450
461
|
raise ValueError("unknown case; please check (child not an XsdElement)")
|
|
451
462
|
|
|
452
|
-
if hasattr(parent_node, "type") and
|
|
463
|
+
if hasattr(parent_node, "type") and (
|
|
464
|
+
parent_node.type.has_mixed_content()
|
|
465
|
+
or parent_node.type.has_simple_content()
|
|
466
|
+
):
|
|
467
|
+
if parent_node.type.base_type is not None:
|
|
468
|
+
(
|
|
469
|
+
data_type,
|
|
470
|
+
min_length,
|
|
471
|
+
max_length,
|
|
472
|
+
allow_empty,
|
|
473
|
+
) = recurse_parse_simple_type([parent_node.type.base_type])
|
|
474
|
+
else:
|
|
475
|
+
data_type, min_length, max_length, allow_empty = "string", 0, None, True
|
|
476
|
+
|
|
453
477
|
parent_table.add_column(
|
|
454
478
|
"value",
|
|
455
|
-
|
|
479
|
+
data_type,
|
|
456
480
|
[0, 1],
|
|
457
|
-
|
|
458
|
-
|
|
481
|
+
min_length,
|
|
482
|
+
max_length,
|
|
459
483
|
False,
|
|
460
484
|
True,
|
|
461
|
-
|
|
485
|
+
allow_empty,
|
|
462
486
|
None,
|
|
463
487
|
)
|
|
464
488
|
|
|
@@ -10,6 +10,7 @@ from sqlalchemy import (
|
|
|
10
10
|
Column,
|
|
11
11
|
DateTime,
|
|
12
12
|
String,
|
|
13
|
+
LargeBinary,
|
|
13
14
|
)
|
|
14
15
|
from sqlalchemy.dialects import mssql
|
|
15
16
|
|
|
@@ -51,6 +52,8 @@ def types_mapping_default(temp: bool, col: "DataModelColumn") -> Any:
|
|
|
51
52
|
if min_length >= col.max_length - 1 and not col.allow_empty:
|
|
52
53
|
return String(col.max_length)
|
|
53
54
|
return String(col.max_length)
|
|
55
|
+
if col.data_type == "binary":
|
|
56
|
+
return LargeBinary(col.max_length)
|
|
54
57
|
else:
|
|
55
58
|
logger.warning(
|
|
56
59
|
f"unknown type '{col.data_type}' for column '{col.name}', defaulting to VARCHAR(1000) "
|
|
@@ -93,6 +96,10 @@ def types_mapping_mssql(temp: bool, col: "DataModelColumn") -> Any:
|
|
|
93
96
|
if min_length >= col.max_length - 1 and not col.allow_empty:
|
|
94
97
|
return mssql.CHAR(col.max_length)
|
|
95
98
|
return mssql.VARCHAR(col.max_length)
|
|
99
|
+
if col.data_type == "binary":
|
|
100
|
+
if col.max_length == col.min_length:
|
|
101
|
+
return mssql.BINARY(col.max_length)
|
|
102
|
+
return mssql.VARBINARY(col.max_length)
|
|
96
103
|
else:
|
|
97
104
|
logger.warning(
|
|
98
105
|
f"unknown type '{col.data_type}' for column '{col.name}', defaulting to VARCHAR(1000) "
|
|
@@ -109,6 +116,8 @@ class DataModelColumn:
|
|
|
109
116
|
:param occurs: min and max occurrences of the field
|
|
110
117
|
:param min_length: min length
|
|
111
118
|
:param max_length: max length
|
|
119
|
+
:param is_attr: does the column value come from an xml attribute?
|
|
120
|
+
:param is_content: is the column used to store the content value of a mixed complex type?
|
|
112
121
|
:param allow_empty: is nullable ?
|
|
113
122
|
:param ngroup: a key used to handle nested sequences
|
|
114
123
|
:param model_config: data model config, may contain column type information
|
|
@@ -126,7 +135,7 @@ class DataModelColumn:
|
|
|
126
135
|
data_type: str,
|
|
127
136
|
occurs: List[int],
|
|
128
137
|
min_length: int,
|
|
129
|
-
max_length: int,
|
|
138
|
+
max_length: Union[int, None],
|
|
130
139
|
is_attr: bool,
|
|
131
140
|
is_content: bool,
|
|
132
141
|
allow_empty: bool,
|
|
@@ -13,6 +13,7 @@ from sqlalchemy import (
|
|
|
13
13
|
)
|
|
14
14
|
|
|
15
15
|
from .transformed_table import DataModelTableTransformed
|
|
16
|
+
from .column import DataModelColumn
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
class DataModelTableReused(DataModelTableTransformed):
|
|
@@ -49,13 +50,39 @@ class DataModelTableReused(DataModelTableTransformed):
|
|
|
49
50
|
# Root table is given additional integration metadata columns
|
|
50
51
|
if self.is_root_table:
|
|
51
52
|
yield Column("xml2db_input_file_path", String(256), nullable=False)
|
|
52
|
-
|
|
53
|
-
|
|
53
|
+
# Use DataModelColumn to create record hash column in order to get the right data type
|
|
54
|
+
processed_at_col = DataModelColumn(
|
|
55
|
+
"xml2db_processed_at",
|
|
56
|
+
[],
|
|
57
|
+
"dateTime",
|
|
58
|
+
[1, 1],
|
|
59
|
+
0,
|
|
60
|
+
None,
|
|
61
|
+
False,
|
|
62
|
+
False,
|
|
63
|
+
False,
|
|
64
|
+
None,
|
|
65
|
+
self.config,
|
|
66
|
+
self.data_model,
|
|
54
67
|
)
|
|
55
|
-
|
|
56
|
-
|
|
68
|
+
yield from processed_at_col.get_sqlalchemy_column(temp)
|
|
69
|
+
hash_col = DataModelColumn(
|
|
70
|
+
"record_hash",
|
|
71
|
+
[],
|
|
72
|
+
"binary",
|
|
73
|
+
[1, 1],
|
|
74
|
+
20,
|
|
75
|
+
20,
|
|
76
|
+
False,
|
|
77
|
+
False,
|
|
78
|
+
False,
|
|
79
|
+
None,
|
|
80
|
+
self.config,
|
|
81
|
+
self.data_model,
|
|
82
|
+
)
|
|
83
|
+
yield from hash_col.get_sqlalchemy_column(temp)
|
|
57
84
|
yield UniqueConstraint(
|
|
58
|
-
"
|
|
85
|
+
"record_hash",
|
|
59
86
|
name=f"{prefix if temp else ''}{self.name}_xml2db_record_hash",
|
|
60
87
|
)
|
|
61
88
|
|
|
@@ -115,10 +142,8 @@ class DataModelTableReused(DataModelTableTransformed):
|
|
|
115
142
|
|
|
116
143
|
# find matching records hash in target table
|
|
117
144
|
yield self.temp_table.update().values(temp_exists=True).where(
|
|
118
|
-
getattr( # noqa: Linter puzzled by ==
|
|
119
|
-
|
|
120
|
-
)
|
|
121
|
-
== getattr(self.table.c, "xml2db_record_hash")
|
|
145
|
+
getattr(self.temp_table.c, "record_hash") # noqa: Linter puzzled by ==
|
|
146
|
+
== getattr(self.table.c, "record_hash")
|
|
122
147
|
)
|
|
123
148
|
|
|
124
149
|
# update foreign keys for n-1 relations tables
|
|
@@ -141,10 +166,8 @@ class DataModelTableReused(DataModelTableTransformed):
|
|
|
141
166
|
yield self.temp_table.update().values(
|
|
142
167
|
**{f"pk_{self.name}": getattr(self.table.c, f"pk_{self.name}")}
|
|
143
168
|
).where(
|
|
144
|
-
getattr( # noqa: Linter puzzled by ==
|
|
145
|
-
|
|
146
|
-
)
|
|
147
|
-
== getattr(self.table.c, "xml2db_record_hash")
|
|
169
|
+
getattr(self.temp_table.c, "record_hash") # noqa: Linter puzzled by ==
|
|
170
|
+
== getattr(self.table.c, "record_hash")
|
|
148
171
|
)
|
|
149
172
|
|
|
150
173
|
# update primary keys for n-n relations tables
|
|
@@ -158,6 +158,7 @@ class XMLConverter:
|
|
|
158
158
|
|
|
159
159
|
def _make_xml_node(self, node_data, node_name, nsmap: dict = None):
|
|
160
160
|
def check_transformed_node(node_type, element):
|
|
161
|
+
"""Convert "choice" transformed nodes (type/value) to `<type>value</type>` XML nodes"""
|
|
161
162
|
if (
|
|
162
163
|
node_type in self.model.types_transforms
|
|
163
164
|
and self.model.types_transforms[node_type] == "choice"
|
|
@@ -176,18 +177,27 @@ class XMLConverter:
|
|
|
176
177
|
return element
|
|
177
178
|
|
|
178
179
|
tb = self.model.tables[node_data["type"]]
|
|
180
|
+
# due to "elevated" nodes (i.e. flattened), we need to build a stack of nested nodes to reconstruct the
|
|
181
|
+
# original XML. It is a list of tuples of (node type, node Element).
|
|
179
182
|
nodes_stack = [(node_data["type"], etree.Element(node_name, nsmap=nsmap))]
|
|
180
183
|
prev_chain = []
|
|
181
184
|
prev_ngroup = None
|
|
182
185
|
ngroup_stack = []
|
|
183
186
|
for field_type, rel_name, rel in tb.fields:
|
|
187
|
+
# This part manages the nodes stack, based on `name_chain` attribute which represents a "path".
|
|
188
|
+
# We compare the current path with the previous field path, and manage the nodes stack accordingly
|
|
189
|
+
# (i.e. create new nested nodes, pop the last node when moving to another path, etc.)
|
|
184
190
|
name_chain = rel.name_chain[:-1]
|
|
185
191
|
i = len(prev_chain)
|
|
186
192
|
while i > 0 and (
|
|
187
193
|
i > len(name_chain) or name_chain[i - 1][0] != prev_chain[i - 1][0]
|
|
188
194
|
):
|
|
189
195
|
completed_node = check_transformed_node(*nodes_stack.pop())
|
|
190
|
-
if completed_node is not None and
|
|
196
|
+
if completed_node is not None and (
|
|
197
|
+
len(completed_node) > 0
|
|
198
|
+
or completed_node.text
|
|
199
|
+
or len(completed_node.attrib) > 0
|
|
200
|
+
):
|
|
191
201
|
nodes_stack[-1][1].append(completed_node)
|
|
192
202
|
i -= 1
|
|
193
203
|
while i < len(name_chain):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: xml2db
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.4
|
|
4
4
|
Summary: Import complex XML files to a relational database
|
|
5
5
|
Author-email: Commission de régulation de l'énergie <opensource@cre.fr>
|
|
6
6
|
Project-URL: Documentation, https://cre-dev.github.io/xml2db
|
|
@@ -9,23 +9,22 @@ Project-URL: Issues page, https://github.com/cre-dev/xml2db/issues
|
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
10
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
11
11
|
Classifier: Operating System :: OS Independent
|
|
12
|
-
Requires-Python: >=3.
|
|
12
|
+
Requires-Python: >=3.9
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
|
-
Requires-Dist: sqlalchemy
|
|
16
|
-
Requires-Dist: xmlschema
|
|
17
|
-
Requires-Dist: lxml
|
|
18
|
-
Requires-Dist: graphlib_backport; python_version < "3.9"
|
|
15
|
+
Requires-Dist: sqlalchemy>1.4
|
|
16
|
+
Requires-Dist: xmlschema==3.1.0
|
|
17
|
+
Requires-Dist: lxml==5.1.0
|
|
19
18
|
Provides-Extra: docs
|
|
20
|
-
Requires-Dist: mkdocs-material; extra == "docs"
|
|
21
|
-
Requires-Dist: mkdocstrings[python]; extra == "docs"
|
|
19
|
+
Requires-Dist: mkdocs-material==9.5.14; extra == "docs"
|
|
20
|
+
Requires-Dist: mkdocstrings[python]==0.24.1; extra == "docs"
|
|
22
21
|
Provides-Extra: tests
|
|
23
|
-
Requires-Dist: pytest; extra == "tests"
|
|
22
|
+
Requires-Dist: pytest>=7.0; extra == "tests"
|
|
24
23
|
|
|
25
24
|
# Xml2db
|
|
26
25
|
|
|
27
26
|
`xml2db` is a Python package which allows loading XML data into a relational database. It is designed to handle complex
|
|
28
|
-
schemas which cannot be denormalized to a flat table, without any custom code.
|
|
27
|
+
schemas which cannot be easily denormalized to a flat table, without any custom code.
|
|
29
28
|
|
|
30
29
|
It builds a data model (i.e. a set of database tables linked with foreign keys relationships) based on a XSD schema and
|
|
31
30
|
allows parsing and loading XML files into the database, and get them back to XML, if needed.
|
|
@@ -38,7 +37,7 @@ from xml2db import DataModel
|
|
|
38
37
|
# Create a data model of tables with relations based on the XSD file
|
|
39
38
|
data_model = DataModel(
|
|
40
39
|
xsd_file="path/to/file.xsd",
|
|
41
|
-
connection_string="
|
|
40
|
+
connection_string="postgresql+psycopg2://testuser:testuser@localhost:5432/testdb",
|
|
42
41
|
)
|
|
43
42
|
# Parse an XML file based on this XSD
|
|
44
43
|
document = data_model.parse_xml(
|
|
@@ -52,11 +51,10 @@ The data model will adhere closely to the XSD schema, but `xml2db` will perform
|
|
|
52
51
|
complexity of the resulting data model and the storage footprint.
|
|
53
52
|
|
|
54
53
|
The raw data loaded into the database can then be processed using [DBT](https://www.getdbt.com/), SQL views or
|
|
55
|
-
|
|
54
|
+
other tools aimed at extracting, correcting and formatting the data into more user-friendly tables.
|
|
56
55
|
|
|
57
56
|
`xml2db` is developed and used at the [French energy regulation authority (CRE)](https://www.cre.fr/) to process XML
|
|
58
|
-
data
|
|
59
|
-
files translating into a 20+ tables data model in the database.
|
|
57
|
+
data.
|
|
60
58
|
|
|
61
59
|
This package uses `sqlalchemy` to interact with the database, so it should work with different database backends. It has
|
|
62
60
|
been tested against PostgreSQL and MS SQL Server. It currently does not work with SQLite. You may have to install
|
|
@@ -87,8 +85,8 @@ Run all tests with the following command:
|
|
|
87
85
|
python -m pytest
|
|
88
86
|
```
|
|
89
87
|
|
|
90
|
-
Integration tests require write access to a MS SQL
|
|
91
|
-
variable `DB_STRING`. If you want to run only conversion tests that do not require a database you can run:
|
|
88
|
+
Integration tests require write access to a PostgreSQL or MS SQL Server database; the connection string is provided as an
|
|
89
|
+
environment variable `DB_STRING`. If you want to run only conversion tests that do not require a database you can run:
|
|
92
90
|
|
|
93
91
|
```bash
|
|
94
92
|
pytest -m "not dbtest"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|