xml2db 0.12.0__tar.gz → 0.12.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xml2db-0.12.0/src/xml2db.egg-info → xml2db-0.12.2}/PKG-INFO +13 -13
- {xml2db-0.12.0 → xml2db-0.12.2}/README.md +9 -9
- {xml2db-0.12.0 → xml2db-0.12.2}/pyproject.toml +3 -3
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db/document.py +1 -2
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db/model.py +35 -19
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db/table/duplicated_table.py +15 -1
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db/table/reused_table.py +17 -1
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db/xml_converter.py +1 -3
- {xml2db-0.12.0 → xml2db-0.12.2/src/xml2db.egg-info}/PKG-INFO +13 -13
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db.egg-info/SOURCES.txt +2 -1
- xml2db-0.12.2/src/xml2db.egg-info/requires.txt +10 -0
- {xml2db-0.12.0 → xml2db-0.12.2}/tests/test_roundtrip.py +2 -1
- xml2db-0.12.2/tests/test_validation.py +85 -0
- xml2db-0.12.0/src/xml2db.egg-info/requires.txt +0 -10
- {xml2db-0.12.0 → xml2db-0.12.2}/LICENSE +0 -0
- {xml2db-0.12.0 → xml2db-0.12.2}/setup.cfg +0 -0
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db/__init__.py +0 -0
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db/exceptions.py +0 -0
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db/table/__init__.py +0 -0
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db/table/column.py +0 -0
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db/table/relations.py +0 -0
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db/table/table.py +0 -0
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db/table/transformed_table.py +0 -0
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db.egg-info/dependency_links.txt +0 -0
- {xml2db-0.12.0 → xml2db-0.12.2}/src/xml2db.egg-info/top_level.txt +0 -0
- {xml2db-0.12.0 → xml2db-0.12.2}/tests/test_conversions.py +1 -1
- {xml2db-0.12.0 → xml2db-0.12.2}/tests/test_models_output.py +1 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: xml2db
|
|
3
|
-
Version: 0.12.
|
|
3
|
+
Version: 0.12.2
|
|
4
4
|
Summary: Import complex XML files to a relational database
|
|
5
5
|
Author-email: Commission de régulation de l'énergie <opensource@cre.fr>
|
|
6
6
|
Project-URL: Documentation, https://cre-dev.github.io/xml2db
|
|
@@ -13,19 +13,18 @@ Requires-Python: >=3.9
|
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
15
|
Requires-Dist: sqlalchemy>1.4
|
|
16
|
-
Requires-Dist: xmlschema==3.
|
|
16
|
+
Requires-Dist: xmlschema==3.3.2
|
|
17
17
|
Requires-Dist: lxml==5.1.0
|
|
18
18
|
Provides-Extra: docs
|
|
19
|
-
Requires-Dist: mkdocs-material==9.5.
|
|
20
|
-
Requires-Dist: mkdocstrings-python==1.
|
|
19
|
+
Requires-Dist: mkdocs-material==9.5.34; extra == "docs"
|
|
20
|
+
Requires-Dist: mkdocstrings-python==1.11.1; extra == "docs"
|
|
21
21
|
Provides-Extra: tests
|
|
22
22
|
Requires-Dist: pytest>=7.0; extra == "tests"
|
|
23
23
|
|
|
24
|
-
# Loading
|
|
24
|
+
# Loading XML files into a relational database
|
|
25
25
|
|
|
26
|
-
`xml2db` is a Python package which allows parsing and loading XML files into a relational database. It
|
|
27
|
-
|
|
28
|
-
mapping rules.
|
|
26
|
+
`xml2db` is a Python package which allows parsing and loading XML files into a relational database. It handles complex
|
|
27
|
+
XML files which cannot be denormalized to flat tables, and works out of the box, without any custom mapping rules.
|
|
29
28
|
|
|
30
29
|
It can be used within an [Extract, Load, Transform](https://docs.getdbt.com/terms/elt) data pipeline pattern as it
|
|
31
30
|
allows loading XML files into a relational data model which is very close from the source data, yet easy to work with.
|
|
@@ -52,7 +51,7 @@ document = data_model.parse_xml(
|
|
|
52
51
|
document.insert_into_target_tables()
|
|
53
52
|
```
|
|
54
53
|
|
|
55
|
-
The
|
|
54
|
+
The data model created by `xml2db` will be close to the XSD schema. However, `xml2db` will perform a few systematic
|
|
56
55
|
simplifications aimed at limiting the complexity of the resulting data model and the storage footprint. The resulting
|
|
57
56
|
data model can be configured, but the above code will work out of the box, with reasonable defaults.
|
|
58
57
|
|
|
@@ -60,9 +59,9 @@ The raw data loaded into the database can then be processed if need be, using fo
|
|
|
60
59
|
SQL views or stored procedures aimed at extracting, correcting and formatting the data into more user-friendly tables.
|
|
61
60
|
|
|
62
61
|
This package uses `sqlalchemy` to interact with the database, so it should work with different database backends.
|
|
63
|
-
Automated integration tests run against PostgreSQL, MySQL
|
|
64
|
-
|
|
65
|
-
`
|
|
62
|
+
Automated integration tests run against PostgreSQL, MySQL, MS SQL Server and DuckDB. You may have to install additional
|
|
63
|
+
packages to connect to your database (e.g. `psycopg2` for PostgreSQL, `pymysql` for MySQL, `pyodbc` for MS SQL Server or
|
|
64
|
+
`duckdb_engine` for DuckDB).
|
|
66
65
|
|
|
67
66
|
**Please read the [package documentation website](https://cre-dev.github.io/xml2db) for all the details!**
|
|
68
67
|
|
|
@@ -97,7 +96,8 @@ pytest -m "not dbtest"
|
|
|
97
96
|
|
|
98
97
|
## Contributing
|
|
99
98
|
|
|
100
|
-
`xml2db` is developed and used at the [French energy regulation authority (CRE)](https://www.cre.fr/) to process complex
|
|
99
|
+
`xml2db` is developed and used at the [French energy regulation authority (CRE)](https://www.cre.fr/) to process complex
|
|
100
|
+
XML data.
|
|
101
101
|
|
|
102
102
|
Contributions are welcome, as well as bug reports, starting on the project's
|
|
103
103
|
[issue page](https://github.com/cre-dev/xml2db/issues).
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
# Loading
|
|
1
|
+
# Loading XML files into a relational database
|
|
2
2
|
|
|
3
|
-
`xml2db` is a Python package which allows parsing and loading XML files into a relational database. It
|
|
4
|
-
|
|
5
|
-
mapping rules.
|
|
3
|
+
`xml2db` is a Python package which allows parsing and loading XML files into a relational database. It handles complex
|
|
4
|
+
XML files which cannot be denormalized to flat tables, and works out of the box, without any custom mapping rules.
|
|
6
5
|
|
|
7
6
|
It can be used within an [Extract, Load, Transform](https://docs.getdbt.com/terms/elt) data pipeline pattern as it
|
|
8
7
|
allows loading XML files into a relational data model which is very close from the source data, yet easy to work with.
|
|
@@ -29,7 +28,7 @@ document = data_model.parse_xml(
|
|
|
29
28
|
document.insert_into_target_tables()
|
|
30
29
|
```
|
|
31
30
|
|
|
32
|
-
The
|
|
31
|
+
The data model created by `xml2db` will be close to the XSD schema. However, `xml2db` will perform a few systematic
|
|
33
32
|
simplifications aimed at limiting the complexity of the resulting data model and the storage footprint. The resulting
|
|
34
33
|
data model can be configured, but the above code will work out of the box, with reasonable defaults.
|
|
35
34
|
|
|
@@ -37,9 +36,9 @@ The raw data loaded into the database can then be processed if need be, using fo
|
|
|
37
36
|
SQL views or stored procedures aimed at extracting, correcting and formatting the data into more user-friendly tables.
|
|
38
37
|
|
|
39
38
|
This package uses `sqlalchemy` to interact with the database, so it should work with different database backends.
|
|
40
|
-
Automated integration tests run against PostgreSQL, MySQL
|
|
41
|
-
|
|
42
|
-
`
|
|
39
|
+
Automated integration tests run against PostgreSQL, MySQL, MS SQL Server and DuckDB. You may have to install additional
|
|
40
|
+
packages to connect to your database (e.g. `psycopg2` for PostgreSQL, `pymysql` for MySQL, `pyodbc` for MS SQL Server or
|
|
41
|
+
`duckdb_engine` for DuckDB).
|
|
43
42
|
|
|
44
43
|
**Please read the [package documentation website](https://cre-dev.github.io/xml2db) for all the details!**
|
|
45
44
|
|
|
@@ -74,7 +73,8 @@ pytest -m "not dbtest"
|
|
|
74
73
|
|
|
75
74
|
## Contributing
|
|
76
75
|
|
|
77
|
-
`xml2db` is developed and used at the [French energy regulation authority (CRE)](https://www.cre.fr/) to process complex
|
|
76
|
+
`xml2db` is developed and used at the [French energy regulation authority (CRE)](https://www.cre.fr/) to process complex
|
|
77
|
+
XML data.
|
|
78
78
|
|
|
79
79
|
Contributions are welcome, as well as bug reports, starting on the project's
|
|
80
80
|
[issue page](https://github.com/cre-dev/xml2db/issues).
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "xml2db"
|
|
7
|
-
version = "0.12.
|
|
7
|
+
version = "0.12.2"
|
|
8
8
|
authors = [
|
|
9
9
|
{ name="Commission de régulation de l'énergie", email="opensource@cre.fr" },
|
|
10
10
|
]
|
|
@@ -18,12 +18,12 @@ classifiers = [
|
|
|
18
18
|
]
|
|
19
19
|
dependencies = [
|
|
20
20
|
"sqlalchemy>1.4",
|
|
21
|
-
"xmlschema==3.
|
|
21
|
+
"xmlschema==3.3.2",
|
|
22
22
|
"lxml==5.1.0",
|
|
23
23
|
]
|
|
24
24
|
|
|
25
25
|
[project.optional-dependencies]
|
|
26
|
-
docs = ["mkdocs-material==9.5.
|
|
26
|
+
docs = ["mkdocs-material==9.5.34", "mkdocstrings-python==1.11.1"]
|
|
27
27
|
tests = ["pytest>=7.0"]
|
|
28
28
|
|
|
29
29
|
[project.urls]
|
|
@@ -2,7 +2,7 @@ import csv
|
|
|
2
2
|
import datetime
|
|
3
3
|
import logging
|
|
4
4
|
from io import BytesIO
|
|
5
|
-
from typing import Union, TYPE_CHECKING
|
|
5
|
+
from typing import Union, TYPE_CHECKING
|
|
6
6
|
from zoneinfo import ZoneInfo
|
|
7
7
|
from sqlalchemy import Column, Table, text, select
|
|
8
8
|
from sqlalchemy.engine import Connection
|
|
@@ -12,7 +12,6 @@ from lxml import etree
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
13
13
|
from .model import DataModel
|
|
14
14
|
|
|
15
|
-
from .exceptions import DataModelConfigError
|
|
16
15
|
from .xml_converter import XMLConverter
|
|
17
16
|
|
|
18
17
|
logger = logging.getLogger(__name__)
|
|
@@ -8,8 +8,10 @@ import hashlib
|
|
|
8
8
|
|
|
9
9
|
import xmlschema
|
|
10
10
|
import sqlalchemy
|
|
11
|
+
from lxml import etree
|
|
11
12
|
from sqlalchemy import MetaData, create_engine, inspect
|
|
12
13
|
from sqlalchemy.sql.ddl import CreateIndex, CreateTable
|
|
14
|
+
from sqlalchemy.exc import ProgrammingError
|
|
13
15
|
from graphlib import TopologicalSorter
|
|
14
16
|
|
|
15
17
|
from .document import Document
|
|
@@ -49,6 +51,7 @@ class DataModel:
|
|
|
49
51
|
|
|
50
52
|
Attributes:
|
|
51
53
|
xml_schema: The `xmlschema.XMLSchema` object associated with this data model
|
|
54
|
+
lxml_schema: The `lxml.etree.XMLSchema` object associated with this data model
|
|
52
55
|
data_flow_name: A short identifier used for the data model (`short_name` argument value)
|
|
53
56
|
data_flow_long_name: A longer for the data model (`long_name` argument value)
|
|
54
57
|
db_schema: A database schema name to store the database tables
|
|
@@ -72,22 +75,22 @@ class DataModel:
|
|
|
72
75
|
base_url: str = None,
|
|
73
76
|
model_config: dict = None,
|
|
74
77
|
connection_string: str = None,
|
|
75
|
-
db_engine:
|
|
78
|
+
db_engine: sqlalchemy.Engine = None,
|
|
76
79
|
db_type: str = None,
|
|
77
80
|
db_schema: str = None,
|
|
78
81
|
temp_prefix: str = None,
|
|
79
82
|
):
|
|
80
83
|
self.model_config = self._validate_config(model_config)
|
|
81
|
-
self.tables_config = model_config.get("tables", {})
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
base_url=(
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
84
|
+
self.tables_config = model_config.get("tables", {}) if model_config else {}
|
|
85
|
+
|
|
86
|
+
xsd_file_name = xsd_file
|
|
87
|
+
if base_url is None:
|
|
88
|
+
base_url = os.path.normpath(os.path.dirname(xsd_file))
|
|
89
|
+
xsd_file_name = os.path.basename(xsd_file)
|
|
90
|
+
|
|
91
|
+
self.xml_schema = xmlschema.XMLSchema(xsd_file_name, base_url=base_url)
|
|
92
|
+
self.lxml_schema = etree.XMLSchema(etree.parse(xsd_file))
|
|
93
|
+
|
|
91
94
|
self.xml_converter = XMLConverter(data_model=self)
|
|
92
95
|
self.data_flow_name = short_name
|
|
93
96
|
self.data_flow_long_name = long_name
|
|
@@ -104,10 +107,12 @@ class DataModel:
|
|
|
104
107
|
else:
|
|
105
108
|
engine_options = {}
|
|
106
109
|
if "mssql" in connection_string:
|
|
107
|
-
engine_options = {
|
|
110
|
+
engine_options = {
|
|
111
|
+
"fast_executemany": True,
|
|
112
|
+
"isolation_level": "SERIALIZABLE",
|
|
113
|
+
}
|
|
108
114
|
self.engine = create_engine(
|
|
109
115
|
connection_string,
|
|
110
|
-
isolation_level="SERIALIZABLE",
|
|
111
116
|
**engine_options,
|
|
112
117
|
)
|
|
113
118
|
self.db_type = self.engine.dialect.name
|
|
@@ -647,13 +652,24 @@ class DataModel:
|
|
|
647
652
|
You do not have to call this method explicitly when using
|
|
648
653
|
[`Document.insert_into_target_tables()`](document.md#xml2db.document.Document.insert_into_target_tables).
|
|
649
654
|
"""
|
|
655
|
+
|
|
656
|
+
def do_create_schema():
|
|
657
|
+
with self.engine.connect() as conn:
|
|
658
|
+
conn.execute(sqlalchemy.schema.CreateSchema(self.db_schema))
|
|
659
|
+
conn.commit()
|
|
660
|
+
|
|
650
661
|
if self.db_schema is not None:
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
662
|
+
if self.db_type == "duckdb":
|
|
663
|
+
try:
|
|
664
|
+
do_create_schema()
|
|
665
|
+
except ProgrammingError:
|
|
666
|
+
pass
|
|
667
|
+
else:
|
|
668
|
+
inspector = inspect(self.engine)
|
|
669
|
+
if self.db_schema not in inspector.get_schema_names():
|
|
670
|
+
do_create_schema()
|
|
671
|
+
|
|
672
|
+
logger.info(f"Created schema: {self.db_schema}")
|
|
657
673
|
|
|
658
674
|
def drop_all_tables(self):
|
|
659
675
|
"""Drop the data model target (unprefixed) tables.
|
|
@@ -9,6 +9,7 @@ from sqlalchemy import (
|
|
|
9
9
|
Boolean,
|
|
10
10
|
select,
|
|
11
11
|
and_,
|
|
12
|
+
Sequence,
|
|
12
13
|
)
|
|
13
14
|
|
|
14
15
|
from .transformed_table import DataModelTableTransformed
|
|
@@ -83,10 +84,23 @@ class DataModelTableDuplicated(DataModelTableTransformed):
|
|
|
83
84
|
if callable(self.config.get("extra_args", []))
|
|
84
85
|
else self.config.get("extra_args", [])
|
|
85
86
|
)
|
|
87
|
+
if self.data_model.db_type == "duckdb":
|
|
88
|
+
pk_sequence = Sequence(f"pk_sequ_{self.name}")
|
|
89
|
+
pk_col = Column(
|
|
90
|
+
f"pk_{self.name}",
|
|
91
|
+
Integer,
|
|
92
|
+
pk_sequence,
|
|
93
|
+
server_default=pk_sequence.next_value(),
|
|
94
|
+
primary_key=True,
|
|
95
|
+
)
|
|
96
|
+
else:
|
|
97
|
+
pk_col = Column(
|
|
98
|
+
f"pk_{self.name}", Integer, primary_key=True, autoincrement=True
|
|
99
|
+
)
|
|
86
100
|
self.table = Table(
|
|
87
101
|
self.name,
|
|
88
102
|
self.metadata,
|
|
89
|
-
|
|
103
|
+
pk_col,
|
|
90
104
|
PrimaryKeyConstraint(
|
|
91
105
|
name=f"cx_pk_{self.name}",
|
|
92
106
|
mssql_clustered=not self.config["as_columnstore"],
|
|
@@ -7,6 +7,7 @@ from sqlalchemy import (
|
|
|
7
7
|
UniqueConstraint,
|
|
8
8
|
Boolean,
|
|
9
9
|
select,
|
|
10
|
+
Sequence,
|
|
10
11
|
)
|
|
11
12
|
|
|
12
13
|
from .transformed_table import DataModelTableTransformed
|
|
@@ -86,10 +87,25 @@ class DataModelTableReused(DataModelTableTransformed):
|
|
|
86
87
|
if callable(self.config.get("extra_args", []))
|
|
87
88
|
else self.config.get("extra_args", [])
|
|
88
89
|
)
|
|
90
|
+
|
|
91
|
+
if self.data_model.db_type == "duckdb":
|
|
92
|
+
pk_sequence = Sequence(f"pk_sequ_{self.name}")
|
|
93
|
+
pk_col = Column(
|
|
94
|
+
f"pk_{self.name}",
|
|
95
|
+
Integer,
|
|
96
|
+
pk_sequence,
|
|
97
|
+
server_default=pk_sequence.next_value(),
|
|
98
|
+
primary_key=True,
|
|
99
|
+
)
|
|
100
|
+
else:
|
|
101
|
+
pk_col = Column(
|
|
102
|
+
f"pk_{self.name}", Integer, primary_key=True, autoincrement=True
|
|
103
|
+
)
|
|
104
|
+
|
|
89
105
|
self.table = Table(
|
|
90
106
|
self.name,
|
|
91
107
|
self.metadata,
|
|
92
|
-
|
|
108
|
+
pk_col,
|
|
93
109
|
PrimaryKeyConstraint(
|
|
94
110
|
name=f"cx_pk_{self.name}",
|
|
95
111
|
mssql_clustered=not self.config["as_columnstore"],
|
|
@@ -6,8 +6,6 @@ from lxml import etree
|
|
|
6
6
|
from io import BytesIO
|
|
7
7
|
from itertools import zip_longest
|
|
8
8
|
|
|
9
|
-
from .exceptions import DataModelConfigError
|
|
10
|
-
|
|
11
9
|
|
|
12
10
|
if typing.TYPE_CHECKING:
|
|
13
11
|
from .model import DataModel
|
|
@@ -76,7 +74,7 @@ class XMLConverter:
|
|
|
76
74
|
logger.info("Skipping XML file validation")
|
|
77
75
|
else:
|
|
78
76
|
logger.info("Validating XML file against the schema")
|
|
79
|
-
if not self.model.
|
|
77
|
+
if not self.model.lxml_schema.validate(xt if xt else etree.parse(xml_file)):
|
|
80
78
|
logger.error(f"XML file {file_path} does not conform with the schema")
|
|
81
79
|
raise ValueError(
|
|
82
80
|
f"XML file {file_path} does not conform with the schema"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: xml2db
|
|
3
|
-
Version: 0.12.
|
|
3
|
+
Version: 0.12.2
|
|
4
4
|
Summary: Import complex XML files to a relational database
|
|
5
5
|
Author-email: Commission de régulation de l'énergie <opensource@cre.fr>
|
|
6
6
|
Project-URL: Documentation, https://cre-dev.github.io/xml2db
|
|
@@ -13,19 +13,18 @@ Requires-Python: >=3.9
|
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
15
|
Requires-Dist: sqlalchemy>1.4
|
|
16
|
-
Requires-Dist: xmlschema==3.
|
|
16
|
+
Requires-Dist: xmlschema==3.3.2
|
|
17
17
|
Requires-Dist: lxml==5.1.0
|
|
18
18
|
Provides-Extra: docs
|
|
19
|
-
Requires-Dist: mkdocs-material==9.5.
|
|
20
|
-
Requires-Dist: mkdocstrings-python==1.
|
|
19
|
+
Requires-Dist: mkdocs-material==9.5.34; extra == "docs"
|
|
20
|
+
Requires-Dist: mkdocstrings-python==1.11.1; extra == "docs"
|
|
21
21
|
Provides-Extra: tests
|
|
22
22
|
Requires-Dist: pytest>=7.0; extra == "tests"
|
|
23
23
|
|
|
24
|
-
# Loading
|
|
24
|
+
# Loading XML files into a relational database
|
|
25
25
|
|
|
26
|
-
`xml2db` is a Python package which allows parsing and loading XML files into a relational database. It
|
|
27
|
-
|
|
28
|
-
mapping rules.
|
|
26
|
+
`xml2db` is a Python package which allows parsing and loading XML files into a relational database. It handles complex
|
|
27
|
+
XML files which cannot be denormalized to flat tables, and works out of the box, without any custom mapping rules.
|
|
29
28
|
|
|
30
29
|
It can be used within an [Extract, Load, Transform](https://docs.getdbt.com/terms/elt) data pipeline pattern as it
|
|
31
30
|
allows loading XML files into a relational data model which is very close from the source data, yet easy to work with.
|
|
@@ -52,7 +51,7 @@ document = data_model.parse_xml(
|
|
|
52
51
|
document.insert_into_target_tables()
|
|
53
52
|
```
|
|
54
53
|
|
|
55
|
-
The
|
|
54
|
+
The data model created by `xml2db` will be close to the XSD schema. However, `xml2db` will perform a few systematic
|
|
56
55
|
simplifications aimed at limiting the complexity of the resulting data model and the storage footprint. The resulting
|
|
57
56
|
data model can be configured, but the above code will work out of the box, with reasonable defaults.
|
|
58
57
|
|
|
@@ -60,9 +59,9 @@ The raw data loaded into the database can then be processed if need be, using fo
|
|
|
60
59
|
SQL views or stored procedures aimed at extracting, correcting and formatting the data into more user-friendly tables.
|
|
61
60
|
|
|
62
61
|
This package uses `sqlalchemy` to interact with the database, so it should work with different database backends.
|
|
63
|
-
Automated integration tests run against PostgreSQL, MySQL
|
|
64
|
-
|
|
65
|
-
`
|
|
62
|
+
Automated integration tests run against PostgreSQL, MySQL, MS SQL Server and DuckDB. You may have to install additional
|
|
63
|
+
packages to connect to your database (e.g. `psycopg2` for PostgreSQL, `pymysql` for MySQL, `pyodbc` for MS SQL Server or
|
|
64
|
+
`duckdb_engine` for DuckDB).
|
|
66
65
|
|
|
67
66
|
**Please read the [package documentation website](https://cre-dev.github.io/xml2db) for all the details!**
|
|
68
67
|
|
|
@@ -97,7 +96,8 @@ pytest -m "not dbtest"
|
|
|
97
96
|
|
|
98
97
|
## Contributing
|
|
99
98
|
|
|
100
|
-
`xml2db` is developed and used at the [French energy regulation authority (CRE)](https://www.cre.fr/) to process complex
|
|
99
|
+
`xml2db` is developed and used at the [French energy regulation authority (CRE)](https://www.cre.fr/) to process complex
|
|
100
|
+
XML data.
|
|
101
101
|
|
|
102
102
|
Contributions are welcome, as well as bug reports, starting on the project's
|
|
103
103
|
[issue page](https://github.com/cre-dev/xml2db/issues).
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
|
+
|
|
2
3
|
import pytest
|
|
3
4
|
from lxml import etree
|
|
4
|
-
from xml2db.xml_converter import XMLConverter, remove_record_hash
|
|
5
5
|
|
|
6
|
+
from xml2db.xml_converter import XMLConverter, remove_record_hash
|
|
6
7
|
from .fixtures import setup_db_model, conn_string
|
|
7
8
|
from .sample_models import models
|
|
8
9
|
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import xml.etree.ElementTree
|
|
2
|
+
|
|
3
|
+
import lxml.etree
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from xml2db import DataModel
|
|
7
|
+
from .sample_models import models
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@pytest.mark.parametrize(
|
|
11
|
+
"args",
|
|
12
|
+
[
|
|
13
|
+
("invalid", True, False, ValueError),
|
|
14
|
+
("invalid", True, True, ValueError),
|
|
15
|
+
("invalid", False, False, ValueError),
|
|
16
|
+
("invalid", False, True, ValueError),
|
|
17
|
+
("malformed_recover", True, False, lxml.etree.XMLSyntaxError),
|
|
18
|
+
("malformed_recover", True, True, None),
|
|
19
|
+
("malformed_recover", False, False, lxml.etree.XMLSyntaxError),
|
|
20
|
+
("malformed_recover", False, True, None),
|
|
21
|
+
("malformed_no_recover", True, False, lxml.etree.XMLSyntaxError),
|
|
22
|
+
("malformed_no_recover", True, True, ValueError),
|
|
23
|
+
("malformed_no_recover", False, False, lxml.etree.XMLSyntaxError),
|
|
24
|
+
("malformed_no_recover", False, True, ValueError),
|
|
25
|
+
],
|
|
26
|
+
)
|
|
27
|
+
def test_invalid_xml(args: tuple):
|
|
28
|
+
|
|
29
|
+
file_name, iterparse, recover, exception = args
|
|
30
|
+
data_model = DataModel(models[0]["xsd_path"])
|
|
31
|
+
|
|
32
|
+
if exception is None:
|
|
33
|
+
data_model.parse_xml(
|
|
34
|
+
f"tests/sample_models/orders/invalid_xml/{file_name}.xml",
|
|
35
|
+
skip_validation=False,
|
|
36
|
+
iterparse=iterparse,
|
|
37
|
+
recover=recover,
|
|
38
|
+
)
|
|
39
|
+
else:
|
|
40
|
+
with pytest.raises(exception):
|
|
41
|
+
data_model.parse_xml(
|
|
42
|
+
f"tests/sample_models/orders/invalid_xml/{file_name}.xml",
|
|
43
|
+
skip_validation=False,
|
|
44
|
+
iterparse=iterparse,
|
|
45
|
+
recover=recover,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@pytest.mark.parametrize(
|
|
50
|
+
"args",
|
|
51
|
+
[
|
|
52
|
+
("invalid", True, False, IndexError),
|
|
53
|
+
("invalid", True, True, IndexError),
|
|
54
|
+
("invalid", False, False, None),
|
|
55
|
+
("invalid", False, True, None),
|
|
56
|
+
("malformed_recover", True, False, lxml.etree.XMLSyntaxError),
|
|
57
|
+
("malformed_recover", True, True, None),
|
|
58
|
+
("malformed_recover", False, False, lxml.etree.XMLSyntaxError),
|
|
59
|
+
("malformed_recover", False, True, None),
|
|
60
|
+
("malformed_no_recover", True, False, lxml.etree.XMLSyntaxError),
|
|
61
|
+
("malformed_no_recover", True, True, IndexError),
|
|
62
|
+
("malformed_no_recover", False, False, lxml.etree.XMLSyntaxError),
|
|
63
|
+
("malformed_no_recover", False, True, None),
|
|
64
|
+
],
|
|
65
|
+
)
|
|
66
|
+
def test_invalid_xml_skip_verify(args: tuple):
|
|
67
|
+
|
|
68
|
+
file_name, iterparse, recover, exception = args
|
|
69
|
+
data_model = DataModel(models[0]["xsd_path"])
|
|
70
|
+
|
|
71
|
+
if exception is None:
|
|
72
|
+
data_model.parse_xml(
|
|
73
|
+
f"tests/sample_models/orders/invalid_xml/{file_name}.xml",
|
|
74
|
+
skip_validation=True,
|
|
75
|
+
iterparse=iterparse,
|
|
76
|
+
recover=recover,
|
|
77
|
+
)
|
|
78
|
+
else:
|
|
79
|
+
with pytest.raises(exception):
|
|
80
|
+
data_model.parse_xml(
|
|
81
|
+
f"tests/sample_models/orders/invalid_xml/{file_name}.xml",
|
|
82
|
+
skip_validation=True,
|
|
83
|
+
iterparse=iterparse,
|
|
84
|
+
recover=recover,
|
|
85
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|