xml2db 0.13.0__tar.gz → 0.13.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {xml2db-0.13.0/src/xml2db.egg-info → xml2db-0.13.2}/PKG-INFO +1 -1
  2. {xml2db-0.13.0 → xml2db-0.13.2}/pyproject.toml +1 -1
  3. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/dialect/base.py +19 -0
  4. xml2db-0.13.2/src/xml2db/dialect/duckdb.py +166 -0
  5. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/document.py +9 -3
  6. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/model.py +4 -2
  7. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/table/table.py +10 -0
  8. {xml2db-0.13.0 → xml2db-0.13.2/src/xml2db.egg-info}/PKG-INFO +1 -1
  9. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db.egg-info/SOURCES.txt +2 -0
  10. xml2db-0.13.2/tests/test_bulk_insert.py +178 -0
  11. xml2db-0.13.2/tests/test_multiprocessing.py +114 -0
  12. xml2db-0.13.0/src/xml2db/dialect/duckdb.py +0 -50
  13. {xml2db-0.13.0 → xml2db-0.13.2}/LICENSE +0 -0
  14. {xml2db-0.13.0 → xml2db-0.13.2}/README.md +0 -0
  15. {xml2db-0.13.0 → xml2db-0.13.2}/setup.cfg +0 -0
  16. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/__init__.py +0 -0
  17. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/dialect/__init__.py +0 -0
  18. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/dialect/mssql.py +0 -0
  19. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/dialect/mysql.py +0 -0
  20. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/dialect/postgresql.py +0 -0
  21. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/exceptions.py +0 -0
  22. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/table/__init__.py +0 -0
  23. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/table/column.py +0 -0
  24. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/table/duplicated_table.py +0 -0
  25. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/table/relations.py +0 -0
  26. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/table/reused_table.py +0 -0
  27. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/table/transformed_table.py +0 -0
  28. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/xml_converter.py +0 -0
  29. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db.egg-info/dependency_links.txt +0 -0
  30. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db.egg-info/requires.txt +0 -0
  31. {xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db.egg-info/top_level.txt +0 -0
  32. {xml2db-0.13.0 → xml2db-0.13.2}/tests/test_conversions.py +0 -0
  33. {xml2db-0.13.0 → xml2db-0.13.2}/tests/test_models_output.py +0 -0
  34. {xml2db-0.13.0 → xml2db-0.13.2}/tests/test_roundtrip.py +0 -0
  35. {xml2db-0.13.0 → xml2db-0.13.2}/tests/test_validation.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xml2db
3
- Version: 0.13.0
3
+ Version: 0.13.2
4
4
  Summary: Import complex XML files to a relational database
5
5
  Author-email: Commission de régulation de l'énergie <opensource@cre.fr>
6
6
  Project-URL: Documentation, https://cre-dev.github.io/xml2db
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "xml2db"
7
- version = "0.13.0"
7
+ version = "0.13.2"
8
8
  authors = [
9
9
  { name="Commission de régulation de l'énergie", email="opensource@cre.fr" },
10
10
  ]
@@ -313,3 +313,22 @@ class DatabaseDialect:
313
313
  "Clustered columnstore indexes are only supported with MS SQL Server database, noop"
314
314
  )
315
315
  return config
316
+
317
+ # ------------------------------------------------------------------
318
+ # Data loading
319
+ # ------------------------------------------------------------------
320
+
321
+ def bulk_insert(self, conn: Any, table: Any, records: list) -> None:
322
+ """Insert records into a staging table.
323
+
324
+ The base implementation uses SQLAlchemy's parameterised executemany,
325
+ which is backend-agnostic. Subclasses may override this with a
326
+ backend-specific bulk-loading strategy (e.g. COPY FROM CSV).
327
+
328
+ Args:
329
+ conn: A SQLAlchemy ``Connection`` already within a transaction.
330
+ table: The SQLAlchemy ``Table`` object to insert into.
331
+ records: A list of dicts mapping column keys to Python values.
332
+ """
333
+ if records:
334
+ conn.execute(table.insert(), records)
@@ -0,0 +1,166 @@
1
+ import csv
2
+ import os
3
+ import tempfile
4
+ from typing import Any
5
+
6
+ from sqlalchemy import (
7
+ BigInteger,
8
+ Boolean,
9
+ Column,
10
+ DateTime,
11
+ Double,
12
+ Integer,
13
+ LargeBinary,
14
+ Sequence,
15
+ SmallInteger,
16
+ text,
17
+ )
18
+ from sqlalchemy.exc import ProgrammingError
19
+ import sqlalchemy.schema
20
+
21
+ from .base import DatabaseDialect
22
+
23
+
24
+ class DuckDBDialect(DatabaseDialect):
25
+ """Dialect for DuckDB.
26
+
27
+ DuckDB supports very long identifiers (effectively unlimited in practice;
28
+ we document 1024 as a safe upper bound). It requires two workarounds:
29
+
30
+ - **Primary key columns**: DuckDB does not support ``autoincrement`` in the
31
+ same way as other backends. A ``Sequence`` object is used instead.
32
+ - **Schema creation**: DuckDB's inspector does not reliably list schemas
33
+ before they exist, so the existence check is replaced with a try/except
34
+ around ``CREATE SCHEMA``.
35
+ """
36
+
37
+ # this limit comes from the implementation with SQLAlchemy and not a constraint of duckdb per se
38
+ MAX_IDENTIFIER_LENGTH: int = 63
39
+
40
+ def pk_column(self, table_name: str) -> Column:
41
+ """Return a Sequence-based primary key column for DuckDB."""
42
+ logical = f"pk_{table_name}"
43
+ pk_sequence = Sequence(self.db_identifier(f"pk_sequ_{table_name}"))
44
+ return Column(
45
+ self.db_identifier(logical),
46
+ Integer,
47
+ pk_sequence,
48
+ server_default=pk_sequence.next_value(),
49
+ primary_key=True,
50
+ key=logical,
51
+ )
52
+
53
+ def create_schema(self, engine: Any, schema_name: str) -> None:
54
+ """Create a schema using try/except, as required by DuckDB."""
55
+
56
+ def do_create() -> None:
57
+ with engine.connect() as conn:
58
+ conn.execute(sqlalchemy.schema.CreateSchema(schema_name))
59
+ conn.commit()
60
+
61
+ try:
62
+ do_create()
63
+ except ProgrammingError:
64
+ pass
65
+
66
+ # Maps SQLAlchemy column types to DuckDB CAST target type names.
67
+ # String types need no cast; LargeBinary is handled via unhex().
68
+ # Order matters: subclasses (BigInteger, SmallInteger) must appear before
69
+ # their parent (Integer) so that isinstance() matches the most specific type.
70
+ _DUCKDB_CAST: dict = {
71
+ BigInteger: "BIGINT",
72
+ SmallInteger: "SMALLINT",
73
+ Integer: "INTEGER",
74
+ Double: "DOUBLE",
75
+ Boolean: "BOOLEAN",
76
+ DateTime: "TIMESTAMPTZ", # DateTime(timezone=False) → TIMESTAMP below
77
+ }
78
+
79
+ def _select_expr(self, key: str, col: Any) -> str:
80
+ """Return a DuckDB SELECT expression that casts a VARCHAR CSV column."""
81
+ if isinstance(col.type, LargeBinary):
82
+ return f'unhex("{key}")'
83
+ for sa_type, duckdb_type in self._DUCKDB_CAST.items():
84
+ if isinstance(col.type, sa_type):
85
+ if isinstance(col.type, DateTime) and not col.type.timezone:
86
+ duckdb_type = "TIMESTAMP"
87
+ return f'CAST("{key}" AS {duckdb_type})'
88
+ return f'"{key}"' # String / unknown: keep as VARCHAR
89
+
90
+ def bulk_insert(self, conn: Any, table: Any, records: list) -> None:
91
+ """Bulk-insert records via a temporary CSV file and DuckDB's ``read_csv``.
92
+
93
+ All CSV columns are read as VARCHAR (``all_varchar=true``) and then
94
+ explicitly cast to their target types in the ``SELECT`` clause.
95
+ Binary columns are hex-encoded in the CSV and decoded with ``unhex()``.
96
+
97
+ Args:
98
+ conn: A SQLAlchemy ``Connection`` already within a transaction.
99
+ table: The SQLAlchemy ``Table`` object to insert into.
100
+ records: A list of dicts mapping column keys to Python values.
101
+ """
102
+ if not records:
103
+ return
104
+
105
+ # Map column key -> SQLAlchemy Column object
106
+ col_by_key = {col.key: col for col in table.columns}
107
+
108
+ # Columns present in the first record that correspond to table columns
109
+ col_keys = [k for k in records[0] if k in col_by_key]
110
+
111
+ # SQLAlchemy Python-side scalar defaults (e.g. default=False on temp_exists)
112
+ # are applied automatically by executemany but not by our CSV path.
113
+ extra_defaults: dict = {}
114
+ for col in table.columns:
115
+ if col.key not in records[0] and col.key in col_by_key:
116
+ d = col.default
117
+ if d is not None and d.is_scalar:
118
+ extra_defaults[col.key] = d.arg
119
+
120
+ all_col_keys = col_keys + list(extra_defaults.keys())
121
+
122
+ fd, csv_path = tempfile.mkstemp(suffix=".csv")
123
+ try:
124
+ with os.fdopen(fd, "w", newline="", encoding="utf-8") as f:
125
+ writer = csv.writer(f)
126
+ writer.writerow(all_col_keys)
127
+ for record in records:
128
+ row = []
129
+ for key in all_col_keys:
130
+ v = record.get(key) if key in col_keys else extra_defaults[key]
131
+ if v is None:
132
+ row.append("")
133
+ elif isinstance(v, bytes):
134
+ row.append(v.hex())
135
+ elif isinstance(v, bool):
136
+ # Must come before the general str() path since bool is a
137
+ # subclass of int, and csv.writer would write 0/1 otherwise.
138
+ row.append("true" if v else "false")
139
+ else:
140
+ # str() on datetime gives "YYYY-MM-DD HH:MM:SS[.f][+HH:MM]",
141
+ # which DuckDB's CAST accepts without ambiguity.
142
+ row.append(str(v))
143
+ writer.writerow(row)
144
+
145
+ full_name = (
146
+ f'"{table.schema}"."{table.name}"'
147
+ if table.schema
148
+ else f'"{table.name}"'
149
+ )
150
+ insert_cols = ", ".join(
151
+ f'"{col_by_key[k].name}"' for k in all_col_keys
152
+ )
153
+ select_exprs = ", ".join(
154
+ self._select_expr(k, col_by_key[k]) for k in all_col_keys
155
+ )
156
+ # DuckDB requires forward slashes in file paths on all platforms.
157
+ safe_path = csv_path.replace("\\", "/")
158
+ sql = text(
159
+ f"INSERT INTO {full_name} ({insert_cols}) "
160
+ f"SELECT {select_exprs} "
161
+ f"FROM read_csv('{safe_path}', header=true, nullstr='', all_varchar=true)"
162
+ )
163
+ conn.execute(sql)
164
+ finally:
165
+ if os.path.exists(csv_path):
166
+ os.unlink(csv_path)
@@ -56,8 +56,10 @@ class Document:
56
56
  skip_validation: Should we validate the document against the schema first?
57
57
  iterparse: Parse XML using iterative parsing, which is a bit slower but uses less memory
58
58
  recover: Should we try to parse incorrect XML? (argument passed to lxml parser)
59
- flat_data: A dict containing flat data if we want to add data to another dataset instead of creating
60
- a new one
59
+ flat_data: An existing `document.data` dict from a previously parsed document. When provided, records
60
+ from this XML file are appended to it rather than starting fresh, allowing multiple files to be
61
+ accumulated in memory and inserted together with a single
62
+ [`insert_into_target_tables`][xml2db.document.Document.insert_into_target_tables] call.
61
63
  """
62
64
  self.xml_file_path = xml_file[:255] if isinstance(xml_file, str) else "<stream>"
63
65
 
@@ -391,7 +393,11 @@ class Document:
391
393
  start_idx = 0
392
394
  while start_idx < len(data):
393
395
  with self.model.engine.begin() as conn:
394
- conn.execute(query, data[start_idx : (start_idx + max_lines)])
396
+ self.model.dialect.bulk_insert(
397
+ conn,
398
+ query.table,
399
+ data[start_idx : (start_idx + max_lines)],
400
+ )
395
401
  start_idx = start_idx + max_lines
396
402
 
397
403
  def merge_into_target_tables(self, single_transaction: bool = True) -> int:
@@ -698,8 +698,10 @@ class DataModel:
698
698
  skip_validation: Should we validate the documents against the schema first?
699
699
  iterparse: Parse XML using iterative parsing, which is a bit slower but uses less memory
700
700
  recover: Should we try to parse incorrect XML? (argument passed to lxml parser)
701
- flat_data: A dict containing flat data if we want to add data to another dataset instead of creating
702
- a new one
701
+ flat_data: An existing `document.data` dict from a previously parsed document. When provided, records
702
+ from this XML file are appended to it rather than starting fresh, allowing multiple files to be
703
+ accumulated in memory and inserted together with a single
704
+ [`Document.insert_into_target_tables`][xml2db.document.Document.insert_into_target_tables] call.
703
705
 
704
706
  Returns:
705
707
  A parsed [`Document`](document.md) object
@@ -180,6 +180,11 @@ class DataModelTable:
180
180
  raise ValueError(
181
181
  "attempting to add a 1-1 relationship with max occurrences different from 1"
182
182
  )
183
+ if (
184
+ name in self.relations_1
185
+ and self.relations_1[name].other_table.type_name == other_table.type_name
186
+ ):
187
+ return
183
188
  rel = DataModelRelation1(
184
189
  name,
185
190
  [(name, other_table.type_name)],
@@ -206,6 +211,11 @@ class DataModelTable:
206
211
  raise ValueError(
207
212
  "attempting to add a 1-n relationship with max occurrences equal to 1"
208
213
  )
214
+ if (
215
+ name in self.relations_n
216
+ and self.relations_n[name].other_table.type_name == other_table.type_name
217
+ ):
218
+ return
209
219
  rel = DataModelRelationN(
210
220
  name,
211
221
  [(name, other_table.type_name)],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xml2db
3
- Version: 0.13.0
3
+ Version: 0.13.2
4
4
  Summary: Import complex XML files to a relational database
5
5
  Author-email: Commission de régulation de l'énergie <opensource@cre.fr>
6
6
  Project-URL: Documentation, https://cre-dev.github.io/xml2db
@@ -24,7 +24,9 @@ src/xml2db/table/relations.py
24
24
  src/xml2db/table/reused_table.py
25
25
  src/xml2db/table/table.py
26
26
  src/xml2db/table/transformed_table.py
27
+ tests/test_bulk_insert.py
27
28
  tests/test_conversions.py
28
29
  tests/test_models_output.py
30
+ tests/test_multiprocessing.py
29
31
  tests/test_roundtrip.py
30
32
  tests/test_validation.py
@@ -0,0 +1,178 @@
1
+ """Unit tests for dialect bulk_insert implementations."""
2
+ import datetime
3
+
4
+ import pytest
5
+
6
+ pytest.importorskip("duckdb", reason="duckdb not installed")
7
+
8
+ from sqlalchemy import (
9
+ BigInteger,
10
+ Boolean,
11
+ Column,
12
+ DateTime,
13
+ Double,
14
+ Integer,
15
+ LargeBinary,
16
+ MetaData,
17
+ SmallInteger,
18
+ String,
19
+ Table,
20
+ create_engine,
21
+ select,
22
+ text,
23
+ )
24
+
25
+ from xml2db.dialect.base import DatabaseDialect
26
+ from xml2db.dialect.duckdb import DuckDBDialect
27
+
28
+
29
+ @pytest.fixture()
30
+ def duckdb_engine():
31
+ return create_engine("duckdb:///:memory:")
32
+
33
+
34
+ def _make_table(engine, name, *extra_cols):
35
+ """Create a simple test table and return the SQLAlchemy Table object."""
36
+ meta = MetaData()
37
+ table = Table(
38
+ name,
39
+ meta,
40
+ Column("id", Integer, key="id"),
41
+ Column("label", String(100), key="label"),
42
+ *extra_cols,
43
+ )
44
+ meta.create_all(engine)
45
+ return table
46
+
47
+
48
+ def _roundtrip(engine, table, records):
49
+ """Insert records via DuckDBDialect.bulk_insert and read them back."""
50
+ dialect = DuckDBDialect()
51
+ with engine.begin() as conn:
52
+ dialect.bulk_insert(conn, table, records)
53
+ with engine.connect() as conn:
54
+ return conn.execute(select(table)).mappings().all()
55
+
56
+
57
+ # ---------------------------------------------------------------------------
58
+ # Base dialect falls back to SQLAlchemy executemany
59
+ # ---------------------------------------------------------------------------
60
+
61
+
62
+ def test_base_dialect_bulk_insert(duckdb_engine):
63
+ table = _make_table(duckdb_engine, "base_test")
64
+ records = [{"id": 1, "label": "hello"}, {"id": 2, "label": "world"}]
65
+ DatabaseDialect().bulk_insert(
66
+ duckdb_engine.connect().__enter__(), table, records
67
+ )
68
+ # Just check the method is importable and has the right signature.
69
+
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # DuckDB dialect: basic types
73
+ # ---------------------------------------------------------------------------
74
+
75
+
76
+ def test_duckdb_bulk_insert_basic(duckdb_engine):
77
+ table = _make_table(duckdb_engine, "basic")
78
+ records = [{"id": 1, "label": "hello"}, {"id": 2, "label": None}]
79
+ rows = _roundtrip(duckdb_engine, table, records)
80
+ assert len(rows) == 2
81
+ assert rows[0]["id"] == 1
82
+ assert rows[0]["label"] == "hello"
83
+ assert rows[1]["label"] is None
84
+
85
+
86
+ def test_duckdb_bulk_insert_numeric_types(duckdb_engine):
87
+ meta = MetaData()
88
+ table = Table(
89
+ "numeric_types",
90
+ meta,
91
+ Column("i", Integer, key="i"),
92
+ Column("bi", BigInteger, key="bi"),
93
+ Column("si", SmallInteger, key="si"),
94
+ Column("d", Double, key="d"),
95
+ )
96
+ meta.create_all(duckdb_engine)
97
+ records = [{"i": 1, "bi": 10**15, "si": 32767, "d": 3.14}]
98
+ rows = _roundtrip(duckdb_engine, table, records)
99
+ assert rows[0]["i"] == 1
100
+ assert rows[0]["bi"] == 10**15
101
+ assert rows[0]["si"] == 32767
102
+ assert abs(rows[0]["d"] - 3.14) < 1e-9
103
+
104
+
105
+ def test_duckdb_bulk_insert_boolean(duckdb_engine):
106
+ meta = MetaData()
107
+ table = Table(
108
+ "bool_test",
109
+ meta,
110
+ Column("id", Integer, key="id"),
111
+ Column("flag", Boolean, key="flag"),
112
+ )
113
+ meta.create_all(duckdb_engine)
114
+ records = [{"id": 1, "flag": True}, {"id": 2, "flag": False}, {"id": 3, "flag": None}]
115
+ rows = _roundtrip(duckdb_engine, table, records)
116
+ assert rows[0]["flag"] is True
117
+ assert rows[1]["flag"] is False
118
+ assert rows[2]["flag"] is None
119
+
120
+
121
+ def test_duckdb_bulk_insert_datetime(duckdb_engine):
122
+ meta = MetaData()
123
+ table = Table(
124
+ "dt_test",
125
+ meta,
126
+ Column("id", Integer, key="id"),
127
+ Column("ts", DateTime(timezone=True), key="ts"),
128
+ )
129
+ meta.create_all(duckdb_engine)
130
+ dt = datetime.datetime(2023, 9, 27, 14, 35, 54, 274602)
131
+ records = [{"id": 1, "ts": dt}, {"id": 2, "ts": None}]
132
+ rows = _roundtrip(duckdb_engine, table, records)
133
+ # Value must survive the CSV round-trip and be returned as a datetime-like object.
134
+ assert rows[0]["ts"] is not None
135
+ assert rows[1]["ts"] is None
136
+
137
+
138
+ def test_duckdb_bulk_insert_binary(duckdb_engine):
139
+ meta = MetaData()
140
+ table = Table(
141
+ "binary_test",
142
+ meta,
143
+ Column("id", Integer, key="id"),
144
+ Column("hash", LargeBinary(32), key="hash"),
145
+ )
146
+ meta.create_all(duckdb_engine)
147
+ payload = b"\xde\xad\xbe\xef" * 8
148
+ records = [{"id": 1, "hash": payload}, {"id": 2, "hash": None}]
149
+ rows = _roundtrip(duckdb_engine, table, records)
150
+ assert bytes(rows[0]["hash"]) == payload
151
+ assert rows[1]["hash"] is None
152
+
153
+
154
+ def test_duckdb_bulk_insert_scalar_column_default(duckdb_engine):
155
+ """Columns with Python-side scalar defaults absent from records must be applied."""
156
+ meta = MetaData()
157
+ table = Table(
158
+ "default_test",
159
+ meta,
160
+ Column("id", Integer, key="id"),
161
+ Column("flag", Boolean, default=False, key="flag"),
162
+ )
163
+ meta.create_all(duckdb_engine)
164
+ # Records do NOT contain 'flag'; the default must be applied.
165
+ records = [{"id": 1}, {"id": 2}]
166
+ rows = _roundtrip(duckdb_engine, table, records)
167
+ assert rows[0]["flag"] is False
168
+ assert rows[1]["flag"] is False
169
+
170
+
171
+ def test_duckdb_bulk_insert_empty(duckdb_engine):
172
+ table = _make_table(duckdb_engine, "empty_test")
173
+ dialect = DuckDBDialect()
174
+ with engine.begin() if False else duckdb_engine.begin() as conn:
175
+ dialect.bulk_insert(conn, table, [])
176
+ with duckdb_engine.connect() as conn:
177
+ count = conn.execute(text("SELECT COUNT(*) FROM empty_test")).scalar()
178
+ assert count == 0
@@ -0,0 +1,114 @@
1
+ """Tests for concurrent XML loading with multiprocessing and a file-based DuckDB."""
2
+ import multiprocessing
3
+ import os
4
+ import tempfile
5
+
6
+ import pytest
7
+ from lxml import etree
8
+
9
+ pytest.importorskip("duckdb", reason="duckdb not installed")
10
+
11
+ from sqlalchemy import String, create_engine, text
12
+
13
+ from xml2db import DataModel
14
+
15
+ _SAMPLE = os.path.join(os.path.dirname(__file__), "sample_models", "orders")
16
+ _XSD = os.path.join(_SAMPLE, "orders.xsd")
17
+ _XML_FILES = [
18
+ os.path.join(_SAMPLE, "xml", f"order{i}.xml") for i in (1, 2, 3)
19
+ ]
20
+
21
+ # Matches orders model version 0 in sample_models/models.py so that the XML
22
+ # roundtrip produces byte-for-byte identical output.
23
+ _MODEL_CONFIG = {
24
+ "tables": {
25
+ "shiporder": {"fields": {"orderperson": {"transform": False}}},
26
+ "item": None,
27
+ },
28
+ "record_hash_column_name": "record_hash",
29
+ "metadata_columns": [
30
+ {"name": "input_file_path", "type": String(256)},
31
+ ],
32
+ }
33
+
34
+
35
+ def _load_xml_file(xml_path: str, xsd_path: str, db_path: str, lock) -> None:
36
+ """Worker function: parse one XML file and load it into a shared DuckDB file.
37
+
38
+ Each process builds its own DataModel (and gets a unique temp_prefix UUID),
39
+ so temporary tables never collide. All database I/O is serialised via *lock*
40
+ because DuckDB allows only one active writer at a time.
41
+ """
42
+ model = DataModel(
43
+ xsd_file=xsd_path,
44
+ connection_string=f"duckdb:///{db_path}",
45
+ model_config=_MODEL_CONFIG,
46
+ )
47
+ # CPU-bound XML parsing runs in parallel across processes.
48
+ doc = model.parse_xml(xml_path, metadata={"input_file_path": xml_path})
49
+
50
+ # Serialise all database access: one writer at a time for DuckDB.
51
+ with lock:
52
+ doc.insert_into_target_tables()
53
+ # Dispose inside the lock so the file handle is released before
54
+ # the next process tries to open the database.
55
+ model.engine.dispose()
56
+
57
+
58
+ def test_multiprocessing_file_duckdb():
59
+ """Three worker processes load XML files concurrently into a file-based DuckDB.
60
+
61
+ Parsing happens in parallel; database writes are serialised via a
62
+ multiprocessing.Lock. After all workers finish:
63
+ - the target table must contain one row per XML file, and
64
+ - each file must round-trip back to identical XML (content assertion).
65
+ """
66
+ with tempfile.TemporaryDirectory() as tmpdir:
67
+ db_path = os.path.join(tmpdir, "test.duckdb")
68
+ lock = multiprocessing.Lock()
69
+
70
+ processes = [
71
+ multiprocessing.Process(
72
+ target=_load_xml_file,
73
+ args=(xml_path, _XSD, db_path, lock),
74
+ )
75
+ for xml_path in _XML_FILES
76
+ ]
77
+ for p in processes:
78
+ p.start()
79
+ for p in processes:
80
+ p.join()
81
+ assert p.exitcode == 0, (
82
+ f"Worker for {_XML_FILES[processes.index(p)]} "
83
+ f"exited with code {p.exitcode}"
84
+ )
85
+
86
+ # --- row count ---
87
+ engine = create_engine(f"duckdb:///{db_path}")
88
+ with engine.connect() as conn:
89
+ count = conn.execute(text("SELECT COUNT(*) FROM orders")).scalar()
90
+ engine.dispose()
91
+ assert count == len(_XML_FILES)
92
+
93
+ # --- content roundtrip ---
94
+ verify_model = DataModel(
95
+ xsd_file=_XSD,
96
+ connection_string=f"duckdb:///{db_path}",
97
+ model_config=_MODEL_CONFIG,
98
+ )
99
+ for xml_path in _XML_FILES:
100
+ doc = verify_model.extract_from_database(
101
+ f"input_file_path='{xml_path}'",
102
+ force_tz="Europe/Paris",
103
+ )
104
+ src = etree.parse(xml_path).getroot()
105
+ el = doc.to_xml(nsmap=src.nsmap)
106
+ for key, val in src.attrib.items():
107
+ el.set(key, val)
108
+ actual = etree.tostring(
109
+ el, pretty_print=True, encoding="utf-8", xml_declaration=True
110
+ ).decode("utf-8")
111
+ with open(xml_path) as f:
112
+ expected = f.read()
113
+ assert actual == expected, f"XML roundtrip failed for {xml_path}"
114
+ verify_model.engine.dispose()
@@ -1,50 +0,0 @@
1
- from typing import Any
2
-
3
- from sqlalchemy import Column, Integer, Sequence
4
- from sqlalchemy.exc import ProgrammingError
5
- import sqlalchemy.schema
6
-
7
- from .base import DatabaseDialect
8
-
9
-
10
- class DuckDBDialect(DatabaseDialect):
11
- """Dialect for DuckDB.
12
-
13
- DuckDB supports very long identifiers (effectively unlimited in practice;
14
- we document 1024 as a safe upper bound). It requires two workarounds:
15
-
16
- - **Primary key columns**: DuckDB does not support ``autoincrement`` in the
17
- same way as other backends. A ``Sequence`` object is used instead.
18
- - **Schema creation**: DuckDB's inspector does not reliably list schemas
19
- before they exist, so the existence check is replaced with a try/except
20
- around ``CREATE SCHEMA``.
21
- """
22
-
23
- # this limit comes from the implementation with SQLAlchemy and not a constraint of duckdb per se
24
- MAX_IDENTIFIER_LENGTH: int = 63
25
-
26
- def pk_column(self, table_name: str) -> Column:
27
- """Return a Sequence-based primary key column for DuckDB."""
28
- logical = f"pk_{table_name}"
29
- pk_sequence = Sequence(self.db_identifier(f"pk_sequ_{table_name}"))
30
- return Column(
31
- self.db_identifier(logical),
32
- Integer,
33
- pk_sequence,
34
- server_default=pk_sequence.next_value(),
35
- primary_key=True,
36
- key=logical,
37
- )
38
-
39
- def create_schema(self, engine: Any, schema_name: str) -> None:
40
- """Create a schema using try/except, as required by DuckDB."""
41
-
42
- def do_create() -> None:
43
- with engine.connect() as conn:
44
- conn.execute(sqlalchemy.schema.CreateSchema(schema_name))
45
- conn.commit()
46
-
47
- try:
48
- do_create()
49
- except ProgrammingError:
50
- pass
File without changes
File without changes
File without changes
File without changes
File without changes