PyPI - xml2db - Versions diffs - 0.13.0__tar.gz → 0.13.2__tar.gz - Mend

xml2db 0.13.0tar.gz → 0.13.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

{xml2db-0.13.0/src/xml2db.egg-info → xml2db-0.13.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xml2db
-Version: 0.13.0
+Version: 0.13.2
 Summary: Import complex XML files to a relational database
 Author-email: Commission de régulation de l'énergie <opensource@cre.fr>
 Project-URL: Documentation, https://cre-dev.github.io/xml2db

{xml2db-0.13.0 → xml2db-0.13.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "xml2db"
-version = "0.13.0"
+version = "0.13.2"
 authors = [
   { name="Commission de régulation de l'énergie", email="opensource@cre.fr" },
 ]

{xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/dialect/base.py RENAMED Viewed

@@ -313,3 +313,22 @@ class DatabaseDialect:
                 "Clustered columnstore indexes are only supported with MS SQL Server database, noop"
             )
         return config
+    # ------------------------------------------------------------------
+    # Data loading
+    # ------------------------------------------------------------------
+    def bulk_insert(self, conn: Any, table: Any, records: list) -> None:
+        """Insert records into a staging table.
+        The base implementation uses SQLAlchemy's parameterised executemany,
+        which is backend-agnostic. Subclasses may override this with a
+        backend-specific bulk-loading strategy (e.g. COPY FROM CSV).
+        Args:
+            conn: A SQLAlchemy ``Connection`` already within a transaction.
+            table: The SQLAlchemy ``Table`` object to insert into.
+            records: A list of dicts mapping column keys to Python values.
+        """
+        if records:
+            conn.execute(table.insert(), records)

xml2db-0.13.2/src/xml2db/dialect/duckdb.py ADDED Viewed

@@ -0,0 +1,166 @@
+import csv
+import os
+import tempfile
+from typing import Any
+from sqlalchemy import (
+    BigInteger,
+    Boolean,
+    Column,
+    DateTime,
+    Double,
+    Integer,
+    LargeBinary,
+    Sequence,
+    SmallInteger,
+    text,
+)
+from sqlalchemy.exc import ProgrammingError
+import sqlalchemy.schema
+from .base import DatabaseDialect
+class DuckDBDialect(DatabaseDialect):
+    """Dialect for DuckDB.
+    DuckDB supports very long identifiers (effectively unlimited in practice;
+    we document 1024 as a safe upper bound). It requires two workarounds:
+    - **Primary key columns**: DuckDB does not support ``autoincrement`` in the
+      same way as other backends. A ``Sequence`` object is used instead.
+    - **Schema creation**: DuckDB's inspector does not reliably list schemas
+      before they exist, so the existence check is replaced with a try/except
+      around ``CREATE SCHEMA``.
+    """
+    # this limit comes from the implementation with SQLAlchemy and not a constraint of duckdb per se
+    MAX_IDENTIFIER_LENGTH: int = 63
+    def pk_column(self, table_name: str) -> Column:
+        """Return a Sequence-based primary key column for DuckDB."""
+        logical = f"pk_{table_name}"
+        pk_sequence = Sequence(self.db_identifier(f"pk_sequ_{table_name}"))
+        return Column(
+            self.db_identifier(logical),
+            Integer,
+            pk_sequence,
+            server_default=pk_sequence.next_value(),
+            primary_key=True,
+            key=logical,
+        )
+    def create_schema(self, engine: Any, schema_name: str) -> None:
+        """Create a schema using try/except, as required by DuckDB."""
+        def do_create() -> None:
+            with engine.connect() as conn:
+                conn.execute(sqlalchemy.schema.CreateSchema(schema_name))
+                conn.commit()
+        try:
+            do_create()
+        except ProgrammingError:
+            pass
+    # Maps SQLAlchemy column types to DuckDB CAST target type names.
+    # String types need no cast; LargeBinary is handled via unhex().
+    # Order matters: subclasses (BigInteger, SmallInteger) must appear before
+    # their parent (Integer) so that isinstance() matches the most specific type.
+    _DUCKDB_CAST: dict = {
+        BigInteger: "BIGINT",
+        SmallInteger: "SMALLINT",
+        Integer: "INTEGER",
+        Double: "DOUBLE",
+        Boolean: "BOOLEAN",
+        DateTime: "TIMESTAMPTZ",  # DateTime(timezone=False) → TIMESTAMP below
+    }
+    def _select_expr(self, key: str, col: Any) -> str:
+        """Return a DuckDB SELECT expression that casts a VARCHAR CSV column."""
+        if isinstance(col.type, LargeBinary):
+            return f'unhex("{key}")'
+        for sa_type, duckdb_type in self._DUCKDB_CAST.items():
+            if isinstance(col.type, sa_type):
+                if isinstance(col.type, DateTime) and not col.type.timezone:
+                    duckdb_type = "TIMESTAMP"
+                return f'CAST("{key}" AS {duckdb_type})'
+        return f'"{key}"'  # String / unknown: keep as VARCHAR
+    def bulk_insert(self, conn: Any, table: Any, records: list) -> None:
+        """Bulk-insert records via a temporary CSV file and DuckDB's ``read_csv``.
+        All CSV columns are read as VARCHAR (``all_varchar=true``) and then
+        explicitly cast to their target types in the ``SELECT`` clause.
+        Binary columns are hex-encoded in the CSV and decoded with ``unhex()``.
+        Args:
+            conn: A SQLAlchemy ``Connection`` already within a transaction.
+            table: The SQLAlchemy ``Table`` object to insert into.
+            records: A list of dicts mapping column keys to Python values.
+        """
+        if not records:
+            return
+        # Map column key -> SQLAlchemy Column object
+        col_by_key = {col.key: col for col in table.columns}
+        # Columns present in the first record that correspond to table columns
+        col_keys = [k for k in records[0] if k in col_by_key]
+        # SQLAlchemy Python-side scalar defaults (e.g. default=False on temp_exists)
+        # are applied automatically by executemany but not by our CSV path.
+        extra_defaults: dict = {}
+        for col in table.columns:
+            if col.key not in records[0] and col.key in col_by_key:
+                d = col.default
+                if d is not None and d.is_scalar:
+                    extra_defaults[col.key] = d.arg
+        all_col_keys = col_keys + list(extra_defaults.keys())
+        fd, csv_path = tempfile.mkstemp(suffix=".csv")
+        try:
+            with os.fdopen(fd, "w", newline="", encoding="utf-8") as f:
+                writer = csv.writer(f)
+                writer.writerow(all_col_keys)
+                for record in records:
+                    row = []
+                    for key in all_col_keys:
+                        v = record.get(key) if key in col_keys else extra_defaults[key]
+                        if v is None:
+                            row.append("")
+                        elif isinstance(v, bytes):
+                            row.append(v.hex())
+                        elif isinstance(v, bool):
+                            # Must come before the general str() path since bool is a
+                            # subclass of int, and csv.writer would write 0/1 otherwise.
+                            row.append("true" if v else "false")
+                        else:
+                            # str() on datetime gives "YYYY-MM-DD HH:MM:SS[.f][+HH:MM]",
+                            # which DuckDB's CAST accepts without ambiguity.
+                            row.append(str(v))
+                    writer.writerow(row)
+            full_name = (
+                f'"{table.schema}"."{table.name}"'
+                if table.schema
+                else f'"{table.name}"'
+            )
+            insert_cols = ", ".join(
+                f'"{col_by_key[k].name}"' for k in all_col_keys
+            )
+            select_exprs = ", ".join(
+                self._select_expr(k, col_by_key[k]) for k in all_col_keys
+            )
+            # DuckDB requires forward slashes in file paths on all platforms.
+            safe_path = csv_path.replace("\\", "/")
+            sql = text(
+                f"INSERT INTO {full_name} ({insert_cols}) "
+                f"SELECT {select_exprs} "
+                f"FROM read_csv('{safe_path}', header=true, nullstr='', all_varchar=true)"
+            )
+            conn.execute(sql)
+        finally:
+            if os.path.exists(csv_path):
+                os.unlink(csv_path)

{xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/document.py RENAMED Viewed

@@ -56,8 +56,10 @@ class Document:
             skip_validation: Should we validate the document against the schema first?
             iterparse: Parse XML using iterative parsing, which is a bit slower but uses less memory
             recover: Should we try to parse incorrect XML? (argument passed to lxml parser)
-            flat_data: A dict containing flat data if we want to add data to another dataset instead of creating
-                a new one
+            flat_data: An existing `document.data` dict from a previously parsed document. When provided, records
+                from this XML file are appended to it rather than starting fresh, allowing multiple files to be
+                accumulated in memory and inserted together with a single
+                [`insert_into_target_tables`][xml2db.document.Document.insert_into_target_tables] call.
         """
         self.xml_file_path = xml_file[:255] if isinstance(xml_file, str) else "<stream>"
@@ -391,7 +393,11 @@ class Document:
                 start_idx = 0
                 while start_idx < len(data):
                     with self.model.engine.begin() as conn:
-                        conn.execute(query, data[start_idx : (start_idx + max_lines)])
+                        self.model.dialect.bulk_insert(
+                            conn,
+                            query.table,
+                            data[start_idx : (start_idx + max_lines)],
+                        )
                     start_idx = start_idx + max_lines
     def merge_into_target_tables(self, single_transaction: bool = True) -> int:

{xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/model.py RENAMED Viewed

@@ -698,8 +698,10 @@ class DataModel:
             skip_validation: Should we validate the documents against the schema first?
             iterparse: Parse XML using iterative parsing, which is a bit slower but uses less memory
             recover: Should we try to parse incorrect XML? (argument passed to lxml parser)
-            flat_data: A dict containing flat data if we want to add data to another dataset instead of creating
-                a new one
+            flat_data: An existing `document.data` dict from a previously parsed document. When provided, records
+                from this XML file are appended to it rather than starting fresh, allowing multiple files to be
+                accumulated in memory and inserted together with a single
+                [`Document.insert_into_target_tables`][xml2db.document.Document.insert_into_target_tables] call.
         Returns:
             A parsed [`Document`](document.md) object

{xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db/table/table.py RENAMED Viewed

@@ -180,6 +180,11 @@ class DataModelTable:
             raise ValueError(
                 "attempting to add a 1-1 relationship with max occurrences different from 1"
             )
+        if (
+            name in self.relations_1
+            and self.relations_1[name].other_table.type_name == other_table.type_name
+        ):
+            return
         rel = DataModelRelation1(
             name,
             [(name, other_table.type_name)],
@@ -206,6 +211,11 @@ class DataModelTable:
             raise ValueError(
                 "attempting to add a 1-n relationship with max occurrences equal to 1"
             )
+        if (
+            name in self.relations_n
+            and self.relations_n[name].other_table.type_name == other_table.type_name
+        ):
+            return
         rel = DataModelRelationN(
             name,
             [(name, other_table.type_name)],

{xml2db-0.13.0 → xml2db-0.13.2/src/xml2db.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xml2db
-Version: 0.13.0
+Version: 0.13.2
 Summary: Import complex XML files to a relational database
 Author-email: Commission de régulation de l'énergie <opensource@cre.fr>
 Project-URL: Documentation, https://cre-dev.github.io/xml2db

{xml2db-0.13.0 → xml2db-0.13.2}/src/xml2db.egg-info/SOURCES.txt RENAMED Viewed

@@ -24,7 +24,9 @@ src/xml2db/table/relations.py
 src/xml2db/table/reused_table.py
 src/xml2db/table/table.py
 src/xml2db/table/transformed_table.py
+tests/test_bulk_insert.py
 tests/test_conversions.py
 tests/test_models_output.py
+tests/test_multiprocessing.py
 tests/test_roundtrip.py
 tests/test_validation.py

xml2db-0.13.2/tests/test_bulk_insert.py ADDED Viewed

@@ -0,0 +1,178 @@
+"""Unit tests for dialect bulk_insert implementations."""
+import datetime
+import pytest
+pytest.importorskip("duckdb", reason="duckdb not installed")
+from sqlalchemy import (
+    BigInteger,
+    Boolean,
+    Column,
+    DateTime,
+    Double,
+    Integer,
+    LargeBinary,
+    MetaData,
+    SmallInteger,
+    String,
+    Table,
+    create_engine,
+    select,
+    text,
+)
+from xml2db.dialect.base import DatabaseDialect
+from xml2db.dialect.duckdb import DuckDBDialect
+@pytest.fixture()
+def duckdb_engine():
+    return create_engine("duckdb:///:memory:")
+def _make_table(engine, name, *extra_cols):
+    """Create a simple test table and return the SQLAlchemy Table object."""
+    meta = MetaData()
+    table = Table(
+        name,
+        meta,
+        Column("id", Integer, key="id"),
+        Column("label", String(100), key="label"),
+        *extra_cols,
+    )
+    meta.create_all(engine)
+    return table
+def _roundtrip(engine, table, records):
+    """Insert records via DuckDBDialect.bulk_insert and read them back."""
+    dialect = DuckDBDialect()
+    with engine.begin() as conn:
+        dialect.bulk_insert(conn, table, records)
+    with engine.connect() as conn:
+        return conn.execute(select(table)).mappings().all()
+# ---------------------------------------------------------------------------
+# Base dialect falls back to SQLAlchemy executemany
+# ---------------------------------------------------------------------------
+def test_base_dialect_bulk_insert(duckdb_engine):
+    table = _make_table(duckdb_engine, "base_test")
+    records = [{"id": 1, "label": "hello"}, {"id": 2, "label": "world"}]
+    DatabaseDialect().bulk_insert(
+        duckdb_engine.connect().__enter__(), table, records
+    )
+    # Just check the method is importable and has the right signature.
+# ---------------------------------------------------------------------------
+# DuckDB dialect: basic types
+# ---------------------------------------------------------------------------
+def test_duckdb_bulk_insert_basic(duckdb_engine):
+    table = _make_table(duckdb_engine, "basic")
+    records = [{"id": 1, "label": "hello"}, {"id": 2, "label": None}]
+    rows = _roundtrip(duckdb_engine, table, records)
+    assert len(rows) == 2
+    assert rows[0]["id"] == 1
+    assert rows[0]["label"] == "hello"
+    assert rows[1]["label"] is None
+def test_duckdb_bulk_insert_numeric_types(duckdb_engine):
+    meta = MetaData()
+    table = Table(
+        "numeric_types",
+        meta,
+        Column("i", Integer, key="i"),
+        Column("bi", BigInteger, key="bi"),
+        Column("si", SmallInteger, key="si"),
+        Column("d", Double, key="d"),
+    )
+    meta.create_all(duckdb_engine)
+    records = [{"i": 1, "bi": 10**15, "si": 32767, "d": 3.14}]
+    rows = _roundtrip(duckdb_engine, table, records)
+    assert rows[0]["i"] == 1
+    assert rows[0]["bi"] == 10**15
+    assert rows[0]["si"] == 32767
+    assert abs(rows[0]["d"] - 3.14) < 1e-9
+def test_duckdb_bulk_insert_boolean(duckdb_engine):
+    meta = MetaData()
+    table = Table(
+        "bool_test",
+        meta,
+        Column("id", Integer, key="id"),
+        Column("flag", Boolean, key="flag"),
+    )
+    meta.create_all(duckdb_engine)
+    records = [{"id": 1, "flag": True}, {"id": 2, "flag": False}, {"id": 3, "flag": None}]
+    rows = _roundtrip(duckdb_engine, table, records)
+    assert rows[0]["flag"] is True
+    assert rows[1]["flag"] is False
+    assert rows[2]["flag"] is None
+def test_duckdb_bulk_insert_datetime(duckdb_engine):
+    meta = MetaData()
+    table = Table(
+        "dt_test",
+        meta,
+        Column("id", Integer, key="id"),
+        Column("ts", DateTime(timezone=True), key="ts"),
+    )
+    meta.create_all(duckdb_engine)
+    dt = datetime.datetime(2023, 9, 27, 14, 35, 54, 274602)
+    records = [{"id": 1, "ts": dt}, {"id": 2, "ts": None}]
+    rows = _roundtrip(duckdb_engine, table, records)
+    # Value must survive the CSV round-trip and be returned as a datetime-like object.
+    assert rows[0]["ts"] is not None
+    assert rows[1]["ts"] is None
+def test_duckdb_bulk_insert_binary(duckdb_engine):
+    meta = MetaData()
+    table = Table(
+        "binary_test",
+        meta,
+        Column("id", Integer, key="id"),
+        Column("hash", LargeBinary(32), key="hash"),
+    )
+    meta.create_all(duckdb_engine)
+    payload = b"\xde\xad\xbe\xef" * 8
+    records = [{"id": 1, "hash": payload}, {"id": 2, "hash": None}]
+    rows = _roundtrip(duckdb_engine, table, records)
+    assert bytes(rows[0]["hash"]) == payload
+    assert rows[1]["hash"] is None
+def test_duckdb_bulk_insert_scalar_column_default(duckdb_engine):
+    """Columns with Python-side scalar defaults absent from records must be applied."""
+    meta = MetaData()
+    table = Table(
+        "default_test",
+        meta,
+        Column("id", Integer, key="id"),
+        Column("flag", Boolean, default=False, key="flag"),
+    )
+    meta.create_all(duckdb_engine)
+    # Records do NOT contain 'flag'; the default must be applied.
+    records = [{"id": 1}, {"id": 2}]
+    rows = _roundtrip(duckdb_engine, table, records)
+    assert rows[0]["flag"] is False
+    assert rows[1]["flag"] is False
+def test_duckdb_bulk_insert_empty(duckdb_engine):
+    table = _make_table(duckdb_engine, "empty_test")
+    dialect = DuckDBDialect()
+    with engine.begin() if False else duckdb_engine.begin() as conn:
+        dialect.bulk_insert(conn, table, [])
+    with duckdb_engine.connect() as conn:
+        count = conn.execute(text("SELECT COUNT(*) FROM empty_test")).scalar()
+    assert count == 0

xml2db-0.13.2/tests/test_multiprocessing.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""Tests for concurrent XML loading with multiprocessing and a file-based DuckDB."""
+import multiprocessing
+import os
+import tempfile
+import pytest
+from lxml import etree
+pytest.importorskip("duckdb", reason="duckdb not installed")
+from sqlalchemy import String, create_engine, text
+from xml2db import DataModel
+_SAMPLE = os.path.join(os.path.dirname(__file__), "sample_models", "orders")
+_XSD = os.path.join(_SAMPLE, "orders.xsd")
+_XML_FILES = [
+    os.path.join(_SAMPLE, "xml", f"order{i}.xml") for i in (1, 2, 3)
+]
+# Matches orders model version 0 in sample_models/models.py so that the XML
+# roundtrip produces byte-for-byte identical output.
+_MODEL_CONFIG = {
+    "tables": {
+        "shiporder": {"fields": {"orderperson": {"transform": False}}},
+        "item": None,
+    },
+    "record_hash_column_name": "record_hash",
+    "metadata_columns": [
+        {"name": "input_file_path", "type": String(256)},
+    ],
+}
+def _load_xml_file(xml_path: str, xsd_path: str, db_path: str, lock) -> None:
+    """Worker function: parse one XML file and load it into a shared DuckDB file.
+    Each process builds its own DataModel (and gets a unique temp_prefix UUID),
+    so temporary tables never collide.  All database I/O is serialised via *lock*
+    because DuckDB allows only one active writer at a time.
+    """
+    model = DataModel(
+        xsd_file=xsd_path,
+        connection_string=f"duckdb:///{db_path}",
+        model_config=_MODEL_CONFIG,
+    )
+    # CPU-bound XML parsing runs in parallel across processes.
+    doc = model.parse_xml(xml_path, metadata={"input_file_path": xml_path})
+    # Serialise all database access: one writer at a time for DuckDB.
+    with lock:
+        doc.insert_into_target_tables()
+        # Dispose inside the lock so the file handle is released before
+        # the next process tries to open the database.
+        model.engine.dispose()
+def test_multiprocessing_file_duckdb():
+    """Three worker processes load XML files concurrently into a file-based DuckDB.
+    Parsing happens in parallel; database writes are serialised via a
+    multiprocessing.Lock.  After all workers finish:
+    - the target table must contain one row per XML file, and
+    - each file must round-trip back to identical XML (content assertion).
+    """
+    with tempfile.TemporaryDirectory() as tmpdir:
+        db_path = os.path.join(tmpdir, "test.duckdb")
+        lock = multiprocessing.Lock()
+        processes = [
+            multiprocessing.Process(
+                target=_load_xml_file,
+                args=(xml_path, _XSD, db_path, lock),
+            )
+            for xml_path in _XML_FILES
+        ]
+        for p in processes:
+            p.start()
+        for p in processes:
+            p.join()
+            assert p.exitcode == 0, (
+                f"Worker for {_XML_FILES[processes.index(p)]} "
+                f"exited with code {p.exitcode}"
+            )
+        # --- row count ---
+        engine = create_engine(f"duckdb:///{db_path}")
+        with engine.connect() as conn:
+            count = conn.execute(text("SELECT COUNT(*) FROM orders")).scalar()
+        engine.dispose()
+        assert count == len(_XML_FILES)
+        # --- content roundtrip ---
+        verify_model = DataModel(
+            xsd_file=_XSD,
+            connection_string=f"duckdb:///{db_path}",
+            model_config=_MODEL_CONFIG,
+        )
+        for xml_path in _XML_FILES:
+            doc = verify_model.extract_from_database(
+                f"input_file_path='{xml_path}'",
+                force_tz="Europe/Paris",
+            )
+            src = etree.parse(xml_path).getroot()
+            el = doc.to_xml(nsmap=src.nsmap)
+            for key, val in src.attrib.items():
+                el.set(key, val)
+            actual = etree.tostring(
+                el, pretty_print=True, encoding="utf-8", xml_declaration=True
+            ).decode("utf-8")
+            with open(xml_path) as f:
+                expected = f.read()
+            assert actual == expected, f"XML roundtrip failed for {xml_path}"
+        verify_model.engine.dispose()

xml2db-0.13.0/src/xml2db/dialect/duckdb.py DELETED Viewed

@@ -1,50 +0,0 @@
-from typing import Any
-from sqlalchemy import Column, Integer, Sequence
-from sqlalchemy.exc import ProgrammingError
-import sqlalchemy.schema
-from .base import DatabaseDialect
-class DuckDBDialect(DatabaseDialect):
-    """Dialect for DuckDB.
-    DuckDB supports very long identifiers (effectively unlimited in practice;
-    we document 1024 as a safe upper bound). It requires two workarounds:
-    - **Primary key columns**: DuckDB does not support ``autoincrement`` in the
-      same way as other backends. A ``Sequence`` object is used instead.
-    - **Schema creation**: DuckDB's inspector does not reliably list schemas
-      before they exist, so the existence check is replaced with a try/except
-      around ``CREATE SCHEMA``.
-    """
-    # this limit comes from the implementation with SQLAlchemy and not a constraint of duckdb per se
-    MAX_IDENTIFIER_LENGTH: int = 63
-    def pk_column(self, table_name: str) -> Column:
-        """Return a Sequence-based primary key column for DuckDB."""
-        logical = f"pk_{table_name}"
-        pk_sequence = Sequence(self.db_identifier(f"pk_sequ_{table_name}"))
-        return Column(
-            self.db_identifier(logical),
-            Integer,
-            pk_sequence,
-            server_default=pk_sequence.next_value(),
-            primary_key=True,
-            key=logical,
-        )
-    def create_schema(self, engine: Any, schema_name: str) -> None:
-        """Create a schema using try/except, as required by DuckDB."""
-        def do_create() -> None:
-            with engine.connect() as conn:
-                conn.execute(sqlalchemy.schema.CreateSchema(schema_name))
-                conn.commit()
-        try:
-            do_create()
-        except ProgrammingError:
-            pass