PyPI - xml2db - Versions diffs - 0.9.0__py3-none-any.whl - Mend

xml2db 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

debug.py +34 -0
xml2db/__init__.py +21 -0
xml2db/document.py +650 -0
xml2db/exceptions.py +4 -0
xml2db/model.py +619 -0
xml2db/table/__init__.py +5 -0
xml2db/table/column.py +190 -0
xml2db/table/duplicated_table.py +180 -0
xml2db/table/relations.py +243 -0
xml2db/table/reused_table.py +152 -0
xml2db/table/table.py +356 -0
xml2db/table/transformed_table.py +314 -0
xml2db/xml_converter.py +258 -0
xml2db-0.9.0.dist-info/LICENSE +19 -0
xml2db-0.9.0.dist-info/METADATA +100 -0
xml2db-0.9.0.dist-info/RECORD +18 -0
xml2db-0.9.0.dist-info/WHEEL +5 -0
xml2db-0.9.0.dist-info/top_level.txt +2 -0

xml2db/table/column.py ADDED Viewed

@@ -0,0 +1,190 @@
+import logging
+from typing import List, Iterable, Any, Union, TYPE_CHECKING
+from sqlalchemy import (
+    Integer,
+    Float,
+    Boolean,
+    BigInteger,
+    SmallInteger,
+    Column,
+    DateTime,
+    String,
+)
+from sqlalchemy.dialects import mssql
+if TYPE_CHECKING:
+    from xml2db.model import DataModel
+logger = logging.getLogger(__name__)
+def types_mapping_default(temp: bool, col: "DataModelColumn") -> Any:
+    """Defines the sqlalchemy type to use for given column properties in target tables
+    :param temp: are we targeting the temporary tables schema or the final tables?
+    :param col: an object representing a column of a table for which we are determining the SQL type to define
+    :return: a sqlalchemy class representing the data type to be used
+    """
+    if col.occurs[1] != 1:
+        return String(8000)
+    if col.data_type in ["decimal", "float"]:
+        return Float
+    if col.data_type == "dateTime":
+        return DateTime(timezone=True)
+    if col.data_type == "integer" or col.data_type == "int":
+        return Integer
+    if col.data_type == "boolean":
+        return Boolean
+    if col.data_type == "byte":
+        return SmallInteger
+    if col.data_type == "long":
+        return BigInteger
+    if col.data_type == "date":
+        return String(16)
+    if col.data_type == "time":
+        return String(18)
+    if col.data_type in ["string", "NMTOKEN", "duration", "token"]:
+        if col.max_length is None:
+            return String(1000)
+        min_length = 0 if col.min_length is None else col.min_length
+        if min_length >= col.max_length - 1 and not col.allow_empty:
+            return String(col.max_length)
+        return String(col.max_length)
+    else:
+        logger.warning(
+            f"unknown type '{col.data_type}' for column '{col.name}', defaulting to VARCHAR(1000) "
+            f"(this can be overridden by providing a field type in the configuration)"
+        )
+        return String(1000)
+def types_mapping_mssql(temp: bool, col: "DataModelColumn") -> Any:
+    """Defines the MSSQL type to use for given column properties in target tables
+    :param temp: are we targeting the temporary tables schema or the final tables?
+    :param col: an object representing a column of a table for which we are determining the SQL type to define
+    :return: a sqlalchemy class representing the data type to be used
+    """
+    if col.occurs[1] != 1:
+        return mssql.VARCHAR(8000)
+    if col.data_type in ["decimal", "float"]:
+        return Float
+    if col.data_type == "dateTime":
+        # using the DATETIMEOFFSET directly in the temporary table caused issues when inserting data in the target
+        # table with INSERT INTO SELECT converts datetime VARCHAR to DATETIMEOFFSET without errors
+        return mssql.VARCHAR(100) if temp else mssql.DATETIMEOFFSET
+    if col.data_type == "integer" or col.data_type == "int":
+        return Integer
+    if col.data_type == "boolean":
+        return Boolean
+    if col.data_type == "byte":
+        return SmallInteger
+    if col.data_type == "long":
+        return BigInteger
+    if col.data_type == "date":
+        return mssql.VARCHAR(16)
+    if col.data_type == "time":
+        return mssql.VARCHAR(18)
+    if col.data_type in ["string", "NMTOKEN", "duration", "token"]:
+        if col.max_length is None:
+            return mssql.VARCHAR(1000)
+        min_length = 0 if col.min_length is None else col.min_length
+        if min_length >= col.max_length - 1 and not col.allow_empty:
+            return mssql.CHAR(col.max_length)
+        return mssql.VARCHAR(col.max_length)
+    else:
+        logger.warning(
+            f"unknown type '{col.data_type}' for column '{col.name}', defaulting to VARCHAR(1000) "
+            f"(this can be overridden by providing a field type in the configuration)"
+        )
+        return mssql.VARCHAR(1000)
+class DataModelColumn:
+    """A class representing a column of a table
+    :param name: column name
+    :param data_type: column data type
+    :param occurs: min and max occurrences of the field
+    :param min_length: min length
+    :param max_length: max length
+    :param allow_empty: is nullable ?
+    :param ngroup: a key used to handle nested sequences
+    :param model_config: data model config, may contain column type information
+    :param data_model: the DataModel object it belongs to
+    :ivar name: the name of the field (i.e. column name)
+    :ivar data_type: the data type, extracted from XSD data type
+    :ivar occurs: list of int with two elements: min occurrences and max occurrences. \
+    Max occurrences is None if unbounded
+    """
+    def __init__(
+        self,
+        name: str,
+        name_chain: list,
+        data_type: str,
+        occurs: List[int],
+        min_length: int,
+        max_length: int,
+        is_attr: bool,
+        is_content: bool,
+        allow_empty: bool,
+        ngroup: Union[int, None],
+        model_config: dict[str, Any],
+        data_model: "DataModel",
+    ):
+        """Constructor method"""
+        self.name = name
+        self.name_chain = name_chain
+        self.data_type = data_type
+        self.occurs = occurs
+        self.min_length = min_length
+        self.max_length = max_length
+        self.is_attr = is_attr
+        self.is_content = is_content
+        self.allow_empty = allow_empty
+        self.ngroup = ngroup
+        self.model_config = model_config
+        self.data_model = data_model
+        self.other_table = None  # just to avoid a linting warning
+        self.types_mapping = (
+            types_mapping_mssql
+            if data_model.engine and data_model.engine.dialect.name == "mssql"
+            else types_mapping_default
+        )
+    @property
+    def can_join_values_as_string(self):
+        """Decide whether multiple values can be stored as comma separated values in this column
+        :return: True if data type is compatible with comma separated values
+        :raises ValueError: if data type does not allow storage as comma separated values
+        """
+        if self.occurs[1] == 1:
+            return True
+        if self.occurs[1] is None or self.occurs[1] > 1:
+            if self.data_type in (
+                "string",
+                "date",
+                "dateTime",
+                "NMTOKEN",
+                "time",
+            ):
+                return True
+            raise ValueError(
+                f"Col type '{self.data_type}' with maxOccur > 1 is not supported."
+            )
+        return False
+    def get_sqlalchemy_column(self, temp: bool = False) -> Iterable[Column]:
+        """Create sqlalchemy Column object
+        :param temp: temp table or target table ?
+        """
+        # use type specified in config if exists
+        column_type = self.model_config.get("fields", {}).get(self.name, {}).get(
+            "type"
+        ) or self.types_mapping(temp, self)
+        yield Column(self.name, column_type)

xml2db/table/duplicated_table.py ADDED Viewed

@@ -0,0 +1,180 @@
+from typing import Iterable, Any
+from sqlalchemy import (
+    Table,
+    Column,
+    Integer,
+    ForeignKey,
+    PrimaryKeyConstraint,
+    Index,
+    Boolean,
+    DateTime,
+    String,
+    select,
+    and_,
+)
+from xml2db.table.transformed_table import DataModelTableTransformed
+class DataModelTableDuplicated(DataModelTableTransformed):
+    """A table data model which allows duplicated records in the database.
+    This table model is only allowed if this node type is used only once in the schema, \
+    in a 1-n relationship with its parent node. The 1-n relationship is represented with \
+    a foreign key relation from this node to its parent node, without intermediate relationship \
+    table. As such, it is a simpler schema, with the drawback of having duplicates records.
+    """
+    is_reused = False
+    def build_sqlalchemy_tables(self) -> None:
+        """Build sqlalchemy table objects.
+        Build the sqlalchemy table objet based on table attributes for the main table, and \
+        relation tables to store n-n relationships with children nodes, for target and temp \
+        tables (so it builds at least 2 tables if there is no relations).
+        This method is intended to be called only once (if it called more than once it will return \
+        immediately) and further changes to the table will not be updated.
+        """
+        if self.table is not None:
+            return
+        prefix = f"temp_{self.temp_prefix}_"
+        def get_col(temp=False) -> Iterable[Column]:
+            """Generator function to build sqlalchemy Column objects
+            :param temp: are we targeting temp or target table?
+            """
+            # temp primary key which is used also in the final table to update back target pk
+            if temp or self.referenced_as_fk:
+                yield Column(
+                    f"temp_pk_{self.name}",
+                    Integer,
+                    primary_key=temp,
+                    autoincrement=False,
+                )
+            # foreign key column to link with parent
+            if temp:
+                yield Column(f"temp_fk_parent_{self.parent.name}", Integer)
+                yield Column(f"fk_parent_{self.parent.name}", Integer)
+            else:
+                yield Column(
+                    f"fk_parent_{self.parent.name}",
+                    Integer,
+                    ForeignKey(f"{self.parent.name}.pk_{self.parent.name}"),
+                    index=True,
+                )
+            # row_number if needed
+            if self.data_model.model_config["row_numbers"]:
+                yield Column(
+                    "xml2db_row_number",
+                    Integer,
+                    nullable=False,
+                )
+            # all other columns and 1-1 relationships
+            for field_type, key, field in self.fields:
+                if field_type == "col" or field_type == "rel1":
+                    yield from field.get_sqlalchemy_column(temp)
+            # root table is given additional integration metadata columns
+            if self.is_root_table:
+                yield Column("xml2db_input_file_path", String(256), nullable=False)
+                yield Column(
+                    "xml2db_processed_at", DateTime(timezone=True), nullable=False
+                )
+        # build target table
+        self.table = Table(
+            self.name,
+            self.metadata,
+            Column(f"pk_{self.name}", Integer, primary_key=True, autoincrement=True),
+            PrimaryKeyConstraint(
+                name=f"cx_pk_{self.name}",
+                mssql_clustered=not self.config["as_columnstore"],
+            ),
+            *get_col(),
+        )
+        # set columnstore index
+        if self.config["as_columnstore"]:
+            self.table.append_constraint(
+                Index(
+                    f"idx_{self.name}_columnstore",
+                    mssql_clustered=True,
+                    mssql_columnstore=True,
+                )
+            )
+        # build temporary table
+        self.temp_table = Table(
+            f"{prefix}{self.name}",
+            self.metadata,
+            Column(f"pk_{self.name}", Integer),
+            *get_col(temp=True),
+            Column("temp_exists", Boolean, default=False),
+        )
+        # build relationship tables
+        for rel in self.relations_n.values():
+            rel.build_relation_tables()
+        self._set_db_schema()
+    def get_merge_temp_records_statements(self) -> Iterable[Any]:
+        """Yield insert and update statements to merge temporary tables into target tables
+        This method yields SQL statements inserting the data of the temporary table (prefixed) \
+        into the target tables (unprefixed). As this kind of node can be duplicated, no unique constraint \
+        is used, but a record is inserted only if its parent record is inserted too.
+        This method should not be called directly but through the save_db method in the :class:`xml2db.Document` \
+        object holding the parsed XML document data, which will ensure that merge queries are issued in the \
+        correct order, and which will encapsulated all queries in a transaction in order to rollback changes on failure.
+        """
+        # update foreign keys and temp_exists based on parent table
+        yield self.temp_table.update().values(
+            **{
+                f"fk_parent_{self.parent.name}": getattr(
+                    self.parent.temp_table.c, f"pk_{self.parent.name}"
+                ),
+                "temp_exists": self.parent.temp_table.c.temp_exists,
+            }
+        ).where(
+            getattr(self.temp_table.c, f"temp_fk_parent_{self.parent.name}")  # noqa
+            == getattr(self.parent.temp_table.c, f"temp_pk_{self.parent.name}")
+        )
+        # update foreign keys for n-1 relations tables
+        for rel in self.relations_1.values():
+            yield from rel.get_merge_temp_records_statements()
+        # insert new records from temp table to target
+        cols = [
+            col_name
+            for col_name in self.table.columns.keys()
+            if col_name != f"pk_{self.name}"
+        ]
+        sel = select(*[getattr(self.temp_table.c, col) for col in cols]).where(
+            self.temp_table.c.temp_exists
+            == False  # noqa: SQLAlchemy not supporting "is False"
+        )
+        yield self.table.insert().from_select(cols, sel)
+        # if table is referenced in a fk relationship, update primary keys back in temp table
+        if self.referenced_as_fk:
+            yield self.temp_table.update().values(
+                **{f"pk_{self.name}": getattr(self.table.c, f"pk_{self.name}")}
+            ).where(
+                and_(
+                    getattr(self.temp_table.c, f"fk_parent_{self.parent.name}")
+                    == getattr(self.table.c, f"fk_parent_{self.parent.name}"),
+                    getattr(self.temp_table.c, f"temp_pk_{self.name}")
+                    == getattr(self.table.c, f"temp_pk_{self.name}"),
+                )
+            )
+        # update records for n-n relations tables
+        for rel in self.relations_n.values():
+            yield from rel.get_merge_temp_records_statements()

xml2db/table/relations.py ADDED Viewed

@@ -0,0 +1,243 @@
+import sqlalchemy.engine
+from sqlalchemy import Table, Column, ForeignKey, Integer, Index, select
+from typing import TYPE_CHECKING, List, Iterable, Any, Union
+if TYPE_CHECKING:
+    from xml2db.table.table import DataModelTable
+    from xml2db.model import DataModel
+class DataModelRelation:
+    """A class representing a relation with another table
+    :param name: the name of the field holding the relation in the parent table
+    :param table: the parent table model in the relation
+    :param other_table: the other table model in the relation
+    :param occurs: list of int with two elements: min occurrences and max occurrences. \
+    Max occurrences is None if unbounded
+    :param ngroup: a key used to handle nested sequences
+    :param data_model: the DataModel object it belongs to
+    """
+    def __init__(
+        self,
+        name: str,
+        name_chain: list,
+        table: "DataModelTable",
+        other_table: "DataModelTable",
+        occurs: List[int],
+        ngroup: Union[str, None],
+        data_model: "DataModel",
+    ):
+        """Constructor method"""
+        self.name = name
+        self.name_chain = name_chain
+        self.table = table
+        self.other_table = other_table
+        self.occurs = occurs
+        self.ngroup = ngroup
+        self.rel_table_name = None
+        self.field_name = None
+        self.rel_table = None
+        self.temp_rel_table = None
+        self.data_model = data_model
+class DataModelRelation1(DataModelRelation):
+    """A class representing a 1-1 relation with another table"""
+    def get_sqlalchemy_column(self, temp: bool = False):
+        """Yields SQLAlchemy object representing the foreign key relation
+        :param temp: are we targeting temp or target table?
+        """
+        self.field_name = (
+            f"{self.name}_fk_{self.other_table.name}"
+            if not self.name.endswith(self.other_table.name)
+            else f"fk_{self.name}"
+        )
+        if temp:
+            yield Column(f"temp_{self.field_name}", Integer)
+            yield Column(self.field_name, Integer)
+        else:
+            yield Column(
+                self.field_name,
+                Integer,
+                ForeignKey(f"{self.other_table.name}.pk_{self.other_table.name}"),
+                index=True,
+            )
+    def get_merge_temp_records_statements(self) -> Iterable[Any]:
+        """A SQL statement to update foreign keys values from target table back to temp table after insert
+        :return: iterable of SQL statements
+        """
+        yield self.table.temp_table.update().values(
+            **{
+                self.field_name: getattr(
+                    self.other_table.temp_table.c, f"pk_{self.other_table.name}"
+                )
+            }
+        ).where(
+            getattr(self.table.temp_table.c, f"temp_{self.field_name}")
+            == getattr(
+                self.other_table.temp_table.c, f"temp_pk_{self.other_table.name}"
+            )
+        )
+class DataModelRelationN(DataModelRelation):
+    """A class representing a 1-N relation with another table"""
+    def build_relation_tables(self) -> None:
+        """Builds sqlalchemy objects for intermediate relationship tables"""
+        self.rel_table_name = (
+            f"{self.table.name}_{self.name}_{self.other_table.name}"
+            if not self.name.endswith(self.other_table.name)
+            else f"{self.table.name}_{self.other_table.name}"
+        )
+        prefix = f"temp_{self.table.temp_prefix}_"
+        if self.other_table.is_reused:
+            self.temp_rel_table = Table(
+                f"{prefix}{self.rel_table_name}",
+                self.table.metadata,
+                Column(f"temp_fk_{self.table.name}", Integer, nullable=False),
+                Column(f"fk_{self.table.name}", Integer),
+                Column(f"temp_fk_{self.other_table.name}", Integer, nullable=False),
+                Column(f"fk_{self.other_table.name}", Integer),
+                *(
+                    (
+                        Column(
+                            "xml2db_row_number",
+                            Integer,
+                            nullable=False,
+                        ),
+                    )
+                    if self.data_model.model_config["row_numbers"]
+                    else ()
+                ),
+            )
+            cl_index = ()
+            if (
+                self.data_model.engine
+                and self.data_model.engine.dialect.name == "mssql"
+                and not self.data_model.model_config["as_columnstore"]
+            ):
+                # n-n relation tables don't have a primary key, so we define a clustered index on the first FK
+                cl_index = (
+                    Index(
+                        f"ix_fk_{self.rel_table_name}",
+                        f"fk_{self.table.name}",
+                        mssql_clustered=True,
+                    ),
+                )
+            self.rel_table = Table(
+                self.rel_table_name,
+                self.table.metadata,
+                Column(
+                    f"fk_{self.table.name}",
+                    Integer,
+                    ForeignKey(f"{self.table.name}.pk_{self.table.name}"),
+                    nullable=False,
+                ),
+                Column(
+                    f"fk_{self.other_table.name}",
+                    Integer,
+                    ForeignKey(f"{self.other_table.name}.pk_{self.other_table.name}"),
+                    nullable=False,
+                    index=True,
+                ),
+                *(
+                    (
+                        Column(
+                            "xml2db_row_number",
+                            Integer,
+                            nullable=False,
+                        ),
+                    )
+                    if self.data_model.model_config["row_numbers"]
+                    else ()
+                ),
+                *cl_index,
+            )
+            # set columnstore index
+            if self.data_model.model_config["as_columnstore"]:
+                self.rel_table.append_constraint(
+                    Index(
+                        f"idx_{self.rel_table.name}_columnstore",
+                        mssql_clustered=True,
+                        mssql_columnstore=True,
+                    )
+                )
+            if self.table.db_schema is not None:
+                self.rel_table.schema = self.table.db_schema
+                self.temp_rel_table.schema = self.table.db_schema
+    def create_table(
+        self, engine: sqlalchemy.engine.Engine, temp: bool = False
+    ) -> None:
+        """Create intermediate relationship table
+        :param engine: sqlalchemy engine to use
+        :param temp: are we creating temp or target table?
+        """
+        if temp:
+            if self.temp_rel_table is not None:
+                self.temp_rel_table.create(engine, checkfirst=True)
+        else:
+            if self.rel_table is not None:
+                self.rel_table.create(engine, checkfirst=True)
+    def get_merge_temp_records_statements(self) -> Iterable[Any]:
+        """Issue SQL statements to insert new records in the intermediate relationship table
+        First, it will update foreign keys in the relationship table to use target tables foreign keys.
+        Then, it will insert new relationship records into the target relationship table
+        :return: sqlalchemy query statements
+        """
+        if self.other_table.is_reused:
+            rel_tb = self.temp_rel_table
+            # update foreign key with self
+            yield rel_tb.update().values(
+                **{
+                    f"fk_{self.table.name}": getattr(
+                        self.table.temp_table.c, f"pk_{self.table.name}"
+                    )
+                }
+            ).where(
+                getattr(  # noqa: Linter puzzled by ==
+                    rel_tb.c, f"temp_fk_{self.table.name}"
+                )
+                == getattr(self.table.temp_table.c, f"temp_pk_{self.table.name}")
+            ).where(
+                self.table.temp_table.c.temp_exists
+                == False  # noqa: SQLAlchemy not supporting "is False"
+            )
+            # update foreign key with other table
+            yield rel_tb.update().values(
+                **{
+                    f"fk_{self.other_table.name}": getattr(
+                        self.other_table.temp_table.c, f"pk_{self.other_table.name}"
+                    )
+                }
+            ).where(
+                getattr(  # noqa: Linter puzzled by ==
+                    rel_tb.c, f"temp_fk_{self.other_table.name}"
+                )
+                == getattr(
+                    self.other_table.temp_table.c, f"temp_pk_{self.other_table.name}"
+                )
+            )
+            # insert new records
+            cols = [f"fk_{self.table.name}", f"fk_{self.other_table.name}"]
+            if self.data_model.model_config["row_numbers"]:
+                cols = cols + ["xml2db_row_number"]
+            sel = select(*[getattr(rel_tb.c, col) for col in cols]).where(
+                getattr(rel_tb.c, f"fk_{self.table.name}")  # noqa
+                != None  # SQLAlchemy not supporting "is not None"
+            )
+            yield self.rel_table.insert().from_select(cols, sel)