PyPI - xml2db - Versions diffs - 0.9.0__py3-none-any.whl - Mend

xml2db 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

debug.py +34 -0
xml2db/__init__.py +21 -0
xml2db/document.py +650 -0
xml2db/exceptions.py +4 -0
xml2db/model.py +619 -0
xml2db/table/__init__.py +5 -0
xml2db/table/column.py +190 -0
xml2db/table/duplicated_table.py +180 -0
xml2db/table/relations.py +243 -0
xml2db/table/reused_table.py +152 -0
xml2db/table/table.py +356 -0
xml2db/table/transformed_table.py +314 -0
xml2db/xml_converter.py +258 -0
xml2db-0.9.0.dist-info/LICENSE +19 -0
xml2db-0.9.0.dist-info/METADATA +100 -0
xml2db-0.9.0.dist-info/RECORD +18 -0
xml2db-0.9.0.dist-info/WHEEL +5 -0
xml2db-0.9.0.dist-info/top_level.txt +2 -0

xml2db/table/reused_table.py ADDED Viewed

@@ -0,0 +1,152 @@
+from sqlalchemy import (
+    Table,
+    Column,
+    Integer,
+    Index,
+    PrimaryKeyConstraint,
+    UniqueConstraint,
+    Boolean,
+    DateTime,
+    String,
+    LargeBinary,
+    select,
+)
+from .transformed_table import DataModelTableTransformed
+class DataModelTableReused(DataModelTableTransformed):
+    """A table data model which de-duplicates records in the database based on their hash value.
+    This table model is the default model to store XML nodes. n-n relationships with parent nodes \
+    are represented with an intermediate relationship table. Although more complicated than the \
+    duplicated version, this table model store less records in the database.
+    """
+    is_reused = True
+    def build_sqlalchemy_tables(self):
+        """Build sqlalchemy table objects.
+        Build the sqlalchemy table objet based on table attributes for the main table, \
+        and relation tables to store n-n relationships, for target and temp tables \
+        (so it builds at least 2 tables if there is no relations).
+        This method is intended to be called only once (if it called more than once it \
+        will return immediately) and further changes to the table will not be updated.
+        """
+        if self.table is not None:
+            return
+        prefix = f"temp_{self.temp_prefix}_"
+        # build target table and n-n relations tables
+        def get_col(temp=False):
+            for field_type, key, field in self.fields:
+                if field_type == "col" or field_type == "rel1":
+                    yield from field.get_sqlalchemy_column(temp)
+            # Root table is given additional integration metadata columns
+            if self.is_root_table:
+                yield Column("xml2db_input_file_path", String(256), nullable=False)
+                yield Column(
+                    "xml2db_processed_at", DateTime(timezone=True), nullable=False
+                )
+            yield Column("xml2db_record_hash", LargeBinary(20), nullable=False)
+            yield UniqueConstraint(
+                "xml2db_record_hash",
+                name=f"{prefix if temp else ''}{self.name}_xml2db_record_hash",
+            )
+        # build target table
+        self.table = Table(
+            self.name,
+            self.metadata,
+            Column(f"pk_{self.name}", Integer, primary_key=True, autoincrement=True),
+            PrimaryKeyConstraint(
+                name=f"cx_pk_{self.name}",
+                mssql_clustered=not self.config["as_columnstore"],
+            ),
+            *get_col(),
+        )
+        # set columnstore index
+        if self.config["as_columnstore"]:
+            self.table.append_constraint(
+                Index(
+                    f"idx_{self.name}_columnstore",
+                    mssql_clustered=True,
+                    mssql_columnstore=True,
+                )
+            )
+        # build temporary table
+        self.temp_table = Table(
+            f"{prefix}{self.name}",
+            self.metadata,
+            Column(f"pk_{self.name}", Integer),
+            Column(
+                f"temp_pk_{self.name}", Integer, primary_key=True, autoincrement=False
+            ),
+            *get_col(temp=True),
+            Column("temp_exists", Boolean, default=False),
+        )
+        # build relation tables
+        for rel in self.relations_n.values():
+            rel.build_relation_tables()
+        self._set_db_schema()
+    def get_merge_temp_records_statements(self):
+        """Yield insert and update statements to merge temporary tables into target tables
+        This method yield SQL statements inserting the data of the temporary table (prefixed)
+        into the target tables (unprefixed). It deals with primary keys and foreign keys by
+        looking up first existing records with the same hash in order to reuse already existing
+        records when the new record is identical.
+        This method should not be called directly but through the save_db method in the Document
+        class, which will ensure that merge queries are issued in the correct order for all the
+        data flow, and which will encapsulated all queries in a transaction in order to rollback
+        changes on failure.
+        """
+        # find matching records hash in target table
+        yield self.temp_table.update().values(temp_exists=True).where(
+            getattr(  # noqa: Linter puzzled by ==
+                self.temp_table.c, "xml2db_record_hash"
+            )
+            == getattr(self.table.c, "xml2db_record_hash")
+        )
+        # update foreign keys for n-1 relations tables
+        for rel in self.relations_1.values():
+            yield from rel.get_merge_temp_records_statements()
+        # insert missing records from temp table to target
+        cols = [
+            col_name
+            for col_name in self.temp_table.columns.keys()
+            if not col_name.startswith("temp_") and col_name != f"pk_{self.name}"
+        ]
+        sel = select(*[getattr(self.temp_table.c, col) for col in cols]).where(
+            self.temp_table.c.temp_exists
+            == False  # noqa: SQLAlchemy not supporting "is False"
+        )
+        yield self.table.insert().from_select(cols, sel)
+        # update primary keys back in temp table
+        yield self.temp_table.update().values(
+            **{f"pk_{self.name}": getattr(self.table.c, f"pk_{self.name}")}
+        ).where(
+            getattr(  # noqa: Linter puzzled by ==
+                self.temp_table.c, "xml2db_record_hash"
+            )
+            == getattr(self.table.c, "xml2db_record_hash")
+        )
+        # update primary keys for n-n relations tables
+        for rel in self.relations_n.values():
+            yield from rel.get_merge_temp_records_statements()

xml2db/table/table.py ADDED Viewed

@@ -0,0 +1,356 @@
+from typing import Iterable, List, Any, Union, TYPE_CHECKING
+import logging
+import sqlalchemy
+from sqlalchemy import Table
+from sqlalchemy.schema import CreateTable, CreateIndex
+from xml2db.table.column import DataModelColumn
+from xml2db.table.relations import DataModelRelation1, DataModelRelationN
+from xml2db.exceptions import DataModelConfigError
+if TYPE_CHECKING:
+    from xml2db.model import DataModel
+logger = logging.getLogger(__name__)
+class DataModelTable:
+    """A class representing a database table translated from an XML schema complex type
+    :param table_name: the table's name
+    :param type_name: the XSD complex type name
+    :param is_root_table: is this table the root table?
+    :param is_virtual_node: was this table created to store multiple root elements?
+    :param metadata: :class:`sqlalchemy.Metadata` object to build sqlalchemy models into
+    :param config: model's configuration
+    :param db_schema: database schema to use
+    :param temp_prefix: temp prefix to use for naming temp tables
+    :param data_model: the `DataModel` instance
+    :ivar model_group: 'choice' or 'sequence', extracted from the XSD. 'choice' means that only one field \
+    can have a value at the same time
+    :ivar is_root_table: is this table the root table?
+    :ivar fields: a list of tuples describing all table fields, ordered, in the form (type, name, object) where \
+    type can be "col", "rel1" or "reln", name is the name of the column or relation, and object is the column \
+    or relationship object
+    :ivar columns: a dict of all columns (fields with simple values), keyed by field name
+    :ivar relations_1: a dict of 0-1 or 1-1 relations, keyed by field name
+    :ivar relations_n: a dict of 0-n or 1-n relations, keyed by field name
+    """
+    is_reused = None
+    def __init__(
+        self,
+        table_name: str,
+        type_name: str,
+        is_root_table: bool,
+        is_virtual_node: bool,
+        metadata: sqlalchemy.MetaData,
+        config: dict,
+        db_schema: str,
+        temp_prefix: str,
+        data_model: "DataModel",
+    ):
+        """Constructor method"""
+        # config attributes
+        self.name = table_name
+        self.type_name = type_name
+        self.is_root_table = is_root_table
+        self.is_virtual_node = is_virtual_node
+        self.model_group = "sequence"
+        self.config = {} if config is None else config
+        if "as_columnstore" in self.config:
+            if not isinstance(self.config["as_columnstore"], bool):
+                raise DataModelConfigError("as_columnstore must be a bool")
+            if (
+                self.config["as_columnstore"]
+                and data_model.engine
+                and not data_model.engine.dialect.name == "mssql"
+            ):
+                self.config["as_columnstore"] = False
+                logger.warning(
+                    "Clustered columnstore indexes are only supported with MS SQL Server database"
+                )
+        else:
+            self.config["as_columnstore"] = data_model.model_config["as_columnstore"]
+        self.db_schema = db_schema
+        self.temp_prefix = temp_prefix
+        # fields (columns and relations)
+        self.fields = []
+        self.columns = {}
+        self.relations_1 = {}
+        self.relations_n = {}
+        # dependencies logic
+        self.is_simplified = False  # is the table already simplified ? (used in the simplification process)
+        self.parents_1 = (
+            set()
+        )  # a set of 1-1 relations the table is involved in as a child
+        self.parents_n = (
+            set()
+        )  # a set of 1-n relations the table is involved in as a child
+        self.parent = None
+        self.dependencies = (
+            set()
+        )  # a set of tables this table depends on (can be children or parents)
+        self.referenced_as_fk = False
+        # sqlalchemy objects
+        self.metadata = metadata
+        self.table = None
+        self.temp_table = None
+        self.data_model = data_model
+    def add_column(
+        self,
+        name: str,
+        data_type: str,
+        occurs: List[int],
+        min_length: int,
+        max_length: Union[int, None],
+        is_attr: bool,
+        is_content: bool,
+        allow_empty: bool,
+        ngroup: Union[str, None],
+    ) -> None:
+        """Helper to add a new column to the model
+        :param name: name of the column
+        :param data_type: data type
+        :param occurs: min and max occurrences
+        :param min_length: minimum length
+        :param max_length: maximum length
+        :param is_attr: is XML attribute or element?
+        :param is_content: is content of a mixed type element?
+        :param allow_empty: is nullable?
+        :param ngroup: a string id signaling that the column belongs to a nested sequence
+        """
+        self.columns[name] = DataModelColumn(
+            name,
+            [(name, None)],
+            data_type,
+            occurs,
+            min_length,
+            max_length,
+            is_attr,
+            is_content,
+            allow_empty,
+            ngroup,
+            self.config,
+            self.data_model,
+        )
+        self.fields.append(("col", name, self.columns[name]))
+    def add_relation_1(
+        self,
+        name: str,
+        other_table: "DataModelTable",
+        occurs: List[int],
+        ngroup: Union[str, None],
+    ) -> None:
+        """Helper to add a 1-to-1 relationship
+        :param name: name of the 1-1 relationship
+        :param other_table: the child table of the relationship
+        :param occurs: min and max occurs for this relationship
+        :param ngroup: a string id signaling that the relation belongs to a nested sequence
+        """
+        if occurs[1] != 1:
+            raise ValueError(
+                "attempting to add a 1-1 relationship with max occurrences different from 1"
+            )
+        rel = DataModelRelation1(
+            name,
+            [(name, other_table.type_name)],
+            self,
+            other_table,
+            occurs,
+            ngroup,
+            self.data_model,
+        )
+        self.relations_1[name] = rel
+        self.fields.append(("rel1", name, rel))
+        other_table.parents_1.add(rel)
+    def add_relation_n(self, name, other_table, occurs, ngroup):
+        """Helper to add a 1-to-many relationship
+        :param name: name of the 1-1 relationship
+        :param other_table: the child table of the relationship
+        :param occurs: min and max occurs for this relationship
+        :param ngroup: a string id signaling that the relation belongs to a nested sequence
+        """
+        if occurs[1] == 1:
+            raise ValueError(
+                "attempting to add a 1-n relationship with max occurrences equal to 1"
+            )
+        rel = DataModelRelationN(
+            name,
+            [(name, other_table.type_name)],
+            self,
+            other_table,
+            occurs,
+            ngroup,
+            self.data_model,
+        )
+        self.relations_n[name] = rel
+        self.fields.append(("reln", name, rel))
+        other_table.parents_n.add(rel)
+    def compute_dependencies(self) -> None:
+        """Compute the table's dependencies according to foreign keys relationships.
+        Dependencies are tables that the current table holds foreign keys relationships to (i.e. the one which need
+        to exist before this one can be created, for instance). To compute `dependencies` list, it ignores fk referenced
+        in relationship tables for n-n relationships. For `referenced_as_fk` it is more litteral and include those.
+        This function should be called after schema simplification because dependencies will not \
+        be properly updated during the simplification process.
+        """
+        # we drop parents information which is no longer accurate after schema simplification
+        self.parents_1 = None
+        self.parents_n = None
+        for field_type, rel_name, relation in self.fields:
+            if field_type == "rel1" or field_type == "reln":
+                if (
+                    relation.other_table.parent is not None
+                    and not relation.other_table.is_reused
+                ):
+                    raise ValueError(
+                        f"unsupported: table {relation.other_table.name} is not reused and has more than 1 parent"
+                    )
+                relation.other_table.parent = self
+                if relation.other_table.is_reused:
+                    self.dependencies.add(relation.other_table.type_name)
+                    relation.other_table.referenced_as_fk = True
+                    if (
+                        field_type == "reln"
+                    ):  # the relationship table will create a fk constraint to self
+                        self.referenced_as_fk = True
+                else:
+                    relation.other_table.dependencies.add(self.type_name)
+                    self.referenced_as_fk = True
+    def _set_db_schema(self) -> None:
+        """Set db schema value for sqlalchemy tables objects"""
+        if (
+            self.db_schema is not None
+            and self.table is not None
+            and self.temp_table is not None
+        ):
+            # sqlalchemy.Table.schema is the db_schema
+            self.table.schema = self.db_schema
+            self.temp_table.schema = self.db_schema
+    def get_create_table_statements(self, temp=False) -> Iterable[CreateTable]:
+        """Yield create table statements for the table and the rel tables
+        :param temp: if True, yield create table statements for temporary tables (prefixed)
+        """
+        if temp:
+            yield CreateTable(self.temp_table)
+            for relation in self.relations_n.values():
+                if relation.temp_rel_table is not None:
+                    yield CreateTable(relation.temp_rel_table)
+        else:
+            yield CreateTable(self.table)
+            for relation in self.relations_n.values():
+                if relation.rel_table is not None:
+                    yield CreateTable(relation.rel_table)
+    def get_create_index_statements(self) -> Iterable[CreateIndex]:
+        """Yield create index statements for the indexes of the table and its relation tables"""
+        def yield_indexes(table: Table) -> Iterable[CreateIndex]:
+            indexes = [index for index in table.indexes]
+            # Sort to guarantee indexes statements of a same table are printed in the same order everytime, otherwise
+            # the order is random, and it may create useless git changes in the output folder
+            indexes.sort(key=lambda index: index.name)
+            for index in indexes:
+                yield CreateIndex(index)
+        yield from yield_indexes(self.table)
+        for relation in self.relations_n.values():
+            if relation.rel_table is not None:
+                yield from yield_indexes(relation.rel_table)
+    def create_tables(self, engine: sqlalchemy.engine.base.Engine, temp: bool = False):
+        """Create tables, either target tables or temp tables used to import data
+        :param engine: a sqlalchemy engine to use
+        :param temp: if True, create temporary (prefixed) tables
+        """
+        if temp:
+            self.temp_table.create(engine, checkfirst=True)
+        else:
+            self.table.create(engine, checkfirst=True)
+        for relation in self.relations_n.values():
+            relation.create_table(engine, temp)
+    def get_insert_temp_records_statements(self, data: dict) -> Iterable[Any]:
+        """Yield drop table if exists, create table and insert statement for temporary tables"""
+        if data is not None and len(data["records"]) > 0:
+            yield self.temp_table.insert(), data["records"]
+            data_rel = data.get("relations_n", {})
+            for relation in self.relations_n.values():
+                if (
+                    relation.rel_table_name in data_rel
+                    and len(data_rel[relation.rel_table_name]["records"]) > 0
+                ):
+                    yield relation.temp_rel_table.insert(), data_rel[
+                        relation.rel_table_name
+                    ]["records"]
+    def drop_tables(self, engine: sqlalchemy.engine.base.Engine) -> None:
+        """Drop target (unprefixed) tables (main table and relations)
+        BE CAUTIOUS, THIS METHOD DROPS TABLES WITHOUT FURTHER NOTICE!
+        :param engine: a sqlalchemy engine to use
+        """
+        for rel in self.relations_n.values():
+            if rel.rel_table is not None:
+                rel.rel_table.drop(engine, checkfirst=True)
+        self.table.drop(engine, checkfirst=True)
+    def drop_temp_tables(self, engine: sqlalchemy.engine.base.Engine) -> None:
+        """Drop temporary (prefixed) tables (main table and relations)
+        BE CAUTIOUS, THIS METHOD DROPS TABLES WITHOUT FURTHER NOTICE!
+        :param engine: a sqlalchemy engine to use
+        """
+        for rel in self.relations_n.values():
+            if rel.temp_rel_table is not None:
+                rel.temp_rel_table.drop(engine, checkfirst=True)
+        self.temp_table.drop(engine, checkfirst=True)
+    def get_entity_rel_diagram(self) -> List:
+        """Build ERD representation for a single table and its relationships
+        The string representation is used by mermaid.js to create a visual diagram.
+        :return: a list of strings (lines)
+        """
+        out = (
+            [
+                f"{self.name} ||--{'o' if rel.occurs[0] == 0 else '|'}| {rel.other_table.name} : "
+                f'"{rel.name}"'
+                for rel in self.relations_1.values()
+            ]
+            + [
+                f"{self.name} ||--{'o' if rel.occurs[0] == 0 else '|'}{{ {rel.other_table.name} : "
+                f"\"{rel.name}{'*' if rel.other_table.is_reused else ''}\""
+                for rel in self.relations_n.values()
+            ]
+            + [f"{self.name} {{"]
+            + [
+                (
+                    f"    {self.columns[field[1]].data_type}{'-N' if self.columns[field[1]].occurs[1] is None else ''} "
+                    f"{field[1].replace('.', '_')}"
+                )
+                for field in self.fields
+                if field[0] == "col"
+            ]
+            + ["}"]
+        )
+        return [f"    {line}" for line in out]