PyPI - xml2db - Versions diffs - 0.12.2__tar.gz → 0.12.4__tar.gz - Mend

xml2db 0.12.2tar.gz → 0.12.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{xml2db-0.12.2/src/xml2db.egg-info → xml2db-0.12.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: xml2db
-Version: 0.12.2
+Version: 0.12.4
 Summary: Import complex XML files to a relational database
 Author-email: Commission de régulation de l'énergie <opensource@cre.fr>
 Project-URL: Documentation, https://cre-dev.github.io/xml2db
@@ -13,11 +13,11 @@ Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: sqlalchemy>1.4
-Requires-Dist: xmlschema==3.3.2
-Requires-Dist: lxml==5.1.0
+Requires-Dist: xmlschema>=3.3.2
+Requires-Dist: lxml>=5.1.0
 Provides-Extra: docs
-Requires-Dist: mkdocs-material==9.5.34; extra == "docs"
-Requires-Dist: mkdocstrings-python==1.11.1; extra == "docs"
+Requires-Dist: mkdocs-material>=9.5.34; extra == "docs"
+Requires-Dist: mkdocstrings-python>=1.11.1; extra == "docs"
 Provides-Extra: tests
 Requires-Dist: pytest>=7.0; extra == "tests"

{xml2db-0.12.2 → xml2db-0.12.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "xml2db"
-version = "0.12.2"
+version = "0.12.4"
 authors = [
   { name="Commission de régulation de l'énergie", email="opensource@cre.fr" },
 ]
@@ -18,12 +18,12 @@ classifiers = [
 ]
 dependencies = [
     "sqlalchemy>1.4",
-    "xmlschema==3.3.2",
-    "lxml==5.1.0",
+    "xmlschema>=3.3.2",
+    "lxml>=5.1.0",
 ]
 [project.optional-dependencies]
-docs = ["mkdocs-material==9.5.34", "mkdocstrings-python==1.11.1"]
+docs = ["mkdocs-material>=9.5.34", "mkdocstrings-python>=1.11.1"]
 tests = ["pytest>=7.0"]
 [project.urls]

{xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/document.py RENAMED Viewed

@@ -171,17 +171,15 @@ class Document:
                     record["xml2db_row_number"] = row_number
             # build record from fields for columns and n-1 relations
-            for field_type, key, _ in model_table.fields:
+            for field_type, key, field in model_table.fields:
                 if field_type == "col":
-                    if key in content:
-                        if model_table.columns[key].data_type in ["decimal", "float"]:
-                            val = [float(v) for v in content[key]]
-                        elif model_table.columns[key].data_type == "integer":
-                            val = [int(v) for v in content[key]]
-                        elif model_table.columns[key].data_type == "boolean":
-                            val = [v == "true" or v == "1" for v in content[key]]
-                        else:
-                            val = content[key]
+                    content_key = (
+                        (f"{key[:-5]}__attr" if field.has_suffix else f"{key}__attr")
+                        if field.is_attr
+                        else key
+                    )
+                    if content_key in content:
+                        val = content[content_key]
                         if len(val) == 1:
                             record[key] = val[0]
@@ -320,25 +318,26 @@ class Document:
             record = data_index[node_type]["records"][node_pk]
             for field_type, rel_name, rel in tb.fields:
                 if field_type == "col" and record[rel_name] is not None:
-                    if rel.data_type in [
-                        "decimal",
-                        "float",
-                    ]:  # remove trailing ".0" for decimal and float
-                        content[rel_name] = [
-                            value.rstrip("0").rstrip(".") if "." in value else value
-                            for value in str(record[rel_name]).split(",")
-                        ]
-                    elif isinstance(record[rel_name], datetime.datetime):
-                        content[rel_name] = [
+                    content_key = (
+                        (
+                            f"{rel_name[:-5]}__attr"
+                            if rel.has_suffix
+                            else f"{rel_name}__attr"
+                        )
+                        if rel.is_attr
+                        else rel_name
+                    )
+                    if isinstance(record[rel_name], datetime.datetime):
+                        content[content_key] = [
                             record[rel_name].isoformat(timespec="milliseconds")
                         ]
                     else:
-                        content[rel_name] = (
+                        content[content_key] = (
                             list(csv.reader([str(record[rel_name])], escapechar="\\"))[
                                 0
                             ]
                             if "," in str(record[rel_name])
-                            else [str(record[rel_name])]
+                            else [record[rel_name]]
                         )
                 elif (
                     field_type == "rel1"

{xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/model.py RENAMED Viewed

@@ -70,7 +70,7 @@ class DataModel:
     def __init__(
         self,
         xsd_file: str,
-        short_name: str = None,
+        short_name: str = "DocumentRoot",
         long_name: str = None,
         base_url: str = None,
         model_config: dict = None,
@@ -226,8 +226,7 @@ class DataModel:
         """
         # parse the XML schema recursively and hold a reference to the head table
         root_table = self._parse_tree(
-            self.xml_schema[0] if len(self.xml_schema) == 1 else self.xml_schema,
-            is_root_table=True,
+            self.xml_schema[0] if len(self.xml_schema) == 1 else self.xml_schema
         )
         self.root_table = root_table.type_name
         # compute a text representation of the original data model and store it
@@ -273,9 +272,7 @@ class DataModel:
         for tb in self.fk_ordered_tables:
             tb.build_sqlalchemy_tables()
-    def _parse_tree(
-        self, parent_node: xmlschema.XsdElement, is_root_table: bool = False
-    ):
+    def _parse_tree(self, parent_node: xmlschema.XsdElement, nodes_path: list = None):
         """Parse a node of an XML schema recursively and create a target data model without any simplification
         We parse the XSD tree recursively to create for each node (basically a complex type in the XSD) an equivalent \
@@ -289,7 +286,7 @@ class DataModel:
         Args:
             parent_node: the current XSD node being parsed
-            is_root_table: True if this is the root table
+            nodes_path: a list of nodes types from the root node
         """
         # find current node type and name and returns corresponding table if it already exists
@@ -301,12 +298,16 @@ class DataModel:
         if parent_type is None:
             parent_type = parent_node.local_name
+        nodes_path = (nodes_path if nodes_path else []) + [parent_type]
         # if this type has already been encountered, stop here and return existing table
         if parent_type in self.tables:
             parent_table = self.tables[parent_type]
             return parent_table
-        # elements names and types should be bijective. If an element name is used for different types,
+        # For database tables we use element names rather than XSD types, under the assumption that they are often
+        # more meaningful given that they are the one which appear in XML documents. However, same names can be used
+        # for different XSD types, so if an element name is used for different types,
         # we add a suffix to the name to make it unique again (using a dict to keep the name/type association)
         parent_name = (
             parent_node.local_name
@@ -324,7 +325,7 @@ class DataModel:
         parent_table = self._create_table_model(
             parent_name,
             parent_type,
-            is_root_table,
+            len(nodes_path) == 1,
             isinstance(parent_node, xmlschema.XMLSchema),
         )
         self.tables[parent_type] = parent_table
@@ -363,6 +364,13 @@ class DataModel:
                     if elem_type.base_type
                     else recurse_parse_simple_type(elem_type.member_types)
                 )
+            if elem_type.is_list():
+                return (
+                    "string",
+                    0,
+                    None,
+                    elem_type.allow_empty,
+                )
             if elem_type.is_restriction():
                 dt = elem_type.base_type.local_name
                 mil = elem_type.min_length
@@ -384,7 +392,12 @@ class DataModel:
                         else None
                     )
                     ae = ae and bt_ae if ae is not None and bt_ae is not None else None
-                if elem_type.enumeration is not None:
+                if elem_type.enumeration is not None and dt in [
+                    "string",
+                    "NMTOKEN",
+                    "duration",
+                    "token",
+                ]:
                     mil = min([len(val) for val in elem_type.enumeration])
                     mal = max([len(val) for val in elem_type.enumeration])
                 return dt, mil, mal, ae
@@ -410,25 +423,31 @@ class DataModel:
                 ),
             ]
-        # go through item attributes and add them as columns
+        # go through item attributes and add them as columns, adding a suffix if an element with the same name exists
+        children_names = None
         for attrib_name, attrib in parent_node.attributes.items():
+            if children_names is None:
+                children_names = [child.local_name for child in parent_node]
             (
                 data_type,
                 min_length,
                 max_length,
                 allow_empty,
             ) = recurse_parse_simple_type([attrib.type])
+            suffix = attrib_name in children_names
             parent_table.add_column(
-                f"{attrib_name}",
+                f"{attrib_name}{'_attr' if suffix else ''}",
                 data_type,
                 [0, 1],
                 min_length,
                 max_length,
                 True,
+                suffix,
                 False,
                 allow_empty,
                 None,
             )
         nested_containers = []
         # go through the children to add either arguments either relations to the current element
         for child in parent_node:
@@ -454,6 +473,7 @@ class DataModel:
                                 if child.parent
                                 and child.parent.max_occurs != 1
                                 and child.parent.model != "choice"
+                                and child.max_occurs == 1
                                 else None
                             ),
                         )
@@ -482,32 +502,39 @@ class DataModel:
                         max_length,
                         False,
                         False,
+                        False,
                         allow_empty,
                         nested_containers[-1][1],
                     )
                 elif ct.is_complex():
-                    child_table = self._parse_tree(child)
-                    child_table.model_group = (
-                        "choice"
-                        if ct.model_group and ct.model_group.model == "choice"
-                        else "sequence"
-                    )
-                    occurs = get_occurs(child)
-                    if child.is_single():
-                        parent_table.add_relation_1(
-                            child.local_name,
-                            child_table,
-                            occurs,
-                            nested_containers[-1][1],
+                    # ignoring recursive definitions by skipping these fields
+                    if child.type.local_name in nodes_path:
+                        logger.warning(
+                            f"type '{child.type.local_name}' contains a recursive definition"
                         )
                     else:
-                        parent_table.add_relation_n(
-                            child.local_name,
-                            child_table,
-                            occurs,
-                            nested_containers[-1][1],
+                        child_table = self._parse_tree(child, nodes_path)
+                        child_table.model_group = (
+                            "choice"
+                            if ct.model_group and ct.model_group.model == "choice"
+                            else "sequence"
                         )
+                        occurs = get_occurs(child)
+                        if occurs[1] == 1:
+                            parent_table.add_relation_1(
+                                child.local_name,
+                                child_table,
+                                occurs,
+                                nested_containers[-1][1],
+                            )
+                        else:
+                            parent_table.add_relation_n(
+                                child.local_name,
+                                child_table,
+                                occurs,
+                                nested_containers[-1][1],
+                            )
                 else:
                     raise ValueError("unknown case; please check")
             else:
@@ -534,6 +561,7 @@ class DataModel:
                 min_length,
                 max_length,
                 False,
+                False,
                 True,
                 allow_empty,
                 None,
@@ -544,31 +572,19 @@ class DataModel:
     def _repr_tree(
         self,
         parent_table: Union[DataModelTableReused, DataModelTableDuplicated],
-        visited_nodes: Union[set, None] = None,
     ):
         """Build a text representation of the data model tree
         Args:
             parent_table: the current data model table object
         """
-        if visited_nodes is None:
-            visited_nodes = set()
-        else:
-            visited_nodes = {item for item in visited_nodes}
-        visited_nodes.add(parent_table.name)
         for field_type, name, field in parent_table.fields:
             if field_type == "col":
                 yield f"{field.name}{field.occurs}: {field.data_type}"
-            elif field_type == "rel1":
-                mg = " (choice)" if field.other_table.model_group == "choice" else ""
-                yield f"{field.name}{field.occurs}{mg}:{' ...' if field_type in visited_nodes else ''}"
-                if field.other_table.name not in visited_nodes:
-                    for line in self._repr_tree(field.other_table, visited_nodes):
-                        yield f"    {line}"
-            elif field_type == "reln":
+            else:
                 mg = " (choice)" if field.other_table.model_group == "choice" else ""
-                yield f"{field.name}{field.occurs}{mg}:{' ...' if field_type in visited_nodes else ''}"
-                for line in self._repr_tree(field.other_table, visited_nodes):
+                yield f"{field.name}{field.occurs}{mg}:"
+                for line in self._repr_tree(field.other_table):
                     yield f"    {line}"
     def get_entity_rel_diagram(self, text_context: bool = True) -> str:

{xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/table/column.py RENAMED Viewed

@@ -32,15 +32,22 @@ def types_mapping_default(temp: bool, col: "DataModelColumn") -> Any:
     """
     if col.occurs[1] != 1:
         return String(8000)
-    if col.data_type in ["decimal", "float"]:
+    if col.data_type in ["decimal", "float", "double"]:
         return Double
     if col.data_type == "dateTime":
         return DateTime(timezone=True)
-    if col.data_type == "integer" or col.data_type == "int":
+    if col.data_type in [
+        "integer",
+        "int",
+        "nonPositiveInteger",
+        "nonNegativeInteger",
+        "positiveInteger",
+        "negativeInteger",
+    ]:
         return Integer
     if col.data_type == "boolean":
         return Boolean
-    if col.data_type == "byte":
+    if col.data_type in ["short", "byte"]:
         return SmallInteger
     if col.data_type == "long":
         return BigInteger
@@ -77,20 +84,10 @@ def types_mapping_mssql(temp: bool, col: "DataModelColumn") -> Any:
     """
     if col.occurs[1] != 1:
         return mssql.VARCHAR(8000)
-    if col.data_type in ["decimal", "float"]:
-        return Double
     if col.data_type == "dateTime":
         # using the DATETIMEOFFSET directly in the temporary table caused issues when inserting data in the target
         # table with INSERT INTO SELECT converts datetime VARCHAR to DATETIMEOFFSET without errors
         return mssql.VARCHAR(100) if temp else mssql.DATETIMEOFFSET
-    if col.data_type == "integer" or col.data_type == "int":
-        return Integer
-    if col.data_type == "boolean":
-        return Boolean
-    if col.data_type == "byte":
-        return SmallInteger
-    if col.data_type == "long":
-        return BigInteger
     if col.data_type == "date":
         return mssql.VARCHAR(16)
     if col.data_type == "time":
@@ -106,12 +103,7 @@ def types_mapping_mssql(temp: bool, col: "DataModelColumn") -> Any:
         if col.max_length == col.min_length:
             return mssql.BINARY(col.max_length)
         return mssql.VARBINARY(col.max_length)
-    else:
-        logger.warning(
-            f"unknown type '{col.data_type}' for column '{col.name}', defaulting to VARCHAR(1000) "
-            f"(this can be overridden by providing a field type in the configuration)"
-        )
-        return mssql.VARCHAR(1000)
+    return types_mapping_default(temp, col)
 def types_mapping_mysql(temp: bool, col: "DataModelColumn") -> Any:
@@ -167,6 +159,7 @@ class DataModelColumn:
         min_length: int,
         max_length: Union[int, None],
         is_attr: bool,
+        has_suffix: bool,
         is_content: bool,
         allow_empty: bool,
         ngroup: Union[int, None],
@@ -181,6 +174,7 @@ class DataModelColumn:
         self.min_length = min_length
         self.max_length = max_length
         self.is_attr = is_attr
+        self.has_suffix = has_suffix
         self.is_content = is_content
         self.allow_empty = allow_empty
         self.ngroup = ngroup

{xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/table/reused_table.py RENAMED Viewed

@@ -71,6 +71,7 @@ class DataModelTableReused(DataModelTableTransformed):
                 False,
                 False,
                 False,
+                False,
                 None,
                 self.config,
                 self.data_model,

{xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/table/table.py RENAMED Viewed

@@ -130,6 +130,7 @@ class DataModelTable:
         min_length: int,
         max_length: Union[int, None],
         is_attr: bool,
+        has_suffix: bool,
         is_content: bool,
         allow_empty: bool,
         ngroup: Union[str, None],
@@ -143,6 +144,7 @@ class DataModelTable:
             min_length: minimum length
             max_length: maximum length
             is_attr: is XML attribute or element?
+            has_suffix: for an attribute, do we need the '_attr' suffix?
             is_content: is content of a mixed type element?
             allow_empty: is nullable?
             ngroup: a string id signaling that the column belongs to a nested sequence
@@ -155,6 +157,7 @@ class DataModelTable:
             min_length,
             max_length,
             is_attr,
+            has_suffix,
             is_content,
             allow_empty,
             ngroup,

{xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/table/transformed_table.py RENAMED Viewed

@@ -76,6 +76,7 @@ class DataModelTableTransformed(DataModelTable):
                 False,
                 False,
                 False,
+                False,
                 None,
                 self.config,
                 self.data_model,
@@ -89,6 +90,7 @@ class DataModelTableTransformed(DataModelTable):
                 max(max_lengths) if all(e is not None for e in max_lengths) else None,
                 False,
                 False,
+                False,
                 any(allow_empty),
                 None,
                 self.config,
@@ -193,6 +195,7 @@ class DataModelTableTransformed(DataModelTable):
                     child_field.min_length,
                     child_field.max_length,
                     child_field.is_attr,
+                    child_field.has_suffix,
                     child_field.is_content,
                     child_field.allow_empty,
                     child_field.ngroup,
@@ -276,9 +279,12 @@ class DataModelTableTransformed(DataModelTable):
         # if the table can be transformed, stop here
         if self._is_table_choice_transform_applicable():
+            fields_transform = {}
+            for col in self.columns.values():
+                fields_transform[(self.type_name, col.name)] = (None, "join")
             self._transform_to_choice()
             self.is_simplified = True
-            return {self.type_name: "choice"}, {}
+            return {self.type_name: "choice"}, fields_transform
         # loop through field to transform them if need be
         out_fields = []

{xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/xml_converter.py RENAMED Viewed

@@ -128,31 +128,36 @@ class XMLConverter:
                 key
                 != "{http://www.w3.org/2001/XMLSchema-instance}noNamespaceSchemaLocation"
             ):
-                content[key] = [val]
+                content[f"{key}__attr"] = [val.strip() if val.strip() else val]
         if node.text and node.text.strip():
             content["value"] = [node.text.strip()]
         for element in node.iterchildren():
-            key = element.tag.split("}")[1] if "}" in element.tag else element.tag
-            node_type_key = (node_type, key)
-            value = None
-            if element.text and element.text.strip():
-                value = element.text
-            transform = self.model.fields_transforms.get(node_type_key, (None, "join"))[
-                1
-            ]
-            if transform != "join":
-                value = self._parse_xml_node(
-                    self.model.fields_transforms[node_type_key][0],
-                    element,
-                    transform not in ["elevate", "elevate_wo_prefix"],
-                    hash_maps,
-                )
-            if key in content:
-                content[key].append(value)
-            else:
-                content[key] = [value]
+            if isinstance(element.tag, str):
+                key = element.tag.split("}")[1] if "}" in element.tag else element.tag
+                node_type_key = (node_type, key)
+                value = None
+                if element.text:
+                    value = (
+                        element.text.strip() if element.text.strip() else element.text
+                    )
+                if node_type_key not in self.model.fields_transforms:
+                    # skip the node if it is not in the data model
+                    continue
+                transform = self.model.fields_transforms[node_type_key][1]
+                if transform != "join":
+                    value = self._parse_xml_node(
+                        self.model.fields_transforms[node_type_key][0],
+                        element,
+                        transform not in ["elevate", "elevate_wo_prefix"],
+                        hash_maps,
+                    )
+                if value is not None:
+                    if key in content:
+                        content[key].append(value)
+                    else:
+                        content[key] = [value]
         node = self._transform_node(node_type, content)
@@ -189,6 +194,7 @@ class XMLConverter:
         hash_maps = {}
         joined_values = False
+        skipped_nodes = 0
         for event, element in etree.iterparse(
             xml_file,
             recover=recover,
@@ -196,12 +202,17 @@ class XMLConverter:
             remove_blank_text=True,
         ):
             key = element.tag.split("}")[1] if "}" in element.tag else element.tag
-            if event == "start":
+            if event == "start" and skipped_nodes > 0:
+                skipped_nodes += 1
+            elif event == "start":
                 if nodes_stack[-1][0]:
                     node_type_key = (nodes_stack[-1][0], key)
-                    node_type, transform = self.model.fields_transforms.get(
-                        node_type_key, (None, "join")
-                    )
+                    if node_type_key not in self.model.fields_transforms:
+                        skipped_nodes += 1
+                        continue
+                    node_type, transform = self.model.fields_transforms[node_type_key]
                 else:
                     node_type, transform = self.model.root_table, None
                 joined_values = transform == "join"
@@ -212,28 +223,41 @@ class XMLConverter:
                             attrib_key
                             != "{http://www.w3.org/2001/XMLSchema-instance}noNamespaceSchemaLocation"
                         ):
-                            content[attrib_key] = [attrib_val]
+                            content[f"{attrib_key}__attr"] = [
+                                attrib_val.strip() if attrib_val.strip() else attrib_val
+                            ]
                     nodes_stack.append((node_type, content))
+            elif event == "end" and skipped_nodes > 0:
+                skipped_nodes -= 1
             elif event == "end":
-                # joined_values was set with the previous "start" event just before
+                # joined_values was set with the previous "start" event just before and corresponds to lists of simple
+                # type elements
                 if joined_values:
+                    value = None
                     if element.text:
-                        if key in nodes_stack[-1][1]:
-                            nodes_stack[-1][1][key].append(element.text)
+                        if element.text.strip():
+                            value = element.text.strip()
                         else:
-                            nodes_stack[-1][1][key] = [element.text]
+                            value = element.text
+                    if key in nodes_stack[-1][1]:
+                        nodes_stack[-1][1][key].append(value)
+                    else:
+                        nodes_stack[-1][1][key] = [value]
+                # else, we have completed a complex type node
                 else:
                     node = nodes_stack.pop()
                     if nodes_stack[-1][0]:
                         node_type_key = (nodes_stack[-1][0], key)
-                        node_type, transform = self.model.fields_transforms.get(
-                            node_type_key, (None, "join")
-                        )
+                        node_type, transform = self.model.fields_transforms[
+                            node_type_key
+                        ]
                     else:
                         node_type, transform = self.model.root_table, None
-                    if element.text:
-                        node[1]["value"] = [element.text]
+                    if element.text and element.text.strip():
+                        node[1]["value"] = [element.text.strip()]
                     node = self._transform_node(*node)
                     if transform not in ["elevate", "elevate_wo_prefix"]:
                         node = self._compute_hash_deduplicate(node, hash_maps)
@@ -278,6 +302,39 @@ class XMLConverter:
                 child_key, val = list(content.items())[0]
                 content = {"type": [child_key], "value": val}
+        # convert some simple types to python types
+        if node_type in self.model.tables:
+            table = self.model.tables[node_type]
+            for key in table.columns:
+                content_key = (
+                    (
+                        f"{key[:-5]}__attr"
+                        if table.columns[key].has_suffix
+                        else f"{key}__attr"
+                    )
+                    if table.columns[key].is_attr
+                    else key
+                )
+                if content_key in content:
+                    if table.columns[key].data_type in ["decimal", "float"]:
+                        content[content_key] = [float(v) for v in content[content_key]]
+                    elif table.columns[key].data_type in [
+                        "integer",
+                        "int",
+                        "nonPositiveInteger",
+                        "nonNegativeInteger",
+                        "positiveInteger",
+                        "negativeInteger",
+                        "short",
+                        "byte",
+                        "long",
+                    ]:
+                        content[content_key] = [int(v) for v in content[content_key]]
+                    elif table.columns[key].data_type == "boolean":
+                        content[content_key] = [
+                            v == "true" or v == "1" for v in content[content_key]
+                        ]
         return node_type, content
     def _compute_hash_deduplicate(self, node: tuple, hash_maps: dict) -> tuple:
@@ -292,12 +349,28 @@ class XMLConverter:
             A tuple of (node_type, content, hash) representing a node after deduplication
         """
         node_type, content = node
+        if node_type not in self.model.tables:
+            return "", None, b""
         table = self.model.tables[node_type]
         h = self.model.model_config["record_hash_constructor"]()
-        for field_type, name, _ in table.fields:
+        for field_type, name, field in table.fields:
             if field_type == "col":
-                h.update(str(content.get(name, None)).encode("utf-8"))
+                if field.is_attr:
+                    h.update(
+                        str(
+                            content.get(
+                                (
+                                    f"{name[:-5]}__attr"
+                                    if field.has_suffix
+                                    else f"{name}__attr"
+                                ),
+                                None,
+                            )
+                        ).encode("utf-8")
+                    )
+                else:
+                    h.update(str(content.get(name, None)).encode("utf-8"))
             elif field_type == "rel1":
                 h.update(content[name][0][2] if name in content else b"")
             elif field_type == "reln":
@@ -419,13 +492,37 @@ class XMLConverter:
             attributes = {}
             text_content = None
             if field_type == "col":
-                if rel_name in content:
+                content_key = (
+                    (
+                        f"{rel_name[:-5]}__attr"
+                        if rel.has_suffix
+                        else f"{rel_name}__attr"
+                    )
+                    if rel.is_attr
+                    else rel_name
+                )
+                if content_key in content:
+                    if rel.data_type in [
+                        "decimal",
+                        "float",
+                    ]:  # remove trailing ".0" for decimal and float
+                        val = str(content[content_key][0])
+                        val = [val.rstrip("0").rstrip(".") if "." in val else val]
+                    elif isinstance(content[content_key][0], datetime):
+                        val = [
+                            content[content_key][0].isoformat(timespec="milliseconds")
+                        ]
+                    else:
+                        val = content[content_key]
                     if rel.is_attr:
-                        attributes[rel.name_chain[-1][0]] = content[rel_name][0]
+                        if rel.has_suffix:
+                            attributes[rel.name_chain[-1][0][:-5]] = val[0]
+                        else:
+                            attributes[rel.name_chain[-1][0]] = val[0]
                     elif rel.is_content:
-                        text_content = content[rel_name][0]
+                        text_content = val[0]
                     else:
-                        for field_value in content[rel_name]:
+                        for field_value in val:
                             child = etree.Element(rel.name_chain[-1][0])
                             if isinstance(field_value, datetime):
                                 field_value = field_value.isoformat()
@@ -446,7 +543,8 @@ class XMLConverter:
             if prev_ngroup and rel.ngroup != prev_ngroup:
                 for ngroup_children in zip_longest(*ngroup_stack):
                     for child in ngroup_children:
-                        nodes_stack[-1][1].append(child)
+                        if child is not None:
+                            nodes_stack[-1][1].append(child)
                 ngroup_stack = []
             prev_ngroup = rel.ngroup
             if len(children) > 0:

{xml2db-0.12.2 → xml2db-0.12.4/src/xml2db.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: xml2db
-Version: 0.12.2
+Version: 0.12.4
 Summary: Import complex XML files to a relational database
 Author-email: Commission de régulation de l'énergie <opensource@cre.fr>
 Project-URL: Documentation, https://cre-dev.github.io/xml2db
@@ -13,11 +13,11 @@ Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: sqlalchemy>1.4
-Requires-Dist: xmlschema==3.3.2
-Requires-Dist: lxml==5.1.0
+Requires-Dist: xmlschema>=3.3.2
+Requires-Dist: lxml>=5.1.0
 Provides-Extra: docs
-Requires-Dist: mkdocs-material==9.5.34; extra == "docs"
-Requires-Dist: mkdocstrings-python==1.11.1; extra == "docs"
+Requires-Dist: mkdocs-material>=9.5.34; extra == "docs"
+Requires-Dist: mkdocstrings-python>=1.11.1; extra == "docs"
 Provides-Extra: tests
 Requires-Dist: pytest>=7.0; extra == "tests"

xml2db-0.12.4/src/xml2db.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,10 @@
+sqlalchemy>1.4
+xmlschema>=3.3.2
+lxml>=5.1.0
+[docs]
+mkdocs-material>=9.5.34
+mkdocstrings-python>=1.11.1
+[tests]
+pytest>=7.0

{xml2db-0.12.2 → xml2db-0.12.4}/tests/test_conversions.py RENAMED Viewed

@@ -1,10 +1,12 @@
 import os
+import pprint
 import pytest
 from lxml import etree
 from xml2db import DataModel
 from xml2db.xml_converter import XMLConverter, remove_record_hash
+from .conftest import list_xml_path, models_path
 from .sample_models import models
@@ -13,19 +15,20 @@ from .sample_models import models
     [
         {**model, **version, "xml_file": xml_file}
         for model in models
-        for xml_file in os.listdir(model["xml_path"])
+        for xml_file in list_xml_path(model, "xml")
+        + list_xml_path(model, "equivalent_xml")
         for version in model["versions"]
     ],
 )
-def test_document_tree_parsing(test_config):
+def test_iterative_recursive_parsing(test_config):
     """Test whether iterative and recursive parsing give same results"""
     model = DataModel(
-        test_config["xsd_path"],
+        str(os.path.join(models_path, test_config["id"], test_config["xsd"])),
         short_name=test_config["id"],
         model_config=test_config["config"],
     )
     converter = XMLConverter(model)
-    file_path = os.path.join(test_config["xml_path"], test_config["xml_file"])
+    file_path = test_config["xml_file"]
     parsed_recursive = converter.parse_xml(
         file_path, file_path, skip_validation=True, iterparse=False
@@ -42,7 +45,7 @@ def test_document_tree_parsing(test_config):
     [
         {**model, **version, "xml_file": xml_file}
         for model in models
-        for xml_file in os.listdir(model["xml_path"])
+        for xml_file in list_xml_path(model, "xml")
         for version in model["versions"]
     ],
 )
@@ -50,22 +53,22 @@ def test_document_tree_to_flat_data(test_config):
     """A test for document tree to flat data conversion and back"""
     model = DataModel(
-        test_config["xsd_path"],
+        str(os.path.join(models_path, test_config["id"], test_config["xsd"])),
         short_name=test_config["id"],
         model_config=test_config["config"],
     )
     converter = XMLConverter(model)
-    file_path = os.path.join(test_config["xml_path"], test_config["xml_file"])
+    file_path = test_config["xml_file"]
     # parse XML to document tree
     converter.parse_xml(file_path, file_path)
-    exp_doc_tree = remove_record_hash(converter.document_tree)
+    exp_doc_tree = pprint.pformat(remove_record_hash(converter.document_tree))
     # parse XML to document tree and then flat data model
     doc = model.parse_xml(file_path)
     # and convert it back to document tree
-    act_doc_tree = doc.flat_data_to_doc_tree()
+    act_doc_tree = pprint.pformat(doc.flat_data_to_doc_tree())
     assert act_doc_tree == exp_doc_tree
@@ -75,7 +78,7 @@ def test_document_tree_to_flat_data(test_config):
     [
         {**model, **version, "xml_file": xml_file}
         for model in models
-        for xml_file in os.listdir(model["xml_path"])
+        for xml_file in list_xml_path(model, "xml")
         for version in model["versions"]
     ],
 )
@@ -83,13 +86,13 @@ def test_document_tree_to_xml(test_config):
     """A test for document tree to xml conversion and back"""
     model = DataModel(
-        test_config["xsd_path"],
+        str(os.path.join(models_path, test_config["id"], test_config["xsd"])),
         short_name=test_config["id"],
         model_config=test_config["config"],
     )
     converter = XMLConverter(model)
-    file_path = os.path.join(test_config["xml_path"], test_config["xml_file"])
+    file_path = test_config["xml_file"]
     # parse XML to document tree
     converter.parse_xml(file_path, file_path)
@@ -112,3 +115,29 @@ def test_document_tree_to_xml(test_config):
         ref_xml = f.read()
     assert xml == ref_xml
+@pytest.mark.parametrize(
+    "test_config",
+    [
+        {**model, **version}
+        for model in models
+        for version in model["versions"]
+        if os.path.isdir(os.path.join(models_path, model["id"], "equivalent_xml"))
+    ],
+)
+def test_equivalent_xml(test_config):
+    """A test for xml documents which should result in the same extracted data"""
+    xml_files = list_xml_path(test_config, "equivalent_xml")
+    if len(xml_files) > 1:
+        model = DataModel(
+            str(os.path.join(models_path, test_config["id"], test_config["xsd"])),
+            short_name=test_config["id"],
+            model_config=test_config["config"],
+        )
+        ref_data = model.parse_xml(xml_files[0])
+        for xml_file in xml_files[1:]:
+            equ_data = model.parse_xml(xml_file)
+            assert ref_data.data == equ_data.data

{xml2db-0.12.2 → xml2db-0.12.4}/tests/test_models_output.py RENAMED Viewed

@@ -5,6 +5,7 @@ from sqlalchemy.dialects import postgresql, mssql, mysql
 from xml2db import DataModel
 from .sample_models import models
+from .conftest import models_path
 @pytest.mark.parametrize(
@@ -19,14 +20,15 @@ def test_model_erd(test_config):
     """A test to check if generated ERD matches saved output"""
     model = DataModel(
-        test_config["xsd_path"],
+        str(os.path.join(models_path, test_config["id"], test_config["xsd"])),
         short_name=test_config["id"],
         model_config=test_config["config"],
     )
     expected = open(
         os.path.join(
-            os.path.dirname(test_config["xsd_path"]),
+            models_path,
+            test_config["id"],
             f"{test_config['id']}_erd_version{test_config['version_id']}.md",
         ),
         "r",
@@ -49,7 +51,7 @@ def test_model_ddl(test_config):
     """A test to check if generated SQL DDL matches saved output"""
     model = DataModel(
-        test_config["xsd_path"],
+        str(os.path.join(models_path, test_config["id"], test_config["xsd"])),
         short_name=test_config["id"],
         model_config=test_config["config"],
         db_type=test_config["dialect"].name,
@@ -57,7 +59,8 @@ def test_model_ddl(test_config):
     expected = open(
         os.path.join(
-            os.path.dirname(test_config["xsd_path"]),
+            models_path,
+            test_config["id"],
             f"{test_config['id']}_ddl_{test_config['dialect'].name}_version{test_config['version_id']}.sql",
         ),
         "r",

{xml2db-0.12.2 → xml2db-0.12.4}/tests/test_roundtrip.py RENAMED Viewed

@@ -4,7 +4,7 @@ import pytest
 from lxml import etree
 from xml2db.xml_converter import XMLConverter, remove_record_hash
-from .fixtures import setup_db_model, conn_string
+from .conftest import list_xml_path
 from .sample_models import models
@@ -17,10 +17,7 @@ def test_database_xml_roundtrip(setup_db_model, model_config):
     """A test for roundtrip insert to the database from and to XML"""
     model = setup_db_model
-    xml_files = [
-        os.path.join(model_config["xml_path"], file)
-        for file in os.listdir(model_config["xml_path"])
-    ]
+    xml_files = list_xml_path(model_config, "xml")
     for file in xml_files:
         # do parse and insert into the database
@@ -59,10 +56,7 @@ def test_database_document_tree_roundtrip(setup_db_model, model_config):
     """A test for roundtrip insert to the database from and to document tree"""
     model = setup_db_model
-    xml_files = [
-        os.path.join(model_config["xml_path"], file)
-        for file in os.listdir(model_config["xml_path"])
-    ]
+    xml_files = list_xml_path(model_config, "xml")
     for file in xml_files:
         # do parse and insert into the database
@@ -92,10 +86,7 @@ def test_database_document_tree_roundtrip_single_load(setup_db_model, model_conf
     """A test for roundtrip insert to the database from and to document tree"""
     model = setup_db_model
-    xml_files = [
-        os.path.join(model_config["xml_path"], file)
-        for file in os.listdir(model_config["xml_path"])
-    ]
+    xml_files = list_xml_path(model_config, "xml")
     flat_data = None
     doc = None
@@ -129,7 +120,7 @@ def test_database_document_tree_roundtrip_single_load(setup_db_model, model_conf
     [
         {**model, **version, "xml_file": xml_file}
         for model in models
-        for xml_file in os.listdir(model["xml_path"])
+        for xml_file in list_xml_path(model, "xml")
         for version in model["versions"]
     ],
 )

{xml2db-0.12.2 → xml2db-0.12.4}/tests/test_validation.py RENAMED Viewed

@@ -1,10 +1,10 @@
-import xml.etree.ElementTree
 import lxml.etree
 import pytest
+import os
 from xml2db import DataModel
 from .sample_models import models
+from .conftest import models_path
 @pytest.mark.parametrize(
@@ -27,7 +27,9 @@ from .sample_models import models
 def test_invalid_xml(args: tuple):
     file_name, iterparse, recover, exception = args
-    data_model = DataModel(models[0]["xsd_path"])
+    data_model = DataModel(
+        str(os.path.join(models_path, models[0]["id"], models[0]["xsd"]))
+    )
     if exception is None:
         data_model.parse_xml(
@@ -49,8 +51,8 @@ def test_invalid_xml(args: tuple):
 @pytest.mark.parametrize(
     "args",
     [
-        ("invalid", True, False, IndexError),
-        ("invalid", True, True, IndexError),
+        ("invalid", True, False, None),
+        ("invalid", True, True, None),
         ("invalid", False, False, None),
         ("invalid", False, True, None),
         ("malformed_recover", True, False, lxml.etree.XMLSyntaxError),
@@ -58,7 +60,7 @@ def test_invalid_xml(args: tuple):
         ("malformed_recover", False, False, lxml.etree.XMLSyntaxError),
         ("malformed_recover", False, True, None),
         ("malformed_no_recover", True, False, lxml.etree.XMLSyntaxError),
-        ("malformed_no_recover", True, True, IndexError),
+        ("malformed_no_recover", True, True, None),
         ("malformed_no_recover", False, False, lxml.etree.XMLSyntaxError),
         ("malformed_no_recover", False, True, None),
     ],
@@ -66,7 +68,9 @@ def test_invalid_xml(args: tuple):
 def test_invalid_xml_skip_verify(args: tuple):
     file_name, iterparse, recover, exception = args
-    data_model = DataModel(models[0]["xsd_path"])
+    data_model = DataModel(
+        str(os.path.join(models_path, models[0]["id"], models[0]["xsd"]))
+    )
     if exception is None:
         data_model.parse_xml(