xml2db 0.12.2__tar.gz → 0.12.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {xml2db-0.12.2/src/xml2db.egg-info → xml2db-0.12.4}/PKG-INFO +6 -6
  2. {xml2db-0.12.2 → xml2db-0.12.4}/pyproject.toml +4 -4
  3. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/document.py +21 -22
  4. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/model.py +61 -45
  5. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/table/column.py +13 -19
  6. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/table/reused_table.py +1 -0
  7. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/table/table.py +3 -0
  8. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/table/transformed_table.py +7 -1
  9. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/xml_converter.py +139 -41
  10. {xml2db-0.12.2 → xml2db-0.12.4/src/xml2db.egg-info}/PKG-INFO +6 -6
  11. xml2db-0.12.4/src/xml2db.egg-info/requires.txt +10 -0
  12. {xml2db-0.12.2 → xml2db-0.12.4}/tests/test_conversions.py +41 -12
  13. {xml2db-0.12.2 → xml2db-0.12.4}/tests/test_models_output.py +7 -4
  14. {xml2db-0.12.2 → xml2db-0.12.4}/tests/test_roundtrip.py +5 -14
  15. {xml2db-0.12.2 → xml2db-0.12.4}/tests/test_validation.py +11 -7
  16. xml2db-0.12.2/src/xml2db.egg-info/requires.txt +0 -10
  17. {xml2db-0.12.2 → xml2db-0.12.4}/LICENSE +0 -0
  18. {xml2db-0.12.2 → xml2db-0.12.4}/README.md +0 -0
  19. {xml2db-0.12.2 → xml2db-0.12.4}/setup.cfg +0 -0
  20. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/__init__.py +0 -0
  21. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/exceptions.py +0 -0
  22. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/table/__init__.py +0 -0
  23. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/table/duplicated_table.py +0 -0
  24. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db/table/relations.py +0 -0
  25. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db.egg-info/SOURCES.txt +0 -0
  26. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db.egg-info/dependency_links.txt +0 -0
  27. {xml2db-0.12.2 → xml2db-0.12.4}/src/xml2db.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: xml2db
3
- Version: 0.12.2
3
+ Version: 0.12.4
4
4
  Summary: Import complex XML files to a relational database
5
5
  Author-email: Commission de régulation de l'énergie <opensource@cre.fr>
6
6
  Project-URL: Documentation, https://cre-dev.github.io/xml2db
@@ -13,11 +13,11 @@ Requires-Python: >=3.9
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
15
  Requires-Dist: sqlalchemy>1.4
16
- Requires-Dist: xmlschema==3.3.2
17
- Requires-Dist: lxml==5.1.0
16
+ Requires-Dist: xmlschema>=3.3.2
17
+ Requires-Dist: lxml>=5.1.0
18
18
  Provides-Extra: docs
19
- Requires-Dist: mkdocs-material==9.5.34; extra == "docs"
20
- Requires-Dist: mkdocstrings-python==1.11.1; extra == "docs"
19
+ Requires-Dist: mkdocs-material>=9.5.34; extra == "docs"
20
+ Requires-Dist: mkdocstrings-python>=1.11.1; extra == "docs"
21
21
  Provides-Extra: tests
22
22
  Requires-Dist: pytest>=7.0; extra == "tests"
23
23
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "xml2db"
7
- version = "0.12.2"
7
+ version = "0.12.4"
8
8
  authors = [
9
9
  { name="Commission de régulation de l'énergie", email="opensource@cre.fr" },
10
10
  ]
@@ -18,12 +18,12 @@ classifiers = [
18
18
  ]
19
19
  dependencies = [
20
20
  "sqlalchemy>1.4",
21
- "xmlschema==3.3.2",
22
- "lxml==5.1.0",
21
+ "xmlschema>=3.3.2",
22
+ "lxml>=5.1.0",
23
23
  ]
24
24
 
25
25
  [project.optional-dependencies]
26
- docs = ["mkdocs-material==9.5.34", "mkdocstrings-python==1.11.1"]
26
+ docs = ["mkdocs-material>=9.5.34", "mkdocstrings-python>=1.11.1"]
27
27
  tests = ["pytest>=7.0"]
28
28
 
29
29
  [project.urls]
@@ -171,17 +171,15 @@ class Document:
171
171
  record["xml2db_row_number"] = row_number
172
172
 
173
173
  # build record from fields for columns and n-1 relations
174
- for field_type, key, _ in model_table.fields:
174
+ for field_type, key, field in model_table.fields:
175
175
  if field_type == "col":
176
- if key in content:
177
- if model_table.columns[key].data_type in ["decimal", "float"]:
178
- val = [float(v) for v in content[key]]
179
- elif model_table.columns[key].data_type == "integer":
180
- val = [int(v) for v in content[key]]
181
- elif model_table.columns[key].data_type == "boolean":
182
- val = [v == "true" or v == "1" for v in content[key]]
183
- else:
184
- val = content[key]
176
+ content_key = (
177
+ (f"{key[:-5]}__attr" if field.has_suffix else f"{key}__attr")
178
+ if field.is_attr
179
+ else key
180
+ )
181
+ if content_key in content:
182
+ val = content[content_key]
185
183
 
186
184
  if len(val) == 1:
187
185
  record[key] = val[0]
@@ -320,25 +318,26 @@ class Document:
320
318
  record = data_index[node_type]["records"][node_pk]
321
319
  for field_type, rel_name, rel in tb.fields:
322
320
  if field_type == "col" and record[rel_name] is not None:
323
- if rel.data_type in [
324
- "decimal",
325
- "float",
326
- ]: # remove trailing ".0" for decimal and float
327
- content[rel_name] = [
328
- value.rstrip("0").rstrip(".") if "." in value else value
329
- for value in str(record[rel_name]).split(",")
330
- ]
331
- elif isinstance(record[rel_name], datetime.datetime):
332
- content[rel_name] = [
321
+ content_key = (
322
+ (
323
+ f"{rel_name[:-5]}__attr"
324
+ if rel.has_suffix
325
+ else f"{rel_name}__attr"
326
+ )
327
+ if rel.is_attr
328
+ else rel_name
329
+ )
330
+ if isinstance(record[rel_name], datetime.datetime):
331
+ content[content_key] = [
333
332
  record[rel_name].isoformat(timespec="milliseconds")
334
333
  ]
335
334
  else:
336
- content[rel_name] = (
335
+ content[content_key] = (
337
336
  list(csv.reader([str(record[rel_name])], escapechar="\\"))[
338
337
  0
339
338
  ]
340
339
  if "," in str(record[rel_name])
341
- else [str(record[rel_name])]
340
+ else [record[rel_name]]
342
341
  )
343
342
  elif (
344
343
  field_type == "rel1"
@@ -70,7 +70,7 @@ class DataModel:
70
70
  def __init__(
71
71
  self,
72
72
  xsd_file: str,
73
- short_name: str = None,
73
+ short_name: str = "DocumentRoot",
74
74
  long_name: str = None,
75
75
  base_url: str = None,
76
76
  model_config: dict = None,
@@ -226,8 +226,7 @@ class DataModel:
226
226
  """
227
227
  # parse the XML schema recursively and hold a reference to the head table
228
228
  root_table = self._parse_tree(
229
- self.xml_schema[0] if len(self.xml_schema) == 1 else self.xml_schema,
230
- is_root_table=True,
229
+ self.xml_schema[0] if len(self.xml_schema) == 1 else self.xml_schema
231
230
  )
232
231
  self.root_table = root_table.type_name
233
232
  # compute a text representation of the original data model and store it
@@ -273,9 +272,7 @@ class DataModel:
273
272
  for tb in self.fk_ordered_tables:
274
273
  tb.build_sqlalchemy_tables()
275
274
 
276
- def _parse_tree(
277
- self, parent_node: xmlschema.XsdElement, is_root_table: bool = False
278
- ):
275
+ def _parse_tree(self, parent_node: xmlschema.XsdElement, nodes_path: list = None):
279
276
  """Parse a node of an XML schema recursively and create a target data model without any simplification
280
277
 
281
278
  We parse the XSD tree recursively to create for each node (basically a complex type in the XSD) an equivalent \
@@ -289,7 +286,7 @@ class DataModel:
289
286
 
290
287
  Args:
291
288
  parent_node: the current XSD node being parsed
292
- is_root_table: True if this is the root table
289
+ nodes_path: a list of nodes types from the root node
293
290
  """
294
291
 
295
292
  # find current node type and name and returns corresponding table if it already exists
@@ -301,12 +298,16 @@ class DataModel:
301
298
  if parent_type is None:
302
299
  parent_type = parent_node.local_name
303
300
 
301
+ nodes_path = (nodes_path if nodes_path else []) + [parent_type]
302
+
304
303
  # if this type has already been encountered, stop here and return existing table
305
304
  if parent_type in self.tables:
306
305
  parent_table = self.tables[parent_type]
307
306
  return parent_table
308
307
 
309
- # elements names and types should be bijective. If an element name is used for different types,
308
+ # For database tables we use element names rather than XSD types, under the assumption that they are often
309
+ # more meaningful given that they are the one which appear in XML documents. However, same names can be used
310
+ # for different XSD types, so if an element name is used for different types,
310
311
  # we add a suffix to the name to make it unique again (using a dict to keep the name/type association)
311
312
  parent_name = (
312
313
  parent_node.local_name
@@ -324,7 +325,7 @@ class DataModel:
324
325
  parent_table = self._create_table_model(
325
326
  parent_name,
326
327
  parent_type,
327
- is_root_table,
328
+ len(nodes_path) == 1,
328
329
  isinstance(parent_node, xmlschema.XMLSchema),
329
330
  )
330
331
  self.tables[parent_type] = parent_table
@@ -363,6 +364,13 @@ class DataModel:
363
364
  if elem_type.base_type
364
365
  else recurse_parse_simple_type(elem_type.member_types)
365
366
  )
367
+ if elem_type.is_list():
368
+ return (
369
+ "string",
370
+ 0,
371
+ None,
372
+ elem_type.allow_empty,
373
+ )
366
374
  if elem_type.is_restriction():
367
375
  dt = elem_type.base_type.local_name
368
376
  mil = elem_type.min_length
@@ -384,7 +392,12 @@ class DataModel:
384
392
  else None
385
393
  )
386
394
  ae = ae and bt_ae if ae is not None and bt_ae is not None else None
387
- if elem_type.enumeration is not None:
395
+ if elem_type.enumeration is not None and dt in [
396
+ "string",
397
+ "NMTOKEN",
398
+ "duration",
399
+ "token",
400
+ ]:
388
401
  mil = min([len(val) for val in elem_type.enumeration])
389
402
  mal = max([len(val) for val in elem_type.enumeration])
390
403
  return dt, mil, mal, ae
@@ -410,25 +423,31 @@ class DataModel:
410
423
  ),
411
424
  ]
412
425
 
413
- # go through item attributes and add them as columns
426
+ # go through item attributes and add them as columns, adding a suffix if an element with the same name exists
427
+ children_names = None
414
428
  for attrib_name, attrib in parent_node.attributes.items():
429
+ if children_names is None:
430
+ children_names = [child.local_name for child in parent_node]
415
431
  (
416
432
  data_type,
417
433
  min_length,
418
434
  max_length,
419
435
  allow_empty,
420
436
  ) = recurse_parse_simple_type([attrib.type])
437
+ suffix = attrib_name in children_names
421
438
  parent_table.add_column(
422
- f"{attrib_name}",
439
+ f"{attrib_name}{'_attr' if suffix else ''}",
423
440
  data_type,
424
441
  [0, 1],
425
442
  min_length,
426
443
  max_length,
427
444
  True,
445
+ suffix,
428
446
  False,
429
447
  allow_empty,
430
448
  None,
431
449
  )
450
+
432
451
  nested_containers = []
433
452
  # go through the children to add either arguments either relations to the current element
434
453
  for child in parent_node:
@@ -454,6 +473,7 @@ class DataModel:
454
473
  if child.parent
455
474
  and child.parent.max_occurs != 1
456
475
  and child.parent.model != "choice"
476
+ and child.max_occurs == 1
457
477
  else None
458
478
  ),
459
479
  )
@@ -482,32 +502,39 @@ class DataModel:
482
502
  max_length,
483
503
  False,
484
504
  False,
505
+ False,
485
506
  allow_empty,
486
507
  nested_containers[-1][1],
487
508
  )
488
509
 
489
510
  elif ct.is_complex():
490
- child_table = self._parse_tree(child)
491
- child_table.model_group = (
492
- "choice"
493
- if ct.model_group and ct.model_group.model == "choice"
494
- else "sequence"
495
- )
496
- occurs = get_occurs(child)
497
- if child.is_single():
498
- parent_table.add_relation_1(
499
- child.local_name,
500
- child_table,
501
- occurs,
502
- nested_containers[-1][1],
511
+ # ignoring recursive definitions by skipping these fields
512
+ if child.type.local_name in nodes_path:
513
+ logger.warning(
514
+ f"type '{child.type.local_name}' contains a recursive definition"
503
515
  )
504
516
  else:
505
- parent_table.add_relation_n(
506
- child.local_name,
507
- child_table,
508
- occurs,
509
- nested_containers[-1][1],
517
+ child_table = self._parse_tree(child, nodes_path)
518
+ child_table.model_group = (
519
+ "choice"
520
+ if ct.model_group and ct.model_group.model == "choice"
521
+ else "sequence"
510
522
  )
523
+ occurs = get_occurs(child)
524
+ if occurs[1] == 1:
525
+ parent_table.add_relation_1(
526
+ child.local_name,
527
+ child_table,
528
+ occurs,
529
+ nested_containers[-1][1],
530
+ )
531
+ else:
532
+ parent_table.add_relation_n(
533
+ child.local_name,
534
+ child_table,
535
+ occurs,
536
+ nested_containers[-1][1],
537
+ )
511
538
  else:
512
539
  raise ValueError("unknown case; please check")
513
540
  else:
@@ -534,6 +561,7 @@ class DataModel:
534
561
  min_length,
535
562
  max_length,
536
563
  False,
564
+ False,
537
565
  True,
538
566
  allow_empty,
539
567
  None,
@@ -544,31 +572,19 @@ class DataModel:
544
572
  def _repr_tree(
545
573
  self,
546
574
  parent_table: Union[DataModelTableReused, DataModelTableDuplicated],
547
- visited_nodes: Union[set, None] = None,
548
575
  ):
549
576
  """Build a text representation of the data model tree
550
577
 
551
578
  Args:
552
579
  parent_table: the current data model table object
553
580
  """
554
- if visited_nodes is None:
555
- visited_nodes = set()
556
- else:
557
- visited_nodes = {item for item in visited_nodes}
558
- visited_nodes.add(parent_table.name)
559
581
  for field_type, name, field in parent_table.fields:
560
582
  if field_type == "col":
561
583
  yield f"{field.name}{field.occurs}: {field.data_type}"
562
- elif field_type == "rel1":
563
- mg = " (choice)" if field.other_table.model_group == "choice" else ""
564
- yield f"{field.name}{field.occurs}{mg}:{' ...' if field_type in visited_nodes else ''}"
565
- if field.other_table.name not in visited_nodes:
566
- for line in self._repr_tree(field.other_table, visited_nodes):
567
- yield f" {line}"
568
- elif field_type == "reln":
584
+ else:
569
585
  mg = " (choice)" if field.other_table.model_group == "choice" else ""
570
- yield f"{field.name}{field.occurs}{mg}:{' ...' if field_type in visited_nodes else ''}"
571
- for line in self._repr_tree(field.other_table, visited_nodes):
586
+ yield f"{field.name}{field.occurs}{mg}:"
587
+ for line in self._repr_tree(field.other_table):
572
588
  yield f" {line}"
573
589
 
574
590
  def get_entity_rel_diagram(self, text_context: bool = True) -> str:
@@ -32,15 +32,22 @@ def types_mapping_default(temp: bool, col: "DataModelColumn") -> Any:
32
32
  """
33
33
  if col.occurs[1] != 1:
34
34
  return String(8000)
35
- if col.data_type in ["decimal", "float"]:
35
+ if col.data_type in ["decimal", "float", "double"]:
36
36
  return Double
37
37
  if col.data_type == "dateTime":
38
38
  return DateTime(timezone=True)
39
- if col.data_type == "integer" or col.data_type == "int":
39
+ if col.data_type in [
40
+ "integer",
41
+ "int",
42
+ "nonPositiveInteger",
43
+ "nonNegativeInteger",
44
+ "positiveInteger",
45
+ "negativeInteger",
46
+ ]:
40
47
  return Integer
41
48
  if col.data_type == "boolean":
42
49
  return Boolean
43
- if col.data_type == "byte":
50
+ if col.data_type in ["short", "byte"]:
44
51
  return SmallInteger
45
52
  if col.data_type == "long":
46
53
  return BigInteger
@@ -77,20 +84,10 @@ def types_mapping_mssql(temp: bool, col: "DataModelColumn") -> Any:
77
84
  """
78
85
  if col.occurs[1] != 1:
79
86
  return mssql.VARCHAR(8000)
80
- if col.data_type in ["decimal", "float"]:
81
- return Double
82
87
  if col.data_type == "dateTime":
83
88
  # using the DATETIMEOFFSET directly in the temporary table caused issues when inserting data in the target
84
89
  # table with INSERT INTO SELECT converts datetime VARCHAR to DATETIMEOFFSET without errors
85
90
  return mssql.VARCHAR(100) if temp else mssql.DATETIMEOFFSET
86
- if col.data_type == "integer" or col.data_type == "int":
87
- return Integer
88
- if col.data_type == "boolean":
89
- return Boolean
90
- if col.data_type == "byte":
91
- return SmallInteger
92
- if col.data_type == "long":
93
- return BigInteger
94
91
  if col.data_type == "date":
95
92
  return mssql.VARCHAR(16)
96
93
  if col.data_type == "time":
@@ -106,12 +103,7 @@ def types_mapping_mssql(temp: bool, col: "DataModelColumn") -> Any:
106
103
  if col.max_length == col.min_length:
107
104
  return mssql.BINARY(col.max_length)
108
105
  return mssql.VARBINARY(col.max_length)
109
- else:
110
- logger.warning(
111
- f"unknown type '{col.data_type}' for column '{col.name}', defaulting to VARCHAR(1000) "
112
- f"(this can be overridden by providing a field type in the configuration)"
113
- )
114
- return mssql.VARCHAR(1000)
106
+ return types_mapping_default(temp, col)
115
107
 
116
108
 
117
109
  def types_mapping_mysql(temp: bool, col: "DataModelColumn") -> Any:
@@ -167,6 +159,7 @@ class DataModelColumn:
167
159
  min_length: int,
168
160
  max_length: Union[int, None],
169
161
  is_attr: bool,
162
+ has_suffix: bool,
170
163
  is_content: bool,
171
164
  allow_empty: bool,
172
165
  ngroup: Union[int, None],
@@ -181,6 +174,7 @@ class DataModelColumn:
181
174
  self.min_length = min_length
182
175
  self.max_length = max_length
183
176
  self.is_attr = is_attr
177
+ self.has_suffix = has_suffix
184
178
  self.is_content = is_content
185
179
  self.allow_empty = allow_empty
186
180
  self.ngroup = ngroup
@@ -71,6 +71,7 @@ class DataModelTableReused(DataModelTableTransformed):
71
71
  False,
72
72
  False,
73
73
  False,
74
+ False,
74
75
  None,
75
76
  self.config,
76
77
  self.data_model,
@@ -130,6 +130,7 @@ class DataModelTable:
130
130
  min_length: int,
131
131
  max_length: Union[int, None],
132
132
  is_attr: bool,
133
+ has_suffix: bool,
133
134
  is_content: bool,
134
135
  allow_empty: bool,
135
136
  ngroup: Union[str, None],
@@ -143,6 +144,7 @@ class DataModelTable:
143
144
  min_length: minimum length
144
145
  max_length: maximum length
145
146
  is_attr: is XML attribute or element?
147
+ has_suffix: for an attribute, do we need the '_attr' suffix?
146
148
  is_content: is content of a mixed type element?
147
149
  allow_empty: is nullable?
148
150
  ngroup: a string id signaling that the column belongs to a nested sequence
@@ -155,6 +157,7 @@ class DataModelTable:
155
157
  min_length,
156
158
  max_length,
157
159
  is_attr,
160
+ has_suffix,
158
161
  is_content,
159
162
  allow_empty,
160
163
  ngroup,
@@ -76,6 +76,7 @@ class DataModelTableTransformed(DataModelTable):
76
76
  False,
77
77
  False,
78
78
  False,
79
+ False,
79
80
  None,
80
81
  self.config,
81
82
  self.data_model,
@@ -89,6 +90,7 @@ class DataModelTableTransformed(DataModelTable):
89
90
  max(max_lengths) if all(e is not None for e in max_lengths) else None,
90
91
  False,
91
92
  False,
93
+ False,
92
94
  any(allow_empty),
93
95
  None,
94
96
  self.config,
@@ -193,6 +195,7 @@ class DataModelTableTransformed(DataModelTable):
193
195
  child_field.min_length,
194
196
  child_field.max_length,
195
197
  child_field.is_attr,
198
+ child_field.has_suffix,
196
199
  child_field.is_content,
197
200
  child_field.allow_empty,
198
201
  child_field.ngroup,
@@ -276,9 +279,12 @@ class DataModelTableTransformed(DataModelTable):
276
279
 
277
280
  # if the table can be transformed, stop here
278
281
  if self._is_table_choice_transform_applicable():
282
+ fields_transform = {}
283
+ for col in self.columns.values():
284
+ fields_transform[(self.type_name, col.name)] = (None, "join")
279
285
  self._transform_to_choice()
280
286
  self.is_simplified = True
281
- return {self.type_name: "choice"}, {}
287
+ return {self.type_name: "choice"}, fields_transform
282
288
 
283
289
  # loop through field to transform them if need be
284
290
  out_fields = []
@@ -128,31 +128,36 @@ class XMLConverter:
128
128
  key
129
129
  != "{http://www.w3.org/2001/XMLSchema-instance}noNamespaceSchemaLocation"
130
130
  ):
131
- content[key] = [val]
131
+ content[f"{key}__attr"] = [val.strip() if val.strip() else val]
132
132
 
133
133
  if node.text and node.text.strip():
134
134
  content["value"] = [node.text.strip()]
135
135
 
136
136
  for element in node.iterchildren():
137
- key = element.tag.split("}")[1] if "}" in element.tag else element.tag
138
- node_type_key = (node_type, key)
139
- value = None
140
- if element.text and element.text.strip():
141
- value = element.text
142
- transform = self.model.fields_transforms.get(node_type_key, (None, "join"))[
143
- 1
144
- ]
145
- if transform != "join":
146
- value = self._parse_xml_node(
147
- self.model.fields_transforms[node_type_key][0],
148
- element,
149
- transform not in ["elevate", "elevate_wo_prefix"],
150
- hash_maps,
151
- )
152
- if key in content:
153
- content[key].append(value)
154
- else:
155
- content[key] = [value]
137
+ if isinstance(element.tag, str):
138
+ key = element.tag.split("}")[1] if "}" in element.tag else element.tag
139
+ node_type_key = (node_type, key)
140
+ value = None
141
+ if element.text:
142
+ value = (
143
+ element.text.strip() if element.text.strip() else element.text
144
+ )
145
+ if node_type_key not in self.model.fields_transforms:
146
+ # skip the node if it is not in the data model
147
+ continue
148
+ transform = self.model.fields_transforms[node_type_key][1]
149
+ if transform != "join":
150
+ value = self._parse_xml_node(
151
+ self.model.fields_transforms[node_type_key][0],
152
+ element,
153
+ transform not in ["elevate", "elevate_wo_prefix"],
154
+ hash_maps,
155
+ )
156
+ if value is not None:
157
+ if key in content:
158
+ content[key].append(value)
159
+ else:
160
+ content[key] = [value]
156
161
 
157
162
  node = self._transform_node(node_type, content)
158
163
 
@@ -189,6 +194,7 @@ class XMLConverter:
189
194
  hash_maps = {}
190
195
 
191
196
  joined_values = False
197
+ skipped_nodes = 0
192
198
  for event, element in etree.iterparse(
193
199
  xml_file,
194
200
  recover=recover,
@@ -196,12 +202,17 @@ class XMLConverter:
196
202
  remove_blank_text=True,
197
203
  ):
198
204
  key = element.tag.split("}")[1] if "}" in element.tag else element.tag
199
- if event == "start":
205
+
206
+ if event == "start" and skipped_nodes > 0:
207
+ skipped_nodes += 1
208
+
209
+ elif event == "start":
200
210
  if nodes_stack[-1][0]:
201
211
  node_type_key = (nodes_stack[-1][0], key)
202
- node_type, transform = self.model.fields_transforms.get(
203
- node_type_key, (None, "join")
204
- )
212
+ if node_type_key not in self.model.fields_transforms:
213
+ skipped_nodes += 1
214
+ continue
215
+ node_type, transform = self.model.fields_transforms[node_type_key]
205
216
  else:
206
217
  node_type, transform = self.model.root_table, None
207
218
  joined_values = transform == "join"
@@ -212,28 +223,41 @@ class XMLConverter:
212
223
  attrib_key
213
224
  != "{http://www.w3.org/2001/XMLSchema-instance}noNamespaceSchemaLocation"
214
225
  ):
215
- content[attrib_key] = [attrib_val]
226
+ content[f"{attrib_key}__attr"] = [
227
+ attrib_val.strip() if attrib_val.strip() else attrib_val
228
+ ]
216
229
  nodes_stack.append((node_type, content))
217
230
 
231
+ elif event == "end" and skipped_nodes > 0:
232
+ skipped_nodes -= 1
233
+
218
234
  elif event == "end":
219
- # joined_values was set with the previous "start" event just before
235
+ # joined_values was set with the previous "start" event just before and corresponds to lists of simple
236
+ # type elements
220
237
  if joined_values:
238
+ value = None
221
239
  if element.text:
222
- if key in nodes_stack[-1][1]:
223
- nodes_stack[-1][1][key].append(element.text)
240
+ if element.text.strip():
241
+ value = element.text.strip()
224
242
  else:
225
- nodes_stack[-1][1][key] = [element.text]
243
+ value = element.text
244
+ if key in nodes_stack[-1][1]:
245
+ nodes_stack[-1][1][key].append(value)
246
+ else:
247
+ nodes_stack[-1][1][key] = [value]
248
+
249
+ # else, we have completed a complex type node
226
250
  else:
227
251
  node = nodes_stack.pop()
228
252
  if nodes_stack[-1][0]:
229
253
  node_type_key = (nodes_stack[-1][0], key)
230
- node_type, transform = self.model.fields_transforms.get(
231
- node_type_key, (None, "join")
232
- )
254
+ node_type, transform = self.model.fields_transforms[
255
+ node_type_key
256
+ ]
233
257
  else:
234
258
  node_type, transform = self.model.root_table, None
235
- if element.text:
236
- node[1]["value"] = [element.text]
259
+ if element.text and element.text.strip():
260
+ node[1]["value"] = [element.text.strip()]
237
261
  node = self._transform_node(*node)
238
262
  if transform not in ["elevate", "elevate_wo_prefix"]:
239
263
  node = self._compute_hash_deduplicate(node, hash_maps)
@@ -278,6 +302,39 @@ class XMLConverter:
278
302
  child_key, val = list(content.items())[0]
279
303
  content = {"type": [child_key], "value": val}
280
304
 
305
+ # convert some simple types to python types
306
+ if node_type in self.model.tables:
307
+ table = self.model.tables[node_type]
308
+ for key in table.columns:
309
+ content_key = (
310
+ (
311
+ f"{key[:-5]}__attr"
312
+ if table.columns[key].has_suffix
313
+ else f"{key}__attr"
314
+ )
315
+ if table.columns[key].is_attr
316
+ else key
317
+ )
318
+ if content_key in content:
319
+ if table.columns[key].data_type in ["decimal", "float"]:
320
+ content[content_key] = [float(v) for v in content[content_key]]
321
+ elif table.columns[key].data_type in [
322
+ "integer",
323
+ "int",
324
+ "nonPositiveInteger",
325
+ "nonNegativeInteger",
326
+ "positiveInteger",
327
+ "negativeInteger",
328
+ "short",
329
+ "byte",
330
+ "long",
331
+ ]:
332
+ content[content_key] = [int(v) for v in content[content_key]]
333
+ elif table.columns[key].data_type == "boolean":
334
+ content[content_key] = [
335
+ v == "true" or v == "1" for v in content[content_key]
336
+ ]
337
+
281
338
  return node_type, content
282
339
 
283
340
  def _compute_hash_deduplicate(self, node: tuple, hash_maps: dict) -> tuple:
@@ -292,12 +349,28 @@ class XMLConverter:
292
349
  A tuple of (node_type, content, hash) representing a node after deduplication
293
350
  """
294
351
  node_type, content = node
352
+ if node_type not in self.model.tables:
353
+ return "", None, b""
295
354
  table = self.model.tables[node_type]
296
355
 
297
356
  h = self.model.model_config["record_hash_constructor"]()
298
- for field_type, name, _ in table.fields:
357
+ for field_type, name, field in table.fields:
299
358
  if field_type == "col":
300
- h.update(str(content.get(name, None)).encode("utf-8"))
359
+ if field.is_attr:
360
+ h.update(
361
+ str(
362
+ content.get(
363
+ (
364
+ f"{name[:-5]}__attr"
365
+ if field.has_suffix
366
+ else f"{name}__attr"
367
+ ),
368
+ None,
369
+ )
370
+ ).encode("utf-8")
371
+ )
372
+ else:
373
+ h.update(str(content.get(name, None)).encode("utf-8"))
301
374
  elif field_type == "rel1":
302
375
  h.update(content[name][0][2] if name in content else b"")
303
376
  elif field_type == "reln":
@@ -419,13 +492,37 @@ class XMLConverter:
419
492
  attributes = {}
420
493
  text_content = None
421
494
  if field_type == "col":
422
- if rel_name in content:
495
+ content_key = (
496
+ (
497
+ f"{rel_name[:-5]}__attr"
498
+ if rel.has_suffix
499
+ else f"{rel_name}__attr"
500
+ )
501
+ if rel.is_attr
502
+ else rel_name
503
+ )
504
+ if content_key in content:
505
+ if rel.data_type in [
506
+ "decimal",
507
+ "float",
508
+ ]: # remove trailing ".0" for decimal and float
509
+ val = str(content[content_key][0])
510
+ val = [val.rstrip("0").rstrip(".") if "." in val else val]
511
+ elif isinstance(content[content_key][0], datetime):
512
+ val = [
513
+ content[content_key][0].isoformat(timespec="milliseconds")
514
+ ]
515
+ else:
516
+ val = content[content_key]
423
517
  if rel.is_attr:
424
- attributes[rel.name_chain[-1][0]] = content[rel_name][0]
518
+ if rel.has_suffix:
519
+ attributes[rel.name_chain[-1][0][:-5]] = val[0]
520
+ else:
521
+ attributes[rel.name_chain[-1][0]] = val[0]
425
522
  elif rel.is_content:
426
- text_content = content[rel_name][0]
523
+ text_content = val[0]
427
524
  else:
428
- for field_value in content[rel_name]:
525
+ for field_value in val:
429
526
  child = etree.Element(rel.name_chain[-1][0])
430
527
  if isinstance(field_value, datetime):
431
528
  field_value = field_value.isoformat()
@@ -446,7 +543,8 @@ class XMLConverter:
446
543
  if prev_ngroup and rel.ngroup != prev_ngroup:
447
544
  for ngroup_children in zip_longest(*ngroup_stack):
448
545
  for child in ngroup_children:
449
- nodes_stack[-1][1].append(child)
546
+ if child is not None:
547
+ nodes_stack[-1][1].append(child)
450
548
  ngroup_stack = []
451
549
  prev_ngroup = rel.ngroup
452
550
  if len(children) > 0:
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: xml2db
3
- Version: 0.12.2
3
+ Version: 0.12.4
4
4
  Summary: Import complex XML files to a relational database
5
5
  Author-email: Commission de régulation de l'énergie <opensource@cre.fr>
6
6
  Project-URL: Documentation, https://cre-dev.github.io/xml2db
@@ -13,11 +13,11 @@ Requires-Python: >=3.9
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
15
  Requires-Dist: sqlalchemy>1.4
16
- Requires-Dist: xmlschema==3.3.2
17
- Requires-Dist: lxml==5.1.0
16
+ Requires-Dist: xmlschema>=3.3.2
17
+ Requires-Dist: lxml>=5.1.0
18
18
  Provides-Extra: docs
19
- Requires-Dist: mkdocs-material==9.5.34; extra == "docs"
20
- Requires-Dist: mkdocstrings-python==1.11.1; extra == "docs"
19
+ Requires-Dist: mkdocs-material>=9.5.34; extra == "docs"
20
+ Requires-Dist: mkdocstrings-python>=1.11.1; extra == "docs"
21
21
  Provides-Extra: tests
22
22
  Requires-Dist: pytest>=7.0; extra == "tests"
23
23
 
@@ -0,0 +1,10 @@
1
+ sqlalchemy>1.4
2
+ xmlschema>=3.3.2
3
+ lxml>=5.1.0
4
+
5
+ [docs]
6
+ mkdocs-material>=9.5.34
7
+ mkdocstrings-python>=1.11.1
8
+
9
+ [tests]
10
+ pytest>=7.0
@@ -1,10 +1,12 @@
1
1
  import os
2
+ import pprint
2
3
 
3
4
  import pytest
4
5
  from lxml import etree
5
6
 
6
7
  from xml2db import DataModel
7
8
  from xml2db.xml_converter import XMLConverter, remove_record_hash
9
+ from .conftest import list_xml_path, models_path
8
10
  from .sample_models import models
9
11
 
10
12
 
@@ -13,19 +15,20 @@ from .sample_models import models
13
15
  [
14
16
  {**model, **version, "xml_file": xml_file}
15
17
  for model in models
16
- for xml_file in os.listdir(model["xml_path"])
18
+ for xml_file in list_xml_path(model, "xml")
19
+ + list_xml_path(model, "equivalent_xml")
17
20
  for version in model["versions"]
18
21
  ],
19
22
  )
20
- def test_document_tree_parsing(test_config):
23
+ def test_iterative_recursive_parsing(test_config):
21
24
  """Test whether iterative and recursive parsing give same results"""
22
25
  model = DataModel(
23
- test_config["xsd_path"],
26
+ str(os.path.join(models_path, test_config["id"], test_config["xsd"])),
24
27
  short_name=test_config["id"],
25
28
  model_config=test_config["config"],
26
29
  )
27
30
  converter = XMLConverter(model)
28
- file_path = os.path.join(test_config["xml_path"], test_config["xml_file"])
31
+ file_path = test_config["xml_file"]
29
32
 
30
33
  parsed_recursive = converter.parse_xml(
31
34
  file_path, file_path, skip_validation=True, iterparse=False
@@ -42,7 +45,7 @@ def test_document_tree_parsing(test_config):
42
45
  [
43
46
  {**model, **version, "xml_file": xml_file}
44
47
  for model in models
45
- for xml_file in os.listdir(model["xml_path"])
48
+ for xml_file in list_xml_path(model, "xml")
46
49
  for version in model["versions"]
47
50
  ],
48
51
  )
@@ -50,22 +53,22 @@ def test_document_tree_to_flat_data(test_config):
50
53
  """A test for document tree to flat data conversion and back"""
51
54
 
52
55
  model = DataModel(
53
- test_config["xsd_path"],
56
+ str(os.path.join(models_path, test_config["id"], test_config["xsd"])),
54
57
  short_name=test_config["id"],
55
58
  model_config=test_config["config"],
56
59
  )
57
60
  converter = XMLConverter(model)
58
61
 
59
- file_path = os.path.join(test_config["xml_path"], test_config["xml_file"])
62
+ file_path = test_config["xml_file"]
60
63
 
61
64
  # parse XML to document tree
62
65
  converter.parse_xml(file_path, file_path)
63
- exp_doc_tree = remove_record_hash(converter.document_tree)
66
+ exp_doc_tree = pprint.pformat(remove_record_hash(converter.document_tree))
64
67
 
65
68
  # parse XML to document tree and then flat data model
66
69
  doc = model.parse_xml(file_path)
67
70
  # and convert it back to document tree
68
- act_doc_tree = doc.flat_data_to_doc_tree()
71
+ act_doc_tree = pprint.pformat(doc.flat_data_to_doc_tree())
69
72
 
70
73
  assert act_doc_tree == exp_doc_tree
71
74
 
@@ -75,7 +78,7 @@ def test_document_tree_to_flat_data(test_config):
75
78
  [
76
79
  {**model, **version, "xml_file": xml_file}
77
80
  for model in models
78
- for xml_file in os.listdir(model["xml_path"])
81
+ for xml_file in list_xml_path(model, "xml")
79
82
  for version in model["versions"]
80
83
  ],
81
84
  )
@@ -83,13 +86,13 @@ def test_document_tree_to_xml(test_config):
83
86
  """A test for document tree to xml conversion and back"""
84
87
 
85
88
  model = DataModel(
86
- test_config["xsd_path"],
89
+ str(os.path.join(models_path, test_config["id"], test_config["xsd"])),
87
90
  short_name=test_config["id"],
88
91
  model_config=test_config["config"],
89
92
  )
90
93
  converter = XMLConverter(model)
91
94
 
92
- file_path = os.path.join(test_config["xml_path"], test_config["xml_file"])
95
+ file_path = test_config["xml_file"]
93
96
 
94
97
  # parse XML to document tree
95
98
  converter.parse_xml(file_path, file_path)
@@ -112,3 +115,29 @@ def test_document_tree_to_xml(test_config):
112
115
  ref_xml = f.read()
113
116
 
114
117
  assert xml == ref_xml
118
+
119
+
120
+ @pytest.mark.parametrize(
121
+ "test_config",
122
+ [
123
+ {**model, **version}
124
+ for model in models
125
+ for version in model["versions"]
126
+ if os.path.isdir(os.path.join(models_path, model["id"], "equivalent_xml"))
127
+ ],
128
+ )
129
+ def test_equivalent_xml(test_config):
130
+ """A test for xml documents which should result in the same extracted data"""
131
+
132
+ xml_files = list_xml_path(test_config, "equivalent_xml")
133
+
134
+ if len(xml_files) > 1:
135
+ model = DataModel(
136
+ str(os.path.join(models_path, test_config["id"], test_config["xsd"])),
137
+ short_name=test_config["id"],
138
+ model_config=test_config["config"],
139
+ )
140
+ ref_data = model.parse_xml(xml_files[0])
141
+ for xml_file in xml_files[1:]:
142
+ equ_data = model.parse_xml(xml_file)
143
+ assert ref_data.data == equ_data.data
@@ -5,6 +5,7 @@ from sqlalchemy.dialects import postgresql, mssql, mysql
5
5
 
6
6
  from xml2db import DataModel
7
7
  from .sample_models import models
8
+ from .conftest import models_path
8
9
 
9
10
 
10
11
  @pytest.mark.parametrize(
@@ -19,14 +20,15 @@ def test_model_erd(test_config):
19
20
  """A test to check if generated ERD matches saved output"""
20
21
 
21
22
  model = DataModel(
22
- test_config["xsd_path"],
23
+ str(os.path.join(models_path, test_config["id"], test_config["xsd"])),
23
24
  short_name=test_config["id"],
24
25
  model_config=test_config["config"],
25
26
  )
26
27
 
27
28
  expected = open(
28
29
  os.path.join(
29
- os.path.dirname(test_config["xsd_path"]),
30
+ models_path,
31
+ test_config["id"],
30
32
  f"{test_config['id']}_erd_version{test_config['version_id']}.md",
31
33
  ),
32
34
  "r",
@@ -49,7 +51,7 @@ def test_model_ddl(test_config):
49
51
  """A test to check if generated SQL DDL matches saved output"""
50
52
 
51
53
  model = DataModel(
52
- test_config["xsd_path"],
54
+ str(os.path.join(models_path, test_config["id"], test_config["xsd"])),
53
55
  short_name=test_config["id"],
54
56
  model_config=test_config["config"],
55
57
  db_type=test_config["dialect"].name,
@@ -57,7 +59,8 @@ def test_model_ddl(test_config):
57
59
 
58
60
  expected = open(
59
61
  os.path.join(
60
- os.path.dirname(test_config["xsd_path"]),
62
+ models_path,
63
+ test_config["id"],
61
64
  f"{test_config['id']}_ddl_{test_config['dialect'].name}_version{test_config['version_id']}.sql",
62
65
  ),
63
66
  "r",
@@ -4,7 +4,7 @@ import pytest
4
4
  from lxml import etree
5
5
 
6
6
  from xml2db.xml_converter import XMLConverter, remove_record_hash
7
- from .fixtures import setup_db_model, conn_string
7
+ from .conftest import list_xml_path
8
8
  from .sample_models import models
9
9
 
10
10
 
@@ -17,10 +17,7 @@ def test_database_xml_roundtrip(setup_db_model, model_config):
17
17
  """A test for roundtrip insert to the database from and to XML"""
18
18
 
19
19
  model = setup_db_model
20
- xml_files = [
21
- os.path.join(model_config["xml_path"], file)
22
- for file in os.listdir(model_config["xml_path"])
23
- ]
20
+ xml_files = list_xml_path(model_config, "xml")
24
21
 
25
22
  for file in xml_files:
26
23
  # do parse and insert into the database
@@ -59,10 +56,7 @@ def test_database_document_tree_roundtrip(setup_db_model, model_config):
59
56
  """A test for roundtrip insert to the database from and to document tree"""
60
57
 
61
58
  model = setup_db_model
62
- xml_files = [
63
- os.path.join(model_config["xml_path"], file)
64
- for file in os.listdir(model_config["xml_path"])
65
- ]
59
+ xml_files = list_xml_path(model_config, "xml")
66
60
 
67
61
  for file in xml_files:
68
62
  # do parse and insert into the database
@@ -92,10 +86,7 @@ def test_database_document_tree_roundtrip_single_load(setup_db_model, model_conf
92
86
  """A test for roundtrip insert to the database from and to document tree"""
93
87
 
94
88
  model = setup_db_model
95
- xml_files = [
96
- os.path.join(model_config["xml_path"], file)
97
- for file in os.listdir(model_config["xml_path"])
98
- ]
89
+ xml_files = list_xml_path(model_config, "xml")
99
90
 
100
91
  flat_data = None
101
92
  doc = None
@@ -129,7 +120,7 @@ def test_database_document_tree_roundtrip_single_load(setup_db_model, model_conf
129
120
  [
130
121
  {**model, **version, "xml_file": xml_file}
131
122
  for model in models
132
- for xml_file in os.listdir(model["xml_path"])
123
+ for xml_file in list_xml_path(model, "xml")
133
124
  for version in model["versions"]
134
125
  ],
135
126
  )
@@ -1,10 +1,10 @@
1
- import xml.etree.ElementTree
2
-
3
1
  import lxml.etree
4
2
  import pytest
3
+ import os
5
4
 
6
5
  from xml2db import DataModel
7
6
  from .sample_models import models
7
+ from .conftest import models_path
8
8
 
9
9
 
10
10
  @pytest.mark.parametrize(
@@ -27,7 +27,9 @@ from .sample_models import models
27
27
  def test_invalid_xml(args: tuple):
28
28
 
29
29
  file_name, iterparse, recover, exception = args
30
- data_model = DataModel(models[0]["xsd_path"])
30
+ data_model = DataModel(
31
+ str(os.path.join(models_path, models[0]["id"], models[0]["xsd"]))
32
+ )
31
33
 
32
34
  if exception is None:
33
35
  data_model.parse_xml(
@@ -49,8 +51,8 @@ def test_invalid_xml(args: tuple):
49
51
  @pytest.mark.parametrize(
50
52
  "args",
51
53
  [
52
- ("invalid", True, False, IndexError),
53
- ("invalid", True, True, IndexError),
54
+ ("invalid", True, False, None),
55
+ ("invalid", True, True, None),
54
56
  ("invalid", False, False, None),
55
57
  ("invalid", False, True, None),
56
58
  ("malformed_recover", True, False, lxml.etree.XMLSyntaxError),
@@ -58,7 +60,7 @@ def test_invalid_xml(args: tuple):
58
60
  ("malformed_recover", False, False, lxml.etree.XMLSyntaxError),
59
61
  ("malformed_recover", False, True, None),
60
62
  ("malformed_no_recover", True, False, lxml.etree.XMLSyntaxError),
61
- ("malformed_no_recover", True, True, IndexError),
63
+ ("malformed_no_recover", True, True, None),
62
64
  ("malformed_no_recover", False, False, lxml.etree.XMLSyntaxError),
63
65
  ("malformed_no_recover", False, True, None),
64
66
  ],
@@ -66,7 +68,9 @@ def test_invalid_xml(args: tuple):
66
68
  def test_invalid_xml_skip_verify(args: tuple):
67
69
 
68
70
  file_name, iterparse, recover, exception = args
69
- data_model = DataModel(models[0]["xsd_path"])
71
+ data_model = DataModel(
72
+ str(os.path.join(models_path, models[0]["id"], models[0]["xsd"]))
73
+ )
70
74
 
71
75
  if exception is None:
72
76
  data_model.parse_xml(
@@ -1,10 +0,0 @@
1
- sqlalchemy>1.4
2
- xmlschema==3.3.2
3
- lxml==5.1.0
4
-
5
- [docs]
6
- mkdocs-material==9.5.34
7
- mkdocstrings-python==1.11.1
8
-
9
- [tests]
10
- pytest>=7.0
File without changes
File without changes
File without changes
File without changes