xml2db 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xml2db/model.py ADDED
@@ -0,0 +1,619 @@
1
+ import logging
2
+ import os
3
+ from datetime import datetime
4
+ from io import BytesIO
5
+ from typing import Iterable, Union
6
+ from uuid import uuid4
7
+
8
+ import xmlschema
9
+ import sqlalchemy
10
+ from sqlalchemy import MetaData, create_engine, inspect
11
+ from sqlalchemy.sql.ddl import CreateIndex, CreateTable
12
+ from graphlib import TopologicalSorter
13
+
14
+ from xml2db import document
15
+ from xml2db.exceptions import DataModelConfigError
16
+ from xml2db.table import (
17
+ DataModelTableReused,
18
+ DataModelTableDuplicated,
19
+ )
20
+ from xml2db.xml_converter import XMLConverter
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class DataModel:
26
+ """A class to manage a data model based on an XML schema and its database equivalent.
27
+
28
+ This class allows parsing a set of XSD files to build a representation of
29
+ the XML schema, simplify it and convert it into a set of database tables.
30
+ It also allows parsing XML documents that fit this XML schema in order to import
31
+ the data into the database.
32
+
33
+ :param xsd_file: A path to a XSD file
34
+ :param short_name: A short name for the schema
35
+ :param long_name: A longer name for the schema
36
+ :param base_url: The root folder to find other dependant XSD files (by default, the location of the \
37
+ provided XSD file)
38
+ :param model_config: A config dict to provide options for building the model
39
+ :param connection_string: A database connection string (optional if you will not be loading data)
40
+ :param db_schema: A schema name to use in the database
41
+ :param temp_prefix: A prefix to use for temporary tables (if `None`, will be generated randomly)
42
+
43
+ :ivar xml_schema: The `xmlschema.XMLSchema` object associated with this data model
44
+ :ivar data_flow_name: A short identifier used for the data model (`short_name` argument value)
45
+ :ivar data_flow_long_name: A longer for the data model (`long_name` argument value)
46
+ :ivar db_schema: A database schema name to store the database tables
47
+ :ivar source_tree: A text representation of the source data model tree
48
+ :ivar target_tree: A text representation of the simplified data model tree \
49
+ which will be used to create target tables
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ xsd_file: str,
55
+ short_name: str = None,
56
+ long_name: str = None,
57
+ base_url: str = None,
58
+ model_config: dict = None,
59
+ connection_string: str = None,
60
+ db_schema: str = None,
61
+ temp_prefix: str = None,
62
+ ):
63
+ self.xml_schema = xmlschema.XMLSchema(
64
+ os.path.basename(xsd_file) if base_url is None else xsd_file,
65
+ base_url=base_url
66
+ if base_url is not None
67
+ else os.path.normpath(os.path.dirname(xsd_file)),
68
+ )
69
+ self.xml_converter = XMLConverter(data_model=self)
70
+ self.data_flow_name = short_name
71
+ self.data_flow_long_name = long_name
72
+
73
+ if connection_string is None:
74
+ logger.warning(
75
+ "DataModel created without connection string cannot do actual imports"
76
+ )
77
+ self.engine = None
78
+ else:
79
+ engine_options = {}
80
+ if "mssql" in connection_string:
81
+ engine_options = {
82
+ "fast_executemany": True,
83
+ }
84
+ self.engine = create_engine(
85
+ connection_string,
86
+ isolation_level="SERIALIZABLE",
87
+ **engine_options,
88
+ )
89
+
90
+ self.model_config = {} if model_config is None else model_config
91
+
92
+ # validate row_numbers global option value
93
+ if "row_numbers" in self.model_config:
94
+ if not isinstance(self.model_config["row_numbers"], bool):
95
+ raise DataModelConfigError("row_numbers must be a bool")
96
+ else:
97
+ self.model_config["row_numbers"] = False
98
+
99
+ # as_columnstore global option available only for MSSQL database
100
+ if "as_columnstore" in self.model_config:
101
+ if not isinstance(self.model_config["as_columnstore"], bool):
102
+ raise DataModelConfigError("as_columnstore must be a bool")
103
+ if (
104
+ self.model_config["as_columnstore"]
105
+ and self.engine
106
+ and not self.engine.dialect.name == "mssql"
107
+ ):
108
+ self.model_config["as_columnstore"] = False
109
+ logger.info(
110
+ "Clustered columnstore indexes are only supported with MS SQL Server database, noop"
111
+ )
112
+ else:
113
+ self.model_config["as_columnstore"] = False
114
+
115
+ self.db_schema = db_schema
116
+ self.temp_prefix = str(uuid4())[:8] if temp_prefix is None else temp_prefix
117
+
118
+ self.tables = {}
119
+ self.names_types_map = {}
120
+ self.root_table = None
121
+ self.types_transforms = {}
122
+ self.fields_transforms = {}
123
+ self.ordered_tables_keys = []
124
+ self.source_tree = ""
125
+ self.target_tree = ""
126
+ self.metadata = MetaData()
127
+ self.processed_at = datetime.now()
128
+
129
+ self._build_model()
130
+
131
+ @property
132
+ def fk_ordered_tables(
133
+ self,
134
+ ) -> Iterable[Union[DataModelTableDuplicated, DataModelTableReused]]:
135
+ """Yields tables in create/insert order (tables referenced in foreign keys first)"""
136
+ for key in self.ordered_tables_keys:
137
+ yield self.tables[key]
138
+
139
+ @property
140
+ def fk_ordered_tables_reversed(
141
+ self,
142
+ ) -> Iterable[Union[DataModelTableDuplicated, DataModelTableReused]]:
143
+ """Yields tables in drop/delete order (tables referencing foreign keys first)"""
144
+ for key in reversed(self.ordered_tables_keys):
145
+ yield self.tables[key]
146
+
147
+ def _create_table_model(
148
+ self,
149
+ table_name: str,
150
+ type_name: str,
151
+ is_root_table: bool = False,
152
+ is_virtual_node: bool = False,
153
+ ) -> Union[DataModelTableReused, DataModelTableDuplicated]:
154
+ """Helper to create a data table model
155
+
156
+ :param table_name: name of the table
157
+ :param type_name: type of the table
158
+ :param is_root_table: is this table the root table?
159
+ :param is_virtual_node: was this table created to store multiple root elements?
160
+ :return: a data table model
161
+ """
162
+ table_config = self.model_config.get("tables", {}).get(table_name, {})
163
+ if table_config.get("reuse", True):
164
+ return DataModelTableReused(
165
+ table_name,
166
+ type_name,
167
+ is_root_table,
168
+ is_virtual_node,
169
+ self.metadata,
170
+ table_config,
171
+ self.db_schema,
172
+ self.temp_prefix,
173
+ self,
174
+ )
175
+ else:
176
+ return DataModelTableDuplicated(
177
+ table_name,
178
+ type_name,
179
+ is_root_table,
180
+ is_virtual_node,
181
+ self.metadata,
182
+ table_config,
183
+ self.db_schema,
184
+ self.temp_prefix,
185
+ self,
186
+ )
187
+
188
+ def _build_model(self):
189
+ """Build model from the provided XSD schema and config.
190
+
191
+ It will parse the XML schema, then simplify it, then create all sqlalchemy objects.
192
+ """
193
+ # parse the XML schema recursively and hold a reference to the head table
194
+ root_table = self._parse_tree(
195
+ self.xml_schema[0] if len(self.xml_schema) == 1 else self.xml_schema,
196
+ is_root_table=True,
197
+ )
198
+ self.root_table = root_table.type_name
199
+ # compute a text representation of the original data model and store it
200
+ self.source_tree = "\n".join(self._repr_tree(root_table))
201
+ # check user-provided configuration for tables
202
+ for tb_config in self.model_config.get("tables", {}):
203
+ if tb_config not in self.names_types_map:
204
+ raise DataModelConfigError(
205
+ f"Table '{tb_config}' provided in config does not exist"
206
+ )
207
+ # simplify the data model recursively starting from the root table
208
+ self.types_transforms, self.fields_transforms = root_table.simplify_table()
209
+ # remove tables that have been flagged for deletion during the simplification process
210
+ root_table.keep_table = True
211
+ self.tables = {
212
+ key: tb for key, tb in self.tables.items() if hasattr(tb, "keep_table")
213
+ }
214
+ # compute a text representation of the simplified data model and store it
215
+ self.target_tree = "\n".join(self._repr_tree(root_table))
216
+ # add parent table information on each table when it is not reused
217
+ # raises an error if a table is not configured as "reused" and have more than 1 parent table
218
+ for tb in self.tables.values():
219
+ tb.compute_dependencies()
220
+ # build a list of tables in insert/create order
221
+ ts = TopologicalSorter(
222
+ {key: sorted(tb.dependencies) for key, tb in self.tables.items()}
223
+ )
224
+ self.ordered_tables_keys = list(ts.static_order())
225
+ # build the ordered table in the sqlalchemy Metadata object (cannot be done before simplification because
226
+ # it will fail if we attempt to recreate tables that already exist in the sqlalchemy metadata
227
+ for tb in self.fk_ordered_tables:
228
+ tb.build_sqlalchemy_tables()
229
+
230
+ def _parse_tree(
231
+ self, parent_node: xmlschema.XsdElement, is_root_table: bool = False
232
+ ):
233
+ """Parse a node of an XML schema recursively and create a target data model without any simplification
234
+
235
+ We parse the XSD tree recursively to create for each node (basically a complex type in the XSD) an equivalent \
236
+ DataModelTable (which represents a table in the target data model). By default, tables are named after the \
237
+ first field name of this type. This is because we hope that fields names will be 'better' than actual \
238
+ type names. To be on the safe side, we need to make our new table names unique in the event where different \
239
+ XSD types are used with the same field names somewhere in the data model. Actual XSD types names and our \
240
+ table names are bijective.
241
+ This step is fairly straightforward, as we create DataModelTable objects recursively along the XSD tree, and \
242
+ populate them with appropriate columns and relations.
243
+
244
+ :param parent_node: the current XSD node being parsed
245
+ :param is_root_table: True if this is the root table
246
+ """
247
+
248
+ # find current node type and name and returns corresponding table if it already exists
249
+ parent_type = (
250
+ parent_node.type.local_name
251
+ if hasattr(parent_node, "type")
252
+ else self.data_flow_name
253
+ )
254
+ if parent_type is None:
255
+ parent_type = parent_node.local_name
256
+
257
+ # if this type has already been encountered, stop here and return existing table
258
+ if parent_type in self.tables:
259
+ parent_table = self.tables[parent_type]
260
+ return parent_table
261
+
262
+ # elements names and types should be bijective. If an element name is used for different types,
263
+ # we add a suffix to the name to make it unique again (using a dict to keep the name/type association)
264
+ parent_name = (
265
+ parent_node.local_name
266
+ if hasattr(parent_node, "local_name")
267
+ else self.data_flow_name
268
+ )
269
+ if parent_name in self.names_types_map:
270
+ i = 1
271
+ while "_".join([parent_name, str(i)]) in self.names_types_map:
272
+ i += 1
273
+ parent_name = "_".join([parent_name, str(i)])
274
+ self.names_types_map[parent_name] = parent_type
275
+
276
+ # create a new table object associated with the element
277
+ parent_table = self._create_table_model(
278
+ parent_name,
279
+ parent_type,
280
+ is_root_table,
281
+ isinstance(parent_node, xmlschema.XMLSchema),
282
+ )
283
+ self.tables[parent_type] = parent_table
284
+
285
+ def recurse_parse_simple_type(elem_type):
286
+ """Parse simple types to extract properties in case of restrictions, unions, and nested forms"""
287
+ if len(elem_type) > 1:
288
+ data_types = []
289
+ min_lengths = []
290
+ max_lengths = []
291
+ allow_empties = []
292
+ for el_type in elem_type:
293
+ dt, mil, mal, ae = recurse_parse_simple_type([el_type])
294
+ data_types.append(dt)
295
+ min_lengths.append(mil)
296
+ max_lengths.append(mal)
297
+ allow_empties.append(ae)
298
+ return (
299
+ data_types[0] if len(set(data_types)) == 1 else "string",
300
+ (
301
+ min(min_lengths)
302
+ if all(e is not None for e in min_lengths)
303
+ else None
304
+ ),
305
+ (
306
+ max(max_lengths)
307
+ if all(e is not None for e in max_lengths)
308
+ else None
309
+ ),
310
+ any(allow_empties),
311
+ )
312
+ elem_type = elem_type[0]
313
+ if elem_type.is_union():
314
+ return recurse_parse_simple_type(elem_type.base_type.member_types)
315
+ if elem_type.is_restriction():
316
+ dt = elem_type.base_type.local_name
317
+ mil = elem_type.min_length
318
+ mal = elem_type.max_length
319
+ ae = elem_type.allow_empty
320
+ if elem_type.base_type.is_restriction():
321
+ bt_dt, bt_mil, bt_mal, bt_ae = recurse_parse_simple_type(
322
+ [elem_type.base_type]
323
+ )
324
+ dt = bt_dt
325
+ mil = (
326
+ min(mil, bt_mil)
327
+ if mil is not None and bt_mil is not None
328
+ else None
329
+ )
330
+ mal = (
331
+ max(mal, bt_mal)
332
+ if mal is not None and bt_mal is not None
333
+ else None
334
+ )
335
+ ae = ae and bt_ae if ae is not None and bt_ae is not None else None
336
+ if elem_type.enumeration is not None:
337
+ mil = min([len(val) for val in elem_type.enumeration])
338
+ mal = max([len(val) for val in elem_type.enumeration])
339
+ return dt, mil, mal, ae
340
+ return (
341
+ elem_type.local_name,
342
+ elem_type.min_length,
343
+ elem_type.max_length,
344
+ elem_type.allow_empty,
345
+ )
346
+
347
+ def get_occurs(particle):
348
+ parent_occurs = [1, 1]
349
+ if particle.parent and hasattr(particle.parent, "model"):
350
+ parent_occurs = get_occurs(particle.parent)
351
+ return [
352
+ min(parent_occurs[0], particle.min_occurs),
353
+ max(parent_occurs[1], particle.max_occurs)
354
+ if parent_occurs[1] is not None and particle.max_occurs is not None
355
+ else None,
356
+ ]
357
+
358
+ # go through item attributes and add them as columns
359
+ for attrib_name, attrib in parent_node.attributes.items():
360
+ (
361
+ data_type,
362
+ min_length,
363
+ max_length,
364
+ allow_empty,
365
+ ) = recurse_parse_simple_type([attrib.type])
366
+ parent_table.add_column(
367
+ f"{attrib_name}",
368
+ data_type,
369
+ [0, 1],
370
+ min_length,
371
+ max_length,
372
+ True,
373
+ False,
374
+ allow_empty,
375
+ None,
376
+ )
377
+ nested_containers = []
378
+ # go through the children to add either arguments either relations to the current element
379
+ for child in parent_node:
380
+ if type(child) is xmlschema.XsdElement:
381
+ # "nested_containers" is used to allow ordering nodes in mostly correct order in case of nested sequence
382
+ # with multiple occurrence when generating XML. For instance, if we have a sequence A, B with
383
+ # max occur > 1, we want to generate A, B, A, B and not A, A, B, B, thus we mark A and B as member of
384
+ # the same "ngroup", which will be used when generating XML
385
+ if (
386
+ len(nested_containers) > 1
387
+ and child.parent == nested_containers[-2][0]
388
+ ):
389
+ nested_containers.pop()
390
+ elif (
391
+ len(nested_containers) == 0
392
+ or child.parent != nested_containers[-1][0]
393
+ ):
394
+ nested_containers.append(
395
+ (
396
+ child.parent,
397
+ str(hash(child.parent))
398
+ if child.parent
399
+ and child.parent.max_occurs != 1
400
+ and child.parent.model != "choice"
401
+ else None,
402
+ )
403
+ )
404
+ ct = child.type
405
+ if ct.is_complex() and len(child) == 0 and ct.base_type is not None:
406
+ ct = ct.base_type
407
+ if ct.is_simple():
408
+ (
409
+ data_type,
410
+ min_length,
411
+ max_length,
412
+ allow_empty,
413
+ ) = recurse_parse_simple_type([ct])
414
+ occurs = get_occurs(child)
415
+ parent_table.add_column(
416
+ child.local_name,
417
+ data_type,
418
+ occurs,
419
+ min_length,
420
+ max_length,
421
+ False,
422
+ False,
423
+ allow_empty,
424
+ nested_containers[-1][1],
425
+ )
426
+
427
+ elif ct.is_complex():
428
+ child_table = self._parse_tree(child)
429
+ child_table.model_group = (
430
+ "choice" if ct.model_group.model == "choice" else "sequence"
431
+ )
432
+ occurs = get_occurs(child)
433
+ if child.is_single():
434
+ parent_table.add_relation_1(
435
+ child.local_name,
436
+ child_table,
437
+ occurs,
438
+ nested_containers[-1][1],
439
+ )
440
+ else:
441
+ parent_table.add_relation_n(
442
+ child.local_name,
443
+ child_table,
444
+ occurs,
445
+ nested_containers[-1][1],
446
+ )
447
+ else:
448
+ raise ValueError("unknown case; please check")
449
+ else:
450
+ raise ValueError("unknown case; please check (child not an XsdElement)")
451
+
452
+ if hasattr(parent_node, "type") and parent_node.type.has_mixed_content():
453
+ parent_table.add_column(
454
+ "value",
455
+ "string",
456
+ [0, 1],
457
+ 0,
458
+ None,
459
+ False,
460
+ True,
461
+ True,
462
+ None,
463
+ )
464
+
465
+ return parent_table
466
+
467
+ def _repr_tree(
468
+ self,
469
+ parent_table: Union[DataModelTableReused, DataModelTableDuplicated],
470
+ visited_nodes=None,
471
+ ):
472
+ """Build a text representation of the data model tree
473
+
474
+ :param parent_table: the current data model table object
475
+ """
476
+ if visited_nodes is None:
477
+ visited_nodes = set()
478
+ else:
479
+ visited_nodes = {item for item in visited_nodes}
480
+ visited_nodes.add(parent_table.name)
481
+ for field_type, name, field in parent_table.fields:
482
+ if field_type == "col":
483
+ yield f"{field.name}{field.occurs}: {field.data_type}"
484
+ elif field_type == "rel1":
485
+ mg = " (choice)" if field.other_table.model_group == "choice" else ""
486
+ yield f"{field.name}{field.occurs}{mg}:{' ...' if field_type in visited_nodes else ''}"
487
+ if field.other_table.name not in visited_nodes:
488
+ for line in self._repr_tree(field.other_table, visited_nodes):
489
+ yield f" {line}"
490
+ elif field_type == "reln":
491
+ mg = " (choice)" if field.other_table.model_group == "choice" else ""
492
+ yield f"{field.name}{field.occurs}{mg}:{' ...' if field_type in visited_nodes else ''}"
493
+ for line in self._repr_tree(field.other_table, visited_nodes):
494
+ yield f" {line}"
495
+
496
+ def get_entity_rel_diagram(self, text_context=True) -> str:
497
+ """Build an entity relationship diagram for the data model
498
+
499
+ The ERD syntax is used by mermaid.js to create a visual representation of the diagram, which is supported
500
+ by Pycharm IDE or GitHub in markdown files, among others
501
+
502
+ :param text_context: Should we add a title, a text explanation, etc. or just the ERD?
503
+ :return: A string representation of the ERD
504
+ """
505
+ out = ["erDiagram"]
506
+ for tb in self.fk_ordered_tables_reversed:
507
+ out += tb.get_entity_rel_diagram()
508
+
509
+ if text_context:
510
+ out = (
511
+ [
512
+ f"# {self.data_flow_long_name}\n",
513
+ f"### Data model name: `{self.data_flow_name}`\n",
514
+ (
515
+ "The following *Entity Relationships Diagram* represents the target data model, after the "
516
+ "simplification of the source data model, but before the transformations performed to optimize "
517
+ "data storage (transformation of `1-1` and `1-n` relationships into `n-1` and `n-n` "
518
+ "relationships, respectively, as described [here](../../docs/recycling_nodes.md)).\n"
519
+ ),
520
+ (
521
+ "As a consequence, not all tables of the actual data model used in the database are shown. "
522
+ "Specifically, `1-n` relationships presented may be stored in the database using an additional "
523
+ "relationship table (noted with an asterisk in the relationship name).\n"
524
+ ),
525
+ "```mermaid",
526
+ ]
527
+ + out
528
+ + [
529
+ "```",
530
+ (
531
+ "`-N` suffix in field type indicates that the field can have multiple values, which will be "
532
+ "stored as comma separated values."
533
+ ),
534
+ ]
535
+ )
536
+ return "\n".join(out)
537
+
538
+ def get_all_create_table_statements(self, temp=False) -> Iterable[CreateTable]:
539
+ """Yield create table statements for all tables
540
+
541
+ :param temp: If True, yield create table statements for temporary tables (prefixed)
542
+ """
543
+ for tb in self.fk_ordered_tables:
544
+ yield from tb.get_create_table_statements(temp)
545
+
546
+ def get_all_create_index_statements(self) -> Iterable[CreateIndex]:
547
+ """Yield create index statements for all tables"""
548
+ for tb in self.fk_ordered_tables:
549
+ yield from tb.get_create_index_statements()
550
+
551
+ def create_all_tables(self, temp: bool = False) -> None:
552
+ """Create tables for the data model, either target tables or temp tables used to import data
553
+
554
+ :param temp: If True, create temporary (prefixed) tables
555
+ """
556
+ for tb in self.fk_ordered_tables:
557
+ tb.create_tables(self.engine, temp)
558
+
559
+ def create_db_schema(self) -> None:
560
+ """Create database schema if it does not already exist."""
561
+ if self.db_schema is not None:
562
+ inspector = inspect(self.engine)
563
+ if self.db_schema not in inspector.get_schema_names():
564
+ with self.engine.connect() as conn:
565
+ conn.execute(sqlalchemy.schema.CreateSchema(self.db_schema))
566
+ conn.commit()
567
+ logger.info(f"Created schema: {self.db_schema}")
568
+
569
+ def drop_all_tables(self):
570
+ """Drop the data model target (unprefixed) tables.
571
+
572
+ BE CAUTIOUS, THIS METHOD DROPS TABLES WITHOUT FURTHER NOTICE!
573
+
574
+ """
575
+ for tb in self.fk_ordered_tables_reversed:
576
+ tb.drop_tables(self.engine)
577
+
578
+ def drop_all_temp_tables(self):
579
+ """Drop the data model temporary (prefixed) tables.
580
+
581
+ BE CAUTIOUS, THIS METHOD DROPS TABLES WITHOUT FURTHER NOTICE!
582
+
583
+ """
584
+ for tb in self.fk_ordered_tables_reversed:
585
+ tb.drop_temp_tables(self.engine)
586
+
587
+ def parse_xml(
588
+ self,
589
+ xml_file: Union[str, BytesIO],
590
+ xml_file_path: str = None,
591
+ skip_validation: bool = True,
592
+ ) -> document.Document:
593
+ """Parse an XML document based on this data model
594
+
595
+ This method is just a wrapper around the parse_xml method of the Document class.
596
+
597
+ :param xml_file: The path or the file object of an XML file to parse
598
+ :param xml_file_path: The path of the XML file, mandatory if xml_file is file object in order to fill the \
599
+ 'xml2db_input_file_path' column of the root table.
600
+ :param skip_validation: Should we validate the documents against the schema first?
601
+ :return: A parsed `Document` object
602
+ """
603
+ doc = document.Document(self)
604
+ doc.parse_xml(xml_file, xml_file_path, skip_validation)
605
+ return doc
606
+
607
+ def extract_from_database(
608
+ self,
609
+ root_select_where: str,
610
+ ) -> document.Document:
611
+ """Extract a document from the database, based on a where clause applied to the root table. For instance, you
612
+ can use the column `xml2db_input_file_path` to filter the data loaded from a specific file.
613
+
614
+ :param root_select_where: A where clause to apply to this root table, as a string
615
+ :return: A `Document` object containing extracted data
616
+ """
617
+ doc = document.Document(self)
618
+ doc.extract_from_database(self.root_table, root_select_where)
619
+ return doc
@@ -0,0 +1,5 @@
1
+ from .table import DataModelTable
2
+ from .reused_table import DataModelTableReused
3
+ from .duplicated_table import DataModelTableDuplicated
4
+ from .relations import DataModelRelation1, DataModelRelationN
5
+ from .column import DataModelColumn