icsDataValidation 1.0.358__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. icsDataValidation/configuration.py +19 -0
  2. icsDataValidation/connection_setups/__init__.py +0 -0
  3. icsDataValidation/connection_setups/azure_connection_setup.py +19 -0
  4. icsDataValidation/connection_setups/databricks_connection_setup.py +28 -0
  5. icsDataValidation/connection_setups/exasol_connection_setup.py +17 -0
  6. icsDataValidation/connection_setups/oracle_connection_setup.py +26 -0
  7. icsDataValidation/connection_setups/snowflake_connection_setup.py +35 -0
  8. icsDataValidation/connection_setups/teradata_connection_setup.py +18 -0
  9. icsDataValidation/core/__init__.py +0 -0
  10. icsDataValidation/core/database_objects.py +18 -0
  11. icsDataValidation/core/object_comparison.py +239 -0
  12. icsDataValidation/input_parameters/__init__.py +0 -0
  13. icsDataValidation/input_parameters/testing_tool_params.py +81 -0
  14. icsDataValidation/main.py +250 -0
  15. icsDataValidation/output_parameters/__init__.py +0 -0
  16. icsDataValidation/output_parameters/result_params.py +94 -0
  17. icsDataValidation/services/__init__.py +0 -0
  18. icsDataValidation/services/comparison_service.py +582 -0
  19. icsDataValidation/services/database_services/__init__.py +0 -0
  20. icsDataValidation/services/database_services/azure_service.py +320 -0
  21. icsDataValidation/services/database_services/databricks_hive_metastore_service.py +1694 -0
  22. icsDataValidation/services/database_services/databricks_unity_catalog_service.py +1379 -0
  23. icsDataValidation/services/database_services/exasol_service.py +261 -0
  24. icsDataValidation/services/database_services/oracle_service.py +713 -0
  25. icsDataValidation/services/database_services/snowflake_service.py +1100 -0
  26. icsDataValidation/services/database_services/teradata_service.py +665 -0
  27. icsDataValidation/services/initialization_service.py +103 -0
  28. icsDataValidation/services/result_service.py +573 -0
  29. icsDataValidation/services/system_service.py +61 -0
  30. icsDataValidation/services/testset_service.py +257 -0
  31. icsDataValidation/utils/__init__.py +0 -0
  32. icsDataValidation/utils/file_util.py +96 -0
  33. icsDataValidation/utils/logger_util.py +96 -0
  34. icsDataValidation/utils/pandas_util.py +159 -0
  35. icsDataValidation/utils/parallelization_util.py +52 -0
  36. icsDataValidation/utils/sql_util.py +14 -0
  37. icsDataValidation-1.0.358.dist-info/METADATA +21 -0
  38. icsDataValidation-1.0.358.dist-info/RECORD +40 -0
  39. icsDataValidation-1.0.358.dist-info/WHEEL +5 -0
  40. icsDataValidation-1.0.358.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1694 @@
1
+ from databricks import sql as databricks_sqlconnect
2
+ import pandas as pd
3
+ import logging
4
+ import re
5
+ from datetime import datetime
6
+
7
+ from typing import Union, List, Dict
8
+ from pathlib import PurePath
9
+
10
+ from icsDataValidation.utils.logger_util import configure_dev_ops_logger
11
+ from icsDataValidation.core.database_objects import DatabaseObject
12
+
13
+ #########################################################################################
14
+ #########################################################################################
15
+
16
+ # Configure Dev Ops Logger
17
+
18
+ logger = logging.getLogger("Databricks_Hive_Metastore_Service")
19
+ logger.setLevel(logging.INFO)
20
+ configure_dev_ops_logger(logger)
21
+
22
+
23
+ class DatabricksHiveMetastoreService(object):
24
+ def __init__(self, connection_params: dict):
25
+ self.connection_params = connection_params
26
+ self.databricks_connection = None
27
+ self.databricks_datatype_mapping = {
28
+ "string": ["string", "array", "map", "struct"],
29
+ "numeric": [
30
+ "int",
31
+ "bigint",
32
+ "double",
33
+ "decimal",
34
+ "float",
35
+ "smallint",
36
+ "tinyint",
37
+ ],
38
+ "date_and_time": [
39
+ "timestamp",
40
+ "date",
41
+ "interval",
42
+ "timestamp_ntz",
43
+ "timestamp_tz",
44
+ "timestamp_ltz",
45
+ ],
46
+ "binary": ["binary"],
47
+ "boolean": ["boolean"],
48
+ }
49
+
50
+ def __enter__(self):
51
+ return self
52
+
53
+ def __exit__(self, exception_type, exception_value, traceback):
54
+ if self.databricks_connection is not None:
55
+ self.databricks_connection.close()
56
+
57
+ def __del__(self):
58
+ if self.databricks_connection is not None:
59
+ self.databricks_connection.close()
60
+
61
+ def _connect_to_databricks(self):
62
+ self.databricks_connection = databricks_sqlconnect.connect(
63
+ **self.connection_params
64
+ )
65
+ return self.databricks_connection
66
+
67
+ @staticmethod
68
+ def _get_error_message(excepction: Exception, statement: str) -> None:
69
+ """
70
+ Compose error message if the execution of a statement or query fails.
71
+ """
72
+ if hasattr(excepction, "raw_msg"):
73
+ message = excepction.raw_msg.replace("\n", " ")
74
+ else:
75
+ message = str(
76
+ excepction
77
+ ) # this makes sure that all kinds of errors can have a message, even if they do not have raw_msg attribute
78
+ if hasattr(excepction, "sfqid"):
79
+ message = message + f"\nQuery ID: {excepction.sfqid}"
80
+ return f"Databricks ERROR: {message}\nFailed statement:\n{statement}"
81
+
82
+ @staticmethod
83
+ def _get_in_clause(
84
+ key_filters: list,
85
+ numeric_columns: list,
86
+ numeric_scale: int,
87
+ where_exists: bool = True,
88
+ ) -> str:
89
+ """generates in_clause from list ready to expand the where clause, numeric values are rounded
90
+
91
+ Args:
92
+ key_filters (list): list of given expected values
93
+ numeric_columns (list): list of all numeric columns
94
+ numeric_scale (int): number of decimal places after rounding
95
+
96
+ Returns:
97
+ str: in clause as string
98
+ """
99
+ values = list(key_filters.values())
100
+ in_clause_values = "('"
101
+ for j in range(len(values[0])):
102
+ for value in values:
103
+ in_clause_values += str(value[j]) + "','"
104
+ in_clause_values = in_clause_values[:-2] + "),('"
105
+ in_clause_values = in_clause_values[:-3] + ")"
106
+
107
+ if where_exists:
108
+ in_clause_cols = f" AND (("
109
+ else:
110
+ in_clause_cols = f" WHERE (("
111
+ for key in key_filters.keys():
112
+ if key in numeric_columns:
113
+ in_clause_cols += f"""ROUND({key.replace("'", "")},2)""" + ","
114
+ else:
115
+ in_clause_cols += key.replace("'", "") + ","
116
+ in_clause_cols = in_clause_cols[:-1] + ")"
117
+ in_clause = in_clause_cols + " in (" + in_clause_values + ")"
118
+ return in_clause
119
+
120
+ def _get_column_clause(
121
+ self, column_list: list, columns_datatype: list, numeric_scale, key_columns
122
+ ) -> dict:
123
+ """turns list of desired columns into a sql compatible string
124
+
125
+ Args:
126
+ column_list (list): list of all columns
127
+ columns_datatype (list): datatypes of given columns
128
+ numeric_scale (_type_): number of decimal places for numeric columns
129
+ key_columns (_type_):list of columns of interest
130
+
131
+ Returns:
132
+ dict: _description_
133
+ """
134
+ column_intersecions_new = []
135
+ used_columns = []
136
+ numeric_columns = []
137
+ for column in column_list:
138
+ column_datatype = next(
139
+ x for x in columns_datatype if x["COLUMN_NAME"] == column
140
+ )["DATA_TYPE"]
141
+
142
+ if column in key_columns or not (
143
+ column_datatype.lower()
144
+ in self.databricks_datatype_mapping["date_and_time"]
145
+ ):
146
+ if (
147
+ column_datatype.lower()
148
+ in self.databricks_datatype_mapping["numeric"]
149
+ ):
150
+ if numeric_scale:
151
+ column_intersecions_new.append(
152
+ f"CAST(ROUND({column}, {numeric_scale}) as decimal(38,{numeric_scale})) as {column}"
153
+ )
154
+ else:
155
+ column_intersecions_new.append(f"{column} as {column}")
156
+ used_columns.append(column)
157
+ numeric_columns.append(column)
158
+ elif (
159
+ column_datatype.lower()
160
+ in self.databricks_datatype_mapping["string"]
161
+ ):
162
+ column_intersecions_new.append(f"{column} AS {column}")
163
+ used_columns.append(column)
164
+ else:
165
+ column_intersecions_new.append(column)
166
+ used_columns.append(column)
167
+
168
+ column_intersections = column_intersecions_new.copy()
169
+ column_clause = str(column_intersections)[1:-1].replace("'", "")
170
+ return column_clause, numeric_columns, used_columns
171
+
172
+ def get_database_objects(
173
+ self,
174
+ database: str,
175
+ schema: str = None,
176
+ object_type_restriction: str = "include_all",
177
+ ) -> dict:
178
+ if self.databricks_connection is None:
179
+ self._connect_to_databricks()
180
+
181
+ all_database_tables = []
182
+ all_database_views = []
183
+
184
+ if (
185
+ object_type_restriction == "include_all"
186
+ or object_type_restriction == "include_only_tables"
187
+ ):
188
+ if schema:
189
+ query_db_tables = f"SHOW TABLES IN {database}.{schema}"
190
+ else:
191
+ logger.error(
192
+ "Query defined as null - please check input for execute_queries function."
193
+ )
194
+ exit()
195
+
196
+ all_database_tables = self.execute_queries(query_db_tables)
197
+
198
+ if (
199
+ object_type_restriction == "include_all"
200
+ or object_type_restriction == "include_only_views"
201
+ ):
202
+ if schema:
203
+ query_db_views = f"SHOW VIEWS IN {schema}"
204
+ else:
205
+ logger.error(
206
+ "Query defined as null - please check input for execute_queries function."
207
+ )
208
+ exit()
209
+
210
+ all_database_views = self.execute_queries(query_db_views)
211
+
212
+ database_objects = []
213
+ for row in all_database_tables:
214
+ database_table = (
215
+ f'hive_metastore.{row["database"]}.{row["tableName"]}'.upper()
216
+ )
217
+ database_objects.append(
218
+ {"object_identifier": database_table, "object_type": "table"}
219
+ )
220
+ for row in all_database_views:
221
+ database_view = f'{row["TABLE_CATALOG"]}.{row["TABLE_SCHEMA"]}.{row["TABLE_NAME"]}'.upper()
222
+ database_objects.append(
223
+ {"object_identifier": database_view, "object_type": "view"}
224
+ )
225
+ return database_objects
226
+
227
+ def get_last_altered_timestamp_from_object(self, object: DatabaseObject) -> str:
228
+ """queries last_altered timestamp for given object
229
+
230
+ Args:
231
+ object (str): object for comparison
232
+
233
+ Returns:
234
+ str: last_altered timestamp
235
+ """
236
+ if self.databricks_connection is None:
237
+ self._connect_to_databricks()
238
+
239
+ self.execute_statement("ALTER SESSION SET TIMEZONE = 'Europe/London';")
240
+
241
+ query_get_last_altered = f"SELECT LAST_ALTERED FROM {object.database}.INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = '{object.name}' AND TABLE_SCHEMA = '{object.schema}';"
242
+
243
+ last_altered = self.execute_queries(query_get_last_altered)[0]
244
+
245
+ return last_altered
246
+
247
+ def get_columns_from_object(self, object: DatabaseObject) -> list:
248
+ """returns all columns from given object
249
+
250
+ Args:
251
+ object (DatabaseObject): table or view
252
+
253
+ Returns:
254
+ list: list of all columns
255
+ """
256
+
257
+ if self.databricks_connection is None:
258
+ self._connect_to_databricks()
259
+
260
+ query_show_columns = (
261
+ f"SHOW COLUMNS IN {object.database}.{object.schema}.{object.name};"
262
+ )
263
+
264
+ all_columns = self.execute_queries(query_show_columns)
265
+ columns = []
266
+
267
+ for row in all_columns:
268
+ columns.append(row["col_name"])
269
+
270
+ return columns
271
+
272
+ def get_row_count_from_object(self, object: DatabaseObject, where_clause: str="") -> int:
273
+ """gets row count from given object
274
+
275
+ Args:
276
+ object (DatabaseObject): table or view
277
+
278
+ Returns:
279
+ int: number of rows in object
280
+ """
281
+
282
+ if self.databricks_connection is None:
283
+ self._connect_to_databricks()
284
+
285
+ # is it more efficient to select the information_schema.table view to get the rows?
286
+ query_get_row_count = f"SELECT COUNT(*) AS ROW_COUNT FROM {object.database}.{object.schema}.{object.name} {where_clause};"
287
+ row_count = -1
288
+ error_list = []
289
+
290
+ try:
291
+ row_count = self.execute_queries(query_get_row_count)[0]["ROW_COUNT"]
292
+
293
+ except Exception as err:
294
+ error_list.append(str(err))
295
+ error_list.append(query_get_row_count)
296
+
297
+ return row_count, error_list
298
+
299
+ def get_data_types_from_object(
300
+ self, object: DatabaseObject, column_intersections: list
301
+ ) -> dict:
302
+ """returns datatypes for all intersection columns in a database object
303
+
304
+ Args:
305
+ object (DatabaseObject): table or view
306
+ column_intersections (list): columns for which the data type is queried
307
+
308
+ Returns:
309
+ dict: columns and their datatype
310
+ """
311
+
312
+ if self.databricks_connection is None:
313
+ self._connect_to_databricks()
314
+
315
+ column_intersections = str(column_intersections)[1:-1]
316
+ if column_intersections == "":
317
+ column_intersections = "''"
318
+
319
+ query_get_data_types_from_object = (
320
+ f"DESCRIBE TABLE {object.database}.{object.schema}.{object.name};"
321
+ )
322
+
323
+ table_description = self.execute_queries(query_get_data_types_from_object)
324
+
325
+ dict_colummns_datatype = []
326
+
327
+ for row in table_description:
328
+ dict_colummns_datatype.append(
329
+ {"COLUMN_NAME": row["col_name"], "DATA_TYPE": row["data_type"]}
330
+ )
331
+ return dict_colummns_datatype
332
+
333
+ def get_count_distincts_from_object(
334
+ self,
335
+ object: DatabaseObject,
336
+ column_intersections: list,
337
+ where_clause: str = "",
338
+ exclude_columns: list = [],
339
+ ) -> dict:
340
+ """get distinct count for every column in a database object that is in column intersections list
341
+
342
+ Args:
343
+ object (DatabaseObject): table or view
344
+ column_intersections (list): columns that are used for distinct count
345
+ where_clause (str, optional): optional further filter. Defaults to "".
346
+ exclude_columns (list, optional): columns to exclude from distinct count. Defaults to [].
347
+
348
+ Returns:
349
+ dict: distinct counts for columns
350
+ error_list: list of failed executions for distinct counts
351
+ """
352
+
353
+ if self.databricks_connection is None:
354
+ self._connect_to_databricks()
355
+
356
+ unions = ""
357
+
358
+ for column in column_intersections:
359
+ if column not in exclude_columns:
360
+ unions += f" UNION SELECT '{column}' AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.database}.{object.schema}.{object.name} {where_clause}"
361
+
362
+ query_get_count_distincts_from_object = f"{unions[6:]} ORDER BY COUNT_DISTINCT;"
363
+ error_list = []
364
+ try:
365
+ dict_count_distincts = self.execute_queries(
366
+ query_get_count_distincts_from_object
367
+ )
368
+
369
+ except Exception as err:
370
+ # raise err
371
+ dict_count_distincts = [{"COUNT_DISTINCT": 0}]
372
+ error_list.append(
373
+ ["ERROR", str(err).split("|||")[0], str(err).split("|||")[1]]
374
+ )
375
+
376
+ return dict_count_distincts, error_list
377
+
378
+ def get_table_size(self, object: DatabaseObject) -> int:
379
+ """returns size of given object
380
+
381
+ Args:
382
+ object (DatabaseObject): table or view
383
+
384
+ Returns:
385
+ int: size of object
386
+ """
387
+
388
+ if self.databricks_connection is None:
389
+ self._connect_to_databricks()
390
+
391
+ query_analyze_table = f"ANALYZE TABLE {object.database}.{object.schema}.{object.name} COMPUTE STATISTICS NOSCAN"
392
+ self.execute_queries(query_analyze_table)
393
+
394
+ query_get_table_size = (
395
+ f"DESC EXTENDED {object.database}.{object.schema}.{object.name}"
396
+ )
397
+
398
+ table_description = self.execute_queries(query_get_table_size)
399
+ size_string = [
400
+ row["data_type"]
401
+ for row in table_description
402
+ if row["col_name"] == "Statistics"
403
+ ][0]
404
+ size = int(re.search(r"\d+", size_string).group())
405
+
406
+ return size
407
+
408
+ def create_checksums(
409
+ self,
410
+ object: DatabaseObject,
411
+ column_intersections: list,
412
+ where_clause: str = "",
413
+ exclude_columns: list = [],
414
+ numeric_scale: int = None,
415
+ ) -> List[Dict]:
416
+ """creates checksums for given object in compliance with given conditions
417
+
418
+ Args:
419
+ object (DatabaseObject): table or view
420
+ column_intersections (list): columns that are used for checksums
421
+ where_clause (str, optional): Optional filter criteria given as sql-usable string. Defaults to "".
422
+ exclude_columns (list, optional): columns to exlude from calculation. Defaults to [].
423
+ numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
424
+
425
+ Returns:
426
+ List[Dict]: checksums for columns of object
427
+ """
428
+
429
+ if self.databricks_connection is None:
430
+ self._connect_to_databricks()
431
+
432
+ column_intersections = [
433
+ f"{x.upper()}" for x in column_intersections if x not in exclude_columns
434
+ ]
435
+
436
+ dict_colummns_datatype = self.get_data_types_from_object(
437
+ object, column_intersections
438
+ )
439
+
440
+ aggregates = ""
441
+ count_nulls = ""
442
+
443
+ for column in column_intersections:
444
+ column_datatype = next(
445
+ x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column
446
+ )["DATA_TYPE"]
447
+
448
+ count_nulls += f", SUM(CASE WHEN {column} IS NULL THEN 1 ELSE 0 END) AS COUNTNULLS_{column}"
449
+
450
+ if column_datatype.lower() in self.databricks_datatype_mapping["numeric"]:
451
+ if numeric_scale:
452
+ aggregates += f", CAST(ROUND(SUM({column}), {numeric_scale}) as decimal(38, {numeric_scale})) AS sum_{column}"
453
+ else:
454
+ aggregates += (
455
+ f", CAST(SUM({column}) as decimal(38)) AS sum_{column}"
456
+ )
457
+
458
+ elif (
459
+ column_datatype.lower() in self.databricks_datatype_mapping["string"]
460
+ or column_datatype.lower()
461
+ in self.databricks_datatype_mapping["date_and_time"]
462
+ ):
463
+ aggregates += (
464
+ f", COUNT(DISTINCT LOWER({column})) AS countdistinct_{column}"
465
+ )
466
+
467
+ elif column_datatype.lower() in self.databricks_datatype_mapping["binary"]:
468
+ aggregates += f", COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column}::VARCHAR))) AS countdistinct_{column}"
469
+
470
+ elif column_datatype.lower() in self.databricks_datatype_mapping["boolean"]:
471
+ aggregates += f", MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = true)) || '_' || MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = false)) AS aggregateboolean_{column}"
472
+
473
+ # else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
474
+
475
+ query_checksums = f"SELECT {aggregates[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
476
+
477
+ query_countnulls = f"SELECT {count_nulls[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
478
+
479
+ error_list = []
480
+ checksums={}
481
+
482
+ try:
483
+ checksums_results = self.execute_queries(
484
+ [query_checksums, query_countnulls]
485
+ )
486
+
487
+ aggregation_results = checksums_results[0][0]
488
+ countnulls_results = checksums_results[1][0]
489
+
490
+ checksums = {}
491
+ for key in aggregation_results.asDict().keys():
492
+ aggregation = key.split("_", 1)[0].upper()
493
+ col_name = key.split("_", 1)[1]
494
+ value = aggregation_results[key]
495
+ cnt_nulls = countnulls_results[f"COUNTNULLS_{col_name}"]
496
+ checksums[col_name] = [aggregation, value, cnt_nulls]
497
+
498
+ except Exception as err:
499
+ # TODO: Improve error formatting
500
+ error_list.append(["ERROR",query_checksums, str(err)])
501
+
502
+ checksums["TESTATM_ERRORS"] = error_list
503
+
504
+ return checksums
505
+
506
+ def create_pandas_df_from_group_by(
507
+ self,
508
+ object: DatabaseObject,
509
+ column_intersections: list,
510
+ group_by_columns: list,
511
+ group_by_aggregation_columns: list,
512
+ group_by_aggregation_type: str,
513
+ only_numeric: bool,
514
+ where_clause: str,
515
+ exclude_columns: list,
516
+ numeric_scale: int = None,
517
+ ) -> List[Dict]:
518
+ """execution of multiple aggregations at once
519
+
520
+ Args:
521
+ object (DatabaseObject): table or view
522
+ column_intersections (list): columns existing in src and trgt
523
+ group_by_columns (list): columns for grouping the aggregations
524
+ group_by_aggregation_columns (list): list of columns that are supposed to be aggregated
525
+ group_by_aggregation_type (str): choice between: only_min_max, various, various_and_min_max
526
+ only_numeric (bool): whether to also include distinct counts or only do numeric aggregations
527
+ where_clause (str): optional filter for aggregations, given as sql compatible where-string
528
+ exclude_columns (list): columns to exclude from comparisons
529
+ numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
530
+
531
+ Returns:
532
+ List[Dict]: list of pandas dataframes with results from aggregations, used sql queries
533
+ """
534
+
535
+ if self.databricks_connection is None:
536
+ self._connect_to_databricks()
537
+
538
+ if group_by_aggregation_columns == ["all"]:
539
+ aggregation_columns = [
540
+ f"{column.upper()}"
541
+ for column in column_intersections
542
+ if (column not in group_by_columns and column not in exclude_columns)
543
+ ]
544
+ else:
545
+ aggregation_columns = [
546
+ f"{column.upper()}"
547
+ for column in column_intersections
548
+ if (
549
+ column in group_by_aggregation_columns
550
+ and column not in exclude_columns
551
+ )
552
+ ]
553
+
554
+ group_by_query_columns_string = " "
555
+ grouping_columns_final = []
556
+ error_dict = {}
557
+
558
+ try:
559
+ for column in group_by_columns:
560
+ if column in column_intersections and column not in exclude_columns:
561
+ group_by_query_columns_string += f"{column} ,"
562
+ grouping_columns_final.append(column)
563
+
564
+ group_by_query_columns_string = group_by_query_columns_string[:-1]
565
+
566
+ dict_colummns_datatype = self.get_data_types_from_object(
567
+ object, aggregation_columns
568
+ )
569
+
570
+ aggregates = ""
571
+ aggregates_min = ""
572
+
573
+ for column in aggregation_columns:
574
+ column_datatype = next(
575
+ x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column
576
+ )["DATA_TYPE"]
577
+
578
+ if (
579
+ column_datatype.lower()
580
+ in self.databricks_datatype_mapping["numeric"]
581
+ ):
582
+ if numeric_scale:
583
+ aggregates_min += f", CAST(ROUND(MIN({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MIN_{column}, CAST(ROUND(max({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MAX_{column}"
584
+ aggregates += f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS SUM_{column}"
585
+
586
+ else:
587
+ aggregates_min += f", MIN({column}) AS MIN_{column}, MAX({column}) AS MAX_{column}"
588
+ aggregates += f", SUM({column}) AS SUM_{column}"
589
+
590
+ elif not only_numeric and (
591
+ column_datatype.lower()
592
+ in self.databricks_datatype_mapping["string"]
593
+ or column_datatype.lower()
594
+ in self.databricks_datatype_mapping["date_and_time"]
595
+ ):
596
+ aggregates += (
597
+ f", COUNT(DISTINCT LOWER({column})) AS COUNTDISTINCT_{column}"
598
+ )
599
+
600
+ elif (
601
+ not only_numeric
602
+ and column_datatype.lower()
603
+ in self.databricks_datatype_mapping["binary"]
604
+ ):
605
+ aggregates += f", COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column}::VARCHAR))) AS COUNTDISTINCT_{column}"
606
+
607
+ elif (
608
+ not only_numeric
609
+ and column_datatype.lower()
610
+ in self.databricks_datatype_mapping["boolean"]
611
+ ):
612
+ aggregates += f", MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = true)) || '_' || MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = false)) AS AGGREGATEBOOLEAN_{column}"
613
+
614
+ # else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
615
+
616
+ # CASE 1: min_max
617
+ if group_by_aggregation_type == "only_min_max":
618
+ group_by_query_aggregation_string = aggregates_min[1:]
619
+
620
+ # CASE 2; sum, count_distinct, aggregate_boolean
621
+ elif group_by_aggregation_type == "various":
622
+ group_by_query_aggregation_string = aggregates[1:]
623
+
624
+ # CASE 3: sum, count_distinct, aggregate_boolean, min_max
625
+ elif group_by_aggregation_type == "various_and_min_max":
626
+ group_by_query_aggregation_string = f"{aggregates_min[1:]}{aggregates}"
627
+
628
+ query_group_by_aggregation = f"SELECT {group_by_query_columns_string}, COUNT(*) AS COUNT_OF_GROUP_BY_VALUE, {group_by_query_aggregation_string} FROM {object.database}.{object.schema}.{object.name} {where_clause} GROUP BY {group_by_query_columns_string} ORDER BY {group_by_query_columns_string};"
629
+
630
+ group_by_aggregation_pdf = self.execute_queries(
631
+ query_group_by_aggregation, True
632
+ )
633
+ except Exception as err:
634
+ group_by_aggregation_pdf = pd.DataFrame()
635
+ group_by_aggregation_pdf["TESTATM_ERROR"] = [1]
636
+ if not grouping_columns_final:
637
+ error_dict = {
638
+ "QUERY": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table",
639
+ "ERROR": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table"
640
+ }
641
+ group_by_query_aggregation_string = ""
642
+ elif "|||" in str(err):
643
+ error_dict = {
644
+ "QUERY": str(err).split("|||")[0],
645
+ "ERROR": str(err).split("|||")[1],
646
+ }
647
+ else:
648
+ error_dict = {
649
+ "QUERY": "NO Query generated. Please check if the configurated Grouping Columns exist in the Table",
650
+ "ERROR": str(err),
651
+ }
652
+ group_by_query_aggregation_string = ""
653
+
654
+ return (
655
+ group_by_aggregation_pdf,
656
+ group_by_query_aggregation_string,
657
+ group_by_query_columns_string,
658
+ grouping_columns_final,
659
+ error_dict
660
+ )
661
+
662
+ def create_pandas_df(
663
+ self, object: DatabaseObject, intersection_columns_trgt_src: list, where_clause:str="", exclude_columns:list=[]
664
+ ) -> pd.DataFrame:
665
+ """creates pandas dataframes with all data from given object in given columns
666
+
667
+ Args:
668
+ object (DatabaseObject): table or view
669
+ intersection_columns_trgt_src (list): columns existing in source and target
670
+
671
+ Returns:
672
+ pd.DataFrame: direct result of sql query
673
+ """
674
+ if self.databricks_connection is None:
675
+ self._connect_to_databricks()
676
+
677
+ intersection_columns_trgt_src_ = ', '.join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
678
+
679
+ df_query = f"SELECT {intersection_columns_trgt_src_} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
680
+
681
+ src_pdf = self.execute_queries(df_query, True)
682
+
683
+ return src_pdf
684
+
685
+ def create_pandas_df_from_sample(
686
+ self,
687
+ object: DatabaseObject,
688
+ column_intersections: list,
689
+ key_columns: list,
690
+ where_clause: str = "",
691
+ exclude_columns: list = [],
692
+ key_filters: dict = {},
693
+ dedicated_columns: list = [],
694
+ sample_count: int = 10,
695
+ numeric_scale: int = None,
696
+ ) -> List[Dict]:
697
+ if self.databricks_connection is None:
698
+ self._connect_to_databricks()
699
+
700
+ where_exists = True
701
+ if not where_clause:
702
+ where_exists = False
703
+
704
+ sample_count = str(sample_count)
705
+ key_intersection = list(
706
+ (set(column_intersections) & set(key_columns)) - set(exclude_columns)
707
+ )
708
+ filter_intersection = list(
709
+ (set(column_intersections) & set(key_filters.keys())) - set(exclude_columns)
710
+ )
711
+ dedicated_intersection = list(
712
+ (set(column_intersections) & set(dedicated_columns)) - set(exclude_columns)
713
+ )
714
+
715
+ key_intersection.sort()
716
+ filter_intersection.sort()
717
+ dedicated_intersection.sort()
718
+
719
+ if dedicated_intersection != []:
720
+ is_dedicated = True
721
+
722
+ dict_colummns_datatype = self.get_data_types_from_object(
723
+ object, dedicated_intersection
724
+ )
725
+
726
+ else:
727
+ is_dedicated = False
728
+
729
+ dict_colummns_datatype = self.get_data_types_from_object(
730
+ object, column_intersections
731
+ )
732
+
733
+ if key_intersection != [] and is_dedicated:
734
+ keys = str(key_intersection)[1:-1].replace("'", "")
735
+ column_clause, numeric_columns, used_columns = self._get_column_clause(
736
+ dedicated_intersection,
737
+ dict_colummns_datatype,
738
+ numeric_scale,
739
+ key_columns,
740
+ )
741
+ if (key_filters != {}) & (filter_intersection != []):
742
+ values = list(key_filters.values())
743
+ if values[0] != []:
744
+ in_clause = self._get_in_clause(
745
+ key_filters, numeric_columns, numeric_scale, where_exists
746
+ )
747
+ else:
748
+ in_clause = ""
749
+ sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} TABLESAMPLE ({sample_count} ROWS) {where_clause}{in_clause} ORDER BY {keys};"
750
+ elif key_intersection != [] and not is_dedicated:
751
+ keys = str(key_intersection)[1:-1].replace("'", "")
752
+ column_clause, numeric_columns, used_columns = self._get_column_clause(
753
+ column_intersections, dict_colummns_datatype, numeric_scale, key_columns
754
+ )
755
+ if (key_filters != {}) & (filter_intersection != []):
756
+ values = list(key_filters.values())
757
+ if values[0] != []:
758
+ in_clause = self._get_in_clause(
759
+ key_filters, numeric_columns, numeric_scale, where_exists
760
+ )
761
+ else:
762
+ in_clause = ""
763
+ sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} TABLESAMPLE ({sample_count} ROWS) {where_clause}{in_clause} ORDER BY {keys};"
764
+ else:
765
+ column_intersections = list(
766
+ set(column_intersections) - set(exclude_columns)
767
+ )
768
+ column_intersections.sort()
769
+ column_clause, numeric_columns, used_columns = self._get_column_clause(
770
+ column_intersections, dict_colummns_datatype, numeric_scale, key_columns
771
+ )
772
+ sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} TABLESAMPLE ({sample_count} ROWS) {where_clause};"
773
+
774
+ error_dict = {}
775
+ key_dict = {}
776
+ try:
777
+ sample_pdf = self.execute_queries(sample_query, return_as_pdf=True)
778
+ for key in key_intersection:
779
+ if pd.api.types.is_datetime64_any_dtype(sample_pdf[key]):
780
+ key_dict[key] = list(sample_pdf[key].astype(str))
781
+ else:
782
+ key_dict[key] = list(sample_pdf[key])
783
+
784
+ except Exception as err:
785
+ sample_pdf = pd.DataFrame()
786
+ sample_pdf["TESTATM_ERROR"] = [1]
787
+ if "|||" in str(err):
788
+ error_dict = {
789
+ "QUERY": str(err).split("|||")[0],
790
+ "ERROR": str(err).split("|||")[1],
791
+ }
792
+ else:
793
+ error_dict = {"QUERY": "No SQL Error", "ERROR": str(err)}
794
+
795
+ return_list = []
796
+ return_list.append(sample_pdf)
797
+ return_list.append(error_dict)
798
+
799
+ return return_list, key_dict, used_columns, sample_query
800
+
801
+ def execute_queries(
802
+ self,
803
+ query: Union[str, List[str]],
804
+ return_as_pdf: bool = False,
805
+ return_query_ids: bool = False,
806
+ ) -> Union[List[Dict], List[List[Dict]]]:
807
+ """actual execution of defined queries
808
+
809
+ Args:
810
+ query (Union[str, List[str]]): queries to be executed
811
+ return_as_pdf (bool, optional): If true, queries returned as pandas data frames. Defaults to False.
812
+ return_query_ids (bool, optional): If true, results and queri ids are returned, otherwise only results. Defaults to False.
813
+
814
+ Raises:
815
+ Exception: Raises exception if single query cannot be executed.
816
+
817
+ Returns:
818
+ Union[List[Dict], List[List[Dict]]]: returns results or results with query-ids
819
+ """
820
+ if self.databricks_connection is None:
821
+ self._connect_to_databricks()
822
+
823
+ if query:
824
+ query_list: List[str] = query if isinstance(query, list) else [query]
825
+ else:
826
+ logger.error(
827
+ "Query defined as null - please check input for execute_queries function."
828
+ )
829
+
830
+ cursor = self.databricks_connection.cursor()
831
+
832
+ results = []
833
+ query_ids = []
834
+
835
+ for single_query in query_list:
836
+ try:
837
+ query_result = cursor.execute(single_query).fetchall()
838
+ if return_as_pdf:
839
+ columns = [col[0] for col in cursor.description]
840
+ query_result = pd.DataFrame(query_result, columns=columns)
841
+
842
+ results.append(query_result)
843
+ query_ids.append(0) # there is no query id returned by databricks
844
+
845
+ except Exception as err:
846
+ raise Exception(single_query + "|||" + str(err))
847
+
848
+ if return_query_ids:
849
+ return (
850
+ results[0],
851
+ query_ids[0] if not isinstance(query, list) else results,
852
+ query_ids,
853
+ )
854
+
855
+ else:
856
+ return results[0] if not isinstance(query, list) else results
857
+
858
+ def execute_statement(self, statement: Union[str, List[str]]) -> None:
859
+ """
860
+ Executes simple statement against snowflake
861
+ Schema and Database settings must be set beforehand
862
+ Args:
863
+ statement Union[str, List[str]] - a sql statement or a list of sql statements to execute
864
+ """
865
+ if self.databricks_connection is None:
866
+ self._connect_to_databricks()
867
+
868
+ statement_list: List[str] = (
869
+ statement if isinstance(statement, list) else [statement]
870
+ )
871
+
872
+ cursor = self.databricks_connection.cursor()
873
+
874
+ for single_statement in statement_list:
875
+ try:
876
+ stripped_statement = single_statement.strip()
877
+ _ = cursor.execute(stripped_statement)
878
+
879
+ except Exception as err:
880
+ raise Exception(self._get_error_message(err, single_statement)) from err
881
+
882
+ def create_schemas(self, database_name: str, schemas: List):
883
+ statement_list = []
884
+
885
+ for schema in schemas:
886
+ statement_list.append(f"CREATE SCHEMA IF NOT EXISTS {schema}")
887
+
888
+ self.execute_statement(statement_list)
889
+
890
+ def insert_json_results(
891
+ self,
892
+ run_guid: str,
893
+ pipeline_name: str,
894
+ pipeline_id: str,
895
+ start_time_utc: str,
896
+ result_table: str,
897
+ results: dict,
898
+ ) -> None:
899
+ """
900
+ copy into - result table for json results
901
+ """
902
+
903
+ statement = f"CREATE TABLE IF NOT EXISTS {result_table} (RUN_GUID STRING, PIPELINE_NAME STRING, PIPELINE_ID STRING, START_TIME_UTC STRING, RESULT STRING, CREATION_TIME_UTC STRING)"
904
+
905
+ self.execute_statement(statement)
906
+
907
+ statement = (
908
+ "INSERT INTO {} VALUES ('{}', '{}', '{}', '{}', '{}', '{}');".format(
909
+ result_table,
910
+ run_guid,
911
+ pipeline_name,
912
+ pipeline_id,
913
+ start_time_utc,
914
+ str(results).replace("'", '"'),
915
+ datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S"),
916
+ )
917
+ )
918
+
919
+ self.execute_statement(statement)
920
+
921
+ def insert_json_results_live(
922
+ self,
923
+ run_guid: str,
924
+ pipeline_name: str,
925
+ pipeline_id: str,
926
+ result_table: str,
927
+ stage_name: str,
928
+ source_system: str,
929
+ target_system: str,
930
+ database: str,
931
+ schema: str,
932
+ object: str,
933
+ ) -> None:
934
+ """
935
+ copy into - result table for json results live
936
+ """
937
+ result_database = result_table.split(".", 1)[0]
938
+
939
+ statement = f"COPY INTO {result_table} (RUN_GUID, PIPELINE_NAME, PIPELINE_ID, SOURCE_SYSTEM, TARGET_SYSTEM, DATABASE_NAME, SCHEMA_NAME, OBJECT_NAME ,RESULT, CREATION_TS) FROM (SELECT '{run_guid}', '{pipeline_name}', '{pipeline_id}', '{source_system}', '{target_system}', '{database}', '{schema}', '{object}', $1, SYSDATE() from @{stage_name} (file_format => {result_database}.meta_data.ff_json ));"
940
+
941
+ self.execute_statement(statement)
942
+
943
+ def insert_highlevel_results(
944
+ self,
945
+ results: dict,
946
+ run_guid: str,
947
+ pipeline_name: str,
948
+ pipeline_id: str,
949
+ result_table_highlevel: str,
950
+ ) -> None:
951
+ """
952
+ insert into - highlevel results per "pipeline run" / "generic testing tool execution"
953
+ """
954
+
955
+ statement = f"CREATE TABLE IF NOT EXISTS {result_table_highlevel} (RUN_GUID STRING, PIPELINE_NAME STRING, PIPELINE_ID STRING, START_TIME_UTC STRING, SOURCE_SYSTEM STRING, TARGET_SYSTEM STRING, DATABASE_NAME STRING, TESTSET STRING, ALL_OBJECTS_MATCHING BOOLEAN, ALL_COLUMNS_EQUAL BOOLEAN, ALL_ROWCOUNTS_EQUAL BOOLEAN, ALL_CHECKSUMS_EQUAL BOOLEAN, ALL_SAMPLES_EQUAL BOOLEAN, ALL_OBJECTS_EQUAL BOOLEAN, OBJECTS_TO_COMPARE_SRC STRING, OBJECTS_TO_COMPARE_TRGT STRING, NUMBER_OF_OBJECTS_TO_COMPARE INT, SRC_MINUS_TRGT STRING, TRGT_MINUS_SRC STRING, CREATION_TS_UTC STRING)"
956
+
957
+ self.execute_statement(statement)
958
+
959
+ TESTSET_ = ", ".join(results["TESTSET"])
960
+
961
+ OBJECTS_TO_COMPARE_SRC_ = ", ".join(results["OBJECTS_TO_COMPARE_SRC"])
962
+
963
+ OBJECTS_TO_COMPARE_TRGT_ = ", ".join(results["OBJECTS_TO_COMPARE_TRGT"])
964
+
965
+ SRC_MINUS_TRGT_ = ", ".join(results["SRC_MINUS_TRGT"])
966
+
967
+ TRGT_MINUS_SRC_ = ", ".join(results["TRGT_MINUS_SRC"])
968
+
969
+ date_utc = datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
970
+
971
+ insert_statement = f"INSERT INTO {result_table_highlevel} ( \
972
+ RUN_GUID, \
973
+ PIPELINE_NAME, \
974
+ PIPELINE_ID, \
975
+ START_TIME_UTC, \
976
+ SOURCE_SYSTEM, \
977
+ TARGET_SYSTEM, \
978
+ DATABASE_NAME, \
979
+ TESTSET, \
980
+ ALL_OBJECTS_MATCHING, \
981
+ ALL_COLUMNS_EQUAL, \
982
+ ALL_DATATYPES_EQUAL, \
983
+ ALL_ROWCOUNTS_EQUAL, \
984
+ ALL_CHECKSUMS_EQUAL, \
985
+ ALL_SAMPLES_EQUAL, \
986
+ ALL_OBJECTS_EQUAL, \
987
+ OBJECTS_TO_COMPARE_SRC, \
988
+ OBJECTS_TO_COMPARE_TRGT, \
989
+ NUMBER_OF_OBJECTS_TO_COMPARE, \
990
+ SRC_MINUS_TRGT, \
991
+ TRGT_MINUS_SRC, \
992
+ CREATION_TS_UTC) \
993
+ VALUES \
994
+ ('{run_guid}', \
995
+ '{pipeline_name}', \
996
+ '{pipeline_id}', \
997
+ '{results['START_TIME_UTC']}', \
998
+ '{results['SOURCE_SYSTEM']}', \
999
+ '{results['TARGET_SYSTEM']}', \
1000
+ '{results['DATABASE_NAME']}', \
1001
+ '{TESTSET_}', \
1002
+ '{results['ALL_OBJECTS_MATCHING']}', \
1003
+ '{results['ALL_COLUMNS_EQUAL']}', \
1004
+ '{results['ALL_DATATYPES_EQUAL']}', \
1005
+ '{results['ALL_ROWCOUNTS_EQUAL']}', \
1006
+ '{results['ALL_CHECKSUMS_EQUAL']}', \
1007
+ NULLIF('{results['ALL_SAMPLES_EQUAL']}', 'None'), \
1008
+ NULLIF('{results['ALL_OBJECTS_EQUAL']}', 'None'), \
1009
+ '{OBJECTS_TO_COMPARE_SRC_}', \
1010
+ '{OBJECTS_TO_COMPARE_TRGT_}', \
1011
+ '{results['NUMBER_OF_OBJECTS_TO_COMPARE']}', \
1012
+ '{SRC_MINUS_TRGT_}', \
1013
+ '{TRGT_MINUS_SRC_}', \
1014
+ '{date_utc}')"
1015
+
1016
+ self.execute_statement(insert_statement)
1017
+
1018
+ def insert_objectlevel_results(
1019
+ self,
1020
+ result_table: str,
1021
+ result_table_objectlevel: str,
1022
+ run_guid: str,
1023
+ results: dict,
1024
+ ) -> None:
1025
+ """
1026
+ insert into - detailed results per object
1027
+ """
1028
+ date_utc = datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
1029
+
1030
+ statement = f"CREATE TABLE IF NOT EXISTS {result_table_objectlevel} (RUN_GUID STRING, PIPELINE_ID STRING, START_TIME_UTC STRING, SRC_DATABASE_NAME STRING, SRC_SCHEMA_NAME STRING, SRC_OBJECT_NAME STRING, SRC_OBJECT_TYPE STRING, TRGT_DATABASE_NAME STRING, TRGT_SCHEMA_NAME STRING, TRGT_OBJECT_NAME STRING, TRGT_OBJECT_TYPE STRING, SRC_FILTER STRING, TRGT_FILTER STRING, EXCLUDED_COLUMNS STRING, COLUMNS_EQUAL BOOLEAN, COLUMN_INTERSECTION STRING,SRC_COLUMNS_MINUS_TRGT_COLUMNS STRING, TRGT_COLUMNS_MINUS_SRC_COLUMNS STRING, ROW_COUNTS_EQUAL BOOLEAN, SRC_ROW_COUNT INT, TRGT_ROW_COUNT INT, ALL_COUNT_NULLS_EQUAL BOOLEAN, AGGREGATIONS_EQUAL BOOLEAN, SRC_ERROR_QUERY STRING, TRGT_ERROR_QUERY STRING, SRC_ERROR_MSG STRING, TRGT_ERROR_MSG STRING, GROUP_BY_COLUMNS STRING,GROUP_BY_EQUAL BOOLEAN, GROUP_BY_VALUES_WITH_MISMATCHES STRING, COLUMNS_WITH_MISMATCH STRING, SRC_GROUP_BY_QUERY STRING, TRGT_GROUP_BY_QUERY STRING, SRC_GROUP_BY_ERROR STRING, TRGT_GROUP_BY_ERROR STRING, SAMPLES_COMPARED BOOLEAN,SAMPLES_EQUAL BOOLEAN, SAMPLE_KEYS STRING, SRC_SAMPLE STRING, TRGT_SAMPLE STRING, SRC_SAMPLE_QUERY STRING, TRGT_SAMPLE_QUERY STRING, SRC_SAMPLE_ERROR_MSG STRING, TRGT_SAMPLE_ERROR_MSG STRING, PANDAS_DATAFRAME_COMPARED BOOLEAN, PANDAS_DATAFRAME_EQUAL BOOLEAN, SRC_NOT_ALTERED_DURING_COMPARISON BOOLEAN, TRGT_NOT_ALTERED_DURING_COMPARISON BOOLEAN, SRC_LAST_ALTERED STRING, TRGT_LAST_ALTERED STRING, CREATION_TS_UTC STRING)"
1031
+
1032
+ self.execute_statement(statement)
1033
+
1034
+ dict_list = self.get_objects_in_result_column(result_table, run_guid)
1035
+
1036
+ """
1037
+ Now, we have to extract all the information in the dicts manually to
1038
+ insert them in the query. We write one line for each object one by one.
1039
+ """
1040
+
1041
+ for element in dict_list:
1042
+ elem = element
1043
+ help_str, elem = elem.split(",", 1)
1044
+ src_database_name = re.sub(
1045
+ r"(.*)\"SRC_DATABASE_NAME\":\s\"(.*)\"", r"\2", help_str
1046
+ )
1047
+ help_str, elem = elem.split(",", 1)
1048
+ src_schema_name = re.sub(
1049
+ r"(.*)\"SRC_SCHEMA_NAME\":\s\"(.*)\"", r"\2", help_str
1050
+ )
1051
+ help_str, elem = elem.split(",", 1)
1052
+ src_object_name = re.sub(
1053
+ r"(.*)\"SRC_OBJECT_NAME\":\s\"(.*)\"", r"\2", help_str
1054
+ )
1055
+ help_str, elem = elem.split(",", 1)
1056
+ src_object_type = re.sub(
1057
+ r"(.*)\"SRC_OBJECT_TYPE\":\s\"(.*)\"", r"\2", help_str
1058
+ )
1059
+ help_str, elem = elem.split(",", 1)
1060
+ trgt_database_name = re.sub(
1061
+ r"(.*)\"TRGT_DATABASE_NAME\":\s\"(.*)\"", r"\2", help_str
1062
+ )
1063
+ help_str, elem = elem.split(",", 1)
1064
+ trgt_schema_name = re.sub(
1065
+ r"(.*)\"TRGT_SCHEMA_NAME\":\s\"(.*)\"", r"\2", help_str
1066
+ )
1067
+ help_str, elem = elem.split(",", 1)
1068
+ trgt_object_name = re.sub(
1069
+ r"(.*)\"TRGT_OBJECT_NAME\":\s\"(.*)\"", r"\2", help_str
1070
+ )
1071
+ help_str, elem = elem.split(",", 1)
1072
+ trgt_object_type = re.sub(
1073
+ r"(.*)\"TRGT_OBJECT_TYPE\":\s\"(.*)\"", r"\2", help_str
1074
+ )
1075
+ help_str, elem = elem.split(",", 1)
1076
+ src_filter = re.sub(
1077
+ r"(.*)\"SRC_FILTER\":\s(.*)", r"\2", help_str
1078
+ )
1079
+ help_str, elem = elem.split(",", 1)
1080
+ trgt_filter = re.sub(
1081
+ r"(.*)\"TRGT_FILTER\":\s(.*)", r"\2", help_str
1082
+ )
1083
+ help_str, elem = elem.split("],", 1)
1084
+ help_str = help_str + "]"
1085
+ excluded_columns = re.sub(
1086
+ r"(.*)\"EXCLUDED_COLUMNS\":\s(.*)", r"\2", help_str
1087
+ )
1088
+ help_str, elem = elem.split(",", 1)
1089
+ columns_equal = re.sub(r"(.*)\"COLUMNS_EQUAL\":\s(.*)", r"\2", help_str)
1090
+ help_str, elem = elem.split("],", 1)
1091
+ help_str = help_str + "]"
1092
+ column_intersection = re.sub(
1093
+ r"(.*)\"COLUMN_INTERSECTION\":\s(.*)", r"\2", help_str
1094
+ )
1095
+ help_str, elem = elem.split("],", 1)
1096
+ help_str = help_str + "]"
1097
+ src_columns_minus_trgt_columns = re.sub(
1098
+ r"(.*)\"SRC_COLUMNS_MINUS_TRGT_COLUMNS\":\s(.*)", r"\2", help_str
1099
+ )
1100
+ help_str, elem = elem.split("],", 1)
1101
+ help_str = help_str + "]"
1102
+ trgt_columns_minus_src_columns = re.sub(
1103
+ r"(.*)\"TRGT_COLUMNS_MINUS_SRC_COLUMNS\":\s(.*)", r"\2", help_str
1104
+ )
1105
+ help_str, elem = elem.split(",", 1)
1106
+ row_counts_equal = re.sub(
1107
+ r"(.*)\"ROW_COUNTS_EQUAL\":\s(.*)", r"\2", help_str
1108
+ )
1109
+ help_str, elem = elem.split(",", 1)
1110
+ src_row_count = re.sub(
1111
+ r"(.*)\"SRC_ROW_COUNT\":\s(.*)", r"\2", help_str
1112
+ )
1113
+ help_str, elem = elem.split(",", 1)
1114
+ trgt_row_count = re.sub(
1115
+ r"(.*)\"TRGT_ROW_COUNT\":\s(.*)", r"\2", help_str
1116
+ )
1117
+ help_str, elem = elem.split(",", 1)
1118
+ all_count_nulls_equal = re.sub(
1119
+ r"(.*)\"ALL_COUNT_NULLS_EQUAL\":\s(.*)", r"\2", help_str
1120
+ )
1121
+ help_str, elem = elem.split(",", 1)
1122
+ aggregations_equal = re.sub(
1123
+ r"(.*)\"AGGREGATIONS_EQUAL\":\s(.*)", r"\2", help_str
1124
+ )
1125
+ help_str, elem = elem.split("},", 1)
1126
+ help_str = help_str + "}"
1127
+ src_error = re.sub(
1128
+ r"(.*)\"SRC_ERROR\":\s(.*)", r"\2", help_str
1129
+ )
1130
+ help_str, elem = elem.split("},", 1)
1131
+ help_str = help_str + "}"
1132
+ trgt_error = re.sub(
1133
+ r"(.*)\"TRGT_ERROR\":\s(.*)", r"\2", help_str
1134
+ )
1135
+ help_str, elem = elem.split(', "SRC_GROUP_BY_QUERY', 1)
1136
+ elem = '"SRC_GROUP_BY_QUERY' + elem
1137
+ group_by_columns = re.sub(
1138
+ r"(.*)\"GROUP_BY_COLUMNS\":\s(.*)", r"\2", help_str
1139
+ )
1140
+ help_str, elem = elem.split(",", 1)
1141
+ src_group_by_query = re.sub(
1142
+ r"(.*)\"SRC_GROUP_BY_QUERY\":\s(.*)", r"\2", help_str
1143
+ )
1144
+ help_str, elem = elem.split(",", 1)
1145
+ trgt_group_by_query = re.sub(
1146
+ r"(.*)\"TRGT_GROUP_BY_QUERY\":\s(.*)", r"\2", help_str
1147
+ )
1148
+ help_str, elem = elem.split(",", 1)
1149
+ group_by_equal = re.sub(r"(.*)\"GROUP_BY_EQUAL\":\s(.*)", r"\2", help_str)
1150
+ help_str, elem = elem.split(', "COLUMNS_WITH_MISMATCH', 1)
1151
+ elem = '"COLUMNS_WITH_MISMATCH' + elem
1152
+ group_by_values_with_mismatches = re.sub(
1153
+ r"(.*)\"GROUP_BY_VALUES_WITH_MISMATCHES\":\s(.*)", r"\2", help_str
1154
+ )
1155
+ help_str, elem = elem.split(', "SRC_GROUP_BY_ERROR', 1)
1156
+ elem = '"SRC_GROUP_BY_ERROR' + elem
1157
+ columns_with_mismatch = re.sub(
1158
+ r"(.*)\"COLUMNS_WITH_MISMATCH\":\s(.*)", r"\2", help_str
1159
+ )
1160
+ help_str, elem = elem.split(', "TRGT_GROUP_BY_ERROR', 1)
1161
+ elem = '"TRGT_GROUP_BY_ERROR' + elem
1162
+ src_group_by_error = re.sub(
1163
+ r"(.*)\"SRC_GROUP_BY_ERROR\":\s(.*)", r"\2", help_str
1164
+ )
1165
+ help_str, elem = elem.split(', "SAMPLES_COMPARED', 1)
1166
+ elem = '"SAMPLES_COMPARED' + elem
1167
+ trgt_group_by_error = re.sub(
1168
+ r"(.*)\"TRGT_GROUP_BY_ERROR\":\s(.*)", r"\2", help_str
1169
+ )
1170
+ help_str, elem = elem.split(",", 1)
1171
+ samples_compared = re.sub(
1172
+ r"(.*)\"SAMPLES_COMPARED\":\s(.*)", r"\2", help_str
1173
+ )
1174
+ help_str, elem = elem.split(",", 1)
1175
+ samples_equal = re.sub(
1176
+ r"(.*)\"SAMPLES_EQUAL\":\s(.*)", r"\2", help_str
1177
+ )
1178
+ help_str, elem = elem.split("},", 1)
1179
+ help_str = help_str + "}"
1180
+ sample_keys = re.sub(
1181
+ r"(.*)\"SAMPLE_KEYS\":\s(.*)", r"\2", help_str
1182
+ )
1183
+ help_str, elem = elem.split("}},", 1)
1184
+ help_str = help_str + "}}"
1185
+ src_sample = re.sub(
1186
+ r"(.*)\"SRC_SAMPLE\":\s(.*)", r"\2", help_str
1187
+ )
1188
+ help_str, elem = elem.split("}},", 1)
1189
+ help_str = help_str + "}}"
1190
+ trgt_sample = re.sub(
1191
+ r"(.*)\"TRGT_SAMPLE\":\s(.*)", r"\2", help_str
1192
+ )
1193
+ help_str, elem = elem.split(';",', 1)
1194
+ help_str = help_str + ';"'
1195
+ src_sample_query = re.sub(
1196
+ r"(.*)\"SRC_SAMPLE_QUERY\":\s(.*)", r"\2", help_str
1197
+ )
1198
+ help_str, elem = elem.split(';",', 1)
1199
+ help_str = help_str + ';"'
1200
+ trgt_sample_query = re.sub(
1201
+ r"(.*)\"TRGT_SAMPLE_QUERY\":\s(.*)", r"\2", help_str
1202
+ )
1203
+ help_str, elem = elem.split("},", 1)
1204
+ help_str = help_str + "}"
1205
+ src_sample_error_dict = re.sub(
1206
+ r"(.*)\"SRC_SAMPLE_ERROR_DICT\":\s(.*)", r"\2", help_str
1207
+ )
1208
+ help_str, elem = elem.split("},", 1)
1209
+ help_str = help_str + "}"
1210
+ trgt_sample_error_dict = re.sub(
1211
+ r"(.*)\"TRGT_SAMPLE_ERROR_DICT\":\s(.*)", r"\2", help_str
1212
+ )
1213
+ help_str, elem = elem.split(",", 1)
1214
+ pandas_dataframe_compared = re.sub(
1215
+ r"(.*)\"PANDAS_DATAFRAME_COMPARED\":\s(.*)", r"\2", help_str
1216
+ )
1217
+ help_str, elem = elem.split(",", 1)
1218
+ pandas_dataframe_equal = re.sub(
1219
+ r"(.*)\"PANDAS_DATAFRAME_EQUAL\":\s(.*)", r"\2", help_str
1220
+ )
1221
+ help_str, elem = elem.split(",", 1)
1222
+ src_not_altered_during_comparison = re.sub(
1223
+ r"(.*)\"SRC_NOT_ALTERED_DURING_COMPARISON\":\s(.*)", r"\2", help_str
1224
+ )
1225
+ help_str, elem = elem.split(",", 1)
1226
+ trgt_not_altered_during_comparison = re.sub(
1227
+ r"(.*)\"TRGT_NOT_ALTERED_DURING_COMPARISON\":\s(.*)", r"\2", help_str
1228
+ )
1229
+ help_str, elem = elem.split(",", 1)
1230
+ src_last_altered = re.sub(
1231
+ r"(.*)\"SRC_LAST_ALTERED\":\s(.*)", r"\2", help_str
1232
+ )
1233
+ help_str, elem = elem.split(",", 1)
1234
+ trgt_last_altered = re.sub(r"(.*)\"TRGT_LAST_ALTERED\":\s(.*)", r"\2", help_str)
1235
+
1236
+ # the rest in elem is not used for this table
1237
+
1238
+ insert_statement = f"INSERT INTO {result_table_objectlevel} ( \
1239
+ RUN_GUID, \
1240
+ PIPELINE_ID, \
1241
+ START_TIME_UTC, \
1242
+ SRC_DATABASE_NAME, \
1243
+ SRC_SCHEMA_NAME, \
1244
+ SRC_OBJECT_NAME, \
1245
+ SRC_OBJECT_TYPE, \
1246
+ TRGT_DATABASE_NAME, \
1247
+ TRGT_SCHEMA_NAME, \
1248
+ TRGT_OBJECT_NAME, \
1249
+ TRGT_OBJECT_TYPE, \
1250
+ SRC_FILTER, \
1251
+ TRGT_FILTER, \
1252
+ EXCLUDED_COLUMNS, \
1253
+ COLUMNS_EQUAL, \
1254
+ COLUMN_INTERSECTION, \
1255
+ SRC_COLUMNS_MINUS_TRGT_COLUMNS, \
1256
+ TRGT_COLUMNS_MINUS_SRC_COLUMNS, \
1257
+ ROW_COUNTS_EQUAL, \
1258
+ SRC_ROW_COUNT, \
1259
+ TRGT_ROW_COUNT, \
1260
+ ALL_COUNT_NULLS_EQUAL, \
1261
+ AGGREGATIONS_EQUAL, \
1262
+ SRC_ERROR_QUERY , \
1263
+ TRGT_ERROR_QUERY, \
1264
+ SRC_ERROR_MSG, \
1265
+ TRGT_ERROR_MSG, \
1266
+ GROUP_BY_COLUMNS, \
1267
+ GROUP_BY_EQUAL, \
1268
+ GROUP_BY_VALUES_WITH_MISMATCHES, \
1269
+ COLUMNS_WITH_MISMATCH, \
1270
+ SRC_GROUP_BY_QUERY, \
1271
+ TRGT_GROUP_BY_QUERY, \
1272
+ SRC_GROUP_BY_ERROR, \
1273
+ TRGT_GROUP_BY_ERROR, \
1274
+ SAMPLES_COMPARED, \
1275
+ SAMPLES_EQUAL, \
1276
+ SAMPLE_KEYS, \
1277
+ SRC_SAMPLE, \
1278
+ TRGT_SAMPLE, \
1279
+ SRC_SAMPLE_QUERY, \
1280
+ TRGT_SAMPLE_QUERY, \
1281
+ SRC_SAMPLE_ERROR_MSG, \
1282
+ TRGT_SAMPLE_ERROR_MSG, \
1283
+ PANDAS_DATAFRAME_COMPARED, \
1284
+ PANDAS_DATAFRAME_EQUAL, \
1285
+ SRC_NOT_ALTERED_DURING_COMPARISON, \
1286
+ TRGT_NOT_ALTERED_DURING_COMPARISON, \
1287
+ SRC_LAST_ALTERED, \
1288
+ TRGT_LAST_ALTERED, \
1289
+ CREATION_TS_UTC) \
1290
+ WITH group_error_src AS (SELECT\
1291
+ json_tuple('{src_group_by_error}', 'QUERY', 'ERROR') AS (grouping_errors_src_query, grouping_errors_src_error)\
1292
+ ),\
1293
+ group_error_trgt AS (SELECT\
1294
+ json_tuple('{trgt_group_by_error}', 'QUERY', 'ERROR') AS (grouping_errors_trgt_query, grouping_errors_trgt_error)\
1295
+ ),\
1296
+ src_error AS (SELECT\
1297
+ json_tuple('{src_error}', 'QUERY', 'ERROR') AS (src_error_query, src_error_error)\
1298
+ ),\
1299
+ trgt_error AS (SELECT\
1300
+ json_tuple('{trgt_error}', 'QUERY', 'ERROR') AS (trgt_error_query, trgt_error_error)\
1301
+ ),\
1302
+ src_sample_error AS (SELECT\
1303
+ json_tuple('{src_sample_error_dict}', 'QUERY', 'ERROR') AS (src_sample_error_dict_query, src_sample_error_dict_error)\
1304
+ ),\
1305
+ trgt_sample_error AS (SELECT\
1306
+ json_tuple('{trgt_sample_error_dict}', 'QUERY', 'ERROR') AS (trgt_sample_error_dict_query, trgt_sample_error_dict_error)\
1307
+ )\
1308
+ SELECT\
1309
+ RESULTS.RUN_GUID AS RUN_GUID, \
1310
+ RESULTS.PIPELINE_ID AS PIPELINE_ID, \
1311
+ RESULTS.START_TIME_UTC::STRING AS START_TIME_UTC, \
1312
+ '{src_database_name}' AS SRC_DATABASE_NAME, \
1313
+ '{src_schema_name}' AS SRC_SCHEMA_NAME, \
1314
+ '{src_object_name}' AS SRC_OBJECT_NAME, \
1315
+ '{src_object_type}' AS SRC_OBJECT_TYPE, \
1316
+ '{trgt_database_name}' AS TRGT_DATABASE_NAME, \
1317
+ '{trgt_schema_name}' AS TRGT_SCHEMA_NAME, \
1318
+ '{trgt_object_name}' AS TRGT_OBJECT_NAME, \
1319
+ '{trgt_object_type}' AS TRGT_OBJECT_TYPE, \
1320
+ '{src_filter}' AS SRC_FILTER, \
1321
+ '{trgt_filter}' AS TRGT_FILTER, \
1322
+ '{excluded_columns}' AS EXCLUDED_COLUMNS, \
1323
+ '{columns_equal}'::BOOLEAN AS COLUMNS_EQUAL, \
1324
+ '{column_intersection}'::BOOLEAN AS COLUMN_INTERSECTION, \
1325
+ '{src_columns_minus_trgt_columns}' AS SRC_COLUMNS_MINUS_TRGT_COLUMNS, \
1326
+ '{trgt_columns_minus_src_columns}' AS TRGT_COLUMNS_MINUS_SRC_COLUMNS, \
1327
+ '{row_counts_equal}'::BOOLEAN AS ROW_COUNTS_EQUAL, \
1328
+ '{src_row_count}'::INT AS SRC_ROW_COUNT, \
1329
+ '{trgt_row_count}'::INT AS TRGT_ROW_COUNT, \
1330
+ '{all_count_nulls_equal}'::BOOLEAN AS ALL_COUNT_NULLS_EQUAL, \
1331
+ '{aggregations_equal}'::BOOLEAN AS AGGREGATIONS_EQUAL, \
1332
+ src_error_query::STRING AS SRC_ERROR_QUERY, \
1333
+ trgt_error_query::STRING AS TRGT_ERROR_QUERY, \
1334
+ src_error_error::STRING AS SRC_ERROR_MSG, \
1335
+ trgt_error_error::STRING AS TRGT_ERROR_MSG, \
1336
+ '{group_by_columns}' AS GROUP_BY_COLUMNS, \
1337
+ '{group_by_equal}'::BOOLEAN AS GROUP_BY_EQUAL, \
1338
+ '{group_by_values_with_mismatches}' AS GROUP_BY_VALUES_WITH_MISMATCHES, \
1339
+ '{columns_with_mismatch}' AS COLUMNS_WITH_MISMATCH, \
1340
+ CASE WHEN '{src_group_by_error}'::STRING = '{{}}' \
1341
+ THEN NULLIF('{src_group_by_query}'::STRING, '') \
1342
+ WHEN'{src_group_by_error}'::STRING != '{{}}' \
1343
+ THEN NULLIF(grouping_errors_src_query::STRING, '') \
1344
+ END AS SRC_GROUP_BY_QUERY, \
1345
+ CASE WHEN '{trgt_group_by_error}'::STRING = '{{}}' \
1346
+ THEN NULLIF('{trgt_group_by_query}'::STRING, '') \
1347
+ WHEN '{trgt_group_by_error}'::STRING != '{{}}' \
1348
+ THEN NULLIF(grouping_errors_trgt_query::STRING, '') \
1349
+ END AS TRGT_GROUP_BY_QUERY, \
1350
+ CASE WHEN '{src_group_by_error}'::STRING = '{{}}' \
1351
+ THEN NULL \
1352
+ ELSE '{src_group_by_error}'::STRING \
1353
+ END AS SRC_GROUP_BY_ERROR, \
1354
+ CASE WHEN '{trgt_group_by_error}'::STRING = '{{}}' \
1355
+ THEN NULL \
1356
+ ELSE '{trgt_group_by_error}'::STRING \
1357
+ END AS TRGT_GROUP_BY_ERROR, \
1358
+ '{samples_compared}'::BOOLEAN AS SAMPLES_COMPARED, \
1359
+ '{samples_equal}'::BOOLEAN AS SAMPLES_EQUAL, \
1360
+ '{sample_keys}' AS SAMPLE_KEYS, \
1361
+ '{src_sample}' AS SRC_SAMPLE, \
1362
+ '{trgt_sample}' AS TRGT_SAMPLE, \
1363
+ '{src_sample_query}' AS SRC_SAMPLE_QUERY, \
1364
+ '{trgt_sample_query}' AS TRGT_SAMPLE_QUERY, \
1365
+ src_sample_error_dict_error::STRING AS SRC_SAMPLE_ERROR_MSG, \
1366
+ trgt_sample_error_dict_error::STRING AS TRGT_SAMPLE_ERROR_MSG, \
1367
+ '{pandas_dataframe_compared}'::BOOLEAN AS PANDAS_DATAFRAME_COMPARED, \
1368
+ '{pandas_dataframe_equal}'::BOOLEAN AS PANDAS_DATAFRAME_EQUAL, \
1369
+ '{src_not_altered_during_comparison}'::BOOLEAN AS SRC_NOT_ALTERED_DURING_COMPARISON, \
1370
+ '{trgt_not_altered_during_comparison}'::BOOLEAN AS TRGT_NOT_ALTERED_DURING_COMPARISON, \
1371
+ '{src_last_altered}'::STRING AS SRC_LAST_ALTERED, \
1372
+ '{trgt_last_altered}'::STRING AS TRGT_LAST_ALTERED, \
1373
+ '{date_utc}' \
1374
+ FROM {result_table} RESULTS, group_error_src, group_error_trgt, src_error, trgt_error, src_sample_error, trgt_sample_error \
1375
+ WHERE RUN_GUID = '{run_guid}'\
1376
+ ;"
1377
+
1378
+ self.execute_statement(insert_statement)
1379
+
1380
+ def insert_columnlevel_results(
1381
+ self,
1382
+ result_table: str,
1383
+ result_table_columnlevel: str,
1384
+ run_guid: str,
1385
+ ) -> None:
1386
+ """
1387
+ insert into - detailed results per column
1388
+ """
1389
+
1390
+ date_utc = datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
1391
+
1392
+ statement = f"CREATE TABLE IF NOT EXISTS {result_table_columnlevel} (RUN_GUID STRING, PIPELINE_ID STRING, START_TIME_UTC STRING, SRC_DATABASE_NAME STRING, SRC_SCHEMA_NAME STRING, SRC_OBJECT_NAME STRING, SRC_OBJECT_TYPE STRING, TRGT_DATABASE_NAME STRING, TRGT_SCHEMA_NAME STRING, TRGT_OBJECT_NAME STRING, TRGT_OBJECT_TYPE STRING, COLUMN_NAME STRING, IN_SRC BOOLEAN, IN_TRGT BOOLEAN, IN_SYNC BOOLEAN, IN_EXCLUDED BOOLEAN, SRC_DATATYPE STRING, TRGT_DATATYPE STRING, AGGREGATION_TYPE STRING, AGGREGATION_EQUAL BOOLEAN, AGGREGATION_RESULT_SRC STRING, AGGREGATION_RESULT_TRGT STRING, AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC STRING, COUNT_NULLS_EQUAL BOOLEAN, COUNT_NULLS_SRC STRING, COUNT_NULLS_TRGT STRING, COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC STRING, ERROR_QUERY_SRC STRING, ERROR_MSG_SRC STRING, ERROR_QUERY_TRGT STRING, ERROR_MSG_TRGT STRING, ERROR_FLAG BOOLEAN, CREATION_TS_UTC STRING);"
1393
+
1394
+ self.execute_statement(statement)
1395
+
1396
+ dict_list = self.get_objects_in_result_column(result_table, run_guid)
1397
+
1398
+ # extract the information needed for the table on object level
1399
+ for element in dict_list:
1400
+ elem = element
1401
+ help_str, elem = elem.split(",", 1)
1402
+ src_database_name = re.sub(
1403
+ r"(.*)\"SRC_DATABASE_NAME\":\s\"(.*)\"", r"\2", help_str
1404
+ )
1405
+ help_str, elem = elem.split(",", 1)
1406
+ src_schema_name = re.sub(
1407
+ r"(.*)\"SRC_SCHEMA_NAME\":\s\"(.*)\"", r"\2", help_str
1408
+ )
1409
+ help_str, elem = elem.split(",", 1)
1410
+ src_object_name = re.sub(
1411
+ r"(.*)\"SRC_OBJECT_NAME\":\s\"(.*)\"", r"\2", help_str
1412
+ )
1413
+ help_str, elem = elem.split(",", 1)
1414
+ src_object_type = re.sub(
1415
+ r"(.*)\"SRC_OBJECT_TYPE\":\s\"(.*)\"", r"\2", help_str
1416
+ )
1417
+ help_str, elem = elem.split(",", 1)
1418
+ trgt_database_name = re.sub(
1419
+ r"(.*)\"TRGT_DATABASE_NAME\":\s\"(.*)\"", r"\2", help_str
1420
+ )
1421
+ help_str, elem = elem.split(",", 1)
1422
+ trgt_schema_name = re.sub(
1423
+ r"(.*)\"TRGT_SCHEMA_NAME\":\s\"(.*)\"", r"\2", help_str
1424
+ )
1425
+ help_str, elem = elem.split(",", 1)
1426
+ trgt_object_name = re.sub(
1427
+ r"(.*)\"TRGT_OBJECT_NAME\":\s\"(.*)\"", r"\2", help_str
1428
+ )
1429
+ help_str, elem = elem.split(",", 1)
1430
+ trgt_object_type = re.sub(
1431
+ r"(.*)\"TRGT_OBJECT_TYPE\":\s\"(.*)\"", r"\2", help_str
1432
+ )
1433
+ help_str, elem = elem.split(",", 1)
1434
+ src_filter = re.sub(
1435
+ r"(.*)\"SRC_FILTER\":\s(.*)", r"\2", help_str
1436
+ )
1437
+ help_str, elem = elem.split(",", 1)
1438
+ trgt_filter = re.sub(
1439
+ r"(.*)\"TRGT_FILTER\":\s(.*)", r"\2", help_str
1440
+ )
1441
+ help_str, elem = elem.split("],", 1)
1442
+ help_str = help_str + "]" # EXCLUDED_COLUMNS not needed for column level table
1443
+ help_str, elem = elem.split(",", 1) # COLUMNS_EQUAL not needed for column level table
1444
+ help_str, elem = elem.split("],", 1) # COLUMN_INTERSECTION not needed for column level table
1445
+ help_str, elem = elem.split("],", 1) # SRC_COLUMNS_MINUS_TRGT_COLUMNS not needed for column level table
1446
+ help_str = help_str + "]" # SRC_COLUMNS_MINUS_TRGT_COLUMNS not needed for column level table
1447
+ help_str, elem = elem.split("],", 1) # TRGT_COLUMNS_MINUS_SRC_COLUMNS not needed for column level table
1448
+ help_str, elem = elem.split(",", 1) # ROW_COUNTS_EQUAL not needed for column level table
1449
+ help_str, elem = elem.split(",", 1) # SRC_ROW_COUNT not needed for column level table
1450
+ help_str, elem = elem.split(",", 1) # TRGT_ROW_COUNT not needed for column level table
1451
+ help_str, elem = elem.split(",", 1) # ALL_COUNT_NULLS_EQUAL not needed for column level table
1452
+ help_str, elem = elem.split(",", 1) # AGGREGATIONS_EQUAL not needed for column level table
1453
+ help_str, elem = elem.split("},", 1)
1454
+ help_str = help_str + "}"
1455
+ src_error = re.sub(
1456
+ r"(.*)\"SRC_ERROR\":\s(.*)", r"\2", help_str
1457
+ )
1458
+ help_str, elem = elem.split("},", 1)
1459
+ help_str = help_str + "}"
1460
+ trgt_error = re.sub(
1461
+ r"(.*)\"TRGT_ERROR\":\s(.*)", r"\2", help_str
1462
+ )
1463
+ help_str, elem = elem.split(', "SRC_GROUP_BY_QUERY', 1) # GROUP_BY_COLUMNS not needed for column level table
1464
+ elem = '"SRC_GROUP_BY_QUERY' + elem
1465
+ help_str, elem = elem.split(",", 1) # SRC_GROUP_BY_QUERY not needed for column level table
1466
+ help_str, elem = elem.split(",", 1) # TRGT_GROUP_BY_QUERY not needed for column level table
1467
+ help_str, elem = elem.split(",", 1) # GROUP_BY_EQUAL not needed for column level table
1468
+ help_str, elem = elem.split(', "COLUMNS_WITH_MISMATCH', 1) # GROUP_BY_VALUES_WITH_MISMATCHES not needed for column level table
1469
+ elem = '"COLUMNS_WITH_MISMATCH' + elem
1470
+ help_str, elem = elem.split(', "SRC_GROUP_BY_ERROR', 1) # COLUMNS_WITH_MISMATCH not needed for column level table
1471
+ elem = '"SRC_GROUP_BY_ERROR' + elem
1472
+ help_str, elem = elem.split(', "TRGT_GROUP_BY_ERROR', 1) # SRC_GROUP_BY_ERROR not needed for column level table
1473
+ elem = '"TRGT_GROUP_BY_ERROR' + elem
1474
+ help_str, elem = elem.split(', "SAMPLES_COMPARED', 1) # TRGT_GROUP_BY_ERROR not needed for column level table
1475
+ elem = '"SAMPLES_COMPARED' + elem
1476
+ help_str, elem = elem.split(",", 1) # SAMPLES_COMPARED not needed for column level table
1477
+ help_str, elem = elem.split(",", 1) # SAMPLES_EQUAL not needed for column level table
1478
+ help_str, elem = elem.split("},", 1) # SAMPLE_KEYS not needed for column level table
1479
+ help_str, elem = elem.split("}},", 1) # SRC_SAMPLE not needed for column level table
1480
+ help_str, elem = elem.split("}},", 1) # TRGT_SAMPLE not needed for column level table
1481
+ help_str, elem = elem.split(';",', 1) # SRC_SAMPLE_QUERY not needed for column level table
1482
+ help_str, elem = elem.split(';",', 1) # TRGT_SAMPLE_QUERY not needed for column level table
1483
+ help_str, elem = elem.split("},", 1) # SRC_SAMPLE_ERROR_DICT not needed for column level table
1484
+ help_str, elem = elem.split("},", 1) # TRGT_SAMPLE_ERROR_DICT not needed for column level table
1485
+ help_str, elem = elem.split(",", 1) # PANDAS_DATAFRAME_COMPARED not needed for column level table
1486
+ help_str, elem = elem.split(",", 1) # PANDAS_DATAFRAME_EQUAL not needed for column level table
1487
+ help_str, elem = elem.split(",", 1) # SRC_NOT_ALTERED_DURING_COMPARISON not needed for column level table
1488
+ help_str, elem = elem.split(",", 1) # TRGT_NOT_ALTERED_DURING_COMPARISON not needed for column level table
1489
+ help_str, elem = elem.split(",", 1) # SRC_LAST_ALTERED not needed for column level table
1490
+ help_str, elem = elem.split(",", 1) # TRGT_LAST_ALTERED not needed for column level table
1491
+ help_str, elem = elem.split("],", 1) # ALL_COLUMNS not needed for column level table
1492
+ help_str, elem = elem.split("}]}", 1)
1493
+ help_str = help_str + "}]"
1494
+ columns_liststr = re.search(r'(.*)"COLUMNS":\s\[(.*)\]', help_str).group(2)
1495
+ columns_dictlist = columns_liststr.split("}")
1496
+ columns_dictlist = [
1497
+ dictionary + "}"
1498
+ for dictionary in columns_dictlist
1499
+ if len(dictionary) > 0
1500
+ ]
1501
+
1502
+ # extract the information needed for the table on column level
1503
+ for column in columns_dictlist:
1504
+ col = re.sub(r"^,", "", column)
1505
+ help_str, col = col.split(",", 1)
1506
+ column_name = re.sub(r"(.*)\"COLUMN_NAME\":\s\"(.*)\"", r"\2", help_str)
1507
+ help_str, col = col.split(",", 1)
1508
+ in_src = re.sub(r"(.*)\"IN_SRC\":\s(.*)", r"\2", help_str)
1509
+ help_str, col = col.split(",", 1)
1510
+ in_trgt = re.sub(r"(.*)\"IN_TRGT\":\s(.*)", r"\2", help_str)
1511
+ help_str, col = col.split(",", 1)
1512
+ in_sync = re.sub(r"(.*)\"IN_SYNC\":\s(.*)", r"\2", help_str)
1513
+ help_str, col = col.split(",", 1)
1514
+ in_excluded = re.sub(r"(.*)\"IN_EXCLUDED\":\s(.*)", r"\2", help_str)
1515
+ help_str, col = col.split(",", 1)
1516
+ if help_str == ' "SRC_DATATYPE": None':
1517
+ src_datatype = "None"
1518
+ else:
1519
+ src_datatype = re.sub(
1520
+ r"(.*)\"SRC_DATATYPE\":\s\"(.*)\"", r"\2", help_str
1521
+ )
1522
+ help_str, col = col.split(",", 1)
1523
+ if help_str == ' "TRGT_DATATYPE": None':
1524
+ trgt_datatype = "None"
1525
+ else:
1526
+ trgt_datatype = re.sub(
1527
+ r"(.*)\"TRGT_DATATYPE\":\s\"(.*)\"", r"\2", help_str
1528
+ )
1529
+ help_str, col = col.split(",", 1)
1530
+ if help_str == ' "AGGREGATION_TYPE": None':
1531
+ aggregation_type = "None"
1532
+ else:
1533
+ aggregation_type = re.sub(
1534
+ r"(.*)\"AGGREGATION_TYPE\":\s\"(.*)\"", r"\2", help_str
1535
+ )
1536
+ help_str, col = col.split(",", 1)
1537
+ aggregation_equal = re.sub(
1538
+ r"(.*)\"AGGREGATION_EQUAL\":\s(.*)", r"\2", help_str
1539
+ )
1540
+ help_str, col = col.split(",", 1)
1541
+ aggregation_result_src = re.sub(
1542
+ r"(.*)\"AGGREGATION_RESULT_SRC\":\s(.*)", r"\2", help_str
1543
+ )
1544
+ help_str, col = col.split(",", 1)
1545
+ aggregation_result_trgt = re.sub(
1546
+ r"(.*)\"AGGREGATION_RESULT_TRGT\":\s(.*)", r"\2", help_str
1547
+ )
1548
+ help_str, col = col.split(",", 1)
1549
+ if help_str == ' "AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC": None':
1550
+ aggregation_difference_trgt_minus_src = "None"
1551
+ else:
1552
+ aggregation_difference_trgt_minus_src = re.sub(
1553
+ r"(.*)\"AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC\":\s\"(.*)\"",
1554
+ r"\2",
1555
+ help_str,
1556
+ )
1557
+ help_str, col = col.split(",", 1)
1558
+ count_nulls_equal = re.sub(
1559
+ r"(.*)\"COUNT_NULLS_EQUAL\":\s(.*)", r"\2", help_str
1560
+ )
1561
+ help_str, col = col.split(",", 1)
1562
+ count_nulls_src = re.sub(
1563
+ r"(.*)\"COUNT_NULLS_SRC\":\s(.*)", r"\2", help_str
1564
+ )
1565
+ help_str, col = col.split(",", 1)
1566
+ count_nulls_trgt = re.sub(
1567
+ r"(.*)\"COUNT_NULLS_TRGT\":\s(.*)", r"\2", help_str
1568
+ )
1569
+ if col == ' "COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC": None}':
1570
+ count_nulls_difference_trgt_minus_src = "None"
1571
+ else:
1572
+ count_nulls_difference_trgt_minus_src = re.sub(
1573
+ r"(.*)\"COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC\":\s\"(.*)\"\}",
1574
+ r"\2",
1575
+ col,
1576
+ )
1577
+
1578
+ insert_statement = f"INSERT INTO {result_table_columnlevel} ( \
1579
+ RUN_GUID,\
1580
+ PIPELINE_ID,\
1581
+ START_TIME_UTC,\
1582
+ SRC_DATABASE_NAME, \
1583
+ SRC_SCHEMA_NAME, \
1584
+ SRC_OBJECT_NAME, \
1585
+ SRC_OBJECT_TYPE, \
1586
+ TRGT_DATABASE_NAME, \
1587
+ TRGT_SCHEMA_NAME, \
1588
+ TRGT_OBJECT_NAME, \
1589
+ TRGT_OBJECT_TYPE, \
1590
+ COLUMN_NAME,\
1591
+ IN_SRC,\
1592
+ IN_TRGT,\
1593
+ IN_SYNC,\
1594
+ IN_EXCLUDED, \
1595
+ SRC_DATATYPE,\
1596
+ TRGT_DATATYPE,\
1597
+ AGGREGATION_TYPE,\
1598
+ AGGREGATION_EQUAL,\
1599
+ AGGREGATION_RESULT_SRC,\
1600
+ AGGREGATION_RESULT_TRGT,\
1601
+ AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC,\
1602
+ COUNT_NULLS_EQUAL,\
1603
+ COUNT_NULLS_SRC,\
1604
+ COUNT_NULLS_TRGT,\
1605
+ COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC,\
1606
+ ERROR_QUERY_SRC ,\
1607
+ ERROR_MSG_SRC ,\
1608
+ ERROR_QUERY_TRGT ,\
1609
+ ERROR_MSG_TRGT ,\
1610
+ ERROR_FLAG,\
1611
+ CREATION_TS_UTC)\
1612
+ WITH errors_src AS (SELECT\
1613
+ json_tuple('{src_error}', 'QUERY', 'ERROR') AS (ERROR_QUERY_SRC, ERROR_MSG_SRC)\
1614
+ ),\
1615
+ errors_trgt AS (SELECT\
1616
+ json_tuple('{trgt_error}', 'QUERY', 'ERROR') AS (ERROR_QUERY_TRGT, ERROR_MSG_TRGT)\
1617
+ )\
1618
+ SELECT\
1619
+ RESULTS.RUN_GUID AS RUN_GUID,\
1620
+ RESULTS.PIPELINE_ID AS PIPELINE_ID,\
1621
+ RESULTS.START_TIME_UTC::STRING AS START_TIME_UTC,\
1622
+ '{src_database_name}' AS SRC_DATABASE_NAME,\
1623
+ '{src_schema_name}' AS SRC_SCHEMA_NAME,\
1624
+ '{src_object_name}' AS SRC_OBJECT_NAME,\
1625
+ '{src_object_type}' AS SRC_OBJECT_TYPE,\
1626
+ '{trgt_database_name}' AS TRGT_DATABASE_NAME,\
1627
+ '{trgt_schema_name}' AS TRGT_SCHEMA_NAME,\
1628
+ '{trgt_object_name}' AS TRGT_OBJECT_NAME,\
1629
+ '{trgt_object_type}' AS TRGT_OBJECT_TYPE,\
1630
+ '{column_name}' AS COLUMN_NAME,\
1631
+ '{in_src}'::BOOLEAN AS IN_SRC,\
1632
+ '{in_trgt}'::BOOLEAN AS IN_TRGT,\
1633
+ '{in_sync}'::BOOLEAN AS IN_SYNC,\
1634
+ '{in_excluded}'::BOOLEAN AS IN_SYNC,\
1635
+ '{src_datatype}' AS SRC_DATATYPE,\
1636
+ '{trgt_datatype}' AS TRGT_DATATYPE,\
1637
+ '{aggregation_type}' AS AGGREGATION_TYPE,\
1638
+ '{aggregation_equal}'::BOOLEAN AS AGGREGATION_EQUAL,\
1639
+ '{aggregation_result_src}' AS AGGREGATION_RESULT_SRC,\
1640
+ '{aggregation_result_trgt}' AS AGGREGATION_RESULT_TRGT,\
1641
+ '{aggregation_difference_trgt_minus_src}' AS AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC,\
1642
+ '{count_nulls_equal}'::BOOLEAN AS COUNT_NULLS_EQUAL,\
1643
+ '{count_nulls_src}'::INT AS COUNT_NULLS_SRC,\
1644
+ '{count_nulls_trgt}'::INT AS COUNT_NULLS_TRGT,\
1645
+ '{count_nulls_difference_trgt_minus_src}' AS COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC,\
1646
+ ERROR_QUERY_SRC,\
1647
+ ERROR_MSG_SRC,\
1648
+ ERROR_QUERY_TRGT,\
1649
+ ERROR_MSG_TRGT,\
1650
+ CASE WHEN ERROR_MSG_SRC IS NULL AND ERROR_MSG_TRGT IS NULL THEN FALSE ELSE TRUE END AS ERROR_FLAG,\
1651
+ '{date_utc}'\
1652
+ FROM {result_table} RESULTS, errors_src, errors_trgt\
1653
+ WHERE RUN_GUID = '{run_guid}';"
1654
+
1655
+ self.execute_statement(insert_statement)
1656
+
1657
+ def get_objects_in_result_column(
1658
+ self,
1659
+ result_table: str,
1660
+ run_guid: str,
1661
+ ):
1662
+ """
1663
+ The results could only be written back as almost dictionary
1664
+ (replacing quotes) and a conversion back is not possible since
1665
+ they are also used in a different context. Here, we do string
1666
+ parsing to extract the list of dictionaries (one for each object
1667
+ to compare).
1668
+ """
1669
+
1670
+ select_statement = (
1671
+ f"SELECT RESULT FROM {result_table} WHERE RUN_GUID = '{run_guid}'"
1672
+ )
1673
+
1674
+ results_dict = self.execute_queries(select_statement)[0][0]
1675
+ result_string = re.search(
1676
+ r'"OBJECTS":(.*)', results_dict, flags=re.DOTALL
1677
+ ).group(1)
1678
+ result_string = re.sub(
1679
+ "}$", "", result_string
1680
+ ) # remove } from the outer dictionary, the objects string is in
1681
+ result_dictstr = re.sub(r"^\s\[(.*)]$", r"\1", result_string, flags=re.DOTALL)
1682
+ dict_list = result_dictstr.split(
1683
+ '{"SRC_DATABASE_NAME"'
1684
+ ) # cannot split dictionaries at } because there are dicts in the dict
1685
+ dict_list = [
1686
+ '{"SRC_DATABASE_NAME"' + dictionary
1687
+ for dictionary in dict_list
1688
+ if len(dictionary) > 0
1689
+ ] # add the string used for splitting
1690
+ dict_list = [
1691
+ re.sub(r",\s$", "", dictionary) for dictionary in dict_list
1692
+ ] # remove ', ' at the end for those dicts not at the end of the list
1693
+
1694
+ return dict_list