icsDataValidation 1.0.358__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. icsDataValidation/configuration.py +19 -0
  2. icsDataValidation/connection_setups/__init__.py +0 -0
  3. icsDataValidation/connection_setups/azure_connection_setup.py +19 -0
  4. icsDataValidation/connection_setups/databricks_connection_setup.py +28 -0
  5. icsDataValidation/connection_setups/exasol_connection_setup.py +17 -0
  6. icsDataValidation/connection_setups/oracle_connection_setup.py +26 -0
  7. icsDataValidation/connection_setups/snowflake_connection_setup.py +35 -0
  8. icsDataValidation/connection_setups/teradata_connection_setup.py +18 -0
  9. icsDataValidation/core/__init__.py +0 -0
  10. icsDataValidation/core/database_objects.py +18 -0
  11. icsDataValidation/core/object_comparison.py +239 -0
  12. icsDataValidation/input_parameters/__init__.py +0 -0
  13. icsDataValidation/input_parameters/testing_tool_params.py +81 -0
  14. icsDataValidation/main.py +250 -0
  15. icsDataValidation/output_parameters/__init__.py +0 -0
  16. icsDataValidation/output_parameters/result_params.py +94 -0
  17. icsDataValidation/services/__init__.py +0 -0
  18. icsDataValidation/services/comparison_service.py +582 -0
  19. icsDataValidation/services/database_services/__init__.py +0 -0
  20. icsDataValidation/services/database_services/azure_service.py +320 -0
  21. icsDataValidation/services/database_services/databricks_hive_metastore_service.py +1694 -0
  22. icsDataValidation/services/database_services/databricks_unity_catalog_service.py +1379 -0
  23. icsDataValidation/services/database_services/exasol_service.py +261 -0
  24. icsDataValidation/services/database_services/oracle_service.py +713 -0
  25. icsDataValidation/services/database_services/snowflake_service.py +1100 -0
  26. icsDataValidation/services/database_services/teradata_service.py +665 -0
  27. icsDataValidation/services/initialization_service.py +103 -0
  28. icsDataValidation/services/result_service.py +573 -0
  29. icsDataValidation/services/system_service.py +61 -0
  30. icsDataValidation/services/testset_service.py +257 -0
  31. icsDataValidation/utils/__init__.py +0 -0
  32. icsDataValidation/utils/file_util.py +96 -0
  33. icsDataValidation/utils/logger_util.py +96 -0
  34. icsDataValidation/utils/pandas_util.py +159 -0
  35. icsDataValidation/utils/parallelization_util.py +52 -0
  36. icsDataValidation/utils/sql_util.py +14 -0
  37. icsDataValidation-1.0.358.dist-info/METADATA +21 -0
  38. icsDataValidation-1.0.358.dist-info/RECORD +40 -0
  39. icsDataValidation-1.0.358.dist-info/WHEEL +5 -0
  40. icsDataValidation-1.0.358.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1379 @@
1
+ from databricks import sql as databricks_sqlconnect
2
+ import pandas as pd
3
+ import logging
4
+ import re
5
+ from datetime import datetime
6
+
7
+ from typing import Union, List, Dict
8
+ from pathlib import PurePath
9
+
10
+ from icsDataValidation.utils.logger_util import configure_dev_ops_logger
11
+ from icsDataValidation.core.database_objects import DatabaseObject
12
+
13
+ #########################################################################################
14
+ #########################################################################################
15
+
16
+ # Configure Dev Ops Logger
17
+
18
+ logger = logging.getLogger("Databricks_Unity_Catalog_Service")
19
+ logger.setLevel(logging.INFO)
20
+ configure_dev_ops_logger(logger)
21
+
22
+
23
+ class DatabricksUnityCatalogService(object):
24
+ def __init__(self, connection_params: dict):
25
+ self.connection_params = connection_params
26
+ self.databricks_connection = None
27
+ self.databricks_datatype_mapping = {
28
+ "string": ["string", "array", "map", "struct"],
29
+ "numeric": [
30
+ "int",
31
+ "bigint",
32
+ "double",
33
+ "decimal",
34
+ "float",
35
+ "smallint",
36
+ "tinyint",
37
+ ],
38
+ "date_and_time": [
39
+ "timestamp",
40
+ "date",
41
+ "interval",
42
+ "timestamp_ntz",
43
+ "timestamp_tz",
44
+ "timestamp_ltz",
45
+ ],
46
+ "binary": ["binary"],
47
+ "boolean": ["boolean"],
48
+ }
49
+
50
+ def __enter__(self):
51
+ return self
52
+
53
+ def __exit__(self, exception_type, exception_value, traceback):
54
+ if self.databricks_connection is not None:
55
+ self.databricks_connection.close()
56
+
57
+ def __del__(self):
58
+ if self.databricks_connection is not None:
59
+ self.databricks_connection.close()
60
+
61
+ def _connect_to_databricks(self):
62
+ self.databricks_connection = databricks_sqlconnect.connect(
63
+ **self.connection_params
64
+ )
65
+ return self.databricks_connection
66
+
67
+ @staticmethod
68
+ def _get_error_message(excepction: Exception, statement: str) -> None:
69
+ """
70
+ Compose error message if the execution of a statement or query fails.
71
+ """
72
+ if hasattr(excepction, "raw_msg"):
73
+ message = excepction.raw_msg.replace("\n", " ")
74
+ else:
75
+ message = str(
76
+ excepction
77
+ ) # this makes sure that all kinds of errors can have a message, even if they do not have raw_msg attribute
78
+ if hasattr(excepction, "sfqid"):
79
+ message = message + f"\nQuery ID: {excepction.sfqid}"
80
+ return f"Databricks ERROR: {message}\nFailed statement:\n{statement}"
81
+
82
+ @staticmethod
83
+ def _get_in_clause(
84
+ key_filters: list,
85
+ numeric_columns: list,
86
+ numeric_scale: int,
87
+ where_exists: bool = True,
88
+ ) -> str:
89
+ """generates in_clause from list ready to expand the where clause, numeric values are rounded
90
+
91
+ Args:
92
+ key_filters (list): list of given expected values
93
+ numeric_columns (list): list of all numeric columns
94
+ numeric_scale (int): number of decimal places after rounding
95
+
96
+ Returns:
97
+ str: in clause as string
98
+ """
99
+ values = list(key_filters.values())
100
+ in_clause_values = "('"
101
+ for j in range(len(values[0])):
102
+ for value in values:
103
+ in_clause_values += str(value[j]) + "','"
104
+ in_clause_values = in_clause_values[:-2] + "),('"
105
+ in_clause_values = in_clause_values[:-3] + ")"
106
+
107
+ if where_exists:
108
+ in_clause_cols = f" AND (("
109
+ else:
110
+ in_clause_cols = f" WHERE (("
111
+ for key in key_filters.keys():
112
+ if key in numeric_columns:
113
+ in_clause_cols += f"""ROUND({key.replace("'", "")},2)""" + ","
114
+ else:
115
+ in_clause_cols += key.replace("'", "") + ","
116
+ in_clause_cols = in_clause_cols[:-1] + ")"
117
+ in_clause = in_clause_cols + " in (" + in_clause_values + ")"
118
+ return in_clause
119
+
120
+ def _get_column_clause(
121
+ self, column_list: list, columns_datatype: list, numeric_scale, key_columns
122
+ ) -> dict:
123
+ """turns list of desired columns into a sql compatible string
124
+
125
+ Args:
126
+ column_list (list): list of all columns
127
+ columns_datatype (list): datatypes of given columns
128
+ numeric_scale (_type_): number of decimal places for numeric columns
129
+ key_columns (_type_):list of columns of interest
130
+
131
+ Returns:
132
+ dict: _description_
133
+ """
134
+ column_intersecions_new = []
135
+ used_columns = []
136
+ numeric_columns = []
137
+ for column in column_list:
138
+ column_datatype = next(
139
+ x for x in columns_datatype if x["COLUMN_NAME"] == column
140
+ )["DATA_TYPE"]
141
+
142
+ if column in key_columns or not (
143
+ column_datatype.lower()
144
+ in self.databricks_datatype_mapping["date_and_time"]
145
+ ):
146
+ if (
147
+ column_datatype.lower()
148
+ in self.databricks_datatype_mapping["numeric"]
149
+ ):
150
+ if numeric_scale:
151
+ column_intersecions_new.append(
152
+ f"CAST(ROUND({column}, {numeric_scale}) as decimal(38,{numeric_scale})) as {column}"
153
+ )
154
+ else:
155
+ column_intersecions_new.append(f"{column} as {column}")
156
+ used_columns.append(column)
157
+ numeric_columns.append(column)
158
+ elif (
159
+ column_datatype.lower()
160
+ in self.databricks_datatype_mapping["string"]
161
+ ):
162
+ column_intersecions_new.append(f"{column} AS {column}")
163
+ used_columns.append(column)
164
+ else:
165
+ column_intersecions_new.append(column)
166
+ used_columns.append(column)
167
+
168
+ column_intersections = column_intersecions_new.copy()
169
+ column_clause = str(column_intersections)[1:-1].replace("'", "")
170
+ return column_clause, numeric_columns, used_columns
171
+
172
+ def get_database_objects(
173
+ self,
174
+ database: str,
175
+ schema: str = None,
176
+ object_type_restriction: str = "include_all",
177
+ ) -> dict:
178
+ if self.databricks_connection is None:
179
+ self._connect_to_databricks()
180
+
181
+ all_database_tables = []
182
+ all_database_views = []
183
+
184
+ if (
185
+ object_type_restriction == "include_all"
186
+ or object_type_restriction == "include_only_tables"
187
+ ):
188
+ if schema:
189
+ query_db_tables = f"SELECT table_schema, table_name FROM {database}.information_schema.tables WHERE table_schema == '{schema.lower()}' and table_type != 'VIEW'"
190
+ else:
191
+ logger.error(
192
+ "Query defined as null - please check input for execute_queries function."
193
+ )
194
+ exit()
195
+
196
+ all_database_tables = self.execute_queries(query_db_tables)
197
+
198
+ if (
199
+ object_type_restriction == "include_all"
200
+ or object_type_restriction == "include_only_views"
201
+ ):
202
+ if schema:
203
+ query_db_views = f"SELECT table_schema, table_name FROM {database}.information_schema.tables WHERE table_schema == '{schema.lower()}' and table_type == 'VIEW'"
204
+ else:
205
+ logger.error(
206
+ "Query defined as null - please check input for execute_queries function."
207
+ )
208
+ exit()
209
+
210
+ all_database_views = self.execute_queries(query_db_views)
211
+
212
+ database_objects = []
213
+ for row in all_database_tables:
214
+ database_table = (
215
+ f'{database}.{row["table_schema"]}.{row["table_name"]}'.upper()
216
+ )
217
+ database_objects.append(
218
+ {"object_identifier": database_table, "object_type": "table"}
219
+ )
220
+ for row in all_database_views:
221
+ database_view = (
222
+ f'{database}.{row["table_schema"]}.{row["table_name"]}'.upper()
223
+ )
224
+ database_objects.append(
225
+ {"object_identifier": database_view, "object_type": "view"}
226
+ )
227
+ return database_objects
228
+
229
+ def get_last_altered_timestamp_from_object(self, object: DatabaseObject) -> str:
230
+ """queries last_altered timestamp for given object
231
+
232
+ Args:
233
+ object (str): object for comparison
234
+
235
+ Returns:
236
+ str: last_altered timestamp
237
+ """
238
+ if self.databricks_connection is None:
239
+ self._connect_to_databricks()
240
+
241
+ self.execute_statement("ALTER SESSION SET TIMEZONE = 'Europe/London';")
242
+
243
+ query_get_last_altered = f"SELECT LAST_ALTERED FROM {object.database}.INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = '{object.name}' AND TABLE_SCHEMA = '{object.schema}';"
244
+
245
+ last_altered = self.execute_queries(query_get_last_altered)[0]
246
+
247
+ return last_altered
248
+
249
+ def get_columns_from_object(self, object: DatabaseObject) -> list:
250
+ """returns all columns from given object
251
+
252
+ Args:
253
+ object (DatabaseObject): table or view
254
+
255
+ Returns:
256
+ list: list of all columns
257
+ """
258
+
259
+ if self.databricks_connection is None:
260
+ self._connect_to_databricks()
261
+
262
+ query_show_columns = f"SELECT column_name FROM {object.database}.information_schema.columns WHERE table_schema == '{object.schema.lower()}' and table_name == '{object.name.lower()}'"
263
+
264
+ all_columns = self.execute_queries(query_show_columns)
265
+ columns = []
266
+
267
+ for row in all_columns:
268
+ columns.append(row["column_name"])
269
+
270
+ return columns
271
+
272
+ def get_row_count_from_object(self, object: DatabaseObject, where_clause: str="") -> int:
273
+ """gets row count from given object
274
+
275
+ Args:
276
+ object (DatabaseObject): table or view
277
+
278
+ Returns:
279
+ int: number of rows in object
280
+ """
281
+
282
+ if self.databricks_connection is None:
283
+ self._connect_to_databricks()
284
+
285
+ # is it more efficient to select the information_schema.table view to get the rows?
286
+ query_get_row_count = f"SELECT COUNT(*) AS ROW_COUNT FROM {object.database}.{object.schema}.{object.name} {where_clause};"
287
+ row_count = -1
288
+ error_list = []
289
+
290
+ try:
291
+ row_count = self.execute_queries(query_get_row_count)[0]["ROW_COUNT"]
292
+
293
+ except Exception as err:
294
+ error_list.append(str(err))
295
+ error_list.append(query_get_row_count)
296
+
297
+ return row_count, error_list
298
+
299
+ def get_data_types_from_object(
300
+ self, object: DatabaseObject, column_intersections: list
301
+ ) -> dict:
302
+ """returns datatypes for all intersection columns in a database object
303
+
304
+ Args:
305
+ object (DatabaseObject): table or view
306
+ column_intersections (list): columns for which the data type is queried
307
+
308
+ Returns:
309
+ dict: columns and their datatype
310
+ """
311
+
312
+ if self.databricks_connection is None:
313
+ self._connect_to_databricks()
314
+
315
+ column_intersections = str(column_intersections)[1:-1]
316
+ if column_intersections == "":
317
+ column_intersections = "''"
318
+
319
+ query_get_data_types_from_object = f"SELECT column_name, data_type FROM {object.database}.information_schema.columns WHERE table_schema == '{object.schema.lower()}' and table_name == '{object.name.lower()}'"
320
+
321
+ table_description = self.execute_queries(query_get_data_types_from_object)
322
+
323
+ dict_colummns_datatype = []
324
+
325
+ for row in table_description:
326
+ dict_colummns_datatype.append(
327
+ {"COLUMN_NAME": row["column_name"], "DATA_TYPE": row["data_type"]}
328
+ )
329
+ return dict_colummns_datatype
330
+
331
+ def get_count_distincts_from_object(
332
+ self,
333
+ object: DatabaseObject,
334
+ column_intersections: list,
335
+ where_clause: str = "",
336
+ exclude_columns: list = [],
337
+ ) -> dict:
338
+ """get distinct count for every column in a database object that is in column intersections list
339
+
340
+ Args:
341
+ object (DatabaseObject): table or view
342
+ column_intersections (list): columns that are used for distinct count
343
+ where_clause (str, optional): optional further filter. Defaults to "".
344
+ exclude_columns (list, optional): columns to exclude from distinct count. Defaults to [].
345
+
346
+ Returns:
347
+ dict: distinct counts for columns
348
+ error_list: list of failed executions for distinct counts
349
+ """
350
+
351
+ if self.databricks_connection is None:
352
+ self._connect_to_databricks()
353
+
354
+ unions = ""
355
+
356
+ for column in column_intersections:
357
+ if column not in exclude_columns:
358
+ unions += f" UNION SELECT '{column}' AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.database}.{object.schema}.{object.name} {where_clause}"
359
+
360
+ query_get_count_distincts_from_object = f"{unions[6:]} ORDER BY COUNT_DISTINCT;"
361
+ error_list = []
362
+ try:
363
+ dict_count_distincts = self.execute_queries(
364
+ query_get_count_distincts_from_object
365
+ )
366
+
367
+ except Exception as err:
368
+ # raise err
369
+ dict_count_distincts = [{"COUNT_DISTINCT": 0}]
370
+ error_list.append(
371
+ ["ERROR", str(err).split("|||")[0], str(err).split("|||")[1]]
372
+ )
373
+
374
+ return dict_count_distincts, error_list
375
+
376
+ def get_table_size(self, object: DatabaseObject) -> int:
377
+ """returns size of given object
378
+
379
+ Args:
380
+ object (DatabaseObject): table or view
381
+
382
+ Returns:
383
+ int: size of object
384
+ """
385
+
386
+ if self.databricks_connection is None:
387
+ self._connect_to_databricks()
388
+
389
+ query_analyze_table = f"ANALYZE TABLE {object.database}.{object.schema}.{object.name} COMPUTE STATISTICS NOSCAN"
390
+ self.execute_queries(query_analyze_table)
391
+
392
+ query_get_table_size = (
393
+ f"DESC EXTENDED {object.database}.{object.schema}.{object.name}"
394
+ )
395
+
396
+ table_description = self.execute_queries(query_get_table_size)
397
+ size_string = [
398
+ row["data_type"]
399
+ for row in table_description
400
+ if row["col_name"] == "Statistics"
401
+ ][0]
402
+ size = int(re.search(r"\d+", size_string).group())
403
+
404
+ return size
405
+
406
+ def create_checksums(
407
+ self,
408
+ object: DatabaseObject,
409
+ column_intersections: list,
410
+ where_clause: str = "",
411
+ exclude_columns: list = [],
412
+ numeric_scale: int = None,
413
+ ) -> List[Dict]:
414
+ """creates checksums for given object in compliance with given conditions
415
+
416
+ Args:
417
+ object (DatabaseObject): table or view
418
+ column_intersections (list): columns that are used for checksums
419
+ where_clause (str, optional): Optional filter criteria given as sql-usable string. Defaults to "".
420
+ exclude_columns (list, optional): columns to exlude from calculation. Defaults to [].
421
+ numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
422
+
423
+ Returns:
424
+ List[Dict]: checksums for columns of object
425
+ """
426
+
427
+ if self.databricks_connection is None:
428
+ self._connect_to_databricks()
429
+
430
+ column_intersections = [f"{x.upper()}" for x in column_intersections if x not in exclude_columns]
431
+
432
+ dict_colummns_datatype = self.get_data_types_from_object(object, column_intersections)
433
+
434
+ aggregates = ""
435
+ count_nulls = ""
436
+
437
+ for column in column_intersections:
438
+ column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
439
+
440
+ count_nulls += f", SUM(CASE WHEN {column} IS NULL THEN 1 ELSE 0 END) AS COUNTNULLS_{column}"
441
+
442
+ if column_datatype.lower() in self.databricks_datatype_mapping["numeric"]:
443
+
444
+ if numeric_scale:
445
+ aggregates += f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38, {numeric_scale})) AS sum_{column}"
446
+ else:
447
+ aggregates += f", CAST(SUM({column}) AS DECIMAL(38)) AS sum_{column}"
448
+
449
+ elif (
450
+ column_datatype.lower() in self.databricks_datatype_mapping["string"]
451
+ or column_datatype.lower() in self.databricks_datatype_mapping["date_and_time"]
452
+ ):
453
+
454
+ aggregates += f", COUNT(DISTINCT LOWER({column})) AS countdistinct_{column}"
455
+
456
+ elif column_datatype.lower() in self.databricks_datatype_mapping["binary"]:
457
+
458
+ aggregates += f", COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column}::VARCHAR))) AS countdistinct_{column}"
459
+
460
+ elif column_datatype.lower() in self.databricks_datatype_mapping["boolean"]:
461
+ aggregates += f", MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = true)) || '_' || MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = false)) AS aggregateboolean_{column}"
462
+
463
+ # else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
464
+
465
+ query_checksums = f"SELECT {aggregates[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
466
+
467
+ query_countnulls = f"SELECT {count_nulls[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
468
+
469
+ error_list = []
470
+ checksums={}
471
+
472
+ try:
473
+ checksums_results = self.execute_queries(
474
+ [query_checksums, query_countnulls]
475
+ )
476
+
477
+ aggregation_results = checksums_results[0][0]
478
+ countnulls_results = checksums_results[1][0]
479
+
480
+ checksums = {}
481
+ for key in aggregation_results.asDict().keys():
482
+ aggregation = key.split("_", 1)[0].upper()
483
+ col_name = key.split("_", 1)[1]
484
+ value = aggregation_results[key]
485
+ cnt_nulls = countnulls_results[f"COUNTNULLS_{col_name}"]
486
+ checksums[col_name] = [aggregation, value, cnt_nulls]
487
+
488
+ except Exception as err:
489
+ # TODO: Improve error formatting
490
+ error_list.append(["ERROR", query_checksums, str(err)])
491
+
492
+ checksums["TESTATM_ERRORS"] = error_list
493
+
494
+ return checksums
495
+
496
+ def create_pandas_df_from_group_by(
497
+ self,
498
+ object: DatabaseObject,
499
+ column_intersections: list,
500
+ group_by_columns: list,
501
+ group_by_aggregation_columns: list,
502
+ group_by_aggregation_type: str,
503
+ only_numeric: bool,
504
+ where_clause: str,
505
+ exclude_columns: list,
506
+ numeric_scale: int = None,
507
+ ) -> List[Dict]:
508
+ """execution of multiple aggregations at once
509
+
510
+ Args:
511
+ object (DatabaseObject): table or view
512
+ column_intersections (list): columns existing in src and trgt
513
+ group_by_columns (list): columns for grouping the aggregations
514
+ group_by_aggregation_columns (list): list of columns that are supposed to be aggregated
515
+ group_by_aggregation_type (str): choice between: only_min_max, various, various_and_min_max
516
+ only_numeric (bool): whether to also include distinct counts or only do numeric aggregations
517
+ where_clause (str): optional filter for aggregations, given as sql compatible where-string
518
+ exclude_columns (list): columns to exclude from comparisons
519
+ numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
520
+
521
+ Returns:
522
+ List[Dict]: list of pandas dataframes with results from aggregations, used sql queries
523
+ """
524
+
525
+ if self.databricks_connection is None:
526
+ self._connect_to_databricks()
527
+
528
+ if group_by_aggregation_columns == ["all"]:
529
+ aggregation_columns = [
530
+ f"{column.upper()}"
531
+ for column in column_intersections
532
+ if (column not in group_by_columns and column not in exclude_columns)
533
+ ]
534
+ else:
535
+ aggregation_columns = [
536
+ f"{column.upper()}"
537
+ for column in column_intersections
538
+ if (
539
+ column in group_by_aggregation_columns
540
+ and column not in exclude_columns
541
+ )
542
+ ]
543
+
544
+ group_by_query_columns_string = " "
545
+ grouping_columns_final = []
546
+ error_dict = {}
547
+
548
+ try:
549
+ for column in group_by_columns:
550
+ if column in column_intersections and column not in exclude_columns:
551
+ group_by_query_columns_string += f"{column} ,"
552
+ grouping_columns_final.append(column)
553
+
554
+ group_by_query_columns_string = group_by_query_columns_string[:-1]
555
+
556
+ dict_colummns_datatype = self.get_data_types_from_object(
557
+ object, aggregation_columns
558
+ )
559
+
560
+ aggregates = ""
561
+ aggregates_min = ""
562
+
563
+ for column in aggregation_columns:
564
+ column_datatype = next(
565
+ x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column
566
+ )["DATA_TYPE"]
567
+
568
+ if (
569
+ column_datatype.lower()
570
+ in self.databricks_datatype_mapping["numeric"]
571
+ ):
572
+ if numeric_scale:
573
+ aggregates_min += f", CAST(ROUND(MIN({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MIN_{column}, CAST(ROUND(max({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MAX_{column}"
574
+ aggregates += f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS SUM_{column}"
575
+
576
+ else:
577
+ aggregates_min += f", MIN({column}) AS MIN_{column}, MAX({column}) AS MAX_{column}"
578
+ aggregates += f", SUM({column}) AS SUM_{column}"
579
+
580
+ elif not only_numeric and (
581
+ column_datatype.lower()
582
+ in self.databricks_datatype_mapping["string"]
583
+ or column_datatype.lower()
584
+ in self.databricks_datatype_mapping["date_and_time"]
585
+ ):
586
+ aggregates += (
587
+ f", COUNT(DISTINCT LOWER({column})) AS COUNTDISTINCT_{column}"
588
+ )
589
+
590
+ elif (
591
+ not only_numeric
592
+ and column_datatype.lower()
593
+ in self.databricks_datatype_mapping["binary"]
594
+ ):
595
+ aggregates += f", COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column}::VARCHAR))) AS COUNTDISTINCT_{column}"
596
+
597
+ elif (
598
+ not only_numeric
599
+ and column_datatype.lower()
600
+ in self.databricks_datatype_mapping["boolean"]
601
+ ):
602
+ aggregates += f", MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = true)) || '_' || MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = false)) AS AGGREGATEBOOLEAN_{column}"
603
+
604
+ # else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
605
+
606
+ # CASE 1: min_max
607
+ if group_by_aggregation_type == "only_min_max":
608
+ group_by_query_aggregation_string = aggregates_min[1:]
609
+
610
+ # CASE 2; sum, count_distinct, aggregate_boolean
611
+ elif group_by_aggregation_type == "various":
612
+ group_by_query_aggregation_string = aggregates[1:]
613
+
614
+ # CASE 3: sum, count_distinct, aggregate_boolean, min_max
615
+ elif group_by_aggregation_type == "various_and_min_max":
616
+ group_by_query_aggregation_string = f"{aggregates_min[1:]}{aggregates}"
617
+
618
+ query_group_by_aggregation = f"SELECT {group_by_query_columns_string}, COUNT(*) AS COUNT_OF_GROUP_BY_VALUE, {group_by_query_aggregation_string} FROM {object.database}.{object.schema}.{object.name} {where_clause} GROUP BY {group_by_query_columns_string} ORDER BY {group_by_query_columns_string};"
619
+
620
+ group_by_aggregation_pdf = self.execute_queries(
621
+ query_group_by_aggregation, True
622
+ )
623
+ except Exception as err:
624
+ group_by_aggregation_pdf = pd.DataFrame()
625
+ group_by_aggregation_pdf["TESTATM_ERROR"] = [1]
626
+ if not grouping_columns_final:
627
+ error_dict = {
628
+ "QUERY": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table",
629
+ "ERROR": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table"
630
+ }
631
+ group_by_query_aggregation_string = ""
632
+ elif "|||" in str(err):
633
+ error_dict = {
634
+ "QUERY": str(err).split("|||")[0],
635
+ "ERROR": str(err).split("|||")[1],
636
+ }
637
+ else:
638
+ error_dict = {
639
+ "QUERY": "NO Query generated. Please check if the configurated Grouping Columns exist in the Table",
640
+ "ERROR": str(err),
641
+ }
642
+ group_by_query_aggregation_string = ""
643
+
644
+ return (
645
+ group_by_aggregation_pdf,
646
+ group_by_query_aggregation_string,
647
+ group_by_query_columns_string,
648
+ grouping_columns_final,
649
+ error_dict
650
+ )
651
+
652
+
653
+ def create_pandas_df(
654
+ self, object: DatabaseObject, intersection_columns_trgt_src: list, where_clause:str="", exclude_columns:list=[]
655
+ ) -> pd.DataFrame:
656
+ """creates pandas dataframes with all data from given object in given columns
657
+
658
+ Args:
659
+ object (DatabaseObject): table or view
660
+ intersection_columns_trgt_src (list): columns existing in source and target
661
+
662
+ Returns:
663
+ pd.DataFrame: direct result of sql query
664
+ """
665
+ if self.databricks_connection is None:
666
+ self._connect_to_databricks()
667
+
668
+ intersection_columns_trgt_src_ = ', '.join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
669
+
670
+ df_query = f"SELECT {intersection_columns_trgt_src_} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
671
+
672
+ src_pdf = self.execute_queries(df_query, True)
673
+
674
+ return src_pdf
675
+
676
+ def create_pandas_df_from_sample(
677
+ self,
678
+ object: DatabaseObject,
679
+ column_intersections: list,
680
+ key_columns: list,
681
+ where_clause: str = "",
682
+ exclude_columns: list = [],
683
+ key_filters: dict = {},
684
+ dedicated_columns: list = [],
685
+ sample_count: int = 10,
686
+ numeric_scale: int = None,
687
+ ) -> List[Dict]:
688
+ if self.databricks_connection is None:
689
+ self._connect_to_databricks()
690
+
691
+ where_exists = True
692
+ if not where_clause:
693
+ where_exists = False
694
+
695
+ sample_count = str(sample_count)
696
+ key_intersection = list(
697
+ (set(column_intersections) & set(key_columns)) - set(exclude_columns)
698
+ )
699
+ filter_intersection = list(
700
+ (set(column_intersections) & set(key_filters.keys())) - set(exclude_columns)
701
+ )
702
+ dedicated_intersection = list(
703
+ (set(column_intersections) & set(dedicated_columns)) - set(exclude_columns)
704
+ )
705
+
706
+ key_intersection.sort()
707
+ filter_intersection.sort()
708
+ dedicated_intersection.sort()
709
+
710
+ if dedicated_intersection != []:
711
+ is_dedicated = True
712
+
713
+ dict_colummns_datatype = self.get_data_types_from_object(
714
+ object, dedicated_intersection
715
+ )
716
+
717
+ else:
718
+ is_dedicated = False
719
+
720
+ dict_colummns_datatype = self.get_data_types_from_object(
721
+ object, column_intersections
722
+ )
723
+
724
+ if key_intersection != [] and is_dedicated:
725
+ keys = str(key_intersection)[1:-1].replace("'", "")
726
+ column_clause, numeric_columns, used_columns = self._get_column_clause(
727
+ dedicated_intersection,
728
+ dict_colummns_datatype,
729
+ numeric_scale,
730
+ key_columns,
731
+ )
732
+ if (key_filters != {}) & (filter_intersection != []):
733
+ values = list(key_filters.values())
734
+ if values[0] != []:
735
+ in_clause = self._get_in_clause(
736
+ key_filters, numeric_columns, numeric_scale, where_exists
737
+ )
738
+ else:
739
+ in_clause = ""
740
+ sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} TABLESAMPLE ({sample_count} ROWS) {where_clause}{in_clause} ORDER BY {keys};"
741
+ elif key_intersection != [] and not is_dedicated:
742
+ keys = str(key_intersection)[1:-1].replace("'", "")
743
+ column_clause, numeric_columns, used_columns = self._get_column_clause(
744
+ column_intersections, dict_colummns_datatype, numeric_scale, key_columns
745
+ )
746
+ if (key_filters != {}) & (filter_intersection != []):
747
+ values = list(key_filters.values())
748
+ if values[0] != []:
749
+ in_clause = self._get_in_clause(
750
+ key_filters, numeric_columns, numeric_scale, where_exists
751
+ )
752
+ else:
753
+ in_clause = ""
754
+ sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} TABLESAMPLE ({sample_count} ROWS) {where_clause}{in_clause} ORDER BY {keys};"
755
+ else:
756
+ column_intersections = list(
757
+ set(column_intersections) - set(exclude_columns)
758
+ )
759
+ column_intersections.sort()
760
+ column_clause, numeric_columns, used_columns = self._get_column_clause(
761
+ column_intersections, dict_colummns_datatype, numeric_scale, key_columns
762
+ )
763
+ sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} TABLESAMPLE ({sample_count} ROWS) {where_clause};"
764
+
765
+ error_dict = {}
766
+ key_dict = {}
767
+ try:
768
+ sample_pdf = self.execute_queries(sample_query, return_as_pdf=True)
769
+ for key in key_intersection:
770
+ if pd.api.types.is_datetime64_any_dtype(sample_pdf[key]):
771
+ key_dict[key] = list(sample_pdf[key].astype(str))
772
+ else:
773
+ key_dict[key] = list(sample_pdf[key])
774
+
775
+ except Exception as err:
776
+ sample_pdf = pd.DataFrame()
777
+ sample_pdf["TESTATM_ERROR"] = [1]
778
+ if "|||" in str(err):
779
+ error_dict = {
780
+ "QUERY": str(err).split("|||")[0],
781
+ "ERROR": str(err).split("|||")[1],
782
+ }
783
+ else:
784
+ error_dict = {"QUERY": "No SQL Error", "ERROR": str(err)}
785
+
786
+ return_list = []
787
+ return_list.append(sample_pdf)
788
+ return_list.append(error_dict)
789
+
790
+ return return_list, key_dict, used_columns, sample_query
791
+
792
+ def execute_queries(
793
+ self,
794
+ query: Union[str, List[str]],
795
+ return_as_pdf: bool = False,
796
+ return_query_ids: bool = False,
797
+ ) -> Union[List[Dict], List[List[Dict]]]:
798
+ """actual execution of defined queries
799
+
800
+ Args:
801
+ query (Union[str, List[str]]): queries to be executed
802
+ return_as_pdf (bool, optional): If true, queries returned as pandas data frames. Defaults to False.
803
+ return_query_ids (bool, optional): If true, results and queri ids are returned, otherwise only results. Defaults to False.
804
+
805
+ Raises:
806
+ Exception: Raises exception if single query cannot be executed.
807
+
808
+ Returns:
809
+ Union[List[Dict], List[List[Dict]]]: returns results or results with query-ids
810
+ """
811
+ if self.databricks_connection is None:
812
+ self._connect_to_databricks()
813
+
814
+ if query:
815
+ query_list: List[str] = query if isinstance(query, list) else [query]
816
+ else:
817
+ logger.error(
818
+ "Query defined as null - please check input for execute_queries function."
819
+ )
820
+
821
+ cursor = self.databricks_connection.cursor()
822
+
823
+ results = []
824
+ query_ids = []
825
+
826
+ for single_query in query_list:
827
+ try:
828
+ try:
829
+ query_result = cursor.execute(single_query).fetchall()
830
+ except:
831
+ query_result = cursor.execute(single_query).fetchall_arrow().to_pylist()
832
+ if return_as_pdf:
833
+ columns = [col[0] for col in cursor.description]
834
+ query_result = pd.DataFrame(query_result, columns=columns)
835
+
836
+ results.append(query_result)
837
+ query_ids.append(0) # there is no query id returned by databricks
838
+
839
+ except Exception as err:
840
+ raise Exception(single_query + "|||" + str(err))
841
+
842
+ if return_query_ids:
843
+ return (
844
+ results[0],
845
+ query_ids[0] if not isinstance(query, list) else results,
846
+ query_ids,
847
+ )
848
+
849
+ else:
850
+ return results[0] if not isinstance(query, list) else results
851
+
852
+ def execute_statement(self, statement: Union[str, List[str]]) -> None:
853
+ """
854
+ Executes simple statement against snowflake
855
+ Schema and Database settings must be set beforehand
856
+ Args:
857
+ statement Union[str, List[str]] - a sql statement or a list of sql statements to execute
858
+ """
859
+ if self.databricks_connection is None:
860
+ self._connect_to_databricks()
861
+
862
+ statement_list: List[str] = (
863
+ statement if isinstance(statement, list) else [statement]
864
+ )
865
+
866
+ cursor = self.databricks_connection.cursor()
867
+
868
+ for single_statement in statement_list:
869
+ try:
870
+ stripped_statement = single_statement.strip()
871
+ _ = cursor.execute(stripped_statement)
872
+
873
+ except Exception as err:
874
+ raise Exception(self._get_error_message(err, single_statement)) from err
875
+
876
+ def create_schemas(self, database_name: str, schemas: List):
877
+ statement_list = []
878
+
879
+ for schema in schemas:
880
+ statement_list.append(
881
+ f"CREATE SCHEMA IF NOT EXISTS {database_name}.{schema}"
882
+ )
883
+
884
+ self.execute_statement(statement_list)
885
+
886
+ def insert_json_results(
887
+ self,
888
+ run_guid: str,
889
+ pipeline_name: str,
890
+ pipeline_id: str,
891
+ start_time_utc: str,
892
+ result_table: str,
893
+ results: dict,
894
+ ) -> None:
895
+ """
896
+ copy into - result table for json results
897
+ """
898
+
899
+ statement = f"CREATE TABLE IF NOT EXISTS {result_table} (RUN_GUID STRING, PIPELINE_NAME STRING, PIPELINE_ID STRING, START_TIME_UTC STRING, RESULT STRING, CREATION_TIME_UTC STRING)"
900
+
901
+ self.execute_statement(statement)
902
+
903
+ statement = (
904
+ "INSERT INTO {} VALUES ('{}', '{}', '{}', '{}', '{}', '{}');".format(
905
+ result_table,
906
+ run_guid,
907
+ pipeline_name,
908
+ pipeline_id,
909
+ start_time_utc,
910
+ str(results).replace("'", '"'),
911
+ datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S"),
912
+ )
913
+ )
914
+
915
+ self.execute_statement(statement)
916
+
917
+ def insert_json_results_live(
918
+ self,
919
+ run_guid: str,
920
+ pipeline_name: str,
921
+ pipeline_id: str,
922
+ result_table: str,
923
+ stage_name: str,
924
+ source_system: str,
925
+ target_system: str,
926
+ database: str,
927
+ schema: str,
928
+ object: str,
929
+ ) -> None:
930
+ """
931
+ copy into - result table for json results live
932
+ """
933
+ result_database = result_table.split(".", 1)[0]
934
+
935
+ statement = f"COPY INTO {result_table} (RUN_GUID, PIPELINE_NAME, PIPELINE_ID, SOURCE_SYSTEM, TARGET_SYSTEM, DATABASE_NAME, SCHEMA_NAME, OBJECT_NAME ,RESULT, CREATION_TS) FROM (SELECT '{run_guid}', '{pipeline_name}', '{pipeline_id}', '{source_system}', '{target_system}', '{database}', '{schema}', '{object}', $1, SYSDATE() from @{stage_name} (file_format => {result_database}.meta_data.ff_json ));"
936
+
937
+ self.execute_statement(statement)
938
+
939
+ def insert_highlevel_results(
940
+ self,
941
+ results: dict,
942
+ run_guid: str,
943
+ pipeline_name: str,
944
+ pipeline_id: str,
945
+ result_table_highlevel: str,
946
+ ) -> None:
947
+ """
948
+ insert into - highlevel results per "pipeline run" / "generic testing tool execution"
949
+ """
950
+
951
+ statement = f"CREATE TABLE IF NOT EXISTS {result_table_highlevel} (RUN_GUID STRING, PIPELINE_NAME STRING, PIPELINE_ID STRING, START_TIME_UTC STRING, SOURCE_SYSTEM STRING, TARGET_SYSTEM STRING, DATABASE_NAME STRING, TESTSET STRING, ALL_OBJECTS_MATCHING BOOLEAN, ALL_COLUMNS_EQUAL BOOLEAN, ALL_ROWCOUNTS_EQUAL BOOLEAN, ALL_CHECKSUMS_EQUAL BOOLEAN, ALL_SAMPLES_EQUAL BOOLEAN, ALL_OBJECTS_EQUAL BOOLEAN, OBJECTS_TO_COMPARE_SRC STRING, OBJECTS_TO_COMPARE_TRGT STRING, NUMBER_OF_OBJECTS_TO_COMPARE INT, SRC_MINUS_TRGT STRING, TRGT_MINUS_SRC STRING, CREATION_TS_UTC STRING)"
952
+
953
+ self.execute_statement(statement)
954
+
955
+ TESTSET_ = ", ".join(results["TESTSET"])
956
+
957
+ OBJECTS_TO_COMPARE_SRC_ = ", ".join(results["OBJECTS_TO_COMPARE_SRC"])
958
+
959
+ OBJECTS_TO_COMPARE_TRGT_ = ", ".join(results["OBJECTS_TO_COMPARE_TRGT"])
960
+
961
+ SRC_MINUS_TRGT_ = ", ".join(results["SRC_MINUS_TRGT"])
962
+
963
+ TRGT_MINUS_SRC_ = ", ".join(results["TRGT_MINUS_SRC"])
964
+
965
+ date_utc = datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
966
+
967
+ insert_statement = f"INSERT INTO {result_table_highlevel} ( \
968
+ RUN_GUID, \
969
+ PIPELINE_NAME, \
970
+ PIPELINE_ID, \
971
+ START_TIME_UTC, \
972
+ SOURCE_SYSTEM, \
973
+ TARGET_SYSTEM, \
974
+ DATABASE_NAME, \
975
+ TESTSET, \
976
+ ALL_OBJECTS_MATCHING, \
977
+ ALL_COLUMNS_EQUAL, \
978
+ ALL_DATATYPES_EQUAL, \
979
+ ALL_ROWCOUNTS_EQUAL, \
980
+ ALL_CHECKSUMS_EQUAL, \
981
+ ALL_SAMPLES_EQUAL, \
982
+ ALL_OBJECTS_EQUAL, \
983
+ OBJECTS_TO_COMPARE_SRC, \
984
+ OBJECTS_TO_COMPARE_TRGT, \
985
+ NUMBER_OF_OBJECTS_TO_COMPARE, \
986
+ SRC_MINUS_TRGT, \
987
+ TRGT_MINUS_SRC, \
988
+ CREATION_TS_UTC) \
989
+ VALUES \
990
+ ('{run_guid}', \
991
+ '{pipeline_name}', \
992
+ '{pipeline_id}', \
993
+ '{results['START_TIME_UTC']}', \
994
+ '{results['SOURCE_SYSTEM']}', \
995
+ '{results['TARGET_SYSTEM']}', \
996
+ '{results['DATABASE_NAME']}', \
997
+ '{TESTSET_}', \
998
+ '{results['ALL_OBJECTS_MATCHING']}', \
999
+ '{results['ALL_COLUMNS_EQUAL']}', \
1000
+ '{results['ALL_DATATYPES_EQUAL']}', \
1001
+ '{results['ALL_ROWCOUNTS_EQUAL']}', \
1002
+ '{results['ALL_CHECKSUMS_EQUAL']}', \
1003
+ NULLIF('{results['ALL_SAMPLES_EQUAL']}', 'None'), \
1004
+ NULLIF('{results['ALL_OBJECTS_EQUAL']}', 'None'), \
1005
+ '{OBJECTS_TO_COMPARE_SRC_}', \
1006
+ '{OBJECTS_TO_COMPARE_TRGT_}', \
1007
+ '{results['NUMBER_OF_OBJECTS_TO_COMPARE']}', \
1008
+ '{SRC_MINUS_TRGT_}', \
1009
+ '{TRGT_MINUS_SRC_}', \
1010
+ '{date_utc}')"
1011
+
1012
+ self.execute_statement(insert_statement)
1013
+
1014
+ def insert_objectlevel_results(
1015
+ self,
1016
+ result_table: str,
1017
+ result_table_objectlevel: str,
1018
+ run_guid: str,
1019
+ results: dict,
1020
+ ) -> None:
1021
+ """
1022
+ insert into - detailed results per object
1023
+ """
1024
+ date_utc = datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
1025
+
1026
+ statement = f"""
1027
+ CREATE TABLE IF NOT EXISTS {result_table_objectlevel} (
1028
+ RUN_GUID STRING,
1029
+ PIPELINE_ID STRING,
1030
+ START_TIME_UTC STRING,
1031
+ SRC_DATABASE_NAME STRING,
1032
+ SRC_SCHEMA_NAME STRING,
1033
+ SRC_OBJECT_NAME STRING,
1034
+ SRC_OBJECT_TYPE STRING,
1035
+ TRGT_DATABASE_NAME STRING,
1036
+ TRGT_SCHEMA_NAME STRING,
1037
+ TRGT_OBJECT_NAME STRING,
1038
+ TRGT_OBJECT_TYPE STRING,
1039
+ SRC_FILTER STRING,
1040
+ TRGT_FILTER STRING,
1041
+ EXCLUDED_COLUMNS STRING,
1042
+ COLUMNS_EQUAL BOOLEAN,
1043
+ COLUMN_INTERSECTION STRING,
1044
+ SRC_COLUMNS_MINUS_TRGT_COLUMNS STRING,
1045
+ TRGT_COLUMNS_MINUS_SRC_COLUMNS STRING,
1046
+ ROW_COUNTS_EQUAL BOOLEAN,
1047
+ SRC_ROW_COUNT INT,
1048
+ TRGT_ROW_COUNT INT,
1049
+ ALL_COUNT_NULLS_EQUAL BOOLEAN,
1050
+ AGGREGATIONS_EQUAL BOOLEAN,
1051
+ SRC_ERROR_QUERY STRING,
1052
+ TRGT_ERROR_QUERY STRING,
1053
+ SRC_ERROR_MSG STRING,
1054
+ TRGT_ERROR_MSG STRING,
1055
+ GROUP_BY_COLUMNS STRING,
1056
+ GROUP_BY_EQUAL BOOLEAN,
1057
+ GROUP_BY_VALUES_WITH_MISMATCHES STRING,
1058
+ COLUMNS_WITH_MISMATCH STRING,
1059
+ SRC_GROUP_BY_QUERY STRING,
1060
+ TRGT_GROUP_BY_QUERY STRING,
1061
+ SRC_GROUP_BY_ERROR STRING,
1062
+ TRGT_GROUP_BY_ERROR STRING,
1063
+ SAMPLES_COMPARED BOOLEAN,
1064
+ SAMPLES_EQUAL BOOLEAN,
1065
+ SAMPLE_KEYS STRING,
1066
+ SRC_SAMPLE STRING,
1067
+ TRGT_SAMPLE STRING,
1068
+ SRC_SAMPLE_QUERY STRING,
1069
+ TRGT_SAMPLE_QUERY STRING,
1070
+ SRC_SAMPLE_ERROR_MSG STRING,
1071
+ TRGT_SAMPLE_ERROR_MSG STRING,
1072
+ PANDAS_DATAFRAME_COMPARED BOOLEAN,
1073
+ PANDAS_DATAFRAME_EQUAL BOOLEAN,
1074
+ SRC_NOT_ALTERED_DURING_COMPARISON BOOLEAN,
1075
+ TRGT_NOT_ALTERED_DURING_COMPARISON BOOLEAN,
1076
+ SRC_LAST_ALTERED STRING,
1077
+ TRGT_LAST_ALTERED STRING,
1078
+ CREATION_TS_UTC STRING)
1079
+ """
1080
+
1081
+ self.execute_statement(statement)
1082
+
1083
+ for object_result in results['OBJECTS']:
1084
+
1085
+ pipeline_id = results['PIPELINE_ID']
1086
+ start_time_utc = results['START_TIME_UTC']
1087
+ src_database_name = object_result['SRC_DATABASE_NAME']
1088
+ src_schema_name = object_result['SRC_SCHEMA_NAME']
1089
+ src_object_name = object_result['SRC_OBJECT_NAME']
1090
+ src_object_type = object_result['SRC_OBJECT_TYPE']
1091
+ trgt_database_name = object_result['TRGT_DATABASE_NAME']
1092
+ trgt_schema_name = object_result['TRGT_SCHEMA_NAME']
1093
+ trgt_object_name = object_result['TRGT_OBJECT_NAME']
1094
+ trgt_object_type = object_result['TRGT_OBJECT_TYPE']
1095
+ src_filter = object_result['SRC_FILTER']
1096
+ trgt_filter = object_result['TRGT_FILTER']
1097
+ excluded_columns = object_result['EXCLUDED_COLUMNS']
1098
+ columns_equal = object_result['COLUMNS_EQUAL']
1099
+ column_intersection = str(object_result['COLUMN_INTERSECTION'])
1100
+ src_columns_minus_trgt_columns = object_result['SRC_COLUMNS_MINUS_TRGT_COLUMNS']
1101
+ trgt_columns_minus_src_columns = object_result['TRGT_COLUMNS_MINUS_SRC_COLUMNS']
1102
+ datatypes_equal = object_result['DATATYPES_EQUAL']
1103
+ row_counts_equal = object_result['ROW_COUNTS_EQUAL']
1104
+ src_row_count = object_result['SRC_ROW_COUNT']
1105
+ trgt_row_count = object_result['TRGT_ROW_COUNT']
1106
+ all_count_nulls_equal = object_result['ALL_COUNT_NULLS_EQUAL']
1107
+ aggregations_equal = object_result['AGGREGATIONS_EQUAL']
1108
+ src_error_query = object_result['SRC_ERROR']['QUERY']
1109
+ trgt_error_query = object_result['TRGT_ERROR']['QUERY']
1110
+ src_error_msg = object_result['SRC_ERROR']['ERROR']
1111
+ trgt_error_msg = object_result['TRGT_ERROR']['ERROR']
1112
+ group_by_columns = object_result['GROUP_BY_COLUMNS']
1113
+ group_by_equal = object_result['GROUP_BY_EQUAL']
1114
+ group_by_values_with_mismatches = object_result['GROUP_BY_VALUES_WITH_MISMATCHES']
1115
+ columns_with_mismatch = object_result['COLUMNS_WITH_MISMATCH']
1116
+ src_group_by_query = object_result['SRC_GROUP_BY_QUERY']
1117
+ trgt_group_by_query = object_result['TRGT_GROUP_BY_QUERY']
1118
+ src_group_by_error = object_result['SRC_GROUP_BY_ERROR']
1119
+ trgt_group_by_error = object_result['TRGT_GROUP_BY_ERROR']
1120
+ samples_compared = object_result['SAMPLES_COMPARED']
1121
+ samples_equal = object_result['SAMPLES_EQUAL']
1122
+ sample_keys = object_result['SAMPLE_KEYS']
1123
+ src_sample = object_result['SRC_SAMPLE']
1124
+ trgt_sample = object_result['TRGT_SAMPLE']
1125
+ src_sample_query = object_result['SRC_SAMPLE_QUERY']
1126
+ trgt_sample_query = object_result['TRGT_SAMPLE_QUERY']
1127
+ src_sample_error_msg = object_result['SRC_SAMPLE_ERROR_DICT']
1128
+ trgt_sample_error_msg = object_result['SRC_SAMPLE_ERROR_DICT']
1129
+ pandas_dataframe_compared = object_result['PANDAS_DATAFRAME_COMPARED']
1130
+ pandas_dataframe_equal = object_result['PANDAS_DATAFRAME_EQUAL']
1131
+ src_not_altered_during_comparison = object_result['SRC_NOT_ALTERED_DURING_COMPARISON']
1132
+ trgt_not_altered_during_comparison = object_result['TRGT_NOT_ALTERED_DURING_COMPARISON']
1133
+ src_last_altered = object_result['SRC_LAST_ALTERED']
1134
+ trgt_last_altered = object_result['TRGT_LAST_ALTERED']
1135
+
1136
+ # the rest in elem is not used for this table
1137
+
1138
+ insert_statement = f"""INSERT INTO {result_table_objectlevel} ( \
1139
+ RUN_GUID, \
1140
+ PIPELINE_ID, \
1141
+ START_TIME_UTC, \
1142
+ SRC_DATABASE_NAME, \
1143
+ SRC_SCHEMA_NAME, \
1144
+ SRC_OBJECT_NAME, \
1145
+ SRC_OBJECT_TYPE, \
1146
+ TRGT_DATABASE_NAME, \
1147
+ TRGT_SCHEMA_NAME, \
1148
+ TRGT_OBJECT_NAME, \
1149
+ TRGT_OBJECT_TYPE, \
1150
+ SRC_FILTER, \
1151
+ TRGT_FILTER, \
1152
+ EXCLUDED_COLUMNS, \
1153
+ COLUMNS_EQUAL, \
1154
+ COLUMN_INTERSECTION, \
1155
+ SRC_COLUMNS_MINUS_TRGT_COLUMNS, \
1156
+ TRGT_COLUMNS_MINUS_SRC_COLUMNS, \
1157
+ DATATYPES_EQUAL, \
1158
+ ROW_COUNTS_EQUAL, \
1159
+ SRC_ROW_COUNT, \
1160
+ TRGT_ROW_COUNT, \
1161
+ ALL_COUNT_NULLS_EQUAL, \
1162
+ AGGREGATIONS_EQUAL, \
1163
+ SRC_ERROR_QUERY , \
1164
+ TRGT_ERROR_QUERY, \
1165
+ SRC_ERROR_MSG, \
1166
+ TRGT_ERROR_MSG, \
1167
+ GROUP_BY_COLUMNS, \
1168
+ GROUP_BY_EQUAL, \
1169
+ GROUP_BY_VALUES_WITH_MISMATCHES, \
1170
+ COLUMNS_WITH_MISMATCH, \
1171
+ SRC_GROUP_BY_QUERY, \
1172
+ TRGT_GROUP_BY_QUERY, \
1173
+ SRC_GROUP_BY_ERROR, \
1174
+ TRGT_GROUP_BY_ERROR, \
1175
+ SAMPLES_COMPARED, \
1176
+ SAMPLES_EQUAL, \
1177
+ SAMPLE_KEYS, \
1178
+ SRC_SAMPLE, \
1179
+ TRGT_SAMPLE, \
1180
+ SRC_SAMPLE_QUERY, \
1181
+ TRGT_SAMPLE_QUERY, \
1182
+ SRC_SAMPLE_ERROR_MSG, \
1183
+ TRGT_SAMPLE_ERROR_MSG, \
1184
+ PANDAS_DATAFRAME_COMPARED, \
1185
+ PANDAS_DATAFRAME_EQUAL, \
1186
+ SRC_NOT_ALTERED_DURING_COMPARISON, \
1187
+ TRGT_NOT_ALTERED_DURING_COMPARISON, \
1188
+ SRC_LAST_ALTERED, \
1189
+ TRGT_LAST_ALTERED, \
1190
+ CREATION_TS_UTC) \
1191
+ SELECT\
1192
+ '{run_guid}' AS RUN_GUID, \
1193
+ '{pipeline_id}' AS PIPELINE_ID, \
1194
+ '{start_time_utc}' AS START_TIME_UTC, \
1195
+ '{src_database_name}' AS SRC_DATABASE_NAME, \
1196
+ '{src_schema_name}' AS SRC_SCHEMA_NAME, \
1197
+ '{src_object_name}' AS SRC_OBJECT_NAME, \
1198
+ '{src_object_type}' AS SRC_OBJECT_TYPE, \
1199
+ '{trgt_database_name}' AS TRGT_DATABASE_NAME, \
1200
+ '{trgt_schema_name}' AS TRGT_SCHEMA_NAME, \
1201
+ '{trgt_object_name}' AS TRGT_OBJECT_NAME, \
1202
+ '{trgt_object_type}' AS TRGT_OBJECT_TYPE, \
1203
+ '{src_filter}' AS SRC_FILTER, \
1204
+ '{trgt_filter}' AS TRGT_FILTER, \
1205
+ '{excluded_columns}' AS EXCLUDED_COLUMNS, \
1206
+ try_cast('{columns_equal}' AS BOOLEAN) AS COLUMNS_EQUAL, \
1207
+ "{column_intersection}"::STRING AS COLUMN_INTERSECTION, \
1208
+ '{src_columns_minus_trgt_columns}' AS SRC_COLUMNS_MINUS_TRGT_COLUMNS, \
1209
+ '{trgt_columns_minus_src_columns}' AS TRGT_COLUMNS_MINUS_SRC_COLUMNS, \
1210
+ try_cast('{datatypes_equal}' AS BOOLEAN) AS DATATYPES_EQUAL, \
1211
+ try_cast('{row_counts_equal}' AS BOOLEAN) AS ROW_COUNTS_EQUAL, \
1212
+ '{src_row_count}'::INT AS SRC_ROW_COUNT, \
1213
+ '{trgt_row_count}'::INT AS TRGT_ROW_COUNT, \
1214
+ try_cast('{all_count_nulls_equal}' AS BOOLEAN) AS ALL_COUNT_NULLS_EQUAL, \
1215
+ try_cast('{aggregations_equal}' AS BOOLEAN) AS AGGREGATIONS_EQUAL, \
1216
+ '{src_error_query}'::STRING AS SRC_ERROR_QUERY, \
1217
+ '{trgt_error_query}'::STRING AS TRGT_ERROR_QUERY, \
1218
+ '{src_error_msg}'::STRING AS SRC_ERROR_MSG, \
1219
+ '{trgt_error_msg}'::STRING AS TRGT_ERROR_MSG, \
1220
+ "{group_by_columns}" AS GROUP_BY_COLUMNS, \
1221
+ try_cast('{group_by_equal}' AS BOOLEAN) AS GROUP_BY_EQUAL, \
1222
+ "{group_by_values_with_mismatches}" AS GROUP_BY_VALUES_WITH_MISMATCHES, \
1223
+ "{columns_with_mismatch}" AS COLUMNS_WITH_MISMATCH, \
1224
+ '{src_group_by_query}'::STRING AS SRC_GROUP_BY_QUERY, \
1225
+ '{trgt_group_by_query}'::STRING AS TRGT_GROUP_BY_QUERY, \
1226
+ '{src_group_by_error}'::STRING AS SRC_GROUP_BY_ERROR, \
1227
+ '{trgt_group_by_error}'::STRING AS TRGT_GROUP_BY_ERROR, \
1228
+ try_cast('{samples_compared}' AS BOOLEAN) AS SAMPLES_COMPARED, \
1229
+ try_cast('{samples_equal}' AS BOOLEAN) AS SAMPLES_EQUAL, \
1230
+ '{sample_keys}' AS SAMPLE_KEYS, \
1231
+ '{src_sample}' AS SRC_SAMPLE, \
1232
+ '{trgt_sample}' AS TRGT_SAMPLE, \
1233
+ '{src_sample_query}' AS SRC_SAMPLE_QUERY, \
1234
+ '{trgt_sample_query}' AS TRGT_SAMPLE_QUERY, \
1235
+ '{src_sample_error_msg}'::STRING AS SRC_SAMPLE_ERROR_MSG, \
1236
+ '{trgt_sample_error_msg}'::STRING AS TRGT_SAMPLE_ERROR_MSG, \
1237
+ try_cast('{pandas_dataframe_compared}' AS BOOLEAN) AS PANDAS_DATAFRAME_COMPARED, \
1238
+ try_cast('{pandas_dataframe_equal}' AS BOOLEAN) AS PANDAS_DATAFRAME_EQUAL, \
1239
+ try_cast('{src_not_altered_during_comparison}' AS BOOLEAN) AS SRC_NOT_ALTERED_DURING_COMPARISON, \
1240
+ try_cast('{trgt_not_altered_during_comparison}' AS BOOLEAN) AS TRGT_NOT_ALTERED_DURING_COMPARISON, \
1241
+ '{src_last_altered}'::STRING AS SRC_LAST_ALTERED, \
1242
+ '{trgt_last_altered}'::STRING AS TRGT_LAST_ALTERED, \
1243
+ '{date_utc}' \
1244
+ ;"""
1245
+
1246
+ self.execute_statement(insert_statement)
1247
+
1248
+ def insert_columnlevel_results(
1249
+ self,
1250
+ result_table: str,
1251
+ result_table_columnlevel: str,
1252
+ run_guid: str,
1253
+ results: dict,
1254
+ ) -> None:
1255
+ """
1256
+ insert into - detailed results per column
1257
+ """
1258
+
1259
+ date_utc = datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
1260
+
1261
+ statement = f"CREATE TABLE IF NOT EXISTS {result_table_columnlevel} (RUN_GUID STRING, PIPELINE_ID STRING, START_TIME_UTC STRING, SRC_DATABASE_NAME STRING, SRC_SCHEMA_NAME STRING, SRC_OBJECT_NAME STRING, SRC_OBJECT_TYPE STRING, TRGT_DATABASE_NAME STRING, TRGT_SCHEMA_NAME STRING, TRGT_OBJECT_NAME STRING, TRGT_OBJECT_TYPE STRING, COLUMN_NAME STRING, IN_SRC BOOLEAN, IN_TRGT BOOLEAN, IN_SYNC BOOLEAN, IN_EXCLUDED BOOLEAN, SRC_DATATYPE STRING, TRGT_DATATYPE STRING, AGGREGATION_TYPE STRING, AGGREGATION_EQUAL BOOLEAN, AGGREGATION_RESULT_SRC STRING, AGGREGATION_RESULT_TRGT STRING, AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC STRING, COUNT_NULLS_EQUAL BOOLEAN, COUNT_NULLS_SRC STRING, COUNT_NULLS_TRGT STRING, COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC STRING, ERROR_QUERY_SRC STRING, ERROR_MSG_SRC STRING, ERROR_QUERY_TRGT STRING, ERROR_MSG_TRGT STRING, ERROR_FLAG BOOLEAN, CREATION_TS_UTC STRING);"
1262
+
1263
+ self.execute_statement(statement)
1264
+
1265
+
1266
+
1267
+ # extract the information needed for the table on object level
1268
+ for object_result in results['OBJECTS']:
1269
+ for column_result in object_result['COLUMNS']:
1270
+ pipeline_id = results['PIPELINE_ID']
1271
+ start_time_utc = results['START_TIME_UTC']
1272
+ src_database_name = object_result['SRC_DATABASE_NAME']
1273
+ src_schema_name = object_result['SRC_SCHEMA_NAME']
1274
+ src_object_name = object_result['SRC_OBJECT_NAME']
1275
+ src_object_type = object_result['SRC_OBJECT_TYPE']
1276
+ trgt_database_name = object_result['TRGT_DATABASE_NAME']
1277
+ trgt_schema_name = object_result['TRGT_SCHEMA_NAME']
1278
+ trgt_object_name = object_result['TRGT_OBJECT_NAME']
1279
+ trgt_object_type = object_result['TRGT_OBJECT_TYPE']
1280
+ column_name = column_result['COLUMN_NAME']
1281
+ in_src = column_result['IN_SRC']
1282
+ in_trgt = column_result['IN_TRGT']
1283
+ in_sync = column_result['IN_SYNC']
1284
+ in_excluded = column_result['IN_EXCLUDED']
1285
+ src_datatype = column_result['SRC_DATATYPE']
1286
+ trgt_datatype = column_result['TRGT_DATATYPE']
1287
+ datatype_equal = column_result['DATATYPE_EQUAL']
1288
+ aggregation_type = column_result['AGGREGATION_TYPE']
1289
+ aggregation_equal = column_result['AGGREGATION_EQUAL']
1290
+ aggregation_result_src = column_result['AGGREGATION_RESULT_SRC']
1291
+ aggregation_result_trgt = column_result['AGGREGATION_RESULT_TRGT']
1292
+ aggregation_difference_trgt_minus_src = column_result['AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC']
1293
+ count_nulls_equal = column_result['COUNT_NULLS_EQUAL']
1294
+ count_nulls_src = column_result['COUNT_NULLS_SRC']
1295
+ count_nulls_trgt = column_result['COUNT_NULLS_TRGT']
1296
+ count_nulls_difference_trgt_minus_src = column_result['COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC']
1297
+ error_query_src = object_result['SRC_ERROR']['QUERY']
1298
+ error_msg_src = object_result['SRC_ERROR']['ERROR']
1299
+ error_query_trgt = object_result['TRGT_ERROR']['QUERY']
1300
+ error_msg_trgt = object_result['TRGT_ERROR']['ERROR']
1301
+ if not (object_result['SRC_ERROR']['ERROR'] and object_result['TRGT_ERROR']['ERROR']):
1302
+ error_flag =False
1303
+ else:
1304
+ error_flag = True
1305
+ insert_statement = f"""INSERT INTO {result_table_columnlevel} ( \
1306
+ RUN_GUID,\
1307
+ PIPELINE_ID,\
1308
+ START_TIME_UTC,\
1309
+ SRC_DATABASE_NAME, \
1310
+ SRC_SCHEMA_NAME, \
1311
+ SRC_OBJECT_NAME, \
1312
+ SRC_OBJECT_TYPE, \
1313
+ TRGT_DATABASE_NAME, \
1314
+ TRGT_SCHEMA_NAME, \
1315
+ TRGT_OBJECT_NAME, \
1316
+ TRGT_OBJECT_TYPE, \
1317
+ COLUMN_NAME,\
1318
+ IN_SRC,\
1319
+ IN_TRGT,\
1320
+ IN_SYNC,\
1321
+ IN_EXCLUDED, \
1322
+ SRC_DATATYPE,\
1323
+ TRGT_DATATYPE,\
1324
+ DATATYPE_EQUAL,\
1325
+ AGGREGATION_TYPE,\
1326
+ AGGREGATION_EQUAL,\
1327
+ AGGREGATION_RESULT_SRC,\
1328
+ AGGREGATION_RESULT_TRGT,\
1329
+ AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC,\
1330
+ COUNT_NULLS_EQUAL,\
1331
+ COUNT_NULLS_SRC,\
1332
+ COUNT_NULLS_TRGT,\
1333
+ COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC,\
1334
+ ERROR_QUERY_SRC ,\
1335
+ ERROR_MSG_SRC ,\
1336
+ ERROR_QUERY_TRGT ,\
1337
+ ERROR_MSG_TRGT ,\
1338
+ ERROR_FLAG,\
1339
+ CREATION_TS_UTC)\
1340
+ SELECT\
1341
+ '{run_guid}' AS RUN_GUID,\
1342
+ '{pipeline_id}' AS PIPELINE_ID,\
1343
+ '{start_time_utc}'::STRING AS START_TIME_UTC,\
1344
+ '{src_database_name}' AS SRC_DATABASE_NAME,\
1345
+ '{src_schema_name}' AS SRC_SCHEMA_NAME,\
1346
+ '{src_object_name}' AS SRC_OBJECT_NAME,\
1347
+ '{src_object_type}' AS SRC_OBJECT_TYPE,\
1348
+ '{trgt_database_name}' AS TRGT_DATABASE_NAME,\
1349
+ '{trgt_schema_name}' AS TRGT_SCHEMA_NAME,\
1350
+ '{trgt_object_name}' AS TRGT_OBJECT_NAME,\
1351
+ '{trgt_object_type}' AS TRGT_OBJECT_TYPE,\
1352
+ '{column_name}' AS COLUMN_NAME,\
1353
+ try_cast('{in_src}' AS BOOLEAN) AS IN_SRC,\
1354
+ try_cast('{in_trgt}' AS BOOLEAN) AS IN_TRGT,\
1355
+ try_cast('{in_sync}' AS BOOLEAN) AS IN_SYNC,\
1356
+ try_cast('{in_excluded}' AS BOOLEAN) AS IN_SYNC,\
1357
+ '{src_datatype}' AS SRC_DATATYPE,\
1358
+ '{trgt_datatype}' AS TRGT_DATATYPE,\
1359
+ '{datatype_equal}' AS DATATYPE_EQUAL,\
1360
+ '{aggregation_type}' AS AGGREGATION_TYPE,\
1361
+ try_cast('{aggregation_equal}' AS BOOLEAN) AS AGGREGATION_EQUAL,\
1362
+ '{aggregation_result_src}' AS AGGREGATION_RESULT_SRC,\
1363
+ '{aggregation_result_trgt}' AS AGGREGATION_RESULT_TRGT,\
1364
+ '{aggregation_difference_trgt_minus_src}' AS AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC,\
1365
+ try_cast('{count_nulls_equal}' AS BOOLEAN) AS COUNT_NULLS_EQUAL,\
1366
+ '{count_nulls_src}'::INT AS COUNT_NULLS_SRC,\
1367
+ '{count_nulls_trgt}'::INT AS COUNT_NULLS_TRGT,\
1368
+ '{count_nulls_difference_trgt_minus_src}' AS COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC,\
1369
+ '{error_query_src}' AS ERROR_QUERY_SRC,\
1370
+ '{error_msg_src}' AS ERROR_MSG_SRC,\
1371
+ '{error_query_trgt}' AS ERROR_QUERY_TRGT,\
1372
+ '{error_msg_trgt}' AS ERROR_MSG_TRGT,\
1373
+ try_cast('{error_flag}' AS BOOLEAN) AS ERROR_FLAG,\
1374
+ '{date_utc}'\
1375
+ ;"""
1376
+
1377
+ self.execute_statement(insert_statement)
1378
+
1379
+