icsDataValidation 1.0.358__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. icsDataValidation/configuration.py +19 -0
  2. icsDataValidation/connection_setups/__init__.py +0 -0
  3. icsDataValidation/connection_setups/azure_connection_setup.py +19 -0
  4. icsDataValidation/connection_setups/databricks_connection_setup.py +28 -0
  5. icsDataValidation/connection_setups/exasol_connection_setup.py +17 -0
  6. icsDataValidation/connection_setups/oracle_connection_setup.py +26 -0
  7. icsDataValidation/connection_setups/snowflake_connection_setup.py +35 -0
  8. icsDataValidation/connection_setups/teradata_connection_setup.py +18 -0
  9. icsDataValidation/core/__init__.py +0 -0
  10. icsDataValidation/core/database_objects.py +18 -0
  11. icsDataValidation/core/object_comparison.py +239 -0
  12. icsDataValidation/input_parameters/__init__.py +0 -0
  13. icsDataValidation/input_parameters/testing_tool_params.py +81 -0
  14. icsDataValidation/main.py +250 -0
  15. icsDataValidation/output_parameters/__init__.py +0 -0
  16. icsDataValidation/output_parameters/result_params.py +94 -0
  17. icsDataValidation/services/__init__.py +0 -0
  18. icsDataValidation/services/comparison_service.py +582 -0
  19. icsDataValidation/services/database_services/__init__.py +0 -0
  20. icsDataValidation/services/database_services/azure_service.py +320 -0
  21. icsDataValidation/services/database_services/databricks_hive_metastore_service.py +1694 -0
  22. icsDataValidation/services/database_services/databricks_unity_catalog_service.py +1379 -0
  23. icsDataValidation/services/database_services/exasol_service.py +261 -0
  24. icsDataValidation/services/database_services/oracle_service.py +713 -0
  25. icsDataValidation/services/database_services/snowflake_service.py +1100 -0
  26. icsDataValidation/services/database_services/teradata_service.py +665 -0
  27. icsDataValidation/services/initialization_service.py +103 -0
  28. icsDataValidation/services/result_service.py +573 -0
  29. icsDataValidation/services/system_service.py +61 -0
  30. icsDataValidation/services/testset_service.py +257 -0
  31. icsDataValidation/utils/__init__.py +0 -0
  32. icsDataValidation/utils/file_util.py +96 -0
  33. icsDataValidation/utils/logger_util.py +96 -0
  34. icsDataValidation/utils/pandas_util.py +159 -0
  35. icsDataValidation/utils/parallelization_util.py +52 -0
  36. icsDataValidation/utils/sql_util.py +14 -0
  37. icsDataValidation-1.0.358.dist-info/METADATA +21 -0
  38. icsDataValidation-1.0.358.dist-info/RECORD +40 -0
  39. icsDataValidation-1.0.358.dist-info/WHEEL +5 -0
  40. icsDataValidation-1.0.358.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1100 @@
1
+ import logging
2
+ from pathlib import PurePath
3
+
4
+ import pandas as pd
5
+ import snowflake.connector
6
+ from cloe_util_snowflake_connector import connection_parameters, snowflake_interface
7
+
8
+ from icsDataValidation.core.database_objects import DatabaseObject
9
+ from icsDataValidation.utils.logger_util import configure_dev_ops_logger
10
+
11
+ #########################################################################################
12
+ #########################################################################################
13
+
14
+ # Configure Dev Ops Logger
15
+
16
+ logger = logging.getLogger("Snowflake_Service")
17
+ logger.setLevel(logging.INFO)
18
+ configure_dev_ops_logger(logger)
19
+
20
+
21
+ class SnowflakeService:
22
+ def __init__(self, connection_params: connection_parameters.ConnectionParameters):
23
+ self.connection_params = connection_params
24
+ self.snowflake_connection = None
25
+ self.snowflake_datatype_mapping = {
26
+ "string": ["text"],
27
+ "numeric": ["number", "float"],
28
+ "date_and_time": ["date", "time", "timestamp_ntz", "timestamp_tz", "timestamp_ltz"],
29
+ "binary": ["binary"],
30
+ "boolean": ["boolean"],
31
+ }
32
+
33
+ def __enter__(self):
34
+ return self
35
+
36
+ def __exit__(self, exception_type, exception_value, traceback):
37
+ if self.snowflake_connection is not None:
38
+ self.snowflake_connection.close()
39
+
40
+ def __del__(self):
41
+ if self.snowflake_connection is not None:
42
+ self.snowflake_connection.close()
43
+
44
+ def _connect_to_snowflake(self):
45
+ self.snowflake_connection = snowflake_interface.SnowflakeInterface(self.connection_params)
46
+ return self.snowflake_connection
47
+
48
+ @staticmethod
49
+ def _get_error_message(excepction: Exception, statement: str) -> None:
50
+ """
51
+ Compose error message if the execution of a statement or query fails.
52
+ """
53
+ if hasattr(excepction, "raw_msg"):
54
+ message = excepction.raw_msg.replace("\n", " ")
55
+ else:
56
+ message = str(
57
+ excepction
58
+ ) # this makes sure that all kinds of errors can have a message, even if they do not have raw_msg attribute
59
+ if hasattr(excepction, "sfqid"):
60
+ message = message + f"\nQuery ID: {excepction.sfqid}"
61
+ return f"Snowflake ERROR: {message}\nFailed statement:\n{statement}"
62
+
63
+ @staticmethod
64
+ def _get_in_clause(key_filters: list, numeric_columns: list, numeric_scale: int) -> str:
65
+ """generates in_clause from list ready to expand the where clause, numeric values are rounded
66
+
67
+ Args:
68
+ key_filters (list): list of given expected values
69
+ numeric_columns (list): list of all numeric columns
70
+ numeric_scale (int): number of decimal places after rounding
71
+
72
+ Returns:
73
+ str: in clause as string
74
+ """
75
+ values = list(key_filters.values())
76
+ in_clause_values = "('"
77
+ for j in range(len(values[0])):
78
+ for value in values:
79
+ in_clause_values += str(value[j]) + "','"
80
+ in_clause_values = in_clause_values[:-2] + "),('"
81
+ in_clause_values = in_clause_values[:-3] + ")"
82
+
83
+ in_clause_cols = " AND (("
84
+ for key in key_filters.keys():
85
+ if key in numeric_columns:
86
+ in_clause_cols += f"""ROUND({key.replace("'", "")},2)""" + ","
87
+ else:
88
+ in_clause_cols += key.replace("'", "") + ","
89
+ in_clause_cols = in_clause_cols[:-1] + ")"
90
+ in_clause = in_clause_cols + " in (" + in_clause_values + ")"
91
+ return in_clause
92
+
93
+ def _get_column_clause(self, column_list: list, columns_datatype: list, numeric_scale, key_columns) -> dict:
94
+ """
95
+ Turns list of desired columns into a sql compatible string.
96
+ Columns with a date or time data type are omitted.
97
+
98
+ Args:
99
+ column_list (list): list of all columns
100
+ columns_datatype (list): datatypes of given columns
101
+ numeric_scale (_type_): number of decimal places for numeric columns
102
+ key_columns (_type_):list of columns of interest
103
+
104
+ Returns:
105
+ dict: _description_
106
+ """
107
+ column_intersecions_new = []
108
+ used_columns = []
109
+ numeric_columns = []
110
+ for column in column_list:
111
+ column_datatype = next(x for x in columns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
112
+
113
+ if column in key_columns or column_datatype.lower() not in self.snowflake_datatype_mapping["date_and_time"]:
114
+ if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
115
+ if numeric_scale:
116
+ column_intersecions_new.append(
117
+ f"CAST(ROUND({column}, {numeric_scale}) as decimal(38,{numeric_scale})) as {column}"
118
+ )
119
+ else:
120
+ column_intersecions_new.append(f"{column} as {column}")
121
+ used_columns.append(column)
122
+ numeric_columns.append(column)
123
+ elif column_datatype.lower() in self.snowflake_datatype_mapping["string"]:
124
+ column_intersecions_new.append(f"{column} AS {column}")
125
+ used_columns.append(column)
126
+ else:
127
+ column_intersecions_new.append(column)
128
+ used_columns.append(column)
129
+
130
+ column_intersections = column_intersecions_new.copy()
131
+ column_clause = str(column_intersections)[1:-1].replace("'", "")
132
+ return column_clause, numeric_columns, used_columns
133
+
134
+ def get_database_objects(
135
+ self, database: str, schema: str = None, object_type_restriction: str = "include_all"
136
+ ) -> dict:
137
+ if self.snowflake_connection is None:
138
+ self._connect_to_snowflake()
139
+
140
+ all_database_tables = []
141
+ all_database_views = []
142
+
143
+ if object_type_restriction == "include_all" or object_type_restriction == "include_only_tables":
144
+ if schema:
145
+ query_db_tables = f"SELECT * FROM {database}.INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '{schema.upper()}' AND TABLE_SCHEMA != 'INFORMATION_SCHEMA' AND TABLE_TYPE ='BASE TABLE'; "
146
+ else:
147
+ query_db_tables = f"SELECT * FROM {database}.INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA != 'INFORMATION_SCHEMA' AND TABLE_TYPE ='BASE TABLE';"
148
+
149
+ all_database_tables = self.execute_queries(query_db_tables)
150
+
151
+ if object_type_restriction == "include_all" or object_type_restriction == "include_only_views":
152
+ if schema:
153
+ query_db_views = f"SELECT * FROM {database}.INFORMATION_SCHEMA.VIEWS WHERE TABLE_SCHEMA = '{schema.upper()}' AND TABLE_SCHEMA != 'INFORMATION_SCHEMA';"
154
+ else:
155
+ query_db_views = (
156
+ f"SELECT * FROM {database}.INFORMATION_SCHEMA.VIEWS WHERE TABLE_SCHEMA != 'INFORMATION_SCHEMA';"
157
+ )
158
+
159
+ all_database_views = self.execute_queries(query_db_views)
160
+
161
+ database_objects = []
162
+ for row in all_database_tables:
163
+ table_identifier = f"{row['TABLE_CATALOG']}.{row['TABLE_SCHEMA']}.{row['TABLE_NAME']}"
164
+ database_objects.append({"object_identifier": table_identifier, "object_type": "table"})
165
+ for row in all_database_views:
166
+ view_identifier = f"{row['TABLE_CATALOG']}.{row['TABLE_SCHEMA']}.{row['TABLE_NAME']}"
167
+ database_objects.append({"object_identifier": view_identifier, "object_type": "view"})
168
+ return database_objects
169
+
170
+ def get_last_altered_timestamp_from_object(self, object: DatabaseObject) -> str:
171
+ """queries last_altered timestamp for given object
172
+
173
+ Args:
174
+ object (str): object for comparison
175
+
176
+ Returns:
177
+ str: last_altered timestamp
178
+ """
179
+ if self.snowflake_connection is None:
180
+ self._connect_to_snowflake()
181
+
182
+ self.execute_statement("ALTER SESSION SET TIMEZONE = 'Europe/London';")
183
+
184
+ query_get_last_altered = f"SELECT LAST_ALTERED FROM {object.database}.INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = '{object.name}' AND TABLE_SCHEMA = '{object.schema}';"
185
+
186
+ last_altered = self.execute_queries(query_get_last_altered)[0]
187
+
188
+ return last_altered
189
+
190
+ def get_columns_from_object(self, object: DatabaseObject) -> list:
191
+ """returns all columns from given object
192
+
193
+ Args:
194
+ object (DatabaseObject): table or view
195
+
196
+ Returns:
197
+ list: list of all columns
198
+ """
199
+
200
+ if self.snowflake_connection is None:
201
+ self._connect_to_snowflake()
202
+
203
+ if object.type == "table":
204
+ query_show_columns = f"SHOW COLUMNS IN TABLE {object.database}.{object.schema}.{object.name};"
205
+
206
+ show_columns_result, query_id, test = self.execute_queries(
207
+ query_show_columns, return_as_pdf=False, return_query_ids=True
208
+ )
209
+
210
+ query_get_columns = f"SELECT $3 AS COLUMN_NAME FROM TABLE(result_scan('{query_id}'));"
211
+
212
+ if object.type == "view":
213
+ query_show_columns = f"SHOW COLUMNS IN VIEW {object.database}.{object.schema}.{object.name};"
214
+
215
+ show_columns_result, query_id, test = self.execute_queries(
216
+ query_show_columns, return_as_pdf=False, return_query_ids=True
217
+ )
218
+
219
+ query_get_columns = f"SELECT $3 AS COLUMN_NAME FROM TABLE(result_scan('{query_id}'));"
220
+
221
+ all_columns = self.execute_queries(query_get_columns)
222
+ columns = []
223
+
224
+ for row in all_columns:
225
+ columns.append(row["COLUMN_NAME"])
226
+
227
+ return columns
228
+
229
+ def get_row_count_from_object(self, object: DatabaseObject, where_clause: str = "") -> int:
230
+ """gets row count from given object
231
+
232
+ Args:
233
+ object (DatabaseObject): table or view
234
+
235
+ Returns:
236
+ int: number of rows in object
237
+ """
238
+
239
+ if self.snowflake_connection is None:
240
+ self._connect_to_snowflake()
241
+
242
+ # TODO is it more efficient to select the information_schema.table view to get the rows?
243
+ query_get_row_count = (
244
+ f"SELECT COUNT(*) AS ROW_COUNT FROM {object.database}.{object.schema}.{object.name} {where_clause};"
245
+ )
246
+ row_count = -1
247
+ error_list = []
248
+
249
+ try:
250
+ row_count = self.execute_queries(query_get_row_count)[0]["ROW_COUNT"]
251
+
252
+ except Exception as err:
253
+ error_list.append(str(err))
254
+ error_list.append(query_get_row_count)
255
+
256
+ return row_count, error_list
257
+
258
+ def get_data_types_from_object(self, object: DatabaseObject, column_intersections: list) -> dict:
259
+ """returns datatypes for all intersection columns in a database object
260
+
261
+ Args:
262
+ object (DatabaseObject): table or view
263
+ column_intersections (list): columns for which the data type is queried
264
+
265
+ Returns:
266
+ dict: columns and their datatype
267
+ """
268
+
269
+ if self.snowflake_connection is None:
270
+ self._connect_to_snowflake()
271
+
272
+ column_intersections = str(column_intersections)[1:-1]
273
+ if column_intersections == "":
274
+ column_intersections = "''"
275
+
276
+ query_get_data_types_from_object = f"SELECT COLUMN_NAME , DATA_TYPE \
277
+ FROM {object.database.upper()}.INFORMATION_SCHEMA.COLUMNS \
278
+ WHERE TABLE_NAME='{object.name.upper()}' \
279
+ AND TABLE_SCHEMA = '{object.schema.upper()}' \
280
+ AND COLUMN_NAME IN ({column_intersections}) \
281
+ ;"
282
+
283
+ dict_colummns_datatype = self.execute_queries(query_get_data_types_from_object)
284
+ return dict_colummns_datatype
285
+
286
+ def get_count_distincts_from_object(
287
+ self, object: DatabaseObject, column_intersections: list, where_clause: str = "", exclude_columns: list = []
288
+ ) -> dict:
289
+ """get distinct count for every column in a database object that is in column intersections list
290
+
291
+ Args:
292
+ object (DatabaseObject): table or view
293
+ column_intersections (list): columns that are used for distinct count
294
+ where_clause (str, optional): optional further filter. Defaults to "".
295
+ exclude_columns (list, optional): columns to exclude from distinct count. Defaults to [].
296
+
297
+ Returns:
298
+ dict: distinct counts for columns
299
+ error_list: list of failed executions for distinct counts
300
+ """
301
+
302
+ if self.snowflake_connection is None:
303
+ self._connect_to_snowflake()
304
+
305
+ unions = ""
306
+
307
+ for column in column_intersections:
308
+ if column not in exclude_columns:
309
+ unions += f" UNION SELECT '{column}' AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.database}.{object.schema}.{object.name} {where_clause}"
310
+
311
+ query_get_count_distincts_from_object = f"{unions[6:]} ORDER BY COUNT_DISTINCT;"
312
+ error_list = []
313
+ try:
314
+ dict_count_distincts = self.execute_queries(query_get_count_distincts_from_object)
315
+
316
+ except Exception as err:
317
+ # raise err
318
+ dict_count_distincts = [{"COUNT_DISTINCT": 0}]
319
+ error_list.append(["ERROR", str(err).split("|||")[0], str(err).split("|||")[1]])
320
+
321
+ return dict_count_distincts, error_list
322
+
323
+ def get_table_size(self, object: DatabaseObject) -> int:
324
+ """returns size of given object
325
+
326
+ Args:
327
+ object (DatabaseObject): table or view
328
+
329
+ Returns:
330
+ int: size of object
331
+ """
332
+
333
+ if self.snowflake_connection is None:
334
+ self._connect_to_snowflake()
335
+
336
+ query_get_table_size = f"SELECT BYTES FROM {object.database.upper()}.INFORMATION_SCHEMA.TABLES WHERE TABLE_CATALOG = '{object.database.upper()}' AND TABLE_SCHEMA = '{object.schema.upper()}' AND TABLE_NAME = '{object.name.upper()}' AND BYTES IS NOT NULL;"
337
+
338
+ size = self.execute_queries(query_get_table_size)[0]["BYTES"]
339
+
340
+ return size
341
+
342
+ def create_checksums(
343
+ self,
344
+ object: DatabaseObject,
345
+ column_intersections: list,
346
+ where_clause: str = "",
347
+ exclude_columns: list = [],
348
+ numeric_scale: int = None,
349
+ ) -> list[dict]:
350
+ """creates checksums for given object in compliance with given conditions
351
+
352
+ Args:
353
+ object (DatabaseObject): table or view
354
+ column_intersections (list): columns that are used for checksums
355
+ where_clause (str, optional): Optional filter criteria given as sql-usable string. Defaults to "".
356
+ exclude_columns (list, optional): columns to exlude from calculation. Defaults to [].
357
+ numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
358
+
359
+ Returns:
360
+ List[Dict]: checksums for columns of object
361
+ """
362
+
363
+ if self.snowflake_connection is None:
364
+ self._connect_to_snowflake()
365
+
366
+ column_intersections = [f"{x.upper()}" for x in column_intersections if x not in exclude_columns]
367
+
368
+ dict_colummns_datatype = self.get_data_types_from_object(object, column_intersections)
369
+
370
+ aggregates = ""
371
+ count_nulls = ""
372
+
373
+ for column in column_intersections:
374
+ column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
375
+
376
+ count_nulls += f", SUM(CASE WHEN {column} IS NULL THEN 1 ELSE 0 END) AS COUNTNULLS_{column}"
377
+
378
+ if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
379
+ if numeric_scale:
380
+ aggregates += (
381
+ f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38, {numeric_scale})) AS sum_{column}"
382
+ )
383
+ else:
384
+ aggregates += f", CAST(SUM({column}) AS DECIMAL(38)) AS sum_{column}"
385
+
386
+ elif (
387
+ column_datatype.lower() in self.snowflake_datatype_mapping["string"]
388
+ or column_datatype.lower() in self.snowflake_datatype_mapping["date_and_time"]
389
+ ):
390
+ aggregates += f", COUNT(DISTINCT LOWER({column})) AS countdistinct_{column}"
391
+
392
+ elif column_datatype.lower() in self.snowflake_datatype_mapping["binary"]:
393
+ aggregates += f", COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column}::VARCHAR))) AS countdistinct_{column}"
394
+
395
+ elif column_datatype.lower() in self.snowflake_datatype_mapping["boolean"]:
396
+ aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = true)::VARCHAR || '_' || MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = false) :: VARCHAR AS aggregateboolean_{column}"
397
+
398
+ # else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
399
+
400
+ query_checksums = (
401
+ f"SELECT {aggregates[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
402
+ )
403
+
404
+ query_countnulls = (
405
+ f"SELECT {count_nulls[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
406
+ )
407
+
408
+ error_list = []
409
+ test_list = []
410
+ aggregation_results = {}
411
+
412
+ try:
413
+ checksums_results = self.execute_queries([query_checksums, query_countnulls])
414
+
415
+ aggregation_results = checksums_results[0][0]
416
+
417
+ countnulls_results = checksums_results[1][0]
418
+
419
+ for i in range(0, len(aggregation_results)):
420
+ if list(aggregation_results.values())[i] is None:
421
+ agg_result = 0
422
+ else:
423
+ agg_result = list(aggregation_results.values())[i]
424
+
425
+ if list(countnulls_results.values())[i] is None:
426
+ cnt_result = 0
427
+ else:
428
+ cnt_result = list(countnulls_results.values())[i]
429
+
430
+ test_list.append(
431
+ [[item.split("_", 1)[0] for item in list(aggregation_results.keys())][i], agg_result, cnt_result]
432
+ )
433
+
434
+ except Exception as err:
435
+ error_list.append(["ERROR", str(err).split("|||")[0], str(err).split("|||")[1]])
436
+
437
+ checksums = dict(zip([item.split("_", 1)[1] for item in aggregation_results.keys()], test_list))
438
+ checksums["TESTATM_ERRORS"] = error_list
439
+
440
+ return checksums
441
+
442
+ def create_pandas_df_from_group_by(
443
+ self,
444
+ object: DatabaseObject,
445
+ column_intersections: list,
446
+ group_by_columns: list,
447
+ group_by_aggregation_columns: list,
448
+ group_by_aggregation_type: str,
449
+ only_numeric: bool,
450
+ where_clause: str,
451
+ exclude_columns: list,
452
+ numeric_scale: int = None,
453
+ ) -> list[dict]:
454
+ """execution of multiple aggregations at once
455
+
456
+ Args:
457
+ object (DatabaseObject): table or view
458
+ column_intersections (list): columns existing in src and trgt
459
+ group_by_columns (list): columns for grouping the aggregations
460
+ group_by_aggregation_columns (list): list of columns that are supposed to be aggregated
461
+ group_by_aggregation_type (str): choice between: only_min_max, various, various_and_min_max
462
+ only_numeric (bool): whether to also include distinct counts or only do numeric aggregations
463
+ where_clause (str): optional filter for aggregations, given as sql compatible where-string
464
+ exclude_columns (list): columns to exclude from comparisons
465
+ numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
466
+
467
+ Returns:
468
+ List[Dict]: list of pandas dataframes with results from aggregations, used sql queries
469
+ """
470
+
471
+ if self.snowflake_connection is None:
472
+ self._connect_to_snowflake()
473
+
474
+ if group_by_aggregation_columns == ["all"]:
475
+ aggregation_columns = [
476
+ f"{column.upper()}"
477
+ for column in column_intersections
478
+ if (column not in group_by_columns and column not in exclude_columns)
479
+ ]
480
+ else:
481
+ aggregation_columns = [
482
+ f"{column.upper()}"
483
+ for column in column_intersections
484
+ if (column in group_by_aggregation_columns and column not in exclude_columns)
485
+ ]
486
+
487
+ group_by_query_columns_string = " "
488
+ grouping_columns_final = []
489
+ error_dict = {}
490
+
491
+ try:
492
+ for column in group_by_columns:
493
+ if column in column_intersections and column not in exclude_columns:
494
+ group_by_query_columns_string += f"{column} ,"
495
+ grouping_columns_final.append(column)
496
+
497
+ group_by_query_columns_string = group_by_query_columns_string[:-1]
498
+
499
+ dict_colummns_datatype = self.get_data_types_from_object(object, aggregation_columns)
500
+
501
+ aggregates = ""
502
+ aggregates_min = ""
503
+
504
+ for column in aggregation_columns:
505
+ column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
506
+
507
+ if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
508
+ if numeric_scale:
509
+ aggregates_min += f", CAST(ROUND(MIN({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MIN_{column}, CAST(ROUND(max({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MAX_{column}"
510
+ aggregates += f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS SUM_{column}"
511
+ else:
512
+ aggregates_min += f", MIN({column}) AS MIN_{column}, MAX({column}) AS MAX_{column}"
513
+ aggregates += f", SUM({column}) AS SUM_{column}"
514
+
515
+ elif not only_numeric and (
516
+ column_datatype.lower() in self.snowflake_datatype_mapping["string"]
517
+ or column_datatype.lower() in self.snowflake_datatype_mapping["date_and_time"]
518
+ ):
519
+ aggregates += f", COUNT(DISTINCT LOWER({column})) AS COUNTDISTINCT_{column}"
520
+
521
+ elif not only_numeric and column_datatype.lower() in self.snowflake_datatype_mapping["binary"]:
522
+ aggregates += f", COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column}::VARCHAR))) AS COUNTDISTINCT_{column}"
523
+
524
+ elif not only_numeric and column_datatype.lower() in self.snowflake_datatype_mapping["boolean"]:
525
+ aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = true)::VARCHAR || '_' || MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = false) :: VARCHAR AS AGGREGATEBOOLEAN_{column}"
526
+
527
+ # else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
528
+
529
+ # CASE 1: min_max
530
+ if group_by_aggregation_type == "only_min_max":
531
+ group_by_query_aggregation_string = aggregates_min[1:]
532
+
533
+ # CASE 2: sum, count_distinct, aggregate_boolean
534
+ elif group_by_aggregation_type == "various":
535
+ group_by_query_aggregation_string = aggregates[1:]
536
+
537
+ # CASE 3: sum, count_distinct, aggregate_boolean, min_max
538
+ elif group_by_aggregation_type == "various_and_min_max":
539
+ group_by_query_aggregation_string = f"{aggregates_min[1:]}{aggregates}"
540
+
541
+ query_group_by_aggregation = f"SELECT {group_by_query_columns_string}, COUNT(*) AS COUNT_OF_GROUP_BY_VALUE, {group_by_query_aggregation_string} FROM {object.database}.{object.schema}.{object.name} {where_clause} GROUP BY {group_by_query_columns_string} ORDER BY {group_by_query_columns_string};"
542
+
543
+ group_by_aggregation_pdf = self.execute_queries(query_group_by_aggregation, True)
544
+ except Exception as err:
545
+ group_by_aggregation_pdf = pd.DataFrame()
546
+ group_by_aggregation_pdf["TESTATM_ERROR"] = [1]
547
+ if not grouping_columns_final:
548
+ error_dict = {
549
+ "QUERY": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table",
550
+ "ERROR": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table",
551
+ }
552
+ group_by_query_aggregation_string = ""
553
+ elif "|||" in str(err):
554
+ error_dict = {"QUERY": str(err).split("|||")[0], "ERROR": str(err).split("|||")[1]}
555
+ else:
556
+ error_dict = {
557
+ "QUERY": "NO Query generated. Please check if the configurated Grouping Columns exist in the Table",
558
+ "ERROR": str(err),
559
+ }
560
+ group_by_query_aggregation_string = ""
561
+
562
+ return (
563
+ group_by_aggregation_pdf,
564
+ group_by_query_aggregation_string,
565
+ group_by_query_columns_string,
566
+ grouping_columns_final,
567
+ error_dict,
568
+ )
569
+
570
+ def create_pandas_df(
571
+ self,
572
+ object: DatabaseObject,
573
+ intersection_columns_trgt_src: list,
574
+ where_clause: str = "",
575
+ exclude_columns: list = [],
576
+ ) -> pd.DataFrame:
577
+ """creates pandas dataframes with all data from given object in given columns
578
+
579
+ Args:
580
+ object (DatabaseObject): table or view
581
+ intersection_columns_trgt_src (list): columns existing in source and target
582
+
583
+ Returns:
584
+ pd.DataFrame: direct result of sql query
585
+ """
586
+
587
+ if self.snowflake_connection is None:
588
+ self._connect_to_snowflake()
589
+
590
+ intersection_columns_trgt_src_ = ", ".join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
591
+
592
+ df_query = f"SELECT {intersection_columns_trgt_src_} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
593
+
594
+ src_pdf = self.execute_queries(df_query, True)
595
+
596
+ return src_pdf
597
+
598
+ def create_pandas_df_from_sample(
599
+ self,
600
+ object: DatabaseObject,
601
+ column_intersections: list,
602
+ key_columns: list,
603
+ where_clause: str = "",
604
+ exclude_columns: list = [],
605
+ key_filters: dict = {},
606
+ dedicated_columns: list = [],
607
+ sample_count: int = 10,
608
+ numeric_scale: int = None,
609
+ ) -> list[dict]:
610
+ if self.snowflake_connection is None:
611
+ self._connect_to_snowflake()
612
+
613
+ sample_count = str(sample_count)
614
+ key_intersection = list((set(column_intersections) & set(key_columns)) - set(exclude_columns))
615
+ filter_intersection = list((set(column_intersections) & set(key_filters.keys())) - set(exclude_columns))
616
+ dedicated_intersection = list((set(column_intersections) & set(dedicated_columns)) - set(exclude_columns))
617
+
618
+ key_intersection.sort()
619
+ filter_intersection.sort()
620
+ dedicated_intersection.sort()
621
+
622
+ if not where_clause:
623
+ where_clause = "WHERE 1=1 "
624
+
625
+ if dedicated_intersection != []:
626
+ is_dedicated = True
627
+
628
+ dict_colummns_datatype = self.get_data_types_from_object(object, dedicated_intersection)
629
+
630
+ else:
631
+ is_dedicated = False
632
+
633
+ dict_colummns_datatype = self.get_data_types_from_object(object, column_intersections)
634
+
635
+ if key_intersection != [] and is_dedicated:
636
+ keys = str(key_intersection)[1:-1].replace("'", "")
637
+ column_clause, numeric_columns, used_columns = self._get_column_clause(
638
+ dedicated_intersection, dict_colummns_datatype, numeric_scale, key_columns
639
+ )
640
+ if (key_filters != {}) & (filter_intersection != []):
641
+ values = list(key_filters.values())
642
+ if values[0] != []:
643
+ in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale)
644
+ else:
645
+ in_clause = ""
646
+ else:
647
+ in_clause = ""
648
+ sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} SAMPLE ({sample_count} ROWS) {where_clause}{in_clause} ORDER BY {keys};"
649
+ elif key_intersection != [] and not is_dedicated:
650
+ keys = str(key_intersection)[1:-1].replace("'", "")
651
+ column_clause, numeric_columns, used_columns = self._get_column_clause(
652
+ column_intersections, dict_colummns_datatype, numeric_scale, key_columns
653
+ )
654
+ if (key_filters != {}) & (filter_intersection != []):
655
+ values = list(key_filters.values())
656
+ if values[0] != []:
657
+ in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale)
658
+ else:
659
+ in_clause = ""
660
+ else:
661
+ in_clause = ""
662
+ sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} SAMPLE ({sample_count} ROWS) {where_clause}{in_clause} ORDER BY {keys};"
663
+ else:
664
+ column_intersections = list(set(column_intersections) - set(exclude_columns))
665
+ column_intersections.sort()
666
+ column_clause, numeric_columns, used_columns = self._get_column_clause(
667
+ column_intersections, dict_colummns_datatype, numeric_scale, key_columns
668
+ )
669
+ sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} SAMPLE ({sample_count} ROWS) {where_clause};"
670
+
671
+ error_dict = {}
672
+ key_dict = {}
673
+ try:
674
+ sample_pdf = self.execute_queries(sample_query, return_as_pdf=True)
675
+ for key in key_intersection:
676
+ if pd.api.types.is_datetime64_any_dtype(sample_pdf[key]):
677
+ key_dict[key] = list(sample_pdf[key].astype(str))
678
+ else:
679
+ key_dict[key] = list(sample_pdf[key])
680
+
681
+ except Exception as err:
682
+ sample_pdf = pd.DataFrame()
683
+ sample_pdf["TESTATM_ERROR"] = [1]
684
+ if "|||" in str(err):
685
+ error_dict = {"QUERY": str(err).split("|||")[0], "ERROR": str(err).split("|||")[1]}
686
+ else:
687
+ error_dict = {"QUERY": "No SQL Error", "ERROR": str(err)}
688
+
689
+ return_list = []
690
+ return_list.append(sample_pdf)
691
+ return_list.append(error_dict)
692
+
693
+ return return_list, key_dict, used_columns, sample_query
694
+
695
+ def execute_queries(
696
+ self, query: str | list[str], return_as_pdf: bool = False, return_query_ids: bool = False
697
+ ) -> list[dict] | list[list[dict]]:
698
+ """actual execution of defined queries
699
+
700
+ Args:
701
+ query (Union[str, List[str]]): queries to be executed
702
+ return_as_pdf (bool, optional): If true, queries returned as pandas data frames. Defaults to False.
703
+ return_query_ids (bool, optional): If true, results and queri ids are returned, otherwise only results. Defaults to False.
704
+
705
+ Raises:
706
+ Exception: Raises exception if single query cannot be executed.
707
+
708
+ Returns:
709
+ Union[List[Dict], List[List[Dict]]]: returns results or results with query-ids
710
+ """
711
+
712
+ if self.snowflake_connection is None:
713
+ self._connect_to_snowflake()
714
+
715
+ if query:
716
+ query_list: list[str] = query if isinstance(query, list) else [query]
717
+ else:
718
+ logger.error("Query defined as null - please check input for execute_queries function.")
719
+
720
+ cursor = self.snowflake_connection.get_connection_object().cursor(snowflake.connector.DictCursor)
721
+
722
+ results = []
723
+ query_ids = []
724
+
725
+ for single_query in query_list:
726
+ try:
727
+ query_result = cursor.execute(single_query).fetchall()
728
+ if return_as_pdf:
729
+ query_result = pd.DataFrame(query_result)
730
+
731
+ results.append(query_result)
732
+ query_ids.append(cursor.sfqid)
733
+
734
+ except Exception as err:
735
+ raise Exception(single_query + "|||" + str(err))
736
+
737
+ if return_query_ids:
738
+ return results[0], query_ids[0] if not isinstance(query, list) else results, query_ids
739
+
740
+ else:
741
+ return results[0] if not isinstance(query, list) else results
742
+
743
+ def execute_statement(self, statement: str | list[str]) -> None:
744
+ """
745
+ Executes simple statement against snowflake
746
+ Schema and Database settings must be set beforehand
747
+ Args:
748
+ statement Union[str, List[str]] - a sql statement or a list of sql statements to execute
749
+ """
750
+ if self.snowflake_connection is None:
751
+ self._connect_to_snowflake()
752
+
753
+ statement_list: list[str] = statement if isinstance(statement, list) else [statement]
754
+
755
+ try:
756
+ for single_statement in statement_list:
757
+ stripped_statement = single_statement.strip()
758
+ _ = self.snowflake_connection.get_connection_object().execute_string(stripped_statement)
759
+
760
+ except Exception as err:
761
+ raise Exception(self._get_error_message(err, single_statement)) from err
762
+
763
+ def upload_to_stage(self, stage_name: str, folder_path: str, file_name: str, is_temporary: bool):
764
+ file_path = PurePath(folder_path).joinpath(PurePath(file_name))
765
+
766
+ if is_temporary:
767
+ create_query = f"CREATE TEMPORARY STAGE IF NOT EXISTS {stage_name};"
768
+ else:
769
+ create_query = f"CREATE STAGE IF NOT EXISTS {stage_name};"
770
+
771
+ put_query = rf"PUT 'file://{file_path}' @{stage_name};"
772
+
773
+ put_query = put_query.replace("\\", "\\\\")
774
+
775
+ self.execute_statement(create_query)
776
+
777
+ self.execute_statement(put_query)
778
+
779
+ def insert_json_results(
780
+ self,
781
+ run_guid: str,
782
+ pipeline_name: str,
783
+ pipeline_id: str,
784
+ start_time_utc: str,
785
+ result_table: str,
786
+ stage_name: str,
787
+ ) -> None:
788
+ """
789
+ copy into - result table for json results
790
+ """
791
+ result_database = result_table.split(".")[0]
792
+ meta_data_schema = result_table.split(".")[1]
793
+
794
+ statement = f"COPY INTO {result_table} (RUN_GUID, PIPELINE_NAME, PIPELINE_ID, START_TIME_UTC, RESULT, CREATION_TIME_UTC) FROM (SELECT '{run_guid}', '{pipeline_name}', '{pipeline_id}', '{start_time_utc}', $1, SYSDATE() from @{stage_name} (file_format => {result_database}.{meta_data_schema}.ff_json ));"
795
+
796
+ self.execute_statement(statement)
797
+
798
+ def insert_json_results_live(
799
+ self,
800
+ run_guid: str,
801
+ pipeline_name: str,
802
+ pipeline_id: str,
803
+ result_table: str,
804
+ stage_name: str,
805
+ source_system: str,
806
+ target_system: str,
807
+ database: str,
808
+ schema: str,
809
+ object: str,
810
+ ) -> None:
811
+ """
812
+ copy into - result table for json results live
813
+ """
814
+ result_database = result_table.split(".")[0]
815
+ meta_data_schema = result_table.split(".")[1]
816
+
817
+ statement = f"COPY INTO {result_table} (RUN_GUID, PIPELINE_NAME, PIPELINE_ID, SOURCE_SYSTEM, TARGET_SYSTEM, DATABASE_NAME, SCHEMA_NAME, OBJECT_NAME ,RESULT, CREATION_TS) FROM (SELECT '{run_guid}', '{pipeline_name}', '{pipeline_id}', '{source_system}', '{target_system}', '{database}', '{schema}', '{object}', $1, SYSDATE() from @{stage_name} (file_format => {result_database}.{meta_data_schema}.ff_json ));"
818
+
819
+ self.execute_statement(statement)
820
+
821
+ def insert_highlevel_results(
822
+ self, results: dict, run_guid: str, pipeline_name: str, pipeline_id: str, result_table_highlevel: str
823
+ ) -> None:
824
+ """
825
+ insert into - highlevel results per "pipeline run" / "ics data validation execution"
826
+ """
827
+ TESTSET_ = ", ".join(results["TESTSET"])
828
+
829
+ OBJECTS_TO_COMPARE_SRC_ = ", ".join(results["OBJECTS_TO_COMPARE_SRC"])
830
+
831
+ OBJECTS_TO_COMPARE_TRGT_ = ", ".join(results["OBJECTS_TO_COMPARE_TRGT"])
832
+
833
+ SRC_MINUS_TRGT_ = ", ".join(results["SRC_MINUS_TRGT"])
834
+
835
+ TRGT_MINUS_SRC_ = ", ".join(results["TRGT_MINUS_SRC"])
836
+
837
+ insert_statement = f"INSERT INTO {result_table_highlevel} ( \
838
+ RUN_GUID, \
839
+ PIPELINE_NAME, \
840
+ PIPELINE_ID, \
841
+ START_TIME_UTC, \
842
+ SOURCE_SYSTEM, \
843
+ TARGET_SYSTEM, \
844
+ DATABASE_NAME, \
845
+ TESTSET, \
846
+ ALL_OBJECTS_MATCHING, \
847
+ ALL_COLUMNS_EQUAL, \
848
+ ALL_DATATYPES_EQUAL, \
849
+ ALL_ROWCOUNTS_EQUAL, \
850
+ ALL_CHECKSUMS_EQUAL, \
851
+ ALL_SAMPLES_EQUAL, \
852
+ ALL_OBJECTS_EQUAL, \
853
+ OBJECTS_TO_COMPARE_SRC, \
854
+ OBJECTS_TO_COMPARE_TRGT, \
855
+ NUMBER_OF_OBJECTS_TO_COMPARE, \
856
+ SRC_MINUS_TRGT, \
857
+ TRGT_MINUS_SRC, \
858
+ CREATION_TS) \
859
+ VALUES \
860
+ ('{run_guid}', \
861
+ '{pipeline_name}', \
862
+ '{pipeline_id}', \
863
+ '{results['START_TIME_UTC']}', \
864
+ '{results['SOURCE_SYSTEM']}', \
865
+ '{results['TARGET_SYSTEM']}', \
866
+ '{results['DATABASE_NAME']}', \
867
+ '{TESTSET_}', \
868
+ '{results['ALL_OBJECTS_MATCHING']}', \
869
+ '{results['ALL_COLUMNS_EQUAL']}', \
870
+ '{results['ALL_DATATYPES_EQUAL']}', \
871
+ '{results['ALL_ROWCOUNTS_EQUAL']}', \
872
+ '{results['ALL_CHECKSUMS_EQUAL']}', \
873
+ NULLIF('{results['ALL_SAMPLES_EQUAL']}', 'None'), \
874
+ NULLIF('{results['ALL_OBJECTS_EQUAL']}', 'None'), \
875
+ '{OBJECTS_TO_COMPARE_SRC_}', \
876
+ '{OBJECTS_TO_COMPARE_TRGT_}', \
877
+ '{results['NUMBER_OF_OBJECTS_TO_COMPARE']}', \
878
+ '{SRC_MINUS_TRGT_}', \
879
+ '{TRGT_MINUS_SRC_}', \
880
+ SYSDATE())"
881
+
882
+ self.execute_statement(insert_statement)
883
+
884
+ def insert_objectlevel_results(self, result_table: dict, result_table_objectlevel: str, run_guid: str) -> None:
885
+ """
886
+ insert into - detailed results per object
887
+ """
888
+ insert_statement = f"INSERT INTO {result_table_objectlevel} ( \
889
+ RUN_GUID, \
890
+ PIPELINE_ID, \
891
+ START_TIME_UTC,\
892
+ SRC_DATABASE_NAME, \
893
+ SRC_SCHEMA_NAME, \
894
+ SRC_OBJECT_NAME, \
895
+ SRC_OBJECT_TYPE, \
896
+ TRGT_DATABASE_NAME, \
897
+ TRGT_SCHEMA_NAME, \
898
+ TRGT_OBJECT_NAME, \
899
+ TRGT_OBJECT_TYPE, \
900
+ SRC_FILTER, \
901
+ TRGT_FILTER, \
902
+ EXCLUDED_COLUMNS, \
903
+ COLUMNS_EQUAL, \
904
+ COLUMN_INTERSECTION, \
905
+ SRC_COLUMNS_MINUS_TRGT_COLUMNS, \
906
+ TRGT_COLUMNS_MINUS_SRC_COLUMNS, \
907
+ DATATYPES_EQUAL, \
908
+ ROW_COUNTS_EQUAL, \
909
+ SRC_ROW_COUNT, \
910
+ TRGT_ROW_COUNT, \
911
+ ALL_COUNT_NULLS_EQUAL, \
912
+ AGGREGATIONS_EQUAL, \
913
+ AGGREGATIONS_EQUAL_TOLERATED,\
914
+ SRC_ERROR_QUERY, \
915
+ TRGT_ERROR_QUERY, \
916
+ SRC_ERROR_MSG, \
917
+ TRGT_ERROR_MSG, \
918
+ GROUP_BY_COLUMNS, \
919
+ GROUP_BY_EQUAL, \
920
+ GROUP_BY_VALUES_WITH_MISMATCHES, \
921
+ COLUMNS_WITH_MISMATCH, \
922
+ GROUP_BY_DIFF_DICT, \
923
+ SRC_GROUP_BY_QUERY, \
924
+ TRGT_GROUP_BY_QUERY, \
925
+ SRC_GROUP_BY_ERROR, \
926
+ TRGT_GROUP_BY_ERROR, \
927
+ SAMPLES_COMPARED, \
928
+ SAMPLES_EQUAL, \
929
+ SAMPLE_KEYS, \
930
+ SRC_SAMPLE, \
931
+ TRGT_SAMPLE, \
932
+ SRC_SAMPLE_QUERY, \
933
+ TRGT_SAMPLE_QUERY, \
934
+ SRC_SAMPLE_ERROR_MSG, \
935
+ TRGT_SAMPLE_ERROR_MSG, \
936
+ PANDAS_DATAFRAME_COMPARED, \
937
+ PANDAS_DATAFRAME_EQUAL, \
938
+ SRC_NOT_ALTERED_DURING_COMPARISON, \
939
+ TRGT_NOT_ALTERED_DURING_COMPARISON, \
940
+ SRC_LAST_ALTERED, \
941
+ TRGT_LAST_ALTERED, \
942
+ CREATION_TS) \
943
+ SELECT\
944
+ RESULTS.RUN_GUID AS RUN_GUID, \
945
+ RESULTS.PIPELINE_ID AS PIPELINE_ID, \
946
+ RESULTS.START_TIME_UTC::VARCHAR AS START_TIME_UTC, \
947
+ F1.VALUE:SRC_DATABASE_NAME::VARCHAR AS SRC_DATABASE_NAME, \
948
+ F1.VALUE:SRC_SCHEMA_NAME::VARCHAR AS SRC_SCHEMA_NAME, \
949
+ F1.VALUE:SRC_OBJECT_NAME::VARCHAR AS SRC_OBJECT_NAME, \
950
+ F1.VALUE:SRC_OBJECT_TYPE::VARCHAR AS SRC_OBJECT_TYPE, \
951
+ F1.VALUE:TRGT_DATABASE_NAME::VARCHAR AS TRGT_DATABASE_NAME, \
952
+ F1.VALUE:TRGT_SCHEMA_NAME::VARCHAR AS TRGT_SCHEMA_NAME, \
953
+ F1.VALUE:TRGT_OBJECT_NAME::VARCHAR AS TRGT_OBJECT_NAME, \
954
+ F1.VALUE:TRGT_OBJECT_TYPE::VARCHAR AS TRGT_OBJECT_TYPE, \
955
+ F1.VALUE:SRC_FILTER::VARCHAR AS SRC_FILTER, \
956
+ F1.VALUE:TRGT_FILTER::VARCHAR AS TRGT_FILTER, \
957
+ F1.VALUE:EXCLUDED_COLUMNS AS EXCLUDED_COLUMNS, \
958
+ F1.VALUE:COLUMNS_EQUAL::BOOLEAN AS COLUMNS_EQUAL, \
959
+ F1.VALUE:COLUMN_INTERSECTION AS COLUMN_INTERSECTION, \
960
+ F1.VALUE:SRC_COLUMNS_MINUS_TRGT_COLUMNS AS SRC_COLUMNS_MINUS_TRGT_COLUMNS, \
961
+ F1.VALUE:TRGT_COLUMNS_MINUS_SRC_COLUMNS AS TRGT_COLUMNS_MINUS_SRC_COLUMNS, \
962
+ F1.VALUE:DATATYPES_EQUAL::BOOLEAN AS DATATYPES_EQUAL, \
963
+ F1.VALUE:ROW_COUNTS_EQUAL::BOOLEAN AS ROW_COUNTS_EQUAL, \
964
+ F1.VALUE:SRC_ROW_COUNT::INT AS SRC_ROW_COUNT, \
965
+ F1.VALUE:TRGT_ROW_COUNT::INT AS TRGT_ROW_COUNT, \
966
+ F1.VALUE:ALL_COUNT_NULLS_EQUAL::BOOLEAN AS ALL_COUNT_NULLS_EQUAL, \
967
+ F1.VALUE:AGGREGATIONS_EQUAL::BOOLEAN AS AGGREGATIONS_EQUAL, \
968
+ F1.VALUE:AGGREGATIONS_EQUAL_TOLERATED::BOOLEAN AS AGGREGATIONS_EQUAL_TOLERATED,\
969
+ F1.VALUE:SRC_ERROR:QUERY::VARCHAR AS SRC_ERROR_QUERY, \
970
+ F1.VALUE:TRGT_ERROR:QUERY::VARCHAR AS TRGT_ERROR_QUERY, \
971
+ F1.VALUE:SRC_ERROR:ERROR::VARCHAR AS SRC_ERROR_MSG, \
972
+ F1.VALUE:TRGT_ERROR:ERROR::VARCHAR AS TRGT_ERROR_MSG, \
973
+ F1.VALUE:GROUP_BY_COLUMNS AS GROUP_BY_COLUMNS, \
974
+ F1.VALUE:GROUP_BY_EQUAL::BOOLEAN AS GROUP_BY_EQUAL, \
975
+ F1.VALUE:GROUP_BY_VALUES_WITH_MISMATCHES AS GROUP_BY_VALUES_WITH_MISMATCHES, \
976
+ F1.VALUE:COLUMNS_WITH_MISMATCH AS COLUMNS_WITH_MISMATCH, \
977
+ F1.VALUE:GROUP_BY_DIFF_DICT AS GROUP_BY_DIFF_DICT, \
978
+ CASE WHEN F1.VALUE:SRC_GROUP_BY_ERROR::VARCHAR = '{{}}' \
979
+ THEN NULLIF(F1.VALUE:SRC_GROUP_BY_QUERY::VARCHAR, '') \
980
+ WHEN F1.VALUE:SRC_GROUP_BY_ERROR::VARCHAR != '{{}}' \
981
+ THEN NULLIF(F1.VALUE:SRC_GROUP_BY_ERROR:QUERY::VARCHAR, '') \
982
+ END AS SRC_GROUP_BY_QUERY, \
983
+ CASE WHEN F1.VALUE:TRGT_GROUP_BY_ERROR::VARCHAR = '{{}}' \
984
+ THEN NULLIF(F1.VALUE:TRGT_GROUP_BY_QUERY::VARCHAR, '') \
985
+ WHEN F1.VALUE:TRGT_GROUP_BY_ERROR::VARCHAR != '{{}}' \
986
+ THEN NULLIF(F1.VALUE:TRGT_GROUP_BY_ERROR:QUERY::VARCHAR, '') \
987
+ END AS TRGT_GROUP_BY_QUERY, \
988
+ CASE WHEN F1.VALUE:SRC_GROUP_BY_ERROR::VARCHAR = '{{}}' \
989
+ THEN NULL \
990
+ ELSE F1.VALUE:SRC_GROUP_BY_ERROR::VARCHAR \
991
+ END AS SRC_GROUP_BY_ERROR, \
992
+ CASE WHEN F1.VALUE:TRGT_GROUP_BY_ERROR::VARCHAR = '{{}}' \
993
+ THEN NULL \
994
+ ELSE F1.VALUE:TRGT_GROUP_BY_ERROR::VARCHAR \
995
+ END AS TRGT_GROUP_BY_ERROR, \
996
+ F1.VALUE:SAMPLES_COMPARED::BOOLEAN AS SAMPLES_COMPARED, \
997
+ F1.VALUE:SAMPLES_EQUAL::BOOLEAN AS SAMPLES_EQUAL, \
998
+ F1.VALUE:SAMPLE_KEYS AS SAMPLE_KEYS, \
999
+ F1.VALUE:SRC_SAMPLE AS SRC_SAMPLE, \
1000
+ F1.VALUE:TRGT_SAMPLE AS TRGT_SAMPLE, \
1001
+ F1.VALUE:SRC_SAMPLE_QUERY AS SRC_SAMPLE_QUERY, \
1002
+ F1.VALUE:TRGT_SAMPLE_QUERY AS TRGT_SAMPLE_QUERY, \
1003
+ F1.VALUE:SRC_SAMPLE_ERROR_DICT:ERROR::VARCHAR AS SRC_SAMPLE_ERROR_MSG, \
1004
+ F1.VALUE:TRGT_SAMPLE_ERROR_DICT:ERROR::VARCHAR AS TRGT_SAMPLE_ERROR_MSG, \
1005
+ F1.VALUE:PANDAS_DATAFRAME_COMPARED::BOOLEAN AS PANDAS_DATAFRAME_COMPARED, \
1006
+ F1.VALUE:PANDAS_DATAFRAME_EQUAL::BOOLEAN AS PANDAS_DATAFRAME_EQUAL, \
1007
+ F1.VALUE:SRC_NOT_ALTERED_DURING_COMPARISON::BOOLEAN AS SRC_NOT_ALTERED_DURING_COMPARISON, \
1008
+ F1.VALUE:TRGT_NOT_ALTERED_DURING_COMPARISON::BOOLEAN AS TRGT_NOT_ALTERED_DURING_COMPARISON, \
1009
+ F1.VALUE:SRC_LAST_ALTERE::VARCHAR AS SRC_LAST_ALTERED, \
1010
+ F1.VALUE:TRGT_LAST_ALTERED::VARCHAR AS TRGT_LAST_ALTERED, \
1011
+ SYSDATE() \
1012
+ FROM {result_table} RESULTS \
1013
+ CROSS JOIN LATERAL FLATTEN(INPUT => RESULT:OBJECTS) F1\
1014
+ WHERE RUN_GUID = '{run_guid}'\
1015
+ ;"
1016
+
1017
+ self.execute_statement(insert_statement)
1018
+
1019
+ def insert_columnlevel_results(self, result_table: str, result_table_columnlevel: str, run_guid: str) -> None:
1020
+ """
1021
+ insert into - detailed results per column
1022
+ """
1023
+ insert_statement = f"INSERT INTO {result_table_columnlevel} ( \
1024
+ RUN_GUID,\
1025
+ PIPELINE_ID,\
1026
+ START_TIME_UTC,\
1027
+ SRC_DATABASE_NAME, \
1028
+ SRC_SCHEMA_NAME, \
1029
+ SRC_OBJECT_NAME, \
1030
+ SRC_OBJECT_TYPE, \
1031
+ TRGT_DATABASE_NAME, \
1032
+ TRGT_SCHEMA_NAME, \
1033
+ TRGT_OBJECT_NAME, \
1034
+ TRGT_OBJECT_TYPE, \
1035
+ COLUMN_NAME,\
1036
+ IN_SRC,\
1037
+ IN_TRGT,\
1038
+ IN_SYNC,\
1039
+ IN_EXCLUDED,\
1040
+ SRC_DATATYPE,\
1041
+ TRGT_DATATYPE,\
1042
+ DATATYPE_EQUAL,\
1043
+ AGGREGATION_TYPE,\
1044
+ AGGREGATION_EQUAL,\
1045
+ AGGREGATION_RESULT_SRC,\
1046
+ AGGREGATION_RESULT_TRGT,\
1047
+ AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC,\
1048
+ AGGREGATION_EQUAL_TOLERATED,\
1049
+ COUNT_NULLS_EQUAL,\
1050
+ COUNT_NULLS_SRC,\
1051
+ COUNT_NULLS_TRGT,\
1052
+ COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC,\
1053
+ ERROR_QUERY_SRC ,\
1054
+ ERROR_MSG_SRC ,\
1055
+ ERROR_QUERY_TRGT ,\
1056
+ ERROR_MSG_TRGT ,\
1057
+ ERROR_FLAG,\
1058
+ CREATION_TS )\
1059
+ SELECT\
1060
+ RESULTS.RUN_GUID AS RUN_GUID,\
1061
+ RESULTS.PIPELINE_ID AS PIPELINE_ID,\
1062
+ RESULTS.START_TIME_UTC::VARCHAR AS START_TIME_UTC,\
1063
+ F1.VALUE:SRC_DATABASE_NAME::VARCHAR AS SRC_DATABASE_NAME,\
1064
+ F1.VALUE:SRC_SCHEMA_NAME::VARCHAR AS SRC_SCHEMA_NAME,\
1065
+ F1.VALUE:SRC_OBJECT_NAME::VARCHAR AS SRC_OBJECT_NAME,\
1066
+ F1.VALUE:SRC_OBJECT_TYPE::VARCHAR AS SRC_OBJECT_TYPE,\
1067
+ F1.VALUE:TRGT_DATABASE_NAME::VARCHAR AS TRGT_DATABASE_NAME,\
1068
+ F1.VALUE:TRGT_SCHEMA_NAME::VARCHAR AS TRGT_SCHEMA_NAME,\
1069
+ F1.VALUE:TRGT_OBJECT_NAME::VARCHAR AS TRGT_OBJECT_NAME,\
1070
+ F1.VALUE:TRGT_OBJECT_TYPE::VARCHAR AS TRGT_OBJECT_TYPE,\
1071
+ F2.VALUE:COLUMN_NAME::VARCHAR AS COLUMN_NAME,\
1072
+ F2.VALUE:IN_SRC::BOOLEAN AS IN_SRC,\
1073
+ F2.VALUE:IN_TRGT::BOOLEAN AS IN_TRGT,\
1074
+ F2.VALUE:IN_SYNC::BOOLEAN AS IN_SYNC,\
1075
+ F2.VALUE:IN_EXCLUDED::BOOLEAN AS IN_EXCLUDED,\
1076
+ F2.VALUE:SRC_DATATYPE::VARCHAR AS SRC_DATATYPE,\
1077
+ F2.VALUE:TRGT_DATATYPE::VARCHAR AS TRGT_DATATYPE,\
1078
+ F2.VALUE:DATATYPE_EQUAL::BOOLEAN AS DATATYPE_EQUAL,\
1079
+ F2.VALUE:AGGREGATION_TYPE::VARCHAR AS AGGREGATION_TYPE,\
1080
+ F2.VALUE:AGGREGATION_EQUAL::BOOLEAN AS AGGREGATION_EQUAL,\
1081
+ F2.VALUE:AGGREGATION_RESULT_SRC::VARCHAR AS AGGREGATION_RESULT_SRC,\
1082
+ F2.VALUE:AGGREGATION_RESULT_TRGT::VARCHAR AS AGGREGATION_RESULT_TRGT,\
1083
+ F2.VALUE:AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC::VARCHAR AS AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC,\
1084
+ F2.VALUE:AGGREGATION_EQUAL_TOLERATED::BOOLEAN AS AGGREGATION_EQUAL_TOLERATED,\
1085
+ F2.VALUE:COUNT_NULLS_EQUAL::BOOLEAN AS COUNT_NULLS_EQUAL,\
1086
+ F2.VALUE:COUNT_NULLS_SRC::VARCHAR AS COUNT_NULLS_SRC,\
1087
+ F2.VALUE:COUNT_NULLS_TRGT::VARCHAR AS COUNT_NULLS_TRGT,\
1088
+ F2.VALUE:COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC::VARCHAR AS COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC,\
1089
+ F1.VALUE:SRC_ERROR:QUERY::VARCHAR AS ERROR_QUERY_SRC,\
1090
+ F1.VALUE:SRC_ERROR:ERROR::VARCHAR AS ERROR_MSG_SRC,\
1091
+ F1.VALUE:TRGT_ERROR:QUERY::VARCHAR AS ERROR_QUERY_TRGT,\
1092
+ F1.VALUE:TRGT_ERROR:ERROR::VARCHAR AS ERROR_MSG_TRGT,\
1093
+ CASE WHEN ERROR_MSG_SRC IS NULL AND ERROR_MSG_TRGT IS NULL THEN FALSE ELSE TRUE END AS ERROR_FLAG,\
1094
+ SYSDATE()\
1095
+ FROM {result_table} RESULTS\
1096
+ CROSS JOIN LATERAL FLATTEN(INPUT => RESULT:OBJECTS) F1\
1097
+ CROSS JOIN LATERAL FLATTEN(INPUT => F1.VALUE:COLUMNS) F2\
1098
+ WHERE RUN_GUID = '{run_guid}';"
1099
+
1100
+ self.execute_statement(insert_statement)