icsDataValidation 1.0.358__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- icsDataValidation/configuration.py +19 -0
- icsDataValidation/connection_setups/__init__.py +0 -0
- icsDataValidation/connection_setups/azure_connection_setup.py +19 -0
- icsDataValidation/connection_setups/databricks_connection_setup.py +28 -0
- icsDataValidation/connection_setups/exasol_connection_setup.py +17 -0
- icsDataValidation/connection_setups/oracle_connection_setup.py +26 -0
- icsDataValidation/connection_setups/snowflake_connection_setup.py +35 -0
- icsDataValidation/connection_setups/teradata_connection_setup.py +18 -0
- icsDataValidation/core/__init__.py +0 -0
- icsDataValidation/core/database_objects.py +18 -0
- icsDataValidation/core/object_comparison.py +239 -0
- icsDataValidation/input_parameters/__init__.py +0 -0
- icsDataValidation/input_parameters/testing_tool_params.py +81 -0
- icsDataValidation/main.py +250 -0
- icsDataValidation/output_parameters/__init__.py +0 -0
- icsDataValidation/output_parameters/result_params.py +94 -0
- icsDataValidation/services/__init__.py +0 -0
- icsDataValidation/services/comparison_service.py +582 -0
- icsDataValidation/services/database_services/__init__.py +0 -0
- icsDataValidation/services/database_services/azure_service.py +320 -0
- icsDataValidation/services/database_services/databricks_hive_metastore_service.py +1694 -0
- icsDataValidation/services/database_services/databricks_unity_catalog_service.py +1379 -0
- icsDataValidation/services/database_services/exasol_service.py +261 -0
- icsDataValidation/services/database_services/oracle_service.py +713 -0
- icsDataValidation/services/database_services/snowflake_service.py +1100 -0
- icsDataValidation/services/database_services/teradata_service.py +665 -0
- icsDataValidation/services/initialization_service.py +103 -0
- icsDataValidation/services/result_service.py +573 -0
- icsDataValidation/services/system_service.py +61 -0
- icsDataValidation/services/testset_service.py +257 -0
- icsDataValidation/utils/__init__.py +0 -0
- icsDataValidation/utils/file_util.py +96 -0
- icsDataValidation/utils/logger_util.py +96 -0
- icsDataValidation/utils/pandas_util.py +159 -0
- icsDataValidation/utils/parallelization_util.py +52 -0
- icsDataValidation/utils/sql_util.py +14 -0
- icsDataValidation-1.0.358.dist-info/METADATA +21 -0
- icsDataValidation-1.0.358.dist-info/RECORD +40 -0
- icsDataValidation-1.0.358.dist-info/WHEEL +5 -0
- icsDataValidation-1.0.358.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1694 @@
|
|
|
1
|
+
from databricks import sql as databricks_sqlconnect
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
|
|
7
|
+
from typing import Union, List, Dict
|
|
8
|
+
from pathlib import PurePath
|
|
9
|
+
|
|
10
|
+
from icsDataValidation.utils.logger_util import configure_dev_ops_logger
|
|
11
|
+
from icsDataValidation.core.database_objects import DatabaseObject
|
|
12
|
+
|
|
13
|
+
#########################################################################################
|
|
14
|
+
#########################################################################################
|
|
15
|
+
|
|
16
|
+
# Configure Dev Ops Logger
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("Databricks_Hive_Metastore_Service")
|
|
19
|
+
logger.setLevel(logging.INFO)
|
|
20
|
+
configure_dev_ops_logger(logger)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DatabricksHiveMetastoreService(object):
|
|
24
|
+
def __init__(self, connection_params: dict):
|
|
25
|
+
self.connection_params = connection_params
|
|
26
|
+
self.databricks_connection = None
|
|
27
|
+
self.databricks_datatype_mapping = {
|
|
28
|
+
"string": ["string", "array", "map", "struct"],
|
|
29
|
+
"numeric": [
|
|
30
|
+
"int",
|
|
31
|
+
"bigint",
|
|
32
|
+
"double",
|
|
33
|
+
"decimal",
|
|
34
|
+
"float",
|
|
35
|
+
"smallint",
|
|
36
|
+
"tinyint",
|
|
37
|
+
],
|
|
38
|
+
"date_and_time": [
|
|
39
|
+
"timestamp",
|
|
40
|
+
"date",
|
|
41
|
+
"interval",
|
|
42
|
+
"timestamp_ntz",
|
|
43
|
+
"timestamp_tz",
|
|
44
|
+
"timestamp_ltz",
|
|
45
|
+
],
|
|
46
|
+
"binary": ["binary"],
|
|
47
|
+
"boolean": ["boolean"],
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
def __enter__(self):
|
|
51
|
+
return self
|
|
52
|
+
|
|
53
|
+
def __exit__(self, exception_type, exception_value, traceback):
|
|
54
|
+
if self.databricks_connection is not None:
|
|
55
|
+
self.databricks_connection.close()
|
|
56
|
+
|
|
57
|
+
def __del__(self):
|
|
58
|
+
if self.databricks_connection is not None:
|
|
59
|
+
self.databricks_connection.close()
|
|
60
|
+
|
|
61
|
+
def _connect_to_databricks(self):
|
|
62
|
+
self.databricks_connection = databricks_sqlconnect.connect(
|
|
63
|
+
**self.connection_params
|
|
64
|
+
)
|
|
65
|
+
return self.databricks_connection
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def _get_error_message(excepction: Exception, statement: str) -> None:
|
|
69
|
+
"""
|
|
70
|
+
Compose error message if the execution of a statement or query fails.
|
|
71
|
+
"""
|
|
72
|
+
if hasattr(excepction, "raw_msg"):
|
|
73
|
+
message = excepction.raw_msg.replace("\n", " ")
|
|
74
|
+
else:
|
|
75
|
+
message = str(
|
|
76
|
+
excepction
|
|
77
|
+
) # this makes sure that all kinds of errors can have a message, even if they do not have raw_msg attribute
|
|
78
|
+
if hasattr(excepction, "sfqid"):
|
|
79
|
+
message = message + f"\nQuery ID: {excepction.sfqid}"
|
|
80
|
+
return f"Databricks ERROR: {message}\nFailed statement:\n{statement}"
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def _get_in_clause(
|
|
84
|
+
key_filters: list,
|
|
85
|
+
numeric_columns: list,
|
|
86
|
+
numeric_scale: int,
|
|
87
|
+
where_exists: bool = True,
|
|
88
|
+
) -> str:
|
|
89
|
+
"""generates in_clause from list ready to expand the where clause, numeric values are rounded
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
key_filters (list): list of given expected values
|
|
93
|
+
numeric_columns (list): list of all numeric columns
|
|
94
|
+
numeric_scale (int): number of decimal places after rounding
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
str: in clause as string
|
|
98
|
+
"""
|
|
99
|
+
values = list(key_filters.values())
|
|
100
|
+
in_clause_values = "('"
|
|
101
|
+
for j in range(len(values[0])):
|
|
102
|
+
for value in values:
|
|
103
|
+
in_clause_values += str(value[j]) + "','"
|
|
104
|
+
in_clause_values = in_clause_values[:-2] + "),('"
|
|
105
|
+
in_clause_values = in_clause_values[:-3] + ")"
|
|
106
|
+
|
|
107
|
+
if where_exists:
|
|
108
|
+
in_clause_cols = f" AND (("
|
|
109
|
+
else:
|
|
110
|
+
in_clause_cols = f" WHERE (("
|
|
111
|
+
for key in key_filters.keys():
|
|
112
|
+
if key in numeric_columns:
|
|
113
|
+
in_clause_cols += f"""ROUND({key.replace("'", "")},2)""" + ","
|
|
114
|
+
else:
|
|
115
|
+
in_clause_cols += key.replace("'", "") + ","
|
|
116
|
+
in_clause_cols = in_clause_cols[:-1] + ")"
|
|
117
|
+
in_clause = in_clause_cols + " in (" + in_clause_values + ")"
|
|
118
|
+
return in_clause
|
|
119
|
+
|
|
120
|
+
def _get_column_clause(
|
|
121
|
+
self, column_list: list, columns_datatype: list, numeric_scale, key_columns
|
|
122
|
+
) -> dict:
|
|
123
|
+
"""turns list of desired columns into a sql compatible string
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
column_list (list): list of all columns
|
|
127
|
+
columns_datatype (list): datatypes of given columns
|
|
128
|
+
numeric_scale (_type_): number of decimal places for numeric columns
|
|
129
|
+
key_columns (_type_):list of columns of interest
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
dict: _description_
|
|
133
|
+
"""
|
|
134
|
+
column_intersecions_new = []
|
|
135
|
+
used_columns = []
|
|
136
|
+
numeric_columns = []
|
|
137
|
+
for column in column_list:
|
|
138
|
+
column_datatype = next(
|
|
139
|
+
x for x in columns_datatype if x["COLUMN_NAME"] == column
|
|
140
|
+
)["DATA_TYPE"]
|
|
141
|
+
|
|
142
|
+
if column in key_columns or not (
|
|
143
|
+
column_datatype.lower()
|
|
144
|
+
in self.databricks_datatype_mapping["date_and_time"]
|
|
145
|
+
):
|
|
146
|
+
if (
|
|
147
|
+
column_datatype.lower()
|
|
148
|
+
in self.databricks_datatype_mapping["numeric"]
|
|
149
|
+
):
|
|
150
|
+
if numeric_scale:
|
|
151
|
+
column_intersecions_new.append(
|
|
152
|
+
f"CAST(ROUND({column}, {numeric_scale}) as decimal(38,{numeric_scale})) as {column}"
|
|
153
|
+
)
|
|
154
|
+
else:
|
|
155
|
+
column_intersecions_new.append(f"{column} as {column}")
|
|
156
|
+
used_columns.append(column)
|
|
157
|
+
numeric_columns.append(column)
|
|
158
|
+
elif (
|
|
159
|
+
column_datatype.lower()
|
|
160
|
+
in self.databricks_datatype_mapping["string"]
|
|
161
|
+
):
|
|
162
|
+
column_intersecions_new.append(f"{column} AS {column}")
|
|
163
|
+
used_columns.append(column)
|
|
164
|
+
else:
|
|
165
|
+
column_intersecions_new.append(column)
|
|
166
|
+
used_columns.append(column)
|
|
167
|
+
|
|
168
|
+
column_intersections = column_intersecions_new.copy()
|
|
169
|
+
column_clause = str(column_intersections)[1:-1].replace("'", "")
|
|
170
|
+
return column_clause, numeric_columns, used_columns
|
|
171
|
+
|
|
172
|
+
def get_database_objects(
|
|
173
|
+
self,
|
|
174
|
+
database: str,
|
|
175
|
+
schema: str = None,
|
|
176
|
+
object_type_restriction: str = "include_all",
|
|
177
|
+
) -> dict:
|
|
178
|
+
if self.databricks_connection is None:
|
|
179
|
+
self._connect_to_databricks()
|
|
180
|
+
|
|
181
|
+
all_database_tables = []
|
|
182
|
+
all_database_views = []
|
|
183
|
+
|
|
184
|
+
if (
|
|
185
|
+
object_type_restriction == "include_all"
|
|
186
|
+
or object_type_restriction == "include_only_tables"
|
|
187
|
+
):
|
|
188
|
+
if schema:
|
|
189
|
+
query_db_tables = f"SHOW TABLES IN {database}.{schema}"
|
|
190
|
+
else:
|
|
191
|
+
logger.error(
|
|
192
|
+
"Query defined as null - please check input for execute_queries function."
|
|
193
|
+
)
|
|
194
|
+
exit()
|
|
195
|
+
|
|
196
|
+
all_database_tables = self.execute_queries(query_db_tables)
|
|
197
|
+
|
|
198
|
+
if (
|
|
199
|
+
object_type_restriction == "include_all"
|
|
200
|
+
or object_type_restriction == "include_only_views"
|
|
201
|
+
):
|
|
202
|
+
if schema:
|
|
203
|
+
query_db_views = f"SHOW VIEWS IN {schema}"
|
|
204
|
+
else:
|
|
205
|
+
logger.error(
|
|
206
|
+
"Query defined as null - please check input for execute_queries function."
|
|
207
|
+
)
|
|
208
|
+
exit()
|
|
209
|
+
|
|
210
|
+
all_database_views = self.execute_queries(query_db_views)
|
|
211
|
+
|
|
212
|
+
database_objects = []
|
|
213
|
+
for row in all_database_tables:
|
|
214
|
+
database_table = (
|
|
215
|
+
f'hive_metastore.{row["database"]}.{row["tableName"]}'.upper()
|
|
216
|
+
)
|
|
217
|
+
database_objects.append(
|
|
218
|
+
{"object_identifier": database_table, "object_type": "table"}
|
|
219
|
+
)
|
|
220
|
+
for row in all_database_views:
|
|
221
|
+
database_view = f'{row["TABLE_CATALOG"]}.{row["TABLE_SCHEMA"]}.{row["TABLE_NAME"]}'.upper()
|
|
222
|
+
database_objects.append(
|
|
223
|
+
{"object_identifier": database_view, "object_type": "view"}
|
|
224
|
+
)
|
|
225
|
+
return database_objects
|
|
226
|
+
|
|
227
|
+
def get_last_altered_timestamp_from_object(self, object: DatabaseObject) -> str:
|
|
228
|
+
"""queries last_altered timestamp for given object
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
object (str): object for comparison
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
str: last_altered timestamp
|
|
235
|
+
"""
|
|
236
|
+
if self.databricks_connection is None:
|
|
237
|
+
self._connect_to_databricks()
|
|
238
|
+
|
|
239
|
+
self.execute_statement("ALTER SESSION SET TIMEZONE = 'Europe/London';")
|
|
240
|
+
|
|
241
|
+
query_get_last_altered = f"SELECT LAST_ALTERED FROM {object.database}.INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = '{object.name}' AND TABLE_SCHEMA = '{object.schema}';"
|
|
242
|
+
|
|
243
|
+
last_altered = self.execute_queries(query_get_last_altered)[0]
|
|
244
|
+
|
|
245
|
+
return last_altered
|
|
246
|
+
|
|
247
|
+
def get_columns_from_object(self, object: DatabaseObject) -> list:
|
|
248
|
+
"""returns all columns from given object
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
object (DatabaseObject): table or view
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
list: list of all columns
|
|
255
|
+
"""
|
|
256
|
+
|
|
257
|
+
if self.databricks_connection is None:
|
|
258
|
+
self._connect_to_databricks()
|
|
259
|
+
|
|
260
|
+
query_show_columns = (
|
|
261
|
+
f"SHOW COLUMNS IN {object.database}.{object.schema}.{object.name};"
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
all_columns = self.execute_queries(query_show_columns)
|
|
265
|
+
columns = []
|
|
266
|
+
|
|
267
|
+
for row in all_columns:
|
|
268
|
+
columns.append(row["col_name"])
|
|
269
|
+
|
|
270
|
+
return columns
|
|
271
|
+
|
|
272
|
+
def get_row_count_from_object(self, object: DatabaseObject, where_clause: str="") -> int:
|
|
273
|
+
"""gets row count from given object
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
object (DatabaseObject): table or view
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
int: number of rows in object
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
if self.databricks_connection is None:
|
|
283
|
+
self._connect_to_databricks()
|
|
284
|
+
|
|
285
|
+
# is it more efficient to select the information_schema.table view to get the rows?
|
|
286
|
+
query_get_row_count = f"SELECT COUNT(*) AS ROW_COUNT FROM {object.database}.{object.schema}.{object.name} {where_clause};"
|
|
287
|
+
row_count = -1
|
|
288
|
+
error_list = []
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
row_count = self.execute_queries(query_get_row_count)[0]["ROW_COUNT"]
|
|
292
|
+
|
|
293
|
+
except Exception as err:
|
|
294
|
+
error_list.append(str(err))
|
|
295
|
+
error_list.append(query_get_row_count)
|
|
296
|
+
|
|
297
|
+
return row_count, error_list
|
|
298
|
+
|
|
299
|
+
def get_data_types_from_object(
|
|
300
|
+
self, object: DatabaseObject, column_intersections: list
|
|
301
|
+
) -> dict:
|
|
302
|
+
"""returns datatypes for all intersection columns in a database object
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
object (DatabaseObject): table or view
|
|
306
|
+
column_intersections (list): columns for which the data type is queried
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
dict: columns and their datatype
|
|
310
|
+
"""
|
|
311
|
+
|
|
312
|
+
if self.databricks_connection is None:
|
|
313
|
+
self._connect_to_databricks()
|
|
314
|
+
|
|
315
|
+
column_intersections = str(column_intersections)[1:-1]
|
|
316
|
+
if column_intersections == "":
|
|
317
|
+
column_intersections = "''"
|
|
318
|
+
|
|
319
|
+
query_get_data_types_from_object = (
|
|
320
|
+
f"DESCRIBE TABLE {object.database}.{object.schema}.{object.name};"
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
table_description = self.execute_queries(query_get_data_types_from_object)
|
|
324
|
+
|
|
325
|
+
dict_colummns_datatype = []
|
|
326
|
+
|
|
327
|
+
for row in table_description:
|
|
328
|
+
dict_colummns_datatype.append(
|
|
329
|
+
{"COLUMN_NAME": row["col_name"], "DATA_TYPE": row["data_type"]}
|
|
330
|
+
)
|
|
331
|
+
return dict_colummns_datatype
|
|
332
|
+
|
|
333
|
+
def get_count_distincts_from_object(
|
|
334
|
+
self,
|
|
335
|
+
object: DatabaseObject,
|
|
336
|
+
column_intersections: list,
|
|
337
|
+
where_clause: str = "",
|
|
338
|
+
exclude_columns: list = [],
|
|
339
|
+
) -> dict:
|
|
340
|
+
"""get distinct count for every column in a database object that is in column intersections list
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
object (DatabaseObject): table or view
|
|
344
|
+
column_intersections (list): columns that are used for distinct count
|
|
345
|
+
where_clause (str, optional): optional further filter. Defaults to "".
|
|
346
|
+
exclude_columns (list, optional): columns to exclude from distinct count. Defaults to [].
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
dict: distinct counts for columns
|
|
350
|
+
error_list: list of failed executions for distinct counts
|
|
351
|
+
"""
|
|
352
|
+
|
|
353
|
+
if self.databricks_connection is None:
|
|
354
|
+
self._connect_to_databricks()
|
|
355
|
+
|
|
356
|
+
unions = ""
|
|
357
|
+
|
|
358
|
+
for column in column_intersections:
|
|
359
|
+
if column not in exclude_columns:
|
|
360
|
+
unions += f" UNION SELECT '{column}' AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.database}.{object.schema}.{object.name} {where_clause}"
|
|
361
|
+
|
|
362
|
+
query_get_count_distincts_from_object = f"{unions[6:]} ORDER BY COUNT_DISTINCT;"
|
|
363
|
+
error_list = []
|
|
364
|
+
try:
|
|
365
|
+
dict_count_distincts = self.execute_queries(
|
|
366
|
+
query_get_count_distincts_from_object
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
except Exception as err:
|
|
370
|
+
# raise err
|
|
371
|
+
dict_count_distincts = [{"COUNT_DISTINCT": 0}]
|
|
372
|
+
error_list.append(
|
|
373
|
+
["ERROR", str(err).split("|||")[0], str(err).split("|||")[1]]
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
return dict_count_distincts, error_list
|
|
377
|
+
|
|
378
|
+
def get_table_size(self, object: DatabaseObject) -> int:
|
|
379
|
+
"""returns size of given object
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
object (DatabaseObject): table or view
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
int: size of object
|
|
386
|
+
"""
|
|
387
|
+
|
|
388
|
+
if self.databricks_connection is None:
|
|
389
|
+
self._connect_to_databricks()
|
|
390
|
+
|
|
391
|
+
query_analyze_table = f"ANALYZE TABLE {object.database}.{object.schema}.{object.name} COMPUTE STATISTICS NOSCAN"
|
|
392
|
+
self.execute_queries(query_analyze_table)
|
|
393
|
+
|
|
394
|
+
query_get_table_size = (
|
|
395
|
+
f"DESC EXTENDED {object.database}.{object.schema}.{object.name}"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
table_description = self.execute_queries(query_get_table_size)
|
|
399
|
+
size_string = [
|
|
400
|
+
row["data_type"]
|
|
401
|
+
for row in table_description
|
|
402
|
+
if row["col_name"] == "Statistics"
|
|
403
|
+
][0]
|
|
404
|
+
size = int(re.search(r"\d+", size_string).group())
|
|
405
|
+
|
|
406
|
+
return size
|
|
407
|
+
|
|
408
|
+
def create_checksums(
|
|
409
|
+
self,
|
|
410
|
+
object: DatabaseObject,
|
|
411
|
+
column_intersections: list,
|
|
412
|
+
where_clause: str = "",
|
|
413
|
+
exclude_columns: list = [],
|
|
414
|
+
numeric_scale: int = None,
|
|
415
|
+
) -> List[Dict]:
|
|
416
|
+
"""creates checksums for given object in compliance with given conditions
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
object (DatabaseObject): table or view
|
|
420
|
+
column_intersections (list): columns that are used for checksums
|
|
421
|
+
where_clause (str, optional): Optional filter criteria given as sql-usable string. Defaults to "".
|
|
422
|
+
exclude_columns (list, optional): columns to exlude from calculation. Defaults to [].
|
|
423
|
+
numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
List[Dict]: checksums for columns of object
|
|
427
|
+
"""
|
|
428
|
+
|
|
429
|
+
if self.databricks_connection is None:
|
|
430
|
+
self._connect_to_databricks()
|
|
431
|
+
|
|
432
|
+
column_intersections = [
|
|
433
|
+
f"{x.upper()}" for x in column_intersections if x not in exclude_columns
|
|
434
|
+
]
|
|
435
|
+
|
|
436
|
+
dict_colummns_datatype = self.get_data_types_from_object(
|
|
437
|
+
object, column_intersections
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
aggregates = ""
|
|
441
|
+
count_nulls = ""
|
|
442
|
+
|
|
443
|
+
for column in column_intersections:
|
|
444
|
+
column_datatype = next(
|
|
445
|
+
x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column
|
|
446
|
+
)["DATA_TYPE"]
|
|
447
|
+
|
|
448
|
+
count_nulls += f", SUM(CASE WHEN {column} IS NULL THEN 1 ELSE 0 END) AS COUNTNULLS_{column}"
|
|
449
|
+
|
|
450
|
+
if column_datatype.lower() in self.databricks_datatype_mapping["numeric"]:
|
|
451
|
+
if numeric_scale:
|
|
452
|
+
aggregates += f", CAST(ROUND(SUM({column}), {numeric_scale}) as decimal(38, {numeric_scale})) AS sum_{column}"
|
|
453
|
+
else:
|
|
454
|
+
aggregates += (
|
|
455
|
+
f", CAST(SUM({column}) as decimal(38)) AS sum_{column}"
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
elif (
|
|
459
|
+
column_datatype.lower() in self.databricks_datatype_mapping["string"]
|
|
460
|
+
or column_datatype.lower()
|
|
461
|
+
in self.databricks_datatype_mapping["date_and_time"]
|
|
462
|
+
):
|
|
463
|
+
aggregates += (
|
|
464
|
+
f", COUNT(DISTINCT LOWER({column})) AS countdistinct_{column}"
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
elif column_datatype.lower() in self.databricks_datatype_mapping["binary"]:
|
|
468
|
+
aggregates += f", COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column}::VARCHAR))) AS countdistinct_{column}"
|
|
469
|
+
|
|
470
|
+
elif column_datatype.lower() in self.databricks_datatype_mapping["boolean"]:
|
|
471
|
+
aggregates += f", MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = true)) || '_' || MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = false)) AS aggregateboolean_{column}"
|
|
472
|
+
|
|
473
|
+
# else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
|
|
474
|
+
|
|
475
|
+
query_checksums = f"SELECT {aggregates[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
|
|
476
|
+
|
|
477
|
+
query_countnulls = f"SELECT {count_nulls[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
|
|
478
|
+
|
|
479
|
+
error_list = []
|
|
480
|
+
checksums={}
|
|
481
|
+
|
|
482
|
+
try:
|
|
483
|
+
checksums_results = self.execute_queries(
|
|
484
|
+
[query_checksums, query_countnulls]
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
aggregation_results = checksums_results[0][0]
|
|
488
|
+
countnulls_results = checksums_results[1][0]
|
|
489
|
+
|
|
490
|
+
checksums = {}
|
|
491
|
+
for key in aggregation_results.asDict().keys():
|
|
492
|
+
aggregation = key.split("_", 1)[0].upper()
|
|
493
|
+
col_name = key.split("_", 1)[1]
|
|
494
|
+
value = aggregation_results[key]
|
|
495
|
+
cnt_nulls = countnulls_results[f"COUNTNULLS_{col_name}"]
|
|
496
|
+
checksums[col_name] = [aggregation, value, cnt_nulls]
|
|
497
|
+
|
|
498
|
+
except Exception as err:
|
|
499
|
+
# TODO: Improve error formatting
|
|
500
|
+
error_list.append(["ERROR",query_checksums, str(err)])
|
|
501
|
+
|
|
502
|
+
checksums["TESTATM_ERRORS"] = error_list
|
|
503
|
+
|
|
504
|
+
return checksums
|
|
505
|
+
|
|
506
|
+
def create_pandas_df_from_group_by(
|
|
507
|
+
self,
|
|
508
|
+
object: DatabaseObject,
|
|
509
|
+
column_intersections: list,
|
|
510
|
+
group_by_columns: list,
|
|
511
|
+
group_by_aggregation_columns: list,
|
|
512
|
+
group_by_aggregation_type: str,
|
|
513
|
+
only_numeric: bool,
|
|
514
|
+
where_clause: str,
|
|
515
|
+
exclude_columns: list,
|
|
516
|
+
numeric_scale: int = None,
|
|
517
|
+
) -> List[Dict]:
|
|
518
|
+
"""execution of multiple aggregations at once
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
object (DatabaseObject): table or view
|
|
522
|
+
column_intersections (list): columns existing in src and trgt
|
|
523
|
+
group_by_columns (list): columns for grouping the aggregations
|
|
524
|
+
group_by_aggregation_columns (list): list of columns that are supposed to be aggregated
|
|
525
|
+
group_by_aggregation_type (str): choice between: only_min_max, various, various_and_min_max
|
|
526
|
+
only_numeric (bool): whether to also include distinct counts or only do numeric aggregations
|
|
527
|
+
where_clause (str): optional filter for aggregations, given as sql compatible where-string
|
|
528
|
+
exclude_columns (list): columns to exclude from comparisons
|
|
529
|
+
numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
List[Dict]: list of pandas dataframes with results from aggregations, used sql queries
|
|
533
|
+
"""
|
|
534
|
+
|
|
535
|
+
if self.databricks_connection is None:
|
|
536
|
+
self._connect_to_databricks()
|
|
537
|
+
|
|
538
|
+
if group_by_aggregation_columns == ["all"]:
|
|
539
|
+
aggregation_columns = [
|
|
540
|
+
f"{column.upper()}"
|
|
541
|
+
for column in column_intersections
|
|
542
|
+
if (column not in group_by_columns and column not in exclude_columns)
|
|
543
|
+
]
|
|
544
|
+
else:
|
|
545
|
+
aggregation_columns = [
|
|
546
|
+
f"{column.upper()}"
|
|
547
|
+
for column in column_intersections
|
|
548
|
+
if (
|
|
549
|
+
column in group_by_aggregation_columns
|
|
550
|
+
and column not in exclude_columns
|
|
551
|
+
)
|
|
552
|
+
]
|
|
553
|
+
|
|
554
|
+
group_by_query_columns_string = " "
|
|
555
|
+
grouping_columns_final = []
|
|
556
|
+
error_dict = {}
|
|
557
|
+
|
|
558
|
+
try:
|
|
559
|
+
for column in group_by_columns:
|
|
560
|
+
if column in column_intersections and column not in exclude_columns:
|
|
561
|
+
group_by_query_columns_string += f"{column} ,"
|
|
562
|
+
grouping_columns_final.append(column)
|
|
563
|
+
|
|
564
|
+
group_by_query_columns_string = group_by_query_columns_string[:-1]
|
|
565
|
+
|
|
566
|
+
dict_colummns_datatype = self.get_data_types_from_object(
|
|
567
|
+
object, aggregation_columns
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
aggregates = ""
|
|
571
|
+
aggregates_min = ""
|
|
572
|
+
|
|
573
|
+
for column in aggregation_columns:
|
|
574
|
+
column_datatype = next(
|
|
575
|
+
x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column
|
|
576
|
+
)["DATA_TYPE"]
|
|
577
|
+
|
|
578
|
+
if (
|
|
579
|
+
column_datatype.lower()
|
|
580
|
+
in self.databricks_datatype_mapping["numeric"]
|
|
581
|
+
):
|
|
582
|
+
if numeric_scale:
|
|
583
|
+
aggregates_min += f", CAST(ROUND(MIN({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MIN_{column}, CAST(ROUND(max({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MAX_{column}"
|
|
584
|
+
aggregates += f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS SUM_{column}"
|
|
585
|
+
|
|
586
|
+
else:
|
|
587
|
+
aggregates_min += f", MIN({column}) AS MIN_{column}, MAX({column}) AS MAX_{column}"
|
|
588
|
+
aggregates += f", SUM({column}) AS SUM_{column}"
|
|
589
|
+
|
|
590
|
+
elif not only_numeric and (
|
|
591
|
+
column_datatype.lower()
|
|
592
|
+
in self.databricks_datatype_mapping["string"]
|
|
593
|
+
or column_datatype.lower()
|
|
594
|
+
in self.databricks_datatype_mapping["date_and_time"]
|
|
595
|
+
):
|
|
596
|
+
aggregates += (
|
|
597
|
+
f", COUNT(DISTINCT LOWER({column})) AS COUNTDISTINCT_{column}"
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
elif (
|
|
601
|
+
not only_numeric
|
|
602
|
+
and column_datatype.lower()
|
|
603
|
+
in self.databricks_datatype_mapping["binary"]
|
|
604
|
+
):
|
|
605
|
+
aggregates += f", COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column}::VARCHAR))) AS COUNTDISTINCT_{column}"
|
|
606
|
+
|
|
607
|
+
elif (
|
|
608
|
+
not only_numeric
|
|
609
|
+
and column_datatype.lower()
|
|
610
|
+
in self.databricks_datatype_mapping["boolean"]
|
|
611
|
+
):
|
|
612
|
+
aggregates += f", MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = true)) || '_' || MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = false)) AS AGGREGATEBOOLEAN_{column}"
|
|
613
|
+
|
|
614
|
+
# else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
|
|
615
|
+
|
|
616
|
+
# CASE 1: min_max
|
|
617
|
+
if group_by_aggregation_type == "only_min_max":
|
|
618
|
+
group_by_query_aggregation_string = aggregates_min[1:]
|
|
619
|
+
|
|
620
|
+
# CASE 2; sum, count_distinct, aggregate_boolean
|
|
621
|
+
elif group_by_aggregation_type == "various":
|
|
622
|
+
group_by_query_aggregation_string = aggregates[1:]
|
|
623
|
+
|
|
624
|
+
# CASE 3: sum, count_distinct, aggregate_boolean, min_max
|
|
625
|
+
elif group_by_aggregation_type == "various_and_min_max":
|
|
626
|
+
group_by_query_aggregation_string = f"{aggregates_min[1:]}{aggregates}"
|
|
627
|
+
|
|
628
|
+
query_group_by_aggregation = f"SELECT {group_by_query_columns_string}, COUNT(*) AS COUNT_OF_GROUP_BY_VALUE, {group_by_query_aggregation_string} FROM {object.database}.{object.schema}.{object.name} {where_clause} GROUP BY {group_by_query_columns_string} ORDER BY {group_by_query_columns_string};"
|
|
629
|
+
|
|
630
|
+
group_by_aggregation_pdf = self.execute_queries(
|
|
631
|
+
query_group_by_aggregation, True
|
|
632
|
+
)
|
|
633
|
+
except Exception as err:
|
|
634
|
+
group_by_aggregation_pdf = pd.DataFrame()
|
|
635
|
+
group_by_aggregation_pdf["TESTATM_ERROR"] = [1]
|
|
636
|
+
if not grouping_columns_final:
|
|
637
|
+
error_dict = {
|
|
638
|
+
"QUERY": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table",
|
|
639
|
+
"ERROR": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table"
|
|
640
|
+
}
|
|
641
|
+
group_by_query_aggregation_string = ""
|
|
642
|
+
elif "|||" in str(err):
|
|
643
|
+
error_dict = {
|
|
644
|
+
"QUERY": str(err).split("|||")[0],
|
|
645
|
+
"ERROR": str(err).split("|||")[1],
|
|
646
|
+
}
|
|
647
|
+
else:
|
|
648
|
+
error_dict = {
|
|
649
|
+
"QUERY": "NO Query generated. Please check if the configurated Grouping Columns exist in the Table",
|
|
650
|
+
"ERROR": str(err),
|
|
651
|
+
}
|
|
652
|
+
group_by_query_aggregation_string = ""
|
|
653
|
+
|
|
654
|
+
return (
|
|
655
|
+
group_by_aggregation_pdf,
|
|
656
|
+
group_by_query_aggregation_string,
|
|
657
|
+
group_by_query_columns_string,
|
|
658
|
+
grouping_columns_final,
|
|
659
|
+
error_dict
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
def create_pandas_df(
|
|
663
|
+
self, object: DatabaseObject, intersection_columns_trgt_src: list, where_clause:str="", exclude_columns:list=[]
|
|
664
|
+
) -> pd.DataFrame:
|
|
665
|
+
"""creates pandas dataframes with all data from given object in given columns
|
|
666
|
+
|
|
667
|
+
Args:
|
|
668
|
+
object (DatabaseObject): table or view
|
|
669
|
+
intersection_columns_trgt_src (list): columns existing in source and target
|
|
670
|
+
|
|
671
|
+
Returns:
|
|
672
|
+
pd.DataFrame: direct result of sql query
|
|
673
|
+
"""
|
|
674
|
+
if self.databricks_connection is None:
|
|
675
|
+
self._connect_to_databricks()
|
|
676
|
+
|
|
677
|
+
intersection_columns_trgt_src_ = ', '.join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
|
|
678
|
+
|
|
679
|
+
df_query = f"SELECT {intersection_columns_trgt_src_} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
|
|
680
|
+
|
|
681
|
+
src_pdf = self.execute_queries(df_query, True)
|
|
682
|
+
|
|
683
|
+
return src_pdf
|
|
684
|
+
|
|
685
|
+
def create_pandas_df_from_sample(
|
|
686
|
+
self,
|
|
687
|
+
object: DatabaseObject,
|
|
688
|
+
column_intersections: list,
|
|
689
|
+
key_columns: list,
|
|
690
|
+
where_clause: str = "",
|
|
691
|
+
exclude_columns: list = [],
|
|
692
|
+
key_filters: dict = {},
|
|
693
|
+
dedicated_columns: list = [],
|
|
694
|
+
sample_count: int = 10,
|
|
695
|
+
numeric_scale: int = None,
|
|
696
|
+
) -> List[Dict]:
|
|
697
|
+
if self.databricks_connection is None:
|
|
698
|
+
self._connect_to_databricks()
|
|
699
|
+
|
|
700
|
+
where_exists = True
|
|
701
|
+
if not where_clause:
|
|
702
|
+
where_exists = False
|
|
703
|
+
|
|
704
|
+
sample_count = str(sample_count)
|
|
705
|
+
key_intersection = list(
|
|
706
|
+
(set(column_intersections) & set(key_columns)) - set(exclude_columns)
|
|
707
|
+
)
|
|
708
|
+
filter_intersection = list(
|
|
709
|
+
(set(column_intersections) & set(key_filters.keys())) - set(exclude_columns)
|
|
710
|
+
)
|
|
711
|
+
dedicated_intersection = list(
|
|
712
|
+
(set(column_intersections) & set(dedicated_columns)) - set(exclude_columns)
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
key_intersection.sort()
|
|
716
|
+
filter_intersection.sort()
|
|
717
|
+
dedicated_intersection.sort()
|
|
718
|
+
|
|
719
|
+
if dedicated_intersection != []:
|
|
720
|
+
is_dedicated = True
|
|
721
|
+
|
|
722
|
+
dict_colummns_datatype = self.get_data_types_from_object(
|
|
723
|
+
object, dedicated_intersection
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
else:
|
|
727
|
+
is_dedicated = False
|
|
728
|
+
|
|
729
|
+
dict_colummns_datatype = self.get_data_types_from_object(
|
|
730
|
+
object, column_intersections
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
if key_intersection != [] and is_dedicated:
|
|
734
|
+
keys = str(key_intersection)[1:-1].replace("'", "")
|
|
735
|
+
column_clause, numeric_columns, used_columns = self._get_column_clause(
|
|
736
|
+
dedicated_intersection,
|
|
737
|
+
dict_colummns_datatype,
|
|
738
|
+
numeric_scale,
|
|
739
|
+
key_columns,
|
|
740
|
+
)
|
|
741
|
+
if (key_filters != {}) & (filter_intersection != []):
|
|
742
|
+
values = list(key_filters.values())
|
|
743
|
+
if values[0] != []:
|
|
744
|
+
in_clause = self._get_in_clause(
|
|
745
|
+
key_filters, numeric_columns, numeric_scale, where_exists
|
|
746
|
+
)
|
|
747
|
+
else:
|
|
748
|
+
in_clause = ""
|
|
749
|
+
sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} TABLESAMPLE ({sample_count} ROWS) {where_clause}{in_clause} ORDER BY {keys};"
|
|
750
|
+
elif key_intersection != [] and not is_dedicated:
|
|
751
|
+
keys = str(key_intersection)[1:-1].replace("'", "")
|
|
752
|
+
column_clause, numeric_columns, used_columns = self._get_column_clause(
|
|
753
|
+
column_intersections, dict_colummns_datatype, numeric_scale, key_columns
|
|
754
|
+
)
|
|
755
|
+
if (key_filters != {}) & (filter_intersection != []):
|
|
756
|
+
values = list(key_filters.values())
|
|
757
|
+
if values[0] != []:
|
|
758
|
+
in_clause = self._get_in_clause(
|
|
759
|
+
key_filters, numeric_columns, numeric_scale, where_exists
|
|
760
|
+
)
|
|
761
|
+
else:
|
|
762
|
+
in_clause = ""
|
|
763
|
+
sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} TABLESAMPLE ({sample_count} ROWS) {where_clause}{in_clause} ORDER BY {keys};"
|
|
764
|
+
else:
|
|
765
|
+
column_intersections = list(
|
|
766
|
+
set(column_intersections) - set(exclude_columns)
|
|
767
|
+
)
|
|
768
|
+
column_intersections.sort()
|
|
769
|
+
column_clause, numeric_columns, used_columns = self._get_column_clause(
|
|
770
|
+
column_intersections, dict_colummns_datatype, numeric_scale, key_columns
|
|
771
|
+
)
|
|
772
|
+
sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} TABLESAMPLE ({sample_count} ROWS) {where_clause};"
|
|
773
|
+
|
|
774
|
+
error_dict = {}
|
|
775
|
+
key_dict = {}
|
|
776
|
+
try:
|
|
777
|
+
sample_pdf = self.execute_queries(sample_query, return_as_pdf=True)
|
|
778
|
+
for key in key_intersection:
|
|
779
|
+
if pd.api.types.is_datetime64_any_dtype(sample_pdf[key]):
|
|
780
|
+
key_dict[key] = list(sample_pdf[key].astype(str))
|
|
781
|
+
else:
|
|
782
|
+
key_dict[key] = list(sample_pdf[key])
|
|
783
|
+
|
|
784
|
+
except Exception as err:
|
|
785
|
+
sample_pdf = pd.DataFrame()
|
|
786
|
+
sample_pdf["TESTATM_ERROR"] = [1]
|
|
787
|
+
if "|||" in str(err):
|
|
788
|
+
error_dict = {
|
|
789
|
+
"QUERY": str(err).split("|||")[0],
|
|
790
|
+
"ERROR": str(err).split("|||")[1],
|
|
791
|
+
}
|
|
792
|
+
else:
|
|
793
|
+
error_dict = {"QUERY": "No SQL Error", "ERROR": str(err)}
|
|
794
|
+
|
|
795
|
+
return_list = []
|
|
796
|
+
return_list.append(sample_pdf)
|
|
797
|
+
return_list.append(error_dict)
|
|
798
|
+
|
|
799
|
+
return return_list, key_dict, used_columns, sample_query
|
|
800
|
+
|
|
801
|
+
def execute_queries(
|
|
802
|
+
self,
|
|
803
|
+
query: Union[str, List[str]],
|
|
804
|
+
return_as_pdf: bool = False,
|
|
805
|
+
return_query_ids: bool = False,
|
|
806
|
+
) -> Union[List[Dict], List[List[Dict]]]:
|
|
807
|
+
"""actual execution of defined queries
|
|
808
|
+
|
|
809
|
+
Args:
|
|
810
|
+
query (Union[str, List[str]]): queries to be executed
|
|
811
|
+
return_as_pdf (bool, optional): If true, queries returned as pandas data frames. Defaults to False.
|
|
812
|
+
return_query_ids (bool, optional): If true, results and queri ids are returned, otherwise only results. Defaults to False.
|
|
813
|
+
|
|
814
|
+
Raises:
|
|
815
|
+
Exception: Raises exception if single query cannot be executed.
|
|
816
|
+
|
|
817
|
+
Returns:
|
|
818
|
+
Union[List[Dict], List[List[Dict]]]: returns results or results with query-ids
|
|
819
|
+
"""
|
|
820
|
+
if self.databricks_connection is None:
|
|
821
|
+
self._connect_to_databricks()
|
|
822
|
+
|
|
823
|
+
if query:
|
|
824
|
+
query_list: List[str] = query if isinstance(query, list) else [query]
|
|
825
|
+
else:
|
|
826
|
+
logger.error(
|
|
827
|
+
"Query defined as null - please check input for execute_queries function."
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
cursor = self.databricks_connection.cursor()
|
|
831
|
+
|
|
832
|
+
results = []
|
|
833
|
+
query_ids = []
|
|
834
|
+
|
|
835
|
+
for single_query in query_list:
|
|
836
|
+
try:
|
|
837
|
+
query_result = cursor.execute(single_query).fetchall()
|
|
838
|
+
if return_as_pdf:
|
|
839
|
+
columns = [col[0] for col in cursor.description]
|
|
840
|
+
query_result = pd.DataFrame(query_result, columns=columns)
|
|
841
|
+
|
|
842
|
+
results.append(query_result)
|
|
843
|
+
query_ids.append(0) # there is no query id returned by databricks
|
|
844
|
+
|
|
845
|
+
except Exception as err:
|
|
846
|
+
raise Exception(single_query + "|||" + str(err))
|
|
847
|
+
|
|
848
|
+
if return_query_ids:
|
|
849
|
+
return (
|
|
850
|
+
results[0],
|
|
851
|
+
query_ids[0] if not isinstance(query, list) else results,
|
|
852
|
+
query_ids,
|
|
853
|
+
)
|
|
854
|
+
|
|
855
|
+
else:
|
|
856
|
+
return results[0] if not isinstance(query, list) else results
|
|
857
|
+
|
|
858
|
+
def execute_statement(self, statement: Union[str, List[str]]) -> None:
|
|
859
|
+
"""
|
|
860
|
+
Executes simple statement against snowflake
|
|
861
|
+
Schema and Database settings must be set beforehand
|
|
862
|
+
Args:
|
|
863
|
+
statement Union[str, List[str]] - a sql statement or a list of sql statements to execute
|
|
864
|
+
"""
|
|
865
|
+
if self.databricks_connection is None:
|
|
866
|
+
self._connect_to_databricks()
|
|
867
|
+
|
|
868
|
+
statement_list: List[str] = (
|
|
869
|
+
statement if isinstance(statement, list) else [statement]
|
|
870
|
+
)
|
|
871
|
+
|
|
872
|
+
cursor = self.databricks_connection.cursor()
|
|
873
|
+
|
|
874
|
+
for single_statement in statement_list:
|
|
875
|
+
try:
|
|
876
|
+
stripped_statement = single_statement.strip()
|
|
877
|
+
_ = cursor.execute(stripped_statement)
|
|
878
|
+
|
|
879
|
+
except Exception as err:
|
|
880
|
+
raise Exception(self._get_error_message(err, single_statement)) from err
|
|
881
|
+
|
|
882
|
+
def create_schemas(self, database_name: str, schemas: List):
|
|
883
|
+
statement_list = []
|
|
884
|
+
|
|
885
|
+
for schema in schemas:
|
|
886
|
+
statement_list.append(f"CREATE SCHEMA IF NOT EXISTS {schema}")
|
|
887
|
+
|
|
888
|
+
self.execute_statement(statement_list)
|
|
889
|
+
|
|
890
|
+
def insert_json_results(
|
|
891
|
+
self,
|
|
892
|
+
run_guid: str,
|
|
893
|
+
pipeline_name: str,
|
|
894
|
+
pipeline_id: str,
|
|
895
|
+
start_time_utc: str,
|
|
896
|
+
result_table: str,
|
|
897
|
+
results: dict,
|
|
898
|
+
) -> None:
|
|
899
|
+
"""
|
|
900
|
+
copy into - result table for json results
|
|
901
|
+
"""
|
|
902
|
+
|
|
903
|
+
statement = f"CREATE TABLE IF NOT EXISTS {result_table} (RUN_GUID STRING, PIPELINE_NAME STRING, PIPELINE_ID STRING, START_TIME_UTC STRING, RESULT STRING, CREATION_TIME_UTC STRING)"
|
|
904
|
+
|
|
905
|
+
self.execute_statement(statement)
|
|
906
|
+
|
|
907
|
+
statement = (
|
|
908
|
+
"INSERT INTO {} VALUES ('{}', '{}', '{}', '{}', '{}', '{}');".format(
|
|
909
|
+
result_table,
|
|
910
|
+
run_guid,
|
|
911
|
+
pipeline_name,
|
|
912
|
+
pipeline_id,
|
|
913
|
+
start_time_utc,
|
|
914
|
+
str(results).replace("'", '"'),
|
|
915
|
+
datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S"),
|
|
916
|
+
)
|
|
917
|
+
)
|
|
918
|
+
|
|
919
|
+
self.execute_statement(statement)
|
|
920
|
+
|
|
921
|
+
def insert_json_results_live(
|
|
922
|
+
self,
|
|
923
|
+
run_guid: str,
|
|
924
|
+
pipeline_name: str,
|
|
925
|
+
pipeline_id: str,
|
|
926
|
+
result_table: str,
|
|
927
|
+
stage_name: str,
|
|
928
|
+
source_system: str,
|
|
929
|
+
target_system: str,
|
|
930
|
+
database: str,
|
|
931
|
+
schema: str,
|
|
932
|
+
object: str,
|
|
933
|
+
) -> None:
|
|
934
|
+
"""
|
|
935
|
+
copy into - result table for json results live
|
|
936
|
+
"""
|
|
937
|
+
result_database = result_table.split(".", 1)[0]
|
|
938
|
+
|
|
939
|
+
statement = f"COPY INTO {result_table} (RUN_GUID, PIPELINE_NAME, PIPELINE_ID, SOURCE_SYSTEM, TARGET_SYSTEM, DATABASE_NAME, SCHEMA_NAME, OBJECT_NAME ,RESULT, CREATION_TS) FROM (SELECT '{run_guid}', '{pipeline_name}', '{pipeline_id}', '{source_system}', '{target_system}', '{database}', '{schema}', '{object}', $1, SYSDATE() from @{stage_name} (file_format => {result_database}.meta_data.ff_json ));"
|
|
940
|
+
|
|
941
|
+
self.execute_statement(statement)
|
|
942
|
+
|
|
943
|
+
def insert_highlevel_results(
|
|
944
|
+
self,
|
|
945
|
+
results: dict,
|
|
946
|
+
run_guid: str,
|
|
947
|
+
pipeline_name: str,
|
|
948
|
+
pipeline_id: str,
|
|
949
|
+
result_table_highlevel: str,
|
|
950
|
+
) -> None:
|
|
951
|
+
"""
|
|
952
|
+
insert into - highlevel results per "pipeline run" / "generic testing tool execution"
|
|
953
|
+
"""
|
|
954
|
+
|
|
955
|
+
statement = f"CREATE TABLE IF NOT EXISTS {result_table_highlevel} (RUN_GUID STRING, PIPELINE_NAME STRING, PIPELINE_ID STRING, START_TIME_UTC STRING, SOURCE_SYSTEM STRING, TARGET_SYSTEM STRING, DATABASE_NAME STRING, TESTSET STRING, ALL_OBJECTS_MATCHING BOOLEAN, ALL_COLUMNS_EQUAL BOOLEAN, ALL_ROWCOUNTS_EQUAL BOOLEAN, ALL_CHECKSUMS_EQUAL BOOLEAN, ALL_SAMPLES_EQUAL BOOLEAN, ALL_OBJECTS_EQUAL BOOLEAN, OBJECTS_TO_COMPARE_SRC STRING, OBJECTS_TO_COMPARE_TRGT STRING, NUMBER_OF_OBJECTS_TO_COMPARE INT, SRC_MINUS_TRGT STRING, TRGT_MINUS_SRC STRING, CREATION_TS_UTC STRING)"
|
|
956
|
+
|
|
957
|
+
self.execute_statement(statement)
|
|
958
|
+
|
|
959
|
+
TESTSET_ = ", ".join(results["TESTSET"])
|
|
960
|
+
|
|
961
|
+
OBJECTS_TO_COMPARE_SRC_ = ", ".join(results["OBJECTS_TO_COMPARE_SRC"])
|
|
962
|
+
|
|
963
|
+
OBJECTS_TO_COMPARE_TRGT_ = ", ".join(results["OBJECTS_TO_COMPARE_TRGT"])
|
|
964
|
+
|
|
965
|
+
SRC_MINUS_TRGT_ = ", ".join(results["SRC_MINUS_TRGT"])
|
|
966
|
+
|
|
967
|
+
TRGT_MINUS_SRC_ = ", ".join(results["TRGT_MINUS_SRC"])
|
|
968
|
+
|
|
969
|
+
date_utc = datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
|
|
970
|
+
|
|
971
|
+
insert_statement = f"INSERT INTO {result_table_highlevel} ( \
|
|
972
|
+
RUN_GUID, \
|
|
973
|
+
PIPELINE_NAME, \
|
|
974
|
+
PIPELINE_ID, \
|
|
975
|
+
START_TIME_UTC, \
|
|
976
|
+
SOURCE_SYSTEM, \
|
|
977
|
+
TARGET_SYSTEM, \
|
|
978
|
+
DATABASE_NAME, \
|
|
979
|
+
TESTSET, \
|
|
980
|
+
ALL_OBJECTS_MATCHING, \
|
|
981
|
+
ALL_COLUMNS_EQUAL, \
|
|
982
|
+
ALL_DATATYPES_EQUAL, \
|
|
983
|
+
ALL_ROWCOUNTS_EQUAL, \
|
|
984
|
+
ALL_CHECKSUMS_EQUAL, \
|
|
985
|
+
ALL_SAMPLES_EQUAL, \
|
|
986
|
+
ALL_OBJECTS_EQUAL, \
|
|
987
|
+
OBJECTS_TO_COMPARE_SRC, \
|
|
988
|
+
OBJECTS_TO_COMPARE_TRGT, \
|
|
989
|
+
NUMBER_OF_OBJECTS_TO_COMPARE, \
|
|
990
|
+
SRC_MINUS_TRGT, \
|
|
991
|
+
TRGT_MINUS_SRC, \
|
|
992
|
+
CREATION_TS_UTC) \
|
|
993
|
+
VALUES \
|
|
994
|
+
('{run_guid}', \
|
|
995
|
+
'{pipeline_name}', \
|
|
996
|
+
'{pipeline_id}', \
|
|
997
|
+
'{results['START_TIME_UTC']}', \
|
|
998
|
+
'{results['SOURCE_SYSTEM']}', \
|
|
999
|
+
'{results['TARGET_SYSTEM']}', \
|
|
1000
|
+
'{results['DATABASE_NAME']}', \
|
|
1001
|
+
'{TESTSET_}', \
|
|
1002
|
+
'{results['ALL_OBJECTS_MATCHING']}', \
|
|
1003
|
+
'{results['ALL_COLUMNS_EQUAL']}', \
|
|
1004
|
+
'{results['ALL_DATATYPES_EQUAL']}', \
|
|
1005
|
+
'{results['ALL_ROWCOUNTS_EQUAL']}', \
|
|
1006
|
+
'{results['ALL_CHECKSUMS_EQUAL']}', \
|
|
1007
|
+
NULLIF('{results['ALL_SAMPLES_EQUAL']}', 'None'), \
|
|
1008
|
+
NULLIF('{results['ALL_OBJECTS_EQUAL']}', 'None'), \
|
|
1009
|
+
'{OBJECTS_TO_COMPARE_SRC_}', \
|
|
1010
|
+
'{OBJECTS_TO_COMPARE_TRGT_}', \
|
|
1011
|
+
'{results['NUMBER_OF_OBJECTS_TO_COMPARE']}', \
|
|
1012
|
+
'{SRC_MINUS_TRGT_}', \
|
|
1013
|
+
'{TRGT_MINUS_SRC_}', \
|
|
1014
|
+
'{date_utc}')"
|
|
1015
|
+
|
|
1016
|
+
self.execute_statement(insert_statement)
|
|
1017
|
+
|
|
1018
|
+
def insert_objectlevel_results(
|
|
1019
|
+
self,
|
|
1020
|
+
result_table: str,
|
|
1021
|
+
result_table_objectlevel: str,
|
|
1022
|
+
run_guid: str,
|
|
1023
|
+
results: dict,
|
|
1024
|
+
) -> None:
|
|
1025
|
+
"""
|
|
1026
|
+
insert into - detailed results per object
|
|
1027
|
+
"""
|
|
1028
|
+
date_utc = datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
|
|
1029
|
+
|
|
1030
|
+
statement = f"CREATE TABLE IF NOT EXISTS {result_table_objectlevel} (RUN_GUID STRING, PIPELINE_ID STRING, START_TIME_UTC STRING, SRC_DATABASE_NAME STRING, SRC_SCHEMA_NAME STRING, SRC_OBJECT_NAME STRING, SRC_OBJECT_TYPE STRING, TRGT_DATABASE_NAME STRING, TRGT_SCHEMA_NAME STRING, TRGT_OBJECT_NAME STRING, TRGT_OBJECT_TYPE STRING, SRC_FILTER STRING, TRGT_FILTER STRING, EXCLUDED_COLUMNS STRING, COLUMNS_EQUAL BOOLEAN, COLUMN_INTERSECTION STRING,SRC_COLUMNS_MINUS_TRGT_COLUMNS STRING, TRGT_COLUMNS_MINUS_SRC_COLUMNS STRING, ROW_COUNTS_EQUAL BOOLEAN, SRC_ROW_COUNT INT, TRGT_ROW_COUNT INT, ALL_COUNT_NULLS_EQUAL BOOLEAN, AGGREGATIONS_EQUAL BOOLEAN, SRC_ERROR_QUERY STRING, TRGT_ERROR_QUERY STRING, SRC_ERROR_MSG STRING, TRGT_ERROR_MSG STRING, GROUP_BY_COLUMNS STRING,GROUP_BY_EQUAL BOOLEAN, GROUP_BY_VALUES_WITH_MISMATCHES STRING, COLUMNS_WITH_MISMATCH STRING, SRC_GROUP_BY_QUERY STRING, TRGT_GROUP_BY_QUERY STRING, SRC_GROUP_BY_ERROR STRING, TRGT_GROUP_BY_ERROR STRING, SAMPLES_COMPARED BOOLEAN,SAMPLES_EQUAL BOOLEAN, SAMPLE_KEYS STRING, SRC_SAMPLE STRING, TRGT_SAMPLE STRING, SRC_SAMPLE_QUERY STRING, TRGT_SAMPLE_QUERY STRING, SRC_SAMPLE_ERROR_MSG STRING, TRGT_SAMPLE_ERROR_MSG STRING, PANDAS_DATAFRAME_COMPARED BOOLEAN, PANDAS_DATAFRAME_EQUAL BOOLEAN, SRC_NOT_ALTERED_DURING_COMPARISON BOOLEAN, TRGT_NOT_ALTERED_DURING_COMPARISON BOOLEAN, SRC_LAST_ALTERED STRING, TRGT_LAST_ALTERED STRING, CREATION_TS_UTC STRING)"
|
|
1031
|
+
|
|
1032
|
+
self.execute_statement(statement)
|
|
1033
|
+
|
|
1034
|
+
dict_list = self.get_objects_in_result_column(result_table, run_guid)
|
|
1035
|
+
|
|
1036
|
+
"""
|
|
1037
|
+
Now, we have to extract all the information in the dicts manually to
|
|
1038
|
+
insert them in the query. We write one line for each object one by one.
|
|
1039
|
+
"""
|
|
1040
|
+
|
|
1041
|
+
for element in dict_list:
|
|
1042
|
+
elem = element
|
|
1043
|
+
help_str, elem = elem.split(",", 1)
|
|
1044
|
+
src_database_name = re.sub(
|
|
1045
|
+
r"(.*)\"SRC_DATABASE_NAME\":\s\"(.*)\"", r"\2", help_str
|
|
1046
|
+
)
|
|
1047
|
+
help_str, elem = elem.split(",", 1)
|
|
1048
|
+
src_schema_name = re.sub(
|
|
1049
|
+
r"(.*)\"SRC_SCHEMA_NAME\":\s\"(.*)\"", r"\2", help_str
|
|
1050
|
+
)
|
|
1051
|
+
help_str, elem = elem.split(",", 1)
|
|
1052
|
+
src_object_name = re.sub(
|
|
1053
|
+
r"(.*)\"SRC_OBJECT_NAME\":\s\"(.*)\"", r"\2", help_str
|
|
1054
|
+
)
|
|
1055
|
+
help_str, elem = elem.split(",", 1)
|
|
1056
|
+
src_object_type = re.sub(
|
|
1057
|
+
r"(.*)\"SRC_OBJECT_TYPE\":\s\"(.*)\"", r"\2", help_str
|
|
1058
|
+
)
|
|
1059
|
+
help_str, elem = elem.split(",", 1)
|
|
1060
|
+
trgt_database_name = re.sub(
|
|
1061
|
+
r"(.*)\"TRGT_DATABASE_NAME\":\s\"(.*)\"", r"\2", help_str
|
|
1062
|
+
)
|
|
1063
|
+
help_str, elem = elem.split(",", 1)
|
|
1064
|
+
trgt_schema_name = re.sub(
|
|
1065
|
+
r"(.*)\"TRGT_SCHEMA_NAME\":\s\"(.*)\"", r"\2", help_str
|
|
1066
|
+
)
|
|
1067
|
+
help_str, elem = elem.split(",", 1)
|
|
1068
|
+
trgt_object_name = re.sub(
|
|
1069
|
+
r"(.*)\"TRGT_OBJECT_NAME\":\s\"(.*)\"", r"\2", help_str
|
|
1070
|
+
)
|
|
1071
|
+
help_str, elem = elem.split(",", 1)
|
|
1072
|
+
trgt_object_type = re.sub(
|
|
1073
|
+
r"(.*)\"TRGT_OBJECT_TYPE\":\s\"(.*)\"", r"\2", help_str
|
|
1074
|
+
)
|
|
1075
|
+
help_str, elem = elem.split(",", 1)
|
|
1076
|
+
src_filter = re.sub(
|
|
1077
|
+
r"(.*)\"SRC_FILTER\":\s(.*)", r"\2", help_str
|
|
1078
|
+
)
|
|
1079
|
+
help_str, elem = elem.split(",", 1)
|
|
1080
|
+
trgt_filter = re.sub(
|
|
1081
|
+
r"(.*)\"TRGT_FILTER\":\s(.*)", r"\2", help_str
|
|
1082
|
+
)
|
|
1083
|
+
help_str, elem = elem.split("],", 1)
|
|
1084
|
+
help_str = help_str + "]"
|
|
1085
|
+
excluded_columns = re.sub(
|
|
1086
|
+
r"(.*)\"EXCLUDED_COLUMNS\":\s(.*)", r"\2", help_str
|
|
1087
|
+
)
|
|
1088
|
+
help_str, elem = elem.split(",", 1)
|
|
1089
|
+
columns_equal = re.sub(r"(.*)\"COLUMNS_EQUAL\":\s(.*)", r"\2", help_str)
|
|
1090
|
+
help_str, elem = elem.split("],", 1)
|
|
1091
|
+
help_str = help_str + "]"
|
|
1092
|
+
column_intersection = re.sub(
|
|
1093
|
+
r"(.*)\"COLUMN_INTERSECTION\":\s(.*)", r"\2", help_str
|
|
1094
|
+
)
|
|
1095
|
+
help_str, elem = elem.split("],", 1)
|
|
1096
|
+
help_str = help_str + "]"
|
|
1097
|
+
src_columns_minus_trgt_columns = re.sub(
|
|
1098
|
+
r"(.*)\"SRC_COLUMNS_MINUS_TRGT_COLUMNS\":\s(.*)", r"\2", help_str
|
|
1099
|
+
)
|
|
1100
|
+
help_str, elem = elem.split("],", 1)
|
|
1101
|
+
help_str = help_str + "]"
|
|
1102
|
+
trgt_columns_minus_src_columns = re.sub(
|
|
1103
|
+
r"(.*)\"TRGT_COLUMNS_MINUS_SRC_COLUMNS\":\s(.*)", r"\2", help_str
|
|
1104
|
+
)
|
|
1105
|
+
help_str, elem = elem.split(",", 1)
|
|
1106
|
+
row_counts_equal = re.sub(
|
|
1107
|
+
r"(.*)\"ROW_COUNTS_EQUAL\":\s(.*)", r"\2", help_str
|
|
1108
|
+
)
|
|
1109
|
+
help_str, elem = elem.split(",", 1)
|
|
1110
|
+
src_row_count = re.sub(
|
|
1111
|
+
r"(.*)\"SRC_ROW_COUNT\":\s(.*)", r"\2", help_str
|
|
1112
|
+
)
|
|
1113
|
+
help_str, elem = elem.split(",", 1)
|
|
1114
|
+
trgt_row_count = re.sub(
|
|
1115
|
+
r"(.*)\"TRGT_ROW_COUNT\":\s(.*)", r"\2", help_str
|
|
1116
|
+
)
|
|
1117
|
+
help_str, elem = elem.split(",", 1)
|
|
1118
|
+
all_count_nulls_equal = re.sub(
|
|
1119
|
+
r"(.*)\"ALL_COUNT_NULLS_EQUAL\":\s(.*)", r"\2", help_str
|
|
1120
|
+
)
|
|
1121
|
+
help_str, elem = elem.split(",", 1)
|
|
1122
|
+
aggregations_equal = re.sub(
|
|
1123
|
+
r"(.*)\"AGGREGATIONS_EQUAL\":\s(.*)", r"\2", help_str
|
|
1124
|
+
)
|
|
1125
|
+
help_str, elem = elem.split("},", 1)
|
|
1126
|
+
help_str = help_str + "}"
|
|
1127
|
+
src_error = re.sub(
|
|
1128
|
+
r"(.*)\"SRC_ERROR\":\s(.*)", r"\2", help_str
|
|
1129
|
+
)
|
|
1130
|
+
help_str, elem = elem.split("},", 1)
|
|
1131
|
+
help_str = help_str + "}"
|
|
1132
|
+
trgt_error = re.sub(
|
|
1133
|
+
r"(.*)\"TRGT_ERROR\":\s(.*)", r"\2", help_str
|
|
1134
|
+
)
|
|
1135
|
+
help_str, elem = elem.split(', "SRC_GROUP_BY_QUERY', 1)
|
|
1136
|
+
elem = '"SRC_GROUP_BY_QUERY' + elem
|
|
1137
|
+
group_by_columns = re.sub(
|
|
1138
|
+
r"(.*)\"GROUP_BY_COLUMNS\":\s(.*)", r"\2", help_str
|
|
1139
|
+
)
|
|
1140
|
+
help_str, elem = elem.split(",", 1)
|
|
1141
|
+
src_group_by_query = re.sub(
|
|
1142
|
+
r"(.*)\"SRC_GROUP_BY_QUERY\":\s(.*)", r"\2", help_str
|
|
1143
|
+
)
|
|
1144
|
+
help_str, elem = elem.split(",", 1)
|
|
1145
|
+
trgt_group_by_query = re.sub(
|
|
1146
|
+
r"(.*)\"TRGT_GROUP_BY_QUERY\":\s(.*)", r"\2", help_str
|
|
1147
|
+
)
|
|
1148
|
+
help_str, elem = elem.split(",", 1)
|
|
1149
|
+
group_by_equal = re.sub(r"(.*)\"GROUP_BY_EQUAL\":\s(.*)", r"\2", help_str)
|
|
1150
|
+
help_str, elem = elem.split(', "COLUMNS_WITH_MISMATCH', 1)
|
|
1151
|
+
elem = '"COLUMNS_WITH_MISMATCH' + elem
|
|
1152
|
+
group_by_values_with_mismatches = re.sub(
|
|
1153
|
+
r"(.*)\"GROUP_BY_VALUES_WITH_MISMATCHES\":\s(.*)", r"\2", help_str
|
|
1154
|
+
)
|
|
1155
|
+
help_str, elem = elem.split(', "SRC_GROUP_BY_ERROR', 1)
|
|
1156
|
+
elem = '"SRC_GROUP_BY_ERROR' + elem
|
|
1157
|
+
columns_with_mismatch = re.sub(
|
|
1158
|
+
r"(.*)\"COLUMNS_WITH_MISMATCH\":\s(.*)", r"\2", help_str
|
|
1159
|
+
)
|
|
1160
|
+
help_str, elem = elem.split(', "TRGT_GROUP_BY_ERROR', 1)
|
|
1161
|
+
elem = '"TRGT_GROUP_BY_ERROR' + elem
|
|
1162
|
+
src_group_by_error = re.sub(
|
|
1163
|
+
r"(.*)\"SRC_GROUP_BY_ERROR\":\s(.*)", r"\2", help_str
|
|
1164
|
+
)
|
|
1165
|
+
help_str, elem = elem.split(', "SAMPLES_COMPARED', 1)
|
|
1166
|
+
elem = '"SAMPLES_COMPARED' + elem
|
|
1167
|
+
trgt_group_by_error = re.sub(
|
|
1168
|
+
r"(.*)\"TRGT_GROUP_BY_ERROR\":\s(.*)", r"\2", help_str
|
|
1169
|
+
)
|
|
1170
|
+
help_str, elem = elem.split(",", 1)
|
|
1171
|
+
samples_compared = re.sub(
|
|
1172
|
+
r"(.*)\"SAMPLES_COMPARED\":\s(.*)", r"\2", help_str
|
|
1173
|
+
)
|
|
1174
|
+
help_str, elem = elem.split(",", 1)
|
|
1175
|
+
samples_equal = re.sub(
|
|
1176
|
+
r"(.*)\"SAMPLES_EQUAL\":\s(.*)", r"\2", help_str
|
|
1177
|
+
)
|
|
1178
|
+
help_str, elem = elem.split("},", 1)
|
|
1179
|
+
help_str = help_str + "}"
|
|
1180
|
+
sample_keys = re.sub(
|
|
1181
|
+
r"(.*)\"SAMPLE_KEYS\":\s(.*)", r"\2", help_str
|
|
1182
|
+
)
|
|
1183
|
+
help_str, elem = elem.split("}},", 1)
|
|
1184
|
+
help_str = help_str + "}}"
|
|
1185
|
+
src_sample = re.sub(
|
|
1186
|
+
r"(.*)\"SRC_SAMPLE\":\s(.*)", r"\2", help_str
|
|
1187
|
+
)
|
|
1188
|
+
help_str, elem = elem.split("}},", 1)
|
|
1189
|
+
help_str = help_str + "}}"
|
|
1190
|
+
trgt_sample = re.sub(
|
|
1191
|
+
r"(.*)\"TRGT_SAMPLE\":\s(.*)", r"\2", help_str
|
|
1192
|
+
)
|
|
1193
|
+
help_str, elem = elem.split(';",', 1)
|
|
1194
|
+
help_str = help_str + ';"'
|
|
1195
|
+
src_sample_query = re.sub(
|
|
1196
|
+
r"(.*)\"SRC_SAMPLE_QUERY\":\s(.*)", r"\2", help_str
|
|
1197
|
+
)
|
|
1198
|
+
help_str, elem = elem.split(';",', 1)
|
|
1199
|
+
help_str = help_str + ';"'
|
|
1200
|
+
trgt_sample_query = re.sub(
|
|
1201
|
+
r"(.*)\"TRGT_SAMPLE_QUERY\":\s(.*)", r"\2", help_str
|
|
1202
|
+
)
|
|
1203
|
+
help_str, elem = elem.split("},", 1)
|
|
1204
|
+
help_str = help_str + "}"
|
|
1205
|
+
src_sample_error_dict = re.sub(
|
|
1206
|
+
r"(.*)\"SRC_SAMPLE_ERROR_DICT\":\s(.*)", r"\2", help_str
|
|
1207
|
+
)
|
|
1208
|
+
help_str, elem = elem.split("},", 1)
|
|
1209
|
+
help_str = help_str + "}"
|
|
1210
|
+
trgt_sample_error_dict = re.sub(
|
|
1211
|
+
r"(.*)\"TRGT_SAMPLE_ERROR_DICT\":\s(.*)", r"\2", help_str
|
|
1212
|
+
)
|
|
1213
|
+
help_str, elem = elem.split(",", 1)
|
|
1214
|
+
pandas_dataframe_compared = re.sub(
|
|
1215
|
+
r"(.*)\"PANDAS_DATAFRAME_COMPARED\":\s(.*)", r"\2", help_str
|
|
1216
|
+
)
|
|
1217
|
+
help_str, elem = elem.split(",", 1)
|
|
1218
|
+
pandas_dataframe_equal = re.sub(
|
|
1219
|
+
r"(.*)\"PANDAS_DATAFRAME_EQUAL\":\s(.*)", r"\2", help_str
|
|
1220
|
+
)
|
|
1221
|
+
help_str, elem = elem.split(",", 1)
|
|
1222
|
+
src_not_altered_during_comparison = re.sub(
|
|
1223
|
+
r"(.*)\"SRC_NOT_ALTERED_DURING_COMPARISON\":\s(.*)", r"\2", help_str
|
|
1224
|
+
)
|
|
1225
|
+
help_str, elem = elem.split(",", 1)
|
|
1226
|
+
trgt_not_altered_during_comparison = re.sub(
|
|
1227
|
+
r"(.*)\"TRGT_NOT_ALTERED_DURING_COMPARISON\":\s(.*)", r"\2", help_str
|
|
1228
|
+
)
|
|
1229
|
+
help_str, elem = elem.split(",", 1)
|
|
1230
|
+
src_last_altered = re.sub(
|
|
1231
|
+
r"(.*)\"SRC_LAST_ALTERED\":\s(.*)", r"\2", help_str
|
|
1232
|
+
)
|
|
1233
|
+
help_str, elem = elem.split(",", 1)
|
|
1234
|
+
trgt_last_altered = re.sub(r"(.*)\"TRGT_LAST_ALTERED\":\s(.*)", r"\2", help_str)
|
|
1235
|
+
|
|
1236
|
+
# the rest in elem is not used for this table
|
|
1237
|
+
|
|
1238
|
+
insert_statement = f"INSERT INTO {result_table_objectlevel} ( \
|
|
1239
|
+
RUN_GUID, \
|
|
1240
|
+
PIPELINE_ID, \
|
|
1241
|
+
START_TIME_UTC, \
|
|
1242
|
+
SRC_DATABASE_NAME, \
|
|
1243
|
+
SRC_SCHEMA_NAME, \
|
|
1244
|
+
SRC_OBJECT_NAME, \
|
|
1245
|
+
SRC_OBJECT_TYPE, \
|
|
1246
|
+
TRGT_DATABASE_NAME, \
|
|
1247
|
+
TRGT_SCHEMA_NAME, \
|
|
1248
|
+
TRGT_OBJECT_NAME, \
|
|
1249
|
+
TRGT_OBJECT_TYPE, \
|
|
1250
|
+
SRC_FILTER, \
|
|
1251
|
+
TRGT_FILTER, \
|
|
1252
|
+
EXCLUDED_COLUMNS, \
|
|
1253
|
+
COLUMNS_EQUAL, \
|
|
1254
|
+
COLUMN_INTERSECTION, \
|
|
1255
|
+
SRC_COLUMNS_MINUS_TRGT_COLUMNS, \
|
|
1256
|
+
TRGT_COLUMNS_MINUS_SRC_COLUMNS, \
|
|
1257
|
+
ROW_COUNTS_EQUAL, \
|
|
1258
|
+
SRC_ROW_COUNT, \
|
|
1259
|
+
TRGT_ROW_COUNT, \
|
|
1260
|
+
ALL_COUNT_NULLS_EQUAL, \
|
|
1261
|
+
AGGREGATIONS_EQUAL, \
|
|
1262
|
+
SRC_ERROR_QUERY , \
|
|
1263
|
+
TRGT_ERROR_QUERY, \
|
|
1264
|
+
SRC_ERROR_MSG, \
|
|
1265
|
+
TRGT_ERROR_MSG, \
|
|
1266
|
+
GROUP_BY_COLUMNS, \
|
|
1267
|
+
GROUP_BY_EQUAL, \
|
|
1268
|
+
GROUP_BY_VALUES_WITH_MISMATCHES, \
|
|
1269
|
+
COLUMNS_WITH_MISMATCH, \
|
|
1270
|
+
SRC_GROUP_BY_QUERY, \
|
|
1271
|
+
TRGT_GROUP_BY_QUERY, \
|
|
1272
|
+
SRC_GROUP_BY_ERROR, \
|
|
1273
|
+
TRGT_GROUP_BY_ERROR, \
|
|
1274
|
+
SAMPLES_COMPARED, \
|
|
1275
|
+
SAMPLES_EQUAL, \
|
|
1276
|
+
SAMPLE_KEYS, \
|
|
1277
|
+
SRC_SAMPLE, \
|
|
1278
|
+
TRGT_SAMPLE, \
|
|
1279
|
+
SRC_SAMPLE_QUERY, \
|
|
1280
|
+
TRGT_SAMPLE_QUERY, \
|
|
1281
|
+
SRC_SAMPLE_ERROR_MSG, \
|
|
1282
|
+
TRGT_SAMPLE_ERROR_MSG, \
|
|
1283
|
+
PANDAS_DATAFRAME_COMPARED, \
|
|
1284
|
+
PANDAS_DATAFRAME_EQUAL, \
|
|
1285
|
+
SRC_NOT_ALTERED_DURING_COMPARISON, \
|
|
1286
|
+
TRGT_NOT_ALTERED_DURING_COMPARISON, \
|
|
1287
|
+
SRC_LAST_ALTERED, \
|
|
1288
|
+
TRGT_LAST_ALTERED, \
|
|
1289
|
+
CREATION_TS_UTC) \
|
|
1290
|
+
WITH group_error_src AS (SELECT\
|
|
1291
|
+
json_tuple('{src_group_by_error}', 'QUERY', 'ERROR') AS (grouping_errors_src_query, grouping_errors_src_error)\
|
|
1292
|
+
),\
|
|
1293
|
+
group_error_trgt AS (SELECT\
|
|
1294
|
+
json_tuple('{trgt_group_by_error}', 'QUERY', 'ERROR') AS (grouping_errors_trgt_query, grouping_errors_trgt_error)\
|
|
1295
|
+
),\
|
|
1296
|
+
src_error AS (SELECT\
|
|
1297
|
+
json_tuple('{src_error}', 'QUERY', 'ERROR') AS (src_error_query, src_error_error)\
|
|
1298
|
+
),\
|
|
1299
|
+
trgt_error AS (SELECT\
|
|
1300
|
+
json_tuple('{trgt_error}', 'QUERY', 'ERROR') AS (trgt_error_query, trgt_error_error)\
|
|
1301
|
+
),\
|
|
1302
|
+
src_sample_error AS (SELECT\
|
|
1303
|
+
json_tuple('{src_sample_error_dict}', 'QUERY', 'ERROR') AS (src_sample_error_dict_query, src_sample_error_dict_error)\
|
|
1304
|
+
),\
|
|
1305
|
+
trgt_sample_error AS (SELECT\
|
|
1306
|
+
json_tuple('{trgt_sample_error_dict}', 'QUERY', 'ERROR') AS (trgt_sample_error_dict_query, trgt_sample_error_dict_error)\
|
|
1307
|
+
)\
|
|
1308
|
+
SELECT\
|
|
1309
|
+
RESULTS.RUN_GUID AS RUN_GUID, \
|
|
1310
|
+
RESULTS.PIPELINE_ID AS PIPELINE_ID, \
|
|
1311
|
+
RESULTS.START_TIME_UTC::STRING AS START_TIME_UTC, \
|
|
1312
|
+
'{src_database_name}' AS SRC_DATABASE_NAME, \
|
|
1313
|
+
'{src_schema_name}' AS SRC_SCHEMA_NAME, \
|
|
1314
|
+
'{src_object_name}' AS SRC_OBJECT_NAME, \
|
|
1315
|
+
'{src_object_type}' AS SRC_OBJECT_TYPE, \
|
|
1316
|
+
'{trgt_database_name}' AS TRGT_DATABASE_NAME, \
|
|
1317
|
+
'{trgt_schema_name}' AS TRGT_SCHEMA_NAME, \
|
|
1318
|
+
'{trgt_object_name}' AS TRGT_OBJECT_NAME, \
|
|
1319
|
+
'{trgt_object_type}' AS TRGT_OBJECT_TYPE, \
|
|
1320
|
+
'{src_filter}' AS SRC_FILTER, \
|
|
1321
|
+
'{trgt_filter}' AS TRGT_FILTER, \
|
|
1322
|
+
'{excluded_columns}' AS EXCLUDED_COLUMNS, \
|
|
1323
|
+
'{columns_equal}'::BOOLEAN AS COLUMNS_EQUAL, \
|
|
1324
|
+
'{column_intersection}'::BOOLEAN AS COLUMN_INTERSECTION, \
|
|
1325
|
+
'{src_columns_minus_trgt_columns}' AS SRC_COLUMNS_MINUS_TRGT_COLUMNS, \
|
|
1326
|
+
'{trgt_columns_minus_src_columns}' AS TRGT_COLUMNS_MINUS_SRC_COLUMNS, \
|
|
1327
|
+
'{row_counts_equal}'::BOOLEAN AS ROW_COUNTS_EQUAL, \
|
|
1328
|
+
'{src_row_count}'::INT AS SRC_ROW_COUNT, \
|
|
1329
|
+
'{trgt_row_count}'::INT AS TRGT_ROW_COUNT, \
|
|
1330
|
+
'{all_count_nulls_equal}'::BOOLEAN AS ALL_COUNT_NULLS_EQUAL, \
|
|
1331
|
+
'{aggregations_equal}'::BOOLEAN AS AGGREGATIONS_EQUAL, \
|
|
1332
|
+
src_error_query::STRING AS SRC_ERROR_QUERY, \
|
|
1333
|
+
trgt_error_query::STRING AS TRGT_ERROR_QUERY, \
|
|
1334
|
+
src_error_error::STRING AS SRC_ERROR_MSG, \
|
|
1335
|
+
trgt_error_error::STRING AS TRGT_ERROR_MSG, \
|
|
1336
|
+
'{group_by_columns}' AS GROUP_BY_COLUMNS, \
|
|
1337
|
+
'{group_by_equal}'::BOOLEAN AS GROUP_BY_EQUAL, \
|
|
1338
|
+
'{group_by_values_with_mismatches}' AS GROUP_BY_VALUES_WITH_MISMATCHES, \
|
|
1339
|
+
'{columns_with_mismatch}' AS COLUMNS_WITH_MISMATCH, \
|
|
1340
|
+
CASE WHEN '{src_group_by_error}'::STRING = '{{}}' \
|
|
1341
|
+
THEN NULLIF('{src_group_by_query}'::STRING, '') \
|
|
1342
|
+
WHEN'{src_group_by_error}'::STRING != '{{}}' \
|
|
1343
|
+
THEN NULLIF(grouping_errors_src_query::STRING, '') \
|
|
1344
|
+
END AS SRC_GROUP_BY_QUERY, \
|
|
1345
|
+
CASE WHEN '{trgt_group_by_error}'::STRING = '{{}}' \
|
|
1346
|
+
THEN NULLIF('{trgt_group_by_query}'::STRING, '') \
|
|
1347
|
+
WHEN '{trgt_group_by_error}'::STRING != '{{}}' \
|
|
1348
|
+
THEN NULLIF(grouping_errors_trgt_query::STRING, '') \
|
|
1349
|
+
END AS TRGT_GROUP_BY_QUERY, \
|
|
1350
|
+
CASE WHEN '{src_group_by_error}'::STRING = '{{}}' \
|
|
1351
|
+
THEN NULL \
|
|
1352
|
+
ELSE '{src_group_by_error}'::STRING \
|
|
1353
|
+
END AS SRC_GROUP_BY_ERROR, \
|
|
1354
|
+
CASE WHEN '{trgt_group_by_error}'::STRING = '{{}}' \
|
|
1355
|
+
THEN NULL \
|
|
1356
|
+
ELSE '{trgt_group_by_error}'::STRING \
|
|
1357
|
+
END AS TRGT_GROUP_BY_ERROR, \
|
|
1358
|
+
'{samples_compared}'::BOOLEAN AS SAMPLES_COMPARED, \
|
|
1359
|
+
'{samples_equal}'::BOOLEAN AS SAMPLES_EQUAL, \
|
|
1360
|
+
'{sample_keys}' AS SAMPLE_KEYS, \
|
|
1361
|
+
'{src_sample}' AS SRC_SAMPLE, \
|
|
1362
|
+
'{trgt_sample}' AS TRGT_SAMPLE, \
|
|
1363
|
+
'{src_sample_query}' AS SRC_SAMPLE_QUERY, \
|
|
1364
|
+
'{trgt_sample_query}' AS TRGT_SAMPLE_QUERY, \
|
|
1365
|
+
src_sample_error_dict_error::STRING AS SRC_SAMPLE_ERROR_MSG, \
|
|
1366
|
+
trgt_sample_error_dict_error::STRING AS TRGT_SAMPLE_ERROR_MSG, \
|
|
1367
|
+
'{pandas_dataframe_compared}'::BOOLEAN AS PANDAS_DATAFRAME_COMPARED, \
|
|
1368
|
+
'{pandas_dataframe_equal}'::BOOLEAN AS PANDAS_DATAFRAME_EQUAL, \
|
|
1369
|
+
'{src_not_altered_during_comparison}'::BOOLEAN AS SRC_NOT_ALTERED_DURING_COMPARISON, \
|
|
1370
|
+
'{trgt_not_altered_during_comparison}'::BOOLEAN AS TRGT_NOT_ALTERED_DURING_COMPARISON, \
|
|
1371
|
+
'{src_last_altered}'::STRING AS SRC_LAST_ALTERED, \
|
|
1372
|
+
'{trgt_last_altered}'::STRING AS TRGT_LAST_ALTERED, \
|
|
1373
|
+
'{date_utc}' \
|
|
1374
|
+
FROM {result_table} RESULTS, group_error_src, group_error_trgt, src_error, trgt_error, src_sample_error, trgt_sample_error \
|
|
1375
|
+
WHERE RUN_GUID = '{run_guid}'\
|
|
1376
|
+
;"
|
|
1377
|
+
|
|
1378
|
+
self.execute_statement(insert_statement)
|
|
1379
|
+
|
|
1380
|
+
def insert_columnlevel_results(
|
|
1381
|
+
self,
|
|
1382
|
+
result_table: str,
|
|
1383
|
+
result_table_columnlevel: str,
|
|
1384
|
+
run_guid: str,
|
|
1385
|
+
) -> None:
|
|
1386
|
+
"""
|
|
1387
|
+
insert into - detailed results per column
|
|
1388
|
+
"""
|
|
1389
|
+
|
|
1390
|
+
date_utc = datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
|
|
1391
|
+
|
|
1392
|
+
statement = f"CREATE TABLE IF NOT EXISTS {result_table_columnlevel} (RUN_GUID STRING, PIPELINE_ID STRING, START_TIME_UTC STRING, SRC_DATABASE_NAME STRING, SRC_SCHEMA_NAME STRING, SRC_OBJECT_NAME STRING, SRC_OBJECT_TYPE STRING, TRGT_DATABASE_NAME STRING, TRGT_SCHEMA_NAME STRING, TRGT_OBJECT_NAME STRING, TRGT_OBJECT_TYPE STRING, COLUMN_NAME STRING, IN_SRC BOOLEAN, IN_TRGT BOOLEAN, IN_SYNC BOOLEAN, IN_EXCLUDED BOOLEAN, SRC_DATATYPE STRING, TRGT_DATATYPE STRING, AGGREGATION_TYPE STRING, AGGREGATION_EQUAL BOOLEAN, AGGREGATION_RESULT_SRC STRING, AGGREGATION_RESULT_TRGT STRING, AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC STRING, COUNT_NULLS_EQUAL BOOLEAN, COUNT_NULLS_SRC STRING, COUNT_NULLS_TRGT STRING, COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC STRING, ERROR_QUERY_SRC STRING, ERROR_MSG_SRC STRING, ERROR_QUERY_TRGT STRING, ERROR_MSG_TRGT STRING, ERROR_FLAG BOOLEAN, CREATION_TS_UTC STRING);"
|
|
1393
|
+
|
|
1394
|
+
self.execute_statement(statement)
|
|
1395
|
+
|
|
1396
|
+
dict_list = self.get_objects_in_result_column(result_table, run_guid)
|
|
1397
|
+
|
|
1398
|
+
# extract the information needed for the table on object level
|
|
1399
|
+
for element in dict_list:
|
|
1400
|
+
elem = element
|
|
1401
|
+
help_str, elem = elem.split(",", 1)
|
|
1402
|
+
src_database_name = re.sub(
|
|
1403
|
+
r"(.*)\"SRC_DATABASE_NAME\":\s\"(.*)\"", r"\2", help_str
|
|
1404
|
+
)
|
|
1405
|
+
help_str, elem = elem.split(",", 1)
|
|
1406
|
+
src_schema_name = re.sub(
|
|
1407
|
+
r"(.*)\"SRC_SCHEMA_NAME\":\s\"(.*)\"", r"\2", help_str
|
|
1408
|
+
)
|
|
1409
|
+
help_str, elem = elem.split(",", 1)
|
|
1410
|
+
src_object_name = re.sub(
|
|
1411
|
+
r"(.*)\"SRC_OBJECT_NAME\":\s\"(.*)\"", r"\2", help_str
|
|
1412
|
+
)
|
|
1413
|
+
help_str, elem = elem.split(",", 1)
|
|
1414
|
+
src_object_type = re.sub(
|
|
1415
|
+
r"(.*)\"SRC_OBJECT_TYPE\":\s\"(.*)\"", r"\2", help_str
|
|
1416
|
+
)
|
|
1417
|
+
help_str, elem = elem.split(",", 1)
|
|
1418
|
+
trgt_database_name = re.sub(
|
|
1419
|
+
r"(.*)\"TRGT_DATABASE_NAME\":\s\"(.*)\"", r"\2", help_str
|
|
1420
|
+
)
|
|
1421
|
+
help_str, elem = elem.split(",", 1)
|
|
1422
|
+
trgt_schema_name = re.sub(
|
|
1423
|
+
r"(.*)\"TRGT_SCHEMA_NAME\":\s\"(.*)\"", r"\2", help_str
|
|
1424
|
+
)
|
|
1425
|
+
help_str, elem = elem.split(",", 1)
|
|
1426
|
+
trgt_object_name = re.sub(
|
|
1427
|
+
r"(.*)\"TRGT_OBJECT_NAME\":\s\"(.*)\"", r"\2", help_str
|
|
1428
|
+
)
|
|
1429
|
+
help_str, elem = elem.split(",", 1)
|
|
1430
|
+
trgt_object_type = re.sub(
|
|
1431
|
+
r"(.*)\"TRGT_OBJECT_TYPE\":\s\"(.*)\"", r"\2", help_str
|
|
1432
|
+
)
|
|
1433
|
+
help_str, elem = elem.split(",", 1)
|
|
1434
|
+
src_filter = re.sub(
|
|
1435
|
+
r"(.*)\"SRC_FILTER\":\s(.*)", r"\2", help_str
|
|
1436
|
+
)
|
|
1437
|
+
help_str, elem = elem.split(",", 1)
|
|
1438
|
+
trgt_filter = re.sub(
|
|
1439
|
+
r"(.*)\"TRGT_FILTER\":\s(.*)", r"\2", help_str
|
|
1440
|
+
)
|
|
1441
|
+
help_str, elem = elem.split("],", 1)
|
|
1442
|
+
help_str = help_str + "]" # EXCLUDED_COLUMNS not needed for column level table
|
|
1443
|
+
help_str, elem = elem.split(",", 1) # COLUMNS_EQUAL not needed for column level table
|
|
1444
|
+
help_str, elem = elem.split("],", 1) # COLUMN_INTERSECTION not needed for column level table
|
|
1445
|
+
help_str, elem = elem.split("],", 1) # SRC_COLUMNS_MINUS_TRGT_COLUMNS not needed for column level table
|
|
1446
|
+
help_str = help_str + "]" # SRC_COLUMNS_MINUS_TRGT_COLUMNS not needed for column level table
|
|
1447
|
+
help_str, elem = elem.split("],", 1) # TRGT_COLUMNS_MINUS_SRC_COLUMNS not needed for column level table
|
|
1448
|
+
help_str, elem = elem.split(",", 1) # ROW_COUNTS_EQUAL not needed for column level table
|
|
1449
|
+
help_str, elem = elem.split(",", 1) # SRC_ROW_COUNT not needed for column level table
|
|
1450
|
+
help_str, elem = elem.split(",", 1) # TRGT_ROW_COUNT not needed for column level table
|
|
1451
|
+
help_str, elem = elem.split(",", 1) # ALL_COUNT_NULLS_EQUAL not needed for column level table
|
|
1452
|
+
help_str, elem = elem.split(",", 1) # AGGREGATIONS_EQUAL not needed for column level table
|
|
1453
|
+
help_str, elem = elem.split("},", 1)
|
|
1454
|
+
help_str = help_str + "}"
|
|
1455
|
+
src_error = re.sub(
|
|
1456
|
+
r"(.*)\"SRC_ERROR\":\s(.*)", r"\2", help_str
|
|
1457
|
+
)
|
|
1458
|
+
help_str, elem = elem.split("},", 1)
|
|
1459
|
+
help_str = help_str + "}"
|
|
1460
|
+
trgt_error = re.sub(
|
|
1461
|
+
r"(.*)\"TRGT_ERROR\":\s(.*)", r"\2", help_str
|
|
1462
|
+
)
|
|
1463
|
+
help_str, elem = elem.split(', "SRC_GROUP_BY_QUERY', 1) # GROUP_BY_COLUMNS not needed for column level table
|
|
1464
|
+
elem = '"SRC_GROUP_BY_QUERY' + elem
|
|
1465
|
+
help_str, elem = elem.split(",", 1) # SRC_GROUP_BY_QUERY not needed for column level table
|
|
1466
|
+
help_str, elem = elem.split(",", 1) # TRGT_GROUP_BY_QUERY not needed for column level table
|
|
1467
|
+
help_str, elem = elem.split(",", 1) # GROUP_BY_EQUAL not needed for column level table
|
|
1468
|
+
help_str, elem = elem.split(', "COLUMNS_WITH_MISMATCH', 1) # GROUP_BY_VALUES_WITH_MISMATCHES not needed for column level table
|
|
1469
|
+
elem = '"COLUMNS_WITH_MISMATCH' + elem
|
|
1470
|
+
help_str, elem = elem.split(', "SRC_GROUP_BY_ERROR', 1) # COLUMNS_WITH_MISMATCH not needed for column level table
|
|
1471
|
+
elem = '"SRC_GROUP_BY_ERROR' + elem
|
|
1472
|
+
help_str, elem = elem.split(', "TRGT_GROUP_BY_ERROR', 1) # SRC_GROUP_BY_ERROR not needed for column level table
|
|
1473
|
+
elem = '"TRGT_GROUP_BY_ERROR' + elem
|
|
1474
|
+
help_str, elem = elem.split(', "SAMPLES_COMPARED', 1) # TRGT_GROUP_BY_ERROR not needed for column level table
|
|
1475
|
+
elem = '"SAMPLES_COMPARED' + elem
|
|
1476
|
+
help_str, elem = elem.split(",", 1) # SAMPLES_COMPARED not needed for column level table
|
|
1477
|
+
help_str, elem = elem.split(",", 1) # SAMPLES_EQUAL not needed for column level table
|
|
1478
|
+
help_str, elem = elem.split("},", 1) # SAMPLE_KEYS not needed for column level table
|
|
1479
|
+
help_str, elem = elem.split("}},", 1) # SRC_SAMPLE not needed for column level table
|
|
1480
|
+
help_str, elem = elem.split("}},", 1) # TRGT_SAMPLE not needed for column level table
|
|
1481
|
+
help_str, elem = elem.split(';",', 1) # SRC_SAMPLE_QUERY not needed for column level table
|
|
1482
|
+
help_str, elem = elem.split(';",', 1) # TRGT_SAMPLE_QUERY not needed for column level table
|
|
1483
|
+
help_str, elem = elem.split("},", 1) # SRC_SAMPLE_ERROR_DICT not needed for column level table
|
|
1484
|
+
help_str, elem = elem.split("},", 1) # TRGT_SAMPLE_ERROR_DICT not needed for column level table
|
|
1485
|
+
help_str, elem = elem.split(",", 1) # PANDAS_DATAFRAME_COMPARED not needed for column level table
|
|
1486
|
+
help_str, elem = elem.split(",", 1) # PANDAS_DATAFRAME_EQUAL not needed for column level table
|
|
1487
|
+
help_str, elem = elem.split(",", 1) # SRC_NOT_ALTERED_DURING_COMPARISON not needed for column level table
|
|
1488
|
+
help_str, elem = elem.split(",", 1) # TRGT_NOT_ALTERED_DURING_COMPARISON not needed for column level table
|
|
1489
|
+
help_str, elem = elem.split(",", 1) # SRC_LAST_ALTERED not needed for column level table
|
|
1490
|
+
help_str, elem = elem.split(",", 1) # TRGT_LAST_ALTERED not needed for column level table
|
|
1491
|
+
help_str, elem = elem.split("],", 1) # ALL_COLUMNS not needed for column level table
|
|
1492
|
+
help_str, elem = elem.split("}]}", 1)
|
|
1493
|
+
help_str = help_str + "}]"
|
|
1494
|
+
columns_liststr = re.search(r'(.*)"COLUMNS":\s\[(.*)\]', help_str).group(2)
|
|
1495
|
+
columns_dictlist = columns_liststr.split("}")
|
|
1496
|
+
columns_dictlist = [
|
|
1497
|
+
dictionary + "}"
|
|
1498
|
+
for dictionary in columns_dictlist
|
|
1499
|
+
if len(dictionary) > 0
|
|
1500
|
+
]
|
|
1501
|
+
|
|
1502
|
+
# extract the information needed for the table on column level
|
|
1503
|
+
for column in columns_dictlist:
|
|
1504
|
+
col = re.sub(r"^,", "", column)
|
|
1505
|
+
help_str, col = col.split(",", 1)
|
|
1506
|
+
column_name = re.sub(r"(.*)\"COLUMN_NAME\":\s\"(.*)\"", r"\2", help_str)
|
|
1507
|
+
help_str, col = col.split(",", 1)
|
|
1508
|
+
in_src = re.sub(r"(.*)\"IN_SRC\":\s(.*)", r"\2", help_str)
|
|
1509
|
+
help_str, col = col.split(",", 1)
|
|
1510
|
+
in_trgt = re.sub(r"(.*)\"IN_TRGT\":\s(.*)", r"\2", help_str)
|
|
1511
|
+
help_str, col = col.split(",", 1)
|
|
1512
|
+
in_sync = re.sub(r"(.*)\"IN_SYNC\":\s(.*)", r"\2", help_str)
|
|
1513
|
+
help_str, col = col.split(",", 1)
|
|
1514
|
+
in_excluded = re.sub(r"(.*)\"IN_EXCLUDED\":\s(.*)", r"\2", help_str)
|
|
1515
|
+
help_str, col = col.split(",", 1)
|
|
1516
|
+
if help_str == ' "SRC_DATATYPE": None':
|
|
1517
|
+
src_datatype = "None"
|
|
1518
|
+
else:
|
|
1519
|
+
src_datatype = re.sub(
|
|
1520
|
+
r"(.*)\"SRC_DATATYPE\":\s\"(.*)\"", r"\2", help_str
|
|
1521
|
+
)
|
|
1522
|
+
help_str, col = col.split(",", 1)
|
|
1523
|
+
if help_str == ' "TRGT_DATATYPE": None':
|
|
1524
|
+
trgt_datatype = "None"
|
|
1525
|
+
else:
|
|
1526
|
+
trgt_datatype = re.sub(
|
|
1527
|
+
r"(.*)\"TRGT_DATATYPE\":\s\"(.*)\"", r"\2", help_str
|
|
1528
|
+
)
|
|
1529
|
+
help_str, col = col.split(",", 1)
|
|
1530
|
+
if help_str == ' "AGGREGATION_TYPE": None':
|
|
1531
|
+
aggregation_type = "None"
|
|
1532
|
+
else:
|
|
1533
|
+
aggregation_type = re.sub(
|
|
1534
|
+
r"(.*)\"AGGREGATION_TYPE\":\s\"(.*)\"", r"\2", help_str
|
|
1535
|
+
)
|
|
1536
|
+
help_str, col = col.split(",", 1)
|
|
1537
|
+
aggregation_equal = re.sub(
|
|
1538
|
+
r"(.*)\"AGGREGATION_EQUAL\":\s(.*)", r"\2", help_str
|
|
1539
|
+
)
|
|
1540
|
+
help_str, col = col.split(",", 1)
|
|
1541
|
+
aggregation_result_src = re.sub(
|
|
1542
|
+
r"(.*)\"AGGREGATION_RESULT_SRC\":\s(.*)", r"\2", help_str
|
|
1543
|
+
)
|
|
1544
|
+
help_str, col = col.split(",", 1)
|
|
1545
|
+
aggregation_result_trgt = re.sub(
|
|
1546
|
+
r"(.*)\"AGGREGATION_RESULT_TRGT\":\s(.*)", r"\2", help_str
|
|
1547
|
+
)
|
|
1548
|
+
help_str, col = col.split(",", 1)
|
|
1549
|
+
if help_str == ' "AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC": None':
|
|
1550
|
+
aggregation_difference_trgt_minus_src = "None"
|
|
1551
|
+
else:
|
|
1552
|
+
aggregation_difference_trgt_minus_src = re.sub(
|
|
1553
|
+
r"(.*)\"AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC\":\s\"(.*)\"",
|
|
1554
|
+
r"\2",
|
|
1555
|
+
help_str,
|
|
1556
|
+
)
|
|
1557
|
+
help_str, col = col.split(",", 1)
|
|
1558
|
+
count_nulls_equal = re.sub(
|
|
1559
|
+
r"(.*)\"COUNT_NULLS_EQUAL\":\s(.*)", r"\2", help_str
|
|
1560
|
+
)
|
|
1561
|
+
help_str, col = col.split(",", 1)
|
|
1562
|
+
count_nulls_src = re.sub(
|
|
1563
|
+
r"(.*)\"COUNT_NULLS_SRC\":\s(.*)", r"\2", help_str
|
|
1564
|
+
)
|
|
1565
|
+
help_str, col = col.split(",", 1)
|
|
1566
|
+
count_nulls_trgt = re.sub(
|
|
1567
|
+
r"(.*)\"COUNT_NULLS_TRGT\":\s(.*)", r"\2", help_str
|
|
1568
|
+
)
|
|
1569
|
+
if col == ' "COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC": None}':
|
|
1570
|
+
count_nulls_difference_trgt_minus_src = "None"
|
|
1571
|
+
else:
|
|
1572
|
+
count_nulls_difference_trgt_minus_src = re.sub(
|
|
1573
|
+
r"(.*)\"COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC\":\s\"(.*)\"\}",
|
|
1574
|
+
r"\2",
|
|
1575
|
+
col,
|
|
1576
|
+
)
|
|
1577
|
+
|
|
1578
|
+
insert_statement = f"INSERT INTO {result_table_columnlevel} ( \
|
|
1579
|
+
RUN_GUID,\
|
|
1580
|
+
PIPELINE_ID,\
|
|
1581
|
+
START_TIME_UTC,\
|
|
1582
|
+
SRC_DATABASE_NAME, \
|
|
1583
|
+
SRC_SCHEMA_NAME, \
|
|
1584
|
+
SRC_OBJECT_NAME, \
|
|
1585
|
+
SRC_OBJECT_TYPE, \
|
|
1586
|
+
TRGT_DATABASE_NAME, \
|
|
1587
|
+
TRGT_SCHEMA_NAME, \
|
|
1588
|
+
TRGT_OBJECT_NAME, \
|
|
1589
|
+
TRGT_OBJECT_TYPE, \
|
|
1590
|
+
COLUMN_NAME,\
|
|
1591
|
+
IN_SRC,\
|
|
1592
|
+
IN_TRGT,\
|
|
1593
|
+
IN_SYNC,\
|
|
1594
|
+
IN_EXCLUDED, \
|
|
1595
|
+
SRC_DATATYPE,\
|
|
1596
|
+
TRGT_DATATYPE,\
|
|
1597
|
+
AGGREGATION_TYPE,\
|
|
1598
|
+
AGGREGATION_EQUAL,\
|
|
1599
|
+
AGGREGATION_RESULT_SRC,\
|
|
1600
|
+
AGGREGATION_RESULT_TRGT,\
|
|
1601
|
+
AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC,\
|
|
1602
|
+
COUNT_NULLS_EQUAL,\
|
|
1603
|
+
COUNT_NULLS_SRC,\
|
|
1604
|
+
COUNT_NULLS_TRGT,\
|
|
1605
|
+
COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC,\
|
|
1606
|
+
ERROR_QUERY_SRC ,\
|
|
1607
|
+
ERROR_MSG_SRC ,\
|
|
1608
|
+
ERROR_QUERY_TRGT ,\
|
|
1609
|
+
ERROR_MSG_TRGT ,\
|
|
1610
|
+
ERROR_FLAG,\
|
|
1611
|
+
CREATION_TS_UTC)\
|
|
1612
|
+
WITH errors_src AS (SELECT\
|
|
1613
|
+
json_tuple('{src_error}', 'QUERY', 'ERROR') AS (ERROR_QUERY_SRC, ERROR_MSG_SRC)\
|
|
1614
|
+
),\
|
|
1615
|
+
errors_trgt AS (SELECT\
|
|
1616
|
+
json_tuple('{trgt_error}', 'QUERY', 'ERROR') AS (ERROR_QUERY_TRGT, ERROR_MSG_TRGT)\
|
|
1617
|
+
)\
|
|
1618
|
+
SELECT\
|
|
1619
|
+
RESULTS.RUN_GUID AS RUN_GUID,\
|
|
1620
|
+
RESULTS.PIPELINE_ID AS PIPELINE_ID,\
|
|
1621
|
+
RESULTS.START_TIME_UTC::STRING AS START_TIME_UTC,\
|
|
1622
|
+
'{src_database_name}' AS SRC_DATABASE_NAME,\
|
|
1623
|
+
'{src_schema_name}' AS SRC_SCHEMA_NAME,\
|
|
1624
|
+
'{src_object_name}' AS SRC_OBJECT_NAME,\
|
|
1625
|
+
'{src_object_type}' AS SRC_OBJECT_TYPE,\
|
|
1626
|
+
'{trgt_database_name}' AS TRGT_DATABASE_NAME,\
|
|
1627
|
+
'{trgt_schema_name}' AS TRGT_SCHEMA_NAME,\
|
|
1628
|
+
'{trgt_object_name}' AS TRGT_OBJECT_NAME,\
|
|
1629
|
+
'{trgt_object_type}' AS TRGT_OBJECT_TYPE,\
|
|
1630
|
+
'{column_name}' AS COLUMN_NAME,\
|
|
1631
|
+
'{in_src}'::BOOLEAN AS IN_SRC,\
|
|
1632
|
+
'{in_trgt}'::BOOLEAN AS IN_TRGT,\
|
|
1633
|
+
'{in_sync}'::BOOLEAN AS IN_SYNC,\
|
|
1634
|
+
'{in_excluded}'::BOOLEAN AS IN_SYNC,\
|
|
1635
|
+
'{src_datatype}' AS SRC_DATATYPE,\
|
|
1636
|
+
'{trgt_datatype}' AS TRGT_DATATYPE,\
|
|
1637
|
+
'{aggregation_type}' AS AGGREGATION_TYPE,\
|
|
1638
|
+
'{aggregation_equal}'::BOOLEAN AS AGGREGATION_EQUAL,\
|
|
1639
|
+
'{aggregation_result_src}' AS AGGREGATION_RESULT_SRC,\
|
|
1640
|
+
'{aggregation_result_trgt}' AS AGGREGATION_RESULT_TRGT,\
|
|
1641
|
+
'{aggregation_difference_trgt_minus_src}' AS AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC,\
|
|
1642
|
+
'{count_nulls_equal}'::BOOLEAN AS COUNT_NULLS_EQUAL,\
|
|
1643
|
+
'{count_nulls_src}'::INT AS COUNT_NULLS_SRC,\
|
|
1644
|
+
'{count_nulls_trgt}'::INT AS COUNT_NULLS_TRGT,\
|
|
1645
|
+
'{count_nulls_difference_trgt_minus_src}' AS COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC,\
|
|
1646
|
+
ERROR_QUERY_SRC,\
|
|
1647
|
+
ERROR_MSG_SRC,\
|
|
1648
|
+
ERROR_QUERY_TRGT,\
|
|
1649
|
+
ERROR_MSG_TRGT,\
|
|
1650
|
+
CASE WHEN ERROR_MSG_SRC IS NULL AND ERROR_MSG_TRGT IS NULL THEN FALSE ELSE TRUE END AS ERROR_FLAG,\
|
|
1651
|
+
'{date_utc}'\
|
|
1652
|
+
FROM {result_table} RESULTS, errors_src, errors_trgt\
|
|
1653
|
+
WHERE RUN_GUID = '{run_guid}';"
|
|
1654
|
+
|
|
1655
|
+
self.execute_statement(insert_statement)
|
|
1656
|
+
|
|
1657
|
+
def get_objects_in_result_column(
|
|
1658
|
+
self,
|
|
1659
|
+
result_table: str,
|
|
1660
|
+
run_guid: str,
|
|
1661
|
+
):
|
|
1662
|
+
"""
|
|
1663
|
+
The results could only be written back as almost dictionary
|
|
1664
|
+
(replacing quotes) and a conversion back is not possible since
|
|
1665
|
+
they are also used in a different context. Here, we do string
|
|
1666
|
+
parsing to extract the list of dictionaries (one for each object
|
|
1667
|
+
to compare).
|
|
1668
|
+
"""
|
|
1669
|
+
|
|
1670
|
+
select_statement = (
|
|
1671
|
+
f"SELECT RESULT FROM {result_table} WHERE RUN_GUID = '{run_guid}'"
|
|
1672
|
+
)
|
|
1673
|
+
|
|
1674
|
+
results_dict = self.execute_queries(select_statement)[0][0]
|
|
1675
|
+
result_string = re.search(
|
|
1676
|
+
r'"OBJECTS":(.*)', results_dict, flags=re.DOTALL
|
|
1677
|
+
).group(1)
|
|
1678
|
+
result_string = re.sub(
|
|
1679
|
+
"}$", "", result_string
|
|
1680
|
+
) # remove } from the outer dictionary, the objects string is in
|
|
1681
|
+
result_dictstr = re.sub(r"^\s\[(.*)]$", r"\1", result_string, flags=re.DOTALL)
|
|
1682
|
+
dict_list = result_dictstr.split(
|
|
1683
|
+
'{"SRC_DATABASE_NAME"'
|
|
1684
|
+
) # cannot split dictionaries at } because there are dicts in the dict
|
|
1685
|
+
dict_list = [
|
|
1686
|
+
'{"SRC_DATABASE_NAME"' + dictionary
|
|
1687
|
+
for dictionary in dict_list
|
|
1688
|
+
if len(dictionary) > 0
|
|
1689
|
+
] # add the string used for splitting
|
|
1690
|
+
dict_list = [
|
|
1691
|
+
re.sub(r",\s$", "", dictionary) for dictionary in dict_list
|
|
1692
|
+
] # remove ', ' at the end for those dicts not at the end of the list
|
|
1693
|
+
|
|
1694
|
+
return dict_list
|