icsDataValidation 1.0.358__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- icsDataValidation/configuration.py +19 -0
- icsDataValidation/connection_setups/__init__.py +0 -0
- icsDataValidation/connection_setups/azure_connection_setup.py +19 -0
- icsDataValidation/connection_setups/databricks_connection_setup.py +28 -0
- icsDataValidation/connection_setups/exasol_connection_setup.py +17 -0
- icsDataValidation/connection_setups/oracle_connection_setup.py +26 -0
- icsDataValidation/connection_setups/snowflake_connection_setup.py +35 -0
- icsDataValidation/connection_setups/teradata_connection_setup.py +18 -0
- icsDataValidation/core/__init__.py +0 -0
- icsDataValidation/core/database_objects.py +18 -0
- icsDataValidation/core/object_comparison.py +239 -0
- icsDataValidation/input_parameters/__init__.py +0 -0
- icsDataValidation/input_parameters/testing_tool_params.py +81 -0
- icsDataValidation/main.py +250 -0
- icsDataValidation/output_parameters/__init__.py +0 -0
- icsDataValidation/output_parameters/result_params.py +94 -0
- icsDataValidation/services/__init__.py +0 -0
- icsDataValidation/services/comparison_service.py +582 -0
- icsDataValidation/services/database_services/__init__.py +0 -0
- icsDataValidation/services/database_services/azure_service.py +320 -0
- icsDataValidation/services/database_services/databricks_hive_metastore_service.py +1694 -0
- icsDataValidation/services/database_services/databricks_unity_catalog_service.py +1379 -0
- icsDataValidation/services/database_services/exasol_service.py +261 -0
- icsDataValidation/services/database_services/oracle_service.py +713 -0
- icsDataValidation/services/database_services/snowflake_service.py +1100 -0
- icsDataValidation/services/database_services/teradata_service.py +665 -0
- icsDataValidation/services/initialization_service.py +103 -0
- icsDataValidation/services/result_service.py +573 -0
- icsDataValidation/services/system_service.py +61 -0
- icsDataValidation/services/testset_service.py +257 -0
- icsDataValidation/utils/__init__.py +0 -0
- icsDataValidation/utils/file_util.py +96 -0
- icsDataValidation/utils/logger_util.py +96 -0
- icsDataValidation/utils/pandas_util.py +159 -0
- icsDataValidation/utils/parallelization_util.py +52 -0
- icsDataValidation/utils/sql_util.py +14 -0
- icsDataValidation-1.0.358.dist-info/METADATA +21 -0
- icsDataValidation-1.0.358.dist-info/RECORD +40 -0
- icsDataValidation-1.0.358.dist-info/WHEEL +5 -0
- icsDataValidation-1.0.358.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1379 @@
|
|
|
1
|
+
from databricks import sql as databricks_sqlconnect
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
|
|
7
|
+
from typing import Union, List, Dict
|
|
8
|
+
from pathlib import PurePath
|
|
9
|
+
|
|
10
|
+
from icsDataValidation.utils.logger_util import configure_dev_ops_logger
|
|
11
|
+
from icsDataValidation.core.database_objects import DatabaseObject
|
|
12
|
+
|
|
13
|
+
#########################################################################################
|
|
14
|
+
#########################################################################################
|
|
15
|
+
|
|
16
|
+
# Configure Dev Ops Logger
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("Databricks_Unity_Catalog_Service")
|
|
19
|
+
logger.setLevel(logging.INFO)
|
|
20
|
+
configure_dev_ops_logger(logger)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DatabricksUnityCatalogService(object):
|
|
24
|
+
def __init__(self, connection_params: dict):
|
|
25
|
+
self.connection_params = connection_params
|
|
26
|
+
self.databricks_connection = None
|
|
27
|
+
self.databricks_datatype_mapping = {
|
|
28
|
+
"string": ["string", "array", "map", "struct"],
|
|
29
|
+
"numeric": [
|
|
30
|
+
"int",
|
|
31
|
+
"bigint",
|
|
32
|
+
"double",
|
|
33
|
+
"decimal",
|
|
34
|
+
"float",
|
|
35
|
+
"smallint",
|
|
36
|
+
"tinyint",
|
|
37
|
+
],
|
|
38
|
+
"date_and_time": [
|
|
39
|
+
"timestamp",
|
|
40
|
+
"date",
|
|
41
|
+
"interval",
|
|
42
|
+
"timestamp_ntz",
|
|
43
|
+
"timestamp_tz",
|
|
44
|
+
"timestamp_ltz",
|
|
45
|
+
],
|
|
46
|
+
"binary": ["binary"],
|
|
47
|
+
"boolean": ["boolean"],
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
def __enter__(self):
|
|
51
|
+
return self
|
|
52
|
+
|
|
53
|
+
def __exit__(self, exception_type, exception_value, traceback):
|
|
54
|
+
if self.databricks_connection is not None:
|
|
55
|
+
self.databricks_connection.close()
|
|
56
|
+
|
|
57
|
+
def __del__(self):
|
|
58
|
+
if self.databricks_connection is not None:
|
|
59
|
+
self.databricks_connection.close()
|
|
60
|
+
|
|
61
|
+
def _connect_to_databricks(self):
|
|
62
|
+
self.databricks_connection = databricks_sqlconnect.connect(
|
|
63
|
+
**self.connection_params
|
|
64
|
+
)
|
|
65
|
+
return self.databricks_connection
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def _get_error_message(excepction: Exception, statement: str) -> None:
|
|
69
|
+
"""
|
|
70
|
+
Compose error message if the execution of a statement or query fails.
|
|
71
|
+
"""
|
|
72
|
+
if hasattr(excepction, "raw_msg"):
|
|
73
|
+
message = excepction.raw_msg.replace("\n", " ")
|
|
74
|
+
else:
|
|
75
|
+
message = str(
|
|
76
|
+
excepction
|
|
77
|
+
) # this makes sure that all kinds of errors can have a message, even if they do not have raw_msg attribute
|
|
78
|
+
if hasattr(excepction, "sfqid"):
|
|
79
|
+
message = message + f"\nQuery ID: {excepction.sfqid}"
|
|
80
|
+
return f"Databricks ERROR: {message}\nFailed statement:\n{statement}"
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def _get_in_clause(
|
|
84
|
+
key_filters: list,
|
|
85
|
+
numeric_columns: list,
|
|
86
|
+
numeric_scale: int,
|
|
87
|
+
where_exists: bool = True,
|
|
88
|
+
) -> str:
|
|
89
|
+
"""generates in_clause from list ready to expand the where clause, numeric values are rounded
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
key_filters (list): list of given expected values
|
|
93
|
+
numeric_columns (list): list of all numeric columns
|
|
94
|
+
numeric_scale (int): number of decimal places after rounding
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
str: in clause as string
|
|
98
|
+
"""
|
|
99
|
+
values = list(key_filters.values())
|
|
100
|
+
in_clause_values = "('"
|
|
101
|
+
for j in range(len(values[0])):
|
|
102
|
+
for value in values:
|
|
103
|
+
in_clause_values += str(value[j]) + "','"
|
|
104
|
+
in_clause_values = in_clause_values[:-2] + "),('"
|
|
105
|
+
in_clause_values = in_clause_values[:-3] + ")"
|
|
106
|
+
|
|
107
|
+
if where_exists:
|
|
108
|
+
in_clause_cols = f" AND (("
|
|
109
|
+
else:
|
|
110
|
+
in_clause_cols = f" WHERE (("
|
|
111
|
+
for key in key_filters.keys():
|
|
112
|
+
if key in numeric_columns:
|
|
113
|
+
in_clause_cols += f"""ROUND({key.replace("'", "")},2)""" + ","
|
|
114
|
+
else:
|
|
115
|
+
in_clause_cols += key.replace("'", "") + ","
|
|
116
|
+
in_clause_cols = in_clause_cols[:-1] + ")"
|
|
117
|
+
in_clause = in_clause_cols + " in (" + in_clause_values + ")"
|
|
118
|
+
return in_clause
|
|
119
|
+
|
|
120
|
+
def _get_column_clause(
|
|
121
|
+
self, column_list: list, columns_datatype: list, numeric_scale, key_columns
|
|
122
|
+
) -> dict:
|
|
123
|
+
"""turns list of desired columns into a sql compatible string
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
column_list (list): list of all columns
|
|
127
|
+
columns_datatype (list): datatypes of given columns
|
|
128
|
+
numeric_scale (_type_): number of decimal places for numeric columns
|
|
129
|
+
key_columns (_type_):list of columns of interest
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
dict: _description_
|
|
133
|
+
"""
|
|
134
|
+
column_intersecions_new = []
|
|
135
|
+
used_columns = []
|
|
136
|
+
numeric_columns = []
|
|
137
|
+
for column in column_list:
|
|
138
|
+
column_datatype = next(
|
|
139
|
+
x for x in columns_datatype if x["COLUMN_NAME"] == column
|
|
140
|
+
)["DATA_TYPE"]
|
|
141
|
+
|
|
142
|
+
if column in key_columns or not (
|
|
143
|
+
column_datatype.lower()
|
|
144
|
+
in self.databricks_datatype_mapping["date_and_time"]
|
|
145
|
+
):
|
|
146
|
+
if (
|
|
147
|
+
column_datatype.lower()
|
|
148
|
+
in self.databricks_datatype_mapping["numeric"]
|
|
149
|
+
):
|
|
150
|
+
if numeric_scale:
|
|
151
|
+
column_intersecions_new.append(
|
|
152
|
+
f"CAST(ROUND({column}, {numeric_scale}) as decimal(38,{numeric_scale})) as {column}"
|
|
153
|
+
)
|
|
154
|
+
else:
|
|
155
|
+
column_intersecions_new.append(f"{column} as {column}")
|
|
156
|
+
used_columns.append(column)
|
|
157
|
+
numeric_columns.append(column)
|
|
158
|
+
elif (
|
|
159
|
+
column_datatype.lower()
|
|
160
|
+
in self.databricks_datatype_mapping["string"]
|
|
161
|
+
):
|
|
162
|
+
column_intersecions_new.append(f"{column} AS {column}")
|
|
163
|
+
used_columns.append(column)
|
|
164
|
+
else:
|
|
165
|
+
column_intersecions_new.append(column)
|
|
166
|
+
used_columns.append(column)
|
|
167
|
+
|
|
168
|
+
column_intersections = column_intersecions_new.copy()
|
|
169
|
+
column_clause = str(column_intersections)[1:-1].replace("'", "")
|
|
170
|
+
return column_clause, numeric_columns, used_columns
|
|
171
|
+
|
|
172
|
+
def get_database_objects(
|
|
173
|
+
self,
|
|
174
|
+
database: str,
|
|
175
|
+
schema: str = None,
|
|
176
|
+
object_type_restriction: str = "include_all",
|
|
177
|
+
) -> dict:
|
|
178
|
+
if self.databricks_connection is None:
|
|
179
|
+
self._connect_to_databricks()
|
|
180
|
+
|
|
181
|
+
all_database_tables = []
|
|
182
|
+
all_database_views = []
|
|
183
|
+
|
|
184
|
+
if (
|
|
185
|
+
object_type_restriction == "include_all"
|
|
186
|
+
or object_type_restriction == "include_only_tables"
|
|
187
|
+
):
|
|
188
|
+
if schema:
|
|
189
|
+
query_db_tables = f"SELECT table_schema, table_name FROM {database}.information_schema.tables WHERE table_schema == '{schema.lower()}' and table_type != 'VIEW'"
|
|
190
|
+
else:
|
|
191
|
+
logger.error(
|
|
192
|
+
"Query defined as null - please check input for execute_queries function."
|
|
193
|
+
)
|
|
194
|
+
exit()
|
|
195
|
+
|
|
196
|
+
all_database_tables = self.execute_queries(query_db_tables)
|
|
197
|
+
|
|
198
|
+
if (
|
|
199
|
+
object_type_restriction == "include_all"
|
|
200
|
+
or object_type_restriction == "include_only_views"
|
|
201
|
+
):
|
|
202
|
+
if schema:
|
|
203
|
+
query_db_views = f"SELECT table_schema, table_name FROM {database}.information_schema.tables WHERE table_schema == '{schema.lower()}' and table_type == 'VIEW'"
|
|
204
|
+
else:
|
|
205
|
+
logger.error(
|
|
206
|
+
"Query defined as null - please check input for execute_queries function."
|
|
207
|
+
)
|
|
208
|
+
exit()
|
|
209
|
+
|
|
210
|
+
all_database_views = self.execute_queries(query_db_views)
|
|
211
|
+
|
|
212
|
+
database_objects = []
|
|
213
|
+
for row in all_database_tables:
|
|
214
|
+
database_table = (
|
|
215
|
+
f'{database}.{row["table_schema"]}.{row["table_name"]}'.upper()
|
|
216
|
+
)
|
|
217
|
+
database_objects.append(
|
|
218
|
+
{"object_identifier": database_table, "object_type": "table"}
|
|
219
|
+
)
|
|
220
|
+
for row in all_database_views:
|
|
221
|
+
database_view = (
|
|
222
|
+
f'{database}.{row["table_schema"]}.{row["table_name"]}'.upper()
|
|
223
|
+
)
|
|
224
|
+
database_objects.append(
|
|
225
|
+
{"object_identifier": database_view, "object_type": "view"}
|
|
226
|
+
)
|
|
227
|
+
return database_objects
|
|
228
|
+
|
|
229
|
+
def get_last_altered_timestamp_from_object(self, object: DatabaseObject) -> str:
|
|
230
|
+
"""queries last_altered timestamp for given object
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
object (str): object for comparison
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
str: last_altered timestamp
|
|
237
|
+
"""
|
|
238
|
+
if self.databricks_connection is None:
|
|
239
|
+
self._connect_to_databricks()
|
|
240
|
+
|
|
241
|
+
self.execute_statement("ALTER SESSION SET TIMEZONE = 'Europe/London';")
|
|
242
|
+
|
|
243
|
+
query_get_last_altered = f"SELECT LAST_ALTERED FROM {object.database}.INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = '{object.name}' AND TABLE_SCHEMA = '{object.schema}';"
|
|
244
|
+
|
|
245
|
+
last_altered = self.execute_queries(query_get_last_altered)[0]
|
|
246
|
+
|
|
247
|
+
return last_altered
|
|
248
|
+
|
|
249
|
+
def get_columns_from_object(self, object: DatabaseObject) -> list:
|
|
250
|
+
"""returns all columns from given object
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
object (DatabaseObject): table or view
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
list: list of all columns
|
|
257
|
+
"""
|
|
258
|
+
|
|
259
|
+
if self.databricks_connection is None:
|
|
260
|
+
self._connect_to_databricks()
|
|
261
|
+
|
|
262
|
+
query_show_columns = f"SELECT column_name FROM {object.database}.information_schema.columns WHERE table_schema == '{object.schema.lower()}' and table_name == '{object.name.lower()}'"
|
|
263
|
+
|
|
264
|
+
all_columns = self.execute_queries(query_show_columns)
|
|
265
|
+
columns = []
|
|
266
|
+
|
|
267
|
+
for row in all_columns:
|
|
268
|
+
columns.append(row["column_name"])
|
|
269
|
+
|
|
270
|
+
return columns
|
|
271
|
+
|
|
272
|
+
def get_row_count_from_object(self, object: DatabaseObject, where_clause: str="") -> int:
|
|
273
|
+
"""gets row count from given object
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
object (DatabaseObject): table or view
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
int: number of rows in object
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
if self.databricks_connection is None:
|
|
283
|
+
self._connect_to_databricks()
|
|
284
|
+
|
|
285
|
+
# is it more efficient to select the information_schema.table view to get the rows?
|
|
286
|
+
query_get_row_count = f"SELECT COUNT(*) AS ROW_COUNT FROM {object.database}.{object.schema}.{object.name} {where_clause};"
|
|
287
|
+
row_count = -1
|
|
288
|
+
error_list = []
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
row_count = self.execute_queries(query_get_row_count)[0]["ROW_COUNT"]
|
|
292
|
+
|
|
293
|
+
except Exception as err:
|
|
294
|
+
error_list.append(str(err))
|
|
295
|
+
error_list.append(query_get_row_count)
|
|
296
|
+
|
|
297
|
+
return row_count, error_list
|
|
298
|
+
|
|
299
|
+
def get_data_types_from_object(
|
|
300
|
+
self, object: DatabaseObject, column_intersections: list
|
|
301
|
+
) -> dict:
|
|
302
|
+
"""returns datatypes for all intersection columns in a database object
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
object (DatabaseObject): table or view
|
|
306
|
+
column_intersections (list): columns for which the data type is queried
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
dict: columns and their datatype
|
|
310
|
+
"""
|
|
311
|
+
|
|
312
|
+
if self.databricks_connection is None:
|
|
313
|
+
self._connect_to_databricks()
|
|
314
|
+
|
|
315
|
+
column_intersections = str(column_intersections)[1:-1]
|
|
316
|
+
if column_intersections == "":
|
|
317
|
+
column_intersections = "''"
|
|
318
|
+
|
|
319
|
+
query_get_data_types_from_object = f"SELECT column_name, data_type FROM {object.database}.information_schema.columns WHERE table_schema == '{object.schema.lower()}' and table_name == '{object.name.lower()}'"
|
|
320
|
+
|
|
321
|
+
table_description = self.execute_queries(query_get_data_types_from_object)
|
|
322
|
+
|
|
323
|
+
dict_colummns_datatype = []
|
|
324
|
+
|
|
325
|
+
for row in table_description:
|
|
326
|
+
dict_colummns_datatype.append(
|
|
327
|
+
{"COLUMN_NAME": row["column_name"], "DATA_TYPE": row["data_type"]}
|
|
328
|
+
)
|
|
329
|
+
return dict_colummns_datatype
|
|
330
|
+
|
|
331
|
+
def get_count_distincts_from_object(
|
|
332
|
+
self,
|
|
333
|
+
object: DatabaseObject,
|
|
334
|
+
column_intersections: list,
|
|
335
|
+
where_clause: str = "",
|
|
336
|
+
exclude_columns: list = [],
|
|
337
|
+
) -> dict:
|
|
338
|
+
"""get distinct count for every column in a database object that is in column intersections list
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
object (DatabaseObject): table or view
|
|
342
|
+
column_intersections (list): columns that are used for distinct count
|
|
343
|
+
where_clause (str, optional): optional further filter. Defaults to "".
|
|
344
|
+
exclude_columns (list, optional): columns to exclude from distinct count. Defaults to [].
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
dict: distinct counts for columns
|
|
348
|
+
error_list: list of failed executions for distinct counts
|
|
349
|
+
"""
|
|
350
|
+
|
|
351
|
+
if self.databricks_connection is None:
|
|
352
|
+
self._connect_to_databricks()
|
|
353
|
+
|
|
354
|
+
unions = ""
|
|
355
|
+
|
|
356
|
+
for column in column_intersections:
|
|
357
|
+
if column not in exclude_columns:
|
|
358
|
+
unions += f" UNION SELECT '{column}' AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.database}.{object.schema}.{object.name} {where_clause}"
|
|
359
|
+
|
|
360
|
+
query_get_count_distincts_from_object = f"{unions[6:]} ORDER BY COUNT_DISTINCT;"
|
|
361
|
+
error_list = []
|
|
362
|
+
try:
|
|
363
|
+
dict_count_distincts = self.execute_queries(
|
|
364
|
+
query_get_count_distincts_from_object
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
except Exception as err:
|
|
368
|
+
# raise err
|
|
369
|
+
dict_count_distincts = [{"COUNT_DISTINCT": 0}]
|
|
370
|
+
error_list.append(
|
|
371
|
+
["ERROR", str(err).split("|||")[0], str(err).split("|||")[1]]
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
return dict_count_distincts, error_list
|
|
375
|
+
|
|
376
|
+
def get_table_size(self, object: DatabaseObject) -> int:
|
|
377
|
+
"""returns size of given object
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
object (DatabaseObject): table or view
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
int: size of object
|
|
384
|
+
"""
|
|
385
|
+
|
|
386
|
+
if self.databricks_connection is None:
|
|
387
|
+
self._connect_to_databricks()
|
|
388
|
+
|
|
389
|
+
query_analyze_table = f"ANALYZE TABLE {object.database}.{object.schema}.{object.name} COMPUTE STATISTICS NOSCAN"
|
|
390
|
+
self.execute_queries(query_analyze_table)
|
|
391
|
+
|
|
392
|
+
query_get_table_size = (
|
|
393
|
+
f"DESC EXTENDED {object.database}.{object.schema}.{object.name}"
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
table_description = self.execute_queries(query_get_table_size)
|
|
397
|
+
size_string = [
|
|
398
|
+
row["data_type"]
|
|
399
|
+
for row in table_description
|
|
400
|
+
if row["col_name"] == "Statistics"
|
|
401
|
+
][0]
|
|
402
|
+
size = int(re.search(r"\d+", size_string).group())
|
|
403
|
+
|
|
404
|
+
return size
|
|
405
|
+
|
|
406
|
+
def create_checksums(
|
|
407
|
+
self,
|
|
408
|
+
object: DatabaseObject,
|
|
409
|
+
column_intersections: list,
|
|
410
|
+
where_clause: str = "",
|
|
411
|
+
exclude_columns: list = [],
|
|
412
|
+
numeric_scale: int = None,
|
|
413
|
+
) -> List[Dict]:
|
|
414
|
+
"""creates checksums for given object in compliance with given conditions
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
object (DatabaseObject): table or view
|
|
418
|
+
column_intersections (list): columns that are used for checksums
|
|
419
|
+
where_clause (str, optional): Optional filter criteria given as sql-usable string. Defaults to "".
|
|
420
|
+
exclude_columns (list, optional): columns to exlude from calculation. Defaults to [].
|
|
421
|
+
numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
List[Dict]: checksums for columns of object
|
|
425
|
+
"""
|
|
426
|
+
|
|
427
|
+
if self.databricks_connection is None:
|
|
428
|
+
self._connect_to_databricks()
|
|
429
|
+
|
|
430
|
+
column_intersections = [f"{x.upper()}" for x in column_intersections if x not in exclude_columns]
|
|
431
|
+
|
|
432
|
+
dict_colummns_datatype = self.get_data_types_from_object(object, column_intersections)
|
|
433
|
+
|
|
434
|
+
aggregates = ""
|
|
435
|
+
count_nulls = ""
|
|
436
|
+
|
|
437
|
+
for column in column_intersections:
|
|
438
|
+
column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
|
|
439
|
+
|
|
440
|
+
count_nulls += f", SUM(CASE WHEN {column} IS NULL THEN 1 ELSE 0 END) AS COUNTNULLS_{column}"
|
|
441
|
+
|
|
442
|
+
if column_datatype.lower() in self.databricks_datatype_mapping["numeric"]:
|
|
443
|
+
|
|
444
|
+
if numeric_scale:
|
|
445
|
+
aggregates += f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38, {numeric_scale})) AS sum_{column}"
|
|
446
|
+
else:
|
|
447
|
+
aggregates += f", CAST(SUM({column}) AS DECIMAL(38)) AS sum_{column}"
|
|
448
|
+
|
|
449
|
+
elif (
|
|
450
|
+
column_datatype.lower() in self.databricks_datatype_mapping["string"]
|
|
451
|
+
or column_datatype.lower() in self.databricks_datatype_mapping["date_and_time"]
|
|
452
|
+
):
|
|
453
|
+
|
|
454
|
+
aggregates += f", COUNT(DISTINCT LOWER({column})) AS countdistinct_{column}"
|
|
455
|
+
|
|
456
|
+
elif column_datatype.lower() in self.databricks_datatype_mapping["binary"]:
|
|
457
|
+
|
|
458
|
+
aggregates += f", COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column}::VARCHAR))) AS countdistinct_{column}"
|
|
459
|
+
|
|
460
|
+
elif column_datatype.lower() in self.databricks_datatype_mapping["boolean"]:
|
|
461
|
+
aggregates += f", MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = true)) || '_' || MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = false)) AS aggregateboolean_{column}"
|
|
462
|
+
|
|
463
|
+
# else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
|
|
464
|
+
|
|
465
|
+
query_checksums = f"SELECT {aggregates[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
|
|
466
|
+
|
|
467
|
+
query_countnulls = f"SELECT {count_nulls[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
|
|
468
|
+
|
|
469
|
+
error_list = []
|
|
470
|
+
checksums={}
|
|
471
|
+
|
|
472
|
+
try:
|
|
473
|
+
checksums_results = self.execute_queries(
|
|
474
|
+
[query_checksums, query_countnulls]
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
aggregation_results = checksums_results[0][0]
|
|
478
|
+
countnulls_results = checksums_results[1][0]
|
|
479
|
+
|
|
480
|
+
checksums = {}
|
|
481
|
+
for key in aggregation_results.asDict().keys():
|
|
482
|
+
aggregation = key.split("_", 1)[0].upper()
|
|
483
|
+
col_name = key.split("_", 1)[1]
|
|
484
|
+
value = aggregation_results[key]
|
|
485
|
+
cnt_nulls = countnulls_results[f"COUNTNULLS_{col_name}"]
|
|
486
|
+
checksums[col_name] = [aggregation, value, cnt_nulls]
|
|
487
|
+
|
|
488
|
+
except Exception as err:
|
|
489
|
+
# TODO: Improve error formatting
|
|
490
|
+
error_list.append(["ERROR", query_checksums, str(err)])
|
|
491
|
+
|
|
492
|
+
checksums["TESTATM_ERRORS"] = error_list
|
|
493
|
+
|
|
494
|
+
return checksums
|
|
495
|
+
|
|
496
|
+
def create_pandas_df_from_group_by(
|
|
497
|
+
self,
|
|
498
|
+
object: DatabaseObject,
|
|
499
|
+
column_intersections: list,
|
|
500
|
+
group_by_columns: list,
|
|
501
|
+
group_by_aggregation_columns: list,
|
|
502
|
+
group_by_aggregation_type: str,
|
|
503
|
+
only_numeric: bool,
|
|
504
|
+
where_clause: str,
|
|
505
|
+
exclude_columns: list,
|
|
506
|
+
numeric_scale: int = None,
|
|
507
|
+
) -> List[Dict]:
|
|
508
|
+
"""execution of multiple aggregations at once
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
object (DatabaseObject): table or view
|
|
512
|
+
column_intersections (list): columns existing in src and trgt
|
|
513
|
+
group_by_columns (list): columns for grouping the aggregations
|
|
514
|
+
group_by_aggregation_columns (list): list of columns that are supposed to be aggregated
|
|
515
|
+
group_by_aggregation_type (str): choice between: only_min_max, various, various_and_min_max
|
|
516
|
+
only_numeric (bool): whether to also include distinct counts or only do numeric aggregations
|
|
517
|
+
where_clause (str): optional filter for aggregations, given as sql compatible where-string
|
|
518
|
+
exclude_columns (list): columns to exclude from comparisons
|
|
519
|
+
numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
|
|
520
|
+
|
|
521
|
+
Returns:
|
|
522
|
+
List[Dict]: list of pandas dataframes with results from aggregations, used sql queries
|
|
523
|
+
"""
|
|
524
|
+
|
|
525
|
+
if self.databricks_connection is None:
|
|
526
|
+
self._connect_to_databricks()
|
|
527
|
+
|
|
528
|
+
if group_by_aggregation_columns == ["all"]:
|
|
529
|
+
aggregation_columns = [
|
|
530
|
+
f"{column.upper()}"
|
|
531
|
+
for column in column_intersections
|
|
532
|
+
if (column not in group_by_columns and column not in exclude_columns)
|
|
533
|
+
]
|
|
534
|
+
else:
|
|
535
|
+
aggregation_columns = [
|
|
536
|
+
f"{column.upper()}"
|
|
537
|
+
for column in column_intersections
|
|
538
|
+
if (
|
|
539
|
+
column in group_by_aggregation_columns
|
|
540
|
+
and column not in exclude_columns
|
|
541
|
+
)
|
|
542
|
+
]
|
|
543
|
+
|
|
544
|
+
group_by_query_columns_string = " "
|
|
545
|
+
grouping_columns_final = []
|
|
546
|
+
error_dict = {}
|
|
547
|
+
|
|
548
|
+
try:
|
|
549
|
+
for column in group_by_columns:
|
|
550
|
+
if column in column_intersections and column not in exclude_columns:
|
|
551
|
+
group_by_query_columns_string += f"{column} ,"
|
|
552
|
+
grouping_columns_final.append(column)
|
|
553
|
+
|
|
554
|
+
group_by_query_columns_string = group_by_query_columns_string[:-1]
|
|
555
|
+
|
|
556
|
+
dict_colummns_datatype = self.get_data_types_from_object(
|
|
557
|
+
object, aggregation_columns
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
aggregates = ""
|
|
561
|
+
aggregates_min = ""
|
|
562
|
+
|
|
563
|
+
for column in aggregation_columns:
|
|
564
|
+
column_datatype = next(
|
|
565
|
+
x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column
|
|
566
|
+
)["DATA_TYPE"]
|
|
567
|
+
|
|
568
|
+
if (
|
|
569
|
+
column_datatype.lower()
|
|
570
|
+
in self.databricks_datatype_mapping["numeric"]
|
|
571
|
+
):
|
|
572
|
+
if numeric_scale:
|
|
573
|
+
aggregates_min += f", CAST(ROUND(MIN({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MIN_{column}, CAST(ROUND(max({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MAX_{column}"
|
|
574
|
+
aggregates += f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS SUM_{column}"
|
|
575
|
+
|
|
576
|
+
else:
|
|
577
|
+
aggregates_min += f", MIN({column}) AS MIN_{column}, MAX({column}) AS MAX_{column}"
|
|
578
|
+
aggregates += f", SUM({column}) AS SUM_{column}"
|
|
579
|
+
|
|
580
|
+
elif not only_numeric and (
|
|
581
|
+
column_datatype.lower()
|
|
582
|
+
in self.databricks_datatype_mapping["string"]
|
|
583
|
+
or column_datatype.lower()
|
|
584
|
+
in self.databricks_datatype_mapping["date_and_time"]
|
|
585
|
+
):
|
|
586
|
+
aggregates += (
|
|
587
|
+
f", COUNT(DISTINCT LOWER({column})) AS COUNTDISTINCT_{column}"
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
elif (
|
|
591
|
+
not only_numeric
|
|
592
|
+
and column_datatype.lower()
|
|
593
|
+
in self.databricks_datatype_mapping["binary"]
|
|
594
|
+
):
|
|
595
|
+
aggregates += f", COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column}::VARCHAR))) AS COUNTDISTINCT_{column}"
|
|
596
|
+
|
|
597
|
+
elif (
|
|
598
|
+
not only_numeric
|
|
599
|
+
and column_datatype.lower()
|
|
600
|
+
in self.databricks_datatype_mapping["boolean"]
|
|
601
|
+
):
|
|
602
|
+
aggregates += f", MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = true)) || '_' || MAX((SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column} = false)) AS AGGREGATEBOOLEAN_{column}"
|
|
603
|
+
|
|
604
|
+
# else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
|
|
605
|
+
|
|
606
|
+
# CASE 1: min_max
|
|
607
|
+
if group_by_aggregation_type == "only_min_max":
|
|
608
|
+
group_by_query_aggregation_string = aggregates_min[1:]
|
|
609
|
+
|
|
610
|
+
# CASE 2; sum, count_distinct, aggregate_boolean
|
|
611
|
+
elif group_by_aggregation_type == "various":
|
|
612
|
+
group_by_query_aggregation_string = aggregates[1:]
|
|
613
|
+
|
|
614
|
+
# CASE 3: sum, count_distinct, aggregate_boolean, min_max
|
|
615
|
+
elif group_by_aggregation_type == "various_and_min_max":
|
|
616
|
+
group_by_query_aggregation_string = f"{aggregates_min[1:]}{aggregates}"
|
|
617
|
+
|
|
618
|
+
query_group_by_aggregation = f"SELECT {group_by_query_columns_string}, COUNT(*) AS COUNT_OF_GROUP_BY_VALUE, {group_by_query_aggregation_string} FROM {object.database}.{object.schema}.{object.name} {where_clause} GROUP BY {group_by_query_columns_string} ORDER BY {group_by_query_columns_string};"
|
|
619
|
+
|
|
620
|
+
group_by_aggregation_pdf = self.execute_queries(
|
|
621
|
+
query_group_by_aggregation, True
|
|
622
|
+
)
|
|
623
|
+
except Exception as err:
|
|
624
|
+
group_by_aggregation_pdf = pd.DataFrame()
|
|
625
|
+
group_by_aggregation_pdf["TESTATM_ERROR"] = [1]
|
|
626
|
+
if not grouping_columns_final:
|
|
627
|
+
error_dict = {
|
|
628
|
+
"QUERY": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table",
|
|
629
|
+
"ERROR": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table"
|
|
630
|
+
}
|
|
631
|
+
group_by_query_aggregation_string = ""
|
|
632
|
+
elif "|||" in str(err):
|
|
633
|
+
error_dict = {
|
|
634
|
+
"QUERY": str(err).split("|||")[0],
|
|
635
|
+
"ERROR": str(err).split("|||")[1],
|
|
636
|
+
}
|
|
637
|
+
else:
|
|
638
|
+
error_dict = {
|
|
639
|
+
"QUERY": "NO Query generated. Please check if the configurated Grouping Columns exist in the Table",
|
|
640
|
+
"ERROR": str(err),
|
|
641
|
+
}
|
|
642
|
+
group_by_query_aggregation_string = ""
|
|
643
|
+
|
|
644
|
+
return (
|
|
645
|
+
group_by_aggregation_pdf,
|
|
646
|
+
group_by_query_aggregation_string,
|
|
647
|
+
group_by_query_columns_string,
|
|
648
|
+
grouping_columns_final,
|
|
649
|
+
error_dict
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
def create_pandas_df(
|
|
654
|
+
self, object: DatabaseObject, intersection_columns_trgt_src: list, where_clause:str="", exclude_columns:list=[]
|
|
655
|
+
) -> pd.DataFrame:
|
|
656
|
+
"""creates pandas dataframes with all data from given object in given columns
|
|
657
|
+
|
|
658
|
+
Args:
|
|
659
|
+
object (DatabaseObject): table or view
|
|
660
|
+
intersection_columns_trgt_src (list): columns existing in source and target
|
|
661
|
+
|
|
662
|
+
Returns:
|
|
663
|
+
pd.DataFrame: direct result of sql query
|
|
664
|
+
"""
|
|
665
|
+
if self.databricks_connection is None:
|
|
666
|
+
self._connect_to_databricks()
|
|
667
|
+
|
|
668
|
+
intersection_columns_trgt_src_ = ', '.join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
|
|
669
|
+
|
|
670
|
+
df_query = f"SELECT {intersection_columns_trgt_src_} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
|
|
671
|
+
|
|
672
|
+
src_pdf = self.execute_queries(df_query, True)
|
|
673
|
+
|
|
674
|
+
return src_pdf
|
|
675
|
+
|
|
676
|
+
def create_pandas_df_from_sample(
|
|
677
|
+
self,
|
|
678
|
+
object: DatabaseObject,
|
|
679
|
+
column_intersections: list,
|
|
680
|
+
key_columns: list,
|
|
681
|
+
where_clause: str = "",
|
|
682
|
+
exclude_columns: list = [],
|
|
683
|
+
key_filters: dict = {},
|
|
684
|
+
dedicated_columns: list = [],
|
|
685
|
+
sample_count: int = 10,
|
|
686
|
+
numeric_scale: int = None,
|
|
687
|
+
) -> List[Dict]:
|
|
688
|
+
if self.databricks_connection is None:
|
|
689
|
+
self._connect_to_databricks()
|
|
690
|
+
|
|
691
|
+
where_exists = True
|
|
692
|
+
if not where_clause:
|
|
693
|
+
where_exists = False
|
|
694
|
+
|
|
695
|
+
sample_count = str(sample_count)
|
|
696
|
+
key_intersection = list(
|
|
697
|
+
(set(column_intersections) & set(key_columns)) - set(exclude_columns)
|
|
698
|
+
)
|
|
699
|
+
filter_intersection = list(
|
|
700
|
+
(set(column_intersections) & set(key_filters.keys())) - set(exclude_columns)
|
|
701
|
+
)
|
|
702
|
+
dedicated_intersection = list(
|
|
703
|
+
(set(column_intersections) & set(dedicated_columns)) - set(exclude_columns)
|
|
704
|
+
)
|
|
705
|
+
|
|
706
|
+
key_intersection.sort()
|
|
707
|
+
filter_intersection.sort()
|
|
708
|
+
dedicated_intersection.sort()
|
|
709
|
+
|
|
710
|
+
if dedicated_intersection != []:
|
|
711
|
+
is_dedicated = True
|
|
712
|
+
|
|
713
|
+
dict_colummns_datatype = self.get_data_types_from_object(
|
|
714
|
+
object, dedicated_intersection
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
else:
|
|
718
|
+
is_dedicated = False
|
|
719
|
+
|
|
720
|
+
dict_colummns_datatype = self.get_data_types_from_object(
|
|
721
|
+
object, column_intersections
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
if key_intersection != [] and is_dedicated:
|
|
725
|
+
keys = str(key_intersection)[1:-1].replace("'", "")
|
|
726
|
+
column_clause, numeric_columns, used_columns = self._get_column_clause(
|
|
727
|
+
dedicated_intersection,
|
|
728
|
+
dict_colummns_datatype,
|
|
729
|
+
numeric_scale,
|
|
730
|
+
key_columns,
|
|
731
|
+
)
|
|
732
|
+
if (key_filters != {}) & (filter_intersection != []):
|
|
733
|
+
values = list(key_filters.values())
|
|
734
|
+
if values[0] != []:
|
|
735
|
+
in_clause = self._get_in_clause(
|
|
736
|
+
key_filters, numeric_columns, numeric_scale, where_exists
|
|
737
|
+
)
|
|
738
|
+
else:
|
|
739
|
+
in_clause = ""
|
|
740
|
+
sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} TABLESAMPLE ({sample_count} ROWS) {where_clause}{in_clause} ORDER BY {keys};"
|
|
741
|
+
elif key_intersection != [] and not is_dedicated:
|
|
742
|
+
keys = str(key_intersection)[1:-1].replace("'", "")
|
|
743
|
+
column_clause, numeric_columns, used_columns = self._get_column_clause(
|
|
744
|
+
column_intersections, dict_colummns_datatype, numeric_scale, key_columns
|
|
745
|
+
)
|
|
746
|
+
if (key_filters != {}) & (filter_intersection != []):
|
|
747
|
+
values = list(key_filters.values())
|
|
748
|
+
if values[0] != []:
|
|
749
|
+
in_clause = self._get_in_clause(
|
|
750
|
+
key_filters, numeric_columns, numeric_scale, where_exists
|
|
751
|
+
)
|
|
752
|
+
else:
|
|
753
|
+
in_clause = ""
|
|
754
|
+
sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} TABLESAMPLE ({sample_count} ROWS) {where_clause}{in_clause} ORDER BY {keys};"
|
|
755
|
+
else:
|
|
756
|
+
column_intersections = list(
|
|
757
|
+
set(column_intersections) - set(exclude_columns)
|
|
758
|
+
)
|
|
759
|
+
column_intersections.sort()
|
|
760
|
+
column_clause, numeric_columns, used_columns = self._get_column_clause(
|
|
761
|
+
column_intersections, dict_colummns_datatype, numeric_scale, key_columns
|
|
762
|
+
)
|
|
763
|
+
sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} TABLESAMPLE ({sample_count} ROWS) {where_clause};"
|
|
764
|
+
|
|
765
|
+
error_dict = {}
|
|
766
|
+
key_dict = {}
|
|
767
|
+
try:
|
|
768
|
+
sample_pdf = self.execute_queries(sample_query, return_as_pdf=True)
|
|
769
|
+
for key in key_intersection:
|
|
770
|
+
if pd.api.types.is_datetime64_any_dtype(sample_pdf[key]):
|
|
771
|
+
key_dict[key] = list(sample_pdf[key].astype(str))
|
|
772
|
+
else:
|
|
773
|
+
key_dict[key] = list(sample_pdf[key])
|
|
774
|
+
|
|
775
|
+
except Exception as err:
|
|
776
|
+
sample_pdf = pd.DataFrame()
|
|
777
|
+
sample_pdf["TESTATM_ERROR"] = [1]
|
|
778
|
+
if "|||" in str(err):
|
|
779
|
+
error_dict = {
|
|
780
|
+
"QUERY": str(err).split("|||")[0],
|
|
781
|
+
"ERROR": str(err).split("|||")[1],
|
|
782
|
+
}
|
|
783
|
+
else:
|
|
784
|
+
error_dict = {"QUERY": "No SQL Error", "ERROR": str(err)}
|
|
785
|
+
|
|
786
|
+
return_list = []
|
|
787
|
+
return_list.append(sample_pdf)
|
|
788
|
+
return_list.append(error_dict)
|
|
789
|
+
|
|
790
|
+
return return_list, key_dict, used_columns, sample_query
|
|
791
|
+
|
|
792
|
+
def execute_queries(
|
|
793
|
+
self,
|
|
794
|
+
query: Union[str, List[str]],
|
|
795
|
+
return_as_pdf: bool = False,
|
|
796
|
+
return_query_ids: bool = False,
|
|
797
|
+
) -> Union[List[Dict], List[List[Dict]]]:
|
|
798
|
+
"""actual execution of defined queries
|
|
799
|
+
|
|
800
|
+
Args:
|
|
801
|
+
query (Union[str, List[str]]): queries to be executed
|
|
802
|
+
return_as_pdf (bool, optional): If true, queries returned as pandas data frames. Defaults to False.
|
|
803
|
+
return_query_ids (bool, optional): If true, results and queri ids are returned, otherwise only results. Defaults to False.
|
|
804
|
+
|
|
805
|
+
Raises:
|
|
806
|
+
Exception: Raises exception if single query cannot be executed.
|
|
807
|
+
|
|
808
|
+
Returns:
|
|
809
|
+
Union[List[Dict], List[List[Dict]]]: returns results or results with query-ids
|
|
810
|
+
"""
|
|
811
|
+
if self.databricks_connection is None:
|
|
812
|
+
self._connect_to_databricks()
|
|
813
|
+
|
|
814
|
+
if query:
|
|
815
|
+
query_list: List[str] = query if isinstance(query, list) else [query]
|
|
816
|
+
else:
|
|
817
|
+
logger.error(
|
|
818
|
+
"Query defined as null - please check input for execute_queries function."
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
cursor = self.databricks_connection.cursor()
|
|
822
|
+
|
|
823
|
+
results = []
|
|
824
|
+
query_ids = []
|
|
825
|
+
|
|
826
|
+
for single_query in query_list:
|
|
827
|
+
try:
|
|
828
|
+
try:
|
|
829
|
+
query_result = cursor.execute(single_query).fetchall()
|
|
830
|
+
except:
|
|
831
|
+
query_result = cursor.execute(single_query).fetchall_arrow().to_pylist()
|
|
832
|
+
if return_as_pdf:
|
|
833
|
+
columns = [col[0] for col in cursor.description]
|
|
834
|
+
query_result = pd.DataFrame(query_result, columns=columns)
|
|
835
|
+
|
|
836
|
+
results.append(query_result)
|
|
837
|
+
query_ids.append(0) # there is no query id returned by databricks
|
|
838
|
+
|
|
839
|
+
except Exception as err:
|
|
840
|
+
raise Exception(single_query + "|||" + str(err))
|
|
841
|
+
|
|
842
|
+
if return_query_ids:
|
|
843
|
+
return (
|
|
844
|
+
results[0],
|
|
845
|
+
query_ids[0] if not isinstance(query, list) else results,
|
|
846
|
+
query_ids,
|
|
847
|
+
)
|
|
848
|
+
|
|
849
|
+
else:
|
|
850
|
+
return results[0] if not isinstance(query, list) else results
|
|
851
|
+
|
|
852
|
+
def execute_statement(self, statement: Union[str, List[str]]) -> None:
|
|
853
|
+
"""
|
|
854
|
+
Executes simple statement against snowflake
|
|
855
|
+
Schema and Database settings must be set beforehand
|
|
856
|
+
Args:
|
|
857
|
+
statement Union[str, List[str]] - a sql statement or a list of sql statements to execute
|
|
858
|
+
"""
|
|
859
|
+
if self.databricks_connection is None:
|
|
860
|
+
self._connect_to_databricks()
|
|
861
|
+
|
|
862
|
+
statement_list: List[str] = (
|
|
863
|
+
statement if isinstance(statement, list) else [statement]
|
|
864
|
+
)
|
|
865
|
+
|
|
866
|
+
cursor = self.databricks_connection.cursor()
|
|
867
|
+
|
|
868
|
+
for single_statement in statement_list:
|
|
869
|
+
try:
|
|
870
|
+
stripped_statement = single_statement.strip()
|
|
871
|
+
_ = cursor.execute(stripped_statement)
|
|
872
|
+
|
|
873
|
+
except Exception as err:
|
|
874
|
+
raise Exception(self._get_error_message(err, single_statement)) from err
|
|
875
|
+
|
|
876
|
+
def create_schemas(self, database_name: str, schemas: List):
|
|
877
|
+
statement_list = []
|
|
878
|
+
|
|
879
|
+
for schema in schemas:
|
|
880
|
+
statement_list.append(
|
|
881
|
+
f"CREATE SCHEMA IF NOT EXISTS {database_name}.{schema}"
|
|
882
|
+
)
|
|
883
|
+
|
|
884
|
+
self.execute_statement(statement_list)
|
|
885
|
+
|
|
886
|
+
def insert_json_results(
|
|
887
|
+
self,
|
|
888
|
+
run_guid: str,
|
|
889
|
+
pipeline_name: str,
|
|
890
|
+
pipeline_id: str,
|
|
891
|
+
start_time_utc: str,
|
|
892
|
+
result_table: str,
|
|
893
|
+
results: dict,
|
|
894
|
+
) -> None:
|
|
895
|
+
"""
|
|
896
|
+
copy into - result table for json results
|
|
897
|
+
"""
|
|
898
|
+
|
|
899
|
+
statement = f"CREATE TABLE IF NOT EXISTS {result_table} (RUN_GUID STRING, PIPELINE_NAME STRING, PIPELINE_ID STRING, START_TIME_UTC STRING, RESULT STRING, CREATION_TIME_UTC STRING)"
|
|
900
|
+
|
|
901
|
+
self.execute_statement(statement)
|
|
902
|
+
|
|
903
|
+
statement = (
|
|
904
|
+
"INSERT INTO {} VALUES ('{}', '{}', '{}', '{}', '{}', '{}');".format(
|
|
905
|
+
result_table,
|
|
906
|
+
run_guid,
|
|
907
|
+
pipeline_name,
|
|
908
|
+
pipeline_id,
|
|
909
|
+
start_time_utc,
|
|
910
|
+
str(results).replace("'", '"'),
|
|
911
|
+
datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S"),
|
|
912
|
+
)
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
self.execute_statement(statement)
|
|
916
|
+
|
|
917
|
+
def insert_json_results_live(
|
|
918
|
+
self,
|
|
919
|
+
run_guid: str,
|
|
920
|
+
pipeline_name: str,
|
|
921
|
+
pipeline_id: str,
|
|
922
|
+
result_table: str,
|
|
923
|
+
stage_name: str,
|
|
924
|
+
source_system: str,
|
|
925
|
+
target_system: str,
|
|
926
|
+
database: str,
|
|
927
|
+
schema: str,
|
|
928
|
+
object: str,
|
|
929
|
+
) -> None:
|
|
930
|
+
"""
|
|
931
|
+
copy into - result table for json results live
|
|
932
|
+
"""
|
|
933
|
+
result_database = result_table.split(".", 1)[0]
|
|
934
|
+
|
|
935
|
+
statement = f"COPY INTO {result_table} (RUN_GUID, PIPELINE_NAME, PIPELINE_ID, SOURCE_SYSTEM, TARGET_SYSTEM, DATABASE_NAME, SCHEMA_NAME, OBJECT_NAME ,RESULT, CREATION_TS) FROM (SELECT '{run_guid}', '{pipeline_name}', '{pipeline_id}', '{source_system}', '{target_system}', '{database}', '{schema}', '{object}', $1, SYSDATE() from @{stage_name} (file_format => {result_database}.meta_data.ff_json ));"
|
|
936
|
+
|
|
937
|
+
self.execute_statement(statement)
|
|
938
|
+
|
|
939
|
+
def insert_highlevel_results(
|
|
940
|
+
self,
|
|
941
|
+
results: dict,
|
|
942
|
+
run_guid: str,
|
|
943
|
+
pipeline_name: str,
|
|
944
|
+
pipeline_id: str,
|
|
945
|
+
result_table_highlevel: str,
|
|
946
|
+
) -> None:
|
|
947
|
+
"""
|
|
948
|
+
insert into - highlevel results per "pipeline run" / "generic testing tool execution"
|
|
949
|
+
"""
|
|
950
|
+
|
|
951
|
+
statement = f"CREATE TABLE IF NOT EXISTS {result_table_highlevel} (RUN_GUID STRING, PIPELINE_NAME STRING, PIPELINE_ID STRING, START_TIME_UTC STRING, SOURCE_SYSTEM STRING, TARGET_SYSTEM STRING, DATABASE_NAME STRING, TESTSET STRING, ALL_OBJECTS_MATCHING BOOLEAN, ALL_COLUMNS_EQUAL BOOLEAN, ALL_ROWCOUNTS_EQUAL BOOLEAN, ALL_CHECKSUMS_EQUAL BOOLEAN, ALL_SAMPLES_EQUAL BOOLEAN, ALL_OBJECTS_EQUAL BOOLEAN, OBJECTS_TO_COMPARE_SRC STRING, OBJECTS_TO_COMPARE_TRGT STRING, NUMBER_OF_OBJECTS_TO_COMPARE INT, SRC_MINUS_TRGT STRING, TRGT_MINUS_SRC STRING, CREATION_TS_UTC STRING)"
|
|
952
|
+
|
|
953
|
+
self.execute_statement(statement)
|
|
954
|
+
|
|
955
|
+
TESTSET_ = ", ".join(results["TESTSET"])
|
|
956
|
+
|
|
957
|
+
OBJECTS_TO_COMPARE_SRC_ = ", ".join(results["OBJECTS_TO_COMPARE_SRC"])
|
|
958
|
+
|
|
959
|
+
OBJECTS_TO_COMPARE_TRGT_ = ", ".join(results["OBJECTS_TO_COMPARE_TRGT"])
|
|
960
|
+
|
|
961
|
+
SRC_MINUS_TRGT_ = ", ".join(results["SRC_MINUS_TRGT"])
|
|
962
|
+
|
|
963
|
+
TRGT_MINUS_SRC_ = ", ".join(results["TRGT_MINUS_SRC"])
|
|
964
|
+
|
|
965
|
+
date_utc = datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
|
|
966
|
+
|
|
967
|
+
insert_statement = f"INSERT INTO {result_table_highlevel} ( \
|
|
968
|
+
RUN_GUID, \
|
|
969
|
+
PIPELINE_NAME, \
|
|
970
|
+
PIPELINE_ID, \
|
|
971
|
+
START_TIME_UTC, \
|
|
972
|
+
SOURCE_SYSTEM, \
|
|
973
|
+
TARGET_SYSTEM, \
|
|
974
|
+
DATABASE_NAME, \
|
|
975
|
+
TESTSET, \
|
|
976
|
+
ALL_OBJECTS_MATCHING, \
|
|
977
|
+
ALL_COLUMNS_EQUAL, \
|
|
978
|
+
ALL_DATATYPES_EQUAL, \
|
|
979
|
+
ALL_ROWCOUNTS_EQUAL, \
|
|
980
|
+
ALL_CHECKSUMS_EQUAL, \
|
|
981
|
+
ALL_SAMPLES_EQUAL, \
|
|
982
|
+
ALL_OBJECTS_EQUAL, \
|
|
983
|
+
OBJECTS_TO_COMPARE_SRC, \
|
|
984
|
+
OBJECTS_TO_COMPARE_TRGT, \
|
|
985
|
+
NUMBER_OF_OBJECTS_TO_COMPARE, \
|
|
986
|
+
SRC_MINUS_TRGT, \
|
|
987
|
+
TRGT_MINUS_SRC, \
|
|
988
|
+
CREATION_TS_UTC) \
|
|
989
|
+
VALUES \
|
|
990
|
+
('{run_guid}', \
|
|
991
|
+
'{pipeline_name}', \
|
|
992
|
+
'{pipeline_id}', \
|
|
993
|
+
'{results['START_TIME_UTC']}', \
|
|
994
|
+
'{results['SOURCE_SYSTEM']}', \
|
|
995
|
+
'{results['TARGET_SYSTEM']}', \
|
|
996
|
+
'{results['DATABASE_NAME']}', \
|
|
997
|
+
'{TESTSET_}', \
|
|
998
|
+
'{results['ALL_OBJECTS_MATCHING']}', \
|
|
999
|
+
'{results['ALL_COLUMNS_EQUAL']}', \
|
|
1000
|
+
'{results['ALL_DATATYPES_EQUAL']}', \
|
|
1001
|
+
'{results['ALL_ROWCOUNTS_EQUAL']}', \
|
|
1002
|
+
'{results['ALL_CHECKSUMS_EQUAL']}', \
|
|
1003
|
+
NULLIF('{results['ALL_SAMPLES_EQUAL']}', 'None'), \
|
|
1004
|
+
NULLIF('{results['ALL_OBJECTS_EQUAL']}', 'None'), \
|
|
1005
|
+
'{OBJECTS_TO_COMPARE_SRC_}', \
|
|
1006
|
+
'{OBJECTS_TO_COMPARE_TRGT_}', \
|
|
1007
|
+
'{results['NUMBER_OF_OBJECTS_TO_COMPARE']}', \
|
|
1008
|
+
'{SRC_MINUS_TRGT_}', \
|
|
1009
|
+
'{TRGT_MINUS_SRC_}', \
|
|
1010
|
+
'{date_utc}')"
|
|
1011
|
+
|
|
1012
|
+
self.execute_statement(insert_statement)
|
|
1013
|
+
|
|
1014
|
+
def insert_objectlevel_results(
|
|
1015
|
+
self,
|
|
1016
|
+
result_table: str,
|
|
1017
|
+
result_table_objectlevel: str,
|
|
1018
|
+
run_guid: str,
|
|
1019
|
+
results: dict,
|
|
1020
|
+
) -> None:
|
|
1021
|
+
"""
|
|
1022
|
+
insert into - detailed results per object
|
|
1023
|
+
"""
|
|
1024
|
+
date_utc = datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
|
|
1025
|
+
|
|
1026
|
+
statement = f"""
|
|
1027
|
+
CREATE TABLE IF NOT EXISTS {result_table_objectlevel} (
|
|
1028
|
+
RUN_GUID STRING,
|
|
1029
|
+
PIPELINE_ID STRING,
|
|
1030
|
+
START_TIME_UTC STRING,
|
|
1031
|
+
SRC_DATABASE_NAME STRING,
|
|
1032
|
+
SRC_SCHEMA_NAME STRING,
|
|
1033
|
+
SRC_OBJECT_NAME STRING,
|
|
1034
|
+
SRC_OBJECT_TYPE STRING,
|
|
1035
|
+
TRGT_DATABASE_NAME STRING,
|
|
1036
|
+
TRGT_SCHEMA_NAME STRING,
|
|
1037
|
+
TRGT_OBJECT_NAME STRING,
|
|
1038
|
+
TRGT_OBJECT_TYPE STRING,
|
|
1039
|
+
SRC_FILTER STRING,
|
|
1040
|
+
TRGT_FILTER STRING,
|
|
1041
|
+
EXCLUDED_COLUMNS STRING,
|
|
1042
|
+
COLUMNS_EQUAL BOOLEAN,
|
|
1043
|
+
COLUMN_INTERSECTION STRING,
|
|
1044
|
+
SRC_COLUMNS_MINUS_TRGT_COLUMNS STRING,
|
|
1045
|
+
TRGT_COLUMNS_MINUS_SRC_COLUMNS STRING,
|
|
1046
|
+
ROW_COUNTS_EQUAL BOOLEAN,
|
|
1047
|
+
SRC_ROW_COUNT INT,
|
|
1048
|
+
TRGT_ROW_COUNT INT,
|
|
1049
|
+
ALL_COUNT_NULLS_EQUAL BOOLEAN,
|
|
1050
|
+
AGGREGATIONS_EQUAL BOOLEAN,
|
|
1051
|
+
SRC_ERROR_QUERY STRING,
|
|
1052
|
+
TRGT_ERROR_QUERY STRING,
|
|
1053
|
+
SRC_ERROR_MSG STRING,
|
|
1054
|
+
TRGT_ERROR_MSG STRING,
|
|
1055
|
+
GROUP_BY_COLUMNS STRING,
|
|
1056
|
+
GROUP_BY_EQUAL BOOLEAN,
|
|
1057
|
+
GROUP_BY_VALUES_WITH_MISMATCHES STRING,
|
|
1058
|
+
COLUMNS_WITH_MISMATCH STRING,
|
|
1059
|
+
SRC_GROUP_BY_QUERY STRING,
|
|
1060
|
+
TRGT_GROUP_BY_QUERY STRING,
|
|
1061
|
+
SRC_GROUP_BY_ERROR STRING,
|
|
1062
|
+
TRGT_GROUP_BY_ERROR STRING,
|
|
1063
|
+
SAMPLES_COMPARED BOOLEAN,
|
|
1064
|
+
SAMPLES_EQUAL BOOLEAN,
|
|
1065
|
+
SAMPLE_KEYS STRING,
|
|
1066
|
+
SRC_SAMPLE STRING,
|
|
1067
|
+
TRGT_SAMPLE STRING,
|
|
1068
|
+
SRC_SAMPLE_QUERY STRING,
|
|
1069
|
+
TRGT_SAMPLE_QUERY STRING,
|
|
1070
|
+
SRC_SAMPLE_ERROR_MSG STRING,
|
|
1071
|
+
TRGT_SAMPLE_ERROR_MSG STRING,
|
|
1072
|
+
PANDAS_DATAFRAME_COMPARED BOOLEAN,
|
|
1073
|
+
PANDAS_DATAFRAME_EQUAL BOOLEAN,
|
|
1074
|
+
SRC_NOT_ALTERED_DURING_COMPARISON BOOLEAN,
|
|
1075
|
+
TRGT_NOT_ALTERED_DURING_COMPARISON BOOLEAN,
|
|
1076
|
+
SRC_LAST_ALTERED STRING,
|
|
1077
|
+
TRGT_LAST_ALTERED STRING,
|
|
1078
|
+
CREATION_TS_UTC STRING)
|
|
1079
|
+
"""
|
|
1080
|
+
|
|
1081
|
+
self.execute_statement(statement)
|
|
1082
|
+
|
|
1083
|
+
for object_result in results['OBJECTS']:
|
|
1084
|
+
|
|
1085
|
+
pipeline_id = results['PIPELINE_ID']
|
|
1086
|
+
start_time_utc = results['START_TIME_UTC']
|
|
1087
|
+
src_database_name = object_result['SRC_DATABASE_NAME']
|
|
1088
|
+
src_schema_name = object_result['SRC_SCHEMA_NAME']
|
|
1089
|
+
src_object_name = object_result['SRC_OBJECT_NAME']
|
|
1090
|
+
src_object_type = object_result['SRC_OBJECT_TYPE']
|
|
1091
|
+
trgt_database_name = object_result['TRGT_DATABASE_NAME']
|
|
1092
|
+
trgt_schema_name = object_result['TRGT_SCHEMA_NAME']
|
|
1093
|
+
trgt_object_name = object_result['TRGT_OBJECT_NAME']
|
|
1094
|
+
trgt_object_type = object_result['TRGT_OBJECT_TYPE']
|
|
1095
|
+
src_filter = object_result['SRC_FILTER']
|
|
1096
|
+
trgt_filter = object_result['TRGT_FILTER']
|
|
1097
|
+
excluded_columns = object_result['EXCLUDED_COLUMNS']
|
|
1098
|
+
columns_equal = object_result['COLUMNS_EQUAL']
|
|
1099
|
+
column_intersection = str(object_result['COLUMN_INTERSECTION'])
|
|
1100
|
+
src_columns_minus_trgt_columns = object_result['SRC_COLUMNS_MINUS_TRGT_COLUMNS']
|
|
1101
|
+
trgt_columns_minus_src_columns = object_result['TRGT_COLUMNS_MINUS_SRC_COLUMNS']
|
|
1102
|
+
datatypes_equal = object_result['DATATYPES_EQUAL']
|
|
1103
|
+
row_counts_equal = object_result['ROW_COUNTS_EQUAL']
|
|
1104
|
+
src_row_count = object_result['SRC_ROW_COUNT']
|
|
1105
|
+
trgt_row_count = object_result['TRGT_ROW_COUNT']
|
|
1106
|
+
all_count_nulls_equal = object_result['ALL_COUNT_NULLS_EQUAL']
|
|
1107
|
+
aggregations_equal = object_result['AGGREGATIONS_EQUAL']
|
|
1108
|
+
src_error_query = object_result['SRC_ERROR']['QUERY']
|
|
1109
|
+
trgt_error_query = object_result['TRGT_ERROR']['QUERY']
|
|
1110
|
+
src_error_msg = object_result['SRC_ERROR']['ERROR']
|
|
1111
|
+
trgt_error_msg = object_result['TRGT_ERROR']['ERROR']
|
|
1112
|
+
group_by_columns = object_result['GROUP_BY_COLUMNS']
|
|
1113
|
+
group_by_equal = object_result['GROUP_BY_EQUAL']
|
|
1114
|
+
group_by_values_with_mismatches = object_result['GROUP_BY_VALUES_WITH_MISMATCHES']
|
|
1115
|
+
columns_with_mismatch = object_result['COLUMNS_WITH_MISMATCH']
|
|
1116
|
+
src_group_by_query = object_result['SRC_GROUP_BY_QUERY']
|
|
1117
|
+
trgt_group_by_query = object_result['TRGT_GROUP_BY_QUERY']
|
|
1118
|
+
src_group_by_error = object_result['SRC_GROUP_BY_ERROR']
|
|
1119
|
+
trgt_group_by_error = object_result['TRGT_GROUP_BY_ERROR']
|
|
1120
|
+
samples_compared = object_result['SAMPLES_COMPARED']
|
|
1121
|
+
samples_equal = object_result['SAMPLES_EQUAL']
|
|
1122
|
+
sample_keys = object_result['SAMPLE_KEYS']
|
|
1123
|
+
src_sample = object_result['SRC_SAMPLE']
|
|
1124
|
+
trgt_sample = object_result['TRGT_SAMPLE']
|
|
1125
|
+
src_sample_query = object_result['SRC_SAMPLE_QUERY']
|
|
1126
|
+
trgt_sample_query = object_result['TRGT_SAMPLE_QUERY']
|
|
1127
|
+
src_sample_error_msg = object_result['SRC_SAMPLE_ERROR_DICT']
|
|
1128
|
+
trgt_sample_error_msg = object_result['SRC_SAMPLE_ERROR_DICT']
|
|
1129
|
+
pandas_dataframe_compared = object_result['PANDAS_DATAFRAME_COMPARED']
|
|
1130
|
+
pandas_dataframe_equal = object_result['PANDAS_DATAFRAME_EQUAL']
|
|
1131
|
+
src_not_altered_during_comparison = object_result['SRC_NOT_ALTERED_DURING_COMPARISON']
|
|
1132
|
+
trgt_not_altered_during_comparison = object_result['TRGT_NOT_ALTERED_DURING_COMPARISON']
|
|
1133
|
+
src_last_altered = object_result['SRC_LAST_ALTERED']
|
|
1134
|
+
trgt_last_altered = object_result['TRGT_LAST_ALTERED']
|
|
1135
|
+
|
|
1136
|
+
# the rest in elem is not used for this table
|
|
1137
|
+
|
|
1138
|
+
insert_statement = f"""INSERT INTO {result_table_objectlevel} ( \
|
|
1139
|
+
RUN_GUID, \
|
|
1140
|
+
PIPELINE_ID, \
|
|
1141
|
+
START_TIME_UTC, \
|
|
1142
|
+
SRC_DATABASE_NAME, \
|
|
1143
|
+
SRC_SCHEMA_NAME, \
|
|
1144
|
+
SRC_OBJECT_NAME, \
|
|
1145
|
+
SRC_OBJECT_TYPE, \
|
|
1146
|
+
TRGT_DATABASE_NAME, \
|
|
1147
|
+
TRGT_SCHEMA_NAME, \
|
|
1148
|
+
TRGT_OBJECT_NAME, \
|
|
1149
|
+
TRGT_OBJECT_TYPE, \
|
|
1150
|
+
SRC_FILTER, \
|
|
1151
|
+
TRGT_FILTER, \
|
|
1152
|
+
EXCLUDED_COLUMNS, \
|
|
1153
|
+
COLUMNS_EQUAL, \
|
|
1154
|
+
COLUMN_INTERSECTION, \
|
|
1155
|
+
SRC_COLUMNS_MINUS_TRGT_COLUMNS, \
|
|
1156
|
+
TRGT_COLUMNS_MINUS_SRC_COLUMNS, \
|
|
1157
|
+
DATATYPES_EQUAL, \
|
|
1158
|
+
ROW_COUNTS_EQUAL, \
|
|
1159
|
+
SRC_ROW_COUNT, \
|
|
1160
|
+
TRGT_ROW_COUNT, \
|
|
1161
|
+
ALL_COUNT_NULLS_EQUAL, \
|
|
1162
|
+
AGGREGATIONS_EQUAL, \
|
|
1163
|
+
SRC_ERROR_QUERY , \
|
|
1164
|
+
TRGT_ERROR_QUERY, \
|
|
1165
|
+
SRC_ERROR_MSG, \
|
|
1166
|
+
TRGT_ERROR_MSG, \
|
|
1167
|
+
GROUP_BY_COLUMNS, \
|
|
1168
|
+
GROUP_BY_EQUAL, \
|
|
1169
|
+
GROUP_BY_VALUES_WITH_MISMATCHES, \
|
|
1170
|
+
COLUMNS_WITH_MISMATCH, \
|
|
1171
|
+
SRC_GROUP_BY_QUERY, \
|
|
1172
|
+
TRGT_GROUP_BY_QUERY, \
|
|
1173
|
+
SRC_GROUP_BY_ERROR, \
|
|
1174
|
+
TRGT_GROUP_BY_ERROR, \
|
|
1175
|
+
SAMPLES_COMPARED, \
|
|
1176
|
+
SAMPLES_EQUAL, \
|
|
1177
|
+
SAMPLE_KEYS, \
|
|
1178
|
+
SRC_SAMPLE, \
|
|
1179
|
+
TRGT_SAMPLE, \
|
|
1180
|
+
SRC_SAMPLE_QUERY, \
|
|
1181
|
+
TRGT_SAMPLE_QUERY, \
|
|
1182
|
+
SRC_SAMPLE_ERROR_MSG, \
|
|
1183
|
+
TRGT_SAMPLE_ERROR_MSG, \
|
|
1184
|
+
PANDAS_DATAFRAME_COMPARED, \
|
|
1185
|
+
PANDAS_DATAFRAME_EQUAL, \
|
|
1186
|
+
SRC_NOT_ALTERED_DURING_COMPARISON, \
|
|
1187
|
+
TRGT_NOT_ALTERED_DURING_COMPARISON, \
|
|
1188
|
+
SRC_LAST_ALTERED, \
|
|
1189
|
+
TRGT_LAST_ALTERED, \
|
|
1190
|
+
CREATION_TS_UTC) \
|
|
1191
|
+
SELECT\
|
|
1192
|
+
'{run_guid}' AS RUN_GUID, \
|
|
1193
|
+
'{pipeline_id}' AS PIPELINE_ID, \
|
|
1194
|
+
'{start_time_utc}' AS START_TIME_UTC, \
|
|
1195
|
+
'{src_database_name}' AS SRC_DATABASE_NAME, \
|
|
1196
|
+
'{src_schema_name}' AS SRC_SCHEMA_NAME, \
|
|
1197
|
+
'{src_object_name}' AS SRC_OBJECT_NAME, \
|
|
1198
|
+
'{src_object_type}' AS SRC_OBJECT_TYPE, \
|
|
1199
|
+
'{trgt_database_name}' AS TRGT_DATABASE_NAME, \
|
|
1200
|
+
'{trgt_schema_name}' AS TRGT_SCHEMA_NAME, \
|
|
1201
|
+
'{trgt_object_name}' AS TRGT_OBJECT_NAME, \
|
|
1202
|
+
'{trgt_object_type}' AS TRGT_OBJECT_TYPE, \
|
|
1203
|
+
'{src_filter}' AS SRC_FILTER, \
|
|
1204
|
+
'{trgt_filter}' AS TRGT_FILTER, \
|
|
1205
|
+
'{excluded_columns}' AS EXCLUDED_COLUMNS, \
|
|
1206
|
+
try_cast('{columns_equal}' AS BOOLEAN) AS COLUMNS_EQUAL, \
|
|
1207
|
+
"{column_intersection}"::STRING AS COLUMN_INTERSECTION, \
|
|
1208
|
+
'{src_columns_minus_trgt_columns}' AS SRC_COLUMNS_MINUS_TRGT_COLUMNS, \
|
|
1209
|
+
'{trgt_columns_minus_src_columns}' AS TRGT_COLUMNS_MINUS_SRC_COLUMNS, \
|
|
1210
|
+
try_cast('{datatypes_equal}' AS BOOLEAN) AS DATATYPES_EQUAL, \
|
|
1211
|
+
try_cast('{row_counts_equal}' AS BOOLEAN) AS ROW_COUNTS_EQUAL, \
|
|
1212
|
+
'{src_row_count}'::INT AS SRC_ROW_COUNT, \
|
|
1213
|
+
'{trgt_row_count}'::INT AS TRGT_ROW_COUNT, \
|
|
1214
|
+
try_cast('{all_count_nulls_equal}' AS BOOLEAN) AS ALL_COUNT_NULLS_EQUAL, \
|
|
1215
|
+
try_cast('{aggregations_equal}' AS BOOLEAN) AS AGGREGATIONS_EQUAL, \
|
|
1216
|
+
'{src_error_query}'::STRING AS SRC_ERROR_QUERY, \
|
|
1217
|
+
'{trgt_error_query}'::STRING AS TRGT_ERROR_QUERY, \
|
|
1218
|
+
'{src_error_msg}'::STRING AS SRC_ERROR_MSG, \
|
|
1219
|
+
'{trgt_error_msg}'::STRING AS TRGT_ERROR_MSG, \
|
|
1220
|
+
"{group_by_columns}" AS GROUP_BY_COLUMNS, \
|
|
1221
|
+
try_cast('{group_by_equal}' AS BOOLEAN) AS GROUP_BY_EQUAL, \
|
|
1222
|
+
"{group_by_values_with_mismatches}" AS GROUP_BY_VALUES_WITH_MISMATCHES, \
|
|
1223
|
+
"{columns_with_mismatch}" AS COLUMNS_WITH_MISMATCH, \
|
|
1224
|
+
'{src_group_by_query}'::STRING AS SRC_GROUP_BY_QUERY, \
|
|
1225
|
+
'{trgt_group_by_query}'::STRING AS TRGT_GROUP_BY_QUERY, \
|
|
1226
|
+
'{src_group_by_error}'::STRING AS SRC_GROUP_BY_ERROR, \
|
|
1227
|
+
'{trgt_group_by_error}'::STRING AS TRGT_GROUP_BY_ERROR, \
|
|
1228
|
+
try_cast('{samples_compared}' AS BOOLEAN) AS SAMPLES_COMPARED, \
|
|
1229
|
+
try_cast('{samples_equal}' AS BOOLEAN) AS SAMPLES_EQUAL, \
|
|
1230
|
+
'{sample_keys}' AS SAMPLE_KEYS, \
|
|
1231
|
+
'{src_sample}' AS SRC_SAMPLE, \
|
|
1232
|
+
'{trgt_sample}' AS TRGT_SAMPLE, \
|
|
1233
|
+
'{src_sample_query}' AS SRC_SAMPLE_QUERY, \
|
|
1234
|
+
'{trgt_sample_query}' AS TRGT_SAMPLE_QUERY, \
|
|
1235
|
+
'{src_sample_error_msg}'::STRING AS SRC_SAMPLE_ERROR_MSG, \
|
|
1236
|
+
'{trgt_sample_error_msg}'::STRING AS TRGT_SAMPLE_ERROR_MSG, \
|
|
1237
|
+
try_cast('{pandas_dataframe_compared}' AS BOOLEAN) AS PANDAS_DATAFRAME_COMPARED, \
|
|
1238
|
+
try_cast('{pandas_dataframe_equal}' AS BOOLEAN) AS PANDAS_DATAFRAME_EQUAL, \
|
|
1239
|
+
try_cast('{src_not_altered_during_comparison}' AS BOOLEAN) AS SRC_NOT_ALTERED_DURING_COMPARISON, \
|
|
1240
|
+
try_cast('{trgt_not_altered_during_comparison}' AS BOOLEAN) AS TRGT_NOT_ALTERED_DURING_COMPARISON, \
|
|
1241
|
+
'{src_last_altered}'::STRING AS SRC_LAST_ALTERED, \
|
|
1242
|
+
'{trgt_last_altered}'::STRING AS TRGT_LAST_ALTERED, \
|
|
1243
|
+
'{date_utc}' \
|
|
1244
|
+
;"""
|
|
1245
|
+
|
|
1246
|
+
self.execute_statement(insert_statement)
|
|
1247
|
+
|
|
1248
|
+
def insert_columnlevel_results(
|
|
1249
|
+
self,
|
|
1250
|
+
result_table: str,
|
|
1251
|
+
result_table_columnlevel: str,
|
|
1252
|
+
run_guid: str,
|
|
1253
|
+
results: dict,
|
|
1254
|
+
) -> None:
|
|
1255
|
+
"""
|
|
1256
|
+
insert into - detailed results per column
|
|
1257
|
+
"""
|
|
1258
|
+
|
|
1259
|
+
date_utc = datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S")
|
|
1260
|
+
|
|
1261
|
+
statement = f"CREATE TABLE IF NOT EXISTS {result_table_columnlevel} (RUN_GUID STRING, PIPELINE_ID STRING, START_TIME_UTC STRING, SRC_DATABASE_NAME STRING, SRC_SCHEMA_NAME STRING, SRC_OBJECT_NAME STRING, SRC_OBJECT_TYPE STRING, TRGT_DATABASE_NAME STRING, TRGT_SCHEMA_NAME STRING, TRGT_OBJECT_NAME STRING, TRGT_OBJECT_TYPE STRING, COLUMN_NAME STRING, IN_SRC BOOLEAN, IN_TRGT BOOLEAN, IN_SYNC BOOLEAN, IN_EXCLUDED BOOLEAN, SRC_DATATYPE STRING, TRGT_DATATYPE STRING, AGGREGATION_TYPE STRING, AGGREGATION_EQUAL BOOLEAN, AGGREGATION_RESULT_SRC STRING, AGGREGATION_RESULT_TRGT STRING, AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC STRING, COUNT_NULLS_EQUAL BOOLEAN, COUNT_NULLS_SRC STRING, COUNT_NULLS_TRGT STRING, COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC STRING, ERROR_QUERY_SRC STRING, ERROR_MSG_SRC STRING, ERROR_QUERY_TRGT STRING, ERROR_MSG_TRGT STRING, ERROR_FLAG BOOLEAN, CREATION_TS_UTC STRING);"
|
|
1262
|
+
|
|
1263
|
+
self.execute_statement(statement)
|
|
1264
|
+
|
|
1265
|
+
|
|
1266
|
+
|
|
1267
|
+
# extract the information needed for the table on object level
|
|
1268
|
+
for object_result in results['OBJECTS']:
|
|
1269
|
+
for column_result in object_result['COLUMNS']:
|
|
1270
|
+
pipeline_id = results['PIPELINE_ID']
|
|
1271
|
+
start_time_utc = results['START_TIME_UTC']
|
|
1272
|
+
src_database_name = object_result['SRC_DATABASE_NAME']
|
|
1273
|
+
src_schema_name = object_result['SRC_SCHEMA_NAME']
|
|
1274
|
+
src_object_name = object_result['SRC_OBJECT_NAME']
|
|
1275
|
+
src_object_type = object_result['SRC_OBJECT_TYPE']
|
|
1276
|
+
trgt_database_name = object_result['TRGT_DATABASE_NAME']
|
|
1277
|
+
trgt_schema_name = object_result['TRGT_SCHEMA_NAME']
|
|
1278
|
+
trgt_object_name = object_result['TRGT_OBJECT_NAME']
|
|
1279
|
+
trgt_object_type = object_result['TRGT_OBJECT_TYPE']
|
|
1280
|
+
column_name = column_result['COLUMN_NAME']
|
|
1281
|
+
in_src = column_result['IN_SRC']
|
|
1282
|
+
in_trgt = column_result['IN_TRGT']
|
|
1283
|
+
in_sync = column_result['IN_SYNC']
|
|
1284
|
+
in_excluded = column_result['IN_EXCLUDED']
|
|
1285
|
+
src_datatype = column_result['SRC_DATATYPE']
|
|
1286
|
+
trgt_datatype = column_result['TRGT_DATATYPE']
|
|
1287
|
+
datatype_equal = column_result['DATATYPE_EQUAL']
|
|
1288
|
+
aggregation_type = column_result['AGGREGATION_TYPE']
|
|
1289
|
+
aggregation_equal = column_result['AGGREGATION_EQUAL']
|
|
1290
|
+
aggregation_result_src = column_result['AGGREGATION_RESULT_SRC']
|
|
1291
|
+
aggregation_result_trgt = column_result['AGGREGATION_RESULT_TRGT']
|
|
1292
|
+
aggregation_difference_trgt_minus_src = column_result['AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC']
|
|
1293
|
+
count_nulls_equal = column_result['COUNT_NULLS_EQUAL']
|
|
1294
|
+
count_nulls_src = column_result['COUNT_NULLS_SRC']
|
|
1295
|
+
count_nulls_trgt = column_result['COUNT_NULLS_TRGT']
|
|
1296
|
+
count_nulls_difference_trgt_minus_src = column_result['COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC']
|
|
1297
|
+
error_query_src = object_result['SRC_ERROR']['QUERY']
|
|
1298
|
+
error_msg_src = object_result['SRC_ERROR']['ERROR']
|
|
1299
|
+
error_query_trgt = object_result['TRGT_ERROR']['QUERY']
|
|
1300
|
+
error_msg_trgt = object_result['TRGT_ERROR']['ERROR']
|
|
1301
|
+
if not (object_result['SRC_ERROR']['ERROR'] and object_result['TRGT_ERROR']['ERROR']):
|
|
1302
|
+
error_flag =False
|
|
1303
|
+
else:
|
|
1304
|
+
error_flag = True
|
|
1305
|
+
insert_statement = f"""INSERT INTO {result_table_columnlevel} ( \
|
|
1306
|
+
RUN_GUID,\
|
|
1307
|
+
PIPELINE_ID,\
|
|
1308
|
+
START_TIME_UTC,\
|
|
1309
|
+
SRC_DATABASE_NAME, \
|
|
1310
|
+
SRC_SCHEMA_NAME, \
|
|
1311
|
+
SRC_OBJECT_NAME, \
|
|
1312
|
+
SRC_OBJECT_TYPE, \
|
|
1313
|
+
TRGT_DATABASE_NAME, \
|
|
1314
|
+
TRGT_SCHEMA_NAME, \
|
|
1315
|
+
TRGT_OBJECT_NAME, \
|
|
1316
|
+
TRGT_OBJECT_TYPE, \
|
|
1317
|
+
COLUMN_NAME,\
|
|
1318
|
+
IN_SRC,\
|
|
1319
|
+
IN_TRGT,\
|
|
1320
|
+
IN_SYNC,\
|
|
1321
|
+
IN_EXCLUDED, \
|
|
1322
|
+
SRC_DATATYPE,\
|
|
1323
|
+
TRGT_DATATYPE,\
|
|
1324
|
+
DATATYPE_EQUAL,\
|
|
1325
|
+
AGGREGATION_TYPE,\
|
|
1326
|
+
AGGREGATION_EQUAL,\
|
|
1327
|
+
AGGREGATION_RESULT_SRC,\
|
|
1328
|
+
AGGREGATION_RESULT_TRGT,\
|
|
1329
|
+
AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC,\
|
|
1330
|
+
COUNT_NULLS_EQUAL,\
|
|
1331
|
+
COUNT_NULLS_SRC,\
|
|
1332
|
+
COUNT_NULLS_TRGT,\
|
|
1333
|
+
COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC,\
|
|
1334
|
+
ERROR_QUERY_SRC ,\
|
|
1335
|
+
ERROR_MSG_SRC ,\
|
|
1336
|
+
ERROR_QUERY_TRGT ,\
|
|
1337
|
+
ERROR_MSG_TRGT ,\
|
|
1338
|
+
ERROR_FLAG,\
|
|
1339
|
+
CREATION_TS_UTC)\
|
|
1340
|
+
SELECT\
|
|
1341
|
+
'{run_guid}' AS RUN_GUID,\
|
|
1342
|
+
'{pipeline_id}' AS PIPELINE_ID,\
|
|
1343
|
+
'{start_time_utc}'::STRING AS START_TIME_UTC,\
|
|
1344
|
+
'{src_database_name}' AS SRC_DATABASE_NAME,\
|
|
1345
|
+
'{src_schema_name}' AS SRC_SCHEMA_NAME,\
|
|
1346
|
+
'{src_object_name}' AS SRC_OBJECT_NAME,\
|
|
1347
|
+
'{src_object_type}' AS SRC_OBJECT_TYPE,\
|
|
1348
|
+
'{trgt_database_name}' AS TRGT_DATABASE_NAME,\
|
|
1349
|
+
'{trgt_schema_name}' AS TRGT_SCHEMA_NAME,\
|
|
1350
|
+
'{trgt_object_name}' AS TRGT_OBJECT_NAME,\
|
|
1351
|
+
'{trgt_object_type}' AS TRGT_OBJECT_TYPE,\
|
|
1352
|
+
'{column_name}' AS COLUMN_NAME,\
|
|
1353
|
+
try_cast('{in_src}' AS BOOLEAN) AS IN_SRC,\
|
|
1354
|
+
try_cast('{in_trgt}' AS BOOLEAN) AS IN_TRGT,\
|
|
1355
|
+
try_cast('{in_sync}' AS BOOLEAN) AS IN_SYNC,\
|
|
1356
|
+
try_cast('{in_excluded}' AS BOOLEAN) AS IN_SYNC,\
|
|
1357
|
+
'{src_datatype}' AS SRC_DATATYPE,\
|
|
1358
|
+
'{trgt_datatype}' AS TRGT_DATATYPE,\
|
|
1359
|
+
'{datatype_equal}' AS DATATYPE_EQUAL,\
|
|
1360
|
+
'{aggregation_type}' AS AGGREGATION_TYPE,\
|
|
1361
|
+
try_cast('{aggregation_equal}' AS BOOLEAN) AS AGGREGATION_EQUAL,\
|
|
1362
|
+
'{aggregation_result_src}' AS AGGREGATION_RESULT_SRC,\
|
|
1363
|
+
'{aggregation_result_trgt}' AS AGGREGATION_RESULT_TRGT,\
|
|
1364
|
+
'{aggregation_difference_trgt_minus_src}' AS AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC,\
|
|
1365
|
+
try_cast('{count_nulls_equal}' AS BOOLEAN) AS COUNT_NULLS_EQUAL,\
|
|
1366
|
+
'{count_nulls_src}'::INT AS COUNT_NULLS_SRC,\
|
|
1367
|
+
'{count_nulls_trgt}'::INT AS COUNT_NULLS_TRGT,\
|
|
1368
|
+
'{count_nulls_difference_trgt_minus_src}' AS COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC,\
|
|
1369
|
+
'{error_query_src}' AS ERROR_QUERY_SRC,\
|
|
1370
|
+
'{error_msg_src}' AS ERROR_MSG_SRC,\
|
|
1371
|
+
'{error_query_trgt}' AS ERROR_QUERY_TRGT,\
|
|
1372
|
+
'{error_msg_trgt}' AS ERROR_MSG_TRGT,\
|
|
1373
|
+
try_cast('{error_flag}' AS BOOLEAN) AS ERROR_FLAG,\
|
|
1374
|
+
'{date_utc}'\
|
|
1375
|
+
;"""
|
|
1376
|
+
|
|
1377
|
+
self.execute_statement(insert_statement)
|
|
1378
|
+
|
|
1379
|
+
|