icsDataValidation 1.0.358__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- icsDataValidation/configuration.py +19 -0
- icsDataValidation/connection_setups/__init__.py +0 -0
- icsDataValidation/connection_setups/azure_connection_setup.py +19 -0
- icsDataValidation/connection_setups/databricks_connection_setup.py +28 -0
- icsDataValidation/connection_setups/exasol_connection_setup.py +17 -0
- icsDataValidation/connection_setups/oracle_connection_setup.py +26 -0
- icsDataValidation/connection_setups/snowflake_connection_setup.py +35 -0
- icsDataValidation/connection_setups/teradata_connection_setup.py +18 -0
- icsDataValidation/core/__init__.py +0 -0
- icsDataValidation/core/database_objects.py +18 -0
- icsDataValidation/core/object_comparison.py +239 -0
- icsDataValidation/input_parameters/__init__.py +0 -0
- icsDataValidation/input_parameters/testing_tool_params.py +81 -0
- icsDataValidation/main.py +250 -0
- icsDataValidation/output_parameters/__init__.py +0 -0
- icsDataValidation/output_parameters/result_params.py +94 -0
- icsDataValidation/services/__init__.py +0 -0
- icsDataValidation/services/comparison_service.py +582 -0
- icsDataValidation/services/database_services/__init__.py +0 -0
- icsDataValidation/services/database_services/azure_service.py +320 -0
- icsDataValidation/services/database_services/databricks_hive_metastore_service.py +1694 -0
- icsDataValidation/services/database_services/databricks_unity_catalog_service.py +1379 -0
- icsDataValidation/services/database_services/exasol_service.py +261 -0
- icsDataValidation/services/database_services/oracle_service.py +713 -0
- icsDataValidation/services/database_services/snowflake_service.py +1100 -0
- icsDataValidation/services/database_services/teradata_service.py +665 -0
- icsDataValidation/services/initialization_service.py +103 -0
- icsDataValidation/services/result_service.py +573 -0
- icsDataValidation/services/system_service.py +61 -0
- icsDataValidation/services/testset_service.py +257 -0
- icsDataValidation/utils/__init__.py +0 -0
- icsDataValidation/utils/file_util.py +96 -0
- icsDataValidation/utils/logger_util.py +96 -0
- icsDataValidation/utils/pandas_util.py +159 -0
- icsDataValidation/utils/parallelization_util.py +52 -0
- icsDataValidation/utils/sql_util.py +14 -0
- icsDataValidation-1.0.358.dist-info/METADATA +21 -0
- icsDataValidation-1.0.358.dist-info/RECORD +40 -0
- icsDataValidation-1.0.358.dist-info/WHEEL +5 -0
- icsDataValidation-1.0.358.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,713 @@
|
|
|
1
|
+
|
|
2
|
+
import oracledb
|
|
3
|
+
oracledb.defaults.fetch_decimals = True
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from typing import Union, List, Dict
|
|
8
|
+
|
|
9
|
+
from icsDataValidation.utils.logger_util import configure_dev_ops_logger
|
|
10
|
+
from icsDataValidation.core.database_objects import DatabaseObject
|
|
11
|
+
|
|
12
|
+
#########################################################################################
|
|
13
|
+
#########################################################################################
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger('Oracle_Service')
|
|
16
|
+
logger.setLevel(logging.INFO)
|
|
17
|
+
configure_dev_ops_logger(logger)
|
|
18
|
+
|
|
19
|
+
class OracleService(object):
|
|
20
|
+
def __init__(self, connection_params: dict):
|
|
21
|
+
self.connection_params =connection_params
|
|
22
|
+
self.oracle_connection = None
|
|
23
|
+
self.oracle_datatype_mapping = {
|
|
24
|
+
"string": ['text'],
|
|
25
|
+
"numeric": [
|
|
26
|
+
'number',
|
|
27
|
+
'float',
|
|
28
|
+
'long',
|
|
29
|
+
'binary_float',
|
|
30
|
+
'binary_double',
|
|
31
|
+
'numeric',
|
|
32
|
+
'decimal',
|
|
33
|
+
'int',
|
|
34
|
+
'integer',
|
|
35
|
+
'smallint',
|
|
36
|
+
'real'
|
|
37
|
+
],
|
|
38
|
+
"binary": ['binary'],
|
|
39
|
+
"boolean": ['boolean'],
|
|
40
|
+
"date_and_time":['date','time','datetime','timestamp','year']
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
def __enter__(self):
|
|
44
|
+
return self
|
|
45
|
+
|
|
46
|
+
def __exit__(self, exception_type, exception_value, traceback):
|
|
47
|
+
if self.oracle_connection is not None:
|
|
48
|
+
self.oracle_connection.close()
|
|
49
|
+
|
|
50
|
+
#def __del__(self):
|
|
51
|
+
# if self.oracle_connection is not None:
|
|
52
|
+
# self.oracle_connection.close()
|
|
53
|
+
|
|
54
|
+
def _connect_to_oracle(self):
|
|
55
|
+
# self.oracle_connection = oracledb.connect(**self.connection_params, mode=oracledb.SYSDBA)
|
|
56
|
+
self.oracle_connection = oracledb.connect(**self.connection_params)
|
|
57
|
+
return self.oracle_connection
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def _get_error_message(excepction: Exception, statement: str) -> None:
|
|
61
|
+
"""
|
|
62
|
+
Compose error message if the execution of a statement or query fails.
|
|
63
|
+
"""
|
|
64
|
+
if hasattr(excepction, "raw_msg"):
|
|
65
|
+
message = excepction.raw_msg.replace("\n", " ")
|
|
66
|
+
else:
|
|
67
|
+
message = str(
|
|
68
|
+
excepction
|
|
69
|
+
) # this makes sure that all kinds of errors can have a message, even if they do not have raw_msg attribute
|
|
70
|
+
if hasattr(excepction, "sfqid"):
|
|
71
|
+
message = message + f"\nQuery ID: {excepction.sfqid}"
|
|
72
|
+
return f"Oracle ERROR: {message}\nFailed statement:\n{statement}"
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def _get_in_clause(key_filters:list, numeric_columns:list, numeric_scale:int) -> str:
|
|
76
|
+
""" generates in_clause from list ready to expand the where clause, numeric values are rounded
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
key_filters (list): list of given expected values
|
|
80
|
+
numeric_columns (list): list of all numeric columns
|
|
81
|
+
numeric_scale (int): number of decimal places after rounding
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
str: in clause as string
|
|
85
|
+
"""
|
|
86
|
+
values = list(key_filters.values())
|
|
87
|
+
in_clause_values = "('"
|
|
88
|
+
for j in range(len(values[0])):
|
|
89
|
+
for value in values:
|
|
90
|
+
in_clause_values += str(value[j]) + "','"
|
|
91
|
+
in_clause_values = in_clause_values[:-2] + "),('"
|
|
92
|
+
in_clause_values = in_clause_values[:-3] + ')'
|
|
93
|
+
|
|
94
|
+
in_clause_cols = f" AND (("
|
|
95
|
+
for key in key_filters.keys():
|
|
96
|
+
if key in numeric_columns:
|
|
97
|
+
in_clause_cols += f"""ROUND({key.replace("'", "")},2)""" + ","
|
|
98
|
+
else:
|
|
99
|
+
in_clause_cols += key.replace("'", "") + ","
|
|
100
|
+
in_clause_cols = in_clause_cols[:-1] + ")"
|
|
101
|
+
in_clause = in_clause_cols + " in (" + in_clause_values + ")"
|
|
102
|
+
return in_clause
|
|
103
|
+
|
|
104
|
+
def _get_column_clause(self, column_list: list, columns_datatype: list, numeric_scale, key_columns) ->dict :
|
|
105
|
+
"""
|
|
106
|
+
Turns list of desired columns into a sql compatible string.
|
|
107
|
+
Columns with a date or time data type are omitted.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
column_list (list): list of all columns
|
|
111
|
+
columns_datatype (list): datatypes of given columns
|
|
112
|
+
numeric_scale (_type_): number of decimal places for numeric columns
|
|
113
|
+
key_columns (_type_):list of columns of interest
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
dict: _description_
|
|
117
|
+
"""
|
|
118
|
+
column_intersecions_new = []
|
|
119
|
+
used_columns = []
|
|
120
|
+
numeric_columns = []
|
|
121
|
+
for column in column_list:
|
|
122
|
+
column_datatype=next(x for x in columns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
|
|
123
|
+
|
|
124
|
+
if column in key_columns or not (column_datatype.lower() in self.oracle_datatype_mapping["date_and_time"]):
|
|
125
|
+
if column_datatype.lower() in self.oracle_datatype_mapping["numeric"]:
|
|
126
|
+
if numeric_scale:
|
|
127
|
+
column_intersecions_new.append(f"CAST(ROUND({column}, {numeric_scale}) as decimal(38,{numeric_scale})) as {column}")
|
|
128
|
+
else:
|
|
129
|
+
column_intersecions_new.append(f"{column} as {column}")
|
|
130
|
+
used_columns.append(column)
|
|
131
|
+
numeric_columns.append(column)
|
|
132
|
+
elif column_datatype.lower() in self.oracle_datatype_mapping["string"]:
|
|
133
|
+
column_intersecions_new.append(f'{column} AS {column}')
|
|
134
|
+
used_columns.append(column)
|
|
135
|
+
else:
|
|
136
|
+
column_intersecions_new.append(column)
|
|
137
|
+
used_columns.append(column)
|
|
138
|
+
|
|
139
|
+
column_intersections = column_intersecions_new.copy()
|
|
140
|
+
column_clause = str(column_intersections)[1:-1].replace("'", "")
|
|
141
|
+
return column_clause, numeric_columns, used_columns
|
|
142
|
+
|
|
143
|
+
def get_database_objects(self, database: str, schema: str=None, object_type_restriction: str='include_all') -> dict:
|
|
144
|
+
if self.oracle_connection is None:
|
|
145
|
+
self._connect_to_oracle()
|
|
146
|
+
|
|
147
|
+
all_database_tables=[]
|
|
148
|
+
all_database_views=[]
|
|
149
|
+
|
|
150
|
+
if object_type_restriction=='include_all' or object_type_restriction=='include_only_tables':
|
|
151
|
+
if schema:
|
|
152
|
+
query_db_tables=f"SELECT * FROM all_tables WHERE OWNER = '{schema.upper()}'"
|
|
153
|
+
else:
|
|
154
|
+
query_db_tables=f"SELECT * FROM all_tables "
|
|
155
|
+
|
|
156
|
+
all_database_tables = self.execute_queries(query_db_tables)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
if object_type_restriction=='include_all' or object_type_restriction=='include_only_views':
|
|
160
|
+
if schema:
|
|
161
|
+
query_db_views=f"SELECT * FROM all_views WHERE OWNER = '{schema.upper()}'"
|
|
162
|
+
else:
|
|
163
|
+
query_db_views=f"SELECT * FROM all_views "
|
|
164
|
+
|
|
165
|
+
all_database_views = self.execute_queries(query_db_views)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
database_objects=[]
|
|
169
|
+
for row in all_database_tables:
|
|
170
|
+
table_identifier=f'{database.upper()}.{row["OWNER"]}.{row["TABLE_NAME"]}'
|
|
171
|
+
database_objects.append({"object_identifier": table_identifier, "object_type": "table"})
|
|
172
|
+
for row in all_database_views:
|
|
173
|
+
view_identifier=f'{database.upper()}.{row["OWNER"]}.{row["VIEW_NAME"]}'
|
|
174
|
+
database_objects.append({"object_identifier": view_identifier, "object_type": "view"})
|
|
175
|
+
return database_objects
|
|
176
|
+
|
|
177
|
+
def get_last_altered_timestamp_from_object(
|
|
178
|
+
self,
|
|
179
|
+
object: DatabaseObject
|
|
180
|
+
) -> str:
|
|
181
|
+
"""queries last_altered timestamp for given object
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
object (str): object for comparison
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
str: last_altered timestamp
|
|
188
|
+
"""
|
|
189
|
+
if self.oracle_connection is None:
|
|
190
|
+
self._connect_to_oracle()
|
|
191
|
+
|
|
192
|
+
self.execute_statement("ALTER SESSION SET TIMEZONE = 'Europe/London'")
|
|
193
|
+
|
|
194
|
+
query_get_last_altered=f"SELECT LAST_ALTERED FROM {object.database}.INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = '{object.name}' AND TABLE_SCHEMA = '{object.schema}'"
|
|
195
|
+
|
|
196
|
+
last_altered = self.execute_queries(query_get_last_altered)[0]
|
|
197
|
+
|
|
198
|
+
return last_altered
|
|
199
|
+
|
|
200
|
+
def get_columns_from_object(self, object: DatabaseObject) -> list:
|
|
201
|
+
"""returns all columns from given object
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
object (DatabaseObject): table or view
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
list: list of all columns
|
|
208
|
+
"""
|
|
209
|
+
if self.oracle_connection is None:
|
|
210
|
+
self._connect_to_oracle()
|
|
211
|
+
|
|
212
|
+
query_get_columns = f"SELECT COLUMN_NAME FROM SYS.ALL_TAB_COLUMNS WHERE OWNER = '{object.schema}' AND TABLE_NAME = '{object.name}'"
|
|
213
|
+
|
|
214
|
+
all_columns = self.execute_queries(query_get_columns)
|
|
215
|
+
|
|
216
|
+
columns=[]
|
|
217
|
+
|
|
218
|
+
for row in all_columns:
|
|
219
|
+
columns.append(row["COLUMN_NAME"])
|
|
220
|
+
|
|
221
|
+
return columns
|
|
222
|
+
|
|
223
|
+
def get_row_count_from_object(self, object: DatabaseObject, where_clause: str="") -> int:
|
|
224
|
+
""" gets row count from given object
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
object (DatabaseObject): table or view
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
int: number of rows in object
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
if self.oracle_connection is None:
|
|
234
|
+
self._connect_to_oracle()
|
|
235
|
+
|
|
236
|
+
query_get_row_count = f"SELECT COUNT(*) AS ROW_COUNT FROM {object.schema}.{object.name} {where_clause}"
|
|
237
|
+
row_count = -1
|
|
238
|
+
error_list = []
|
|
239
|
+
|
|
240
|
+
try:
|
|
241
|
+
row_count = self.execute_queries(query_get_row_count)[0]["ROW_COUNT"]
|
|
242
|
+
|
|
243
|
+
except Exception as err:
|
|
244
|
+
error_list.append(str(err))
|
|
245
|
+
error_list.append(query_get_row_count)
|
|
246
|
+
|
|
247
|
+
return row_count, error_list
|
|
248
|
+
|
|
249
|
+
def get_data_types_from_object(self, object: DatabaseObject, column_intersections: list) -> dict:
|
|
250
|
+
""" returns datatypes for all intersection columns in a database object
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
object (DatabaseObject): table or view
|
|
254
|
+
column_intersections (list): columns for which the data type is queried
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
dict: columns and their datatype
|
|
258
|
+
"""
|
|
259
|
+
|
|
260
|
+
if self.oracle_connection is None:
|
|
261
|
+
self._connect_to_oracle()
|
|
262
|
+
|
|
263
|
+
column_intersections = str(column_intersections)[1:-1]
|
|
264
|
+
if column_intersections == '':
|
|
265
|
+
column_intersections = "''"
|
|
266
|
+
|
|
267
|
+
query_get_data_types_from_object=f"SELECT COLUMN_NAME , DATA_TYPE \
|
|
268
|
+
FROM sys.all_tab_columns \
|
|
269
|
+
WHERE TABLE_NAME='{object.name.upper()}' \
|
|
270
|
+
AND OWNER = '{object.schema.upper()}' \
|
|
271
|
+
AND COLUMN_NAME IN ({column_intersections}) \
|
|
272
|
+
"
|
|
273
|
+
|
|
274
|
+
dict_colummns_datatype=self.execute_queries(query_get_data_types_from_object)
|
|
275
|
+
return dict_colummns_datatype
|
|
276
|
+
|
|
277
|
+
def get_count_distincts_from_object(self, object: DatabaseObject, column_intersections: list, where_clause: str="", exclude_columns: list=[]) -> dict:
|
|
278
|
+
"""get distinct count for every column in a database object that is in column intersections list
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
object (DatabaseObject): table or view
|
|
282
|
+
column_intersections (list): columns that are used for distinct count
|
|
283
|
+
where_clause (str, optional): optional further filter. Defaults to "".
|
|
284
|
+
exclude_columns (list, optional): columns to exclude from distinct count. Defaults to [].
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
dict: distinct counts for columns
|
|
288
|
+
error_list: list of failed executions for distinct counts
|
|
289
|
+
"""
|
|
290
|
+
|
|
291
|
+
if self.oracle_connection is None:
|
|
292
|
+
self._connect_to_oracle()
|
|
293
|
+
|
|
294
|
+
unions=""
|
|
295
|
+
|
|
296
|
+
for column in column_intersections:
|
|
297
|
+
if column not in exclude_columns:
|
|
298
|
+
unions +=f" UNION SELECT '{column}' AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.schema}.{object.name} {where_clause}"
|
|
299
|
+
|
|
300
|
+
query_get_count_distincts_from_object=f"{unions[6:]} ORDER BY COUNT_DISTINCT"
|
|
301
|
+
error_list = []
|
|
302
|
+
try:
|
|
303
|
+
dict_count_distincts=self.execute_queries(query_get_count_distincts_from_object)
|
|
304
|
+
|
|
305
|
+
except Exception as err:
|
|
306
|
+
#raise err
|
|
307
|
+
dict_count_distincts = [{'COUNT_DISTINCT': 0}]
|
|
308
|
+
error_list.append(["ERROR", str(err).split('|||')[0], str(err).split('|||')[1]])
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
return dict_count_distincts, error_list
|
|
312
|
+
|
|
313
|
+
def get_table_size(self, object: DatabaseObject) -> int:
|
|
314
|
+
""" returns size of given object
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
object (DatabaseObject): table or view
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
int: size of object
|
|
321
|
+
"""
|
|
322
|
+
|
|
323
|
+
if self.oracle_connection is None:
|
|
324
|
+
self._connect_to_oracle()
|
|
325
|
+
|
|
326
|
+
query_get_table_size = f"SELECT SEGMENT_NAME,SUM(BYTES) BYTES FROM DBA_SEGMENTS WHERE OWNER = '{object.schema.upper()}' AND SEGMENT_TYPE='TABLE' AND SEGMENT_NAME='{object.name.upper()}' GROUP BY SEGMENT_NAME"
|
|
327
|
+
|
|
328
|
+
query_result=self.execute_queries(query_get_table_size)
|
|
329
|
+
|
|
330
|
+
if query_result:
|
|
331
|
+
size = query_result[0]["BYTES"]
|
|
332
|
+
else:
|
|
333
|
+
size = 0
|
|
334
|
+
|
|
335
|
+
return size
|
|
336
|
+
|
|
337
|
+
def create_checksums(self, object: DatabaseObject , column_intersections: list, where_clause: str="", exclude_columns:list=[], numeric_scale: int = None) -> List[Dict]:
|
|
338
|
+
""" creates checksums for given object in compliance with given conditions
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
object (DatabaseObject): table or view
|
|
342
|
+
column_intersections (list): columns that are used for checksums
|
|
343
|
+
where_clause (str, optional): Optional filter criteria given as sql-usable string. Defaults to "".
|
|
344
|
+
exclude_columns (list, optional): columns to exlude from calculation. Defaults to [].
|
|
345
|
+
numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
List[Dict]: checksums for columns of object
|
|
349
|
+
"""
|
|
350
|
+
|
|
351
|
+
if self.oracle_connection is None:
|
|
352
|
+
self._connect_to_oracle()
|
|
353
|
+
|
|
354
|
+
column_intersections= [f"{x.upper()}" for x in column_intersections if x not in exclude_columns]
|
|
355
|
+
|
|
356
|
+
dict_colummns_datatype=self.get_data_types_from_object(object, column_intersections)
|
|
357
|
+
|
|
358
|
+
aggregates = ""
|
|
359
|
+
count_nulls = ""
|
|
360
|
+
|
|
361
|
+
for column in column_intersections:
|
|
362
|
+
column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
|
|
363
|
+
|
|
364
|
+
count_nulls += f", SUM(CASE WHEN {column} IS NULL THEN 1 ELSE 0 END) AS COUNTNULLS_{column}"
|
|
365
|
+
|
|
366
|
+
if column_datatype.lower() in self.oracle_datatype_mapping["numeric"]:
|
|
367
|
+
|
|
368
|
+
if numeric_scale:
|
|
369
|
+
aggregates += f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38, {numeric_scale})) AS sum_{column}"
|
|
370
|
+
else:
|
|
371
|
+
aggregates += f", CAST(SUM({column}) AS DECIMAL(38)) AS sum_{column}"
|
|
372
|
+
|
|
373
|
+
elif 'char' in column_datatype.lower() or 'raw' in column_datatype.lower():
|
|
374
|
+
|
|
375
|
+
aggregates += f", COUNT(DISTINCT LOWER({column})) AS countdistinct_{column}"
|
|
376
|
+
|
|
377
|
+
elif column_datatype.lower() == 'date' or 'timestamp' in column_datatype.lower() or 'interval' in column_datatype.lower():
|
|
378
|
+
|
|
379
|
+
aggregates += f", COUNT(DISTINCT {column}) AS countdistinct_{column}"
|
|
380
|
+
#else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
|
|
381
|
+
|
|
382
|
+
query_checksums = f"SELECT {aggregates[1:]} FROM {object.schema}.{object.name} {where_clause}"
|
|
383
|
+
|
|
384
|
+
query_countnulls = f"SELECT {count_nulls[1:]} FROM {object.schema}.{object.name} {where_clause}"
|
|
385
|
+
|
|
386
|
+
error_list = []
|
|
387
|
+
test_list=[]
|
|
388
|
+
aggregation_results={}
|
|
389
|
+
|
|
390
|
+
try:
|
|
391
|
+
checksums_results = self.execute_queries([query_checksums,query_countnulls])
|
|
392
|
+
|
|
393
|
+
aggregation_results=checksums_results[0][0]
|
|
394
|
+
|
|
395
|
+
countnulls_results=checksums_results[1][0]
|
|
396
|
+
|
|
397
|
+
for i in range(0,len(aggregation_results)):
|
|
398
|
+
|
|
399
|
+
if list(aggregation_results.values())[i] is None:
|
|
400
|
+
agg_result = 0
|
|
401
|
+
else:
|
|
402
|
+
agg_result = list(aggregation_results.values())[i]
|
|
403
|
+
|
|
404
|
+
if list(countnulls_results.values())[i] is None:
|
|
405
|
+
cnt_result = 0
|
|
406
|
+
else:
|
|
407
|
+
cnt_result = list(countnulls_results.values())[i]
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
test_list.append([[item.split("_", 1)[0] for item in list(aggregation_results.keys())][i],agg_result,cnt_result])
|
|
411
|
+
|
|
412
|
+
except Exception as err:
|
|
413
|
+
error_list.append(["ERROR", str(err).split('|||')[0], str(err).split('|||')[1]])
|
|
414
|
+
|
|
415
|
+
checksums = dict(zip([item.split("_", 1)[1] for item in aggregation_results.keys()] , test_list))
|
|
416
|
+
checksums['TESTATM_ERRORS'] = error_list
|
|
417
|
+
|
|
418
|
+
return checksums
|
|
419
|
+
|
|
420
|
+
def create_pandas_df_from_group_by(
|
|
421
|
+
self,
|
|
422
|
+
object: DatabaseObject,
|
|
423
|
+
column_intersections: list,
|
|
424
|
+
group_by_columns: list,
|
|
425
|
+
group_by_aggregation_columns: list,
|
|
426
|
+
group_by_aggregation_type: str,
|
|
427
|
+
only_numeric: bool,
|
|
428
|
+
where_clause: str,
|
|
429
|
+
exclude_columns: list,
|
|
430
|
+
numeric_scale: int = None
|
|
431
|
+
) -> List[Dict]:
|
|
432
|
+
"""execution of multiple aggregations at once
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
object (DatabaseObject): table or view
|
|
436
|
+
column_intersections (list): columns existing in src and trgt
|
|
437
|
+
group_by_columns (list): columns for grouping the aggregations
|
|
438
|
+
group_by_aggregation_columns (list): list of columns that are supposed to be aggregated
|
|
439
|
+
group_by_aggregation_type (str): choice between: only_min_max, various, various_and_min_max
|
|
440
|
+
only_numeric (bool): whether to also include distinct counts or only do numeric aggregations
|
|
441
|
+
where_clause (str): optional filter for aggregations, given as sql compatible where-string
|
|
442
|
+
exclude_columns (list): columns to exclude from comparisons
|
|
443
|
+
numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
List[Dict]: list of pandas dataframes with results from aggregations, used sql queries
|
|
447
|
+
"""
|
|
448
|
+
|
|
449
|
+
if self.oracle_connection is None:
|
|
450
|
+
self._connect_to_oracle()
|
|
451
|
+
|
|
452
|
+
if group_by_aggregation_columns == ["all"]:
|
|
453
|
+
aggregation_columns= [f"{column.upper()}" for column in column_intersections if (column not in group_by_columns and column not in exclude_columns)]
|
|
454
|
+
else:
|
|
455
|
+
aggregation_columns= [f"{column.upper()}" for column in column_intersections if (column in group_by_aggregation_columns and column not in exclude_columns)]
|
|
456
|
+
|
|
457
|
+
group_by_query_columns_string = " "
|
|
458
|
+
grouping_columns_final = []
|
|
459
|
+
error_dict = {}
|
|
460
|
+
|
|
461
|
+
try:
|
|
462
|
+
for column in group_by_columns:
|
|
463
|
+
if column in column_intersections and column not in exclude_columns:
|
|
464
|
+
group_by_query_columns_string += f"{column} ,"
|
|
465
|
+
grouping_columns_final.append(column)
|
|
466
|
+
|
|
467
|
+
group_by_query_columns_string = group_by_query_columns_string[:-1]
|
|
468
|
+
|
|
469
|
+
dict_colummns_datatype=self.get_data_types_from_object(object, aggregation_columns)
|
|
470
|
+
|
|
471
|
+
aggregates = ""
|
|
472
|
+
aggregates_min = ""
|
|
473
|
+
|
|
474
|
+
for column in aggregation_columns:
|
|
475
|
+
|
|
476
|
+
column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
|
|
477
|
+
|
|
478
|
+
if column_datatype.lower() in self.oracle_datatype_mapping["numeric"]:
|
|
479
|
+
if numeric_scale:
|
|
480
|
+
aggregates_min += f", CAST(ROUND(MIN({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MIN_{column}, CAST(ROUND(max({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MAX_{column}"
|
|
481
|
+
aggregates += f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS SUM_{column}"
|
|
482
|
+
else:
|
|
483
|
+
aggregates_min += f", MIN({column}) AS MIN_{column}, MAX({column}) AS MAX_{column}"
|
|
484
|
+
aggregates += f", SUM({column}) AS SUM_{column}"
|
|
485
|
+
|
|
486
|
+
elif 'char' in column_datatype.lower() or 'raw' in column_datatype.lower():
|
|
487
|
+
|
|
488
|
+
aggregates += f", COUNT(DISTINCT LOWER({column})) AS COUNTDISTINCT_{column}"
|
|
489
|
+
|
|
490
|
+
elif column_datatype.lower() == 'date' or 'timestamp' in column_datatype.lower() or 'interval' in column_datatype.lower():
|
|
491
|
+
|
|
492
|
+
aggregates += f", COUNT(DISTINCT {column}) AS COUNTDISTINCT_{column}"
|
|
493
|
+
#else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
|
|
494
|
+
|
|
495
|
+
# CASE 1: min_max
|
|
496
|
+
if group_by_aggregation_type == "only_min_max":
|
|
497
|
+
group_by_query_aggregation_string = aggregates_min[1:]
|
|
498
|
+
|
|
499
|
+
# CASE 2: sum, count_distinct, aggregate_boolean
|
|
500
|
+
elif group_by_aggregation_type == "various":
|
|
501
|
+
group_by_query_aggregation_string = aggregates[1:]
|
|
502
|
+
|
|
503
|
+
# CASE 3: sum, count_distinct, aggregate_boolean, min_max
|
|
504
|
+
elif group_by_aggregation_type == "various_and_min_max":
|
|
505
|
+
group_by_query_aggregation_string = f"{aggregates_min[1:]}{aggregates}"
|
|
506
|
+
|
|
507
|
+
query_group_by_aggregation = f"SELECT {group_by_query_columns_string}, COUNT(*) AS COUNT_OF_GROUP_BY_VALUE, {group_by_query_aggregation_string} FROM {object.schema}.{object.name} {where_clause} GROUP BY {group_by_query_columns_string} ORDER BY {group_by_query_columns_string}"
|
|
508
|
+
|
|
509
|
+
group_by_aggregation_pdf = self.execute_queries(query_group_by_aggregation,True)
|
|
510
|
+
except Exception as err:
|
|
511
|
+
group_by_aggregation_pdf = pd.DataFrame()
|
|
512
|
+
group_by_aggregation_pdf["TESTATM_ERROR"] = [1]
|
|
513
|
+
if not grouping_columns_final:
|
|
514
|
+
error_dict = {
|
|
515
|
+
"QUERY": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table",
|
|
516
|
+
"ERROR": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table"
|
|
517
|
+
}
|
|
518
|
+
group_by_query_aggregation_string = ""
|
|
519
|
+
elif '|||' in str(err):
|
|
520
|
+
error_dict = {
|
|
521
|
+
"QUERY": str(err).split('|||')[0],
|
|
522
|
+
"ERROR": str(err).split('|||')[1]
|
|
523
|
+
}
|
|
524
|
+
else:
|
|
525
|
+
error_dict = {
|
|
526
|
+
"QUERY": "NO Query generated. Please check if the configurated Grouping Columns exist in the Table",
|
|
527
|
+
"ERROR": str(err)
|
|
528
|
+
}
|
|
529
|
+
group_by_query_aggregation_string = ""
|
|
530
|
+
|
|
531
|
+
return group_by_aggregation_pdf, group_by_query_aggregation_string, group_by_query_columns_string, grouping_columns_final, error_dict
|
|
532
|
+
|
|
533
|
+
def create_pandas_df(self, object: DatabaseObject, intersection_columns_trgt_src: list, where_clause:str="", exclude_columns:list=[]) -> pd.DataFrame:
|
|
534
|
+
""" creates pandas dataframes with all data from given object in given columns
|
|
535
|
+
|
|
536
|
+
Args:
|
|
537
|
+
object (DatabaseObject): table or view
|
|
538
|
+
intersection_columns_trgt_src (list): columns existing in source and target
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
pd.DataFrame: direct result of sql query
|
|
542
|
+
"""
|
|
543
|
+
|
|
544
|
+
if self.oracle_connection is None:
|
|
545
|
+
self._connect_to_oracle()
|
|
546
|
+
|
|
547
|
+
intersection_columns_trgt_src_ = ', '.join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
|
|
548
|
+
|
|
549
|
+
df_query = f"SELECT {intersection_columns_trgt_src_} FROM {object.schema}.{object.name} {where_clause}"
|
|
550
|
+
|
|
551
|
+
src_pdf = self.execute_queries(df_query,True)
|
|
552
|
+
|
|
553
|
+
return src_pdf
|
|
554
|
+
|
|
555
|
+
def create_pandas_df_from_sample(self, object: DatabaseObject, column_intersections: list, key_columns: list, where_clause:str="", exclude_columns:list=[], key_filters: dict={}, dedicated_columns: list=[], sample_count :int=10, numeric_scale: int = None) -> List[Dict]:
|
|
556
|
+
|
|
557
|
+
if self.oracle_connection is None:
|
|
558
|
+
self._connect_to_oracle()
|
|
559
|
+
|
|
560
|
+
sample_count = str(sample_count)
|
|
561
|
+
key_intersection = list((set(column_intersections) & set(key_columns)) - set(exclude_columns))
|
|
562
|
+
filter_intersection = list((set(column_intersections) & set(key_filters.keys())) - set(exclude_columns))
|
|
563
|
+
dedicated_intersection = list((set(column_intersections) & set(dedicated_columns)) - set(exclude_columns))
|
|
564
|
+
|
|
565
|
+
key_intersection.sort()
|
|
566
|
+
filter_intersection.sort()
|
|
567
|
+
dedicated_intersection.sort()
|
|
568
|
+
|
|
569
|
+
if not where_clause:
|
|
570
|
+
where_clause= 'WHERE 1=1 '
|
|
571
|
+
|
|
572
|
+
if dedicated_intersection != []:
|
|
573
|
+
is_dedicated = True
|
|
574
|
+
|
|
575
|
+
dict_colummns_datatype=self.get_data_types_from_object(object, dedicated_intersection)
|
|
576
|
+
|
|
577
|
+
else:
|
|
578
|
+
is_dedicated = False
|
|
579
|
+
|
|
580
|
+
dict_colummns_datatype=self.get_data_types_from_object(object, column_intersections)
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
if key_intersection != [] and is_dedicated:
|
|
584
|
+
keys = str(key_intersection)[1:-1].replace("'", "")
|
|
585
|
+
column_clause, numeric_columns, used_columns = self._get_column_clause(dedicated_intersection, dict_colummns_datatype, numeric_scale, key_columns)
|
|
586
|
+
if (key_filters != {}) & (filter_intersection != []):
|
|
587
|
+
values = list(key_filters.values())
|
|
588
|
+
if values[0] != []:
|
|
589
|
+
in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale)
|
|
590
|
+
else:
|
|
591
|
+
in_clause = ""
|
|
592
|
+
else:
|
|
593
|
+
in_clause = ""
|
|
594
|
+
sample_query = f"SELECT {column_clause} FROM (SELECT * FROM {object.schema}.{object.name} ORDER BY DBMS_RANDOM.VALUE) {where_clause} AND rownum <= {sample_count} {in_clause} ORDER BY {keys}"
|
|
595
|
+
elif key_intersection != [] and not is_dedicated:
|
|
596
|
+
keys = str(key_intersection)[1:-1].replace("'", "")
|
|
597
|
+
column_clause, numeric_columns, used_columns = self._get_column_clause(column_intersections, dict_colummns_datatype, numeric_scale, key_columns)
|
|
598
|
+
if (key_filters != {}) & (filter_intersection != []):
|
|
599
|
+
values = list(key_filters.values())
|
|
600
|
+
if values[0] != []:
|
|
601
|
+
in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale)
|
|
602
|
+
else:
|
|
603
|
+
in_clause = ""
|
|
604
|
+
else:
|
|
605
|
+
in_clause = ""
|
|
606
|
+
sample_query = f"SELECT {column_clause} FROM (SELECT * FROM {object.schema}.{object.name} ORDER BY DBMS_RANDOM.VALUE) {where_clause} AND rownum <= {sample_count} {in_clause} ORDER BY {keys}"
|
|
607
|
+
else:
|
|
608
|
+
column_intersections = list(set(column_intersections) - set(exclude_columns))
|
|
609
|
+
column_intersections.sort()
|
|
610
|
+
column_clause, numeric_columns, used_columns = self._get_column_clause(column_intersections, dict_colummns_datatype, numeric_scale, key_columns)
|
|
611
|
+
sample_query = f"SELECT {column_clause} FROM (SELECT * FROM {object.schema}.{object.name} ORDER BY DBMS_RANDOM.VALUE) {where_clause} AND rownum <= {sample_count}"
|
|
612
|
+
|
|
613
|
+
error_dict = {}
|
|
614
|
+
key_dict = {}
|
|
615
|
+
try:
|
|
616
|
+
sample_pdf = self.execute_queries(sample_query, return_as_pdf=True)
|
|
617
|
+
for key in key_intersection:
|
|
618
|
+
if pd.api.types.is_datetime64_any_dtype(sample_pdf[key]):
|
|
619
|
+
key_dict[key] = list(sample_pdf[key].astype(str))
|
|
620
|
+
else:
|
|
621
|
+
key_dict[key] = list(sample_pdf[key])
|
|
622
|
+
|
|
623
|
+
except Exception as err:
|
|
624
|
+
sample_pdf = pd.DataFrame()
|
|
625
|
+
sample_pdf["TESTATM_ERROR"] = [1]
|
|
626
|
+
if '|||' in str(err):
|
|
627
|
+
error_dict = {
|
|
628
|
+
"QUERY": str(err).split('|||')[0],
|
|
629
|
+
"ERROR": str(err).split('|||')[1]
|
|
630
|
+
}
|
|
631
|
+
else:
|
|
632
|
+
error_dict = {
|
|
633
|
+
"QUERY": 'No SQL Error',
|
|
634
|
+
"ERROR": str(err)
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
return_list = []
|
|
638
|
+
return_list.append(sample_pdf)
|
|
639
|
+
return_list.append(error_dict)
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
return return_list , key_dict, used_columns, sample_query
|
|
643
|
+
|
|
644
|
+
def execute_queries(self, query: Union[str, List[str]],return_as_pdf:bool=False, return_query_ids:bool=False) -> Union[List[Dict], List[List[Dict]]]:
|
|
645
|
+
""" actual execution of defined queries
|
|
646
|
+
|
|
647
|
+
Args:
|
|
648
|
+
query (Union[str, List[str]]): queries to be executed
|
|
649
|
+
return_as_pdf (bool, optional): If true, queries returned as pandas data frames. Defaults to False.
|
|
650
|
+
return_query_ids (bool, optional): If true, results and queri ids are returned, otherwise only results. Defaults to False.
|
|
651
|
+
|
|
652
|
+
Raises:
|
|
653
|
+
Exception: Raises exception if single query cannot be executed.
|
|
654
|
+
|
|
655
|
+
Returns:
|
|
656
|
+
Union[List[Dict], List[List[Dict]]]: returns results or results with query-ids
|
|
657
|
+
"""
|
|
658
|
+
|
|
659
|
+
if self.oracle_connection is None:
|
|
660
|
+
self._connect_to_oracle()
|
|
661
|
+
|
|
662
|
+
if query:
|
|
663
|
+
query_list: List[str] = query if isinstance(query, list) else [query]
|
|
664
|
+
else:
|
|
665
|
+
logger.error('Query defined as null - please check input for execute_queries function.')
|
|
666
|
+
|
|
667
|
+
cursor = self.oracle_connection.cursor()
|
|
668
|
+
|
|
669
|
+
results = []
|
|
670
|
+
|
|
671
|
+
for single_query in query_list:
|
|
672
|
+
try:
|
|
673
|
+
if return_as_pdf:
|
|
674
|
+
|
|
675
|
+
query_list=cursor.execute(single_query).fetchall()
|
|
676
|
+
columns = [col[0] for col in cursor.description]
|
|
677
|
+
query_result = pd.DataFrame(query_list, columns = columns)
|
|
678
|
+
else:
|
|
679
|
+
cursor.execute(single_query)
|
|
680
|
+
columns = [col[0] for col in cursor.description]
|
|
681
|
+
cursor.rowfactory = lambda *args: dict(zip(columns, args))
|
|
682
|
+
query_result = cursor.fetchall()
|
|
683
|
+
|
|
684
|
+
except Exception as err:
|
|
685
|
+
raise Exception(single_query + "|||" + str(err))
|
|
686
|
+
|
|
687
|
+
results.append(query_result)
|
|
688
|
+
|
|
689
|
+
return results[0] if not isinstance(query, list) else results
|
|
690
|
+
|
|
691
|
+
def execute_statement(self, statement: Union[str, List[str]]) -> None:
|
|
692
|
+
"""
|
|
693
|
+
Executes simple statement against oracle
|
|
694
|
+
Schema and Database settings must be set beforehand
|
|
695
|
+
Args:
|
|
696
|
+
statement Union[str, List[str]] - a sql statement or a list of sql statements to execute
|
|
697
|
+
"""
|
|
698
|
+
if self.oracle_connection is None:
|
|
699
|
+
self._connect_to_oracle()
|
|
700
|
+
|
|
701
|
+
statement_list: List[str] = (
|
|
702
|
+
statement if isinstance(statement, list) else [statement]
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
try:
|
|
706
|
+
for single_statement in statement_list:
|
|
707
|
+
stripped_statement = (
|
|
708
|
+
single_statement.strip()
|
|
709
|
+
)
|
|
710
|
+
_ = self.oracle_connection.execute_string(stripped_statement)
|
|
711
|
+
|
|
712
|
+
except Exception as err:
|
|
713
|
+
raise Exception(self._get_error_message(err, single_statement)) from err
|