icsDataValidation 1.0.358__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- icsDataValidation/configuration.py +19 -0
- icsDataValidation/connection_setups/__init__.py +0 -0
- icsDataValidation/connection_setups/azure_connection_setup.py +19 -0
- icsDataValidation/connection_setups/databricks_connection_setup.py +28 -0
- icsDataValidation/connection_setups/exasol_connection_setup.py +17 -0
- icsDataValidation/connection_setups/oracle_connection_setup.py +26 -0
- icsDataValidation/connection_setups/snowflake_connection_setup.py +35 -0
- icsDataValidation/connection_setups/teradata_connection_setup.py +18 -0
- icsDataValidation/core/__init__.py +0 -0
- icsDataValidation/core/database_objects.py +18 -0
- icsDataValidation/core/object_comparison.py +239 -0
- icsDataValidation/input_parameters/__init__.py +0 -0
- icsDataValidation/input_parameters/testing_tool_params.py +81 -0
- icsDataValidation/main.py +250 -0
- icsDataValidation/output_parameters/__init__.py +0 -0
- icsDataValidation/output_parameters/result_params.py +94 -0
- icsDataValidation/services/__init__.py +0 -0
- icsDataValidation/services/comparison_service.py +582 -0
- icsDataValidation/services/database_services/__init__.py +0 -0
- icsDataValidation/services/database_services/azure_service.py +320 -0
- icsDataValidation/services/database_services/databricks_hive_metastore_service.py +1694 -0
- icsDataValidation/services/database_services/databricks_unity_catalog_service.py +1379 -0
- icsDataValidation/services/database_services/exasol_service.py +261 -0
- icsDataValidation/services/database_services/oracle_service.py +713 -0
- icsDataValidation/services/database_services/snowflake_service.py +1100 -0
- icsDataValidation/services/database_services/teradata_service.py +665 -0
- icsDataValidation/services/initialization_service.py +103 -0
- icsDataValidation/services/result_service.py +573 -0
- icsDataValidation/services/system_service.py +61 -0
- icsDataValidation/services/testset_service.py +257 -0
- icsDataValidation/utils/__init__.py +0 -0
- icsDataValidation/utils/file_util.py +96 -0
- icsDataValidation/utils/logger_util.py +96 -0
- icsDataValidation/utils/pandas_util.py +159 -0
- icsDataValidation/utils/parallelization_util.py +52 -0
- icsDataValidation/utils/sql_util.py +14 -0
- icsDataValidation-1.0.358.dist-info/METADATA +21 -0
- icsDataValidation-1.0.358.dist-info/RECORD +40 -0
- icsDataValidation-1.0.358.dist-info/WHEEL +5 -0
- icsDataValidation-1.0.358.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
import pyodbc
|
|
2
|
+
import pandas.io.sql
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from typing import Union, List, Dict
|
|
7
|
+
|
|
8
|
+
from icsDataValidation.utils.logger_util import configure_dev_ops_logger
|
|
9
|
+
from icsDataValidation.core.database_objects import DatabaseObject
|
|
10
|
+
|
|
11
|
+
#########################################################################################
|
|
12
|
+
#########################################################################################
|
|
13
|
+
|
|
14
|
+
# Configure Dev Ops Logger
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger('Azure_Service')
|
|
17
|
+
logger.setLevel(logging.INFO)
|
|
18
|
+
configure_dev_ops_logger(logger)
|
|
19
|
+
|
|
20
|
+
class AzureService(object):
|
|
21
|
+
def __init__(self, connection_params: dict):
|
|
22
|
+
self.connection_params =connection_params
|
|
23
|
+
self.azure_connection = None
|
|
24
|
+
|
|
25
|
+
def __enter__(self):
|
|
26
|
+
return self
|
|
27
|
+
|
|
28
|
+
def __exit__(self, exception_type, exception_value, traceback):
|
|
29
|
+
if self.azure_connection is not None:
|
|
30
|
+
self.azure_connection.close()
|
|
31
|
+
|
|
32
|
+
def __del__(self):
|
|
33
|
+
if self.azure_connection is not None:
|
|
34
|
+
self.azure_connection.close()
|
|
35
|
+
|
|
36
|
+
def _connect_to_azure(self):
|
|
37
|
+
azure_connection_string = f"DRIVER={self.connection_params['Driver']};SERVER={self.connection_params['Server']};PORT=1443;DATABASE={self.connection_params['Database']};UID={self.connection_params['User']};PWD={self.connection_params['Password']}"
|
|
38
|
+
#'DRIVER='+driver+';SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password
|
|
39
|
+
self.azure_connection = pyodbc.connect(azure_connection_string)
|
|
40
|
+
return self.azure_connection
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def _get_error_message(excepction: Exception, statement: str) -> None:
|
|
45
|
+
"""
|
|
46
|
+
Compose error message if the execution of a statement or query fails.
|
|
47
|
+
"""
|
|
48
|
+
if hasattr(excepction, "raw_msg"):
|
|
49
|
+
message = excepction.raw_msg.replace("\n", " ")
|
|
50
|
+
else:
|
|
51
|
+
message = str(
|
|
52
|
+
excepction
|
|
53
|
+
) # this makes sure that all kinds of errors can have a message, even if they do not have raw_msg attribute
|
|
54
|
+
if hasattr(excepction, "sfqid"):
|
|
55
|
+
message = message + f"\nQuery ID: {excepction.sfqid}"
|
|
56
|
+
return f"Azure ERROR: {message}\nFailed statement:\n{statement}"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_database_objects(self, database: str, schema: str=None, object_type_restriction: str='include_all') -> dict:
|
|
60
|
+
if self.azure_connection is None:
|
|
61
|
+
self._connect_to_azure()
|
|
62
|
+
|
|
63
|
+
all_database_tables=[]
|
|
64
|
+
all_database_views=[]
|
|
65
|
+
|
|
66
|
+
if object_type_restriction=='include_all' or object_type_restriction=='include_only_tables':
|
|
67
|
+
if schema:
|
|
68
|
+
query_db_tables=f"select schema_name(t.schema_id) as schema_name, t.name as table_name from sys.tables t where schema_name(t.schema_id) = '{object.schema}' order by schema_name;"
|
|
69
|
+
else:
|
|
70
|
+
query_db_tables=f"select schema_name(t.schema_id) as schema_name, t.name as table_name from sys.tables t order by schema_name;"
|
|
71
|
+
|
|
72
|
+
all_database_tables = self.execute_queries(query_db_tables)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
elif object_type_restriction=='include_all' or object_type_restriction=='include_only_views':
|
|
76
|
+
if schema:
|
|
77
|
+
query_db_views=f"select schema_name(t.schema_id) as schema_name, t.name as table_name from sys.views t where schema_name(t.schema_id) = '{object.schema}' order by schema_name;"
|
|
78
|
+
else:
|
|
79
|
+
query_db_views=f"select schema_name(t.schema_id) as schema_name, t.name as table_name from sys.views t order by schema_name;"
|
|
80
|
+
|
|
81
|
+
all_database_views = self.execute_queries(query_db_views)
|
|
82
|
+
|
|
83
|
+
database_objects=[]
|
|
84
|
+
for row in all_database_tables:
|
|
85
|
+
database_table=f'{database}.{row[0].upper()}.{row[1].upper()}'
|
|
86
|
+
database_objects.append({"object_identifier": database_table, "object_type": "table"})
|
|
87
|
+
for row in all_database_views:
|
|
88
|
+
database_view=f'{database}.{row[0].upper()}.{row[1].upper()}'
|
|
89
|
+
database_objects.append({"object_identifier": database_view, "object_type": "view"})
|
|
90
|
+
return database_objects
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_columns_from_object(self, object : DatabaseObject) -> list:
|
|
94
|
+
if self.azure_connection is None:
|
|
95
|
+
self._connect_to_azure()
|
|
96
|
+
|
|
97
|
+
queries_get_columns = [f"select col.name from sys.tables as tab inner join sys.columns as col on (tab.object_id = col.object_id and upper(tab.name) = '{object.name.upper()}') inner join (select object_id, schema_id from sys.objects) as obj on (tab.object_id = obj.object_id and schema_name(obj.schema_id) = '{object.schema.upper()}');"]
|
|
98
|
+
|
|
99
|
+
# select col.name from sys.tables as tab inner join sys.columns as col on tab.object_id = col.object_id where tab.name = 'TBL_CUSTOMER_DATA'
|
|
100
|
+
|
|
101
|
+
all_columns = self.execute_queries(queries_get_columns)[0]
|
|
102
|
+
|
|
103
|
+
columns=[]
|
|
104
|
+
|
|
105
|
+
for row in all_columns:
|
|
106
|
+
columns.append(row[0])
|
|
107
|
+
|
|
108
|
+
return columns
|
|
109
|
+
|
|
110
|
+
def get_row_count_from_object(self, object : DatabaseObject) -> int:
|
|
111
|
+
if self.azure_connection is None:
|
|
112
|
+
self._connect_to_azure()
|
|
113
|
+
|
|
114
|
+
query_get_row_count = f"select count(*) as ROW_COUNT from {object.schema}.{object.name};"
|
|
115
|
+
|
|
116
|
+
row_count = self.execute_queries(query_get_row_count).fetchall()[0][0]
|
|
117
|
+
|
|
118
|
+
return row_count
|
|
119
|
+
|
|
120
|
+
def get_data_types_from_object(self, object : DatabaseObject, column_intersections: list) -> dict:
|
|
121
|
+
results = []
|
|
122
|
+
|
|
123
|
+
if self.azure_connection is None:
|
|
124
|
+
self._connect_to_azure()
|
|
125
|
+
|
|
126
|
+
column_intersections = str(column_intersections)[1:-1]
|
|
127
|
+
query_get_data_types_from_object=f"select col.name, t.name as data_type from sys.tables as tab inner join sys.columns as col on tab.object_id = col.object_id left join sys.types as t on col.user_type_id = t.user_type_id where tab.name = '{object.name}' and schema_name(tab.schema_id) = '{object.schema}'"
|
|
128
|
+
dict_colummns_datatype=self.execute_queries(query_get_data_types_from_object).fetchall()
|
|
129
|
+
|
|
130
|
+
for row in dict_colummns_datatype:
|
|
131
|
+
# logger.info(type(row))
|
|
132
|
+
row_to_list = [elem for elem in row]
|
|
133
|
+
results.append({"COLUMN_NAME":row_to_list[0],"DATA_TYPE":row_to_list[1]})
|
|
134
|
+
|
|
135
|
+
return results
|
|
136
|
+
|
|
137
|
+
def get_count_distincts_from_object(self, object : DatabaseObject, column_intersections: list) -> dict:
|
|
138
|
+
if self.azure_connection is None:
|
|
139
|
+
self._connect_to_azure()
|
|
140
|
+
|
|
141
|
+
unions=""
|
|
142
|
+
for column in column_intersections:
|
|
143
|
+
unions +=f"UNION SELECT '{column}' AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.schema}.{object.name}"
|
|
144
|
+
|
|
145
|
+
query_get_count_distincts_from_object=f"{unions[5:]} ORDER BY COUNT_DISTINCT;"
|
|
146
|
+
dict_count_distincts=self.execute_queries(query_get_count_distincts_from_object).fetchall()
|
|
147
|
+
|
|
148
|
+
return dict_count_distincts
|
|
149
|
+
|
|
150
|
+
def get_table_size(self, object : DatabaseObject) -> int:
|
|
151
|
+
query_get_table_size = f"select cast(sum(spc.used_pages * 8)/1024.00 *1000000 as integer) as BYTES from sys.tables tab inner join sys.indexes ind on tab.object_id = ind.object_id inner join sys.partitions part on ind.object_id = part.object_id and ind.index_id = part.index_id inner join sys.allocation_units spc on part.partition_id = spc.container_id where schema_name(tab.schema_id) = '{object.schema}' and tab.name = '{object.name}' group by schema_name(tab.schema_id) + '.' + tab.name order by sum(spc.used_pages) desc;"
|
|
152
|
+
|
|
153
|
+
size = self.execute_queries(query_get_table_size).fetchall()[0][0]
|
|
154
|
+
|
|
155
|
+
return size
|
|
156
|
+
|
|
157
|
+
def create_checksums(self, object : DatabaseObject, column_intersections: list) -> List[Dict]:
|
|
158
|
+
if self.azure_connection is None:
|
|
159
|
+
self._connect_to_azure()
|
|
160
|
+
|
|
161
|
+
# column_intersections= [f"{x.upper()}" for x in column_intersections]
|
|
162
|
+
|
|
163
|
+
dict_colummns_datatype=self.get_data_types_from_object(object, column_intersections)
|
|
164
|
+
|
|
165
|
+
# dict_colummns_datatype_dict = dict(zip(dict_colummns_datatype[::2], dict_colummns_datatype[1::2]))
|
|
166
|
+
|
|
167
|
+
aggregates = ""
|
|
168
|
+
count_nulls = ""
|
|
169
|
+
|
|
170
|
+
for column in column_intersections:
|
|
171
|
+
column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
|
|
172
|
+
column_datatype = column_datatype.split('(')[0]
|
|
173
|
+
|
|
174
|
+
count_nulls += f", sum(case when {column} is null then 1 else 0 end) as countnulls_{column}"
|
|
175
|
+
|
|
176
|
+
if column_datatype.lower() == 'tinyint' or column_datatype.lower() == 'smallint' or column_datatype.lower() == 'int' or column_datatype.lower() == 'bigint' or column_datatype.lower() == 'decimal' or column_datatype.lower() == 'numeric' or column_datatype.lower() == 'smallmoney' or column_datatype.lower() == 'money' or column_datatype.lower() == 'float' or column_datatype.lower() == 'real':
|
|
177
|
+
|
|
178
|
+
aggregates += f", sum({column}) as sum_{column}"
|
|
179
|
+
|
|
180
|
+
elif column_datatype.lower() == 'char' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'text' or column_datatype.lower() == 'nchar' or column_datatype.lower() == 'nvarchar' or column_datatype.lower() == 'nvarchar' or column_datatype.lower() == 'ntext' or column_datatype.lower() == 'binary' or column_datatype.lower() == 'varbinary' or column_datatype.lower() == 'varbinary' or column_datatype.lower() == 'datetime' or column_datatype.lower() == 'datetime2' or column_datatype.lower() == 'smalldatetime' or column_datatype.lower() == 'date' or column_datatype.lower() == 'time' or column_datatype.lower() == 'datetimeoffset' or column_datatype.lower() == 'timestamp':
|
|
181
|
+
|
|
182
|
+
aggregates += f", count(distinct lower({column})) as countdistinct_{column}"
|
|
183
|
+
|
|
184
|
+
elif column_datatype.lower() == 'bit':
|
|
185
|
+
|
|
186
|
+
aggregates += f", (SELECT CONCAT ((select count(*) as val FROM {object.schema}.{object.name} WHERE {column} = 1),'_',(select count(*) as val from {object.schema}.{object.name} WHERE {column} = 0))) AS aggregateboolean_{column}"
|
|
187
|
+
|
|
188
|
+
#else: Additional Data Types: image , sql_variant, uniqueidentifier, xml, cursor, table, column_datatype.lower() == 'bit' or
|
|
189
|
+
|
|
190
|
+
query_checksums = f"select {aggregates[1:]} from {object.schema}.{object.name};"
|
|
191
|
+
|
|
192
|
+
query_countnulls = f"select {count_nulls[1:]} from {object.schema}.{object.name};"
|
|
193
|
+
|
|
194
|
+
aggregation_cursor = self.execute_queries(query_checksums)
|
|
195
|
+
|
|
196
|
+
aggregation_columns = [column[0].upper() for column in aggregation_cursor.description]
|
|
197
|
+
|
|
198
|
+
aggregation_results = aggregation_cursor.fetchall()[0]
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
countnulls_cursor = self.execute_queries(query_countnulls)
|
|
202
|
+
|
|
203
|
+
countnulls_results = countnulls_cursor.fetchall()[0]
|
|
204
|
+
|
|
205
|
+
test_list=[]
|
|
206
|
+
|
|
207
|
+
for i in range(0,len(aggregation_results)):
|
|
208
|
+
|
|
209
|
+
if aggregation_results[i] is None:
|
|
210
|
+
agg_result = 0
|
|
211
|
+
else:
|
|
212
|
+
agg_result = aggregation_results[i]
|
|
213
|
+
|
|
214
|
+
if countnulls_results[i] is None:
|
|
215
|
+
cnt_result = 0
|
|
216
|
+
else:
|
|
217
|
+
cnt_result = countnulls_results[i]
|
|
218
|
+
|
|
219
|
+
test_list.append([[item.split("_", 1)[0] for item in aggregation_columns][i],agg_result,cnt_result])
|
|
220
|
+
|
|
221
|
+
checksums = dict(zip([item.split("_", 1)[1] for item in aggregation_columns] , test_list))
|
|
222
|
+
|
|
223
|
+
return checksums
|
|
224
|
+
|
|
225
|
+
def create_pandas_df_from_group_by(self, object : DatabaseObject, column_intersections: list, group_by_column: str) -> List[Dict]:
|
|
226
|
+
|
|
227
|
+
if self.teradata_connection is None:
|
|
228
|
+
self._connect_to_teradata()
|
|
229
|
+
|
|
230
|
+
aggregation_columns= [f"{column.upper()}" for column in column_intersections if column != group_by_column]
|
|
231
|
+
|
|
232
|
+
dict_colummns_datatype=self.get_data_types_from_object(object, aggregation_columns)
|
|
233
|
+
|
|
234
|
+
aggregates = ""
|
|
235
|
+
|
|
236
|
+
for column in column_intersections:
|
|
237
|
+
column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
|
|
238
|
+
column_datatype = column_datatype.split('(')[0]
|
|
239
|
+
|
|
240
|
+
if column_datatype.lower() == 'tinyint' or column_datatype.lower() == 'smallint' or column_datatype.lower() == 'int' or column_datatype.lower() == 'bigint' or column_datatype.lower() == 'decimal' or column_datatype.lower() == 'numeric' or column_datatype.lower() == 'smallmoney' or column_datatype.lower() == 'money' or column_datatype.lower() == 'float' or column_datatype.lower() == 'real':
|
|
241
|
+
|
|
242
|
+
aggregates += f", sum({column}) as sum_{column}"
|
|
243
|
+
|
|
244
|
+
elif column_datatype.lower() == 'char' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'text' or column_datatype.lower() == 'nchar' or column_datatype.lower() == 'nvarchar' or column_datatype.lower() == 'nvarchar' or column_datatype.lower() == 'ntext' or column_datatype.lower() == 'binary' or column_datatype.lower() == 'varbinary' or column_datatype.lower() == 'varbinary' or column_datatype.lower() == 'datetime' or column_datatype.lower() == 'datetime2' or column_datatype.lower() == 'smalldatetime' or column_datatype.lower() == 'date' or column_datatype.lower() == 'time' or column_datatype.lower() == 'datetimeoffset' or column_datatype.lower() == 'timestamp':
|
|
245
|
+
|
|
246
|
+
aggregates += f", count(distinct lower({column})) as countdistinct_{column}"
|
|
247
|
+
|
|
248
|
+
elif column_datatype.lower() == 'bit':
|
|
249
|
+
|
|
250
|
+
aggregates += f", (SELECT CONCAT ((select count(*) as val FROM {object.schema}.{object.name} WHERE {column} = 1),'_',(select count(*) as val from {object.schema}.{object.name} WHERE {column} = 0))) AS aggregateboolean_{column}"
|
|
251
|
+
|
|
252
|
+
#else: Additional Data Types: image , sql_variant, uniqueidentifier, xml, cursor, table, column_datatype.lower() == 'bit' or
|
|
253
|
+
|
|
254
|
+
query_group_by_aggregation = f"select {group_by_column}, count(*) as COUNT_OF_GROUP_BY_VALUE, {aggregates[1:]} from {object.schema}.{object.name} group by {group_by_column};"
|
|
255
|
+
|
|
256
|
+
group_by_aggregation_pdf = self.execute_queries(query_group_by_aggregation,True)
|
|
257
|
+
|
|
258
|
+
return group_by_aggregation_pdf
|
|
259
|
+
|
|
260
|
+
def create_pandas_df(self, object : DatabaseObject, intersection_columns_trgt_src: list , where_clause:str="", exclude_columns:list=[]) -> pd.DataFrame:
|
|
261
|
+
|
|
262
|
+
if self.azure_connection is None:
|
|
263
|
+
self._connect_to_azure()
|
|
264
|
+
|
|
265
|
+
intersection_columns_trgt_src_ = ', '.join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
|
|
266
|
+
|
|
267
|
+
df_query = f"select {intersection_columns_trgt_src_} from {object.schema}.{object.name} {where_clause};"
|
|
268
|
+
|
|
269
|
+
pdf = self.execute_queries(df_query,True)
|
|
270
|
+
|
|
271
|
+
return pdf
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def execute_queries(self, query: Union[str, List[str]],return_as_pdf:bool=False) -> Union[List[Dict], List[List[Dict]]]:
|
|
275
|
+
if self.azure_connection is None:
|
|
276
|
+
self._connect_to_azure()
|
|
277
|
+
|
|
278
|
+
query_list: List[str] = query if isinstance(query, list) else [query]
|
|
279
|
+
|
|
280
|
+
results = []
|
|
281
|
+
|
|
282
|
+
try:
|
|
283
|
+
for single_query in query_list:
|
|
284
|
+
if return_as_pdf:
|
|
285
|
+
query_result = pandas.io.sql.read_sql(single_query, self.azure_connection)
|
|
286
|
+
else:
|
|
287
|
+
query_result=self.azure_connection.execute(single_query)
|
|
288
|
+
|
|
289
|
+
results.append(query_result)
|
|
290
|
+
|
|
291
|
+
except Exception as err:
|
|
292
|
+
raise Exception() from err
|
|
293
|
+
|
|
294
|
+
return results[0] if not isinstance(query, list) else results
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def execute_statement(self, statement: Union[str, List[str]]) -> None:
|
|
298
|
+
"""
|
|
299
|
+
Executes simple statement against azure
|
|
300
|
+
Args:
|
|
301
|
+
statement Union[str, List[str]] - a sql statement or a list of sql statements to execute
|
|
302
|
+
"""
|
|
303
|
+
if self.azure_connection is None:
|
|
304
|
+
self._connect_to_azure()
|
|
305
|
+
|
|
306
|
+
statement_list: List[str] = (
|
|
307
|
+
statement if isinstance(statement, list) else [statement]
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
try:
|
|
311
|
+
for single_statement in statement_list:
|
|
312
|
+
stripped_statement = (
|
|
313
|
+
single_statement.strip()
|
|
314
|
+
)
|
|
315
|
+
_ = self.azure_connection.execute(stripped_statement)
|
|
316
|
+
|
|
317
|
+
except Exception as err:
|
|
318
|
+
raise Exception(self._get_error_message(err, single_statement)) from err
|
|
319
|
+
|
|
320
|
+
|