icsDataValidation 1.0.358__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- icsDataValidation/configuration.py +19 -0
- icsDataValidation/connection_setups/__init__.py +0 -0
- icsDataValidation/connection_setups/azure_connection_setup.py +19 -0
- icsDataValidation/connection_setups/databricks_connection_setup.py +28 -0
- icsDataValidation/connection_setups/exasol_connection_setup.py +17 -0
- icsDataValidation/connection_setups/oracle_connection_setup.py +26 -0
- icsDataValidation/connection_setups/snowflake_connection_setup.py +35 -0
- icsDataValidation/connection_setups/teradata_connection_setup.py +18 -0
- icsDataValidation/core/__init__.py +0 -0
- icsDataValidation/core/database_objects.py +18 -0
- icsDataValidation/core/object_comparison.py +239 -0
- icsDataValidation/input_parameters/__init__.py +0 -0
- icsDataValidation/input_parameters/testing_tool_params.py +81 -0
- icsDataValidation/main.py +250 -0
- icsDataValidation/output_parameters/__init__.py +0 -0
- icsDataValidation/output_parameters/result_params.py +94 -0
- icsDataValidation/services/__init__.py +0 -0
- icsDataValidation/services/comparison_service.py +582 -0
- icsDataValidation/services/database_services/__init__.py +0 -0
- icsDataValidation/services/database_services/azure_service.py +320 -0
- icsDataValidation/services/database_services/databricks_hive_metastore_service.py +1694 -0
- icsDataValidation/services/database_services/databricks_unity_catalog_service.py +1379 -0
- icsDataValidation/services/database_services/exasol_service.py +261 -0
- icsDataValidation/services/database_services/oracle_service.py +713 -0
- icsDataValidation/services/database_services/snowflake_service.py +1100 -0
- icsDataValidation/services/database_services/teradata_service.py +665 -0
- icsDataValidation/services/initialization_service.py +103 -0
- icsDataValidation/services/result_service.py +573 -0
- icsDataValidation/services/system_service.py +61 -0
- icsDataValidation/services/testset_service.py +257 -0
- icsDataValidation/utils/__init__.py +0 -0
- icsDataValidation/utils/file_util.py +96 -0
- icsDataValidation/utils/logger_util.py +96 -0
- icsDataValidation/utils/pandas_util.py +159 -0
- icsDataValidation/utils/parallelization_util.py +52 -0
- icsDataValidation/utils/sql_util.py +14 -0
- icsDataValidation-1.0.358.dist-info/METADATA +21 -0
- icsDataValidation-1.0.358.dist-info/RECORD +40 -0
- icsDataValidation-1.0.358.dist-info/WHEEL +5 -0
- icsDataValidation-1.0.358.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
import pyexasol as px
|
|
2
|
+
from typing import Union, List, Dict
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from icsDataValidation.core.database_objects import DatabaseObject
|
|
6
|
+
|
|
7
|
+
#########################################################################################
|
|
8
|
+
#########################################################################################
|
|
9
|
+
|
|
10
|
+
class ExasolService(object):
|
|
11
|
+
def __init__(self, connection_params: dict):
|
|
12
|
+
self.connection_params =connection_params
|
|
13
|
+
self.exasol_connection = None
|
|
14
|
+
|
|
15
|
+
def __enter__(self):
|
|
16
|
+
return self
|
|
17
|
+
|
|
18
|
+
def __exit__(self, exception_type, exception_value, traceback):
|
|
19
|
+
if self.exasol_connection is not None:
|
|
20
|
+
self.exasol_connection.close()
|
|
21
|
+
|
|
22
|
+
def __del__(self):
|
|
23
|
+
if self.exasol_connection is not None:
|
|
24
|
+
self.exasol_connection.close()
|
|
25
|
+
|
|
26
|
+
def _connect_to_exasol(self):
|
|
27
|
+
self.exasol_connection = px.connect(**self.connection_params,fetch_dict=True)
|
|
28
|
+
return self.exasol_connection
|
|
29
|
+
|
|
30
|
+
#@staticmethod
|
|
31
|
+
#def _get_error_message(excepction: Exception, statement: str) -> None:
|
|
32
|
+
# """
|
|
33
|
+
# Compose error message if the execution of a statement or query fails.
|
|
34
|
+
# """
|
|
35
|
+
# return
|
|
36
|
+
|
|
37
|
+
def get_database_objects(self, database: str, schema: str=None, object_type_restriction: str='include_all') -> dict:
|
|
38
|
+
if self.exasol_connection is None:
|
|
39
|
+
self._connect_to_exasol()
|
|
40
|
+
|
|
41
|
+
all_database_tables=[]
|
|
42
|
+
all_database_views=[]
|
|
43
|
+
|
|
44
|
+
if object_type_restriction=='include_all' or object_type_restriction=='include_only_tables':
|
|
45
|
+
if schema:
|
|
46
|
+
query_db_tables=f"select * from EXA_ALL_OBJECTS where root_name='{object.schema}' and object_type='TABLE';"
|
|
47
|
+
else:
|
|
48
|
+
query_db_tables=f"select * from EXA_ALL_OBJECTS where object_type='TABLE';"
|
|
49
|
+
|
|
50
|
+
all_database_tables = self.execute_queries(query_db_tables)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
elif object_type_restriction=='include_all' or object_type_restriction=='include_only_views':
|
|
54
|
+
if schema:
|
|
55
|
+
query_db_views=f"select * from EXA_ALL_OBJECTS where root_name='{object.schema}' and object_type='VIEW';"
|
|
56
|
+
else:
|
|
57
|
+
query_db_views=f"select * from EXA_ALL_OBJECTS where object_type='VIEW';"
|
|
58
|
+
|
|
59
|
+
all_database_views = self.execute_queries(query_db_views)
|
|
60
|
+
|
|
61
|
+
database_objects=[]
|
|
62
|
+
for row in all_database_tables:
|
|
63
|
+
table_identifier=f'{database.upper()}.{row["ROOT_NAME"]}.{row["OBJECT_NAME"]}'
|
|
64
|
+
database_objects.append({"object_identifier": table_identifier, "object_type": "table"})
|
|
65
|
+
for row in all_database_views:
|
|
66
|
+
view_identifier=f'{database.upper()}.{row["ROOT_NAME"]}.{row["OBJECT_NAME"]}'
|
|
67
|
+
database_objects.append({"object_identifier": view_identifier, "object_type": "view"})
|
|
68
|
+
return database_objects
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_columns_from_object(self, object: DatabaseObject) -> list:
|
|
72
|
+
|
|
73
|
+
if self.exasol_connection is None:
|
|
74
|
+
self._connect_to_exasol()
|
|
75
|
+
|
|
76
|
+
# select system table and filter on current table to get column names
|
|
77
|
+
queries_get_columns = [f" SELECT COLUMN_NAME FROM EXA_ALL_COLUMNS WHERE COLUMN_TABLE = '{object.name}';"]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
all_columns = self.execute_queries(queries_get_columns)[0]
|
|
81
|
+
|
|
82
|
+
columns=[]
|
|
83
|
+
|
|
84
|
+
for row in all_columns:
|
|
85
|
+
columns.append(row["COLUMN_NAME"])
|
|
86
|
+
|
|
87
|
+
return columns
|
|
88
|
+
|
|
89
|
+
def get_row_count_from_table(self, object:DatabaseObject, where_clause: str="") -> int:
|
|
90
|
+
|
|
91
|
+
if self.exasol_connection is None:
|
|
92
|
+
self._connect_to_exasol()
|
|
93
|
+
|
|
94
|
+
query_get_row_count = f"select count(*) as ROW_COUNT from {object.schema}.{object.name} {where_clause};"
|
|
95
|
+
|
|
96
|
+
row_count = self.execute_queries(query_get_row_count).fetchall()[0]["ROW_COUNT"]
|
|
97
|
+
|
|
98
|
+
return row_count
|
|
99
|
+
|
|
100
|
+
def get_data_types_from_object(self, object: DatabaseObject, column_intersections: list) -> dict:
|
|
101
|
+
|
|
102
|
+
if self.exasol_connection is None:
|
|
103
|
+
self._connect_to_exasol()
|
|
104
|
+
|
|
105
|
+
column_intersections = str(column_intersections)[1:-1]
|
|
106
|
+
query_get_data_types_from_table=f"select COLUMN_NAME , column_type from EXA_ALL_COLUMNS where column_table='{object.name}' AND column_schema = '{object.schema}' and COLUMN_NAME in ({column_intersections});"
|
|
107
|
+
dict_colummns_datatype=self.execute_queries(query_get_data_types_from_table).fetchall()
|
|
108
|
+
return dict_colummns_datatype
|
|
109
|
+
|
|
110
|
+
def get_count_distincts_from_object(self, object: DatabaseObject, column_intersections: list, where_clause: str="") -> dict:
|
|
111
|
+
|
|
112
|
+
if self.exasol_connection is None:
|
|
113
|
+
self._connect_to_exasol()
|
|
114
|
+
|
|
115
|
+
unions=""
|
|
116
|
+
for column in column_intersections:
|
|
117
|
+
unions +=f"UNION SELECT '{column}' AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.schema}.{object.name} {where_clause}"
|
|
118
|
+
|
|
119
|
+
query_get_count_distincts_from_object=f"{unions[5:]} ORDER BY COUNT_DISTINCT;"
|
|
120
|
+
dict_count_distincts=self.execute_queries(query_get_count_distincts_from_object).fetchall()
|
|
121
|
+
return dict_count_distincts
|
|
122
|
+
|
|
123
|
+
def create_checksums(self, object : DatabaseObject, column_intersections: list, where_clause: str="") -> List[Dict]:
|
|
124
|
+
|
|
125
|
+
if self.exasol_connection is None:
|
|
126
|
+
self._connect_to_exasol()
|
|
127
|
+
|
|
128
|
+
column_intersections= [f"{x.upper()}" for x in column_intersections]
|
|
129
|
+
|
|
130
|
+
dict_colummns_datatype=self.get_data_types_from_object(object, column_intersections)
|
|
131
|
+
|
|
132
|
+
aggregates = ""
|
|
133
|
+
count_nulls = ""
|
|
134
|
+
|
|
135
|
+
for column in column_intersections:
|
|
136
|
+
column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["COLUMN_TYPE"]
|
|
137
|
+
column_datatype = column_datatype.split('(')[0]
|
|
138
|
+
|
|
139
|
+
count_nulls += f", sum(case when {column} is null then 1 else 0 end) countnulls_{column}"
|
|
140
|
+
|
|
141
|
+
if column_datatype.lower() == 'decimal' or column_datatype.lower() == 'double':
|
|
142
|
+
|
|
143
|
+
aggregates += f", sum({column}) as sum_{column}"
|
|
144
|
+
|
|
145
|
+
elif column_datatype.lower() == 'char' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'date' or column_datatype.lower() == 'timestamp':
|
|
146
|
+
|
|
147
|
+
aggregates += f", count(distinct lower({column})) as countdistinct_{column}"
|
|
148
|
+
|
|
149
|
+
elif column_datatype.lower() == 'boolean':
|
|
150
|
+
|
|
151
|
+
aggregates += f", max(select count(*) FROM {object.schema}.{object.name} WHERE {column} = true)::varchar || '_' || max(select count(*) FROM {object.schema}.{object.name} WHERE {column} = false) :: varchar as aggregateboolean_{column}"
|
|
152
|
+
|
|
153
|
+
#else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
|
|
154
|
+
|
|
155
|
+
query_checksums = f"select {aggregates[1:]} from {object.schema}.{object.name} {where_clause};"
|
|
156
|
+
|
|
157
|
+
query_countnulls = f"select {count_nulls[1:]} from {object.schema}.{object.name} {where_clause};"
|
|
158
|
+
|
|
159
|
+
aggregation_cursor = self.execute_queries(query_checksums)
|
|
160
|
+
|
|
161
|
+
aggregation_columns = [column[0].upper() for column in aggregation_cursor.description]
|
|
162
|
+
|
|
163
|
+
aggregation_results = aggregation_cursor.fetchall()[0]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
countnulls_cursor = self.execute_queries(query_countnulls)
|
|
167
|
+
|
|
168
|
+
countnulls_results = countnulls_cursor.fetchall()[0]
|
|
169
|
+
|
|
170
|
+
test_list=[]
|
|
171
|
+
|
|
172
|
+
for i in range(0,len(aggregation_results)):
|
|
173
|
+
|
|
174
|
+
if aggregation_results[i] is None:
|
|
175
|
+
agg_result = 0
|
|
176
|
+
else:
|
|
177
|
+
agg_result = aggregation_results[i]
|
|
178
|
+
|
|
179
|
+
if countnulls_results[i] is None:
|
|
180
|
+
cnt_result = 0
|
|
181
|
+
else:
|
|
182
|
+
cnt_result = countnulls_results[i]
|
|
183
|
+
|
|
184
|
+
test_list.append([[item.split("_", 1)[0] for item in aggregation_columns][i],agg_result,cnt_result])
|
|
185
|
+
|
|
186
|
+
checksums = dict(zip([item.split("_", 1)[1] for item in aggregation_columns] , test_list))
|
|
187
|
+
|
|
188
|
+
return checksums
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def create_pandas_df_from_group_by(self, object : DatabaseObject, object_type: str, column_intersections: list, group_by_column: str, where_clause: str="") -> List[Dict]:
|
|
192
|
+
|
|
193
|
+
if self.teradata_connection is None:
|
|
194
|
+
self._connect_to_teradata()
|
|
195
|
+
|
|
196
|
+
aggregation_columns= [f"{column.upper()}" for column in column_intersections if column != group_by_column]
|
|
197
|
+
|
|
198
|
+
dict_colummns_datatype=self.get_data_types_from_object(object, aggregation_columns)
|
|
199
|
+
|
|
200
|
+
aggregates = ""
|
|
201
|
+
|
|
202
|
+
for column in column_intersections:
|
|
203
|
+
column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["COLUMN_TYPE"]
|
|
204
|
+
column_datatype = column_datatype.split('(')[0]
|
|
205
|
+
|
|
206
|
+
if column_datatype.lower() == 'decimal' or column_datatype.lower() == 'double':
|
|
207
|
+
|
|
208
|
+
aggregates += f", sum({column}) as sum_{column}"
|
|
209
|
+
|
|
210
|
+
elif column_datatype.lower() == 'char' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'date' or column_datatype.lower() == 'timestamp':
|
|
211
|
+
|
|
212
|
+
aggregates += f", count(distinct lower({column})) as countdistinct_{column}"
|
|
213
|
+
|
|
214
|
+
elif column_datatype.lower() == 'boolean':
|
|
215
|
+
|
|
216
|
+
aggregates += f", max(select count(*) FROM {object.schema}.{object} WHERE {column} = true)::varchar || '_' || max(select count(*) FROM {object.schema}.{object} WHERE {column} = false) :: varchar as aggregateboolean_{column}"
|
|
217
|
+
|
|
218
|
+
#else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
|
|
219
|
+
|
|
220
|
+
query_group_by_aggregation = f"select {group_by_column}, count(*) as COUNT_OF_GROUP_BY_VALUE, {aggregates[1:]} from {object.schema}.{object} {where_clause} group by {group_by_column};"
|
|
221
|
+
|
|
222
|
+
group_by_aggregation_pdf = self.execute_queries(query_group_by_aggregation,True)
|
|
223
|
+
|
|
224
|
+
return group_by_aggregation_pdf
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def create_pandas_df(self, object:DatabaseObject, intersection_columns_trgt_src: list, where_clause:str="", exclude_columns:list=[]) -> pd.DataFrame:
|
|
228
|
+
if self.exasol_connection is None:
|
|
229
|
+
self._connect_to_exasol()
|
|
230
|
+
|
|
231
|
+
intersection_columns_trgt_src_ = ', '.join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
|
|
232
|
+
|
|
233
|
+
df_query = f"select {intersection_columns_trgt_src_} from {object.schema}.{object.name} {where_clause};"
|
|
234
|
+
|
|
235
|
+
pdf = self.execute_queries(df_query,True)
|
|
236
|
+
|
|
237
|
+
return pdf
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def execute_queries(self, query: Union[str, List[str]],return_as_pdf:bool=False) -> Union[List[Dict], List[List[Dict]]]:
|
|
241
|
+
if self.exasol_connection is None:
|
|
242
|
+
self._connect_to_exasol()
|
|
243
|
+
|
|
244
|
+
query_list: List[str] = query if isinstance(query, list) else [query]
|
|
245
|
+
|
|
246
|
+
results = []
|
|
247
|
+
|
|
248
|
+
try:
|
|
249
|
+
for single_query in query_list:
|
|
250
|
+
if return_as_pdf:
|
|
251
|
+
query_result=self.exasol_connection.export_to_pandas(single_query)
|
|
252
|
+
else:
|
|
253
|
+
query_result=self.exasol_connection.execute(single_query)
|
|
254
|
+
|
|
255
|
+
results.append(query_result)
|
|
256
|
+
|
|
257
|
+
except Exception as err:
|
|
258
|
+
raise Exception() from err
|
|
259
|
+
|
|
260
|
+
return results[0] if not isinstance(query, list) else results
|
|
261
|
+
|