icsDataValidation 1.0.358__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. icsDataValidation/configuration.py +19 -0
  2. icsDataValidation/connection_setups/__init__.py +0 -0
  3. icsDataValidation/connection_setups/azure_connection_setup.py +19 -0
  4. icsDataValidation/connection_setups/databricks_connection_setup.py +28 -0
  5. icsDataValidation/connection_setups/exasol_connection_setup.py +17 -0
  6. icsDataValidation/connection_setups/oracle_connection_setup.py +26 -0
  7. icsDataValidation/connection_setups/snowflake_connection_setup.py +35 -0
  8. icsDataValidation/connection_setups/teradata_connection_setup.py +18 -0
  9. icsDataValidation/core/__init__.py +0 -0
  10. icsDataValidation/core/database_objects.py +18 -0
  11. icsDataValidation/core/object_comparison.py +239 -0
  12. icsDataValidation/input_parameters/__init__.py +0 -0
  13. icsDataValidation/input_parameters/testing_tool_params.py +81 -0
  14. icsDataValidation/main.py +250 -0
  15. icsDataValidation/output_parameters/__init__.py +0 -0
  16. icsDataValidation/output_parameters/result_params.py +94 -0
  17. icsDataValidation/services/__init__.py +0 -0
  18. icsDataValidation/services/comparison_service.py +582 -0
  19. icsDataValidation/services/database_services/__init__.py +0 -0
  20. icsDataValidation/services/database_services/azure_service.py +320 -0
  21. icsDataValidation/services/database_services/databricks_hive_metastore_service.py +1694 -0
  22. icsDataValidation/services/database_services/databricks_unity_catalog_service.py +1379 -0
  23. icsDataValidation/services/database_services/exasol_service.py +261 -0
  24. icsDataValidation/services/database_services/oracle_service.py +713 -0
  25. icsDataValidation/services/database_services/snowflake_service.py +1100 -0
  26. icsDataValidation/services/database_services/teradata_service.py +665 -0
  27. icsDataValidation/services/initialization_service.py +103 -0
  28. icsDataValidation/services/result_service.py +573 -0
  29. icsDataValidation/services/system_service.py +61 -0
  30. icsDataValidation/services/testset_service.py +257 -0
  31. icsDataValidation/utils/__init__.py +0 -0
  32. icsDataValidation/utils/file_util.py +96 -0
  33. icsDataValidation/utils/logger_util.py +96 -0
  34. icsDataValidation/utils/pandas_util.py +159 -0
  35. icsDataValidation/utils/parallelization_util.py +52 -0
  36. icsDataValidation/utils/sql_util.py +14 -0
  37. icsDataValidation-1.0.358.dist-info/METADATA +21 -0
  38. icsDataValidation-1.0.358.dist-info/RECORD +40 -0
  39. icsDataValidation-1.0.358.dist-info/WHEEL +5 -0
  40. icsDataValidation-1.0.358.dist-info/top_level.txt +1 -0
@@ -0,0 +1,261 @@
1
+ import pyexasol as px
2
+ from typing import Union, List, Dict
3
+ import pandas as pd
4
+
5
+ from icsDataValidation.core.database_objects import DatabaseObject
6
+
7
+ #########################################################################################
8
+ #########################################################################################
9
+
10
+ class ExasolService(object):
11
+ def __init__(self, connection_params: dict):
12
+ self.connection_params =connection_params
13
+ self.exasol_connection = None
14
+
15
+ def __enter__(self):
16
+ return self
17
+
18
+ def __exit__(self, exception_type, exception_value, traceback):
19
+ if self.exasol_connection is not None:
20
+ self.exasol_connection.close()
21
+
22
+ def __del__(self):
23
+ if self.exasol_connection is not None:
24
+ self.exasol_connection.close()
25
+
26
+ def _connect_to_exasol(self):
27
+ self.exasol_connection = px.connect(**self.connection_params,fetch_dict=True)
28
+ return self.exasol_connection
29
+
30
+ #@staticmethod
31
+ #def _get_error_message(excepction: Exception, statement: str) -> None:
32
+ # """
33
+ # Compose error message if the execution of a statement or query fails.
34
+ # """
35
+ # return
36
+
37
+ def get_database_objects(self, database: str, schema: str=None, object_type_restriction: str='include_all') -> dict:
38
+ if self.exasol_connection is None:
39
+ self._connect_to_exasol()
40
+
41
+ all_database_tables=[]
42
+ all_database_views=[]
43
+
44
+ if object_type_restriction=='include_all' or object_type_restriction=='include_only_tables':
45
+ if schema:
46
+ query_db_tables=f"select * from EXA_ALL_OBJECTS where root_name='{object.schema}' and object_type='TABLE';"
47
+ else:
48
+ query_db_tables=f"select * from EXA_ALL_OBJECTS where object_type='TABLE';"
49
+
50
+ all_database_tables = self.execute_queries(query_db_tables)
51
+
52
+
53
+ elif object_type_restriction=='include_all' or object_type_restriction=='include_only_views':
54
+ if schema:
55
+ query_db_views=f"select * from EXA_ALL_OBJECTS where root_name='{object.schema}' and object_type='VIEW';"
56
+ else:
57
+ query_db_views=f"select * from EXA_ALL_OBJECTS where object_type='VIEW';"
58
+
59
+ all_database_views = self.execute_queries(query_db_views)
60
+
61
+ database_objects=[]
62
+ for row in all_database_tables:
63
+ table_identifier=f'{database.upper()}.{row["ROOT_NAME"]}.{row["OBJECT_NAME"]}'
64
+ database_objects.append({"object_identifier": table_identifier, "object_type": "table"})
65
+ for row in all_database_views:
66
+ view_identifier=f'{database.upper()}.{row["ROOT_NAME"]}.{row["OBJECT_NAME"]}'
67
+ database_objects.append({"object_identifier": view_identifier, "object_type": "view"})
68
+ return database_objects
69
+
70
+
71
+ def get_columns_from_object(self, object: DatabaseObject) -> list:
72
+
73
+ if self.exasol_connection is None:
74
+ self._connect_to_exasol()
75
+
76
+ # select system table and filter on current table to get column names
77
+ queries_get_columns = [f" SELECT COLUMN_NAME FROM EXA_ALL_COLUMNS WHERE COLUMN_TABLE = '{object.name}';"]
78
+
79
+
80
+ all_columns = self.execute_queries(queries_get_columns)[0]
81
+
82
+ columns=[]
83
+
84
+ for row in all_columns:
85
+ columns.append(row["COLUMN_NAME"])
86
+
87
+ return columns
88
+
89
+ def get_row_count_from_table(self, object:DatabaseObject, where_clause: str="") -> int:
90
+
91
+ if self.exasol_connection is None:
92
+ self._connect_to_exasol()
93
+
94
+ query_get_row_count = f"select count(*) as ROW_COUNT from {object.schema}.{object.name} {where_clause};"
95
+
96
+ row_count = self.execute_queries(query_get_row_count).fetchall()[0]["ROW_COUNT"]
97
+
98
+ return row_count
99
+
100
+ def get_data_types_from_object(self, object: DatabaseObject, column_intersections: list) -> dict:
101
+
102
+ if self.exasol_connection is None:
103
+ self._connect_to_exasol()
104
+
105
+ column_intersections = str(column_intersections)[1:-1]
106
+ query_get_data_types_from_table=f"select COLUMN_NAME , column_type from EXA_ALL_COLUMNS where column_table='{object.name}' AND column_schema = '{object.schema}' and COLUMN_NAME in ({column_intersections});"
107
+ dict_colummns_datatype=self.execute_queries(query_get_data_types_from_table).fetchall()
108
+ return dict_colummns_datatype
109
+
110
+ def get_count_distincts_from_object(self, object: DatabaseObject, column_intersections: list, where_clause: str="") -> dict:
111
+
112
+ if self.exasol_connection is None:
113
+ self._connect_to_exasol()
114
+
115
+ unions=""
116
+ for column in column_intersections:
117
+ unions +=f"UNION SELECT '{column}' AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.schema}.{object.name} {where_clause}"
118
+
119
+ query_get_count_distincts_from_object=f"{unions[5:]} ORDER BY COUNT_DISTINCT;"
120
+ dict_count_distincts=self.execute_queries(query_get_count_distincts_from_object).fetchall()
121
+ return dict_count_distincts
122
+
123
+ def create_checksums(self, object : DatabaseObject, column_intersections: list, where_clause: str="") -> List[Dict]:
124
+
125
+ if self.exasol_connection is None:
126
+ self._connect_to_exasol()
127
+
128
+ column_intersections= [f"{x.upper()}" for x in column_intersections]
129
+
130
+ dict_colummns_datatype=self.get_data_types_from_object(object, column_intersections)
131
+
132
+ aggregates = ""
133
+ count_nulls = ""
134
+
135
+ for column in column_intersections:
136
+ column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["COLUMN_TYPE"]
137
+ column_datatype = column_datatype.split('(')[0]
138
+
139
+ count_nulls += f", sum(case when {column} is null then 1 else 0 end) countnulls_{column}"
140
+
141
+ if column_datatype.lower() == 'decimal' or column_datatype.lower() == 'double':
142
+
143
+ aggregates += f", sum({column}) as sum_{column}"
144
+
145
+ elif column_datatype.lower() == 'char' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'date' or column_datatype.lower() == 'timestamp':
146
+
147
+ aggregates += f", count(distinct lower({column})) as countdistinct_{column}"
148
+
149
+ elif column_datatype.lower() == 'boolean':
150
+
151
+ aggregates += f", max(select count(*) FROM {object.schema}.{object.name} WHERE {column} = true)::varchar || '_' || max(select count(*) FROM {object.schema}.{object.name} WHERE {column} = false) :: varchar as aggregateboolean_{column}"
152
+
153
+ #else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
154
+
155
+ query_checksums = f"select {aggregates[1:]} from {object.schema}.{object.name} {where_clause};"
156
+
157
+ query_countnulls = f"select {count_nulls[1:]} from {object.schema}.{object.name} {where_clause};"
158
+
159
+ aggregation_cursor = self.execute_queries(query_checksums)
160
+
161
+ aggregation_columns = [column[0].upper() for column in aggregation_cursor.description]
162
+
163
+ aggregation_results = aggregation_cursor.fetchall()[0]
164
+
165
+
166
+ countnulls_cursor = self.execute_queries(query_countnulls)
167
+
168
+ countnulls_results = countnulls_cursor.fetchall()[0]
169
+
170
+ test_list=[]
171
+
172
+ for i in range(0,len(aggregation_results)):
173
+
174
+ if aggregation_results[i] is None:
175
+ agg_result = 0
176
+ else:
177
+ agg_result = aggregation_results[i]
178
+
179
+ if countnulls_results[i] is None:
180
+ cnt_result = 0
181
+ else:
182
+ cnt_result = countnulls_results[i]
183
+
184
+ test_list.append([[item.split("_", 1)[0] for item in aggregation_columns][i],agg_result,cnt_result])
185
+
186
+ checksums = dict(zip([item.split("_", 1)[1] for item in aggregation_columns] , test_list))
187
+
188
+ return checksums
189
+
190
+
191
+ def create_pandas_df_from_group_by(self, object : DatabaseObject, object_type: str, column_intersections: list, group_by_column: str, where_clause: str="") -> List[Dict]:
192
+
193
+ if self.teradata_connection is None:
194
+ self._connect_to_teradata()
195
+
196
+ aggregation_columns= [f"{column.upper()}" for column in column_intersections if column != group_by_column]
197
+
198
+ dict_colummns_datatype=self.get_data_types_from_object(object, aggregation_columns)
199
+
200
+ aggregates = ""
201
+
202
+ for column in column_intersections:
203
+ column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["COLUMN_TYPE"]
204
+ column_datatype = column_datatype.split('(')[0]
205
+
206
+ if column_datatype.lower() == 'decimal' or column_datatype.lower() == 'double':
207
+
208
+ aggregates += f", sum({column}) as sum_{column}"
209
+
210
+ elif column_datatype.lower() == 'char' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'date' or column_datatype.lower() == 'timestamp':
211
+
212
+ aggregates += f", count(distinct lower({column})) as countdistinct_{column}"
213
+
214
+ elif column_datatype.lower() == 'boolean':
215
+
216
+ aggregates += f", max(select count(*) FROM {object.schema}.{object} WHERE {column} = true)::varchar || '_' || max(select count(*) FROM {object.schema}.{object} WHERE {column} = false) :: varchar as aggregateboolean_{column}"
217
+
218
+ #else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
219
+
220
+ query_group_by_aggregation = f"select {group_by_column}, count(*) as COUNT_OF_GROUP_BY_VALUE, {aggregates[1:]} from {object.schema}.{object} {where_clause} group by {group_by_column};"
221
+
222
+ group_by_aggregation_pdf = self.execute_queries(query_group_by_aggregation,True)
223
+
224
+ return group_by_aggregation_pdf
225
+
226
+
227
+ def create_pandas_df(self, object:DatabaseObject, intersection_columns_trgt_src: list, where_clause:str="", exclude_columns:list=[]) -> pd.DataFrame:
228
+ if self.exasol_connection is None:
229
+ self._connect_to_exasol()
230
+
231
+ intersection_columns_trgt_src_ = ', '.join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
232
+
233
+ df_query = f"select {intersection_columns_trgt_src_} from {object.schema}.{object.name} {where_clause};"
234
+
235
+ pdf = self.execute_queries(df_query,True)
236
+
237
+ return pdf
238
+
239
+
240
+ def execute_queries(self, query: Union[str, List[str]],return_as_pdf:bool=False) -> Union[List[Dict], List[List[Dict]]]:
241
+ if self.exasol_connection is None:
242
+ self._connect_to_exasol()
243
+
244
+ query_list: List[str] = query if isinstance(query, list) else [query]
245
+
246
+ results = []
247
+
248
+ try:
249
+ for single_query in query_list:
250
+ if return_as_pdf:
251
+ query_result=self.exasol_connection.export_to_pandas(single_query)
252
+ else:
253
+ query_result=self.exasol_connection.execute(single_query)
254
+
255
+ results.append(query_result)
256
+
257
+ except Exception as err:
258
+ raise Exception() from err
259
+
260
+ return results[0] if not isinstance(query, list) else results
261
+