icsDataValidation 1.0.358__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. icsDataValidation/configuration.py +19 -0
  2. icsDataValidation/connection_setups/__init__.py +0 -0
  3. icsDataValidation/connection_setups/azure_connection_setup.py +19 -0
  4. icsDataValidation/connection_setups/databricks_connection_setup.py +28 -0
  5. icsDataValidation/connection_setups/exasol_connection_setup.py +17 -0
  6. icsDataValidation/connection_setups/oracle_connection_setup.py +26 -0
  7. icsDataValidation/connection_setups/snowflake_connection_setup.py +35 -0
  8. icsDataValidation/connection_setups/teradata_connection_setup.py +18 -0
  9. icsDataValidation/core/__init__.py +0 -0
  10. icsDataValidation/core/database_objects.py +18 -0
  11. icsDataValidation/core/object_comparison.py +239 -0
  12. icsDataValidation/input_parameters/__init__.py +0 -0
  13. icsDataValidation/input_parameters/testing_tool_params.py +81 -0
  14. icsDataValidation/main.py +250 -0
  15. icsDataValidation/output_parameters/__init__.py +0 -0
  16. icsDataValidation/output_parameters/result_params.py +94 -0
  17. icsDataValidation/services/__init__.py +0 -0
  18. icsDataValidation/services/comparison_service.py +582 -0
  19. icsDataValidation/services/database_services/__init__.py +0 -0
  20. icsDataValidation/services/database_services/azure_service.py +320 -0
  21. icsDataValidation/services/database_services/databricks_hive_metastore_service.py +1694 -0
  22. icsDataValidation/services/database_services/databricks_unity_catalog_service.py +1379 -0
  23. icsDataValidation/services/database_services/exasol_service.py +261 -0
  24. icsDataValidation/services/database_services/oracle_service.py +713 -0
  25. icsDataValidation/services/database_services/snowflake_service.py +1100 -0
  26. icsDataValidation/services/database_services/teradata_service.py +665 -0
  27. icsDataValidation/services/initialization_service.py +103 -0
  28. icsDataValidation/services/result_service.py +573 -0
  29. icsDataValidation/services/system_service.py +61 -0
  30. icsDataValidation/services/testset_service.py +257 -0
  31. icsDataValidation/utils/__init__.py +0 -0
  32. icsDataValidation/utils/file_util.py +96 -0
  33. icsDataValidation/utils/logger_util.py +96 -0
  34. icsDataValidation/utils/pandas_util.py +159 -0
  35. icsDataValidation/utils/parallelization_util.py +52 -0
  36. icsDataValidation/utils/sql_util.py +14 -0
  37. icsDataValidation-1.0.358.dist-info/METADATA +21 -0
  38. icsDataValidation-1.0.358.dist-info/RECORD +40 -0
  39. icsDataValidation-1.0.358.dist-info/WHEEL +5 -0
  40. icsDataValidation-1.0.358.dist-info/top_level.txt +1 -0
@@ -0,0 +1,320 @@
1
+ import pyodbc
2
+ import pandas.io.sql
3
+ import pandas as pd
4
+ import logging
5
+
6
+ from typing import Union, List, Dict
7
+
8
+ from icsDataValidation.utils.logger_util import configure_dev_ops_logger
9
+ from icsDataValidation.core.database_objects import DatabaseObject
10
+
11
+ #########################################################################################
12
+ #########################################################################################
13
+
14
+ # Configure Dev Ops Logger
15
+
16
+ logger = logging.getLogger('Azure_Service')
17
+ logger.setLevel(logging.INFO)
18
+ configure_dev_ops_logger(logger)
19
+
20
+ class AzureService(object):
21
+ def __init__(self, connection_params: dict):
22
+ self.connection_params =connection_params
23
+ self.azure_connection = None
24
+
25
+ def __enter__(self):
26
+ return self
27
+
28
+ def __exit__(self, exception_type, exception_value, traceback):
29
+ if self.azure_connection is not None:
30
+ self.azure_connection.close()
31
+
32
+ def __del__(self):
33
+ if self.azure_connection is not None:
34
+ self.azure_connection.close()
35
+
36
+ def _connect_to_azure(self):
37
+ azure_connection_string = f"DRIVER={self.connection_params['Driver']};SERVER={self.connection_params['Server']};PORT=1443;DATABASE={self.connection_params['Database']};UID={self.connection_params['User']};PWD={self.connection_params['Password']}"
38
+ #'DRIVER='+driver+';SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password
39
+ self.azure_connection = pyodbc.connect(azure_connection_string)
40
+ return self.azure_connection
41
+
42
+
43
+ @staticmethod
44
+ def _get_error_message(excepction: Exception, statement: str) -> None:
45
+ """
46
+ Compose error message if the execution of a statement or query fails.
47
+ """
48
+ if hasattr(excepction, "raw_msg"):
49
+ message = excepction.raw_msg.replace("\n", " ")
50
+ else:
51
+ message = str(
52
+ excepction
53
+ ) # this makes sure that all kinds of errors can have a message, even if they do not have raw_msg attribute
54
+ if hasattr(excepction, "sfqid"):
55
+ message = message + f"\nQuery ID: {excepction.sfqid}"
56
+ return f"Azure ERROR: {message}\nFailed statement:\n{statement}"
57
+
58
+
59
+ def get_database_objects(self, database: str, schema: str=None, object_type_restriction: str='include_all') -> dict:
60
+ if self.azure_connection is None:
61
+ self._connect_to_azure()
62
+
63
+ all_database_tables=[]
64
+ all_database_views=[]
65
+
66
+ if object_type_restriction=='include_all' or object_type_restriction=='include_only_tables':
67
+ if schema:
68
+ query_db_tables=f"select schema_name(t.schema_id) as schema_name, t.name as table_name from sys.tables t where schema_name(t.schema_id) = '{object.schema}' order by schema_name;"
69
+ else:
70
+ query_db_tables=f"select schema_name(t.schema_id) as schema_name, t.name as table_name from sys.tables t order by schema_name;"
71
+
72
+ all_database_tables = self.execute_queries(query_db_tables)
73
+
74
+
75
+ elif object_type_restriction=='include_all' or object_type_restriction=='include_only_views':
76
+ if schema:
77
+ query_db_views=f"select schema_name(t.schema_id) as schema_name, t.name as table_name from sys.views t where schema_name(t.schema_id) = '{object.schema}' order by schema_name;"
78
+ else:
79
+ query_db_views=f"select schema_name(t.schema_id) as schema_name, t.name as table_name from sys.views t order by schema_name;"
80
+
81
+ all_database_views = self.execute_queries(query_db_views)
82
+
83
+ database_objects=[]
84
+ for row in all_database_tables:
85
+ database_table=f'{database}.{row[0].upper()}.{row[1].upper()}'
86
+ database_objects.append({"object_identifier": database_table, "object_type": "table"})
87
+ for row in all_database_views:
88
+ database_view=f'{database}.{row[0].upper()}.{row[1].upper()}'
89
+ database_objects.append({"object_identifier": database_view, "object_type": "view"})
90
+ return database_objects
91
+
92
+
93
+ def get_columns_from_object(self, object : DatabaseObject) -> list:
94
+ if self.azure_connection is None:
95
+ self._connect_to_azure()
96
+
97
+ queries_get_columns = [f"select col.name from sys.tables as tab inner join sys.columns as col on (tab.object_id = col.object_id and upper(tab.name) = '{object.name.upper()}') inner join (select object_id, schema_id from sys.objects) as obj on (tab.object_id = obj.object_id and schema_name(obj.schema_id) = '{object.schema.upper()}');"]
98
+
99
+ # select col.name from sys.tables as tab inner join sys.columns as col on tab.object_id = col.object_id where tab.name = 'TBL_CUSTOMER_DATA'
100
+
101
+ all_columns = self.execute_queries(queries_get_columns)[0]
102
+
103
+ columns=[]
104
+
105
+ for row in all_columns:
106
+ columns.append(row[0])
107
+
108
+ return columns
109
+
110
+ def get_row_count_from_object(self, object : DatabaseObject) -> int:
111
+ if self.azure_connection is None:
112
+ self._connect_to_azure()
113
+
114
+ query_get_row_count = f"select count(*) as ROW_COUNT from {object.schema}.{object.name};"
115
+
116
+ row_count = self.execute_queries(query_get_row_count).fetchall()[0][0]
117
+
118
+ return row_count
119
+
120
+ def get_data_types_from_object(self, object : DatabaseObject, column_intersections: list) -> dict:
121
+ results = []
122
+
123
+ if self.azure_connection is None:
124
+ self._connect_to_azure()
125
+
126
+ column_intersections = str(column_intersections)[1:-1]
127
+ query_get_data_types_from_object=f"select col.name, t.name as data_type from sys.tables as tab inner join sys.columns as col on tab.object_id = col.object_id left join sys.types as t on col.user_type_id = t.user_type_id where tab.name = '{object.name}' and schema_name(tab.schema_id) = '{object.schema}'"
128
+ dict_colummns_datatype=self.execute_queries(query_get_data_types_from_object).fetchall()
129
+
130
+ for row in dict_colummns_datatype:
131
+ # logger.info(type(row))
132
+ row_to_list = [elem for elem in row]
133
+ results.append({"COLUMN_NAME":row_to_list[0],"DATA_TYPE":row_to_list[1]})
134
+
135
+ return results
136
+
137
+ def get_count_distincts_from_object(self, object : DatabaseObject, column_intersections: list) -> dict:
138
+ if self.azure_connection is None:
139
+ self._connect_to_azure()
140
+
141
+ unions=""
142
+ for column in column_intersections:
143
+ unions +=f"UNION SELECT '{column}' AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.schema}.{object.name}"
144
+
145
+ query_get_count_distincts_from_object=f"{unions[5:]} ORDER BY COUNT_DISTINCT;"
146
+ dict_count_distincts=self.execute_queries(query_get_count_distincts_from_object).fetchall()
147
+
148
+ return dict_count_distincts
149
+
150
+ def get_table_size(self, object : DatabaseObject) -> int:
151
+ query_get_table_size = f"select cast(sum(spc.used_pages * 8)/1024.00 *1000000 as integer) as BYTES from sys.tables tab inner join sys.indexes ind on tab.object_id = ind.object_id inner join sys.partitions part on ind.object_id = part.object_id and ind.index_id = part.index_id inner join sys.allocation_units spc on part.partition_id = spc.container_id where schema_name(tab.schema_id) = '{object.schema}' and tab.name = '{object.name}' group by schema_name(tab.schema_id) + '.' + tab.name order by sum(spc.used_pages) desc;"
152
+
153
+ size = self.execute_queries(query_get_table_size).fetchall()[0][0]
154
+
155
+ return size
156
+
157
+ def create_checksums(self, object : DatabaseObject, column_intersections: list) -> List[Dict]:
158
+ if self.azure_connection is None:
159
+ self._connect_to_azure()
160
+
161
+ # column_intersections= [f"{x.upper()}" for x in column_intersections]
162
+
163
+ dict_colummns_datatype=self.get_data_types_from_object(object, column_intersections)
164
+
165
+ # dict_colummns_datatype_dict = dict(zip(dict_colummns_datatype[::2], dict_colummns_datatype[1::2]))
166
+
167
+ aggregates = ""
168
+ count_nulls = ""
169
+
170
+ for column in column_intersections:
171
+ column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
172
+ column_datatype = column_datatype.split('(')[0]
173
+
174
+ count_nulls += f", sum(case when {column} is null then 1 else 0 end) as countnulls_{column}"
175
+
176
+ if column_datatype.lower() == 'tinyint' or column_datatype.lower() == 'smallint' or column_datatype.lower() == 'int' or column_datatype.lower() == 'bigint' or column_datatype.lower() == 'decimal' or column_datatype.lower() == 'numeric' or column_datatype.lower() == 'smallmoney' or column_datatype.lower() == 'money' or column_datatype.lower() == 'float' or column_datatype.lower() == 'real':
177
+
178
+ aggregates += f", sum({column}) as sum_{column}"
179
+
180
+ elif column_datatype.lower() == 'char' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'text' or column_datatype.lower() == 'nchar' or column_datatype.lower() == 'nvarchar' or column_datatype.lower() == 'nvarchar' or column_datatype.lower() == 'ntext' or column_datatype.lower() == 'binary' or column_datatype.lower() == 'varbinary' or column_datatype.lower() == 'varbinary' or column_datatype.lower() == 'datetime' or column_datatype.lower() == 'datetime2' or column_datatype.lower() == 'smalldatetime' or column_datatype.lower() == 'date' or column_datatype.lower() == 'time' or column_datatype.lower() == 'datetimeoffset' or column_datatype.lower() == 'timestamp':
181
+
182
+ aggregates += f", count(distinct lower({column})) as countdistinct_{column}"
183
+
184
+ elif column_datatype.lower() == 'bit':
185
+
186
+ aggregates += f", (SELECT CONCAT ((select count(*) as val FROM {object.schema}.{object.name} WHERE {column} = 1),'_',(select count(*) as val from {object.schema}.{object.name} WHERE {column} = 0))) AS aggregateboolean_{column}"
187
+
188
+ #else: Additional Data Types: image , sql_variant, uniqueidentifier, xml, cursor, table, column_datatype.lower() == 'bit' or
189
+
190
+ query_checksums = f"select {aggregates[1:]} from {object.schema}.{object.name};"
191
+
192
+ query_countnulls = f"select {count_nulls[1:]} from {object.schema}.{object.name};"
193
+
194
+ aggregation_cursor = self.execute_queries(query_checksums)
195
+
196
+ aggregation_columns = [column[0].upper() for column in aggregation_cursor.description]
197
+
198
+ aggregation_results = aggregation_cursor.fetchall()[0]
199
+
200
+
201
+ countnulls_cursor = self.execute_queries(query_countnulls)
202
+
203
+ countnulls_results = countnulls_cursor.fetchall()[0]
204
+
205
+ test_list=[]
206
+
207
+ for i in range(0,len(aggregation_results)):
208
+
209
+ if aggregation_results[i] is None:
210
+ agg_result = 0
211
+ else:
212
+ agg_result = aggregation_results[i]
213
+
214
+ if countnulls_results[i] is None:
215
+ cnt_result = 0
216
+ else:
217
+ cnt_result = countnulls_results[i]
218
+
219
+ test_list.append([[item.split("_", 1)[0] for item in aggregation_columns][i],agg_result,cnt_result])
220
+
221
+ checksums = dict(zip([item.split("_", 1)[1] for item in aggregation_columns] , test_list))
222
+
223
+ return checksums
224
+
225
+ def create_pandas_df_from_group_by(self, object : DatabaseObject, column_intersections: list, group_by_column: str) -> List[Dict]:
226
+
227
+ if self.teradata_connection is None:
228
+ self._connect_to_teradata()
229
+
230
+ aggregation_columns= [f"{column.upper()}" for column in column_intersections if column != group_by_column]
231
+
232
+ dict_colummns_datatype=self.get_data_types_from_object(object, aggregation_columns)
233
+
234
+ aggregates = ""
235
+
236
+ for column in column_intersections:
237
+ column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
238
+ column_datatype = column_datatype.split('(')[0]
239
+
240
+ if column_datatype.lower() == 'tinyint' or column_datatype.lower() == 'smallint' or column_datatype.lower() == 'int' or column_datatype.lower() == 'bigint' or column_datatype.lower() == 'decimal' or column_datatype.lower() == 'numeric' or column_datatype.lower() == 'smallmoney' or column_datatype.lower() == 'money' or column_datatype.lower() == 'float' or column_datatype.lower() == 'real':
241
+
242
+ aggregates += f", sum({column}) as sum_{column}"
243
+
244
+ elif column_datatype.lower() == 'char' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'text' or column_datatype.lower() == 'nchar' or column_datatype.lower() == 'nvarchar' or column_datatype.lower() == 'nvarchar' or column_datatype.lower() == 'ntext' or column_datatype.lower() == 'binary' or column_datatype.lower() == 'varbinary' or column_datatype.lower() == 'varbinary' or column_datatype.lower() == 'datetime' or column_datatype.lower() == 'datetime2' or column_datatype.lower() == 'smalldatetime' or column_datatype.lower() == 'date' or column_datatype.lower() == 'time' or column_datatype.lower() == 'datetimeoffset' or column_datatype.lower() == 'timestamp':
245
+
246
+ aggregates += f", count(distinct lower({column})) as countdistinct_{column}"
247
+
248
+ elif column_datatype.lower() == 'bit':
249
+
250
+ aggregates += f", (SELECT CONCAT ((select count(*) as val FROM {object.schema}.{object.name} WHERE {column} = 1),'_',(select count(*) as val from {object.schema}.{object.name} WHERE {column} = 0))) AS aggregateboolean_{column}"
251
+
252
+ #else: Additional Data Types: image , sql_variant, uniqueidentifier, xml, cursor, table, column_datatype.lower() == 'bit' or
253
+
254
+ query_group_by_aggregation = f"select {group_by_column}, count(*) as COUNT_OF_GROUP_BY_VALUE, {aggregates[1:]} from {object.schema}.{object.name} group by {group_by_column};"
255
+
256
+ group_by_aggregation_pdf = self.execute_queries(query_group_by_aggregation,True)
257
+
258
+ return group_by_aggregation_pdf
259
+
260
+ def create_pandas_df(self, object : DatabaseObject, intersection_columns_trgt_src: list , where_clause:str="", exclude_columns:list=[]) -> pd.DataFrame:
261
+
262
+ if self.azure_connection is None:
263
+ self._connect_to_azure()
264
+
265
+ intersection_columns_trgt_src_ = ', '.join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
266
+
267
+ df_query = f"select {intersection_columns_trgt_src_} from {object.schema}.{object.name} {where_clause};"
268
+
269
+ pdf = self.execute_queries(df_query,True)
270
+
271
+ return pdf
272
+
273
+
274
+ def execute_queries(self, query: Union[str, List[str]],return_as_pdf:bool=False) -> Union[List[Dict], List[List[Dict]]]:
275
+ if self.azure_connection is None:
276
+ self._connect_to_azure()
277
+
278
+ query_list: List[str] = query if isinstance(query, list) else [query]
279
+
280
+ results = []
281
+
282
+ try:
283
+ for single_query in query_list:
284
+ if return_as_pdf:
285
+ query_result = pandas.io.sql.read_sql(single_query, self.azure_connection)
286
+ else:
287
+ query_result=self.azure_connection.execute(single_query)
288
+
289
+ results.append(query_result)
290
+
291
+ except Exception as err:
292
+ raise Exception() from err
293
+
294
+ return results[0] if not isinstance(query, list) else results
295
+
296
+
297
+ def execute_statement(self, statement: Union[str, List[str]]) -> None:
298
+ """
299
+ Executes simple statement against azure
300
+ Args:
301
+ statement Union[str, List[str]] - a sql statement or a list of sql statements to execute
302
+ """
303
+ if self.azure_connection is None:
304
+ self._connect_to_azure()
305
+
306
+ statement_list: List[str] = (
307
+ statement if isinstance(statement, list) else [statement]
308
+ )
309
+
310
+ try:
311
+ for single_statement in statement_list:
312
+ stripped_statement = (
313
+ single_statement.strip()
314
+ )
315
+ _ = self.azure_connection.execute(stripped_statement)
316
+
317
+ except Exception as err:
318
+ raise Exception(self._get_error_message(err, single_statement)) from err
319
+
320
+