icsDataValidation 1.0.358__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. icsDataValidation/configuration.py +19 -0
  2. icsDataValidation/connection_setups/__init__.py +0 -0
  3. icsDataValidation/connection_setups/azure_connection_setup.py +19 -0
  4. icsDataValidation/connection_setups/databricks_connection_setup.py +28 -0
  5. icsDataValidation/connection_setups/exasol_connection_setup.py +17 -0
  6. icsDataValidation/connection_setups/oracle_connection_setup.py +26 -0
  7. icsDataValidation/connection_setups/snowflake_connection_setup.py +35 -0
  8. icsDataValidation/connection_setups/teradata_connection_setup.py +18 -0
  9. icsDataValidation/core/__init__.py +0 -0
  10. icsDataValidation/core/database_objects.py +18 -0
  11. icsDataValidation/core/object_comparison.py +239 -0
  12. icsDataValidation/input_parameters/__init__.py +0 -0
  13. icsDataValidation/input_parameters/testing_tool_params.py +81 -0
  14. icsDataValidation/main.py +250 -0
  15. icsDataValidation/output_parameters/__init__.py +0 -0
  16. icsDataValidation/output_parameters/result_params.py +94 -0
  17. icsDataValidation/services/__init__.py +0 -0
  18. icsDataValidation/services/comparison_service.py +582 -0
  19. icsDataValidation/services/database_services/__init__.py +0 -0
  20. icsDataValidation/services/database_services/azure_service.py +320 -0
  21. icsDataValidation/services/database_services/databricks_hive_metastore_service.py +1694 -0
  22. icsDataValidation/services/database_services/databricks_unity_catalog_service.py +1379 -0
  23. icsDataValidation/services/database_services/exasol_service.py +261 -0
  24. icsDataValidation/services/database_services/oracle_service.py +713 -0
  25. icsDataValidation/services/database_services/snowflake_service.py +1100 -0
  26. icsDataValidation/services/database_services/teradata_service.py +665 -0
  27. icsDataValidation/services/initialization_service.py +103 -0
  28. icsDataValidation/services/result_service.py +573 -0
  29. icsDataValidation/services/system_service.py +61 -0
  30. icsDataValidation/services/testset_service.py +257 -0
  31. icsDataValidation/utils/__init__.py +0 -0
  32. icsDataValidation/utils/file_util.py +96 -0
  33. icsDataValidation/utils/logger_util.py +96 -0
  34. icsDataValidation/utils/pandas_util.py +159 -0
  35. icsDataValidation/utils/parallelization_util.py +52 -0
  36. icsDataValidation/utils/sql_util.py +14 -0
  37. icsDataValidation-1.0.358.dist-info/METADATA +21 -0
  38. icsDataValidation-1.0.358.dist-info/RECORD +40 -0
  39. icsDataValidation-1.0.358.dist-info/WHEEL +5 -0
  40. icsDataValidation-1.0.358.dist-info/top_level.txt +1 -0
@@ -0,0 +1,582 @@
1
+ import logging
2
+ import pandas as pd
3
+ import datetime
4
+ import numpy as np
5
+
6
+ from pandas._testing import assert_frame_equal
7
+ from decimal import Decimal
8
+
9
+ from icsDataValidation.utils.logger_util import configure_dev_ops_logger
10
+ from icsDataValidation.utils.pandas_util import get_diff_dataframes, get_diff_dict_from_diff_dataframes
11
+ from icsDataValidation.input_parameters.testing_tool_params import TestingToolParams
12
+ from icsDataValidation.core.database_objects import DatabaseObject
13
+ from icsDataValidation.output_parameters.result_params import ResultParams
14
+
15
+ #########################################################################################
16
+
17
+ # Configure Dev Ops Logger
18
+
19
+ logger = logging.getLogger('ComparisonService')
20
+ logger.setLevel(logging.INFO)
21
+ configure_dev_ops_logger(logger)
22
+
23
+ #########################################################################################
24
+ #########################################################################################
25
+
26
+
27
+ class ComparisonService(TestingToolParams):
28
+ """
29
+ Class to compare an object between a source and a target system.
30
+ """
31
+ def __init__(self, src_object: DatabaseObject, trgt_object: DatabaseObject, db_service_src, db_service_trgt, src_filter: list, trgt_filter: list, exclude_columns: list, comp_id: int):
32
+ super().__init__()
33
+ self.result_params = ResultParams()
34
+ self.src_object = src_object
35
+ self.trgt_object = trgt_object
36
+ self.db_service_src = db_service_src
37
+ self.db_service_trgt = db_service_trgt
38
+ self.src_filter = src_filter
39
+ self.trgt_filter = trgt_filter
40
+ self.exclude_columns = exclude_columns
41
+ self.comp_id = comp_id
42
+
43
+ def _get_group_by_column_by_validation(self, group_by_column_candidates: list, src_column_count_distincts, trgt_column_count_distincts):
44
+ object_group_by_column=None
45
+ for object_group_by_column in group_by_column_candidates:
46
+
47
+ src_group_by_column_count_distinct=next(item["COUNT_DISTINCT"] for item in src_column_count_distincts if item["COLUMN_NAME"].upper() == object_group_by_column)
48
+ trgt_group_by_column_count_distinct=next(item["COUNT_DISTINCT"] for item in trgt_column_count_distincts if item["COLUMN_NAME"].upper() == object_group_by_column)
49
+
50
+ if (trgt_group_by_column_count_distinct<=1 or src_group_by_column_count_distinct<=1):
51
+ logger.info(f"[{self.comp_id}] The GROUP_BY_COLUMN {object_group_by_column} does not satisfy the necessary criteria.")
52
+ logger.info(f"[{self.comp_id}] Number of distinct values <= 1 on src or trgt.")
53
+ continue
54
+ elif (trgt_group_by_column_count_distinct==self.result_params.trgt_row_count or src_group_by_column_count_distinct==self.result_params.src_row_count):
55
+ logger.info(f"[{self.comp_id}] The GROUP_BY_COLUMN {object_group_by_column} does not satisfy the necessary criteria.")
56
+ logger.info(f"[{self.comp_id}] Number of distinct values equal to rowcount of object on src or trgt.")
57
+ continue
58
+ elif (trgt_group_by_column_count_distinct<self.min_group_by_count_distinct or src_group_by_column_count_distinct<self.min_group_by_count_distinct):
59
+ logger.info(f"[{self.comp_id}] The GROUP_BY_COLUMN {object_group_by_column} does not satisfy the necessary criteria.")
60
+ logger.info(f"[{self.comp_id}] Number of distinct values falls below the min_group_by_count_distinct {self.min_group_by_count_distinct} on src or trgt.")
61
+ continue
62
+ elif (trgt_group_by_column_count_distinct>self.max_group_by_count_distinct or src_group_by_column_count_distinct>self.max_group_by_count_distinct):
63
+ logger.info(f"[{self.comp_id}] The GROUP_BY_COLUMN {object_group_by_column} does not satisfy the necessary criteria.")
64
+ logger.info(f"[{self.comp_id}] Number of distinct values exceeds the max_group_by_count_distinct {self.max_group_by_count_distinct} on src or trgt.")
65
+ continue
66
+ elif (trgt_group_by_column_count_distinct*len(self.result_params.intersection_columns_trgt_src)>self.max_group_by_size or src_group_by_column_count_distinct*len(self.result_params.intersection_columns_trgt_src)>self.max_group_by_size):
67
+ logger.info(f"[{self.comp_id}] The GROUP_BY_COLUMN {object_group_by_column} does not satisfy the necessary criteria.")
68
+ logger.info(f"[{self.comp_id}] The size of the expected result of the group-by-query exceeds the max_group_by_size {self.max_group_by_size} on src or trgt.")
69
+ continue
70
+
71
+ logger.info(f"[{self.comp_id}] USING Column {object_group_by_column} for group by aggregation")
72
+ return object_group_by_column
73
+
74
+ def row_count_comparison(self):
75
+ logger.info(f"[{self.comp_id}] START Row-Count-Comparison")
76
+ # row count comparison
77
+ self.result_params.src_row_count, self.result_params.error_list_rows_src = self.db_service_src.get_row_count_from_object(self.src_object, self.src_filter)
78
+ self.result_params.trgt_row_count, self.result_params.error_list_rows_trgt = self.db_service_trgt.get_row_count_from_object(self.trgt_object, self.trgt_filter)
79
+ self.result_params.src_row_count = int(self.result_params.src_row_count)
80
+ self.result_params.trgt_row_count = int(self.result_params.trgt_row_count)
81
+ self.result_params.src_row_count_minus_trgt_row_count = self.result_params.src_row_count-self.result_params.trgt_row_count
82
+
83
+ self.result_params.row_counts_equal = True
84
+ if self.result_params.error_list_rows_src or self.result_params.error_list_rows_trgt:
85
+ self.result_params.row_counts_equal = None
86
+ elif self.result_params.src_row_count_minus_trgt_row_count != 0:
87
+ self.result_params.row_counts_equal = False
88
+
89
+ def column_names_comparison(self):
90
+ logger.info(f"[{self.comp_id}] START Column-Names-Comparison")
91
+ src_columns = self.db_service_src.get_columns_from_object(self.src_object)
92
+ trgt_columns = self.db_service_trgt.get_columns_from_object(self.trgt_object)
93
+ src_columns.sort()
94
+ trgt_columns.sort()
95
+
96
+ src_columns_upper=[src_column.upper() for src_column in src_columns]
97
+ trgt_columns_upper=[trgt_column.upper() for trgt_column in trgt_columns]
98
+
99
+ src_columns_minus_trgt_columns = list(set(src_columns_upper) - set(trgt_columns_upper))
100
+ trgt_columns_minus_src_columns = list(set(trgt_columns_upper) - set(src_columns_upper))
101
+ src_columns_minus_trgt_columns.sort()
102
+ trgt_columns_minus_src_columns.sort()
103
+
104
+ columns_equal = True
105
+ if src_columns_minus_trgt_columns:
106
+ columns_equal = False
107
+
108
+ if trgt_columns_minus_src_columns:
109
+ columns_equal = False
110
+
111
+ intersection_columns_trgt_src = list(set(src_columns_upper) & set(trgt_columns_upper))
112
+ intersection_columns_trgt_src.sort()
113
+
114
+ all_columns_trgt_src = list(set(src_columns_upper) | set(trgt_columns_upper))
115
+ all_columns_trgt_src.sort()
116
+
117
+ #save results
118
+ self.result_params.src_columns = src_columns
119
+ self.result_params.trgt_columns = trgt_columns
120
+ self.result_params.src_columns_upper = src_columns_upper
121
+ self.result_params.trgt_columns_upper = trgt_columns_upper
122
+ self.result_params.src_columns_minus_trgt_columns = src_columns_minus_trgt_columns
123
+ self.result_params.trgt_columns_minus_src_columns = trgt_columns_minus_src_columns
124
+ self.result_params.columns_equal = columns_equal
125
+ self.result_params.intersection_columns_trgt_src = intersection_columns_trgt_src
126
+ self.result_params.all_columns_trgt_src = all_columns_trgt_src
127
+
128
+ def aggregation_comparison(self):
129
+ logger.info(f"[{self.comp_id}] START Aggregation-Comparison")
130
+ src_column_datatypes = self.db_service_src.get_data_types_from_object(self.src_object, self.result_params.src_columns)
131
+ src_columns_aggregate = self.db_service_src.create_checksums(self.src_object, self.result_params.src_columns, self.src_filter, self.exclude_columns, self.numeric_scale)
132
+
133
+ trgt_column_datatypes = self.db_service_trgt.get_data_types_from_object(self.trgt_object, self.result_params.trgt_columns)
134
+ trgt_columns_aggregate = self.db_service_trgt.create_checksums(self.trgt_object, self.result_params.trgt_columns, self.trgt_filter, self.exclude_columns, self.numeric_scale)
135
+
136
+ src_aggregations_error = src_columns_aggregate['TESTATM_ERRORS']
137
+ trgt_aggregations_error = trgt_columns_aggregate['TESTATM_ERRORS']
138
+
139
+ if self.result_params.error_list_rows_src != []:
140
+ src_error_dict = {
141
+ 'QUERY': self.result_params.error_list_rows_src[1]
142
+ , 'ERROR': self.result_params.error_list_rows_src[0]
143
+ }
144
+ elif src_aggregations_error!= []:
145
+ src_error_dict = {
146
+ 'QUERY': src_aggregations_error[0][1]
147
+ , 'ERROR': src_aggregations_error[0][2]
148
+ }
149
+ else:
150
+ src_error_dict = {'QUERY': None, 'ERROR': None}
151
+
152
+ if self.result_params.error_list_rows_trgt != []:
153
+ trgt_error_dict = {
154
+ 'QUERY': self.result_params.error_list_rows_trgt[1]
155
+ , 'ERROR': self.result_params.error_list_rows_trgt[0]
156
+ }
157
+ elif trgt_aggregations_error!= []:
158
+ trgt_error_dict = {
159
+ 'QUERY': trgt_aggregations_error[0][1]
160
+ , 'ERROR': trgt_aggregations_error[0][2]
161
+ }
162
+ else:
163
+ trgt_error_dict = {'QUERY': None, 'ERROR': None}
164
+
165
+ del src_columns_aggregate['TESTATM_ERRORS']
166
+ del trgt_columns_aggregate['TESTATM_ERRORS']
167
+
168
+ if self.result_params.src_row_count != 0 and self.result_params.trgt_row_count != 0:
169
+ aggregation_differences_trgt_minus_src_not_boolean = {
170
+ k: round(Decimal(trgt_columns_aggregate[k][1])
171
+ - Decimal(src_columns_aggregate[k][1]), self.numeric_scale)
172
+ for k in src_columns_aggregate.keys()
173
+ if k in trgt_columns_aggregate
174
+ and str(src_columns_aggregate[k][1]) != str(trgt_columns_aggregate[k][1])
175
+ and src_columns_aggregate[k][1] != trgt_columns_aggregate[k][1]
176
+ and src_columns_aggregate[k][0].upper() != 'AGGREGATEBOOLEAN'
177
+ and trgt_columns_aggregate[k][0].upper() != 'AGGREGATEBOOLEAN'
178
+ }
179
+ aggregation_differences_trgt_minus_src_boolean = {
180
+ k: str(
181
+ int(trgt_columns_aggregate[k][1].split('_',1)[0])
182
+ - int(src_columns_aggregate[k][1].split('_',1)[0])
183
+ )
184
+ + '_'
185
+ + str(
186
+ int(trgt_columns_aggregate[k][1].split('_',1)[1])
187
+ - int(src_columns_aggregate[k][1].split('_',1)[1])
188
+ )
189
+ for k in src_columns_aggregate.keys()
190
+ if k in trgt_columns_aggregate
191
+ and str(src_columns_aggregate[k][1]) != str(trgt_columns_aggregate[k][1])
192
+ and src_columns_aggregate[k][1] != trgt_columns_aggregate[k][1]
193
+ and src_columns_aggregate[k][0].upper() == 'AGGREGATEBOOLEAN'
194
+ and trgt_columns_aggregate[k][0].upper() == 'AGGREGATEBOOLEAN'
195
+ }
196
+ aggregation_differences_trgt_minus_src=aggregation_differences_trgt_minus_src_not_boolean
197
+ aggregation_differences_trgt_minus_src.update(aggregation_differences_trgt_minus_src_boolean)
198
+ elif self.result_params.src_row_count != 0 and self.result_params.trgt_row_count == 0:
199
+ aggregation_differences_trgt_minus_src_not_boolean = {
200
+ k: -src_columns_aggregate[k][1]
201
+ for k in src_columns_aggregate.keys()
202
+ if k in trgt_columns_aggregate
203
+ and str(src_columns_aggregate[k][1]) != str(trgt_columns_aggregate[k][1])
204
+ and src_columns_aggregate[k][1] != trgt_columns_aggregate[k][1]
205
+ and src_columns_aggregate[k][0].upper() != 'AGGREGATEBOOLEAN'
206
+ and trgt_columns_aggregate[k][0].upper() != 'AGGREGATEBOOLEAN'
207
+ }
208
+ aggregation_differences_trgt_minus_src_boolean = {
209
+ k: str(
210
+ - int(src_columns_aggregate[k][1].split('_',1)[0])
211
+ )
212
+ + '_'
213
+ + str(
214
+ - int(src_columns_aggregate[k][1].split('_',1)[1])
215
+ )
216
+ for k in src_columns_aggregate.keys()
217
+ if k in trgt_columns_aggregate
218
+ and str(src_columns_aggregate[k][1]) != str(trgt_columns_aggregate[k][1])
219
+ and src_columns_aggregate[k][1] != trgt_columns_aggregate[k][1]
220
+ and src_columns_aggregate[k][0].upper() == 'AGGREGATEBOOLEAN'
221
+ and trgt_columns_aggregate[k][0].upper() == 'AGGREGATEBOOLEAN'
222
+ }
223
+ aggregation_differences_trgt_minus_src=aggregation_differences_trgt_minus_src_not_boolean
224
+ aggregation_differences_trgt_minus_src.update(aggregation_differences_trgt_minus_src_boolean)
225
+ elif self.result_params.src_row_count == 0 and self.result_params.trgt_row_count != 0:
226
+ aggregation_differences_trgt_minus_src = {
227
+ k: trgt_columns_aggregate[k][1]
228
+ for k in src_columns_aggregate.keys()
229
+ if k in trgt_columns_aggregate
230
+ and str(src_columns_aggregate[k][1]) != str(trgt_columns_aggregate[k][1])
231
+ }
232
+
233
+ else:
234
+ aggregation_differences_trgt_minus_src = {}
235
+
236
+ aggregations_equal = True
237
+ if src_aggregations_error or trgt_aggregations_error:
238
+ aggregations_equal = None
239
+ else:
240
+ for aggregation_diff in aggregation_differences_trgt_minus_src.values():
241
+ if aggregation_diff and not aggregation_diff == 0.0:
242
+ aggregations_equal = False
243
+ break
244
+
245
+ # save results
246
+ self.result_params.src_column_datatypes = src_column_datatypes
247
+ self.result_params.src_columns_aggregate = src_columns_aggregate
248
+ self.result_params.trgt_column_datatypes = trgt_column_datatypes
249
+ self.result_params.trgt_columns_aggregate = trgt_columns_aggregate
250
+ self.result_params.src_aggregations_error = src_aggregations_error
251
+ self.result_params.trgt_aggregations_error = trgt_aggregations_error
252
+ self.result_params.aggregation_differences_trgt_minus_src = aggregation_differences_trgt_minus_src
253
+ self.result_params.src_error_dict = src_error_dict
254
+ self.result_params.trgt_error_dict = trgt_error_dict
255
+ self.result_params.aggregations_equal = aggregations_equal
256
+
257
+
258
+ def group_by_comparison(self):
259
+
260
+ object_group_by_columns=[]
261
+ group_by_columns_src=[]
262
+ group_by_columns_trgt=[]
263
+ src_group_by_error = {}
264
+ trgt_group_by_error = {}
265
+ src_group_by_query_aggregation_string = ''
266
+ src_group_by_query_columns_string = ''
267
+ trgt_group_by_query_aggregation_string = ''
268
+ trgt_group_by_query_columns_string = ''
269
+ group_by_values_with_mismatches = {}
270
+ group_by_query_where_filter = ''
271
+ columns_with_mismatch = []
272
+ group_by_diff_dict = {}
273
+ src_group_by_query = ''
274
+ trgt_group_by_query = ''
275
+
276
+ src_pdf_from_group_by_sorted = None
277
+ trgt_pdf_from_group_by_sorted = None
278
+ diff_src_pdf_from_group_by_sorted = None
279
+ diff_trgt_pdf_from_group_by_sorted = None
280
+ pandas_df_from_group_by_is_equal = None
281
+
282
+ if not "GROUP_BY_AGGREGATION" in self.migration_config:
283
+ raise ValueError(f"The GROUP_BY_AGGREGATION key is missing in the migration_config.json. Please add the key and the parameters GROUP_BY_COLUMNS and GROUP_BY_COLUMNS_PER_TABLE to the config or disable the execute_group_by_comparison parameter.")
284
+ elif self.use_group_by_columns :
285
+ if not "GROUP_BY_COLUMNS" in self.migration_config["GROUP_BY_AGGREGATION"]:
286
+ raise ValueError(f"The GROUP_BY_COLUMNS key is missing in the migration_config.json. Please add the key to the config under GROUP_BY_AGGREGATION or disable the use_group_by_columns parameter or the execute_group_by_comparison parameter.")
287
+ if not "GROUP_BY_COLUMNS_PER_TABLE" in self.migration_config["GROUP_BY_AGGREGATION"]:
288
+ raise ValueError(f"The GROUP_BY_COLUMNS_PER_TABLE key is missing in the migration_config.json. Please add the key to the config under GROUP_BY_AGGREGATION or disable the use_group_by_columns parameter or the execute_group_by_comparison parameter.")
289
+
290
+ # group-by only if tables not empty
291
+ if self.result_params.src_row_count == 0 :
292
+ logger.info(f"[{self.comp_id}] Source table {self.src_object.database}.{self.src_object.schema}.{self.src_object.name} is empty, Group-By-Comparison will be skipped")
293
+ elif self.result_params.trgt_row_count == 0:
294
+ logger.info(f"[{self.comp_id}] Target table {self.trgt_object.database}.{self.trgt_object.schema}.{self.trgt_object.name} is empty, Group-By-Comparison will be skipped")
295
+
296
+ # group-by option 1 - group_by_columns defined as multiple lists for specific tables
297
+ elif self.use_group_by_columns and f"{self.src_object.database}.{self.src_object.schema}.{self.src_object.name}" in self.migration_config["GROUP_BY_AGGREGATION"]["GROUP_BY_COLUMNS_PER_TABLE"].keys():
298
+ logger.info(f"[{self.comp_id}] START Group-By-Comparison - with option 1 (group_by_columns defined for specific object)")
299
+ group_by_configuration_current_object = self.migration_config["GROUP_BY_AGGREGATION"]["GROUP_BY_COLUMNS_PER_TABLE"][f"{self.src_object.database}.{self.src_object.schema}.{self.src_object.name}"]
300
+ object_group_by_columns = group_by_configuration_current_object["GROUP_BY_COLUMNS"]
301
+ object_group_by_aggregation_columns = group_by_configuration_current_object["GROUP_BY_AGGREGATION_COLUMNS"]
302
+ object_group_by_aggregation_type = group_by_configuration_current_object["GROUP_BY_AGGREGATION_TYPE"]
303
+
304
+ # group-by option 2 - group_by_columns defined as one list for all tables
305
+ elif self.use_group_by_columns and self.migration_config["GROUP_BY_AGGREGATION"]["GROUP_BY_COLUMNS"]:
306
+ logger.info(f"[{self.comp_id}] START Group-By-Comparison - with option 2 (group_by_columns defined as a list for all objects)")
307
+ global_group_by_columns=self.migration_config["GROUP_BY_AGGREGATION"]["GROUP_BY_COLUMNS"]
308
+ global_group_by_columns_in_object=[group_by_column for group_by_column in global_group_by_columns if group_by_column in self.result_params.intersection_columns_trgt_src]
309
+ if global_group_by_columns_in_object:
310
+ object_group_by_columns=[global_group_by_columns_in_object]
311
+ object_group_by_aggregation_columns=["all"]
312
+ object_group_by_aggregation_type='various'
313
+
314
+ # group-by option 3 - group_by_columns NOT defined as a list
315
+ elif (not self.use_group_by_columns or not object_group_by_columns):
316
+ logger.info(f"[{self.comp_id}] START Group-By-Comparison - with option 3 (group_by_columns NOT defined -> retrieve group_by_columns by defined criteria)")
317
+ src_column_count_distincts, error_list = self.db_service_src.get_count_distincts_from_object(self.src_object, self.result_params.src_columns)
318
+ trgt_column_count_distincts, error_list = self.db_service_trgt.get_count_distincts_from_object(self.trgt_object, self.result_params.trgt_columns)
319
+ if src_column_count_distincts and trgt_column_count_distincts:
320
+ object_group_by_column=self._get_group_by_column_by_validation(self.result_params.intersection_columns_trgt_src, src_column_count_distincts, trgt_column_count_distincts)
321
+ if object_group_by_column:
322
+ object_group_by_columns=[object_group_by_column]
323
+ object_group_by_aggregation_columns=["all"]
324
+ object_group_by_aggregation_type='various'
325
+
326
+ if not object_group_by_columns:
327
+ logger.info(f"[{self.comp_id}] No Group-By-Columns found")
328
+ else:
329
+ logger.info(f"[{self.comp_id}] USING Column(s) {str(object_group_by_columns)} for Group-By-Comparison")
330
+ src_pdf_from_group_by, src_group_by_query_aggregation_string, src_group_by_query_columns_string, group_by_columns_src, src_group_by_error = self.db_service_src.create_pandas_df_from_group_by(self.src_object, self.result_params.intersection_columns_trgt_src, object_group_by_columns, object_group_by_aggregation_columns, object_group_by_aggregation_type, False, self.src_filter, self.exclude_columns, self.numeric_scale)
331
+ trgt_pdf_from_group_by, trgt_group_by_query_aggregation_string, trgt_group_by_query_columns_string, group_by_columns_trgt, trgt_group_by_error = self.db_service_trgt.create_pandas_df_from_group_by(self.trgt_object, self.result_params.intersection_columns_trgt_src, object_group_by_columns, object_group_by_aggregation_columns, object_group_by_aggregation_type, False, self.trgt_filter, self.exclude_columns, self.numeric_scale)
332
+
333
+ # check if Group-By-Aggregation was actually performed
334
+ if src_group_by_error == {} and trgt_group_by_error == {}:
335
+ diff_src_pdf_from_group_by_sorted, diff_trgt_pdf_from_group_by_sorted, src_pdf_from_group_by_sorted, trgt_pdf_from_group_by_sorted = get_diff_dataframes(src_pdf_from_group_by, trgt_pdf_from_group_by, group_by_columns_src, group_by_columns_trgt)
336
+ if not diff_src_pdf_from_group_by_sorted.empty:
337
+ logger.debug(f"[{self.comp_id}] diff_src_pdf_from_group_by_sorted:\n {diff_src_pdf_from_group_by_sorted}")
338
+ logger.debug(f"[{self.comp_id}] diff_trgt_pdf_from_group_by_sorted:\n {diff_trgt_pdf_from_group_by_sorted}")
339
+
340
+ for object_group_by_column in object_group_by_columns:
341
+ # creating Group-By-Values with mismatches
342
+ if object_group_by_column in diff_src_pdf_from_group_by_sorted and object_group_by_column in diff_trgt_pdf_from_group_by_sorted:
343
+ group_by_values_with_mismatches [object_group_by_column] = list(set(diff_src_pdf_from_group_by_sorted[object_group_by_column].tolist()).union(set(diff_trgt_pdf_from_group_by_sorted[object_group_by_column].tolist())))
344
+ elif object_group_by_column in diff_src_pdf_from_group_by_sorted:
345
+ group_by_values_with_mismatches [object_group_by_column] = diff_src_pdf_from_group_by_sorted[object_group_by_column].tolist()
346
+ elif object_group_by_column in diff_trgt_pdf_from_group_by_sorted:
347
+ group_by_values_with_mismatches [object_group_by_column] = diff_trgt_pdf_from_group_by_sorted[object_group_by_column].tolist()
348
+ else:
349
+ continue
350
+
351
+ if len(group_by_values_with_mismatches) > self.max_group_by_values_with_mismatches:
352
+ group_by_values_with_mismatches = [f"Warning: There are more than {self.max_group_by_values_with_mismatches} entries."]
353
+ else:
354
+ # creating Group-By-Query where filter
355
+ group_by_values_with_mismatches_string = ', '.join(f"'{c}'" for c in group_by_values_with_mismatches[object_group_by_column])
356
+ group_by_query_where_filter+=f" AND {object_group_by_column} IN ({group_by_values_with_mismatches_string})"
357
+
358
+ # creating list of columns with mismatches
359
+ for column in diff_src_pdf_from_group_by_sorted.columns:
360
+ if column in diff_trgt_pdf_from_group_by_sorted.columns and column not in object_group_by_columns:
361
+ if (diff_src_pdf_from_group_by_sorted[column].equals(diff_trgt_pdf_from_group_by_sorted[column])):
362
+ continue
363
+ try:
364
+ pd.testing.assert_series_equal(diff_src_pdf_from_group_by_sorted[column],diff_trgt_pdf_from_group_by_sorted[column], check_dtype = False, check_index_type= False, check_series_type= False, check_names= False)
365
+ continue
366
+ except Exception:
367
+ columns_with_mismatch.append(column)
368
+ elif column not in diff_trgt_pdf_from_group_by_sorted.columns and column not in object_group_by_columns:
369
+ columns_with_mismatch.append(column)
370
+ for column in diff_trgt_pdf_from_group_by_sorted.columns:
371
+ if column not in diff_src_pdf_from_group_by_sorted.columns and column not in object_group_by_columns:
372
+ columns_with_mismatch.append(column)
373
+
374
+
375
+ if group_by_columns_src and group_by_columns_trgt and diff_src_pdf_from_group_by_sorted is not None and diff_trgt_pdf_from_group_by_sorted is not None:
376
+ group_by_diff_dict = get_diff_dict_from_diff_dataframes(diff_src_pdf_from_group_by_sorted, diff_trgt_pdf_from_group_by_sorted, group_by_columns_src, group_by_columns_trgt, group_by_values_with_mismatches, self.numeric_scale)
377
+
378
+ # creating final Group-By-Queries TODO write as function
379
+ if src_group_by_query_columns_string and src_group_by_query_aggregation_string and group_by_query_where_filter and not len(group_by_values_with_mismatches) > self.max_group_by_values_with_mismatches:
380
+ src_group_by_query = f"SELECT {src_group_by_query_columns_string}, COUNT(*) AS COUNT_OF_GROUP_BY_VALUE, {src_group_by_query_aggregation_string} FROM {self.src_object.database}.{self.src_object.schema}.{self.src_object.name} WHERE 1=1 {group_by_query_where_filter} GROUP BY {src_group_by_query_columns_string};"
381
+
382
+ if trgt_group_by_query_columns_string and trgt_group_by_query_aggregation_string and group_by_query_where_filter and not len(group_by_values_with_mismatches) > self.max_group_by_values_with_mismatches:
383
+ trgt_group_by_query = f"SELECT {trgt_group_by_query_columns_string}, COUNT(*) AS COUNT_OF_GROUP_BY_VALUE, {trgt_group_by_query_aggregation_string} FROM {self.trgt_object.database}.{self.trgt_object.schema}.{self.trgt_object.name} WHERE 1=1 {group_by_query_where_filter} GROUP BY {trgt_group_by_query_columns_string};"
384
+
385
+ # additional evaluation of the pandas_df_from_group_by TODO check if this is really necessary and write as a function
386
+ if src_pdf_from_group_by_sorted is not None and trgt_pdf_from_group_by_sorted is not None:
387
+ try:
388
+ pandas_df_from_group_by_is_equal = src_pdf_from_group_by_sorted.equals(trgt_pdf_from_group_by_sorted)
389
+ except:
390
+ pandas_df_from_group_by_is_equal = False
391
+
392
+ ## RE-EVALUATE
393
+ if src_group_by_error == {} and trgt_group_by_error == {} and src_pdf_from_group_by_sorted is not None and trgt_pdf_from_group_by_sorted is not None:
394
+
395
+ eq_frame = src_pdf_from_group_by_sorted.eq(trgt_pdf_from_group_by_sorted)
396
+ if not pandas_df_from_group_by_is_equal:
397
+ all_equal_columns = eq_frame.all()
398
+ if all_equal_columns.all():
399
+ pandas_df_from_group_by_is_equal = True
400
+ else:
401
+ pandas_df_from_group_by_is_equal = False
402
+
403
+ src_number_of_rows = len(src_pdf_from_group_by_sorted.index)
404
+ trgt_number_of_rows = len(trgt_pdf_from_group_by_sorted.index)
405
+ logger.info(f"[{self.comp_id}] ROWS src_pdf_from_group_by_sorted: {str(src_number_of_rows)}")
406
+ logger.info(f"[{self.comp_id}] ROWS trgt_pdf_from_group_by_sorted: {str(trgt_number_of_rows)}")
407
+ diff_rows = abs(trgt_number_of_rows - src_number_of_rows)
408
+ logger.info(f"[{self.comp_id}] ROW DIFF: {str(diff_rows)}")
409
+
410
+ src_number_of_columns = len(src_pdf_from_group_by_sorted.columns)
411
+ trgt_number_of_columns = len(trgt_pdf_from_group_by_sorted.columns)
412
+ logger.info(f"[{self.comp_id}] COLUMNS src_pdf_from_group_by_sorted: {str(src_number_of_columns)}")
413
+ logger.info(f"[{self.comp_id}] COLUMNS trgt_pdf_from_group_by_sorted: {str(trgt_number_of_columns)}")
414
+
415
+ if set(src_pdf_from_group_by_sorted.columns.values) == set(trgt_pdf_from_group_by_sorted.columns.values):
416
+ src_delta_pdf_pre = src_pdf_from_group_by_sorted.merge(trgt_pdf_from_group_by_sorted, indicator=True, how='outer').query('_merge not in ("both", "right_only")')
417
+
418
+ trgt_delta_pdf_pre = trgt_pdf_from_group_by_sorted.merge(src_pdf_from_group_by_sorted, indicator=True, how='outer').query('_merge not in ("both", "right_only")')
419
+
420
+ ## RE-EVALUATE
421
+ eq_frame = src_pdf_from_group_by_sorted.eq(trgt_pdf_from_group_by_sorted)
422
+ if not pandas_df_from_group_by_is_equal:
423
+ if src_delta_pdf_pre.empty and trgt_delta_pdf_pre.empty:
424
+ pandas_df_from_group_by_is_equal = True
425
+ else:
426
+ pandas_df_from_group_by_is_equal = False
427
+
428
+ #### save self.result_params data
429
+ self.result_params.src_group_by_query = src_group_by_query
430
+ self.result_params.trgt_group_by_query = trgt_group_by_query
431
+ self.result_params.src_group_by_error = src_group_by_error
432
+ self.result_params.trgt_group_by_error = trgt_group_by_error
433
+ self.result_params.object_group_by_columns = object_group_by_columns
434
+ self.result_params.group_by_equal = pandas_df_from_group_by_is_equal
435
+ self.result_params.group_by_values_with_mismatches= group_by_values_with_mismatches
436
+ self.result_params.columns_with_mismatch= columns_with_mismatch
437
+ self.result_params.group_by_diff_dict = group_by_diff_dict
438
+
439
+ def pandas_dataframe_comparison(self):
440
+
441
+ if self.max_object_size > -1:
442
+
443
+ if self.src_object.type=='view':
444
+ src_tbl_size=-1
445
+ else:
446
+ src_tbl_size = self.db_service_src.get_table_size(self.src_object)
447
+
448
+ if self.trgt_object.type=='view':
449
+ trgt_tbl_size=-1
450
+ else:
451
+ trgt_tbl_size = self.db_service_trgt.get_table_size(self.trgt_object)
452
+ else:
453
+ src_tbl_size = None
454
+ trgt_tbl_size = None
455
+
456
+ if (
457
+ src_tbl_size is None
458
+ or trgt_tbl_size is None
459
+ or src_tbl_size == 0
460
+ or trgt_tbl_size == 0
461
+ or src_tbl_size > self.max_object_size
462
+ or trgt_tbl_size > self.max_object_size
463
+ or self.result_params.src_row_count > self.max_row_number
464
+ or self.result_params.trgt_row_count > self.max_row_number
465
+ ):
466
+ pandas_df_compared = False
467
+ pandas_df_is_equal = None
468
+ pandas_df_mismatch = f"Pandas Dataframes not compared!"
469
+ if src_tbl_size == 0:
470
+ logger.info(f"[{self.comp_id}] Pandas Dataframes not compared -> Source table empty")
471
+ elif trgt_tbl_size == 0:
472
+ logger.info(f"[{self.comp_id}] Pandas Dataframes not compared -> Target table empty")
473
+ else:
474
+ logger.info(f"[{self.comp_id}] Pandas Dataframes not compared -> restricted by input parameters MAX_OBJECT_SIZE and MAX_ROW_NUMBER")
475
+ else:
476
+ logger.info(f"[{self.comp_id}] START Pandas-Dataframe-Comparison")
477
+ src_pdf = self.db_service_src.create_pandas_df(self.src_object, self.result_params.intersection_columns_trgt_src, self.src_filter, self.exclude_columns)
478
+ trgt_pdf = self.db_service_trgt.create_pandas_df(self.trgt_object, self.result_params.intersection_columns_trgt_src, self.trgt_filter, self.exclude_columns)
479
+
480
+ # sorting the dataframes using the intersecting columns minus excluded columns
481
+ src_pdf_sorted = src_pdf.sort_values(by=list(set(self.result_params.intersection_columns_trgt_src) - set(self.exclude_columns))).reset_index(drop=True)
482
+ trgt_pdf_sorted = trgt_pdf.sort_values(by=list(set(self.result_params.intersection_columns_trgt_src) - set(self.exclude_columns))).reset_index(drop=True)
483
+
484
+ pandas_df_compared = True
485
+ pandas_df_is_equal = True
486
+ pandas_df_mismatch = ""
487
+
488
+ try:
489
+ assert_frame_equal(src_pdf_sorted,trgt_pdf_sorted, check_dtype = False, check_names = False, check_index_type = False, check_column_type = False, check_exact = False)
490
+ except Exception as err:
491
+ pandas_df_is_equal = False
492
+ pandas_df_mismatch = err
493
+
494
+ self.result_params.pandas_df_compared = pandas_df_compared
495
+ self.result_params.pandas_df_is_equal = pandas_df_is_equal
496
+ self.result_params.pandas_df_mismatch = pandas_df_mismatch
497
+ self.result_params.src_tbl_size = src_tbl_size
498
+ self.result_params.trgt_tbl_size = trgt_tbl_size
499
+
500
+ def sample_comparison(self):
501
+ logger.info(f"[{self.comp_id}] START Sample-Comparison")
502
+ samples_compared = False
503
+ trgt_key_filters = {}
504
+ trgt_used_columns = []
505
+ src_sample_query = None
506
+ trgt_sample_query = None
507
+
508
+ src_sample_pdf = [pd.DataFrame(), {}]
509
+ trgt_sample_pdf = [pd.DataFrame(), {}]
510
+ samples_equal = None
511
+ src_sample_dict = {}
512
+ trgt_sample_dict = {}
513
+ src_sample_error_dict = {}
514
+ trgt_sample_error_dict = {}
515
+ if "SAMPLE_KEYS" in self.migration_config.keys():
516
+ sample_comparison_config=self.migration_config["SAMPLE_KEYS"]
517
+ if f"{self.src_object.database}.{self.src_object.schema}.{self.src_object.name}" in sample_comparison_config.keys():
518
+ logger.info(f"[{self.comp_id}] START Sample-Check for: {self.src_object.database}.{self.src_object.schema}.{self.src_object.name}")
519
+ samples_compared = True
520
+ key_columns = sample_comparison_config[f"{self.src_object.database}.{self.src_object.schema}.{self.src_object.name}"]
521
+ trgt_sample_pdf, trgt_key_filters, trgt_used_columns, trgt_sample_query = self.db_service_trgt.create_pandas_df_from_sample(
522
+ object = self.trgt_object,
523
+ column_intersections=self.result_params.intersection_columns_trgt_src,
524
+ key_columns=key_columns,
525
+ where_clause=self.trgt_filter,
526
+ exclude_columns=self.exclude_columns
527
+ )
528
+ src_sample_pdf, src_key_filters, src_used_columns, src_sample_query = self.db_service_src.create_pandas_df_from_sample(
529
+ object = self.src_object,
530
+ column_intersections=self.result_params.intersection_columns_trgt_src,
531
+ key_columns=key_columns,
532
+ where_clause=self.src_filter,
533
+ exclude_columns=self.exclude_columns,
534
+ key_filters=trgt_key_filters,
535
+ dedicated_columns=trgt_used_columns
536
+ )
537
+ ## Handle Datetime Datatypes -> transform into readable string
538
+ for key in trgt_key_filters:
539
+ if any((isinstance(x, datetime.date) for x in trgt_key_filters[key]) or (isinstance(x, datetime.datetime) for x in trgt_key_filters[key])):
540
+ new_value = []
541
+ for element in trgt_key_filters[key]:
542
+ new_value.append(str(element))
543
+ trgt_key_filters[key] = new_value
544
+
545
+ # TODO Runden erst hier - vorher create_pandas_df_from_sample ohne Runden zurückgeben und dann hier eine Extra Funktion zum Runden
546
+ if trgt_key_filters:
547
+ logger.info(f"[{self.comp_id}] Sample-Check Keys: {trgt_key_filters}")
548
+ else:
549
+ logger.info(f"[{self.comp_id}] Sample-Check Keys not found in column intersection or excluded in ADDITIONAL_CONFIGURATION.")
550
+
551
+ src_sample_error_dict = src_sample_pdf[1]
552
+ trgt_sample_error_dict = trgt_sample_pdf[1]
553
+ if samples_compared and src_sample_error_dict == {} and trgt_sample_error_dict == {}:
554
+ # sorting the dataframes using the intersecting columns
555
+ src_sample_pdf_sorted = src_sample_pdf[0] #.sort_values(by=intersection_columns_trgt_src).reset_index(drop=True)
556
+ src_sample_pdf_sorted = src_sample_pdf_sorted.replace(np.nan, None)
557
+ src_sample_pdf_sorted = src_sample_pdf_sorted.astype(str)
558
+
559
+ trgt_sample_pdf_sorted = trgt_sample_pdf[0] #.sort_values(by=intersection_columns_trgt_src).reset_index(drop=True)
560
+ trgt_sample_pdf_sorted = trgt_sample_pdf_sorted.replace(np.nan, None)
561
+ trgt_sample_pdf_sorted = trgt_sample_pdf_sorted.astype(str)
562
+
563
+ src_sample_dict = src_sample_pdf_sorted.to_dict()
564
+ trgt_sample_dict = trgt_sample_pdf_sorted.to_dict()
565
+
566
+ try:
567
+ pd.testing.assert_frame_equal(src_sample_pdf_sorted,trgt_sample_pdf_sorted,check_dtype = False, check_names = False, check_index_type = False, check_column_type = False, check_exact = False)
568
+ samples_equal = True
569
+ except:
570
+ samples_equal = False
571
+
572
+ # save results
573
+ self.result_params.src_sample_query = src_sample_query
574
+ self.result_params.trgt_sample_query = trgt_sample_query
575
+ self.result_params.src_sample_dict = src_sample_dict
576
+ self.result_params.trgt_sample_dict = trgt_sample_dict
577
+ self.result_params.samples_equal = samples_equal
578
+ self.result_params.src_sample_error_dict= src_sample_error_dict
579
+ self.result_params.trgt_sample_error_dict= trgt_sample_error_dict
580
+ self.result_params.samples_compared = samples_compared
581
+ self.result_params.samples_equal = samples_equal
582
+ self.result_params.trgt_key_filters = trgt_key_filters