icsDataValidation 1.0.358__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. icsDataValidation/configuration.py +19 -0
  2. icsDataValidation/connection_setups/__init__.py +0 -0
  3. icsDataValidation/connection_setups/azure_connection_setup.py +19 -0
  4. icsDataValidation/connection_setups/databricks_connection_setup.py +28 -0
  5. icsDataValidation/connection_setups/exasol_connection_setup.py +17 -0
  6. icsDataValidation/connection_setups/oracle_connection_setup.py +26 -0
  7. icsDataValidation/connection_setups/snowflake_connection_setup.py +35 -0
  8. icsDataValidation/connection_setups/teradata_connection_setup.py +18 -0
  9. icsDataValidation/core/__init__.py +0 -0
  10. icsDataValidation/core/database_objects.py +18 -0
  11. icsDataValidation/core/object_comparison.py +239 -0
  12. icsDataValidation/input_parameters/__init__.py +0 -0
  13. icsDataValidation/input_parameters/testing_tool_params.py +81 -0
  14. icsDataValidation/main.py +250 -0
  15. icsDataValidation/output_parameters/__init__.py +0 -0
  16. icsDataValidation/output_parameters/result_params.py +94 -0
  17. icsDataValidation/services/__init__.py +0 -0
  18. icsDataValidation/services/comparison_service.py +582 -0
  19. icsDataValidation/services/database_services/__init__.py +0 -0
  20. icsDataValidation/services/database_services/azure_service.py +320 -0
  21. icsDataValidation/services/database_services/databricks_hive_metastore_service.py +1694 -0
  22. icsDataValidation/services/database_services/databricks_unity_catalog_service.py +1379 -0
  23. icsDataValidation/services/database_services/exasol_service.py +261 -0
  24. icsDataValidation/services/database_services/oracle_service.py +713 -0
  25. icsDataValidation/services/database_services/snowflake_service.py +1100 -0
  26. icsDataValidation/services/database_services/teradata_service.py +665 -0
  27. icsDataValidation/services/initialization_service.py +103 -0
  28. icsDataValidation/services/result_service.py +573 -0
  29. icsDataValidation/services/system_service.py +61 -0
  30. icsDataValidation/services/testset_service.py +257 -0
  31. icsDataValidation/utils/__init__.py +0 -0
  32. icsDataValidation/utils/file_util.py +96 -0
  33. icsDataValidation/utils/logger_util.py +96 -0
  34. icsDataValidation/utils/pandas_util.py +159 -0
  35. icsDataValidation/utils/parallelization_util.py +52 -0
  36. icsDataValidation/utils/sql_util.py +14 -0
  37. icsDataValidation-1.0.358.dist-info/METADATA +21 -0
  38. icsDataValidation-1.0.358.dist-info/RECORD +40 -0
  39. icsDataValidation-1.0.358.dist-info/WHEEL +5 -0
  40. icsDataValidation-1.0.358.dist-info/top_level.txt +1 -0
@@ -0,0 +1,103 @@
1
+ import logging
2
+ import os
3
+
4
+ from pathlib import PurePath
5
+
6
+ from icsDataValidation.utils.logger_util import configure_dev_ops_logger
7
+ from icsDataValidation.input_parameters.testing_tool_params import TestingToolParams
8
+
9
+ #########################################################################################
10
+ # Configure Dev Ops Logger
11
+
12
+ logger = logging.getLogger('InitializationService')
13
+ logger.setLevel(logging.INFO)
14
+ configure_dev_ops_logger(logger)
15
+
16
+ #########################################################################################
17
+ #########################################################################################
18
+
19
+ class InitializationService:
20
+ """
21
+ Class to initialize the icsDataValidation Tool with the input parameters.
22
+ Process the TestingToolParams.
23
+ Extend the TestingToolParams with additional parameters.
24
+ """
25
+
26
+ def __init__(self, testing_tool_params: TestingToolParams, current_working_dir: str, start_time_utc: str):
27
+ self.testing_tool_params = testing_tool_params
28
+ self.current_working_dir = current_working_dir
29
+ self.start_time_utc = start_time_utc
30
+
31
+ def create_list_of_testset_file_names(self):
32
+ """
33
+ Create list of testset file names.
34
+ """
35
+ if self.testing_tool_params.testset_file_names and not self.testing_tool_params.testset_file_names == 'testset_file_names env variable not found' and not self.testing_tool_params.testset_file_names =='null':
36
+ self.testing_tool_params.testset_file_names = [testset_file_name.strip() for testset_file_name in self.testing_tool_params.testset_file_names.split(',') ]
37
+ else:
38
+ self.testing_tool_params.testset_file_names = []
39
+
40
+ def create_result_table_identifiers(self):
41
+ """
42
+ Create result table identifiers from result database, result schema, and result table names.
43
+ """
44
+ self.testing_tool_params.result_table = f"{self.testing_tool_params.result_database_name}.{self.testing_tool_params.result_meta_data_schema_name}.{self.testing_tool_params.result_table_name}"
45
+ self.testing_tool_params.result_table_highlevel = f"{self.testing_tool_params.result_database_name}.{self.testing_tool_params.result_schema_name}.{self.testing_tool_params.result_table_highlevel_name}"
46
+ self.testing_tool_params.result_table_objectlevel = f"{self.testing_tool_params.result_database_name}.{self.testing_tool_params.result_schema_name}.{self.testing_tool_params.result_table_objectlevel_name}"
47
+ self.testing_tool_params.result_table_columnlevel = f"{self.testing_tool_params.result_database_name}.{self.testing_tool_params.result_schema_name}.{self.testing_tool_params.result_table_columnlevel_name}"
48
+
49
+ def get_config_file_paths(self):
50
+ """
51
+ Create config file paths independent of operation system and append the currenct working directory to the paths.
52
+ Get migration config file path and database config file path.
53
+ """
54
+ config_folder_path = PurePath(self.current_working_dir).joinpath(PurePath(self.testing_tool_params.config_folder_name))
55
+ config_file_path = config_folder_path.joinpath(PurePath(self.testing_tool_params.configuration_file_name))
56
+ migration_config_file_path = config_folder_path.joinpath(PurePath(self.testing_tool_params.migration_configuration_file_name))
57
+
58
+ return config_file_path, migration_config_file_path
59
+
60
+ def get_testset_file_paths(self):
61
+ """
62
+ Get testset file paths independent of operation system and append the currenct working directory to the paths.
63
+ """
64
+ testset_folder_path = PurePath(self.current_working_dir).joinpath(PurePath(self.testing_tool_params.testset_folder_name))
65
+ testset_file_paths = []
66
+ for testset_file_name in self.testing_tool_params.testset_file_names:
67
+
68
+ testset_file_paths.append(testset_folder_path.joinpath(PurePath(testset_file_name)))
69
+
70
+ return testset_file_paths
71
+
72
+ def create_result_file_paths(self):
73
+ """
74
+ Create result file paths independent of operation system and append the currenct working directory to the paths.
75
+ Create result folder path, result file name, result file path, and stage_name.
76
+ """
77
+
78
+ self.testing_tool_params.result_folder_path = PurePath(self.current_working_dir).joinpath(PurePath(self.testing_tool_params.result_folder_name))
79
+ self.testing_tool_params.result_file_name = f"Comparison_Result_{self.testing_tool_params.source_system_selection}_{self.testing_tool_params.target_system_selection}_{self.testing_tool_params.database_name}_{self.start_time_utc}.json"
80
+ self.testing_tool_params.result_file_path = self.testing_tool_params.result_folder_path.joinpath(PurePath(self.testing_tool_params.result_file_name))
81
+ self.testing_tool_params.stage_name = f'{self.testing_tool_params.result_database_name}.{self.testing_tool_params.stage_schema}."{self.testing_tool_params.stage_name_prefix}_{self.testing_tool_params.run_guid}"'
82
+
83
+ def create_live_result_file_path(self):
84
+ """
85
+ Create live result file paths independent of operation system and append the currenct working directory to the paths.
86
+ Create live result folder if it does not exist.
87
+ """
88
+
89
+ self.testing_tool_params.live_result_folder_path = self.testing_tool_params.result_folder_path.joinpath(PurePath(f"Live_Result_{self.testing_tool_params.source_system_selection}_{self.testing_tool_params.target_system_selection}_{self.testing_tool_params.database_name}_{self.start_time_utc}"))
90
+
91
+ if not os.path.exists(self.testing_tool_params.live_result_folder_path):
92
+ os.makedirs(self.testing_tool_params.live_result_folder_path)
93
+
94
+ self.testing_tool_params.result_live_table = f"{self.testing_tool_params.result_database_name}.{self.testing_tool_params.result_meta_data_schema_name}.{self.testing_tool_params.result_live_table_name}"
95
+
96
+ def create_remaining_mapping_objects_file_path(self):
97
+ """
98
+ Create live remaining mapping objects file path independent of operation system and append the currenct working directory to the path.
99
+ """
100
+
101
+ remaining_mapping_objects_folder_path = PurePath(self.current_working_dir).joinpath(PurePath(self.testing_tool_params.remaining_mapping_objects_folder_name))
102
+ remaining_mapping_objects_file_name = f"Remaining_Mapping_Objects_{self.testing_tool_params.database_name}_{self.start_time_utc}.json"
103
+ self.testing_tool_params.remaining_mapping_objects_file_path = remaining_mapping_objects_folder_path.joinpath(PurePath(remaining_mapping_objects_file_name))
@@ -0,0 +1,573 @@
1
+ import logging
2
+ import boto3
3
+ import subprocess
4
+ import json
5
+
6
+ from azure.storage.blob import BlobServiceClient
7
+ from decimal import Decimal
8
+
9
+ from icsDataValidation.services.system_service import SystemService
10
+ from icsDataValidation.utils.logger_util import configure_dev_ops_logger
11
+ from icsDataValidation.input_parameters.testing_tool_params import TestingToolParams
12
+ from icsDataValidation.output_parameters.result_params import ResultParams
13
+ from icsDataValidation.core.database_objects import DatabaseObject
14
+ from icsDataValidation.utils.file_util import write_json_to_file, CustomJSONEncoder
15
+
16
+ #########################################################################################
17
+ # Configure Dev Ops Logger
18
+
19
+ logger = logging.getLogger('ResultService')
20
+ logger.setLevel(logging.INFO)
21
+ configure_dev_ops_logger(logger)
22
+
23
+ #########################################################################################
24
+ #########################################################################################
25
+
26
+
27
+ class ResultService(TestingToolParams):
28
+ """
29
+ Class to process comparison results and save the results in various formats.
30
+ """
31
+
32
+ def __init__(
33
+ self, start_time_utc: str,
34
+ remaining_mapping_objects: dict,
35
+ object_identifiers_src_minus_trgt: list,
36
+ object_identifiers_trgt_minus_src: list,
37
+ object_identifiers_to_compare_src: list,
38
+ object_identifiers_to_compare_trgt: list,
39
+ objects_to_compare: list[dict],
40
+ all_objects_matching: bool,
41
+ object_level_comparison_results: list[dict]
42
+ ):
43
+ super().__init__()
44
+
45
+ self.results = {
46
+ "PIPELINE_NAME": TestingToolParams.pipeline_name,
47
+ "PIPELINE_ID": TestingToolParams.pipeline_id,
48
+ "START_TIME_UTC": start_time_utc,
49
+ "SOURCE_SYSTEM": TestingToolParams.source_system_selection,
50
+ "TARGET_SYSTEM": TestingToolParams.target_system_selection,
51
+ "DATABASE_NAME": TestingToolParams.database_name,
52
+ "TESTSET": TestingToolParams.testset_file_names,
53
+ "SRC_MINUS_TRGT": object_identifiers_src_minus_trgt,
54
+ "TRGT_MINUS_SRC": object_identifiers_trgt_minus_src,
55
+ "OBJECTS_TO_COMPARE_SRC": object_identifiers_to_compare_src,
56
+ "OBJECTS_TO_COMPARE_TRGT": object_identifiers_to_compare_trgt,
57
+ "NUMBER_OF_OBJECTS_TO_COMPARE": len(objects_to_compare),
58
+ "ALL_OBJECTS_MATCHING": all_objects_matching,
59
+ "ALL_COLUMNS_EQUAL": None,
60
+ "ALL_DATATYPES_EQUAL": None,
61
+ "ALL_ROWCOUNTS_EQUAL": None,
62
+ "ALL_CHECKSUMS_EQUAL": None,
63
+ "ALL_SAMPLES_EQUAL" : None,
64
+ "ALL_OBJECTS_EQUAL": None,
65
+ #"ALL_OBJECTS_NOT_ALTERED_DURING_COMPARISON": True,
66
+ "OBJECTS": object_level_comparison_results
67
+ }
68
+
69
+ self.remaining_mapping_objects = remaining_mapping_objects
70
+ self.start_time_utc = start_time_utc
71
+
72
+ self.load_results_function_mapping = {
73
+ "SNOWFLAKE": self.load_results_to_snowflake,
74
+ "EXASOL": None,
75
+ "AZURE": None,
76
+ "TERADATA": None,
77
+ "ORACLE": None,
78
+ "DATABRICKS_HIVE_METASTORE": self.load_results_to_databricks,
79
+ "DATABRICKS_UNITY_CATALOG": self.load_results_to_databricks,
80
+ }
81
+
82
+ @staticmethod
83
+ def _compare_column_datatypes(
84
+ src_datatype: str,
85
+ trgt_datatype: str
86
+ ):
87
+ """
88
+ Compare the data types of a source- and a target-column.
89
+ Uses data-type-mapping defined in the migration_config.json.
90
+ """
91
+ if not src_datatype or not trgt_datatype:
92
+ datatype_equal = None
93
+
94
+ if src_datatype.lower() == trgt_datatype.lower():
95
+ datatype_equal = True
96
+ elif "DATATYPE_MAPPING" in TestingToolParams.migration_config["MAPPING"] and TestingToolParams.migration_config["MAPPING"]["DATATYPE_MAPPING"]:
97
+ datatype_equal = False
98
+ for datatype_mapping in TestingToolParams.migration_config["MAPPING"]["DATATYPE_MAPPING"]:
99
+ if src_datatype in datatype_mapping["src_datatypes"] and trgt_datatype in datatype_mapping["trgt_datatypes"]:
100
+ datatype_equal = True
101
+ else:
102
+ datatype_equal = False
103
+
104
+ return datatype_equal
105
+
106
+
107
+ @staticmethod
108
+ def prepare_column_level_result(
109
+ column: str,
110
+ exclude_columns: list,
111
+ result_params: ResultParams
112
+ ) -> dict:
113
+ """
114
+ Get column level result dictionary from the result parameters.
115
+ """
116
+ in_sync = False
117
+ datatype_equal = None
118
+ aggregation_type_src = None
119
+ aggregation_type_trgt = None
120
+ aggregation_type = None
121
+ aggregation_result_src = None
122
+ aggregation_result_trgt = None
123
+ aggregation_equal = None
124
+ aggregation_tolerated = None
125
+ aggregation_difference_trgt_minus_src = None
126
+ count_nulls_src = None
127
+ count_nulls_trgt = None
128
+ count_nulls_equal = None
129
+ count_nulls_difference_trgt_minus_src = None
130
+
131
+ if column in result_params.src_columns_upper:
132
+ in_src = True
133
+ src_datatype = next(item["DATA_TYPE"] for item in result_params.src_column_datatypes if item["COLUMN_NAME"].upper() == column)
134
+ else:
135
+ in_src = False
136
+ src_datatype = None
137
+
138
+ if column in result_params.trgt_columns_upper:
139
+ in_trgt = True
140
+ trgt_datatype = next(item["DATA_TYPE"] for item in result_params.trgt_column_datatypes if item["COLUMN_NAME"].upper() == column)
141
+ else:
142
+ in_trgt = False
143
+ trgt_datatype = None
144
+
145
+ if column.upper() in exclude_columns:
146
+ in_excluded = True
147
+ else:
148
+ in_excluded = False
149
+
150
+ if in_src and in_trgt :
151
+ in_sync=True
152
+ if result_params.src_columns_aggregate != {}:
153
+
154
+ if column in result_params.src_columns_aggregate:
155
+ aggregation_type_src = result_params.src_columns_aggregate[column][0]
156
+ aggregation_result_src = result_params.src_columns_aggregate[column][1]
157
+ count_nulls_src = result_params.src_columns_aggregate[column][2]
158
+
159
+ if column in result_params.trgt_columns_aggregate:
160
+ aggregation_type_trgt = result_params.trgt_columns_aggregate[column][0]
161
+ aggregation_result_trgt = result_params.trgt_columns_aggregate[column][1]
162
+ count_nulls_trgt = result_params.trgt_columns_aggregate[column][2]
163
+
164
+ if column in result_params.aggregation_differences_trgt_minus_src and result_params.aggregation_differences_trgt_minus_src[column] and not result_params.aggregation_differences_trgt_minus_src[column] == '0_0':
165
+ aggregation_equal = False
166
+ aggregation_difference_trgt_minus_src = result_params.aggregation_differences_trgt_minus_src[column]
167
+
168
+ elif aggregation_result_src is not None and aggregation_result_trgt is not None and aggregation_type_src and aggregation_type_trgt and aggregation_type_src == aggregation_type_trgt:
169
+ aggregation_equal = True
170
+
171
+ if column in result_params.aggregation_differences_trgt_minus_src and result_params.aggregation_differences_trgt_minus_src[column] == '0_0':
172
+ aggregation_difference_trgt_minus_src='0_0'
173
+
174
+ else:
175
+ aggregation_difference_trgt_minus_src='0'
176
+
177
+ if aggregation_type_src and aggregation_type_trgt and aggregation_type_src == aggregation_type_trgt:
178
+ aggregation_type = aggregation_type_src
179
+
180
+ '''
181
+ Comparison Based on Decimal Places
182
+ Logic is defined in migration_config.json
183
+ '''
184
+ aggregation_tolerated = aggregation_equal
185
+
186
+ if 'DATATYPE_TOLERANCE' in TestingToolParams.migration_config['MAPPING'].keys():
187
+ if (
188
+ src_datatype in TestingToolParams.migration_config['MAPPING']['DATATYPE_TOLERANCE'].keys()
189
+ and aggregation_type == 'SUM'
190
+ and abs(Decimal(aggregation_difference_trgt_minus_src)) <= Decimal(TestingToolParams.migration_config['MAPPING']['DATATYPE_TOLERANCE'][src_datatype])
191
+ ):
192
+ aggregation_tolerated = True
193
+ else :
194
+ aggregation_tolerated = None
195
+
196
+ if count_nulls_src is not None and count_nulls_trgt is not None and count_nulls_src==count_nulls_trgt:
197
+ count_nulls_equal = True
198
+ count_nulls_difference_trgt_minus_src = '0'
199
+ elif count_nulls_src is not None and count_nulls_trgt is not None:
200
+ count_nulls_equal = False
201
+ count_nulls_difference_trgt_minus_src = int(count_nulls_trgt)-int(count_nulls_src)
202
+
203
+ datatype_equal = ResultService._compare_column_datatypes(src_datatype, trgt_datatype)
204
+
205
+ column_comparison_result = {
206
+ "COLUMN_NAME": column,
207
+ "IN_SRC": in_src,
208
+ "IN_TRGT": in_trgt,
209
+ "IN_SYNC": in_sync,
210
+ "IN_EXCLUDED": in_excluded,
211
+ "SRC_DATATYPE": src_datatype,
212
+ "TRGT_DATATYPE": trgt_datatype,
213
+ "DATATYPE_EQUAL": datatype_equal,
214
+ "AGGREGATION_TYPE": aggregation_type,
215
+ "AGGREGATION_EQUAL": aggregation_equal,
216
+ "AGGREGATION_EQUAL_TOLERATED": aggregation_tolerated,
217
+ "AGGREGATION_RESULT_SRC": aggregation_result_src,
218
+ "AGGREGATION_RESULT_TRGT": aggregation_result_trgt,
219
+ "AGGREGATION_DIFFERENCE_TRGT_MINUS_SRC": aggregation_difference_trgt_minus_src,
220
+ "COUNT_NULLS_EQUAL": count_nulls_equal,
221
+ "COUNT_NULLS_SRC": count_nulls_src,
222
+ "COUNT_NULLS_TRGT": count_nulls_trgt,
223
+ "COUNT_NULLS_DIFFERENCE_TRGT_MINUS_SRC": count_nulls_difference_trgt_minus_src
224
+ }
225
+
226
+ return column_comparison_result
227
+
228
+ @staticmethod
229
+ def prepare_object_level_result(
230
+ src_object: DatabaseObject,
231
+ trgt_object: DatabaseObject,
232
+ src_filter: str,
233
+ trgt_filter: str,
234
+ exclude_columns: list,
235
+ result_params: ResultParams,
236
+ column_level_comparison_result: dict
237
+ ) -> dict:
238
+ """
239
+ Get object level result dictionary from the result parameters and from the column level result.
240
+ """
241
+
242
+ if 'DATATYPE_TOLERANCE' not in TestingToolParams.migration_config['MAPPING']:
243
+ aggregations_equal_tolerated = None
244
+ elif result_params.aggregations_equal:
245
+ aggregations_equal_tolerated = True
246
+ elif all([column['AGGREGATION_EQUAL_TOLERATED'] for column in column_level_comparison_result if column['AGGREGATION_EQUAL_TOLERATED'] is not None]):
247
+ aggregations_equal_tolerated = True
248
+ else:
249
+ aggregations_equal_tolerated = False
250
+
251
+ object_level_comparison_result = {
252
+ "SRC_DATABASE_NAME": src_object.database,
253
+ "SRC_SCHEMA_NAME": src_object.schema,
254
+ "SRC_OBJECT_NAME": src_object.name,
255
+ "SRC_OBJECT_TYPE": src_object.type,
256
+ "TRGT_DATABASE_NAME": trgt_object.database,
257
+ "TRGT_SCHEMA_NAME": trgt_object.schema,
258
+ "TRGT_OBJECT_NAME": trgt_object.name,
259
+ "TRGT_OBJECT_TYPE": trgt_object.type,
260
+ "SRC_FILTER": src_filter,
261
+ "TRGT_FILTER": trgt_filter,
262
+ "EXCLUDED_COLUMNS": exclude_columns,
263
+ "COLUMNS_EQUAL": result_params.columns_equal,
264
+ "COLUMN_INTERSECTION": result_params.intersection_columns_trgt_src,
265
+ "SRC_COLUMNS_MINUS_TRGT_COLUMNS": result_params.src_columns_minus_trgt_columns,
266
+ "TRGT_COLUMNS_MINUS_SRC_COLUMNS": result_params.trgt_columns_minus_src_columns,
267
+ "DATATYPES_EQUAL": result_params.datatypes_equal,
268
+ "ROW_COUNTS_EQUAL": result_params.row_counts_equal,
269
+ "SRC_ROW_COUNT": result_params.src_row_count,
270
+ "TRGT_ROW_COUNT": result_params.trgt_row_count,
271
+ "ALL_COUNT_NULLS_EQUAL": result_params.all_count_nulls_equal,
272
+ "AGGREGATIONS_EQUAL": result_params.aggregations_equal,
273
+ "AGGREGATIONS_EQUAL_TOLERATED": aggregations_equal_tolerated,
274
+ "SRC_ERROR": result_params.src_error_dict,
275
+ "TRGT_ERROR": result_params.trgt_error_dict,
276
+ "GROUP_BY_COLUMNS": result_params.object_group_by_columns,
277
+ "SRC_GROUP_BY_QUERY": result_params.src_group_by_query,
278
+ "TRGT_GROUP_BY_QUERY": result_params.trgt_group_by_query,
279
+ "GROUP_BY_EQUAL": result_params.group_by_equal,
280
+ "GROUP_BY_VALUES_WITH_MISMATCHES": result_params.group_by_values_with_mismatches,
281
+ "COLUMNS_WITH_MISMATCH": result_params.columns_with_mismatch,
282
+ "GROUP_BY_DIFF_DICT": result_params.group_by_diff_dict,
283
+ "SRC_GROUP_BY_ERROR": result_params.src_group_by_error,
284
+ "TRGT_GROUP_BY_ERROR": result_params.trgt_group_by_error,
285
+ "SAMPLES_COMPARED": result_params.samples_compared,
286
+ "SAMPLES_EQUAL": result_params.samples_equal,
287
+ "SAMPLE_KEYS": result_params.trgt_key_filters,
288
+ "SRC_SAMPLE": result_params.src_sample_dict,
289
+ "TRGT_SAMPLE": result_params.trgt_sample_dict,
290
+ "SRC_SAMPLE_QUERY": result_params.src_sample_query,
291
+ "TRGT_SAMPLE_QUERY": result_params.trgt_sample_query,
292
+ "SRC_SAMPLE_ERROR_DICT": result_params.src_sample_error_dict,
293
+ "TRGT_SAMPLE_ERROR_DICT": result_params.trgt_sample_error_dict,
294
+ "PANDAS_DATAFRAME_COMPARED" : result_params.pandas_df_compared,
295
+ "PANDAS_DATAFRAME_EQUAL": result_params.pandas_df_is_equal,
296
+ "SRC_NOT_ALTERED_DURING_COMPARISON": result_params.not_altered_during_comparison_src,
297
+ "TRGT_NOT_ALTERED_DURING_COMPARISON": result_params.not_altered_during_comparison_trgt,
298
+ "SRC_LAST_ALTERED": result_params.last_altered_src,
299
+ "TRGT_LAST_ALTERED": result_params.last_altered_trgt,
300
+ "ALL_COLUMNS": result_params.all_columns_trgt_src,
301
+ "COLUMNS": column_level_comparison_result
302
+
303
+ #"PANDAS_DATAFRAME_MISMATCH": str(err_msg)
304
+ }
305
+
306
+ return object_level_comparison_result
307
+
308
+ @staticmethod
309
+ def prepare_object_level_live_result(
310
+ object_level_comparison_result: dict,
311
+ testing_tool_params: TestingToolParams,
312
+ ) -> dict:
313
+ """
314
+ Get object level live result dictionary from the object level comparison result and from the testing tool parameters.
315
+ """
316
+ live_object_level_comparison_result = {
317
+ "PIPELINE_NAME": testing_tool_params.pipeline_name,
318
+ "PIPELINE_ID": testing_tool_params.pipeline_id,
319
+ "RUN_GUID": testing_tool_params.run_guid,
320
+ "SOURCE_SYSTEM": testing_tool_params.source_system_selection,
321
+ "TARGET_SYSTEM": testing_tool_params.target_system_selection,
322
+ "DATABASE_NAME": testing_tool_params.database_name,
323
+ #"ALL_OBJECTS_NOT_ALTERED_DURING_COMPARISON": True,
324
+ "OBJECTS": object_level_comparison_result
325
+ }
326
+
327
+ return live_object_level_comparison_result
328
+
329
+ def determine_highlevel_results(self):
330
+ """
331
+ Determine highlevel results based on all object level results.
332
+ """
333
+ logger.info(f"++++++++++++++++ DETERMINE highlevel results")
334
+
335
+ if any(not object_level_comparison_result['COLUMNS_EQUAL'] for object_level_comparison_result in self.results["OBJECTS"]):
336
+ self.results["ALL_COLUMNS_EQUAL"] = False
337
+ else:
338
+ self.results["ALL_COLUMNS_EQUAL"] = True
339
+
340
+ if any(not object_level_comparison_result['DATATYPES_EQUAL'] for object_level_comparison_result in self.results["OBJECTS"]):
341
+ self.results["ALL_DATATYPES_EQUAL"] = False
342
+ else:
343
+ self.results["ALL_DATATYPES_EQUAL"] = True
344
+
345
+ if any(not object_level_comparison_result['ROW_COUNTS_EQUAL'] for object_level_comparison_result in self.results["OBJECTS"]):
346
+ self.results["ALL_ROWCOUNTS_EQUAL"] = False
347
+ else:
348
+ self.results["ALL_ROWCOUNTS_EQUAL"] = True
349
+
350
+ if any(not object_level_comparison_result['AGGREGATIONS_EQUAL'] for object_level_comparison_result in self.results["OBJECTS"]):
351
+ self.results["ALL_CHECKSUMS_EQUAL"] = False
352
+ else:
353
+ self.results["ALL_CHECKSUMS_EQUAL"] = True
354
+
355
+ if all(not object_level_comparison_result['SAMPLES_COMPARED'] for object_level_comparison_result in self.results["OBJECTS"]):
356
+ self.results["ALL_SAMPLES_EQUAL"] = None
357
+ elif any(not object_level_comparison_result['SAMPLES_EQUAL'] for object_level_comparison_result in self.results["OBJECTS"]):
358
+ self.results["ALL_SAMPLES_EQUAL"] = False
359
+ else:
360
+ self.results["ALL_SAMPLES_EQUAL"] = True
361
+
362
+ if all(not object_level_comparison_result['PANDAS_DATAFRAME_COMPARED'] for object_level_comparison_result in self.results["OBJECTS"]):
363
+ self.results["ALL_OBJECTS_EQUAL"] = None
364
+ elif any(not object_level_comparison_result['PANDAS_DATAFRAME_EQUAL'] for object_level_comparison_result in self.results["OBJECTS"]):
365
+ self.results["ALL_OBJECTS_EQUAL"] = False
366
+ else:
367
+ self.results["ALL_OBJECTS_EQUAL"] = True
368
+
369
+ # TODO add ALL_OBJECTS_NOT_ALTERED_DURING_COMPARISON flag
370
+ #if any(object_level_comparison_result['NOT_ALTERED_DURING_COMPARISON_SRC'] == False for object_level_comparison_result in self.results["OBJECTS"]) or any(object_level_comparison_result['NOT_ALTERED_DURING_COMPARISON_TRGT'] == False for object_level_comparison_result in self.results["OBJECTS"]):
371
+ # self.results["ALL_OBJECTS_NOT_ALTERED_DURING_COMPARISON"] = False
372
+ #else:
373
+ # self.results["ALL_OBJECTS_NOT_ALTERED_DURING_COMPARISON"] = True
374
+
375
+ logger.info("\n****************************************************")
376
+ logger.info(f"++++++++++++++++ Highlevel results ++++++++++++++++")
377
+ logger.info(f"RUN_GUID: {self.run_guid}")
378
+ logger.info(f"NUMBER_OF_OBJECTS_TO_COMPARE: {self.results['NUMBER_OF_OBJECTS_TO_COMPARE']}")
379
+ logger.info(f"ALL_OBJECTS_MATCHING: {self.results['ALL_OBJECTS_MATCHING']}")
380
+ logger.info(f"ALL_COLUMNS_EQUAL: {self.results['ALL_COLUMNS_EQUAL']}")
381
+ logger.info(f"ALL_DATATYPES_EQUAL: {self.results['ALL_DATATYPES_EQUAL']}")
382
+ logger.info(f"ALL_ROWCOUNTS_EQUAL: {self.results['ALL_ROWCOUNTS_EQUAL']}")
383
+ logger.info(f"ALL_CHECKSUMS_EQUAL: {self.results['ALL_CHECKSUMS_EQUAL']}")
384
+ logger.info(f"ALL_SAMPLES_EQUAL: {self.results['ALL_SAMPLES_EQUAL']}")
385
+ logger.info(f"ALL_OBJECTS_EQUAL: {self.results['ALL_OBJECTS_EQUAL']}")
386
+ logger.info("****************************************************\n")
387
+
388
+
389
+ def load_results_to_result_database(self):
390
+ """
391
+ Initialize database service for result-system.
392
+ Load results to result database.
393
+ """
394
+ result_system_selection_type=self.systems[self.result_system_selection]["DATABASE_TYPE"]
395
+
396
+ result_system = SystemService(self.result_system_selection, self.systems)
397
+ self.connection_params_result = result_system.get_connection_params()
398
+ database_service_result=result_system.initialize_database_service(self.connection_params_result)
399
+
400
+ with database_service_result as db_service_result:
401
+
402
+ load_results_function = self.load_results_function_mapping[result_system_selection_type.upper()]
403
+
404
+ if load_results_function:
405
+ load_results_function(db_service_result, self.results)
406
+ else:
407
+ raise ValueError(f"Result system selection of type '{result_system_selection_type}' not supported!")
408
+
409
+ def load_results_to_snowflake(self, db_service_result, results: dict):
410
+ """
411
+ Load results to Snowflake.
412
+ """
413
+ logger.info(f"++++++++++++++++ LOAD comparison results to Snowflake")
414
+
415
+ db_service_result.upload_to_stage(self.stage_name, self.result_folder_path, self.result_file_name, is_temporary=True)
416
+
417
+ db_service_result.insert_json_results(self.run_guid, self.pipeline_name, self.pipeline_id, self.start_time_utc, self.result_table, self.stage_name)
418
+
419
+ db_service_result.insert_highlevel_results(results, self.run_guid, self.pipeline_name, self.pipeline_id, self.result_table_highlevel)
420
+
421
+ db_service_result.insert_objectlevel_results(self.result_table, self.result_table_objectlevel, self.run_guid)
422
+
423
+ db_service_result.insert_columnlevel_results(self.result_table, self.result_table_columnlevel, self.run_guid)
424
+
425
+ def load_results_to_databricks(self, db_service_result, results: dict):
426
+ """
427
+ Load results to Databricks Hive Metastore or Unity Catalog.
428
+ """
429
+ logger.info(f"++++++++++++++++ LOAD comparison results to Databricks")
430
+
431
+ db_service_result.create_schemas(
432
+ database_name=self.database_name,
433
+ schemas=[self.result_meta_data_schema_name, self.result_schema_name],
434
+ )
435
+
436
+ db_service_result.insert_json_results(
437
+ self.run_guid,
438
+ self.pipeline_name,
439
+ self.pipeline_id,
440
+ self.start_time_utc,
441
+ self.result_table,
442
+ self.results,
443
+ )
444
+
445
+ db_service_result.insert_highlevel_results(
446
+ results,
447
+ self.run_guid,
448
+ self.pipeline_name,
449
+ self.pipeline_id,
450
+ self.result_table_highlevel,
451
+ )
452
+
453
+ db_service_result.insert_objectlevel_results(
454
+ self.result_table,
455
+ self.result_table_objectlevel,
456
+ self.run_guid,
457
+ self.results,
458
+ )
459
+
460
+ db_service_result.insert_columnlevel_results(
461
+ self.result_table, self.result_table_columnlevel, self.run_guid, self.results
462
+ )
463
+
464
+ def upload_json_result_to_blob(self, start_time_utc:str) -> str:
465
+
466
+ """
467
+ Upload the comparison result (JSON) to a blob storage and return the full blob url. If blob container does not exist create it before uploading the blob.
468
+ """
469
+ logger.info(f"++++++++++++++++ LOAD comparison results to Azure Blob Storage")
470
+
471
+ prep_result_json = json.dumps(self.results, indent = 4, cls=CustomJSONEncoder)
472
+
473
+ blob_file_prefix = start_time_utc[0:10]
474
+ blob_file_name = f"comparison_results_{start_time_utc}_{self.pipeline_name}_{self.pipeline_id}_{self.run_guid}.json"
475
+ blob_name = f"{blob_file_prefix}/{blob_file_name}"
476
+
477
+ try:
478
+ blob_service_client = BlobServiceClient.from_connection_string(conn_str=self.azure_storage_connection_string)
479
+ except Exception as error:
480
+ logger.info(f"FAILED to connect to Azure Blob Storage with error '{str(error)}'")
481
+ raise error
482
+
483
+ container_client = blob_service_client.get_container_client(self.container_name)
484
+
485
+ if not container_client.exists():
486
+
487
+ container_client = blob_service_client.create_container(self.container_name)
488
+
489
+ blob_client = blob_service_client.get_blob_client(container=self.container_name, blob=blob_name)
490
+
491
+ blob_url = blob_client.url
492
+
493
+ try:
494
+ logger.info(f"Upload comparison result (JSON) for run_guid {self.run_guid} and pipeline_id {self.pipeline_id} to Azure Blob Storage under '{blob_url}'")
495
+ blob_client.upload_blob(prep_result_json)
496
+ except Exception as error:
497
+ logger.info(f"FAILED comparison result (JSON) upload to Azure Blob Storage under '{blob_url}' with error '{str(error)}'")
498
+ raise error
499
+
500
+ def upload_json_result_to_bucket(self, start_time_utc:str) -> str:
501
+
502
+ """
503
+ Upload the comparison result (JSON) to an AWS S3 bucket.
504
+ """
505
+ logger.info(f"++++++++++++++++ LOAD comparison results to AWS Bucket")
506
+
507
+ prep_result_json = json.dumps(self.results, indent = 4, cls=CustomJSONEncoder)
508
+
509
+ bucket_file_prefix = start_time_utc[0:10]
510
+ bucket_file_info = f"comparison_results_{start_time_utc}_{self.pipeline_name}_{self.pipeline_id}_{self.run_guid}.json"
511
+ bucket_file_name = f"{bucket_file_prefix}_-_{bucket_file_info}"
512
+
513
+ try:
514
+ s3_service_client = boto3.client(
515
+ 's3',
516
+ aws_access_key_id=self.aws_bucket_access_key,
517
+ aws_secret_access_key=self.aws_bucket_secret_key
518
+ )
519
+ except Exception as error:
520
+ logger.info(f"FAILED to connect to AWS S3 bucket with error '{str(error)}'")
521
+ raise error
522
+
523
+ try:
524
+ logger.info(f"Upload comparison result (JSON) for run_guid {self.run_guid} and pipeline_id {self.pipeline_id} to AWS S3 bucket")
525
+ s3_service_client.put_object(
526
+ Body=prep_result_json,
527
+ Bucket=self.bucket_name,
528
+ Key=bucket_file_name
529
+ )
530
+ except Exception as error:
531
+ logger.info(f"FAILED comparison result (JSON) upload to AWS S3 bucket with error '{str(error)}'")
532
+ raise error
533
+
534
+ def write_results_to_git(self):
535
+ """
536
+ Write comparison results to GIT repository.
537
+ In case of a remote pipeline run: Pull latest changes from GIT befor writing to the local repository, and push to the remote repository at the end.
538
+ """
539
+ logger.info(f"++++++++++++++++ WRITE comparison results to GIT repository")
540
+
541
+ if self.pipeline_id:
542
+ logger.info(f"++++++ Pull latest changes from GIT")
543
+ subprocess.run(["git", "checkout", f"origin/{self.branch_name}"])
544
+ subprocess.run(["git", "pull", "--no-rebase"])
545
+
546
+ logger.info(f"++++++++++++++++ WRITE to local GIT repository")
547
+
548
+ write_json_to_file(self.results, self.result_file_path)
549
+
550
+ if self.remaining_mapping_objects:
551
+ logger.info(f"++++++++++++++++ WRITE remaining mapping objects to local GIT repository")
552
+
553
+ write_json_to_file( self.remaining_mapping_objects, self.remaining_mapping_objects_file_path)
554
+
555
+ if self.pipeline_id:
556
+ logger.info(f"++++++++++++++++ PUSH latest changes to GIT Source Branch: {self.source_branch}; Branch: {self.branch_name}")
557
+
558
+ if self.azure_devops_pipeline:
559
+ subprocess.run(["git", "add", f"{self.remaining_mapping_objects_folder_name}"])
560
+ subprocess.run(["git", "add", f"{self.result_folder_name}"])
561
+ subprocess.run(["git", "commit", "-m", f"Added icsDataValidation Tool comparison results of the {self.pipeline_name} Pipeline run with ID {self.pipeline_id}"])
562
+ subprocess.run(["git", "push", "-u","origin", f"HEAD:{self.source_branch}"])
563
+
564
+ if self.gitlab_pipeline:
565
+ branches = subprocess.run(["git", "branch"], stdout=subprocess.PIPE, text=True)
566
+ logger.info('+++ BEGIN BRANCHES')
567
+ logger.info(branches.stdout)
568
+ logger.info('+++ END BRANCHES')
569
+ subprocess.run(["git", "add", f"{self.remaining_mapping_objects_folder_name}"])
570
+ subprocess.run(["git", "add", f"{self.result_folder_name}"])
571
+ subprocess.run(["git", "commit", "-m", f"Added icsDataValidation Tool comparison results of the {self.pipeline_name} Pipeline run with ID {self.pipeline_id}"])
572
+ subprocess.run(["git", "push", "-u","origin", f"HEAD:{self.source_branch}"])
573
+ subprocess.run(["git", "push", f"https://user:{self.testatm_access_token}@{self.gitlab_ci_server_host}/{self.gitlab_ci_project_path}.git/", "-u","origin", f"HEAD:origin/{self.branch_name}", "-o", "ci.skip"])