icsDataValidation 1.0.358__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- icsDataValidation/configuration.py +19 -0
- icsDataValidation/connection_setups/__init__.py +0 -0
- icsDataValidation/connection_setups/azure_connection_setup.py +19 -0
- icsDataValidation/connection_setups/databricks_connection_setup.py +28 -0
- icsDataValidation/connection_setups/exasol_connection_setup.py +17 -0
- icsDataValidation/connection_setups/oracle_connection_setup.py +26 -0
- icsDataValidation/connection_setups/snowflake_connection_setup.py +35 -0
- icsDataValidation/connection_setups/teradata_connection_setup.py +18 -0
- icsDataValidation/core/__init__.py +0 -0
- icsDataValidation/core/database_objects.py +18 -0
- icsDataValidation/core/object_comparison.py +239 -0
- icsDataValidation/input_parameters/__init__.py +0 -0
- icsDataValidation/input_parameters/testing_tool_params.py +81 -0
- icsDataValidation/main.py +250 -0
- icsDataValidation/output_parameters/__init__.py +0 -0
- icsDataValidation/output_parameters/result_params.py +94 -0
- icsDataValidation/services/__init__.py +0 -0
- icsDataValidation/services/comparison_service.py +582 -0
- icsDataValidation/services/database_services/__init__.py +0 -0
- icsDataValidation/services/database_services/azure_service.py +320 -0
- icsDataValidation/services/database_services/databricks_hive_metastore_service.py +1694 -0
- icsDataValidation/services/database_services/databricks_unity_catalog_service.py +1379 -0
- icsDataValidation/services/database_services/exasol_service.py +261 -0
- icsDataValidation/services/database_services/oracle_service.py +713 -0
- icsDataValidation/services/database_services/snowflake_service.py +1100 -0
- icsDataValidation/services/database_services/teradata_service.py +665 -0
- icsDataValidation/services/initialization_service.py +103 -0
- icsDataValidation/services/result_service.py +573 -0
- icsDataValidation/services/system_service.py +61 -0
- icsDataValidation/services/testset_service.py +257 -0
- icsDataValidation/utils/__init__.py +0 -0
- icsDataValidation/utils/file_util.py +96 -0
- icsDataValidation/utils/logger_util.py +96 -0
- icsDataValidation/utils/pandas_util.py +159 -0
- icsDataValidation/utils/parallelization_util.py +52 -0
- icsDataValidation/utils/sql_util.py +14 -0
- icsDataValidation-1.0.358.dist-info/METADATA +21 -0
- icsDataValidation-1.0.358.dist-info/RECORD +40 -0
- icsDataValidation-1.0.358.dist-info/WHEEL +5 -0
- icsDataValidation-1.0.358.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from icsDataValidation.connection_setups.snowflake_connection_setup import load_snowflake_credentials
|
|
2
|
+
from icsDataValidation.connection_setups.exasol_connection_setup import load_exasol_credentials
|
|
3
|
+
from icsDataValidation.connection_setups.azure_connection_setup import load_azure_credentials
|
|
4
|
+
from icsDataValidation.connection_setups.teradata_connection_setup import load_teradata_credentials
|
|
5
|
+
from icsDataValidation.connection_setups.oracle_connection_setup import load_oracle_credentials
|
|
6
|
+
from icsDataValidation.connection_setups.databricks_connection_setup import load_databricks_credentials
|
|
7
|
+
from icsDataValidation.services.database_services.snowflake_service import SnowflakeService
|
|
8
|
+
from icsDataValidation.services.database_services.teradata_service import TeradataService
|
|
9
|
+
from icsDataValidation.services.database_services.exasol_service import ExasolService
|
|
10
|
+
from icsDataValidation.services.database_services.azure_service import AzureService
|
|
11
|
+
from icsDataValidation.services.database_services.oracle_service import OracleService
|
|
12
|
+
from icsDataValidation.services.database_services.databricks_hive_metastore_service import DatabricksHiveMetastoreService
|
|
13
|
+
from icsDataValidation.services.database_services.databricks_unity_catalog_service import DatabricksUnityCatalogService
|
|
14
|
+
|
|
15
|
+
#########################################################################################
|
|
16
|
+
#########################################################################################
|
|
17
|
+
|
|
18
|
+
class SystemService:
|
|
19
|
+
"""
|
|
20
|
+
Class to initialize database services dependent on the system selection.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, system_selection: str, database_config: dict):
|
|
24
|
+
self.database_config = database_config
|
|
25
|
+
self.system_selection = system_selection
|
|
26
|
+
self.system_type = database_config[system_selection]["DATABASE_TYPE"].upper()
|
|
27
|
+
|
|
28
|
+
def get_connection_params(self):
|
|
29
|
+
"""
|
|
30
|
+
Get the connection parameters dependent on the system type.
|
|
31
|
+
"""
|
|
32
|
+
credentials_function_mapping = {
|
|
33
|
+
"SNOWFLAKE": load_snowflake_credentials,
|
|
34
|
+
"EXASOL": load_exasol_credentials,
|
|
35
|
+
"AZURE": load_azure_credentials,
|
|
36
|
+
"TERADATA": load_teradata_credentials,
|
|
37
|
+
"ORACLE": load_oracle_credentials,
|
|
38
|
+
"DATABRICKS_HIVE_METASTORE": load_databricks_credentials,
|
|
39
|
+
"DATABRICKS_UNITY_CATALOG": load_databricks_credentials,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
connection_params = credentials_function_mapping[self.system_type](
|
|
43
|
+
self.database_config, self.system_selection
|
|
44
|
+
)
|
|
45
|
+
return connection_params
|
|
46
|
+
|
|
47
|
+
def initialize_database_service(self, connection_params: dict):
|
|
48
|
+
"""
|
|
49
|
+
Initialize the database service dependent on the system type.
|
|
50
|
+
"""
|
|
51
|
+
database_service_mapping = {
|
|
52
|
+
"SNOWFLAKE": SnowflakeService,
|
|
53
|
+
"EXASOL": ExasolService,
|
|
54
|
+
"AZURE": AzureService,
|
|
55
|
+
"TERADATA": TeradataService,
|
|
56
|
+
"ORACLE": OracleService,
|
|
57
|
+
"DATABRICKS_HIVE_METASTORE": DatabricksHiveMetastoreService,
|
|
58
|
+
"DATABRICKS_UNITY_CATALOG": DatabricksUnityCatalogService,
|
|
59
|
+
}
|
|
60
|
+
database_service = database_service_mapping[self.system_type](connection_params)
|
|
61
|
+
return database_service
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from icsDataValidation.utils.logger_util import configure_dev_ops_logger
|
|
4
|
+
|
|
5
|
+
#########################################################################################
|
|
6
|
+
# Configure Dev Ops Logger
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger('TestsetService')
|
|
9
|
+
logger.setLevel(logging.INFO)
|
|
10
|
+
configure_dev_ops_logger(logger)
|
|
11
|
+
|
|
12
|
+
#########################################################################################
|
|
13
|
+
#########################################################################################
|
|
14
|
+
|
|
15
|
+
class TestsetService:
|
|
16
|
+
"""
|
|
17
|
+
Class to prepare the set of objects for the comparison.
|
|
18
|
+
Maps schemas and objects between source and target.
|
|
19
|
+
Handles blacklists and whitelists.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, testset_mapping:dict, testset_blacklist: dict, testset_whitelist: dict=None):
|
|
23
|
+
self.testset_mapping = testset_mapping
|
|
24
|
+
self.testset_blacklist = testset_blacklist
|
|
25
|
+
self.testset_whitelist = testset_whitelist
|
|
26
|
+
|
|
27
|
+
def handle_database_mapping(self, source_database_name: str = None) -> str:
|
|
28
|
+
"""
|
|
29
|
+
Map the source and the target database.
|
|
30
|
+
Note: Case-Insensitive and returns upper-case target database name.
|
|
31
|
+
"""
|
|
32
|
+
target_database_name=source_database_name.upper()
|
|
33
|
+
|
|
34
|
+
if self.testset_mapping and "DATABASE_MAPPING" in self.testset_mapping:
|
|
35
|
+
for database_mapping in self.testset_mapping["DATABASE_MAPPING"]:
|
|
36
|
+
if source_database_name.upper() == database_mapping["src_database_name"].upper():
|
|
37
|
+
target_database_name = database_mapping["trgt_database_name"].upper()
|
|
38
|
+
|
|
39
|
+
return target_database_name
|
|
40
|
+
|
|
41
|
+
def handle_schema_mapping(self, source_schema_name: str = None, source_database_name: str = None) -> str:
|
|
42
|
+
"""
|
|
43
|
+
Map the source and the target schema.
|
|
44
|
+
Note: Case-Insensitive and returns upper-case target schema name.
|
|
45
|
+
"""
|
|
46
|
+
target_schema_name=source_schema_name.upper()
|
|
47
|
+
found_schema_mapping = False
|
|
48
|
+
|
|
49
|
+
if self.testset_mapping and "SCHEMA_MAPPING" in self.testset_mapping:
|
|
50
|
+
for schema_mapping in self.testset_mapping["SCHEMA_MAPPING"]:
|
|
51
|
+
|
|
52
|
+
if f"{source_database_name.upper()}.{source_schema_name.upper()}" == schema_mapping["src_schema_identifier"].upper():
|
|
53
|
+
target_schema_name = schema_mapping["trgt_schema_name"].upper()
|
|
54
|
+
found_schema_mapping = True
|
|
55
|
+
|
|
56
|
+
return target_schema_name, found_schema_mapping
|
|
57
|
+
|
|
58
|
+
def handle_schema_replace_mapping(self, source_schema_name: str = None) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Map the source and the target schema by replacing a subset of the target schema string.
|
|
61
|
+
Note: Case-Insensitive and returns upper-case target schema name.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
if self.testset_mapping and "SCHEMA_REPLACE_MAPPING" in self.testset_mapping:
|
|
65
|
+
replace_mapping = self.testset_mapping["SCHEMA_REPLACE_MAPPING"]
|
|
66
|
+
for replace_object in replace_mapping:
|
|
67
|
+
target_schema_name = source_schema_name.upper().replace(
|
|
68
|
+
replace_object["src_replace_value"].upper(),
|
|
69
|
+
replace_object["trgt_replace_value"].upper(),
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
target_schema_name=source_schema_name.upper()
|
|
73
|
+
|
|
74
|
+
return target_schema_name
|
|
75
|
+
|
|
76
|
+
def handle_blacklist(self, database_objects: dict, src_trgt: str)-> dict:
|
|
77
|
+
"""
|
|
78
|
+
Handle the blacklist from the migration_config to restrict database objects.
|
|
79
|
+
Use src_trgt="SRC" for source and src_trgt="TRGT" for target.
|
|
80
|
+
"""
|
|
81
|
+
blacklist_objects=[object_blacklisted.upper() for object_blacklisted in self.testset_blacklist[f"BLACKLIST_OBJECTS_{src_trgt}"]]
|
|
82
|
+
blacklist_schemas=[schema_blacklisted.upper() for schema_blacklisted in self.testset_blacklist[f"BLACKLIST_SCHEMAS_{src_trgt}"]]
|
|
83
|
+
blacklist_databases=[database_blacklisted.upper() for database_blacklisted in self.testset_blacklist[f"BLACKLIST_DATABASES_{src_trgt}"]]
|
|
84
|
+
|
|
85
|
+
database_objects_=database_objects.copy()
|
|
86
|
+
|
|
87
|
+
for db_object in database_objects_:
|
|
88
|
+
database_name = db_object["object_identifier"].split(".",1)[0]
|
|
89
|
+
schema_identifier = ".".join(db_object["object_identifier"].split(".",2)[:2])
|
|
90
|
+
if database_name in blacklist_databases:
|
|
91
|
+
database_objects.remove(db_object)
|
|
92
|
+
elif schema_identifier in blacklist_schemas:
|
|
93
|
+
database_objects.remove(db_object)
|
|
94
|
+
elif db_object["object_identifier"] in blacklist_objects:
|
|
95
|
+
database_objects.remove(db_object)
|
|
96
|
+
|
|
97
|
+
return database_objects
|
|
98
|
+
|
|
99
|
+
def handle_whitelist(self, database_objects: dict, src_trgt: str)-> dict:
|
|
100
|
+
"""
|
|
101
|
+
Handle the whitelist which is defined as a testset to restrict database objects.
|
|
102
|
+
Use src_trgt="SRC" for source and src_trgt="TRGT" for target.
|
|
103
|
+
"""
|
|
104
|
+
whitelist_objects=[object_whitelisted.upper() for object_whitelisted in self.testset_whitelist[f"WHITELIST_OBJECTS_{src_trgt}"]]
|
|
105
|
+
whitelist_schemas=[schema_whitelisted.upper() for schema_whitelisted in self.testset_whitelist[f"WHITELIST_SCHEMAS_{src_trgt}"]]
|
|
106
|
+
whitelist_databases=[database_whitelisted.upper() for database_whitelisted in self.testset_whitelist[f"WHITELIST_DATABASES_{src_trgt}"]]
|
|
107
|
+
|
|
108
|
+
database_objects_=database_objects.copy()
|
|
109
|
+
|
|
110
|
+
for db_object in database_objects_:
|
|
111
|
+
database_name = db_object["object_identifier"].split(".",1)[0]
|
|
112
|
+
schema_identifier = ".".join(db_object["object_identifier"].split(".",2)[:2])
|
|
113
|
+
if not db_object["object_identifier"].upper() in whitelist_objects and schema_identifier.upper() not in whitelist_schemas and database_name.upper() not in whitelist_databases:
|
|
114
|
+
database_objects.remove(db_object)
|
|
115
|
+
|
|
116
|
+
return database_objects
|
|
117
|
+
|
|
118
|
+
def map_objects(self, database_objects_src: list, database_objects_trgt: list):
|
|
119
|
+
"""
|
|
120
|
+
Maps objects between source and target by using the mapping defined in the migration_config.json.
|
|
121
|
+
Handles object "1:1"-mapping and object "replace"-mapping.
|
|
122
|
+
Returns remaining_mapping_objects which differ between source and target and can not be mapped.
|
|
123
|
+
Returns a flag all_objects_matching which indicates if there exist remaining_mapping_objects.
|
|
124
|
+
"""
|
|
125
|
+
intersection_objects_mapped_trgt_src = []
|
|
126
|
+
remaining_mapping_objects = []
|
|
127
|
+
src_objects_minus_trgt_objects = [object for object in database_objects_src if object not in database_objects_trgt]
|
|
128
|
+
trgt_objects_minus_src_objects = [object for object in database_objects_trgt if object not in database_objects_src]
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
trgt_objects_minus_src_table_identifiers = [object["object_identifier"] for object in database_objects_trgt if object not in database_objects_src and object["object_type"] == 'table']
|
|
132
|
+
trgt_objects_minus_src_view_identifiers = [object["object_identifier"] for object in database_objects_trgt if object not in database_objects_src and object["object_type"] == 'view']
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if database_objects_src != database_objects_trgt and self.testset_mapping:
|
|
136
|
+
|
|
137
|
+
src_objects_minus_trgt_objects_ = src_objects_minus_trgt_objects.copy()
|
|
138
|
+
|
|
139
|
+
trgt_objects_minus_src_object_identifiers=[object["object_identifier"] for object in trgt_objects_minus_src_objects]
|
|
140
|
+
|
|
141
|
+
for n_db_object, db_object in enumerate(src_objects_minus_trgt_objects_):
|
|
142
|
+
logger.info(f"Object {n_db_object+1} of {len(src_objects_minus_trgt_objects_)}: {db_object}")
|
|
143
|
+
continue_flag = True
|
|
144
|
+
|
|
145
|
+
#########################################################################################
|
|
146
|
+
# Object-Mapping
|
|
147
|
+
for mapping in self.testset_mapping["OBJECT_MAPPING"]:
|
|
148
|
+
|
|
149
|
+
if (
|
|
150
|
+
db_object["object_identifier"] == mapping["src_object_identifier"].upper()
|
|
151
|
+
and db_object["object_type"] == mapping["src_object_type"]
|
|
152
|
+
and mapping['trgt_object_identifier'].upper() in trgt_objects_minus_src_object_identifiers
|
|
153
|
+
):
|
|
154
|
+
logger.info(f" -> mapping object found: {mapping}")
|
|
155
|
+
intersection_objects_mapped_trgt_src.append({"src_object_identifier": db_object["object_identifier"],"src_object_type": db_object["object_type"], "trgt_object_identifier": mapping["trgt_object_identifier"],"trgt_object_type": mapping["trgt_object_type"]})
|
|
156
|
+
src_objects_minus_trgt_objects.remove(db_object)
|
|
157
|
+
|
|
158
|
+
for trgt_object in trgt_objects_minus_src_objects:
|
|
159
|
+
if trgt_object["object_identifier"] == mapping["trgt_object_identifier"].upper():
|
|
160
|
+
trgt_objects_minus_src_objects.remove(trgt_object)
|
|
161
|
+
logger.info(" -> added by 1:1 mapping")
|
|
162
|
+
|
|
163
|
+
# set continue_flag to false because this object has been covered by the mapping
|
|
164
|
+
continue_flag = False
|
|
165
|
+
break
|
|
166
|
+
|
|
167
|
+
##########################################################################################
|
|
168
|
+
# Database-Mapping, and Schema-Mapping
|
|
169
|
+
|
|
170
|
+
if continue_flag == True:
|
|
171
|
+
|
|
172
|
+
src_database_name = db_object["object_identifier"].split(".",1)[0]
|
|
173
|
+
src_schema_name = db_object["object_identifier"].split(".",2)[1]
|
|
174
|
+
src_object_name = db_object["object_identifier"].split(".",2)[2]
|
|
175
|
+
|
|
176
|
+
trgt_database_name=self.handle_database_mapping(src_database_name)
|
|
177
|
+
trgt_schema_name, _ =self.handle_schema_mapping(src_schema_name,src_database_name)
|
|
178
|
+
|
|
179
|
+
trgt_object_identifier=f"{trgt_database_name}.{trgt_schema_name}.{src_object_name}".upper()
|
|
180
|
+
|
|
181
|
+
if (db_object["object_type"] == 'table' and trgt_object_identifier in trgt_objects_minus_src_table_identifiers) or (db_object["object_type"] == 'view' and trgt_object_identifier in trgt_objects_minus_src_view_identifiers):
|
|
182
|
+
intersection_objects_mapped_trgt_src.append({"src_object_identifier": db_object["object_identifier"],"src_object_type": db_object["object_type"], "trgt_object_identifier": trgt_object_identifier,"trgt_object_type": db_object["object_type"]})
|
|
183
|
+
src_objects_minus_trgt_objects.remove(db_object)
|
|
184
|
+
|
|
185
|
+
for trgt_object in trgt_objects_minus_src_objects:
|
|
186
|
+
if trgt_object["object_identifier"] == trgt_object_identifier:
|
|
187
|
+
trgt_objects_minus_src_objects.remove(trgt_object)
|
|
188
|
+
|
|
189
|
+
logger.info(" -> added by database/schema-mapping")
|
|
190
|
+
|
|
191
|
+
# set continue_flag to false because this object has been covered by the replacements
|
|
192
|
+
continue_flag = False
|
|
193
|
+
|
|
194
|
+
##########################################################################################
|
|
195
|
+
# Replace-Mapping
|
|
196
|
+
|
|
197
|
+
if continue_flag == True:
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
src_database_name = db_object["object_identifier"].split(".",1)[0]
|
|
201
|
+
src_schema_name = db_object["object_identifier"].split(".",2)[1]
|
|
202
|
+
src_object_name = db_object["object_identifier"].split(".",2)[2]
|
|
203
|
+
|
|
204
|
+
#TODO rework!!!!
|
|
205
|
+
|
|
206
|
+
## replace the values from the migration_config.json to create a potential_match which can be looked for the trgt_objects_minus_src_objects list
|
|
207
|
+
#potential_match = db_object["object_identifier"].upper().replace(f'{substitute["src_replace_value"].upper()}',f'{substitute["trgt_replace_value"].upper()}')
|
|
208
|
+
#
|
|
209
|
+
## the potential_match is contained within the trgt_objects_minus_src_objects list
|
|
210
|
+
#if potential_match in trgt_objects_minus_src_object_identifiers:
|
|
211
|
+
# logger.info(f" -> replace mapping found: {substitute}")
|
|
212
|
+
# intersection_objects_mapped_trgt_src.append({"src_object_identifier": db_object["object_identifier"],"src_object_type": db_object["object_type"], "trgt_object_identifier": potential_match,"trgt_object_type": db_object["object_type"]})
|
|
213
|
+
# src_objects_minus_trgt_objects.remove(db_object)
|
|
214
|
+
#
|
|
215
|
+
# for trgt_object in trgt_objects_minus_src_objects:
|
|
216
|
+
# if trgt_object["object_identifier"] == potential_match:
|
|
217
|
+
# trgt_objects_minus_src_objects.remove(trgt_object)
|
|
218
|
+
# logger.info(" -> added by replace mapping")
|
|
219
|
+
#
|
|
220
|
+
# # set continue_flag to false because this object has been covered by the replacements
|
|
221
|
+
# continue_flag = False
|
|
222
|
+
# break
|
|
223
|
+
|
|
224
|
+
#####################################################################
|
|
225
|
+
# Remaining objects
|
|
226
|
+
if continue_flag == True:
|
|
227
|
+
remaining_mapping_objects.append({"src_object_identifier": db_object["object_identifier"],"trgt_object_identifier": '',"src_object_type": db_object["object_type"],"trgt_object_type": ''})
|
|
228
|
+
logger.info(" -> no mapping found -> added to remaining_mapping_objects")
|
|
229
|
+
|
|
230
|
+
object_identifiers_src_minus_trgt= [object["object_identifier"] for object in src_objects_minus_trgt_objects]
|
|
231
|
+
object_identifiers_trgt_minus_src= [object["object_identifier"] for object in trgt_objects_minus_src_objects]
|
|
232
|
+
|
|
233
|
+
if src_objects_minus_trgt_objects:
|
|
234
|
+
logger.warning('There are database objects in the source db that are not in the target db and for which no mapping exists:')
|
|
235
|
+
logger.warning(f"{object_identifiers_src_minus_trgt}")
|
|
236
|
+
if trgt_objects_minus_src_objects:
|
|
237
|
+
logger.warning('There are database objects in the target db that are not in the source db and for which no mapping exists:')
|
|
238
|
+
logger.warning(f"{object_identifiers_trgt_minus_src}")
|
|
239
|
+
|
|
240
|
+
if not (src_objects_minus_trgt_objects and trgt_objects_minus_src_objects):
|
|
241
|
+
all_objects_matching=True
|
|
242
|
+
else:
|
|
243
|
+
all_objects_matching=False
|
|
244
|
+
|
|
245
|
+
return intersection_objects_mapped_trgt_src, object_identifiers_src_minus_trgt, object_identifiers_trgt_minus_src, remaining_mapping_objects, all_objects_matching
|
|
246
|
+
|
|
247
|
+
@staticmethod
|
|
248
|
+
def get_intersection_objects_trgt_src(database_objects_src: list, database_objects_trgt: list, intersection_objects_mapped_trgt_src:list):
|
|
249
|
+
"""
|
|
250
|
+
Get intersection of all database objects from source db and target db - including mapped objects.
|
|
251
|
+
"""
|
|
252
|
+
|
|
253
|
+
intersection_objects_trgt_src_without_mapping =[{"src_object_identifier": object["object_identifier"],"src_object_type": object["object_type"],"trgt_object_identifier": object["object_identifier"],"trgt_object_type": object["object_type"]} for object in database_objects_src if object in database_objects_trgt]
|
|
254
|
+
|
|
255
|
+
intersection_objects_trgt_src= intersection_objects_trgt_src_without_mapping + intersection_objects_mapped_trgt_src
|
|
256
|
+
|
|
257
|
+
return intersection_objects_trgt_src
|
|
File without changes
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import errno
|
|
2
|
+
import os
|
|
3
|
+
from typing import Union
|
|
4
|
+
import json
|
|
5
|
+
import decimal
|
|
6
|
+
import numpy as np
|
|
7
|
+
class CustomJSONEncoder(json.JSONEncoder):
|
|
8
|
+
def default(self, o):
|
|
9
|
+
if isinstance(o, decimal.Decimal):
|
|
10
|
+
return str(o)
|
|
11
|
+
if isinstance(o, np.integer):
|
|
12
|
+
return int(o)
|
|
13
|
+
if isinstance(o, np.floating):
|
|
14
|
+
return float(o)
|
|
15
|
+
if isinstance(o, np.ndarray):
|
|
16
|
+
return o.tolist()
|
|
17
|
+
try:
|
|
18
|
+
super(CustomJSONEncoder, self).default(o)
|
|
19
|
+
except:
|
|
20
|
+
return str(o)
|
|
21
|
+
|
|
22
|
+
return super(CustomJSONEncoder, self).default(o)
|
|
23
|
+
|
|
24
|
+
def load_file(file_path: str, encoding: str = "utf-8-sig") -> str:
|
|
25
|
+
"""
|
|
26
|
+
Reads and returns the file content of given path.
|
|
27
|
+
Encodings tried in following order:
|
|
28
|
+
utf-8-sig - default
|
|
29
|
+
utf-8
|
|
30
|
+
utf-16-le
|
|
31
|
+
utf-16
|
|
32
|
+
cp1252
|
|
33
|
+
Args:
|
|
34
|
+
file_path: absolute file path to a file
|
|
35
|
+
encoding: specific code page name
|
|
36
|
+
Raises:
|
|
37
|
+
EnvironmentError - if file could not been read with stated encodings
|
|
38
|
+
Returns:
|
|
39
|
+
File content as string representation
|
|
40
|
+
"""
|
|
41
|
+
if not os.path.exists(file_path):
|
|
42
|
+
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path)
|
|
43
|
+
else:
|
|
44
|
+
try:
|
|
45
|
+
with open(file_path, "r", encoding=encoding) as file:
|
|
46
|
+
return file.read()
|
|
47
|
+
except:
|
|
48
|
+
try:
|
|
49
|
+
encoding = "utf-8"
|
|
50
|
+
with open(file_path, "r", encoding=encoding) as file:
|
|
51
|
+
return file.read()
|
|
52
|
+
except:
|
|
53
|
+
try:
|
|
54
|
+
encoding = "utf-16-le"
|
|
55
|
+
with open(file_path, "r", encoding=encoding) as file:
|
|
56
|
+
return file.read()
|
|
57
|
+
except:
|
|
58
|
+
try:
|
|
59
|
+
encoding = "utf-16"
|
|
60
|
+
with open(file_path, "r", encoding=encoding) as file:
|
|
61
|
+
return file.read()
|
|
62
|
+
except:
|
|
63
|
+
try:
|
|
64
|
+
encoding = "cp1252"
|
|
65
|
+
with open(file_path, "r", encoding=encoding) as file:
|
|
66
|
+
return file.read()
|
|
67
|
+
except:
|
|
68
|
+
raise EnvironmentError(
|
|
69
|
+
f"Can not read file {file_path}. Tried utf-8-sig (BOM), utf-8, utf-16, utf-16-le and cp1252."
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def load_json(file_path: str, encoding: str = "utf-8-sig") -> Union[dict, None]:
|
|
73
|
+
"""
|
|
74
|
+
Reads amd returns a given json file. Content must be in valid JSON Schema.
|
|
75
|
+
Valid JSON Schema should not have any trailing commas.
|
|
76
|
+
Encodings tried in following order:
|
|
77
|
+
utf-8-sig - default
|
|
78
|
+
utf-8
|
|
79
|
+
utf-16-le
|
|
80
|
+
utf-16
|
|
81
|
+
cp1252
|
|
82
|
+
Args:
|
|
83
|
+
file_path: absolute file path to a file
|
|
84
|
+
encoding: specific code page name
|
|
85
|
+
Raises:
|
|
86
|
+
EnvironmentError - if file could not been read with stated encodings
|
|
87
|
+
Returns:
|
|
88
|
+
File content as dictionary.
|
|
89
|
+
If the file path does not exists None will be returned.
|
|
90
|
+
"""
|
|
91
|
+
return json.loads(load_file(file_path, encoding))
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def write_json_to_file(json_object: dict, file_path: str) -> None:
|
|
95
|
+
with open(f"{file_path}", 'w') as f:
|
|
96
|
+
json.dump(json_object, f, indent=4, cls=CustomJSONEncoder)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
class DevOpsFormatter(logging.Formatter):
|
|
5
|
+
error_format = "##vso[task.logissue type=error][%(name)s] %(message)s"
|
|
6
|
+
warning_format = "##vso[task.logissue type=warning][%(name)s] %(message)s"
|
|
7
|
+
dbg_fmt = "DBG: %(module)s: %(lineno)d: %(msg)s"
|
|
8
|
+
info_format = "%(message)s"
|
|
9
|
+
section_format = "%(message)s"
|
|
10
|
+
|
|
11
|
+
def __init__(self, fmt="%(levelno)s: %(msg)s", section_info = False):
|
|
12
|
+
super().__init__(fmt=fmt)
|
|
13
|
+
self._section_info = section_info
|
|
14
|
+
|
|
15
|
+
def parse_progress(self, message:str) -> str:
|
|
16
|
+
"""
|
|
17
|
+
Parses Method for progress information
|
|
18
|
+
"""
|
|
19
|
+
progress_pattern = re.compile(r".*PROGRESS\s\[\s*'(?P<x>\d+)\/(?P<y>\d+)'\s*\].*", re.IGNORECASE | re.MULTILINE)
|
|
20
|
+
|
|
21
|
+
if (progress_pattern.match(message)):
|
|
22
|
+
progress_match = progress_pattern.search(message)
|
|
23
|
+
x = int(progress_match.group('x'))
|
|
24
|
+
y = int(progress_match.group('y'))
|
|
25
|
+
progress_value = round((x / y)*100)
|
|
26
|
+
return f"##vso[task.setprogress value={progress_value};]script progress\n"
|
|
27
|
+
|
|
28
|
+
return ""
|
|
29
|
+
|
|
30
|
+
def parse_group_start(self, message:str) -> str:
|
|
31
|
+
"""
|
|
32
|
+
Parses if its a group start and prepends a command string to the message
|
|
33
|
+
"""
|
|
34
|
+
start_pattern = re.compile(r".*START\s(?P<gname>.+?\s\[\s*'.*'\s*\]).*", re.IGNORECASE | re.MULTILINE)
|
|
35
|
+
|
|
36
|
+
if (start_pattern.match(message)):
|
|
37
|
+
start_match = start_pattern.search(message)
|
|
38
|
+
return f"##[group]{start_match.group('gname')}\n"
|
|
39
|
+
|
|
40
|
+
return ""
|
|
41
|
+
|
|
42
|
+
def parse_group_end(self, message:str) -> str:
|
|
43
|
+
"""
|
|
44
|
+
Parses if its a group end and appends a command string to the message
|
|
45
|
+
"""
|
|
46
|
+
end_pattern = re.compile(r".*FINISHED\s.+?\s\[\s*'.*'\s*\].*", re.IGNORECASE | re.MULTILINE)
|
|
47
|
+
|
|
48
|
+
if (end_pattern.match(message)):
|
|
49
|
+
return "\n##[endgroup]"
|
|
50
|
+
|
|
51
|
+
return ""
|
|
52
|
+
|
|
53
|
+
def format(self, record):
|
|
54
|
+
|
|
55
|
+
# Save the original format configured by the user
|
|
56
|
+
# when the logger formatter was instantiated
|
|
57
|
+
format_orig = self._style._fmt
|
|
58
|
+
|
|
59
|
+
if (record.levelno == logging.INFO):
|
|
60
|
+
record_message = f"{record.msg}"
|
|
61
|
+
|
|
62
|
+
return f"{self.parse_progress(record.msg)}{self.parse_group_start(record.msg)}{record_message}{self.parse_group_end(record.msg)}"
|
|
63
|
+
|
|
64
|
+
# Replace the original format with one customized by logging level
|
|
65
|
+
if record.levelno == logging.DEBUG:
|
|
66
|
+
self._fmt = DevOpsFormatter.dbg_fmt
|
|
67
|
+
elif record.levelno == logging.INFO and not self._section_info:
|
|
68
|
+
self._style._fmt = DevOpsFormatter.info_format
|
|
69
|
+
|
|
70
|
+
elif record.levelno == logging.INFO and self._section_info:
|
|
71
|
+
self._style._fmt = DevOpsFormatter.section_format
|
|
72
|
+
|
|
73
|
+
elif record.levelno == logging.ERROR:
|
|
74
|
+
self._style._fmt = DevOpsFormatter.error_format
|
|
75
|
+
|
|
76
|
+
elif record.levelno == logging.WARNING:
|
|
77
|
+
self._style._fmt = DevOpsFormatter.warning_format
|
|
78
|
+
|
|
79
|
+
# Call the original formatter class to do the grunt work
|
|
80
|
+
result = logging.Formatter.format(self, record)
|
|
81
|
+
|
|
82
|
+
# Restore the original format configured by the user
|
|
83
|
+
self._style._fmt = format_orig
|
|
84
|
+
|
|
85
|
+
return result
|
|
86
|
+
|
|
87
|
+
def configure_dev_ops_logger(logger: logging.Logger) -> None:
|
|
88
|
+
"""
|
|
89
|
+
Configure logging for azure devops
|
|
90
|
+
"""
|
|
91
|
+
snf_logger = logging.getLogger('snowflake.connector.ocsp_snowflake')
|
|
92
|
+
snf_logger.disabled = True
|
|
93
|
+
section_formatter = DevOpsFormatter(section_info=True)
|
|
94
|
+
section_handler = logging.StreamHandler()
|
|
95
|
+
section_handler.setFormatter(section_formatter)
|
|
96
|
+
logger.addHandler(section_handler)
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from decimal import Decimal
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_diff_dataframes(df_1, df_2, key_columns_1, key_columns_2):
|
|
7
|
+
"""
|
|
8
|
+
Get the the difference between two Pandas Dataframes by sorting over specific key-columns.
|
|
9
|
+
Returns the two dataframes containing only the rows with differences from the input dataframes and the sorted dataframes.
|
|
10
|
+
"""
|
|
11
|
+
df_1_sorted = df_1.sort_values(by=key_columns_1).reset_index(drop=True)
|
|
12
|
+
df_2_sorted = df_2.sort_values(by=key_columns_2).reset_index(drop=True)
|
|
13
|
+
|
|
14
|
+
diff_1 = df_1_sorted[~df_1_sorted.apply(tuple,1).isin(df_2_sorted.apply(tuple,1))]
|
|
15
|
+
diff_2 = df_2_sorted[~df_2_sorted.apply(tuple,1).isin(df_1_sorted.apply(tuple,1))]
|
|
16
|
+
|
|
17
|
+
diff_1 = diff_1.reset_index(drop=True)
|
|
18
|
+
diff_2 = diff_2.reset_index(drop=True)
|
|
19
|
+
|
|
20
|
+
return diff_1, diff_2, df_1_sorted, df_2_sorted
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_diff_dict_from_diff_dataframes(diff_1, diff_2, key_columns_1, key_columns_2, key_column_values_with_mismatches, numeric_scale):
|
|
24
|
+
"""
|
|
25
|
+
Get the
|
|
26
|
+
"""
|
|
27
|
+
diff_dict = {}
|
|
28
|
+
|
|
29
|
+
#TODO support a list of key_columns_1 (and key_columns_2) and a dictionary of key_column_values_with_mismatches
|
|
30
|
+
key_column_1=key_columns_1[0]
|
|
31
|
+
key_column_2=key_columns_2[0]
|
|
32
|
+
key_column_values_with_mismatches=key_column_values_with_mismatches[key_column_1]
|
|
33
|
+
|
|
34
|
+
for value_with_mismatch in key_column_values_with_mismatches:
|
|
35
|
+
|
|
36
|
+
if value_with_mismatch is None:
|
|
37
|
+
row_1_with_mismatch = diff_1.loc[diff_1[key_column_1].isnull()]
|
|
38
|
+
row_2_with_mismatch = diff_2.loc[diff_2[key_column_2].isnull()]
|
|
39
|
+
value_with_mismatch = 'NULL'
|
|
40
|
+
else:
|
|
41
|
+
row_1_with_mismatch = diff_1.loc[diff_1[key_column_1] == value_with_mismatch]
|
|
42
|
+
row_2_with_mismatch = diff_2.loc[diff_2[key_column_2] == value_with_mismatch]
|
|
43
|
+
value_with_mismatch = str(value_with_mismatch)
|
|
44
|
+
|
|
45
|
+
diff_dict[value_with_mismatch] = {}
|
|
46
|
+
|
|
47
|
+
for column in row_1_with_mismatch:
|
|
48
|
+
if column == 'group_by_column' or column not in row_2_with_mismatch or column in key_columns_1:
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
if row_1_with_mismatch[column].values.size > 0:
|
|
52
|
+
src_value=row_1_with_mismatch[column].values[0]
|
|
53
|
+
elif column=='COUNT_OF_GROUP_BY_VALUE':
|
|
54
|
+
src_value=0
|
|
55
|
+
else:
|
|
56
|
+
src_value= None
|
|
57
|
+
|
|
58
|
+
if row_2_with_mismatch[column].values.size > 0:
|
|
59
|
+
trgt_value=row_2_with_mismatch[column].values[0]
|
|
60
|
+
elif column=='COUNT_OF_GROUP_BY_VALUE':
|
|
61
|
+
trgt_value=0
|
|
62
|
+
else:
|
|
63
|
+
trgt_value= None
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
src_value= src_value.item()
|
|
67
|
+
except Exception:
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
trgt_value= trgt_value.item()
|
|
72
|
+
except Exception:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
if src_value != trgt_value:
|
|
76
|
+
if src_value is None:
|
|
77
|
+
diff_trgt_minus_src = trgt_value
|
|
78
|
+
elif trgt_value is None:
|
|
79
|
+
if isinstance(src_value, str) or isinstance(trgt_value, str):
|
|
80
|
+
diff_trgt_minus_src = f"{-int(src_value.split('_',1)[0])}_{-int(src_value.split('_',1)[1])}"
|
|
81
|
+
else:
|
|
82
|
+
diff_trgt_minus_src = -round(float(src_value), numeric_scale)
|
|
83
|
+
else:
|
|
84
|
+
if isinstance(src_value, str) or isinstance(trgt_value, str):
|
|
85
|
+
diff_trgt_minus_src = f"{int(trgt_value.split('_',1)[0])-int(src_value.split('_',1)[0])}_{int(trgt_value.split('_',1)[1])-int(src_value.split('_',1)[1])}"
|
|
86
|
+
else:
|
|
87
|
+
diff_trgt_minus_src = round(float(trgt_value)-float(src_value), numeric_scale)
|
|
88
|
+
|
|
89
|
+
if diff_trgt_minus_src:
|
|
90
|
+
diff_dict[value_with_mismatch][column] = {
|
|
91
|
+
"SRC_VALUE": src_value,
|
|
92
|
+
"TRGT_VALUE": trgt_value,
|
|
93
|
+
"DIFF_TRGT_MINUS_SRC": diff_trgt_minus_src
|
|
94
|
+
}
|
|
95
|
+
if not diff_dict[value_with_mismatch]:
|
|
96
|
+
diff_dict.pop(value_with_mismatch)
|
|
97
|
+
|
|
98
|
+
if not diff_dict:
|
|
99
|
+
diff_dict = None
|
|
100
|
+
|
|
101
|
+
return diff_dict
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
#########################################################################################################
|
|
106
|
+
#TODO write as pytest
|
|
107
|
+
# Test Space
|
|
108
|
+
import pandas as pd
|
|
109
|
+
|
|
110
|
+
# *** TEST 1 ***
|
|
111
|
+
#df1 = pd.DataFrame({'group_by_column': [1, 2,3,6], 'A': [1, 1,9,2], 'B': [1, 3,9,2], 'C': [1, 4,9,2],'D': ['1_1', '1_1','8_3','1_1']})
|
|
112
|
+
#df2 = pd.DataFrame({'group_by_column': [2, 1,3,5], 'A': [2, 1,9,1], 'B': [3, 1,5,1], 'C': [5, 1,9,1],'D': ['1_1', '1_1','5_5','1_1']})
|
|
113
|
+
#key_column_values_with_mismatches={'group_by_column':[None,3,5,6]}
|
|
114
|
+
#numeric_scale=2
|
|
115
|
+
|
|
116
|
+
#########
|
|
117
|
+
|
|
118
|
+
# *** TEST 2 ***
|
|
119
|
+
#df1 = pd.DataFrame({'group_by_column': [1, None,3,6], 'A': [1, 1,9,2], 'B': [1, 3,9,2], 'C': [1, 4,9,2],'D': ['1_1', '1_1','8_3','1_1']})
|
|
120
|
+
#df2 = pd.DataFrame({'group_by_column': [None, 1,3,5], 'A': [2, 1,9,1], 'B': [3, 1,5,1], 'C': [5, 1,9,1],'D': ['1_1', '1_1','5_5','1_1']})
|
|
121
|
+
#key_column_values_with_mismatches={'group_by_column':[None,3,5,6]}
|
|
122
|
+
#numeric_scale=2
|
|
123
|
+
|
|
124
|
+
#########
|
|
125
|
+
|
|
126
|
+
# *** TEST 3 ***
|
|
127
|
+
#df1 = pd.DataFrame({'group_by_column': [1, 2,3,4], 'A': [1, 1,9,2.001], 'B': [1, 3,9,2.0004], 'C': [1, 4,9,2.00000000001],'D': ['1_1', '1_1','8_3','1_1']})
|
|
128
|
+
#df2 = pd.DataFrame({'group_by_column': [1, 2,3,4], 'A': [2, 1,9,Decimal(2.001)], 'B': [3, 3,9,2.0005], 'C': [1, 4,9,2.00000000004],'D': ['1_1', '1_1','8_3','1_1']})
|
|
129
|
+
#key_column_values_with_mismatches={'group_by_column':[1,2,3,4]}
|
|
130
|
+
#numeric_scale=7
|
|
131
|
+
#
|
|
132
|
+
##########
|
|
133
|
+
#
|
|
134
|
+
#diff_1, diff_2, df_1_sorted, df_2_sorted =get_diff_dataframes(df1, df2, ['group_by_column'], ['group_by_column'])
|
|
135
|
+
#diff_dict = get_diff_dict_from_diff_dataframes(df1, df2, ['group_by_column'], ['group_by_column'], key_column_values_with_mismatches, numeric_scale)
|
|
136
|
+
#import json
|
|
137
|
+
#import decimal
|
|
138
|
+
#import numpy as np
|
|
139
|
+
#
|
|
140
|
+
#class CustomJSONEncoder(json.JSONEncoder):
|
|
141
|
+
# def default(self, o):
|
|
142
|
+
# if isinstance(o, decimal.Decimal):
|
|
143
|
+
# return str(o)
|
|
144
|
+
# if isinstance(o, np.integer):
|
|
145
|
+
# return int(o)
|
|
146
|
+
# if isinstance(o, np.floating):
|
|
147
|
+
# return float(o)
|
|
148
|
+
# if isinstance(o, np.ndarray):
|
|
149
|
+
# return o.tolist()
|
|
150
|
+
# try:
|
|
151
|
+
# super(CustomJSONEncoder, self).default(o)
|
|
152
|
+
# except:
|
|
153
|
+
# return str(o)
|
|
154
|
+
#
|
|
155
|
+
# return super(CustomJSONEncoder, self).default(o)
|
|
156
|
+
#
|
|
157
|
+
#diff_json = json.dumps(diff_dict, indent=4, cls=CustomJSONEncoder)
|
|
158
|
+
#
|
|
159
|
+
#print(diff_json)
|